diff --git a/.dockerignore b/.dockerignore
index e47f48873..60583dbf9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -6,6 +6,7 @@ build/
 *.pyc
 *.pyo
 *.pyd
+.pytest_cache/
 
 # Git
 **/.git
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 509ac6d48..7ac2f1649 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -49,7 +49,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
+        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
         make build ark_py
 
     - name: Perform CodeQL Analysis
@@ -95,7 +95,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
         make -j build ark_py
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 363f1b771..10b0679da 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON ..
+          cmake -DCMAKE_BUILD_TYPE=Debug ..
           make -j ut ark_py
 
       - name: Run C++ UT
@@ -71,7 +71,11 @@ jobs:
       - name: Run Python UT
         run: |
           cd build
-          ARK_ROOT=$PWD pytest --cov=../python/ark --cov-report lcov:py_coverage.info --verbose ../python/unittest/test.py
+          PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \
+              --cov=python/ark \
+              --cov-report lcov:py_coverage.info \
+              --verbose \
+              ../python/unittest/test.py
 
       - name: Report Coverage
         env:
diff --git a/.gitmodules b/.gitmodules
index ced5dcf94..ec484eb61 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -17,3 +17,7 @@
 [submodule "third_party/json"]
 	path = third_party/json
 	url = https://github.com/nlohmann/json
+
+[submodule "third_party/dlpack"]
+	path = third_party/dlpack
+	url = https://github.com/dmlc/dlpack
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 640196a66..00260f078 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,8 +3,6 @@
     "cmake.environment": {
         "ARK_ROOT": "${workspaceFolder}/build",
         "ARK_IGNORE_BINARY_CACHE": "1",
-        "ARK_DISABLE_GRAPH_OPT": "0",
-        "ARK_IPC_LISTEN_PORT_BASE": "42000",
         // "ARK_LOG_LEVEL": "DEBUG"
     },
     "cmake.ctestArgs": [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e80ea1e8..c3b09b0e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ option(ARK_USE_CUDA "Use NVIDIA/CUDA." OFF)
 option(ARK_USE_ROCM "Use AMD/ROCm." OFF)
 option(ARK_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(ARK_BUILD_TESTS "Build unit tests." ON)
+option(ARK_BUILD_PYTHON "Build Python module." ON)
 
 if(ARK_BYPASS_GPU_CHECK)
     if(ARK_USE_CUDA)
diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt
index 208d9f9cb..9616ea875 100644
--- a/ark/CMakeLists.txt
+++ b/ark/CMakeLists.txt
@@ -17,6 +17,7 @@ set(COMMON_LIBS ARK::numa ARK::ibverbs pthread rt)
 target_include_directories(ark_obj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(ark_obj PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(ark_obj SYSTEM PRIVATE
+    ${DLPACK_INCLUDE_DIRS}
     ${JSON_INCLUDE_DIRS}
     ${MSCCLPP_INCLUDE_DIRS}
     ${IBVERBS_INCLUDE_DIRS}
diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index c8e2e7df6..c121328c2 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -4,15 +4,18 @@
 #include "ark/executor.hpp"
 
 #include <cmath>
+#include <list>
 #include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 
+#include "ark/data_type.hpp"
 #include "ark/model.hpp"
 #include "ark/planner.hpp"
 #include "codegen.hpp"
 #include "env.h"
+#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "gpu/gpu.hpp"
 #include "gpu/gpu_event.hpp"
@@ -140,58 +143,78 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
+    Impl() : plan_json_(), device_id_(-1){};
     ~Impl();
 
-    void init(const PlanJson& plan);
-
     int device_id() const { return device_id_; }
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
 
+    std::shared_ptr<GpuMemory> buffer() const {
+        return buffers_.empty() ? nullptr : buffers_.back();
+    }
+
     std::string plan() const { return plan_json_.dump_pretty(); }
 
-    void compile();
-    void launch();
+    const std::string &name() const { return name_; }
+
+    void compile(const std::string &plan, int device_id,
+                 const std::string &name,
+                 const std::unordered_map<Tensor, void *> &external_tensors);
+    void launch(Stream stream, bool loop_mode);
     void run(int iter);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
     void barrier();
 
-    uintptr_t tensor_address(const Tensor &tensor) const;
+    void *tensor_address(const Tensor &tensor) const;
 
     void tensor_read(const Tensor &tensor, void *data, size_t bytes,
                      Stream stream, bool is_d2d) const;
     void tensor_write(const Tensor &tensor, const void *data, size_t bytes,
                       Stream stream, bool is_d2d) const;
 
+   protected:
+    friend class DefaultExecutor;
+
+    gpuStream stream_raw_;
+    bool loop_mode_;
+
    private:
+    void init(const PlanJson &plan_json, int device_id,
+              const std::string &name);
     void init_communicator();
     std::map<size_t, size_t> init_buffers(const Json &plan_json);
+    std::map<size_t, void *> init_buffer_addrs(
+        std::shared_ptr<GpuMemory> buffer,
+        const std::map<size_t, size_t> &buffer_id_to_offset);
     std::set<int> init_remote_ranks(const Json &plan_json) const;
     void init_channels(const std::set<int> &remote_ranks);
 
-   protected:
+    PlanJson plan_json_;
     int device_id_;
     std::string name_;
-    bool loop_mode_;
-
-    gpuStream stream_raw_;
 
     int rank_;
     int world_size_;
 
+    std::string kernel_name_;
+
     bool is_launched_ = false;
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
-    PlanJson plan_json_;
+    ModelBufferManager &buffer_manager_;
+    std::vector<void *> external_buffers_;
+    std::vector<std::string> external_args_;
+    std::map<size_t, std::string> buffer_id_to_name_;
     std::map<size_t, size_t> buffer_id_to_offset_;
+    std::map<size_t, void *> buffer_id_to_addr_;
     size_t total_bytes_;
     std::shared_ptr<CodeGenerator> codegen_;
     std::shared_ptr<GpuEvent> timer_begin_;
     std::shared_ptr<GpuEvent> timer_end_;
-    std::shared_ptr<GpuMemory> buffer_;
+    std::list<std::shared_ptr<GpuMemory>> buffers_;
     std::shared_ptr<GpuHostMemory> flag_;
     std::shared_ptr<GpuStream> stream_;
     std::shared_ptr<GpuKernel> kernel_;
@@ -205,26 +228,25 @@ class Executor::Impl {
         rank_to_sm_channels_;
 };
 
-Executor::Impl::Impl(int device_id, Stream stream, const std::string &name,
-                     bool loop_mode)
-    : device_id_(device_id), name_(name), loop_mode_(loop_mode) {
-    if (device_id < 0) {
-        ERR(InvalidUsageError, "Invalid device ID ", device_id);
-    }
-    if (stream) {
-        stream_raw_ = reinterpret_cast<gpuStream>(stream);
-    } else {
-        stream_ = GpuManager::get_instance(device_id_)->create_stream();
-        stream_raw_ = stream_->get();
-    }
-}
-
 Executor::Impl::~Impl() {
     if (is_launched_) stop(-1);
 }
 
-void Executor::Impl::init(const PlanJson &plan_json) {
+void Executor::Impl::init(const PlanJson &plan_json, int device_id,
+                          const std::string &name) {
+    if (device_id < 0) {
+        ERR(InvalidUsageError, "Invalid device ID ", device_id);
+    }
+
     plan_json_ = plan_json;
+    device_id_ = device_id;
+    name_ = name;
+
+    external_buffers_.clear();
+    external_args_.clear();
+    buffer_id_to_name_.clear();
+    total_bytes_ = 0;
+
     rank_ = plan_json_["Rank"].get<int>();
     world_size_ = plan_json_["WorldSize"].get<int>();
 
@@ -232,11 +254,12 @@ void Executor::Impl::init(const PlanJson &plan_json) {
         ERR(InvalidUsageError, "Invalid rank ", rank_, " with world size ",
             world_size_);
     }
-    if (world_size_ > 1) {
+    if (world_size_ > 1 && !comm_) {
         init_communicator();
     }
 
     auto gpu_manager = GpuManager::get_instance(device_id_);
+
     if (!gpu_manager->info().arch->belongs_to(
             Arch::from_name(plan_json.at("Architecture")))) {
         LOG(WARN, "Architecture name of the plan `",
@@ -253,12 +276,18 @@ void Executor::Impl::init(const PlanJson &plan_json) {
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
-                                               name_);
-
     timer_begin_ = gpu_manager->create_event();
     timer_end_ = gpu_manager->create_event();
-    buffer_ = gpu_manager->malloc(total_bytes_, 65536);
+    if (total_bytes_ > 0) {
+        buffers_.push_back(gpu_manager->malloc(total_bytes_, 65536));
+        buffer_id_to_addr_ =
+            init_buffer_addrs(buffers_.back(), buffer_id_to_offset_);
+    }
+
+    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
+                                               external_args_,
+                                               buffer_id_to_name_, name_);
+
     flag_ = gpu_manager->malloc_host(
         sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
 
@@ -268,24 +297,14 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     size_t smem_block_total =
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
-    if (world_size_ > 1) {
+    if (world_size_ > 1 && total_bytes_ > 0) {
         auto remote_ranks = init_remote_ranks(plan_json_);
         init_channels(remote_ranks);
     }
 
-    std::string kernel_name;
-    if (loop_mode_) {
-        kernel_name = "ark_loop_kernel";
-    } else {
-        kernel_name = "ark_kernel";
-    }
-    if (!name_.empty()) {
-        kernel_name += "_" + name_;
-    }
-
-    kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
-        device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), kernel_name));
+    kernel_ = std::shared_ptr<GpuKernel>(
+        new GpuKernel(device_id_, codegen_->code(), {threads_per_block, 1, 1},
+                      {num_sm, 1, 1}, std::max(smem_block_total, size_t(4))));
 }
 
 void Executor::Impl::init_communicator() {
@@ -297,6 +316,21 @@ void Executor::Impl::init_communicator() {
     comm_ = std::make_shared<mscclpp::Communicator>(bootstrap);
 }
 
+std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
+    std::shared_ptr<GpuMemory> buffer,
+    const std::map<size_t, size_t> &buffer_id_to_offset) {
+    std::map<size_t, void *> buffer_id_to_addr;
+    // Reuse existing buffer addresses for new plans that use previous tensors
+    // from earlier plans
+    if (!buffer_id_to_addr_.empty()) {
+        buffer_id_to_addr = buffer_id_to_addr_;
+    }
+    for (const auto &[id, offset] : buffer_id_to_offset) {
+        buffer_id_to_addr[id] = buffer->ref(offset);
+    }
+    return buffer_id_to_addr;
+}
+
 std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     class BufferInfo {
        public:
@@ -371,39 +405,65 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     std::map<int, std::map<int, size_t>> remote_rank_to_send_tag_to_buffer_id;
     std::map<int, std::map<int, size_t>> remote_rank_to_recv_tag_to_buffer_id;
 
+    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
+
     // TODO: improve memory planning
     size_t offset = 0;
     for (auto &kv : buffer_id_to_info) {
         auto &buf_info = kv.second;
         int r = buf_info->buffer->rank();
+        const size_t buf_id = buf_info->buffer->id();
         if (r != rank_ && r != -1) {
             // this is a remote buffer
             for (const auto &tag_info : buf_info->buffer->send_tags()) {
                 remote_rank_to_send_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] =
-                                                        buf_info->buffer->id();
+                                                    [tag_info.second] = buf_id;
             }
             for (const auto &tag_info : buf_info->buffer->recv_tags()) {
                 remote_rank_to_recv_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] =
-                                                        buf_info->buffer->id();
+                                                    [tag_info.second] = buf_id;
             }
             continue;
         }
-        buffer_id_to_offset[buf_info->buffer->id()] = offset;
-        for (const auto &tag_info : buf_info->buffer->send_tags()) {
-            remote_rank_to_send_tags_and_offsets[tag_info.first]
-                .first.push_back(tag_info.second);
-            remote_rank_to_send_tags_and_offsets[tag_info.first]
-                .second.push_back(offset);
+        void *ext_data = ext_buf_reg.get(buf_id);
+        if (ext_data) {
+            gpuPointerAttributes attr;
+            GLOG(gpuPointerGetAttributes(&attr, ext_data));
+            if (attr.device != device_id_) {
+                ERR(InvalidUsageError,
+                    "External data provided is on a different GPU: ",
+                    attr.device, " vs ", device_id_);
+            }
+            external_buffers_.push_back(ext_data);
+            const std::string name = "extern_buf_" + std::to_string(buf_id);
+            external_args_.push_back(name);
+            buffer_id_to_name_[buf_id] = name;
+            continue;
         }
-        for (const auto &tag_info : buf_info->buffer->recv_tags()) {
-            remote_rank_to_recv_tags_and_offsets[tag_info.first]
-                .first.push_back(tag_info.second);
-            remote_rank_to_recv_tags_and_offsets[tag_info.first]
-                .second.push_back(offset);
+        // if we are adding a plan and come across a buffer from a previous
+        // plan, we utilize the buffer offset from the previous plan
+        if (buffer_id_to_offset_.find(buf_id) != buffer_id_to_offset_.end()) {
+            external_buffers_.push_back(buffer_id_to_addr_[buf_id]);
+            const std::string name = "extern_buf_" + std::to_string(buf_id);
+            external_args_.push_back(name);
+            buffer_id_to_name_[buf_id] = name;
+            continue;
+        } else {
+            buffer_id_to_offset[buf_id] = offset;
+            for (const auto &tag_info : buf_info->buffer->send_tags()) {
+                remote_rank_to_send_tags_and_offsets[tag_info.first]
+                    .first.push_back(tag_info.second);
+                remote_rank_to_send_tags_and_offsets[tag_info.first]
+                    .second.push_back(offset);
+            }
+            for (const auto &tag_info : buf_info->buffer->recv_tags()) {
+                remote_rank_to_recv_tags_and_offsets[tag_info.first]
+                    .first.push_back(tag_info.second);
+                remote_rank_to_recv_tags_and_offsets[tag_info.first]
+                    .second.push_back(offset);
+            }
+            offset += buf_info->bytes;
         }
-        offset += buf_info->bytes;
     }
     total_bytes_ = offset;
 
@@ -479,7 +539,13 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 1);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 2);
         for (int i = 0; i < len; ++i) {
-            buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] = offsets[i];
+            const size_t buf_id =
+                buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]->buffer->id();
+            void *buf_data = ext_buf_reg.get(buf_id);
+            if (buf_data == nullptr) {
+                buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
+                    offsets[i];
+            }
         }
     }
     for (auto &kv : remote_rank_to_recv_tag_to_buffer_id) {
@@ -495,10 +561,15 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 4);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 5);
         for (int i = 0; i < len; ++i) {
-            buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] = offsets[i];
+            const size_t buf_id =
+                buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]->buffer->id();
+            void *buf_data = ext_buf_reg.get(buf_id);
+            if (buf_data == nullptr) {
+                buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
+                    offsets[i];
+            }
         }
     }
-
     return buffer_id_to_offset;
 }
 
@@ -530,7 +601,9 @@ std::set<int> Executor::Impl::init_remote_ranks(const Json &plan_json) const {
 }
 
 void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
-    proxy_service_ = std::make_shared<mscclpp::ProxyService>();
+    if (!proxy_service_) {
+        proxy_service_ = std::make_shared<mscclpp::ProxyService>();
+    }
 
     int num_ranks_per_node = get_env().num_ranks_per_host;
     auto rank_to_node = [&](int rank) { return rank / num_ranks_per_node; };
@@ -547,8 +620,8 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     if (!get_env().disable_ib) {
         all_transports |= IBs[device_id_];
     }
-    mscclpp::RegisteredMemory regmem =
-        comm_->registerMemory(buffer_->ref(), buffer_->bytes(), all_transports);
+    mscclpp::RegisteredMemory regmem = comm_->registerMemory(
+        buffers_.back()->ref(), buffers_.back()->bytes(), all_transports);
 
     std::map<int, std::vector<mscclpp::NonblockingFuture<
                       std::shared_ptr<mscclpp::Connection>>>>
@@ -622,16 +695,67 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     }
 }
 
-void Executor::Impl::compile() { kernel_->compile(); }
+void Executor::Impl::compile(
+    const std::string &plan, int device_id, const std::string &name,
+    const std::unordered_map<Tensor, void *> &external_tensors) {
+    if (is_launched_) {
+        ERR(InvalidUsageError, "Need to stop before re-compiling.");
+        return;
+    }
+    try {
+        auto plan_json = Json::parse(plan);
+        init(plan_json, device_id, name);
+    } catch (const ::nlohmann::json::parse_error &e) {
+        ERR(InvalidUsageError, "Failed to parse the plan JSON: ", e.what());
+    }
+    for (auto &[tns, addr] : external_tensors) {
+        const size_t buf_id = tns.ref()->buffer()->id();
+        if (buffer_manager_.is_staged(buf_id)) {
+            buffer_manager_.set_buffer_address(buf_id, addr);
+            external_buffers_.push_back(addr);
+            const std::string name = "extern_buf_" + std::to_string(buf_id);
+            external_args_.push_back(name);
+            buffer_id_to_name_[buf_id] = name;
+        } else {
+            ERR(InvalidUsageError,
+                "Cannot set the buffer address for tensor with buffer:", buf_id,
+                " the address is already bound. "
+                "Address setting is only allowed for delayed binding of "
+                "uninitialized buffers.");
+        }
+    }
+    kernel_->compile();
+}
 
-void Executor::Impl::launch() {
-    if (!kernel_->is_compiled()) {
-        ERR(InvalidUsageError, "Need to compile first before initialization.");
+void Executor::Impl::launch(
+    Stream stream, bool loop_mode,
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
+    if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
+        ERR(InvalidUsageError, "Need to compile first before launch.");
     }
     if (is_launched_) {
         LOG(WARN, "Ignore launching twice.");
         return;
     }
+    if (stream) {
+        stream_raw_ = reinterpret_cast<gpuStream>(stream);
+    } else {
+        stream_ = GpuManager::get_instance(device_id_)->create_stream();
+        stream_raw_ = stream_->get();
+    }
+    loop_mode_ = loop_mode;
+
+    if (loop_mode_) {
+        // should we add an identifier to specify which plan the kernel executes
+        // i.e. ark_loop_kernel_2 for the second plan
+        kernel_name_ = "ark_loop_kernel";
+    } else {
+        kernel_name_ = "ark_kernel";
+    }
+    if (!name_.empty()) {
+        kernel_name_ += "_" + name_;
+    }
+
     auto get_global_rt = [&](const std::string &symbol) {
         return reinterpret_cast<void *>(kernel_->get_global(symbol));
     };
@@ -686,27 +810,35 @@ void Executor::Impl::launch() {
     if (loop_mode_) {
         // Initialize loop flags.
         atomicStoreRelaxed(flag_->ref<int>(), 0);
-        void *buf_ptr = buffer_->ref();
+        void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
         void *flag_ptr = flag_->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
-        kernel_->launch(stream_raw_, args);
+        for (auto &buffer : external_buffers_) {
+            args.push_back(&buffer);
+        }
+        kernel_->launch(kernel_name_, stream_raw_, args);
     }
     is_recording_ = true;
     is_launched_ = true;
 }
 
-void Executor::Impl::run(int iter) {
+void Executor::Impl::run(
+    int iter,
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
     if (iter <= 0) return;
     if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
         }
         atomicStoreRelaxed(flag_->ref<int>(), iter);
     } else {
-        void *buf_ptr = buffer_->ref();
+        void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
         int i = 0;
         std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
+        for (auto &buffer : external_buffers_) {
+            args.push_back(&buffer);
+        }
         for (; i < iter; i++) {
-            kernel_->launch(stream_raw_, args);
+            kernel_->launch(kernel_name_, stream_raw_, args);
         }
     }
 }
@@ -722,9 +854,8 @@ void Executor::Impl::wait(int64_t max_spin_count) {
             gpuError res = gpuStreamQuery(stream_raw_);
             if (res == gpuSuccess) {
                 if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-                    LOG(WARN,
+                    ERR(InternalError,
                         "Stream is finished but the loop flag is still set.");
-                    break;
                 } else {
                     LOG(WARN,
                         "wait() is delayed by a stream query. Regarding "
@@ -771,13 +902,20 @@ void Executor::Impl::barrier() {
     }
 }
 
-uintptr_t Executor::Impl::tensor_address(const Tensor &tensor) const {
+void *Executor::Impl::tensor_address(const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
-    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
-        ERR(InternalError, "Invalid buffer ID: ", buffer_id);
+    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
+    void *ext_data = ext_buf_reg.get(buffer_id);
+    if (ext_data) {
+        return ext_data;
+    }
+    if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
+        ERR(InvalidUsageError, "Tensor has an unknown buffer ID ", buffer_id,
+            ". This is likely caused by accessing a tensor that is optimized "
+            "out by the compiler or not used in any plan passed to the "
+            "executor.");
     }
-    size_t offset = buffer_id_to_offset_.at(buffer_id);
-    return reinterpret_cast<uintptr_t>(buffer_->ref(offset));
+    return buffer_id_to_addr_.at(buffer_id);
 }
 
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
@@ -803,7 +941,7 @@ void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
             ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
-    void *src = reinterpret_cast<void *>(tensor_address(tensor));
+    void *src = tensor_address(tensor);
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_raw));
     } else {
@@ -856,7 +994,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
-    void *dst = reinterpret_cast<void *>(tensor_address(tensor));
+    void *dst = tensor_address(tensor);
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_raw));
     } else {
@@ -885,18 +1023,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
     GLOG(gpuStreamSynchronize(copy_stream_raw));
 }
 
-Executor::Executor(int device_id, Stream stream, const std::string &name,
-                   const std::string &plan, bool loop_mode)
-    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name,
-                                             loop_mode)) {
-    auto &plan_path = get_env().enforce_plan_path;
-    if (!plan_path.empty()) {
-        LOG(INFO, "Enforce executor plan path: ", plan_path);
-        impl_->init(Json::parse(read_file(plan_path)));
-    } else if (!plan.empty()) {
-        impl_->init(Json::parse(plan));
-    }
-}
+Executor::Executor() : impl_(std::make_unique<Executor::Impl>()) {}
 
 Executor::~Executor() = default;
 
@@ -904,13 +1031,29 @@ int Executor::device_id() const { return impl_->device_id(); }
 
 Stream Executor::stream() const { return impl_->stream(); }
 
+std::shared_ptr<GpuMemory> Executor::buffer() const { return impl_->buffer(); }
+
 std::string Executor::plan() const { return impl_->plan(); }
 
-void Executor::compile() { impl_->compile(); }
+const std::string &Executor::name() const { return impl_->name(); }
 
-void Executor::launch() { impl_->launch(); }
+void Executor::compile(
+    const std::string &plan, int device_id, const std::string &name,
+    const std::unordered_map<Tensor, void *> &external_tensors) {
+    impl_->compile(plan, device_id, name, external_tensors);
+}
+
+void Executor::launch(
+    Stream stream, bool loop_mode,
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
+    impl_->launch(stream, loop_mode, placeholder_data);
+}
 
-void Executor::run(int iter) { impl_->run(iter); }
+void Executor::run(
+    int iter,
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
+    impl_->run(iter, placeholder_data);
+}
 
 void Executor::wait(int64_t max_spin_count) { impl_->wait(max_spin_count); }
 
@@ -924,7 +1067,7 @@ void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
-uintptr_t Executor::tensor_address(const Tensor &tensor) const {
+void *Executor::tensor_address(const Tensor &tensor) const {
     return impl_->tensor_address(tensor);
 }
 
@@ -942,14 +1085,21 @@ DefaultExecutor::DefaultExecutor(
     const Model &model, int device_id, Stream stream,
     const std::vector<Planner::ConfigRule> &config_rules,
     const std::string &name, bool loop_mode)
-    : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
-                               : device_id,
-               stream, name, "", loop_mode) {
-    Planner planner(model, impl_->device_id());
+    : Executor() {
+    device_id = (device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
+                                : device_id;
+    Planner planner(model, device_id);
     for (const auto &rule : config_rules) {
         planner.install_config_rule(rule);
     }
-    impl_->init(Json::parse(planner.plan()));
+    compile(planner.plan(), device_id, name);
+    impl_->stream_raw_ = reinterpret_cast<gpuStream>(stream);
+    impl_->loop_mode_ = loop_mode;
+}
+
+void DefaultExecutor::launch() {
+    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_),
+                     impl_->loop_mode_);
 }
 
 }  // namespace ark
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index dad0e9d83..cf3495780 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -20,7 +20,6 @@ ark::unittest::State test_executor() {
         UNITTEST_EQ(executor.device_id(), 0);
         UNITTEST_EQ(executor.stream(), stream);
 
-        executor.compile();
         executor.launch();
         executor.run(1);
         executor.wait();
@@ -31,7 +30,6 @@ ark::unittest::State test_executor() {
     }
     {
         ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
-        executor.compile();
         executor.launch();
         executor.run(1);
         executor.wait();
@@ -46,9 +44,7 @@ ark::unittest::State test_executor() {
     }
     {
         ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
-        UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
 
-        executor.compile();
         executor.launch();
         executor.launch();  // Will be ignored with a warning.
         executor.run(1);
@@ -86,9 +82,8 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     m.noop(tensor);
 
     ark::DefaultExecutor executor(m, 0);
-    executor.compile();
-    executor.launch();
-    UNITTEST_GT(executor.tensor_address(tensor), 0);
+
+    UNITTEST_NE(executor.tensor_address(tensor), nullptr);
 
     // Copy data from CPU array to ARK tensor
     executor.tensor_write(tensor, host_data.data(),
@@ -107,20 +102,28 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
         dev_data[i] = -1;
     }
 
+    ark::gpuStream stream;
     UNITTEST_EQ(
-        ark::gpuMemcpy(dev_data.data(), dev_ptr, shape.nelems() * sizeof(float),
-                       ark::gpuMemcpyDeviceToHost),
+        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
         ark::gpuSuccess);
+
+    UNITTEST_EQ(ark::gpuMemcpyAsync(dev_data.data(), dev_ptr,
+                                    shape.nelems() * sizeof(float),
+                                    ark::gpuMemcpyDeviceToHost, stream),
+                ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuStreamSynchronize(stream), ark::gpuSuccess);
+
     for (size_t i = 0; i < dev_data.size(); ++i) {
         UNITTEST_EQ(dev_data[i], static_cast<float>(i));
         dev_data[i] = -1;
     }
 
     // Copy -1s back to GPU array
-    UNITTEST_EQ(
-        ark::gpuMemcpy(dev_ptr, dev_data.data(), shape.nelems() * sizeof(float),
-                       ark::gpuMemcpyHostToDevice),
-        ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuMemcpyAsync(dev_ptr, dev_data.data(),
+                                    shape.nelems() * sizeof(float),
+                                    ark::gpuMemcpyHostToDevice, stream),
+                ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuStreamSynchronize(stream), ark::gpuSuccess);
 
     // Copy data from GPU array to ARK tensor
     executor.tensor_write(tensor, dev_ptr, shape.nelems() * sizeof(float),
@@ -136,10 +139,6 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     }
 
     // Provide a stream
-    ark::gpuStream stream;
-    UNITTEST_EQ(
-        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
-        ark::gpuSuccess);
     executor.tensor_read(tensor, host_data.data(),
                          shape.nelems() * sizeof(float), stream);
     executor.tensor_write(tensor, host_data.data(),
@@ -169,15 +168,19 @@ ark::unittest::State test_executor_tensor_read_write_stride_offset() {
 }
 
 ark::unittest::State test_executor_invalid() {
+    ark::Executor exe;
+
+    // Invalid plan.
+    UNITTEST_THROW(exe.compile("not a json", 0), ark::InvalidUsageError);
+
     // Invalid device ID.
-    UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
+    UNITTEST_THROW(exe.compile(ark::PlanJson().dump(), -1),
                    ark::InvalidUsageError);
 
     // Invalid rank.
     ark::PlanJson plan;
     plan["Rank"] = 1;
-    UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
-                   ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(plan.dump(), 0), ark::InvalidUsageError);
 
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4b03c3ac8..fc44b4a58 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -3,6 +3,7 @@
 
 #include "ark/tensor.hpp"
 
+#include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
 
@@ -50,6 +51,23 @@ const DataType &Tensor::data_type() const {
     return NONE;
 }
 
+Dims Tensor::torch_strides() const {
+    if (ref_) {
+        Dims st = ref_->strides();
+        int ndims = st.ndims();
+        std::vector<DimType> tmp;
+        for (int i = 1; i < ndims; ++i) {
+            tmp.push_back(st[i]);
+        }
+        tmp.push_back(1);
+        for (int i = ndims - 2; i >= 0; --i) {
+            tmp[i] *= tmp[i + 1];
+        }
+        return Dims(tmp);
+    }
+    return Dims();
+}
+
 std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
     if (tensor.is_null()) {
         os << "null";
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 54214277d..4a1c1ed81 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -7,6 +7,7 @@
 
 #include "ark/data_type.hpp"
 #include "env.h"
+#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
@@ -24,7 +25,18 @@ static std::string replace(
         size_t pos = 0;
         while ((pos = result.find(kv.first, pos)) != std::string::npos) {
             result.replace(pos, kv.first.length(), kv.second);
-            pos += kv.second.length();
+            if ((kv.first == "@GLOBAL_ARGS@" || kv.first == "@FUNCTION_ARGS@" ||
+                 kv.first == "@ARG_TYPES@") &&
+                kv.second.empty()) {
+                size_t comma_pos = pos;
+                if (comma_pos >= 2 && result.substr(comma_pos - 2, 2) == ", ") {
+                    result.erase(comma_pos - 2, 2);
+                    pos -= 2;
+                }
+
+            } else {
+                pos += kv.second.length();
+            }
         }
     }
     return result;
@@ -43,6 +55,8 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
+         const std::vector<std::string> &external_args,
+         const std::map<size_t, std::string> &buffer_id_to_name,
          const std::string &name);
     ~Impl() = default;
 
@@ -68,6 +82,8 @@ class CodeGenerator::Impl {
     friend class CodeGenerator;
 
     std::map<size_t, size_t> buffer_id_to_offset_;
+    std::vector<std::string> external_args_;
+    std::map<size_t, std::string> buffer_id_to_name_;
     std::string name_;
     int rank_;
     int world_size_;
@@ -76,10 +92,15 @@ class CodeGenerator::Impl {
     std::string code_;
 };
 
-CodeGenerator::Impl::Impl(const PlanJson &plan,
-                          const std::map<size_t, size_t> &buffer_id_to_offset,
-                          const std::string &name)
-    : buffer_id_to_offset_(buffer_id_to_offset), name_(name) {
+CodeGenerator::Impl::Impl(
+    const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
+    const std::vector<std::string> &external_args,
+    const std::map<size_t, std::string> &buffer_id_to_name,
+    const std::string &name)
+    : buffer_id_to_offset_(buffer_id_to_offset),
+      external_args_(external_args),
+      buffer_id_to_name_(buffer_id_to_name),
+      name_(name) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
     num_procs_ = plan.at("NumProcessors");
@@ -168,6 +189,30 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
     if (!is_file(template_path)) {
         ERR(InternalError, "kernel template file not found: ", template_path);
     }
+
+    // Generate the global arguments
+    std::stringstream global_args_ss, function_args_ss, arg_types_ss;
+    for (const auto &arg : external_args_) {
+        global_args_ss << "void *" << arg << ", ";
+        function_args_ss << arg << ", ";
+        arg_types_ss << "void *, ";
+    }
+    std::string global_args = global_args_ss.str();
+    std::string function_args = function_args_ss.str();
+    std::string arg_types = arg_types_ss.str();
+    if (!global_args.empty()) {
+        global_args.pop_back();
+        global_args.pop_back();
+    }
+    if (!function_args.empty()) {
+        function_args.pop_back();
+        function_args.pop_back();
+    }
+    if (!arg_types.empty()) {
+        arg_types.pop_back();
+        arg_types.pop_back();
+    }
+
     std::string template_code = read_file(template_path);
     std::map<std::string, std::string> replacements = {
         {"@NUM_BLOCKS@", std::to_string(num_procs_)},
@@ -175,6 +220,9 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
         {"@DEFINITIONS@", definitions_ss.str()},
         {"@BODY@", body_ss.str()},
         {"@NAME@", (name_.empty() ? "" : "_" + name_)},
+        {"@GLOBAL_ARGS@", global_args},
+        {"@FUNCTION_ARGS@", function_args},
+        {"@ARG_TYPES@", arg_types},
     };
     code_ = replace(template_code, replacements);
 }
@@ -214,7 +262,7 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
         ss << this->def_op(op_json, task_json["Id"], op_idx++);
     }
     ss << "__device__ void t" << task_json["Id"]
-       << "(char* _buf, int _idx, int _spw) {\n";
+       << "(char *_buf, int _idx, int _spw, @GLOBAL_ARGS@) {\n";
     op_idx = 0;
     for (auto &op_json : task_json["Ops"]) {
         auto op = ModelOp::deserialize(op_json);
@@ -224,17 +272,32 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
             auto &arg = impl_args[i];
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
-                size_t buffer_offset =
-                    buffer_id_to_offset_.at(tns->buffer()->id());
-                size_t offset = buffer_offset + ModelOffset(tns).value();
-                ss << "(" << tns->data_type()->type_str() << "*)&_buf["
-                   << offset << "]";
+                size_t buffer_id = tns->buffer()->id();
+                if (buffer_id_to_name_.find(buffer_id) ==
+                    buffer_id_to_name_.end()) {
+                    size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
+                    size_t offset = buffer_offset + ModelOffset(tns).value();
+                    ss << "(" << tns->data_type()->type_str() << "*)&_buf["
+                       << offset << "]";
+                } else {
+                    ss << "(" << tns->data_type()->type_str() << "*)"
+                       << buffer_id_to_name_.at(buffer_id);
+                }
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
-                size_t buffer_offset =
-                    buffer_id_to_offset_.at(moff.buffer_id());
-                size_t offset = buffer_offset + moff.value();
-                ss << offset;
+                size_t buffer_id = moff.buffer_id();
+                if (buffer_id_to_name_.find(buffer_id) ==
+                    buffer_id_to_name_.end()) {
+                    size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
+                    size_t offset = buffer_offset + moff.value();
+                    ss << offset;
+                } else {
+                    const std::string &buffer_name =
+                        buffer_id_to_name_.at(buffer_id);
+                    size_t offset = moff.value();
+                    ss << "(uint64_t)((char*)" << buffer_name << " + " << offset
+                       << ")";
+                }
             } else {
                 ss << arg.serialize().begin().value();
             }
@@ -265,7 +328,7 @@ std::string CodeGenerator::Impl::task_seq(
     ss << "task_seq<" << proc_b << ", " << proc_e << ", " << proc_s << ", "
        << proc_cur << ", " << task_b << ", " << task_e << ", " << task_s << ", "
        << task_gran << ", " << num_slots << ", " << slot_num_warps << ", "
-       << slot_sram_bytes << ", t" << task_id << ">(_buf);\n";
+       << slot_sram_bytes << ", t" << task_id << ">(_buf, @FUNCTION_ARGS@);\n";
     return ss.str();
 }
 
@@ -288,10 +351,14 @@ std::string CodeGenerator::Impl::resource_group(
     size_t proc_b = *rg_proc_range.begin();
     size_t proc_e = *rg_proc_range.end();
     size_t proc_s = rg_proc_range.step();
+    std::map<size_t, Json> task_infos_map;
+    for (auto &task_info : task_infos) {
+        task_infos_map[task_info.at("Id").get<size_t>()] = task_info;
+    }
     std::stringstream ss;
     for (auto &tg : rg_json["TaskGroups"]) {
         size_t task_id = tg["TaskId"];
-        auto &task_info = task_infos[task_id];
+        auto &task_info = task_infos_map.at(task_id);
         Range<size_t> task_range(tg["TaskRange"][0], tg["TaskRange"][1]);
         size_t task_gran = tg["Granularity"];
         size_t num_warps_per_task = task_info["NumWarps"];
@@ -305,7 +372,8 @@ std::string CodeGenerator::Impl::resource_group(
             n_slots = total_warps / num_warps_per_task;
         }
         if (n_slots == 0) {
-            ERR(PlanError, "not enough resources for task group");
+            ERR(PlanError, "not enough resources for task group: ",
+                tg.dump());
         }
 
         size_t task_b = *task_range.begin();
@@ -430,8 +498,11 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
+    const std::vector<std::string> &external_args,
+    const std::map<size_t, std::string> &buffer_id_to_name,
     const std::string &name)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name)) {}
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, external_args,
+                                   buffer_id_to_name, name)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index 4f8307e7e..89d89080e 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -16,6 +16,8 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
+                  const std::vector<std::string> &external_args,
+                  const std::map<size_t, std::string> &buffer_id_to_name,
                   const std::string &name = "ark_kernel");
 
     ~CodeGenerator() = default;
diff --git a/ark/cpu_timer.cpp b/ark/cpu_timer.cpp
index c740de5f3..129ba7bd2 100644
--- a/ark/cpu_timer.cpp
+++ b/ark/cpu_timer.cpp
@@ -16,20 +16,4 @@ double cpu_timer(void) {
     return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec;
 }
 
-// Sleep in second.
-int cpu_timer_sleep(double sec) {
-    struct timespec tspec;
-    tspec.tv_sec = (time_t)sec;
-    tspec.tv_nsec = (long)((sec - tspec.tv_sec) * 1.0e9);
-    return nanosleep(&tspec, 0);
-}
-
-// Sleep in nanosecond.
-int cpu_ntimer_sleep(long nsec) {
-    struct timespec tspec;
-    tspec.tv_sec = 0;
-    tspec.tv_nsec = nsec;
-    return nanosleep(&tspec, 0);
-}
-
 }  // namespace ark
diff --git a/ark/cpu_timer.h b/ark/cpu_timer.h
index 52bf63d92..eaac94061 100644
--- a/ark/cpu_timer.h
+++ b/ark/cpu_timer.h
@@ -8,10 +8,6 @@ namespace ark {
 
 // Measure current time in second.
 double cpu_timer(void);
-// Sleep in second.
-int cpu_timer_sleep(double sec);
-// Sleep in nanosecond.
-int cpu_ntimer_sleep(long nsec);
 
 }  // namespace ark
 
diff --git a/ark/env.cpp b/ark/env.cpp
index d8322378f..f9e7355ff 100644
--- a/ark/env.cpp
+++ b/ark/env.cpp
@@ -10,11 +10,11 @@
 #define DEFAULT_ARK_LOG_LEVEL "INFO"
 #define DEFAULT_ARK_ROOT "/usr/local/ark"
 #define DEFAULT_ARK_TMP "/tmp/ark"
-#define DEFAULT_ARK_KEEP_TMP true
+#define DEFAULT_ARK_KEEP_TMP false
 #define DEFAULT_ARK_HOSTFILE_NAME "hostfile"
 #define DEFAULT_ARK_NUM_RANKS_PER_HOST 8
 #define DEFAULT_ARK_DISABLE_IB false
-#define DEFAULT_ARK_IGNORE_BINARY_CACHE true
+#define DEFAULT_ARK_IGNORE_BINARY_CACHE false
 #define DEFAULT_ARK_ENFORCE_PLAN_PATH ""
 #define DEFAULT_ARK_MSCCLPP_PORT 50051
 
diff --git a/ark/external_buffer_registry.cpp b/ark/external_buffer_registry.cpp
new file mode 100644
index 000000000..450dd332b
--- /dev/null
+++ b/ark/external_buffer_registry.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "external_buffer_registry.hpp"
+
+#include "logging.hpp"
+
+namespace ark {
+
+ExternalBufferRegistry &ExternalBufferRegistry::get_instance() {
+    static ExternalBufferRegistry instance;
+    return instance;
+}
+
+void ExternalBufferRegistry::set(const size_t id, void *data) {
+    if (data == nullptr) {
+        ERR(InternalError, "data is nullptr.");
+    }
+    buffers_[id] = data;
+}
+
+void *ExternalBufferRegistry::get(const size_t id) const {
+    auto it = buffers_.find(id);
+    if (it != buffers_.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+void ExternalBufferRegistry::clear() { buffers_.clear(); }
+
+}  // namespace ark
diff --git a/ark/external_buffer_registry.hpp b/ark/external_buffer_registry.hpp
new file mode 100644
index 000000000..ab199bafc
--- /dev/null
+++ b/ark/external_buffer_registry.hpp
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
+#define ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
+
+#include <unordered_map>
+
+namespace ark {
+// Manages externally allocated buffers (buffers corresponding to Tensors that
+// are the output of a `placeholder` operation) outside of ARK's memory space.
+class ExternalBufferRegistry {
+   public:
+    static ExternalBufferRegistry &get_instance();
+
+    void set(const size_t id, void *data);
+
+    void *get(const size_t id) const;
+
+    void clear();
+
+   private:
+    // Maps buffer IDs to pointers and sizes.
+    std::unordered_map<size_t, void *> buffers_;
+    ExternalBufferRegistry() {}
+    ExternalBufferRegistry(const ExternalBufferRegistry &) = delete;
+    ExternalBufferRegistry &operator=(const ExternalBufferRegistry &) = delete;
+};
+}  // namespace ark
+
+#endif  // ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
diff --git a/ark/gpu/gpu.hpp b/ark/gpu/gpu.hpp
index 531d6c7ee..8ff3b2843 100644
--- a/ark/gpu/gpu.hpp
+++ b/ark/gpu/gpu.hpp
@@ -53,6 +53,8 @@ ARK_GPU_DEFINE_TYPE_ALIAS(gpuModule, CUmodule, hipModule_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunction, CUfunction, hipFunction_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunctionAttribute, CUfunction_attribute,
                           hipFunction_attribute);
+ARK_GPU_DEFINE_TYPE_ALIAS(gpuPointerAttributes, cudaPointerAttributes,
+                          hipPointerAttributes);
 
 // runtime API
 ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuSuccess, cudaSuccess, hipSuccess);
@@ -126,6 +128,8 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
                           hipGetErrorString);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
+ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerGetAttributes, cudaPointerGetAttributes,
+                          hipPointerGetAttributes);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
                           hipDeviceGetAttribute);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp
index d4412f80e..a474b32a7 100644
--- a/ark/gpu/gpu_kernel.cpp
+++ b/ark/gpu/gpu_kernel.cpp
@@ -15,24 +15,18 @@ namespace ark {
 
 GpuKernel::GpuKernel(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
-                     const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name) {
-    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name);
+                     const std::array<int, 3>& grid_dim, size_t smem_bytes) {
+    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes);
 }
 
 void GpuKernel::init(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
-                     const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name) {
+                     const std::array<int, 3>& grid_dim, size_t smem_bytes) {
     gpu_manager_ = GpuManager::get_instance(gpu_id);
     code_ = code;
     block_dim_ = block_dim;
     grid_dim_ = grid_dim;
     smem_bytes_ = smem_bytes;
-    kernel_name_ = kernel_name;
-    if (kernel_name_.size() == 0) {
-        ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_);
-    }
 }
 
 void GpuKernel::compile() {
@@ -45,21 +39,30 @@ void GpuKernel::compile() {
     }
     bin_ = gpu_compile({code_}, gpu_manager_->info().arch, max_reg_cnt);
     GLOG_DRV(gpuModuleLoadData(&module_, bin_.c_str()));
-    GLOG_DRV(gpuModuleGetFunction(&function_, module_, kernel_name_.c_str()));
-
-    int static_smem_size_bytes;
-    GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes,
-                                 gpuFuncAttributeSharedSizeBytes, function_));
-    int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes;
-    GLOG_DRV(gpuFuncSetAttribute(function_,
-                                 gpuFuncAttributeMaxDynamicSharedSizeBytes,
-                                 dynamic_smem_size_bytes));
 }
 
-void GpuKernel::launch(gpuStream stream, std::vector<void*>& args) {
+void GpuKernel::launch(const std::string& kernel_name, gpuStream stream,
+                       std::vector<void*>& args) {
     if (!this->is_compiled()) {
         ERR(InvalidUsageError, "Kernel is not compiled yet.");
     }
+    if (kernel_name.size() == 0) {
+        ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name);
+    }
+    if (kernel_name_ != kernel_name) {
+        GLOG_DRV(
+            gpuModuleGetFunction(&function_, module_, kernel_name.c_str()));
+
+        int static_smem_size_bytes;
+        GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes,
+                                     gpuFuncAttributeSharedSizeBytes,
+                                     function_));
+        int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes;
+        GLOG_DRV(gpuFuncSetAttribute(function_,
+                                     gpuFuncAttributeMaxDynamicSharedSizeBytes,
+                                     dynamic_smem_size_bytes));
+        kernel_name_ = kernel_name;
+    }
     gpu_manager_->launch(function_, grid_dim_, block_dim_, smem_bytes_, stream,
                          args.data(), nullptr);
     GLOG(gpuGetLastError());
diff --git a/ark/gpu/gpu_kernel.hpp b/ark/gpu/gpu_kernel.hpp
index 5308cfead..1e02cc7a1 100644
--- a/ark/gpu/gpu_kernel.hpp
+++ b/ark/gpu/gpu_kernel.hpp
@@ -18,19 +18,18 @@ class GpuKernel {
    public:
     GpuKernel(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
-              const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name);
+              const std::array<int, 3>& grid_dim, size_t smem_bytes);
 
     void init(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
-              const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name);
+              const std::array<int, 3>& grid_dim, size_t smem_bytes);
     void compile();
-    void launch(gpuStream stream, std::vector<void*>& args);
+    void launch(const std::string& kernel_name, gpuStream stream,
+                std::vector<void*>& args);
 
     gpuDeviceptr get_global(const std::string& name,
                             bool ignore_not_found = false) const;
-    bool is_compiled() const { return function_ != nullptr; }
+    bool is_compiled() const { return !bin_.empty(); }
 
    protected:
     std::shared_ptr<GpuManager> gpu_manager_;
diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp
index 342ef9656..7b9f7f176 100644
--- a/ark/gpu/gpu_kernel_test.cpp
+++ b/ark/gpu/gpu_kernel_test.cpp
@@ -8,13 +8,14 @@
 const std::string void_kernel = "extern \"C\" __global__ void kernel() {}";
 
 ark::unittest::State test_gpu_kernel() {
-    ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel");
+    ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0);
     UNITTEST_TRUE(!kernel.is_compiled());
     kernel.compile();
     UNITTEST_TRUE(kernel.is_compiled());
     std::vector<void*> args;
+    UNITTEST_THROW(kernel.launch("", nullptr, args), ark::InvalidUsageError);
     for (int i = 0; i < 10; i++) {
-        kernel.launch(nullptr, args);
+        kernel.launch("kernel", nullptr, args);
     }
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 14ca87618..97597be14 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -9,18 +9,20 @@
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace ark {
 
 using Stream = void *;
 
+class GpuMemory;
+
 /// Convenience class for executing a model.
 class Executor {
    public:
     /// Constructor.
-    Executor(int device_id, Stream stream, const std::string &name,
-             const std::string &plan, bool loop_mode = true);
+    Executor();
 
     /// Destructor.
     ~Executor();
@@ -31,23 +33,32 @@ class Executor {
     /// Return the stream of the executor.
     Stream stream() const;
 
+    /// Return the buffer of the executor.
+    std::shared_ptr<GpuMemory> buffer() const;
+
     /// Return the plan string.
     std::string plan() const;
 
+    const std::string &name() const;
+
     /// Compile the model. This must be called before `launch()`.
-    void compile();
+    void compile(const std::string &plan, int device_id,
+                 const std::string &name = "executor",
+                 const std::unordered_map<Tensor, void *> &external_tensors = {});
 
-    /// Launch the model (not running yet). This must be called after
-    /// `compile()`.
-    void launch();
+    /// Launch the executor. This must be called after `compile()`.
+    void launch(Stream stream = nullptr, bool loop_mode = true,
+                const std::unordered_map<const Tensor, const void *>
+                    &placeholder_data = {});
 
-    /// Run the model for `iter` iterations.
-    void run(int iter);
+    /// Run the executor for `iter` iterations.
+    void run(int iter, const std::unordered_map<const Tensor, const void *>
+                           &placeholder_data = {});
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
 
-    /// Stop the model and return the elapsed time in milliseconds.
+    /// Stop the executor and return the elapsed time in milliseconds.
     /// Once this is called, we need to call `launch()` again to run the model
     /// again.
     float stop(int64_t max_spin_count = -1);
@@ -62,7 +73,7 @@ class Executor {
     bool destroyed() const;
 
     /// Return the raw virtual address of the tensor.
-    uintptr_t tensor_address(const Tensor &tensor) const;
+    void *tensor_address(const Tensor &tensor) const;
 
     template <typename T>
     void tensor_read(const Tensor &tensor, std::vector<T> &data,
@@ -93,10 +104,15 @@ class Model;
 
 class DefaultExecutor : public Executor {
    public:
-    DefaultExecutor(
-        const Model &model, int device_id = -1, Stream stream = nullptr,
-        const std::vector<Planner::ConfigRule> &config_rules = {},
-        const std::string &name = "DefaultExecutor", bool loop_mode = true);
+    DefaultExecutor(const Model &model, int device_id = -1,
+                    Stream stream = nullptr,
+                    const std::vector<Planner::ConfigRule> &config_rules = {},
+                    const std::string &name = "DefaultExecutor",
+                    bool loop_mode = true);
+
+    /// Launch the default executor.
+    void launch(const std::unordered_map<const Tensor, const void *>
+                    &placeholder_data = {});
 };
 
 }  // namespace ark
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 3c4f22e22..e1b1f462b 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -76,6 +76,37 @@ class Model : public ModelGraph {
                   const Dims &padded_shape = {}, int rank = -1,
                   const std::string &name = "");
 
+    ///
+    /// Returns a tensor object associated with an external buffer.
+    ///
+    /// @param shape Shape of the tensor, where the data of interest is.
+    /// @param dtype Type of the tensor data.
+    /// @param strides Strides of each dimension of the tensor, which may be
+    /// different from the shape. @p strides can be considered as the actual
+    /// shape of the underlying data buffer.
+    /// @param offsets Offsets of the tensor. The data of interest starts at
+    /// @p offsets and ends at @p offsets + @p padded_shape.
+    /// @param padded_shape Padded shape of the tensor. Padding is used to
+    /// reserve extra space for the tensor when computation requires it.
+    /// Data on the padded region is allowed to be accessed by computation,
+    /// but it is not considered as the data of interest. The padded region is
+    /// initialized to zero only once when the Executor is launched. The padded
+    /// shape should be greater than or equal to the @p shape, and the
+    /// @p strides should be greater than or equal to the padded shape. If the
+    /// @p strides are not provided, they are set to the padded shape. If the
+    /// padded shape is not provided, it is set to the @p shape.
+    /// @param rank Rank of the tensor. -1 means the rank of this model.
+    /// @param name Name of the tensor.
+    /// @param data Address of data to pass through placeholder. If provided,
+    /// this buffer is registered with the ExternalBufferRegistry and associated
+    /// with the tensor.
+    /// @return Pointer to a tensor object that references the external buffer.
+    ///
+    Tensor placeholder(const Dims &shape, const DataType &data_type,
+                       const Dims &strides = {}, const Dims &offsets = {},
+                       const Dims &padded_shape = {}, int rank = -1,
+                       void *data = nullptr, const std::string &name = "");
+
     Tensor refer(Tensor input, const Dims &shape = {}, const Dims &strides = {},
                  const Dims &offsets = {}, const Dims &padded_shape = {},
                  const std::string &name = "");
@@ -254,7 +285,6 @@ class Model : public ModelGraph {
 
     Tensor local_all_reduce(Tensor input, int gpu_id, int gpu_num,
                             const std::string &name = "");
-
 };
 
 }  // namespace ark
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 747ce5fea..c2d9dbe94 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -50,6 +50,8 @@ class Tensor {
     Dims padded_shape() const;
 
     const DataType &data_type() const;
+
+    Dims torch_strides() const;
 };
 
 const Tensor NullTensor;
@@ -58,4 +60,13 @@ std::ostream &operator<<(std::ostream &os, const Tensor &tensor);
 
 }  // namespace ark
 
+namespace std {
+template <>
+struct hash<ark::Tensor> {
+    size_t operator()(const ark::Tensor &t) const noexcept {
+        return t.id();
+    }
+};
+}  // namespace std
+
 #endif  // ARK_TENSOR_HPP
diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index a8a56f141..a05e143d3 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -6,8 +6,8 @@ using namespace ark;
 template <size_t ProcBegin, size_t ProcEnd, size_t ProcStep, size_t ProcCurrent,
           size_t TaskBegin, size_t TaskEnd, size_t TaskStep, size_t TaskGranularity,
           size_t NumSlots, size_t SlotNumWarps, size_t SlotSramBytes,
-          void (*task)(char*, int, int)>
-__forceinline__ __device__ void task_seq(char *_buf) {
+          void (*task)(char*, int, int, @ARG_TYPES@)>
+__forceinline__ __device__ void task_seq(char *_buf, @GLOBAL_ARGS@) {
   if (math::geq<ProcBegin>(blockIdx.x) && math::le<ProcEnd>(blockIdx.x) &&
       ((blockIdx.x - ProcBegin) % ProcStep == 0)) {
     constexpr size_t SlotNumThreads = SlotNumWarps * Arch::ThreadsPerWarp;
@@ -23,7 +23,7 @@ __forceinline__ __device__ void task_seq(char *_buf) {
       size_t task_id = task_id_base + TaskStep *
         (t % TaskGranularity + t / TaskGranularity * TaskGranularity * NumProcs);
       if (task_id >= TaskEnd) break;
-      task(_buf, task_id, SramBytesPerWarp);
+      task(_buf, task_id, SramBytesPerWarp, @FUNCTION_ARGS@);
     }
   }
 }
@@ -33,12 +33,12 @@ __device__ sync::State ARK_LOOP_SYNC_STATE;
 
 @DEFINITIONS@
 
-__device__ void ark_body(char *_buf, int _iter) {
+__device__ void ark_body(char *_buf, int _iter, @GLOBAL_ARGS@) {
 @BODY@
 }
 
 extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
-void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
+void ark_loop_kernel@NAME@(char *_buf, int *_iter, @GLOBAL_ARGS@) {
   int *shared_mem = (int *)_ARK_SMEM;
   for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
     shared_mem[i] = 0;
@@ -52,10 +52,10 @@ void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
     sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
     if (ARK_ITER < 0) return;
 
-    ark_body(_buf, 0);
+    ark_body(_buf, 0, @FUNCTION_ARGS@);
     for (int _i = 1; _i < ARK_ITER; ++_i) {
       sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
-      ark_body(_buf, _i);
+      ark_body(_buf, _i, @FUNCTION_ARGS@);
     }
     if (threadIdx.x == 0) {
       __threadfence_system();
@@ -69,10 +69,10 @@ void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
 }
 
 extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
-void ark_kernel@NAME@(char *_buf, int _iter) {
+void ark_kernel@NAME@(char *_buf, int _iter, @GLOBAL_ARGS@) {
   int *shared_mem = (int *)_ARK_SMEM;
   for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
     shared_mem[i] = 0;
   }
-  ark_body(_buf, _iter);
+  ark_body(_buf, _iter, @FUNCTION_ARGS@);
 }
diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp
index e637307fd..5e2409537 100644
--- a/ark/model/model_buffer.cpp
+++ b/ark/model/model_buffer.cpp
@@ -3,19 +3,22 @@
 
 #include "model_buffer.hpp"
 
+#include "external_buffer_registry.hpp"
 #include "logging.hpp"
 
 namespace ark {
 
-ModelBuffer::ModelBuffer(int rank) : rank_(rank) {
-    static size_t id = 0;
-    id_ = id++;
+size_t ModelBuffer::curr_id = 0;
+
+ModelBuffer::ModelBuffer(int rank, bool is_external)
+    : rank_(rank), is_external_(is_external) {
+    id_ = curr_id++;
 }
 
-ModelBuffer::ModelBuffer(size_t id, int rank,
+ModelBuffer::ModelBuffer(size_t id, int rank, bool is_external,
                          const std::vector<TagInfo> &send_tags,
                          const std::vector<TagInfo> &recv_tags)
-    : id_(id), rank_(rank) {
+    : id_(id), rank_(rank), is_external_(is_external) {
     for (const auto &info : send_tags) {
         send_tags_.insert(info);
     }
@@ -44,6 +47,7 @@ Json ModelBuffer::serialize() const {
     for (const auto &info : recv_tags_) {
         recv_tags.push_back({info.first, info.second});
     }
+    j["IsExternal"] = is_external_;
     j["SendTags"] = send_tags;
     j["RecvTags"] = recv_tags;
     return j;
@@ -57,11 +61,15 @@ std::shared_ptr<ModelBuffer> ModelBuffer::deserialize(const Json &serialized) {
     } else if (!serialized.contains("SendTags")) {
         ERR(ModelError, "ModelBuffer deserialization failed: missing SendTags");
     } else if (!serialized.contains("RecvTags")) {
-        ERR(ModelError, "ModelBuffer deserialization failed: missing RecvTags");
+        ERR(ModelError,
+            "ModelBuffer deserialization failed: missing RecvTags");
+    } else if (!serialized.contains("IsExternal")) {
+        ERR(ModelError,
+            "ModelBuffer deserialization failed: missing IsExternal");
     }
-    return std::make_shared<ModelBuffer>(serialized["Id"], serialized["Rank"],
-                                         serialized["SendTags"],
-                                         serialized["RecvTags"]);
+    return std::make_shared<ModelBuffer>(
+        serialized["Id"], serialized["Rank"], serialized["IsExternal"],
+        serialized["SendTags"], serialized["RecvTags"]);
 }
 
 }  // namespace ark
diff --git a/ark/model/model_buffer.hpp b/ark/model/model_buffer.hpp
index 7ad3db206..8b66356b1 100644
--- a/ark/model/model_buffer.hpp
+++ b/ark/model/model_buffer.hpp
@@ -17,15 +17,18 @@ class ModelBuffer {
     // (remote_rank, tag)
     using TagInfo = std::pair<int, int>;
 
-    ModelBuffer(int rank = -1);
+    ModelBuffer(int rank = -1, bool is_external = false);
 
-    ModelBuffer(size_t id, int rank, const std::vector<TagInfo> &send_tags,
+    ModelBuffer(size_t id, int rank, bool is_external,
+                const std::vector<TagInfo> &send_tags,
                 const std::vector<TagInfo> &recv_tags);
 
     size_t id() const { return id_; }
 
     int rank() const { return rank_; }
 
+    bool is_external() const { return is_external_; }
+
     const std::set<TagInfo> &send_tags() const { return send_tags_; }
 
     const std::set<TagInfo> &recv_tags() const { return recv_tags_; }
@@ -45,8 +48,10 @@ class ModelBuffer {
     static std::shared_ptr<ModelBuffer> deserialize(const Json &serialized);
 
    private:
+    static size_t curr_id;
     size_t id_;
     int rank_;
+    bool is_external_;
     std::set<TagInfo> send_tags_;
     std::set<TagInfo> recv_tags_;
 };
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index c2099e2c9..dad62cb4e 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -5,6 +5,7 @@
 
 #include <sstream>
 
+#include "ark/dims.hpp"
 #include "logging.hpp"
 
 static std::stringstream &idnt(std::stringstream &ss, int indent) {
@@ -26,14 +27,46 @@ static void verify_format_json(const std::string &name, const Json &json,
                                const std::vector<std::string> &array_fields) {
     for (const auto &field : required_fields) {
         if (!json.contains(field)) {
-            ERR(ErrorType,
-                name + ": " + field + " not found. Given: " + json.dump());
+            ERR(ErrorType, name, ": ", field,
+                " not found. Given: ", json.dump());
         }
     }
     for (const auto &field : array_fields) {
         if (!json.at(field).is_array()) {
-            ERR(ErrorType, name + ": " + field +
-                               " is not an array. Given: " + json.dump());
+            ERR(ErrorType, name, ": ", field,
+                " is not an array. Given: ", json.dump());
+        }
+    }
+}
+
+template <typename ErrorType, bool ZeroNotAllowed>
+static void verify_format_dims(const std::string &name, const Json &json,
+                               const std::vector<std::string> &dims_fields) {
+    for (const auto &field : dims_fields) {
+        if (!json.at(field).is_array()) {
+            ERR(ErrorType, name, ": ", field,
+                " is not an array. Given: ", json.dump());
+        }
+        std::vector<DimType> dims;
+        try {
+            dims = json.at(field).get<std::vector<DimType>>();
+        } catch (const std::exception &e) {
+            ERR(ErrorType, name, ": ", field,
+                " is not an array of integers. Given: ", json.dump());
+        }
+        for (const auto &dim : dims) {
+            if (dim < 0) {
+                ERR(ErrorType, name, ": ", field,
+                    " contains negative value. Given: ", json.dump());
+            }
+        }
+        if (ZeroNotAllowed) {
+            for (const auto &dim : dims) {
+                if (dim == 0) {
+                    ERR(ErrorType, name, ": ", field,
+                        " contains zero value. Given: ", json.dump());
+                }
+            }
         }
     }
 }
@@ -52,10 +85,15 @@ static void verify_format_tensor(const Json &json) {
     const std::vector<std::string> required_fields = {
         "Id",      "DataType",    "Shape", "Strides",
         "Offsets", "PaddedShape", "Buffer"};
-    const std::vector<std::string> array_fields = {"Shape", "Strides",
-                                                   "Offsets", "PaddedShape"};
-    verify_format_json<ErrorType>("TensorJson", json, required_fields,
-                                  array_fields);
+    const std::vector<std::string> dims_fields = {"Shape", "Strides", "Offsets",
+                                                  "PaddedShape"};
+    verify_format_json<ErrorType>("TensorJson", json, required_fields, {});
+    verify_format_dims<ErrorType, false>("TensorJson", json,
+                                         {
+                                             "Offsets",
+                                         });
+    verify_format_dims<ErrorType, true>("TensorJson", json,
+                                        {"Shape", "Strides", "PaddedShape"});
     verify_format_buffer<ErrorType>(json.at("Buffer"));
 }
 
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index 5db8576e8..8f222b75d 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -16,6 +16,7 @@
 #include "ops/ops_math.hpp"
 #include "ops/ops_matmul.hpp"
 #include "ops/ops_noop.hpp"
+#include "ops/ops_placeholder.hpp"
 #include "ops/ops_reduce.hpp"
 #include "ops/ops_refer.hpp"
 #include "ops/ops_reshape.hpp"
@@ -78,6 +79,7 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) {
         MODEL_OP_TYPE_REGISTER(Sqrt);
         MODEL_OP_TYPE_REGISTER(Sub);
         MODEL_OP_TYPE_REGISTER(Tensor);
+        MODEL_OP_TYPE_REGISTER(Placeholder);
         MODEL_OP_TYPE_REGISTER(Transpose);
         MODEL_OP_TYPE_REGISTER(SendPacket);
         MODEL_OP_TYPE_REGISTER(RecvPacket);
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index 8cdad41b2..39c466909 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -25,7 +25,6 @@ ark::unittest::State test_communication_send_recv_unidir() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 0) {
                 std::vector<ark::half_t> data(1024);
@@ -68,7 +67,6 @@ ark::unittest::State test_communication_send_recv_unidir() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 1) {
                 std::vector<ark::half_t> data(1024);
@@ -117,7 +115,6 @@ ark::unittest::State test_communication_send_recv_bidir() {
             tns2 = model.recv(tns2_data, remote_gpu_id, tag);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -161,7 +158,6 @@ ark::unittest::State test_communication_send_recv_bidir() {
             ark::Tensor sum = model.add(tns2, tns_data);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -232,7 +228,6 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
             tns2 = model.recv(tns2_data, remote_gpu_id, tag);
 
             ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -276,7 +271,6 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
             ark::Tensor sum = model.add(tns2, tns_data);
 
             ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -319,7 +313,6 @@ ark::unittest::State test_communication_send_packet() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 0) {
                 std::vector<ark::half_t> data(1024);
@@ -362,7 +355,6 @@ ark::unittest::State test_communication_send_recv_reduce_packet() {
             model.recv_packet(shard_tensors[peer_gpu_id], peer_gpu_id, 1, 1);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), 1.0f);
@@ -433,8 +425,8 @@ ark::unittest::State test_communication_send_recv_reduce() {
 
             ark::Planner planner(model, gpu_id);
             planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, nullptr, "Executor", planner.plan());
-            exe.compile();
+            ark::Executor exe;
+            exe.compile(planner.plan(), gpu_id);
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), 1.0f);
diff --git a/ark/ops/ops_identity_test.cpp b/ark/ops/ops_identity_test.cpp
index a6e49c9c0..eb8d3f4d4 100644
--- a/ark/ops/ops_identity_test.cpp
+++ b/ark/ops/ops_identity_test.cpp
@@ -58,7 +58,6 @@ ark::unittest::State test_ops_identity() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     int num_elem = 2 * 3 * 4 * 5;
 
diff --git a/ark/ops/ops_placeholder.cpp b/ark/ops/ops_placeholder.cpp
new file mode 100644
index 000000000..73c1c1b25
--- /dev/null
+++ b/ark/ops/ops_placeholder.cpp
@@ -0,0 +1,49 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_placeholder.hpp"
+
+#include "external_buffer_registry.hpp"
+#include "logging.hpp"
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpPlaceholder::ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
+                                       ModelDataType data_type,
+                                       const Dims &strides, const Dims &offsets,
+                                       const Dims &padded_shape, void *data)
+    : ModelOp("Placeholder", true) {
+    if (!buffer) {
+        buffer = std::make_shared<ModelBuffer>(-1, true);
+    }
+
+    ExternalBufferRegistry::get_instance().set(buffer->id(), data);
+
+    ModelTensorRef tensor = std::make_shared<ModelTensor>(
+        data_type, buffer, shape, strides, offsets, padded_shape);
+
+    result_tensors_.emplace_back(tensor);
+
+    verify();
+}
+
+Tensor Model::placeholder(const Dims &shape, const DataType &data_type,
+                          const Dims &strides, const Dims &offsets,
+                          const Dims &padded_shape, int rank, void *data,
+                          const std::string &name) {
+    if (rank != -1) {
+        if (rank == this->rank()) {
+            rank = -1;
+        } else if (rank < 0 || rank >= this->world_size()) {
+            ERR(ModelError, "Invalid rank %d", rank);
+        }
+    }
+    return impl_
+        ->create_op<ModelOpPlaceholder>(
+            name, std::make_shared<ModelBuffer>(rank, true), shape,
+            data_type.ref(), strides, offsets, padded_shape, data)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_placeholder.hpp b/ark/ops/ops_placeholder.hpp
new file mode 100644
index 000000000..91dd874ae
--- /dev/null
+++ b/ark/ops/ops_placeholder.hpp
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_PLACEHOLDER_HPP_
+#define ARK_OPS_PLACEHOLDER_HPP_
+
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpPlaceholder : public ModelOp {
+   public:
+    ModelOpPlaceholder() = default;
+    ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
+                       ModelDataType data_type, const Dims &strides,
+                       const Dims &offsets, const Dims &padded_shape,
+                       void *data = nullptr);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_PLACEHOLDER_HPP_
\ No newline at end of file
diff --git a/ark/ops/ops_placeholder_test.cpp b/ark/ops/ops_placeholder_test.cpp
new file mode 100644
index 000000000..dbbc0c90f
--- /dev/null
+++ b/ark/ops/ops_placeholder_test.cpp
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/executor.hpp"
+#include "gpu/gpu.hpp"
+#include "logging.hpp"
+#include "model/model_op.hpp"
+#include "ops_test_common.hpp"
+
+ark::unittest::State test_ops_placeholder() {
+    ark::Model model;
+    ark::Dims shape{10, 1};
+
+    // Allocate GPU memory for the external buffer
+    float *d_ext_buffer = nullptr;
+    UNITTEST_EQ(ark::gpuMalloc(&d_ext_buffer, shape.nelems() * sizeof(float)),
+                ark::gpuSuccess);
+
+    // Initialize GPU Memory
+    std::vector<float> h_ext_buffer(shape.nelems());
+    std::iota(h_ext_buffer.begin(), h_ext_buffer.end(), 1.0f);
+    UNITTEST_EQ(ark::gpuMemcpy(d_ext_buffer, h_ext_buffer.data(),
+                               shape.nelems() * sizeof(float),
+                               ark::gpuMemcpyHostToDevice),
+                ark::gpuSuccess);
+
+    // Associate the initialized device buffer with a tensor produced from a
+    // placeholder operation
+    ark::Tensor tns =
+        model.placeholder(shape, ark::FP32, {}, {}, {}, -1, d_ext_buffer);
+
+    ark::Tensor res = model.add(tns, 1.0);
+
+    ark::DefaultExecutor exe(model);
+
+    exe.launch();
+    exe.run(1);
+    exe.stop();
+
+    UNITTEST_EQ(exe.tensor_address(tns), d_ext_buffer);
+
+    // Copy tensor data from GPU to CPU
+    std::vector<float> h_res(shape.nelems(), 0.0f);
+    exe.tensor_read(res, h_res);
+
+    for (auto i = 0; i < shape.nelems(); ++i) {
+        UNITTEST_EQ(h_res[i], i + 2);
+    }
+
+    UNITTEST_EQ(ark::gpuFree(d_ext_buffer), ark::gpuSuccess);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    ark::init();
+    UNITTEST(test_ops_placeholder);
+    // add test for delayed binding
+    return ark::unittest::SUCCESS;
+}
\ No newline at end of file
diff --git a/ark/ops/ops_reshape_test.cpp b/ark/ops/ops_reshape_test.cpp
index 1128c955a..7bb8aa4be 100644
--- a/ark/ops/ops_reshape_test.cpp
+++ b/ark/ops/ops_reshape_test.cpp
@@ -9,7 +9,6 @@
 void test_reshape_checker(ark::Model &m, ark::Tensor t0, ark::Tensor t1,
                           const std::string &) {
     ark::DefaultExecutor exe(m);
-    exe.compile();
 
     std::vector<float> data_vec(t0.shape().nelems());
     std::iota(data_vec.begin(), data_vec.end(), 1.0f);
diff --git a/ark/ops/ops_scalar_test.cpp b/ark/ops/ops_scalar_test.cpp
index 6afc9e1ad..47a5b40bd 100644
--- a/ark/ops/ops_scalar_test.cpp
+++ b/ark/ops/ops_scalar_test.cpp
@@ -66,7 +66,6 @@ ark::unittest::State test_scalar_assign_fp16() {
         ark::Tensor t = m.constant(7, ark::Dims(4, 2, 50), ark::FP16);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         exe.launch();
         exe.run(1);
@@ -84,7 +83,6 @@ ark::unittest::State test_scalar_assign_fp16() {
         ark::Tensor out = m.copy(7, t);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         std::vector<ark::half_t> data(4 * 2 * 50, 3);
         exe.tensor_write(t, data);
@@ -109,7 +107,6 @@ ark::unittest::State test_scalar_assign_fp32() {
         ark::Tensor out = m.copy(7);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         exe.launch();
         exe.run(1);
diff --git a/ark/ops/ops_tensor_test.cpp b/ark/ops/ops_tensor_test.cpp
index be6488ef1..a2c36fd8c 100644
--- a/ark/ops/ops_tensor_test.cpp
+++ b/ark/ops/ops_tensor_test.cpp
@@ -20,7 +20,6 @@ ark::unittest::State test_tensor_strides() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill buffer data: {1.0, 2.0, 3.0, 4.0}
     std::vector<float> data(shape.nelems());
@@ -53,7 +52,6 @@ ark::unittest::State test_tensor_memcpy() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill buffer data: {1.0, 2.0, 3.0, ..., 3024.0}
     std::vector<float> data(strides.nelems());
@@ -138,7 +136,6 @@ ark::unittest::State test_tensor_layout() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill tensor data: {1.0, 2.0, 3.0, ..., 120.0}
     std::vector<float> data(2 * 3 * 4 * 5);
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 4e94d06a7..42f7e670e 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -38,7 +38,6 @@ OpsTestResult op_test(
     const std::vector<Planner::ConfigRule> &config_rules,
     bool print_on_error) {
     DefaultExecutor exe(model, -1, nullptr, config_rules);
-    exe.compile();
 
     std::vector<std::shared_ptr<std::vector<char>>> inputs_data_storages;
     std::vector<void *> inputs_data_refs;
diff --git a/ark/utils/utils_net_test.cpp b/ark/utils/utils_net_test.cpp
index 4c3b6f162..95dda890c 100644
--- a/ark/utils/utils_net_test.cpp
+++ b/ark/utils/utils_net_test.cpp
@@ -12,6 +12,7 @@ ark::unittest::State test_ipc_hosts() {
     auto tmp_hostfile = tmp_dir + "/.test_ipc_hostfile";
     ark::write_file(tmp_hostfile, "127.0.0.1\n127.0.0.1\n127.0.0.1\n");
     ::setenv("ARK_HOSTFILE", tmp_hostfile.c_str(), 1);
+    ::setenv("ARK_KEEP_TMP", "1", 1);
     ark::init();
 
     UNITTEST_EQ(ark::get_host(0, true), "127.0.0.1");
@@ -31,6 +32,7 @@ ark::unittest::State test_ipc_hosts_unknown_host() {
     auto tmp_hostfile = tmp_dir + "/.test_ipc_hostfile";
     ark::write_file(tmp_hostfile, "unknown\nunknown\nunknown\n");
     ::setenv("ARK_HOSTFILE", tmp_hostfile.c_str(), 1);
+    ::setenv("ARK_KEEP_TMP", "1", 1);
     ark::init();
 
     UNITTEST_THROW(ark::get_host(0, true), ark::InvalidUsageError);
diff --git a/arkprof.py b/arkprof.py
new file mode 100644
index 000000000..5fb62e118
--- /dev/null
+++ b/arkprof.py
@@ -0,0 +1,7 @@
+import ark
+import sys
+
+ark.init()
+ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(
+    iter=1000, profile_processor_groups=False
+)
diff --git a/docs/env.md b/docs/env.md
index 2d5839c3b..95330a032 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -27,3 +27,7 @@
 - `ARK_DISABLE_IB` (Default: `0`; Options: `0`, `1`)
 
     If set to `1`, disable ibverbs networking (i.e., disable multi-node execution).
+
+- `ARK_IGNORE_BINARY_CACHE` (Default: `1`; Options: `0`, `1`)
+
+    If set to `1`, ignore the binary cache and force ARK to recompile binaries on each run.
diff --git a/docs/plan_file.md b/docs/plan_file.md
index 90a4537a2..c06ccc35d 100644
--- a/docs/plan_file.md
+++ b/docs/plan_file.md
@@ -6,6 +6,7 @@ See an example plan file: [Example 1](../examples/tutorial/default_plan.json)
 
     - Rank (Int)
     - WorldSize (Int)
+    - Architecture (String)
     - NumProcessors (Int)
     - NumWarpsPerProcessor (Int)
     - TaskInfos (Array of TaskInfo)
@@ -42,6 +43,23 @@ See an example plan file: [Example 1](../examples/tutorial/default_plan.json)
 
 `ProcessorRange`, `WarpRange`, `SramRange`, and `TaskRange` are in the "range" format, i.e., `[Begin, End, Step]` that indicates an arithmetic integer sequence with a common difference of `Step`, starting from `Begin` and ends before `End` (does not include `End`). They alternatively can be in the format `[Begin, End]` that assumes `Step` is 1.
 
+## Architecture
+
+A name that refers to the hardware architecture where the plan is supposed to run over. The following names are currently supported.
+
+- `ANY`: compatible with all architectures.
+
+- NVIDIA Family
+    - `CUDA`: compatible with all supported NVIDIA architectures.
+    - `CUDA_70`: compatible with NVIDIA Volta architecture.
+    - `CUDA_80`: compatible with NVIDIA Ampere architecture.
+    - `CUDA_90`: compatible with NVIDIA Hopper architecture.
+
+- AMD Family
+    - `ROCM`: compatible with all supported AMD architectures.
+    - `ROCM_90A`: compatible with AMD CDNA 2 (GFX90A) architecture.
+    - `ROCM_942`: compatible with AMD CDNA 3 (GFX942) architecture.
+
 ## TaskInfo
 
 A `TaskInfo` object describes a sequential set of operators. The followings describe each field of `TaskInfo`.
diff --git a/examples/llama/README.md b/examples/llama/README.md
index 090dd1de3..1fe040ae0 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -29,10 +29,10 @@ Llama2 examples over ARK.
 4. Download Llama2 model weights and tokenizer weights.
     * The model and tokenizer should be compatible with the [official PyTorch implementation](https://github.com/facebookresearch/llama/blob/main/llama).
 
-5. Run the model accuracy test. `--pth_path` is the path to the model weights file (`consolidated.00.pth`).
+5. Run the model accuracy test. `--ckpt_dir` is the directory where the model weight files are at (e.g., `consolidated.00.pth`).
 
     ```bash
-    python3 model_test.py --pth_path=/path/to/model/weights.pth
+    python3 model_test.py --ckpt_dir=/directory/of/model/weights
     ```
 
 6. Test text generation. `--pth_path` is the path to the model weights file (`consolidated.00.pth`), `--tok_path` is the path to the tokenizer weights file (`tokenizer.model`), and `--params_path` is the path to the model parameters (`params.json`).
diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py
new file mode 100644
index 000000000..73d349ccc
--- /dev/null
+++ b/examples/llama/model_7b_b1_s2048.py
@@ -0,0 +1,714 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""LLaMA 2 Transformer model.
+   Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py
+"""
+
+import ark
+import math
+from dataclasses import dataclass
+from typing import Optional
+import os
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs7B(ModelArgs):
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs13B(ModelArgs):
+    dim: int = 5120
+    n_layers: int = 40
+    n_heads: int = 40
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs70B(ModelArgs):
+    dim: int = 8192
+    n_layers: int = 80
+    n_heads: int = 64
+    n_kv_heads: Optional[int] = 8
+    vocab_size: int = -1
+    multiple_of: int = (
+        4096  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = 1.3
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 4096
+
+
+class RMSNorm(ark.Module):
+    """
+    Root mean square layer normalization (RMSNorm).
+    """
+
+    def __init__(
+        self, dim: int, eps: float = 1e-6, dtype: ark.DataType = ark.fp16
+    ):
+        super().__init__()
+        self.eps = eps
+        self.dtype = dtype
+        self.weight = ark.parameter([1, 1, dim], ark.fp32)
+
+    def forward(self, x):
+        with ark.PlannerContext(
+            warp_range=[0, 8],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 2048,
+                "Granularity": 7,
+            },
+        ):
+            with ark.PlannerContext(config={"Tile": [1, 4096]}):
+                x = ark.cast(x, ark.fp32)
+                x2 = ark.mul(x, x)
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
+                mean = ark.reduce_mean(x2, axis=-1)
+        with ark.PlannerContext(
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "Tile": [64, 1],
+                "NumTasks": 32,
+            }
+        ):
+            rrms = ark.rsqrt(mean)
+        with ark.PlannerContext(
+            warp_range=[0, 8],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 2048,
+                "Tile": [1, 4096],
+                "Granularity": 7,
+            },
+        ):
+            x = ark.mul(x, rrms)
+            x = ark.mul(x, self.weight, x)
+            return ark.cast(x, self.dtype)
+
+
+class ColumnParallelLinear(ark.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+    Here the weight = A^T, so we need to partition the weight matrix along
+    its first dimension.
+
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dtype: ark.DataType = ark.fp16,
+        gather_output: bool = True,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.dtype = dtype
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.gather_output = gather_output
+
+        self.weight = ark.parameter([out_dim // world_size, in_dim], dtype)
+
+    def forward(self, x):
+        if self.world_size == 1 or self.gather_output == False:
+            return ark.matmul(x, self.weight, transpose_other=True)
+        # We need to concat the output_tensor_shards along the last dimension
+        output_tensor = ark.tensor(
+            [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
+        )
+        output_tensor_shards = ark.sharding(
+            output_tensor,
+            axis=2,
+            dim_per_shard=self.out_dim // self.world_size,
+        )
+        local_result = ark.identity(
+            output_tensor_shards[self.local_rank], deps=output_tensor_shards
+        )
+        # (batch_size, seq_len, out_dim // world_size)
+        local_result = ark.matmul(
+            x, self.weight, local_result, transpose_other=True
+        )
+        gather_input = ark.identity(output_tensor, deps=[local_result])
+        # return gather_input
+        gather_reshape = ark.reshape(
+            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
+        )
+        gather_out = ark.local_all_gather(
+            gather_reshape, self.local_rank, self.world_size, 1
+        )
+        return ark.reshape(
+            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
+        )
+
+
+class RowParallelLinear(ark.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+
+    Here the weight = A^T, so we need to partition the weight matrix along
+    its second dimension.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dtype: ark.DataType = ark.fp16,
+        input_is_parallel: bool = False,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.dtype = dtype
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.input_is_parallel = input_is_parallel
+
+        self.weight = ark.parameter([out_dim, in_dim // world_size], dtype)
+
+    def forward(self, x):
+        if self.world_size == 1:
+            return ark.matmul(x, self.weight, transpose_other=True)
+        x_ndims = len(x.shape())
+        if self.input_is_parallel:
+            input_parallel = x
+        else:
+            x_shards = ark.sharding(
+                x, x_ndims - 1, self.in_dim // self.world_size
+            )
+            input_parallel = x_shards[self.local_rank]
+        local_result = ark.matmul(
+            input_parallel, self.weight, transpose_other=True
+        )
+        reduced_result = ark.local_all_reduce(
+            local_result, self.local_rank, self.world_size
+        )
+        return reduced_result
+
+
+class ParallelEmbedding(ark.Module):
+    """Embedding layer."""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        dim: int,
+        dtype: ark.DataType,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.weight = ark.parameter([vocab_size, dim // world_size], dtype)
+        self.out_dim = dim
+        self.dtype = dtype
+        self.world_size = world_size
+        self.local_rank = local_rank
+
+    def forward(self, x):
+        if self.world_size == 1:
+            return ark.embedding(x, self.weight)
+
+        output_tensor = ark.tensor(
+            [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
+        )
+        output_tensor_shards = ark.sharding(
+            output_tensor, axis=2, dim_per_shard=self.out_dim // self.world_size
+        )
+        local_result = ark.identity(
+            output_tensor_shards[self.local_rank], deps=output_tensor_shards
+        )
+        local_result = ark.embedding(x, self.weight, local_result)
+        gather_input = ark.identity(output_tensor, deps=[local_result])
+        gather_reshape = ark.reshape(
+            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
+        )
+        gather_out = ark.local_all_gather(
+            gather_reshape, self.local_rank, self.world_size, 1
+        )
+        return ark.reshape(
+            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
+        )
+
+
+class Linear(ark.Module):
+    """
+    Linear layer module with weights and no bias.
+    """
+
+    def __init__(
+        self, in_dim: int, out_dim: int, dtype: ark.DataType = ark.fp16
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.weight = ark.parameter([out_dim, in_dim], dtype)
+
+    def forward(self, x):
+        return ark.matmul(x, self.weight, transpose_other=True)
+
+
+class Silu(ark.Module):
+    """
+    Silu activation function, silu(x) = x * sigmoid(x)
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: ark.Tensor):
+        # We need to specify output tensor so that the sigmoid op will not be an in-place operator
+        output = ark.tensor(x.shape(), x.dtype())
+        x1 = ark.sigmoid(x, output)
+        return ark.mul(x, x1)
+
+
+class FeedForward(ark.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * (
+            (hidden_dim + multiple_of - 1) // multiple_of
+        )
+
+        self.w1 = ColumnParallelLinear(
+            dim, hidden_dim, dtype, False, local_rank, world_size
+        )
+        self.w2 = RowParallelLinear(
+            hidden_dim, dim, dtype, True, local_rank, world_size
+        )
+        self.w3 = ColumnParallelLinear(
+            dim, hidden_dim, dtype, False, local_rank, world_size
+        )
+
+    def forward(self, x):
+        # self.w2(F.silu(self.w1(x)) * self.w3(x))
+        with ark.PlannerContext(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 688,
+            },
+        ):
+            with ark.PlannerContext(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                x1 = self.w1(x)
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
+                x1 = Silu()(x1)
+        with ark.PlannerContext(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 688,
+            },
+        ):
+            with ark.PlannerContext(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                x2 = self.w3(x)
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
+                x3 = ark.mul(x1, x2)
+        x4 = self.w2(x3)
+        return x4
+
+
+def apply_rotary_emb(xq, xk, freqs_cis):
+    """
+    Apply rotary embeddings to xq and xk.
+    """
+    xq_out = ark.rope(xq, freqs_cis)
+    xk_out = ark.rope(xk, freqs_cis)
+    return xq_out, xk_out
+
+
+class Softmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        with ark.PlannerContext(
+            warp_range=[0, 8],
+            sram_range=[0, 0],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 65536,
+            },
+        ):
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
+                max = ark.reduce_max(input, axis=-1)
+            with ark.PlannerContext(config={"Tile": [1, 2048]}):
+                output = ark.sub(input, max)
+                output = ark.exp(output)
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
+                sum = ark.reduce_sum(output, axis=-1)
+            with ark.PlannerContext(config={"Tile": [1, 2048]}):
+                output = ark.div(output, sum)
+            return output
+
+
+class Attention(ark.Module):
+    def __init__(
+        self,
+        args: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.n_kv_heads = (
+            args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        )
+        model_parallel_size = world_size
+        self.dtype = dtype
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wk = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wv = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wo = RowParallelLinear(
+            args.n_heads * self.head_dim,
+            args.dim,
+            dtype,
+            True,
+            local_rank,
+            world_size,
+        )
+
+    def forward(
+        self,
+        x: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        bsz, seqlen, _ = x.shape()
+
+        with ark.PlannerContext(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4, "NumTasks": 256},
+        ):
+            with ark.PlannerContext(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xq = self.wq(x)
+            xq = ark.reshape(
+                xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
+            )
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                if freqs_cis is not None:
+                    xq = ark.rope(xq, freqs_cis)
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
+                xq = ark.transpose(xq, [0, 2, 1, 3])
+
+        with ark.PlannerContext(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4, "NumTasks": 256},
+        ):
+            with ark.PlannerContext(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xk = self.wk(x)
+            xk = ark.reshape(
+                xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                if freqs_cis is not None:
+                    xk = ark.rope(xk, freqs_cis)
+            keys = xk
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
+                keys = ark.transpose(keys, [0, 2, 1, 3])
+
+        with ark.PlannerContext(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 256,
+                "SramBytes": 24672,
+                "TileShapeMNK": [256, 128, 32],
+            },
+        ):
+            with ark.PlannerContext(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xv = self.wv(x)
+            xv = ark.reshape(
+                xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+            values = xv
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                values = ark.transpose(values, [0, 2, 1, 3])
+
+        with ark.PlannerContext(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 4096,
+                "Granularity": 2,
+            },
+        ):
+            with ark.PlannerContext(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                scores = ark.matmul(xq, keys, transpose_other=True)
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
+                scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
+
+        if mask is not None:
+            scores = ark.add(scores, mask)
+
+        scores = Softmax()(scores)
+
+        with ark.PlannerContext(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 256,
+            },
+        ):
+            with ark.PlannerContext(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                output = ark.matmul(scores, values)
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                output = ark.transpose(output, [0, 2, 1, 3])
+        output = ark.reshape(
+            output, [bsz, seqlen, self.head_dim * self.n_local_heads]
+        )
+        return self.wo(output)
+
+
+class TransformerBlock(ark.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        args: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args, dtype, local_rank, world_size)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+            dtype=dtype,
+            local_rank=local_rank,
+            world_size=world_size,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype)
+
+    def forward(
+        self,
+        x: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        attention_norm_x = self.attention_norm(x)
+        h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask)
+        with ark.PlannerContext(
+            warp_range=[0, 4],
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "NumTasks": 256,
+                "SramBytes": 0,
+            },
+        ):
+            h = ark.add(x, h)
+        ff = self.feed_forward(self.ffn_norm(h))
+        with ark.PlannerContext(
+            warp_range=[0, 4],
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "NumTasks": 256,
+                "SramBytes": 0,
+            },
+        ):
+            out = ark.add(h, ff)
+        return out
+
+
+class Transformer(ark.Module):
+    def __init__(
+        self,
+        params: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+
+        self.tok_embeddings = ParallelEmbedding(
+            params.vocab_size, params.dim, dtype, local_rank, world_size
+        )
+
+        self.layers = []
+        for layer_id in range(self.n_layers):
+            self.layers.append(
+                TransformerBlock(
+                    layer_id, params, dtype, local_rank, world_size
+                )
+            )
+            self.register_module(f"layers.{layer_id}", self.layers[layer_id])
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps, dtype=dtype)
+        self.output = ColumnParallelLinear(
+            params.dim, params.vocab_size, dtype, True, local_rank, world_size
+        )
+
+    def forward(
+        self,
+        tokens: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        h = self.tok_embeddings(tokens)
+
+        for layer in self.layers:
+            h = layer(h, start_pos, freqs_cis, mask)
+        h = self.norm(h)
+        output = self.output(h)
+        return output
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 737d3ec8b..f559a826b 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -58,30 +58,37 @@ def run_ark(
     ]
     output = module(*module_inputs)
 
-    runtime = ark.Runtime()
-    # Prefer num_warps_per_sm = 16 for nvidia and 8 for amd
-    runtime.launch(num_warps_per_sm=8)
+    with ark.Runtime() as rt:
+        plan = ark.DefaultPlanner().plan()
+        with open("plan.json", "w") as f:
+            f.write(str(plan))
+        rt.launch(plan=plan)
 
-    # Load model parameters
-    if state_dict:
-        module.load_state_dict(state_dict)
+        # Load model parameters
+        if state_dict:
+            print("Loading state_dict")
+            module.load_state_dict(state_dict)
+            print("Loading state_dict done")
 
-    # Load input data into tensors
-    tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
-    tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
-    for tensor, ndarray in zip(tensors, tensor_data):
-        tensor.from_numpy(ndarray)
+        # Load input data into tensors
+        tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
+        tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
+        for tensor, ndarray in zip(tensors, tensor_data):
+            tensor.from_numpy(ndarray)
 
-    start_time = time.time()
+        start_time = time.time()
 
-    # Run the model
-    runtime.run(iter=iterations)
+        # Run the model
+        print("Run:", iterations)
 
-    end_time = time.time()
+        rt.run(iter=iterations)
+        print("Run done")
 
-    if isinstance(output, list) or isinstance(output, tuple):
-        outputs = [o.to_numpy() for o in output]
-    outputs = [output.to_numpy()]
+        end_time = time.time()
+
+        if isinstance(output, list) or isinstance(output, tuple):
+            outputs = [o.to_numpy() for o in output]
+        outputs = [output.to_numpy()]
 
     return RunResults(outputs=outputs, runtime=end_time - start_time)
 
@@ -160,7 +167,9 @@ def test_module(
         else:
             prefix = module_name_prefix + "." if module_name_prefix else ""
             # Load the state_dict from the given path
+            print("Loading ckpt:", ckpt_path)
             state_dict_pt = torch.load(ckpt_path)
+            print("Loading ckpt done")
             state_dict_pt = {
                 k[len(prefix) :]: v
                 for k, v in state_dict_pt.items()
@@ -182,6 +191,7 @@ def test_module(
         rank=rank,
         world_size=world_size,
     )
+    print("Run ARK done")
 
     if not test_thru_ark_only:
         # PyTorch module
@@ -195,6 +205,7 @@ def test_module(
             inputs_pt,
             iterations=test_thru_iterations if test_thru else 1,
         )
+        print("Run PyTorch done")
 
         if test_thru:
             print(
@@ -430,43 +441,43 @@ def test_transformer_block(
         low=-1, high=1, size=(batch_size, seq_len, args.dim)
     ).astype(dtype)
 
-    module = model_ark.Attention(
-        args, ark.DataType.from_numpy(dtype), rank, world_size
-    )
+    # module = model_ark.Attention(
+    #     args, ark.DataType.from_numpy(dtype), rank, world_size
+    # )
     # module_inputs = [
     #     ark.tensor(list(i.shape), ark.DataType.from_numpy(i.dtype))
     #     if isinstance(i, np.ndarray)
     #     else i
     #     for i in inputs
     # ]
-    feature_tensor = ark.tensor(
-        list(feature.shape), ark.DataType.from_numpy(feature.dtype)
-    )
-    freqs_cis_ark_tensor = ark.tensor(
-        list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
-    )
-    output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
-
-    ark.Model.get_model().create_nodes()
-    print(ark.Model.get_model().serialize())
-
-    # test_module(
-    #     module_class_ark=model_ark.TransformerBlock,
-    #     module_args_ark=[
-    #         0,
-    #         args,
-    #         ark.DataType.from_numpy(dtype),
-    #         rank,
-    #         world_size,
-    #     ],
-    #     inputs_ark=[feature, 0, freqs_cis_ark, None],
-    #     module_class_pt=model_pt.TransformerBlock,
-    #     module_args_pt=[0, args],
-    #     inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
-    #     module_name_prefix="layers.0",
-    #     rank=rank,
-    #     world_size=world_size,
+    # feature_tensor = ark.tensor(
+    #     list(feature.shape), ark.DataType.from_numpy(feature.dtype)
     # )
+    # freqs_cis_ark_tensor = ark.tensor(
+    #     list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
+    # )
+    # output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
+
+    # print(ark.Model.get_model().serialize())
+
+    test_module(
+        module_class_ark=model_ark.TransformerBlock,
+        module_args_ark=[
+            0,
+            args,
+            ark.DataType.from_numpy(dtype),
+            rank,
+            world_size,
+        ],
+        inputs_ark=[feature, 0, freqs_cis_ark, None],
+        module_class_pt=model_pt.TransformerBlock,
+        module_args_pt=[0, args],
+        inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
+        module_name_prefix="layers.0",
+        rank=rank,
+        world_size=world_size,
+        test_thru=False,
+    )
 
 
 def test_transformer(
@@ -570,7 +581,7 @@ def worker(
     # Configurations
     args = ModelArgs7B()
     batch_size = 1
-    seq_len = 512
+    seq_len = 2048
     dtype = np.float16
     world_size = ngpus
 
@@ -578,7 +589,7 @@ def worker(
     args.vocab_size = 32000
 
     # Reduce max_seq_len due to OOM from the PyTorch model
-    args.max_seq_len = 512
+    args.max_seq_len = 2048
 
     # Verify the configurations
     assert batch_size <= args.max_batch_size
diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
new file mode 100644
index 000000000..b0bc757dc
--- /dev/null
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -0,0 +1,1206 @@
+{
+  "Rank": 0,
+  "WorldSize": 1,
+  "Architecture": "ROCM_942",
+  "NumProcessors": 304,
+  "NumWarpsPerProcessor": 8,
+  "TaskInfos": [
+    {
+      "Id": 0,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":11,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,4096],
+            "NumTasks": 2048
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":13,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,4096],
+            "NumTasks": 2048
+          }
+        },
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":15,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 0,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 3,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":17,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 4,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":19,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,4096],
+            "NumTasks": 2048
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":7,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,4096],
+            "NumTasks": 2048
+          }
+        },
+        {
+          "Type": "Cast",
+          "Name": "cast_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":22,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,4096],
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 7,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":24,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":25,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        },
+        {
+          "Type": "Rope",
+          "Name": "rope",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":30,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":33,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,1,128],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 10,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":26,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":27,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        },
+        {
+          "Type": "Rope",
+          "Name": "rope_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":35,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 13,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":29,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 15,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":43,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 4096
+          }
+        },
+        {
+          "Type": "ScalarMul",
+          "Name": "mul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":45,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Factor": {"FLOAT":0.0883883461356163}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 17,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "ReduceMax",
+          "Name": "reduce_max",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":47,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 0,
+            "NumTasks": 65536
+          }
+        },
+        {
+          "Type": "Sub",
+          "Name": "sub",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,2048],
+            "NumTasks": 65536
+          }
+        },
+        {
+          "Type": "Exp",
+          "Name": "exp",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,2048],
+            "NumTasks": 65536
+          }
+        },
+        {
+          "Type": "ReduceSum",
+          "Name": "reduce_sum",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":51,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 0,
+            "NumTasks": 65536
+          }
+        },
+        {
+          "Type": "Div",
+          "Name": "div",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,2048],
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 22,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":54,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 24,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":58,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":59,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        },
+        {
+          "Type": "Add",
+          "Name": "add",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":61,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 256
+          }
+        },
+        {
+          "Type": "Cast",
+          "Name": "cast_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 256
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":65,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 28,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":67,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 0,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 29,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":69,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 30,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":71,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,4096],
+            "NumTasks": 2048
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":8,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,4096],
+            "NumTasks": 2048
+          }
+        },
+        {
+          "Type": "Cast",
+          "Name": "cast_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":74,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,4096],
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 33,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":4,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":76,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 688
+          }
+        },
+        {
+          "Type": "Sigmoid",
+          "Name": "sigmoid",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":78,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 688
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":80,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 36,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":82,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 602
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":84,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 602
+          }
+        }
+      ]
+    },
+    {
+      "Id": 37,
+      "NumWarps": 4,
+      "SramBytes": 16480,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":102,"DataType":"FP16","Shape":[1,1792,4096],"Strides":[1,2048,4096],"Offsets":[0,256,0],"PaddedShape":[1,1792,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":101,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":100,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 16480,
+            "TileShapeMNK": [128,128,32],
+            "NumTasks": 172
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":83,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":84,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,128],
+            "NumTasks": 172
+          }
+        }
+      ]
+    },
+    {
+      "Id": 38,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":5,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":86,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        },
+        {
+          "Type": "Add",
+          "Name": "add_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":88,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [256,128],
+            "NumTasks": 256
+          }
+        }
+      ]
+    }
+  ],
+  "ProcessorGroups": [
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,8],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":0,"TaskRange":[0,2048],"Granularity":7}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":3,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,8],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":4,"TaskRange":[0,2048],"Granularity":7}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":10,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":13,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
+          "TaskGroups": [
+            {"TaskId":15,"TaskRange":[0,4096],"Granularity":2}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,8],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":22,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":24,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,8],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":28,"TaskRange":[0,2048],"Granularity":7}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":29,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,8],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":30,"TaskRange":[0,2048],"Granularity":7}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
+          "TaskGroups": [
+            {"TaskId":33,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
+          "TaskGroups": [
+            {"TaskId":36,"TaskRange":[0,602],"Granularity":2},
+            {"TaskId":37,"TaskRange":[0,172],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":38,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/examples/tutorial/default_plan.json b/examples/tutorial/default_plan.json
index c6b4be243..bb774a5b8 100644
--- a/examples/tutorial/default_plan.json
+++ b/examples/tutorial/default_plan.json
@@ -1,36 +1,37 @@
 {
   "Rank": 0,
   "WorldSize": 1,
-  "NumProcessors": 108,
-  "NumWarpsPerProcessor": 8,
+  "Architecture": "ROCM_942",
+  "NumProcessors": 304,
+  "NumWarpsPerProcessor": 4,
   "TaskInfos": [
     {
       "Id": 0,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 172
           }
         }
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -95,31 +96,31 @@
     },
     {
       "Id": 3,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 172
           }
         }
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -156,31 +157,31 @@
     },
     {
       "Id": 5,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 64
           }
         }
@@ -189,12 +190,12 @@
   ],
   "ProcessorGroups": [
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,172],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "ProcessorRange": [0,172],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":0,"TaskRange":[0,172],"Granularity":1}
           ]
@@ -202,10 +203,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -215,10 +216,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -228,12 +229,12 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,172],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "ProcessorRange": [0,172],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":3,"TaskRange":[0,172],"Granularity":1}
           ]
@@ -241,10 +242,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -258,8 +259,8 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":5,"TaskRange":[0,64],"Granularity":1}
           ]
@@ -267,4 +268,4 @@
       ]
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/examples/tutorial/model_test_tutorial.py b/examples/tutorial/model_test_tutorial.py
new file mode 100644
index 000000000..c83d0d15e
--- /dev/null
+++ b/examples/tutorial/model_test_tutorial.py
@@ -0,0 +1,164 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import torch
+import torch.optim as optim
+
+
+# Set random seed for reproducibility.
+torch.manual_seed(42)
+
+
+# Let's first define a linear layer using ARK.
+class ARKLinear(ark.Module):
+    def __init__(self, weight):
+        super().__init__()
+        self.weight = weight
+
+    def forward(self, input):
+        self.saved_input = input
+        output = ark.matmul(input, self.weight, transpose_other=True)
+        return output
+
+    def backward(self, grad_output):
+        grad_weight = ark.matmul(
+            grad_output, self.saved_input, transpose_input=True
+        )
+        grad_input = ark.matmul(grad_output, self.weight, transpose_other=False)
+        self.weight.update_gradient(grad_weight)
+        return grad_input, grad_weight
+
+
+# Let's use our previous module to define a double linear layer.
+class MyARKModule(ark.Module):
+    def __init__(self, weight0, weight1):
+        super().__init__()
+        self.linear1 = ARKLinear(weight0)
+        self.linear2 = ARKLinear(weight1)
+
+    def forward(self, x):
+        x = self.linear1.forward(x)
+        x = self.linear2.forward(x)
+        return x
+
+    def backward(self, grad_output):
+        grad_x, grad_weight2 = self.linear2.backward(grad_output)
+        grad_x, grad_weight1 = self.linear1.backward(grad_x)
+        return grad_x, grad_weight1, grad_weight2
+
+
+# Define a PyTorch model.
+class SimpleModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = torch.nn.Sequential(
+            torch.nn.Linear(256, 256, bias=False),  # Layer 0
+            torch.nn.Linear(256, 256, bias=False),  # Layer 1
+            torch.nn.Linear(256, 256, bias=False),  # Layer 2
+            torch.nn.Linear(256, 256, bias=False),  # Layer 3
+            torch.nn.Linear(256, 256, bias=False),  # Layer 4
+            torch.nn.ReLU(),  # Activation
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+# Function to compare the gradients of two models of the same architecture and parameter order.
+def compare_grad(ark_model, torch_model, atol=1e-4, rtol=1e-2):
+    ark_params = list(ark_model.named_parameters())
+    torch_params = list(torch_model.named_parameters())
+    for (ark_name, ark_param), (torch_name, torch_param) in zip(
+        ark_params, torch_params
+    ):
+        if (ark_param.grad is None) ^ (torch_param.grad is None):
+            print("Exactly one of the gradients is None")
+        else:
+            grads_equal = torch.allclose(
+                ark_param.grad, torch_param.grad, atol=atol, rtol=rtol
+            )
+            if not grads_equal:
+                print(
+                    f"Gradient for {ark_name} when compared to {torch_name} is different:"
+                )
+                print(f"ARK gradient: {ark_param.grad}")
+                print(f"Torch gradient: {torch_param.grad}")
+
+
+# For our ARK model we will replace the first two layers with ARK layers.
+def replace_layers_with_ark(model):
+    weight_0 = torch.nn.Parameter(
+        model.layers[0].weight.to("cuda:0").requires_grad_(True)
+    )
+    weight_1 = torch.nn.Parameter(
+        model.layers[1].weight.to("cuda:0").requires_grad_(True)
+    )
+    ark_module = ark.RuntimeModule(MyARKModule(weight_0, weight_1))
+    model.layers[0] = ark_module
+    del model.layers[1]
+
+    # Since we replaced the PyTorch layer with an ARK layer, we need to register the PyTorch parameters
+    # our ARK module utilizes with the original PyTorch model so ARK can leverage PyTorch's optimizers.
+    model.register_parameter("weight_0", weight_0)
+    model.register_parameter("weight_1", weight_1)
+
+    return model
+
+
+# Instantiate our models.
+pytorch_model = SimpleModel()
+ark_model = SimpleModel()
+
+
+# Ensure both models have the same weights.
+ark_model.load_state_dict(pytorch_model.state_dict())
+ark_model = replace_layers_with_ark(ark_model)
+
+
+# Move both models to GPU.
+pytorch_model.to("cuda:0")
+ark_model.to("cuda:0")
+
+# Now let's run the models on some random input.
+input_torch = torch.randn(128, 256).to("cuda:0").requires_grad_(True)
+input_ark = input_torch.clone().detach().requires_grad_(True)
+
+
+# Define an arbitrary target.
+target = torch.randn(128, 256).to("cuda:0")
+
+loss_fn = torch.nn.MSELoss()
+optim_torch = optim.SGD(pytorch_model.parameters(), lr=0.01)
+optim_ark = optim.SGD(ark_model.parameters(), lr=0.01)
+
+num_iters = 5
+for iter in range(num_iters):
+    print(f"Iteration {iter+1}/{num_iters}")
+
+    optim_torch.zero_grad()
+    optim_ark.zero_grad()
+
+    pytorch_output = pytorch_model(input_torch)
+    ark_output = ark_model(input_ark)
+
+    assert torch.allclose(pytorch_output, ark_output, atol=1e-4, rtol=1e-2)
+
+    # Compute losses.
+    torch_loss = loss_fn(pytorch_output, target)
+    ark_loss = loss_fn(ark_output, target)
+
+    # See how ARK's loss compares to PyTorch's loss.
+    print(f"\nPyTorch loss: {torch_loss.item()}")
+    print(f"\nARK loss: {ark_loss.item()}\n")
+    assert torch.allclose(torch_loss, ark_loss, atol=1e-4, rtol=1e-2)
+
+    # Perform a backward pass.
+    torch_loss.backward()
+    ark_loss.backward()
+
+    optim_torch.step()
+    optim_ark.step()
+
+    # Ensure gradients of both models are updated accordingly.
+    compare_grad(ark_model, pytorch_model)
diff --git a/examples/tutorial/plan.json b/examples/tutorial/plan.json
index c0854e505..335c27549 100644
--- a/examples/tutorial/plan.json
+++ b/examples/tutorial/plan.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_1_larger_tile.json b/examples/tutorial/plan_1_larger_tile.json
index 3a3f66530..04d2e9d60 100644
--- a/examples/tutorial/plan_1_larger_tile.json
+++ b/examples/tutorial/plan_1_larger_tile.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
diff --git a/examples/tutorial/plan_2_split_k.json b/examples/tutorial/plan_2_split_k.json
index 493515d8c..837944171 100644
--- a/examples/tutorial/plan_2_split_k.json
+++ b/examples/tutorial/plan_2_split_k.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_3_overwrite.json b/examples/tutorial/plan_3_overwrite.json
index c0854e505..335c27549 100644
--- a/examples/tutorial/plan_3_overwrite.json
+++ b/examples/tutorial/plan_3_overwrite.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_tutorial.py b/examples/tutorial/plan_tutorial.py
index 560021522..a2c5e3e57 100644
--- a/examples/tutorial/plan_tutorial.py
+++ b/examples/tutorial/plan_tutorial.py
@@ -339,7 +339,7 @@ def main(plan_path: str):
 
         plan = planner.plan()
         with open("default_plan.json", "w") as f:
-            f.write(plan)
+            f.write(str(plan))
         rt.launch(plan=plan)
 
         # Initialize
@@ -364,7 +364,7 @@ def main(plan_path: str):
         print(f"File {plan_path} does not exist. Exiting...")
         return
     with ark.Runtime.get_runtime() as rt:
-        rt.launch(plan_path=plan_path)
+        rt.launch(plan=ark.Plan.from_file(plan_path))
 
         # Initialize
         InputModule.initialize()
diff --git a/examples/tutorial/planner_tutorial.py b/examples/tutorial/planner_tutorial.py
index 1f6c3ac58..6153aaf8e 100644
--- a/examples/tutorial/planner_tutorial.py
+++ b/examples/tutorial/planner_tutorial.py
@@ -69,14 +69,13 @@ def perf():
 
     shape = (32, 2048, 2048)
 
-    # input = torch.randn(*shape).to("cuda:0")
-    input = ark.tensor(shape)
+    input = torch.randn(*shape).to("cuda:0")
 
-    output = Softmax()(input)
+    output = Softmax()(ark.Tensor.from_torch(input))
 
-    # if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
-    #     print("Correct result")
-    # else:
-    #     print("Incorrect result")
+    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
+        print("Correct result")
+    else:
+        print("Incorrect result")
 
     print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index efb9aea3e..597388e2d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,5 +18,22 @@ FetchContent_MakeAvailable(pybind11)
 
 file(GLOB_RECURSE BIND_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 pybind11_add_module(ark_py ${BIND_SOURCES})
-set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core)
+set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ark)
+add_custom_command(TARGET ark_py POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/ark ${CMAKE_CURRENT_BINARY_DIR}/ark
+)
 target_link_libraries(ark_py PRIVATE ark_static)
+target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})
+target_include_directories(ark_py PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../ark)
+
+if(ARK_USE_CUDA)
+    target_include_directories(ark_py SYSTEM PRIVATE
+        ${CUDAToolkit_INCLUDE_DIRS}
+    )
+endif()
+
+if(ARK_USE_ROCM)
+    target_include_directories(ark_py SYSTEM PRIVATE
+        /opt/rocm/include
+    )
+endif()
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index b1d0f7873..1aebfa43f 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -1,15 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
 import os
 
 if os.environ.get("ARK_ROOT", None) is None:
     os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__))
 
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-import _ark_core
+from . import _ark_core
 from .model import Model
 
 
@@ -21,11 +18,6 @@ def version():
     return __version__
 
 
-def srand(seed):
-    """Sets the seed for random number generation."""
-    _ark_core.srand(seed)
-
-
 def set_rank(rank):
     """Sets the rank of the current process."""
     Model.set_rank(rank)
@@ -38,8 +30,8 @@ def set_world_size(world_size):
 
 from .init import init
 from .tensor import Dims, Tensor, Parameter
-from .module import Module
-from .runtime import Runtime
+from .module import Module, RuntimeModule
+from .runtime import *
 from .serialize import save, load
 from .data_type import (
     DataType,
@@ -51,6 +43,7 @@ def set_world_size(world_size):
     uint8,
     byte,
 )
+from .profiler import Profiler
 from .ops import *
 from .planner import *
 from .error import *
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index fe95d0d88..8ab982106 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -4,16 +4,20 @@
 import numpy
 from . import _ark_core
 
+try:
+    import torch
+except ImportError:
+    from . import torch_mock as torch
 
 _REGISTRY_DATA_TYPE = {
-    "fp32": {"np": numpy.float32},
-    "fp16": {"np": numpy.float16},
-    "bf16": {"np": None},
-    "int32": {"np": numpy.int32},
-    "uint32": {"np": numpy.uint32},
-    "int8": {"np": numpy.int8},
-    "uint8": {"np": numpy.uint8},
-    "byte": {"np": numpy.ubyte},
+    "fp32": {"np": numpy.float32, "torch": torch.float32},
+    "fp16": {"np": numpy.float16, "torch": torch.float16},
+    "bf16": {"np": None, "torch": torch.bfloat16},
+    "int32": {"np": numpy.int32, "torch": torch.int32},
+    "uint32": {"np": numpy.uint32, "torch": None},
+    "int8": {"np": numpy.int8, "torch": torch.int8},
+    "uint8": {"np": numpy.uint8, "torch": torch.uint8},
+    "byte": {"np": numpy.ubyte, "torch": torch.uint8},
 }
 
 
@@ -23,6 +27,7 @@ def __new__(cls, name, bases, attrs):
         if name in _REGISTRY_DATA_TYPE:
             reg = _REGISTRY_DATA_TYPE[name]
             new_class.to_numpy = staticmethod(lambda: reg["np"])
+            new_class.to_torch = staticmethod(lambda: reg["torch"])
             new_class.ctype = staticmethod(
                 lambda: getattr(_ark_core, name.upper())
             )
@@ -59,6 +64,28 @@ def from_numpy(np_type: numpy.dtype) -> "DataType":
             f" to ark data type."
         )
 
+    @staticmethod
+    def from_torch(torch_type: torch.dtype) -> "DataType":
+        """
+        Return the corresponding ark data type.
+
+        Parameters:
+            torch_type (torch.dtype): The torch data type.
+
+        Returns:
+            DataType: The corresponding ark data type.
+
+        Raises:
+            ValueError: If there is no defined conversion from torch data type to ark data type.
+        """
+        for type_name, reg in _REGISTRY_DATA_TYPE.items():
+            if reg["torch"] == torch_type:
+                return DataType.from_name(type_name)
+        raise ValueError(
+            f"Undefined conversion from torch data type {torch_type}"
+            f" to ark data type."
+        )
+
     @staticmethod
     def from_name(type_name: str) -> "DataType":
         """
@@ -104,6 +131,16 @@ def to_numpy() -> numpy.dtype:
         """
         ...
 
+    @staticmethod
+    def to_torch() -> torch.dtype:
+        """
+        Return the corresponding torch data type.
+
+        Returns:
+            torch.dtype: The corresponding torch data type.
+        """
+        ...
+
     @staticmethod
     def ctype() -> _ark_core._DataType:
         """
diff --git a/python/ark/error.py b/python/ark/error.py
index 4ffe6a3f8..cec8ab137 100644
--- a/python/ark/error.py
+++ b/python/ark/error.py
@@ -1,14 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from _ark_core import _BaseError as BaseError
-from _ark_core import _InternalError as InternalError
-from _ark_core import _InvalidUsageError as InvalidUsageError
-from _ark_core import _ModelError as ModelError
-from _ark_core import _PlanError as PlanError
-from _ark_core import _UnsupportedError as UnsupportedError
-from _ark_core import _SystemError as SystemError
-from _ark_core import _GpuError as GpuError
+from ._ark_core import _BaseError as BaseError
+from ._ark_core import _InternalError as InternalError
+from ._ark_core import _InvalidUsageError as InvalidUsageError
+from ._ark_core import _ModelError as ModelError
+from ._ark_core import _PlanError as PlanError
+from ._ark_core import _UnsupportedError as UnsupportedError
+from ._ark_core import _SystemError as SystemError
+from ._ark_core import _GpuError as GpuError
 
 __all__ = [
     "BaseError",
diff --git a/python/ark/init.py b/python/ark/init.py
index be71e8e02..7daa0771b 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import _ark_core
+from . import _ark_core
 from .model import Model
 from .runtime import _RuntimeState
 
@@ -9,7 +9,7 @@
 def init():
     """Initializes ARK."""
     Model.reset()
-    if _RuntimeState.executor is not None:
-        if not _RuntimeState.executor.destroyed():
-            _RuntimeState.executor.destroy()
+    if _RuntimeState.runtime is not None:
+        del _RuntimeState.runtime
+        _RuntimeState.runtime = None
     _ark_core.init()
diff --git a/python/ark/model.py b/python/ark/model.py
index e6208fc16..87af88f49 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 from typing import NewType
-from _ark_core import _Model
+from ._ark_core import _Model
 
 _ModelState = NewType("_ModelState", None)
 
diff --git a/python/ark/module.py b/python/ark/module.py
index 62b941281..4809ea432 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -3,8 +3,20 @@
 
 import logging
 import numpy as np
-from typing import Any, Dict
+from typing import Any, Dict, Union
 from .tensor import Parameter
+from .runtime import Runtime
+from .model import Model
+
+try:
+    import torch
+    from .ops import placeholder
+
+    _no_torch = False
+except ImportError:
+    from . import torch_mock as torch
+
+    _no_torch = True
 
 
 class Module:
@@ -13,6 +25,7 @@ class Module:
     """
 
     def __init__(self):
+        super().__init__()
         # The submodules of the module.
         self.sub_modules: dict[str, "Module"] = dict()
         # The parameters of the module.
@@ -22,12 +35,16 @@ def __setattr__(self, __name: str, __value: Any) -> None:
         """
         When setting an attribute, if the attribute is a Module, add it to
         the sub_modules. If the attribute is a Tensor and this Tensor is a
-        parameter, add it to the parameters.
+        parameter, add it to the parameters. If the attribute is a
+        torch.nn.Parameter, convert it to an ARK Parameter before adding.
         """
         if isinstance(__value, Module):
             self.register_module(__name, __value)
         elif isinstance(__value, Parameter):
             self.register_parameter(__name, __value)
+        elif not _no_torch and isinstance(__value, torch.nn.Parameter):
+            __value = Parameter(placeholder(torch_tensor=__value), True)
+            self.register_parameter(__name, __value)
         super().__setattr__(__name, __value)
 
     def __call__(self, *args: Any, **kwargs: Any):
@@ -57,7 +74,10 @@ def params_dict(self, prefix="") -> Dict[str, Parameter]:
         return params_dict
 
     def load_state_dict(
-        self, state_dict: Dict[str, np.ndarray], prefix: str = ""
+        self,
+        state_dict: Dict[str, Union[np.ndarray, torch.Tensor]],
+        prefix: str = "",
+        stream: int = 0,
     ):
         """
         Loads a model from a state_dict and copy the parameters to the device GPU.
@@ -68,21 +88,125 @@ def load_state_dict(
         all_keys = set(state_dict.keys())
         pd = self.params_dict(prefix)
         for name, param in pd.items():
-            param.from_numpy(state_dict[name])
+            data = state_dict.get(name, None)
+            if data is None:
+                continue
+            param.copy(data, stream=stream)
             all_keys.remove(name)
         if all_keys:
             logging.warning(
                 f"{len(all_keys)} unused parameter(s) in state_dict"
             )
 
-    def state_dict(self, prefix: str = "") -> Dict[str, np.ndarray]:
+    def state_dict(
+        self,
+        prefix: str = "",
+        mode: str = "numpy",
+        stream: int = 0,
+    ) -> Dict[str, Union[np.ndarray, torch.Tensor]]:
         """
         Copies the parameters from the device GPU to the host and saves the
         model to a state_dict.
         Must be called after the executor is launched.
         """
-        return {k: v.to_numpy() for k, v in self.params_dict(prefix).items()}
+        if mode == "numpy":
+            return {
+                k: v.to_numpy(stream=stream)
+                for k, v in self.params_dict(prefix).items()
+            }
+        elif mode == "torch":
+            return {
+                k: v.to_torch(stream=stream)
+                for k, v in self.params_dict(prefix).items()
+            }
+        raise ValueError(f"Unsupported mode: {mode}")
 
     def forward(self, *args: Any, **kwargs: Any) -> Any: ...
 
     def backward(self, *args: Any, **kwargs: Any) -> Any: ...
+
+    def initialize(self):
+        for param in self.parameters.values():
+            param.initialize()
+        for module in self.sub_modules.values():
+            module.initialize()
+
+
+class _Function(torch.autograd.Function):
+    """
+    Facilitates the integration of ARK modules with PyTorch's
+    autograd system by defining custom forward and backward passes that
+    utilize the user's defined ARK module.
+    """
+
+    @staticmethod
+    def forward(ctx, ark_module, *args, **kwargs):
+        """
+        Returns a PyTorch tensor that is the result
+        of the forward pass of the ARK module.
+        """
+        Model.reset()
+        ctx.ark_module = ark_module
+        input_args, input_kwargs = [], {}
+        input_requires_grad = 0
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                input_args.append(placeholder(torch_tensor=arg))
+                if arg.requires_grad:
+                    input_requires_grad += 1
+            else:
+                input_args.append(arg)
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                input_kwargs[k] = placeholder(torch_tensor=v)
+                if v.requires_grad:
+                    input_requires_grad += 1
+            else:
+                input_kwargs[k] = v
+        ctx.num_inp_grad = input_requires_grad
+        output = ark_module.forward(*input_args, **input_kwargs)
+        rt = Runtime.get_runtime()
+        rt.launch()
+        rt.run()
+        rt.stop()
+        output = output.to_torch()
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        """
+        Converts the gradient outputs to ARK format, computes the gradients for the input
+        and parameters using the ARK module backwards pass, and updates the gradients of the corresponding
+        PyTorch parameters.
+        """
+        Model.reset()
+        ark_grad_outputs = [placeholder(torch_tensor=grad) for grad in grad_outputs]
+        grads = ctx.ark_module.backward(*ark_grad_outputs)
+        grad_inputs, grad_weights = (
+            grads[: ctx.num_inp_grad],
+            grads[ctx.num_inp_grad :],
+        )
+        params_dict = ctx.ark_module.params_dict()
+        rt = Runtime.get_runtime()
+        rt.launch()
+        rt.run()
+        rt.stop()
+        grad_inputs = [grad.to_torch() for grad in grad_inputs]
+        for _, param in params_dict.items():
+            if param.staged_tensor is not None:
+                pytorch_grad = param.staged_tensor.to_torch()
+                param.torch_param.grad = pytorch_grad
+        return (None, *grad_inputs)
+
+
+class RuntimeModule(torch.nn.Module):
+    """
+    Wraps an ARK module to be used as a PyTorch autograd function.
+    """
+
+    def __init__(self, ark_module):
+        super().__init__()
+        self.ark_module = ark_module
+
+    def forward(self, *args, **kwargs):
+        return _Function.apply(self.ark_module, *args, **kwargs)
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 484e248ca..6f937a03e 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,51 +1,24 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import List, Iterable, Union
+from typing import List, Iterable, Union, Optional
 
-from .tensor import Dims, Tensor, Parameter, NullTensor
+from .tensor import Dims, Tensor, Parameter, NullTensor, _cpp_tensor
 from .data_type import DataType, fp32
 from .model import Model
 
+try:
+    import torch
 
-def _is_list_or_tuple(obj):
-    return isinstance(obj, list) or isinstance(obj, tuple)
+    _no_torch = False
+except ImportError:
+    from . import torch_mock as torch
 
+    _no_torch = True
 
-def _tensor(
-    shape: Iterable[int],
-    dtype: DataType = fp32,
-    strides: Iterable[int] = [],
-    offsets: Iterable[int] = [],
-    padded_shape: Iterable[int] = [],
-    rank: int = -1,
-    name: str = "",
-) -> Tensor:
-    if not _is_list_or_tuple(shape):
-        raise ValueError("shape should be a list or tuple of integers")
-    if not _is_list_or_tuple(strides):
-        raise ValueError("strides should be a list or tuple of integers")
-    if not _is_list_or_tuple(offsets):
-        raise ValueError("offsets should be a list or tuple of integers")
-    if not _is_list_or_tuple(padded_shape):
-        raise ValueError("padded_shape should be a list or tuple of integers")
-    # only support tensors with up to 4 dimensions
-    if (
-        len(shape) > 4
-        or len(strides) > 4
-        or len(offsets) > 4
-        or len(padded_shape) > 4
-    ):
-        raise ValueError("Only support tensors with up to 4 dimensions")
-    return Model.get_model().tensor(
-        Dims(shape),
-        dtype.ctype(),
-        Dims(strides),
-        Dims(offsets),
-        Dims(padded_shape),
-        rank,
-        name,
-    )
+
+def _is_list_or_tuple(obj):
+    return isinstance(obj, list) or isinstance(obj, tuple)
 
 
 def add(
@@ -107,7 +80,9 @@ def constant(
 
 
 def copy(
-    input: Union[Tensor, float], output: Tensor = NullTensor, name: str = "copy"
+    input: Union[Tensor, float],
+    output: Tensor = NullTensor,
+    name: str = "copy",
 ) -> Tensor:
     """Data caopy."""
     if output is not NullTensor:
@@ -151,7 +126,9 @@ def embedding(
 
 
 def exp(
-    input: Tensor, output: Tensor = NullTensor, name: str = "exp"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    name: str = "exp",
 ) -> Tensor:
     """
     Calculates the exponential of the `input` tensor, element-wise.
@@ -164,7 +141,9 @@ def exp(
 
 
 def gelu(
-    input: Tensor, output: Tensor = NullTensor, name: str = "gelu"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    name: str = "gelu",
 ) -> Tensor:
     """
     Applies the Gaussian Error Linear Unit (GELU) activation
@@ -252,6 +231,24 @@ def noop(input: Tensor, name: str = "noop"):
     Model.get_model().noop(input._tensor, name)
 
 
+def placeholder(
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    strides: Iterable[int] = [],
+    offsets: Iterable[int] = [],
+    padded_shape: Iterable[int] = [],
+    rank: int = -1,
+    data: int = 0,
+    name: str = "placeholder",
+) -> Tensor:
+    """ """
+    return Tensor(
+        _cpp_tensor(
+            shape, dtype, strides, offsets, padded_shape, rank, data, name
+        )
+    )
+
+
 def reduce_max(
     input: Tensor,
     axis: int,
@@ -321,7 +318,9 @@ def reduce_sum(
 
 
 def relu(
-    input: Tensor, output: Tensor = NullTensor, name: str = "relu"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    name: str = "relu",
 ) -> Tensor:
     """
     Applies the ReLU activation function to the `input` tensor,
@@ -382,7 +381,9 @@ def rope(
 
 
 def rsqrt(
-    input: Tensor, output: Tensor = NullTensor, name: str = "rsqrt"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    name: str = "rsqrt",
 ) -> Tensor:
     """
     Calculates the square root of the `input` tensor, element-wise.
@@ -413,7 +414,9 @@ def sharding(
 
 
 def sigmoid(
-    input: Tensor, output: Tensor = NullTensor, name: str = "sigmoid"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    name: str = "sigmoid",
 ) -> Tensor:
     """
     Applies the Sigmoid activation function to the `input` tensor,
@@ -427,7 +430,9 @@ def sigmoid(
 
 
 def sqrt(
-    input: Tensor, output: Tensor = NullTensor, name: str = "sqrt"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    name: str = "sqrt",
 ) -> Tensor:
     """
     Calculates the square root of the `input` tensor, element-wise.
@@ -474,7 +479,9 @@ def tensor(
     tensor = ark.tensor([1, 2], dtype=ark.fp16)
     """
     return Tensor(
-        _tensor(shape, dtype, strides, offsets, padded_shape, rank, name)
+        _cpp_tensor(
+            shape, dtype, strides, offsets, padded_shape, rank, None, name
+        )
     )
 
 
@@ -540,7 +547,7 @@ def parameter(
     Construct a parameter with given shape and data type.
     """
     return Parameter(
-        _tensor(shape, dtype, strides, offsets, padded_shape, name)
+        _cpp_tensor(shape, dtype, strides, offsets, padded_shape, None, name)
     )
 
 
@@ -574,7 +581,9 @@ def layernorm(
 
 
 def zeros(
-    shape: Iterable[int], dtype: DataType = fp32, name: str = "zeros"
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    name: str = "zeros",
 ) -> Tensor:
     """Zeros."""
     return Tensor(
@@ -614,13 +623,15 @@ def all_reduce(
 
 __all__ = [
     "tensor",
+    "placeholder",
     "parameter",
     "reshape",
     "identity",
     "sharding",
-    "reduce_sum",
-    "reduce_mean",
+    "noop",
     "reduce_max",
+    "reduce_mean",
+    "reduce_sum",
     "layernorm",
     "softmax",
     "transpose",
diff --git a/python/ark/planner.py b/python/ark/planner.py
index e7eb2e7ed..e5291bbce 100644
--- a/python/ark/planner.py
+++ b/python/ark/planner.py
@@ -5,7 +5,7 @@
 import json
 from typing import Callable, Dict, List, Any
 
-from _ark_core import _Planner, _PlannerContext
+from ._ark_core import _Planner, _PlannerContext
 from .model import Model
 
 
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
new file mode 100644
index 000000000..e47f5b7aa
--- /dev/null
+++ b/python/ark/profiler.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import sys
+import time
+
+from .runtime import Runtime
+from .planner import Plan
+
+
+def timeit(plan: Plan, iter: int, loop_mode: bool):
+    with Runtime() as rt:
+        rt.launch(plan=plan, loop_mode=loop_mode)
+        start_time = time.time()
+        rt.run(iter=iter)
+        end_time = time.time()
+        return (end_time - start_time) / iter
+
+
+class Profiler:
+    def __init__(self, plan: Plan):
+        self.plan = plan
+
+    def run(
+        self,
+        iter: int = 1000,
+        loop_mode: bool = True,
+        profile_processor_groups: bool = False,
+    ):
+        sys.stderr.write(
+            f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n"
+        )
+
+        if not profile_processor_groups:
+            return
+        num_processor_groups = len(self.plan.processor_groups)
+        new_plan = {
+            "Rank": self.plan.rank,
+            "WorldSize": self.plan.world_size,
+            "Architecture": self.plan.architecture,
+            "NumProcessors": self.plan.num_processors,
+            "NumWarpsPerProcessor": self.plan.num_warps_per_processor,
+            "TaskInfos": self.plan.task_infos,
+            "ProcessorGroups": [None],
+        }
+        for i in range(num_processor_groups):
+            new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
+            lat_per_iter = timeit(Plan(new_plan), iter, loop_mode)
+            sys.stderr.write(
+                f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n"
+            )
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 495fc1c24..1625ca71a 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -4,8 +4,17 @@
 import logging
 from enum import Enum
 
-from _ark_core import _Executor
+from ._ark_core import _Executor
 from .planner import Planner, Plan
+from typing import Dict
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    from . import torch_mock as torch
+
+    _no_torch = True
 
 
 class _RuntimeState:
@@ -14,11 +23,6 @@ class _RuntimeState:
     """
 
     runtime = None
-    executor = None
-
-
-class Executor(_Executor):
-    pass
 
 
 class Runtime:
@@ -35,23 +39,22 @@ class State(Enum):
         LaunchedNotRunning = 1
         Running = 2
 
+    def __init__(self):
+        self.executor: _Executor = _Executor()
+        self.state: Runtime.State = Runtime.State.Init
+        self.loop_mode = True
+        _RuntimeState.runtime = self
+
     @staticmethod
     def get_runtime() -> "Runtime":
         """
         Get the runtime.
+        If the runtime does not exist, create a new runtime.
         """
         if _RuntimeState.runtime is None:
             _RuntimeState.runtime = Runtime()
         return _RuntimeState.runtime
 
-    def __init__(self):
-        self.executor: Executor = None
-        self.state: Runtime.State = Runtime.State.Init
-        _RuntimeState.runtime = self
-
-    def __del__(self):
-        self.reset()
-
     def __enter__(self):
         return self
 
@@ -79,34 +82,36 @@ def launch(
         device_id: int = 0,
         stream: int = 0,
         loop_mode: bool = True,
+        tensor_mappings: Dict = {}
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
         the CUDA kernels. The GPU context and the connection between GPUs will be
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
-        if self.launched():
-            logging.warn("Runtime is already launched, skip launching")
-            return
+        if device_id < 0:
+            logging.error(f"Invalid device_id: {device_id}")
+            raise ValueError(f"Invalid device_id: {device_id}")
         plan = Planner(device_id).plan() if plan is None else plan
-        # If the RuntimeState is init, we need to create a new executor and
-        # compile the kernels
-        if self.state == Runtime.State.Init:
-            if _RuntimeState.executor is not None:
-                if not _RuntimeState.executor.destroyed():
-                    logging.warn("Destroying an old executor")
-                    _RuntimeState.executor.destroy()
-
-            _RuntimeState.executor = Executor(
-                device_id,
-                stream,
-                "ArkRuntime",
-                str(plan),
-                loop_mode,
-            )
-            self.executor = _RuntimeState.executor
-            self.executor.compile()
-        self.executor.launch()
+        plan_str = str(plan)
+        if self.launched():
+            # Stop the current running model
+            self.stop()
+        
+        for ark_tensor in tensor_mappings:
+            torch_tensor = tensor_mappings[ark_tensor]
+            if not isinstance(torch_tensor, torch.Tensor):
+                raise ValueError("Must bind PyTorch tensor")
+            tensor_mappings[ark_tensor] = torch_tensor.data_ptr()
+
+        # Recompile if the previous launch was not compiled with the same info
+        # or if this is the first launch
+        if (
+            plan_str != self.executor.plan()
+            or device_id != self.executor.device_id()
+        ):
+            self.executor.compile(plan_str, device_id, tensor_mappings)
+        self.executor.launch(stream, loop_mode)
         self.state = Runtime.State.LaunchedNotRunning
 
     def run(self, iter=1, non_blocking=False):
@@ -114,8 +119,8 @@ def run(self, iter=1, non_blocking=False):
         Run the ARK program for iter iterations and wait for the kernel to finish.
         """
         if self.state != Runtime.State.LaunchedNotRunning:
-            logging.error("ARK runtime is not launched")
-            raise RuntimeError("ARK runtime is not launched")
+            logging.error(f"ARK runtime is not launched")
+            raise RuntimeError(f"ARK runtime is not launched")
         self.state = Runtime.State.Running
         self.executor.run(iter)
         if not non_blocking:
@@ -135,7 +140,7 @@ def wait(self):
         Wait for the kernel to finish.
         """
         if self.state != Runtime.State.Running:
-            logging.warn("ARK runtime is not running, skip waiting")
+            logging.warning(f"ARK runtime is not running, skip waiting")
             return
         self.executor.wait()
         self.state = Runtime.State.LaunchedNotRunning
@@ -146,7 +151,7 @@ def stop(self) -> float:
         Once this is called, we need to call `launch()` again to run the model again.
         """
         if not self.launched():
-            logging.warn("ARK runtime is never launched, skip stopping")
+            logging.warning(f"ARK runtime is never launched, skip stopping")
             return
         elapsed = self.executor.stop()
         self.state = Runtime.State.LaunchedNotRunning
@@ -158,8 +163,9 @@ def reset(self):
         """
         if self.launched():
             self.stop()
-        if self.executor is not None:
-            if not self.executor.destroyed():
-                self.executor.destroy()
-            self.executor = None
+        self.executor.destroy()
+        self.executor = _Executor()
         self.state = Runtime.State.Init
+
+
+__all__ = ["Runtime"]
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index d69f2aabc..fb4544cc5 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -2,12 +2,21 @@
 # Licensed under the MIT license.
 
 import numpy as np
-from typing import List
+from typing import Callable, Iterable, List, Union, Type
 
-from _ark_core import _Dims, _Tensor, _NullTensor
-from .data_type import DataType
+from ._ark_core import _Dims, _Tensor, _NullTensor
+from .data_type import DataType, fp32
 from .runtime import Runtime
 
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    from . import torch_mock as torch
+
+    _no_torch = True
+
 NullTensor = _NullTensor
 
 
@@ -15,14 +24,35 @@ class Dims(_Dims):
     pass
 
 
+Initializer = Type[Callable[[], Union[torch.Tensor, np.ndarray]]]
+
+
 class Tensor:
-    def __init__(self, _tensor: _Tensor):
+    def __init__(
+        self,
+        _tensor: _Tensor,
+        initializer: Initializer = None,
+        requires_grad: bool = False,
+    ):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
+            initializer (Initializer): The initializer for the Tensor.
+            requires_grad (bool): Whether the tensor requires gradient. Defaults to True.
         """
         self._tensor = _tensor
+        self.initializer: Initializer = initializer
+        self.requires_grad = requires_grad
+    
+    def __hash__(self):
+        return self._tensor.id()
+    
+    def __eq__(self, other):
+        if not isinstance(other, Tensor):
+            return False
+        return self._tensor.id() == other._tensor.id()
+
 
     def shape(self) -> List[int]:
         """
@@ -57,10 +87,17 @@ def to_numpy(
         an empty numpy array without the data buffer will be returned.
         """
         np_type = self.dtype().to_numpy()
+        if np_type is None:
+            raise ValueError(
+                f"Tensor data type {self.dtype().__name__} is not supported by numpy."
+            )
         rt = Runtime.get_runtime()
         if not rt.launched():
-            return np.ndarray(self.shape(), dtype=np_type, buffer=None)
-        if ndarray is None:
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.to_numpy()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        elif ndarray is None:
             ndarray = np.zeros(self.shape(), dtype=np_type)
         elif not ndarray.flags["C_CONTIGUOUS"]:
             raise ValueError("ndarray is not contiguous in memory")
@@ -91,14 +128,211 @@ def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         rt.executor.tensor_write(self._tensor, ndarray, stream)
         return self
 
+    def to_dlpack(self):
+        """
+        Returns a DLPack tensor that shares the same memory with the device tensor.
+        """
+        rt = Runtime.get_runtime()
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.to_dlpack()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        return rt.executor.tensor_to_dlpack(self._tensor)
+
+    @staticmethod
+    def from_dlpack(ext_tensor) -> "Tensor":
+        """
+        Copies the tensor from a DLPack tensor to the device.
+        """
+        # return Tensor(_Tensor(ext_tensor))
+        raise NotImplementedError("from_dlpack is not implemented yet")
+
+    def to_torch(self) -> torch.Tensor:
+        """
+        Returns a torch tensor that shares the same memory with the device tensor.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        dl_capsule = self.to_dlpack()
+        torch_view = torch.utils.dlpack.from_dlpack(dl_capsule)
+        # Keep dl_capsule alive not to free the memory
+        torch_view.__ark_buffer__ = dl_capsule
+        return torch_view
+
+    @staticmethod
+    def from_torch(tensor: torch.Tensor) -> "Tensor":
+        """
+        Returns an ARK tensor that shares the same memory with the torch tensor.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        elif not tensor.is_contiguous():
+            raise ValueError("Torch tensor must be contiguous.")
+        elif tensor.device.type == "cpu":
+            raise ValueError("Torch tensor must be on a device.")
+        # TODO: support strides and offsets
+        ark_tensor = Tensor(
+            _cpp_tensor(
+                shape=list(tensor.shape),
+                dtype=DataType.from_torch(tensor.dtype),
+                data=tensor.data_ptr(),
+            )
+        )
+        # Share ownership of the memory with the torch tensor
+        ark_tensor.__torch_buffer__ = tensor
+        return ark_tensor
+
+    def copy(
+        self, data: Union[np.ndarray, torch.Tensor], stream: int = 0
+    ) -> "Tensor":
+        """
+        Copies data into this tensor. The data type may differ,
+        but the size must match.
+        """
+        rt = Runtime.get_runtime()
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.copy()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        tensor_bytes = self.nelems() * self.dtype().element_size()
+        if isinstance(data, torch.Tensor):
+            if not data.is_contiguous():
+                data = data.contiguous()
+            if data.numel() * data.element_size() != tensor_bytes:
+                raise ValueError("data size does not match the tensor")
+            rt.executor.tensor_write(
+                self._tensor,
+                data.data_ptr(),
+                tensor_bytes,
+                stream,
+                data.device.type == "cuda",
+            )
+            data.requires_grad = self.requires_grad
+            if isinstance(self, Parameter):
+                self.torch_param = data
+        elif isinstance(data, np.ndarray):
+            if not data.flags["C_CONTIGUOUS"]:
+                data = np.ascontiguousarray(data)
+            if data.nbytes != tensor_bytes:
+                raise ValueError("data size does not match the tensor")
+            rt.executor.tensor_write(self._tensor, data, stream)
+        else:
+            raise ValueError("data must be a numpy array or a torch tensor")
+        return self
+
+    def initialize(self) -> "Tensor":
+        """
+        Initializes the tensor.
+        """
+        if self.initializer is not None:
+            data = self.initializer()
+            self.copy(data)
+        return self
+
 
 class Parameter(Tensor):
     """
     A tensor as a parameter.
     """
 
-    def __init__(self, _tensor: _Tensor):
+    def __init__(
+        self,
+        tensor: _Tensor,
+        from_torch: bool,
+    ):
         """
         Initializes a new instance of the Parameter class.
+        Args:
+            _tensor (_ark_core._Tensor): The underlying _Tensor object.
+            from_torch: Indicates if the Parameter is tied to a torch.nn.Paramter
+        """
+        if not _no_torch and from_torch:
+            _tensor = tensor._tensor
+            self.torch_param = tensor
+            self.staged_tensor = None
+            Tensor.__init__(
+                self,
+                _tensor,
+                requires_grad=tensor.requires_grad,
+            )
+        elif isinstance(tensor, _Tensor):
+            _tensor = tensor
+            self.torch_param = None
+            self.staged_tensor = None
+            Tensor.__init__(self, _tensor, requires_grad=False)
+        else:
+            raise TypeError(
+                "tensor must be an ARK tensor or a torch.nn.Parameter"
+            )
+
+    def update_gradient(self, ark_tensor: Tensor):
         """
-        super().__init__(_tensor)
+        Stages an ARK tensor to be used for updating the gradient of its associated parameter.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        if self.torch_param is None:
+            raise ValueError(
+                "there is no PyTorch parameter associated with this ARK parameter"
+            )
+        if not self.torch_param.requires_grad:
+            raise ValueError("parameter does not require gradient updates")
+        if ark_tensor is None or not isinstance(ark_tensor, Tensor):
+            raise ValueError("cannot use non-ARK tensor to update ARK gradient")
+        self.staged_tensor = ark_tensor
+
+
+def _is_list_or_tuple(obj):
+    return isinstance(obj, list) or isinstance(obj, tuple)
+
+
+def _cpp_tensor(
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    strides: Iterable[int] = [],
+    offsets: Iterable[int] = [],
+    padded_shape: Iterable[int] = [],
+    rank: int = -1,
+    data: int = None,
+    name: str = "",
+) -> Tensor:
+    if not _is_list_or_tuple(shape):
+        raise ValueError("shape should be a list or tuple of integers")
+    if not _is_list_or_tuple(strides):
+        raise ValueError("strides should be a list or tuple of integers")
+    if not _is_list_or_tuple(offsets):
+        raise ValueError("offsets should be a list or tuple of integers")
+    if not _is_list_or_tuple(padded_shape):
+        raise ValueError("padded_shape should be a list or tuple of integers")
+    # only support tensors with up to 4 dimensions
+    if (
+        len(shape) > 4
+        or len(strides) > 4
+        or len(offsets) > 4
+        or len(padded_shape) > 4
+    ):
+        raise ValueError("Only support tensors with up to 4 dimensions")
+    if data is not None:
+        cpp_tensor = Model.get_model().placeholder(
+            Dims(shape),
+            dtype.ctype(),
+            Dims(strides),
+            Dims(offsets),
+            Dims(padded_shape),
+            rank,
+            data,
+            name,
+        )
+    else:
+        cpp_tensor = Model.get_model().tensor(
+            Dims(shape),
+            dtype.ctype(),
+            Dims(strides),
+            Dims(offsets),
+            Dims(padded_shape),
+            rank,
+            name,
+        )
+    return cpp_tensor
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
new file mode 100644
index 000000000..7a7de0ae6
--- /dev/null
+++ b/python/ark/torch_mock.py
@@ -0,0 +1,43 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+
+class dtype: ...
+
+
+class float32: ...
+
+
+class float16: ...
+
+
+class bfloat16: ...
+
+
+class int32: ...
+
+
+class int8: ...
+
+
+class uint8: ...
+
+
+class ubyte: ...
+
+
+class Tensor: ...
+
+
+class nn:
+
+    class Module: ...
+
+    class Parameter: ...
+
+
+class autograd:
+
+    class Function:
+
+        def apply(self, *args, **kwargs): ...
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index b1e468608..08fc94883 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -1,12 +1,17 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
+#include <unordered_map>
+
+#include "gpu/gpu_memory.hpp"
+#include "logging.hpp"
 
 namespace py = pybind11;
 
@@ -40,31 +45,161 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                      reinterpret_cast<ark::Stream>(stream), is_d2d);
 }
 
+static DLDataType to_dl_dtype(const ark::DataType &ark_dtype) {
+    DLDataType dl_dtype;
+    dl_dtype.lanes = 1;
+    if (ark_dtype == ark::FP32) {
+        dl_dtype.code = kDLFloat;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::FP16) {
+        dl_dtype.code = kDLFloat;
+        dl_dtype.bits = 16;
+    } else if (ark_dtype == ark::BF16) {
+        dl_dtype.code = kDLBfloat;
+        dl_dtype.bits = 16;
+    } else if (ark_dtype == ark::INT32) {
+        dl_dtype.code = kDLInt;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::UINT32) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::INT8) {
+        dl_dtype.code = kDLInt;
+        dl_dtype.bits = 8;
+    } else if (ark_dtype == ark::UINT8) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 8;
+    } else if (ark_dtype == ark::BYTE) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 8;
+    } else {
+        ERR(ark::InternalError, "unexpected");
+    }
+    return dl_dtype;
+}
+
+static DLDeviceType get_device_type() {
+#if defined(ARK_CUDA)
+    return kDLCUDA;
+#elif defined(ARK_ROCM)
+    return kDLROCM;
+#else
+    return kDLCPU;
+#endif
+}
+
+namespace ark {
+
+class SharedTensor {
+   public:
+    SharedTensor(Executor &exe, const Tensor &tensor);
+    ~SharedTensor() = default;
+
+    DLTensor dl_tensor() const;
+
+   private:
+    std::shared_ptr<GpuMemory> buffer_;
+    void *data_;
+    int device_id_;
+    DataType dtype_;
+    std::shared_ptr<std::vector<int64_t>> shape_;
+    std::shared_ptr<std::vector<int64_t>> strides_;
+    std::shared_ptr<std::vector<int64_t>> offsets_;
+};
+
+SharedTensor::SharedTensor(Executor &exe, const Tensor &tensor) {
+    buffer_ = exe.buffer();
+    data_ = reinterpret_cast<void *>(exe.tensor_address(tensor));
+    device_id_ = exe.device_id();
+    dtype_ = tensor.data_type();
+    shape_ = std::make_shared<std::vector<int64_t>>(tensor.shape().vector());
+    strides_ =
+        std::make_shared<std::vector<int64_t>>(tensor.torch_strides().vector());
+    offsets_ =
+        std::make_shared<std::vector<int64_t>>(tensor.offsets().vector());
+}
+
+DLTensor SharedTensor::dl_tensor() const {
+    DLTensor dl_tensor;
+    dl_tensor.data = data_;
+    size_t offset_in_elements = offsets_->empty() ? 0 : offsets_->at(0);
+    dl_tensor.byte_offset = offset_in_elements * dtype_.bytes();
+    dl_tensor.device.device_type = get_device_type();
+    dl_tensor.device.device_id = device_id_;
+    dl_tensor.ndim = static_cast<int32_t>(shape_->size());
+    dl_tensor.dtype = to_dl_dtype(dtype_);
+    dl_tensor.shape = shape_->data();
+    dl_tensor.strides = strides_->data();
+    return dl_tensor;
+}
+
+}  // namespace ark
+
+static py::capsule tensor_to_dlpack(ark::Executor &self,
+                                    const ark::Tensor &tensor) {
+    auto shared_tensor = new ark::SharedTensor(self, tensor);
+    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
+    dl_managed_tensor->dl_tensor = shared_tensor->dl_tensor();
+    dl_managed_tensor->manager_ctx = shared_tensor;
+    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
+        if (self->manager_ctx) {
+            delete static_cast<ark::SharedTensor *>(self->manager_ctx);
+            self->manager_ctx = nullptr;
+        }
+    };
+    const char *capsule_name = "dltensor";
+    PyObject *dl_capsule = PyCapsule_New(
+        static_cast<void *>(dl_managed_tensor), capsule_name,
+        [](PyObject *capsule) {
+            const char *name = PyCapsule_GetName(capsule);
+            auto *dl_managed_tensor = static_cast<DLManagedTensor *>(
+                PyCapsule_GetPointer(capsule, name));
+            if (dl_managed_tensor) {
+                dl_managed_tensor->deleter(dl_managed_tensor);
+                dl_managed_tensor = nullptr;
+            }
+        });
+    return py::reinterpret_steal<py::capsule>(dl_capsule);
+}
+
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
-        .def(py::init([](int device_id, uintptr_t stream,
-                         const std::string &name, const std::string &plan,
-                         bool loop_mode) {
-            return new ark::Executor(device_id,
-                                     reinterpret_cast<ark::Stream>(stream),
-                                     name, plan, loop_mode);
-        }))
+        .def(py::init<>())
         .def("device_id", &ark::Executor::device_id)
         .def("stream",
              [](ark::Executor *self) {
                  return reinterpret_cast<uintptr_t>(self->stream());
              })
         .def("plan", &ark::Executor::plan)
-        .def("compile", &ark::Executor::compile)
-        .def("launch", &ark::Executor::launch)
+        .def("name", &ark::Executor::name)
+        .def("compile", 
+            [](ark::Executor *self, int device_id, std::string &plan, const std::string &name,
+               const std::unordered_map<ark::Tensor, uintptr_t> &external_tensors) {
+                std::unordered_map<ark::Tensor, void *> tensor_map;
+                for (const auto &[tensor, ptr] : external_tensors) {
+                    tensor_map[tensor] = reinterpret_cast<void *>(ptr);
+                }
+                self->compile(plan, device_id, name, tensor_map);
+            },
+            py::arg("device_id"), py::arg("plan"), py::arg("name") = "executor",
+            py::arg("external_tensors") = std::unordered_map<ark::Tensor, uintptr_t>())
+        .def("launch", [](ark::Executor *self, uintptr_t stream, bool loop_mode) {
+                 self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode);
+             },
+             py::arg("stream") = 0, py::arg("loop_mode") = true)
         .def("run", &ark::Executor::run, py::arg("iter"))
         .def("wait", &ark::Executor::wait, py::arg("max_spin_count") = -1)
         .def("stop", &ark::Executor::stop, py::arg("max_spin_count") = -1)
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
-        .def("tensor_address", &ark::Executor::tensor_address,
-             py::arg("tensor"))
+        .def(
+            "tensor_address",
+            [](ark::Executor *self, const ark::Tensor &tensor) {
+                return reinterpret_cast<uintptr_t>(
+                    self->tensor_address(tensor));
+            },
+            py::arg("tensor"))
         .def("tensor_read",
              py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
                                uintptr_t>(&tensor_read),
@@ -82,5 +217,6 @@ void register_executor(py::module &m) {
              py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
                                size_t, uintptr_t, bool>(&tensor_write),
              py::arg("tensor"), py::arg("address"), py::arg("bytes"),
-             py::arg("stream"), py::arg("is_d2d"));
+             py::arg("stream"), py::arg("is_d2d"))
+        .def("tensor_to_dlpack", &tensor_to_dlpack);
 }
diff --git a/python/model_py.cpp b/python/model_py.cpp
index c224a3d5b..55ab5e9fe 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -8,8 +9,65 @@
 #include <ark/model.hpp>
 #include <ark/model_graph.hpp>
 
+#include "logging.hpp"
+
 namespace py = pybind11;
 
+struct DLTensorMetadata {
+    void *data_ptr;
+    int32_t device_id;
+    DLDeviceType device_type;
+    int32_t ndim;
+    DLDataType dtype;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> strides;
+    uint64_t byte_offset;
+};
+
+static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor *dl_tensor) {
+    DLTensorMetadata metadata;
+    metadata.data_ptr = dl_tensor->dl_tensor.data;
+    metadata.device_id = dl_tensor->dl_tensor.device.device_id;
+    metadata.device_type = dl_tensor->dl_tensor.device.device_type;
+    metadata.ndim = dl_tensor->dl_tensor.ndim;
+    metadata.dtype = dl_tensor->dl_tensor.dtype;
+    metadata.shape.assign(
+        dl_tensor->dl_tensor.shape,
+        dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim);
+    if (dl_tensor->dl_tensor.strides != nullptr) {
+        metadata.strides.assign(
+            dl_tensor->dl_tensor.strides,
+            dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim);
+    }
+    metadata.byte_offset = dl_tensor->dl_tensor.byte_offset;
+    return metadata;
+}
+
+static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
+    if (dl_dtype.lanes != 1) {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    ark::DataType ark_dtype;
+    if (dl_dtype.code == kDLFloat && dl_dtype.bits == 32) {
+        ark_dtype = ark::FP32;
+    } else if (dl_dtype.code == kDLFloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::FP16;
+    } else if (dl_dtype.code == kDLBfloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::BF16;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::INT32;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::UINT32;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::INT8;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::UINT8;
+    } else {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    return ark_dtype;
+}
+
 void register_model(py::module &m) {
     py::class_<ark::Model, ark::ModelGraph>(m, "_Model")
         .def(py::init<int, int>(), py::arg("rank"), py::arg("world_size"))
@@ -71,6 +129,19 @@ void register_model(py::module &m) {
              py::arg("input"), py::arg("other"), py::arg("output"),
              py::arg("name"))
         .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name"))
+        .def(
+            "placeholder",
+            [](ark::Model &model, const ark::Dims &shape,
+               const ark::DataType &data_type, const ark::Dims &strides,
+               const ark::Dims &offsets, const ark::Dims &padded_shape,
+               int rank, uintptr_t data, const std::string &name) {
+                return model.placeholder(shape, data_type, strides, offsets,
+                                         padded_shape, rank,
+                                         reinterpret_cast<void *>(data), name);
+            },
+            py::arg("shape"), py::arg("data_type"), py::arg("strides"),
+            py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"),
+            py::arg("data"), py::arg("name"))
         .def("reduce_max", &ark::Model::reduce_max, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
              py::arg("name"))
@@ -104,14 +175,30 @@ void register_model(py::module &m) {
                                const std::string &>(&ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
              py::arg("name"))
-        .def("tensor",
+        .def("tensor", &ark::Model::tensor, py::arg("shape"),
+             py::arg("data_type"), py::arg("strides"), py::arg("offsets"),
+             py::arg("padded_shape"), py::arg("rank"), py::arg("name"))
+        .def("placeholder",
              py::overload_cast<const ark::Dims &, const ark::DataType &,
                                const ark::Dims &, const ark::Dims &,
-                               const ark::Dims &, int, const std::string &>(
-                 &ark::Model::tensor),
+                               const ark::Dims &, int, const std::string &,
+                               void *>(&ark::Model::placeholder),
              py::arg("shape"), py::arg("data_type"), py::arg("strides"),
              py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"),
-             py::arg("name"))
+             py::arg("name"), py::arg("data"))
+        .def(
+            "placeholder",
+            [](ark::Model &self, py::capsule input, int rank,
+               const std::string &name) {
+                DLManagedTensor *dl_tensor =
+                    static_cast<DLManagedTensor *>(input.get_pointer());
+                DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
+                ark::DataType ark_dtype = from_dl_dtype(metadata.dtype);
+                ark::Dims shape(metadata.shape);
+                return self.placeholder(shape, ark_dtype, {}, {}, {}, rank,
+                                        name, metadata.data_ptr);
+            },
+            py::arg("external_tensor"), py::arg("rank"), py::arg("name"))
         .def("transpose", &ark::Model::transpose, py::arg("input"),
              py::arg("permutation"), py::arg("output"), py::arg("name"))
         .def("all_reduce", &ark::Model::all_reduce, py::arg("input"),
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index fbd909d3d..5c28563de 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -9,18 +9,15 @@
 
 namespace py = pybind11;
 
-void register_tensor(py::module &m) {
+void register_tensor(py::module& m) {
     py::class_<ark::Tensor>(m, "_Tensor")
         .def("id", &ark::Tensor::id)
-        .def("shape", &ark::Tensor::shape, py::return_value_policy::reference)
-        .def("strides", &ark::Tensor::strides,
-             py::return_value_policy::reference)
-        .def("offsets", &ark::Tensor::offsets,
-             py::return_value_policy::reference)
-        .def("padded_shape", &ark::Tensor::padded_shape,
-             py::return_value_policy::reference)
-        .def("data_type", &ark::Tensor::data_type,
-             py::return_value_policy::reference);
+        .def("shape", &ark::Tensor::shape)
+        .def("strides", &ark::Tensor::strides)
+        .def("offsets", &ark::Tensor::offsets)
+        .def("padded_shape", &ark::Tensor::padded_shape)
+        .def("data_type", &ark::Tensor::data_type)
+        .def("torch_strides", &ark::Tensor::torch_strides);
 
     m.attr("_NullTensor") = &ark::NullTensor;
 }
diff --git a/python/unittest/test.py b/python/unittest/test.py
index 2d9647e3a..e8f22fdae 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -1,12 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
-import os
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + "/..")
-sys.path.insert(0, os.environ.get("ARK_ROOT", ".") + "/python")
-
 from test_error import *
 from test_model import *
 from test_runtime import *
+from test_tensor import *
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
new file mode 100644
index 000000000..83fb77b31
--- /dev/null
+++ b/python/unittest/test_conversion.py
@@ -0,0 +1,269 @@
+import pytest
+import numpy as np
+import ark
+from typing import Callable
+
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    _no_torch = True
+
+# ARK to Torch tests
+
+
+def initialize_tensor(dimensions, dtype):
+    tensor = ark.tensor(dimensions, dtype)
+    tensor_host = np.random.rand(*dimensions).astype(dtype.to_numpy())
+    return tensor, tensor_host
+
+
+# Test function to validate the integrity of the PyTorch view of the ARK tensor,
+# including its data and attributes such as shape and data type.
+@pytest.mark.parametrize("num_dims,size", [(1, 5), (1, 1024), (2, 5), (2, 32)])
+@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
+def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
+    ark.init()
+    dimensions = [size] * num_dims
+
+    input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
+    other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype)
+    output_tensor = ark.add(input_tensor, other_tensor)
+
+    runtime = ark.Runtime()
+    runtime.launch()
+
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor.from_numpy(other_tensor_host)
+
+    input_view = input_tensor.to_torch()
+    other_view = other_tensor.to_torch()
+    output_view = output_tensor.to_torch()
+
+    runtime.run()
+
+    input_view_numpy = input_view.cpu().numpy()
+    other_view_numpy = other_view.cpu().numpy()
+    output_view_numpy = output_view.cpu().numpy()
+
+    output_tensor_host = output_tensor.to_numpy()
+
+    runtime.stop()
+    runtime.reset()
+
+    assert np.allclose(input_tensor_host, input_view_numpy)
+    assert np.allclose(other_tensor_host, other_view_numpy)
+    assert np.allclose(output_tensor_host, output_view_numpy)
+
+
+# Function to check if there is a difference between two arrays at a specific index
+def check_diff(input_tensor_host, input_view_numpy, value, index):
+    mask = np.ones(input_tensor_host.shape, dtype=bool)
+    mask[index] = False
+    if not np.allclose(input_tensor_host[mask], input_view_numpy[mask]):
+        print("Difference found at index: ", index)
+        return False
+    if input_view_numpy[index] != value:
+        print(input_view_numpy[index], value)
+        return False
+    return True
+
+
+# Test function to check if changes to the torch views are reflected in the original tensors
+@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
+def test_ark_to_torch_aliasing(dtype: ark.DataType):
+    ark.init()
+    dimensions = [4, 4]
+    input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
+    other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype)
+    output_tensor = ark.mul(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor.from_numpy(other_tensor_host)
+
+    input_view = input_tensor.to_torch()
+    other_view = other_tensor.to_torch()
+    output_view = output_tensor.to_torch()
+    # make changes to the views
+    input_view[1, 1] = 20
+    other_view[0, 0] = 30
+    runtime.run()
+    output_view[3, 0] = 40
+
+    output_tensor_host = output_tensor.to_numpy()
+    input_view_numpy = input_view.cpu().numpy()
+    other_view_numpy = other_view.cpu().numpy()
+    output_view_numpy = output_view.cpu().numpy()
+    # Check if changes to the views are reflected in the original tensors
+    print(input_view_numpy)
+    assert check_diff(input_tensor_host, input_view_numpy, 20, (1, 1))
+    assert check_diff(other_tensor_host, other_view_numpy, 30, (0, 0))
+    assert check_diff(output_tensor_host, output_view_numpy, 40, (3, 0))
+
+    runtime.stop()
+    runtime.reset()
+
+pytest.mark.skip()
+def test_conversion_torch():
+    if _no_torch:
+        pytest.skip("PyTorch not available")
+
+    dimensions = [4, 4]
+
+    ark.init()
+    t = ark.constant(7, dimensions)
+
+    with ark.Runtime() as rt:
+        rt.launch()
+
+        torch_tensor = t.to_torch()
+
+        assert torch_tensor.shape == (4, 4)
+        assert torch_tensor.dtype == torch.float32
+        assert torch_tensor.device.type == "cuda"
+        assert torch.all(torch_tensor == 0)
+
+        rt.run()
+
+        torch_tensor = t.to_torch()
+        assert torch.all(torch_tensor == 7)
+
+
+# Torch to ARK tests
+
+ArkBinOp = Callable[[ark.Tensor, ark.Tensor], ark.Tensor]
+TorchBinOp = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
+ArkUnOp = Callable[[ark.Tensor], ark.Tensor]
+TorchUnOp = Callable[[torch.Tensor], torch.Tensor]
+
+
+# Verify the accuracy of binary operations involving ARK view tensors
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.add, torch.add, (2, 3))],
+)
+def test_bin_op(dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
+    input_ark_view = ark.placeholder(torch_tensor=input_tensor)
+    other_ark_view = ark.placeholder(torch_tensor=other_tensor)
+    output = ark_op(input_ark_view, other_ark_view)
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+
+# Verify the accuracy of unary operations involving ARK view tensors
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.exp, torch.exp, (3, 3))],
+)
+def test_unary_op(dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor).cpu().numpy()
+    input_ark_view = ark.placeholder(torch_tensor=input_tensor)
+    output = ark_op(input_ark_view)
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+
+# Test function to check if changes in torch tensors are reflected in ARK views
+@pytest.mark.parametrize("dtype, tensor_dims", [(torch.float16, (64, 64))])
+def test_torch_to_ark_aliasing(dtype, tensor_dims):
+    ark.init()
+    # Initialize a PyTorch tensor
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+
+    input_ark_view = ark.placeholder(torch_tensor=input_tensor)
+    other_ark_view = ark.placeholder(torch_tensor=other_tensor)
+
+    output = ark.add(input_ark_view, other_ark_view)
+    # Perform in place operations
+    input_tensor += other_tensor
+    other_tensor += input_tensor
+    expected_output = (input_tensor + other_tensor).cpu().numpy()
+
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+
+# Staged View Tests
+
+
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.add, torch.add, (2, 3))],
+)
+def test_bin_op_staged(
+    dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims
+):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
+    input_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    other_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    output = ark_op(input_ark_view, other_ark_view)
+    runtime = ark.Runtime()
+    tensor_mapping = {
+        input_ark_view: input_tensor,
+        other_ark_view: other_tensor,
+    }
+    runtime.launch(tensor_mappings=tensor_mapping)
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+test_bin_op_staged(torch.float16, ark.add, torch.add, (2, 3))
+
+
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.exp, torch.exp, (3, 3))],
+)
+def test_unary_op_staged(
+    dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims
+):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor).cpu().numpy()
+    input_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    output = ark_op(input_ark_view)
+    runtime = ark.Runtime()
+    tensor_mapping = {input_ark_view: input_tensor}
+    runtime.launch()
+    runtime.run(tensor_mappings=tensor_mapping)
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+test_unary_op_staged(torch.float16, ark.exp, torch.exp, (3, 3))
diff --git a/python/unittest/test_error.py b/python/unittest/test_error.py
index 299e2675e..115dd1a15 100644
--- a/python/unittest/test_error.py
+++ b/python/unittest/test_error.py
@@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
 
 
+@pytest_ark()
 def test_error():
-    ark.init()
     try:
         ark.tensor([0])
     except ark.BaseError as e:
diff --git a/python/unittest/test_model.py b/python/unittest/test_model.py
index da8ae399a..d65191e54 100644
--- a/python/unittest/test_model.py
+++ b/python/unittest/test_model.py
@@ -1,13 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
 import json
 
 
+@pytest_ark()
 def test_model():
-    ark.init()
-
     input_tensor = ark.tensor([64, 64], ark.fp16)
     other_tensor = ark.tensor([64, 64], ark.fp16)
     ark.add(input_tensor, other_tensor)
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index d91fd85c5..dd8064d85 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
+import numpy as np
 
 
+@pytest_ark()
 def test_runtime_relaunch():
-    ark.init()
-
     with ark.Runtime.get_runtime() as rt:
         assert rt.launched() == False
         rt.launch()
@@ -16,3 +16,64 @@ def test_runtime_relaunch():
         assert rt.launched() == False
         rt.launch()
         assert rt.launched() == True
+
+
+@pytest_ark()
+def test_runtime_init():
+    M, N = 64, 64
+    input_tensor = ark.tensor([M, N], ark.fp16)
+    other_tensor = ark.tensor([M, N], ark.fp16)
+    output_tensor = ark.add(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor_host = np.random.rand(M, N).astype(np.float16)
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor_host = np.random.rand(M, N).astype(np.float16)
+    other_tensor.from_numpy(other_tensor_host)
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.stop()
+    ark.Model.reset()
+    prev_output = output_tensor
+    new_tensor = ark.tensor([M, N], ark.fp16)
+    final_output = ark.add(prev_output, new_tensor)
+    runtime.launch()
+    new_tensor_host = np.random.rand(M, N).astype(np.float16)
+    new_tensor.from_numpy(new_tensor_host)
+    runtime.run()
+    final_output_host = final_output.to_numpy()
+    np.testing.assert_allclose(
+        final_output_host, output_tensor_host + new_tensor_host
+    )
+    runtime.reset()
+
+
+@pytest_ark()
+def test_runtime_reuse_plans():
+    M, N = 64, 64
+    input_tensor = ark.tensor([M, N], ark.fp16)
+    other_tensor = ark.tensor([M, N], ark.fp16)
+    output_tensor = ark.add(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor_host = np.random.rand(M, N).astype(np.float16)
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor_host = np.random.rand(M, N).astype(np.float16)
+    other_tensor.from_numpy(other_tensor_host)
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.stop()
+    ark.Model.reset()
+    runtime.launch()
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.reset()
diff --git a/python/unittest/test_tensor.py b/python/unittest/test_tensor.py
new file mode 100644
index 000000000..213264e3b
--- /dev/null
+++ b/python/unittest/test_tensor.py
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from unittest_common import ark, pytest_ark
+
+
+@pytest_ark(need_torch=True)
+def test_tensor_torch():
+    import torch
+
+    ones = torch.ones(2, 1024, device=torch.device("cuda:0"))
+
+    t = ark.Tensor.from_torch(ones)
+    t = ark.mul(t, 5)
+
+    with ark.Runtime() as rt:
+        rt.launch()
+        rt.run()
+
+        x = t.to_torch()
+
+    assert torch.allclose(x, ones * 5)
diff --git a/python/unittest/unittest_common.py b/python/unittest/unittest_common.py
new file mode 100644
index 000000000..0c385e89a
--- /dev/null
+++ b/python/unittest/unittest_common.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import pytest
+import ark
+
+
+def pytest_ark(need_torch: bool = False):
+    """
+    Decorator for ARK unit tests.
+    """
+
+    def decorator(test_func):
+        if need_torch:
+            try:
+                import torch
+            except ImportError:
+                return pytest.mark.skip(reason="torch is not installed")(
+                    test_func
+                )
+
+        def wrapper(*args, **kwargs):
+            ark.init()
+            test_func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 96e442289..49251be74 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -40,6 +40,19 @@ if (NOT json_POPULATED)
 endif()
 set(JSON_INCLUDE_DIRS ${json_SOURCE_DIR}/include PARENT_SCOPE)
 
+# DLPack
+FetchContent_Declare(
+    dlpack
+    GIT_REPOSITORY https://github.com/dmlc/dlpack
+    GIT_TAG v0.8
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/dlpack
+)
+FetchContent_GetProperties(dlpack)
+if (NOT dlpack_POPULATED)
+    FetchContent_Populate(dlpack)
+endif()
+set(DLPACK_INCLUDE_DIRS ${dlpack_SOURCE_DIR}/include PARENT_SCOPE)
+
 if(ARK_USE_CUDA)
     # Configure CUTLASS
     FetchContent_Declare(
diff --git a/third_party/dlpack b/third_party/dlpack
new file mode 160000
index 000000000..365b823ce
--- /dev/null
+++ b/third_party/dlpack
@@ -0,0 +1 @@
+Subproject commit 365b823cedb281cd0240ca601aba9b78771f91a3