From 432ab5de2f59d2c10c08a7ce08fa4528e83caa79 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Mon, 9 Aug 2021 16:49:54 -0700
Subject: [PATCH] Dump more memory usage info (like per-layer live tensors
 etc.) w/ the benchmark tool.

1. Utilize weak symbols as a non-intrusive way to dump the memory planner info, and add such a strong definition to print out tensor's life span, the memory space used at each op execution from the memory arena.

2. Integrate this strong definition w/ the TfLite model benchmark tool.

PiperOrigin-RevId: 389755683
Change-Id: Ib53006ad6eee566432f5a412737b7c6278b21e3d
---
 tensorflow/lite/BUILD                         |  16 +-
 tensorflow/lite/arena_planner.cc              |   6 +
 tensorflow/lite/arena_planner.h               |   1 +
 tensorflow/lite/core/subgraph.cc              |   5 +
 tensorflow/lite/core/subgraph.h               |  11 +
 tensorflow/lite/memory_planner.h              |   6 +
 tensorflow/lite/optional_debug_tools.cc       |   5 +
 tensorflow/lite/simple_memory_arena.cc        |  14 ++
 tensorflow/lite/simple_memory_arena.h         |  18 ++
 .../lite/simple_memory_arena_debug_dump.cc    | 196 ++++++++++++++++++
 tensorflow/lite/simple_planner.h              |   1 +
 tensorflow/lite/tools/benchmark/BUILD         |   1 +
 12 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/simple_memory_arena_debug_dump.cc
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 5369a39047fda9..15830e07096857 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -219,7 +219,21 @@ cc_library(
     hdrs = ["simple_memory_arena.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts_warnings(),
-    deps = ["//tensorflow/lite/c:common"],
+    deps = [
+        ":macros",
+        "//tensorflow/lite/c:common",
+    ],
+)
+
+cc_library(
+    name = "simple_memory_arena_debug_dump",
+    srcs = ["simple_memory_arena_debug_dump.cc"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts_warnings(),
+    deps = [
+        ":simple_memory_arena",
+    ],
+    alwayslink = 1,
 )
 
 cc_library(
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index 1a4e023e3f7b7a..e715ac2627406b 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -249,6 +249,12 @@ bool ArenaPlanner::HasNonPersistentMemory() {
   return arena_.GetBufferSize() != 0;
 }
 
+void ArenaPlanner::DumpDebugInfo(const std::vector<int>& execution_plan) const {
+  arena_.DumpDebugInfo("kTfLiteArenaRw Dump:", execution_plan);
+  persistent_arena_.DumpDebugInfo("kTfLiteArenaRwPersistent Dump:",
+                                  execution_plan);
+}
+
 TfLiteStatus ArenaPlanner::Commit() {
   TF_LITE_ENSURE_STATUS(arena_.Commit(context_));
   TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_));
diff --git a/tensorflow/lite/arena_planner.h b/tensorflow/lite/arena_planner.h
index 5092b0421c8a6e..1c2f4201800e46 100644
--- a/tensorflow/lite/arena_planner.h
+++ b/tensorflow/lite/arena_planner.h
@@ -62,6 +62,7 @@ class ArenaPlanner : public MemoryPlanner {
   TfLiteStatus ReleaseNonPersistentMemory() override;
   TfLiteStatus AcquireNonPersistentMemory() override;
   bool HasNonPersistentMemory() override;
+  void DumpDebugInfo(const std::vector<int>& execution_plan) const override;
 
   // Returns the base arena location for a given allocation type.
   std::intptr_t BasePointer(TfLiteAllocationType type);
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index e42bd9e53ec5cd..7c7d6f8f1e90d9 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -1750,6 +1750,11 @@ void Subgraph::SetName(const char* name) {
 
 const std::string& Subgraph::GetName() const { return name_; }
 
+void Subgraph::DumpMemoryPlannerDebugInfo() const {
+  if (memory_planner_ == nullptr) return;
+  memory_planner_->DumpDebugInfo(execution_plan());
+}
+
 TfLiteStatus Subgraph::PreserveAllTensorsExperimental() {
   if (memory_planner_) {
     ReportError(
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 990c4944efa081..e149a90fa2a14f 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -359,6 +359,17 @@ class Subgraph {
   void SetName(const char* name);
   const std::string& GetName() const;
 
+  // WARNING: This is an experimental API and subject to change.
+  // Dumps debugging info by the underlying memory planner.
+  // Note: to have minimal binary increase caused by this debug info dump for
+  // the TfLite library and allow users to plug-in their own memory planner
+  // debugger, we have utilized weak symbols to meet these two requirements. By
+  // default, there is no debugging info dumped. However, if the TfLite-provided
+  // lite:simple_memory_arena_debug_dump (i.e. containing the strong defintion)
+  // is linked to the program, calling this function will output memory usage
+  // information about tenosrs and ops.
+  void DumpMemoryPlannerDebugInfo() const;
+
  private:
   friend class InterpreterBuilder;
   friend class TestDelegate;
diff --git a/tensorflow/lite/memory_planner.h b/tensorflow/lite/memory_planner.h
index e4b6aee26b161b..eebdc49d140a5a 100644
--- a/tensorflow/lite/memory_planner.h
+++ b/tensorflow/lite/memory_planner.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MEMORY_PLANNER_H_
 #define TENSORFLOW_LITE_MEMORY_PLANNER_H_
 
+#include <vector>
+
 #include "tensorflow/lite/c/common.h"
 
 namespace tflite {
@@ -59,6 +61,10 @@ class MemoryPlanner {
 
   // Returns true if the non-persistent memory is available.
   virtual bool HasNonPersistentMemory() = 0;
+
+  // Dumps the memory planning information against the specified op node
+  // execution plan (i.e. `execution_plan`) for the purpose of debugging.
+  virtual void DumpDebugInfo(const std::vector<int>& execution_plan) const = 0;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index ea33a1d7142bb8..69296d9f1934b2 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -390,6 +390,11 @@ void PrintInterpreterState(const Interpreter* interpreter) {
     }
     tensor_mem_info.Print();
 
+    // Dumps debugging info provided by the underlying memory planner.
+    // Note that this will output nothing unless the
+    // ":simple_memory_arena_debug_dump" is added as an extra dependence.
+    subgraph.DumpMemoryPlannerDebugInfo();
+
     // Going to print out all nodes (i.e. op kernels) in this subgraph.
     std::vector<bool> replaced_node_bits;
     std::vector<size_t> replaced_by_node;
diff --git a/tensorflow/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc
index 615460d686846d..1c7a03846f5f3e 100644
--- a/tensorflow/lite/simple_memory_arena.cc
+++ b/tensorflow/lite/simple_memory_arena.cc
@@ -23,9 +23,11 @@ limitations under the License.
 #include <iterator>
 #include <limits>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/macros.h"
 
 namespace {
 
@@ -166,4 +168,16 @@ TfLiteStatus SimpleMemoryArena::ReleaseBuffer() {
   return kTfLiteOk;
 }
 
+// Using weak symbols to create a pluggable debugging module.
+TFLITE_ATTRIBUTE_WEAK void DumpArenaInfo(
+    const std::string& name, const std::vector<int>& execution_plan,
+    size_t arena_size, const std::vector<ArenaAllocWithUsageInterval>& allocs) {
+}
+
+void SimpleMemoryArena::DumpDebugInfo(
+    const std::string& name, const std::vector<int>& execution_plan) const {
+  tflite::DumpArenaInfo(name, execution_plan, underlying_buffer_size_,
+                        ordered_allocs_);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/simple_memory_arena.h b/tensorflow/lite/simple_memory_arena.h
index b83ac606b62ba7..ab086ab2bee0ae 100644
--- a/tensorflow/lite/simple_memory_arena.h
+++ b/tensorflow/lite/simple_memory_arena.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -107,6 +108,23 @@ class SimpleMemoryArena {
     return reinterpret_cast<std::intptr_t>(underlying_buffer_aligned_ptr_);
   }
 
+  // Dumps the memory allocation information of this memory arena (which could
+  // be differentiated from others by the `name`) against the specified op node
+  // execution plan (i.e. `execution_plan`) for the purpose of debugging.
+  // Note: in order to have minimal binary increase caused by this debug info
+  // dump implementation for the TfLite library, and allow users to plug-in
+  // their own memory planner debugger, we have utilized weak symbols to meet
+  // these two requirementsements. By default, there is no debugging info
+  // dumped. To override this, provide a strong defintion of
+  // tflite::DumpArenaInfo(...) whose weak defintion is in
+  // simple_memory_arena.cc. TfLite provides a sample one as
+  // "lite:simple_memory_arena_debug_dump". When this dep is added to the
+  // program, calling this function will output information of this memory arena
+  // about tenosrs and ops, such as memory arena utilization rate, live tensors
+  // at each op etc.
+  void DumpDebugInfo(const std::string& name,
+                     const std::vector<int>& execution_plan) const;
+
  private:
   bool committed_;
   size_t arena_alignment_;
diff --git a/tensorflow/lite/simple_memory_arena_debug_dump.cc b/tensorflow/lite/simple_memory_arena_debug_dump.cc
new file mode 100644
index 00000000000000..256fa0b9a7e04c
--- /dev/null
+++ b/tensorflow/lite/simple_memory_arena_debug_dump.cc
@@ -0,0 +1,196 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cstdio>
+#include <functional>
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/simple_memory_arena.h"
+
+namespace tflite {
+namespace {
+// Same w/ that defined in tensorflow/lite/arena_planner.cc.
+constexpr int32_t kNodeNotAssigned = std::numeric_limits<int32_t>::max();
+
+void PrintIntVector(const std::vector<int>& v) {
+  if (v.empty()) {
+    printf("[]");
+    return;
+  }
+
+  int range_start = v[0];
+  int range_end = range_start;
+  std::function<void(const char*)> print_range = [&](const char* suffix) {
+    if (range_end == range_start) {
+      printf("%d%s", range_start, suffix);
+    } else if (range_end == range_start + 1) {
+      printf("%d,%d%s", range_start, range_end, suffix);
+    } else {
+      printf("%d-%d%s", range_start, range_end, suffix);
+    }
+  };
+
+  printf("[");
+  for (int i = 1; i < v.size(); ++i) {
+    int current = v[i];
+    if (current == range_end + 1) {
+      range_end = current;
+    } else {
+      print_range(",");
+      range_start = range_end = current;
+    }
+  }
+  print_range("]");
+}
+
+struct PerLayerInfo {
+  PerLayerInfo() {}
+  PerLayerInfo(int id, size_t bytes, const std::vector<int>& tensors)
+      : node_id(id), total_bytes(bytes), live_tensors(tensors) {}
+  int node_id;
+  size_t total_bytes = 0;
+  std::vector<int> live_tensors;
+};
+struct PerLayerInfoGreater {
+  bool operator()(const PerLayerInfo& l, const PerLayerInfo& r) {
+    return l.total_bytes > r.total_bytes;
+  }
+};
+class PerLayerMinHeap
+    : public std::priority_queue<PerLayerInfo, std::vector<PerLayerInfo>,
+                                 PerLayerInfoGreater> {
+ public:
+  // Just to expose iterators to simplify iterating over contained elements.
+  std::vector<PerLayerInfo>::const_iterator begin() const { return c.begin(); }
+  std::vector<PerLayerInfo>::const_iterator end() const { return c.end(); }
+};
+
+class TopKLayers {
+ public:
+  TopKLayers(size_t top_k, size_t arena_size)
+      : top_k_(top_k), arena_size_(arena_size) {}
+
+  void Add(int node_id, size_t total_bytes,
+           const std::vector<int>& live_tensors) {
+    if (topk_usage_.size() < top_k_) {
+      topk_usage_.emplace(PerLayerInfo(node_id, total_bytes, live_tensors));
+      return;
+    }
+    if (total_bytes < topk_usage_.top().total_bytes) return;
+    topk_usage_.pop();
+    topk_usage_.emplace(PerLayerInfo(node_id, total_bytes, live_tensors));
+  }
+
+  void Print() const {
+    printf("\nTop %zu memory-consuming layers:\n",
+           topk_usage_.size() < top_k_ ? topk_usage_.size() : top_k_);
+    // As we use a min-heap but want to print out usage in decreasing order, we
+    // use a temporary vector to hold pointers to top memory-consuming layers
+    // and do a sorting on it.
+    std::vector<const PerLayerInfo*> tops;
+    for (const auto& usage : topk_usage_) tops.push_back(&usage);
+    std::sort(tops.begin(), tops.end(),
+              [](const PerLayerInfo* l, const PerLayerInfo* r) {
+                return l->total_bytes > r->total_bytes;
+              });
+    for (const auto* usage : tops) {
+      printf(
+          "Node %d: %zu bytes (%.3f MB), utilization rate: %.3f%%, %zu live "
+          "tensors: ",
+          usage->node_id, usage->total_bytes,
+          static_cast<float>(usage->total_bytes) / (1 << 20),
+          static_cast<float>(usage->total_bytes) / arena_size_ * 100.0,
+          usage->live_tensors.size());
+      PrintIntVector(usage->live_tensors);
+      printf("\n");
+    }
+    printf("\n");
+  }
+
+ private:
+  const size_t top_k_;
+  const size_t arena_size_;
+  PerLayerMinHeap topk_usage_;
+};
+}  // namespace
+
+// Corresponding weak declaration found in lite/simple_memory_arena.cc
+void DumpArenaInfo(const std::string& name,
+                   const std::vector<int>& execution_plan, size_t arena_size,
+                   const std::vector<ArenaAllocWithUsageInterval>& allocs) {
+  if (allocs.empty() || execution_plan.empty()) return;
+
+  const int max_node_id =
+      *std::max_element(execution_plan.begin(), execution_plan.end());
+
+  printf("=== Beginning of %s ===\n", name.c_str());
+  printf("Total size is %zu bytes (%.3f MB), holding %zu tensors.\n",
+         arena_size, static_cast<float>(arena_size) / (1 << 20), allocs.size());
+  std::vector<int> max_size_tensors;
+  size_t max_tensor_size = 0;
+  for (const auto& alloc_info : allocs) {
+    printf("tensor %d: life_span: node [%d, %d], size:  %zu bytes (%.3f MB).\n",
+           alloc_info.tensor, alloc_info.first_node,
+           alloc_info.last_node == kNodeNotAssigned ? max_node_id
+                                                    : alloc_info.last_node,
+           alloc_info.size, static_cast<float>(alloc_info.size) / (1 << 20));
+    if (alloc_info.size > max_tensor_size) {
+      max_size_tensors.clear();
+      max_size_tensors.push_back(alloc_info.tensor);
+      max_tensor_size = alloc_info.size;
+    } else if (alloc_info.size == max_tensor_size) {
+      max_size_tensors.push_back(alloc_info.tensor);
+    }
+  }
+  std::sort(max_size_tensors.begin(), max_size_tensors.end());
+  printf("%zu tensors are of same max size (%zu B (%.3f MB)): ",
+         max_size_tensors.size(), max_tensor_size,
+         static_cast<float>(max_tensor_size) / (1 << 20));
+  PrintIntVector(max_size_tensors);
+
+  printf("\nPer-layer-info in the order of op execution:\n");
+  // A straightforward way of computing per-op memory consumption
+  // in the order of O(execution_plan.size() * allocs.size().
+  std::vector<size_t> per_op_mem_bytes(execution_plan.size());
+  // Track top 5 layers that consume most memory.
+  TopKLayers top_usage(5, arena_size);
+  for (int i = 0; i < execution_plan.size(); ++i) {
+    const int node_id = execution_plan[i];
+    size_t total_bytes = 0;
+    std::vector<int> live_tensors;
+    for (const auto& alloc_info : allocs) {
+      if (node_id >= alloc_info.first_node && node_id <= alloc_info.last_node) {
+        total_bytes += alloc_info.size;
+        live_tensors.push_back(alloc_info.tensor);
+      }
+    }
+    per_op_mem_bytes[i] = total_bytes;
+    std::sort(live_tensors.begin(), live_tensors.end());
+    printf(
+        "Node %d: %zu bytes (%.3f MB), utilization rate: %.3f%%, %zu live "
+        "tensors: ",
+        node_id, total_bytes, static_cast<float>(total_bytes) / (1 << 20),
+        static_cast<float>(total_bytes) / arena_size * 100.0,
+        live_tensors.size());
+    PrintIntVector(live_tensors);
+    printf("\n");
+    top_usage.Add(node_id, total_bytes, live_tensors);
+  }
+  top_usage.Print();
+  printf("===End of %s ===\n\n", name.c_str());
+}
+}  // namespace tflite
diff --git a/tensorflow/lite/simple_planner.h b/tensorflow/lite/simple_planner.h
index 11a9d66a7e0014..c601844950ff35 100644
--- a/tensorflow/lite/simple_planner.h
+++ b/tensorflow/lite/simple_planner.h
@@ -90,6 +90,7 @@ class SimplePlanner : public MemoryPlanner {
   TfLiteStatus ReleaseNonPersistentMemory() override;
   TfLiteStatus AcquireNonPersistentMemory() override;
   bool HasNonPersistentMemory() override { return true; };
+  void DumpDebugInfo(const std::vector<int>& execution_plan) const override{};
 
  private:
   // Free all the all allocations.
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index f94fb6e2d388e3..b6e21763c8e082 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -154,6 +154,7 @@ cc_library(
         ":benchmark_utils",
         ":profiling_listener",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:simple_memory_arena_debug_dump",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",