From 432ab5de2f59d2c10c08a7ce08fa4528e83caa79 Mon Sep 17 00:00:00 2001 From: Chao Mei Date: Mon, 9 Aug 2021 16:49:54 -0700 Subject: [PATCH] Dump more memory usage info (like per-layer live tensors etc.) w/ the benchmark tool. 1. Utilize weak symbols as a non-intrusive way to dump the memory planner info, and add such a strong definition to print out tensor's life span, the memory space used at each op execution from the memory arena. 2. Integrate this strong definition w/ the TfLite model benchmark tool. PiperOrigin-RevId: 389755683 Change-Id: Ib53006ad6eee566432f5a412737b7c6278b21e3d --- tensorflow/lite/BUILD | 16 +- tensorflow/lite/arena_planner.cc | 6 + tensorflow/lite/arena_planner.h | 1 + tensorflow/lite/core/subgraph.cc | 5 + tensorflow/lite/core/subgraph.h | 11 + tensorflow/lite/memory_planner.h | 6 + tensorflow/lite/optional_debug_tools.cc | 5 + tensorflow/lite/simple_memory_arena.cc | 14 ++ tensorflow/lite/simple_memory_arena.h | 18 ++ .../lite/simple_memory_arena_debug_dump.cc | 196 ++++++++++++++++++ tensorflow/lite/simple_planner.h | 1 + tensorflow/lite/tools/benchmark/BUILD | 1 + 12 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 tensorflow/lite/simple_memory_arena_debug_dump.cc diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD index 5369a39047fda9..15830e07096857 100644 --- a/tensorflow/lite/BUILD +++ b/tensorflow/lite/BUILD @@ -219,7 +219,21 @@ cc_library( hdrs = ["simple_memory_arena.h"], compatible_with = get_compatible_with_portable(), copts = tflite_copts_warnings(), - deps = ["//tensorflow/lite/c:common"], + deps = [ + ":macros", + "//tensorflow/lite/c:common", + ], +) + +cc_library( + name = "simple_memory_arena_debug_dump", + srcs = ["simple_memory_arena_debug_dump.cc"], + compatible_with = get_compatible_with_portable(), + copts = tflite_copts_warnings(), + deps = [ + ":simple_memory_arena", + ], + alwayslink = 1, ) cc_library( diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc index 1a4e023e3f7b7a..e715ac2627406b 100644 --- a/tensorflow/lite/arena_planner.cc +++ b/tensorflow/lite/arena_planner.cc @@ -249,6 +249,12 @@ bool ArenaPlanner::HasNonPersistentMemory() { return arena_.GetBufferSize() != 0; } +void ArenaPlanner::DumpDebugInfo(const std::vector& execution_plan) const { + arena_.DumpDebugInfo("kTfLiteArenaRw Dump:", execution_plan); + persistent_arena_.DumpDebugInfo("kTfLiteArenaRwPersistent Dump:", + execution_plan); +} + TfLiteStatus ArenaPlanner::Commit() { TF_LITE_ENSURE_STATUS(arena_.Commit(context_)); TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_)); diff --git a/tensorflow/lite/arena_planner.h b/tensorflow/lite/arena_planner.h index 5092b0421c8a6e..1c2f4201800e46 100644 --- a/tensorflow/lite/arena_planner.h +++ b/tensorflow/lite/arena_planner.h @@ -62,6 +62,7 @@ class ArenaPlanner : public MemoryPlanner { TfLiteStatus ReleaseNonPersistentMemory() override; TfLiteStatus AcquireNonPersistentMemory() override; bool HasNonPersistentMemory() override; + void DumpDebugInfo(const std::vector& execution_plan) const override; // Returns the base arena location for a given allocation type. std::intptr_t BasePointer(TfLiteAllocationType type); diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc index e42bd9e53ec5cd..7c7d6f8f1e90d9 100644 --- a/tensorflow/lite/core/subgraph.cc +++ b/tensorflow/lite/core/subgraph.cc @@ -1750,6 +1750,11 @@ void Subgraph::SetName(const char* name) { const std::string& Subgraph::GetName() const { return name_; } +void Subgraph::DumpMemoryPlannerDebugInfo() const { + if (memory_planner_ == nullptr) return; + memory_planner_->DumpDebugInfo(execution_plan()); +} + TfLiteStatus Subgraph::PreserveAllTensorsExperimental() { if (memory_planner_) { ReportError( diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h index 990c4944efa081..e149a90fa2a14f 100644 --- a/tensorflow/lite/core/subgraph.h +++ b/tensorflow/lite/core/subgraph.h @@ -359,6 +359,17 @@ class Subgraph { void SetName(const char* name); const std::string& GetName() const; + // WARNING: This is an experimental API and subject to change. + // Dumps debugging info by the underlying memory planner. + // Note: to have minimal binary increase caused by this debug info dump for + // the TfLite library and allow users to plug-in their own memory planner + // debugger, we have utilized weak symbols to meet these two requirements. By + // default, there is no debugging info dumped. However, if the TfLite-provided + // lite:simple_memory_arena_debug_dump (i.e. containing the strong defintion) + // is linked to the program, calling this function will output memory usage + // information about tenosrs and ops. + void DumpMemoryPlannerDebugInfo() const; + private: friend class InterpreterBuilder; friend class TestDelegate; diff --git a/tensorflow/lite/memory_planner.h b/tensorflow/lite/memory_planner.h index e4b6aee26b161b..eebdc49d140a5a 100644 --- a/tensorflow/lite/memory_planner.h +++ b/tensorflow/lite/memory_planner.h @@ -15,6 +15,8 @@ limitations under the License. #ifndef TENSORFLOW_LITE_MEMORY_PLANNER_H_ #define TENSORFLOW_LITE_MEMORY_PLANNER_H_ +#include + #include "tensorflow/lite/c/common.h" namespace tflite { @@ -59,6 +61,10 @@ class MemoryPlanner { // Returns true if the non-persistent memory is available. virtual bool HasNonPersistentMemory() = 0; + + // Dumps the memory planning information against the specified op node + // execution plan (i.e. `execution_plan`) for the purpose of debugging. + virtual void DumpDebugInfo(const std::vector& execution_plan) const = 0; }; } // namespace tflite diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc index ea33a1d7142bb8..69296d9f1934b2 100644 --- a/tensorflow/lite/optional_debug_tools.cc +++ b/tensorflow/lite/optional_debug_tools.cc @@ -390,6 +390,11 @@ void PrintInterpreterState(const Interpreter* interpreter) { } tensor_mem_info.Print(); + // Dumps debugging info provided by the underlying memory planner. + // Note that this will output nothing unless the + // ":simple_memory_arena_debug_dump" is added as an extra dependence. + subgraph.DumpMemoryPlannerDebugInfo(); + // Going to print out all nodes (i.e. op kernels) in this subgraph. std::vector replaced_node_bits; std::vector replaced_by_node; diff --git a/tensorflow/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc index 615460d686846d..1c7a03846f5f3e 100644 --- a/tensorflow/lite/simple_memory_arena.cc +++ b/tensorflow/lite/simple_memory_arena.cc @@ -23,9 +23,11 @@ limitations under the License. #include #include #include +#include #include #include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/core/macros.h" namespace { @@ -166,4 +168,16 @@ TfLiteStatus SimpleMemoryArena::ReleaseBuffer() { return kTfLiteOk; } +// Using weak symbols to create a pluggable debugging module. +TFLITE_ATTRIBUTE_WEAK void DumpArenaInfo( + const std::string& name, const std::vector& execution_plan, + size_t arena_size, const std::vector& allocs) { +} + +void SimpleMemoryArena::DumpDebugInfo( + const std::string& name, const std::vector& execution_plan) const { + tflite::DumpArenaInfo(name, execution_plan, underlying_buffer_size_, + ordered_allocs_); +} + } // namespace tflite diff --git a/tensorflow/lite/simple_memory_arena.h b/tensorflow/lite/simple_memory_arena.h index b83ac606b62ba7..ab086ab2bee0ae 100644 --- a/tensorflow/lite/simple_memory_arena.h +++ b/tensorflow/lite/simple_memory_arena.h @@ -19,6 +19,7 @@ limitations under the License. #include #include +#include #include #include "tensorflow/lite/c/common.h" @@ -107,6 +108,23 @@ class SimpleMemoryArena { return reinterpret_cast(underlying_buffer_aligned_ptr_); } + // Dumps the memory allocation information of this memory arena (which could + // be differentiated from others by the `name`) against the specified op node + // execution plan (i.e. `execution_plan`) for the purpose of debugging. + // Note: in order to have minimal binary increase caused by this debug info + // dump implementation for the TfLite library, and allow users to plug-in + // their own memory planner debugger, we have utilized weak symbols to meet + // these two requirementsements. By default, there is no debugging info + // dumped. To override this, provide a strong defintion of + // tflite::DumpArenaInfo(...) whose weak defintion is in + // simple_memory_arena.cc. TfLite provides a sample one as + // "lite:simple_memory_arena_debug_dump". When this dep is added to the + // program, calling this function will output information of this memory arena + // about tenosrs and ops, such as memory arena utilization rate, live tensors + // at each op etc. + void DumpDebugInfo(const std::string& name, + const std::vector& execution_plan) const; + private: bool committed_; size_t arena_alignment_; diff --git a/tensorflow/lite/simple_memory_arena_debug_dump.cc b/tensorflow/lite/simple_memory_arena_debug_dump.cc new file mode 100644 index 00000000000000..256fa0b9a7e04c --- /dev/null +++ b/tensorflow/lite/simple_memory_arena_debug_dump.cc @@ -0,0 +1,196 @@ +/* Copyright 2021 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include +#include +#include +#include +#include + +#include "tensorflow/lite/simple_memory_arena.h" + +namespace tflite { +namespace { +// Same w/ that defined in tensorflow/lite/arena_planner.cc. +constexpr int32_t kNodeNotAssigned = std::numeric_limits::max(); + +void PrintIntVector(const std::vector& v) { + if (v.empty()) { + printf("[]"); + return; + } + + int range_start = v[0]; + int range_end = range_start; + std::function print_range = [&](const char* suffix) { + if (range_end == range_start) { + printf("%d%s", range_start, suffix); + } else if (range_end == range_start + 1) { + printf("%d,%d%s", range_start, range_end, suffix); + } else { + printf("%d-%d%s", range_start, range_end, suffix); + } + }; + + printf("["); + for (int i = 1; i < v.size(); ++i) { + int current = v[i]; + if (current == range_end + 1) { + range_end = current; + } else { + print_range(","); + range_start = range_end = current; + } + } + print_range("]"); +} + +struct PerLayerInfo { + PerLayerInfo() {} + PerLayerInfo(int id, size_t bytes, const std::vector& tensors) + : node_id(id), total_bytes(bytes), live_tensors(tensors) {} + int node_id; + size_t total_bytes = 0; + std::vector live_tensors; +}; +struct PerLayerInfoGreater { + bool operator()(const PerLayerInfo& l, const PerLayerInfo& r) { + return l.total_bytes > r.total_bytes; + } +}; +class PerLayerMinHeap + : public std::priority_queue, + PerLayerInfoGreater> { + public: + // Just to expose iterators to simplify iterating over contained elements. + std::vector::const_iterator begin() const { return c.begin(); } + std::vector::const_iterator end() const { return c.end(); } +}; + +class TopKLayers { + public: + TopKLayers(size_t top_k, size_t arena_size) + : top_k_(top_k), arena_size_(arena_size) {} + + void Add(int node_id, size_t total_bytes, + const std::vector& live_tensors) { + if (topk_usage_.size() < top_k_) { + topk_usage_.emplace(PerLayerInfo(node_id, total_bytes, live_tensors)); + return; + } + if (total_bytes < topk_usage_.top().total_bytes) return; + topk_usage_.pop(); + topk_usage_.emplace(PerLayerInfo(node_id, total_bytes, live_tensors)); + } + + void Print() const { + printf("\nTop %zu memory-consuming layers:\n", + topk_usage_.size() < top_k_ ? topk_usage_.size() : top_k_); + // As we use a min-heap but want to print out usage in decreasing order, we + // use a temporary vector to hold pointers to top memory-consuming layers + // and do a sorting on it. + std::vector tops; + for (const auto& usage : topk_usage_) tops.push_back(&usage); + std::sort(tops.begin(), tops.end(), + [](const PerLayerInfo* l, const PerLayerInfo* r) { + return l->total_bytes > r->total_bytes; + }); + for (const auto* usage : tops) { + printf( + "Node %d: %zu bytes (%.3f MB), utilization rate: %.3f%%, %zu live " + "tensors: ", + usage->node_id, usage->total_bytes, + static_cast(usage->total_bytes) / (1 << 20), + static_cast(usage->total_bytes) / arena_size_ * 100.0, + usage->live_tensors.size()); + PrintIntVector(usage->live_tensors); + printf("\n"); + } + printf("\n"); + } + + private: + const size_t top_k_; + const size_t arena_size_; + PerLayerMinHeap topk_usage_; +}; +} // namespace + +// Corresponding weak declaration found in lite/simple_memory_arena.cc +void DumpArenaInfo(const std::string& name, + const std::vector& execution_plan, size_t arena_size, + const std::vector& allocs) { + if (allocs.empty() || execution_plan.empty()) return; + + const int max_node_id = + *std::max_element(execution_plan.begin(), execution_plan.end()); + + printf("=== Beginning of %s ===\n", name.c_str()); + printf("Total size is %zu bytes (%.3f MB), holding %zu tensors.\n", + arena_size, static_cast(arena_size) / (1 << 20), allocs.size()); + std::vector max_size_tensors; + size_t max_tensor_size = 0; + for (const auto& alloc_info : allocs) { + printf("tensor %d: life_span: node [%d, %d], size: %zu bytes (%.3f MB).\n", + alloc_info.tensor, alloc_info.first_node, + alloc_info.last_node == kNodeNotAssigned ? max_node_id + : alloc_info.last_node, + alloc_info.size, static_cast(alloc_info.size) / (1 << 20)); + if (alloc_info.size > max_tensor_size) { + max_size_tensors.clear(); + max_size_tensors.push_back(alloc_info.tensor); + max_tensor_size = alloc_info.size; + } else if (alloc_info.size == max_tensor_size) { + max_size_tensors.push_back(alloc_info.tensor); + } + } + std::sort(max_size_tensors.begin(), max_size_tensors.end()); + printf("%zu tensors are of same max size (%zu B (%.3f MB)): ", + max_size_tensors.size(), max_tensor_size, + static_cast(max_tensor_size) / (1 << 20)); + PrintIntVector(max_size_tensors); + + printf("\nPer-layer-info in the order of op execution:\n"); + // A straightforward way of computing per-op memory consumption + // in the order of O(execution_plan.size() * allocs.size(). + std::vector per_op_mem_bytes(execution_plan.size()); + // Track top 5 layers that consume most memory. + TopKLayers top_usage(5, arena_size); + for (int i = 0; i < execution_plan.size(); ++i) { + const int node_id = execution_plan[i]; + size_t total_bytes = 0; + std::vector live_tensors; + for (const auto& alloc_info : allocs) { + if (node_id >= alloc_info.first_node && node_id <= alloc_info.last_node) { + total_bytes += alloc_info.size; + live_tensors.push_back(alloc_info.tensor); + } + } + per_op_mem_bytes[i] = total_bytes; + std::sort(live_tensors.begin(), live_tensors.end()); + printf( + "Node %d: %zu bytes (%.3f MB), utilization rate: %.3f%%, %zu live " + "tensors: ", + node_id, total_bytes, static_cast(total_bytes) / (1 << 20), + static_cast(total_bytes) / arena_size * 100.0, + live_tensors.size()); + PrintIntVector(live_tensors); + printf("\n"); + top_usage.Add(node_id, total_bytes, live_tensors); + } + top_usage.Print(); + printf("===End of %s ===\n\n", name.c_str()); +} +} // namespace tflite diff --git a/tensorflow/lite/simple_planner.h b/tensorflow/lite/simple_planner.h index 11a9d66a7e0014..c601844950ff35 100644 --- a/tensorflow/lite/simple_planner.h +++ b/tensorflow/lite/simple_planner.h @@ -90,6 +90,7 @@ class SimplePlanner : public MemoryPlanner { TfLiteStatus ReleaseNonPersistentMemory() override; TfLiteStatus AcquireNonPersistentMemory() override; bool HasNonPersistentMemory() override { return true; }; + void DumpDebugInfo(const std::vector& execution_plan) const override{}; private: // Free all the all allocations. diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD index f94fb6e2d388e3..b6e21763c8e082 100644 --- a/tensorflow/lite/tools/benchmark/BUILD +++ b/tensorflow/lite/tools/benchmark/BUILD @@ -154,6 +154,7 @@ cc_library( ":benchmark_utils", ":profiling_listener", "//tensorflow/lite:framework", + "//tensorflow/lite:simple_memory_arena_debug_dump", "//tensorflow/lite:string_util", "//tensorflow/lite/c:common", "//tensorflow/lite/kernels:builtin_ops",