[go: nahoru, domu]

Skip to content

Commit

Permalink
Dump more memory usage info (like per-layer live tensors etc.) w/ the…
Browse files Browse the repository at this point in the history
… benchmark tool.

1. Utilize weak symbols as a non-intrusive way to dump the memory planner info, and add such a strong definition to print out tensor's life span, the memory space used at each op execution from the memory arena.

2. Integrate this strong definition w/ the TfLite model benchmark tool.

PiperOrigin-RevId: 389755683
Change-Id: Ib53006ad6eee566432f5a412737b7c6278b21e3d
  • Loading branch information
multiverse-tf authored and tensorflower-gardener committed Aug 9, 2021
1 parent b13084a commit 432ab5d
Show file tree
Hide file tree
Showing 12 changed files with 279 additions and 1 deletion.
16 changes: 15 additions & 1 deletion tensorflow/lite/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,21 @@ cc_library(
hdrs = ["simple_memory_arena.h"],
compatible_with = get_compatible_with_portable(),
copts = tflite_copts_warnings(),
deps = ["//tensorflow/lite/c:common"],
deps = [
":macros",
"//tensorflow/lite/c:common",
],
)

cc_library(
name = "simple_memory_arena_debug_dump",
srcs = ["simple_memory_arena_debug_dump.cc"],
compatible_with = get_compatible_with_portable(),
copts = tflite_copts_warnings(),
deps = [
":simple_memory_arena",
],
alwayslink = 1,
)

cc_library(
Expand Down
6 changes: 6 additions & 0 deletions tensorflow/lite/arena_planner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,12 @@ bool ArenaPlanner::HasNonPersistentMemory() {
return arena_.GetBufferSize() != 0;
}

void ArenaPlanner::DumpDebugInfo(const std::vector<int>& execution_plan) const {
arena_.DumpDebugInfo("kTfLiteArenaRw Dump:", execution_plan);
persistent_arena_.DumpDebugInfo("kTfLiteArenaRwPersistent Dump:",
execution_plan);
}

TfLiteStatus ArenaPlanner::Commit() {
TF_LITE_ENSURE_STATUS(arena_.Commit(context_));
TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_));
Expand Down
1 change: 1 addition & 0 deletions tensorflow/lite/arena_planner.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class ArenaPlanner : public MemoryPlanner {
TfLiteStatus ReleaseNonPersistentMemory() override;
TfLiteStatus AcquireNonPersistentMemory() override;
bool HasNonPersistentMemory() override;
void DumpDebugInfo(const std::vector<int>& execution_plan) const override;

// Returns the base arena location for a given allocation type.
std::intptr_t BasePointer(TfLiteAllocationType type);
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/lite/core/subgraph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1750,6 +1750,11 @@ void Subgraph::SetName(const char* name) {

const std::string& Subgraph::GetName() const { return name_; }

void Subgraph::DumpMemoryPlannerDebugInfo() const {
if (memory_planner_ == nullptr) return;
memory_planner_->DumpDebugInfo(execution_plan());
}

TfLiteStatus Subgraph::PreserveAllTensorsExperimental() {
if (memory_planner_) {
ReportError(
Expand Down
11 changes: 11 additions & 0 deletions tensorflow/lite/core/subgraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,17 @@ class Subgraph {
void SetName(const char* name);
const std::string& GetName() const;

// WARNING: This is an experimental API and subject to change.
// Dumps debugging info by the underlying memory planner.
// Note: to have minimal binary increase caused by this debug info dump for
// the TfLite library and allow users to plug-in their own memory planner
// debugger, we have utilized weak symbols to meet these two requirements. By
// default, there is no debugging info dumped. However, if the TfLite-provided
// lite:simple_memory_arena_debug_dump (i.e. containing the strong defintion)
// is linked to the program, calling this function will output memory usage
// information about tenosrs and ops.
void DumpMemoryPlannerDebugInfo() const;

private:
friend class InterpreterBuilder;
friend class TestDelegate;
Expand Down
6 changes: 6 additions & 0 deletions tensorflow/lite/memory_planner.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.
#ifndef TENSORFLOW_LITE_MEMORY_PLANNER_H_
#define TENSORFLOW_LITE_MEMORY_PLANNER_H_

#include <vector>

#include "tensorflow/lite/c/common.h"

namespace tflite {
Expand Down Expand Up @@ -59,6 +61,10 @@ class MemoryPlanner {

// Returns true if the non-persistent memory is available.
virtual bool HasNonPersistentMemory() = 0;

// Dumps the memory planning information against the specified op node
// execution plan (i.e. `execution_plan`) for the purpose of debugging.
virtual void DumpDebugInfo(const std::vector<int>& execution_plan) const = 0;
};

} // namespace tflite
Expand Down
5 changes: 5 additions & 0 deletions tensorflow/lite/optional_debug_tools.cc
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,11 @@ void PrintInterpreterState(const Interpreter* interpreter) {
}
tensor_mem_info.Print();

// Dumps debugging info provided by the underlying memory planner.
// Note that this will output nothing unless the
// ":simple_memory_arena_debug_dump" is added as an extra dependence.
subgraph.DumpMemoryPlannerDebugInfo();

// Going to print out all nodes (i.e. op kernels) in this subgraph.
std::vector<bool> replaced_node_bits;
std::vector<size_t> replaced_by_node;
Expand Down
14 changes: 14 additions & 0 deletions tensorflow/lite/simple_memory_arena.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ limitations under the License.
#include <iterator>
#include <limits>
#include <memory>
#include <string>
#include <vector>

#include "tensorflow/lite/c/common.h"
#include "tensorflow/lite/core/macros.h"

namespace {

Expand Down Expand Up @@ -166,4 +168,16 @@ TfLiteStatus SimpleMemoryArena::ReleaseBuffer() {
return kTfLiteOk;
}

// Using weak symbols to create a pluggable debugging module.
TFLITE_ATTRIBUTE_WEAK void DumpArenaInfo(
const std::string& name, const std::vector<int>& execution_plan,
size_t arena_size, const std::vector<ArenaAllocWithUsageInterval>& allocs) {
}

void SimpleMemoryArena::DumpDebugInfo(
const std::string& name, const std::vector<int>& execution_plan) const {
tflite::DumpArenaInfo(name, execution_plan, underlying_buffer_size_,
ordered_allocs_);
}

} // namespace tflite
18 changes: 18 additions & 0 deletions tensorflow/lite/simple_memory_arena.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ limitations under the License.

#include <cstdint>
#include <memory>
#include <string>
#include <vector>

#include "tensorflow/lite/c/common.h"
Expand Down Expand Up @@ -107,6 +108,23 @@ class SimpleMemoryArena {
return reinterpret_cast<std::intptr_t>(underlying_buffer_aligned_ptr_);
}

// Dumps the memory allocation information of this memory arena (which could
// be differentiated from others by the `name`) against the specified op node
// execution plan (i.e. `execution_plan`) for the purpose of debugging.
// Note: in order to have minimal binary increase caused by this debug info
// dump implementation for the TfLite library, and allow users to plug-in
// their own memory planner debugger, we have utilized weak symbols to meet
// these two requirementsements. By default, there is no debugging info
// dumped. To override this, provide a strong defintion of
// tflite::DumpArenaInfo(...) whose weak defintion is in
// simple_memory_arena.cc. TfLite provides a sample one as
// "lite:simple_memory_arena_debug_dump". When this dep is added to the
// program, calling this function will output information of this memory arena
// about tenosrs and ops, such as memory arena utilization rate, live tensors
// at each op etc.
void DumpDebugInfo(const std::string& name,
const std::vector<int>& execution_plan) const;

private:
bool committed_;
size_t arena_alignment_;
Expand Down
196 changes: 196 additions & 0 deletions tensorflow/lite/simple_memory_arena_debug_dump.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <algorithm>
#include <cstdio>
#include <functional>
#include <queue>
#include <string>
#include <vector>

#include "tensorflow/lite/simple_memory_arena.h"

namespace tflite {
namespace {
// Same w/ that defined in tensorflow/lite/arena_planner.cc.
constexpr int32_t kNodeNotAssigned = std::numeric_limits<int32_t>::max();

void PrintIntVector(const std::vector<int>& v) {
if (v.empty()) {
printf("[]");
return;
}

int range_start = v[0];
int range_end = range_start;
std::function<void(const char*)> print_range = [&](const char* suffix) {
if (range_end == range_start) {
printf("%d%s", range_start, suffix);
} else if (range_end == range_start + 1) {
printf("%d,%d%s", range_start, range_end, suffix);
} else {
printf("%d-%d%s", range_start, range_end, suffix);
}
};

printf("[");
for (int i = 1; i < v.size(); ++i) {
int current = v[i];
if (current == range_end + 1) {
range_end = current;
} else {
print_range(",");
range_start = range_end = current;
}
}
print_range("]");
}

struct PerLayerInfo {
PerLayerInfo() {}
PerLayerInfo(int id, size_t bytes, const std::vector<int>& tensors)
: node_id(id), total_bytes(bytes), live_tensors(tensors) {}
int node_id;
size_t total_bytes = 0;
std::vector<int> live_tensors;
};
struct PerLayerInfoGreater {
bool operator()(const PerLayerInfo& l, const PerLayerInfo& r) {
return l.total_bytes > r.total_bytes;
}
};
class PerLayerMinHeap
: public std::priority_queue<PerLayerInfo, std::vector<PerLayerInfo>,
PerLayerInfoGreater> {
public:
// Just to expose iterators to simplify iterating over contained elements.
std::vector<PerLayerInfo>::const_iterator begin() const { return c.begin(); }
std::vector<PerLayerInfo>::const_iterator end() const { return c.end(); }
};

class TopKLayers {
public:
TopKLayers(size_t top_k, size_t arena_size)
: top_k_(top_k), arena_size_(arena_size) {}

void Add(int node_id, size_t total_bytes,
const std::vector<int>& live_tensors) {
if (topk_usage_.size() < top_k_) {
topk_usage_.emplace(PerLayerInfo(node_id, total_bytes, live_tensors));
return;
}
if (total_bytes < topk_usage_.top().total_bytes) return;
topk_usage_.pop();
topk_usage_.emplace(PerLayerInfo(node_id, total_bytes, live_tensors));
}

void Print() const {
printf("\nTop %zu memory-consuming layers:\n",
topk_usage_.size() < top_k_ ? topk_usage_.size() : top_k_);
// As we use a min-heap but want to print out usage in decreasing order, we
// use a temporary vector to hold pointers to top memory-consuming layers
// and do a sorting on it.
std::vector<const PerLayerInfo*> tops;
for (const auto& usage : topk_usage_) tops.push_back(&usage);
std::sort(tops.begin(), tops.end(),
[](const PerLayerInfo* l, const PerLayerInfo* r) {
return l->total_bytes > r->total_bytes;
});
for (const auto* usage : tops) {
printf(
"Node %d: %zu bytes (%.3f MB), utilization rate: %.3f%%, %zu live "
"tensors: ",
usage->node_id, usage->total_bytes,
static_cast<float>(usage->total_bytes) / (1 << 20),
static_cast<float>(usage->total_bytes) / arena_size_ * 100.0,
usage->live_tensors.size());
PrintIntVector(usage->live_tensors);
printf("\n");
}
printf("\n");
}

private:
const size_t top_k_;
const size_t arena_size_;
PerLayerMinHeap topk_usage_;
};
} // namespace

// Corresponding weak declaration found in lite/simple_memory_arena.cc
void DumpArenaInfo(const std::string& name,
const std::vector<int>& execution_plan, size_t arena_size,
const std::vector<ArenaAllocWithUsageInterval>& allocs) {
if (allocs.empty() || execution_plan.empty()) return;

const int max_node_id =
*std::max_element(execution_plan.begin(), execution_plan.end());

printf("=== Beginning of %s ===\n", name.c_str());
printf("Total size is %zu bytes (%.3f MB), holding %zu tensors.\n",
arena_size, static_cast<float>(arena_size) / (1 << 20), allocs.size());
std::vector<int> max_size_tensors;
size_t max_tensor_size = 0;
for (const auto& alloc_info : allocs) {
printf("tensor %d: life_span: node [%d, %d], size: %zu bytes (%.3f MB).\n",
alloc_info.tensor, alloc_info.first_node,
alloc_info.last_node == kNodeNotAssigned ? max_node_id
: alloc_info.last_node,
alloc_info.size, static_cast<float>(alloc_info.size) / (1 << 20));
if (alloc_info.size > max_tensor_size) {
max_size_tensors.clear();
max_size_tensors.push_back(alloc_info.tensor);
max_tensor_size = alloc_info.size;
} else if (alloc_info.size == max_tensor_size) {
max_size_tensors.push_back(alloc_info.tensor);
}
}
std::sort(max_size_tensors.begin(), max_size_tensors.end());
printf("%zu tensors are of same max size (%zu B (%.3f MB)): ",
max_size_tensors.size(), max_tensor_size,
static_cast<float>(max_tensor_size) / (1 << 20));
PrintIntVector(max_size_tensors);

printf("\nPer-layer-info in the order of op execution:\n");
// A straightforward way of computing per-op memory consumption
// in the order of O(execution_plan.size() * allocs.size().
std::vector<size_t> per_op_mem_bytes(execution_plan.size());
// Track top 5 layers that consume most memory.
TopKLayers top_usage(5, arena_size);
for (int i = 0; i < execution_plan.size(); ++i) {
const int node_id = execution_plan[i];
size_t total_bytes = 0;
std::vector<int> live_tensors;
for (const auto& alloc_info : allocs) {
if (node_id >= alloc_info.first_node && node_id <= alloc_info.last_node) {
total_bytes += alloc_info.size;
live_tensors.push_back(alloc_info.tensor);
}
}
per_op_mem_bytes[i] = total_bytes;
std::sort(live_tensors.begin(), live_tensors.end());
printf(
"Node %d: %zu bytes (%.3f MB), utilization rate: %.3f%%, %zu live "
"tensors: ",
node_id, total_bytes, static_cast<float>(total_bytes) / (1 << 20),
static_cast<float>(total_bytes) / arena_size * 100.0,
live_tensors.size());
PrintIntVector(live_tensors);
printf("\n");
top_usage.Add(node_id, total_bytes, live_tensors);
}
top_usage.Print();
printf("===End of %s ===\n\n", name.c_str());
}
} // namespace tflite
1 change: 1 addition & 0 deletions tensorflow/lite/simple_planner.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class SimplePlanner : public MemoryPlanner {
TfLiteStatus ReleaseNonPersistentMemory() override;
TfLiteStatus AcquireNonPersistentMemory() override;
bool HasNonPersistentMemory() override { return true; };
void DumpDebugInfo(const std::vector<int>& execution_plan) const override{};

private:
// Free all the all allocations.
Expand Down
1 change: 1 addition & 0 deletions tensorflow/lite/tools/benchmark/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ cc_library(
":benchmark_utils",
":profiling_listener",
"//tensorflow/lite:framework",
"//tensorflow/lite:simple_memory_arena_debug_dump",
"//tensorflow/lite:string_util",
"//tensorflow/lite/c:common",
"//tensorflow/lite/kernels:builtin_ops",
Expand Down

0 comments on commit 432ab5d

Please sign in to comment.