[go: nahoru, domu]

Skip to content

Commit

Permalink
Add ModelFlops to OpMetrics.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 601142809
  • Loading branch information
cliveverghese authored and tensorflower-gardener committed Jan 24, 2024
1 parent a58d93f commit 543f036
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 27 deletions.
1 change: 1 addition & 0 deletions tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst,
dst->set_time_ps(src.time_ps() + dst->time_ps());
dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
dst->set_flops(src.flops() + dst->flops());
dst->set_model_flops(src.model_flops() + dst->model_flops());
dst->set_bytes_accessed(src.bytes_accessed() + dst->bytes_accessed());
dst->set_autotuned(dst->autotuned() || src.autotuned());
if (update_num_cores) {
Expand Down
23 changes: 0 additions & 23 deletions tensorflow/core/profiler/convert/op_profile_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,6 @@ void FinalizeDeduplicatedNodes(bool by_program, Node* root) {
}
}

// Recursively find computation size for HLOs -- applied only for convolutions.
// This is only for convolutions, not other HLOs, categories or whole programs.
// TODO(b/243596435) Find a permanent fix to this problem.
int64_t GetComputationSize(Node node) {
if (node.has_xla() && node.xla().computation_primitive_size() > 0) {
return node.xla().computation_primitive_size();
}
for (auto child_iter = node.children().rbegin();
child_iter != node.children().rend(); ++child_iter) {
if (const int64_t computation_size = GetComputationSize(*child_iter))
return computation_size;
}
return 0;
}

// Fills op metrics into a node.
void PopulateOpMetricsNode(
const OpMetrics& op_metrics, double peak_gigaflops_per_second_per_core,
Expand Down Expand Up @@ -189,14 +174,6 @@ void PopulateOpMetricsNode(
metrics->set_avg_time_ps(
SafeDivide(op_metrics.time_ps(), op_metrics.occurrences()));

// Hack to approximate utilization for INT8/4 convolution HLOs:
// Since MXU BW is 2x/4x for INT8/4, multiply peak BW by the factor determined
// by the computation size
if (GetComputationSize(*node) == 8) {
peak_gigaflops_per_second_per_core *= 2;
} else if (GetComputationSize(*node) == 4) {
peak_gigaflops_per_second_per_core *= 4;
}
double flops_utilization = SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
peak_gigaflops_per_second_per_core);
// The UI expects flops_utilization = flop_util / time_fraction. See:
Expand Down
7 changes: 5 additions & 2 deletions tensorflow/core/profiler/protobuf/op_metrics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ message MemoryAccessBreakdown {
}

// Metrics for an operation (accumulated over all occurrences).
// Next ID: 24
// Next ID: 25
message OpMetrics {
// HLO module id. 0 for TF ops.
uint64 hlo_module_id = 13;
Expand All @@ -84,8 +84,11 @@ message OpMetrics {
uint64 min_time_ps = 17;
// Total self time in picoseconds.
uint64 self_time_ps = 1;
// Total FLOPs.
// Total FLOPs. Normalized to the devices peak bandwidth.
uint64 flops = 2;
// Total FLOPs for the model. Can be 0, in which case assume it's same as
// flops
uint64 model_flops = 24;
// Total bytes accessed.
uint64 bytes_accessed = 5;
// Breakdown of memory accessed by operation type and memory space.
Expand Down
10 changes: 9 additions & 1 deletion tensorflow/core/profiler/utils/op_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ void DeviceOpMetricsDbBuilder::EnterOp(
uint64 time_ps, uint64 children_time_ps, int64_t flops,
int64_t bytes_accessed,
const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
memory_accessed_breakdown) {
memory_accessed_breakdown,
int64_t model_flops) {
uint64 self_time_ps = time_ps - children_time_ps;
DCHECK_GE(time_ps, self_time_ps);
OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
Expand All @@ -95,6 +96,13 @@ void DeviceOpMetricsDbBuilder::EnterOp(
op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
op_metrics->set_flops(op_metrics->flops() + flops * occurrences);
if (model_flops == 0) {
// If ModelsFlops is 0, use the same value as device flops.
op_metrics->set_model_flops(op_metrics->flops());
} else {
op_metrics->set_model_flops(op_metrics->model_flops() +
model_flops * occurrences);
}
op_metrics->set_bytes_accessed(op_metrics->bytes_accessed() +
bytes_accessed * occurrences);
CombineMemoryAccessedBreakdown(
Expand Down
3 changes: 2 additions & 1 deletion tensorflow/core/profiler/utils/op_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
bool is_eager, uint64 occurrences, uint64 time_ps,
uint64 children_time_ps, int64_t flops, int64_t bytes_accessed,
const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
memory_accessed_breakdown = {});
memory_accessed_breakdown = {},
int64_t model_flops = 0);
};

} // namespace profiler
Expand Down

0 comments on commit 543f036

Please sign in to comment.