Add ModelFlops to OpMetrics.

PiperOrigin-RevId: 601142809
tensorflow · Jan 24, 2024 · 543f036 · 543f036
1 parent a58d93f
commit 543f036
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 27 deletions.
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -72,6 +72,7 @@ void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst,
   dst->set_time_ps(src.time_ps() + dst->time_ps());
   dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
   dst->set_flops(src.flops() + dst->flops());
+  dst->set_model_flops(src.model_flops() + dst->model_flops());
   dst->set_bytes_accessed(src.bytes_accessed() + dst->bytes_accessed());
   dst->set_autotuned(dst->autotuned() || src.autotuned());
   if (update_num_cores) {

diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
@@ -146,21 +146,6 @@ void FinalizeDeduplicatedNodes(bool by_program, Node* root) {
   }
 }
 
-// Recursively find computation size for HLOs -- applied only for convolutions.
-// This is only for convolutions, not other HLOs, categories or whole programs.
-// TODO(b/243596435) Find a permanent fix to this problem.
-int64_t GetComputationSize(Node node) {
-  if (node.has_xla() && node.xla().computation_primitive_size() > 0) {
-    return node.xla().computation_primitive_size();
-  }
-  for (auto child_iter = node.children().rbegin();
-       child_iter != node.children().rend(); ++child_iter) {
-    if (const int64_t computation_size = GetComputationSize(*child_iter))
-      return computation_size;
-  }
-  return 0;
-}
-
 // Fills op metrics into a node.
 void PopulateOpMetricsNode(
     const OpMetrics& op_metrics, double peak_gigaflops_per_second_per_core,
@@ -189,14 +174,6 @@ void PopulateOpMetricsNode(
   metrics->set_avg_time_ps(
       SafeDivide(op_metrics.time_ps(), op_metrics.occurrences()));
 
-  // Hack to approximate utilization for INT8/4 convolution HLOs:
-  // Since MXU BW is 2x/4x for INT8/4, multiply peak BW by the factor determined
-  // by the computation size
-  if (GetComputationSize(*node) == 8) {
-    peak_gigaflops_per_second_per_core *= 2;
-  } else if (GetComputationSize(*node) == 4) {
-    peak_gigaflops_per_second_per_core *= 4;
-  }
   double flops_utilization = SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
                                         peak_gigaflops_per_second_per_core);
   // The UI expects flops_utilization = flop_util / time_fraction. See:

diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -62,7 +62,7 @@ message MemoryAccessBreakdown {
 }
 
 // Metrics for an operation (accumulated over all occurrences).
-// Next ID: 24
+// Next ID: 25
 message OpMetrics {
   // HLO module id. 0 for TF ops.
   uint64 hlo_module_id = 13;
@@ -84,8 +84,11 @@ message OpMetrics {
   uint64 min_time_ps = 17;
   // Total self time in picoseconds.
   uint64 self_time_ps = 1;
-  // Total FLOPs.
+  // Total FLOPs. Normalized to the devices peak bandwidth.
   uint64 flops = 2;
+  // Total FLOPs for the model. Can be 0, in which case assume it's same as
+  // flops
+  uint64 model_flops = 24;
   // Total bytes accessed.
   uint64 bytes_accessed = 5;
   // Breakdown of memory accessed by operation type and memory space.

diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
@@ -79,7 +79,8 @@ void DeviceOpMetricsDbBuilder::EnterOp(
     uint64 time_ps, uint64 children_time_ps, int64_t flops,
     int64_t bytes_accessed,
     const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
-        memory_accessed_breakdown) {
+        memory_accessed_breakdown,
+    int64_t model_flops) {
   uint64 self_time_ps = time_ps - children_time_ps;
   DCHECK_GE(time_ps, self_time_ps);
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
@@ -95,6 +96,13 @@ void DeviceOpMetricsDbBuilder::EnterOp(
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
   op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
   op_metrics->set_flops(op_metrics->flops() + flops * occurrences);
+  if (model_flops == 0) {
+    // If ModelsFlops is 0, use the same value as device flops.
+    op_metrics->set_model_flops(op_metrics->flops());
+  } else {
+    op_metrics->set_model_flops(op_metrics->model_flops() +
+                                model_flops * occurrences);
+  }
   op_metrics->set_bytes_accessed(op_metrics->bytes_accessed() +
                                  bytes_accessed * occurrences);
   CombineMemoryAccessedBreakdown(

diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
@@ -76,7 +76,8 @@ class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
                bool is_eager, uint64 occurrences, uint64 time_ps,
                uint64 children_time_ps, int64_t flops, int64_t bytes_accessed,
                const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
-                   memory_accessed_breakdown = {});
+                   memory_accessed_breakdown = {},
+               int64_t model_flops = 0);
 };
 
 }  // namespace profiler