From e1e1de22a7e7c9e9f9ad512afeccd3ad64165e6b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 3 Apr 2024 10:31:52 -0700 Subject: [PATCH] XProf GPU: Using Per-Thread callback api data for CUPTI collector for better overhead. PiperOrigin-RevId: 621568181 --- .../xla/xla/backends/profiler/gpu/BUILD | 4 + .../profiler/gpu/cupti_buffer_events.cc | 100 ++++-- .../profiler/gpu/cupti_buffer_events.h | 98 ++++-- .../backends/profiler/gpu/cupti_collector.cc | 57 +++- .../backends/profiler/gpu/cupti_collector.h | 14 +- .../xla/backends/profiler/gpu/cupti_tracer.cc | 296 ++++++++++-------- .../xla/backends/profiler/gpu/cupti_tracer.h | 23 +- 7 files changed, 403 insertions(+), 189 deletions(-) diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD index 37c5f6c06c9918..cf16d0099335e7 100644 --- a/third_party/xla/xla/backends/profiler/gpu/BUILD +++ b/third_party/xla/xla/backends/profiler/gpu/BUILD @@ -183,6 +183,8 @@ tsl_gpu_library( "@local_tsl//tsl/platform:platform_port", "@local_tsl//tsl/platform:types", "@local_tsl//tsl/profiler/backends/cpu:annotation_stack", + "@local_tsl//tsl/profiler/utils:lock_free_queue", + "@local_tsl//tsl/profiler/utils:per_thread", ], ) @@ -302,6 +304,7 @@ tsl_gpu_library( "@local_tsl//tsl/platform:mutex", "@local_tsl//tsl/platform:platform_port", "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc", + "@local_tsl//tsl/profiler/utils:lock_free_queue", "@local_tsl//tsl/profiler/utils:parse_annotation", "@local_tsl//tsl/profiler/utils:trace_utils", "@local_tsl//tsl/profiler/utils:xplane_builder", @@ -328,6 +331,7 @@ tsl_gpu_library( "@local_tsl//tsl/platform:platform_port", "@local_tsl//tsl/platform:thread_annotations", "@local_tsl//tsl/profiler/utils:buffer_pool", + "@local_tsl//tsl/profiler/utils:lock_free_queue", ] + if_cuda(["//xla/tsl/cuda:cupti"]), ) diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc index 047e04a8063f8a..933e1ef311d864 100644 --- a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc +++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc @@ -373,7 +373,8 @@ static absl::Status ConvertActivityBuffer( CUptiResult status = cupti_interface->ActivityGetNextRecord(buffer, size, &record); if (status == CUPTI_SUCCESS) { - if (total_activity_event_count >= max_activity_event_count) { + if (max_activity_event_count > 0 && + total_activity_event_count >= max_activity_event_count) { dropped_activity_event_count++; continue; } @@ -438,33 +439,30 @@ static absl::Status ConvertActivityBuffer( } // namespace -void AnnotationMap::Add(uint32_t device_id, uint32_t correlation_id, - const absl::string_view annotation, - const absl::string_view nvtx_range) { - if (annotation.empty() && nvtx_range.empty()) return; - VLOG(3) << "Add annotation: device_id: " << device_id - << " correlation_id: " << correlation_id - << " annotation: " << annotation; - if (device_id >= per_device_map_.size()) return; - auto &per_device_map = per_device_map_[device_id]; - tsl::mutex_lock lock(per_device_map.mutex); - if (per_device_map.annotations.size() < max_size_) { - AnnotationInfo info; - info.annotation = *per_device_map.annotations.emplace(annotation).first; - if (!nvtx_range.empty()) - info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first; - per_device_map.correlation_map.emplace(correlation_id, info); - } +absl::string_view StringDeduper::Dedup(absl::string_view str, + size_t max_unique_count) { + if (str.empty()) return absl::string_view(); + if (max_unique_count == 0 || strings_.size() < max_unique_count) + return *strings_.emplace(str).first; + auto it = strings_.find(str); + if (it != strings_.end()) return *it; + return absl::string_view(); +} + +void AnnotationMap::AddAnnotation(uint32_t correlation_id, + absl::string_view annotation, + absl::string_view nvtx_range) { + auto annotation_view = string_deduper_.Dedup(annotation); + auto nvtx_range_view = string_deduper_.Dedup(nvtx_range); + if (annotation_view.empty() && nvtx_range_view.empty()) return; + map_.emplace(correlation_id, + AnnotationInfo{annotation_view, nvtx_range_view}); } -AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32_t device_id, - uint32_t correlation_id) { - if (device_id >= per_device_map_.size()) return AnnotationInfo(); - auto &per_device_map = per_device_map_[device_id]; - tsl::mutex_lock lock(per_device_map.mutex); - auto it = per_device_map.correlation_map.find(correlation_id); - return it != per_device_map.correlation_map.end() ? it->second - : AnnotationInfo(); +AnnotationMap::AnnotationInfo AnnotationMap::LookUp( + uint32_t device_id, uint32_t correlation_id) const { + const auto it = map_.find(correlation_id); + return it != map_.end() ? it->second : AnnotationInfo(); } CuptiActivityBufferManager::ActivityBufferAndSize::ActivityBufferAndSize( @@ -493,5 +491,55 @@ void CuptiActivityBufferManager::AddCachedActivityEventsTo( } } +CallbackAnnotationsAndEvents::CallbackAnnotationsAndEvents( + CallbackAnnotationsAndEvents &&another) + : annotations_(std::move(another.annotations_)), + nvtx_ranges_(std::move(another.nvtx_ranges_)), + num_dropped_events_(another.num_dropped_events_), + event_annotation_queue_(std::move(another.event_annotation_queue_)) { + another.Clear(); +} + +CallbackAnnotationsAndEvents &CallbackAnnotationsAndEvents::operator=( + CallbackAnnotationsAndEvents &&another) { + annotations_ = std::move(another.annotations_); + nvtx_ranges_ = std::move(another.nvtx_ranges_); + num_dropped_events_ = another.num_dropped_events_; + event_annotation_queue_ = std::move(another.event_annotation_queue_); + another.Clear(); + return *this; +} + +bool CallbackAnnotationsAndEvents::PrepareAnnotation( + uint32_t device_id, uint32_t correlation_id, size_t max_annotation_strings, + size_t max_callback_api_events, + std::atomic &callback_api_event_count, + absl::string_view &annotation, absl::string_view &nvtx_range) { + if (max_callback_api_events == 0 || + callback_api_event_count < max_callback_api_events) { + ++callback_api_event_count; + // Some logic change as no cross thread string comparison should be + // make here. The max_annotation_string is used to limit per-thread + // annotation string count. And annotation string is not collected + // if total callback event could overflow. + bool too_many_annotations = (max_annotation_strings > 0) && + (annotations_.size() >= max_annotation_strings); + annotation = annotations_.Dedup(too_many_annotations ? absl::string_view() + : annotation), + nvtx_range = nvtx_ranges_.Dedup(too_many_annotations ? absl::string_view() + : nvtx_range); + return true; + } + num_dropped_events_++; + return false; +} + +void CallbackAnnotationsAndEvents::Clear() { + annotations_.clear(); + nvtx_ranges_.clear(); + num_dropped_events_ = 0; + event_annotation_queue_.Clear(); +} + } // namespace profiler } // namespace xla diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h index 4ffb4f0ef626e2..f950bebb5f4096 100644 --- a/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h +++ b/third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_ #define XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_ +#include #include #include #include @@ -25,7 +26,6 @@ limitations under the License. #include #include -#include "absl/container/fixed_array.h" #include "absl/container/flat_hash_map.h" #include "absl/container/node_hash_set.h" #include "absl/strings/str_cat.h" @@ -33,6 +33,7 @@ limitations under the License. #include "tsl/platform/mutex.h" #include "tsl/platform/thread_annotations.h" #include "tsl/profiler/utils/buffer_pool.h" +#include "tsl/profiler/utils/lock_free_queue.h" namespace xla { namespace profiler { @@ -217,6 +218,16 @@ struct CuptiTracerEvent { }; }; +class StringDeduper { + public: + void clear() { strings_.clear(); } + absl::string_view Dedup(absl::string_view str, size_t max_unique_count = 0); + size_t size() const { return strings_.size(); } + + private: + absl::node_hash_set strings_; +}; + class AnnotationMap { public: struct AnnotationInfo { @@ -224,29 +235,18 @@ class AnnotationMap { absl::string_view nvtx_range; }; - explicit AnnotationMap(uint64_t max_size, uint32_t num_gpus) - : max_size_(max_size), per_device_map_(num_gpus) {} - void Add(uint32_t device_id, uint32_t correlation_id, - absl::string_view annotation, absl::string_view nvtx_range); - AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id); + void clear() { + map_.clear(); + string_deduper_.clear(); + } + size_t size() const { return map_.size(); } + void AddAnnotation(uint32_t correlation_id, absl::string_view annotation, + absl::string_view nvtx_range); + AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id) const; private: - struct PerDeviceAnnotationMap { - // The population/consumption of annotations might happen from multiple - // callback/activity api related threads. - tsl::mutex mutex; - // Annotation tends to be repetitive, use a hash_set to store the strings, - // an use the reference to the string in the map. - absl::node_hash_set annotations TF_GUARDED_BY(mutex); - absl::node_hash_set nvtx_ranges TF_GUARDED_BY(mutex); - absl::flat_hash_map correlation_map - TF_GUARDED_BY(mutex); - }; - const uint64_t max_size_; - absl::FixedArray per_device_map_; - - AnnotationMap(const AnnotationMap&) = delete; - void operator=(const AnnotationMap&) = delete; + StringDeduper string_deduper_; + absl::flat_hash_map map_; }; struct CuptiEventCollectorDelegate { @@ -290,6 +290,60 @@ class CuptiActivityBufferManager { std::list cached_buffers_ TF_GUARDED_BY(buffer_mutex_); }; +class CallbackAnnotationsAndEvents { + public: + struct EventWithAnnotation { + uint32_t correlation_id = 0; + absl::string_view annotation; + absl::string_view nvtx_range; + CuptiTracerEvent event = {}; + + EventWithAnnotation() = default; + + EventWithAnnotation(uint32_t corr_id, absl::string_view ann, + absl::string_view nvtx) + : correlation_id(corr_id), annotation(ann), nvtx_range(nvtx) {} + }; + + static constexpr size_t kQueueBlockSize = 64 * 1024; + using EventAnnotationQueue = + tsl::profiler::BlockedQueue; + + CallbackAnnotationsAndEvents() = default; + + CallbackAnnotationsAndEvents(CallbackAnnotationsAndEvents&& another); + + CallbackAnnotationsAndEvents& operator=( + CallbackAnnotationsAndEvents&& another); + + void Clear(); + + // Check limits on annotation count, event count allow us to add more event + // or not, if allowed, save the annotation/nvtx range in StringDeduper to + // keep their life cycle, and update their string_view. return true if we + // could add more event. + bool PrepareAnnotation(uint32_t device_id, uint32_t correlation_id, + size_t max_annotation_strings, + size_t max_callback_api_events, + std::atomic& callback_api_event_count, + absl::string_view& annotation, + absl::string_view& nvtx_range); + + EventAnnotationQueue& event_annotation_queue() { + return event_annotation_queue_; + } + + size_t num_dropped_events() { return num_dropped_events_; } + + private: + // Annotation tends to be repetitive, use a hash_set to store the strings, + // an use the reference to the string in the map. + StringDeduper annotations_; + StringDeduper nvtx_ranges_; + size_t num_dropped_events_ = 0; + EventAnnotationQueue event_annotation_queue_; +}; + } // namespace profiler } // namespace xla diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc index f719c8d1daeb74..ec546e7e3fa52a 100644 --- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc +++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc @@ -25,6 +25,7 @@ limitations under the License. #include "third_party/gpus/cuda/include/cuda_occupancy.h" #include "tsl/platform/abi.h" #include "tsl/platform/host_info.h" +#include "tsl/platform/mem.h" #include "tsl/platform/mutex.h" #include "tsl/profiler/utils/parse_annotation.h" #include "tsl/profiler/utils/trace_utils.h" @@ -483,6 +484,35 @@ void CuptiTraceCollector::OnTracerCachedActivityBuffers( } } +void CuptiTraceCollector::OnTracerCollectedCallbackData( + std::list callback_annotations_and_events) { + // Create merged annotation first. + annotation_map_.clear(); + for (auto& annotations_and_events : callback_annotations_and_events) { + auto& event_queue = annotations_and_events.event_annotation_queue(); + for (auto it = event_queue.begin(); it != event_queue.end(); ++it) { + annotation_map_.AddAnnotation((*it).correlation_id, (*it).annotation, + (*it).nvtx_range); + } + } + VLOG(3) << "Total merged annotation map: " << annotation_map_.size(); + + size_t total_dropped_callback_event_count = 0; + for (auto& annotations_and_events : callback_annotations_and_events) { + auto& queue = annotations_and_events.event_annotation_queue(); + for (auto it = queue.begin(); it != queue.end(); ++it) { + AddEvent(std::move((*it).event)); + } + total_dropped_callback_event_count += + annotations_and_events.num_dropped_events(); + annotations_and_events.Clear(); + } + if (total_dropped_callback_event_count > 0) { + OnEventsDropped("total driver(callback) events reaches max", + total_dropped_callback_event_count); + } +} + // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and // eventually convert and filter them to XSpace. class CuptiTraceCollectorImpl : public CuptiTraceCollector { @@ -500,16 +530,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector { void AddEvent(CuptiTracerEvent&& event) override { if (event.device_id >= num_gpus_) return; if (event.source == CuptiTracerEventSource::DriverCallback) { - if (num_callback_events_ > options_.max_callback_api_events) { - OnEventsDropped("total driver(callback) events reaches max", 1); - return; - } num_callback_events_++; } else { - if (num_activity_events_ > options_.max_activity_api_events) { - OnEventsDropped("total device(activity) events reaches max", 1); - return; - } num_activity_events_++; } per_device_collector_[event.device_id].AddEvent(std::move(event)); @@ -525,15 +547,25 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector { activity_buffers_ = std::move(activity_buffers); } + void OnTracerCollectedCallbackData( + std::list callback_events) override { + callback_events_ = std::move(callback_events); + } + void Flush() override {} // Returns true if some GPU events are captured. bool Export(XSpace* space, uint64_t end_gpu_ns) override { + CuptiTraceCollector::OnTracerCollectedCallbackData( + std::move(callback_events_)); CuptiTraceCollector::OnTracerCachedActivityBuffers( std::move(activity_buffers_)); LOG(INFO) << " GpuTracer has collected " << num_callback_events_ << " callback api events and " << num_activity_events_ << " activity events. " << ReportDroppedEvents(); + LOG(INFO) << " GpuTracer max callback_events: " + << options_.max_activity_api_events + << ", max activity events: " << options_.max_activity_api_events; size_t num_events = 0; XPlaneBuilder host_plane( FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName)); @@ -571,15 +603,16 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector { if (events_dropped.empty()) return ""; return absl::StrCat("Detected GPU events dropped on ", tsl::port::Hostname(), ": Profiler has collected ", - num_callback_events_.load(), " driver events and ", - num_activity_events_.load(), " device events.", + num_callback_events_, " driver events and ", + num_activity_events_, " device events.", events_dropped); } private: - std::atomic num_callback_events_; - std::atomic num_activity_events_; + size_t num_callback_events_ = 0; + size_t num_activity_events_ = 0; std::unique_ptr activity_buffers_; + std::list callback_events_; absl::Mutex mutex_; absl::flat_hash_map dropped_events_ ABSL_GUARDED_BY(mutex_); diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h index 98fdbb9d05478a..576b0b3eebf355 100644 --- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h +++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h @@ -17,7 +17,9 @@ limitations under the License. #define XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_ #include +#include #include +#include #include "xla/backends/profiler/gpu/cupti_buffer_events.h" #include "tsl/profiler/protobuf/xplane.pb.h" @@ -41,8 +43,7 @@ struct CuptiTracerCollectorOptions { class CuptiTraceCollector { public: explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options) - : options_(options), - annotation_map_(options.max_annotation_strings, options.num_gpus) {} + : options_(options) {} virtual ~CuptiTraceCollector() {} // Producer side functions (i.e. called by CuptiTracer). @@ -62,6 +63,15 @@ class CuptiTraceCollector { virtual void OnTracerCachedActivityBuffers( std::unique_ptr activity_buffers); + // After CuptiTracer stop, collected per-thread callback data from threads + // will be send here. Default behavior are: a) create merged annotation map + // (for later activity event usage), and b) direct add all event by calling + // AddEvent(). Yet collector could just save those callback events without + // processing now, but merge annotation and AddEvent() later when needed, such + // as during export(). + virtual void OnTracerCollectedCallbackData( + std::list callback_events); + // Consumer side functions (i.e. called by GPU tracer); virtual bool Export(tensorflow::profiler::XSpace* space, uint64_t end_gpu_ns) { diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc index d36efcf17c5709..9a88b6416df113 100644 --- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc +++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc @@ -27,7 +27,9 @@ limitations under the License. #include "tsl/platform/host_info.h" #include "tsl/platform/logging.h" #include "tsl/platform/macros.h" +#include "tsl/platform/mem.h" #include "tsl/profiler/backends/cpu/annotation_stack.h" +#include "tsl/profiler/utils/per_thread.h" namespace xla { namespace profiler { @@ -308,11 +310,9 @@ void CUPTIAPI ProcessCuptiActivityBuffer(CUcontext context, uint32_t stream_id, } } -void AddKernelEventUponApiExit(CuptiTraceCollector *collector, - uint32_t device_id, +void SetKernelEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::Kernel; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName; @@ -323,15 +323,15 @@ void AddKernelEventUponApiExit(CuptiTraceCollector *collector, event.context_id = cbdata->contextUid; event.correlation_id = cbdata->correlationId; VLOG(3) << "Cuda Kernel launch API exit. name=" << event.name; - collector->AddEvent(std::move(event)); } // Performs the actual callback for both normal and P2P memcpy operations. -CuptiTracerEvent PopulateMemcpyCallbackEvent( - CuptiTracerEventType type, const CUpti_CallbackData *cbdata, - size_t num_bytes, uint32_t src_device, uint32_t dst_device, bool async, - uint64_t start_time, uint64_t end_time) { - CuptiTracerEvent event{}; +void PopulateMemcpyCallbackEvent(CuptiTracerEvent &event, + CuptiTracerEventType type, + const CUpti_CallbackData *cbdata, + size_t num_bytes, uint32_t src_device, + uint32_t dst_device, bool async, + uint64_t start_time, uint64_t end_time) { event.type = type; event.source = CuptiTracerEventSource::DriverCallback; event.start_time_ns = start_time; @@ -347,10 +347,9 @@ CuptiTracerEvent PopulateMemcpyCallbackEvent( event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN; event.memcpy_info.dst_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN; event.memcpy_info.src_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN; - return event; } -void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector, +void SetNormalMemcpyEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { @@ -361,14 +360,12 @@ void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector, DecodeDriverMemcpy(cbid, cbdata->functionParams); VLOG(3) << "Cuda Memcpy API exit. sz=" << num_bytes; - CuptiTracerEvent event = - PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id, - async, start_time, end_time); - collector->AddEvent(std::move(event)); + PopulateMemcpyCallbackEvent(event, type, cbdata, num_bytes, device_id, + device_id, async, start_time, end_time); } -void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector, - uint32_t device_id, CUpti_CallbackId cbid, +void SetCuMemsetEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, + CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { // We are casting all variants of cuMemset to cuMemsetD8 for accessing the @@ -381,7 +378,6 @@ void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector, std::tie(num_bytes, type, async) = DecodeDriverMemset(cbid, cbdata->functionParams); - CuptiTracerEvent event{}; event.type = type; event.source = CuptiTracerEventSource::DriverCallback; event.start_time_ns = start_time; @@ -396,10 +392,9 @@ void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector, VLOG(3) << "Cuda Memset API exit." << " dptr=" << reinterpret_cast(params->dstDevice) << " sz=" << num_bytes; - collector->AddEvent(std::move(event)); } -void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector, +void SetP2PMemcpyEventUponApiExit(CuptiTracerEvent &event, CuptiInterface *cupti_interface, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, @@ -417,20 +412,17 @@ void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector, cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device); VLOG(3) << "Cuda P2P Memcpy API exit, src: " << src_device << " dst: " << dst_device << " size:" << num_bytes; - CuptiTracerEvent event = - PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device, - dst_device, async, start_time, end_time); - collector->AddEvent(std::move(event)); + PopulateMemcpyCallbackEvent(event, type, cbdata, num_bytes, src_device, + dst_device, async, start_time, end_time); } -void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector, - uint32_t device_id, CUpti_CallbackId cbid, +void SetCuMemAllocEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, + CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); const void *dptr = reinterpret_cast(*params->dptr); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::MemoryAlloc; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -444,16 +436,14 @@ void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector, event.memalloc_info.num_bytes = params->bytesize; VLOG(3) << "Cuda MemAlloc API exit." << " dptr=" << dptr << " sz=" << params->bytesize; - collector->AddEvent(std::move(event)); } -void AddCuMemAllocPitchEventUponApiExit( - CuptiTraceCollector *collector, uint32_t device_id, CUpti_CallbackId cbid, +void SetCuMemAllocPitchEventUponApiExit( + CuptiTracerEvent &event, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); const void *dptr = reinterpret_cast(*params->dptr); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::MemoryAlloc; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -468,16 +458,14 @@ void AddCuMemAllocPitchEventUponApiExit( event.memalloc_info.num_bytes = size_in_bytes; VLOG(3) << "Cuda MemAllocPitch API exit." << " dptr=" << dptr << " sz=" << size_in_bytes; - collector->AddEvent(std::move(event)); } -void AddCuMemAllocManagedEventUponApiExit( - CuptiTraceCollector *collector, uint32_t device_id, CUpti_CallbackId cbid, +void SetCuMemAllocManagedEventUponApiExit( + CuptiTracerEvent &event, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); const void *dptr = reinterpret_cast(*params->dptr); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::MemoryAlloc; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -491,17 +479,15 @@ void AddCuMemAllocManagedEventUponApiExit( event.memalloc_info.num_bytes = params->bytesize; VLOG(3) << "Cuda MemAllocManaged API exit." << " dptr=" << dptr << " sz=" << params->bytesize; - collector->AddEvent(std::move(event)); } -void AddCuMemAllocHostEventUponApiExit(CuptiTraceCollector *collector, +void SetCuMemAllocHostEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::MemoryAlloc; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -515,17 +501,15 @@ void AddCuMemAllocHostEventUponApiExit(CuptiTraceCollector *collector, event.memalloc_info.num_bytes = params->bytesize; VLOG(3) << "Cuda MemAllocHost API exit." << " pp=" << *params->pp << " sz=" << params->bytesize; - collector->AddEvent(std::move(event)); } -void AddCuMemHostAllocEventUponApiExit(CuptiTraceCollector *collector, +void SetCuMemHostAllocEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::MemoryAlloc; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -539,17 +523,15 @@ void AddCuMemHostAllocEventUponApiExit(CuptiTraceCollector *collector, event.memalloc_info.num_bytes = params->bytesize; VLOG(3) << "Cuda MemHostAlloc API exit." << " pp=" << *params->pp << " sz=" << params->bytesize << " Flags=" << params->Flags; - collector->AddEvent(std::move(event)); } -void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector, - uint32_t device_id, CUpti_CallbackId cbid, +void SetCuMemFreeEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, + CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); const void *dptr = reinterpret_cast(params->dptr); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::MemoryFree; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -561,16 +543,14 @@ void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector, event.correlation_id = cbdata->correlationId; event.memfree_info.address = reinterpret_cast(dptr); VLOG(3) << "Cuda MemFree API exit." << " dptr=" << dptr; - collector->AddEvent(std::move(event)); } -void AddCuMemFreeHostEventUponApiExit(CuptiTraceCollector *collector, +void SetCuMemFreeHostEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::MemoryFree; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -582,15 +562,13 @@ void AddCuMemFreeHostEventUponApiExit(CuptiTraceCollector *collector, event.correlation_id = cbdata->correlationId; event.memfree_info.address = reinterpret_cast(params->p); VLOG(3) << "Cuda MemFreeHost API exit." << " p=" << params->p; - collector->AddEvent(std::move(event)); } -void AddCuMemHostRegisterEventUponApiExit( - CuptiTraceCollector *collector, uint32_t device_id, CUpti_CallbackId cbid, +void SetCuMemHostRegisterEventUponApiExit( + CuptiTracerEvent &event, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::HostRegister; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -605,15 +583,13 @@ void AddCuMemHostRegisterEventUponApiExit( event.host_register_info.flags = params->Flags; VLOG(3) << "Cuda HostRegister API exit." << " p=" << params->p << " bytesize=" << params->bytesize << " flags=" << params->Flags; - collector->AddEvent(std::move(event)); } -void AddCuMemHostUnregisterEventUponApiExit( - CuptiTraceCollector *collector, uint32_t device_id, CUpti_CallbackId cbid, +void SetCuMemHostUnregisterEventUponApiExit( + CuptiTracerEvent &event, uint32_t device_id, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { const auto *params = static_cast(cbdata->functionParams); - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::HostUnregister; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -625,14 +601,12 @@ void AddCuMemHostUnregisterEventUponApiExit( event.correlation_id = cbdata->correlationId; event.host_unregister_info.address = reinterpret_cast(params->p); VLOG(3) << "Cuda HostUnregister API exit." << " p=" << params->p; - collector->AddEvent(std::move(event)); } -void AddGenericEventUponApiExit(CuptiTraceCollector *collector, - uint32_t device_id, CUpti_CallbackId cbid, +void SetGenericEventUponApiExit(CuptiTracerEvent &event, uint32_t device_id, + CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) { - CuptiTracerEvent event{}; event.type = CuptiTracerEventType::Generic; event.source = CuptiTracerEventSource::DriverCallback; event.name = cbdata->functionName; @@ -643,7 +617,6 @@ void AddGenericEventUponApiExit(CuptiTraceCollector *collector, event.context_id = cbdata->contextUid; event.correlation_id = cbdata->correlationId; VLOG(3) << "Observed generic API exit." << " name=" << cbdata->functionName; - collector->AddEvent(std::move(event)); } // This hook uses cupti activity api to measure device side activities. @@ -651,10 +624,8 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook { public: CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option, CuptiInterface *cupti_interface, - CuptiTraceCollector *collector) - : option_(option), - cupti_interface_(cupti_interface), - collector_(collector) {} + CuptiTracer *tracer) + : option_(option), cupti_interface_(cupti_interface), tracer_(tracer) {} absl::Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, @@ -664,7 +635,9 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook { option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0; return absl::OkStatus(); } - absl::Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain, + absl::Status OnDriverApiExit(absl::string_view annotation, + absl::string_view nvtx_range, int device_id, + CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata) override { // If we are not collecting CPU events from Callback API, we can return now. @@ -676,8 +649,9 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook { uint64_t end_tsc = CuptiTracer::GetTimestamp(); uint64_t start_tsc = *cbdata->correlationData; TrackContext(cbid, cbdata->context); - return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id, - start_tsc, end_tsc, domain, cbid, cbdata); + return AddDriverApiCallbackEvent(tracer_, cupti_interface_, annotation, + nvtx_range, device_id, start_tsc, end_tsc, + domain, cbid, cbdata); } absl::Status SyncAndFlush() override { if (option_.sync_devices_before_stop) { @@ -708,7 +682,7 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook { const CuptiTracerOptions option_; CuptiInterface *cupti_interface_; - CuptiTraceCollector *collector_; + CuptiTracer *tracer_; absl::Mutex mutex_; absl::flat_hash_set contexts_ TF_GUARDED_BY(mutex_); @@ -721,13 +695,70 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook { return absl::StrCat(tsl::port::Hostname(), ": ", error_message); } +// This is in fact for per thread data structure. The +// contention will happen at the moment of start/stop the tracing, +// when control thread is clearing all thread local data, while worker +// threads are injecting events. The mutex in practice will have no +// contention at all, so still cheap. +struct GuardedCallbackAnnotationsAndEvents { + tsl::mutex mu; + CallbackAnnotationsAndEvents annotations_and_events TF_GUARDED_BY(mu); + + bool Push( + CallbackAnnotationsAndEvents::EventWithAnnotation event_with_annotation) { + tsl::mutex_lock lock(mu, std::try_to_lock_t()); + if ((bool)lock) { + annotations_and_events.event_annotation_queue().Push( + std::move(event_with_annotation)); + return true; + } + // if control thread holding the mutex, worker could just ignore appending + // more event, as now the tracing is either not yet started, or already been + // stopped. + return false; + } + + CallbackAnnotationsAndEvents Fetch() { + tsl::mutex_lock lock(mu); + CallbackAnnotationsAndEvents grabed = std::move(annotations_and_events); + return grabed; + } + + void Clear() { + tsl::mutex_lock lock(mu); + annotations_and_events.Clear(); + } + + bool PrepareAnnotation(uint32_t device_id, uint32_t correlation_id, + size_t max_annotation_strings, + size_t max_callback_api_events, + std::atomic &callback_api_event_count, + absl::string_view &annotation, + absl::string_view &nvtx_range) { + tsl::mutex_lock lock(mu, std::try_to_lock_t()); + if ((bool)lock) { + return this->annotations_and_events.PrepareAnnotation( + device_id, correlation_id, max_annotation_strings, + max_callback_api_events, callback_api_event_count, annotation, + nvtx_range); + } + return false; + } +}; + } // namespace +using PerThreadCallbackAnnotationsAndEvents = + tsl::profiler::PerThread; + /*static*/ absl::Status CuptiDriverApiHook::AddDriverApiCallbackEvent( - CuptiTraceCollector *collector, CuptiInterface *cupti_interface, - int device_id, uint64_t start_tsc, uint64_t end_tsc, - CUpti_CallbackDomain domain, CUpti_CallbackId cbid, - const CUpti_CallbackData *cbdata) { + CuptiTracer *tracer, CuptiInterface *cupti_interface, + absl::string_view annotation, absl::string_view nvtx_range, int device_id, + uint64_t start_tsc, uint64_t end_tsc, CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata) { + CallbackAnnotationsAndEvents::EventWithAnnotation event_with_annotation( + cbdata->correlationId, annotation, nvtx_range); + CuptiTracerEvent &event = event_with_annotation.event; switch (cbid) { case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: #if CUDA_VERSION >= 11080 // CUDA 11.8 @@ -735,8 +766,7 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook { #endif // CUDA_VERSION >= 11080 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: - AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc, - end_tsc); + SetKernelEventUponApiExit(event, device_id, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: @@ -762,48 +792,48 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook { // dst memory kind by casting cbdata->functionParams. However, we are not // doing that because that will incur significant overhead to get the // memory aperture of each argument. - AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata, - start_tsc, end_tsc); + SetNormalMemcpyEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, + end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: - AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid, + SetP2PMemcpyEventUponApiExit(event, cupti_interface, device_id, cbid, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2: - AddCuMemAllocEventUponApiExit(collector, device_id, cbid, cbdata, - start_tsc, end_tsc); + SetCuMemAllocEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, + end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2: - AddCuMemAllocPitchEventUponApiExit(collector, device_id, cbid, cbdata, + SetCuMemAllocPitchEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemAllocManaged: - AddCuMemAllocManagedEventUponApiExit(collector, device_id, cbid, cbdata, + SetCuMemAllocManagedEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost_v2: - AddCuMemAllocHostEventUponApiExit(collector, device_id, cbid, cbdata, + SetCuMemAllocHostEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc: - AddCuMemHostAllocEventUponApiExit(collector, device_id, cbid, cbdata, + SetCuMemHostAllocEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2: - AddCuMemFreeEventUponApiExit(collector, device_id, cbid, cbdata, - start_tsc, end_tsc); + SetCuMemFreeEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, + end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemFreeHost: - AddCuMemFreeHostEventUponApiExit(collector, device_id, cbid, cbdata, + SetCuMemFreeHostEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister_v2: - AddCuMemHostRegisterEventUponApiExit(collector, device_id, cbid, cbdata, + SetCuMemHostRegisterEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemHostUnregister: - AddCuMemHostUnregisterEventUponApiExit(collector, device_id, cbid, cbdata, + SetCuMemHostUnregisterEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2: @@ -818,14 +848,16 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook { case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async: case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async: case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async: - AddCuMemsetEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc, + SetCuMemsetEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; default: - AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc, + SetGenericEventUponApiExit(event, device_id, cbid, cbdata, start_tsc, end_tsc); break; } + PerThreadCallbackAnnotationsAndEvents::Get().Push( + std::move(event_with_annotation)); return absl::OkStatus(); } @@ -900,8 +932,8 @@ void CuptiTracer::Enable(const CuptiTracerOptions &option, option_ = option; collector_ = collector; - cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi( - option, cupti_interface_, collector)); + cupti_driver_api_hook_ = std::make_unique( + option, cupti_interface_, this); absl::Status status = EnableApiTracing(); need_root_access_ |= status.code() == tsl::error::PERMISSION_DENIED; @@ -918,6 +950,8 @@ void CuptiTracer::Disable() { Finalize().IgnoreError(); cupti_driver_api_hook_->SyncAndFlush().IgnoreError(); + collector_->OnTracerCollectedCallbackData( + GatherCallbackAnnotationsAndEvents()); collector_->OnTracerCachedActivityBuffers(std::move(activity_buffers_)); if (cupti_dropped_activity_event_count_ > 0) { collector_->OnEventsDropped("Activity Event dropped by Cupti Lib:", @@ -938,6 +972,8 @@ void CuptiTracer::Disable() { absl::Status CuptiTracer::EnableApiTracing() { if (api_tracing_enabled_) return absl::OkStatus(); + PrepareCallbackStart(); + VLOG(1) << "Enable subscriber"; // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED. // The application which calls CUPTI APIs cannot be used with Nvidia tools @@ -966,8 +1002,6 @@ absl::Status CuptiTracer::EnableApiTracing() { absl::Status CuptiTracer::DisableApiTracing() { if (!api_tracing_enabled_) return absl::OkStatus(); - api_tracing_enabled_ = false; - if (!option_->cbids_selected.empty()) { for (auto cbid : option_->cbids_selected) { RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback( @@ -985,6 +1019,8 @@ absl::Status CuptiTracer::DisableApiTracing() { VLOG(1) << "Disable subscriber"; RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_)); + + api_tracing_enabled_ = false; return absl::OkStatus(); } @@ -1002,7 +1038,6 @@ absl::Status CuptiTracer::EnableActivityTracing() { } RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks( RequestCuptiActivityBuffer, ProcessCuptiActivityBuffer)); - VLOG(1) << "Enabling activity tracing for " << option_->activities_selected.size() << " activities"; for (auto activity : option_->activities_selected) { @@ -1013,6 +1048,7 @@ absl::Status CuptiTracer::EnableActivityTracing() { RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity)); } } + activity_tracing_enabled_ = true; return absl::OkStatus(); } @@ -1107,25 +1143,24 @@ absl::Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain, device_id, domain, cbid, cbdata)); } else if (cbdata->callbackSite == CUPTI_API_EXIT) { // Set up the map from correlation id to annotation string. - const auto &annotation = AnnotationStack::Get(); - if (!annotation.empty()) { - if (cbid == - CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) { - // Kernels are launched on different devices by this API call, therefore - // we need to populate per device annotation map respectively. - for (int i = 0; i < num_gpus_; ++i) { - collector_->annotation_map()->Add(i, cbdata->correlationId, - annotation, ""); - } - } else { - absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange(); - collector_->annotation_map()->Add(device_id, cbdata->correlationId, - annotation, nvtx_range); - } + absl::string_view annotation = AnnotationStack::Get(); + absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange(); + if (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) { + // Kernels are launched on different devices by this API call, therefore + // we need to populate per device annotation map respectively. + nvtx_range = ""; } - TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit( - device_id, domain, cbid, cbdata)); + bool could_add_more_event = + PerThreadCallbackAnnotationsAndEvents::Get().PrepareAnnotation( + device_id, cbdata->correlationId, + collector_->GetOptions().max_annotation_strings, + collector_->GetOptions().max_activity_api_events, + num_callback_events_, annotation, nvtx_range); + if (could_add_more_event) { + TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit( + annotation, nvtx_range, device_id, domain, cbid, cbdata)); + } } return absl::OkStatus(); } @@ -1180,8 +1215,9 @@ static size_t CountCuptiActivityEvent(uint8_t *buffer, size_t size) { CuptiInterface *cupti_interface = GetCuptiInterface(); CUpti_Activity *record = nullptr; while (true) { - if (cupti_interface->ActivityGetNextRecord(buffer, size, &record) == - CUPTI_SUCCESS) { + CUptiResult status = + cupti_interface->ActivityGetNextRecord(buffer, size, &record); + if (status == CUPTI_SUCCESS) { ++total_event_count; } else { break; @@ -1205,13 +1241,6 @@ absl::Status CuptiTracer::ProcessActivityBuffer(CUcontext context, } if (cupti_interface_->Disabled()) return tsl::errors::Internal("Disabled."); - // Report dropped records. - size_t dropped = 0; - if (cupti_interface_->ActivityGetNumDroppedRecords( - context, stream_id, &dropped) == CUPTI_SUCCESS) { - cupti_dropped_activity_event_count_ += dropped; - } - size_t event_count_in_buffer = CountCuptiActivityEvent(buffer, size); auto max_activity_event_count = collector_->GetOptions().max_activity_api_events; @@ -1248,6 +1277,29 @@ absl::Status CuptiTracer::ProcessActivityBuffer(CUcontext context, return ""; } +std::list +CuptiTracer::GatherCallbackAnnotationsAndEvents() { + auto guarded_collection = + PerThreadCallbackAnnotationsAndEvents::StopRecording(); + VLOG(3) << "Total grabbed per thread annotated events buffer: " + << guarded_collection.size(); + + std::list result; + for (auto &guarded_annotations_events : guarded_collection) { + result.emplace_back(guarded_annotations_events->Fetch()); + } + return result; +} + +void CuptiTracer::PrepareCallbackStart() { + auto guarded_collection = + PerThreadCallbackAnnotationsAndEvents::StartRecording(); + for (auto &guarded_annotations_events : guarded_collection) { + guarded_annotations_events->Clear(); + } + num_callback_events_ = 0; +} + void CuptiTracer::PrepareActivityStart() { activity_buffers_ = std::make_unique(kBufferSizeInBytes); diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h index 54dc0253f3f4d4..f08c7a3b56e184 100644 --- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h +++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h @@ -16,6 +16,9 @@ limitations under the License. #ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_ #define XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_ +#include +#include + #include "absl/status/status.h" #include "absl/types/optional.h" #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h" @@ -46,6 +49,8 @@ struct CuptiTracerOptions { bool enable_nvtx_tracking = false; }; +class CuptiTracer; + class CuptiDriverApiHook { public: virtual ~CuptiDriverApiHook() {} @@ -54,16 +59,17 @@ class CuptiDriverApiHook { int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData* callback_info) = 0; virtual absl::Status OnDriverApiExit( - int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, + absl::string_view annotation, absl::string_view nvtx_range, int device_id, + CUpti_CallbackDomain domain, CUpti_CallbackId cbid, const CUpti_CallbackData* callback_info) = 0; virtual absl::Status SyncAndFlush() = 0; protected: static absl::Status AddDriverApiCallbackEvent( - CuptiTraceCollector* collector, CuptiInterface* cupti_interface, - int device_id, uint64_t start_tsc, uint64_t end_tsc, - CUpti_CallbackDomain domain, CUpti_CallbackId cbid, - const CUpti_CallbackData* callback_info); + CuptiTracer* tracer, CuptiInterface* cupti_interface, + absl::string_view annotation, absl::string_view nvtx_range, int device_id, + uint64_t start_tsc, uint64_t end_tsc, CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, const CUpti_CallbackData* callback_info); }; // The class use to enable cupti callback/activity API and forward the collected @@ -119,10 +125,17 @@ class CuptiTracer { std::atomic cupti_dropped_activity_event_count_ = 0; std::atomic num_activity_events_in_dropped_buffer_ = 0; std::atomic num_activity_events_in_cached_buffer_ = 0; + std::atomic num_callback_events_ = 0; // Clear activity_buffers, reset activity event counters. void PrepareActivityStart(); + // Empty all per-thread callback annotations, reset callback event counter. + void PrepareCallbackStart(); + + // Gather all per-thread callback events and annotations. + std::list GatherCallbackAnnotationsAndEvents(); + absl::Status EnableApiTracing(); absl::Status EnableActivityTracing(); absl::Status DisableApiTracing();