[go: nahoru, domu]

Skip to content

Commit

Permalink
XProf GPU: Using Per-Thread callback api data for CUPTI collector for…
Browse files Browse the repository at this point in the history
… better overhead.

PiperOrigin-RevId: 621568181
  • Loading branch information
tensorflower-gardener committed Jun 6, 2024
1 parent a2c4fbf commit e1e1de2
Show file tree
Hide file tree
Showing 7 changed files with 403 additions and 189 deletions.
4 changes: 4 additions & 0 deletions third_party/xla/xla/backends/profiler/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,8 @@ tsl_gpu_library(
"@local_tsl//tsl/platform:platform_port",
"@local_tsl//tsl/platform:types",
"@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
"@local_tsl//tsl/profiler/utils:lock_free_queue",
"@local_tsl//tsl/profiler/utils:per_thread",
],
)

Expand Down Expand Up @@ -302,6 +304,7 @@ tsl_gpu_library(
"@local_tsl//tsl/platform:mutex",
"@local_tsl//tsl/platform:platform_port",
"@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
"@local_tsl//tsl/profiler/utils:lock_free_queue",
"@local_tsl//tsl/profiler/utils:parse_annotation",
"@local_tsl//tsl/profiler/utils:trace_utils",
"@local_tsl//tsl/profiler/utils:xplane_builder",
Expand All @@ -328,6 +331,7 @@ tsl_gpu_library(
"@local_tsl//tsl/platform:platform_port",
"@local_tsl//tsl/platform:thread_annotations",
"@local_tsl//tsl/profiler/utils:buffer_pool",
"@local_tsl//tsl/profiler/utils:lock_free_queue",
] + if_cuda(["//xla/tsl/cuda:cupti"]),
)

Expand Down
100 changes: 74 additions & 26 deletions third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ static absl::Status ConvertActivityBuffer(
CUptiResult status =
cupti_interface->ActivityGetNextRecord(buffer, size, &record);
if (status == CUPTI_SUCCESS) {
if (total_activity_event_count >= max_activity_event_count) {
if (max_activity_event_count > 0 &&
total_activity_event_count >= max_activity_event_count) {
dropped_activity_event_count++;
continue;
}
Expand Down Expand Up @@ -438,33 +439,30 @@ static absl::Status ConvertActivityBuffer(

} // namespace

void AnnotationMap::Add(uint32_t device_id, uint32_t correlation_id,
const absl::string_view annotation,
const absl::string_view nvtx_range) {
if (annotation.empty() && nvtx_range.empty()) return;
VLOG(3) << "Add annotation: device_id: " << device_id
<< " correlation_id: " << correlation_id
<< " annotation: " << annotation;
if (device_id >= per_device_map_.size()) return;
auto &per_device_map = per_device_map_[device_id];
tsl::mutex_lock lock(per_device_map.mutex);
if (per_device_map.annotations.size() < max_size_) {
AnnotationInfo info;
info.annotation = *per_device_map.annotations.emplace(annotation).first;
if (!nvtx_range.empty())
info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
per_device_map.correlation_map.emplace(correlation_id, info);
}
absl::string_view StringDeduper::Dedup(absl::string_view str,
size_t max_unique_count) {
if (str.empty()) return absl::string_view();
if (max_unique_count == 0 || strings_.size() < max_unique_count)
return *strings_.emplace(str).first;
auto it = strings_.find(str);
if (it != strings_.end()) return *it;
return absl::string_view();
}

void AnnotationMap::AddAnnotation(uint32_t correlation_id,
absl::string_view annotation,
absl::string_view nvtx_range) {
auto annotation_view = string_deduper_.Dedup(annotation);
auto nvtx_range_view = string_deduper_.Dedup(nvtx_range);
if (annotation_view.empty() && nvtx_range_view.empty()) return;
map_.emplace(correlation_id,
AnnotationInfo{annotation_view, nvtx_range_view});
}

AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32_t device_id,
uint32_t correlation_id) {
if (device_id >= per_device_map_.size()) return AnnotationInfo();
auto &per_device_map = per_device_map_[device_id];
tsl::mutex_lock lock(per_device_map.mutex);
auto it = per_device_map.correlation_map.find(correlation_id);
return it != per_device_map.correlation_map.end() ? it->second
: AnnotationInfo();
AnnotationMap::AnnotationInfo AnnotationMap::LookUp(
uint32_t device_id, uint32_t correlation_id) const {
const auto it = map_.find(correlation_id);
return it != map_.end() ? it->second : AnnotationInfo();
}

CuptiActivityBufferManager::ActivityBufferAndSize::ActivityBufferAndSize(
Expand Down Expand Up @@ -493,5 +491,55 @@ void CuptiActivityBufferManager::AddCachedActivityEventsTo(
}
}

CallbackAnnotationsAndEvents::CallbackAnnotationsAndEvents(
CallbackAnnotationsAndEvents &&another)
: annotations_(std::move(another.annotations_)),
nvtx_ranges_(std::move(another.nvtx_ranges_)),
num_dropped_events_(another.num_dropped_events_),
event_annotation_queue_(std::move(another.event_annotation_queue_)) {
another.Clear();
}

CallbackAnnotationsAndEvents &CallbackAnnotationsAndEvents::operator=(
CallbackAnnotationsAndEvents &&another) {
annotations_ = std::move(another.annotations_);
nvtx_ranges_ = std::move(another.nvtx_ranges_);
num_dropped_events_ = another.num_dropped_events_;
event_annotation_queue_ = std::move(another.event_annotation_queue_);
another.Clear();
return *this;
}

bool CallbackAnnotationsAndEvents::PrepareAnnotation(
uint32_t device_id, uint32_t correlation_id, size_t max_annotation_strings,
size_t max_callback_api_events,
std::atomic<size_t> &callback_api_event_count,
absl::string_view &annotation, absl::string_view &nvtx_range) {
if (max_callback_api_events == 0 ||
callback_api_event_count < max_callback_api_events) {
++callback_api_event_count;
// Some logic change as no cross thread string comparison should be
// make here. The max_annotation_string is used to limit per-thread
// annotation string count. And annotation string is not collected
// if total callback event could overflow.
bool too_many_annotations = (max_annotation_strings > 0) &&
(annotations_.size() >= max_annotation_strings);
annotation = annotations_.Dedup(too_many_annotations ? absl::string_view()
: annotation),
nvtx_range = nvtx_ranges_.Dedup(too_many_annotations ? absl::string_view()
: nvtx_range);
return true;
}
num_dropped_events_++;
return false;
}

void CallbackAnnotationsAndEvents::Clear() {
annotations_.clear();
nvtx_ranges_.clear();
num_dropped_events_ = 0;
event_annotation_queue_.Clear();
}

} // namespace profiler
} // namespace xla
98 changes: 76 additions & 22 deletions third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License.
#ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_
#define XLA_BACKENDS_PROFILER_GPU_CUPTI_BUFFER_EVENTS_H_

#include <atomic>
#include <cstddef>
#include <cstdint>
#include <functional>
Expand All @@ -25,14 +26,14 @@ limitations under the License.
#include <string>
#include <utility>

#include "absl/container/fixed_array.h"
#include "absl/container/flat_hash_map.h"
#include "absl/container/node_hash_set.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "tsl/platform/mutex.h"
#include "tsl/platform/thread_annotations.h"
#include "tsl/profiler/utils/buffer_pool.h"
#include "tsl/profiler/utils/lock_free_queue.h"

namespace xla {
namespace profiler {
Expand Down Expand Up @@ -217,36 +218,35 @@ struct CuptiTracerEvent {
};
};

class StringDeduper {
public:
void clear() { strings_.clear(); }
absl::string_view Dedup(absl::string_view str, size_t max_unique_count = 0);
size_t size() const { return strings_.size(); }

private:
absl::node_hash_set<std::string> strings_;
};

class AnnotationMap {
public:
struct AnnotationInfo {
absl::string_view annotation;
absl::string_view nvtx_range;
};

explicit AnnotationMap(uint64_t max_size, uint32_t num_gpus)
: max_size_(max_size), per_device_map_(num_gpus) {}
void Add(uint32_t device_id, uint32_t correlation_id,
absl::string_view annotation, absl::string_view nvtx_range);
AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id);
void clear() {
map_.clear();
string_deduper_.clear();
}
size_t size() const { return map_.size(); }
void AddAnnotation(uint32_t correlation_id, absl::string_view annotation,
absl::string_view nvtx_range);
AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id) const;

private:
struct PerDeviceAnnotationMap {
// The population/consumption of annotations might happen from multiple
// callback/activity api related threads.
tsl::mutex mutex;
// Annotation tends to be repetitive, use a hash_set to store the strings,
// an use the reference to the string in the map.
absl::node_hash_set<std::string> annotations TF_GUARDED_BY(mutex);
absl::node_hash_set<std::string> nvtx_ranges TF_GUARDED_BY(mutex);
absl::flat_hash_map<uint32_t, AnnotationInfo> correlation_map
TF_GUARDED_BY(mutex);
};
const uint64_t max_size_;
absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;

AnnotationMap(const AnnotationMap&) = delete;
void operator=(const AnnotationMap&) = delete;
StringDeduper string_deduper_;
absl::flat_hash_map<uint32_t, AnnotationInfo> map_;
};

struct CuptiEventCollectorDelegate {
Expand Down Expand Up @@ -290,6 +290,60 @@ class CuptiActivityBufferManager {
std::list<ActivityBufferAndSize> cached_buffers_ TF_GUARDED_BY(buffer_mutex_);
};

class CallbackAnnotationsAndEvents {
public:
struct EventWithAnnotation {
uint32_t correlation_id = 0;
absl::string_view annotation;
absl::string_view nvtx_range;
CuptiTracerEvent event = {};

EventWithAnnotation() = default;

EventWithAnnotation(uint32_t corr_id, absl::string_view ann,
absl::string_view nvtx)
: correlation_id(corr_id), annotation(ann), nvtx_range(nvtx) {}
};

static constexpr size_t kQueueBlockSize = 64 * 1024;
using EventAnnotationQueue =
tsl::profiler::BlockedQueue<EventWithAnnotation, kQueueBlockSize>;

CallbackAnnotationsAndEvents() = default;

CallbackAnnotationsAndEvents(CallbackAnnotationsAndEvents&& another);

CallbackAnnotationsAndEvents& operator=(
CallbackAnnotationsAndEvents&& another);

void Clear();

// Check limits on annotation count, event count allow us to add more event
// or not, if allowed, save the annotation/nvtx range in StringDeduper to
// keep their life cycle, and update their string_view. return true if we
// could add more event.
bool PrepareAnnotation(uint32_t device_id, uint32_t correlation_id,
size_t max_annotation_strings,
size_t max_callback_api_events,
std::atomic<size_t>& callback_api_event_count,
absl::string_view& annotation,
absl::string_view& nvtx_range);

EventAnnotationQueue& event_annotation_queue() {
return event_annotation_queue_;
}

size_t num_dropped_events() { return num_dropped_events_; }

private:
// Annotation tends to be repetitive, use a hash_set to store the strings,
// an use the reference to the string in the map.
StringDeduper annotations_;
StringDeduper nvtx_ranges_;
size_t num_dropped_events_ = 0;
EventAnnotationQueue event_annotation_queue_;
};

} // namespace profiler
} // namespace xla

Expand Down
57 changes: 45 additions & 12 deletions third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ limitations under the License.
#include "third_party/gpus/cuda/include/cuda_occupancy.h"
#include "tsl/platform/abi.h"
#include "tsl/platform/host_info.h"
#include "tsl/platform/mem.h"
#include "tsl/platform/mutex.h"
#include "tsl/profiler/utils/parse_annotation.h"
#include "tsl/profiler/utils/trace_utils.h"
Expand Down Expand Up @@ -483,6 +484,35 @@ void CuptiTraceCollector::OnTracerCachedActivityBuffers(
}
}

void CuptiTraceCollector::OnTracerCollectedCallbackData(
std::list<CallbackAnnotationsAndEvents> callback_annotations_and_events) {
// Create merged annotation first.
annotation_map_.clear();
for (auto& annotations_and_events : callback_annotations_and_events) {
auto& event_queue = annotations_and_events.event_annotation_queue();
for (auto it = event_queue.begin(); it != event_queue.end(); ++it) {
annotation_map_.AddAnnotation((*it).correlation_id, (*it).annotation,
(*it).nvtx_range);
}
}
VLOG(3) << "Total merged annotation map: " << annotation_map_.size();

size_t total_dropped_callback_event_count = 0;
for (auto& annotations_and_events : callback_annotations_and_events) {
auto& queue = annotations_and_events.event_annotation_queue();
for (auto it = queue.begin(); it != queue.end(); ++it) {
AddEvent(std::move((*it).event));
}
total_dropped_callback_event_count +=
annotations_and_events.num_dropped_events();
annotations_and_events.Clear();
}
if (total_dropped_callback_event_count > 0) {
OnEventsDropped("total driver(callback) events reaches max",
total_dropped_callback_event_count);
}
}

// CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
// eventually convert and filter them to XSpace.
class CuptiTraceCollectorImpl : public CuptiTraceCollector {
Expand All @@ -500,16 +530,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
void AddEvent(CuptiTracerEvent&& event) override {
if (event.device_id >= num_gpus_) return;
if (event.source == CuptiTracerEventSource::DriverCallback) {
if (num_callback_events_ > options_.max_callback_api_events) {
OnEventsDropped("total driver(callback) events reaches max", 1);
return;
}
num_callback_events_++;
} else {
if (num_activity_events_ > options_.max_activity_api_events) {
OnEventsDropped("total device(activity) events reaches max", 1);
return;
}
num_activity_events_++;
}
per_device_collector_[event.device_id].AddEvent(std::move(event));
Expand All @@ -525,15 +547,25 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
activity_buffers_ = std::move(activity_buffers);
}

void OnTracerCollectedCallbackData(
std::list<CallbackAnnotationsAndEvents> callback_events) override {
callback_events_ = std::move(callback_events);
}

void Flush() override {}
// Returns true if some GPU events are captured.
bool Export(XSpace* space, uint64_t end_gpu_ns) override {
CuptiTraceCollector::OnTracerCollectedCallbackData(
std::move(callback_events_));
CuptiTraceCollector::OnTracerCachedActivityBuffers(
std::move(activity_buffers_));

LOG(INFO) << " GpuTracer has collected " << num_callback_events_
<< " callback api events and " << num_activity_events_
<< " activity events. " << ReportDroppedEvents();
LOG(INFO) << " GpuTracer max callback_events: "
<< options_.max_activity_api_events
<< ", max activity events: " << options_.max_activity_api_events;
size_t num_events = 0;
XPlaneBuilder host_plane(
FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
Expand Down Expand Up @@ -571,15 +603,16 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
if (events_dropped.empty()) return "";
return absl::StrCat("Detected GPU events dropped on ",
tsl::port::Hostname(), ": Profiler has collected ",
num_callback_events_.load(), " driver events and ",
num_activity_events_.load(), " device events.",
num_callback_events_, " driver events and ",
num_activity_events_, " device events.",
events_dropped);
}

private:
std::atomic<int> num_callback_events_;
std::atomic<int> num_activity_events_;
size_t num_callback_events_ = 0;
size_t num_activity_events_ = 0;
std::unique_ptr<CuptiActivityBufferManager> activity_buffers_;
std::list<CallbackAnnotationsAndEvents> callback_events_;
absl::Mutex mutex_;
absl::flat_hash_map<std::string, uint64_t> dropped_events_
ABSL_GUARDED_BY(mutex_);
Expand Down
Loading

0 comments on commit e1e1de2

Please sign in to comment.