runtime/platform/profiler.cpp

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
 *
 * This source code is licensed under the BSD-style license found in the
 * LICENSE file in the root directory of this source tree.
 */

#include <string.h>

#include <executorch/runtime/platform/assert.h>
#include <executorch/runtime/platform/platform.h>
#include <executorch/runtime/platform/profiler.h>
#include <inttypes.h>

namespace executorch {
namespace runtime {

namespace {
static uint8_t prof_buf[prof_buf_size * MAX_PROFILE_BLOCKS];
// Base pointer for header
static prof_header_t* prof_header =
    (prof_header_t*)((uintptr_t)prof_buf + prof_header_offset);
// Base pointer for profiling entries
static prof_event_t* prof_arr =
    (prof_event_t*)((uintptr_t)prof_buf + prof_events_offset);
// Base pointer for memory allocator info array
static prof_allocator_t* mem_allocator_arr =
    (prof_allocator_t*)((uintptr_t)prof_buf + prof_mem_alloc_info_offset);
// Base pointer for memory profiling entries
static mem_prof_event_t* mem_prof_arr =
    (mem_prof_event_t*)((uintptr_t)prof_buf + prof_mem_alloc_events_offset);

static uint32_t num_blocks = 0;
static bool prof_stats_dumped = false;
prof_state_t profile_state_tls{-1, 0u};
} // namespace

const prof_state_t& get_profile_tls_state() {
  return profile_state_tls;
}

void set_profile_tls_state(const prof_state_t& state) {
  profile_state_tls = state;
}

ExecutorchProfilerInstructionScope::ExecutorchProfilerInstructionScope(
    const prof_state_t& state)
    : old_state_(get_profile_tls_state()) {
  set_profile_tls_state(state);
}

ExecutorchProfilerInstructionScope::~ExecutorchProfilerInstructionScope() {
  set_profile_tls_state(old_state_);
}

uint32_t begin_profiling(const char* name) {
  ET_CHECK_MSG(
      prof_header->prof_entries < MAX_PROFILE_EVENTS,
      "Out of profiling buffer space. Increase MAX_PROFILE_EVENTS and re-compile.");
  uint32_t curr_counter = prof_header->prof_entries;
  prof_header->prof_entries++;
  prof_arr[curr_counter].end_time = 0;
  prof_arr[curr_counter].name_str = name;
  prof_state_t state = get_profile_tls_state();
  prof_arr[curr_counter].chain_idx = state.chain_idx;
  prof_arr[curr_counter].instruction_idx = state.instruction_idx;
  // Set start time at the last to ensure that we're not capturing
  // any of the overhead in this function.
  prof_arr[curr_counter].start_time = et_pal_current_ticks();
  return curr_counter;
}

void end_profiling(uint32_t token_id) {
  ET_CHECK_MSG(token_id < MAX_PROFILE_EVENTS, "Invalid token id.");
  prof_arr[token_id].end_time = et_pal_current_ticks();
}

void dump_profile_stats(prof_result_t* prof_result) {
  prof_result->prof_data = (uint8_t*)prof_buf;
  prof_result->num_bytes = num_blocks * prof_buf_size;
  prof_result->num_blocks = num_blocks;

  if (!prof_stats_dumped) {
    for (size_t i = 0; i < num_blocks; i++) {
      prof_header_t* prof_header_local =
          (prof_header_t*)(prof_buf + prof_buf_size * i);
      prof_event_t* prof_event_local =
          (prof_event_t*)(prof_buf + prof_buf_size * i + prof_events_offset);
      // Copy over the string names into the space allocated in prof_event_t. We
      // avoided doing this earlier to keep the overhead in begin_profiling and
      // end_profiling as low as possible.
      for (size_t j = 0; j < prof_header_local->prof_entries; j++) {
        size_t str_len = strlen(prof_event_local[j].name_str);
        const char* str_ptr = prof_event_local[j].name_str;
        memset(prof_event_local[j].name, 0, PROF_NAME_MAX_LEN);
        if (str_len > PROF_NAME_MAX_LEN) {
          memcpy(prof_event_local[j].name, str_ptr, PROF_NAME_MAX_LEN);
        } else {
          memcpy(prof_event_local[j].name, str_ptr, str_len);
        }
      }
    }
  }

  prof_stats_dumped = true;
}

void reset_profile_stats() {
  prof_stats_dumped = false;
  prof_header->prof_entries = 0;
  prof_header->allocator_entries = 0;
  prof_header->mem_prof_entries = 0;
}

void track_allocation(int32_t id, uint32_t size) {
  if (id == -1)
    return;
  ET_CHECK_MSG(
      prof_header->mem_prof_entries < MAX_MEM_PROFILE_EVENTS,
      "Out of memory profiling buffer space. Increase MAX_MEM_PROFILE_EVENTS\
       to %" PRIu32 " and re-compile.",
      prof_header->mem_prof_entries);
  mem_prof_arr[prof_header->mem_prof_entries].allocator_id = id;
  mem_prof_arr[prof_header->mem_prof_entries].allocation_size = size;
  prof_header->mem_prof_entries++;
}

uint32_t track_allocator(const char* name) {
  ET_CHECK_MSG(
      prof_header->allocator_entries < MEM_PROFILE_MAX_ALLOCATORS,
      "Out of allocator tracking space, %" PRIu32
      " is needed. Increase MEM_PROFILE_MAX_ALLOCATORS and re-compile",
      prof_header->allocator_entries);
  size_t str_len = strlen(name);
  size_t num_allocators = prof_header->allocator_entries;
  memset(mem_allocator_arr[num_allocators].name, 0, PROF_NAME_MAX_LEN);
  if (str_len > PROF_NAME_MAX_LEN) {
    memcpy(mem_allocator_arr[num_allocators].name, name, PROF_NAME_MAX_LEN);
  } else {
    memcpy(mem_allocator_arr[num_allocators].name, name, str_len);
  }
  mem_allocator_arr[num_allocators].allocator_id = num_allocators;
  return prof_header->allocator_entries++;
}

void profiling_create_block(const char* name) {
  // If the current profiling block is not used then continue to use this, if
  // not move onto the next block.
  if (prof_header->prof_entries != 0 || prof_header->mem_prof_entries != 0 ||
      prof_header->allocator_entries != 0 || num_blocks == 0) {
    num_blocks += 1;
    ET_CHECK_MSG(
        num_blocks <= MAX_PROFILE_BLOCKS,
        "Only %d blocks are supported and they've all been used up but %" PRIu32
        " is used. Increment MAX_PROFILE_BLOCKS and re-run",
        MAX_PROFILE_BLOCKS,
        num_blocks);
  }

  // Copy over the name of this profiling block.
  size_t str_len =
      strlen(name) >= PROF_NAME_MAX_LEN ? PROF_NAME_MAX_LEN : strlen(name);
  uintptr_t base = (uintptr_t)prof_buf + (num_blocks - 1) * prof_buf_size;
  prof_header = (prof_header_t*)(base + prof_header_offset);
  memset(prof_header->name, 0, PROF_NAME_MAX_LEN);
  memcpy(prof_header->name, name, str_len);

  // Set profiler version for compatiblity checks in the post-processing
  // tool.
  prof_header->prof_ver = ET_PROF_VER;
  // Set the maximum number of entries that this block can support.
  prof_header->max_prof_entries = MAX_PROFILE_EVENTS;
  prof_header->max_allocator_entries = MEM_PROFILE_MAX_ALLOCATORS;
  prof_header->max_mem_prof_entries = MAX_MEM_PROFILE_EVENTS;
  reset_profile_stats();

  // Set the base addresses for the various profiling entries arrays.
  prof_arr = (prof_event_t*)(base + prof_events_offset);
  mem_allocator_arr = (prof_allocator_t*)(base + prof_mem_alloc_info_offset);
  mem_prof_arr = (mem_prof_event_t*)(base + prof_mem_alloc_events_offset);
}

void profiler_init(void) {
  profiling_create_block("default");
}

ExecutorchProfiler::ExecutorchProfiler(const char* name) {
  prof_tok = begin_profiling(name);
}

ExecutorchProfiler::~ExecutorchProfiler() {
  end_profiling(prof_tok);
}

} // namespace runtime
} // namespace executorch