/* Copyright (C) 2020 Google, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */


#ifndef _TU_TRACEPOINTS_H
#define _TU_TRACEPOINTS_H

#include "vk_format.h"
#include "util/sha1/sha1.h"
#include "common/freedreno_lrz.h"

#include "util/perf/u_trace.h"

#ifdef __cplusplus
extern "C" {
#endif

struct tu_cmd_buffer;
struct tu_device;
struct tu_framebuffer;
struct tu_tiling_config;
typedef char tu_sha1_str[SHA1_DIGEST_STRING_LENGTH];

enum tu_gpu_tracepoint {
   TU_GPU_TRACEPOINT_CMD_BUFFER = 1ull << 0,
   TU_GPU_TRACEPOINT_SECONDARY_CMD_BUFFER = 1ull << 1,
   TU_GPU_TRACEPOINT_RENDER_PASS = 1ull << 2,
   TU_GPU_TRACEPOINT_DRAW = 1ull << 3,
   TU_GPU_TRACEPOINT_BINNING_IB = 1ull << 4,
   TU_GPU_TRACEPOINT_DRAW_IB_SYSMEM = 1ull << 5,
   TU_GPU_TRACEPOINT_DRAW_IB_GMEM = 1ull << 6,
   TU_GPU_TRACEPOINT_GENERIC_CLEAR = 1ull << 7,
   TU_GPU_TRACEPOINT_GMEM_CLEAR = 1ull << 8,
   TU_GPU_TRACEPOINT_SYSMEM_CLEAR = 1ull << 9,
   TU_GPU_TRACEPOINT_SYSMEM_CLEAR_ALL = 1ull << 10,
   TU_GPU_TRACEPOINT_GMEM_LOAD = 1ull << 11,
   TU_GPU_TRACEPOINT_GMEM_STORE = 1ull << 12,
   TU_GPU_TRACEPOINT_SYSMEM_RESOLVE = 1ull << 13,
   TU_GPU_TRACEPOINT_BLIT = 1ull << 14,
   TU_GPU_TRACEPOINT_COMPUTE = 1ull << 15,
   TU_GPU_TRACEPOINT_COMPUTE_INDIRECT = 1ull << 16,
   TU_GPU_TRACEPOINT_CMD_BUFFER_ANNOTATION = 1ull << 17,
   TU_GPU_TRACEPOINT_CMD_BUFFER_ANNOTATION_RP = 1ull << 18,
};

extern uint64_t tu_gpu_tracepoint;

void tu_gpu_tracepoint_config_variable(void);


/*
 * start_cmd_buffer
 */
struct trace_start_cmd_buffer {
   VkCommandBuffer command_buffer_handle;
   const char * appName;
   const char * engineName;
   uint8_t oneTimeSubmit;
   uint8_t simultaneousUse;
   char TUdebugFlags[96 + 1];
   char IR3debugFlags[96 + 1];
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_cmd_buffer(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_cmd_buffer *payload,
   const void *indirect_data);
#endif
void __trace_start_cmd_buffer(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , const char *TUdebugFlags
     , const char *IR3debugFlags
);
static ALWAYS_INLINE void trace_start_cmd_buffer(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , const char *TUdebugFlags
   , const char *IR3debugFlags
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_CMD_BUFFER)))
      return;
   __trace_start_cmd_buffer(
        ut
      , enabled_traces
      , cs
      , cmd
      , TUdebugFlags
      , IR3debugFlags
   );
}

/*
 * end_cmd_buffer
 */
struct trace_end_cmd_buffer {
   uint32_t renderpasses;
   uint32_t dispatches;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_cmd_buffer(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_cmd_buffer *payload,
   const void *indirect_data);
#endif
void __trace_end_cmd_buffer(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , const struct tu_cmd_buffer * cmd
);
static ALWAYS_INLINE void trace_end_cmd_buffer(
     struct u_trace *ut
   , void *cs
   , const struct tu_cmd_buffer * cmd
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_CMD_BUFFER)))
      return;
   __trace_end_cmd_buffer(
        ut
      , enabled_traces
      , cs
      , cmd
   );
}

/*
 * start_secondary_cmd_buffer
 */
struct trace_start_secondary_cmd_buffer {
   VkCommandBuffer command_buffer_handle;
   uint8_t render_pass_continue;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_secondary_cmd_buffer(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_secondary_cmd_buffer *payload,
   const void *indirect_data);
#endif
void __trace_start_secondary_cmd_buffer(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
);
static ALWAYS_INLINE void trace_start_secondary_cmd_buffer(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_SECONDARY_CMD_BUFFER)))
      return;
   __trace_start_secondary_cmd_buffer(
        ut
      , enabled_traces
      , cs
      , cmd
   );
}

/*
 * end_secondary_cmd_buffer
 */
struct trace_end_secondary_cmd_buffer {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_secondary_cmd_buffer) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_secondary_cmd_buffer(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_secondary_cmd_buffer *payload,
   const void *indirect_data);
#endif
void __trace_end_secondary_cmd_buffer(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_secondary_cmd_buffer(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_SECONDARY_CMD_BUFFER)))
      return;
   __trace_end_secondary_cmd_buffer(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_render_pass
 */
struct trace_start_render_pass {
   VkCommandBuffer command_buffer_handle;
   uint16_t width;
   uint16_t height;
   uint8_t attachment_count;
   uint16_t numberOfBins;
   uint16_t binWidth;
   uint16_t binHeight;
   uint8_t maxSamples;
   uint8_t clearCPP;
   uint8_t loadCPP;
   uint8_t storeCPP;
   bool hasDepth;
   char ubwc[11 + 1];
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_render_pass(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_render_pass *payload,
   const void *indirect_data);
#endif
void __trace_start_render_pass(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , const struct tu_framebuffer * fb
     , const struct tu_tiling_config * tiling
     , uint8_t maxSamples
     , uint8_t clearCPP
     , uint8_t loadCPP
     , uint8_t storeCPP
     , bool hasDepth
     , const char *ubwc
);
static ALWAYS_INLINE void trace_start_render_pass(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , const struct tu_framebuffer * fb
   , const struct tu_tiling_config * tiling
   , uint8_t maxSamples
   , uint8_t clearCPP
   , uint8_t loadCPP
   , uint8_t storeCPP
   , bool hasDepth
   , const char *ubwc
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_RENDER_PASS)))
      return;
   __trace_start_render_pass(
        ut
      , enabled_traces
      , cs
      , cmd
      , fb
      , tiling
      , maxSamples
      , clearCPP
      , loadCPP
      , storeCPP
      , hasDepth
      , ubwc
   );
}

/*
 * end_render_pass
 */
struct trace_end_render_pass {
   bool tiledRender;
   const char * tilingDisableReason;
   uint32_t drawCount;
   uint32_t avgPerSampleBandwidth;
   bool lrz;
   const char * lrzDisableReason;
   int32_t lrzDisabledAtDraw;
   int32_t lrzWriteDisabledAtDraw;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_render_pass(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_render_pass *payload,
   const void *indirect_data);
#endif
void __trace_end_render_pass(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , bool tiledRender
     , const char * tilingDisableReason
     , uint32_t drawCount
     , uint32_t avgPerSampleBandwidth
     , bool lrz
     , const char * lrzDisableReason
     , int32_t lrzDisabledAtDraw
     , int32_t lrzWriteDisabledAtDraw
     , struct u_trace_address lrzStatus
);
static ALWAYS_INLINE void trace_end_render_pass(
     struct u_trace *ut
   , void *cs
   , bool tiledRender
   , const char * tilingDisableReason
   , uint32_t drawCount
   , uint32_t avgPerSampleBandwidth
   , bool lrz
   , const char * lrzDisableReason
   , int32_t lrzDisabledAtDraw
   , int32_t lrzWriteDisabledAtDraw
   , struct u_trace_address lrzStatus
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_RENDER_PASS)))
      return;
   __trace_end_render_pass(
        ut
      , enabled_traces
      , cs
      , tiledRender
      , tilingDisableReason
      , drawCount
      , avgPerSampleBandwidth
      , lrz
      , lrzDisableReason
      , lrzDisabledAtDraw
      , lrzWriteDisabledAtDraw
      , lrzStatus
   );
}

/*
 * start_draw
 */
struct trace_start_draw {
   VkCommandBuffer command_buffer_handle;
   uint32_t count;
   tu_sha1_str vs_sha1;
   tu_sha1_str tcs_sha1;
   tu_sha1_str tes_sha1;
   tu_sha1_str gs_sha1;
   tu_sha1_str fs_sha1;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_draw(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_draw *payload,
   const void *indirect_data);
#endif
void __trace_start_draw(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , uint32_t count
     , tu_sha1_str vs_sha1
     , tu_sha1_str tcs_sha1
     , tu_sha1_str tes_sha1
     , tu_sha1_str gs_sha1
     , tu_sha1_str fs_sha1
);
static ALWAYS_INLINE void trace_start_draw(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , uint32_t count
   , tu_sha1_str vs_sha1
   , tu_sha1_str tcs_sha1
   , tu_sha1_str tes_sha1
   , tu_sha1_str gs_sha1
   , tu_sha1_str fs_sha1
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_DRAW)))
      return;
   __trace_start_draw(
        ut
      , enabled_traces
      , cs
      , cmd
      , count
      , vs_sha1
      , tcs_sha1
      , tes_sha1
      , gs_sha1
      , fs_sha1
   );
}

/*
 * end_draw
 */
struct trace_end_draw {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_draw) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_draw(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_draw *payload,
   const void *indirect_data);
#endif
void __trace_end_draw(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_draw(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_DRAW)))
      return;
   __trace_end_draw(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_binning_ib
 */
struct trace_start_binning_ib {
   VkCommandBuffer command_buffer_handle;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_binning_ib(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_binning_ib *payload,
   const void *indirect_data);
#endif
void __trace_start_binning_ib(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
);
static ALWAYS_INLINE void trace_start_binning_ib(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_BINNING_IB)))
      return;
   __trace_start_binning_ib(
        ut
      , enabled_traces
      , cs
      , cmd
   );
}

/*
 * end_binning_ib
 */
struct trace_end_binning_ib {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_binning_ib) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_binning_ib(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_binning_ib *payload,
   const void *indirect_data);
#endif
void __trace_end_binning_ib(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_binning_ib(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_BINNING_IB)))
      return;
   __trace_end_binning_ib(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_draw_ib_sysmem
 */
struct trace_start_draw_ib_sysmem {
   VkCommandBuffer command_buffer_handle;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_draw_ib_sysmem(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_draw_ib_sysmem *payload,
   const void *indirect_data);
#endif
void __trace_start_draw_ib_sysmem(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
);
static ALWAYS_INLINE void trace_start_draw_ib_sysmem(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_DRAW_IB_SYSMEM)))
      return;
   __trace_start_draw_ib_sysmem(
        ut
      , enabled_traces
      , cs
      , cmd
   );
}

/*
 * end_draw_ib_sysmem
 */
struct trace_end_draw_ib_sysmem {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_draw_ib_sysmem) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_draw_ib_sysmem(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_draw_ib_sysmem *payload,
   const void *indirect_data);
#endif
void __trace_end_draw_ib_sysmem(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_draw_ib_sysmem(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_DRAW_IB_SYSMEM)))
      return;
   __trace_end_draw_ib_sysmem(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_draw_ib_gmem
 */
struct trace_start_draw_ib_gmem {
   VkCommandBuffer command_buffer_handle;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_draw_ib_gmem(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_draw_ib_gmem *payload,
   const void *indirect_data);
#endif
void __trace_start_draw_ib_gmem(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
);
static ALWAYS_INLINE void trace_start_draw_ib_gmem(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_DRAW_IB_GMEM)))
      return;
   __trace_start_draw_ib_gmem(
        ut
      , enabled_traces
      , cs
      , cmd
   );
}

/*
 * end_draw_ib_gmem
 */
struct trace_end_draw_ib_gmem {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_draw_ib_gmem) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_draw_ib_gmem(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_draw_ib_gmem *payload,
   const void *indirect_data);
#endif
void __trace_end_draw_ib_gmem(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_draw_ib_gmem(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_DRAW_IB_GMEM)))
      return;
   __trace_end_draw_ib_gmem(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_generic_clear
 */
struct trace_start_generic_clear {
   VkCommandBuffer command_buffer_handle;
   enum VkFormat format;
   bool ubwc;
   uint8_t samples;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_generic_clear(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_generic_clear *payload,
   const void *indirect_data);
#endif
void __trace_start_generic_clear(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , enum VkFormat format
     , bool ubwc
     , uint8_t samples
);
static ALWAYS_INLINE void trace_start_generic_clear(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , enum VkFormat format
   , bool ubwc
   , uint8_t samples
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_GENERIC_CLEAR)))
      return;
   __trace_start_generic_clear(
        ut
      , enabled_traces
      , cs
      , cmd
      , format
      , ubwc
      , samples
   );
}

/*
 * end_generic_clear
 */
struct trace_end_generic_clear {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_generic_clear) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_generic_clear(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_generic_clear *payload,
   const void *indirect_data);
#endif
void __trace_end_generic_clear(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_generic_clear(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_GENERIC_CLEAR)))
      return;
   __trace_end_generic_clear(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_gmem_clear
 */
struct trace_start_gmem_clear {
   VkCommandBuffer command_buffer_handle;
   enum VkFormat format;
   uint8_t samples;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_gmem_clear(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_gmem_clear *payload,
   const void *indirect_data);
#endif
void __trace_start_gmem_clear(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , enum VkFormat format
     , uint8_t samples
);
static ALWAYS_INLINE void trace_start_gmem_clear(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , enum VkFormat format
   , uint8_t samples
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_GMEM_CLEAR)))
      return;
   __trace_start_gmem_clear(
        ut
      , enabled_traces
      , cs
      , cmd
      , format
      , samples
   );
}

/*
 * end_gmem_clear
 */
struct trace_end_gmem_clear {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_gmem_clear) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_gmem_clear(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_gmem_clear *payload,
   const void *indirect_data);
#endif
void __trace_end_gmem_clear(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_gmem_clear(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_GMEM_CLEAR)))
      return;
   __trace_end_gmem_clear(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_sysmem_clear
 */
struct trace_start_sysmem_clear {
   VkCommandBuffer command_buffer_handle;
   enum VkFormat format;
   uint8_t uses_3d_ops;
   uint8_t samples;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_sysmem_clear(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_sysmem_clear *payload,
   const void *indirect_data);
#endif
void __trace_start_sysmem_clear(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , enum VkFormat format
     , uint8_t uses_3d_ops
     , uint8_t samples
);
static ALWAYS_INLINE void trace_start_sysmem_clear(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , enum VkFormat format
   , uint8_t uses_3d_ops
   , uint8_t samples
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_SYSMEM_CLEAR)))
      return;
   __trace_start_sysmem_clear(
        ut
      , enabled_traces
      , cs
      , cmd
      , format
      , uses_3d_ops
      , samples
   );
}

/*
 * end_sysmem_clear
 */
struct trace_end_sysmem_clear {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_sysmem_clear) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_sysmem_clear(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_sysmem_clear *payload,
   const void *indirect_data);
#endif
void __trace_end_sysmem_clear(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_sysmem_clear(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_SYSMEM_CLEAR)))
      return;
   __trace_end_sysmem_clear(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_sysmem_clear_all
 */
struct trace_start_sysmem_clear_all {
   VkCommandBuffer command_buffer_handle;
   uint8_t mrt_count;
   uint8_t rect_count;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_sysmem_clear_all(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_sysmem_clear_all *payload,
   const void *indirect_data);
#endif
void __trace_start_sysmem_clear_all(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , uint8_t mrt_count
     , uint8_t rect_count
);
static ALWAYS_INLINE void trace_start_sysmem_clear_all(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , uint8_t mrt_count
   , uint8_t rect_count
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_SYSMEM_CLEAR_ALL)))
      return;
   __trace_start_sysmem_clear_all(
        ut
      , enabled_traces
      , cs
      , cmd
      , mrt_count
      , rect_count
   );
}

/*
 * end_sysmem_clear_all
 */
struct trace_end_sysmem_clear_all {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_sysmem_clear_all) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_sysmem_clear_all(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_sysmem_clear_all *payload,
   const void *indirect_data);
#endif
void __trace_end_sysmem_clear_all(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_sysmem_clear_all(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_SYSMEM_CLEAR_ALL)))
      return;
   __trace_end_sysmem_clear_all(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_gmem_load
 */
struct trace_start_gmem_load {
   VkCommandBuffer command_buffer_handle;
   enum VkFormat format;
   uint8_t force_load;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_gmem_load(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_gmem_load *payload,
   const void *indirect_data);
#endif
void __trace_start_gmem_load(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , enum VkFormat format
     , uint8_t force_load
);
static ALWAYS_INLINE void trace_start_gmem_load(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , enum VkFormat format
   , uint8_t force_load
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_GMEM_LOAD)))
      return;
   __trace_start_gmem_load(
        ut
      , enabled_traces
      , cs
      , cmd
      , format
      , force_load
   );
}

/*
 * end_gmem_load
 */
struct trace_end_gmem_load {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_gmem_load) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_gmem_load(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_gmem_load *payload,
   const void *indirect_data);
#endif
void __trace_end_gmem_load(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_gmem_load(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_GMEM_LOAD)))
      return;
   __trace_end_gmem_load(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_gmem_store
 */
struct trace_start_gmem_store {
   VkCommandBuffer command_buffer_handle;
   enum VkFormat format;
   uint8_t fast_path;
   uint8_t unaligned;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_gmem_store(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_gmem_store *payload,
   const void *indirect_data);
#endif
void __trace_start_gmem_store(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , enum VkFormat format
     , uint8_t fast_path
     , uint8_t unaligned
);
static ALWAYS_INLINE void trace_start_gmem_store(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , enum VkFormat format
   , uint8_t fast_path
   , uint8_t unaligned
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_GMEM_STORE)))
      return;
   __trace_start_gmem_store(
        ut
      , enabled_traces
      , cs
      , cmd
      , format
      , fast_path
      , unaligned
   );
}

/*
 * end_gmem_store
 */
struct trace_end_gmem_store {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_gmem_store) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_gmem_store(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_gmem_store *payload,
   const void *indirect_data);
#endif
void __trace_end_gmem_store(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_gmem_store(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_GMEM_STORE)))
      return;
   __trace_end_gmem_store(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_sysmem_resolve
 */
struct trace_start_sysmem_resolve {
   VkCommandBuffer command_buffer_handle;
   enum VkFormat format;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_sysmem_resolve(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_sysmem_resolve *payload,
   const void *indirect_data);
#endif
void __trace_start_sysmem_resolve(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , enum VkFormat format
);
static ALWAYS_INLINE void trace_start_sysmem_resolve(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , enum VkFormat format
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_SYSMEM_RESOLVE)))
      return;
   __trace_start_sysmem_resolve(
        ut
      , enabled_traces
      , cs
      , cmd
      , format
   );
}

/*
 * end_sysmem_resolve
 */
struct trace_end_sysmem_resolve {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_sysmem_resolve) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_sysmem_resolve(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_sysmem_resolve *payload,
   const void *indirect_data);
#endif
void __trace_end_sysmem_resolve(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_sysmem_resolve(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_SYSMEM_RESOLVE)))
      return;
   __trace_end_sysmem_resolve(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_blit
 */
struct trace_start_blit {
   VkCommandBuffer command_buffer_handle;
   uint8_t uses_3d_blit;
   enum VkFormat src_format;
   enum VkFormat dst_format;
   uint8_t layers;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_blit(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_blit *payload,
   const void *indirect_data);
#endif
void __trace_start_blit(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , uint8_t uses_3d_blit
     , enum VkFormat src_format
     , enum VkFormat dst_format
     , uint8_t layers
);
static ALWAYS_INLINE void trace_start_blit(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , uint8_t uses_3d_blit
   , enum VkFormat src_format
   , enum VkFormat dst_format
   , uint8_t layers
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_BLIT)))
      return;
   __trace_start_blit(
        ut
      , enabled_traces
      , cs
      , cmd
      , uses_3d_blit
      , src_format
      , dst_format
      , layers
   );
}

/*
 * end_blit
 */
struct trace_end_blit {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_blit) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_blit(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_blit *payload,
   const void *indirect_data);
#endif
void __trace_end_blit(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_blit(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_BLIT)))
      return;
   __trace_end_blit(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_compute
 */
struct trace_start_compute {
   VkCommandBuffer command_buffer_handle;
   uint8_t indirect;
   uint8_t unaligned;
   uint16_t local_size_x;
   uint16_t local_size_y;
   uint16_t local_size_z;
   uint16_t num_groups_x;
   uint16_t num_groups_y;
   uint16_t num_groups_z;
   tu_sha1_str cs_sha1;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_compute(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_compute *payload,
   const void *indirect_data);
#endif
void __trace_start_compute(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , uint8_t indirect
     , uint8_t unaligned
     , uint16_t local_size_x
     , uint16_t local_size_y
     , uint16_t local_size_z
     , uint16_t num_groups_x
     , uint16_t num_groups_y
     , uint16_t num_groups_z
     , tu_sha1_str cs_sha1
);
static ALWAYS_INLINE void trace_start_compute(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , uint8_t indirect
   , uint8_t unaligned
   , uint16_t local_size_x
   , uint16_t local_size_y
   , uint16_t local_size_z
   , uint16_t num_groups_x
   , uint16_t num_groups_y
   , uint16_t num_groups_z
   , tu_sha1_str cs_sha1
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_COMPUTE)))
      return;
   __trace_start_compute(
        ut
      , enabled_traces
      , cs
      , cmd
      , indirect
      , unaligned
      , local_size_x
      , local_size_y
      , local_size_z
      , num_groups_x
      , num_groups_y
      , num_groups_z
      , cs_sha1
   );
}

/*
 * end_compute
 */
struct trace_end_compute {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_compute) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_compute(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_compute *payload,
   const void *indirect_data);
#endif
void __trace_end_compute(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_compute(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_COMPUTE)))
      return;
   __trace_end_compute(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_compute_indirect
 */
struct trace_start_compute_indirect {
   VkCommandBuffer command_buffer_handle;
   uint8_t unaligned;
   tu_sha1_str cs_sha1;
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_compute_indirect(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_compute_indirect *payload,
   const void *indirect_data);
#endif
void __trace_start_compute_indirect(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , uint8_t unaligned
     , tu_sha1_str cs_sha1
);
static ALWAYS_INLINE void trace_start_compute_indirect(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , uint8_t unaligned
   , tu_sha1_str cs_sha1
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_COMPUTE_INDIRECT)))
      return;
   __trace_start_compute_indirect(
        ut
      , enabled_traces
      , cs
      , cmd
      , unaligned
      , cs_sha1
   );
}

/*
 * end_compute_indirect
 */
struct trace_end_compute_indirect {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_compute_indirect) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_compute_indirect(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_compute_indirect *payload,
   const void *indirect_data);
#endif
void __trace_end_compute_indirect(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct u_trace_address size
);
static ALWAYS_INLINE void trace_end_compute_indirect(
     struct u_trace *ut
   , void *cs
   , struct u_trace_address size
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_COMPUTE_INDIRECT)))
      return;
   __trace_end_compute_indirect(
        ut
      , enabled_traces
      , cs
      , size
   );
}

/*
 * start_cmd_buffer_annotation
 */
struct trace_start_cmd_buffer_annotation {
   VkCommandBuffer command_buffer_handle;
   uint8_t dummy;
   char str[0];
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_cmd_buffer_annotation(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_cmd_buffer_annotation *payload,
   const void *indirect_data);
#endif
void __trace_start_cmd_buffer_annotation(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , unsigned len
     , const char *str
);
static ALWAYS_INLINE void trace_start_cmd_buffer_annotation(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , unsigned len
   , const char *str
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_CMD_BUFFER_ANNOTATION)))
      return;
   __trace_start_cmd_buffer_annotation(
        ut
      , enabled_traces
      , cs
      , cmd
      , len
      , str
   );
}

/*
 * end_cmd_buffer_annotation
 */
struct trace_end_cmd_buffer_annotation {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_cmd_buffer_annotation) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_cmd_buffer_annotation(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_cmd_buffer_annotation *payload,
   const void *indirect_data);
#endif
void __trace_end_cmd_buffer_annotation(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_cmd_buffer_annotation(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_CMD_BUFFER_ANNOTATION)))
      return;
   __trace_end_cmd_buffer_annotation(
        ut
      , enabled_traces
      , cs
   );
}

/*
 * start_cmd_buffer_annotation_rp
 */
struct trace_start_cmd_buffer_annotation_rp {
   VkCommandBuffer command_buffer_handle;
   uint8_t dummy;
   char str[0];
};
#ifdef HAVE_PERFETTO
void tu_perfetto_start_cmd_buffer_annotation_rp(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_start_cmd_buffer_annotation_rp *payload,
   const void *indirect_data);
#endif
void __trace_start_cmd_buffer_annotation_rp(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
     , struct tu_cmd_buffer * cmd
     , unsigned len
     , const char *str
);
static ALWAYS_INLINE void trace_start_cmd_buffer_annotation_rp(
     struct u_trace *ut
   , void *cs
   , struct tu_cmd_buffer * cmd
   , unsigned len
   , const char *str
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_CMD_BUFFER_ANNOTATION_RP)))
      return;
   __trace_start_cmd_buffer_annotation_rp(
        ut
      , enabled_traces
      , cs
      , cmd
      , len
      , str
   );
}

/*
 * end_cmd_buffer_annotation_rp
 */
struct trace_end_cmd_buffer_annotation_rp {
#ifdef __cplusplus
   /* avoid warnings about empty struct size mis-match in C vs C++..
    * the size mis-match is harmless because (a) nothing will deref
    * the empty struct, and (b) the code that cares about allocating
    * sizeof(struct trace_end_cmd_buffer_annotation_rp) (and wants this to be zero
    * if there is no payload) is C
    */
   uint8_t dummy;
#endif
};
#ifdef HAVE_PERFETTO
void tu_perfetto_end_cmd_buffer_annotation_rp(
   struct tu_device *dev,
   uint64_t ts_ns,
   uint16_t tp_idx,
   const void *flush_data,
   const struct trace_end_cmd_buffer_annotation_rp *payload,
   const void *indirect_data);
#endif
void __trace_end_cmd_buffer_annotation_rp(
       struct u_trace *ut
     , enum u_trace_type enabled_traces
     , void *cs
);
static ALWAYS_INLINE void trace_end_cmd_buffer_annotation_rp(
     struct u_trace *ut
   , void *cs
) {
   enum u_trace_type enabled_traces = p_atomic_read_relaxed(&ut->utctx->enabled_traces);
   if (!unlikely(enabled_traces != 0 &&
                 (tu_gpu_tracepoint & TU_GPU_TRACEPOINT_CMD_BUFFER_ANNOTATION_RP)))
      return;
   __trace_end_cmd_buffer_annotation_rp(
        ut
      , enabled_traces
      , cs
   );
}

#ifdef __cplusplus
}
#endif

#endif /* _TU_TRACEPOINTS_H */
