/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef NGEN_INTERFACE_HPP
#define NGEN_INTERFACE_HPP


#include "ngen_core.hpp"
#include <sstream>


namespace NGEN_NAMESPACE {

template <HW hw> class OpenCLCodeGenerator;
template <HW hw> class L0CodeGenerator;

// Exceptions.
#ifdef NGEN_SAFE
class unknown_argument_exception : public std::runtime_error {
public:
    unknown_argument_exception() : std::runtime_error("Argument not found") {}
};

class bad_argument_type_exception : public std::runtime_error {
public:
    bad_argument_type_exception() : std::runtime_error("Bad argument type") {}
};

class interface_not_finalized : public std::runtime_error {
public:
    interface_not_finalized() : std::runtime_error("Interface has not been finalized") {}
};

class use_simd1_local_id_exception : public std::runtime_error {
public:
    use_simd1_local_id_exception() : std::runtime_error("Use getSIMD1LocalID for SIMD1 kernels") {}
};

class unsupported_argument_location_override : public std::runtime_error {
public:
    unsupported_argument_location_override() : std::runtime_error("Argument register location is invalid") {}
};

#endif

enum class ExternalArgumentType { Scalar, GlobalPtr, LocalPtr, Hidden };
enum class GlobalAccessType { None = 0, Stateless = 1, Surface = 2, All = 3, Default = 4 };

static inline GlobalAccessType operator|(GlobalAccessType access1, GlobalAccessType access2)
{
    return static_cast<GlobalAccessType>(static_cast<int>(access1) | static_cast<int>(access2));
}

enum class ThreadArbitrationMode { Default, OldestFirst, RoundRobin, RoundRobinOnStall };

class InterfaceHandler
{
    template <HW hw> friend class OpenCLCodeGenerator;
    template <HW hw> friend class L0CodeGenerator;

public:
    InterfaceHandler(HW hw_) : hw(hw_), simd(GRF::bytes(hw_) >> 2)
                             , requestedInlineGRFs(defaultInlineGRFs(hw))
    {}

    inline void externalName(const std::string &name)   { kernelName = name; }

    template <typename DT>
    inline void newArgument(const std::string &name)    { newArgument(name, getDataType<DT>()); }
    inline void newArgument(const std::string &name, DataType type, ExternalArgumentType exttype = ExternalArgumentType::Scalar, GlobalAccessType access = GlobalAccessType::Default);
    inline void newArgument(const std::string &name, Subregister reg, ExternalArgumentType exttype = ExternalArgumentType::Scalar, GlobalAccessType access = GlobalAccessType::Default);
    inline void newArgument(const std::string &name, ExternalArgumentType exttype, GlobalAccessType access = GlobalAccessType::Default);

    void allowArgumentRearrangement(bool allow)          { rearrangeArgs = allow; }

    inline Subregister getArgument(const std::string &name) const;
    inline Subregister getArgumentIfExists(const std::string &name) const;
    inline int getArgumentSurface(const std::string &name) const;
    inline int getArgumentSurfaceIfExists(const std::string &name) const;
    inline GRF getLocalID(int dim) const;
    inline Subregister getSIMD1LocalID(int dim) const;
    inline Subregister getLocalSize(int dim) const;

    const std::string &getExternalName() const           { return kernelName; }
    int getSIMD() const                                  { return simd; }
    int getBarrierCount() const                          { return barrierCount; }
    int getGRFCount() const                              { return needGRF; }
    size_t getSLMSize() const                            { return slmSize; }

    void require32BitBuffers()                           { allow64BitBuffers = false; }
    void requireArbitrationMode(ThreadArbitrationMode m) { arbitrationMode = m; }
    void requireBarrier()                                { barrierCount = 1; }
    void requireBarriers(int nBarriers)                  { barrierCount = nBarriers; }
    void requireDPAS()                                   { needDPAS = true; }
    void requireGlobalAtomics()                          { needGlobalAtomics = true; }
    void requireGRF(int grfs)                            { needGRF = grfs; }
    void requireLocalID(int dimensions)                  { needLocalID = dimensions; }
    void requireLocalSize()                              { needLocalSize = true; }
    void requireNonuniformWGs()                          { needNonuniformWGs = true; }
    void requireNoPreemption()                           { needNoPreemption = true; }
    void requirePartitionDim(int dim)                    { needPartitionDim = dim; }
    void requireScratch(size_t bytes = 1)                { scratchSize = bytes; }
    void requireSIMD(int simd_)                          { simd = simd_; }
    void requireSLM(size_t bytes)                        { slmSize = bytes; }
    void requireStatelessWrites(bool req = true)         { needStatelessWrites = req; }
    inline void requireType(DataType type);
    template <typename T> void requireType()             { requireType(getDataType<T>()); }
    void requireWalkOrder(int o1, int o2)                { walkOrder[0] = o1; walkOrder[1] = o2; walkOrder[2] = -1; }
    void requireWalkOrder(int o1, int o2, int o3)        { walkOrder[0] = o1; walkOrder[1] = o2; walkOrder[2] = o3; }
    void requireWorkgroup(size_t x, size_t y = 1,
                          size_t z = 1)                  { wg[0] = x; wg[1] = y; wg[2] = z; }

    void setArgumentBase(RegData base)                   { baseOverride = base; }
    void setInlineGRFCount(int grfs)                     { requestedInlineGRFs = grfs; }
    void setSkipPerThreadOffset(int32_t offset)          { offsetSkipPerThread = offset; }
    void setSkipCrossThreadOffset(int32_t offset)        { offsetSkipCrossThread = offset; }
    int32_t getSkipCrossThreadOffset() const             { return offsetSkipCrossThread; }

    inline GRF getCrossthreadBase(bool effective = true) const;
    inline GRF getArgLoadBase() const;

    inline void finalize();

    template <typename CodeGenerator>
    inline void generatePrologue(CodeGenerator &generator, const GRF &temp = GRF(127)) const;

    inline void generateDummyCL(std::ostream &stream) const;
    inline std::string generateZeInfo() const;

    static constexpr int noSurface = 0x80;        // Returned by getArgumentSurfaceIfExists in case of no surface assignment

protected:
    struct Assignment {
        std::string name;
        DataType type;
        ExternalArgumentType exttype;
        GlobalAccessType access;
        Subregister reg;
        int surface;
        int index;

        bool globalSurfaceAccess()   const { return (static_cast<int>(access) & static_cast<int>(GlobalAccessType::Surface)); }
        bool globalStatelessAccess() const { return (static_cast<int>(access) & static_cast<int>(GlobalAccessType::Stateless)); }
    };

    HW hw;

    std::vector<Assignment> assignments;
    std::string kernelName = "default_kernel";

    int nextArgIndex = 0;
    bool finalized = false;
    bool hasArgLocOverride = false;
    bool rearrangeArgs = true;

    bool allow64BitBuffers = false;
    ThreadArbitrationMode arbitrationMode = ThreadArbitrationMode::Default;
    int barrierCount = 0;
    RegData baseOverride;
    bool needDPAS = false;
    bool needGlobalAtomics = false;
    int32_t needGRF = 128;
    int needLocalID = 0;
    bool needLocalSize = false;
    bool needNonuniformWGs = false;
    bool needNoPreemption = false;
    int needPartitionDim = -1;
    bool needHalf = false;
    bool needDouble = false;
    bool needStatelessWrites = true;
    int32_t offsetSkipPerThread = 0;
    int32_t offsetSkipCrossThread = 0;
    size_t scratchSize = 0;
    int simd = 8;
    size_t slmSize = 0;
    int walkOrder[3] = {-1, -1, -1};
    size_t wg[3] = {0, 0, 0};

    int crossthreadBytes = 0;
    int crossthreadGRFs = 0;
    int requestedInlineGRFs = 0;
    inline int inlineGRFs() const;
    inline int getCrossthreadGRFs() const;
    inline int getCrossthreadBytes() const;
    int grfsPerLID() const { return (simd > 16 && GRF::bytes(hw) < 64) ? 2 : 1; }

    static inline GlobalAccessType defaultGlobalAccess(HW hw);
    static inline int defaultInlineGRFs(HW hw);
};

using NEOInterfaceHandler = InterfaceHandler;   /* Deprecated -- do not use in new code. */

void InterfaceHandler::newArgument(const std::string &name, Subregister reg, ExternalArgumentType exttype, GlobalAccessType access)
{
    auto type = reg.getType();
    if (reg.isNull())
        reg.invalidate();
    else
        hasArgLocOverride = true;
    if (exttype != ExternalArgumentType::GlobalPtr)
        access = GlobalAccessType::None;
    if (access == GlobalAccessType::Default)
        access = defaultGlobalAccess(hw);
    assignments.push_back({name, type, exttype, access, reg, noSurface, nextArgIndex++});
}

void InterfaceHandler::newArgument(const std::string &name, DataType type, ExternalArgumentType exttype, GlobalAccessType access)
{
    newArgument(name, NullRegister().sub(0, type), exttype, access);
}

void InterfaceHandler::newArgument(const std::string &name, ExternalArgumentType exttype, GlobalAccessType access)
{
    DataType type = DataType::invalid;

    switch (exttype) {
        case ExternalArgumentType::GlobalPtr: type = DataType::uq; break;
        case ExternalArgumentType::LocalPtr:  type = DataType::ud; break;
        default:
#ifdef NGEN_SAFE
            throw bad_argument_type_exception();
#else
        break;
#endif
    }

    newArgument(name, type, exttype, access);
}

Subregister InterfaceHandler::getArgumentIfExists(const std::string &name) const
{
    for (auto &assignment : assignments) {
        if (assignment.name == name)
            return assignment.reg;
    }

    return Subregister{};
}

Subregister InterfaceHandler::getArgument(const std::string &name) const
{
    Subregister arg = getArgumentIfExists(name);

#ifdef NGEN_SAFE
    if (arg.isInvalid())
        throw unknown_argument_exception();
#endif

    return arg;
}

int InterfaceHandler::getArgumentSurfaceIfExists(const std::string &name) const
{
    for (auto &assignment : assignments)
        if (assignment.name == name)
            return assignment.surface;
    return noSurface;
}

int InterfaceHandler::getArgumentSurface(const std::string &name) const
{
    int surface = getArgumentSurfaceIfExists(name);

#ifdef NGEN_SAFE
    if (surface == noSurface)
        throw unknown_argument_exception();
#endif
    return surface;
}

Subregister InterfaceHandler::getSIMD1LocalID(int dim) const
{
#ifdef NGEN_SAFE
    if (dim > needLocalID || simd != 1) throw unknown_argument_exception();
#endif

    return GRF(1).uw(dim);
}

GRF InterfaceHandler::getLocalID(int dim) const
{
#ifdef NGEN_SAFE
    if (dim > needLocalID) throw unknown_argument_exception();
    if (simd == 1) throw use_simd1_local_id_exception();
#endif

    if (simd == 1)
        return GRF(1).uw();
    else
        return GRF(1 + dim * grfsPerLID()).uw();
}

void InterfaceHandler::requireType(DataType type)
{
    switch (type) {
        case DataType::hf: needHalf = true;   break;
        case DataType::df: needDouble = true; break;
        default: break;
    }
}

static inline const char *getCLDataType(DataType type)
{
    static const char *names[16] = {"uint", "int", "ushort", "short", "uchar", "char", "double", "float", "ulong", "long", "half", "ushort", "INVALID", "INVALID", "INVALID", "INVALID"};
    return names[static_cast<uint8_t>(type) & 0xF];
}

void InterfaceHandler::generateDummyCL(std::ostream &stream) const
{
#ifdef NGEN_SAFE
    if (!finalized) throw interface_not_finalized();
    if (hasArgLocOverride || !rearrangeArgs) throw unsupported_argument_location_override();
#endif
    const char *dpasDummy = "    int __builtin_IB_sub_group_idpas_s8_s8_8_1(int, int, int8) __attribute__((const));\n"
                            "    int z = __builtin_IB_sub_group_idpas_s8_s8_8_1(0, ____[0], 1);\n"
                            "    for (int i = 0; i < z; i++) (void) ____[0];\n";

    if (needHalf)   stream << "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
    if (needDouble) stream << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";

    if (wg[0] > 0 && wg[1] > 0 && wg[2] > 0)
        stream << "__attribute__((reqd_work_group_size(" << wg[0] << ',' << wg[1] << ',' << wg[2] << ")))\n";
    if (walkOrder[0] >= 0) {
        stream << "__attribute__((intel_reqd_workgroup_walk_order(" << walkOrder[0] << ',' << walkOrder[1];
        if (walkOrder[2] >= 0)
            stream << ',' << walkOrder[2];
        stream << ")))\n";
    }
    stream << "__attribute__((intel_reqd_sub_group_size(" << simd << ")))\n";
    stream << "kernel void " << kernelName << '(';

    bool firstArg = true;
    for (const auto &assignment : assignments) {
        if (assignment.exttype == ExternalArgumentType::Hidden) continue;

        if (!firstArg) stream << ", ";

        switch (assignment.exttype) {
            case ExternalArgumentType::GlobalPtr: stream << "global uint *"; break;
            case ExternalArgumentType::LocalPtr: stream << "local uint *"; break;
            case ExternalArgumentType::Scalar: stream << getCLDataType(assignment.type) << ' '; break;
            default: break;
        }

        stream << assignment.name;
        firstArg = false;
    }
    stream << ") {\n";
    stream << "    global volatile int *____;\n";

    if (needLocalID)        stream << "    (void) ____[get_local_id(0)];\n";
    if (needLocalSize)      stream << "    (void) ____[get_enqueued_local_size(0)];\n";
    if (barrierCount > 0)   stream << "    __asm__ volatile(\"barrier\");\n";
    for (int i = 1; i < barrierCount; i++) {
        stream << "    local NamedBarrier_t *bar" << i << ";\n"
                  "    bar" << i << " = named_barrier_init(1);\n"
                  "    work_group_named_barrier(bar" << i << ", 0);\n";
    }
    if (needDPAS)           stream << dpasDummy;
    if (needGlobalAtomics)  stream << "    atomic_inc(____);\n";
    if (scratchSize > 0)    stream << "    volatile char scratch[" << scratchSize << "] = {0};\n";
    if (slmSize > 0)        stream << "    volatile local char slm[" << slmSize << "]; slm[0]++;\n";
    if (needNoPreemption) {
        if (hw == HW::Gen9)
            stream << "    volatile double *__df; *__df = 1.1 / *__df;\n"; // IEEE macro causes IGC to disable MTP.
        /* To do: Gen11 */
    }

    if (hw >= HW::XeHP) for (const auto &assignment : assignments) {
        // Force IGC to assume stateless accesses could occur if necessary.
        if (assignment.exttype == ExternalArgumentType::GlobalPtr && assignment.globalStatelessAccess())
            stream << "    __asm__ volatile(\"\" :: \"rw.u\"(" << assignment.name << "));\n";
    }

    for (const auto &assignment : assignments) {
        // Force IGC to assume surface accesses could occur if necessary.
        if (assignment.exttype == ExternalArgumentType::GlobalPtr && assignment.globalSurfaceAccess())
            stream << "    { volatile uchar __load = ((global uchar *) " << assignment.name << ")[get_local_id(0)];}\n";
    }

    stream << "}\n";
}

inline Subregister InterfaceHandler::getLocalSize(int dim) const
{
    static const std::string localSizeArgs[3] = {"__local_size0", "__local_size1", "__local_size2"};
    return getArgument(localSizeArgs[dim]);
}

void InterfaceHandler::finalize()
{
    // Make assignments, following NEO rules:
    //  - all inputs are naturally aligned
    //  - all sub-DWord inputs are DWord-aligned
    //  - first register is
    //      r3 (no local IDs)
    //      r5 (SIMD8/16, local IDs)
    //      r8 (SIMD32, local IDs)
    //  - assign local ptr arguments left-to-right
    //  - assign global pointer arguments left-to-right
    //  - assign scalar arguments left-to-right
    //  - assign surface indices left-to-right for global pointers
    //  - no arguments can cross a GRF boundary. Arrays like work size count
    //     as 1 argument for this rule.

    static const std::string localSizeArgs[3] = {"__local_size0", "__local_size1", "__local_size2"};
    static const std::string scratchSizeArg = "__scratch_size";

    GRF base;
    int offset;
    int nextSurface = 0;
    const int grfSize = GRF::bytes(hw);

    if (baseOverride.isValid()) {
        base = GRF(baseOverride.getBase());
        offset = baseOverride.getByteOffset();
    } else {
        base = getCrossthreadBase();
        offset = 32;
    }

    auto assignArgsOfType = [&](ExternalArgumentType which) {
        for (auto &assignment : assignments) {
            auto exttype = assignment.exttype;
            if (!rearrangeArgs)
                exttype = ExternalArgumentType::Scalar;
            if (exttype != which) continue;

            auto bytes = getBytes(assignment.type);
            auto size = getDwords(assignment.type) << 2;

            if (assignment.reg.isInvalid()) {
                if (assignment.name == localSizeArgs[0]) {
                    // Move to next GRF if local size arguments won't fit in this one.
                    if (offset > grfSize - (3 * 4)) {
                        offset = 0;
                        base++;
                    }
                }

                offset = (offset + size - 1) & -size;
                if (offset >= grfSize) {
                    offset = 0;
                    base++;
                }

                assignment.reg = base.sub(offset / bytes, assignment.type);
            } else {
                int obase = assignment.reg.getBase();
                int ooffset = assignment.reg.getByteOffset();
                if (base.getBase() < obase) {
                    base = GRF(obase);
                    offset = ooffset;
                } else if (base.getBase() == obase)
                    offset = std::max(offset, ooffset);
            }

            offset += size;

            if (assignment.exttype == ExternalArgumentType::GlobalPtr) {
                if (!assignment.globalStatelessAccess())
                    assignment.reg = Subregister{};
                if (assignment.globalSurfaceAccess())
                    assignment.surface = nextSurface;
                nextSurface++;
            } else if (assignment.exttype == ExternalArgumentType::Scalar)
                requireType(assignment.type);
        }
    };

    assignArgsOfType(ExternalArgumentType::LocalPtr);
    assignArgsOfType(ExternalArgumentType::GlobalPtr);
    assignArgsOfType(ExternalArgumentType::Scalar);

    // Add private memory size arguments.
    if (scratchSize > 0)
        newArgument(scratchSizeArg, DataType::uq, ExternalArgumentType::Hidden);

    // Add enqueued local size arguments.
    if (needLocalSize && needNonuniformWGs)
        for (int dim = 0; dim < 3; dim++)
            newArgument(localSizeArgs[dim], DataType::ud, ExternalArgumentType::Hidden);

    assignArgsOfType(ExternalArgumentType::Hidden);

    crossthreadBytes = (base.getBase() - getCrossthreadBase().getBase()) * GRF::bytes(hw)
                     + ((offset + 31) & -32);
    crossthreadGRFs = GRF::bytesToGRFs(hw, crossthreadBytes);

    // Manually add regular local size arguments.
    if (needLocalSize && !needNonuniformWGs)
        for (int dim = 0; dim < 3; dim++)
            assignments.push_back({localSizeArgs[dim], DataType::ud, ExternalArgumentType::Hidden,
                                   GlobalAccessType::None, GRF(getCrossthreadBase()).ud(dim + 3), noSurface, -1});

    finalized = true;
}

int InterfaceHandler::inlineGRFs() const
{
    return requestedInlineGRFs;
}

GRF InterfaceHandler::getCrossthreadBase(bool effective) const
{
    if (!needLocalID)
        return GRF((!effective || (hw >= HW::XeHP)) ? 1 : 2);
    else if (simd == 1)
        return GRF(2);
    else
        return GRF(1 + 3 * grfsPerLID());
}

GRF InterfaceHandler::getArgLoadBase() const
{
    return getCrossthreadBase().advance(inlineGRFs());
}

int InterfaceHandler::getCrossthreadBytes() const
{
#ifdef NGEN_SAFE
    if (!finalized) throw interface_not_finalized();
#endif
    return crossthreadBytes;
}

int InterfaceHandler::getCrossthreadGRFs() const
{
#ifdef NGEN_SAFE
    if (!finalized) throw interface_not_finalized();
#endif
    return crossthreadGRFs;
}

GlobalAccessType InterfaceHandler::defaultGlobalAccess(HW hw)
{
    if (hw >= HW::XeHPC) return GlobalAccessType::Stateless;
    return GlobalAccessType::All;
}

int InterfaceHandler::defaultInlineGRFs(HW hw)
{
    if (hw == HW::XeHP) return 1;
    if (hw == HW::XeHPG) return 1;
    return 0;
}

template <typename CodeGenerator>
void InterfaceHandler::generatePrologue(CodeGenerator &generator, const GRF &temp) const
{
    if (needLocalID)
        generator.loadlid(getCrossthreadBytes(), needLocalID, simd, temp, -1);
    if (getCrossthreadGRFs() > inlineGRFs())
        generator.loadargs(getArgLoadBase(), getCrossthreadGRFs() - inlineGRFs(), temp);
}

std::string InterfaceHandler::generateZeInfo() const
{
#ifdef NGEN_SAFE
    if (!finalized) throw interface_not_finalized();
#endif

    std::stringstream md;

    const char *version = "1.8";

    md << "version: " << version << "\n"
          "kernels: \n"
          "  - name: \"" << kernelName << "\"\n"
          "    execution_env: \n"
          "      grf_count: " << needGRF << "\n"
          "      simd_size: " << simd << "\n";
    if (simd > 1)
        md << "      required_sub_group_size: " << simd << "\n";
    if (wg[0] > 0 && wg[1] > 0 && wg[2] > 0) {
        md << "      required_work_group_size:\n"
           << "        - " << wg[0] << "\n"
           << "        - " << wg[1] << "\n"
           << "        - " << wg[2] << "\n";
    }
    if (walkOrder[0] >= 0) {
        md << "      work_group_walk_order_dimensions:\n"
           << "        - " << walkOrder[0] << "\n"
           << "        - " << walkOrder[1] << "\n"
           << "        - " << std::max(walkOrder[2], 0) << "\n";
    }
    if (offsetSkipPerThread > 0)
        md << "      offset_to_skip_per_thread_data_load: " << offsetSkipPerThread << '\n';
    if (barrierCount > 0)
        md << "      barrier_count: " << utils::roundup_pow2(barrierCount) << '\n';
    if (allow64BitBuffers)
        md << "      has_4gb_buffers: true\n";
    if (needDPAS)
        md << "      has_dpas: true\n";
    if (needGlobalAtomics)
        md << "      has_global_atomics: true\n";
    if (slmSize > 0)
        md << "      slm_size: " << slmSize << '\n';
    if (!needStatelessWrites)
        md << "      has_no_stateless_write: true\n";
    if (needNoPreemption)
        md << "      disable_mid_thread_preemption: true\n";
    if (arbitrationMode != ThreadArbitrationMode::Default) {
        md << "      thread_scheduling_mode: ";
        switch (arbitrationMode) {
            case ThreadArbitrationMode::OldestFirst:       md << "age_based\n";         break;
            case ThreadArbitrationMode::RoundRobin:        md << "round_robin\n";       break;
            case ThreadArbitrationMode::RoundRobinOnStall: md << "round_robin_stall\n"; break;
            default: break;
        }
    }
    if (inlineGRFs() > 0)
        md << "      inline_data_payload_size: " << inlineGRFs() * GRF::bytes(hw) << "\n";
    if (!assignments.empty()) {
        md << "\n"
              "    payload_arguments: \n";
    }
    for (auto &assignment : assignments) {
        uint32_t size = 0;
        bool skipArg = false;
        bool explicitArg = true;

        if (assignment.globalSurfaceAccess()) {
            md << "      - arg_type: arg_bypointer\n"
                  "        arg_index: " << assignment.index << "\n"
                  "        offset: 0\n"
                  "        size: 0\n"
                  "        addrmode: stateful\n"
                  "        addrspace: global\n"
                  "        access_type: readwrite\n"
                  "\n";
        }

        switch (assignment.exttype) {
            case ExternalArgumentType::Scalar:
                md << "      - arg_type: arg_byvalue\n";
                size = (assignment.reg.getDwords() << 2);
                break;
            case ExternalArgumentType::GlobalPtr:
                if (!assignment.globalStatelessAccess()) {
                    skipArg = true;
                    break;
                }
                // fall through
            case ExternalArgumentType::LocalPtr:
                md << "      - arg_type: arg_bypointer\n";
                size = (assignment.reg.getDwords() << 2);
                break;
            case ExternalArgumentType::Hidden: {
                explicitArg = false;
                if (assignment.name == "__local_size0") {
                    // from Zebin spec : local_size Argument size : int32x3
                    // may need refining to allow
                    // either int32x1, int32x2, int32x3 (x, xy, xyz)
                    // or fine grain : local_size_x, local_size_y, local_size_z
                    md << "      - arg_type: "
                       << (needNonuniformWGs ? "enqueued_local_size\n" : "local_size\n");
                    size = (assignment.reg.getDwords() << 2) * 3;
                } else
                    skipArg = true;
                break;
            }
        }
        if (skipArg)
            continue;

        auto offset = (assignment.reg.getBase() - getCrossthreadBase().getBase()) * GRF::bytes(hw) + assignment.reg.getByteOffset();

#ifdef NGEN_SAFE
        if (offset < 0) throw unsupported_argument_location_override();
#endif

        if (explicitArg)
            md << "        arg_index: " << assignment.index << "\n";
        md << "        offset: " << offset << "\n"
              "        size: " << size << '\n';

        if (assignment.globalStatelessAccess()) {
            md << "        addrmode: stateless\n"
                  "        addrspace: global\n"
                  "        access_type: readwrite\n";
        } else if (assignment.exttype == ExternalArgumentType::LocalPtr) {
            md << "        addrmode: slm\n"
                  "        addrspace: local\n"
                  "        access_type: readwrite\n";
        }
        md << "\n";
    }

    bool firstSurface = true;
    for (auto &assignment : assignments) {
        if (assignment.globalSurfaceAccess()) {
            if (firstSurface) {
                md << "\n"
                      "    binding_table_indices: \n";
                firstSurface = false;
            }
            md << "      - bti_value: " << assignment.surface << "\n"
                  "        arg_index: " << assignment.index << "\n"
                  " \n";
        }
    }

    if (needLocalID) {
        md << "\n"
              "    per_thread_payload_arguments: \n";

        if (simd == 1) {
            md << "      - arg_type: packed_local_ids\n"
                  "        offset: 0\n"
                  "        size: 6\n"
                  "  \n";
        } else {
            auto localIDBytes = grfsPerLID() * GRF::bytes(hw);
            localIDBytes *= 3; // runtime currently supports 0 or 3 localId channels in per thread data
            md << "      - arg_type: local_id\n"
                  "        offset: 0\n"
                  "        size: " << localIDBytes << "\n"
                  "  \n";
        }
    }

    md << "\n"; // ensure file ends with newline

#ifdef NGEN_DUMP_ZE_INFO
    std::cerr << md.str();
#endif

    return md.str();
}

} /* namespace NGEN_NAMESPACE */

#endif /* header guard */
