diff --git a/clang/lib/Headers/__clang_cuda_intrinsics.h b/clang/lib/Headers/__clang_cuda_intrinsics.h index 8b230af6f6647..cca97cb21ef50 100644 --- a/clang/lib/Headers/__clang_cuda_intrinsics.h +++ b/clang/lib/Headers/__clang_cuda_intrinsics.h @@ -479,6 +479,293 @@ inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, return ret; } +#if defined(__cplusplus) && (__cplusplus >= 201103L) + +#pragma push_macro("__INTRINSIC_LOAD") +#define __INTRINSIC_LOAD(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \ + __Clobber) \ + inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \ + __TmpType __ret; \ + asm(__AsmOp " %0, [%1];" : __AsmType(__ret) : "l"(__ptr)__Clobber); \ + return (__DeclType)__ret; \ + } + +#pragma push_macro("__INTRINSIC_LOAD2") +#define __INTRINSIC_LOAD2(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \ + __Clobber) \ + inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \ + __DeclType __ret; \ + __TmpType __tmp; \ + asm(__AsmOp " {%0,%1}, [%2];" \ + : __AsmType(__tmp.x), __AsmType(__tmp.y) \ + : "l"(__ptr)__Clobber); \ + using __ElementType = decltype(__ret.x); \ + __ret.x = (__ElementType)(__tmp.x); \ + __ret.y = (__ElementType)__tmp.y; \ + return __ret; \ + } + +#pragma push_macro("__INTRINSIC_LOAD4") +#define __INTRINSIC_LOAD4(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType, \ + __Clobber) \ + inline __device__ __DeclType __FnName(const __DeclType *__ptr) { \ + __DeclType __ret; \ + __TmpType __tmp; \ + asm(__AsmOp " {%0,%1,%2,%3}, [%4];" \ + : __AsmType(__tmp.x), __AsmType(__tmp.y), __AsmType(__tmp.z), \ + __AsmType(__tmp.w) \ + : "l"(__ptr)__Clobber); \ + using __ElementType = decltype(__ret.x); \ + __ret.x = (__ElementType)__tmp.x; \ + __ret.y = (__ElementType)__tmp.y; \ + __ret.z = (__ElementType)__tmp.z; \ + __ret.w = (__ElementType)__tmp.w; \ + return __ret; \ + } + +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", char, unsigned int, "=r", ); +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s8", signed char, unsigned int, "=r", ); +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s16", short, unsigned short, "=h", ); +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s32", int, unsigned int, "=r", ); +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.s64", long long, unsigned long long, + "=l", ); + +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s8", char2, int2, "=r", ); +__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s8", char4, int4, "=r", ); +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s16", short2, short2, "=h", ); +__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s16", short4, short4, "=h", ); +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s32", int2, int2, "=r", ); +__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.s32", int4, int4, "=r", ); +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.s64 ", longlong2, longlong2, "=l", ); + +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u8", unsigned char, unsigned int, + "=r", ); +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u16", unsigned short, unsigned short, + "=h", ); +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u32", unsigned int, unsigned int, + "=r", ); +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.u64", unsigned long long, + unsigned long long, "=l", ); + +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u8", uchar2, int2, "=r", ); +__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u8", uchar4, int4, "=r", ); +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u16", ushort2, ushort2, "=h", ); +__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u16", ushort4, ushort4, "=h", ); +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u32", uint2, uint2, "=r", ); +__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.u32", uint4, uint4, "=r", ); +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.u64", ulonglong2, ulonglong2, + "=l", ); + +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f32", float, float, "=f", ); +__INTRINSIC_LOAD(__ldcg, "ld.global.cg.f64", double, double, "=d", ); +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f32", float2, float2, "=f", ); +__INTRINSIC_LOAD4(__ldcg, "ld.global.cg.v4.f32", float4, float4, "=f", ); +__INTRINSIC_LOAD2(__ldcg, "ld.global.cg.v2.f64", double2, double2, "=d", ); + +inline __device__ long __ldcg(const long *__ptr) { + unsigned long __ret; + if (sizeof(long) == 8) { + asm("ld.global.cg.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr)); + } else { + asm("ld.global.cg.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr)); + } + return (long)__ret; +} + +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u8", unsigned char, unsigned int, + "=r", : "memory"); +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u16", unsigned short, unsigned short, + "=h", : "memory"); +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u32", unsigned int, unsigned int, + "=r", : "memory"); +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.u64", unsigned long long, + unsigned long long, "=l", : "memory"); + +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", char, unsigned int, + "=r", : "memory"); +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s8", signed char, unsigned int, + "=r", : "memory"); +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s16", short, unsigned short, + "=h", : "memory"); +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s32", int, unsigned int, + "=r", : "memory"); +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.s64", long long, unsigned long long, + "=l", : "memory"); + +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u8", uchar2, uint2, + "=r", : "memory"); +__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u8", uchar4, uint4, + "=r", : "memory"); +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u16", ushort2, ushort2, + "=h", : "memory"); +__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u16", ushort4, ushort4, + "=h", : "memory"); +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u32", uint2, uint2, + "=r", : "memory"); +__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.u32", uint4, uint4, + "=r", : "memory"); +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.u64", ulonglong2, ulonglong2, + "=l", : "memory"); + +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s8", char2, int2, "=r", : "memory"); +__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s8", char4, int4, "=r", : "memory"); +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s16", short2, short2, + "=h", : "memory"); +__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s16", short4, short4, + "=h", : "memory"); +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s32", int2, int2, "=r", : "memory"); +__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.s32", int4, int4, "=r", : "memory"); +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.s64", longlong2, longlong2, + "=l", : "memory"); + +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f32", float, float, "=f", : "memory"); +__INTRINSIC_LOAD(__ldcv, "ld.global.cv.f64", double, double, "=d", : "memory"); + +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f32", float2, float2, + "=f", : "memory"); +__INTRINSIC_LOAD4(__ldcv, "ld.global.cv.v4.f32", float4, float4, + "=f", : "memory"); +__INTRINSIC_LOAD2(__ldcv, "ld.global.cv.v2.f64", double2, double2, + "=d", : "memory"); + +inline __device__ long __ldcv(const long *__ptr) { + unsigned long __ret; + if (sizeof(long) == 8) { + asm("ld.global.cv.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr)); + } else { + asm("ld.global.cv.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr)); + } + return (long)__ret; +} + +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", char, unsigned int, "=r", ); +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s8", signed char, signed int, "=r", ); +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s16", short, unsigned short, "=h", ); +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s32", int, unsigned int, "=r", ); +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.s64", long long, unsigned long long, + "=l", ); + +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s8", char2, int2, "=r", ); +__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s8", char4, int4, "=r", ); +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s16", short2, short2, "=h", ); +__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s16", short4, short4, "=h", ); +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s32", int2, int2, "=r", ); +__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.s32", int4, int4, "=r", ); +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.s64", longlong2, longlong2, "=l", ); + +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u8", unsigned char, unsigned int, + "=r", ); +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u16", unsigned short, unsigned short, + "=h", ); +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u32", unsigned int, unsigned int, + "=r", ); +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.u64", unsigned long long, + unsigned long long, "=l", ); + +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u8", uchar2, uint2, "=r", ); +__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u8", uchar4, uint4, "=r", ); +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u16", ushort2, ushort2, "=h", ); +__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u16", ushort4, ushort4, "=h", ); +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u32", uint2, uint2, "=r", ); +__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.u32", uint4, uint4, "=r", ); +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.u64", ulonglong2, ulonglong2, + "=l", ); + +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f32", float, float, "=f", ); +__INTRINSIC_LOAD(__ldcs, "ld.global.cs.f64", double, double, "=d", ); +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f32", float2, float2, "=f", ); +__INTRINSIC_LOAD4(__ldcs, "ld.global.cs.v4.f32", float4, float4, "=f", ); +__INTRINSIC_LOAD2(__ldcs, "ld.global.cs.v2.f64", double2, double2, "=d", ); + +#pragma pop_macro("__INTRINSIC_LOAD") +#pragma pop_macro("__INTRINSIC_LOAD2") +#pragma pop_macro("__INTRINSIC_LOAD4") + +inline __device__ long __ldcs(const long *__ptr) { + unsigned long __ret; + if (sizeof(long) == 8) { + asm("ld.global.cs.s64 %0, [%1];" : "=l"(__ret) : "l"(__ptr)); + } else { + asm("ld.global.cs.s32 %0, [%1];" : "=r"(__ret) : "l"(__ptr)); + } + return (long)__ret; +} + +#pragma push_macro("__INTRINSIC_STORE") +#define __INTRINSIC_STORE(__FnName, __AsmOp, __DeclType, __TmpType, __AsmType) \ + inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \ + __TmpType __tmp = (__TmpType)__value; \ + asm(__AsmOp " [%0], %1;" ::"l"(__ptr), __AsmType(__tmp) : "memory"); \ + } + +#pragma push_macro("__INTRINSIC_STORE2") +#define __INTRINSIC_STORE2(__FnName, __AsmOp, __DeclType, __TmpType, \ + __AsmType) \ + inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \ + __TmpType __tmp; \ + using __ElementType = decltype(__tmp.x); \ + __tmp.x = (__ElementType)(__value.x); \ + __tmp.y = (__ElementType)(__value.y); \ + asm(__AsmOp " [%0], {%1,%2};" ::"l"(__ptr), __AsmType(__tmp.x), \ + __AsmType(__tmp.y) \ + : "memory"); \ + } + +#pragma push_macro("__INTRINSIC_STORE4") +#define __INTRINSIC_STORE4(__FnName, __AsmOp, __DeclType, __TmpType, \ + __AsmType) \ + inline __device__ void __FnName(__DeclType *__ptr, __DeclType __value) { \ + __TmpType __tmp; \ + using __ElementType = decltype(__tmp.x); \ + __tmp.x = (__ElementType)(__value.x); \ + __tmp.y = (__ElementType)(__value.y); \ + __tmp.z = (__ElementType)(__value.z); \ + __tmp.w = (__ElementType)(__value.w); \ + asm(__AsmOp " [%0], {%1,%2,%3,%4};" ::"l"(__ptr), __AsmType(__tmp.x), \ + __AsmType(__tmp.y), __AsmType(__tmp.z), __AsmType(__tmp.w) \ + : "memory"); \ + } + +__INTRINSIC_STORE(__stwt, "st.global.wt.s8", char, int, "r"); +__INTRINSIC_STORE(__stwt, "st.global.wt.s8", signed char, int, "r"); +__INTRINSIC_STORE(__stwt, "st.global.wt.s16", short, short, "h"); +__INTRINSIC_STORE(__stwt, "st.global.wt.s32", int, int, "r"); +__INTRINSIC_STORE(__stwt, "st.global.wt.s64", long long, long long, "l"); + +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s8", char2, int2, "r"); +__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s8", char4, int4, "r"); +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s16", short2, short2, "h"); +__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s16", short4, short4, "h"); +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s32", int2, int2, "r"); +__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.s32", int4, int4, "r"); +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.s64", longlong2, longlong2, "l"); + +__INTRINSIC_STORE(__stwt, "st.global.wt.u8", unsigned char, int, "r"); +__INTRINSIC_STORE(__stwt, "st.global.wt.u16", unsigned short, unsigned short, + "h"); +__INTRINSIC_STORE(__stwt, "st.global.wt.u32", unsigned int, unsigned int, "r"); +__INTRINSIC_STORE(__stwt, "st.global.wt.u64", unsigned long long, + unsigned long long, "l"); + +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u8", uchar2, uchar2, "r"); +__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u8", uchar4, uint4, "r"); +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u16", ushort2, ushort2, "h"); +__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u16", ushort4, ushort4, "h"); +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u32", uint2, uint2, "r"); +__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.u32", uint4, uint4, "r"); +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.u64", ulonglong2, ulonglong2, "l"); + +__INTRINSIC_STORE(__stwt, "st.global.wt.f32", float, float, "f"); +__INTRINSIC_STORE(__stwt, "st.global.wt.f64", double, double, "d"); +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f32", float2, float2, "f"); +__INTRINSIC_STORE4(__stwt, "st.global.wt.v4.f32", float4, float4, "f"); +__INTRINSIC_STORE2(__stwt, "st.global.wt.v2.f64", double2, double2, "d"); + +#pragma pop_macro("__INTRINSIC_STORE") +#pragma pop_macro("__INTRINSIC_STORE2") +#pragma pop_macro("__INTRINSIC_STORE4") + +#endif // defined(__cplusplus) && (__cplusplus >= 201103L) #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 #if CUDA_VERSION >= 11000