d_mbh_reduce_cu_db79ce976thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN58_INTERNAL_c82393cb_27_genmmha_fused_mbh_reduce_cu_db79ce976thrust12placeholders3_10E[1]; .extern .shared .align 16 .b8 smem[]; .visible .entry MultiHeadAttention_mbh_reduce_kernel( .param .align 8 .b8 MultiHeadAttention_mbh_reduce_kernel_param_0[144] ) { .reg .pred %p<22>; .reg .f32 %f<50>; .reg .b32 %r<105>; .reg .b64 %rd<27>; mov.b64 %rd4, MultiHeadAttention_mbh_reduce_kernel_param_0; mov.u64 %rd1, %rd4; mov.u32 %r1, %ctaid.z; ld.param.u32 %r2, [MultiHeadAttention_mbh_reduce_kernel_param_0+112]; ld.param.u64 %rd5, [MultiHeadAttention_mbh_reduce_kernel_param_0+80]; setp.eq.s64 %p1, %rd5, 0; mov.u32 %r97, %r2; @%p1 bra $L__BB0_2; ld.param.u64 %rd6, [%rd1+80]; cvta.to.global.u64 %rd7, %rd6; mul.wide.s32 %rd8, %r1, 4; add.s64 %rd9, %rd7, %rd8; ld.global.u32 %r27, [%rd9]; add.s32 %r97, %r27, 1; $L__BB0_2: min.s32 %r28, %r97, %r2; setp.lt.s32 %p2, %r28, 257; @%p2 bra $L__BB0_21; ld.param.u64 %rd10, [%rd1]; cvta.to.global.u64 %rd11, %rd10; ld.param.u64 %rd12, [%rd1+88]; cvta.to.global.u64 %rd13, %rd12; ld.param.u32 %r29, [%rd1+116]; mov.u32 %r30, %ctaid.y; mad.lo.s32 %r5, %r29, %r1, %r30; ld.param.u32 %r31, [%rd1+124]; mul.lo.s32 %r32, %r5, %r31; mov.u32 %r104, WARP_SZ; mov.u32 %r33, %tid.x; div.u32 %r7, %r33, %r104; mul.lo.s32 %r34, %r7, %r104; sub.s32 %r8, %r33, %r34; mul.wide.s32 %rd14, %r32, 2; add.s64 %rd2, %rd11, %rd14; ld.param.u32 %r35, [%rd1+104]; mul.lo.s32 %r36, %r32, %r35; cvt.s64.s32 %rd15, %r36; mul.lo.s32 %r37, %r7, %r31; cvt.s64.s32 %rd16, %r37; add.s64 %rd17, %rd15, %rd16; shl.b64 %rd18, %rd17, 2; add.s64 %rd3, %rd13, %rd18; mov.u32 %r38, %ctaid.x; mad.lo.s32 %r39, %r38, %r104, %r8; shr.u32 %r40, %r31, 31; add.s32 %r41, %r31, %r40; shr.s32 %r9, %r41, 1; setp.ge.s32 %p3, %r39, %r9; setp.ge.s32 %p4, %r7, %r35; or.pred %p5, %p3, %p4; @%p5 bra $L__BB0_5; mov.u32 %r42, WARP_SZ; shl.b32 %r43, %r42, 1; add.s32 %r44, %r43, 2; mul.lo.s32 %r45, %r7, %r44; shl.b32 %r46, %r45, 2; mov.u32 %r47, smem; add.s32 %r48, %r47, %r46; shl.b32 %r49, %r8, 3; add.s32 %r50, %r48, %r49; mad.lo.s32 %r52, %r38, %r42, %r8; mul.wide.s32 %rd19, %r52, 8; add.s64 %rd20, %rd3, %rd19; ld.global.v2.u32 {%r53, %r54}, [%rd20]; st.shared.v2.u32 [%r50], {%r53, %r54}; $L__BB0_5: bar.sync 0; mov.u32 %r57, WARP_SZ; mad.lo.s32 %r10, %r38, %r57, %r7; setp.ge.s32 %p6, %r10, %r9; @%p6 bra $L__BB0_21; ld.param.u32 %r11, [%rd1+104]; setp.ge.s32 %p7, %r8, %r11; mov.f32 %f41, 0fFF7FFFFF; mov.f32 %f40, 0f00000000; @%p7 bra $L__BB0_8; ld.param.u64 %rd21, [%rd1+96]; cvta.to.global.u64 %rd22, %rd21; mad.lo.s32 %r59, %r11, %r5, %r8; mul.wide.s32 %rd23, %r59, 8; add.s64 %rd24, %rd22, %rd23; ld.global.f32 %f41, [%rd24]; ld.global.f32 %f40, [%rd24+4]; $L__BB0_8: mov.b32 %r100, %f41; setp.lt.s32 %p8, %r104, 2; @%p8 bra $L__BB0_11; mov.b32 %r100, %f41; mov.u32 %r99, %r104; mov.f32 %f42, %f41; $L__BB0_10: shr.u32 %r60, %r99, 31; add.s32 %r61, %r99, %r60; shr.s32 %r16, %r61, 1; mov.u32 %r62, 31; mov.u32 %r63, -1; shfl.sync.down.b32 %r64|%p9, %r100, %r16, %r62, %r63; mov.b32 %f25, %r64; max.ftz.f32 %f42, %f25, %f42; mov.b32 %r100, %f42; setp.gt.s32 %p10, %r99, 3; mov.u32 %r99, %r16; @%p10 bra $L__BB0_10; $L__BB0_11: mov.u32 %r65, -1; bar.warp.sync -1; mov.u32 %r66, 31; mov.u32 %r67, 0; shfl.sync.idx.b32 %r68|%p11, %r100, %r67, %r66, %r65; mov.b32 %f26, %r68; bar.warp.sync -1; sub.ftz.f32 %f27, %f41, %f26; mul.ftz.f32 %f28, %f27, 0f3FB8AA3B; ex2.approx.ftz.f32 %f29, %f28; mul.ftz.f32 %f7, %f40, %f29; mov.b32 %r103, %f7; @%p8 bra $L__BB0_14; mov.u32 %r102, %r104; mov.f32 %f43, %f7; $L__BB0_13: shr.u32 %r69, %r102, 31; add.s32 %r70, %r102, %r69; shr.s32 %r22, %r70, 1; mov.u32 %r71, 31; mov.u32 %r72, -1; shfl.sync.down.b32 %r73|%p13, %r103, %r22, %r71, %r72; mov.b32 %f30, %r73; add.ftz.f32 %f43, %f43, %f30; mov.b32 %r103, %f43; setp.gt.s32 %p14, %r102, 3; mov.u32 %r102, %r22; @%p14 bra $L__BB0_13; $L__BB0_14: bar.warp.sync -1; shfl.sync.idx.b32 %r77|%p15, %r103, %r67, %r66, %r65; mov.b32 %f33, %r77; bar.warp.sync -1; div.approx.ftz.f32 %f10, %f7, %f33; ld.param.u32 %r78, [%rd1+104]; setp.ge.s32 %p16, %r8, %r78; mov.f32 %f44, 0f00000000; mov.f32 %f45, %f44; @%p16 bra $L__BB0_16; shl.b32 %r80, %r57, 1; add.s32 %r81, %r80, 2; mul.lo.s32 %r82, %r8, %r81; shl.b32 %r83, %r82, 2; mov.u32 %r84, smem; add.s32 %r85, %r84, %r83; shl.b32 %r86, %r7, 3; add.s32 %r87, %r85, %r86; ld.shared.v2.f32 {%f45, %f44}, [%r87]; $L__BB0_16: mul.ftz.f32 %f49, %f10, %f45; mul.ftz.f32 %f48, %f10, %f44; bar.warp.sync -1; @%p8 bra $L__BB0_19; $L__BB0_18: mov.b32 %r88, %f49; shr.u32 %r89, %r104, 31; add.s32 %r90, %r104, %r89; shr.s32 %r26, %r90, 1; mov.u32 %r91, 31; mov.u32 %r92, -1; shfl.sync.down.b32 %r93|%p18, %r88, %r26, %r91, %r92; mov.b32 %f36, %r93; add.ftz.f32 %f49, %f49, %f36; mov.b32 %r94, %f48; shfl.sync.down.b32 %r95|%p19, %r94, %r26, %r91, %r92; mov.b32 %f37, %r95; add.ftz.f32 %f48, %f48, %f37; setp.gt.s32 %p20, %r104, 3; mov.u32 %r104, %r26; @%p20 bra $L__BB0_18; $L__BB0_19: setp.ne.s32 %p21, %r8, 0; @%p21 bra $L__BB0_21; mul.wide.s32 %rd25, %r10, 4; add.s64 %rd26, %rd2, %rd25; // begin inline asm {.reg .f16 low,high; cvt.rn.f16.f32 low, %f49; cvt.rn.f16.f32 high, %f48; mov.b32 %r96, {low,high};} // end inline asm st.global.u32 [%rd26], %r96; $L__BB0_21: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }