Lj128EEviPT_E2rZ has been demoted .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4df7aeae6thrust12placeholders3_10E[1]; .visible .entry SoftMaxKernel_fp16_medium( .param .align 8 .b8 SoftMaxKernel_fp16_medium_param_0[16] ) { .reg .pred %p<28>; .reg .b16 %rs<3>; .reg .f32 %f<58>; .reg .b32 %r<59>; .reg .b64 %rd<5>; // demoted variable .shared .align 4 .b8 _ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E11tempStorage[24]; // demoted variable .shared .align 4 .f32 _ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E2rZ; ld.param.u32 %r1, [SoftMaxKernel_fp16_medium_param_0+8]; ld.param.u64 %rd2, [SoftMaxKernel_fp16_medium_param_0]; cvta.to.global.u64 %rd3, %rd2; mov.u32 %r4, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r5, %r1, %r4, %r2; setp.ge.u32 %p1, %r2, %r1; mul.wide.s32 %rd4, %r5, 2; add.s64 %rd1, %rd3, %rd4; mov.f32 %f54, 0fFF7FFFFF; @%p1 bra $L__BB0_2; ld.global.u16 %rs1, [%rd1]; // begin inline asm { cvt.f32.f16 %f54, %rs1;} // end inline asm $L__BB0_2: // begin inline asm mov.u32 %r6, %laneid; // end inline asm // begin inline asm mov.u32 %r7, %laneid; // end inline asm mov.b32 %r9, %f54; mov.u32 %r10, 1; mov.u32 %r31, 31; mov.u32 %r32, -1; // begin inline asm shfl.sync.down.b32 %r8, %r9, %r10, %r31, %r32; // end inline asm mov.b32 %f13, %r8; setp.lt.s32 %p2, %r7, 31; setp.lt.ftz.f32 %p3, %f54, %f13; and.pred %p4, %p2, %p3; selp.f32 %f14, %f13, %f54, %p4; mov.b32 %r14, %f14; mov.u32 %r15, 2; // begin inline asm shfl.sync.down.b32 %r13, %r14, %r15, %r31, %r32; // end inline asm mov.b32 %f15, %r13; setp.lt.s32 %p5, %r7, 30; setp.lt.ftz.f32 %p6, %f14, %f15; and.pred %p7, %p5, %p6; selp.f32 %f16, %f15, %f14, %p7; mov.b32 %r19, %f16; mov.u32 %r20, 4; // begin inline asm shfl.sync.down.b32 %r18, %r19, %r20, %r31, %r32; // end inline asm mov.b32 %f17, %r18; setp.lt.s32 %p8, %r7, 28; setp.lt.ftz.f32 %p9, %f16, %f17; and.pred %p10, %p8, %p9; selp.f32 %f18, %f17, %f16, %p10; mov.b32 %r24, %f18; mov.u32 %r25, 8; // begin inline asm shfl.sync.down.b32 %r23, %r24, %r25, %r31, %r32; // end inline asm mov.b32 %f19, %r23; setp.lt.s32 %p11, %r7, 24; setp.lt.ftz.f32 %p12, %f18, %f19; and.pred %p13, %p11, %p12; selp.f32 %f20, %f19, %f18, %p13; mov.b32 %r29, %f20; mov.u32 %r30, 16; // begin inline asm shfl.sync.down.b32 %r28, %r29, %r30, %r31, %r32; // end inline asm mov.b32 %f21, %r28; setp.lt.s32 %p14, %r7, 16; setp.lt.ftz.f32 %p15, %f20, %f21; and.pred %p16, %p14, %p15; selp.f32 %f55, %f21, %f20, %p16; setp.ne.s32 %p17, %r6, 0; shr.s32 %r33, %r2, 31; shr.u32 %r34, %r33, 27; add.s32 %r35, %r2, %r34; shr.s32 %r36, %r35, 5; shl.b32 %r37, %r36, 2; mov.u32 %r38, _ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E11tempStorage; add.s32 %r39, %r38, %r37; @%p17 bra $L__BB0_4; add.s32 %r57, %r39, 4; st.shared.f32 [%r57], %f55; $L__BB0_4: bar.sync 0; setp.ne.s32 %p18, %r2, 0; @%p18 bra $L__BB0_6; ld.shared.f32 %f22, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E11tempStorage+8]; setp.gt.ftz.f32 %p19, %f22, %f55; selp.f32 %f23, %f22, %f55, %p19; ld.shared.f32 %f24, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E11tempStorage+12]; setp.gt.ftz.f32 %p20, %f24, %f23; selp.f32 %f25, %f24, %f23, %p20; ld.shared.f32 %f26, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E11tempStorage+16]; setp.gt.ftz.f32 %p21, %f26, %f25; selp.f32 %f55, %f26, %f25, %p21; $L__BB0_6: @%p18 bra $L__BB0_8; st.shared.f32 [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E2rZ], %f55; $L__BB0_8: bar.sync 0; mov.f32 %f56, 0f00000000; @%p1 bra $L__BB0_10; ld.shared.f32 %f28, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E2rZ]; sub.ftz.f32 %f29, %f54, %f28; mul.ftz.f32 %f30, %f29, 0f3FB8AA3B; ex2.approx.ftz.f32 %f56, %f30; $L__BB0_10: // begin inline asm mov.u32 %r40, %laneid; // end inline asm mov.u32 %r42, 1; mov.u32 %r55, 31; mov.u32 %r56, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f56, %r42, %r55, %r56; @p add.f32 r0, r0, %f56; mov.f32 %f31, r0;} // end inline asm mov.u32 %r45, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f31, %r45, %r55, %r56; @p add.f32 r0, r0, %f31; mov.f32 %f34, r0;} // end inline asm mov.u32 %r48, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f34, %r48, %r55, %r56; @p add.f32 r0, r0, %f34; mov.f32 %f37, r0;} // end inline asm mov.u32 %r51, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f37, %r51, %r55, %r56; @p add.f32 r0, r0, %f37; mov.f32 %f40, r0;} // end inline asm mov.u32 %r54, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f40, %r54, %r55, %r56; @p add.f32 r0, r0, %f40; mov.f32 %f57, r0;} // end inline asm setp.ne.s32 %p24, %r40, 0; @%p24 bra $L__BB0_12; add.s32 %r58, %r39, 4; st.shared.f32 [%r58], %f57; $L__BB0_12: bar.sync 0; @%p18 bra $L__BB0_14; ld.shared.f32 %f46, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E11tempStorage+8]; add.ftz.f32 %f47, %f57, %f46; ld.shared.f32 %f48, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E11tempStorage+12]; add.ftz.f32 %f49, %f47, %f48; ld.shared.f32 %f50, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E11tempStorage+16]; add.ftz.f32 %f57, %f49, %f50; $L__BB0_14: @%p18 bra $L__BB0_16; rcp.approx.ftz.f32 %f51, %f57; st.shared.f32 [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E2rZ], %f51; $L__BB0_16: bar.sync 0; @%p1 bra $L__BB0_18; ld.shared.f32 %f53, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj128EEviPT_E2rZ]; mul.ftz.f32 %f52, %f56, %f53; // begin inline asm { cvt.rn.f16.f32 %rs2, %f52;} // end inline asm st.global.u16 [%rd1], %rs2; $L__BB0_18: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }