RNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_4f1619496thrust12placeholders3_10E[1]; .visible .entry SoftMaxKernel_fp16_small( .param .align 8 .b8 SoftMaxKernel_fp16_small_param_0[16] ) { .reg .pred %p<23>; .reg .b16 %rs<3>; .reg .f32 %f<42>; .reg .b32 %r<49>; .reg .b64 %rd<5>; // demoted variable .shared .align 4 .b8 _ZZ21nchwSoftmaxAxisWSmallI6__halfLj32EEviPT_E11tempStorage[12]; // demoted variable .shared .align 4 .f32 _ZZ21nchwSoftmaxAxisWSmallI6__halfLj32EEviPT_E2rZ; ld.param.u32 %r1, [SoftMaxKernel_fp16_small_param_0+8]; ld.param.u64 %rd2, [SoftMaxKernel_fp16_small_param_0]; cvta.to.global.u64 %rd3, %rd2; mov.u32 %r3, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r4, %r1, %r3, %r2; setp.ge.u32 %p1, %r2, %r1; mul.wide.s32 %rd4, %r4, 2; add.s64 %rd1, %rd3, %rd4; mov.f32 %f40, 0fFF7FFFFF; @%p1 bra $L__BB0_2; ld.global.u16 %rs1, [%rd1]; // begin inline asm { cvt.f32.f16 %f40, %rs1;} // end inline asm $L__BB0_2: // begin inline asm mov.u32 %r5, %laneid; // end inline asm // begin inline asm mov.u32 %r6, %laneid; // end inline asm mov.b32 %r8, %f40; mov.u32 %r9, 1; mov.u32 %r30, 31; mov.u32 %r31, -1; // begin inline asm shfl.sync.down.b32 %r7, %r8, %r9, %r30, %r31; // end inline asm mov.b32 %f9, %r7; setp.lt.s32 %p2, %r6, 31; setp.lt.ftz.f32 %p3, %f40, %f9; and.pred %p4, %p2, %p3; selp.f32 %f10, %f9, %f40, %p4; mov.b32 %r13, %f10; mov.u32 %r14, 2; // begin inline asm shfl.sync.down.b32 %r12, %r13, %r14, %r30, %r31; // end inline asm mov.b32 %f11, %r12; setp.lt.s32 %p5, %r6, 30; setp.lt.ftz.f32 %p6, %f10, %f11; and.pred %p7, %p5, %p6; selp.f32 %f12, %f11, %f10, %p7; mov.b32 %r18, %f12; mov.u32 %r19, 4; // begin inline asm shfl.sync.down.b32 %r17, %r18, %r19, %r30, %r31; // end inline asm mov.b32 %f13, %r17; setp.lt.s32 %p8, %r6, 28; setp.lt.ftz.f32 %p9, %f12, %f13; and.pred %p10, %p8, %p9; selp.f32 %f14, %f13, %f12, %p10; mov.b32 %r23, %f14; mov.u32 %r24, 8; // begin inline asm shfl.sync.down.b32 %r22, %r23, %r24, %r30, %r31; // end inline asm mov.b32 %f15, %r22; setp.lt.s32 %p11, %r6, 24; setp.lt.ftz.f32 %p12, %f14, %f15; and.pred %p13, %p11, %p12; selp.f32 %f16, %f15, %f14, %p13; mov.b32 %r28, %f16; mov.u32 %r29, 16; // begin inline asm shfl.sync.down.b32 %r27, %r28, %r29, %r30, %r31; // end inline asm mov.b32 %f17, %r27; setp.lt.s32 %p14, %r6, 16; setp.lt.ftz.f32 %p15, %f16, %f17; and.pred %p16, %p14, %p15; selp.f32 %f3, %f17, %f16, %p16; setp.ne.s32 %p17, %r5, 0; @%p17 bra $L__BB0_4; st.shared.f32 [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj32EEviPT_E11tempStorage+4], %f3; $L__BB0_4: bar.sync 0; setp.ne.s32 %p18, %r2, 0; @%p18 bra $L__BB0_6; st.shared.f32 [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj32EEviPT_E2rZ], %f3; $L__BB0_6: bar.sync 0; mov.f32 %f41, 0f00000000; @%p1 bra $L__BB0_8; ld.shared.f32 %f19, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj32EEviPT_E2rZ]; sub.ftz.f32 %f20, %f40, %f19; mul.ftz.f32 %f21, %f20, 0f3FB8AA3B; ex2.approx.ftz.f32 %f41, %f21; $L__BB0_8: // begin inline asm mov.u32 %r32, %laneid; // end inline asm mov.u32 %r34, 1; mov.u32 %r47, 31; mov.u32 %r48, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f41, %r34, %r47, %r48; @p add.f32 r0, r0, %f41; mov.f32 %f22, r0;} // end inline asm mov.u32 %r37, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f22, %r37, %r47, %r48; @p add.f32 r0, r0, %f22; mov.f32 %f25, r0;} // end inline asm mov.u32 %r40, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f25, %r40, %r47, %r48; @p add.f32 r0, r0, %f25; mov.f32 %f28, r0;} // end inline asm mov.u32 %r43, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f28, %r43, %r47, %r48; @p add.f32 r0, r0, %f28; mov.f32 %f31, r0;} // end inline asm mov.u32 %r46, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f31, %r46, %r47, %r48; @p add.f32 r0, r0, %f31; mov.f32 %f34, r0;} // end inline asm setp.ne.s32 %p20, %r32, 0; @%p20 bra $L__BB0_10; st.shared.f32 [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj32EEviPT_E11tempStorage+4], %f34; $L__BB0_10: bar.sync 0; @%p18 bra $L__BB0_12; rcp.approx.ftz.f32 %f37, %f34; st.shared.f32 [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj32EEviPT_E2rZ], %f37; $L__BB0_12: bar.sync 0; @%p1 bra $L__BB0_14; ld.shared.f32 %f39, [_ZZ21nchwSoftmaxAxisWSmallI6__halfLj32EEviPT_E2rZ]; mul.ftz.f32 %f38, %f41, %f39; // begin inline asm { cvt.rn.f16.f32 %rs2, %f38;} // end inline asm st.global.u16 [%rd1], %rs2; $L__BB0_14: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }