moted .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_e74b88d26thrust12placeholders3_10E[1]; .visible .entry SoftMaxKernel_fp32_medium( .param .align 8 .b8 SoftMaxKernel_fp32_medium_param_0[16] ) { .reg .pred %p<28>; .reg .f32 %f<57>; .reg .b32 %r<59>; .reg .b64 %rd<5>; // demoted variable .shared .align 4 .b8 _ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E11tempStorage[24]; // demoted variable .shared .align 4 .f32 _ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E2rZ; ld.param.u32 %r1, [SoftMaxKernel_fp32_medium_param_0+8]; ld.param.u64 %rd2, [SoftMaxKernel_fp32_medium_param_0]; cvta.to.global.u64 %rd3, %rd2; mov.u32 %r4, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r5, %r1, %r4, %r2; setp.ge.u32 %p1, %r2, %r1; mul.wide.s32 %rd4, %r5, 4; add.s64 %rd1, %rd3, %rd4; mov.f32 %f53, 0fFF7FFFFF; @%p1 bra $L__BB0_2; ld.global.f32 %f53, [%rd1]; $L__BB0_2: // begin inline asm mov.u32 %r6, %laneid; // end inline asm // begin inline asm mov.u32 %r7, %laneid; // end inline asm mov.b32 %r9, %f53; mov.u32 %r10, 1; mov.u32 %r31, 31; mov.u32 %r32, -1; // begin inline asm shfl.sync.down.b32 %r8, %r9, %r10, %r31, %r32; // end inline asm mov.b32 %f12, %r8; setp.lt.s32 %p2, %r7, 31; setp.lt.ftz.f32 %p3, %f53, %f12; and.pred %p4, %p2, %p3; selp.f32 %f13, %f12, %f53, %p4; mov.b32 %r14, %f13; mov.u32 %r15, 2; // begin inline asm shfl.sync.down.b32 %r13, %r14, %r15, %r31, %r32; // end inline asm mov.b32 %f14, %r13; setp.lt.s32 %p5, %r7, 30; setp.lt.ftz.f32 %p6, %f13, %f14; and.pred %p7, %p5, %p6; selp.f32 %f15, %f14, %f13, %p7; mov.b32 %r19, %f15; mov.u32 %r20, 4; // begin inline asm shfl.sync.down.b32 %r18, %r19, %r20, %r31, %r32; // end inline asm mov.b32 %f16, %r18; setp.lt.s32 %p8, %r7, 28; setp.lt.ftz.f32 %p9, %f15, %f16; and.pred %p10, %p8, %p9; selp.f32 %f17, %f16, %f15, %p10; mov.b32 %r24, %f17; mov.u32 %r25, 8; // begin inline asm shfl.sync.down.b32 %r23, %r24, %r25, %r31, %r32; // end inline asm mov.b32 %f18, %r23; setp.lt.s32 %p11, %r7, 24; setp.lt.ftz.f32 %p12, %f17, %f18; and.pred %p13, %p11, %p12; selp.f32 %f19, %f18, %f17, %p13; mov.b32 %r29, %f19; mov.u32 %r30, 16; // begin inline asm shfl.sync.down.b32 %r28, %r29, %r30, %r31, %r32; // end inline asm mov.b32 %f20, %r28; setp.lt.s32 %p14, %r7, 16; setp.lt.ftz.f32 %p15, %f19, %f20; and.pred %p16, %p14, %p15; selp.f32 %f54, %f20, %f19, %p16; setp.ne.s32 %p17, %r6, 0; shr.s32 %r33, %r2, 31; shr.u32 %r34, %r33, 27; add.s32 %r35, %r2, %r34; shr.s32 %r36, %r35, 5; shl.b32 %r37, %r36, 2; mov.u32 %r38, _ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E11tempStorage; add.s32 %r39, %r38, %r37; @%p17 bra $L__BB0_4; add.s32 %r57, %r39, 4; st.shared.f32 [%r57], %f54; $L__BB0_4: bar.sync 0; setp.ne.s32 %p18, %r2, 0; @%p18 bra $L__BB0_6; ld.shared.f32 %f21, [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E11tempStorage+8]; setp.gt.ftz.f32 %p19, %f21, %f54; selp.f32 %f22, %f21, %f54, %p19; ld.shared.f32 %f23, [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E11tempStorage+12]; setp.gt.ftz.f32 %p20, %f23, %f22; selp.f32 %f24, %f23, %f22, %p20; ld.shared.f32 %f25, [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E11tempStorage+16]; setp.gt.ftz.f32 %p21, %f25, %f24; selp.f32 %f54, %f25, %f24, %p21; $L__BB0_6: @%p18 bra $L__BB0_8; st.shared.f32 [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E2rZ], %f54; $L__BB0_8: bar.sync 0; mov.f32 %f55, 0f00000000; @%p1 bra $L__BB0_10; ld.shared.f32 %f27, [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E2rZ]; sub.ftz.f32 %f28, %f53, %f27; mul.ftz.f32 %f29, %f28, 0f3FB8AA3B; ex2.approx.ftz.f32 %f55, %f29; $L__BB0_10: // begin inline asm mov.u32 %r40, %laneid; // end inline asm mov.u32 %r42, 1; mov.u32 %r55, 31; mov.u32 %r56, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f55, %r42, %r55, %r56; @p add.f32 r0, r0, %f55; mov.f32 %f30, r0;} // end inline asm mov.u32 %r45, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f30, %r45, %r55, %r56; @p add.f32 r0, r0, %f30; mov.f32 %f33, r0;} // end inline asm mov.u32 %r48, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f33, %r48, %r55, %r56; @p add.f32 r0, r0, %f33; mov.f32 %f36, r0;} // end inline asm mov.u32 %r51, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f36, %r51, %r55, %r56; @p add.f32 r0, r0, %f36; mov.f32 %f39, r0;} // end inline asm mov.u32 %r54, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f39, %r54, %r55, %r56; @p add.f32 r0, r0, %f39; mov.f32 %f56, r0;} // end inline asm setp.ne.s32 %p24, %r40, 0; @%p24 bra $L__BB0_12; add.s32 %r58, %r39, 4; st.shared.f32 [%r58], %f56; $L__BB0_12: bar.sync 0; @%p18 bra $L__BB0_14; ld.shared.f32 %f45, [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E11tempStorage+8]; add.ftz.f32 %f46, %f56, %f45; ld.shared.f32 %f47, [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E11tempStorage+12]; add.ftz.f32 %f48, %f46, %f47; ld.shared.f32 %f49, [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E11tempStorage+16]; add.ftz.f32 %f56, %f48, %f49; $L__BB0_14: @%p18 bra $L__BB0_16; rcp.approx.ftz.f32 %f50, %f56; st.shared.f32 [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E2rZ], %f50; $L__BB0_16: bar.sync 0; @%p1 bra $L__BB0_18; ld.shared.f32 %f51, [_ZZ21nchwSoftmaxAxisWSmallIfLj128EEviPT_E2rZ]; mul.ftz.f32 %f52, %f55, %f51; st.global.f32 [%rd1], %f52; $L__BB0_18: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }