ge has been demoted // _ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E2rZ has been demoted .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN41_INTERNAL_63c66d19_10_softmax_cu_6338dae46thrust12placeholders3_10E[1]; .visible .entry SoftMaxKernel_fp16_large( .param .align 8 .b8 SoftMaxKernel_fp16_large_param_0[16] ) { .reg .pred %p<36>; .reg .b16 %rs<6>; .reg .f32 %f<83>; .reg .b32 %r<71>; .reg .b64 %rd<9>; // demoted variable .shared .align 4 .b8 _ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage[44]; // demoted variable .shared .align 4 .f32 _ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E2rZ; ld.param.u32 %r1, [SoftMaxKernel_fp16_large_param_0+8]; ld.param.u64 %rd2, [SoftMaxKernel_fp16_large_param_0]; cvta.to.global.u64 %rd1, %rd2; mov.u32 %r11, %ctaid.x; mul.lo.s32 %r2, %r1, %r11; mov.u32 %r70, %tid.x; setp.ge.s32 %p1, %r70, %r1; mov.f32 %f78, 0fFF7FFFFF; @%p1 bra $L__BB0_3; mov.u32 %r68, %r70; $L__BB0_2: add.s32 %r12, %r68, %r2; mul.wide.s32 %rd3, %r12, 2; add.s64 %rd4, %rd1, %rd3; ld.global.u16 %rs1, [%rd4]; // begin inline asm { cvt.f32.f16 %f17, %rs1;} // end inline asm setp.gt.ftz.f32 %p2, %f17, %f78; selp.f32 %f78, %f17, %f78, %p2; add.s32 %r68, %r68, 256; setp.lt.s32 %p3, %r68, %r1; @%p3 bra $L__BB0_2; $L__BB0_3: // begin inline asm mov.u32 %r13, %laneid; // end inline asm // begin inline asm mov.u32 %r14, %laneid; // end inline asm mov.b32 %r16, %f78; mov.u32 %r17, 1; mov.u32 %r38, 31; mov.u32 %r39, -1; // begin inline asm shfl.sync.down.b32 %r15, %r16, %r17, %r38, %r39; // end inline asm mov.b32 %f18, %r15; setp.lt.s32 %p4, %r14, 31; setp.lt.ftz.f32 %p5, %f78, %f18; and.pred %p6, %p4, %p5; selp.f32 %f19, %f18, %f78, %p6; mov.b32 %r21, %f19; mov.u32 %r22, 2; // begin inline asm shfl.sync.down.b32 %r20, %r21, %r22, %r38, %r39; // end inline asm mov.b32 %f20, %r20; setp.lt.s32 %p7, %r14, 30; setp.lt.ftz.f32 %p8, %f19, %f20; and.pred %p9, %p7, %p8; selp.f32 %f21, %f20, %f19, %p9; mov.b32 %r26, %f21; mov.u32 %r27, 4; // begin inline asm shfl.sync.down.b32 %r25, %r26, %r27, %r38, %r39; // end inline asm mov.b32 %f22, %r25; setp.lt.s32 %p10, %r14, 28; setp.lt.ftz.f32 %p11, %f21, %f22; and.pred %p12, %p10, %p11; selp.f32 %f23, %f22, %f21, %p12; mov.b32 %r31, %f23; mov.u32 %r32, 8; // begin inline asm shfl.sync.down.b32 %r30, %r31, %r32, %r38, %r39; // end inline asm mov.b32 %f24, %r30; setp.lt.s32 %p13, %r14, 24; setp.lt.ftz.f32 %p14, %f23, %f24; and.pred %p15, %p13, %p14; selp.f32 %f25, %f24, %f23, %p15; mov.b32 %r36, %f25; mov.u32 %r37, 16; // begin inline asm shfl.sync.down.b32 %r35, %r36, %r37, %r38, %r39; // end inline asm mov.b32 %f26, %r35; setp.lt.s32 %p16, %r14, 16; setp.lt.ftz.f32 %p17, %f25, %f26; and.pred %p18, %p16, %p17; selp.f32 %f79, %f26, %f25, %p18; setp.ne.s32 %p19, %r13, 0; shr.s32 %r40, %r70, 31; shr.u32 %r41, %r40, 27; add.s32 %r42, %r70, %r41; shr.s32 %r43, %r42, 5; shl.b32 %r44, %r43, 2; mov.u32 %r45, _ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage; add.s32 %r46, %r45, %r44; @%p19 bra $L__BB0_5; add.s32 %r66, %r46, 8; st.shared.f32 [%r66], %f79; $L__BB0_5: bar.sync 0; setp.ne.s32 %p20, %r70, 0; @%p20 bra $L__BB0_7; ld.shared.f32 %f27, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+12]; setp.gt.ftz.f32 %p21, %f27, %f79; selp.f32 %f28, %f27, %f79, %p21; ld.shared.f32 %f29, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+16]; setp.gt.ftz.f32 %p22, %f29, %f28; selp.f32 %f30, %f29, %f28, %p22; ld.shared.f32 %f31, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+20]; setp.gt.ftz.f32 %p23, %f31, %f30; selp.f32 %f32, %f31, %f30, %p23; ld.shared.f32 %f33, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+24]; setp.gt.ftz.f32 %p24, %f33, %f32; selp.f32 %f34, %f33, %f32, %p24; ld.shared.f32 %f35, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+28]; setp.gt.ftz.f32 %p25, %f35, %f34; selp.f32 %f36, %f35, %f34, %p25; ld.shared.f32 %f37, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+32]; setp.gt.ftz.f32 %p26, %f37, %f36; selp.f32 %f38, %f37, %f36, %p26; ld.shared.f32 %f39, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+36]; setp.gt.ftz.f32 %p27, %f39, %f38; selp.f32 %f79, %f39, %f38, %p27; $L__BB0_7: @%p20 bra $L__BB0_9; st.shared.f32 [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E2rZ], %f79; $L__BB0_9: bar.sync 0; mov.f32 %f81, 0f00000000; @%p1 bra $L__BB0_12; ld.shared.f32 %f7, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E2rZ]; mov.u32 %r69, %r70; $L__BB0_11: add.s32 %r47, %r69, %r2; mul.wide.s32 %rd5, %r47, 2; add.s64 %rd6, %rd1, %rd5; ld.global.u16 %rs2, [%rd6]; // begin inline asm { cvt.f32.f16 %f42, %rs2;} // end inline asm sub.ftz.f32 %f44, %f42, %f7; mul.ftz.f32 %f45, %f44, 0f3FB8AA3B; ex2.approx.ftz.f32 %f43, %f45; add.ftz.f32 %f81, %f81, %f43; // begin inline asm { cvt.rn.f16.f32 %rs3, %f43;} // end inline asm st.global.u16 [%rd6], %rs3; add.s32 %r69, %r69, 256; setp.lt.s32 %p30, %r69, %r1; @%p30 bra $L__BB0_11; $L__BB0_12: // begin inline asm mov.u32 %r48, %laneid; // end inline asm mov.u32 %r50, 1; mov.u32 %r63, 31; mov.u32 %r64, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f81, %r50, %r63, %r64; @p add.f32 r0, r0, %f81; mov.f32 %f46, r0;} // end inline asm mov.u32 %r53, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f46, %r53, %r63, %r64; @p add.f32 r0, r0, %f46; mov.f32 %f49, r0;} // end inline asm mov.u32 %r56, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f49, %r56, %r63, %r64; @p add.f32 r0, r0, %f49; mov.f32 %f52, r0;} // end inline asm mov.u32 %r59, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f52, %r59, %r63, %r64; @p add.f32 r0, r0, %f52; mov.f32 %f55, r0;} // end inline asm mov.u32 %r62, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f55, %r62, %r63, %r64; @p add.f32 r0, r0, %f55; mov.f32 %f82, r0;} // end inline asm setp.ne.s32 %p31, %r48, 0; @%p31 bra $L__BB0_14; add.s32 %r67, %r46, 8; st.shared.f32 [%r67], %f82; $L__BB0_14: bar.sync 0; @%p20 bra $L__BB0_16; ld.shared.f32 %f61, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+12]; add.ftz.f32 %f62, %f82, %f61; ld.shared.f32 %f63, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+16]; add.ftz.f32 %f64, %f62, %f63; ld.shared.f32 %f65, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+20]; add.ftz.f32 %f66, %f64, %f65; ld.shared.f32 %f67, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+24]; add.ftz.f32 %f68, %f66, %f67; ld.shared.f32 %f69, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+28]; add.ftz.f32 %f70, %f68, %f69; ld.shared.f32 %f71, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+32]; add.ftz.f32 %f72, %f70, %f71; ld.shared.f32 %f73, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E11tempStorage+36]; add.ftz.f32 %f82, %f72, %f73; $L__BB0_16: @%p20 bra $L__BB0_18; rcp.approx.ftz.f32 %f74, %f82; st.shared.f32 [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E2rZ], %f74; $L__BB0_18: bar.sync 0; @%p1 bra $L__BB0_21; ld.shared.f32 %f14, [_ZZ16nchwSoftmaxAxisWI6__halfLj256EEviPT_E2rZ]; $L__BB0_20: add.s32 %r65, %r70, %r2; mul.wide.s32 %rd7, %r65, 2; add.s64 %rd8, %rd1, %rd7; ld.global.u16 %rs4, [%rd8]; // begin inline asm { cvt.f32.f16 %f75, %rs4;} // end inline asm mul.ftz.f32 %f76, %f75, %f14; // begin inline asm { cvt.rn.f16.f32 %rs5, %f76;} // end inline asm st.global.u16 [%rd8], %rs5; add.s32 %r70, %r70, 256; setp.lt.s32 %p35, %r70, %r1; @%p35 bra $L__BB0_20; $L__BB0_21: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }