// .globl mvn_normalize_fp32 .visible .entry mvn_normalize_fp32( .param .align 8 .b8 mvn_normalize_fp32_param_0[96] ) { .reg .pred %p<8>; .reg .b16 %rs<20>; .reg .f32 %f<24>; .reg .b32 %r<27>; .reg .f64 %fd<3>; .reg .b64 %rd<24>; ld.param.v2.u32 {%r12, %r13}, [mvn_normalize_fp32_param_0+48]; ld.param.v2.u32 {%r14, %r15}, [mvn_normalize_fp32_param_0+56]; ld.param.v2.u32 {%r16, %r17}, [mvn_normalize_fp32_param_0+64]; ld.param.v4.u8 {%rs9, %rs10, %rs11, %rs12}, [mvn_normalize_fp32_param_0+88]; ld.param.f32 %f4, [mvn_normalize_fp32_param_0+84]; ld.param.u64 %rd7, [mvn_normalize_fp32_param_0+40]; ld.param.u64 %rd6, [mvn_normalize_fp32_param_0+32]; ld.param.u64 %rd5, [mvn_normalize_fp32_param_0+24]; ld.param.u64 %rd4, [mvn_normalize_fp32_param_0+16]; ld.param.u64 %rd3, [mvn_normalize_fp32_param_0+8]; ld.param.u64 %rd2, [mvn_normalize_fp32_param_0]; mov.u32 %r20, %ctaid.x; shl.b32 %r21, %r20, 9; mov.u32 %r22, %tid.x; add.s32 %r2, %r21, %r22; setp.ge.u32 %p1, %r2, %r16; @%p1 bra $L__BB0_4; cvta.to.global.u64 %rd8, %rd4; cvta.to.global.u64 %rd9, %rd3; cvta.to.global.u64 %rd10, %rd2; div.u32 %r23, %r2, %r17; mul.wide.u32 %rd11, %r23, 8; add.s64 %rd12, %rd8, %rd11; ld.global.f64 %fd1, [%rd12]; cvt.rn.f32.f64 %f5, %fd1; add.s64 %rd13, %rd9, %rd11; ld.global.f64 %fd2, [%rd13]; cvt.rn.f32.f64 %f6, %fd2; mul.f32 %f7, %f6, %f6; cvt.rn.f32.u32 %f8, %r17; div.rn.f32 %f9, %f7, %f8; sub.f32 %f10, %f5, %f9; div.rn.f32 %f11, %f10, %f8; div.rn.f32 %f12, %f6, %f8; add.f32 %f13, %f4, %f11; rsqrt.approx.f32 %f14, %f13; cvt.u64.u32 %rd1, %r2; mul.wide.u32 %rd14, %r2, 4; add.s64 %rd15, %rd10, %rd14; ld.global.f32 %f15, [%rd15]; sub.f32 %f16, %f15, %f12; setp.eq.s16 %p2, %rs9, 0; mul.f32 %f17, %f14, %f16; selp.f32 %f23, %f17, %f16, %p2; setp.eq.s16 %p3, %rs11, 0; @%p3 bra $L__BB0_3; mul.lo.s32 %r24, %r14, %r15; div.u32 %r25, %r2, %r24; rem.u32 %r26, %r25, %r13; cvta.to.global.u64 %rd16, %rd5; mul.wide.u32 %rd17, %r26, 4; add.s64 %rd18, %rd16, %rd17; cvta.to.global.u64 %rd19, %rd6; add.s64 %rd20, %rd19, %rd17; ld.global.f32 %f18, [%rd18]; ld.global.f32 %f19, [%rd20]; fma.rn.f32 %f23, %f23, %f18, %f19; $L__BB0_3: setp.lt.f32 %p4, %f23, 0f00000000; setp.ne.s16 %p5, %rs12, 0; and.pred %p6, %p5, %p4; selp.f32 %f20, 0f00000000, %f23, %p6; abs.f32 %f21, %f20; setp.le.f32 %p7, %f21, 0f7F800000; selp.f32 %f22, %f20, 0f00000000, %p7; cvta.to.global.u64 %rd21, %rd7; shl.b64 %rd22, %rd1, 2; add.s64 %rd23, %rd21, %rd22; st.global.f32 [%rd23], %f22; $L__BB0_4: ret; }