I6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp[128]; // demoted variable .shared .align 4 .f32 _ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE2mu; // demoted variable .shared .align 4 .f32 _ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE6rsigma; ld.param.v4.u8 {%rs19, %rs20, %rs21, %rs22}, [MVNKernel_fp16_small_param_0+44]; ld.param.v2.u8 {%rs23, %rs24}, [MVNKernel_fp16_small_param_0+48]; ld.param.v2.u32 {%r46, %r47}, [MVNKernel_fp16_small_param_0+56]; ld.param.v2.u32 {%r48, %r49}, [MVNKernel_fp16_small_param_0+64]; ld.param.v2.u32 {%r50, %r51}, [MVNKernel_fp16_small_param_0+72]; ld.param.v4.u8 {%rs25, %rs26, %rs27, %rs28}, [MVNKernel_fp16_small_param_0+84]; ld.param.u32 %r45, [MVNKernel_fp16_small_param_0+80]; ld.param.u8 %rs14, [MVNKernel_fp16_small_param_0+50]; ld.param.f32 %f28, [MVNKernel_fp16_small_param_0+40]; ld.param.f32 %f27, [MVNKernel_fp16_small_param_0+36]; ld.param.u32 %r37, [MVNKernel_fp16_small_param_0+32]; ld.param.u64 %rd4, [MVNKernel_fp16_small_param_0+24]; ld.param.u64 %rd3, [MVNKernel_fp16_small_param_0+16]; ld.param.u64 %rd2, [MVNKernel_fp16_small_param_0+8]; ld.param.u64 %rd1, [MVNKernel_fp16_small_param_0]; mov.u32 %r1, %ctaid.x; mov.u32 %r3, %tid.x; mad.lo.s32 %r4, %r37, %r1, %r3; setp.ge.u32 %p1, %r3, %r37; mov.f32 %f56, 0f00000000; mov.f32 %f50, %f56; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd1; mul.wide.s32 %rd6, %r4, 2; add.s64 %rd7, %rd5, %rd6; ld.global.u16 %rs29, [%rd7]; // begin inline asm { cvt.f32.f16 %f50, %rs29;} // end inline asm $L__BB0_2: setp.eq.s16 %p2, %rs19, 0; @%p2 bra $L__BB0_17; mul.ftz.f32 %f55, %f27, %f50; mov.u32 %r110, WARP_SZ; setp.lt.s32 %p3, %r110, 2; @%p3 bra $L__BB0_6; mov.u32 %r109, %r110; $L__BB0_5: mov.b32 %r52, %f55; shr.u32 %r53, %r109, 31; add.s32 %r54, %r109, %r53; shr.s32 %r7, %r54, 1; mov.u32 %r55, 31; mov.u32 %r56, -1; shfl.sync.down.b32 %r57|%p4, %r52, %r7, %r55, %r56; mov.b32 %f32, %r57; add.ftz.f32 %f55, %f55, %f32; setp.gt.s32 %p5, %r109, 3; mov.u32 %r109, %r7; @%p5 bra $L__BB0_5; $L__BB0_6: rem.u32 %r8, %r3, %r110; setp.ne.s32 %p6, %r8, 0; @%p6 bra $L__BB0_8; div.u32 %r58, %r3, %r110; shl.b32 %r59, %r58, 2; mov.u32 %r60, _ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r61, %r60, %r59; st.shared.f32 [%r61], %f55; $L__BB0_8: bar.sync 0; setp.le.u32 %p7, %r110, %r3; @%p7 bra $L__BB0_14; mov.u32 %r62, %ntid.x; div.u32 %r63, %r62, %r110; setp.ge.s32 %p8, %r8, %r63; mov.f32 %f55, 0f00000000; @%p8 bra $L__BB0_11; shl.b32 %r64, %r8, 2; mov.u32 %r65, _ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r66, %r65, %r64; ld.shared.f32 %f55, [%r66]; $L__BB0_11: @%p3 bra $L__BB0_14; $L__BB0_13: mov.b32 %r67, %f55; shr.u32 %r68, %r110, 31; add.s32 %r69, %r110, %r68; shr.s32 %r10, %r69, 1; mov.u32 %r70, 31; mov.u32 %r71, -1; shfl.sync.down.b32 %r72|%p10, %r67, %r10, %r70, %r71; mov.b32 %f34, %r72; add.ftz.f32 %f55, %f55, %f34; setp.gt.s32 %p11, %r110, 3; mov.u32 %r110, %r10; @%p11 bra $L__BB0_13; $L__BB0_14: bar.sync 0; setp.ne.s32 %p12, %r3, 0; @%p12 bra $L__BB0_16; st.shared.f32 [_ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE2mu], %f55; $L__BB0_16: bar.sync 0; ld.shared.f32 %f56, [_ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE2mu]; $L__BB0_17: setp.eq.s16 %p13, %rs20, 0; mov.f32 %f62, 0f3F800000; @%p13 bra $L__BB0_32; setp.lt.u32 %p14, %r3, %r37; sub.ftz.f32 %f36, %f50, %f56; mul.ftz.f32 %f37, %f36, %f36; mul.ftz.f32 %f38, %f27, %f37; selp.f32 %f61, %f38, 0f00000000, %p14; mov.u32 %r112, WARP_SZ; setp.lt.s32 %p15, %r112, 2; @%p15 bra $L__BB0_21; mov.u32 %r111, %r112; $L__BB0_20: mov.b32 %r73, %f61; shr.u32 %r74, %r111, 31; add.s32 %r75, %r111, %r74; shr.s32 %r13, %r75, 1; mov.u32 %r76, 31; mov.u32 %r77, -1; shfl.sync.down.b32 %r78|%p16, %r73, %r13, %r76, %r77; mov.b32 %f39, %r78; add.ftz.f32 %f61, %f61, %f39; setp.gt.s32 %p17, %r111, 3; mov.u32 %r111, %r13; @%p17 bra $L__BB0_20; $L__BB0_21: rem.u32 %r14, %r3, %r112; setp.ne.s32 %p18, %r14, 0; @%p18 bra $L__BB0_23; div.u32 %r79, %r3, %r112; shl.b32 %r80, %r79, 2; mov.u32 %r81, _ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r82, %r81, %r80; st.shared.f32 [%r82], %f61; $L__BB0_23: bar.sync 0; setp.le.u32 %p19, %r112, %r3; @%p19 bra $L__BB0_29; mov.u32 %r83, %ntid.x; div.u32 %r84, %r83, %r112; setp.ge.s32 %p20, %r14, %r84; mov.f32 %f61, 0f00000000; @%p20 bra $L__BB0_26; shl.b32 %r85, %r14, 2; mov.u32 %r86, _ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r87, %r86, %r85; ld.shared.f32 %f61, [%r87]; $L__BB0_26: @%p15 bra $L__BB0_29; $L__BB0_28: mov.b32 %r88, %f61; shr.u32 %r89, %r112, 31; add.s32 %r90, %r112, %r89; shr.s32 %r16, %r90, 1; mov.u32 %r91, 31; mov.u32 %r92, -1; shfl.sync.down.b32 %r93|%p22, %r88, %r16, %r91, %r92; mov.b32 %f41, %r93; add.ftz.f32 %f61, %f61, %f41; setp.gt.s32 %p23, %r112, 3; mov.u32 %r112, %r16; @%p23 bra $L__BB0_28; $L__BB0_29: bar.sync 0; setp.ne.s32 %p24, %r3, 0; @%p24 bra $L__BB0_31; add.ftz.f32 %f42, %f28, %f61; rsqrt.approx.ftz.f32 %f43, %f42; st.shared.f32 [_ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE6rsigma], %f43; $L__BB0_31: bar.sync 0; ld.shared.f32 %f62, [_ZZ20LayerNormSmallKernelI6__halffEvN8dxml_mvn15MVNKernelParamsEE6rsigma]; $L__BB0_32: setp.eq.s16 %p25, %rs14, 0; @%p25 bra $L__BB0_39; setp.ne.s16 %p26, %rs24, 0; selp.b32 %r113, %r1, 0, %p26; setp.eq.s16 %p27, %rs26, 0; @%p27 bra $L__BB0_37; setp.eq.s16 %p28, %rs25, 0; @%p28 bra $L__BB0_36; mul.lo.s32 %r94, %r47, %r48; div.u32 %r114, %r3, %r94; mul.lo.s32 %r95, %r114, %r94; sub.s32 %r96, %r3, %r95; div.u32 %r115, %r96, %r48; mul.lo.s32 %r97, %r115, %r48; sub.s32 %r116, %r96, %r97; bra.uni $L__BB0_38; $L__BB0_39: setp.ne.s16 %p29, %rs24, 0; selp.b32 %r117, %r4, %r3, %p29; bra.uni $L__BB0_40; $L__BB0_37: div.u32 %r26, %r113, %r46; mul.lo.s32 %r102, %r26, %r46; sub.s32 %r114, %r113, %r102; div.u32 %r115, %r3, %r48; mul.lo.s32 %r103, %r115, %r48; sub.s32 %r116, %r3, %r103; mov.u32 %r113, %r26; bra.uni $L__BB0_38; $L__BB0_36: mul.lo.s32 %r98, %r48, %r46; div.u32 %r115, %r3, %r98; mul.lo.s32 %r99, %r115, %r98; sub.s32 %r100, %r3, %r99; div.u32 %r116, %r100, %r46; mul.lo.s32 %r101, %r116, %r46; sub.s32 %r114, %r100, %r101; $L__BB0_38: mul.lo.s32 %r104, %r113, %r49; mad.lo.s32 %r105, %r114, %r50, %r104; mad.lo.s32 %r106, %r115, %r51, %r105; mad.lo.s32 %r117, %r116, %r45, %r106; $L__BB0_40: @%p1 bra $L__BB0_48; setp.eq.s16 %p31, %rs22, 0; @%p31 bra $L__BB0_43; cvta.to.global.u64 %rd8, %rd2; mul.wide.u32 %rd9, %r117, 2; add.s64 %rd10, %rd8, %rd9; ld.global.u16 %rs45, [%rd10]; bra.uni $L__BB0_44; $L__BB0_43: mov.u32 %r107, 1; // begin inline asm cvt.rn.f16.s32 %rs45, %r107; // end inline asm $L__BB0_44: // begin inline asm { cvt.f32.f16 %f44, %rs45;} // end inline asm setp.eq.s16 %p32, %rs21, 0; @%p32 bra $L__BB0_46; cvta.to.global.u64 %rd11, %rd3; mul.wide.u32 %rd12, %r117, 2; add.s64 %rd13, %rd11, %rd12; ld.global.u16 %rs46, [%rd13]; bra.uni $L__BB0_47; $L__BB0_46: mov.u32 %r108, 0; // begin inline asm cvt.rn.f16.s32 %rs46, %r108; // end inline asm $L__BB0_47: // begin inline asm { cvt.f32.f16 %f45, %rs46;} // end inline asm sub.ftz.f32 %f47, %f50, %f56; mul.ftz.f32 %f48, %f47, %f62; fma.rn.ftz.f32 %f49, %f48, %f44, %f45; setp.lt.ftz.f32 %p33, %f49, 0f00000000; setp.ne.s16 %p34, %rs23, 0; and.pred %p35, %p34, %p33; selp.f32 %f46, 0f00000000, %f49, %p35; // begin inline asm { cvt.rn.f16.f32 %rs43, %f46;} // end inline asm cvta.to.global.u64 %rd14, %rd4; mul.wide.s32 %rd15, %r4, 2; add.s64 %rd16, %rd14, %rd15; st.global.u16 [%rd16], %rs43; $L__BB0_48: ret; }