VNKernelParamsEE2mu; // demoted variable .shared .align 4 .f32 _ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE6rsigma; ld.param.v4.u8 {%rs12, %rs13, %rs14, %rs15}, [MVNKernel_vec_fp32_small_param_0+44]; ld.param.v2.u8 {%rs16, %rs17}, [MVNKernel_vec_fp32_small_param_0+48]; ld.param.f32 %f135, [MVNKernel_vec_fp32_small_param_0+40]; ld.param.f32 %f134, [MVNKernel_vec_fp32_small_param_0+36]; ld.param.u32 %r17, [MVNKernel_vec_fp32_small_param_0+32]; ld.param.u64 %rd4, [MVNKernel_vec_fp32_small_param_0+24]; ld.param.u64 %rd3, [MVNKernel_vec_fp32_small_param_0+16]; ld.param.u64 %rd2, [MVNKernel_vec_fp32_small_param_0+8]; ld.param.u64 %rd1, [MVNKernel_vec_fp32_small_param_0]; shr.u32 %r1, %r17, 2; mov.u32 %r32, %ctaid.x; mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r1, %r32, %r2; setp.ge.u32 %p1, %r2, %r1; mov.f32 %f259, 0f00000000; mov.f32 %f231, %f259; mov.f32 %f232, %f259; mov.f32 %f233, %f259; mov.f32 %f234, %f259; @%p1 bra $L__BB0_2; cvta.to.global.u64 %rd5, %rd1; mul.wide.s32 %rd6, %r3, 16; add.s64 %rd7, %rd5, %rd6; ld.global.v4.f32 {%f234, %f233, %f232, %f231}, [%rd7]; $L__BB0_2: setp.eq.s16 %p2, %rs12, 0; @%p2 bra $L__BB0_18; mov.u32 %r100, WARP_SZ; setp.lt.s32 %p3, %r100, 2; mov.f32 %f239, %f234; mov.f32 %f240, %f233; mov.f32 %f241, %f232; mov.f32 %f242, %f231; @%p3 bra $L__BB0_6; mov.u32 %r99, %r100; mov.f32 %f242, %f231; mov.f32 %f241, %f232; mov.f32 %f240, %f233; mov.f32 %f239, %f234; $L__BB0_5: mov.b32 %r33, %f239; shr.u32 %r34, %r99, 31; add.s32 %r35, %r99, %r34; shr.s32 %r6, %r35, 1; mov.u32 %r36, 31; mov.u32 %r37, -1; shfl.sync.down.b32 %r38|%p4, %r33, %r6, %r36, %r37; mov.b32 %f145, %r38; add.ftz.f32 %f239, %f239, %f145; mov.b32 %r39, %f240; shfl.sync.down.b32 %r40|%p5, %r39, %r6, %r36, %r37; mov.b32 %f146, %r40; add.ftz.f32 %f240, %f240, %f146; mov.b32 %r41, %f241; shfl.sync.down.b32 %r42|%p6, %r41, %r6, %r36, %r37; mov.b32 %f147, %r42; add.ftz.f32 %f241, %f241, %f147; mov.b32 %r43, %f242; shfl.sync.down.b32 %r44|%p7, %r43, %r6, %r36, %r37; mov.b32 %f148, %r44; add.ftz.f32 %f242, %f242, %f148; setp.gt.s32 %p8, %r99, 3; mov.u32 %r99, %r6; @%p8 bra $L__BB0_5; $L__BB0_6: rem.u32 %r7, %r2, %r100; setp.ne.s32 %p9, %r7, 0; @%p9 bra $L__BB0_8; div.u32 %r45, %r2, %r100; shl.b32 %r46, %r45, 4; mov.u32 %r47, _ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r48, %r47, %r46; st.shared.v4.f32 [%r48], {%f239, %f240, %f241, %f242}; $L__BB0_8: bar.sync 0; setp.le.u32 %p10, %r100, %r2; @%p10 bra $L__BB0_15; mov.u32 %r49, %ntid.x; div.u32 %r50, %r49, %r100; setp.ge.s32 %p11, %r7, %r50; mov.f32 %f239, 0f00000000; mov.f32 %f240, %f239; mov.f32 %f241, %f239; mov.f32 %f242, %f239; @%p11 bra $L__BB0_11; shl.b32 %r51, %r7, 4; mov.u32 %r52, _ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r53, %r52, %r51; ld.shared.v4.f32 {%f239, %f240, %f241, %f242}, [%r53]; $L__BB0_11: @%p3 bra $L__BB0_15; $L__BB0_13: mov.b32 %r54, %f239; shr.u32 %r55, %r100, 31; add.s32 %r56, %r100, %r55; shr.s32 %r9, %r56, 1; mov.u32 %r57, 31; mov.u32 %r58, -1; shfl.sync.down.b32 %r59|%p13, %r54, %r9, %r57, %r58; mov.b32 %f157, %r59; add.ftz.f32 %f239, %f239, %f157; mov.b32 %r60, %f240; shfl.sync.down.b32 %r61|%p14, %r60, %r9, %r57, %r58; mov.b32 %f158, %r61; add.ftz.f32 %f240, %f240, %f158; mov.b32 %r62, %f241; shfl.sync.down.b32 %r63|%p15, %r62, %r9, %r57, %r58; mov.b32 %f159, %r63; add.ftz.f32 %f241, %f241, %f159; mov.b32 %r64, %f242; shfl.sync.down.b32 %r65|%p16, %r64, %r9, %r57, %r58; mov.b32 %f160, %r65; add.ftz.f32 %f242, %f242, %f160; setp.gt.s32 %p17, %r100, 3; mov.u32 %r100, %r9; @%p17 bra $L__BB0_13; $L__BB0_15: bar.sync 0; setp.ne.s32 %p18, %r2, 0; @%p18 bra $L__BB0_17; add.ftz.f32 %f161, %f239, %f240; add.ftz.f32 %f162, %f241, %f161; add.ftz.f32 %f163, %f242, %f162; mul.ftz.f32 %f164, %f134, %f163; st.shared.f32 [_ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE2mu], %f164; $L__BB0_17: bar.sync 0; ld.shared.f32 %f259, [_ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE2mu]; $L__BB0_18: setp.eq.s16 %p19, %rs13, 0; mov.f32 %f288, 0f3F800000; @%p19 bra $L__BB0_36; mov.f32 %f271, 0f00000000; mov.f32 %f270, %f271; mov.f32 %f269, %f271; mov.f32 %f268, %f271; @%p1 bra $L__BB0_21; sub.ftz.f32 %f170, %f234, %f259; mul.ftz.f32 %f171, %f170, %f170; mul.ftz.f32 %f268, %f134, %f171; sub.ftz.f32 %f172, %f233, %f259; mul.ftz.f32 %f173, %f172, %f172; mul.ftz.f32 %f269, %f134, %f173; sub.ftz.f32 %f174, %f232, %f259; mul.ftz.f32 %f175, %f174, %f174; mul.ftz.f32 %f270, %f134, %f175; sub.ftz.f32 %f176, %f231, %f259; mul.ftz.f32 %f177, %f176, %f176; mul.ftz.f32 %f271, %f134, %f177; $L__BB0_21: mov.u32 %r102, WARP_SZ; setp.lt.s32 %p21, %r102, 2; @%p21 bra $L__BB0_24; mov.u32 %r101, %r102; $L__BB0_23: mov.b32 %r66, %f268; shr.u32 %r67, %r101, 31; add.s32 %r68, %r101, %r67; shr.s32 %r12, %r68, 1; mov.u32 %r69, 31; mov.u32 %r70, -1; shfl.sync.down.b32 %r71|%p22, %r66, %r12, %r69, %r70; mov.b32 %f178, %r71; add.ftz.f32 %f268, %f268, %f178; mov.b32 %r72, %f269; shfl.sync.down.b32 %r73|%p23, %r72, %r12, %r69, %r70; mov.b32 %f179, %r73; add.ftz.f32 %f269, %f269, %f179; mov.b32 %r74, %f270; shfl.sync.down.b32 %r75|%p24, %r74, %r12, %r69, %r70; mov.b32 %f180, %r75; add.ftz.f32 %f270, %f270, %f180; mov.b32 %r76, %f271; shfl.sync.down.b32 %r77|%p25, %r76, %r12, %r69, %r70; mov.b32 %f181, %r77; add.ftz.f32 %f271, %f271, %f181; setp.gt.s32 %p26, %r101, 3; mov.u32 %r101, %r12; @%p26 bra $L__BB0_23; $L__BB0_24: rem.u32 %r13, %r2, %r102; setp.ne.s32 %p27, %r13, 0; @%p27 bra $L__BB0_26; div.u32 %r78, %r2, %r102; shl.b32 %r79, %r78, 4; mov.u32 %r80, _ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r81, %r80, %r79; st.shared.v4.f32 [%r81], {%f268, %f269, %f270, %f271}; $L__BB0_26: bar.sync 0; setp.le.u32 %p28, %r102, %r2; @%p28 bra $L__BB0_33; mov.u32 %r82, %ntid.x; div.u32 %r83, %r82, %r102; setp.ge.s32 %p29, %r13, %r83; mov.f32 %f268, 0f00000000; mov.f32 %f269, %f268; mov.f32 %f270, %f268; mov.f32 %f271, %f268; @%p29 bra $L__BB0_29; shl.b32 %r84, %r13, 4; mov.u32 %r85, _ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r86, %r85, %r84; ld.shared.v4.f32 {%f268, %f269, %f270, %f271}, [%r86]; $L__BB0_29: @%p21 bra $L__BB0_33; $L__BB0_31: mov.b32 %r87, %f268; shr.u32 %r88, %r102, 31; add.s32 %r89, %r102, %r88; shr.s32 %r15, %r89, 1; mov.u32 %r90, 31; mov.u32 %r91, -1; shfl.sync.down.b32 %r92|%p31, %r87, %r15, %r90, %r91; mov.b32 %f190, %r92; add.ftz.f32 %f268, %f268, %f190; mov.b32 %r93, %f269; shfl.sync.down.b32 %r94|%p32, %r93, %r15, %r90, %r91; mov.b32 %f191, %r94; add.ftz.f32 %f269, %f269, %f191; mov.b32 %r95, %f270; shfl.sync.down.b32 %r96|%p33, %r95, %r15, %r90, %r91; mov.b32 %f192, %r96; add.ftz.f32 %f270, %f270, %f192; mov.b32 %r97, %f271; shfl.sync.down.b32 %r98|%p34, %r97, %r15, %r90, %r91; mov.b32 %f193, %r98; add.ftz.f32 %f271, %f271, %f193; setp.gt.s32 %p35, %r102, 3; mov.u32 %r102, %r15; @%p35 bra $L__BB0_31; $L__BB0_33: bar.sync 0; setp.ne.s32 %p36, %r2, 0; @%p36 bra $L__BB0_35; add.ftz.f32 %f194, %f268, %f269; add.ftz.f32 %f195, %f270, %f194; add.ftz.f32 %f196, %f271, %f195; add.ftz.f32 %f197, %f135, %f196; rsqrt.approx.ftz.f32 %f198, %f197; st.shared.f32 [_ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE6rsigma], %f198; $L__BB0_35: bar.sync 0; ld.shared.f32 %f288, [_ZZ24LayerNormSmallKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE6rsigma]; $L__BB0_36: setp.eq.s16 %p37, %rs17, 0; selp.b32 %r16, %r2, %r3, %p37; @%p1 bra $L__BB0_42; setp.eq.s16 %p39, %rs15, 0; mov.f32 %f289, 0f3F800000; mov.f32 %f290, %f289; mov.f32 %f291, %f289; mov.f32 %f292, %f289; @%p39 bra $L__BB0_39; cvta.to.global.u64 %rd8, %rd2; mul.wide.s32 %rd9, %r16, 16; add.s64 %rd10, %rd8, %rd9; ld.global.v4.f32 {%f292, %f291, %f290, %f289}, [%rd10]; $L__BB0_39: setp.eq.s16 %p40, %rs14, 0; mov.f32 %f293, 0f00000000; mov.f32 %f294, %f293; mov.f32 %f295, %f293; mov.f32 %f296, %f293; @%p40 bra $L__BB0_41; cvta.to.global.u64 %rd11, %rd3; mul.wide.s32 %rd12, %r16, 16; add.s64 %rd13, %rd11, %rd12; ld.global.v4.f32 {%f296, %f295, %f294, %f293}, [%rd13]; $L__BB0_41: sub.ftz.f32 %f215, %f234, %f259; mul.ftz.f32 %f216, %f215, %f288; fma.rn.ftz.f32 %f217, %f216, %f292, %f296; sub.ftz.f32 %f218, %f233, %f259; mul.ftz.f32 %f219, %f218, %f288; fma.rn.ftz.f32 %f220, %f219, %f291, %f295; sub.ftz.f32 %f221, %f232, %f259; mul.ftz.f32 %f222, %f221, %f288; fma.rn.ftz.f32 %f223, %f222, %f290, %f294; sub.ftz.f32 %f224, %f231, %f259; mul.ftz.f32 %f225, %f224, %f288; fma.rn.ftz.f32 %f226, %f225, %f289, %f293; setp.lt.ftz.f32 %p41, %f217, 0f00000000; setp.ne.s16 %p42, %rs16, 0; and.pred %p43, %p42, %p41; setp.lt.ftz.f32 %p44, %f220, 0f00000000; and.pred %p45, %p42, %p44; setp.lt.ftz.f32 %p46, %f223, 0f00000000; and.pred %p47, %p42, %p46; setp.lt.ftz.f32 %p48, %f226, 0f00000000; and.pred %p49, %p42, %p48; cvta.to.global.u64 %rd14, %rd4; mul.wide.s32 %rd15, %r3, 16; add.s64 %rd16, %rd14, %rd15; selp.f32 %f227, 0f00000000, %f226, %p49; selp.f32 %f228, 0f00000000, %f223, %p47; selp.f32 %f229, 0f00000000, %f220, %p45; selp.f32 %f230, 0f00000000, %f217, %p43; st.global.v4.f32 [%rd16], {%f230, %f229, %f228, %f227}; $L__BB0_42: ret; }