dxml_mvn15MVNKernelParamsEE6rsigma has been demoted .visible .entry MVNKernel_vec_fp32_large( .param .align 8 .b8 MVNKernel_vec_fp32_large_param_0[88] ) { .reg .pred %p<173>; .reg .b16 %rs<55>; .reg .f32 %f<959>; .reg .b32 %r<191>; .reg .b64 %rd<120>; // demoted variable .shared .align 16 .b8 _ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp[512]; // demoted variable .shared .align 4 .f32 _ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE2mu; // demoted variable .shared .align 4 .f32 _ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE6rsigma; ld.param.v2.u8 {%rs15, %rs16}, [MVNKernel_vec_fp32_large_param_0+48]; ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [MVNKernel_vec_fp32_large_param_0+44]; ld.param.f32 %f275, [MVNKernel_vec_fp32_large_param_0+40]; ld.param.f32 %f274, [MVNKernel_vec_fp32_large_param_0+36]; ld.param.u32 %r61, [MVNKernel_vec_fp32_large_param_0+32]; ld.param.u64 %rd70, [MVNKernel_vec_fp32_large_param_0+24]; ld.param.u64 %rd69, [MVNKernel_vec_fp32_large_param_0+16]; ld.param.u64 %rd68, [MVNKernel_vec_fp32_large_param_0+8]; ld.param.u64 %rd67, [MVNKernel_vec_fp32_large_param_0]; cvta.to.global.u64 %rd1, %rd67; cvta.to.global.u64 %rd2, %rd68; cvta.to.global.u64 %rd3, %rd69; cvta.to.global.u64 %rd4, %rd70; shr.u32 %r1, %r61, 2; mov.u32 %r76, %ctaid.x; mul.lo.s32 %r2, %r1, %r76; setp.eq.s16 %p1, %rs21, 0; mov.f32 %f869, 0f00000000; mov.u32 %r181, %tid.x; @%p1 bra $L__BB0_21; setp.ge.s32 %p2, %r181, %r1; mov.f32 %f841, 0f00000000; mov.f32 %f842, %f841; mov.f32 %f843, %f841; mov.f32 %f844, %f841; @%p2 bra $L__BB0_8; not.b32 %r77, %r181; add.s32 %r4, %r1, %r77; shr.u32 %r78, %r4, 10; add.s32 %r79, %r78, 1; and.b32 %r170, %r79, 3; setp.eq.s32 %p3, %r170, 0; mov.f32 %f844, 0f00000000; mov.u32 %r171, %r181; mov.f32 %f843, %f844; mov.f32 %f842, %f844; mov.f32 %f841, %f844; @%p3 bra $L__BB0_5; add.s32 %r80, %r181, %r2; mul.wide.s32 %rd71, %r80, 16; add.s64 %rd104, %rd1, %rd71; mov.u32 %r171, %r181; $L__BB0_4: .pragma "nounroll"; ld.global.v4.f32 {%f290, %f291, %f292, %f293}, [%rd104]; add.ftz.f32 %f841, %f841, %f290; add.ftz.f32 %f842, %f842, %f291; add.ftz.f32 %f843, %f843, %f292; add.ftz.f32 %f844, %f844, %f293; add.s32 %r171, %r171, 1024; add.s64 %rd104, %rd104, 16384; add.s32 %r170, %r170, -1; setp.ne.s32 %p4, %r170, 0; @%p4 bra $L__BB0_4; $L__BB0_5: setp.lt.u32 %p5, %r4, 3072; @%p5 bra $L__BB0_8; add.s32 %r81, %r171, %r2; mul.wide.s32 %rd72, %r81, 16; add.s64 %rd73, %rd1, %rd72; add.s64 %rd105, %rd73, 32768; $L__BB0_7: ld.global.v4.f32 {%f298, %f299, %f300, %f301}, [%rd105+-32768]; add.ftz.f32 %f306, %f841, %f298; add.ftz.f32 %f307, %f842, %f299; add.ftz.f32 %f308, %f843, %f300; add.ftz.f32 %f309, %f844, %f301; ld.global.v4.f32 {%f310, %f311, %f312, %f313}, [%rd105+-16384]; add.ftz.f32 %f318, %f306, %f310; add.ftz.f32 %f319, %f307, %f311; add.ftz.f32 %f320, %f308, %f312; add.ftz.f32 %f321, %f309, %f313; ld.global.v4.f32 {%f322, %f323, %f324, %f325}, [%rd105]; add.ftz.f32 %f330, %f318, %f322; add.ftz.f32 %f331, %f319, %f323; add.ftz.f32 %f332, %f320, %f324; add.ftz.f32 %f333, %f321, %f325; ld.global.v4.f32 {%f334, %f335, %f336, %f337}, [%rd105+16384]; add.ftz.f32 %f841, %f330, %f334; add.ftz.f32 %f842, %f331, %f335; add.ftz.f32 %f843, %f332, %f336; add.ftz.f32 %f844, %f333, %f337; add.s64 %rd105, %rd105, 65536; add.s32 %r171, %r171, 4096; setp.lt.s32 %p6, %r171, %r1; @%p6 bra $L__BB0_7; $L__BB0_8: mov.u32 %r174, WARP_SZ; setp.lt.s32 %p7, %r174, 2; @%p7 bra $L__BB0_11; mov.u32 %r173, %r174; $L__BB0_10: mov.b32 %r82, %f841; shr.u32 %r83, %r173, 31; add.s32 %r84, %r173, %r83; shr.s32 %r15, %r84, 1; mov.u32 %r85, 31; mov.u32 %r86, -1; shfl.sync.down.b32 %r87|%p8, %r82, %r15, %r85, %r86; mov.b32 %f342, %r87; add.ftz.f32 %f841, %f841, %f342; mov.b32 %r88, %f842; shfl.sync.down.b32 %r89|%p9, %r88, %r15, %r85, %r86; mov.b32 %f343, %r89; add.ftz.f32 %f842, %f842, %f343; mov.b32 %r90, %f843; shfl.sync.down.b32 %r91|%p10, %r90, %r15, %r85, %r86; mov.b32 %f344, %r91; add.ftz.f32 %f843, %f843, %f344; mov.b32 %r92, %f844; shfl.sync.down.b32 %r93|%p11, %r92, %r15, %r85, %r86; mov.b32 %f345, %r93; add.ftz.f32 %f844, %f844, %f345; setp.gt.s32 %p12, %r173, 3; mov.u32 %r173, %r15; @%p12 bra $L__BB0_10; $L__BB0_11: rem.u32 %r16, %r181, %r174; setp.ne.s32 %p13, %r16, 0; @%p13 bra $L__BB0_13; div.u32 %r94, %r181, %r174; shl.b32 %r95, %r94, 4; mov.u32 %r96, _ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r97, %r96, %r95; st.shared.v4.f32 [%r97], {%f841, %f842, %f843, %f844}; $L__BB0_13: bar.sync 0; setp.le.u32 %p14, %r174, %r181; @%p14 bra $L__BB0_18; mov.u32 %r98, %ntid.x; div.u32 %r99, %r98, %r174; setp.ge.s32 %p15, %r16, %r99; mov.f32 %f841, 0f00000000; mov.f32 %f842, %f841; mov.f32 %f843, %f841; mov.f32 %f844, %f841; @%p15 bra $L__BB0_16; shl.b32 %r100, %r16, 4; mov.u32 %r101, _ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r102, %r101, %r100; ld.shared.v4.f32 {%f841, %f842, %f843, %f844}, [%r102]; $L__BB0_16: @%p7 bra $L__BB0_18; $L__BB0_17: mov.b32 %r103, %f841; shr.u32 %r104, %r174, 31; add.s32 %r105, %r174, %r104; shr.s32 %r18, %r105, 1; mov.u32 %r106, 31; mov.u32 %r107, -1; shfl.sync.down.b32 %r108|%p17, %r103, %r18, %r106, %r107; mov.b32 %f354, %r108; add.ftz.f32 %f841, %f841, %f354; mov.b32 %r109, %f842; shfl.sync.down.b32 %r110|%p18, %r109, %r18, %r106, %r107; mov.b32 %f355, %r110; add.ftz.f32 %f842, %f842, %f355; mov.b32 %r111, %f843; shfl.sync.down.b32 %r112|%p19, %r111, %r18, %r106, %r107; mov.b32 %f356, %r112; add.ftz.f32 %f843, %f843, %f356; mov.b32 %r113, %f844; shfl.sync.down.b32 %r114|%p20, %r113, %r18, %r106, %r107; mov.b32 %f357, %r114; add.ftz.f32 %f844, %f844, %f357; setp.gt.s32 %p21, %r174, 3; mov.u32 %r174, %r18; @%p21 bra $L__BB0_17; $L__BB0_18: bar.sync 0; setp.ne.s32 %p22, %r181, 0; @%p22 bra $L__BB0_20; add.ftz.f32 %f358, %f841, %f842; add.ftz.f32 %f359, %f843, %f358; add.ftz.f32 %f360, %f844, %f359; mul.ftz.f32 %f361, %f274, %f360; st.shared.f32 [_ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE2mu], %f361; $L__BB0_20: bar.sync 0; ld.shared.f32 %f869, [_ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE2mu]; $L__BB0_21: setp.eq.s16 %p23, %rs22, 0; mov.f32 %f914, 0f3F800000; @%p23 bra $L__BB0_42; setp.ge.s32 %p24, %r181, %r1; mov.f32 %f886, 0f00000000; mov.f32 %f887, %f886; mov.f32 %f888, %f886; mov.f32 %f889, %f886; @%p24 bra $L__BB0_29; not.b32 %r115, %r181; add.s32 %r19, %r1, %r115; shr.u32 %r116, %r19, 10; add.s32 %r117, %r116, 1; and.b32 %r176, %r117, 3; setp.eq.s32 %p25, %r176, 0; mov.f32 %f889, 0f00000000; mov.u32 %r177, %r181; mov.f32 %f888, %f889; mov.f32 %f887, %f889; mov.f32 %f886, %f889; @%p25 bra $L__BB0_26; add.s32 %r118, %r181, %r2; mul.wide.s32 %rd74, %r118, 16; add.s64 %rd106, %rd1, %rd74; mov.u32 %r177, %r181; $L__BB0_25: .pragma "nounroll"; ld.global.v4.f32 {%f376, %f377, %f378, %f379}, [%rd106]; sub.ftz.f32 %f384, %f376, %f869; fma.rn.ftz.f32 %f889, %f384, %f384, %f889; sub.ftz.f32 %f385, %f377, %f869; fma.rn.ftz.f32 %f888, %f385, %f385, %f888; sub.ftz.f32 %f386, %f378, %f869; fma.rn.ftz.f32 %f887, %f386, %f386, %f887; sub.ftz.f32 %f387, %f379, %f869; fma.rn.ftz.f32 %f886, %f387, %f387, %f886; add.s32 %r177, %r177, 1024; add.s64 %rd106, %rd106, 16384; add.s32 %r176, %r176, -1; setp.ne.s32 %p26, %r176, 0; @%p26 bra $L__BB0_25; $L__BB0_26: setp.lt.u32 %p27, %r19, 3072; @%p27 bra $L__BB0_29; add.s32 %r119, %r177, %r2; mul.wide.s32 %rd75, %r119, 16; add.s64 %rd76, %rd1, %rd75; add.s64 %rd107, %rd76, 32768; $L__BB0_28: ld.global.v4.f32 {%f388, %f389, %f390, %f391}, [%rd107+-32768]; sub.ftz.f32 %f396, %f388, %f869; fma.rn.ftz.f32 %f397, %f396, %f396, %f889; sub.ftz.f32 %f398, %f389, %f869; fma.rn.ftz.f32 %f399, %f398, %f398, %f888; sub.ftz.f32 %f400, %f390, %f869; fma.rn.ftz.f32 %f401, %f400, %f400, %f887; sub.ftz.f32 %f402, %f391, %f869; fma.rn.ftz.f32 %f403, %f402, %f402, %f886; ld.global.v4.f32 {%f404, %f405, %f406, %f407}, [%rd107+-16384]; sub.ftz.f32 %f412, %f404, %f869; fma.rn.ftz.f32 %f413, %f412, %f412, %f397; sub.ftz.f32 %f414, %f405, %f869; fma.rn.ftz.f32 %f415, %f414, %f414, %f399; sub.ftz.f32 %f416, %f406, %f869; fma.rn.ftz.f32 %f417, %f416, %f416, %f401; sub.ftz.f32 %f418, %f407, %f869; fma.rn.ftz.f32 %f419, %f418, %f418, %f403; ld.global.v4.f32 {%f420, %f421, %f422, %f423}, [%rd107]; sub.ftz.f32 %f428, %f420, %f869; fma.rn.ftz.f32 %f429, %f428, %f428, %f413; sub.ftz.f32 %f430, %f421, %f869; fma.rn.ftz.f32 %f431, %f430, %f430, %f415; sub.ftz.f32 %f432, %f422, %f869; fma.rn.ftz.f32 %f433, %f432, %f432, %f417; sub.ftz.f32 %f434, %f423, %f869; fma.rn.ftz.f32 %f435, %f434, %f434, %f419; ld.global.v4.f32 {%f436, %f437, %f438, %f439}, [%rd107+16384]; sub.ftz.f32 %f444, %f436, %f869; fma.rn.ftz.f32 %f889, %f444, %f444, %f429; sub.ftz.f32 %f445, %f437, %f869; fma.rn.ftz.f32 %f888, %f445, %f445, %f431; sub.ftz.f32 %f446, %f438, %f869; fma.rn.ftz.f32 %f887, %f446, %f446, %f433; sub.ftz.f32 %f447, %f439, %f869; fma.rn.ftz.f32 %f886, %f447, %f447, %f435; add.s64 %rd107, %rd107, 65536; add.s32 %r177, %r177, 4096; setp.lt.s32 %p28, %r177, %r1; @%p28 bra $L__BB0_28; $L__BB0_29: mul.ftz.f32 %f894, %f274, %f889; mul.ftz.f32 %f895, %f274, %f888; mul.ftz.f32 %f896, %f274, %f887; mul.ftz.f32 %f897, %f274, %f886; mov.u32 %r180, WARP_SZ; setp.lt.s32 %p29, %r180, 2; @%p29 bra $L__BB0_32; mov.u32 %r179, %r180; $L__BB0_31: mov.b32 %r120, %f894; shr.u32 %r121, %r179, 31; add.s32 %r122, %r179, %r121; shr.s32 %r30, %r122, 1; mov.u32 %r123, 31; mov.u32 %r124, -1; shfl.sync.down.b32 %r125|%p30, %r120, %r30, %r123, %r124; mov.b32 %f448, %r125; add.ftz.f32 %f894, %f894, %f448; mov.b32 %r126, %f895; shfl.sync.down.b32 %r127|%p31, %r126, %r30, %r123, %r124; mov.b32 %f449, %r127; add.ftz.f32 %f895, %f895, %f449; mov.b32 %r128, %f896; shfl.sync.down.b32 %r129|%p32, %r128, %r30, %r123, %r124; mov.b32 %f450, %r129; add.ftz.f32 %f896, %f896, %f450; mov.b32 %r130, %f897; shfl.sync.down.b32 %r131|%p33, %r130, %r30, %r123, %r124; mov.b32 %f451, %r131; add.ftz.f32 %f897, %f897, %f451; setp.gt.s32 %p34, %r179, 3; mov.u32 %r179, %r30; @%p34 bra $L__BB0_31; $L__BB0_32: rem.u32 %r31, %r181, %r180; setp.ne.s32 %p35, %r31, 0; @%p35 bra $L__BB0_34; div.u32 %r132, %r181, %r180; shl.b32 %r133, %r132, 4; mov.u32 %r134, _ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r135, %r134, %r133; st.shared.v4.f32 [%r135], {%f894, %f895, %f896, %f897}; $L__BB0_34: bar.sync 0; setp.le.u32 %p36, %r180, %r181; @%p36 bra $L__BB0_39; mov.u32 %r136, %ntid.x; div.u32 %r137, %r136, %r180; setp.ge.s32 %p37, %r31, %r137; mov.f32 %f894, 0f00000000; mov.f32 %f895, %f894; mov.f32 %f896, %f894; mov.f32 %f897, %f894; @%p37 bra $L__BB0_37; shl.b32 %r138, %r31, 4; mov.u32 %r139, _ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r140, %r139, %r138; ld.shared.v4.f32 {%f894, %f895, %f896, %f897}, [%r140]; $L__BB0_37: @%p29 bra $L__BB0_39; $L__BB0_38: mov.b32 %r141, %f894; shr.u32 %r142, %r180, 31; add.s32 %r143, %r180, %r142; shr.s32 %r33, %r143, 1; mov.u32 %r144, 31; mov.u32 %r145, -1; shfl.sync.down.b32 %r146|%p39, %r141, %r33, %r144, %r145; mov.b32 %f460, %r146; add.ftz.f32 %f894, %f894, %f460; mov.b32 %r147, %f895; shfl.sync.down.b32 %r148|%p40, %r147, %r33, %r144, %r145; mov.b32 %f461, %r148; add.ftz.f32 %f895, %f895, %f461; mov.b32 %r149, %f896; shfl.sync.down.b32 %r150|%p41, %r149, %r33, %r144, %r145; mov.b32 %f462, %r150; add.ftz.f32 %f896, %f896, %f462; mov.b32 %r151, %f897; shfl.sync.down.b32 %r152|%p42, %r151, %r33, %r144, %r145; mov.b32 %f463, %r152; add.ftz.f32 %f897, %f897, %f463; setp.gt.s32 %p43, %r180, 3; mov.u32 %r180, %r33; @%p43 bra $L__BB0_38; $L__BB0_39: bar.sync 0; setp.ne.s32 %p44, %r181, 0; @%p44 bra $L__BB0_41; add.ftz.f32 %f464, %f894, %f895; add.ftz.f32 %f465, %f896, %f464; add.ftz.f32 %f466, %f897, %f465; add.ftz.f32 %f467, %f275, %f466; rsqrt.approx.ftz.f32 %f468, %f467; st.shared.f32 [_ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE6rsigma], %f468; $L__BB0_41: bar.sync 0; ld.shared.f32 %f914, [_ZZ24LayerNormLargeKernel_vecI6float4EvN8dxml_mvn15MVNKernelParamsEE6rsigma]; $L__BB0_42: setp.ge.s32 %p45, %r181, %r1; @%p45 bra $L__BB0_83; setp.eq.s16 %p46, %rs24, 0; not.b32 %r153, %r181; add.s32 %r34, %r1, %r153; @%p46 bra $L__BB0_67; setp.eq.s16 %p47, %rs16, 0; and.b32 %r35, %r34, 1024; add.s32 %r154, %r181, %r2; mul.wide.s32 %rd77, %r154, 16; add.s64 %rd17, %rd1, %rd77; add.s64 %rd18, %rd4, %rd77; @%p47 bra $L__BB0_56; setp.ne.s32 %p48, %r35, 0; @%p48 bra $L__BB0_49; setp.eq.s16 %p49, %rs23, 0; cvt.s64.s32 %rd19, %r154; add.s64 %rd79, %rd2, %rd77; ld.global.v4.f32 {%f473, %f474, %f475, %f476}, [%rd79]; mov.f32 %f915, 0f00000000; mov.f32 %f916, %f915; mov.f32 %f917, %f915; mov.f32 %f918, %f915; @%p49 bra $L__BB0_48; shl.b64 %rd80, %rd19, 4; add.s64 %rd81, %rd3, %rd80; ld.global.v4.f32 {%f918, %f917, %f916, %f915}, [%rd81]; $L__BB0_48: ld.global.v4.f32 {%f481, %f482, %f483, %f484}, [%rd17]; sub.ftz.f32 %f489, %f481, %f869; mul.ftz.f32 %f490, %f914, %f489; fma.rn.ftz.f32 %f491, %f473, %f490, %f918; sub.ftz.f32 %f492, %f482, %f869; mul.ftz.f32 %f493, %f914, %f492; fma.rn.ftz.f32 %f494, %f474, %f493, %f917; sub.ftz.f32 %f495, %f483, %f869; mul.ftz.f32 %f496, %f914, %f495; fma.rn.ftz.f32 %f497, %f475, %f496, %f916; sub.ftz.f32 %f498, %f484, %f869; mul.ftz.f32 %f499, %f914, %f498; fma.rn.ftz.f32 %f500, %f476, %f499, %f915; setp.lt.ftz.f32 %p50, %f491, 0f00000000; setp.ne.s16 %p51, %rs15, 0; and.pred %p52, %p51, %p50; setp.lt.ftz.f32 %p53, %f494, 0f00000000; and.pred %p54, %p51, %p53; setp.lt.ftz.f32 %p55, %f497, 0f00000000; and.pred %p56, %p51, %p55; setp.lt.ftz.f32 %p57, %f500, 0f00000000; and.pred %p58, %p51, %p57; selp.f32 %f501, 0f00000000, %f500, %p58; selp.f32 %f502, 0f00000000, %f497, %p56; selp.f32 %f503, 0f00000000, %f494, %p54; selp.f32 %f504, 0f00000000, %f491, %p52; st.global.v4.f32 [%rd18], {%f504, %f503, %f502, %f501}; add.s32 %r181, %r181, 1024; $L__BB0_49: setp.lt.u32 %p59, %r34, 1024; @%p59 bra $L__BB0_83; add.s32 %r156, %r181, %r2; mul.wide.s32 %rd82, %r156, 16; add.s64 %rd83, %rd82, 16384; add.s64 %rd111, %rd1, %rd83; add.s64 %rd110, %rd3, %rd83; add.s64 %rd109, %rd2, %rd83; add.s64 %rd108, %rd4, %rd83; $L__BB0_51: add.s64 %rd28, %rd109, -16384; ld.global.v4.f32 {%f509, %f510, %f511, %f512}, [%rd109+-16384]; add.s64 %rd29, %rd110, -16384; setp.eq.s16 %p60, %rs23, 0; mov.f32 %f923, 0f00000000; mov.f32 %f919, %f923; mov.f32 %f920, %f923; mov.f32 %f921, %f923; mov.f32 %f922, %f923; @%p60 bra $L__BB0_53; ld.global.v4.f32 {%f922, %f921, %f920, %f919}, [%rd29]; $L__BB0_53: add.s64 %rd30, %rd111, -16384; ld.global.v4.f32 {%f521, %f522, %f523, %f524}, [%rd111+-16384]; sub.ftz.f32 %f529, %f521, %f869; mul.ftz.f32 %f530, %f914, %f529; fma.rn.ftz.f32 %f531, %f509, %f530, %f922; sub.ftz.f32 %f532, %f522, %f869; mul.ftz.f32 %f533, %f914, %f532; fma.rn.ftz.f32 %f534, %f510, %f533, %f921; sub.ftz.f32 %f535, %f523, %f869; mul.ftz.f32 %f536, %f914, %f535; fma.rn.ftz.f32 %f537, %f511, %f536, %f920; sub.ftz.f32 %f538, %f524, %f869; mul.ftz.f32 %f539, %f914, %f538; fma.rn.ftz.f32 %f540, %f512, %f539, %f919; setp.lt.ftz.f32 %p61, %f531, 0f00000000; setp.ne.s16 %p62, %rs15, 0; and.pred %p63, %p62, %p61; setp.lt.ftz.f32 %p64, %f534, 0f00000000; and.pred %p65, %p62, %p64; setp.lt.ftz.f32 %p66, %f537, 0f00000000; and.pred %p67, %p62, %p66; setp.lt.ftz.f32 %p68, %f540, 0f00000000; and.pred %p69, %p62, %p68; selp.f32 %f541, 0f00000000, %f540, %p69; selp.f32 %f542, 0f00000000, %f537, %p67; selp.f32 %f543, 0f00000000, %f534, %p65; selp.f32 %f544, 0f00000000, %f531, %p63; add.s64 %rd31, %rd108, -16384; st.global.v4.f32 [%rd108+-16384], {%f544, %f543, %f542, %f541}; ld.global.v4.f32 {%f545, %f546, %f547, %f548}, [%rd28+16384]; mov.f32 %f924, %f923; mov.f32 %f925, %f923; mov.f32 %f926, %f923; @%p60 bra $L__BB0_55; ld.global.v4.f32 {%f926, %f925, %f924, %f923}, [%rd29+16384]; $L__BB0_55: ld.global.v4.f32 {%f553, %f554, %f555, %f556}, [%rd30+16384]; sub.ftz.f32 %f561, %f553, %f869; mul.ftz.f32 %f562, %f914, %f561; fma.rn.ftz.f32 %f563, %f545, %f562, %f926; sub.ftz.f32 %f564, %f554, %f869; mul.ftz.f32 %f565, %f914, %f564; fma.rn.ftz.f32 %f566, %f546, %f565, %f925; sub.ftz.f32 %f567, %f555, %f869; mul.ftz.f32 %f568, %f914, %f567; fma.rn.ftz.f32 %f569, %f547, %f568, %f924; sub.ftz.f32 %f570, %f556, %f869; mul.ftz.f32 %f571, %f914, %f570; fma.rn.ftz.f32 %f572, %f548, %f571, %f923; setp.lt.ftz.f32 %p71, %f563, 0f00000000; and.pred %p73, %p62, %p71; setp.lt.ftz.f32 %p74, %f566, 0f00000000; and.pred %p75, %p62, %p74; setp.lt.ftz.f32 %p76, %f569, 0f00000000; and.pred %p77, %p62, %p76; setp.lt.ftz.f32 %p78, %f572, 0f00000000; and.pred %p79, %p62, %p78; selp.f32 %f573, 0f00000000, %f572, %p79; selp.f32 %f574, 0f00000000, %f569, %p77; selp.f32 %f575, 0f00000000, %f566, %p75; selp.f32 %f576, 0f00000000, %f563, %p73; st.global.v4.f32 [%rd31+16384], {%f576, %f575, %f574, %f573}; add.s64 %rd111, %rd111, 32768; add.s64 %rd110, %rd110, 32768; add.s64 %rd109, %rd109, 32768; add.s64 %rd108, %rd108, 32768; add.s32 %r181, %r181, 2048; setp.lt.s32 %p80, %r181, %r1; @%p80 bra $L__BB0_51; bra.uni $L__BB0_83; $L__BB0_67: shr.u32 %r158, %r34, 10; add.s32 %r159, %r158, 1; and.b32 %r187, %r159, 3; setp.eq.s32 %p114, %r187, 0; @%p114 bra $L__BB0_72; add.s32 %r185, %r181, %r2; mul.wide.s32 %rd92, %r185, 16; add.s64 %rd117, %rd1, %rd92; add.s64 %rd116, %rd4, %rd92; $L__BB0_69: .pragma "nounroll"; setp.eq.s16 %p115, %rs23, 0; mov.f32 %f939, 0f00000000; mov.f32 %f940, %f939; mov.f32 %f941, %f939; mov.f32 %f942, %f939; @%p115 bra $L__BB0_71; setp.eq.s16 %p116, %rs16, 0; selp.b32 %r160, %r181, %r185, %p116; mul.wide.s32 %rd93, %r160, 16; add.s64 %rd94, %rd3, %rd93; ld.global.v4.f32 {%f942, %f941, %f940, %f939}, [%rd94]; $L__BB0_71: ld.global.v4.f32 {%f693, %f694, %f695, %f696}, [%rd117]; sub.ftz.f32 %f701, %f693, %f869; fma.rn.ftz.f32 %f702, %f914, %f701, %f942; sub.ftz.f32 %f703, %f694, %f869; fma.rn.ftz.f32 %f704, %f914, %f703, %f941; sub.ftz.f32 %f705, %f695, %f869; fma.rn.ftz.f32 %f706, %f914, %f705, %f940; sub.ftz.f32 %f707, %f696, %f869; fma.rn.ftz.f32 %f708, %f914, %f707, %f939; setp.lt.ftz.f32 %p117, %f702, 0f00000000; setp.ne.s16 %p118, %rs15, 0; and.pred %p119, %p118, %p117; setp.lt.ftz.f32 %p120, %f704, 0f00000000; and.pred %p121, %p118, %p120; setp.lt.ftz.f32 %p122, %f706, 0f00000000; and.pred %p123, %p118, %p122; setp.lt.ftz.f32 %p124, %f708, 0f00000000; and.pred %p125, %p118, %p124; selp.f32 %f709, 0f00000000, %f708, %p125; selp.f32 %f710, 0f00000000, %f706, %p123; selp.f32 %f711, 0f00000000, %f704, %p121; selp.f32 %f712, 0f00000000, %f702, %p119; st.global.v4.f32 [%rd116], {%f712, %f711, %f710, %f709}; add.s32 %r181, %r181, 1024; add.s64 %rd117, %rd117, 16384; add.s64 %rd116, %rd116, 16384; add.s32 %r185, %r185, 1024; add.s32 %r187, %r187, -1; setp.ne.s32 %p126, %r187, 0; @%p126 bra $L__BB0_69; $L__BB0_72: setp.lt.u32 %p127, %r34, 3072; @%p127 bra $L__BB0_83; add.s32 %r161, %r181, %r2; add.s32 %r189, %r161, 3072; mul.wide.s32 %rd95, %r161, 16; add.s64 %rd118, %rd1, %rd95; add.s64 %rd119, %rd4, %rd95; $L__BB0_74: setp.eq.s16 %p128, %rs23, 0; mov.f32 %f947, 0f00000000; mov.f32 %f943, %f947; mov.f32 %f944, %f947; mov.f32 %f945, %f947; mov.f32 %f946, %f947; @%p128 bra $L__BB0_76; setp.eq.s16 %p129, %rs16, 0; add.s32 %r162, %r2, %r181; selp.b32 %r163, %r181, %r162, %p129; mul.wide.s32 %rd96, %r163, 16; add.s64 %rd97, %rd3, %rd96; ld.global.v4.f32 {%f946, %f945, %f944, %f943}, [%rd97]; $L__BB0_76: ld.global.v4.f32 {%f725, %f726, %f727, %f728}, [%rd118]; sub.ftz.f32 %f733, %f725, %f869; fma.rn.ftz.f32 %f734, %f914, %f733, %f946; sub.ftz.f32 %f735, %f726, %f869; fma.rn.ftz.f32 %f736, %f914, %f735, %f945; sub.ftz.f32 %f737, %f727, %f869; fma.rn.ftz.f32 %f738, %f914, %f737, %f944; sub.ftz.f32 %f739, %f728, %f869; fma.rn.ftz.f32 %f740, %f914, %f739, %f943; setp.lt.ftz.f32 %p130, %f734, 0f00000000; setp.ne.s16 %p131, %rs15, 0; and.pred %p132, %p131, %p130; setp.lt.ftz.f32 %p133, %f736, 0f00000000; and.pred %p134, %p131, %p133; setp.lt.ftz.f32 %p135, %f738, 0f00000000; and.pred %p136, %p131, %p135; setp.lt.ftz.f32 %p137, %f740, 0f00000000; and.pred %p138, %p131, %p137; selp.f32 %f741, 0f00000000, %f740, %p138; selp.f32 %f742, 0f00000000, %f738, %p136; selp.f32 %f743, 0f00000000, %f736, %p134; selp.f32 %f744, 0f00000000, %f734, %p132; st.global.v4.f32 [%rd119], {%f744, %f743, %f742, %f741}; add.s32 %r56, %r181, 1024; mov.f32 %f948, %f947; mov.f32 %f949, %f947; mov.f32 %f950, %f947; @%p128 bra $L__BB0_78; setp.eq.s16 %p140, %rs16, 0; add.s32 %r164, %r189, -2048; selp.b32 %r165, %r56, %r164, %p140; mul.wide.s32 %rd98, %r165, 16; add.s64 %rd99, %rd3, %rd98; ld.global.v4.f32 {%f950, %f949, %f948, %f947}, [%rd99]; $L__BB0_78: ld.global.v4.f32 {%f753, %f754, %f755, %f756}, [%rd118+16384]; sub.ftz.f32 %f761, %f753, %f869; fma.rn.ftz.f32 %f762, %f914, %f761, %f950; sub.ftz.f32 %f763, %f754, %f869; fma.rn.ftz.f32 %f764, %f914, %f763, %f949; sub.ftz.f32 %f765, %f755, %f869; fma.rn.ftz.f32 %f766, %f914, %f765, %f948; sub.ftz.f32 %f767, %f756, %f869; fma.rn.ftz.f32 %f768, %f914, %f767, %f947; setp.lt.ftz.f32 %p141, %f762, 0f00000000; mov.f32 %f955, 0f00000000; and.pred %p143, %p131, %p141; setp.lt.ftz.f32 %p144, %f764, 0f00000000; and.pred %p145, %p131, %p144; setp.lt.ftz.f32 %p146, %f766, 0f00000000; and.pred %p147, %p131, %p146; setp.lt.ftz.f32 %p148, %f768, 0f00000000; and.pred %p149, %p131, %p148; selp.f32 %f769, 0f00000000, %f768, %p149; selp.f32 %f770, 0f00000000, %f766, %p147; selp.f32 %f771, 0f00000000, %f764, %p145; selp.f32 %f772, 0f00000000, %f762, %p143; st.global.v4.f32 [%rd119+16384], {%f772, %f771, %f770, %f769}; add.s32 %r57, %r56, 1024; mov.f32 %f951, %f955; mov.f32 %f952, %f955; mov.f32 %f953, %f955; mov.f32 %f954, %f955; @%p128 bra $L__BB0_80; setp.eq.s16 %p151, %rs16, 0; add.s32 %r166, %r189, -1024; selp.b32 %r167, %r57, %r166, %p151; mul.wide.s32 %rd100, %r167, 16; add.s64 %rd101, %rd3, %rd100; ld.global.v4.f32 {%f954, %f953, %f952, %f951}, [%rd101]; $L__BB0_80: ld.global.v4.f32 {%f781, %f782, %f783, %f784}, [%rd118+32768]; sub.ftz.f32 %f789, %f781, %f869; fma.rn.ftz.f32 %f790, %f914, %f789, %f954; sub.ftz.f32 %f791, %f782, %f869; fma.rn.ftz.f32 %f792, %f914, %f791, %f953; sub.ftz.f32 %f793, %f783, %f869; fma.rn.ftz.f32 %f794, %f914, %f793, %f952; sub.ftz.f32 %f795, %f784, %f869; fma.rn.ftz.f32 %f796, %f914, %f795, %f951; setp.lt.ftz.f32 %p152, %f790, 0f00000000; and.pred %p154, %p131, %p152; setp.lt.ftz.f32 %p155, %f792, 0f00000000; and.pred %p156, %p131, %p155; setp.lt.ftz.f32 %p157, %f794, 0f00000000; and.pred %p158, %p131, %p157; setp.lt.ftz.f32 %p159, %f796, 0f00000000; and.pred %p160, %p131, %p159; selp.f32 %f797, 0f00000000, %f796, %p160; selp.f32 %f798, 0f00000000, %f794, %p158; selp.f32 %f799, 0f00000000, %f792, %p156; selp.f32 %f800, 0f00000000, %f790, %p154; st.global.v4.f32 [%rd119+32768], {%f800, %f799, %f798, %f797}; add.s32 %r58, %r57, 1024; mov.f32 %f956, %f955; mov.f32 %f957, %f955; mov.f32 %f958, %f955; @%p128 bra $L__BB0_82; setp.eq.s16 %p162, %rs16, 0; selp.b32 %r168, %r58, %r189, %p162; mul.wide.s32 %rd102, %r168, 16; add.s64 %rd103, %rd3, %rd102; ld.global.v4.f32 {%f958, %f957, %f956, %f955}, [%rd103]; $L__BB0_82: add.s64 %rd65, %rd118, 65536; ld.global.v4.f32 {%f805, %f806, %f807, %f808}, [%rd118+49152]; sub.ftz.f32 %f813, %f805, %f869; fma.rn.ftz.f32 %f814, %f914, %f813, %f958; sub.ftz.f32 %f815, %f806, %f869; fma.rn.ftz.f32 %f816, %f914, %f815, %f957; sub.ftz.f32 %f817, %f807, %f869; fma.rn.ftz.f32 %f818, %f914, %f817, %f956; sub.ftz.f32 %f819, %f808, %f869; fma.rn.ftz.f32 %f820, %f914, %f819, %f955; setp.lt.ftz.f32 %p163, %f814, 0f00000000; and.pred %p165, %p131, %p163; setp.lt.ftz.f32 %p166, %f816, 0f00000000; and.pred %p167, %p131, %p166; setp.lt.ftz.f32 %p168, %f818, 0f00000000; and.pred %p169, %p131, %p168; setp.lt.ftz.f32 %p170, %f820, 0f00000000; and.pred %p171, %p131, %p170; selp.f32 %f821, 0f00000000, %f820, %p171; selp.f32 %f822, 0f00000000, %f818, %p169; selp.f32 %f823, 0f00000000, %f816, %p167; selp.f32 %f824, 0f00000000, %f814, %p165; add.s64 %rd66, %rd119, 65536; st.global.v4.f32 [%rd119+49152], {%f824, %f823, %f822, %f821}; add.s32 %r189, %r189, 4096; add.s32 %r181, %r58, 1024; setp.lt.s32 %p172, %r181, %r1; mov.u64 %rd118, %rd65; mov.u64 %rd119, %rd66; @%p172 bra $L__BB0_74; bra.uni $L__BB0_83; $L__BB0_56: setp.ne.s32 %p81, %r35, 0; @%p81 bra $L__BB0_60; setp.eq.s16 %p82, %rs23, 0; cvt.s64.s32 %rd36, %r181; mul.wide.s32 %rd84, %r181, 16; add.s64 %rd85, %rd2, %rd84; ld.global.v4.f32 {%f581, %f582, %f583, %f584}, [%rd85]; mov.f32 %f927, 0f00000000; mov.f32 %f928, %f927; mov.f32 %f929, %f927; mov.f32 %f930, %f927; @%p82 bra $L__BB0_59; shl.b64 %rd86, %rd36, 4; add.s64 %rd87, %rd3, %rd86; ld.global.v4.f32 {%f930, %f929, %f928, %f927}, [%rd87]; $L__BB0_59: ld.global.v4.f32 {%f589, %f590, %f591, %f592}, [%rd17]; sub.ftz.f32 %f597, %f589, %f869; mul.ftz.f32 %f598, %f914, %f597; fma.rn.ftz.f32 %f599, %f581, %f598, %f930; sub.ftz.f32 %f600, %f590, %f869; mul.ftz.f32 %f601, %f914, %f600; fma.rn.ftz.f32 %f602, %f582, %f601, %f929; sub.ftz.f32 %f603, %f591, %f869; mul.ftz.f32 %f604, %f914, %f603; fma.rn.ftz.f32 %f605, %f583, %f604, %f928; sub.ftz.f32 %f606, %f592, %f869; mul.ftz.f32 %f607, %f914, %f606; fma.rn.ftz.f32 %f608, %f584, %f607, %f927; setp.lt.ftz.f32 %p83, %f599, 0f00000000; setp.ne.s16 %p84, %rs15, 0; and.pred %p85, %p84, %p83; setp.lt.ftz.f32 %p86, %f602, 0f00000000; and.pred %p87, %p84, %p86; setp.lt.ftz.f32 %p88, %f605, 0f00000000; and.pred %p89, %p84, %p88; setp.lt.ftz.f32 %p90, %f608, 0f00000000; and.pred %p91, %p84, %p90; selp.f32 %f609, 0f00000000, %f608, %p91; selp.f32 %f610, 0f00000000, %f605, %p89; selp.f32 %f611, 0f00000000, %f602, %p87; selp.f32 %f612, 0f00000000, %f599, %p85; st.global.v4.f32 [%rd18], {%f612, %f611, %f610, %f609}; add.s32 %r181, %r181, 1024; $L__BB0_60: setp.lt.u32 %p92, %r34, 1024; @%p92 bra $L__BB0_83; add.s32 %r157, %r181, %r2; mul.wide.s32 %rd88, %r157, 16; add.s64 %rd89, %rd88, 16384; add.s64 %rd115, %rd4, %rd89; add.s64 %rd114, %rd1, %rd89; mul.wide.s32 %rd90, %r181, 16; add.s64 %rd91, %rd90, 16384; add.s64 %rd113, %rd2, %rd91; add.s64 %rd112, %rd3, %rd91; $L__BB0_62: add.s64 %rd45, %rd113, -16384; ld.global.v4.f32 {%f617, %f618, %f619, %f620}, [%rd113+-16384]; add.s64 %rd46, %rd112, -16384; setp.eq.s16 %p93, %rs23, 0; mov.f32 %f935, 0f00000000; mov.f32 %f931, %f935; mov.f32 %f932, %f935; mov.f32 %f933, %f935; mov.f32 %f934, %f935; @%p93 bra $L__BB0_64; ld.global.v4.f32 {%f934, %f933, %f932, %f931}, [%rd46]; $L__BB0_64: add.s64 %rd47, %rd114, -16384; ld.global.v4.f32 {%f629, %f630, %f631, %f632}, [%rd114+-16384]; sub.ftz.f32 %f637, %f629, %f869; mul.ftz.f32 %f638, %f914, %f637; fma.rn.ftz.f32 %f639, %f617, %f638, %f934; sub.ftz.f32 %f640, %f630, %f869; mul.ftz.f32 %f641, %f914, %f640; fma.rn.ftz.f32 %f642, %f618, %f641, %f933; sub.ftz.f32 %f643, %f631, %f869; mul.ftz.f32 %f644, %f914, %f643; fma.rn.ftz.f32 %f645, %f619, %f644, %f932; sub.ftz.f32 %f646, %f632, %f869; mul.ftz.f32 %f647, %f914, %f646; fma.rn.ftz.f32 %f648, %f620, %f647, %f931; setp.lt.ftz.f32 %p94, %f639, 0f00000000; setp.ne.s16 %p95, %rs15, 0; and.pred %p96, %p95, %p94; setp.lt.ftz.f32 %p97, %f642, 0f00000000; and.pred %p98, %p95, %p97; setp.lt.ftz.f32 %p99, %f645, 0f00000000; and.pred %p100, %p95, %p99; setp.lt.ftz.f32 %p101, %f648, 0f00000000; and.pred %p102, %p95, %p101; selp.f32 %f649, 0f00000000, %f648, %p102; selp.f32 %f650, 0f00000000, %f645, %p100; selp.f32 %f651, 0f00000000, %f642, %p98; selp.f32 %f652, 0f00000000, %f639, %p96; add.s64 %rd48, %rd115, -16384; st.global.v4.f32 [%rd115+-16384], {%f652, %f651, %f650, %f649}; ld.global.v4.f32 {%f653, %f654, %f655, %f656}, [%rd45+16384]; mov.f32 %f936, %f935; mov.f32 %f937, %f935; mov.f32 %f938, %f935; @%p93 bra $L__BB0_66; ld.global.v4.f32 {%f938, %f937, %f936, %f935}, [%rd46+16384]; $L__BB0_66: ld.global.v4.f32 {%f661, %f662, %f663, %f664}, [%rd47+16384]; sub.ftz.f32 %f669, %f661, %f869; mul.ftz.f32 %f670, %f914, %f669; fma.rn.ftz.f32 %f671, %f653, %f670, %f938; sub.ftz.f32 %f672, %f662, %f869; mul.ftz.f32 %f673, %f914, %f672; fma.rn.ftz.f32 %f674, %f654, %f673, %f937; sub.ftz.f32 %f675, %f663, %f869; mul.ftz.f32 %f676, %f914, %f675; fma.rn.ftz.f32 %f677, %f655, %f676, %f936; sub.ftz.f32 %f678, %f664, %f869; mul.ftz.f32 %f679, %f914, %f678; fma.rn.ftz.f32 %f680, %f656, %f679, %f935; setp.lt.ftz.f32 %p104, %f671, 0f00000000; and.pred %p106, %p95, %p104; setp.lt.ftz.f32 %p107, %f674, 0f00000000; and.pred %p108, %p95, %p107; setp.lt.ftz.f32 %p109, %f677, 0f00000000; and.pred %p110, %p95, %p109; setp.lt.ftz.f32 %p111, %f680, 0f00000000; and.pred %p112, %p95, %p111; selp.f32 %f681, 0f00000000, %f680, %p112; selp.f32 %f682, 0f00000000, %f677, %p110; selp.f32 %f683, 0f00000000, %f674, %p108; selp.f32 %f684, 0f00000000, %f671, %p106; st.global.v4.f32 [%rd48+16384], {%f684, %f683, %f682, %f681}; add.s64 %rd115, %rd115, 32768; add.s64 %rd114, %rd114, 32768; add.s64 %rd113, %rd113, 32768; add.s64 %rd112, %rd112, 32768; add.s32 %r181, %r181, 2048; setp.lt.s32 %p113, %r181, %r1; @%p113 bra $L__BB0_62; $L__BB0_83: ret; }