LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE4temp has been demoted // _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE2mu has been demoted // _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE6rsigma has been demoted .visible .entry MVNKernel_vec_fp16_large( .param .align 8 .b8 MVNKernel_vec_fp16_large_param_0[88] ) { .reg .pred %p<173>; .reg .b16 %rs<403>; .reg .f32 %f<875>; .reg .b32 %r<191>; .reg .b64 %rd<120>; // demoted variable .shared .align 16 .b8 _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE4temp[512]; // demoted variable .shared .align 4 .f32 _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE2mu; // demoted variable .shared .align 4 .f32 _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE6rsigma; ld.param.v2.u8 {%rs15, %rs16}, [MVNKernel_vec_fp16_large_param_0+48]; ld.param.v4.u8 {%rs21, %rs22, %rs23, %rs24}, [MVNKernel_vec_fp16_large_param_0+44]; ld.param.f32 %f275, [MVNKernel_vec_fp16_large_param_0+40]; ld.param.f32 %f274, [MVNKernel_vec_fp16_large_param_0+36]; ld.param.u32 %r61, [MVNKernel_vec_fp16_large_param_0+32]; ld.param.u64 %rd70, [MVNKernel_vec_fp16_large_param_0+24]; ld.param.u64 %rd69, [MVNKernel_vec_fp16_large_param_0+16]; ld.param.u64 %rd68, [MVNKernel_vec_fp16_large_param_0+8]; ld.param.u64 %rd67, [MVNKernel_vec_fp16_large_param_0]; cvta.to.global.u64 %rd1, %rd67; cvta.to.global.u64 %rd2, %rd68; cvta.to.global.u64 %rd3, %rd69; cvta.to.global.u64 %rd4, %rd70; shr.u32 %r1, %r61, 2; mov.u32 %r76, %ctaid.x; mul.lo.s32 %r2, %r1, %r76; setp.eq.s16 %p1, %rs21, 0; mov.f32 %f785, 0f00000000; mov.u32 %r181, %tid.x; @%p1 bra $L__BB0_21; setp.ge.s32 %p2, %r181, %r1; mov.f32 %f757, 0f00000000; mov.f32 %f758, %f757; mov.f32 %f759, %f757; mov.f32 %f760, %f757; @%p2 bra $L__BB0_8; not.b32 %r77, %r181; add.s32 %r4, %r1, %r77; shr.u32 %r78, %r4, 10; add.s32 %r79, %r78, 1; and.b32 %r170, %r79, 3; setp.eq.s32 %p3, %r170, 0; mov.f32 %f760, 0f00000000; mov.u32 %r171, %r181; mov.f32 %f759, %f760; mov.f32 %f758, %f760; mov.f32 %f757, %f760; @%p3 bra $L__BB0_5; add.s32 %r80, %r181, %r2; mul.wide.s32 %rd71, %r80, 8; add.s64 %rd104, %rd1, %rd71; mov.u32 %r171, %r181; $L__BB0_4: .pragma "nounroll"; ld.global.v4.u16 {%rs29, %rs30, %rs31, %rs32}, [%rd104]; // begin inline asm { cvt.f32.f16 %f290, %rs29;} // end inline asm // begin inline asm { cvt.f32.f16 %f291, %rs30;} // end inline asm // begin inline asm { cvt.f32.f16 %f292, %rs31;} // end inline asm // begin inline asm { cvt.f32.f16 %f293, %rs32;} // end inline asm add.ftz.f32 %f757, %f757, %f290; add.ftz.f32 %f758, %f758, %f291; add.ftz.f32 %f759, %f759, %f292; add.ftz.f32 %f760, %f760, %f293; add.s32 %r171, %r171, 1024; add.s64 %rd104, %rd104, 8192; add.s32 %r170, %r170, -1; setp.ne.s32 %p4, %r170, 0; @%p4 bra $L__BB0_4; $L__BB0_5: setp.lt.u32 %p5, %r4, 3072; @%p5 bra $L__BB0_8; add.s32 %r81, %r171, %r2; mul.wide.s32 %rd72, %r81, 8; add.s64 %rd73, %rd1, %rd72; add.s64 %rd105, %rd73, 16384; $L__BB0_7: ld.global.v4.u16 {%rs49, %rs50, %rs51, %rs52}, [%rd105+-16384]; // begin inline asm { cvt.f32.f16 %f294, %rs49;} // end inline asm // begin inline asm { cvt.f32.f16 %f295, %rs50;} // end inline asm // begin inline asm { cvt.f32.f16 %f296, %rs51;} // end inline asm // begin inline asm { cvt.f32.f16 %f297, %rs52;} // end inline asm add.ftz.f32 %f310, %f757, %f294; add.ftz.f32 %f311, %f758, %f295; add.ftz.f32 %f312, %f759, %f296; add.ftz.f32 %f313, %f760, %f297; ld.global.v4.u16 {%rs53, %rs54, %rs55, %rs56}, [%rd105+-8192]; // begin inline asm { cvt.f32.f16 %f298, %rs53;} // end inline asm // begin inline asm { cvt.f32.f16 %f299, %rs54;} // end inline asm // begin inline asm { cvt.f32.f16 %f300, %rs55;} // end inline asm // begin inline asm { cvt.f32.f16 %f301, %rs56;} // end inline asm add.ftz.f32 %f314, %f310, %f298; add.ftz.f32 %f315, %f311, %f299; add.ftz.f32 %f316, %f312, %f300; add.ftz.f32 %f317, %f313, %f301; ld.global.v4.u16 {%rs57, %rs58, %rs59, %rs60}, [%rd105]; // begin inline asm { cvt.f32.f16 %f302, %rs57;} // end inline asm // begin inline asm { cvt.f32.f16 %f303, %rs58;} // end inline asm // begin inline asm { cvt.f32.f16 %f304, %rs59;} // end inline asm // begin inline asm { cvt.f32.f16 %f305, %rs60;} // end inline asm add.ftz.f32 %f318, %f314, %f302; add.ftz.f32 %f319, %f315, %f303; add.ftz.f32 %f320, %f316, %f304; add.ftz.f32 %f321, %f317, %f305; ld.global.v4.u16 {%rs61, %rs62, %rs63, %rs64}, [%rd105+8192]; // begin inline asm { cvt.f32.f16 %f306, %rs61;} // end inline asm // begin inline asm { cvt.f32.f16 %f307, %rs62;} // end inline asm // begin inline asm { cvt.f32.f16 %f308, %rs63;} // end inline asm // begin inline asm { cvt.f32.f16 %f309, %rs64;} // end inline asm add.ftz.f32 %f757, %f318, %f306; add.ftz.f32 %f758, %f319, %f307; add.ftz.f32 %f759, %f320, %f308; add.ftz.f32 %f760, %f321, %f309; add.s64 %rd105, %rd105, 32768; add.s32 %r171, %r171, 4096; setp.lt.s32 %p6, %r171, %r1; @%p6 bra $L__BB0_7; $L__BB0_8: mov.u32 %r174, WARP_SZ; setp.lt.s32 %p7, %r174, 2; @%p7 bra $L__BB0_11; mov.u32 %r173, %r174; $L__BB0_10: mov.b32 %r82, %f757; shr.u32 %r83, %r173, 31; add.s32 %r84, %r173, %r83; shr.s32 %r15, %r84, 1; mov.u32 %r85, 31; mov.u32 %r86, -1; shfl.sync.down.b32 %r87|%p8, %r82, %r15, %r85, %r86; mov.b32 %f322, %r87; add.ftz.f32 %f757, %f757, %f322; mov.b32 %r88, %f758; shfl.sync.down.b32 %r89|%p9, %r88, %r15, %r85, %r86; mov.b32 %f323, %r89; add.ftz.f32 %f758, %f758, %f323; mov.b32 %r90, %f759; shfl.sync.down.b32 %r91|%p10, %r90, %r15, %r85, %r86; mov.b32 %f324, %r91; add.ftz.f32 %f759, %f759, %f324; mov.b32 %r92, %f760; shfl.sync.down.b32 %r93|%p11, %r92, %r15, %r85, %r86; mov.b32 %f325, %r93; add.ftz.f32 %f760, %f760, %f325; setp.gt.s32 %p12, %r173, 3; mov.u32 %r173, %r15; @%p12 bra $L__BB0_10; $L__BB0_11: rem.u32 %r16, %r181, %r174; setp.ne.s32 %p13, %r16, 0; @%p13 bra $L__BB0_13; div.u32 %r94, %r181, %r174; shl.b32 %r95, %r94, 4; mov.u32 %r96, _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r97, %r96, %r95; st.shared.v4.f32 [%r97], {%f757, %f758, %f759, %f760}; $L__BB0_13: bar.sync 0; setp.le.u32 %p14, %r174, %r181; @%p14 bra $L__BB0_18; mov.u32 %r98, %ntid.x; div.u32 %r99, %r98, %r174; setp.ge.s32 %p15, %r16, %r99; mov.f32 %f757, 0f00000000; mov.f32 %f758, %f757; mov.f32 %f759, %f757; mov.f32 %f760, %f757; @%p15 bra $L__BB0_16; shl.b32 %r100, %r16, 4; mov.u32 %r101, _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r102, %r101, %r100; ld.shared.v4.f32 {%f757, %f758, %f759, %f760}, [%r102]; $L__BB0_16: @%p7 bra $L__BB0_18; $L__BB0_17: mov.b32 %r103, %f757; shr.u32 %r104, %r174, 31; add.s32 %r105, %r174, %r104; shr.s32 %r18, %r105, 1; mov.u32 %r106, 31; mov.u32 %r107, -1; shfl.sync.down.b32 %r108|%p17, %r103, %r18, %r106, %r107; mov.b32 %f334, %r108; add.ftz.f32 %f757, %f757, %f334; mov.b32 %r109, %f758; shfl.sync.down.b32 %r110|%p18, %r109, %r18, %r106, %r107; mov.b32 %f335, %r110; add.ftz.f32 %f758, %f758, %f335; mov.b32 %r111, %f759; shfl.sync.down.b32 %r112|%p19, %r111, %r18, %r106, %r107; mov.b32 %f336, %r112; add.ftz.f32 %f759, %f759, %f336; mov.b32 %r113, %f760; shfl.sync.down.b32 %r114|%p20, %r113, %r18, %r106, %r107; mov.b32 %f337, %r114; add.ftz.f32 %f760, %f760, %f337; setp.gt.s32 %p21, %r174, 3; mov.u32 %r174, %r18; @%p21 bra $L__BB0_17; $L__BB0_18: bar.sync 0; setp.ne.s32 %p22, %r181, 0; @%p22 bra $L__BB0_20; add.ftz.f32 %f338, %f757, %f758; add.ftz.f32 %f339, %f759, %f338; add.ftz.f32 %f340, %f760, %f339; mul.ftz.f32 %f341, %f274, %f340; st.shared.f32 [_ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE2mu], %f341; $L__BB0_20: bar.sync 0; ld.shared.f32 %f785, [_ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE2mu]; $L__BB0_21: setp.eq.s16 %p23, %rs22, 0; mov.f32 %f830, 0f3F800000; @%p23 bra $L__BB0_42; setp.ge.s32 %p24, %r181, %r1; mov.f32 %f802, 0f00000000; mov.f32 %f803, %f802; mov.f32 %f804, %f802; mov.f32 %f805, %f802; @%p24 bra $L__BB0_29; not.b32 %r115, %r181; add.s32 %r19, %r1, %r115; shr.u32 %r116, %r19, 10; add.s32 %r117, %r116, 1; and.b32 %r176, %r117, 3; setp.eq.s32 %p25, %r176, 0; mov.f32 %f805, 0f00000000; mov.u32 %r177, %r181; mov.f32 %f804, %f805; mov.f32 %f803, %f805; mov.f32 %f802, %f805; @%p25 bra $L__BB0_26; add.s32 %r118, %r181, %r2; mul.wide.s32 %rd74, %r118, 8; add.s64 %rd106, %rd1, %rd74; mov.u32 %r177, %r181; $L__BB0_25: .pragma "nounroll"; ld.global.v4.u16 {%rs70, %rs71, %rs72, %rs73}, [%rd106]; // begin inline asm { cvt.f32.f16 %f356, %rs70;} // end inline asm // begin inline asm { cvt.f32.f16 %f357, %rs71;} // end inline asm // begin inline asm { cvt.f32.f16 %f358, %rs72;} // end inline asm // begin inline asm { cvt.f32.f16 %f359, %rs73;} // end inline asm sub.ftz.f32 %f360, %f356, %f785; fma.rn.ftz.f32 %f805, %f360, %f360, %f805; sub.ftz.f32 %f361, %f357, %f785; fma.rn.ftz.f32 %f804, %f361, %f361, %f804; sub.ftz.f32 %f362, %f358, %f785; fma.rn.ftz.f32 %f803, %f362, %f362, %f803; sub.ftz.f32 %f363, %f359, %f785; fma.rn.ftz.f32 %f802, %f363, %f363, %f802; add.s32 %r177, %r177, 1024; add.s64 %rd106, %rd106, 8192; add.s32 %r176, %r176, -1; setp.ne.s32 %p26, %r176, 0; @%p26 bra $L__BB0_25; $L__BB0_26: setp.lt.u32 %p27, %r19, 3072; @%p27 bra $L__BB0_29; add.s32 %r119, %r177, %r2; mul.wide.s32 %rd75, %r119, 8; add.s64 %rd76, %rd1, %rd75; add.s64 %rd107, %rd76, 16384; $L__BB0_28: ld.global.v4.u16 {%rs90, %rs91, %rs92, %rs93}, [%rd107+-16384]; // begin inline asm { cvt.f32.f16 %f364, %rs90;} // end inline asm // begin inline asm { cvt.f32.f16 %f365, %rs91;} // end inline asm // begin inline asm { cvt.f32.f16 %f366, %rs92;} // end inline asm // begin inline asm { cvt.f32.f16 %f367, %rs93;} // end inline asm sub.ftz.f32 %f380, %f364, %f785; fma.rn.ftz.f32 %f381, %f380, %f380, %f805; sub.ftz.f32 %f382, %f365, %f785; fma.rn.ftz.f32 %f383, %f382, %f382, %f804; sub.ftz.f32 %f384, %f366, %f785; fma.rn.ftz.f32 %f385, %f384, %f384, %f803; sub.ftz.f32 %f386, %f367, %f785; fma.rn.ftz.f32 %f387, %f386, %f386, %f802; ld.global.v4.u16 {%rs94, %rs95, %rs96, %rs97}, [%rd107+-8192]; // begin inline asm { cvt.f32.f16 %f368, %rs94;} // end inline asm // begin inline asm { cvt.f32.f16 %f369, %rs95;} // end inline asm // begin inline asm { cvt.f32.f16 %f370, %rs96;} // end inline asm // begin inline asm { cvt.f32.f16 %f371, %rs97;} // end inline asm sub.ftz.f32 %f388, %f368, %f785; fma.rn.ftz.f32 %f389, %f388, %f388, %f381; sub.ftz.f32 %f390, %f369, %f785; fma.rn.ftz.f32 %f391, %f390, %f390, %f383; sub.ftz.f32 %f392, %f370, %f785; fma.rn.ftz.f32 %f393, %f392, %f392, %f385; sub.ftz.f32 %f394, %f371, %f785; fma.rn.ftz.f32 %f395, %f394, %f394, %f387; ld.global.v4.u16 {%rs98, %rs99, %rs100, %rs101}, [%rd107]; // begin inline asm { cvt.f32.f16 %f372, %rs98;} // end inline asm // begin inline asm { cvt.f32.f16 %f373, %rs99;} // end inline asm // begin inline asm { cvt.f32.f16 %f374, %rs100;} // end inline asm // begin inline asm { cvt.f32.f16 %f375, %rs101;} // end inline asm sub.ftz.f32 %f396, %f372, %f785; fma.rn.ftz.f32 %f397, %f396, %f396, %f389; sub.ftz.f32 %f398, %f373, %f785; fma.rn.ftz.f32 %f399, %f398, %f398, %f391; sub.ftz.f32 %f400, %f374, %f785; fma.rn.ftz.f32 %f401, %f400, %f400, %f393; sub.ftz.f32 %f402, %f375, %f785; fma.rn.ftz.f32 %f403, %f402, %f402, %f395; ld.global.v4.u16 {%rs102, %rs103, %rs104, %rs105}, [%rd107+8192]; // begin inline asm { cvt.f32.f16 %f376, %rs102;} // end inline asm // begin inline asm { cvt.f32.f16 %f377, %rs103;} // end inline asm // begin inline asm { cvt.f32.f16 %f378, %rs104;} // end inline asm // begin inline asm { cvt.f32.f16 %f379, %rs105;} // end inline asm sub.ftz.f32 %f404, %f376, %f785; fma.rn.ftz.f32 %f805, %f404, %f404, %f397; sub.ftz.f32 %f405, %f377, %f785; fma.rn.ftz.f32 %f804, %f405, %f405, %f399; sub.ftz.f32 %f406, %f378, %f785; fma.rn.ftz.f32 %f803, %f406, %f406, %f401; sub.ftz.f32 %f407, %f379, %f785; fma.rn.ftz.f32 %f802, %f407, %f407, %f403; add.s64 %rd107, %rd107, 32768; add.s32 %r177, %r177, 4096; setp.lt.s32 %p28, %r177, %r1; @%p28 bra $L__BB0_28; $L__BB0_29: mul.ftz.f32 %f810, %f274, %f805; mul.ftz.f32 %f811, %f274, %f804; mul.ftz.f32 %f812, %f274, %f803; mul.ftz.f32 %f813, %f274, %f802; mov.u32 %r180, WARP_SZ; setp.lt.s32 %p29, %r180, 2; @%p29 bra $L__BB0_32; mov.u32 %r179, %r180; $L__BB0_31: mov.b32 %r120, %f810; shr.u32 %r121, %r179, 31; add.s32 %r122, %r179, %r121; shr.s32 %r30, %r122, 1; mov.u32 %r123, 31; mov.u32 %r124, -1; shfl.sync.down.b32 %r125|%p30, %r120, %r30, %r123, %r124; mov.b32 %f408, %r125; add.ftz.f32 %f810, %f810, %f408; mov.b32 %r126, %f811; shfl.sync.down.b32 %r127|%p31, %r126, %r30, %r123, %r124; mov.b32 %f409, %r127; add.ftz.f32 %f811, %f811, %f409; mov.b32 %r128, %f812; shfl.sync.down.b32 %r129|%p32, %r128, %r30, %r123, %r124; mov.b32 %f410, %r129; add.ftz.f32 %f812, %f812, %f410; mov.b32 %r130, %f813; shfl.sync.down.b32 %r131|%p33, %r130, %r30, %r123, %r124; mov.b32 %f411, %r131; add.ftz.f32 %f813, %f813, %f411; setp.gt.s32 %p34, %r179, 3; mov.u32 %r179, %r30; @%p34 bra $L__BB0_31; $L__BB0_32: rem.u32 %r31, %r181, %r180; setp.ne.s32 %p35, %r31, 0; @%p35 bra $L__BB0_34; div.u32 %r132, %r181, %r180; shl.b32 %r133, %r132, 4; mov.u32 %r134, _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r135, %r134, %r133; st.shared.v4.f32 [%r135], {%f810, %f811, %f812, %f813}; $L__BB0_34: bar.sync 0; setp.le.u32 %p36, %r180, %r181; @%p36 bra $L__BB0_39; mov.u32 %r136, %ntid.x; div.u32 %r137, %r136, %r180; setp.ge.s32 %p37, %r31, %r137; mov.f32 %f810, 0f00000000; mov.f32 %f811, %f810; mov.f32 %f812, %f810; mov.f32 %f813, %f810; @%p37 bra $L__BB0_37; shl.b32 %r138, %r31, 4; mov.u32 %r139, _ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r140, %r139, %r138; ld.shared.v4.f32 {%f810, %f811, %f812, %f813}, [%r140]; $L__BB0_37: @%p29 bra $L__BB0_39; $L__BB0_38: mov.b32 %r141, %f810; shr.u32 %r142, %r180, 31; add.s32 %r143, %r180, %r142; shr.s32 %r33, %r143, 1; mov.u32 %r144, 31; mov.u32 %r145, -1; shfl.sync.down.b32 %r146|%p39, %r141, %r33, %r144, %r145; mov.b32 %f420, %r146; add.ftz.f32 %f810, %f810, %f420; mov.b32 %r147, %f811; shfl.sync.down.b32 %r148|%p40, %r147, %r33, %r144, %r145; mov.b32 %f421, %r148; add.ftz.f32 %f811, %f811, %f421; mov.b32 %r149, %f812; shfl.sync.down.b32 %r150|%p41, %r149, %r33, %r144, %r145; mov.b32 %f422, %r150; add.ftz.f32 %f812, %f812, %f422; mov.b32 %r151, %f813; shfl.sync.down.b32 %r152|%p42, %r151, %r33, %r144, %r145; mov.b32 %f423, %r152; add.ftz.f32 %f813, %f813, %f423; setp.gt.s32 %p43, %r180, 3; mov.u32 %r180, %r33; @%p43 bra $L__BB0_38; $L__BB0_39: bar.sync 0; setp.ne.s32 %p44, %r181, 0; @%p44 bra $L__BB0_41; add.ftz.f32 %f424, %f810, %f811; add.ftz.f32 %f425, %f812, %f424; add.ftz.f32 %f426, %f813, %f425; add.ftz.f32 %f427, %f275, %f426; rsqrt.approx.ftz.f32 %f428, %f427; st.shared.f32 [_ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE6rsigma], %f428; $L__BB0_41: bar.sync 0; ld.shared.f32 %f830, [_ZZ24LayerNormLargeKernel_vecI7__half4EvN8dxml_mvn15MVNKernelParamsEE6rsigma]; $L__BB0_42: setp.ge.s32 %p45, %r181, %r1; @%p45 bra $L__BB0_83; setp.eq.s16 %p46, %rs24, 0; not.b32 %r153, %r181; add.s32 %r34, %r1, %r153; @%p46 bra $L__BB0_67; setp.eq.s16 %p47, %rs16, 0; and.b32 %r35, %r34, 1024; add.s32 %r154, %r181, %r2; mul.wide.s32 %rd77, %r154, 8; add.s64 %rd17, %rd1, %rd77; add.s64 %rd18, %rd4, %rd77; @%p47 bra $L__BB0_56; setp.ne.s32 %p48, %r35, 0; @%p48 bra $L__BB0_49; setp.eq.s16 %p49, %rs23, 0; cvt.s64.s32 %rd19, %r154; add.s64 %rd79, %rd2, %rd77; ld.global.v4.u16 {%rs113, %rs114, %rs115, %rs116}, [%rd79]; // begin inline asm { cvt.f32.f16 %f429, %rs113;} // end inline asm // begin inline asm { cvt.f32.f16 %f430, %rs114;} // end inline asm // begin inline asm { cvt.f32.f16 %f431, %rs115;} // end inline asm // begin inline asm { cvt.f32.f16 %f432, %rs116;} // end inline asm mov.f32 %f831, 0f00000000; mov.f32 %f832, %f831; mov.f32 %f833, %f831; mov.f32 %f834, %f831; @%p49 bra $L__BB0_48; shl.b64 %rd80, %rd19, 3; add.s64 %rd81, %rd3, %rd80; ld.global.v4.u16 {%rs121, %rs122, %rs123, %rs124}, [%rd81]; // begin inline asm { cvt.f32.f16 %f834, %rs121;} // end inline asm // begin inline asm { cvt.f32.f16 %f833, %rs122;} // end inline asm // begin inline asm { cvt.f32.f16 %f832, %rs123;} // end inline asm // begin inline asm { cvt.f32.f16 %f831, %rs124;} // end inline asm $L__BB0_48: ld.global.v4.u16 {%rs133, %rs134, %rs135, %rs136}, [%rd17]; // begin inline asm { cvt.f32.f16 %f441, %rs133;} // end inline asm // begin inline asm { cvt.f32.f16 %f442, %rs134;} // end inline asm // begin inline asm { cvt.f32.f16 %f443, %rs135;} // end inline asm // begin inline asm { cvt.f32.f16 %f444, %rs136;} // end inline asm sub.ftz.f32 %f449, %f441, %f785; mul.ftz.f32 %f450, %f830, %f449; fma.rn.ftz.f32 %f451, %f429, %f450, %f834; sub.ftz.f32 %f452, %f442, %f785; mul.ftz.f32 %f453, %f830, %f452; fma.rn.ftz.f32 %f454, %f430, %f453, %f833; sub.ftz.f32 %f455, %f443, %f785; mul.ftz.f32 %f456, %f830, %f455; fma.rn.ftz.f32 %f457, %f431, %f456, %f832; sub.ftz.f32 %f458, %f444, %f785; mul.ftz.f32 %f459, %f830, %f458; fma.rn.ftz.f32 %f460, %f432, %f459, %f831; setp.lt.ftz.f32 %p50, %f451, 0f00000000; setp.ne.s16 %p51, %rs15, 0; and.pred %p52, %p51, %p50; selp.f32 %f445, 0f00000000, %f451, %p52; setp.lt.ftz.f32 %p53, %f454, 0f00000000; and.pred %p54, %p51, %p53; selp.f32 %f446, 0f00000000, %f454, %p54; setp.lt.ftz.f32 %p55, %f457, 0f00000000; and.pred %p56, %p51, %p55; selp.f32 %f447, 0f00000000, %f457, %p56; setp.lt.ftz.f32 %p57, %f460, 0f00000000; and.pred %p58, %p51, %p57; selp.f32 %f448, 0f00000000, %f460, %p58; // begin inline asm { cvt.rn.f16.f32 %rs132, %f448;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs131, %f447;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs130, %f446;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs129, %f445;} // end inline asm st.global.v4.u16 [%rd18], {%rs129, %rs130, %rs131, %rs132}; add.s32 %r181, %r181, 1024; $L__BB0_49: setp.lt.u32 %p59, %r34, 1024; @%p59 bra $L__BB0_83; add.s32 %r156, %r181, %r2; mul.wide.s32 %rd82, %r156, 8; add.s64 %rd83, %rd82, 8192; add.s64 %rd111, %rd1, %rd83; add.s64 %rd110, %rd3, %rd83; add.s64 %rd109, %rd2, %rd83; add.s64 %rd108, %rd4, %rd83; $L__BB0_51: add.s64 %rd28, %rd109, -8192; ld.global.v4.u16 {%rs142, %rs143, %rs144, %rs145}, [%rd109+-8192]; // begin inline asm { cvt.f32.f16 %f461, %rs142;} // end inline asm // begin inline asm { cvt.f32.f16 %f462, %rs143;} // end inline asm // begin inline asm { cvt.f32.f16 %f463, %rs144;} // end inline asm // begin inline asm { cvt.f32.f16 %f464, %rs145;} // end inline asm add.s64 %rd29, %rd110, -8192; setp.eq.s16 %p60, %rs23, 0; mov.f32 %f839, 0f00000000; mov.f32 %f835, %f839; mov.f32 %f836, %f839; mov.f32 %f837, %f839; mov.f32 %f838, %f839; @%p60 bra $L__BB0_53; ld.global.v4.u16 {%rs151, %rs152, %rs153, %rs154}, [%rd29]; // begin inline asm { cvt.f32.f16 %f838, %rs151;} // end inline asm // begin inline asm { cvt.f32.f16 %f837, %rs152;} // end inline asm // begin inline asm { cvt.f32.f16 %f836, %rs153;} // end inline asm // begin inline asm { cvt.f32.f16 %f835, %rs154;} // end inline asm $L__BB0_53: add.s64 %rd30, %rd111, -8192; ld.global.v4.u16 {%rs167, %rs168, %rs169, %rs170}, [%rd111+-8192]; // begin inline asm { cvt.f32.f16 %f473, %rs167;} // end inline asm // begin inline asm { cvt.f32.f16 %f474, %rs168;} // end inline asm // begin inline asm { cvt.f32.f16 %f475, %rs169;} // end inline asm // begin inline asm { cvt.f32.f16 %f476, %rs170;} // end inline asm sub.ftz.f32 %f489, %f473, %f785; mul.ftz.f32 %f490, %f830, %f489; fma.rn.ftz.f32 %f491, %f461, %f490, %f838; sub.ftz.f32 %f492, %f474, %f785; mul.ftz.f32 %f493, %f830, %f492; fma.rn.ftz.f32 %f494, %f462, %f493, %f837; sub.ftz.f32 %f495, %f475, %f785; mul.ftz.f32 %f496, %f830, %f495; fma.rn.ftz.f32 %f497, %f463, %f496, %f836; sub.ftz.f32 %f498, %f476, %f785; mul.ftz.f32 %f499, %f830, %f498; fma.rn.ftz.f32 %f500, %f464, %f499, %f835; setp.lt.ftz.f32 %p61, %f491, 0f00000000; setp.ne.s16 %p62, %rs15, 0; and.pred %p63, %p62, %p61; selp.f32 %f477, 0f00000000, %f491, %p63; setp.lt.ftz.f32 %p64, %f494, 0f00000000; and.pred %p65, %p62, %p64; selp.f32 %f478, 0f00000000, %f494, %p65; setp.lt.ftz.f32 %p66, %f497, 0f00000000; and.pred %p67, %p62, %p66; selp.f32 %f479, 0f00000000, %f497, %p67; setp.lt.ftz.f32 %p68, %f500, 0f00000000; and.pred %p69, %p62, %p68; selp.f32 %f480, 0f00000000, %f500, %p69; // begin inline asm { cvt.rn.f16.f32 %rs162, %f480;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs161, %f479;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs160, %f478;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs159, %f477;} // end inline asm add.s64 %rd31, %rd108, -8192; st.global.v4.u16 [%rd108+-8192], {%rs159, %rs160, %rs161, %rs162}; ld.global.v4.u16 {%rs172, %rs173, %rs174, %rs175}, [%rd28+8192]; // begin inline asm { cvt.f32.f16 %f481, %rs172;} // end inline asm // begin inline asm { cvt.f32.f16 %f482, %rs173;} // end inline asm // begin inline asm { cvt.f32.f16 %f483, %rs174;} // end inline asm // begin inline asm { cvt.f32.f16 %f484, %rs175;} // end inline asm mov.f32 %f840, %f839; mov.f32 %f841, %f839; mov.f32 %f842, %f839; @%p60 bra $L__BB0_55; ld.global.v4.u16 {%rs181, %rs182, %rs183, %rs184}, [%rd29+8192]; // begin inline asm { cvt.f32.f16 %f842, %rs181;} // end inline asm // begin inline asm { cvt.f32.f16 %f841, %rs182;} // end inline asm // begin inline asm { cvt.f32.f16 %f840, %rs183;} // end inline asm // begin inline asm { cvt.f32.f16 %f839, %rs184;} // end inline asm $L__BB0_55: ld.global.v4.u16 {%rs193, %rs194, %rs195, %rs196}, [%rd30+8192]; // begin inline asm { cvt.f32.f16 %f505, %rs193;} // end inline asm // begin inline asm { cvt.f32.f16 %f506, %rs194;} // end inline asm // begin inline asm { cvt.f32.f16 %f507, %rs195;} // end inline asm // begin inline asm { cvt.f32.f16 %f508, %rs196;} // end inline asm sub.ftz.f32 %f513, %f505, %f785; mul.ftz.f32 %f514, %f830, %f513; fma.rn.ftz.f32 %f515, %f481, %f514, %f842; sub.ftz.f32 %f516, %f506, %f785; mul.ftz.f32 %f517, %f830, %f516; fma.rn.ftz.f32 %f518, %f482, %f517, %f841; sub.ftz.f32 %f519, %f507, %f785; mul.ftz.f32 %f520, %f830, %f519; fma.rn.ftz.f32 %f521, %f483, %f520, %f840; sub.ftz.f32 %f522, %f508, %f785; mul.ftz.f32 %f523, %f830, %f522; fma.rn.ftz.f32 %f524, %f484, %f523, %f839; setp.lt.ftz.f32 %p71, %f515, 0f00000000; and.pred %p73, %p62, %p71; selp.f32 %f509, 0f00000000, %f515, %p73; setp.lt.ftz.f32 %p74, %f518, 0f00000000; and.pred %p75, %p62, %p74; selp.f32 %f510, 0f00000000, %f518, %p75; setp.lt.ftz.f32 %p76, %f521, 0f00000000; and.pred %p77, %p62, %p76; selp.f32 %f511, 0f00000000, %f521, %p77; setp.lt.ftz.f32 %p78, %f524, 0f00000000; and.pred %p79, %p62, %p78; selp.f32 %f512, 0f00000000, %f524, %p79; // begin inline asm { cvt.rn.f16.f32 %rs192, %f512;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs191, %f511;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs190, %f510;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs189, %f509;} // end inline asm st.global.v4.u16 [%rd31+8192], {%rs189, %rs190, %rs191, %rs192}; add.s64 %rd111, %rd111, 16384; add.s64 %rd110, %rd110, 16384; add.s64 %rd109, %rd109, 16384; add.s64 %rd108, %rd108, 16384; add.s32 %r181, %r181, 2048; setp.lt.s32 %p80, %r181, %r1; @%p80 bra $L__BB0_51; bra.uni $L__BB0_83; $L__BB0_67: shr.u32 %r158, %r34, 10; add.s32 %r159, %r158, 1; and.b32 %r187, %r159, 3; setp.eq.s32 %p114, %r187, 0; @%p114 bra $L__BB0_72; add.s32 %r185, %r181, %r2; mul.wide.s32 %rd92, %r185, 8; add.s64 %rd117, %rd1, %rd92; add.s64 %rd116, %rd4, %rd92; $L__BB0_69: .pragma "nounroll"; setp.eq.s16 %p115, %rs23, 0; mov.f32 %f855, 0f00000000; mov.f32 %f856, %f855; mov.f32 %f857, %f855; mov.f32 %f858, %f855; @%p115 bra $L__BB0_71; setp.eq.s16 %p116, %rs16, 0; selp.b32 %r160, %r181, %r185, %p116; mul.wide.s32 %rd93, %r160, 8; add.s64 %rd94, %rd3, %rd93; ld.global.v4.u16 {%rs294, %rs295, %rs296, %rs297}, [%rd94]; // begin inline asm { cvt.f32.f16 %f858, %rs294;} // end inline asm // begin inline asm { cvt.f32.f16 %f857, %rs295;} // end inline asm // begin inline asm { cvt.f32.f16 %f856, %rs296;} // end inline asm // begin inline asm { cvt.f32.f16 %f855, %rs297;} // end inline asm $L__BB0_71: ld.global.v4.u16 {%rs306, %rs307, %rs308, %rs309}, [%rd117]; // begin inline asm { cvt.f32.f16 %f629, %rs306;} // end inline asm // begin inline asm { cvt.f32.f16 %f630, %rs307;} // end inline asm // begin inline asm { cvt.f32.f16 %f631, %rs308;} // end inline asm // begin inline asm { cvt.f32.f16 %f632, %rs309;} // end inline asm sub.ftz.f32 %f637, %f629, %f785; fma.rn.ftz.f32 %f638, %f830, %f637, %f858; sub.ftz.f32 %f639, %f630, %f785; fma.rn.ftz.f32 %f640, %f830, %f639, %f857; sub.ftz.f32 %f641, %f631, %f785; fma.rn.ftz.f32 %f642, %f830, %f641, %f856; sub.ftz.f32 %f643, %f632, %f785; fma.rn.ftz.f32 %f644, %f830, %f643, %f855; setp.lt.ftz.f32 %p117, %f638, 0f00000000; setp.ne.s16 %p118, %rs15, 0; and.pred %p119, %p118, %p117; selp.f32 %f633, 0f00000000, %f638, %p119; setp.lt.ftz.f32 %p120, %f640, 0f00000000; and.pred %p121, %p118, %p120; selp.f32 %f634, 0f00000000, %f640, %p121; setp.lt.ftz.f32 %p122, %f642, 0f00000000; and.pred %p123, %p118, %p122; selp.f32 %f635, 0f00000000, %f642, %p123; setp.lt.ftz.f32 %p124, %f644, 0f00000000; and.pred %p125, %p118, %p124; selp.f32 %f636, 0f00000000, %f644, %p125; // begin inline asm { cvt.rn.f16.f32 %rs305, %f636;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs304, %f635;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs303, %f634;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs302, %f633;} // end inline asm st.global.v4.u16 [%rd116], {%rs302, %rs303, %rs304, %rs305}; add.s32 %r181, %r181, 1024; add.s64 %rd117, %rd117, 8192; add.s64 %rd116, %rd116, 8192; add.s32 %r185, %r185, 1024; add.s32 %r187, %r187, -1; setp.ne.s32 %p126, %r187, 0; @%p126 bra $L__BB0_69; $L__BB0_72: setp.lt.u32 %p127, %r34, 3072; @%p127 bra $L__BB0_83; add.s32 %r161, %r181, %r2; add.s32 %r189, %r161, 3072; mul.wide.s32 %rd95, %r161, 8; add.s64 %rd118, %rd1, %rd95; add.s64 %rd119, %rd4, %rd95; $L__BB0_74: setp.eq.s16 %p128, %rs23, 0; mov.f32 %f863, 0f00000000; mov.f32 %f859, %f863; mov.f32 %f860, %f863; mov.f32 %f861, %f863; mov.f32 %f862, %f863; @%p128 bra $L__BB0_76; setp.eq.s16 %p129, %rs16, 0; add.s32 %r162, %r2, %r181; selp.b32 %r163, %r181, %r162, %p129; mul.wide.s32 %rd96, %r163, 8; add.s64 %rd97, %rd3, %rd96; ld.global.v4.u16 {%rs317, %rs318, %rs319, %rs320}, [%rd97]; // begin inline asm { cvt.f32.f16 %f862, %rs317;} // end inline asm // begin inline asm { cvt.f32.f16 %f861, %rs318;} // end inline asm // begin inline asm { cvt.f32.f16 %f860, %rs319;} // end inline asm // begin inline asm { cvt.f32.f16 %f859, %rs320;} // end inline asm $L__BB0_76: ld.global.v4.u16 {%rs329, %rs330, %rs331, %rs332}, [%rd118]; // begin inline asm { cvt.f32.f16 %f653, %rs329;} // end inline asm // begin inline asm { cvt.f32.f16 %f654, %rs330;} // end inline asm // begin inline asm { cvt.f32.f16 %f655, %rs331;} // end inline asm // begin inline asm { cvt.f32.f16 %f656, %rs332;} // end inline asm sub.ftz.f32 %f665, %f653, %f785; fma.rn.ftz.f32 %f666, %f830, %f665, %f862; sub.ftz.f32 %f667, %f654, %f785; fma.rn.ftz.f32 %f668, %f830, %f667, %f861; sub.ftz.f32 %f669, %f655, %f785; fma.rn.ftz.f32 %f670, %f830, %f669, %f860; sub.ftz.f32 %f671, %f656, %f785; fma.rn.ftz.f32 %f672, %f830, %f671, %f859; setp.lt.ftz.f32 %p130, %f666, 0f00000000; setp.ne.s16 %p131, %rs15, 0; and.pred %p132, %p131, %p130; selp.f32 %f657, 0f00000000, %f666, %p132; setp.lt.ftz.f32 %p133, %f668, 0f00000000; and.pred %p134, %p131, %p133; selp.f32 %f658, 0f00000000, %f668, %p134; setp.lt.ftz.f32 %p135, %f670, 0f00000000; and.pred %p136, %p131, %p135; selp.f32 %f659, 0f00000000, %f670, %p136; setp.lt.ftz.f32 %p137, %f672, 0f00000000; and.pred %p138, %p131, %p137; selp.f32 %f660, 0f00000000, %f672, %p138; // begin inline asm { cvt.rn.f16.f32 %rs328, %f660;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs327, %f659;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs326, %f658;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs325, %f657;} // end inline asm st.global.v4.u16 [%rd119], {%rs325, %rs326, %rs327, %rs328}; add.s32 %r56, %r181, 1024; mov.f32 %f864, %f863; mov.f32 %f865, %f863; mov.f32 %f866, %f863; @%p128 bra $L__BB0_78; setp.eq.s16 %p140, %rs16, 0; add.s32 %r164, %r189, -2048; selp.b32 %r165, %r56, %r164, %p140; mul.wide.s32 %rd98, %r165, 8; add.s64 %rd99, %rd3, %rd98; ld.global.v4.u16 {%rs340, %rs341, %rs342, %rs343}, [%rd99]; // begin inline asm { cvt.f32.f16 %f866, %rs340;} // end inline asm // begin inline asm { cvt.f32.f16 %f865, %rs341;} // end inline asm // begin inline asm { cvt.f32.f16 %f864, %rs342;} // end inline asm // begin inline asm { cvt.f32.f16 %f863, %rs343;} // end inline asm $L__BB0_78: ld.global.v4.u16 {%rs352, %rs353, %rs354, %rs355}, [%rd118+8192]; // begin inline asm { cvt.f32.f16 %f677, %rs352;} // end inline asm // begin inline asm { cvt.f32.f16 %f678, %rs353;} // end inline asm // begin inline asm { cvt.f32.f16 %f679, %rs354;} // end inline asm // begin inline asm { cvt.f32.f16 %f680, %rs355;} // end inline asm sub.ftz.f32 %f689, %f677, %f785; fma.rn.ftz.f32 %f690, %f830, %f689, %f866; sub.ftz.f32 %f691, %f678, %f785; fma.rn.ftz.f32 %f692, %f830, %f691, %f865; sub.ftz.f32 %f693, %f679, %f785; fma.rn.ftz.f32 %f694, %f830, %f693, %f864; sub.ftz.f32 %f695, %f680, %f785; fma.rn.ftz.f32 %f696, %f830, %f695, %f863; setp.lt.ftz.f32 %p141, %f690, 0f00000000; mov.f32 %f871, 0f00000000; and.pred %p143, %p131, %p141; selp.f32 %f681, 0f00000000, %f690, %p143; setp.lt.ftz.f32 %p144, %f692, 0f00000000; and.pred %p145, %p131, %p144; selp.f32 %f682, 0f00000000, %f692, %p145; setp.lt.ftz.f32 %p146, %f694, 0f00000000; and.pred %p147, %p131, %p146; selp.f32 %f683, 0f00000000, %f694, %p147; setp.lt.ftz.f32 %p148, %f696, 0f00000000; and.pred %p149, %p131, %p148; selp.f32 %f684, 0f00000000, %f696, %p149; // begin inline asm { cvt.rn.f16.f32 %rs351, %f684;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs350, %f683;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs349, %f682;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs348, %f681;} // end inline asm st.global.v4.u16 [%rd119+8192], {%rs348, %rs349, %rs350, %rs351}; add.s32 %r57, %r56, 1024; mov.f32 %f867, %f871; mov.f32 %f868, %f871; mov.f32 %f869, %f871; mov.f32 %f870, %f871; @%p128 bra $L__BB0_80; setp.eq.s16 %p151, %rs16, 0; add.s32 %r166, %r189, -1024; selp.b32 %r167, %r57, %r166, %p151; mul.wide.s32 %rd100, %r167, 8; add.s64 %rd101, %rd3, %rd100; ld.global.v4.u16 {%rs363, %rs364, %rs365, %rs366}, [%rd101]; // begin inline asm { cvt.f32.f16 %f870, %rs363;} // end inline asm // begin inline asm { cvt.f32.f16 %f869, %rs364;} // end inline asm // begin inline asm { cvt.f32.f16 %f868, %rs365;} // end inline asm // begin inline asm { cvt.f32.f16 %f867, %rs366;} // end inline asm $L__BB0_80: ld.global.v4.u16 {%rs375, %rs376, %rs377, %rs378}, [%rd118+16384]; // begin inline asm { cvt.f32.f16 %f701, %rs375;} // end inline asm // begin inline asm { cvt.f32.f16 %f702, %rs376;} // end inline asm // begin inline asm { cvt.f32.f16 %f703, %rs377;} // end inline asm // begin inline asm { cvt.f32.f16 %f704, %rs378;} // end inline asm sub.ftz.f32 %f713, %f701, %f785; fma.rn.ftz.f32 %f714, %f830, %f713, %f870; sub.ftz.f32 %f715, %f702, %f785; fma.rn.ftz.f32 %f716, %f830, %f715, %f869; sub.ftz.f32 %f717, %f703, %f785; fma.rn.ftz.f32 %f718, %f830, %f717, %f868; sub.ftz.f32 %f719, %f704, %f785; fma.rn.ftz.f32 %f720, %f830, %f719, %f867; setp.lt.ftz.f32 %p152, %f714, 0f00000000; and.pred %p154, %p131, %p152; selp.f32 %f705, 0f00000000, %f714, %p154; setp.lt.ftz.f32 %p155, %f716, 0f00000000; and.pred %p156, %p131, %p155; selp.f32 %f706, 0f00000000, %f716, %p156; setp.lt.ftz.f32 %p157, %f718, 0f00000000; and.pred %p158, %p131, %p157; selp.f32 %f707, 0f00000000, %f718, %p158; setp.lt.ftz.f32 %p159, %f720, 0f00000000; and.pred %p160, %p131, %p159; selp.f32 %f708, 0f00000000, %f720, %p160; // begin inline asm { cvt.rn.f16.f32 %rs374, %f708;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs373, %f707;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs372, %f706;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs371, %f705;} // end inline asm st.global.v4.u16 [%rd119+16384], {%rs371, %rs372, %rs373, %rs374}; add.s32 %r58, %r57, 1024; mov.f32 %f872, %f871; mov.f32 %f873, %f871; mov.f32 %f874, %f871; @%p128 bra $L__BB0_82; setp.eq.s16 %p162, %rs16, 0; selp.b32 %r168, %r58, %r189, %p162; mul.wide.s32 %rd102, %r168, 8; add.s64 %rd103, %rd3, %rd102; ld.global.v4.u16 {%rs386, %rs387, %rs388, %rs389}, [%rd103]; // begin inline asm { cvt.f32.f16 %f874, %rs386;} // end inline asm // begin inline asm { cvt.f32.f16 %f873, %rs387;} // end inline asm // begin inline asm { cvt.f32.f16 %f872, %rs388;} // end inline asm // begin inline asm { cvt.f32.f16 %f871, %rs389;} // end inline asm $L__BB0_82: add.s64 %rd65, %rd118, 32768; ld.global.v4.u16 {%rs398, %rs399, %rs400, %rs401}, [%rd118+24576]; // begin inline asm { cvt.f32.f16 %f725, %rs398;} // end inline asm // begin inline asm { cvt.f32.f16 %f726, %rs399;} // end inline asm // begin inline asm { cvt.f32.f16 %f727, %rs400;} // end inline asm // begin inline asm { cvt.f32.f16 %f728, %rs401;} // end inline asm sub.ftz.f32 %f733, %f725, %f785; fma.rn.ftz.f32 %f734, %f830, %f733, %f874; sub.ftz.f32 %f735, %f726, %f785; fma.rn.ftz.f32 %f736, %f830, %f735, %f873; sub.ftz.f32 %f737, %f727, %f785; fma.rn.ftz.f32 %f738, %f830, %f737, %f872; sub.ftz.f32 %f739, %f728, %f785; fma.rn.ftz.f32 %f740, %f830, %f739, %f871; setp.lt.ftz.f32 %p163, %f734, 0f00000000; and.pred %p165, %p131, %p163; selp.f32 %f729, 0f00000000, %f734, %p165; setp.lt.ftz.f32 %p166, %f736, 0f00000000; and.pred %p167, %p131, %p166; selp.f32 %f730, 0f00000000, %f736, %p167; setp.lt.ftz.f32 %p168, %f738, 0f00000000; and.pred %p169, %p131, %p168; selp.f32 %f731, 0f00000000, %f738, %p169; setp.lt.ftz.f32 %p170, %f740, 0f00000000; and.pred %p171, %p131, %p170; selp.f32 %f732, 0f00000000, %f740, %p171; // begin inline asm { cvt.rn.f16.f32 %rs397, %f732;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs396, %f731;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs395, %f730;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs394, %f729;} // end inline asm add.s64 %rd66, %rd119, 32768; st.global.v4.u16 [%rd119+24576], {%rs394, %rs395, %rs396, %rs397}; add.s32 %r189, %r189, 4096; add.s32 %r181, %r58, 1024; setp.lt.s32 %p172, %r181, %r1; mov.u64 %rd118, %rd65; mov.u64 %rd119, %rd66; @%p172 bra $L__BB0_74; bra.uni $L__BB0_83; $L__BB0_56: setp.ne.s32 %p81, %r35, 0; @%p81 bra $L__BB0_60; setp.eq.s16 %p82, %rs23, 0; cvt.s64.s32 %rd36, %r181; mul.wide.s32 %rd84, %r181, 8; add.s64 %rd85, %rd2, %rd84; ld.global.v4.u16 {%rs203, %rs204, %rs205, %rs206}, [%rd85]; // begin inline asm { cvt.f32.f16 %f525, %rs203;} // end inline asm // begin inline asm { cvt.f32.f16 %f526, %rs204;} // end inline asm // begin inline asm { cvt.f32.f16 %f527, %rs205;} // end inline asm // begin inline asm { cvt.f32.f16 %f528, %rs206;} // end inline asm mov.f32 %f843, 0f00000000; mov.f32 %f844, %f843; mov.f32 %f845, %f843; mov.f32 %f846, %f843; @%p82 bra $L__BB0_59; shl.b64 %rd86, %rd36, 3; add.s64 %rd87, %rd3, %rd86; ld.global.v4.u16 {%rs211, %rs212, %rs213, %rs214}, [%rd87]; // begin inline asm { cvt.f32.f16 %f846, %rs211;} // end inline asm // begin inline asm { cvt.f32.f16 %f845, %rs212;} // end inline asm // begin inline asm { cvt.f32.f16 %f844, %rs213;} // end inline asm // begin inline asm { cvt.f32.f16 %f843, %rs214;} // end inline asm $L__BB0_59: ld.global.v4.u16 {%rs223, %rs224, %rs225, %rs226}, [%rd17]; // begin inline asm { cvt.f32.f16 %f537, %rs223;} // end inline asm // begin inline asm { cvt.f32.f16 %f538, %rs224;} // end inline asm // begin inline asm { cvt.f32.f16 %f539, %rs225;} // end inline asm // begin inline asm { cvt.f32.f16 %f540, %rs226;} // end inline asm sub.ftz.f32 %f545, %f537, %f785; mul.ftz.f32 %f546, %f830, %f545; fma.rn.ftz.f32 %f547, %f525, %f546, %f846; sub.ftz.f32 %f548, %f538, %f785; mul.ftz.f32 %f549, %f830, %f548; fma.rn.ftz.f32 %f550, %f526, %f549, %f845; sub.ftz.f32 %f551, %f539, %f785; mul.ftz.f32 %f552, %f830, %f551; fma.rn.ftz.f32 %f553, %f527, %f552, %f844; sub.ftz.f32 %f554, %f540, %f785; mul.ftz.f32 %f555, %f830, %f554; fma.rn.ftz.f32 %f556, %f528, %f555, %f843; setp.lt.ftz.f32 %p83, %f547, 0f00000000; setp.ne.s16 %p84, %rs15, 0; and.pred %p85, %p84, %p83; selp.f32 %f541, 0f00000000, %f547, %p85; setp.lt.ftz.f32 %p86, %f550, 0f00000000; and.pred %p87, %p84, %p86; selp.f32 %f542, 0f00000000, %f550, %p87; setp.lt.ftz.f32 %p88, %f553, 0f00000000; and.pred %p89, %p84, %p88; selp.f32 %f543, 0f00000000, %f553, %p89; setp.lt.ftz.f32 %p90, %f556, 0f00000000; and.pred %p91, %p84, %p90; selp.f32 %f544, 0f00000000, %f556, %p91; // begin inline asm { cvt.rn.f16.f32 %rs222, %f544;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs221, %f543;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs220, %f542;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs219, %f541;} // end inline asm st.global.v4.u16 [%rd18], {%rs219, %rs220, %rs221, %rs222}; add.s32 %r181, %r181, 1024; $L__BB0_60: setp.lt.u32 %p92, %r34, 1024; @%p92 bra $L__BB0_83; add.s32 %r157, %r181, %r2; mul.wide.s32 %rd88, %r157, 8; add.s64 %rd89, %rd88, 8192; add.s64 %rd115, %rd4, %rd89; add.s64 %rd114, %rd1, %rd89; mul.wide.s32 %rd90, %r181, 8; add.s64 %rd91, %rd90, 8192; add.s64 %rd113, %rd2, %rd91; add.s64 %rd112, %rd3, %rd91; $L__BB0_62: add.s64 %rd45, %rd113, -8192; ld.global.v4.u16 {%rs232, %rs233, %rs234, %rs235}, [%rd113+-8192]; // begin inline asm { cvt.f32.f16 %f557, %rs232;} // end inline asm // begin inline asm { cvt.f32.f16 %f558, %rs233;} // end inline asm // begin inline asm { cvt.f32.f16 %f559, %rs234;} // end inline asm // begin inline asm { cvt.f32.f16 %f560, %rs235;} // end inline asm add.s64 %rd46, %rd112, -8192; setp.eq.s16 %p93, %rs23, 0; mov.f32 %f851, 0f00000000; mov.f32 %f847, %f851; mov.f32 %f848, %f851; mov.f32 %f849, %f851; mov.f32 %f850, %f851; @%p93 bra $L__BB0_64; ld.global.v4.u16 {%rs241, %rs242, %rs243, %rs244}, [%rd46]; // begin inline asm { cvt.f32.f16 %f850, %rs241;} // end inline asm // begin inline asm { cvt.f32.f16 %f849, %rs242;} // end inline asm // begin inline asm { cvt.f32.f16 %f848, %rs243;} // end inline asm // begin inline asm { cvt.f32.f16 %f847, %rs244;} // end inline asm $L__BB0_64: add.s64 %rd47, %rd114, -8192; ld.global.v4.u16 {%rs257, %rs258, %rs259, %rs260}, [%rd114+-8192]; // begin inline asm { cvt.f32.f16 %f569, %rs257;} // end inline asm // begin inline asm { cvt.f32.f16 %f570, %rs258;} // end inline asm // begin inline asm { cvt.f32.f16 %f571, %rs259;} // end inline asm // begin inline asm { cvt.f32.f16 %f572, %rs260;} // end inline asm sub.ftz.f32 %f585, %f569, %f785; mul.ftz.f32 %f586, %f830, %f585; fma.rn.ftz.f32 %f587, %f557, %f586, %f850; sub.ftz.f32 %f588, %f570, %f785; mul.ftz.f32 %f589, %f830, %f588; fma.rn.ftz.f32 %f590, %f558, %f589, %f849; sub.ftz.f32 %f591, %f571, %f785; mul.ftz.f32 %f592, %f830, %f591; fma.rn.ftz.f32 %f593, %f559, %f592, %f848; sub.ftz.f32 %f594, %f572, %f785; mul.ftz.f32 %f595, %f830, %f594; fma.rn.ftz.f32 %f596, %f560, %f595, %f847; setp.lt.ftz.f32 %p94, %f587, 0f00000000; setp.ne.s16 %p95, %rs15, 0; and.pred %p96, %p95, %p94; selp.f32 %f573, 0f00000000, %f587, %p96; setp.lt.ftz.f32 %p97, %f590, 0f00000000; and.pred %p98, %p95, %p97; selp.f32 %f574, 0f00000000, %f590, %p98; setp.lt.ftz.f32 %p99, %f593, 0f00000000; and.pred %p100, %p95, %p99; selp.f32 %f575, 0f00000000, %f593, %p100; setp.lt.ftz.f32 %p101, %f596, 0f00000000; and.pred %p102, %p95, %p101; selp.f32 %f576, 0f00000000, %f596, %p102; // begin inline asm { cvt.rn.f16.f32 %rs252, %f576;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs251, %f575;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs250, %f574;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs249, %f573;} // end inline asm add.s64 %rd48, %rd115, -8192; st.global.v4.u16 [%rd115+-8192], {%rs249, %rs250, %rs251, %rs252}; ld.global.v4.u16 {%rs262, %rs263, %rs264, %rs265}, [%rd45+8192]; // begin inline asm { cvt.f32.f16 %f577, %rs262;} // end inline asm // begin inline asm { cvt.f32.f16 %f578, %rs263;} // end inline asm // begin inline asm { cvt.f32.f16 %f579, %rs264;} // end inline asm // begin inline asm { cvt.f32.f16 %f580, %rs265;} // end inline asm mov.f32 %f852, %f851; mov.f32 %f853, %f851; mov.f32 %f854, %f851; @%p93 bra $L__BB0_66; ld.global.v4.u16 {%rs271, %rs272, %rs273, %rs274}, [%rd46+8192]; // begin inline asm { cvt.f32.f16 %f854, %rs271;} // end inline asm // begin inline asm { cvt.f32.f16 %f853, %rs272;} // end inline asm // begin inline asm { cvt.f32.f16 %f852, %rs273;} // end inline asm // begin inline asm { cvt.f32.f16 %f851, %rs274;} // end inline asm $L__BB0_66: ld.global.v4.u16 {%rs283, %rs284, %rs285, %rs286}, [%rd47+8192]; // begin inline asm { cvt.f32.f16 %f601, %rs283;} // end inline asm // begin inline asm { cvt.f32.f16 %f602, %rs284;} // end inline asm // begin inline asm { cvt.f32.f16 %f603, %rs285;} // end inline asm // begin inline asm { cvt.f32.f16 %f604, %rs286;} // end inline asm sub.ftz.f32 %f609, %f601, %f785; mul.ftz.f32 %f610, %f830, %f609; fma.rn.ftz.f32 %f611, %f577, %f610, %f854; sub.ftz.f32 %f612, %f602, %f785; mul.ftz.f32 %f613, %f830, %f612; fma.rn.ftz.f32 %f614, %f578, %f613, %f853; sub.ftz.f32 %f615, %f603, %f785; mul.ftz.f32 %f616, %f830, %f615; fma.rn.ftz.f32 %f617, %f579, %f616, %f852; sub.ftz.f32 %f618, %f604, %f785; mul.ftz.f32 %f619, %f830, %f618; fma.rn.ftz.f32 %f620, %f580, %f619, %f851; setp.lt.ftz.f32 %p104, %f611, 0f00000000; and.pred %p106, %p95, %p104; selp.f32 %f605, 0f00000000, %f611, %p106; setp.lt.ftz.f32 %p107, %f614, 0f00000000; and.pred %p108, %p95, %p107; selp.f32 %f606, 0f00000000, %f614, %p108; setp.lt.ftz.f32 %p109, %f617, 0f00000000; and.pred %p110, %p95, %p109; selp.f32 %f607, 0f00000000, %f617, %p110; setp.lt.ftz.f32 %p111, %f620, 0f00000000; and.pred %p112, %p95, %p111; selp.f32 %f608, 0f00000000, %f620, %p112; // begin inline asm { cvt.rn.f16.f32 %rs282, %f608;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs281, %f607;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs280, %f606;} // end inline asm // begin inline asm { cvt.rn.f16.f32 %rs279, %f605;} // end inline asm st.global.v4.u16 [%rd48+8192], {%rs279, %rs280, %rs281, %rs282}; add.s64 %rd115, %rd115, 16384; add.s64 %rd114, %rd114, 16384; add.s64 %rd113, %rd113, 16384; add.s64 %rd112, %rd112, 16384; add.s32 %r181, %r181, 2048; setp.lt.s32 %p113, %r181, %r1; @%p113 bra $L__BB0_62; $L__BB0_83: ret; }