7_gemv_cu_0ac0658e6thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_0ac0658e6thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_0ac0658e6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_0ac0658e6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_0ac0658e6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_0ac0658e6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_0ac0658e6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_0ac0658e6thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch723DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<24>; .reg .b16 %rs<775>; .reg .f32 %f<1341>; .reg .b32 %r<683>; .reg .b64 %rd<73>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi7EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[4480]; ld.param.v2.u32 {%r55, %r56}, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r57, %r58}, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f68, %f69}, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs13, %rs14, %rs15, %rs16}, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd25, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd24, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd23, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd22, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+16]; mov.u32 %r682, %tid.y; shl.b32 %r59, %r682, 5; mov.u32 %r60, %tid.x; add.s32 %r681, %r59, %r60; shl.b32 %r680, %r681, 2; setp.ge.u32 %p1, %r680, %r57; mov.f32 %f1320, 0f00000000; mov.f32 %f1321, %f1320; mov.f32 %f1322, %f1320; mov.f32 %f1323, %f1320; mov.f32 %f1324, %f1320; mov.f32 %f1325, %f1320; mov.f32 %f1326, %f1320; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd2, %rd22; mov.u32 %r61, %ctaid.x; mul.lo.s32 %r6, %r58, %r61; shl.b16 %rs2, %rs13, 3; cvta.to.global.u64 %rd3, %rd23; cvta.to.global.u64 %rd4, %rd25; $L__BB0_2: mad.lo.s32 %r63, %r57, %r61, %r680; mul.wide.u32 %rd26, %r63, 4; add.s64 %rd27, %rd3, %rd26; ld.global.v4.u32 {%r64, %r65, %r66, %r67}, [%rd27]; shr.u32 %r69, %r60, 2; shl.b32 %r70, %r682, 3; add.s32 %r14, %r70, %r69; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd28, %r15, 2; add.s64 %rd29, %rd4, %rd28; ld.global.u16 %rs21, [%rd29]; // begin inline asm { cvt.f32.f16 %f77, %rs21;} // end inline asm setp.eq.s64 %p2, %rd24, 0; mov.u16 %rs774, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r71, %r15, 31; add.s32 %r72, %r15, %r71; shr.s32 %r73, %r72, 1; cvt.s64.s32 %rd30, %r73; cvta.to.global.u64 %rd31, %rd24; add.s64 %rd32, %rd31, %rd30; ld.global.u8 %r74, [%rd32]; shl.b32 %r75, %r14, 2; and.b32 %r76, %r75, 4; shr.u32 %r77, %r74, %r76; cvt.u16.u32 %rs22, %r77; and.b16 %rs774, %rs22, 15; $L__BB0_4: shl.b32 %r16, %r681, 5; setp.ge.s32 %p3, %r16, %r55; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs13, 0; shr.u16 %rs24, %rs774, 3; and.b16 %rs25, %rs24, 1; setp.eq.b16 %p5, %rs25, 1; and.pred %p6, %p4, %p5; selp.b16 %rs26, -16, 0, %p6; or.b16 %rs27, %rs26, %rs774; cvt.s16.s8 %rs28, %rs27; cvt.rn.f32.s16 %f9, %rs28; mul.wide.s32 %rd33, %r16, 2; add.s64 %rd5, %rd2, %rd33; ld.global.v4.u32 {%r78, %r79, %r80, %r81}, [%rd5]; mul.wide.s32 %rd34, %r55, 2; add.s64 %rd35, %rd5, %rd34; ld.global.v4.u32 {%r82, %r83, %r84, %r85}, [%rd35]; add.s32 %r86, %r16, %r55; add.s32 %r87, %r86, %r55; mul.wide.s32 %rd36, %r87, 2; add.s64 %rd6, %rd2, %rd36; ld.global.v4.u32 {%r88, %r89, %r90, %r91}, [%rd6]; add.s64 %rd37, %rd6, %rd34; ld.global.v4.u32 {%r92, %r93, %r94, %r95}, [%rd37]; add.s64 %rd38, %rd37, %rd34; ld.global.v4.u32 {%r96, %r97, %r98, %r99}, [%rd38]; add.s64 %rd39, %rd38, %rd34; ld.global.v4.u32 {%r100, %r101, %r102, %r103}, [%rd39]; add.s64 %rd40, %rd39, %rd34; ld.global.v4.u32 {%r104, %r105, %r106, %r107}, [%rd40]; add.s32 %r108, %r86, 8; mul.wide.s32 %rd41, %r108, 2; add.s64 %rd7, %rd2, %rd41; add.s32 %r109, %r108, %r55; add.s32 %r110, %r109, %r55; mul.wide.s32 %rd42, %r110, 2; add.s64 %rd8, %rd2, %rd42; add.s32 %r111, %r110, %r55; mul.wide.s32 %rd43, %r111, 2; add.s64 %rd9, %rd2, %rd43; add.s32 %r112, %r111, %r55; mul.wide.s32 %rd44, %r112, 2; add.s64 %rd10, %rd2, %rd44; add.s32 %r113, %r112, %r55; mul.wide.s32 %rd45, %r113, 2; add.s64 %rd11, %rd2, %rd45; @%p4 bra $L__BB0_7; cvt.u16.u32 %rs253, %r64; and.b16 %rs254, %rs253, 240; and.b16 %rs255, %rs253, 15; cvt.rn.f32.s16 %f302, %rs255; sub.ftz.f32 %f303, %f302, %f9; mul.ftz.f32 %f304, %f77, %f303; mov.b32 {%rs29, %rs36}, %r78; // begin inline asm { cvt.f32.f16 %f78, %rs29;} // end inline asm fma.rn.ftz.f32 %f305, %f304, %f78, %f1326; mov.b32 {%rs30, %rs37}, %r82; // begin inline asm { cvt.f32.f16 %f79, %rs30;} // end inline asm fma.rn.ftz.f32 %f306, %f304, %f79, %f1325; mov.b32 {%rs31, %rs38}, %r88; // begin inline asm { cvt.f32.f16 %f80, %rs31;} // end inline asm fma.rn.ftz.f32 %f307, %f304, %f80, %f1324; mov.b32 {%rs32, %rs39}, %r92; // begin inline asm { cvt.f32.f16 %f81, %rs32;} // end inline asm fma.rn.ftz.f32 %f308, %f304, %f81, %f1323; mov.b32 {%rs33, %rs40}, %r96; // begin inline asm { cvt.f32.f16 %f82, %rs33;} // end inline asm fma.rn.ftz.f32 %f309, %f304, %f82, %f1322; mov.b32 {%rs34, %rs41}, %r100; // begin inline asm { cvt.f32.f16 %f83, %rs34;} // end inline asm fma.rn.ftz.f32 %f310, %f304, %f83, %f1321; mov.b32 {%rs35, %rs42}, %r104; // begin inline asm { cvt.f32.f16 %f84, %rs35;} // end inline asm fma.rn.ftz.f32 %f311, %f304, %f84, %f1320; shr.u16 %rs256, %rs254, 4; cvt.rn.f32.s16 %f312, %rs256; sub.ftz.f32 %f313, %f312, %f9; mul.ftz.f32 %f314, %f77, %f313; // begin inline asm { cvt.f32.f16 %f85, %rs36;} // end inline asm fma.rn.ftz.f32 %f315, %f314, %f85, %f305; // begin inline asm { cvt.f32.f16 %f86, %rs37;} // end inline asm fma.rn.ftz.f32 %f316, %f314, %f86, %f306; // begin inline asm { cvt.f32.f16 %f87, %rs38;} // end inline asm fma.rn.ftz.f32 %f317, %f314, %f87, %f307; // begin inline asm { cvt.f32.f16 %f88, %rs39;} // end inline asm fma.rn.ftz.f32 %f318, %f314, %f88, %f308; // begin inline asm { cvt.f32.f16 %f89, %rs40;} // end inline asm fma.rn.ftz.f32 %f319, %f314, %f89, %f309; // begin inline asm { cvt.f32.f16 %f90, %rs41;} // end inline asm fma.rn.ftz.f32 %f320, %f314, %f90, %f310; // begin inline asm { cvt.f32.f16 %f91, %rs42;} // end inline asm fma.rn.ftz.f32 %f321, %f314, %f91, %f311; shr.u16 %rs257, %rs253, 8; and.b16 %rs258, %rs257, 15; cvt.rn.f32.s16 %f322, %rs258; sub.ftz.f32 %f323, %f322, %f9; mul.ftz.f32 %f324, %f77, %f323; mov.b32 {%rs43, %rs50}, %r79; // begin inline asm { cvt.f32.f16 %f92, %rs43;} // end inline asm fma.rn.ftz.f32 %f325, %f324, %f92, %f315; mov.b32 {%rs44, %rs51}, %r83; // begin inline asm { cvt.f32.f16 %f93, %rs44;} // end inline asm fma.rn.ftz.f32 %f326, %f324, %f93, %f316; mov.b32 {%rs45, %rs52}, %r89; // begin inline asm { cvt.f32.f16 %f94, %rs45;} // end inline asm fma.rn.ftz.f32 %f327, %f324, %f94, %f317; mov.b32 {%rs46, %rs53}, %r93; // begin inline asm { cvt.f32.f16 %f95, %rs46;} // end inline asm fma.rn.ftz.f32 %f328, %f324, %f95, %f318; mov.b32 {%rs47, %rs54}, %r97; // begin inline asm { cvt.f32.f16 %f96, %rs47;} // end inline asm fma.rn.ftz.f32 %f329, %f324, %f96, %f319; mov.b32 {%rs48, %rs55}, %r101; // begin inline asm { cvt.f32.f16 %f97, %rs48;} // end inline asm fma.rn.ftz.f32 %f330, %f324, %f97, %f320; mov.b32 {%rs49, %rs56}, %r105; // begin inline asm { cvt.f32.f16 %f98, %rs49;} // end inline asm fma.rn.ftz.f32 %f331, %f324, %f98, %f321; shr.u16 %rs259, %rs253, 12; cvt.rn.f32.s16 %f332, %rs259; sub.ftz.f32 %f333, %f332, %f9; mul.ftz.f32 %f334, %f77, %f333; // begin inline asm { cvt.f32.f16 %f99, %rs50;} // end inline asm fma.rn.ftz.f32 %f335, %f334, %f99, %f325; // begin inline asm { cvt.f32.f16 %f100, %rs51;} // end inline asm fma.rn.ftz.f32 %f336, %f334, %f100, %f326; // begin inline asm { cvt.f32.f16 %f101, %rs52;} // end inline asm fma.rn.ftz.f32 %f337, %f334, %f101, %f327; // begin inline asm { cvt.f32.f16 %f102, %rs53;} // end inline asm fma.rn.ftz.f32 %f338, %f334, %f102, %f328; // begin inline asm { cvt.f32.f16 %f103, %rs54;} // end inline asm fma.rn.ftz.f32 %f339, %f334, %f103, %f329; // begin inline asm { cvt.f32.f16 %f104, %rs55;} // end inline asm fma.rn.ftz.f32 %f340, %f334, %f104, %f330; // begin inline asm { cvt.f32.f16 %f105, %rs56;} // end inline asm fma.rn.ftz.f32 %f341, %f334, %f105, %f331; shr.u32 %r114, %r64, 16; cvt.u16.u32 %rs260, %r114; and.b16 %rs261, %rs260, 15; cvt.rn.f32.s16 %f342, %rs261; sub.ftz.f32 %f343, %f342, %f9; mul.ftz.f32 %f344, %f77, %f343; mov.b32 {%rs57, %rs64}, %r80; // begin inline asm { cvt.f32.f16 %f106, %rs57;} // end inline asm fma.rn.ftz.f32 %f345, %f344, %f106, %f335; mov.b32 {%rs58, %rs65}, %r84; // begin inline asm { cvt.f32.f16 %f107, %rs58;} // end inline asm fma.rn.ftz.f32 %f346, %f344, %f107, %f336; mov.b32 {%rs59, %rs66}, %r90; // begin inline asm { cvt.f32.f16 %f108, %rs59;} // end inline asm fma.rn.ftz.f32 %f347, %f344, %f108, %f337; mov.b32 {%rs60, %rs67}, %r94; // begin inline asm { cvt.f32.f16 %f109, %rs60;} // end inline asm fma.rn.ftz.f32 %f348, %f344, %f109, %f338; mov.b32 {%rs61, %rs68}, %r98; // begin inline asm { cvt.f32.f16 %f110, %rs61;} // end inline asm fma.rn.ftz.f32 %f349, %f344, %f110, %f339; mov.b32 {%rs62, %rs69}, %r102; // begin inline asm { cvt.f32.f16 %f111, %rs62;} // end inline asm fma.rn.ftz.f32 %f350, %f344, %f111, %f340; mov.b32 {%rs63, %rs70}, %r106; // begin inline asm { cvt.f32.f16 %f112, %rs63;} // end inline asm fma.rn.ftz.f32 %f351, %f344, %f112, %f341; shr.u32 %r115, %r64, 20; cvt.u16.u32 %rs262, %r115; and.b16 %rs263, %rs262, 15; cvt.rn.f32.s16 %f352, %rs263; sub.ftz.f32 %f353, %f352, %f9; mul.ftz.f32 %f354, %f77, %f353; // begin inline asm { cvt.f32.f16 %f113, %rs64;} // end inline asm fma.rn.ftz.f32 %f355, %f354, %f113, %f345; // begin inline asm { cvt.f32.f16 %f114, %rs65;} // end inline asm fma.rn.ftz.f32 %f356, %f354, %f114, %f346; // begin inline asm { cvt.f32.f16 %f115, %rs66;} // end inline asm fma.rn.ftz.f32 %f357, %f354, %f115, %f347; // begin inline asm { cvt.f32.f16 %f116, %rs67;} // end inline asm fma.rn.ftz.f32 %f358, %f354, %f116, %f348; // begin inline asm { cvt.f32.f16 %f117, %rs68;} // end inline asm fma.rn.ftz.f32 %f359, %f354, %f117, %f349; // begin inline asm { cvt.f32.f16 %f118, %rs69;} // end inline asm fma.rn.ftz.f32 %f360, %f354, %f118, %f350; // begin inline asm { cvt.f32.f16 %f119, %rs70;} // end inline asm fma.rn.ftz.f32 %f361, %f354, %f119, %f351; shr.u32 %r116, %r64, 24; cvt.u16.u32 %rs264, %r116; and.b16 %rs265, %rs264, 15; cvt.rn.f32.s16 %f362, %rs265; sub.ftz.f32 %f363, %f362, %f9; mul.ftz.f32 %f364, %f77, %f363; mov.b32 {%rs71, %rs78}, %r81; // begin inline asm { cvt.f32.f16 %f120, %rs71;} // end inline asm fma.rn.ftz.f32 %f365, %f364, %f120, %f355; mov.b32 {%rs72, %rs79}, %r85; // begin inline asm { cvt.f32.f16 %f121, %rs72;} // end inline asm fma.rn.ftz.f32 %f366, %f364, %f121, %f356; mov.b32 {%rs73, %rs80}, %r91; // begin inline asm { cvt.f32.f16 %f122, %rs73;} // end inline asm fma.rn.ftz.f32 %f367, %f364, %f122, %f357; mov.b32 {%rs74, %rs81}, %r95; // begin inline asm { cvt.f32.f16 %f123, %rs74;} // end inline asm fma.rn.ftz.f32 %f368, %f364, %f123, %f358; mov.b32 {%rs75, %rs82}, %r99; // begin inline asm { cvt.f32.f16 %f124, %rs75;} // end inline asm fma.rn.ftz.f32 %f369, %f364, %f124, %f359; mov.b32 {%rs76, %rs83}, %r103; // begin inline asm { cvt.f32.f16 %f125, %rs76;} // end inline asm fma.rn.ftz.f32 %f370, %f364, %f125, %f360; mov.b32 {%rs77, %rs84}, %r107; // begin inline asm { cvt.f32.f16 %f126, %rs77;} // end inline asm fma.rn.ftz.f32 %f371, %f364, %f126, %f361; shr.u32 %r117, %r64, 28; cvt.u16.u32 %rs266, %r117; cvt.rn.f32.s16 %f372, %rs266; sub.ftz.f32 %f373, %f372, %f9; mul.ftz.f32 %f374, %f77, %f373; // begin inline asm { cvt.f32.f16 %f127, %rs78;} // end inline asm fma.rn.ftz.f32 %f375, %f374, %f127, %f365; // begin inline asm { cvt.f32.f16 %f128, %rs79;} // end inline asm fma.rn.ftz.f32 %f376, %f374, %f128, %f366; // begin inline asm { cvt.f32.f16 %f129, %rs80;} // end inline asm fma.rn.ftz.f32 %f377, %f374, %f129, %f367; // begin inline asm { cvt.f32.f16 %f130, %rs81;} // end inline asm fma.rn.ftz.f32 %f378, %f374, %f130, %f368; // begin inline asm { cvt.f32.f16 %f131, %rs82;} // end inline asm fma.rn.ftz.f32 %f379, %f374, %f131, %f369; // begin inline asm { cvt.f32.f16 %f132, %rs83;} // end inline asm fma.rn.ftz.f32 %f380, %f374, %f132, %f370; // begin inline asm { cvt.f32.f16 %f133, %rs84;} // end inline asm fma.rn.ftz.f32 %f381, %f374, %f133, %f371; ld.global.v4.u32 {%r118, %r119, %r120, %r121}, [%rd5+16]; ld.global.v4.u32 {%r126, %r127, %r128, %r129}, [%rd7]; ld.global.v4.u32 {%r134, %r135, %r136, %r137}, [%rd6+16]; ld.global.v4.u32 {%r142, %r143, %r144, %r145}, [%rd8]; ld.global.v4.u32 {%r150, %r151, %r152, %r153}, [%rd9]; ld.global.v4.u32 {%r158, %r159, %r160, %r161}, [%rd10]; ld.global.v4.u32 {%r166, %r167, %r168, %r169}, [%rd11]; cvt.u16.u32 %rs267, %r65; and.b16 %rs268, %rs267, 240; and.b16 %rs269, %rs267, 15; cvt.rn.f32.s16 %f382, %rs269; sub.ftz.f32 %f383, %f382, %f9; mul.ftz.f32 %f384, %f77, %f383; mov.b32 {%rs85, %rs92}, %r118; // begin inline asm { cvt.f32.f16 %f134, %rs85;} // end inline asm fma.rn.ftz.f32 %f385, %f384, %f134, %f375; mov.b32 {%rs86, %rs93}, %r126; // begin inline asm { cvt.f32.f16 %f135, %rs86;} // end inline asm fma.rn.ftz.f32 %f386, %f384, %f135, %f376; mov.b32 {%rs87, %rs94}, %r134; // begin inline asm { cvt.f32.f16 %f136, %rs87;} // end inline asm fma.rn.ftz.f32 %f387, %f384, %f136, %f377; mov.b32 {%rs88, %rs95}, %r142; // begin inline asm { cvt.f32.f16 %f137, %rs88;} // end inline asm fma.rn.ftz.f32 %f388, %f384, %f137, %f378; mov.b32 {%rs89, %rs96}, %r150; // begin inline asm { cvt.f32.f16 %f138, %rs89;} // end inline asm fma.rn.ftz.f32 %f389, %f384, %f138, %f379; mov.b32 {%rs90, %rs97}, %r158; // begin inline asm { cvt.f32.f16 %f139, %rs90;} // end inline asm fma.rn.ftz.f32 %f390, %f384, %f139, %f380; mov.b32 {%rs91, %rs98}, %r166; // begin inline asm { cvt.f32.f16 %f140, %rs91;} // end inline asm fma.rn.ftz.f32 %f391, %f384, %f140, %f381; shr.u16 %rs270, %rs268, 4; cvt.rn.f32.s16 %f392, %rs270; sub.ftz.f32 %f393, %f392, %f9; mul.ftz.f32 %f394, %f77, %f393; // begin inline asm { cvt.f32.f16 %f141, %rs92;} // end inline asm fma.rn.ftz.f32 %f395, %f394, %f141, %f385; // begin inline asm { cvt.f32.f16 %f142, %rs93;} // end inline asm fma.rn.ftz.f32 %f396, %f394, %f142, %f386; // begin inline asm { cvt.f32.f16 %f143, %rs94;} // end inline asm fma.rn.ftz.f32 %f397, %f394, %f143, %f387; // begin inline asm { cvt.f32.f16 %f144, %rs95;} // end inline asm fma.rn.ftz.f32 %f398, %f394, %f144, %f388; // begin inline asm { cvt.f32.f16 %f145, %rs96;} // end inline asm fma.rn.ftz.f32 %f399, %f394, %f145, %f389; // begin inline asm { cvt.f32.f16 %f146, %rs97;} // end inline asm fma.rn.ftz.f32 %f400, %f394, %f146, %f390; // begin inline asm { cvt.f32.f16 %f147, %rs98;} // end inline asm fma.rn.ftz.f32 %f401, %f394, %f147, %f391; shr.u16 %rs271, %rs267, 8; and.b16 %rs272, %rs271, 15; cvt.rn.f32.s16 %f402, %rs272; sub.ftz.f32 %f403, %f402, %f9; mul.ftz.f32 %f404, %f77, %f403; mov.b32 {%rs99, %rs106}, %r119; // begin inline asm { cvt.f32.f16 %f148, %rs99;} // end inline asm fma.rn.ftz.f32 %f405, %f404, %f148, %f395; mov.b32 {%rs100, %rs107}, %r127; // begin inline asm { cvt.f32.f16 %f149, %rs100;} // end inline asm fma.rn.ftz.f32 %f406, %f404, %f149, %f396; mov.b32 {%rs101, %rs108}, %r135; // begin inline asm { cvt.f32.f16 %f150, %rs101;} // end inline asm fma.rn.ftz.f32 %f407, %f404, %f150, %f397; mov.b32 {%rs102, %rs109}, %r143; // begin inline asm { cvt.f32.f16 %f151, %rs102;} // end inline asm fma.rn.ftz.f32 %f408, %f404, %f151, %f398; mov.b32 {%rs103, %rs110}, %r151; // begin inline asm { cvt.f32.f16 %f152, %rs103;} // end inline asm fma.rn.ftz.f32 %f409, %f404, %f152, %f399; mov.b32 {%rs104, %rs111}, %r159; // begin inline asm { cvt.f32.f16 %f153, %rs104;} // end inline asm fma.rn.ftz.f32 %f410, %f404, %f153, %f400; mov.b32 {%rs105, %rs112}, %r167; // begin inline asm { cvt.f32.f16 %f154, %rs105;} // end inline asm fma.rn.ftz.f32 %f411, %f404, %f154, %f401; shr.u16 %rs273, %rs267, 12; cvt.rn.f32.s16 %f412, %rs273; sub.ftz.f32 %f413, %f412, %f9; mul.ftz.f32 %f414, %f77, %f413; // begin inline asm { cvt.f32.f16 %f155, %rs106;} // end inline asm fma.rn.ftz.f32 %f415, %f414, %f155, %f405; // begin inline asm { cvt.f32.f16 %f156, %rs107;} // end inline asm fma.rn.ftz.f32 %f416, %f414, %f156, %f406; // begin inline asm { cvt.f32.f16 %f157, %rs108;} // end inline asm fma.rn.ftz.f32 %f417, %f414, %f157, %f407; // begin inline asm { cvt.f32.f16 %f158, %rs109;} // end inline asm fma.rn.ftz.f32 %f418, %f414, %f158, %f408; // begin inline asm { cvt.f32.f16 %f159, %rs110;} // end inline asm fma.rn.ftz.f32 %f419, %f414, %f159, %f409; // begin inline asm { cvt.f32.f16 %f160, %rs111;} // end inline asm fma.rn.ftz.f32 %f420, %f414, %f160, %f410; // begin inline asm { cvt.f32.f16 %f161, %rs112;} // end inline asm fma.rn.ftz.f32 %f421, %f414, %f161, %f411; shr.u32 %r174, %r65, 16; cvt.u16.u32 %rs274, %r174; and.b16 %rs275, %rs274, 15; cvt.rn.f32.s16 %f422, %rs275; sub.ftz.f32 %f423, %f422, %f9; mul.ftz.f32 %f424, %f77, %f423; mov.b32 {%rs113, %rs120}, %r120; // begin inline asm { cvt.f32.f16 %f162, %rs113;} // end inline asm fma.rn.ftz.f32 %f425, %f424, %f162, %f415; mov.b32 {%rs114, %rs121}, %r128; // begin inline asm { cvt.f32.f16 %f163, %rs114;} // end inline asm fma.rn.ftz.f32 %f426, %f424, %f163, %f416; mov.b32 {%rs115, %rs122}, %r136; // begin inline asm { cvt.f32.f16 %f164, %rs115;} // end inline asm fma.rn.ftz.f32 %f427, %f424, %f164, %f417; mov.b32 {%rs116, %rs123}, %r144; // begin inline asm { cvt.f32.f16 %f165, %rs116;} // end inline asm fma.rn.ftz.f32 %f428, %f424, %f165, %f418; mov.b32 {%rs117, %rs124}, %r152; // begin inline asm { cvt.f32.f16 %f166, %rs117;} // end inline asm fma.rn.ftz.f32 %f429, %f424, %f166, %f419; mov.b32 {%rs118, %rs125}, %r160; // begin inline asm { cvt.f32.f16 %f167, %rs118;} // end inline asm fma.rn.ftz.f32 %f430, %f424, %f167, %f420; mov.b32 {%rs119, %rs126}, %r168; // begin inline asm { cvt.f32.f16 %f168, %rs119;} // end inline asm fma.rn.ftz.f32 %f431, %f424, %f168, %f421; shr.u32 %r175, %r65, 20; cvt.u16.u32 %rs276, %r175; and.b16 %rs277, %rs276, 15; cvt.rn.f32.s16 %f432, %rs277; sub.ftz.f32 %f433, %f432, %f9; mul.ftz.f32 %f434, %f77, %f433; // begin inline asm { cvt.f32.f16 %f169, %rs120;} // end inline asm fma.rn.ftz.f32 %f435, %f434, %f169, %f425; // begin inline asm { cvt.f32.f16 %f170, %rs121;} // end inline asm fma.rn.ftz.f32 %f436, %f434, %f170, %f426; // begin inline asm { cvt.f32.f16 %f171, %rs122;} // end inline asm fma.rn.ftz.f32 %f437, %f434, %f171, %f427; // begin inline asm { cvt.f32.f16 %f172, %rs123;} // end inline asm fma.rn.ftz.f32 %f438, %f434, %f172, %f428; // begin inline asm { cvt.f32.f16 %f173, %rs124;} // end inline asm fma.rn.ftz.f32 %f439, %f434, %f173, %f429; // begin inline asm { cvt.f32.f16 %f174, %rs125;} // end inline asm fma.rn.ftz.f32 %f440, %f434, %f174, %f430; // begin inline asm { cvt.f32.f16 %f175, %rs126;} // end inline asm fma.rn.ftz.f32 %f441, %f434, %f175, %f431; shr.u32 %r176, %r65, 24; cvt.u16.u32 %rs278, %r176; and.b16 %rs279, %rs278, 15; cvt.rn.f32.s16 %f442, %rs279; sub.ftz.f32 %f443, %f442, %f9; mul.ftz.f32 %f444, %f77, %f443; mov.b32 {%rs127, %rs134}, %r121; // begin inline asm { cvt.f32.f16 %f176, %rs127;} // end inline asm fma.rn.ftz.f32 %f445, %f444, %f176, %f435; mov.b32 {%rs128, %rs135}, %r129; // begin inline asm { cvt.f32.f16 %f177, %rs128;} // end inline asm fma.rn.ftz.f32 %f446, %f444, %f177, %f436; mov.b32 {%rs129, %rs136}, %r137; // begin inline asm { cvt.f32.f16 %f178, %rs129;} // end inline asm fma.rn.ftz.f32 %f447, %f444, %f178, %f437; mov.b32 {%rs130, %rs137}, %r145; // begin inline asm { cvt.f32.f16 %f179, %rs130;} // end inline asm fma.rn.ftz.f32 %f448, %f444, %f179, %f438; mov.b32 {%rs131, %rs138}, %r153; // begin inline asm { cvt.f32.f16 %f180, %rs131;} // end inline asm fma.rn.ftz.f32 %f449, %f444, %f180, %f439; mov.b32 {%rs132, %rs139}, %r161; // begin inline asm { cvt.f32.f16 %f181, %rs132;} // end inline asm fma.rn.ftz.f32 %f450, %f444, %f181, %f440; mov.b32 {%rs133, %rs140}, %r169; // begin inline asm { cvt.f32.f16 %f182, %rs133;} // end inline asm fma.rn.ftz.f32 %f451, %f444, %f182, %f441; shr.u32 %r177, %r65, 28; cvt.u16.u32 %rs280, %r177; cvt.rn.f32.s16 %f452, %rs280; sub.ftz.f32 %f453, %f452, %f9; mul.ftz.f32 %f454, %f77, %f453; // begin inline asm { cvt.f32.f16 %f183, %rs134;} // end inline asm fma.rn.ftz.f32 %f455, %f454, %f183, %f445; // begin inline asm { cvt.f32.f16 %f184, %rs135;} // end inline asm fma.rn.ftz.f32 %f456, %f454, %f184, %f446; // begin inline asm { cvt.f32.f16 %f185, %rs136;} // end inline asm fma.rn.ftz.f32 %f457, %f454, %f185, %f447; // begin inline asm { cvt.f32.f16 %f186, %rs137;} // end inline asm fma.rn.ftz.f32 %f458, %f454, %f186, %f448; // begin inline asm { cvt.f32.f16 %f187, %rs138;} // end inline asm fma.rn.ftz.f32 %f459, %f454, %f187, %f449; // begin inline asm { cvt.f32.f16 %f188, %rs139;} // end inline asm fma.rn.ftz.f32 %f460, %f454, %f188, %f450; // begin inline asm { cvt.f32.f16 %f189, %rs140;} // end inline asm fma.rn.ftz.f32 %f461, %f454, %f189, %f451; ld.global.v4.u32 {%r178, %r179, %r180, %r181}, [%rd5+32]; ld.global.v4.u32 {%r186, %r187, %r188, %r189}, [%rd7+16]; ld.global.v4.u32 {%r194, %r195, %r196, %r197}, [%rd6+32]; ld.global.v4.u32 {%r202, %r203, %r204, %r205}, [%rd8+16]; ld.global.v4.u32 {%r210, %r211, %r212, %r213}, [%rd9+16]; ld.global.v4.u32 {%r218, %r219, %r220, %r221}, [%rd10+16]; ld.global.v4.u32 {%r226, %r227, %r228, %r229}, [%rd11+16]; cvt.u16.u32 %rs281, %r66; and.b16 %rs282, %rs281, 240; and.b16 %rs283, %rs281, 15; cvt.rn.f32.s16 %f462, %rs283; sub.ftz.f32 %f463, %f462, %f9; mul.ftz.f32 %f464, %f77, %f463; mov.b32 {%rs141, %rs148}, %r178; // begin inline asm { cvt.f32.f16 %f190, %rs141;} // end inline asm fma.rn.ftz.f32 %f465, %f464, %f190, %f455; mov.b32 {%rs142, %rs149}, %r186; // begin inline asm { cvt.f32.f16 %f191, %rs142;} // end inline asm fma.rn.ftz.f32 %f466, %f464, %f191, %f456; mov.b32 {%rs143, %rs150}, %r194; // begin inline asm { cvt.f32.f16 %f192, %rs143;} // end inline asm fma.rn.ftz.f32 %f467, %f464, %f192, %f457; mov.b32 {%rs144, %rs151}, %r202; // begin inline asm { cvt.f32.f16 %f193, %rs144;} // end inline asm fma.rn.ftz.f32 %f468, %f464, %f193, %f458; mov.b32 {%rs145, %rs152}, %r210; // begin inline asm { cvt.f32.f16 %f194, %rs145;} // end inline asm fma.rn.ftz.f32 %f469, %f464, %f194, %f459; mov.b32 {%rs146, %rs153}, %r218; // begin inline asm { cvt.f32.f16 %f195, %rs146;} // end inline asm fma.rn.ftz.f32 %f470, %f464, %f195, %f460; mov.b32 {%rs147, %rs154}, %r226; // begin inline asm { cvt.f32.f16 %f196, %rs147;} // end inline asm fma.rn.ftz.f32 %f471, %f464, %f196, %f461; shr.u16 %rs284, %rs282, 4; cvt.rn.f32.s16 %f472, %rs284; sub.ftz.f32 %f473, %f472, %f9; mul.ftz.f32 %f474, %f77, %f473; // begin inline asm { cvt.f32.f16 %f197, %rs148;} // end inline asm fma.rn.ftz.f32 %f475, %f474, %f197, %f465; // begin inline asm { cvt.f32.f16 %f198, %rs149;} // end inline asm fma.rn.ftz.f32 %f476, %f474, %f198, %f466; // begin inline asm { cvt.f32.f16 %f199, %rs150;} // end inline asm fma.rn.ftz.f32 %f477, %f474, %f199, %f467; // begin inline asm { cvt.f32.f16 %f200, %rs151;} // end inline asm fma.rn.ftz.f32 %f478, %f474, %f200, %f468; // begin inline asm { cvt.f32.f16 %f201, %rs152;} // end inline asm fma.rn.ftz.f32 %f479, %f474, %f201, %f469; // begin inline asm { cvt.f32.f16 %f202, %rs153;} // end inline asm fma.rn.ftz.f32 %f480, %f474, %f202, %f470; // begin inline asm { cvt.f32.f16 %f203, %rs154;} // end inline asm fma.rn.ftz.f32 %f481, %f474, %f203, %f471; shr.u16 %rs285, %rs281, 8; and.b16 %rs286, %rs285, 15; cvt.rn.f32.s16 %f482, %rs286; sub.ftz.f32 %f483, %f482, %f9; mul.ftz.f32 %f484, %f77, %f483; mov.b32 {%rs155, %rs162}, %r179; // begin inline asm { cvt.f32.f16 %f204, %rs155;} // end inline asm fma.rn.ftz.f32 %f485, %f484, %f204, %f475; mov.b32 {%rs156, %rs163}, %r187; // begin inline asm { cvt.f32.f16 %f205, %rs156;} // end inline asm fma.rn.ftz.f32 %f486, %f484, %f205, %f476; mov.b32 {%rs157, %rs164}, %r195; // begin inline asm { cvt.f32.f16 %f206, %rs157;} // end inline asm fma.rn.ftz.f32 %f487, %f484, %f206, %f477; mov.b32 {%rs158, %rs165}, %r203; // begin inline asm { cvt.f32.f16 %f207, %rs158;} // end inline asm fma.rn.ftz.f32 %f488, %f484, %f207, %f478; mov.b32 {%rs159, %rs166}, %r211; // begin inline asm { cvt.f32.f16 %f208, %rs159;} // end inline asm fma.rn.ftz.f32 %f489, %f484, %f208, %f479; mov.b32 {%rs160, %rs167}, %r219; // begin inline asm { cvt.f32.f16 %f209, %rs160;} // end inline asm fma.rn.ftz.f32 %f490, %f484, %f209, %f480; mov.b32 {%rs161, %rs168}, %r227; // begin inline asm { cvt.f32.f16 %f210, %rs161;} // end inline asm fma.rn.ftz.f32 %f491, %f484, %f210, %f481; shr.u16 %rs287, %rs281, 12; cvt.rn.f32.s16 %f492, %rs287; sub.ftz.f32 %f493, %f492, %f9; mul.ftz.f32 %f494, %f77, %f493; // begin inline asm { cvt.f32.f16 %f211, %rs162;} // end inline asm fma.rn.ftz.f32 %f495, %f494, %f211, %f485; // begin inline asm { cvt.f32.f16 %f212, %rs163;} // end inline asm fma.rn.ftz.f32 %f496, %f494, %f212, %f486; // begin inline asm { cvt.f32.f16 %f213, %rs164;} // end inline asm fma.rn.ftz.f32 %f497, %f494, %f213, %f487; // begin inline asm { cvt.f32.f16 %f214, %rs165;} // end inline asm fma.rn.ftz.f32 %f498, %f494, %f214, %f488; // begin inline asm { cvt.f32.f16 %f215, %rs166;} // end inline asm fma.rn.ftz.f32 %f499, %f494, %f215, %f489; // begin inline asm { cvt.f32.f16 %f216, %rs167;} // end inline asm fma.rn.ftz.f32 %f500, %f494, %f216, %f490; // begin inline asm { cvt.f32.f16 %f217, %rs168;} // end inline asm fma.rn.ftz.f32 %f501, %f494, %f217, %f491; shr.u32 %r234, %r66, 16; cvt.u16.u32 %rs288, %r234; and.b16 %rs289, %rs288, 15; cvt.rn.f32.s16 %f502, %rs289; sub.ftz.f32 %f503, %f502, %f9; mul.ftz.f32 %f504, %f77, %f503; mov.b32 {%rs169, %rs176}, %r180; // begin inline asm { cvt.f32.f16 %f218, %rs169;} // end inline asm fma.rn.ftz.f32 %f505, %f504, %f218, %f495; mov.b32 {%rs170, %rs177}, %r188; // begin inline asm { cvt.f32.f16 %f219, %rs170;} // end inline asm fma.rn.ftz.f32 %f506, %f504, %f219, %f496; mov.b32 {%rs171, %rs178}, %r196; // begin inline asm { cvt.f32.f16 %f220, %rs171;} // end inline asm fma.rn.ftz.f32 %f507, %f504, %f220, %f497; mov.b32 {%rs172, %rs179}, %r204; // begin inline asm { cvt.f32.f16 %f221, %rs172;} // end inline asm fma.rn.ftz.f32 %f508, %f504, %f221, %f498; mov.b32 {%rs173, %rs180}, %r212; // begin inline asm { cvt.f32.f16 %f222, %rs173;} // end inline asm fma.rn.ftz.f32 %f509, %f504, %f222, %f499; mov.b32 {%rs174, %rs181}, %r220; // begin inline asm { cvt.f32.f16 %f223, %rs174;} // end inline asm fma.rn.ftz.f32 %f510, %f504, %f223, %f500; mov.b32 {%rs175, %rs182}, %r228; // begin inline asm { cvt.f32.f16 %f224, %rs175;} // end inline asm fma.rn.ftz.f32 %f511, %f504, %f224, %f501; shr.u32 %r235, %r66, 20; cvt.u16.u32 %rs290, %r235; and.b16 %rs291, %rs290, 15; cvt.rn.f32.s16 %f512, %rs291; sub.ftz.f32 %f513, %f512, %f9; mul.ftz.f32 %f514, %f77, %f513; // begin inline asm { cvt.f32.f16 %f225, %rs176;} // end inline asm fma.rn.ftz.f32 %f515, %f514, %f225, %f505; // begin inline asm { cvt.f32.f16 %f226, %rs177;} // end inline asm fma.rn.ftz.f32 %f516, %f514, %f226, %f506; // begin inline asm { cvt.f32.f16 %f227, %rs178;} // end inline asm fma.rn.ftz.f32 %f517, %f514, %f227, %f507; // begin inline asm { cvt.f32.f16 %f228, %rs179;} // end inline asm fma.rn.ftz.f32 %f518, %f514, %f228, %f508; // begin inline asm { cvt.f32.f16 %f229, %rs180;} // end inline asm fma.rn.ftz.f32 %f519, %f514, %f229, %f509; // begin inline asm { cvt.f32.f16 %f230, %rs181;} // end inline asm fma.rn.ftz.f32 %f520, %f514, %f230, %f510; // begin inline asm { cvt.f32.f16 %f231, %rs182;} // end inline asm fma.rn.ftz.f32 %f521, %f514, %f231, %f511; shr.u32 %r236, %r66, 24; cvt.u16.u32 %rs292, %r236; and.b16 %rs293, %rs292, 15; cvt.rn.f32.s16 %f522, %rs293; sub.ftz.f32 %f523, %f522, %f9; mul.ftz.f32 %f524, %f77, %f523; mov.b32 {%rs183, %rs190}, %r181; // begin inline asm { cvt.f32.f16 %f232, %rs183;} // end inline asm fma.rn.ftz.f32 %f525, %f524, %f232, %f515; mov.b32 {%rs184, %rs191}, %r189; // begin inline asm { cvt.f32.f16 %f233, %rs184;} // end inline asm fma.rn.ftz.f32 %f526, %f524, %f233, %f516; mov.b32 {%rs185, %rs192}, %r197; // begin inline asm { cvt.f32.f16 %f234, %rs185;} // end inline asm fma.rn.ftz.f32 %f527, %f524, %f234, %f517; mov.b32 {%rs186, %rs193}, %r205; // begin inline asm { cvt.f32.f16 %f235, %rs186;} // end inline asm fma.rn.ftz.f32 %f528, %f524, %f235, %f518; mov.b32 {%rs187, %rs194}, %r213; // begin inline asm { cvt.f32.f16 %f236, %rs187;} // end inline asm fma.rn.ftz.f32 %f529, %f524, %f236, %f519; mov.b32 {%rs188, %rs195}, %r221; // begin inline asm { cvt.f32.f16 %f237, %rs188;} // end inline asm fma.rn.ftz.f32 %f530, %f524, %f237, %f520; mov.b32 {%rs189, %rs196}, %r229; // begin inline asm { cvt.f32.f16 %f238, %rs189;} // end inline asm fma.rn.ftz.f32 %f531, %f524, %f238, %f521; shr.u32 %r237, %r66, 28; cvt.u16.u32 %rs294, %r237; cvt.rn.f32.s16 %f532, %rs294; sub.ftz.f32 %f533, %f532, %f9; mul.ftz.f32 %f534, %f77, %f533; // begin inline asm { cvt.f32.f16 %f239, %rs190;} // end inline asm fma.rn.ftz.f32 %f535, %f534, %f239, %f525; // begin inline asm { cvt.f32.f16 %f240, %rs191;} // end inline asm fma.rn.ftz.f32 %f536, %f534, %f240, %f526; // begin inline asm { cvt.f32.f16 %f241, %rs192;} // end inline asm fma.rn.ftz.f32 %f537, %f534, %f241, %f527; // begin inline asm { cvt.f32.f16 %f242, %rs193;} // end inline asm fma.rn.ftz.f32 %f538, %f534, %f242, %f528; // begin inline asm { cvt.f32.f16 %f243, %rs194;} // end inline asm fma.rn.ftz.f32 %f539, %f534, %f243, %f529; // begin inline asm { cvt.f32.f16 %f244, %rs195;} // end inline asm fma.rn.ftz.f32 %f540, %f534, %f244, %f530; // begin inline asm { cvt.f32.f16 %f245, %rs196;} // end inline asm fma.rn.ftz.f32 %f541, %f534, %f245, %f531; ld.global.v4.u32 {%r238, %r239, %r240, %r241}, [%rd5+48]; ld.global.v4.u32 {%r246, %r247, %r248, %r249}, [%rd7+32]; ld.global.v4.u32 {%r254, %r255, %r256, %r257}, [%rd6+48]; ld.global.v4.u32 {%r262, %r263, %r264, %r265}, [%rd8+32]; ld.global.v4.u32 {%r270, %r271, %r272, %r273}, [%rd9+32]; ld.global.v4.u32 {%r278, %r279, %r280, %r281}, [%rd10+32]; ld.global.v4.u32 {%r286, %r287, %r288, %r289}, [%rd11+32]; cvt.u16.u32 %rs295, %r67; and.b16 %rs296, %rs295, 240; and.b16 %rs297, %rs295, 15; cvt.rn.f32.s16 %f542, %rs297; sub.ftz.f32 %f543, %f542, %f9; mul.ftz.f32 %f544, %f77, %f543; mov.b32 {%rs197, %rs204}, %r238; // begin inline asm { cvt.f32.f16 %f246, %rs197;} // end inline asm fma.rn.ftz.f32 %f545, %f544, %f246, %f535; mov.b32 {%rs198, %rs205}, %r246; // begin inline asm { cvt.f32.f16 %f247, %rs198;} // end inline asm fma.rn.ftz.f32 %f546, %f544, %f247, %f536; mov.b32 {%rs199, %rs206}, %r254; // begin inline asm { cvt.f32.f16 %f248, %rs199;} // end inline asm fma.rn.ftz.f32 %f547, %f544, %f248, %f537; mov.b32 {%rs200, %rs207}, %r262; // begin inline asm { cvt.f32.f16 %f249, %rs200;} // end inline asm fma.rn.ftz.f32 %f548, %f544, %f249, %f538; mov.b32 {%rs201, %rs208}, %r270; // begin inline asm { cvt.f32.f16 %f250, %rs201;} // end inline asm fma.rn.ftz.f32 %f549, %f544, %f250, %f539; mov.b32 {%rs202, %rs209}, %r278; // begin inline asm { cvt.f32.f16 %f251, %rs202;} // end inline asm fma.rn.ftz.f32 %f550, %f544, %f251, %f540; mov.b32 {%rs203, %rs210}, %r286; // begin inline asm { cvt.f32.f16 %f252, %rs203;} // end inline asm fma.rn.ftz.f32 %f551, %f544, %f252, %f541; shr.u16 %rs298, %rs296, 4; cvt.rn.f32.s16 %f552, %rs298; sub.ftz.f32 %f553, %f552, %f9; mul.ftz.f32 %f554, %f77, %f553; // begin inline asm { cvt.f32.f16 %f253, %rs204;} // end inline asm fma.rn.ftz.f32 %f555, %f554, %f253, %f545; // begin inline asm { cvt.f32.f16 %f254, %rs205;} // end inline asm fma.rn.ftz.f32 %f556, %f554, %f254, %f546; // begin inline asm { cvt.f32.f16 %f255, %rs206;} // end inline asm fma.rn.ftz.f32 %f557, %f554, %f255, %f547; // begin inline asm { cvt.f32.f16 %f256, %rs207;} // end inline asm fma.rn.ftz.f32 %f558, %f554, %f256, %f548; // begin inline asm { cvt.f32.f16 %f257, %rs208;} // end inline asm fma.rn.ftz.f32 %f559, %f554, %f257, %f549; // begin inline asm { cvt.f32.f16 %f258, %rs209;} // end inline asm fma.rn.ftz.f32 %f560, %f554, %f258, %f550; // begin inline asm { cvt.f32.f16 %f259, %rs210;} // end inline asm fma.rn.ftz.f32 %f561, %f554, %f259, %f551; shr.u16 %rs299, %rs295, 8; and.b16 %rs300, %rs299, 15; cvt.rn.f32.s16 %f562, %rs300; sub.ftz.f32 %f563, %f562, %f9; mul.ftz.f32 %f564, %f77, %f563; mov.b32 {%rs211, %rs218}, %r239; // begin inline asm { cvt.f32.f16 %f260, %rs211;} // end inline asm fma.rn.ftz.f32 %f565, %f564, %f260, %f555; mov.b32 {%rs212, %rs219}, %r247; // begin inline asm { cvt.f32.f16 %f261, %rs212;} // end inline asm fma.rn.ftz.f32 %f566, %f564, %f261, %f556; mov.b32 {%rs213, %rs220}, %r255; // begin inline asm { cvt.f32.f16 %f262, %rs213;} // end inline asm fma.rn.ftz.f32 %f567, %f564, %f262, %f557; mov.b32 {%rs214, %rs221}, %r263; // begin inline asm { cvt.f32.f16 %f263, %rs214;} // end inline asm fma.rn.ftz.f32 %f568, %f564, %f263, %f558; mov.b32 {%rs215, %rs222}, %r271; // begin inline asm { cvt.f32.f16 %f264, %rs215;} // end inline asm fma.rn.ftz.f32 %f569, %f564, %f264, %f559; mov.b32 {%rs216, %rs223}, %r279; // begin inline asm { cvt.f32.f16 %f265, %rs216;} // end inline asm fma.rn.ftz.f32 %f570, %f564, %f265, %f560; mov.b32 {%rs217, %rs224}, %r287; // begin inline asm { cvt.f32.f16 %f266, %rs217;} // end inline asm fma.rn.ftz.f32 %f571, %f564, %f266, %f561; shr.u16 %rs301, %rs295, 12; cvt.rn.f32.s16 %f572, %rs301; sub.ftz.f32 %f573, %f572, %f9; mul.ftz.f32 %f574, %f77, %f573; // begin inline asm { cvt.f32.f16 %f267, %rs218;} // end inline asm fma.rn.ftz.f32 %f575, %f574, %f267, %f565; // begin inline asm { cvt.f32.f16 %f268, %rs219;} // end inline asm fma.rn.ftz.f32 %f576, %f574, %f268, %f566; // begin inline asm { cvt.f32.f16 %f269, %rs220;} // end inline asm fma.rn.ftz.f32 %f577, %f574, %f269, %f567; // begin inline asm { cvt.f32.f16 %f270, %rs221;} // end inline asm fma.rn.ftz.f32 %f578, %f574, %f270, %f568; // begin inline asm { cvt.f32.f16 %f271, %rs222;} // end inline asm fma.rn.ftz.f32 %f579, %f574, %f271, %f569; // begin inline asm { cvt.f32.f16 %f272, %rs223;} // end inline asm fma.rn.ftz.f32 %f580, %f574, %f272, %f570; // begin inline asm { cvt.f32.f16 %f273, %rs224;} // end inline asm fma.rn.ftz.f32 %f581, %f574, %f273, %f571; shr.u32 %r294, %r67, 16; cvt.u16.u32 %rs302, %r294; and.b16 %rs303, %rs302, 15; cvt.rn.f32.s16 %f582, %rs303; sub.ftz.f32 %f583, %f582, %f9; mul.ftz.f32 %f584, %f77, %f583; mov.b32 {%rs225, %rs232}, %r240; // begin inline asm { cvt.f32.f16 %f274, %rs225;} // end inline asm fma.rn.ftz.f32 %f585, %f584, %f274, %f575; mov.b32 {%rs226, %rs233}, %r248; // begin inline asm { cvt.f32.f16 %f275, %rs226;} // end inline asm fma.rn.ftz.f32 %f586, %f584, %f275, %f576; mov.b32 {%rs227, %rs234}, %r256; // begin inline asm { cvt.f32.f16 %f276, %rs227;} // end inline asm fma.rn.ftz.f32 %f587, %f584, %f276, %f577; mov.b32 {%rs228, %rs235}, %r264; // begin inline asm { cvt.f32.f16 %f277, %rs228;} // end inline asm fma.rn.ftz.f32 %f588, %f584, %f277, %f578; mov.b32 {%rs229, %rs236}, %r272; // begin inline asm { cvt.f32.f16 %f278, %rs229;} // end inline asm fma.rn.ftz.f32 %f589, %f584, %f278, %f579; mov.b32 {%rs230, %rs237}, %r280; // begin inline asm { cvt.f32.f16 %f279, %rs230;} // end inline asm fma.rn.ftz.f32 %f590, %f584, %f279, %f580; mov.b32 {%rs231, %rs238}, %r288; // begin inline asm { cvt.f32.f16 %f280, %rs231;} // end inline asm fma.rn.ftz.f32 %f591, %f584, %f280, %f581; shr.u32 %r295, %r67, 20; cvt.u16.u32 %rs304, %r295; and.b16 %rs305, %rs304, 15; cvt.rn.f32.s16 %f592, %rs305; sub.ftz.f32 %f593, %f592, %f9; mul.ftz.f32 %f594, %f77, %f593; // begin inline asm { cvt.f32.f16 %f281, %rs232;} // end inline asm fma.rn.ftz.f32 %f595, %f594, %f281, %f585; // begin inline asm { cvt.f32.f16 %f282, %rs233;} // end inline asm fma.rn.ftz.f32 %f596, %f594, %f282, %f586; // begin inline asm { cvt.f32.f16 %f283, %rs234;} // end inline asm fma.rn.ftz.f32 %f597, %f594, %f283, %f587; // begin inline asm { cvt.f32.f16 %f284, %rs235;} // end inline asm fma.rn.ftz.f32 %f598, %f594, %f284, %f588; // begin inline asm { cvt.f32.f16 %f285, %rs236;} // end inline asm fma.rn.ftz.f32 %f599, %f594, %f285, %f589; // begin inline asm { cvt.f32.f16 %f286, %rs237;} // end inline asm fma.rn.ftz.f32 %f600, %f594, %f286, %f590; // begin inline asm { cvt.f32.f16 %f287, %rs238;} // end inline asm fma.rn.ftz.f32 %f601, %f594, %f287, %f591; shr.u32 %r296, %r67, 24; cvt.u16.u32 %rs306, %r296; and.b16 %rs307, %rs306, 15; cvt.rn.f32.s16 %f602, %rs307; sub.ftz.f32 %f603, %f602, %f9; mul.ftz.f32 %f604, %f77, %f603; mov.b32 {%rs239, %rs246}, %r241; // begin inline asm { cvt.f32.f16 %f288, %rs239;} // end inline asm fma.rn.ftz.f32 %f605, %f604, %f288, %f595; mov.b32 {%rs240, %rs247}, %r249; // begin inline asm { cvt.f32.f16 %f289, %rs240;} // end inline asm fma.rn.ftz.f32 %f606, %f604, %f289, %f596; mov.b32 {%rs241, %rs248}, %r257; // begin inline asm { cvt.f32.f16 %f290, %rs241;} // end inline asm fma.rn.ftz.f32 %f607, %f604, %f290, %f597; mov.b32 {%rs242, %rs249}, %r265; // begin inline asm { cvt.f32.f16 %f291, %rs242;} // end inline asm fma.rn.ftz.f32 %f608, %f604, %f291, %f598; mov.b32 {%rs243, %rs250}, %r273; // begin inline asm { cvt.f32.f16 %f292, %rs243;} // end inline asm fma.rn.ftz.f32 %f609, %f604, %f292, %f599; mov.b32 {%rs244, %rs251}, %r281; // begin inline asm { cvt.f32.f16 %f293, %rs244;} // end inline asm fma.rn.ftz.f32 %f610, %f604, %f293, %f600; mov.b32 {%rs245, %rs252}, %r289; // begin inline asm { cvt.f32.f16 %f294, %rs245;} // end inline asm fma.rn.ftz.f32 %f611, %f604, %f294, %f601; shr.u32 %r297, %r67, 28; cvt.u16.u32 %rs308, %r297; cvt.rn.f32.s16 %f612, %rs308; sub.ftz.f32 %f613, %f612, %f9; mul.ftz.f32 %f614, %f77, %f613; // begin inline asm { cvt.f32.f16 %f295, %rs246;} // end inline asm fma.rn.ftz.f32 %f1326, %f614, %f295, %f605; // begin inline asm { cvt.f32.f16 %f296, %rs247;} // end inline asm fma.rn.ftz.f32 %f1325, %f614, %f296, %f606; // begin inline asm { cvt.f32.f16 %f297, %rs248;} // end inline asm fma.rn.ftz.f32 %f1324, %f614, %f297, %f607; // begin inline asm { cvt.f32.f16 %f298, %rs249;} // end inline asm fma.rn.ftz.f32 %f1323, %f614, %f298, %f608; // begin inline asm { cvt.f32.f16 %f299, %rs250;} // end inline asm fma.rn.ftz.f32 %f1322, %f614, %f299, %f609; // begin inline asm { cvt.f32.f16 %f300, %rs251;} // end inline asm fma.rn.ftz.f32 %f1321, %f614, %f300, %f610; // begin inline asm { cvt.f32.f16 %f301, %rs252;} // end inline asm fma.rn.ftz.f32 %f1320, %f614, %f301, %f611; bra.uni $L__BB0_8; $L__BB0_7: cvt.u16.u32 %rs533, %r64; shl.b16 %rs534, %rs533, 4; cvt.s16.s8 %rs535, %rs534; shr.s16 %rs536, %rs535, 7; and.b16 %rs537, %rs536, -16; and.b16 %rs538, %rs533, 15; or.b16 %rs539, %rs537, %rs538; cvt.rn.f32.s16 %f839, %rs539; sub.ftz.f32 %f840, %f839, %f9; mul.ftz.f32 %f841, %f77, %f840; mov.b32 {%rs309, %rs316}, %r78; // begin inline asm { cvt.f32.f16 %f615, %rs309;} // end inline asm fma.rn.ftz.f32 %f842, %f841, %f615, %f1326; mov.b32 {%rs310, %rs317}, %r82; // begin inline asm { cvt.f32.f16 %f616, %rs310;} // end inline asm fma.rn.ftz.f32 %f843, %f841, %f616, %f1325; mov.b32 {%rs311, %rs318}, %r88; // begin inline asm { cvt.f32.f16 %f617, %rs311;} // end inline asm fma.rn.ftz.f32 %f844, %f841, %f617, %f1324; mov.b32 {%rs312, %rs319}, %r92; // begin inline asm { cvt.f32.f16 %f618, %rs312;} // end inline asm fma.rn.ftz.f32 %f845, %f841, %f618, %f1323; mov.b32 {%rs313, %rs320}, %r96; // begin inline asm { cvt.f32.f16 %f619, %rs313;} // end inline asm fma.rn.ftz.f32 %f846, %f841, %f619, %f1322; mov.b32 {%rs314, %rs321}, %r100; // begin inline asm { cvt.f32.f16 %f620, %rs314;} // end inline asm fma.rn.ftz.f32 %f847, %f841, %f620, %f1321; mov.b32 {%rs315, %rs322}, %r104; // begin inline asm { cvt.f32.f16 %f621, %rs315;} // end inline asm fma.rn.ftz.f32 %f848, %f841, %f621, %f1320; shr.u32 %r298, %r64, 4; cvt.u16.u32 %rs540, %r298; shl.b16 %rs541, %rs540, 4; cvt.s16.s8 %rs542, %rs541; shr.s16 %rs543, %rs542, 7; and.b16 %rs544, %rs543, -16; and.b16 %rs545, %rs540, 15; or.b16 %rs546, %rs544, %rs545; cvt.rn.f32.s16 %f849, %rs546; sub.ftz.f32 %f850, %f849, %f9; mul.ftz.f32 %f851, %f77, %f850; // begin inline asm { cvt.f32.f16 %f622, %rs316;} // end inline asm fma.rn.ftz.f32 %f852, %f851, %f622, %f842; // begin inline asm { cvt.f32.f16 %f623, %rs317;} // end inline asm fma.rn.ftz.f32 %f853, %f851, %f623, %f843; // begin inline asm { cvt.f32.f16 %f624, %rs318;} // end inline asm fma.rn.ftz.f32 %f854, %f851, %f624, %f844; // begin inline asm { cvt.f32.f16 %f625, %rs319;} // end inline asm fma.rn.ftz.f32 %f855, %f851, %f625, %f845; // begin inline asm { cvt.f32.f16 %f626, %rs320;} // end inline asm fma.rn.ftz.f32 %f856, %f851, %f626, %f846; // begin inline asm { cvt.f32.f16 %f627, %rs321;} // end inline asm fma.rn.ftz.f32 %f857, %f851, %f627, %f847; // begin inline asm { cvt.f32.f16 %f628, %rs322;} // end inline asm fma.rn.ftz.f32 %f858, %f851, %f628, %f848; shr.u32 %r299, %r64, 8; cvt.u16.u32 %rs547, %r299; shl.b16 %rs548, %rs547, 4; cvt.s16.s8 %rs549, %rs548; shr.s16 %rs550, %rs549, 7; and.b16 %rs551, %rs550, -16; and.b16 %rs552, %rs547, 15; or.b16 %rs553, %rs551, %rs552; cvt.rn.f32.s16 %f859, %rs553; sub.ftz.f32 %f860, %f859, %f9; mul.ftz.f32 %f861, %f77, %f860; mov.b32 {%rs323, %rs330}, %r79; // begin inline asm { cvt.f32.f16 %f629, %rs323;} // end inline asm fma.rn.ftz.f32 %f862, %f861, %f629, %f852; mov.b32 {%rs324, %rs331}, %r83; // begin inline asm { cvt.f32.f16 %f630, %rs324;} // end inline asm fma.rn.ftz.f32 %f863, %f861, %f630, %f853; mov.b32 {%rs325, %rs332}, %r89; // begin inline asm { cvt.f32.f16 %f631, %rs325;} // end inline asm fma.rn.ftz.f32 %f864, %f861, %f631, %f854; mov.b32 {%rs326, %rs333}, %r93; // begin inline asm { cvt.f32.f16 %f632, %rs326;} // end inline asm fma.rn.ftz.f32 %f865, %f861, %f632, %f855; mov.b32 {%rs327, %rs334}, %r97; // begin inline asm { cvt.f32.f16 %f633, %rs327;} // end inline asm fma.rn.ftz.f32 %f866, %f861, %f633, %f856; mov.b32 {%rs328, %rs335}, %r101; // begin inline asm { cvt.f32.f16 %f634, %rs328;} // end inline asm fma.rn.ftz.f32 %f867, %f861, %f634, %f857; mov.b32 {%rs329, %rs336}, %r105; // begin inline asm { cvt.f32.f16 %f635, %rs329;} // end inline asm fma.rn.ftz.f32 %f868, %f861, %f635, %f858; shr.u32 %r300, %r64, 12; cvt.u16.u32 %rs554, %r300; shl.b16 %rs555, %rs554, 4; cvt.s16.s8 %rs556, %rs555; shr.s16 %rs557, %rs556, 7; and.b16 %rs558, %rs557, -16; and.b16 %rs559, %rs554, 15; or.b16 %rs560, %rs558, %rs559; cvt.rn.f32.s16 %f869, %rs560; sub.ftz.f32 %f870, %f869, %f9; mul.ftz.f32 %f871, %f77, %f870; // begin inline asm { cvt.f32.f16 %f636, %rs330;} // end inline asm fma.rn.ftz.f32 %f872, %f871, %f636, %f862; // begin inline asm { cvt.f32.f16 %f637, %rs331;} // end inline asm fma.rn.ftz.f32 %f873, %f871, %f637, %f863; // begin inline asm { cvt.f32.f16 %f638, %rs332;} // end inline asm fma.rn.ftz.f32 %f874, %f871, %f638, %f864; // begin inline asm { cvt.f32.f16 %f639, %rs333;} // end inline asm fma.rn.ftz.f32 %f875, %f871, %f639, %f865; // begin inline asm { cvt.f32.f16 %f640, %rs334;} // end inline asm fma.rn.ftz.f32 %f876, %f871, %f640, %f866; // begin inline asm { cvt.f32.f16 %f641, %rs335;} // end inline asm fma.rn.ftz.f32 %f877, %f871, %f641, %f867; // begin inline asm { cvt.f32.f16 %f642, %rs336;} // end inline asm fma.rn.ftz.f32 %f878, %f871, %f642, %f868; shr.u32 %r301, %r64, 16; cvt.u16.u32 %rs561, %r301; shl.b16 %rs562, %rs561, 4; cvt.s16.s8 %rs563, %rs562; shr.s16 %rs564, %rs563, 7; and.b16 %rs565, %rs564, -16; and.b16 %rs566, %rs561, 15; or.b16 %rs567, %rs565, %rs566; cvt.rn.f32.s16 %f879, %rs567; sub.ftz.f32 %f880, %f879, %f9; mul.ftz.f32 %f881, %f77, %f880; mov.b32 {%rs337, %rs344}, %r80; // begin inline asm { cvt.f32.f16 %f643, %rs337;} // end inline asm fma.rn.ftz.f32 %f882, %f881, %f643, %f872; mov.b32 {%rs338, %rs345}, %r84; // begin inline asm { cvt.f32.f16 %f644, %rs338;} // end inline asm fma.rn.ftz.f32 %f883, %f881, %f644, %f873; mov.b32 {%rs339, %rs346}, %r90; // begin inline asm { cvt.f32.f16 %f645, %rs339;} // end inline asm fma.rn.ftz.f32 %f884, %f881, %f645, %f874; mov.b32 {%rs340, %rs347}, %r94; // begin inline asm { cvt.f32.f16 %f646, %rs340;} // end inline asm fma.rn.ftz.f32 %f885, %f881, %f646, %f875; mov.b32 {%rs341, %rs348}, %r98; // begin inline asm { cvt.f32.f16 %f647, %rs341;} // end inline asm fma.rn.ftz.f32 %f886, %f881, %f647, %f876; mov.b32 {%rs342, %rs349}, %r102; // begin inline asm { cvt.f32.f16 %f648, %rs342;} // end inline asm fma.rn.ftz.f32 %f887, %f881, %f648, %f877; mov.b32 {%rs343, %rs350}, %r106; // begin inline asm { cvt.f32.f16 %f649, %rs343;} // end inline asm fma.rn.ftz.f32 %f888, %f881, %f649, %f878; shr.u32 %r302, %r64, 20; cvt.u16.u32 %rs568, %r302; shl.b16 %rs569, %rs568, 4; cvt.s16.s8 %rs570, %rs569; shr.s16 %rs571, %rs570, 7; and.b16 %rs572, %rs571, -16; and.b16 %rs573, %rs568, 15; or.b16 %rs574, %rs572, %rs573; cvt.rn.f32.s16 %f889, %rs574; sub.ftz.f32 %f890, %f889, %f9; mul.ftz.f32 %f891, %f77, %f890; // begin inline asm { cvt.f32.f16 %f650, %rs344;} // end inline asm fma.rn.ftz.f32 %f892, %f891, %f650, %f882; // begin inline asm { cvt.f32.f16 %f651, %rs345;} // end inline asm fma.rn.ftz.f32 %f893, %f891, %f651, %f883; // begin inline asm { cvt.f32.f16 %f652, %rs346;} // end inline asm fma.rn.ftz.f32 %f894, %f891, %f652, %f884; // begin inline asm { cvt.f32.f16 %f653, %rs347;} // end inline asm fma.rn.ftz.f32 %f895, %f891, %f653, %f885; // begin inline asm { cvt.f32.f16 %f654, %rs348;} // end inline asm fma.rn.ftz.f32 %f896, %f891, %f654, %f886; // begin inline asm { cvt.f32.f16 %f655, %rs349;} // end inline asm fma.rn.ftz.f32 %f897, %f891, %f655, %f887; // begin inline asm { cvt.f32.f16 %f656, %rs350;} // end inline asm fma.rn.ftz.f32 %f898, %f891, %f656, %f888; shr.u32 %r303, %r64, 24; cvt.u16.u32 %rs575, %r303; shl.b16 %rs576, %rs575, 4; cvt.s16.s8 %rs577, %rs576; shr.s16 %rs578, %rs577, 7; and.b16 %rs579, %rs578, -16; and.b16 %rs580, %rs575, 15; or.b16 %rs581, %rs579, %rs580; cvt.rn.f32.s16 %f899, %rs581; sub.ftz.f32 %f900, %f899, %f9; mul.ftz.f32 %f901, %f77, %f900; mov.b32 {%rs351, %rs358}, %r81; // begin inline asm { cvt.f32.f16 %f657, %rs351;} // end inline asm fma.rn.ftz.f32 %f902, %f901, %f657, %f892; mov.b32 {%rs352, %rs359}, %r85; // begin inline asm { cvt.f32.f16 %f658, %rs352;} // end inline asm fma.rn.ftz.f32 %f903, %f901, %f658, %f893; mov.b32 {%rs353, %rs360}, %r91; // begin inline asm { cvt.f32.f16 %f659, %rs353;} // end inline asm fma.rn.ftz.f32 %f904, %f901, %f659, %f894; mov.b32 {%rs354, %rs361}, %r95; // begin inline asm { cvt.f32.f16 %f660, %rs354;} // end inline asm fma.rn.ftz.f32 %f905, %f901, %f660, %f895; mov.b32 {%rs355, %rs362}, %r99; // begin inline asm { cvt.f32.f16 %f661, %rs355;} // end inline asm fma.rn.ftz.f32 %f906, %f901, %f661, %f896; mov.b32 {%rs356, %rs363}, %r103; // begin inline asm { cvt.f32.f16 %f662, %rs356;} // end inline asm fma.rn.ftz.f32 %f907, %f901, %f662, %f897; mov.b32 {%rs357, %rs364}, %r107; // begin inline asm { cvt.f32.f16 %f663, %rs357;} // end inline asm fma.rn.ftz.f32 %f908, %f901, %f663, %f898; shr.u32 %r304, %r64, 28; cvt.u16.u32 %rs582, %r304; shl.b16 %rs583, %rs582, 4; cvt.s16.s8 %rs584, %rs583; shr.s16 %rs585, %rs584, 7; and.b16 %rs586, %rs585, -16; or.b16 %rs587, %rs586, %rs582; cvt.rn.f32.s16 %f909, %rs587; sub.ftz.f32 %f910, %f909, %f9; mul.ftz.f32 %f911, %f77, %f910; // begin inline asm { cvt.f32.f16 %f664, %rs358;} // end inline asm fma.rn.ftz.f32 %f912, %f911, %f664, %f902; // begin inline asm { cvt.f32.f16 %f665, %rs359;} // end inline asm fma.rn.ftz.f32 %f913, %f911, %f665, %f903; // begin inline asm { cvt.f32.f16 %f666, %rs360;} // end inline asm fma.rn.ftz.f32 %f914, %f911, %f666, %f904; // begin inline asm { cvt.f32.f16 %f667, %rs361;} // end inline asm fma.rn.ftz.f32 %f915, %f911, %f667, %f905; // begin inline asm { cvt.f32.f16 %f668, %rs362;} // end inline asm fma.rn.ftz.f32 %f916, %f911, %f668, %f906; // begin inline asm { cvt.f32.f16 %f669, %rs363;} // end inline asm fma.rn.ftz.f32 %f917, %f911, %f669, %f907; // begin inline asm { cvt.f32.f16 %f670, %rs364;} // end inline asm fma.rn.ftz.f32 %f918, %f911, %f670, %f908; ld.global.v4.u32 {%r305, %r306, %r307, %r308}, [%rd5+16]; ld.global.v4.u32 {%r313, %r314, %r315, %r316}, [%rd7]; ld.global.v4.u32 {%r321, %r322, %r323, %r324}, [%rd6+16]; ld.global.v4.u32 {%r329, %r330, %r331, %r332}, [%rd8]; ld.global.v4.u32 {%r337, %r338, %r339, %r340}, [%rd9]; ld.global.v4.u32 {%r345, %r346, %r347, %r348}, [%rd10]; ld.global.v4.u32 {%r353, %r354, %r355, %r356}, [%rd11]; cvt.u16.u32 %rs588, %r65; shl.b16 %rs589, %rs588, 4; cvt.s16.s8 %rs590, %rs589; shr.s16 %rs591, %rs590, 7; and.b16 %rs592, %rs591, -16; and.b16 %rs593, %rs588, 15; or.b16 %rs594, %rs592, %rs593; cvt.rn.f32.s16 %f919, %rs594; sub.ftz.f32 %f920, %f919, %f9; mul.ftz.f32 %f921, %f77, %f920; mov.b32 {%rs365, %rs372}, %r305; // begin inline asm { cvt.f32.f16 %f671, %rs365;} // end inline asm fma.rn.ftz.f32 %f922, %f921, %f671, %f912; mov.b32 {%rs366, %rs373}, %r313; // begin inline asm { cvt.f32.f16 %f672, %rs366;} // end inline asm fma.rn.ftz.f32 %f923, %f921, %f672, %f913; mov.b32 {%rs367, %rs374}, %r321; // begin inline asm { cvt.f32.f16 %f673, %rs367;} // end inline asm fma.rn.ftz.f32 %f924, %f921, %f673, %f914; mov.b32 {%rs368, %rs375}, %r329; // begin inline asm { cvt.f32.f16 %f674, %rs368;} // end inline asm fma.rn.ftz.f32 %f925, %f921, %f674, %f915; mov.b32 {%rs369, %rs376}, %r337; // begin inline asm { cvt.f32.f16 %f675, %rs369;} // end inline asm fma.rn.ftz.f32 %f926, %f921, %f675, %f916; mov.b32 {%rs370, %rs377}, %r345; // begin inline asm { cvt.f32.f16 %f676, %rs370;} // end inline asm fma.rn.ftz.f32 %f927, %f921, %f676, %f917; mov.b32 {%rs371, %rs378}, %r353; // begin inline asm { cvt.f32.f16 %f677, %rs371;} // end inline asm fma.rn.ftz.f32 %f928, %f921, %f677, %f918; shr.u32 %r361, %r65, 4; cvt.u16.u32 %rs595, %r361; shl.b16 %rs596, %rs595, 4; cvt.s16.s8 %rs597, %rs596; shr.s16 %rs598, %rs597, 7; and.b16 %rs599, %rs598, -16; and.b16 %rs600, %rs595, 15; or.b16 %rs601, %rs599, %rs600; cvt.rn.f32.s16 %f929, %rs601; sub.ftz.f32 %f930, %f929, %f9; mul.ftz.f32 %f931, %f77, %f930; // begin inline asm { cvt.f32.f16 %f678, %rs372;} // end inline asm fma.rn.ftz.f32 %f932, %f931, %f678, %f922; // begin inline asm { cvt.f32.f16 %f679, %rs373;} // end inline asm fma.rn.ftz.f32 %f933, %f931, %f679, %f923; // begin inline asm { cvt.f32.f16 %f680, %rs374;} // end inline asm fma.rn.ftz.f32 %f934, %f931, %f680, %f924; // begin inline asm { cvt.f32.f16 %f681, %rs375;} // end inline asm fma.rn.ftz.f32 %f935, %f931, %f681, %f925; // begin inline asm { cvt.f32.f16 %f682, %rs376;} // end inline asm fma.rn.ftz.f32 %f936, %f931, %f682, %f926; // begin inline asm { cvt.f32.f16 %f683, %rs377;} // end inline asm fma.rn.ftz.f32 %f937, %f931, %f683, %f927; // begin inline asm { cvt.f32.f16 %f684, %rs378;} // end inline asm fma.rn.ftz.f32 %f938, %f931, %f684, %f928; shr.u32 %r362, %r65, 8; cvt.u16.u32 %rs602, %r362; shl.b16 %rs603, %rs602, 4; cvt.s16.s8 %rs604, %rs603; shr.s16 %rs605, %rs604, 7; and.b16 %rs606, %rs605, -16; and.b16 %rs607, %rs602, 15; or.b16 %rs608, %rs606, %rs607; cvt.rn.f32.s16 %f939, %rs608; sub.ftz.f32 %f940, %f939, %f9; mul.ftz.f32 %f941, %f77, %f940; mov.b32 {%rs379, %rs386}, %r306; // begin inline asm { cvt.f32.f16 %f685, %rs379;} // end inline asm fma.rn.ftz.f32 %f942, %f941, %f685, %f932; mov.b32 {%rs380, %rs387}, %r314; // begin inline asm { cvt.f32.f16 %f686, %rs380;} // end inline asm fma.rn.ftz.f32 %f943, %f941, %f686, %f933; mov.b32 {%rs381, %rs388}, %r322; // begin inline asm { cvt.f32.f16 %f687, %rs381;} // end inline asm fma.rn.ftz.f32 %f944, %f941, %f687, %f934; mov.b32 {%rs382, %rs389}, %r330; // begin inline asm { cvt.f32.f16 %f688, %rs382;} // end inline asm fma.rn.ftz.f32 %f945, %f941, %f688, %f935; mov.b32 {%rs383, %rs390}, %r338; // begin inline asm { cvt.f32.f16 %f689, %rs383;} // end inline asm fma.rn.ftz.f32 %f946, %f941, %f689, %f936; mov.b32 {%rs384, %rs391}, %r346; // begin inline asm { cvt.f32.f16 %f690, %rs384;} // end inline asm fma.rn.ftz.f32 %f947, %f941, %f690, %f937; mov.b32 {%rs385, %rs392}, %r354; // begin inline asm { cvt.f32.f16 %f691, %rs385;} // end inline asm fma.rn.ftz.f32 %f948, %f941, %f691, %f938; shr.u32 %r363, %r65, 12; cvt.u16.u32 %rs609, %r363; shl.b16 %rs610, %rs609, 4; cvt.s16.s8 %rs611, %rs610; shr.s16 %rs612, %rs611, 7; and.b16 %rs613, %rs612, -16; and.b16 %rs614, %rs609, 15; or.b16 %rs615, %rs613, %rs614; cvt.rn.f32.s16 %f949, %rs615; sub.ftz.f32 %f950, %f949, %f9; mul.ftz.f32 %f951, %f77, %f950; // begin inline asm { cvt.f32.f16 %f692, %rs386;} // end inline asm fma.rn.ftz.f32 %f952, %f951, %f692, %f942; // begin inline asm { cvt.f32.f16 %f693, %rs387;} // end inline asm fma.rn.ftz.f32 %f953, %f951, %f693, %f943; // begin inline asm { cvt.f32.f16 %f694, %rs388;} // end inline asm fma.rn.ftz.f32 %f954, %f951, %f694, %f944; // begin inline asm { cvt.f32.f16 %f695, %rs389;} // end inline asm fma.rn.ftz.f32 %f955, %f951, %f695, %f945; // begin inline asm { cvt.f32.f16 %f696, %rs390;} // end inline asm fma.rn.ftz.f32 %f956, %f951, %f696, %f946; // begin inline asm { cvt.f32.f16 %f697, %rs391;} // end inline asm fma.rn.ftz.f32 %f957, %f951, %f697, %f947; // begin inline asm { cvt.f32.f16 %f698, %rs392;} // end inline asm fma.rn.ftz.f32 %f958, %f951, %f698, %f948; shr.u32 %r364, %r65, 16; cvt.u16.u32 %rs616, %r364; shl.b16 %rs617, %rs616, 4; cvt.s16.s8 %rs618, %rs617; shr.s16 %rs619, %rs618, 7; and.b16 %rs620, %rs619, -16; and.b16 %rs621, %rs616, 15; or.b16 %rs622, %rs620, %rs621; cvt.rn.f32.s16 %f959, %rs622; sub.ftz.f32 %f960, %f959, %f9; mul.ftz.f32 %f961, %f77, %f960; mov.b32 {%rs393, %rs400}, %r307; // begin inline asm { cvt.f32.f16 %f699, %rs393;} // end inline asm fma.rn.ftz.f32 %f962, %f961, %f699, %f952; mov.b32 {%rs394, %rs401}, %r315; // begin inline asm { cvt.f32.f16 %f700, %rs394;} // end inline asm fma.rn.ftz.f32 %f963, %f961, %f700, %f953; mov.b32 {%rs395, %rs402}, %r323; // begin inline asm { cvt.f32.f16 %f701, %rs395;} // end inline asm fma.rn.ftz.f32 %f964, %f961, %f701, %f954; mov.b32 {%rs396, %rs403}, %r331; // begin inline asm { cvt.f32.f16 %f702, %rs396;} // end inline asm fma.rn.ftz.f32 %f965, %f961, %f702, %f955; mov.b32 {%rs397, %rs404}, %r339; // begin inline asm { cvt.f32.f16 %f703, %rs397;} // end inline asm fma.rn.ftz.f32 %f966, %f961, %f703, %f956; mov.b32 {%rs398, %rs405}, %r347; // begin inline asm { cvt.f32.f16 %f704, %rs398;} // end inline asm fma.rn.ftz.f32 %f967, %f961, %f704, %f957; mov.b32 {%rs399, %rs406}, %r355; // begin inline asm { cvt.f32.f16 %f705, %rs399;} // end inline asm fma.rn.ftz.f32 %f968, %f961, %f705, %f958; shr.u32 %r365, %r65, 20; cvt.u16.u32 %rs623, %r365; shl.b16 %rs624, %rs623, 4; cvt.s16.s8 %rs625, %rs624; shr.s16 %rs626, %rs625, 7; and.b16 %rs627, %rs626, -16; and.b16 %rs628, %rs623, 15; or.b16 %rs629, %rs627, %rs628; cvt.rn.f32.s16 %f969, %rs629; sub.ftz.f32 %f970, %f969, %f9; mul.ftz.f32 %f971, %f77, %f970; // begin inline asm { cvt.f32.f16 %f706, %rs400;} // end inline asm fma.rn.ftz.f32 %f972, %f971, %f706, %f962; // begin inline asm { cvt.f32.f16 %f707, %rs401;} // end inline asm fma.rn.ftz.f32 %f973, %f971, %f707, %f963; // begin inline asm { cvt.f32.f16 %f708, %rs402;} // end inline asm fma.rn.ftz.f32 %f974, %f971, %f708, %f964; // begin inline asm { cvt.f32.f16 %f709, %rs403;} // end inline asm fma.rn.ftz.f32 %f975, %f971, %f709, %f965; // begin inline asm { cvt.f32.f16 %f710, %rs404;} // end inline asm fma.rn.ftz.f32 %f976, %f971, %f710, %f966; // begin inline asm { cvt.f32.f16 %f711, %rs405;} // end inline asm fma.rn.ftz.f32 %f977, %f971, %f711, %f967; // begin inline asm { cvt.f32.f16 %f712, %rs406;} // end inline asm fma.rn.ftz.f32 %f978, %f971, %f712, %f968; shr.u32 %r366, %r65, 24; cvt.u16.u32 %rs630, %r366; shl.b16 %rs631, %rs630, 4; cvt.s16.s8 %rs632, %rs631; shr.s16 %rs633, %rs632, 7; and.b16 %rs634, %rs633, -16; and.b16 %rs635, %rs630, 15; or.b16 %rs636, %rs634, %rs635; cvt.rn.f32.s16 %f979, %rs636; sub.ftz.f32 %f980, %f979, %f9; mul.ftz.f32 %f981, %f77, %f980; mov.b32 {%rs407, %rs414}, %r308; // begin inline asm { cvt.f32.f16 %f713, %rs407;} // end inline asm fma.rn.ftz.f32 %f982, %f981, %f713, %f972; mov.b32 {%rs408, %rs415}, %r316; // begin inline asm { cvt.f32.f16 %f714, %rs408;} // end inline asm fma.rn.ftz.f32 %f983, %f981, %f714, %f973; mov.b32 {%rs409, %rs416}, %r324; // begin inline asm { cvt.f32.f16 %f715, %rs409;} // end inline asm fma.rn.ftz.f32 %f984, %f981, %f715, %f974; mov.b32 {%rs410, %rs417}, %r332; // begin inline asm { cvt.f32.f16 %f716, %rs410;} // end inline asm fma.rn.ftz.f32 %f985, %f981, %f716, %f975; mov.b32 {%rs411, %rs418}, %r340; // begin inline asm { cvt.f32.f16 %f717, %rs411;} // end inline asm fma.rn.ftz.f32 %f986, %f981, %f717, %f976; mov.b32 {%rs412, %rs419}, %r348; // begin inline asm { cvt.f32.f16 %f718, %rs412;} // end inline asm fma.rn.ftz.f32 %f987, %f981, %f718, %f977; mov.b32 {%rs413, %rs420}, %r356; // begin inline asm { cvt.f32.f16 %f719, %rs413;} // end inline asm fma.rn.ftz.f32 %f988, %f981, %f719, %f978; shr.u32 %r367, %r65, 28; cvt.u16.u32 %rs637, %r367; shl.b16 %rs638, %rs637, 4; cvt.s16.s8 %rs639, %rs638; shr.s16 %rs640, %rs639, 7; and.b16 %rs641, %rs640, -16; or.b16 %rs642, %rs641, %rs637; cvt.rn.f32.s16 %f989, %rs642; sub.ftz.f32 %f990, %f989, %f9; mul.ftz.f32 %f991, %f77, %f990; // begin inline asm { cvt.f32.f16 %f720, %rs414;} // end inline asm fma.rn.ftz.f32 %f992, %f991, %f720, %f982; // begin inline asm { cvt.f32.f16 %f721, %rs415;} // end inline asm fma.rn.ftz.f32 %f993, %f991, %f721, %f983; // begin inline asm { cvt.f32.f16 %f722, %rs416;} // end inline asm fma.rn.ftz.f32 %f994, %f991, %f722, %f984; // begin inline asm { cvt.f32.f16 %f723, %rs417;} // end inline asm fma.rn.ftz.f32 %f995, %f991, %f723, %f985; // begin inline asm { cvt.f32.f16 %f724, %rs418;} // end inline asm fma.rn.ftz.f32 %f996, %f991, %f724, %f986; // begin inline asm { cvt.f32.f16 %f725, %rs419;} // end inline asm fma.rn.ftz.f32 %f997, %f991, %f725, %f987; // begin inline asm { cvt.f32.f16 %f726, %rs420;} // end inline asm fma.rn.ftz.f32 %f998, %f991, %f726, %f988; ld.global.v4.u32 {%r368, %r369, %r370, %r371}, [%rd5+32]; ld.global.v4.u32 {%r376, %r377, %r378, %r379}, [%rd7+16]; ld.global.v4.u32 {%r384, %r385, %r386, %r387}, [%rd6+32]; ld.global.v4.u32 {%r392, %r393, %r394, %r395}, [%rd8+16]; ld.global.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9+16]; ld.global.v4.u32 {%r408, %r409, %r410, %r411}, [%rd10+16]; ld.global.v4.u32 {%r416, %r417, %r418, %r419}, [%rd11+16]; cvt.u16.u32 %rs643, %r66; shl.b16 %rs644, %rs643, 4; cvt.s16.s8 %rs645, %rs644; shr.s16 %rs646, %rs645, 7; and.b16 %rs647, %rs646, -16; and.b16 %rs648, %rs643, 15; or.b16 %rs649, %rs647, %rs648; cvt.rn.f32.s16 %f999, %rs649; sub.ftz.f32 %f1000, %f999, %f9; mul.ftz.f32 %f1001, %f77, %f1000; mov.b32 {%rs421, %rs428}, %r368; // begin inline asm { cvt.f32.f16 %f727, %rs421;} // end inline asm fma.rn.ftz.f32 %f1002, %f1001, %f727, %f992; mov.b32 {%rs422, %rs429}, %r376; // begin inline asm { cvt.f32.f16 %f728, %rs422;} // end inline asm fma.rn.ftz.f32 %f1003, %f1001, %f728, %f993; mov.b32 {%rs423, %rs430}, %r384; // begin inline asm { cvt.f32.f16 %f729, %rs423;} // end inline asm fma.rn.ftz.f32 %f1004, %f1001, %f729, %f994; mov.b32 {%rs424, %rs431}, %r392; // begin inline asm { cvt.f32.f16 %f730, %rs424;} // end inline asm fma.rn.ftz.f32 %f1005, %f1001, %f730, %f995; mov.b32 {%rs425, %rs432}, %r400; // begin inline asm { cvt.f32.f16 %f731, %rs425;} // end inline asm fma.rn.ftz.f32 %f1006, %f1001, %f731, %f996; mov.b32 {%rs426, %rs433}, %r408; // begin inline asm { cvt.f32.f16 %f732, %rs426;} // end inline asm fma.rn.ftz.f32 %f1007, %f1001, %f732, %f997; mov.b32 {%rs427, %rs434}, %r416; // begin inline asm { cvt.f32.f16 %f733, %rs427;} // end inline asm fma.rn.ftz.f32 %f1008, %f1001, %f733, %f998; shr.u32 %r424, %r66, 4; cvt.u16.u32 %rs650, %r424; shl.b16 %rs651, %rs650, 4; cvt.s16.s8 %rs652, %rs651; shr.s16 %rs653, %rs652, 7; and.b16 %rs654, %rs653, -16; and.b16 %rs655, %rs650, 15; or.b16 %rs656, %rs654, %rs655; cvt.rn.f32.s16 %f1009, %rs656; sub.ftz.f32 %f1010, %f1009, %f9; mul.ftz.f32 %f1011, %f77, %f1010; // begin inline asm { cvt.f32.f16 %f734, %rs428;} // end inline asm fma.rn.ftz.f32 %f1012, %f1011, %f734, %f1002; // begin inline asm { cvt.f32.f16 %f735, %rs429;} // end inline asm fma.rn.ftz.f32 %f1013, %f1011, %f735, %f1003; // begin inline asm { cvt.f32.f16 %f736, %rs430;} // end inline asm fma.rn.ftz.f32 %f1014, %f1011, %f736, %f1004; // begin inline asm { cvt.f32.f16 %f737, %rs431;} // end inline asm fma.rn.ftz.f32 %f1015, %f1011, %f737, %f1005; // begin inline asm { cvt.f32.f16 %f738, %rs432;} // end inline asm fma.rn.ftz.f32 %f1016, %f1011, %f738, %f1006; // begin inline asm { cvt.f32.f16 %f739, %rs433;} // end inline asm fma.rn.ftz.f32 %f1017, %f1011, %f739, %f1007; // begin inline asm { cvt.f32.f16 %f740, %rs434;} // end inline asm fma.rn.ftz.f32 %f1018, %f1011, %f740, %f1008; shr.u32 %r425, %r66, 8; cvt.u16.u32 %rs657, %r425; shl.b16 %rs658, %rs657, 4; cvt.s16.s8 %rs659, %rs658; shr.s16 %rs660, %rs659, 7; and.b16 %rs661, %rs660, -16; and.b16 %rs662, %rs657, 15; or.b16 %rs663, %rs661, %rs662; cvt.rn.f32.s16 %f1019, %rs663; sub.ftz.f32 %f1020, %f1019, %f9; mul.ftz.f32 %f1021, %f77, %f1020; mov.b32 {%rs435, %rs442}, %r369; // begin inline asm { cvt.f32.f16 %f741, %rs435;} // end inline asm fma.rn.ftz.f32 %f1022, %f1021, %f741, %f1012; mov.b32 {%rs436, %rs443}, %r377; // begin inline asm { cvt.f32.f16 %f742, %rs436;} // end inline asm fma.rn.ftz.f32 %f1023, %f1021, %f742, %f1013; mov.b32 {%rs437, %rs444}, %r385; // begin inline asm { cvt.f32.f16 %f743, %rs437;} // end inline asm fma.rn.ftz.f32 %f1024, %f1021, %f743, %f1014; mov.b32 {%rs438, %rs445}, %r393; // begin inline asm { cvt.f32.f16 %f744, %rs438;} // end inline asm fma.rn.ftz.f32 %f1025, %f1021, %f744, %f1015; mov.b32 {%rs439, %rs446}, %r401; // begin inline asm { cvt.f32.f16 %f745, %rs439;} // end inline asm fma.rn.ftz.f32 %f1026, %f1021, %f745, %f1016; mov.b32 {%rs440, %rs447}, %r409; // begin inline asm { cvt.f32.f16 %f746, %rs440;} // end inline asm fma.rn.ftz.f32 %f1027, %f1021, %f746, %f1017; mov.b32 {%rs441, %rs448}, %r417; // begin inline asm { cvt.f32.f16 %f747, %rs441;} // end inline asm fma.rn.ftz.f32 %f1028, %f1021, %f747, %f1018; shr.u32 %r426, %r66, 12; cvt.u16.u32 %rs664, %r426; shl.b16 %rs665, %rs664, 4; cvt.s16.s8 %rs666, %rs665; shr.s16 %rs667, %rs666, 7; and.b16 %rs668, %rs667, -16; and.b16 %rs669, %rs664, 15; or.b16 %rs670, %rs668, %rs669; cvt.rn.f32.s16 %f1029, %rs670; sub.ftz.f32 %f1030, %f1029, %f9; mul.ftz.f32 %f1031, %f77, %f1030; // begin inline asm { cvt.f32.f16 %f748, %rs442;} // end inline asm fma.rn.ftz.f32 %f1032, %f1031, %f748, %f1022; // begin inline asm { cvt.f32.f16 %f749, %rs443;} // end inline asm fma.rn.ftz.f32 %f1033, %f1031, %f749, %f1023; // begin inline asm { cvt.f32.f16 %f750, %rs444;} // end inline asm fma.rn.ftz.f32 %f1034, %f1031, %f750, %f1024; // begin inline asm { cvt.f32.f16 %f751, %rs445;} // end inline asm fma.rn.ftz.f32 %f1035, %f1031, %f751, %f1025; // begin inline asm { cvt.f32.f16 %f752, %rs446;} // end inline asm fma.rn.ftz.f32 %f1036, %f1031, %f752, %f1026; // begin inline asm { cvt.f32.f16 %f753, %rs447;} // end inline asm fma.rn.ftz.f32 %f1037, %f1031, %f753, %f1027; // begin inline asm { cvt.f32.f16 %f754, %rs448;} // end inline asm fma.rn.ftz.f32 %f1038, %f1031, %f754, %f1028; shr.u32 %r427, %r66, 16; cvt.u16.u32 %rs671, %r427; shl.b16 %rs672, %rs671, 4; cvt.s16.s8 %rs673, %rs672; shr.s16 %rs674, %rs673, 7; and.b16 %rs675, %rs674, -16; and.b16 %rs676, %rs671, 15; or.b16 %rs677, %rs675, %rs676; cvt.rn.f32.s16 %f1039, %rs677; sub.ftz.f32 %f1040, %f1039, %f9; mul.ftz.f32 %f1041, %f77, %f1040; mov.b32 {%rs449, %rs456}, %r370; // begin inline asm { cvt.f32.f16 %f755, %rs449;} // end inline asm fma.rn.ftz.f32 %f1042, %f1041, %f755, %f1032; mov.b32 {%rs450, %rs457}, %r378; // begin inline asm { cvt.f32.f16 %f756, %rs450;} // end inline asm fma.rn.ftz.f32 %f1043, %f1041, %f756, %f1033; mov.b32 {%rs451, %rs458}, %r386; // begin inline asm { cvt.f32.f16 %f757, %rs451;} // end inline asm fma.rn.ftz.f32 %f1044, %f1041, %f757, %f1034; mov.b32 {%rs452, %rs459}, %r394; // begin inline asm { cvt.f32.f16 %f758, %rs452;} // end inline asm fma.rn.ftz.f32 %f1045, %f1041, %f758, %f1035; mov.b32 {%rs453, %rs460}, %r402; // begin inline asm { cvt.f32.f16 %f759, %rs453;} // end inline asm fma.rn.ftz.f32 %f1046, %f1041, %f759, %f1036; mov.b32 {%rs454, %rs461}, %r410; // begin inline asm { cvt.f32.f16 %f760, %rs454;} // end inline asm fma.rn.ftz.f32 %f1047, %f1041, %f760, %f1037; mov.b32 {%rs455, %rs462}, %r418; // begin inline asm { cvt.f32.f16 %f761, %rs455;} // end inline asm fma.rn.ftz.f32 %f1048, %f1041, %f761, %f1038; shr.u32 %r428, %r66, 20; cvt.u16.u32 %rs678, %r428; shl.b16 %rs679, %rs678, 4; cvt.s16.s8 %rs680, %rs679; shr.s16 %rs681, %rs680, 7; and.b16 %rs682, %rs681, -16; and.b16 %rs683, %rs678, 15; or.b16 %rs684, %rs682, %rs683; cvt.rn.f32.s16 %f1049, %rs684; sub.ftz.f32 %f1050, %f1049, %f9; mul.ftz.f32 %f1051, %f77, %f1050; // begin inline asm { cvt.f32.f16 %f762, %rs456;} // end inline asm fma.rn.ftz.f32 %f1052, %f1051, %f762, %f1042; // begin inline asm { cvt.f32.f16 %f763, %rs457;} // end inline asm fma.rn.ftz.f32 %f1053, %f1051, %f763, %f1043; // begin inline asm { cvt.f32.f16 %f764, %rs458;} // end inline asm fma.rn.ftz.f32 %f1054, %f1051, %f764, %f1044; // begin inline asm { cvt.f32.f16 %f765, %rs459;} // end inline asm fma.rn.ftz.f32 %f1055, %f1051, %f765, %f1045; // begin inline asm { cvt.f32.f16 %f766, %rs460;} // end inline asm fma.rn.ftz.f32 %f1056, %f1051, %f766, %f1046; // begin inline asm { cvt.f32.f16 %f767, %rs461;} // end inline asm fma.rn.ftz.f32 %f1057, %f1051, %f767, %f1047; // begin inline asm { cvt.f32.f16 %f768, %rs462;} // end inline asm fma.rn.ftz.f32 %f1058, %f1051, %f768, %f1048; shr.u32 %r429, %r66, 24; cvt.u16.u32 %rs685, %r429; shl.b16 %rs686, %rs685, 4; cvt.s16.s8 %rs687, %rs686; shr.s16 %rs688, %rs687, 7; and.b16 %rs689, %rs688, -16; and.b16 %rs690, %rs685, 15; or.b16 %rs691, %rs689, %rs690; cvt.rn.f32.s16 %f1059, %rs691; sub.ftz.f32 %f1060, %f1059, %f9; mul.ftz.f32 %f1061, %f77, %f1060; mov.b32 {%rs463, %rs470}, %r371; // begin inline asm { cvt.f32.f16 %f769, %rs463;} // end inline asm fma.rn.ftz.f32 %f1062, %f1061, %f769, %f1052; mov.b32 {%rs464, %rs471}, %r379; // begin inline asm { cvt.f32.f16 %f770, %rs464;} // end inline asm fma.rn.ftz.f32 %f1063, %f1061, %f770, %f1053; mov.b32 {%rs465, %rs472}, %r387; // begin inline asm { cvt.f32.f16 %f771, %rs465;} // end inline asm fma.rn.ftz.f32 %f1064, %f1061, %f771, %f1054; mov.b32 {%rs466, %rs473}, %r395; // begin inline asm { cvt.f32.f16 %f772, %rs466;} // end inline asm fma.rn.ftz.f32 %f1065, %f1061, %f772, %f1055; mov.b32 {%rs467, %rs474}, %r403; // begin inline asm { cvt.f32.f16 %f773, %rs467;} // end inline asm fma.rn.ftz.f32 %f1066, %f1061, %f773, %f1056; mov.b32 {%rs468, %rs475}, %r411; // begin inline asm { cvt.f32.f16 %f774, %rs468;} // end inline asm fma.rn.ftz.f32 %f1067, %f1061, %f774, %f1057; mov.b32 {%rs469, %rs476}, %r419; // begin inline asm { cvt.f32.f16 %f775, %rs469;} // end inline asm fma.rn.ftz.f32 %f1068, %f1061, %f775, %f1058; shr.u32 %r430, %r66, 28; cvt.u16.u32 %rs692, %r430; shl.b16 %rs693, %rs692, 4; cvt.s16.s8 %rs694, %rs693; shr.s16 %rs695, %rs694, 7; and.b16 %rs696, %rs695, -16; or.b16 %rs697, %rs696, %rs692; cvt.rn.f32.s16 %f1069, %rs697; sub.ftz.f32 %f1070, %f1069, %f9; mul.ftz.f32 %f1071, %f77, %f1070; // begin inline asm { cvt.f32.f16 %f776, %rs470;} // end inline asm fma.rn.ftz.f32 %f1072, %f1071, %f776, %f1062; // begin inline asm { cvt.f32.f16 %f777, %rs471;} // end inline asm fma.rn.ftz.f32 %f1073, %f1071, %f777, %f1063; // begin inline asm { cvt.f32.f16 %f778, %rs472;} // end inline asm fma.rn.ftz.f32 %f1074, %f1071, %f778, %f1064; // begin inline asm { cvt.f32.f16 %f779, %rs473;} // end inline asm fma.rn.ftz.f32 %f1075, %f1071, %f779, %f1065; // begin inline asm { cvt.f32.f16 %f780, %rs474;} // end inline asm fma.rn.ftz.f32 %f1076, %f1071, %f780, %f1066; // begin inline asm { cvt.f32.f16 %f781, %rs475;} // end inline asm fma.rn.ftz.f32 %f1077, %f1071, %f781, %f1067; // begin inline asm { cvt.f32.f16 %f782, %rs476;} // end inline asm fma.rn.ftz.f32 %f1078, %f1071, %f782, %f1068; ld.global.v4.u32 {%r431, %r432, %r433, %r434}, [%rd5+48]; ld.global.v4.u32 {%r439, %r440, %r441, %r442}, [%rd7+32]; ld.global.v4.u32 {%r447, %r448, %r449, %r450}, [%rd6+48]; ld.global.v4.u32 {%r455, %r456, %r457, %r458}, [%rd8+32]; ld.global.v4.u32 {%r463, %r464, %r465, %r466}, [%rd9+32]; ld.global.v4.u32 {%r471, %r472, %r473, %r474}, [%rd10+32]; ld.global.v4.u32 {%r479, %r480, %r481, %r482}, [%rd11+32]; cvt.u16.u32 %rs698, %r67; shl.b16 %rs699, %rs698, 4; cvt.s16.s8 %rs700, %rs699; shr.s16 %rs701, %rs700, 7; and.b16 %rs702, %rs701, -16; and.b16 %rs703, %rs698, 15; or.b16 %rs704, %rs702, %rs703; cvt.rn.f32.s16 %f1079, %rs704; sub.ftz.f32 %f1080, %f1079, %f9; mul.ftz.f32 %f1081, %f77, %f1080; mov.b32 {%rs477, %rs484}, %r431; // begin inline asm { cvt.f32.f16 %f783, %rs477;} // end inline asm fma.rn.ftz.f32 %f1082, %f1081, %f783, %f1072; mov.b32 {%rs478, %rs485}, %r439; // begin inline asm { cvt.f32.f16 %f784, %rs478;} // end inline asm fma.rn.ftz.f32 %f1083, %f1081, %f784, %f1073; mov.b32 {%rs479, %rs486}, %r447; // begin inline asm { cvt.f32.f16 %f785, %rs479;} // end inline asm fma.rn.ftz.f32 %f1084, %f1081, %f785, %f1074; mov.b32 {%rs480, %rs487}, %r455; // begin inline asm { cvt.f32.f16 %f786, %rs480;} // end inline asm fma.rn.ftz.f32 %f1085, %f1081, %f786, %f1075; mov.b32 {%rs481, %rs488}, %r463; // begin inline asm { cvt.f32.f16 %f787, %rs481;} // end inline asm fma.rn.ftz.f32 %f1086, %f1081, %f787, %f1076; mov.b32 {%rs482, %rs489}, %r471; // begin inline asm { cvt.f32.f16 %f788, %rs482;} // end inline asm fma.rn.ftz.f32 %f1087, %f1081, %f788, %f1077; mov.b32 {%rs483, %rs490}, %r479; // begin inline asm { cvt.f32.f16 %f789, %rs483;} // end inline asm fma.rn.ftz.f32 %f1088, %f1081, %f789, %f1078; shr.u32 %r487, %r67, 4; cvt.u16.u32 %rs705, %r487; shl.b16 %rs706, %rs705, 4; cvt.s16.s8 %rs707, %rs706; shr.s16 %rs708, %rs707, 7; and.b16 %rs709, %rs708, -16; and.b16 %rs710, %rs705, 15; or.b16 %rs711, %rs709, %rs710; cvt.rn.f32.s16 %f1089, %rs711; sub.ftz.f32 %f1090, %f1089, %f9; mul.ftz.f32 %f1091, %f77, %f1090; // begin inline asm { cvt.f32.f16 %f790, %rs484;} // end inline asm fma.rn.ftz.f32 %f1092, %f1091, %f790, %f1082; // begin inline asm { cvt.f32.f16 %f791, %rs485;} // end inline asm fma.rn.ftz.f32 %f1093, %f1091, %f791, %f1083; // begin inline asm { cvt.f32.f16 %f792, %rs486;} // end inline asm fma.rn.ftz.f32 %f1094, %f1091, %f792, %f1084; // begin inline asm { cvt.f32.f16 %f793, %rs487;} // end inline asm fma.rn.ftz.f32 %f1095, %f1091, %f793, %f1085; // begin inline asm { cvt.f32.f16 %f794, %rs488;} // end inline asm fma.rn.ftz.f32 %f1096, %f1091, %f794, %f1086; // begin inline asm { cvt.f32.f16 %f795, %rs489;} // end inline asm fma.rn.ftz.f32 %f1097, %f1091, %f795, %f1087; // begin inline asm { cvt.f32.f16 %f796, %rs490;} // end inline asm fma.rn.ftz.f32 %f1098, %f1091, %f796, %f1088; shr.u32 %r488, %r67, 8; cvt.u16.u32 %rs712, %r488; shl.b16 %rs713, %rs712, 4; cvt.s16.s8 %rs714, %rs713; shr.s16 %rs715, %rs714, 7; and.b16 %rs716, %rs715, -16; and.b16 %rs717, %rs712, 15; or.b16 %rs718, %rs716, %rs717; cvt.rn.f32.s16 %f1099, %rs718; sub.ftz.f32 %f1100, %f1099, %f9; mul.ftz.f32 %f1101, %f77, %f1100; mov.b32 {%rs491, %rs498}, %r432; // begin inline asm { cvt.f32.f16 %f797, %rs491;} // end inline asm fma.rn.ftz.f32 %f1102, %f1101, %f797, %f1092; mov.b32 {%rs492, %rs499}, %r440; // begin inline asm { cvt.f32.f16 %f798, %rs492;} // end inline asm fma.rn.ftz.f32 %f1103, %f1101, %f798, %f1093; mov.b32 {%rs493, %rs500}, %r448; // begin inline asm { cvt.f32.f16 %f799, %rs493;} // end inline asm fma.rn.ftz.f32 %f1104, %f1101, %f799, %f1094; mov.b32 {%rs494, %rs501}, %r456; // begin inline asm { cvt.f32.f16 %f800, %rs494;} // end inline asm fma.rn.ftz.f32 %f1105, %f1101, %f800, %f1095; mov.b32 {%rs495, %rs502}, %r464; // begin inline asm { cvt.f32.f16 %f801, %rs495;} // end inline asm fma.rn.ftz.f32 %f1106, %f1101, %f801, %f1096; mov.b32 {%rs496, %rs503}, %r472; // begin inline asm { cvt.f32.f16 %f802, %rs496;} // end inline asm fma.rn.ftz.f32 %f1107, %f1101, %f802, %f1097; mov.b32 {%rs497, %rs504}, %r480; // begin inline asm { cvt.f32.f16 %f803, %rs497;} // end inline asm fma.rn.ftz.f32 %f1108, %f1101, %f803, %f1098; shr.u32 %r489, %r67, 12; cvt.u16.u32 %rs719, %r489; shl.b16 %rs720, %rs719, 4; cvt.s16.s8 %rs721, %rs720; shr.s16 %rs722, %rs721, 7; and.b16 %rs723, %rs722, -16; and.b16 %rs724, %rs719, 15; or.b16 %rs725, %rs723, %rs724; cvt.rn.f32.s16 %f1109, %rs725; sub.ftz.f32 %f1110, %f1109, %f9; mul.ftz.f32 %f1111, %f77, %f1110; // begin inline asm { cvt.f32.f16 %f804, %rs498;} // end inline asm fma.rn.ftz.f32 %f1112, %f1111, %f804, %f1102; // begin inline asm { cvt.f32.f16 %f805, %rs499;} // end inline asm fma.rn.ftz.f32 %f1113, %f1111, %f805, %f1103; // begin inline asm { cvt.f32.f16 %f806, %rs500;} // end inline asm fma.rn.ftz.f32 %f1114, %f1111, %f806, %f1104; // begin inline asm { cvt.f32.f16 %f807, %rs501;} // end inline asm fma.rn.ftz.f32 %f1115, %f1111, %f807, %f1105; // begin inline asm { cvt.f32.f16 %f808, %rs502;} // end inline asm fma.rn.ftz.f32 %f1116, %f1111, %f808, %f1106; // begin inline asm { cvt.f32.f16 %f809, %rs503;} // end inline asm fma.rn.ftz.f32 %f1117, %f1111, %f809, %f1107; // begin inline asm { cvt.f32.f16 %f810, %rs504;} // end inline asm fma.rn.ftz.f32 %f1118, %f1111, %f810, %f1108; shr.u32 %r490, %r67, 16; cvt.u16.u32 %rs726, %r490; shl.b16 %rs727, %rs726, 4; cvt.s16.s8 %rs728, %rs727; shr.s16 %rs729, %rs728, 7; and.b16 %rs730, %rs729, -16; and.b16 %rs731, %rs726, 15; or.b16 %rs732, %rs730, %rs731; cvt.rn.f32.s16 %f1119, %rs732; sub.ftz.f32 %f1120, %f1119, %f9; mul.ftz.f32 %f1121, %f77, %f1120; mov.b32 {%rs505, %rs512}, %r433; // begin inline asm { cvt.f32.f16 %f811, %rs505;} // end inline asm fma.rn.ftz.f32 %f1122, %f1121, %f811, %f1112; mov.b32 {%rs506, %rs513}, %r441; // begin inline asm { cvt.f32.f16 %f812, %rs506;} // end inline asm fma.rn.ftz.f32 %f1123, %f1121, %f812, %f1113; mov.b32 {%rs507, %rs514}, %r449; // begin inline asm { cvt.f32.f16 %f813, %rs507;} // end inline asm fma.rn.ftz.f32 %f1124, %f1121, %f813, %f1114; mov.b32 {%rs508, %rs515}, %r457; // begin inline asm { cvt.f32.f16 %f814, %rs508;} // end inline asm fma.rn.ftz.f32 %f1125, %f1121, %f814, %f1115; mov.b32 {%rs509, %rs516}, %r465; // begin inline asm { cvt.f32.f16 %f815, %rs509;} // end inline asm fma.rn.ftz.f32 %f1126, %f1121, %f815, %f1116; mov.b32 {%rs510, %rs517}, %r473; // begin inline asm { cvt.f32.f16 %f816, %rs510;} // end inline asm fma.rn.ftz.f32 %f1127, %f1121, %f816, %f1117; mov.b32 {%rs511, %rs518}, %r481; // begin inline asm { cvt.f32.f16 %f817, %rs511;} // end inline asm fma.rn.ftz.f32 %f1128, %f1121, %f817, %f1118; shr.u32 %r491, %r67, 20; cvt.u16.u32 %rs733, %r491; shl.b16 %rs734, %rs733, 4; cvt.s16.s8 %rs735, %rs734; shr.s16 %rs736, %rs735, 7; and.b16 %rs737, %rs736, -16; and.b16 %rs738, %rs733, 15; or.b16 %rs739, %rs737, %rs738; cvt.rn.f32.s16 %f1129, %rs739; sub.ftz.f32 %f1130, %f1129, %f9; mul.ftz.f32 %f1131, %f77, %f1130; // begin inline asm { cvt.f32.f16 %f818, %rs512;} // end inline asm fma.rn.ftz.f32 %f1132, %f1131, %f818, %f1122; // begin inline asm { cvt.f32.f16 %f819, %rs513;} // end inline asm fma.rn.ftz.f32 %f1133, %f1131, %f819, %f1123; // begin inline asm { cvt.f32.f16 %f820, %rs514;} // end inline asm fma.rn.ftz.f32 %f1134, %f1131, %f820, %f1124; // begin inline asm { cvt.f32.f16 %f821, %rs515;} // end inline asm fma.rn.ftz.f32 %f1135, %f1131, %f821, %f1125; // begin inline asm { cvt.f32.f16 %f822, %rs516;} // end inline asm fma.rn.ftz.f32 %f1136, %f1131, %f822, %f1126; // begin inline asm { cvt.f32.f16 %f823, %rs517;} // end inline asm fma.rn.ftz.f32 %f1137, %f1131, %f823, %f1127; // begin inline asm { cvt.f32.f16 %f824, %rs518;} // end inline asm fma.rn.ftz.f32 %f1138, %f1131, %f824, %f1128; shr.u32 %r492, %r67, 24; cvt.u16.u32 %rs740, %r492; shl.b16 %rs741, %rs740, 4; cvt.s16.s8 %rs742, %rs741; shr.s16 %rs743, %rs742, 7; and.b16 %rs744, %rs743, -16; and.b16 %rs745, %rs740, 15; or.b16 %rs746, %rs744, %rs745; cvt.rn.f32.s16 %f1139, %rs746; sub.ftz.f32 %f1140, %f1139, %f9; mul.ftz.f32 %f1141, %f77, %f1140; mov.b32 {%rs519, %rs526}, %r434; // begin inline asm { cvt.f32.f16 %f825, %rs519;} // end inline asm fma.rn.ftz.f32 %f1142, %f1141, %f825, %f1132; mov.b32 {%rs520, %rs527}, %r442; // begin inline asm { cvt.f32.f16 %f826, %rs520;} // end inline asm fma.rn.ftz.f32 %f1143, %f1141, %f826, %f1133; mov.b32 {%rs521, %rs528}, %r450; // begin inline asm { cvt.f32.f16 %f827, %rs521;} // end inline asm fma.rn.ftz.f32 %f1144, %f1141, %f827, %f1134; mov.b32 {%rs522, %rs529}, %r458; // begin inline asm { cvt.f32.f16 %f828, %rs522;} // end inline asm fma.rn.ftz.f32 %f1145, %f1141, %f828, %f1135; mov.b32 {%rs523, %rs530}, %r466; // begin inline asm { cvt.f32.f16 %f829, %rs523;} // end inline asm fma.rn.ftz.f32 %f1146, %f1141, %f829, %f1136; mov.b32 {%rs524, %rs531}, %r474; // begin inline asm { cvt.f32.f16 %f830, %rs524;} // end inline asm fma.rn.ftz.f32 %f1147, %f1141, %f830, %f1137; mov.b32 {%rs525, %rs532}, %r482; // begin inline asm { cvt.f32.f16 %f831, %rs525;} // end inline asm fma.rn.ftz.f32 %f1148, %f1141, %f831, %f1138; shr.u32 %r493, %r67, 28; cvt.u16.u32 %rs747, %r493; shl.b16 %rs748, %rs747, 4; cvt.s16.s8 %rs749, %rs748; shr.s16 %rs750, %rs749, 7; and.b16 %rs751, %rs750, -16; or.b16 %rs752, %rs751, %rs747; cvt.rn.f32.s16 %f1149, %rs752; sub.ftz.f32 %f1150, %f1149, %f9; mul.ftz.f32 %f1151, %f77, %f1150; // begin inline asm { cvt.f32.f16 %f832, %rs526;} // end inline asm fma.rn.ftz.f32 %f1326, %f1151, %f832, %f1142; // begin inline asm { cvt.f32.f16 %f833, %rs527;} // end inline asm fma.rn.ftz.f32 %f1325, %f1151, %f833, %f1143; // begin inline asm { cvt.f32.f16 %f834, %rs528;} // end inline asm fma.rn.ftz.f32 %f1324, %f1151, %f834, %f1144; // begin inline asm { cvt.f32.f16 %f835, %rs529;} // end inline asm fma.rn.ftz.f32 %f1323, %f1151, %f835, %f1145; // begin inline asm { cvt.f32.f16 %f836, %rs530;} // end inline asm fma.rn.ftz.f32 %f1322, %f1151, %f836, %f1146; // begin inline asm { cvt.f32.f16 %f837, %rs531;} // end inline asm fma.rn.ftz.f32 %f1321, %f1151, %f837, %f1147; // begin inline asm { cvt.f32.f16 %f838, %rs532;} // end inline asm fma.rn.ftz.f32 %f1320, %f1151, %f838, %f1148; $L__BB0_8: add.s32 %r682, %r682, 4; shl.b32 %r494, %r682, 5; add.s32 %r681, %r494, %r60; shl.b32 %r680, %r681, 2; setp.lt.u32 %p7, %r680, %r57; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r496, %tid.y; shl.b32 %r497, %r496, 5; add.s32 %r48, %r497, %r60; setp.lt.u32 %p8, %r48, 32; shl.b32 %r499, %r48, 2; mov.u32 %r500, _ZZ9gemv_int4ILi4ELi128ELi7EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r501, %r500, %r499; @%p8 bra $L__BB0_11; add.s32 %r673, %r501, -112; st.shared.f32 [%r673], %f1326; $L__BB0_11: setp.gt.u32 %p9, %r48, 31; bar.sync 0; mad.lo.s32 %r50, %r48, 12, %r500; @%p9 bra $L__BB0_13; mov.u32 %r520, 16; ld.shared.f32 %f1167, [%r50+16]; add.ftz.f32 %f1168, %f1326, %f1167; ld.shared.f32 %f1169, [%r50+20]; add.ftz.f32 %f1170, %f1168, %f1169; ld.shared.f32 %f1171, [%r50+24]; add.ftz.f32 %f1154, %f1170, %f1171; mov.u32 %r508, 1; mov.u32 %r521, 31; mov.u32 %r522, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1154, %r508, %r521, %r522; @p add.f32 r0, r0, %f1154; mov.f32 %f1152, r0;} // end inline asm mov.u32 %r511, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1152, %r511, %r521, %r522; @p add.f32 r0, r0, %f1152; mov.f32 %f1155, r0;} // end inline asm mov.u32 %r514, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1155, %r514, %r521, %r522; @p add.f32 r0, r0, %f1155; mov.f32 %f1158, r0;} // end inline asm mov.u32 %r517, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1158, %r517, %r521, %r522; @p add.f32 r0, r0, %f1158; mov.f32 %f1161, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1161, %r520, %r521, %r522; @p add.f32 r0, r0, %f1161; mov.f32 %f1326, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r674, %r501, -112; st.shared.f32 [%r674+640], %f1325; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f1187, [%r50+656]; add.ftz.f32 %f1188, %f1325, %f1187; ld.shared.f32 %f1189, [%r50+660]; add.ftz.f32 %f1190, %f1188, %f1189; ld.shared.f32 %f1191, [%r50+664]; add.ftz.f32 %f1174, %f1190, %f1191; mov.u32 %r532, 1; mov.u32 %r545, 31; mov.u32 %r546, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1174, %r532, %r545, %r546; @p add.f32 r0, r0, %f1174; mov.f32 %f1172, r0;} // end inline asm mov.u32 %r535, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1172, %r535, %r545, %r546; @p add.f32 r0, r0, %f1172; mov.f32 %f1175, r0;} // end inline asm mov.u32 %r538, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1175, %r538, %r545, %r546; @p add.f32 r0, r0, %f1175; mov.f32 %f1178, r0;} // end inline asm mov.u32 %r541, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1178, %r541, %r545, %r546; @p add.f32 r0, r0, %f1178; mov.f32 %f1181, r0;} // end inline asm mov.u32 %r544, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1181, %r544, %r545, %r546; @p add.f32 r0, r0, %f1181; mov.f32 %f1325, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r675, %r501, -112; st.shared.f32 [%r675+1280], %f1324; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f1207, [%r50+1296]; add.ftz.f32 %f1208, %f1324, %f1207; ld.shared.f32 %f1209, [%r50+1300]; add.ftz.f32 %f1210, %f1208, %f1209; ld.shared.f32 %f1211, [%r50+1304]; add.ftz.f32 %f1194, %f1210, %f1211; mov.u32 %r556, 1; mov.u32 %r569, 31; mov.u32 %r570, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1194, %r556, %r569, %r570; @p add.f32 r0, r0, %f1194; mov.f32 %f1192, r0;} // end inline asm mov.u32 %r559, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1192, %r559, %r569, %r570; @p add.f32 r0, r0, %f1192; mov.f32 %f1195, r0;} // end inline asm mov.u32 %r562, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1195, %r562, %r569, %r570; @p add.f32 r0, r0, %f1195; mov.f32 %f1198, r0;} // end inline asm mov.u32 %r565, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1198, %r565, %r569, %r570; @p add.f32 r0, r0, %f1198; mov.f32 %f1201, r0;} // end inline asm mov.u32 %r568, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1201, %r568, %r569, %r570; @p add.f32 r0, r0, %f1201; mov.f32 %f1324, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r676, %r501, -112; st.shared.f32 [%r676+1920], %f1323; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f1227, [%r50+1936]; add.ftz.f32 %f1228, %f1323, %f1227; ld.shared.f32 %f1229, [%r50+1940]; add.ftz.f32 %f1230, %f1228, %f1229; ld.shared.f32 %f1231, [%r50+1944]; add.ftz.f32 %f1214, %f1230, %f1231; mov.u32 %r580, 1; mov.u32 %r593, 31; mov.u32 %r594, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1214, %r580, %r593, %r594; @p add.f32 r0, r0, %f1214; mov.f32 %f1212, r0;} // end inline asm mov.u32 %r583, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1212, %r583, %r593, %r594; @p add.f32 r0, r0, %f1212; mov.f32 %f1215, r0;} // end inline asm mov.u32 %r586, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1215, %r586, %r593, %r594; @p add.f32 r0, r0, %f1215; mov.f32 %f1218, r0;} // end inline asm mov.u32 %r589, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1218, %r589, %r593, %r594; @p add.f32 r0, r0, %f1218; mov.f32 %f1221, r0;} // end inline asm mov.u32 %r592, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1221, %r592, %r593, %r594; @p add.f32 r0, r0, %f1221; mov.f32 %f1323, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r677, %r501, -112; st.shared.f32 [%r677+2560], %f1322; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f1247, [%r50+2576]; add.ftz.f32 %f1248, %f1322, %f1247; ld.shared.f32 %f1249, [%r50+2580]; add.ftz.f32 %f1250, %f1248, %f1249; ld.shared.f32 %f1251, [%r50+2584]; add.ftz.f32 %f1234, %f1250, %f1251; mov.u32 %r604, 1; mov.u32 %r617, 31; mov.u32 %r618, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1234, %r604, %r617, %r618; @p add.f32 r0, r0, %f1234; mov.f32 %f1232, r0;} // end inline asm mov.u32 %r607, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1232, %r607, %r617, %r618; @p add.f32 r0, r0, %f1232; mov.f32 %f1235, r0;} // end inline asm mov.u32 %r610, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1235, %r610, %r617, %r618; @p add.f32 r0, r0, %f1235; mov.f32 %f1238, r0;} // end inline asm mov.u32 %r613, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1238, %r613, %r617, %r618; @p add.f32 r0, r0, %f1238; mov.f32 %f1241, r0;} // end inline asm mov.u32 %r616, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1241, %r616, %r617, %r618; @p add.f32 r0, r0, %f1241; mov.f32 %f1322, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r678, %r501, -112; st.shared.f32 [%r678+3200], %f1321; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f1267, [%r50+3216]; add.ftz.f32 %f1268, %f1321, %f1267; ld.shared.f32 %f1269, [%r50+3220]; add.ftz.f32 %f1270, %f1268, %f1269; ld.shared.f32 %f1271, [%r50+3224]; add.ftz.f32 %f1254, %f1270, %f1271; mov.u32 %r628, 1; mov.u32 %r641, 31; mov.u32 %r642, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1254, %r628, %r641, %r642; @p add.f32 r0, r0, %f1254; mov.f32 %f1252, r0;} // end inline asm mov.u32 %r631, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1252, %r631, %r641, %r642; @p add.f32 r0, r0, %f1252; mov.f32 %f1255, r0;} // end inline asm mov.u32 %r634, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1255, %r634, %r641, %r642; @p add.f32 r0, r0, %f1255; mov.f32 %f1258, r0;} // end inline asm mov.u32 %r637, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1258, %r637, %r641, %r642; @p add.f32 r0, r0, %f1258; mov.f32 %f1261, r0;} // end inline asm mov.u32 %r640, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1261, %r640, %r641, %r642; @p add.f32 r0, r0, %f1261; mov.f32 %f1321, r0;} // end inline asm $L__BB0_33: @%p8 bra $L__BB0_35; add.s32 %r679, %r501, -112; st.shared.f32 [%r679+3840], %f1320; $L__BB0_35: bar.sync 0; @%p9 bra $L__BB0_37; ld.shared.f32 %f1287, [%r50+3856]; add.ftz.f32 %f1288, %f1320, %f1287; ld.shared.f32 %f1289, [%r50+3860]; add.ftz.f32 %f1290, %f1288, %f1289; ld.shared.f32 %f1291, [%r50+3864]; add.ftz.f32 %f1274, %f1290, %f1291; mov.u32 %r652, 1; mov.u32 %r665, 31; mov.u32 %r666, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1274, %r652, %r665, %r666; @p add.f32 r0, r0, %f1274; mov.f32 %f1272, r0;} // end inline asm mov.u32 %r655, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1272, %r655, %r665, %r666; @p add.f32 r0, r0, %f1272; mov.f32 %f1275, r0;} // end inline asm mov.u32 %r658, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1275, %r658, %r665, %r666; @p add.f32 r0, r0, %f1275; mov.f32 %f1278, r0;} // end inline asm mov.u32 %r661, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1278, %r661, %r665, %r666; @p add.f32 r0, r0, %f1278; mov.f32 %f1281, r0;} // end inline asm mov.u32 %r664, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1281, %r664, %r665, %r666; @p add.f32 r0, r0, %f1281; mov.f32 %f1320, r0;} // end inline asm $L__BB0_37: or.b32 %r669, %r60, %r496; setp.ne.s32 %p22, %r669, 0; @%p22 bra $L__BB0_41; ld.param.u64 %rd71, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd70, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd46, %rd70; setp.eq.s64 %p23, %rd71, 0; mul.ftz.f32 %f52, %f68, %f1326; mov.u32 %r670, %ctaid.x; cvt.s64.s32 %rd13, %r670; mul.wide.s32 %rd47, %r670, 2; add.s64 %rd14, %rd46, %rd47; mul.ftz.f32 %f53, %f68, %f1325; add.s32 %r671, %r56, %r670; cvt.s64.s32 %rd15, %r56; mul.wide.s32 %rd48, %r56, 2; add.s64 %rd16, %rd14, %rd48; mul.ftz.f32 %f54, %f68, %f1324; add.s32 %r672, %r671, %r56; cvt.s64.s32 %rd17, %r672; mul.wide.s32 %rd49, %r672, 2; add.s64 %rd19, %rd46, %rd49; mul.ftz.f32 %f55, %f68, %f1323; mul.ftz.f32 %f56, %f68, %f1322; mul.ftz.f32 %f57, %f68, %f1321; mul.ftz.f32 %f58, %f68, %f1320; @%p23 bra $L__BB0_40; ld.param.u64 %rd72, [_Z28dequant_gemv_group128_batch723DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd50, %rd72; shl.b64 %rd51, %rd13, 1; add.s64 %rd52, %rd50, %rd51; ld.global.u16 %rs753, [%rd52]; // begin inline asm { cvt.f32.f16 %f1292, %rs753;} // end inline asm fma.rn.ftz.f32 %f1293, %f69, %f1292, %f52; // begin inline asm { cvt.rn.f16.f32 %rs754, %f1293;} // end inline asm st.global.u16 [%rd14], %rs754; shl.b64 %rd53, %rd15, 1; add.s64 %rd54, %rd52, %rd53; ld.global.u16 %rs755, [%rd54]; // begin inline asm { cvt.f32.f16 %f1294, %rs755;} // end inline asm fma.rn.ftz.f32 %f1295, %f69, %f1294, %f53; // begin inline asm { cvt.rn.f16.f32 %rs756, %f1295;} // end inline asm st.global.u16 [%rd16], %rs756; shl.b64 %rd55, %rd17, 1; add.s64 %rd56, %rd50, %rd55; ld.global.u16 %rs757, [%rd56]; // begin inline asm { cvt.f32.f16 %f1296, %rs757;} // end inline asm fma.rn.ftz.f32 %f1297, %f69, %f1296, %f54; // begin inline asm { cvt.rn.f16.f32 %rs758, %f1297;} // end inline asm st.global.u16 [%rd19], %rs758; add.s64 %rd57, %rd56, %rd53; ld.global.u16 %rs759, [%rd57]; // begin inline asm { cvt.f32.f16 %f1298, %rs759;} // end inline asm fma.rn.ftz.f32 %f1299, %f69, %f1298, %f55; // begin inline asm { cvt.rn.f16.f32 %rs760, %f1299;} // end inline asm add.s64 %rd58, %rd19, %rd53; st.global.u16 [%rd58], %rs760; add.s64 %rd59, %rd57, %rd53; ld.global.u16 %rs761, [%rd59]; // begin inline asm { cvt.f32.f16 %f1300, %rs761;} // end inline asm fma.rn.ftz.f32 %f1301, %f69, %f1300, %f56; // begin inline asm { cvt.rn.f16.f32 %rs762, %f1301;} // end inline asm add.s64 %rd60, %rd58, %rd53; st.global.u16 [%rd60], %rs762; add.s64 %rd61, %rd59, %rd53; ld.global.u16 %rs763, [%rd61]; // begin inline asm { cvt.f32.f16 %f1302, %rs763;} // end inline asm fma.rn.ftz.f32 %f1303, %f69, %f1302, %f57; // begin inline asm { cvt.rn.f16.f32 %rs764, %f1303;} // end inline asm add.s64 %rd62, %rd60, %rd53; st.global.u16 [%rd62], %rs764; add.s64 %rd63, %rd61, %rd53; ld.global.u16 %rs765, [%rd63]; // begin inline asm { cvt.f32.f16 %f1304, %rs765;} // end inline asm fma.rn.ftz.f32 %f1305, %f69, %f1304, %f58; // begin inline asm { cvt.rn.f16.f32 %rs766, %f1305;} // end inline asm add.s64 %rd64, %rd62, %rd53; st.global.u16 [%rd64], %rs766; bra.uni $L__BB0_41; $L__BB0_40: // begin inline asm { cvt.rn.f16.f32 %rs767, %f52;} // end inline asm st.global.u16 [%rd14], %rs767; // begin inline asm { cvt.rn.f16.f32 %rs768, %f53;} // end inline asm st.global.u16 [%rd16], %rs768; // begin inline asm { cvt.rn.f16.f32 %rs769, %f54;} // end inline asm st.global.u16 [%rd19], %rs769; // begin inline asm { cvt.rn.f16.f32 %rs770, %f55;} // end inline asm shl.b64 %rd65, %rd15, 1; add.s64 %rd66, %rd19, %rd65; st.global.u16 [%rd66], %rs770; // begin inline asm { cvt.rn.f16.f32 %rs771, %f56;} // end inline asm add.s64 %rd67, %rd66, %rd65; st.global.u16 [%rd67], %rs771; // begin inline asm { cvt.rn.f16.f32 %rs772, %f57;} // end inline asm add.s64 %rd68, %rd67, %rd65; st.global.u16 [%rd68], %rs772; // begin inline asm { cvt.rn.f16.f32 %rs773, %f58;} // end inline asm add.s64 %rd69, %rd68, %rd65; st.global.u16 [%rd69], %rs773; $L__BB0_41: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; } // // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-31678015 // Cuda compilation tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_60 .address_size 64 // .globl _Z28dequant_gemv_group128_batch523DequantGemvKernelParams // _ZZ9gemv_int4ILi4ELi128ELi5EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch523DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<20>; .reg .b16 %rs<598>; .reg .f32 %f<1015>; .reg .b32 %r<476>; .reg .b64 %rd<62>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi5EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[3200]; ld.param.v2.u32 {%r47, %r48}, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r49, %r50}, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f50, %f51}, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs37, %rs38, %rs39, %rs40}, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd23, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd22, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd21, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd20, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+16]; mov.u32 %r475, %tid.y; shl.b32 %r51, %r475, 5; mov.u32 %r52, %tid.x; add.s32 %r474, %r51, %r52; shl.b32 %r473, %r474, 2; setp.ge.u32 %p1, %r473, %r49; mov.f32 %f1000, 0f00000000; mov.f32 %f1001, %f1000; mov.f32 %f1002, %f1000; mov.f32 %f1003, %f1000; mov.f32 %f1004, %f1000; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd2, %rd20; mov.u32 %r53, %ctaid.x; mul.lo.s32 %r6, %r50, %r53; shl.b16 %rs2, %rs37, 3; cvta.to.global.u64 %rd3, %rd21; cvta.to.global.u64 %rd4, %rd23; $L__BB0_2: mad.lo.s32 %r55, %r49, %r53, %r473; mul.wide.u32 %rd24, %r55, 4; add.s64 %rd25, %rd3, %rd24; ld.global.v4.u32 {%r56, %r57, %r58, %r59}, [%rd25]; shr.u32 %r61, %r52, 2; shl.b32 %r62, %r475, 3; add.s32 %r14, %r62, %r61; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd26, %r15, 2; add.s64 %rd27, %rd4, %rd26; ld.global.u16 %rs45, [%rd27]; // begin inline asm { cvt.f32.f16 %f57, %rs45;} // end inline asm setp.eq.s64 %p2, %rd22, 0; mov.u16 %rs597, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r63, %r15, 31; add.s32 %r64, %r15, %r63; shr.s32 %r65, %r64, 1; cvt.s64.s32 %rd28, %r65; cvta.to.global.u64 %rd29, %rd22; add.s64 %rd30, %rd29, %rd28; ld.global.u8 %r66, [%rd30]; shl.b32 %r67, %r14, 2; and.b32 %r68, %r67, 4; shr.u32 %r69, %r66, %r68; cvt.u16.u32 %rs46, %r69; and.b16 %rs597, %rs46, 15; $L__BB0_4: shl.b32 %r16, %r474, 5; setp.ge.s32 %p3, %r16, %r47; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs37, 0; shr.u16 %rs48, %rs597, 3; and.b16 %rs49, %rs48, 1; setp.eq.b16 %p5, %rs49, 1; and.pred %p6, %p4, %p5; selp.b16 %rs50, -16, 0, %p6; or.b16 %rs51, %rs50, %rs597; cvt.s16.s8 %rs52, %rs51; cvt.rn.f32.s16 %f7, %rs52; mul.wide.s32 %rd31, %r16, 2; add.s64 %rd5, %rd2, %rd31; ld.global.v4.u32 {%r70, %r71, %r72, %r73}, [%rd5]; mul.wide.s32 %rd32, %r47, 2; add.s64 %rd33, %rd5, %rd32; ld.global.v4.u32 {%r74, %r75, %r76, %r77}, [%rd33]; add.s32 %r78, %r16, %r47; add.s32 %r79, %r78, %r47; shl.b32 %r80, %r47, 1; mul.wide.s32 %rd34, %r80, 2; add.s64 %rd35, %rd5, %rd34; ld.global.v4.u32 {%r81, %r82, %r83, %r84}, [%rd35]; add.s32 %r85, %r79, %r47; mul.wide.s32 %rd36, %r85, 2; add.s64 %rd6, %rd2, %rd36; ld.global.v4.u32 {%r86, %r87, %r88, %r89}, [%rd6]; add.s64 %rd37, %rd35, %rd34; ld.global.v4.u32 {%r90, %r91, %r92, %r93}, [%rd37]; cvt.u16.u32 %rs53, %r56; shr.u16 %rs54, %rs53, 4; and.b16 %rs5, %rs54, 15; shr.u16 %rs6, %rs53, 12; shr.u32 %r94, %r56, 16; cvt.u16.u32 %rs55, %r94; and.b16 %rs7, %rs55, 15; shr.u32 %r95, %r56, 20; cvt.u16.u32 %rs56, %r95; and.b16 %rs8, %rs56, 15; shr.u32 %r96, %r56, 24; cvt.u16.u32 %rs57, %r96; and.b16 %rs9, %rs57, 15; add.s32 %r97, %r78, 8; mul.wide.s32 %rd38, %r97, 2; add.s64 %rd7, %rd2, %rd38; add.s32 %r98, %r97, %r47; mul.wide.s32 %rd39, %r98, 2; add.s64 %rd8, %rd2, %rd39; add.s32 %r99, %r98, %r47; add.s32 %r100, %r99, %r47; mul.wide.s32 %rd40, %r100, 2; add.s64 %rd9, %rd2, %rd40; cvt.u16.u32 %rs58, %r57; shr.u16 %rs59, %rs58, 4; and.b16 %rs10, %rs59, 15; shr.u16 %rs60, %rs58, 8; and.b16 %rs11, %rs60, 15; shr.u16 %rs12, %rs58, 12; shr.u32 %r101, %r57, 16; cvt.u16.u32 %rs61, %r101; and.b16 %rs13, %rs61, 15; cvt.u16.u32 %rs62, %r58; shr.u16 %rs63, %rs62, 4; and.b16 %rs14, %rs63, 15; shr.u16 %rs64, %rs62, 8; and.b16 %rs15, %rs64, 15; shr.u16 %rs16, %rs62, 12; shr.u32 %r102, %r58, 16; cvt.u16.u32 %rs65, %r102; and.b16 %rs17, %rs65, 15; shr.u32 %r103, %r58, 20; cvt.u16.u32 %rs66, %r103; and.b16 %rs18, %rs66, 15; shr.u32 %r104, %r58, 24; cvt.u16.u32 %rs67, %r104; and.b16 %rs19, %rs67, 15; cvt.u16.u32 %rs68, %r59; shr.u16 %rs69, %rs68, 4; and.b16 %rs20, %rs69, 15; shr.u16 %rs70, %rs68, 8; and.b16 %rs21, %rs70, 15; shr.u16 %rs22, %rs68, 12; shr.u32 %r105, %r59, 16; cvt.u16.u32 %rs71, %r105; and.b16 %rs23, %rs71, 15; shr.u32 %r106, %r59, 20; cvt.u16.u32 %rs24, %r106; and.b16 %rs25, %rs24, 15; shr.u32 %r107, %r59, 24; cvt.u16.u32 %rs26, %r107; and.b16 %rs27, %rs26, 15; shr.u32 %r108, %r59, 28; cvt.u16.u32 %rs28, %r108; @%p4 bra $L__BB0_7; and.b16 %rs233, %rs53, 15; cvt.rn.f32.s16 %f218, %rs233; sub.ftz.f32 %f219, %f218, %f7; mul.ftz.f32 %f220, %f57, %f219; mov.b32 {%rs72, %rs77}, %r70; // begin inline asm { cvt.f32.f16 %f58, %rs72;} // end inline asm fma.rn.ftz.f32 %f221, %f220, %f58, %f1004; mov.b32 {%rs73, %rs78}, %r74; // begin inline asm { cvt.f32.f16 %f59, %rs73;} // end inline asm fma.rn.ftz.f32 %f222, %f220, %f59, %f1003; mov.b32 {%rs74, %rs79}, %r81; // begin inline asm { cvt.f32.f16 %f60, %rs74;} // end inline asm fma.rn.ftz.f32 %f223, %f220, %f60, %f1002; mov.b32 {%rs75, %rs80}, %r86; // begin inline asm { cvt.f32.f16 %f61, %rs75;} // end inline asm fma.rn.ftz.f32 %f224, %f220, %f61, %f1001; mov.b32 {%rs76, %rs81}, %r90; // begin inline asm { cvt.f32.f16 %f62, %rs76;} // end inline asm fma.rn.ftz.f32 %f225, %f220, %f62, %f1000; cvt.rn.f32.s16 %f226, %rs5; sub.ftz.f32 %f227, %f226, %f7; mul.ftz.f32 %f228, %f57, %f227; // begin inline asm { cvt.f32.f16 %f63, %rs77;} // end inline asm fma.rn.ftz.f32 %f229, %f228, %f63, %f221; // begin inline asm { cvt.f32.f16 %f64, %rs78;} // end inline asm fma.rn.ftz.f32 %f230, %f228, %f64, %f222; // begin inline asm { cvt.f32.f16 %f65, %rs79;} // end inline asm fma.rn.ftz.f32 %f231, %f228, %f65, %f223; // begin inline asm { cvt.f32.f16 %f66, %rs80;} // end inline asm fma.rn.ftz.f32 %f232, %f228, %f66, %f224; // begin inline asm { cvt.f32.f16 %f67, %rs81;} // end inline asm fma.rn.ftz.f32 %f233, %f228, %f67, %f225; shr.u16 %rs234, %rs53, 8; and.b16 %rs235, %rs234, 15; cvt.rn.f32.s16 %f234, %rs235; sub.ftz.f32 %f235, %f234, %f7; mul.ftz.f32 %f236, %f57, %f235; mov.b32 {%rs82, %rs87}, %r71; // begin inline asm { cvt.f32.f16 %f68, %rs82;} // end inline asm fma.rn.ftz.f32 %f237, %f236, %f68, %f229; mov.b32 {%rs83, %rs88}, %r75; // begin inline asm { cvt.f32.f16 %f69, %rs83;} // end inline asm fma.rn.ftz.f32 %f238, %f236, %f69, %f230; mov.b32 {%rs84, %rs89}, %r82; // begin inline asm { cvt.f32.f16 %f70, %rs84;} // end inline asm fma.rn.ftz.f32 %f239, %f236, %f70, %f231; mov.b32 {%rs85, %rs90}, %r87; // begin inline asm { cvt.f32.f16 %f71, %rs85;} // end inline asm fma.rn.ftz.f32 %f240, %f236, %f71, %f232; mov.b32 {%rs86, %rs91}, %r91; // begin inline asm { cvt.f32.f16 %f72, %rs86;} // end inline asm fma.rn.ftz.f32 %f241, %f236, %f72, %f233; cvt.rn.f32.s16 %f242, %rs6; sub.ftz.f32 %f243, %f242, %f7; mul.ftz.f32 %f244, %f57, %f243; // begin inline asm { cvt.f32.f16 %f73, %rs87;} // end inline asm fma.rn.ftz.f32 %f245, %f244, %f73, %f237; // begin inline asm { cvt.f32.f16 %f74, %rs88;} // end inline asm fma.rn.ftz.f32 %f246, %f244, %f74, %f238; // begin inline asm { cvt.f32.f16 %f75, %rs89;} // end inline asm fma.rn.ftz.f32 %f247, %f244, %f75, %f239; // begin inline asm { cvt.f32.f16 %f76, %rs90;} // end inline asm fma.rn.ftz.f32 %f248, %f244, %f76, %f240; // begin inline asm { cvt.f32.f16 %f77, %rs91;} // end inline asm fma.rn.ftz.f32 %f249, %f244, %f77, %f241; cvt.rn.f32.s16 %f250, %rs7; sub.ftz.f32 %f251, %f250, %f7; mul.ftz.f32 %f252, %f57, %f251; mov.b32 {%rs92, %rs97}, %r72; // begin inline asm { cvt.f32.f16 %f78, %rs92;} // end inline asm fma.rn.ftz.f32 %f253, %f252, %f78, %f245; mov.b32 {%rs93, %rs98}, %r76; // begin inline asm { cvt.f32.f16 %f79, %rs93;} // end inline asm fma.rn.ftz.f32 %f254, %f252, %f79, %f246; mov.b32 {%rs94, %rs99}, %r83; // begin inline asm { cvt.f32.f16 %f80, %rs94;} // end inline asm fma.rn.ftz.f32 %f255, %f252, %f80, %f247; mov.b32 {%rs95, %rs100}, %r88; // begin inline asm { cvt.f32.f16 %f81, %rs95;} // end inline asm fma.rn.ftz.f32 %f256, %f252, %f81, %f248; mov.b32 {%rs96, %rs101}, %r92; // begin inline asm { cvt.f32.f16 %f82, %rs96;} // end inline asm fma.rn.ftz.f32 %f257, %f252, %f82, %f249; cvt.rn.f32.s16 %f258, %rs8; sub.ftz.f32 %f259, %f258, %f7; mul.ftz.f32 %f260, %f57, %f259; // begin inline asm { cvt.f32.f16 %f83, %rs97;} // end inline asm fma.rn.ftz.f32 %f261, %f260, %f83, %f253; // begin inline asm { cvt.f32.f16 %f84, %rs98;} // end inline asm fma.rn.ftz.f32 %f262, %f260, %f84, %f254; // begin inline asm { cvt.f32.f16 %f85, %rs99;} // end inline asm fma.rn.ftz.f32 %f263, %f260, %f85, %f255; // begin inline asm { cvt.f32.f16 %f86, %rs100;} // end inline asm fma.rn.ftz.f32 %f264, %f260, %f86, %f256; // begin inline asm { cvt.f32.f16 %f87, %rs101;} // end inline asm fma.rn.ftz.f32 %f265, %f260, %f87, %f257; cvt.rn.f32.s16 %f266, %rs9; sub.ftz.f32 %f267, %f266, %f7; mul.ftz.f32 %f268, %f57, %f267; mov.b32 {%rs102, %rs107}, %r73; // begin inline asm { cvt.f32.f16 %f88, %rs102;} // end inline asm fma.rn.ftz.f32 %f269, %f268, %f88, %f261; mov.b32 {%rs103, %rs108}, %r77; // begin inline asm { cvt.f32.f16 %f89, %rs103;} // end inline asm fma.rn.ftz.f32 %f270, %f268, %f89, %f262; mov.b32 {%rs104, %rs109}, %r84; // begin inline asm { cvt.f32.f16 %f90, %rs104;} // end inline asm fma.rn.ftz.f32 %f271, %f268, %f90, %f263; mov.b32 {%rs105, %rs110}, %r89; // begin inline asm { cvt.f32.f16 %f91, %rs105;} // end inline asm fma.rn.ftz.f32 %f272, %f268, %f91, %f264; mov.b32 {%rs106, %rs111}, %r93; // begin inline asm { cvt.f32.f16 %f92, %rs106;} // end inline asm fma.rn.ftz.f32 %f273, %f268, %f92, %f265; shr.u32 %r109, %r56, 28; cvt.u16.u32 %rs236, %r109; cvt.rn.f32.s16 %f274, %rs236; sub.ftz.f32 %f275, %f274, %f7; mul.ftz.f32 %f276, %f57, %f275; // begin inline asm { cvt.f32.f16 %f93, %rs107;} // end inline asm fma.rn.ftz.f32 %f277, %f276, %f93, %f269; // begin inline asm { cvt.f32.f16 %f94, %rs108;} // end inline asm fma.rn.ftz.f32 %f278, %f276, %f94, %f270; // begin inline asm { cvt.f32.f16 %f95, %rs109;} // end inline asm fma.rn.ftz.f32 %f279, %f276, %f95, %f271; // begin inline asm { cvt.f32.f16 %f96, %rs110;} // end inline asm fma.rn.ftz.f32 %f280, %f276, %f96, %f272; // begin inline asm { cvt.f32.f16 %f97, %rs111;} // end inline asm fma.rn.ftz.f32 %f281, %f276, %f97, %f273; ld.global.v4.u32 {%r110, %r111, %r112, %r113}, [%rd5+16]; ld.global.v4.u32 {%r118, %r119, %r120, %r121}, [%rd7]; ld.global.v4.u32 {%r126, %r127, %r128, %r129}, [%rd8]; ld.global.v4.u32 {%r134, %r135, %r136, %r137}, [%rd6+16]; ld.global.v4.u32 {%r142, %r143, %r144, %r145}, [%rd9]; and.b16 %rs238, %rs58, 15; cvt.rn.f32.s16 %f282, %rs238; sub.ftz.f32 %f283, %f282, %f7; mul.ftz.f32 %f284, %f57, %f283; mov.b32 {%rs112, %rs117}, %r110; // begin inline asm { cvt.f32.f16 %f98, %rs112;} // end inline asm fma.rn.ftz.f32 %f285, %f284, %f98, %f277; mov.b32 {%rs113, %rs118}, %r118; // begin inline asm { cvt.f32.f16 %f99, %rs113;} // end inline asm fma.rn.ftz.f32 %f286, %f284, %f99, %f278; mov.b32 {%rs114, %rs119}, %r126; // begin inline asm { cvt.f32.f16 %f100, %rs114;} // end inline asm fma.rn.ftz.f32 %f287, %f284, %f100, %f279; mov.b32 {%rs115, %rs120}, %r134; // begin inline asm { cvt.f32.f16 %f101, %rs115;} // end inline asm fma.rn.ftz.f32 %f288, %f284, %f101, %f280; mov.b32 {%rs116, %rs121}, %r142; // begin inline asm { cvt.f32.f16 %f102, %rs116;} // end inline asm fma.rn.ftz.f32 %f289, %f284, %f102, %f281; cvt.rn.f32.s16 %f290, %rs10; sub.ftz.f32 %f291, %f290, %f7; mul.ftz.f32 %f292, %f57, %f291; // begin inline asm { cvt.f32.f16 %f103, %rs117;} // end inline asm fma.rn.ftz.f32 %f293, %f292, %f103, %f285; // begin inline asm { cvt.f32.f16 %f104, %rs118;} // end inline asm fma.rn.ftz.f32 %f294, %f292, %f104, %f286; // begin inline asm { cvt.f32.f16 %f105, %rs119;} // end inline asm fma.rn.ftz.f32 %f295, %f292, %f105, %f287; // begin inline asm { cvt.f32.f16 %f106, %rs120;} // end inline asm fma.rn.ftz.f32 %f296, %f292, %f106, %f288; // begin inline asm { cvt.f32.f16 %f107, %rs121;} // end inline asm fma.rn.ftz.f32 %f297, %f292, %f107, %f289; cvt.rn.f32.s16 %f298, %rs11; sub.ftz.f32 %f299, %f298, %f7; mul.ftz.f32 %f300, %f57, %f299; mov.b32 {%rs122, %rs127}, %r111; // begin inline asm { cvt.f32.f16 %f108, %rs122;} // end inline asm fma.rn.ftz.f32 %f301, %f300, %f108, %f293; mov.b32 {%rs123, %rs128}, %r119; // begin inline asm { cvt.f32.f16 %f109, %rs123;} // end inline asm fma.rn.ftz.f32 %f302, %f300, %f109, %f294; mov.b32 {%rs124, %rs129}, %r127; // begin inline asm { cvt.f32.f16 %f110, %rs124;} // end inline asm fma.rn.ftz.f32 %f303, %f300, %f110, %f295; mov.b32 {%rs125, %rs130}, %r135; // begin inline asm { cvt.f32.f16 %f111, %rs125;} // end inline asm fma.rn.ftz.f32 %f304, %f300, %f111, %f296; mov.b32 {%rs126, %rs131}, %r143; // begin inline asm { cvt.f32.f16 %f112, %rs126;} // end inline asm fma.rn.ftz.f32 %f305, %f300, %f112, %f297; cvt.rn.f32.s16 %f306, %rs12; sub.ftz.f32 %f307, %f306, %f7; mul.ftz.f32 %f308, %f57, %f307; // begin inline asm { cvt.f32.f16 %f113, %rs127;} // end inline asm fma.rn.ftz.f32 %f309, %f308, %f113, %f301; // begin inline asm { cvt.f32.f16 %f114, %rs128;} // end inline asm fma.rn.ftz.f32 %f310, %f308, %f114, %f302; // begin inline asm { cvt.f32.f16 %f115, %rs129;} // end inline asm fma.rn.ftz.f32 %f311, %f308, %f115, %f303; // begin inline asm { cvt.f32.f16 %f116, %rs130;} // end inline asm fma.rn.ftz.f32 %f312, %f308, %f116, %f304; // begin inline asm { cvt.f32.f16 %f117, %rs131;} // end inline asm fma.rn.ftz.f32 %f313, %f308, %f117, %f305; cvt.rn.f32.s16 %f314, %rs13; sub.ftz.f32 %f315, %f314, %f7; mul.ftz.f32 %f316, %f57, %f315; mov.b32 {%rs132, %rs137}, %r112; // begin inline asm { cvt.f32.f16 %f118, %rs132;} // end inline asm fma.rn.ftz.f32 %f317, %f316, %f118, %f309; mov.b32 {%rs133, %rs138}, %r120; // begin inline asm { cvt.f32.f16 %f119, %rs133;} // end inline asm fma.rn.ftz.f32 %f318, %f316, %f119, %f310; mov.b32 {%rs134, %rs139}, %r128; // begin inline asm { cvt.f32.f16 %f120, %rs134;} // end inline asm fma.rn.ftz.f32 %f319, %f316, %f120, %f311; mov.b32 {%rs135, %rs140}, %r136; // begin inline asm { cvt.f32.f16 %f121, %rs135;} // end inline asm fma.rn.ftz.f32 %f320, %f316, %f121, %f312; mov.b32 {%rs136, %rs141}, %r144; // begin inline asm { cvt.f32.f16 %f122, %rs136;} // end inline asm fma.rn.ftz.f32 %f321, %f316, %f122, %f313; shr.u32 %r150, %r57, 20; cvt.u16.u32 %rs239, %r150; and.b16 %rs240, %rs239, 15; cvt.rn.f32.s16 %f322, %rs240; sub.ftz.f32 %f323, %f322, %f7; mul.ftz.f32 %f324, %f57, %f323; // begin inline asm { cvt.f32.f16 %f123, %rs137;} // end inline asm fma.rn.ftz.f32 %f325, %f324, %f123, %f317; // begin inline asm { cvt.f32.f16 %f124, %rs138;} // end inline asm fma.rn.ftz.f32 %f326, %f324, %f124, %f318; // begin inline asm { cvt.f32.f16 %f125, %rs139;} // end inline asm fma.rn.ftz.f32 %f327, %f324, %f125, %f319; // begin inline asm { cvt.f32.f16 %f126, %rs140;} // end inline asm fma.rn.ftz.f32 %f328, %f324, %f126, %f320; // begin inline asm { cvt.f32.f16 %f127, %rs141;} // end inline asm fma.rn.ftz.f32 %f329, %f324, %f127, %f321; shr.u32 %r151, %r57, 24; cvt.u16.u32 %rs241, %r151; and.b16 %rs242, %rs241, 15; cvt.rn.f32.s16 %f330, %rs242; sub.ftz.f32 %f331, %f330, %f7; mul.ftz.f32 %f332, %f57, %f331; mov.b32 {%rs142, %rs147}, %r113; // begin inline asm { cvt.f32.f16 %f128, %rs142;} // end inline asm fma.rn.ftz.f32 %f333, %f332, %f128, %f325; mov.b32 {%rs143, %rs148}, %r121; // begin inline asm { cvt.f32.f16 %f129, %rs143;} // end inline asm fma.rn.ftz.f32 %f334, %f332, %f129, %f326; mov.b32 {%rs144, %rs149}, %r129; // begin inline asm { cvt.f32.f16 %f130, %rs144;} // end inline asm fma.rn.ftz.f32 %f335, %f332, %f130, %f327; mov.b32 {%rs145, %rs150}, %r137; // begin inline asm { cvt.f32.f16 %f131, %rs145;} // end inline asm fma.rn.ftz.f32 %f336, %f332, %f131, %f328; mov.b32 {%rs146, %rs151}, %r145; // begin inline asm { cvt.f32.f16 %f132, %rs146;} // end inline asm fma.rn.ftz.f32 %f337, %f332, %f132, %f329; shr.u32 %r152, %r57, 28; cvt.u16.u32 %rs243, %r152; cvt.rn.f32.s16 %f338, %rs243; sub.ftz.f32 %f339, %f338, %f7; mul.ftz.f32 %f340, %f57, %f339; // begin inline asm { cvt.f32.f16 %f133, %rs147;} // end inline asm fma.rn.ftz.f32 %f341, %f340, %f133, %f333; // begin inline asm { cvt.f32.f16 %f134, %rs148;} // end inline asm fma.rn.ftz.f32 %f342, %f340, %f134, %f334; // begin inline asm { cvt.f32.f16 %f135, %rs149;} // end inline asm fma.rn.ftz.f32 %f343, %f340, %f135, %f335; // begin inline asm { cvt.f32.f16 %f136, %rs150;} // end inline asm fma.rn.ftz.f32 %f344, %f340, %f136, %f336; // begin inline asm { cvt.f32.f16 %f137, %rs151;} // end inline asm fma.rn.ftz.f32 %f345, %f340, %f137, %f337; ld.global.v4.u32 {%r153, %r154, %r155, %r156}, [%rd5+32]; ld.global.v4.u32 {%r161, %r162, %r163, %r164}, [%rd7+16]; ld.global.v4.u32 {%r169, %r170, %r171, %r172}, [%rd8+16]; ld.global.v4.u32 {%r177, %r178, %r179, %r180}, [%rd6+32]; ld.global.v4.u32 {%r185, %r186, %r187, %r188}, [%rd9+16]; and.b16 %rs245, %rs62, 15; cvt.rn.f32.s16 %f346, %rs245; sub.ftz.f32 %f347, %f346, %f7; mul.ftz.f32 %f348, %f57, %f347; mov.b32 {%rs152, %rs157}, %r153; // begin inline asm { cvt.f32.f16 %f138, %rs152;} // end inline asm fma.rn.ftz.f32 %f349, %f348, %f138, %f341; mov.b32 {%rs153, %rs158}, %r161; // begin inline asm { cvt.f32.f16 %f139, %rs153;} // end inline asm fma.rn.ftz.f32 %f350, %f348, %f139, %f342; mov.b32 {%rs154, %rs159}, %r169; // begin inline asm { cvt.f32.f16 %f140, %rs154;} // end inline asm fma.rn.ftz.f32 %f351, %f348, %f140, %f343; mov.b32 {%rs155, %rs160}, %r177; // begin inline asm { cvt.f32.f16 %f141, %rs155;} // end inline asm fma.rn.ftz.f32 %f352, %f348, %f141, %f344; mov.b32 {%rs156, %rs161}, %r185; // begin inline asm { cvt.f32.f16 %f142, %rs156;} // end inline asm fma.rn.ftz.f32 %f353, %f348, %f142, %f345; cvt.rn.f32.s16 %f354, %rs14; sub.ftz.f32 %f355, %f354, %f7; mul.ftz.f32 %f356, %f57, %f355; // begin inline asm { cvt.f32.f16 %f143, %rs157;} // end inline asm fma.rn.ftz.f32 %f357, %f356, %f143, %f349; // begin inline asm { cvt.f32.f16 %f144, %rs158;} // end inline asm fma.rn.ftz.f32 %f358, %f356, %f144, %f350; // begin inline asm { cvt.f32.f16 %f145, %rs159;} // end inline asm fma.rn.ftz.f32 %f359, %f356, %f145, %f351; // begin inline asm { cvt.f32.f16 %f146, %rs160;} // end inline asm fma.rn.ftz.f32 %f360, %f356, %f146, %f352; // begin inline asm { cvt.f32.f16 %f147, %rs161;} // end inline asm fma.rn.ftz.f32 %f361, %f356, %f147, %f353; cvt.rn.f32.s16 %f362, %rs15; sub.ftz.f32 %f363, %f362, %f7; mul.ftz.f32 %f364, %f57, %f363; mov.b32 {%rs162, %rs167}, %r154; // begin inline asm { cvt.f32.f16 %f148, %rs162;} // end inline asm fma.rn.ftz.f32 %f365, %f364, %f148, %f357; mov.b32 {%rs163, %rs168}, %r162; // begin inline asm { cvt.f32.f16 %f149, %rs163;} // end inline asm fma.rn.ftz.f32 %f366, %f364, %f149, %f358; mov.b32 {%rs164, %rs169}, %r170; // begin inline asm { cvt.f32.f16 %f150, %rs164;} // end inline asm fma.rn.ftz.f32 %f367, %f364, %f150, %f359; mov.b32 {%rs165, %rs170}, %r178; // begin inline asm { cvt.f32.f16 %f151, %rs165;} // end inline asm fma.rn.ftz.f32 %f368, %f364, %f151, %f360; mov.b32 {%rs166, %rs171}, %r186; // begin inline asm { cvt.f32.f16 %f152, %rs166;} // end inline asm fma.rn.ftz.f32 %f369, %f364, %f152, %f361; cvt.rn.f32.s16 %f370, %rs16; sub.ftz.f32 %f371, %f370, %f7; mul.ftz.f32 %f372, %f57, %f371; // begin inline asm { cvt.f32.f16 %f153, %rs167;} // end inline asm fma.rn.ftz.f32 %f373, %f372, %f153, %f365; // begin inline asm { cvt.f32.f16 %f154, %rs168;} // end inline asm fma.rn.ftz.f32 %f374, %f372, %f154, %f366; // begin inline asm { cvt.f32.f16 %f155, %rs169;} // end inline asm fma.rn.ftz.f32 %f375, %f372, %f155, %f367; // begin inline asm { cvt.f32.f16 %f156, %rs170;} // end inline asm fma.rn.ftz.f32 %f376, %f372, %f156, %f368; // begin inline asm { cvt.f32.f16 %f157, %rs171;} // end inline asm fma.rn.ftz.f32 %f377, %f372, %f157, %f369; cvt.rn.f32.s16 %f378, %rs17; sub.ftz.f32 %f379, %f378, %f7; mul.ftz.f32 %f380, %f57, %f379; mov.b32 {%rs172, %rs177}, %r155; // begin inline asm { cvt.f32.f16 %f158, %rs172;} // end inline asm fma.rn.ftz.f32 %f381, %f380, %f158, %f373; mov.b32 {%rs173, %rs178}, %r163; // begin inline asm { cvt.f32.f16 %f159, %rs173;} // end inline asm fma.rn.ftz.f32 %f382, %f380, %f159, %f374; mov.b32 {%rs174, %rs179}, %r171; // begin inline asm { cvt.f32.f16 %f160, %rs174;} // end inline asm fma.rn.ftz.f32 %f383, %f380, %f160, %f375; mov.b32 {%rs175, %rs180}, %r179; // begin inline asm { cvt.f32.f16 %f161, %rs175;} // end inline asm fma.rn.ftz.f32 %f384, %f380, %f161, %f376; mov.b32 {%rs176, %rs181}, %r187; // begin inline asm { cvt.f32.f16 %f162, %rs176;} // end inline asm fma.rn.ftz.f32 %f385, %f380, %f162, %f377; cvt.rn.f32.s16 %f386, %rs18; sub.ftz.f32 %f387, %f386, %f7; mul.ftz.f32 %f388, %f57, %f387; // begin inline asm { cvt.f32.f16 %f163, %rs177;} // end inline asm fma.rn.ftz.f32 %f389, %f388, %f163, %f381; // begin inline asm { cvt.f32.f16 %f164, %rs178;} // end inline asm fma.rn.ftz.f32 %f390, %f388, %f164, %f382; // begin inline asm { cvt.f32.f16 %f165, %rs179;} // end inline asm fma.rn.ftz.f32 %f391, %f388, %f165, %f383; // begin inline asm { cvt.f32.f16 %f166, %rs180;} // end inline asm fma.rn.ftz.f32 %f392, %f388, %f166, %f384; // begin inline asm { cvt.f32.f16 %f167, %rs181;} // end inline asm fma.rn.ftz.f32 %f393, %f388, %f167, %f385; cvt.rn.f32.s16 %f394, %rs19; sub.ftz.f32 %f395, %f394, %f7; mul.ftz.f32 %f396, %f57, %f395; mov.b32 {%rs182, %rs187}, %r156; // begin inline asm { cvt.f32.f16 %f168, %rs182;} // end inline asm fma.rn.ftz.f32 %f397, %f396, %f168, %f389; mov.b32 {%rs183, %rs188}, %r164; // begin inline asm { cvt.f32.f16 %f169, %rs183;} // end inline asm fma.rn.ftz.f32 %f398, %f396, %f169, %f390; mov.b32 {%rs184, %rs189}, %r172; // begin inline asm { cvt.f32.f16 %f170, %rs184;} // end inline asm fma.rn.ftz.f32 %f399, %f396, %f170, %f391; mov.b32 {%rs185, %rs190}, %r180; // begin inline asm { cvt.f32.f16 %f171, %rs185;} // end inline asm fma.rn.ftz.f32 %f400, %f396, %f171, %f392; mov.b32 {%rs186, %rs191}, %r188; // begin inline asm { cvt.f32.f16 %f172, %rs186;} // end inline asm fma.rn.ftz.f32 %f401, %f396, %f172, %f393; shr.u32 %r193, %r58, 28; cvt.u16.u32 %rs246, %r193; cvt.rn.f32.s16 %f402, %rs246; sub.ftz.f32 %f403, %f402, %f7; mul.ftz.f32 %f404, %f57, %f403; // begin inline asm { cvt.f32.f16 %f173, %rs187;} // end inline asm fma.rn.ftz.f32 %f405, %f404, %f173, %f397; // begin inline asm { cvt.f32.f16 %f174, %rs188;} // end inline asm fma.rn.ftz.f32 %f406, %f404, %f174, %f398; // begin inline asm { cvt.f32.f16 %f175, %rs189;} // end inline asm fma.rn.ftz.f32 %f407, %f404, %f175, %f399; // begin inline asm { cvt.f32.f16 %f176, %rs190;} // end inline asm fma.rn.ftz.f32 %f408, %f404, %f176, %f400; // begin inline asm { cvt.f32.f16 %f177, %rs191;} // end inline asm fma.rn.ftz.f32 %f409, %f404, %f177, %f401; ld.global.v4.u32 {%r194, %r195, %r196, %r197}, [%rd5+48]; ld.global.v4.u32 {%r202, %r203, %r204, %r205}, [%rd7+32]; ld.global.v4.u32 {%r210, %r211, %r212, %r213}, [%rd8+32]; ld.global.v4.u32 {%r218, %r219, %r220, %r221}, [%rd6+48]; ld.global.v4.u32 {%r226, %r227, %r228, %r229}, [%rd9+32]; and.b16 %rs248, %rs68, 15; cvt.rn.f32.s16 %f410, %rs248; sub.ftz.f32 %f411, %f410, %f7; mul.ftz.f32 %f412, %f57, %f411; mov.b32 {%rs192, %rs197}, %r194; // begin inline asm { cvt.f32.f16 %f178, %rs192;} // end inline asm fma.rn.ftz.f32 %f413, %f412, %f178, %f405; mov.b32 {%rs193, %rs198}, %r202; // begin inline asm { cvt.f32.f16 %f179, %rs193;} // end inline asm fma.rn.ftz.f32 %f414, %f412, %f179, %f406; mov.b32 {%rs194, %rs199}, %r210; // begin inline asm { cvt.f32.f16 %f180, %rs194;} // end inline asm fma.rn.ftz.f32 %f415, %f412, %f180, %f407; mov.b32 {%rs195, %rs200}, %r218; // begin inline asm { cvt.f32.f16 %f181, %rs195;} // end inline asm fma.rn.ftz.f32 %f416, %f412, %f181, %f408; mov.b32 {%rs196, %rs201}, %r226; // begin inline asm { cvt.f32.f16 %f182, %rs196;} // end inline asm fma.rn.ftz.f32 %f417, %f412, %f182, %f409; cvt.rn.f32.s16 %f418, %rs20; sub.ftz.f32 %f419, %f418, %f7; mul.ftz.f32 %f420, %f57, %f419; // begin inline asm { cvt.f32.f16 %f183, %rs197;} // end inline asm fma.rn.ftz.f32 %f421, %f420, %f183, %f413; // begin inline asm { cvt.f32.f16 %f184, %rs198;} // end inline asm fma.rn.ftz.f32 %f422, %f420, %f184, %f414; // begin inline asm { cvt.f32.f16 %f185, %rs199;} // end inline asm fma.rn.ftz.f32 %f423, %f420, %f185, %f415; // begin inline asm { cvt.f32.f16 %f186, %rs200;} // end inline asm fma.rn.ftz.f32 %f424, %f420, %f186, %f416; // begin inline asm { cvt.f32.f16 %f187, %rs201;} // end inline asm fma.rn.ftz.f32 %f425, %f420, %f187, %f417; cvt.rn.f32.s16 %f426, %rs21; sub.ftz.f32 %f427, %f426, %f7; mul.ftz.f32 %f428, %f57, %f427; mov.b32 {%rs202, %rs207}, %r195; // begin inline asm { cvt.f32.f16 %f188, %rs202;} // end inline asm fma.rn.ftz.f32 %f429, %f428, %f188, %f421; mov.b32 {%rs203, %rs208}, %r203; // begin inline asm { cvt.f32.f16 %f189, %rs203;} // end inline asm fma.rn.ftz.f32 %f430, %f428, %f189, %f422; mov.b32 {%rs204, %rs209}, %r211; // begin inline asm { cvt.f32.f16 %f190, %rs204;} // end inline asm fma.rn.ftz.f32 %f431, %f428, %f190, %f423; mov.b32 {%rs205, %rs210}, %r219; // begin inline asm { cvt.f32.f16 %f191, %rs205;} // end inline asm fma.rn.ftz.f32 %f432, %f428, %f191, %f424; mov.b32 {%rs206, %rs211}, %r227; // begin inline asm { cvt.f32.f16 %f192, %rs206;} // end inline asm fma.rn.ftz.f32 %f433, %f428, %f192, %f425; cvt.rn.f32.s16 %f434, %rs22; sub.ftz.f32 %f435, %f434, %f7; mul.ftz.f32 %f436, %f57, %f435; // begin inline asm { cvt.f32.f16 %f193, %rs207;} // end inline asm fma.rn.ftz.f32 %f437, %f436, %f193, %f429; // begin inline asm { cvt.f32.f16 %f194, %rs208;} // end inline asm fma.rn.ftz.f32 %f438, %f436, %f194, %f430; // begin inline asm { cvt.f32.f16 %f195, %rs209;} // end inline asm fma.rn.ftz.f32 %f439, %f436, %f195, %f431; // begin inline asm { cvt.f32.f16 %f196, %rs210;} // end inline asm fma.rn.ftz.f32 %f440, %f436, %f196, %f432; // begin inline asm { cvt.f32.f16 %f197, %rs211;} // end inline asm fma.rn.ftz.f32 %f441, %f436, %f197, %f433; cvt.rn.f32.s16 %f442, %rs23; sub.ftz.f32 %f443, %f442, %f7; mul.ftz.f32 %f444, %f57, %f443; mov.b32 {%rs212, %rs217}, %r196; // begin inline asm { cvt.f32.f16 %f198, %rs212;} // end inline asm fma.rn.ftz.f32 %f445, %f444, %f198, %f437; mov.b32 {%rs213, %rs218}, %r204; // begin inline asm { cvt.f32.f16 %f199, %rs213;} // end inline asm fma.rn.ftz.f32 %f446, %f444, %f199, %f438; mov.b32 {%rs214, %rs219}, %r212; // begin inline asm { cvt.f32.f16 %f200, %rs214;} // end inline asm fma.rn.ftz.f32 %f447, %f444, %f200, %f439; mov.b32 {%rs215, %rs220}, %r220; // begin inline asm { cvt.f32.f16 %f201, %rs215;} // end inline asm fma.rn.ftz.f32 %f448, %f444, %f201, %f440; mov.b32 {%rs216, %rs221}, %r228; // begin inline asm { cvt.f32.f16 %f202, %rs216;} // end inline asm fma.rn.ftz.f32 %f449, %f444, %f202, %f441; cvt.rn.f32.s16 %f450, %rs25; sub.ftz.f32 %f451, %f450, %f7; mul.ftz.f32 %f452, %f57, %f451; // begin inline asm { cvt.f32.f16 %f203, %rs217;} // end inline asm fma.rn.ftz.f32 %f453, %f452, %f203, %f445; // begin inline asm { cvt.f32.f16 %f204, %rs218;} // end inline asm fma.rn.ftz.f32 %f454, %f452, %f204, %f446; // begin inline asm { cvt.f32.f16 %f205, %rs219;} // end inline asm fma.rn.ftz.f32 %f455, %f452, %f205, %f447; // begin inline asm { cvt.f32.f16 %f206, %rs220;} // end inline asm fma.rn.ftz.f32 %f456, %f452, %f206, %f448; // begin inline asm { cvt.f32.f16 %f207, %rs221;} // end inline asm fma.rn.ftz.f32 %f457, %f452, %f207, %f449; cvt.rn.f32.s16 %f458, %rs27; sub.ftz.f32 %f459, %f458, %f7; mul.ftz.f32 %f460, %f57, %f459; mov.b32 {%rs222, %rs227}, %r197; // begin inline asm { cvt.f32.f16 %f208, %rs222;} // end inline asm fma.rn.ftz.f32 %f461, %f460, %f208, %f453; mov.b32 {%rs223, %rs228}, %r205; // begin inline asm { cvt.f32.f16 %f209, %rs223;} // end inline asm fma.rn.ftz.f32 %f462, %f460, %f209, %f454; mov.b32 {%rs224, %rs229}, %r213; // begin inline asm { cvt.f32.f16 %f210, %rs224;} // end inline asm fma.rn.ftz.f32 %f463, %f460, %f210, %f455; mov.b32 {%rs225, %rs230}, %r221; // begin inline asm { cvt.f32.f16 %f211, %rs225;} // end inline asm fma.rn.ftz.f32 %f464, %f460, %f211, %f456; mov.b32 {%rs226, %rs231}, %r229; // begin inline asm { cvt.f32.f16 %f212, %rs226;} // end inline asm fma.rn.ftz.f32 %f465, %f460, %f212, %f457; cvt.rn.f32.s16 %f466, %rs28; sub.ftz.f32 %f467, %f466, %f7; mul.ftz.f32 %f468, %f57, %f467; // begin inline asm { cvt.f32.f16 %f213, %rs227;} // end inline asm fma.rn.ftz.f32 %f1004, %f468, %f213, %f461; // begin inline asm { cvt.f32.f16 %f214, %rs228;} // end inline asm fma.rn.ftz.f32 %f1003, %f468, %f214, %f462; // begin inline asm { cvt.f32.f16 %f215, %rs229;} // end inline asm fma.rn.ftz.f32 %f1002, %f468, %f215, %f463; // begin inline asm { cvt.f32.f16 %f216, %rs230;} // end inline asm fma.rn.ftz.f32 %f1001, %f468, %f216, %f464; // begin inline asm { cvt.f32.f16 %f217, %rs231;} // end inline asm fma.rn.ftz.f32 %f1000, %f468, %f217, %f465; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs410, %rs53, 4; cvt.s16.s8 %rs411, %rs410; shr.s16 %rs412, %rs411, 7; and.b16 %rs413, %rs412, -16; and.b16 %rs414, %rs53, 15; or.b16 %rs415, %rs413, %rs414; cvt.rn.f32.s16 %f629, %rs415; sub.ftz.f32 %f630, %f629, %f7; mul.ftz.f32 %f631, %f57, %f630; mov.b32 {%rs249, %rs254}, %r70; // begin inline asm { cvt.f32.f16 %f469, %rs249;} // end inline asm fma.rn.ftz.f32 %f632, %f631, %f469, %f1004; mov.b32 {%rs250, %rs255}, %r74; // begin inline asm { cvt.f32.f16 %f470, %rs250;} // end inline asm fma.rn.ftz.f32 %f633, %f631, %f470, %f1003; mov.b32 {%rs251, %rs256}, %r81; // begin inline asm { cvt.f32.f16 %f471, %rs251;} // end inline asm fma.rn.ftz.f32 %f634, %f631, %f471, %f1002; mov.b32 {%rs252, %rs257}, %r86; // begin inline asm { cvt.f32.f16 %f472, %rs252;} // end inline asm fma.rn.ftz.f32 %f635, %f631, %f472, %f1001; mov.b32 {%rs253, %rs258}, %r90; // begin inline asm { cvt.f32.f16 %f473, %rs253;} // end inline asm fma.rn.ftz.f32 %f636, %f631, %f473, %f1000; cvt.s16.s8 %rs416, %rs53; shr.s16 %rs417, %rs416, 7; and.b16 %rs418, %rs417, -16; or.b16 %rs419, %rs418, %rs5; cvt.rn.f32.s16 %f637, %rs419; sub.ftz.f32 %f638, %f637, %f7; mul.ftz.f32 %f639, %f57, %f638; // begin inline asm { cvt.f32.f16 %f474, %rs254;} // end inline asm fma.rn.ftz.f32 %f640, %f639, %f474, %f632; // begin inline asm { cvt.f32.f16 %f475, %rs255;} // end inline asm fma.rn.ftz.f32 %f641, %f639, %f475, %f633; // begin inline asm { cvt.f32.f16 %f476, %rs256;} // end inline asm fma.rn.ftz.f32 %f642, %f639, %f476, %f634; // begin inline asm { cvt.f32.f16 %f477, %rs257;} // end inline asm fma.rn.ftz.f32 %f643, %f639, %f477, %f635; // begin inline asm { cvt.f32.f16 %f478, %rs258;} // end inline asm fma.rn.ftz.f32 %f644, %f639, %f478, %f636; shr.u32 %r234, %r56, 8; cvt.u16.u32 %rs420, %r234; shl.b16 %rs421, %rs420, 4; cvt.s16.s8 %rs422, %rs421; shr.s16 %rs423, %rs422, 7; and.b16 %rs424, %rs423, -16; and.b16 %rs425, %rs420, 15; or.b16 %rs426, %rs424, %rs425; cvt.rn.f32.s16 %f645, %rs426; sub.ftz.f32 %f646, %f645, %f7; mul.ftz.f32 %f647, %f57, %f646; mov.b32 {%rs259, %rs264}, %r71; // begin inline asm { cvt.f32.f16 %f479, %rs259;} // end inline asm fma.rn.ftz.f32 %f648, %f647, %f479, %f640; mov.b32 {%rs260, %rs265}, %r75; // begin inline asm { cvt.f32.f16 %f480, %rs260;} // end inline asm fma.rn.ftz.f32 %f649, %f647, %f480, %f641; mov.b32 {%rs261, %rs266}, %r82; // begin inline asm { cvt.f32.f16 %f481, %rs261;} // end inline asm fma.rn.ftz.f32 %f650, %f647, %f481, %f642; mov.b32 {%rs262, %rs267}, %r87; // begin inline asm { cvt.f32.f16 %f482, %rs262;} // end inline asm fma.rn.ftz.f32 %f651, %f647, %f482, %f643; mov.b32 {%rs263, %rs268}, %r91; // begin inline asm { cvt.f32.f16 %f483, %rs263;} // end inline asm fma.rn.ftz.f32 %f652, %f647, %f483, %f644; shr.s16 %rs427, %rs53, 15; and.b16 %rs428, %rs427, -16; or.b16 %rs429, %rs428, %rs6; cvt.rn.f32.s16 %f653, %rs429; sub.ftz.f32 %f654, %f653, %f7; mul.ftz.f32 %f655, %f57, %f654; // begin inline asm { cvt.f32.f16 %f484, %rs264;} // end inline asm fma.rn.ftz.f32 %f656, %f655, %f484, %f648; // begin inline asm { cvt.f32.f16 %f485, %rs265;} // end inline asm fma.rn.ftz.f32 %f657, %f655, %f485, %f649; // begin inline asm { cvt.f32.f16 %f486, %rs266;} // end inline asm fma.rn.ftz.f32 %f658, %f655, %f486, %f650; // begin inline asm { cvt.f32.f16 %f487, %rs267;} // end inline asm fma.rn.ftz.f32 %f659, %f655, %f487, %f651; // begin inline asm { cvt.f32.f16 %f488, %rs268;} // end inline asm fma.rn.ftz.f32 %f660, %f655, %f488, %f652; shl.b16 %rs431, %rs55, 4; cvt.s16.s8 %rs432, %rs431; shr.s16 %rs433, %rs432, 7; and.b16 %rs434, %rs433, -16; or.b16 %rs435, %rs434, %rs7; cvt.rn.f32.s16 %f661, %rs435; sub.ftz.f32 %f662, %f661, %f7; mul.ftz.f32 %f663, %f57, %f662; mov.b32 {%rs269, %rs274}, %r72; // begin inline asm { cvt.f32.f16 %f489, %rs269;} // end inline asm fma.rn.ftz.f32 %f664, %f663, %f489, %f656; mov.b32 {%rs270, %rs275}, %r76; // begin inline asm { cvt.f32.f16 %f490, %rs270;} // end inline asm fma.rn.ftz.f32 %f665, %f663, %f490, %f657; mov.b32 {%rs271, %rs276}, %r83; // begin inline asm { cvt.f32.f16 %f491, %rs271;} // end inline asm fma.rn.ftz.f32 %f666, %f663, %f491, %f658; mov.b32 {%rs272, %rs277}, %r88; // begin inline asm { cvt.f32.f16 %f492, %rs272;} // end inline asm fma.rn.ftz.f32 %f667, %f663, %f492, %f659; mov.b32 {%rs273, %rs278}, %r92; // begin inline asm { cvt.f32.f16 %f493, %rs273;} // end inline asm fma.rn.ftz.f32 %f668, %f663, %f493, %f660; shl.b16 %rs437, %rs56, 4; cvt.s16.s8 %rs438, %rs437; shr.s16 %rs439, %rs438, 7; and.b16 %rs440, %rs439, -16; or.b16 %rs441, %rs440, %rs8; cvt.rn.f32.s16 %f669, %rs441; sub.ftz.f32 %f670, %f669, %f7; mul.ftz.f32 %f671, %f57, %f670; // begin inline asm { cvt.f32.f16 %f494, %rs274;} // end inline asm fma.rn.ftz.f32 %f672, %f671, %f494, %f664; // begin inline asm { cvt.f32.f16 %f495, %rs275;} // end inline asm fma.rn.ftz.f32 %f673, %f671, %f495, %f665; // begin inline asm { cvt.f32.f16 %f496, %rs276;} // end inline asm fma.rn.ftz.f32 %f674, %f671, %f496, %f666; // begin inline asm { cvt.f32.f16 %f497, %rs277;} // end inline asm fma.rn.ftz.f32 %f675, %f671, %f497, %f667; // begin inline asm { cvt.f32.f16 %f498, %rs278;} // end inline asm fma.rn.ftz.f32 %f676, %f671, %f498, %f668; shl.b16 %rs443, %rs57, 4; cvt.s16.s8 %rs444, %rs443; shr.s16 %rs445, %rs444, 7; and.b16 %rs446, %rs445, -16; or.b16 %rs447, %rs446, %rs9; cvt.rn.f32.s16 %f677, %rs447; sub.ftz.f32 %f678, %f677, %f7; mul.ftz.f32 %f679, %f57, %f678; mov.b32 {%rs279, %rs284}, %r73; // begin inline asm { cvt.f32.f16 %f499, %rs279;} // end inline asm fma.rn.ftz.f32 %f680, %f679, %f499, %f672; mov.b32 {%rs280, %rs285}, %r77; // begin inline asm { cvt.f32.f16 %f500, %rs280;} // end inline asm fma.rn.ftz.f32 %f681, %f679, %f500, %f673; mov.b32 {%rs281, %rs286}, %r84; // begin inline asm { cvt.f32.f16 %f501, %rs281;} // end inline asm fma.rn.ftz.f32 %f682, %f679, %f501, %f674; mov.b32 {%rs282, %rs287}, %r89; // begin inline asm { cvt.f32.f16 %f502, %rs282;} // end inline asm fma.rn.ftz.f32 %f683, %f679, %f502, %f675; mov.b32 {%rs283, %rs288}, %r93; // begin inline asm { cvt.f32.f16 %f503, %rs283;} // end inline asm fma.rn.ftz.f32 %f684, %f679, %f503, %f676; shr.u32 %r238, %r56, 28; cvt.u16.u32 %rs448, %r238; shl.b16 %rs449, %rs448, 4; cvt.s16.s8 %rs450, %rs449; shr.s16 %rs451, %rs450, 7; and.b16 %rs452, %rs451, -16; or.b16 %rs453, %rs452, %rs448; cvt.rn.f32.s16 %f685, %rs453; sub.ftz.f32 %f686, %f685, %f7; mul.ftz.f32 %f687, %f57, %f686; // begin inline asm { cvt.f32.f16 %f504, %rs284;} // end inline asm fma.rn.ftz.f32 %f688, %f687, %f504, %f680; // begin inline asm { cvt.f32.f16 %f505, %rs285;} // end inline asm fma.rn.ftz.f32 %f689, %f687, %f505, %f681; // begin inline asm { cvt.f32.f16 %f506, %rs286;} // end inline asm fma.rn.ftz.f32 %f690, %f687, %f506, %f682; // begin inline asm { cvt.f32.f16 %f507, %rs287;} // end inline asm fma.rn.ftz.f32 %f691, %f687, %f507, %f683; // begin inline asm { cvt.f32.f16 %f508, %rs288;} // end inline asm fma.rn.ftz.f32 %f692, %f687, %f508, %f684; ld.global.v4.u32 {%r239, %r240, %r241, %r242}, [%rd5+16]; ld.global.v4.u32 {%r247, %r248, %r249, %r250}, [%rd7]; ld.global.v4.u32 {%r255, %r256, %r257, %r258}, [%rd8]; ld.global.v4.u32 {%r263, %r264, %r265, %r266}, [%rd6+16]; ld.global.v4.u32 {%r271, %r272, %r273, %r274}, [%rd9]; shl.b16 %rs455, %rs58, 4; cvt.s16.s8 %rs456, %rs455; shr.s16 %rs457, %rs456, 7; and.b16 %rs458, %rs457, -16; and.b16 %rs459, %rs58, 15; or.b16 %rs460, %rs458, %rs459; cvt.rn.f32.s16 %f693, %rs460; sub.ftz.f32 %f694, %f693, %f7; mul.ftz.f32 %f695, %f57, %f694; mov.b32 {%rs289, %rs294}, %r239; // begin inline asm { cvt.f32.f16 %f509, %rs289;} // end inline asm fma.rn.ftz.f32 %f696, %f695, %f509, %f688; mov.b32 {%rs290, %rs295}, %r247; // begin inline asm { cvt.f32.f16 %f510, %rs290;} // end inline asm fma.rn.ftz.f32 %f697, %f695, %f510, %f689; mov.b32 {%rs291, %rs296}, %r255; // begin inline asm { cvt.f32.f16 %f511, %rs291;} // end inline asm fma.rn.ftz.f32 %f698, %f695, %f511, %f690; mov.b32 {%rs292, %rs297}, %r263; // begin inline asm { cvt.f32.f16 %f512, %rs292;} // end inline asm fma.rn.ftz.f32 %f699, %f695, %f512, %f691; mov.b32 {%rs293, %rs298}, %r271; // begin inline asm { cvt.f32.f16 %f513, %rs293;} // end inline asm fma.rn.ftz.f32 %f700, %f695, %f513, %f692; cvt.s16.s8 %rs461, %rs58; shr.s16 %rs462, %rs461, 7; and.b16 %rs463, %rs462, -16; or.b16 %rs464, %rs463, %rs10; cvt.rn.f32.s16 %f701, %rs464; sub.ftz.f32 %f702, %f701, %f7; mul.ftz.f32 %f703, %f57, %f702; // begin inline asm { cvt.f32.f16 %f514, %rs294;} // end inline asm fma.rn.ftz.f32 %f704, %f703, %f514, %f696; // begin inline asm { cvt.f32.f16 %f515, %rs295;} // end inline asm fma.rn.ftz.f32 %f705, %f703, %f515, %f697; // begin inline asm { cvt.f32.f16 %f516, %rs296;} // end inline asm fma.rn.ftz.f32 %f706, %f703, %f516, %f698; // begin inline asm { cvt.f32.f16 %f517, %rs297;} // end inline asm fma.rn.ftz.f32 %f707, %f703, %f517, %f699; // begin inline asm { cvt.f32.f16 %f518, %rs298;} // end inline asm fma.rn.ftz.f32 %f708, %f703, %f518, %f700; cvt.s16.s8 %rs466, %rs59; shr.s16 %rs467, %rs466, 7; and.b16 %rs468, %rs467, -16; or.b16 %rs469, %rs468, %rs11; cvt.rn.f32.s16 %f709, %rs469; sub.ftz.f32 %f710, %f709, %f7; mul.ftz.f32 %f711, %f57, %f710; mov.b32 {%rs299, %rs304}, %r240; // begin inline asm { cvt.f32.f16 %f519, %rs299;} // end inline asm fma.rn.ftz.f32 %f712, %f711, %f519, %f704; mov.b32 {%rs300, %rs305}, %r248; // begin inline asm { cvt.f32.f16 %f520, %rs300;} // end inline asm fma.rn.ftz.f32 %f713, %f711, %f520, %f705; mov.b32 {%rs301, %rs306}, %r256; // begin inline asm { cvt.f32.f16 %f521, %rs301;} // end inline asm fma.rn.ftz.f32 %f714, %f711, %f521, %f706; mov.b32 {%rs302, %rs307}, %r264; // begin inline asm { cvt.f32.f16 %f522, %rs302;} // end inline asm fma.rn.ftz.f32 %f715, %f711, %f522, %f707; mov.b32 {%rs303, %rs308}, %r272; // begin inline asm { cvt.f32.f16 %f523, %rs303;} // end inline asm fma.rn.ftz.f32 %f716, %f711, %f523, %f708; shr.s16 %rs470, %rs58, 15; and.b16 %rs471, %rs470, -16; or.b16 %rs472, %rs471, %rs12; cvt.rn.f32.s16 %f717, %rs472; sub.ftz.f32 %f718, %f717, %f7; mul.ftz.f32 %f719, %f57, %f718; // begin inline asm { cvt.f32.f16 %f524, %rs304;} // end inline asm fma.rn.ftz.f32 %f720, %f719, %f524, %f712; // begin inline asm { cvt.f32.f16 %f525, %rs305;} // end inline asm fma.rn.ftz.f32 %f721, %f719, %f525, %f713; // begin inline asm { cvt.f32.f16 %f526, %rs306;} // end inline asm fma.rn.ftz.f32 %f722, %f719, %f526, %f714; // begin inline asm { cvt.f32.f16 %f527, %rs307;} // end inline asm fma.rn.ftz.f32 %f723, %f719, %f527, %f715; // begin inline asm { cvt.f32.f16 %f528, %rs308;} // end inline asm fma.rn.ftz.f32 %f724, %f719, %f528, %f716; shl.b16 %rs474, %rs61, 4; cvt.s16.s8 %rs475, %rs474; shr.s16 %rs476, %rs475, 7; and.b16 %rs477, %rs476, -16; or.b16 %rs478, %rs477, %rs13; cvt.rn.f32.s16 %f725, %rs478; sub.ftz.f32 %f726, %f725, %f7; mul.ftz.f32 %f727, %f57, %f726; mov.b32 {%rs309, %rs314}, %r241; // begin inline asm { cvt.f32.f16 %f529, %rs309;} // end inline asm fma.rn.ftz.f32 %f728, %f727, %f529, %f720; mov.b32 {%rs310, %rs315}, %r249; // begin inline asm { cvt.f32.f16 %f530, %rs310;} // end inline asm fma.rn.ftz.f32 %f729, %f727, %f530, %f721; mov.b32 {%rs311, %rs316}, %r257; // begin inline asm { cvt.f32.f16 %f531, %rs311;} // end inline asm fma.rn.ftz.f32 %f730, %f727, %f531, %f722; mov.b32 {%rs312, %rs317}, %r265; // begin inline asm { cvt.f32.f16 %f532, %rs312;} // end inline asm fma.rn.ftz.f32 %f731, %f727, %f532, %f723; mov.b32 {%rs313, %rs318}, %r273; // begin inline asm { cvt.f32.f16 %f533, %rs313;} // end inline asm fma.rn.ftz.f32 %f732, %f727, %f533, %f724; shr.u32 %r280, %r57, 20; cvt.u16.u32 %rs479, %r280; shl.b16 %rs480, %rs479, 4; cvt.s16.s8 %rs481, %rs480; shr.s16 %rs482, %rs481, 7; and.b16 %rs483, %rs482, -16; and.b16 %rs484, %rs479, 15; or.b16 %rs485, %rs483, %rs484; cvt.rn.f32.s16 %f733, %rs485; sub.ftz.f32 %f734, %f733, %f7; mul.ftz.f32 %f735, %f57, %f734; // begin inline asm { cvt.f32.f16 %f534, %rs314;} // end inline asm fma.rn.ftz.f32 %f736, %f735, %f534, %f728; // begin inline asm { cvt.f32.f16 %f535, %rs315;} // end inline asm fma.rn.ftz.f32 %f737, %f735, %f535, %f729; // begin inline asm { cvt.f32.f16 %f536, %rs316;} // end inline asm fma.rn.ftz.f32 %f738, %f735, %f536, %f730; // begin inline asm { cvt.f32.f16 %f537, %rs317;} // end inline asm fma.rn.ftz.f32 %f739, %f735, %f537, %f731; // begin inline asm { cvt.f32.f16 %f538, %rs318;} // end inline asm fma.rn.ftz.f32 %f740, %f735, %f538, %f732; shr.u32 %r281, %r57, 24; cvt.u16.u32 %rs486, %r281; shl.b16 %rs487, %rs486, 4; cvt.s16.s8 %rs488, %rs487; shr.s16 %rs489, %rs488, 7; and.b16 %rs490, %rs489, -16; and.b16 %rs491, %rs486, 15; or.b16 %rs492, %rs490, %rs491; cvt.rn.f32.s16 %f741, %rs492; sub.ftz.f32 %f742, %f741, %f7; mul.ftz.f32 %f743, %f57, %f742; mov.b32 {%rs319, %rs324}, %r242; // begin inline asm { cvt.f32.f16 %f539, %rs319;} // end inline asm fma.rn.ftz.f32 %f744, %f743, %f539, %f736; mov.b32 {%rs320, %rs325}, %r250; // begin inline asm { cvt.f32.f16 %f540, %rs320;} // end inline asm fma.rn.ftz.f32 %f745, %f743, %f540, %f737; mov.b32 {%rs321, %rs326}, %r258; // begin inline asm { cvt.f32.f16 %f541, %rs321;} // end inline asm fma.rn.ftz.f32 %f746, %f743, %f541, %f738; mov.b32 {%rs322, %rs327}, %r266; // begin inline asm { cvt.f32.f16 %f542, %rs322;} // end inline asm fma.rn.ftz.f32 %f747, %f743, %f542, %f739; mov.b32 {%rs323, %rs328}, %r274; // begin inline asm { cvt.f32.f16 %f543, %rs323;} // end inline asm fma.rn.ftz.f32 %f748, %f743, %f543, %f740; shr.u32 %r282, %r57, 28; cvt.u16.u32 %rs493, %r282; shl.b16 %rs494, %rs493, 4; cvt.s16.s8 %rs495, %rs494; shr.s16 %rs496, %rs495, 7; and.b16 %rs497, %rs496, -16; or.b16 %rs498, %rs497, %rs493; cvt.rn.f32.s16 %f749, %rs498; sub.ftz.f32 %f750, %f749, %f7; mul.ftz.f32 %f751, %f57, %f750; // begin inline asm { cvt.f32.f16 %f544, %rs324;} // end inline asm fma.rn.ftz.f32 %f752, %f751, %f544, %f744; // begin inline asm { cvt.f32.f16 %f545, %rs325;} // end inline asm fma.rn.ftz.f32 %f753, %f751, %f545, %f745; // begin inline asm { cvt.f32.f16 %f546, %rs326;} // end inline asm fma.rn.ftz.f32 %f754, %f751, %f546, %f746; // begin inline asm { cvt.f32.f16 %f547, %rs327;} // end inline asm fma.rn.ftz.f32 %f755, %f751, %f547, %f747; // begin inline asm { cvt.f32.f16 %f548, %rs328;} // end inline asm fma.rn.ftz.f32 %f756, %f751, %f548, %f748; ld.global.v4.u32 {%r283, %r284, %r285, %r286}, [%rd5+32]; ld.global.v4.u32 {%r291, %r292, %r293, %r294}, [%rd7+16]; ld.global.v4.u32 {%r299, %r300, %r301, %r302}, [%rd8+16]; ld.global.v4.u32 {%r307, %r308, %r309, %r310}, [%rd6+32]; ld.global.v4.u32 {%r315, %r316, %r317, %r318}, [%rd9+16]; shl.b16 %rs500, %rs62, 4; cvt.s16.s8 %rs501, %rs500; shr.s16 %rs502, %rs501, 7; and.b16 %rs503, %rs502, -16; and.b16 %rs504, %rs62, 15; or.b16 %rs505, %rs503, %rs504; cvt.rn.f32.s16 %f757, %rs505; sub.ftz.f32 %f758, %f757, %f7; mul.ftz.f32 %f759, %f57, %f758; mov.b32 {%rs329, %rs334}, %r283; // begin inline asm { cvt.f32.f16 %f549, %rs329;} // end inline asm fma.rn.ftz.f32 %f760, %f759, %f549, %f752; mov.b32 {%rs330, %rs335}, %r291; // begin inline asm { cvt.f32.f16 %f550, %rs330;} // end inline asm fma.rn.ftz.f32 %f761, %f759, %f550, %f753; mov.b32 {%rs331, %rs336}, %r299; // begin inline asm { cvt.f32.f16 %f551, %rs331;} // end inline asm fma.rn.ftz.f32 %f762, %f759, %f551, %f754; mov.b32 {%rs332, %rs337}, %r307; // begin inline asm { cvt.f32.f16 %f552, %rs332;} // end inline asm fma.rn.ftz.f32 %f763, %f759, %f552, %f755; mov.b32 {%rs333, %rs338}, %r315; // begin inline asm { cvt.f32.f16 %f553, %rs333;} // end inline asm fma.rn.ftz.f32 %f764, %f759, %f553, %f756; cvt.s16.s8 %rs506, %rs62; shr.s16 %rs507, %rs506, 7; and.b16 %rs508, %rs507, -16; or.b16 %rs509, %rs508, %rs14; cvt.rn.f32.s16 %f765, %rs509; sub.ftz.f32 %f766, %f765, %f7; mul.ftz.f32 %f767, %f57, %f766; // begin inline asm { cvt.f32.f16 %f554, %rs334;} // end inline asm fma.rn.ftz.f32 %f768, %f767, %f554, %f760; // begin inline asm { cvt.f32.f16 %f555, %rs335;} // end inline asm fma.rn.ftz.f32 %f769, %f767, %f555, %f761; // begin inline asm { cvt.f32.f16 %f556, %rs336;} // end inline asm fma.rn.ftz.f32 %f770, %f767, %f556, %f762; // begin inline asm { cvt.f32.f16 %f557, %rs337;} // end inline asm fma.rn.ftz.f32 %f771, %f767, %f557, %f763; // begin inline asm { cvt.f32.f16 %f558, %rs338;} // end inline asm fma.rn.ftz.f32 %f772, %f767, %f558, %f764; cvt.s16.s8 %rs511, %rs63; shr.s16 %rs512, %rs511, 7; and.b16 %rs513, %rs512, -16; or.b16 %rs514, %rs513, %rs15; cvt.rn.f32.s16 %f773, %rs514; sub.ftz.f32 %f774, %f773, %f7; mul.ftz.f32 %f775, %f57, %f774; mov.b32 {%rs339, %rs344}, %r284; // begin inline asm { cvt.f32.f16 %f559, %rs339;} // end inline asm fma.rn.ftz.f32 %f776, %f775, %f559, %f768; mov.b32 {%rs340, %rs345}, %r292; // begin inline asm { cvt.f32.f16 %f560, %rs340;} // end inline asm fma.rn.ftz.f32 %f777, %f775, %f560, %f769; mov.b32 {%rs341, %rs346}, %r300; // begin inline asm { cvt.f32.f16 %f561, %rs341;} // end inline asm fma.rn.ftz.f32 %f778, %f775, %f561, %f770; mov.b32 {%rs342, %rs347}, %r308; // begin inline asm { cvt.f32.f16 %f562, %rs342;} // end inline asm fma.rn.ftz.f32 %f779, %f775, %f562, %f771; mov.b32 {%rs343, %rs348}, %r316; // begin inline asm { cvt.f32.f16 %f563, %rs343;} // end inline asm fma.rn.ftz.f32 %f780, %f775, %f563, %f772; shr.s16 %rs515, %rs62, 15; and.b16 %rs516, %rs515, -16; or.b16 %rs517, %rs516, %rs16; cvt.rn.f32.s16 %f781, %rs517; sub.ftz.f32 %f782, %f781, %f7; mul.ftz.f32 %f783, %f57, %f782; // begin inline asm { cvt.f32.f16 %f564, %rs344;} // end inline asm fma.rn.ftz.f32 %f784, %f783, %f564, %f776; // begin inline asm { cvt.f32.f16 %f565, %rs345;} // end inline asm fma.rn.ftz.f32 %f785, %f783, %f565, %f777; // begin inline asm { cvt.f32.f16 %f566, %rs346;} // end inline asm fma.rn.ftz.f32 %f786, %f783, %f566, %f778; // begin inline asm { cvt.f32.f16 %f567, %rs347;} // end inline asm fma.rn.ftz.f32 %f787, %f783, %f567, %f779; // begin inline asm { cvt.f32.f16 %f568, %rs348;} // end inline asm fma.rn.ftz.f32 %f788, %f783, %f568, %f780; shl.b16 %rs519, %rs65, 4; cvt.s16.s8 %rs520, %rs519; shr.s16 %rs521, %rs520, 7; and.b16 %rs522, %rs521, -16; or.b16 %rs523, %rs522, %rs17; cvt.rn.f32.s16 %f789, %rs523; sub.ftz.f32 %f790, %f789, %f7; mul.ftz.f32 %f791, %f57, %f790; mov.b32 {%rs349, %rs354}, %r285; // begin inline asm { cvt.f32.f16 %f569, %rs349;} // end inline asm fma.rn.ftz.f32 %f792, %f791, %f569, %f784; mov.b32 {%rs350, %rs355}, %r293; // begin inline asm { cvt.f32.f16 %f570, %rs350;} // end inline asm fma.rn.ftz.f32 %f793, %f791, %f570, %f785; mov.b32 {%rs351, %rs356}, %r301; // begin inline asm { cvt.f32.f16 %f571, %rs351;} // end inline asm fma.rn.ftz.f32 %f794, %f791, %f571, %f786; mov.b32 {%rs352, %rs357}, %r309; // begin inline asm { cvt.f32.f16 %f572, %rs352;} // end inline asm fma.rn.ftz.f32 %f795, %f791, %f572, %f787; mov.b32 {%rs353, %rs358}, %r317; // begin inline asm { cvt.f32.f16 %f573, %rs353;} // end inline asm fma.rn.ftz.f32 %f796, %f791, %f573, %f788; shl.b16 %rs525, %rs66, 4; cvt.s16.s8 %rs526, %rs525; shr.s16 %rs527, %rs526, 7; and.b16 %rs528, %rs527, -16; or.b16 %rs529, %rs528, %rs18; cvt.rn.f32.s16 %f797, %rs529; sub.ftz.f32 %f798, %f797, %f7; mul.ftz.f32 %f799, %f57, %f798; // begin inline asm { cvt.f32.f16 %f574, %rs354;} // end inline asm fma.rn.ftz.f32 %f800, %f799, %f574, %f792; // begin inline asm { cvt.f32.f16 %f575, %rs355;} // end inline asm fma.rn.ftz.f32 %f801, %f799, %f575, %f793; // begin inline asm { cvt.f32.f16 %f576, %rs356;} // end inline asm fma.rn.ftz.f32 %f802, %f799, %f576, %f794; // begin inline asm { cvt.f32.f16 %f577, %rs357;} // end inline asm fma.rn.ftz.f32 %f803, %f799, %f577, %f795; // begin inline asm { cvt.f32.f16 %f578, %rs358;} // end inline asm fma.rn.ftz.f32 %f804, %f799, %f578, %f796; shl.b16 %rs531, %rs67, 4; cvt.s16.s8 %rs532, %rs531; shr.s16 %rs533, %rs532, 7; and.b16 %rs534, %rs533, -16; or.b16 %rs535, %rs534, %rs19; cvt.rn.f32.s16 %f805, %rs535; sub.ftz.f32 %f806, %f805, %f7; mul.ftz.f32 %f807, %f57, %f806; mov.b32 {%rs359, %rs364}, %r286; // begin inline asm { cvt.f32.f16 %f579, %rs359;} // end inline asm fma.rn.ftz.f32 %f808, %f807, %f579, %f800; mov.b32 {%rs360, %rs365}, %r294; // begin inline asm { cvt.f32.f16 %f580, %rs360;} // end inline asm fma.rn.ftz.f32 %f809, %f807, %f580, %f801; mov.b32 {%rs361, %rs366}, %r302; // begin inline asm { cvt.f32.f16 %f581, %rs361;} // end inline asm fma.rn.ftz.f32 %f810, %f807, %f581, %f802; mov.b32 {%rs362, %rs367}, %r310; // begin inline asm { cvt.f32.f16 %f582, %rs362;} // end inline asm fma.rn.ftz.f32 %f811, %f807, %f582, %f803; mov.b32 {%rs363, %rs368}, %r318; // begin inline asm { cvt.f32.f16 %f583, %rs363;} // end inline asm fma.rn.ftz.f32 %f812, %f807, %f583, %f804; shr.u32 %r326, %r58, 28; cvt.u16.u32 %rs536, %r326; shl.b16 %rs537, %rs536, 4; cvt.s16.s8 %rs538, %rs537; shr.s16 %rs539, %rs538, 7; and.b16 %rs540, %rs539, -16; or.b16 %rs541, %rs540, %rs536; cvt.rn.f32.s16 %f813, %rs541; sub.ftz.f32 %f814, %f813, %f7; mul.ftz.f32 %f815, %f57, %f814; // begin inline asm { cvt.f32.f16 %f584, %rs364;} // end inline asm fma.rn.ftz.f32 %f816, %f815, %f584, %f808; // begin inline asm { cvt.f32.f16 %f585, %rs365;} // end inline asm fma.rn.ftz.f32 %f817, %f815, %f585, %f809; // begin inline asm { cvt.f32.f16 %f586, %rs366;} // end inline asm fma.rn.ftz.f32 %f818, %f815, %f586, %f810; // begin inline asm { cvt.f32.f16 %f587, %rs367;} // end inline asm fma.rn.ftz.f32 %f819, %f815, %f587, %f811; // begin inline asm { cvt.f32.f16 %f588, %rs368;} // end inline asm fma.rn.ftz.f32 %f820, %f815, %f588, %f812; ld.global.v4.u32 {%r327, %r328, %r329, %r330}, [%rd5+48]; ld.global.v4.u32 {%r335, %r336, %r337, %r338}, [%rd7+32]; ld.global.v4.u32 {%r343, %r344, %r345, %r346}, [%rd8+32]; ld.global.v4.u32 {%r351, %r352, %r353, %r354}, [%rd6+48]; ld.global.v4.u32 {%r359, %r360, %r361, %r362}, [%rd9+32]; shl.b16 %rs543, %rs68, 4; cvt.s16.s8 %rs544, %rs543; shr.s16 %rs545, %rs544, 7; and.b16 %rs546, %rs545, -16; and.b16 %rs547, %rs68, 15; or.b16 %rs548, %rs546, %rs547; cvt.rn.f32.s16 %f821, %rs548; sub.ftz.f32 %f822, %f821, %f7; mul.ftz.f32 %f823, %f57, %f822; mov.b32 {%rs369, %rs374}, %r327; // begin inline asm { cvt.f32.f16 %f589, %rs369;} // end inline asm fma.rn.ftz.f32 %f824, %f823, %f589, %f816; mov.b32 {%rs370, %rs375}, %r335; // begin inline asm { cvt.f32.f16 %f590, %rs370;} // end inline asm fma.rn.ftz.f32 %f825, %f823, %f590, %f817; mov.b32 {%rs371, %rs376}, %r343; // begin inline asm { cvt.f32.f16 %f591, %rs371;} // end inline asm fma.rn.ftz.f32 %f826, %f823, %f591, %f818; mov.b32 {%rs372, %rs377}, %r351; // begin inline asm { cvt.f32.f16 %f592, %rs372;} // end inline asm fma.rn.ftz.f32 %f827, %f823, %f592, %f819; mov.b32 {%rs373, %rs378}, %r359; // begin inline asm { cvt.f32.f16 %f593, %rs373;} // end inline asm fma.rn.ftz.f32 %f828, %f823, %f593, %f820; cvt.s16.s8 %rs549, %rs68; shr.s16 %rs550, %rs549, 7; and.b16 %rs551, %rs550, -16; or.b16 %rs552, %rs551, %rs20; cvt.rn.f32.s16 %f829, %rs552; sub.ftz.f32 %f830, %f829, %f7; mul.ftz.f32 %f831, %f57, %f830; // begin inline asm { cvt.f32.f16 %f594, %rs374;} // end inline asm fma.rn.ftz.f32 %f832, %f831, %f594, %f824; // begin inline asm { cvt.f32.f16 %f595, %rs375;} // end inline asm fma.rn.ftz.f32 %f833, %f831, %f595, %f825; // begin inline asm { cvt.f32.f16 %f596, %rs376;} // end inline asm fma.rn.ftz.f32 %f834, %f831, %f596, %f826; // begin inline asm { cvt.f32.f16 %f597, %rs377;} // end inline asm fma.rn.ftz.f32 %f835, %f831, %f597, %f827; // begin inline asm { cvt.f32.f16 %f598, %rs378;} // end inline asm fma.rn.ftz.f32 %f836, %f831, %f598, %f828; cvt.s16.s8 %rs554, %rs69; shr.s16 %rs555, %rs554, 7; and.b16 %rs556, %rs555, -16; or.b16 %rs557, %rs556, %rs21; cvt.rn.f32.s16 %f837, %rs557; sub.ftz.f32 %f838, %f837, %f7; mul.ftz.f32 %f839, %f57, %f838; mov.b32 {%rs379, %rs384}, %r328; // begin inline asm { cvt.f32.f16 %f599, %rs379;} // end inline asm fma.rn.ftz.f32 %f840, %f839, %f599, %f832; mov.b32 {%rs380, %rs385}, %r336; // begin inline asm { cvt.f32.f16 %f600, %rs380;} // end inline asm fma.rn.ftz.f32 %f841, %f839, %f600, %f833; mov.b32 {%rs381, %rs386}, %r344; // begin inline asm { cvt.f32.f16 %f601, %rs381;} // end inline asm fma.rn.ftz.f32 %f842, %f839, %f601, %f834; mov.b32 {%rs382, %rs387}, %r352; // begin inline asm { cvt.f32.f16 %f602, %rs382;} // end inline asm fma.rn.ftz.f32 %f843, %f839, %f602, %f835; mov.b32 {%rs383, %rs388}, %r360; // begin inline asm { cvt.f32.f16 %f603, %rs383;} // end inline asm fma.rn.ftz.f32 %f844, %f839, %f603, %f836; shr.s16 %rs558, %rs68, 15; and.b16 %rs559, %rs558, -16; or.b16 %rs560, %rs559, %rs22; cvt.rn.f32.s16 %f845, %rs560; sub.ftz.f32 %f846, %f845, %f7; mul.ftz.f32 %f847, %f57, %f846; // begin inline asm { cvt.f32.f16 %f604, %rs384;} // end inline asm fma.rn.ftz.f32 %f848, %f847, %f604, %f840; // begin inline asm { cvt.f32.f16 %f605, %rs385;} // end inline asm fma.rn.ftz.f32 %f849, %f847, %f605, %f841; // begin inline asm { cvt.f32.f16 %f606, %rs386;} // end inline asm fma.rn.ftz.f32 %f850, %f847, %f606, %f842; // begin inline asm { cvt.f32.f16 %f607, %rs387;} // end inline asm fma.rn.ftz.f32 %f851, %f847, %f607, %f843; // begin inline asm { cvt.f32.f16 %f608, %rs388;} // end inline asm fma.rn.ftz.f32 %f852, %f847, %f608, %f844; shl.b16 %rs562, %rs71, 4; cvt.s16.s8 %rs563, %rs562; shr.s16 %rs564, %rs563, 7; and.b16 %rs565, %rs564, -16; or.b16 %rs566, %rs565, %rs23; cvt.rn.f32.s16 %f853, %rs566; sub.ftz.f32 %f854, %f853, %f7; mul.ftz.f32 %f855, %f57, %f854; mov.b32 {%rs389, %rs394}, %r329; // begin inline asm { cvt.f32.f16 %f609, %rs389;} // end inline asm fma.rn.ftz.f32 %f856, %f855, %f609, %f848; mov.b32 {%rs390, %rs395}, %r337; // begin inline asm { cvt.f32.f16 %f610, %rs390;} // end inline asm fma.rn.ftz.f32 %f857, %f855, %f610, %f849; mov.b32 {%rs391, %rs396}, %r345; // begin inline asm { cvt.f32.f16 %f611, %rs391;} // end inline asm fma.rn.ftz.f32 %f858, %f855, %f611, %f850; mov.b32 {%rs392, %rs397}, %r353; // begin inline asm { cvt.f32.f16 %f612, %rs392;} // end inline asm fma.rn.ftz.f32 %f859, %f855, %f612, %f851; mov.b32 {%rs393, %rs398}, %r361; // begin inline asm { cvt.f32.f16 %f613, %rs393;} // end inline asm fma.rn.ftz.f32 %f860, %f855, %f613, %f852; shl.b16 %rs567, %rs24, 4; cvt.s16.s8 %rs568, %rs567; shr.s16 %rs569, %rs568, 7; and.b16 %rs570, %rs569, -16; or.b16 %rs571, %rs570, %rs25; cvt.rn.f32.s16 %f861, %rs571; sub.ftz.f32 %f862, %f861, %f7; mul.ftz.f32 %f863, %f57, %f862; // begin inline asm { cvt.f32.f16 %f614, %rs394;} // end inline asm fma.rn.ftz.f32 %f864, %f863, %f614, %f856; // begin inline asm { cvt.f32.f16 %f615, %rs395;} // end inline asm fma.rn.ftz.f32 %f865, %f863, %f615, %f857; // begin inline asm { cvt.f32.f16 %f616, %rs396;} // end inline asm fma.rn.ftz.f32 %f866, %f863, %f616, %f858; // begin inline asm { cvt.f32.f16 %f617, %rs397;} // end inline asm fma.rn.ftz.f32 %f867, %f863, %f617, %f859; // begin inline asm { cvt.f32.f16 %f618, %rs398;} // end inline asm fma.rn.ftz.f32 %f868, %f863, %f618, %f860; shl.b16 %rs572, %rs26, 4; cvt.s16.s8 %rs573, %rs572; shr.s16 %rs574, %rs573, 7; and.b16 %rs575, %rs574, -16; or.b16 %rs576, %rs575, %rs27; cvt.rn.f32.s16 %f869, %rs576; sub.ftz.f32 %f870, %f869, %f7; mul.ftz.f32 %f871, %f57, %f870; mov.b32 {%rs399, %rs404}, %r330; // begin inline asm { cvt.f32.f16 %f619, %rs399;} // end inline asm fma.rn.ftz.f32 %f872, %f871, %f619, %f864; mov.b32 {%rs400, %rs405}, %r338; // begin inline asm { cvt.f32.f16 %f620, %rs400;} // end inline asm fma.rn.ftz.f32 %f873, %f871, %f620, %f865; mov.b32 {%rs401, %rs406}, %r346; // begin inline asm { cvt.f32.f16 %f621, %rs401;} // end inline asm fma.rn.ftz.f32 %f874, %f871, %f621, %f866; mov.b32 {%rs402, %rs407}, %r354; // begin inline asm { cvt.f32.f16 %f622, %rs402;} // end inline asm fma.rn.ftz.f32 %f875, %f871, %f622, %f867; mov.b32 {%rs403, %rs408}, %r362; // begin inline asm { cvt.f32.f16 %f623, %rs403;} // end inline asm fma.rn.ftz.f32 %f876, %f871, %f623, %f868; shl.b16 %rs577, %rs28, 4; cvt.s16.s8 %rs578, %rs577; shr.s16 %rs579, %rs578, 7; and.b16 %rs580, %rs579, -16; or.b16 %rs581, %rs580, %rs28; cvt.rn.f32.s16 %f877, %rs581; sub.ftz.f32 %f878, %f877, %f7; mul.ftz.f32 %f879, %f57, %f878; // begin inline asm { cvt.f32.f16 %f624, %rs404;} // end inline asm fma.rn.ftz.f32 %f1004, %f879, %f624, %f872; // begin inline asm { cvt.f32.f16 %f625, %rs405;} // end inline asm fma.rn.ftz.f32 %f1003, %f879, %f625, %f873; // begin inline asm { cvt.f32.f16 %f626, %rs406;} // end inline asm fma.rn.ftz.f32 %f1002, %f879, %f626, %f874; // begin inline asm { cvt.f32.f16 %f627, %rs407;} // end inline asm fma.rn.ftz.f32 %f1001, %f879, %f627, %f875; // begin inline asm { cvt.f32.f16 %f628, %rs408;} // end inline asm fma.rn.ftz.f32 %f1000, %f879, %f628, %f876; $L__BB0_8: add.s32 %r475, %r475, 4; shl.b32 %r368, %r475, 5; add.s32 %r474, %r368, %r52; shl.b32 %r473, %r474, 2; setp.lt.u32 %p7, %r473, %r49; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r468, %tid.y; mov.u32 %r467, %tid.x; shl.b32 %r466, %r468, 5; add.s32 %r465, %r466, %r467; shl.b32 %r370, %r465, 2; mov.u32 %r371, _ZZ9gemv_int4ILi4ELi128ELi5EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r372, %r371, %r370; setp.lt.u32 %p8, %r465, 32; @%p8 bra $L__BB0_11; add.s32 %r460, %r372, -112; st.shared.f32 [%r460], %f1004; $L__BB0_11: mov.u32 %r472, %tid.y; mov.u32 %r471, %tid.x; shl.b32 %r470, %r472, 5; add.s32 %r469, %r470, %r471; setp.gt.u32 %p9, %r469, 31; bar.sync 0; mad.lo.s32 %r42, %r469, 12, %r371; @%p9 bra $L__BB0_13; mov.u32 %r387, 16; ld.shared.f32 %f895, [%r42+16]; add.ftz.f32 %f896, %f1004, %f895; ld.shared.f32 %f897, [%r42+20]; add.ftz.f32 %f898, %f896, %f897; ld.shared.f32 %f899, [%r42+24]; add.ftz.f32 %f882, %f898, %f899; mov.u32 %r375, 1; mov.u32 %r388, 31; mov.u32 %r389, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f882, %r375, %r388, %r389; @p add.f32 r0, r0, %f882; mov.f32 %f880, r0;} // end inline asm mov.u32 %r378, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f880, %r378, %r388, %r389; @p add.f32 r0, r0, %f880; mov.f32 %f883, r0;} // end inline asm mov.u32 %r381, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f883, %r381, %r388, %r389; @p add.f32 r0, r0, %f883; mov.f32 %f886, r0;} // end inline asm mov.u32 %r384, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f886, %r384, %r388, %r389; @p add.f32 r0, r0, %f886; mov.f32 %f889, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f889, %r387, %r388, %r389; @p add.f32 r0, r0, %f889; mov.f32 %f1004, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r461, %r372, -112; st.shared.f32 [%r461+640], %f1003; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f915, [%r42+656]; add.ftz.f32 %f916, %f1003, %f915; ld.shared.f32 %f917, [%r42+660]; add.ftz.f32 %f918, %f916, %f917; ld.shared.f32 %f919, [%r42+664]; add.ftz.f32 %f902, %f918, %f919; mov.u32 %r391, 1; mov.u32 %r404, 31; mov.u32 %r405, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f902, %r391, %r404, %r405; @p add.f32 r0, r0, %f902; mov.f32 %f900, r0;} // end inline asm mov.u32 %r394, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f900, %r394, %r404, %r405; @p add.f32 r0, r0, %f900; mov.f32 %f903, r0;} // end inline asm mov.u32 %r397, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f903, %r397, %r404, %r405; @p add.f32 r0, r0, %f903; mov.f32 %f906, r0;} // end inline asm mov.u32 %r400, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f906, %r400, %r404, %r405; @p add.f32 r0, r0, %f906; mov.f32 %f909, r0;} // end inline asm mov.u32 %r403, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f909, %r403, %r404, %r405; @p add.f32 r0, r0, %f909; mov.f32 %f1003, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r462, %r372, -112; st.shared.f32 [%r462+1280], %f1002; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f935, [%r42+1296]; add.ftz.f32 %f936, %f1002, %f935; ld.shared.f32 %f937, [%r42+1300]; add.ftz.f32 %f938, %f936, %f937; ld.shared.f32 %f939, [%r42+1304]; add.ftz.f32 %f922, %f938, %f939; mov.u32 %r407, 1; mov.u32 %r420, 31; mov.u32 %r421, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f922, %r407, %r420, %r421; @p add.f32 r0, r0, %f922; mov.f32 %f920, r0;} // end inline asm mov.u32 %r410, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f920, %r410, %r420, %r421; @p add.f32 r0, r0, %f920; mov.f32 %f923, r0;} // end inline asm mov.u32 %r413, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f923, %r413, %r420, %r421; @p add.f32 r0, r0, %f923; mov.f32 %f926, r0;} // end inline asm mov.u32 %r416, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f926, %r416, %r420, %r421; @p add.f32 r0, r0, %f926; mov.f32 %f929, r0;} // end inline asm mov.u32 %r419, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f929, %r419, %r420, %r421; @p add.f32 r0, r0, %f929; mov.f32 %f1002, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r463, %r372, -112; st.shared.f32 [%r463+1920], %f1001; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f955, [%r42+1936]; add.ftz.f32 %f956, %f1001, %f955; ld.shared.f32 %f957, [%r42+1940]; add.ftz.f32 %f958, %f956, %f957; ld.shared.f32 %f959, [%r42+1944]; add.ftz.f32 %f942, %f958, %f959; mov.u32 %r423, 1; mov.u32 %r436, 31; mov.u32 %r437, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f942, %r423, %r436, %r437; @p add.f32 r0, r0, %f942; mov.f32 %f940, r0;} // end inline asm mov.u32 %r426, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f940, %r426, %r436, %r437; @p add.f32 r0, r0, %f940; mov.f32 %f943, r0;} // end inline asm mov.u32 %r429, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f943, %r429, %r436, %r437; @p add.f32 r0, r0, %f943; mov.f32 %f946, r0;} // end inline asm mov.u32 %r432, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f946, %r432, %r436, %r437; @p add.f32 r0, r0, %f946; mov.f32 %f949, r0;} // end inline asm mov.u32 %r435, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f949, %r435, %r436, %r437; @p add.f32 r0, r0, %f949; mov.f32 %f1001, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r464, %r372, -112; st.shared.f32 [%r464+2560], %f1000; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f975, [%r42+2576]; add.ftz.f32 %f976, %f1000, %f975; ld.shared.f32 %f977, [%r42+2580]; add.ftz.f32 %f978, %f976, %f977; ld.shared.f32 %f979, [%r42+2584]; add.ftz.f32 %f962, %f978, %f979; mov.u32 %r439, 1; mov.u32 %r452, 31; mov.u32 %r453, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f962, %r439, %r452, %r453; @p add.f32 r0, r0, %f962; mov.f32 %f960, r0;} // end inline asm mov.u32 %r442, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f960, %r442, %r452, %r453; @p add.f32 r0, r0, %f960; mov.f32 %f963, r0;} // end inline asm mov.u32 %r445, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f963, %r445, %r452, %r453; @p add.f32 r0, r0, %f963; mov.f32 %f966, r0;} // end inline asm mov.u32 %r448, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f966, %r448, %r452, %r453; @p add.f32 r0, r0, %f966; mov.f32 %f969, r0;} // end inline asm mov.u32 %r451, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f969, %r451, %r452, %r453; @p add.f32 r0, r0, %f969; mov.f32 %f1000, r0;} // end inline asm $L__BB0_29: mov.u32 %r454, %tid.y; or.b32 %r456, %r52, %r454; setp.ne.s32 %p18, %r456, 0; @%p18 bra $L__BB0_33; ld.param.u64 %rd60, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd59, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd41, %rd59; setp.eq.s64 %p19, %rd60, 0; mul.ftz.f32 %f38, %f50, %f1004; mov.u32 %r457, %ctaid.x; cvt.s64.s32 %rd11, %r457; mul.wide.s32 %rd42, %r457, 2; add.s64 %rd12, %rd41, %rd42; mul.ftz.f32 %f39, %f50, %f1003; add.s32 %r458, %r48, %r457; cvt.s64.s32 %rd13, %r48; mul.wide.s32 %rd43, %r48, 2; add.s64 %rd14, %rd12, %rd43; mul.ftz.f32 %f40, %f50, %f1002; add.s32 %r459, %r458, %r48; cvt.s64.s32 %rd15, %r459; mul.wide.s32 %rd44, %r459, 2; add.s64 %rd17, %rd41, %rd44; mul.ftz.f32 %f41, %f50, %f1001; mul.ftz.f32 %f42, %f50, %f1000; @%p19 bra $L__BB0_32; ld.param.u64 %rd61, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd45, %rd61; shl.b64 %rd46, %rd11, 1; add.s64 %rd47, %rd45, %rd46; ld.global.u16 %rs582, [%rd47]; // begin inline asm { cvt.f32.f16 %f980, %rs582;} // end inline asm fma.rn.ftz.f32 %f981, %f51, %f980, %f38; // begin inline asm { cvt.rn.f16.f32 %rs583, %f981;} // end inline asm st.global.u16 [%rd12], %rs583; shl.b64 %rd48, %rd13, 1; add.s64 %rd49, %rd47, %rd48; ld.global.u16 %rs584, [%rd49]; // begin inline asm { cvt.f32.f16 %f982, %rs584;} // end inline asm fma.rn.ftz.f32 %f983, %f51, %f982, %f39; // begin inline asm { cvt.rn.f16.f32 %rs585, %f983;} // end inline asm st.global.u16 [%rd14], %rs585; shl.b64 %rd50, %rd15, 1; add.s64 %rd51, %rd45, %rd50; ld.global.u16 %rs586, [%rd51]; // begin inline asm { cvt.f32.f16 %f984, %rs586;} // end inline asm fma.rn.ftz.f32 %f985, %f51, %f984, %f40; // begin inline asm { cvt.rn.f16.f32 %rs587, %f985;} // end inline asm st.global.u16 [%rd17], %rs587; add.s64 %rd52, %rd51, %rd48; ld.global.u16 %rs588, [%rd52]; // begin inline asm { cvt.f32.f16 %f986, %rs588;} // end inline asm fma.rn.ftz.f32 %f987, %f51, %f986, %f41; // begin inline asm { cvt.rn.f16.f32 %rs589, %f987;} // end inline asm add.s64 %rd53, %rd17, %rd48; st.global.u16 [%rd53], %rs589; add.s64 %rd54, %rd52, %rd48; ld.global.u16 %rs590, [%rd54]; // begin inline asm { cvt.f32.f16 %f988, %rs590;} // end inline asm fma.rn.ftz.f32 %f989, %f51, %f988, %f42; // begin inline asm { cvt.rn.f16.f32 %rs591, %f989;} // end inline asm add.s64 %rd55, %rd53, %rd48; st.global.u16 [%rd55], %rs591; bra.uni $L__BB0_33; $L__BB0_32: // begin inline asm { cvt.rn.f16.f32 %rs592, %f38;} // end inline asm st.global.u16 [%rd12], %rs592; // begin inline asm { cvt.rn.f16.f32 %rs593, %f39;} // end inline asm st.global.u16 [%rd14], %rs593; // begin inline asm { cvt.rn.f16.f32 %rs594, %f40;} // end inline asm st.global.u16 [%rd17], %rs594; // begin inline asm { cvt.rn.f16.f32 %rs595, %f41;} // end inline asm shl.b64 %rd56, %rd13, 1; add.s64 %rd57, %rd17, %rd56; st.global.u16 [%rd57], %rs595; // begin inline asm { cvt.rn.f16.f32 %rs596, %f42;} // end inline asm add.s64 %rd58, %rd57, %rd56; st.global.u16 [%rd58], %rs596; $L__BB0_33: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }