.b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_edddc3196thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch623DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<27>; .reg .b16 %rs<654>; .reg .f32 %f<1192>; .reg .b32 %r<595>; .reg .b64 %rd<90>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[3840]; ld.param.v2.u32 {%r53, %r54}, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r55, %r56}, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f73, %f74}, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs73, %rs74, %rs75, %rs76}, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd30, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd29, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd28, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd27, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd26, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd2, %rd26; mov.u32 %r594, %tid.y; shl.b32 %r57, %r594, 5; mov.u32 %r58, %tid.x; add.s32 %r593, %r57, %r58; shl.b32 %r592, %r593, 2; setp.ge.u32 %p1, %r592, %r55; mov.f32 %f1168, 0f00000000; mov.f32 %f1169, %f1168; mov.f32 %f1170, %f1168; mov.f32 %f1171, %f1168; mov.f32 %f1172, %f1168; mov.f32 %f1173, %f1168; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd27; mov.u32 %r59, %ctaid.x; mul.lo.s32 %r6, %r56, %r59; $L__BB0_2: mad.lo.s32 %r65, %r55, %r59, %r592; mul.wide.u32 %rd37, %r65, 4; add.s64 %rd32, %rd28, %rd37; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd31, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v4.u32 {%r60,%r61,%r62,%r63}, [%rd32], %rd31; // end inline asm shr.u32 %r67, %r58, 2; shl.b32 %r68, %r594, 3; add.s32 %r14, %r68, %r67; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd38, %r15, 2; add.s64 %rd35, %rd30, %rd38; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd34, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs81, [%rd35], %rd34; // end inline asm // begin inline asm { cvt.f32.f16 %f81, %rs81;} // end inline asm shl.b16 %rs653, %rs73, 3; setp.eq.s64 %p2, %rd29, 0; @%p2 bra $L__BB0_4; shr.u32 %r69, %r15, 31; add.s32 %r70, %r15, %r69; shr.s32 %r71, %r70, 1; cvt.s64.s32 %rd42, %r71; add.s64 %rd40, %rd29, %rd42; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd39, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs83, [%rd40], %rd39; // end inline asm cvt.u32.u16 %r72, %rs83; and.b32 %r73, %r72, 255; shl.b32 %r74, %r14, 2; and.b32 %r75, %r74, 4; shr.u32 %r76, %r73, %r75; cvt.u16.u32 %rs84, %r76; and.b16 %rs653, %rs84, 15; $L__BB0_4: shl.b32 %r16, %r593, 5; setp.ge.s32 %p3, %r16, %r53; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs73, 0; shr.u16 %rs86, %rs653, 3; and.b16 %rs87, %rs86, 1; setp.eq.b16 %p5, %rs87, 1; and.pred %p6, %p4, %p5; selp.b16 %rs88, -16, 0, %p6; or.b16 %rs89, %rs88, %rs653; cvt.s16.s8 %rs90, %rs89; cvt.rn.f32.s16 %f8, %rs90; mul.wide.s32 %rd43, %r16, 2; add.s64 %rd7, %rd3, %rd43; ld.global.v4.u32 {%r77, %r78, %r79, %r80}, [%rd7]; mul.wide.s32 %rd44, %r53, 2; add.s64 %rd45, %rd7, %rd44; ld.global.v4.u32 {%r81, %r82, %r83, %r84}, [%rd45]; add.s32 %r85, %r16, %r53; add.s32 %r86, %r85, %r53; mul.wide.s32 %rd46, %r86, 2; add.s64 %rd8, %rd3, %rd46; ld.global.v4.u32 {%r87, %r88, %r89, %r90}, [%rd8]; add.s64 %rd47, %rd8, %rd44; ld.global.v4.u32 {%r91, %r92, %r93, %r94}, [%rd47]; add.s64 %rd48, %rd47, %rd44; ld.global.v4.u32 {%r95, %r96, %r97, %r98}, [%rd48]; add.s64 %rd49, %rd48, %rd44; ld.global.v4.u32 {%r99, %r100, %r101, %r102}, [%rd49]; cvt.u16.u32 %rs5, %r60; and.b16 %rs6, %rs5, 15; shr.u32 %r103, %r60, 4; cvt.u16.u32 %rs7, %r103; and.b16 %rs8, %rs7, 15; shr.u32 %r104, %r60, 8; cvt.u16.u32 %rs9, %r104; and.b16 %rs10, %rs9, 15; shr.u32 %r105, %r60, 12; cvt.u16.u32 %rs11, %r105; and.b16 %rs12, %rs11, 15; shr.u32 %r106, %r60, 16; cvt.u16.u32 %rs13, %r106; and.b16 %rs14, %rs13, 15; shr.u32 %r107, %r60, 20; cvt.u16.u32 %rs15, %r107; and.b16 %rs16, %rs15, 15; shr.u32 %r108, %r60, 24; cvt.u16.u32 %rs17, %r108; and.b16 %rs18, %rs17, 15; shr.u32 %r109, %r60, 28; cvt.u16.u32 %rs19, %r109; add.s32 %r110, %r85, 8; mul.wide.s32 %rd50, %r110, 2; add.s64 %rd9, %rd3, %rd50; add.s32 %r111, %r110, %r53; add.s32 %r112, %r111, %r53; mul.wide.s32 %rd51, %r112, 2; add.s64 %rd10, %rd3, %rd51; add.s32 %r113, %r112, %r53; mul.wide.s32 %rd52, %r113, 2; add.s64 %rd11, %rd3, %rd52; add.s32 %r114, %r113, %r53; mul.wide.s32 %rd53, %r114, 2; add.s64 %rd12, %rd3, %rd53; cvt.u16.u32 %rs20, %r61; and.b16 %rs21, %rs20, 15; shr.u32 %r115, %r61, 4; cvt.u16.u32 %rs22, %r115; and.b16 %rs23, %rs22, 15; shr.u32 %r116, %r61, 8; cvt.u16.u32 %rs24, %r116; and.b16 %rs25, %rs24, 15; shr.u32 %r117, %r61, 12; cvt.u16.u32 %rs26, %r117; and.b16 %rs27, %rs26, 15; shr.u32 %r118, %r61, 16; cvt.u16.u32 %rs28, %r118; and.b16 %rs29, %rs28, 15; shr.u32 %r119, %r61, 20; cvt.u16.u32 %rs30, %r119; and.b16 %rs31, %rs30, 15; shr.u32 %r120, %r61, 24; cvt.u16.u32 %rs32, %r120; and.b16 %rs33, %rs32, 15; shr.u32 %r121, %r61, 28; cvt.u16.u32 %rs34, %r121; cvt.u16.u32 %rs35, %r62; and.b16 %rs36, %rs35, 15; shr.u32 %r122, %r62, 4; cvt.u16.u32 %rs37, %r122; and.b16 %rs38, %rs37, 15; shr.u32 %r123, %r62, 8; cvt.u16.u32 %rs39, %r123; and.b16 %rs40, %rs39, 15; shr.u32 %r124, %r62, 12; cvt.u16.u32 %rs41, %r124; and.b16 %rs42, %rs41, 15; shr.u32 %r125, %r62, 16; cvt.u16.u32 %rs43, %r125; and.b16 %rs44, %rs43, 15; shr.u32 %r126, %r62, 20; cvt.u16.u32 %rs45, %r126; and.b16 %rs46, %rs45, 15; shr.u32 %r127, %r62, 24; cvt.u16.u32 %rs47, %r127; and.b16 %rs48, %rs47, 15; shr.u32 %r128, %r62, 28; cvt.u16.u32 %rs49, %r128; cvt.u16.u32 %rs50, %r63; and.b16 %rs51, %rs50, 15; shr.u32 %r129, %r63, 4; cvt.u16.u32 %rs52, %r129; and.b16 %rs53, %rs52, 15; shr.u32 %r130, %r63, 8; cvt.u16.u32 %rs54, %r130; and.b16 %rs55, %rs54, 15; shr.u32 %r131, %r63, 12; cvt.u16.u32 %rs56, %r131; and.b16 %rs57, %rs56, 15; shr.u32 %r132, %r63, 16; cvt.u16.u32 %rs58, %r132; and.b16 %rs59, %rs58, 15; shr.u32 %r133, %r63, 20; cvt.u16.u32 %rs60, %r133; and.b16 %rs61, %rs60, 15; shr.u32 %r134, %r63, 24; cvt.u16.u32 %rs62, %r134; and.b16 %rs63, %rs62, 15; shr.u32 %r135, %r63, 28; cvt.u16.u32 %rs64, %r135; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f274, %rs6; sub.ftz.f32 %f275, %f274, %f8; mul.ftz.f32 %f276, %f81, %f275; mov.b32 {%rs91, %rs97}, %r77; // begin inline asm { cvt.f32.f16 %f82, %rs91;} // end inline asm fma.rn.ftz.f32 %f277, %f276, %f82, %f1173; mov.b32 {%rs92, %rs98}, %r81; // begin inline asm { cvt.f32.f16 %f83, %rs92;} // end inline asm fma.rn.ftz.f32 %f278, %f276, %f83, %f1172; mov.b32 {%rs93, %rs99}, %r87; // begin inline asm { cvt.f32.f16 %f84, %rs93;} // end inline asm fma.rn.ftz.f32 %f279, %f276, %f84, %f1171; mov.b32 {%rs94, %rs100}, %r91; // begin inline asm { cvt.f32.f16 %f85, %rs94;} // end inline asm fma.rn.ftz.f32 %f280, %f276, %f85, %f1170; mov.b32 {%rs95, %rs101}, %r95; // begin inline asm { cvt.f32.f16 %f86, %rs95;} // end inline asm fma.rn.ftz.f32 %f281, %f276, %f86, %f1169; mov.b32 {%rs96, %rs102}, %r99; // begin inline asm { cvt.f32.f16 %f87, %rs96;} // end inline asm fma.rn.ftz.f32 %f282, %f276, %f87, %f1168; cvt.rn.f32.s16 %f283, %rs8; sub.ftz.f32 %f284, %f283, %f8; mul.ftz.f32 %f285, %f81, %f284; // begin inline asm { cvt.f32.f16 %f88, %rs97;} // end inline asm fma.rn.ftz.f32 %f286, %f285, %f88, %f277; // begin inline asm { cvt.f32.f16 %f89, %rs98;} // end inline asm fma.rn.ftz.f32 %f287, %f285, %f89, %f278; // begin inline asm { cvt.f32.f16 %f90, %rs99;} // end inline asm fma.rn.ftz.f32 %f288, %f285, %f90, %f279; // begin inline asm { cvt.f32.f16 %f91, %rs100;} // end inline asm fma.rn.ftz.f32 %f289, %f285, %f91, %f280; // begin inline asm { cvt.f32.f16 %f92, %rs101;} // end inline asm fma.rn.ftz.f32 %f290, %f285, %f92, %f281; // begin inline asm { cvt.f32.f16 %f93, %rs102;} // end inline asm fma.rn.ftz.f32 %f291, %f285, %f93, %f282; cvt.rn.f32.s16 %f292, %rs10; sub.ftz.f32 %f293, %f292, %f8; mul.ftz.f32 %f294, %f81, %f293; mov.b32 {%rs103, %rs109}, %r78; // begin inline asm { cvt.f32.f16 %f94, %rs103;} // end inline asm fma.rn.ftz.f32 %f295, %f294, %f94, %f286; mov.b32 {%rs104, %rs110}, %r82; // begin inline asm { cvt.f32.f16 %f95, %rs104;} // end inline asm fma.rn.ftz.f32 %f296, %f294, %f95, %f287; mov.b32 {%rs105, %rs111}, %r88; // begin inline asm { cvt.f32.f16 %f96, %rs105;} // end inline asm fma.rn.ftz.f32 %f297, %f294, %f96, %f288; mov.b32 {%rs106, %rs112}, %r92; // begin inline asm { cvt.f32.f16 %f97, %rs106;} // end inline asm fma.rn.ftz.f32 %f298, %f294, %f97, %f289; mov.b32 {%rs107, %rs113}, %r96; // begin inline asm { cvt.f32.f16 %f98, %rs107;} // end inline asm fma.rn.ftz.f32 %f299, %f294, %f98, %f290; mov.b32 {%rs108, %rs114}, %r100; // begin inline asm { cvt.f32.f16 %f99, %rs108;} // end inline asm fma.rn.ftz.f32 %f300, %f294, %f99, %f291; cvt.rn.f32.s16 %f301, %rs12; sub.ftz.f32 %f302, %f301, %f8; mul.ftz.f32 %f303, %f81, %f302; // begin inline asm { cvt.f32.f16 %f100, %rs109;} // end inline asm fma.rn.ftz.f32 %f304, %f303, %f100, %f295; // begin inline asm { cvt.f32.f16 %f101, %rs110;} // end inline asm fma.rn.ftz.f32 %f305, %f303, %f101, %f296; // begin inline asm { cvt.f32.f16 %f102, %rs111;} // end inline asm fma.rn.ftz.f32 %f306, %f303, %f102, %f297; // begin inline asm { cvt.f32.f16 %f103, %rs112;} // end inline asm fma.rn.ftz.f32 %f307, %f303, %f103, %f298; // begin inline asm { cvt.f32.f16 %f104, %rs113;} // end inline asm fma.rn.ftz.f32 %f308, %f303, %f104, %f299; // begin inline asm { cvt.f32.f16 %f105, %rs114;} // end inline asm fma.rn.ftz.f32 %f309, %f303, %f105, %f300; cvt.rn.f32.s16 %f310, %rs14; sub.ftz.f32 %f311, %f310, %f8; mul.ftz.f32 %f312, %f81, %f311; mov.b32 {%rs115, %rs121}, %r79; // begin inline asm { cvt.f32.f16 %f106, %rs115;} // end inline asm fma.rn.ftz.f32 %f313, %f312, %f106, %f304; mov.b32 {%rs116, %rs122}, %r83; // begin inline asm { cvt.f32.f16 %f107, %rs116;} // end inline asm fma.rn.ftz.f32 %f314, %f312, %f107, %f305; mov.b32 {%rs117, %rs123}, %r89; // begin inline asm { cvt.f32.f16 %f108, %rs117;} // end inline asm fma.rn.ftz.f32 %f315, %f312, %f108, %f306; mov.b32 {%rs118, %rs124}, %r93; // begin inline asm { cvt.f32.f16 %f109, %rs118;} // end inline asm fma.rn.ftz.f32 %f316, %f312, %f109, %f307; mov.b32 {%rs119, %rs125}, %r97; // begin inline asm { cvt.f32.f16 %f110, %rs119;} // end inline asm fma.rn.ftz.f32 %f317, %f312, %f110, %f308; mov.b32 {%rs120, %rs126}, %r101; // begin inline asm { cvt.f32.f16 %f111, %rs120;} // end inline asm fma.rn.ftz.f32 %f318, %f312, %f111, %f309; cvt.rn.f32.s16 %f319, %rs16; sub.ftz.f32 %f320, %f319, %f8; mul.ftz.f32 %f321, %f81, %f320; // begin inline asm { cvt.f32.f16 %f112, %rs121;} // end inline asm fma.rn.ftz.f32 %f322, %f321, %f112, %f313; // begin inline asm { cvt.f32.f16 %f113, %rs122;} // end inline asm fma.rn.ftz.f32 %f323, %f321, %f113, %f314; // begin inline asm { cvt.f32.f16 %f114, %rs123;} // end inline asm fma.rn.ftz.f32 %f324, %f321, %f114, %f315; // begin inline asm { cvt.f32.f16 %f115, %rs124;} // end inline asm fma.rn.ftz.f32 %f325, %f321, %f115, %f316; // begin inline asm { cvt.f32.f16 %f116, %rs125;} // end inline asm fma.rn.ftz.f32 %f326, %f321, %f116, %f317; // begin inline asm { cvt.f32.f16 %f117, %rs126;} // end inline asm fma.rn.ftz.f32 %f327, %f321, %f117, %f318; cvt.rn.f32.s16 %f328, %rs18; sub.ftz.f32 %f329, %f328, %f8; mul.ftz.f32 %f330, %f81, %f329; mov.b32 {%rs127, %rs133}, %r80; // begin inline asm { cvt.f32.f16 %f118, %rs127;} // end inline asm fma.rn.ftz.f32 %f331, %f330, %f118, %f322; mov.b32 {%rs128, %rs134}, %r84; // begin inline asm { cvt.f32.f16 %f119, %rs128;} // end inline asm fma.rn.ftz.f32 %f332, %f330, %f119, %f323; mov.b32 {%rs129, %rs135}, %r90; // begin inline asm { cvt.f32.f16 %f120, %rs129;} // end inline asm fma.rn.ftz.f32 %f333, %f330, %f120, %f324; mov.b32 {%rs130, %rs136}, %r94; // begin inline asm { cvt.f32.f16 %f121, %rs130;} // end inline asm fma.rn.ftz.f32 %f334, %f330, %f121, %f325; mov.b32 {%rs131, %rs137}, %r98; // begin inline asm { cvt.f32.f16 %f122, %rs131;} // end inline asm fma.rn.ftz.f32 %f335, %f330, %f122, %f326; mov.b32 {%rs132, %rs138}, %r102; // begin inline asm { cvt.f32.f16 %f123, %rs132;} // end inline asm fma.rn.ftz.f32 %f336, %f330, %f123, %f327; cvt.rn.f32.s16 %f337, %rs19; sub.ftz.f32 %f338, %f337, %f8; mul.ftz.f32 %f339, %f81, %f338; // begin inline asm { cvt.f32.f16 %f124, %rs133;} // end inline asm fma.rn.ftz.f32 %f340, %f339, %f124, %f331; // begin inline asm { cvt.f32.f16 %f125, %rs134;} // end inline asm fma.rn.ftz.f32 %f341, %f339, %f125, %f332; // begin inline asm { cvt.f32.f16 %f126, %rs135;} // end inline asm fma.rn.ftz.f32 %f342, %f339, %f126, %f333; // begin inline asm { cvt.f32.f16 %f127, %rs136;} // end inline asm fma.rn.ftz.f32 %f343, %f339, %f127, %f334; // begin inline asm { cvt.f32.f16 %f128, %rs137;} // end inline asm fma.rn.ftz.f32 %f344, %f339, %f128, %f335; // begin inline asm { cvt.f32.f16 %f129, %rs138;} // end inline asm fma.rn.ftz.f32 %f345, %f339, %f129, %f336; ld.global.v4.u32 {%r136, %r137, %r138, %r139}, [%rd7+16]; ld.global.v4.u32 {%r144, %r145, %r146, %r147}, [%rd9]; ld.global.v4.u32 {%r152, %r153, %r154, %r155}, [%rd8+16]; ld.global.v4.u32 {%r160, %r161, %r162, %r163}, [%rd10]; ld.global.v4.u32 {%r168, %r169, %r170, %r171}, [%rd11]; ld.global.v4.u32 {%r176, %r177, %r178, %r179}, [%rd12]; cvt.rn.f32.s16 %f346, %rs21; sub.ftz.f32 %f347, %f346, %f8; mul.ftz.f32 %f348, %f81, %f347; mov.b32 {%rs139, %rs145}, %r136; // begin inline asm { cvt.f32.f16 %f130, %rs139;} // end inline asm fma.rn.ftz.f32 %f349, %f348, %f130, %f340; mov.b32 {%rs140, %rs146}, %r144; // begin inline asm { cvt.f32.f16 %f131, %rs140;} // end inline asm fma.rn.ftz.f32 %f350, %f348, %f131, %f341; mov.b32 {%rs141, %rs147}, %r152; // begin inline asm { cvt.f32.f16 %f132, %rs141;} // end inline asm fma.rn.ftz.f32 %f351, %f348, %f132, %f342; mov.b32 {%rs142, %rs148}, %r160; // begin inline asm { cvt.f32.f16 %f133, %rs142;} // end inline asm fma.rn.ftz.f32 %f352, %f348, %f133, %f343; mov.b32 {%rs143, %rs149}, %r168; // begin inline asm { cvt.f32.f16 %f134, %rs143;} // end inline asm fma.rn.ftz.f32 %f353, %f348, %f134, %f344; mov.b32 {%rs144, %rs150}, %r176; // begin inline asm { cvt.f32.f16 %f135, %rs144;} // end inline asm fma.rn.ftz.f32 %f354, %f348, %f135, %f345; cvt.rn.f32.s16 %f355, %rs23; sub.ftz.f32 %f356, %f355, %f8; mul.ftz.f32 %f357, %f81, %f356; // begin inline asm { cvt.f32.f16 %f136, %rs145;} // end inline asm fma.rn.ftz.f32 %f358, %f357, %f136, %f349; // begin inline asm { cvt.f32.f16 %f137, %rs146;} // end inline asm fma.rn.ftz.f32 %f359, %f357, %f137, %f350; // begin inline asm { cvt.f32.f16 %f138, %rs147;} // end inline asm fma.rn.ftz.f32 %f360, %f357, %f138, %f351; // begin inline asm { cvt.f32.f16 %f139, %rs148;} // end inline asm fma.rn.ftz.f32 %f361, %f357, %f139, %f352; // begin inline asm { cvt.f32.f16 %f140, %rs149;} // end inline asm fma.rn.ftz.f32 %f362, %f357, %f140, %f353; // begin inline asm { cvt.f32.f16 %f141, %rs150;} // end inline asm fma.rn.ftz.f32 %f363, %f357, %f141, %f354; cvt.rn.f32.s16 %f364, %rs25; sub.ftz.f32 %f365, %f364, %f8; mul.ftz.f32 %f366, %f81, %f365; mov.b32 {%rs151, %rs157}, %r137; // begin inline asm { cvt.f32.f16 %f142, %rs151;} // end inline asm fma.rn.ftz.f32 %f367, %f366, %f142, %f358; mov.b32 {%rs152, %rs158}, %r145; // begin inline asm { cvt.f32.f16 %f143, %rs152;} // end inline asm fma.rn.ftz.f32 %f368, %f366, %f143, %f359; mov.b32 {%rs153, %rs159}, %r153; // begin inline asm { cvt.f32.f16 %f144, %rs153;} // end inline asm fma.rn.ftz.f32 %f369, %f366, %f144, %f360; mov.b32 {%rs154, %rs160}, %r161; // begin inline asm { cvt.f32.f16 %f145, %rs154;} // end inline asm fma.rn.ftz.f32 %f370, %f366, %f145, %f361; mov.b32 {%rs155, %rs161}, %r169; // begin inline asm { cvt.f32.f16 %f146, %rs155;} // end inline asm fma.rn.ftz.f32 %f371, %f366, %f146, %f362; mov.b32 {%rs156, %rs162}, %r177; // begin inline asm { cvt.f32.f16 %f147, %rs156;} // end inline asm fma.rn.ftz.f32 %f372, %f366, %f147, %f363; cvt.rn.f32.s16 %f373, %rs27; sub.ftz.f32 %f374, %f373, %f8; mul.ftz.f32 %f375, %f81, %f374; // begin inline asm { cvt.f32.f16 %f148, %rs157;} // end inline asm fma.rn.ftz.f32 %f376, %f375, %f148, %f367; // begin inline asm { cvt.f32.f16 %f149, %rs158;} // end inline asm fma.rn.ftz.f32 %f377, %f375, %f149, %f368; // begin inline asm { cvt.f32.f16 %f150, %rs159;} // end inline asm fma.rn.ftz.f32 %f378, %f375, %f150, %f369; // begin inline asm { cvt.f32.f16 %f151, %rs160;} // end inline asm fma.rn.ftz.f32 %f379, %f375, %f151, %f370; // begin inline asm { cvt.f32.f16 %f152, %rs161;} // end inline asm fma.rn.ftz.f32 %f380, %f375, %f152, %f371; // begin inline asm { cvt.f32.f16 %f153, %rs162;} // end inline asm fma.rn.ftz.f32 %f381, %f375, %f153, %f372; cvt.rn.f32.s16 %f382, %rs29; sub.ftz.f32 %f383, %f382, %f8; mul.ftz.f32 %f384, %f81, %f383; mov.b32 {%rs163, %rs169}, %r138; // begin inline asm { cvt.f32.f16 %f154, %rs163;} // end inline asm fma.rn.ftz.f32 %f385, %f384, %f154, %f376; mov.b32 {%rs164, %rs170}, %r146; // begin inline asm { cvt.f32.f16 %f155, %rs164;} // end inline asm fma.rn.ftz.f32 %f386, %f384, %f155, %f377; mov.b32 {%rs165, %rs171}, %r154; // begin inline asm { cvt.f32.f16 %f156, %rs165;} // end inline asm fma.rn.ftz.f32 %f387, %f384, %f156, %f378; mov.b32 {%rs166, %rs172}, %r162; // begin inline asm { cvt.f32.f16 %f157, %rs166;} // end inline asm fma.rn.ftz.f32 %f388, %f384, %f157, %f379; mov.b32 {%rs167, %rs173}, %r170; // begin inline asm { cvt.f32.f16 %f158, %rs167;} // end inline asm fma.rn.ftz.f32 %f389, %f384, %f158, %f380; mov.b32 {%rs168, %rs174}, %r178; // begin inline asm { cvt.f32.f16 %f159, %rs168;} // end inline asm fma.rn.ftz.f32 %f390, %f384, %f159, %f381; cvt.rn.f32.s16 %f391, %rs31; sub.ftz.f32 %f392, %f391, %f8; mul.ftz.f32 %f393, %f81, %f392; // begin inline asm { cvt.f32.f16 %f160, %rs169;} // end inline asm fma.rn.ftz.f32 %f394, %f393, %f160, %f385; // begin inline asm { cvt.f32.f16 %f161, %rs170;} // end inline asm fma.rn.ftz.f32 %f395, %f393, %f161, %f386; // begin inline asm { cvt.f32.f16 %f162, %rs171;} // end inline asm fma.rn.ftz.f32 %f396, %f393, %f162, %f387; // begin inline asm { cvt.f32.f16 %f163, %rs172;} // end inline asm fma.rn.ftz.f32 %f397, %f393, %f163, %f388; // begin inline asm { cvt.f32.f16 %f164, %rs173;} // end inline asm fma.rn.ftz.f32 %f398, %f393, %f164, %f389; // begin inline asm { cvt.f32.f16 %f165, %rs174;} // end inline asm fma.rn.ftz.f32 %f399, %f393, %f165, %f390; cvt.rn.f32.s16 %f400, %rs33; sub.ftz.f32 %f401, %f400, %f8; mul.ftz.f32 %f402, %f81, %f401; mov.b32 {%rs175, %rs181}, %r139; // begin inline asm { cvt.f32.f16 %f166, %rs175;} // end inline asm fma.rn.ftz.f32 %f403, %f402, %f166, %f394; mov.b32 {%rs176, %rs182}, %r147; // begin inline asm { cvt.f32.f16 %f167, %rs176;} // end inline asm fma.rn.ftz.f32 %f404, %f402, %f167, %f395; mov.b32 {%rs177, %rs183}, %r155; // begin inline asm { cvt.f32.f16 %f168, %rs177;} // end inline asm fma.rn.ftz.f32 %f405, %f402, %f168, %f396; mov.b32 {%rs178, %rs184}, %r163; // begin inline asm { cvt.f32.f16 %f169, %rs178;} // end inline asm fma.rn.ftz.f32 %f406, %f402, %f169, %f397; mov.b32 {%rs179, %rs185}, %r171; // begin inline asm { cvt.f32.f16 %f170, %rs179;} // end inline asm fma.rn.ftz.f32 %f407, %f402, %f170, %f398; mov.b32 {%rs180, %rs186}, %r179; // begin inline asm { cvt.f32.f16 %f171, %rs180;} // end inline asm fma.rn.ftz.f32 %f408, %f402, %f171, %f399; cvt.rn.f32.s16 %f409, %rs34; sub.ftz.f32 %f410, %f409, %f8; mul.ftz.f32 %f411, %f81, %f410; // begin inline asm { cvt.f32.f16 %f172, %rs181;} // end inline asm fma.rn.ftz.f32 %f412, %f411, %f172, %f403; // begin inline asm { cvt.f32.f16 %f173, %rs182;} // end inline asm fma.rn.ftz.f32 %f413, %f411, %f173, %f404; // begin inline asm { cvt.f32.f16 %f174, %rs183;} // end inline asm fma.rn.ftz.f32 %f414, %f411, %f174, %f405; // begin inline asm { cvt.f32.f16 %f175, %rs184;} // end inline asm fma.rn.ftz.f32 %f415, %f411, %f175, %f406; // begin inline asm { cvt.f32.f16 %f176, %rs185;} // end inline asm fma.rn.ftz.f32 %f416, %f411, %f176, %f407; // begin inline asm { cvt.f32.f16 %f177, %rs186;} // end inline asm fma.rn.ftz.f32 %f417, %f411, %f177, %f408; ld.global.v4.u32 {%r184, %r185, %r186, %r187}, [%rd7+32]; ld.global.v4.u32 {%r192, %r193, %r194, %r195}, [%rd9+16]; ld.global.v4.u32 {%r200, %r201, %r202, %r203}, [%rd8+32]; ld.global.v4.u32 {%r208, %r209, %r210, %r211}, [%rd10+16]; ld.global.v4.u32 {%r216, %r217, %r218, %r219}, [%rd11+16]; ld.global.v4.u32 {%r224, %r225, %r226, %r227}, [%rd12+16]; cvt.rn.f32.s16 %f418, %rs36; sub.ftz.f32 %f419, %f418, %f8; mul.ftz.f32 %f420, %f81, %f419; mov.b32 {%rs187, %rs193}, %r184; // begin inline asm { cvt.f32.f16 %f178, %rs187;} // end inline asm fma.rn.ftz.f32 %f421, %f420, %f178, %f412; mov.b32 {%rs188, %rs194}, %r192; // begin inline asm { cvt.f32.f16 %f179, %rs188;} // end inline asm fma.rn.ftz.f32 %f422, %f420, %f179, %f413; mov.b32 {%rs189, %rs195}, %r200; // begin inline asm { cvt.f32.f16 %f180, %rs189;} // end inline asm fma.rn.ftz.f32 %f423, %f420, %f180, %f414; mov.b32 {%rs190, %rs196}, %r208; // begin inline asm { cvt.f32.f16 %f181, %rs190;} // end inline asm fma.rn.ftz.f32 %f424, %f420, %f181, %f415; mov.b32 {%rs191, %rs197}, %r216; // begin inline asm { cvt.f32.f16 %f182, %rs191;} // end inline asm fma.rn.ftz.f32 %f425, %f420, %f182, %f416; mov.b32 {%rs192, %rs198}, %r224; // begin inline asm { cvt.f32.f16 %f183, %rs192;} // end inline asm fma.rn.ftz.f32 %f426, %f420, %f183, %f417; cvt.rn.f32.s16 %f427, %rs38; sub.ftz.f32 %f428, %f427, %f8; mul.ftz.f32 %f429, %f81, %f428; // begin inline asm { cvt.f32.f16 %f184, %rs193;} // end inline asm fma.rn.ftz.f32 %f430, %f429, %f184, %f421; // begin inline asm { cvt.f32.f16 %f185, %rs194;} // end inline asm fma.rn.ftz.f32 %f431, %f429, %f185, %f422; // begin inline asm { cvt.f32.f16 %f186, %rs195;} // end inline asm fma.rn.ftz.f32 %f432, %f429, %f186, %f423; // begin inline asm { cvt.f32.f16 %f187, %rs196;} // end inline asm fma.rn.ftz.f32 %f433, %f429, %f187, %f424; // begin inline asm { cvt.f32.f16 %f188, %rs197;} // end inline asm fma.rn.ftz.f32 %f434, %f429, %f188, %f425; // begin inline asm { cvt.f32.f16 %f189, %rs198;} // end inline asm fma.rn.ftz.f32 %f435, %f429, %f189, %f426; cvt.rn.f32.s16 %f436, %rs40; sub.ftz.f32 %f437, %f436, %f8; mul.ftz.f32 %f438, %f81, %f437; mov.b32 {%rs199, %rs205}, %r185; // begin inline asm { cvt.f32.f16 %f190, %rs199;} // end inline asm fma.rn.ftz.f32 %f439, %f438, %f190, %f430; mov.b32 {%rs200, %rs206}, %r193; // begin inline asm { cvt.f32.f16 %f191, %rs200;} // end inline asm fma.rn.ftz.f32 %f440, %f438, %f191, %f431; mov.b32 {%rs201, %rs207}, %r201; // begin inline asm { cvt.f32.f16 %f192, %rs201;} // end inline asm fma.rn.ftz.f32 %f441, %f438, %f192, %f432; mov.b32 {%rs202, %rs208}, %r209; // begin inline asm { cvt.f32.f16 %f193, %rs202;} // end inline asm fma.rn.ftz.f32 %f442, %f438, %f193, %f433; mov.b32 {%rs203, %rs209}, %r217; // begin inline asm { cvt.f32.f16 %f194, %rs203;} // end inline asm fma.rn.ftz.f32 %f443, %f438, %f194, %f434; mov.b32 {%rs204, %rs210}, %r225; // begin inline asm { cvt.f32.f16 %f195, %rs204;} // end inline asm fma.rn.ftz.f32 %f444, %f438, %f195, %f435; cvt.rn.f32.s16 %f445, %rs42; sub.ftz.f32 %f446, %f445, %f8; mul.ftz.f32 %f447, %f81, %f446; // begin inline asm { cvt.f32.f16 %f196, %rs205;} // end inline asm fma.rn.ftz.f32 %f448, %f447, %f196, %f439; // begin inline asm { cvt.f32.f16 %f197, %rs206;} // end inline asm fma.rn.ftz.f32 %f449, %f447, %f197, %f440; // begin inline asm { cvt.f32.f16 %f198, %rs207;} // end inline asm fma.rn.ftz.f32 %f450, %f447, %f198, %f441; // begin inline asm { cvt.f32.f16 %f199, %rs208;} // end inline asm fma.rn.ftz.f32 %f451, %f447, %f199, %f442; // begin inline asm { cvt.f32.f16 %f200, %rs209;} // end inline asm fma.rn.ftz.f32 %f452, %f447, %f200, %f443; // begin inline asm { cvt.f32.f16 %f201, %rs210;} // end inline asm fma.rn.ftz.f32 %f453, %f447, %f201, %f444; cvt.rn.f32.s16 %f454, %rs44; sub.ftz.f32 %f455, %f454, %f8; mul.ftz.f32 %f456, %f81, %f455; mov.b32 {%rs211, %rs217}, %r186; // begin inline asm { cvt.f32.f16 %f202, %rs211;} // end inline asm fma.rn.ftz.f32 %f457, %f456, %f202, %f448; mov.b32 {%rs212, %rs218}, %r194; // begin inline asm { cvt.f32.f16 %f203, %rs212;} // end inline asm fma.rn.ftz.f32 %f458, %f456, %f203, %f449; mov.b32 {%rs213, %rs219}, %r202; // begin inline asm { cvt.f32.f16 %f204, %rs213;} // end inline asm fma.rn.ftz.f32 %f459, %f456, %f204, %f450; mov.b32 {%rs214, %rs220}, %r210; // begin inline asm { cvt.f32.f16 %f205, %rs214;} // end inline asm fma.rn.ftz.f32 %f460, %f456, %f205, %f451; mov.b32 {%rs215, %rs221}, %r218; // begin inline asm { cvt.f32.f16 %f206, %rs215;} // end inline asm fma.rn.ftz.f32 %f461, %f456, %f206, %f452; mov.b32 {%rs216, %rs222}, %r226; // begin inline asm { cvt.f32.f16 %f207, %rs216;} // end inline asm fma.rn.ftz.f32 %f462, %f456, %f207, %f453; cvt.rn.f32.s16 %f463, %rs46; sub.ftz.f32 %f464, %f463, %f8; mul.ftz.f32 %f465, %f81, %f464; // begin inline asm { cvt.f32.f16 %f208, %rs217;} // end inline asm fma.rn.ftz.f32 %f466, %f465, %f208, %f457; // begin inline asm { cvt.f32.f16 %f209, %rs218;} // end inline asm fma.rn.ftz.f32 %f467, %f465, %f209, %f458; // begin inline asm { cvt.f32.f16 %f210, %rs219;} // end inline asm fma.rn.ftz.f32 %f468, %f465, %f210, %f459; // begin inline asm { cvt.f32.f16 %f211, %rs220;} // end inline asm fma.rn.ftz.f32 %f469, %f465, %f211, %f460; // begin inline asm { cvt.f32.f16 %f212, %rs221;} // end inline asm fma.rn.ftz.f32 %f470, %f465, %f212, %f461; // begin inline asm { cvt.f32.f16 %f213, %rs222;} // end inline asm fma.rn.ftz.f32 %f471, %f465, %f213, %f462; cvt.rn.f32.s16 %f472, %rs48; sub.ftz.f32 %f473, %f472, %f8; mul.ftz.f32 %f474, %f81, %f473; mov.b32 {%rs223, %rs229}, %r187; // begin inline asm { cvt.f32.f16 %f214, %rs223;} // end inline asm fma.rn.ftz.f32 %f475, %f474, %f214, %f466; mov.b32 {%rs224, %rs230}, %r195; // begin inline asm { cvt.f32.f16 %f215, %rs224;} // end inline asm fma.rn.ftz.f32 %f476, %f474, %f215, %f467; mov.b32 {%rs225, %rs231}, %r203; // begin inline asm { cvt.f32.f16 %f216, %rs225;} // end inline asm fma.rn.ftz.f32 %f477, %f474, %f216, %f468; mov.b32 {%rs226, %rs232}, %r211; // begin inline asm { cvt.f32.f16 %f217, %rs226;} // end inline asm fma.rn.ftz.f32 %f478, %f474, %f217, %f469; mov.b32 {%rs227, %rs233}, %r219; // begin inline asm { cvt.f32.f16 %f218, %rs227;} // end inline asm fma.rn.ftz.f32 %f479, %f474, %f218, %f470; mov.b32 {%rs228, %rs234}, %r227; // begin inline asm { cvt.f32.f16 %f219, %rs228;} // end inline asm fma.rn.ftz.f32 %f480, %f474, %f219, %f471; cvt.rn.f32.s16 %f481, %rs49; sub.ftz.f32 %f482, %f481, %f8; mul.ftz.f32 %f483, %f81, %f482; // begin inline asm { cvt.f32.f16 %f220, %rs229;} // end inline asm fma.rn.ftz.f32 %f484, %f483, %f220, %f475; // begin inline asm { cvt.f32.f16 %f221, %rs230;} // end inline asm fma.rn.ftz.f32 %f485, %f483, %f221, %f476; // begin inline asm { cvt.f32.f16 %f222, %rs231;} // end inline asm fma.rn.ftz.f32 %f486, %f483, %f222, %f477; // begin inline asm { cvt.f32.f16 %f223, %rs232;} // end inline asm fma.rn.ftz.f32 %f487, %f483, %f223, %f478; // begin inline asm { cvt.f32.f16 %f224, %rs233;} // end inline asm fma.rn.ftz.f32 %f488, %f483, %f224, %f479; // begin inline asm { cvt.f32.f16 %f225, %rs234;} // end inline asm fma.rn.ftz.f32 %f489, %f483, %f225, %f480; ld.global.v4.u32 {%r232, %r233, %r234, %r235}, [%rd7+48]; ld.global.v4.u32 {%r240, %r241, %r242, %r243}, [%rd9+32]; ld.global.v4.u32 {%r248, %r249, %r250, %r251}, [%rd8+48]; ld.global.v4.u32 {%r256, %r257, %r258, %r259}, [%rd10+32]; ld.global.v4.u32 {%r264, %r265, %r266, %r267}, [%rd11+32]; ld.global.v4.u32 {%r272, %r273, %r274, %r275}, [%rd12+32]; cvt.rn.f32.s16 %f490, %rs51; sub.ftz.f32 %f491, %f490, %f8; mul.ftz.f32 %f492, %f81, %f491; mov.b32 {%rs235, %rs241}, %r232; // begin inline asm { cvt.f32.f16 %f226, %rs235;} // end inline asm fma.rn.ftz.f32 %f493, %f492, %f226, %f484; mov.b32 {%rs236, %rs242}, %r240; // begin inline asm { cvt.f32.f16 %f227, %rs236;} // end inline asm fma.rn.ftz.f32 %f494, %f492, %f227, %f485; mov.b32 {%rs237, %rs243}, %r248; // begin inline asm { cvt.f32.f16 %f228, %rs237;} // end inline asm fma.rn.ftz.f32 %f495, %f492, %f228, %f486; mov.b32 {%rs238, %rs244}, %r256; // begin inline asm { cvt.f32.f16 %f229, %rs238;} // end inline asm fma.rn.ftz.f32 %f496, %f492, %f229, %f487; mov.b32 {%rs239, %rs245}, %r264; // begin inline asm { cvt.f32.f16 %f230, %rs239;} // end inline asm fma.rn.ftz.f32 %f497, %f492, %f230, %f488; mov.b32 {%rs240, %rs246}, %r272; // begin inline asm { cvt.f32.f16 %f231, %rs240;} // end inline asm fma.rn.ftz.f32 %f498, %f492, %f231, %f489; cvt.rn.f32.s16 %f499, %rs53; sub.ftz.f32 %f500, %f499, %f8; mul.ftz.f32 %f501, %f81, %f500; // begin inline asm { cvt.f32.f16 %f232, %rs241;} // end inline asm fma.rn.ftz.f32 %f502, %f501, %f232, %f493; // begin inline asm { cvt.f32.f16 %f233, %rs242;} // end inline asm fma.rn.ftz.f32 %f503, %f501, %f233, %f494; // begin inline asm { cvt.f32.f16 %f234, %rs243;} // end inline asm fma.rn.ftz.f32 %f504, %f501, %f234, %f495; // begin inline asm { cvt.f32.f16 %f235, %rs244;} // end inline asm fma.rn.ftz.f32 %f505, %f501, %f235, %f496; // begin inline asm { cvt.f32.f16 %f236, %rs245;} // end inline asm fma.rn.ftz.f32 %f506, %f501, %f236, %f497; // begin inline asm { cvt.f32.f16 %f237, %rs246;} // end inline asm fma.rn.ftz.f32 %f507, %f501, %f237, %f498; cvt.rn.f32.s16 %f508, %rs55; sub.ftz.f32 %f509, %f508, %f8; mul.ftz.f32 %f510, %f81, %f509; mov.b32 {%rs247, %rs253}, %r233; // begin inline asm { cvt.f32.f16 %f238, %rs247;} // end inline asm fma.rn.ftz.f32 %f511, %f510, %f238, %f502; mov.b32 {%rs248, %rs254}, %r241; // begin inline asm { cvt.f32.f16 %f239, %rs248;} // end inline asm fma.rn.ftz.f32 %f512, %f510, %f239, %f503; mov.b32 {%rs249, %rs255}, %r249; // begin inline asm { cvt.f32.f16 %f240, %rs249;} // end inline asm fma.rn.ftz.f32 %f513, %f510, %f240, %f504; mov.b32 {%rs250, %rs256}, %r257; // begin inline asm { cvt.f32.f16 %f241, %rs250;} // end inline asm fma.rn.ftz.f32 %f514, %f510, %f241, %f505; mov.b32 {%rs251, %rs257}, %r265; // begin inline asm { cvt.f32.f16 %f242, %rs251;} // end inline asm fma.rn.ftz.f32 %f515, %f510, %f242, %f506; mov.b32 {%rs252, %rs258}, %r273; // begin inline asm { cvt.f32.f16 %f243, %rs252;} // end inline asm fma.rn.ftz.f32 %f516, %f510, %f243, %f507; cvt.rn.f32.s16 %f517, %rs57; sub.ftz.f32 %f518, %f517, %f8; mul.ftz.f32 %f519, %f81, %f518; // begin inline asm { cvt.f32.f16 %f244, %rs253;} // end inline asm fma.rn.ftz.f32 %f520, %f519, %f244, %f511; // begin inline asm { cvt.f32.f16 %f245, %rs254;} // end inline asm fma.rn.ftz.f32 %f521, %f519, %f245, %f512; // begin inline asm { cvt.f32.f16 %f246, %rs255;} // end inline asm fma.rn.ftz.f32 %f522, %f519, %f246, %f513; // begin inline asm { cvt.f32.f16 %f247, %rs256;} // end inline asm fma.rn.ftz.f32 %f523, %f519, %f247, %f514; // begin inline asm { cvt.f32.f16 %f248, %rs257;} // end inline asm fma.rn.ftz.f32 %f524, %f519, %f248, %f515; // begin inline asm { cvt.f32.f16 %f249, %rs258;} // end inline asm fma.rn.ftz.f32 %f525, %f519, %f249, %f516; cvt.rn.f32.s16 %f526, %rs59; sub.ftz.f32 %f527, %f526, %f8; mul.ftz.f32 %f528, %f81, %f527; mov.b32 {%rs259, %rs265}, %r234; // begin inline asm { cvt.f32.f16 %f250, %rs259;} // end inline asm fma.rn.ftz.f32 %f529, %f528, %f250, %f520; mov.b32 {%rs260, %rs266}, %r242; // begin inline asm { cvt.f32.f16 %f251, %rs260;} // end inline asm fma.rn.ftz.f32 %f530, %f528, %f251, %f521; mov.b32 {%rs261, %rs267}, %r250; // begin inline asm { cvt.f32.f16 %f252, %rs261;} // end inline asm fma.rn.ftz.f32 %f531, %f528, %f252, %f522; mov.b32 {%rs262, %rs268}, %r258; // begin inline asm { cvt.f32.f16 %f253, %rs262;} // end inline asm fma.rn.ftz.f32 %f532, %f528, %f253, %f523; mov.b32 {%rs263, %rs269}, %r266; // begin inline asm { cvt.f32.f16 %f254, %rs263;} // end inline asm fma.rn.ftz.f32 %f533, %f528, %f254, %f524; mov.b32 {%rs264, %rs270}, %r274; // begin inline asm { cvt.f32.f16 %f255, %rs264;} // end inline asm fma.rn.ftz.f32 %f534, %f528, %f255, %f525; cvt.rn.f32.s16 %f535, %rs61; sub.ftz.f32 %f536, %f535, %f8; mul.ftz.f32 %f537, %f81, %f536; // begin inline asm { cvt.f32.f16 %f256, %rs265;} // end inline asm fma.rn.ftz.f32 %f538, %f537, %f256, %f529; // begin inline asm { cvt.f32.f16 %f257, %rs266;} // end inline asm fma.rn.ftz.f32 %f539, %f537, %f257, %f530; // begin inline asm { cvt.f32.f16 %f258, %rs267;} // end inline asm fma.rn.ftz.f32 %f540, %f537, %f258, %f531; // begin inline asm { cvt.f32.f16 %f259, %rs268;} // end inline asm fma.rn.ftz.f32 %f541, %f537, %f259, %f532; // begin inline asm { cvt.f32.f16 %f260, %rs269;} // end inline asm fma.rn.ftz.f32 %f542, %f537, %f260, %f533; // begin inline asm { cvt.f32.f16 %f261, %rs270;} // end inline asm fma.rn.ftz.f32 %f543, %f537, %f261, %f534; cvt.rn.f32.s16 %f544, %rs63; sub.ftz.f32 %f545, %f544, %f8; mul.ftz.f32 %f546, %f81, %f545; mov.b32 {%rs271, %rs277}, %r235; // begin inline asm { cvt.f32.f16 %f262, %rs271;} // end inline asm fma.rn.ftz.f32 %f547, %f546, %f262, %f538; mov.b32 {%rs272, %rs278}, %r243; // begin inline asm { cvt.f32.f16 %f263, %rs272;} // end inline asm fma.rn.ftz.f32 %f548, %f546, %f263, %f539; mov.b32 {%rs273, %rs279}, %r251; // begin inline asm { cvt.f32.f16 %f264, %rs273;} // end inline asm fma.rn.ftz.f32 %f549, %f546, %f264, %f540; mov.b32 {%rs274, %rs280}, %r259; // begin inline asm { cvt.f32.f16 %f265, %rs274;} // end inline asm fma.rn.ftz.f32 %f550, %f546, %f265, %f541; mov.b32 {%rs275, %rs281}, %r267; // begin inline asm { cvt.f32.f16 %f266, %rs275;} // end inline asm fma.rn.ftz.f32 %f551, %f546, %f266, %f542; mov.b32 {%rs276, %rs282}, %r275; // begin inline asm { cvt.f32.f16 %f267, %rs276;} // end inline asm fma.rn.ftz.f32 %f552, %f546, %f267, %f543; cvt.rn.f32.s16 %f553, %rs64; sub.ftz.f32 %f554, %f553, %f8; mul.ftz.f32 %f555, %f81, %f554; // begin inline asm { cvt.f32.f16 %f268, %rs277;} // end inline asm fma.rn.ftz.f32 %f1173, %f555, %f268, %f547; // begin inline asm { cvt.f32.f16 %f269, %rs278;} // end inline asm fma.rn.ftz.f32 %f1172, %f555, %f269, %f548; // begin inline asm { cvt.f32.f16 %f270, %rs279;} // end inline asm fma.rn.ftz.f32 %f1171, %f555, %f270, %f549; // begin inline asm { cvt.f32.f16 %f271, %rs280;} // end inline asm fma.rn.ftz.f32 %f1170, %f555, %f271, %f550; // begin inline asm { cvt.f32.f16 %f272, %rs281;} // end inline asm fma.rn.ftz.f32 %f1169, %f555, %f272, %f551; // begin inline asm { cvt.f32.f16 %f273, %rs282;} // end inline asm fma.rn.ftz.f32 %f1168, %f555, %f273, %f552; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs475, %rs5, 4; cvt.s16.s8 %rs476, %rs475; shr.s16 %rs477, %rs476, 7; and.b16 %rs478, %rs477, -16; or.b16 %rs479, %rs478, %rs6; cvt.rn.f32.s16 %f748, %rs479; sub.ftz.f32 %f749, %f748, %f8; mul.ftz.f32 %f750, %f81, %f749; mov.b32 {%rs283, %rs289}, %r77; // begin inline asm { cvt.f32.f16 %f556, %rs283;} // end inline asm fma.rn.ftz.f32 %f751, %f750, %f556, %f1173; mov.b32 {%rs284, %rs290}, %r81; // begin inline asm { cvt.f32.f16 %f557, %rs284;} // end inline asm fma.rn.ftz.f32 %f752, %f750, %f557, %f1172; mov.b32 {%rs285, %rs291}, %r87; // begin inline asm { cvt.f32.f16 %f558, %rs285;} // end inline asm fma.rn.ftz.f32 %f753, %f750, %f558, %f1171; mov.b32 {%rs286, %rs292}, %r91; // begin inline asm { cvt.f32.f16 %f559, %rs286;} // end inline asm fma.rn.ftz.f32 %f754, %f750, %f559, %f1170; mov.b32 {%rs287, %rs293}, %r95; // begin inline asm { cvt.f32.f16 %f560, %rs287;} // end inline asm fma.rn.ftz.f32 %f755, %f750, %f560, %f1169; mov.b32 {%rs288, %rs294}, %r99; // begin inline asm { cvt.f32.f16 %f561, %rs288;} // end inline asm fma.rn.ftz.f32 %f756, %f750, %f561, %f1168; shl.b16 %rs480, %rs7, 4; cvt.s16.s8 %rs481, %rs480; shr.s16 %rs482, %rs481, 7; and.b16 %rs483, %rs482, -16; or.b16 %rs484, %rs483, %rs8; cvt.rn.f32.s16 %f757, %rs484; sub.ftz.f32 %f758, %f757, %f8; mul.ftz.f32 %f759, %f81, %f758; // begin inline asm { cvt.f32.f16 %f562, %rs289;} // end inline asm fma.rn.ftz.f32 %f760, %f759, %f562, %f751; // begin inline asm { cvt.f32.f16 %f563, %rs290;} // end inline asm fma.rn.ftz.f32 %f761, %f759, %f563, %f752; // begin inline asm { cvt.f32.f16 %f564, %rs291;} // end inline asm fma.rn.ftz.f32 %f762, %f759, %f564, %f753; // begin inline asm { cvt.f32.f16 %f565, %rs292;} // end inline asm fma.rn.ftz.f32 %f763, %f759, %f565, %f754; // begin inline asm { cvt.f32.f16 %f566, %rs293;} // end inline asm fma.rn.ftz.f32 %f764, %f759, %f566, %f755; // begin inline asm { cvt.f32.f16 %f567, %rs294;} // end inline asm fma.rn.ftz.f32 %f765, %f759, %f567, %f756; shl.b16 %rs485, %rs9, 4; cvt.s16.s8 %rs486, %rs485; shr.s16 %rs487, %rs486, 7; and.b16 %rs488, %rs487, -16; or.b16 %rs489, %rs488, %rs10; cvt.rn.f32.s16 %f766, %rs489; sub.ftz.f32 %f767, %f766, %f8; mul.ftz.f32 %f768, %f81, %f767; mov.b32 {%rs295, %rs301}, %r78; // begin inline asm { cvt.f32.f16 %f568, %rs295;} // end inline asm fma.rn.ftz.f32 %f769, %f768, %f568, %f760; mov.b32 {%rs296, %rs302}, %r82; // begin inline asm { cvt.f32.f16 %f569, %rs296;} // end inline asm fma.rn.ftz.f32 %f770, %f768, %f569, %f761; mov.b32 {%rs297, %rs303}, %r88; // begin inline asm { cvt.f32.f16 %f570, %rs297;} // end inline asm fma.rn.ftz.f32 %f771, %f768, %f570, %f762; mov.b32 {%rs298, %rs304}, %r92; // begin inline asm { cvt.f32.f16 %f571, %rs298;} // end inline asm fma.rn.ftz.f32 %f772, %f768, %f571, %f763; mov.b32 {%rs299, %rs305}, %r96; // begin inline asm { cvt.f32.f16 %f572, %rs299;} // end inline asm fma.rn.ftz.f32 %f773, %f768, %f572, %f764; mov.b32 {%rs300, %rs306}, %r100; // begin inline asm { cvt.f32.f16 %f573, %rs300;} // end inline asm fma.rn.ftz.f32 %f774, %f768, %f573, %f765; shl.b16 %rs490, %rs11, 4; cvt.s16.s8 %rs491, %rs490; shr.s16 %rs492, %rs491, 7; and.b16 %rs493, %rs492, -16; or.b16 %rs494, %rs493, %rs12; cvt.rn.f32.s16 %f775, %rs494; sub.ftz.f32 %f776, %f775, %f8; mul.ftz.f32 %f777, %f81, %f776; // begin inline asm { cvt.f32.f16 %f574, %rs301;} // end inline asm fma.rn.ftz.f32 %f778, %f777, %f574, %f769; // begin inline asm { cvt.f32.f16 %f575, %rs302;} // end inline asm fma.rn.ftz.f32 %f779, %f777, %f575, %f770; // begin inline asm { cvt.f32.f16 %f576, %rs303;} // end inline asm fma.rn.ftz.f32 %f780, %f777, %f576, %f771; // begin inline asm { cvt.f32.f16 %f577, %rs304;} // end inline asm fma.rn.ftz.f32 %f781, %f777, %f577, %f772; // begin inline asm { cvt.f32.f16 %f578, %rs305;} // end inline asm fma.rn.ftz.f32 %f782, %f777, %f578, %f773; // begin inline asm { cvt.f32.f16 %f579, %rs306;} // end inline asm fma.rn.ftz.f32 %f783, %f777, %f579, %f774; shl.b16 %rs495, %rs13, 4; cvt.s16.s8 %rs496, %rs495; shr.s16 %rs497, %rs496, 7; and.b16 %rs498, %rs497, -16; or.b16 %rs499, %rs498, %rs14; cvt.rn.f32.s16 %f784, %rs499; sub.ftz.f32 %f785, %f784, %f8; mul.ftz.f32 %f786, %f81, %f785; mov.b32 {%rs307, %rs313}, %r79; // begin inline asm { cvt.f32.f16 %f580, %rs307;} // end inline asm fma.rn.ftz.f32 %f787, %f786, %f580, %f778; mov.b32 {%rs308, %rs314}, %r83; // begin inline asm { cvt.f32.f16 %f581, %rs308;} // end inline asm fma.rn.ftz.f32 %f788, %f786, %f581, %f779; mov.b32 {%rs309, %rs315}, %r89; // begin inline asm { cvt.f32.f16 %f582, %rs309;} // end inline asm fma.rn.ftz.f32 %f789, %f786, %f582, %f780; mov.b32 {%rs310, %rs316}, %r93; // begin inline asm { cvt.f32.f16 %f583, %rs310;} // end inline asm fma.rn.ftz.f32 %f790, %f786, %f583, %f781; mov.b32 {%rs311, %rs317}, %r97; // begin inline asm { cvt.f32.f16 %f584, %rs311;} // end inline asm fma.rn.ftz.f32 %f791, %f786, %f584, %f782; mov.b32 {%rs312, %rs318}, %r101; // begin inline asm { cvt.f32.f16 %f585, %rs312;} // end inline asm fma.rn.ftz.f32 %f792, %f786, %f585, %f783; shl.b16 %rs500, %rs15, 4; cvt.s16.s8 %rs501, %rs500; shr.s16 %rs502, %rs501, 7; and.b16 %rs503, %rs502, -16; or.b16 %rs504, %rs503, %rs16; cvt.rn.f32.s16 %f793, %rs504; sub.ftz.f32 %f794, %f793, %f8; mul.ftz.f32 %f795, %f81, %f794; // begin inline asm { cvt.f32.f16 %f586, %rs313;} // end inline asm fma.rn.ftz.f32 %f796, %f795, %f586, %f787; // begin inline asm { cvt.f32.f16 %f587, %rs314;} // end inline asm fma.rn.ftz.f32 %f797, %f795, %f587, %f788; // begin inline asm { cvt.f32.f16 %f588, %rs315;} // end inline asm fma.rn.ftz.f32 %f798, %f795, %f588, %f789; // begin inline asm { cvt.f32.f16 %f589, %rs316;} // end inline asm fma.rn.ftz.f32 %f799, %f795, %f589, %f790; // begin inline asm { cvt.f32.f16 %f590, %rs317;} // end inline asm fma.rn.ftz.f32 %f800, %f795, %f590, %f791; // begin inline asm { cvt.f32.f16 %f591, %rs318;} // end inline asm fma.rn.ftz.f32 %f801, %f795, %f591, %f792; shl.b16 %rs505, %rs17, 4; cvt.s16.s8 %rs506, %rs505; shr.s16 %rs507, %rs506, 7; and.b16 %rs508, %rs507, -16; or.b16 %rs509, %rs508, %rs18; cvt.rn.f32.s16 %f802, %rs509; sub.ftz.f32 %f803, %f802, %f8; mul.ftz.f32 %f804, %f81, %f803; mov.b32 {%rs319, %rs325}, %r80; // begin inline asm { cvt.f32.f16 %f592, %rs319;} // end inline asm fma.rn.ftz.f32 %f805, %f804, %f592, %f796; mov.b32 {%rs320, %rs326}, %r84; // begin inline asm { cvt.f32.f16 %f593, %rs320;} // end inline asm fma.rn.ftz.f32 %f806, %f804, %f593, %f797; mov.b32 {%rs321, %rs327}, %r90; // begin inline asm { cvt.f32.f16 %f594, %rs321;} // end inline asm fma.rn.ftz.f32 %f807, %f804, %f594, %f798; mov.b32 {%rs322, %rs328}, %r94; // begin inline asm { cvt.f32.f16 %f595, %rs322;} // end inline asm fma.rn.ftz.f32 %f808, %f804, %f595, %f799; mov.b32 {%rs323, %rs329}, %r98; // begin inline asm { cvt.f32.f16 %f596, %rs323;} // end inline asm fma.rn.ftz.f32 %f809, %f804, %f596, %f800; mov.b32 {%rs324, %rs330}, %r102; // begin inline asm { cvt.f32.f16 %f597, %rs324;} // end inline asm fma.rn.ftz.f32 %f810, %f804, %f597, %f801; shl.b16 %rs510, %rs19, 4; cvt.s16.s8 %rs511, %rs510; shr.s16 %rs512, %rs511, 7; and.b16 %rs513, %rs512, -16; or.b16 %rs514, %rs513, %rs19; cvt.rn.f32.s16 %f811, %rs514; sub.ftz.f32 %f812, %f811, %f8; mul.ftz.f32 %f813, %f81, %f812; // begin inline asm { cvt.f32.f16 %f598, %rs325;} // end inline asm fma.rn.ftz.f32 %f814, %f813, %f598, %f805; // begin inline asm { cvt.f32.f16 %f599, %rs326;} // end inline asm fma.rn.ftz.f32 %f815, %f813, %f599, %f806; // begin inline asm { cvt.f32.f16 %f600, %rs327;} // end inline asm fma.rn.ftz.f32 %f816, %f813, %f600, %f807; // begin inline asm { cvt.f32.f16 %f601, %rs328;} // end inline asm fma.rn.ftz.f32 %f817, %f813, %f601, %f808; // begin inline asm { cvt.f32.f16 %f602, %rs329;} // end inline asm fma.rn.ftz.f32 %f818, %f813, %f602, %f809; // begin inline asm { cvt.f32.f16 %f603, %rs330;} // end inline asm fma.rn.ftz.f32 %f819, %f813, %f603, %f810; ld.global.v4.u32 {%r280, %r281, %r282, %r283}, [%rd7+16]; ld.global.v4.u32 {%r288, %r289, %r290, %r291}, [%rd9]; ld.global.v4.u32 {%r296, %r297, %r298, %r299}, [%rd8+16]; ld.global.v4.u32 {%r304, %r305, %r306, %r307}, [%rd10]; ld.global.v4.u32 {%r312, %r313, %r314, %r315}, [%rd11]; ld.global.v4.u32 {%r320, %r321, %r322, %r323}, [%rd12]; shl.b16 %rs515, %rs20, 4; cvt.s16.s8 %rs516, %rs515; shr.s16 %rs517, %rs516, 7; and.b16 %rs518, %rs517, -16; or.b16 %rs519, %rs518, %rs21; cvt.rn.f32.s16 %f820, %rs519; sub.ftz.f32 %f821, %f820, %f8; mul.ftz.f32 %f822, %f81, %f821; mov.b32 {%rs331, %rs337}, %r280; // begin inline asm { cvt.f32.f16 %f604, %rs331;} // end inline asm fma.rn.ftz.f32 %f823, %f822, %f604, %f814; mov.b32 {%rs332, %rs338}, %r288; // begin inline asm { cvt.f32.f16 %f605, %rs332;} // end inline asm fma.rn.ftz.f32 %f824, %f822, %f605, %f815; mov.b32 {%rs333, %rs339}, %r296; // begin inline asm { cvt.f32.f16 %f606, %rs333;} // end inline asm fma.rn.ftz.f32 %f825, %f822, %f606, %f816; mov.b32 {%rs334, %rs340}, %r304; // begin inline asm { cvt.f32.f16 %f607, %rs334;} // end inline asm fma.rn.ftz.f32 %f826, %f822, %f607, %f817; mov.b32 {%rs335, %rs341}, %r312; // begin inline asm { cvt.f32.f16 %f608, %rs335;} // end inline asm fma.rn.ftz.f32 %f827, %f822, %f608, %f818; mov.b32 {%rs336, %rs342}, %r320; // begin inline asm { cvt.f32.f16 %f609, %rs336;} // end inline asm fma.rn.ftz.f32 %f828, %f822, %f609, %f819; shl.b16 %rs520, %rs22, 4; cvt.s16.s8 %rs521, %rs520; shr.s16 %rs522, %rs521, 7; and.b16 %rs523, %rs522, -16; or.b16 %rs524, %rs523, %rs23; cvt.rn.f32.s16 %f829, %rs524; sub.ftz.f32 %f830, %f829, %f8; mul.ftz.f32 %f831, %f81, %f830; // begin inline asm { cvt.f32.f16 %f610, %rs337;} // end inline asm fma.rn.ftz.f32 %f832, %f831, %f610, %f823; // begin inline asm { cvt.f32.f16 %f611, %rs338;} // end inline asm fma.rn.ftz.f32 %f833, %f831, %f611, %f824; // begin inline asm { cvt.f32.f16 %f612, %rs339;} // end inline asm fma.rn.ftz.f32 %f834, %f831, %f612, %f825; // begin inline asm { cvt.f32.f16 %f613, %rs340;} // end inline asm fma.rn.ftz.f32 %f835, %f831, %f613, %f826; // begin inline asm { cvt.f32.f16 %f614, %rs341;} // end inline asm fma.rn.ftz.f32 %f836, %f831, %f614, %f827; // begin inline asm { cvt.f32.f16 %f615, %rs342;} // end inline asm fma.rn.ftz.f32 %f837, %f831, %f615, %f828; shl.b16 %rs525, %rs24, 4; cvt.s16.s8 %rs526, %rs525; shr.s16 %rs527, %rs526, 7; and.b16 %rs528, %rs527, -16; or.b16 %rs529, %rs528, %rs25; cvt.rn.f32.s16 %f838, %rs529; sub.ftz.f32 %f839, %f838, %f8; mul.ftz.f32 %f840, %f81, %f839; mov.b32 {%rs343, %rs349}, %r281; // begin inline asm { cvt.f32.f16 %f616, %rs343;} // end inline asm fma.rn.ftz.f32 %f841, %f840, %f616, %f832; mov.b32 {%rs344, %rs350}, %r289; // begin inline asm { cvt.f32.f16 %f617, %rs344;} // end inline asm fma.rn.ftz.f32 %f842, %f840, %f617, %f833; mov.b32 {%rs345, %rs351}, %r297; // begin inline asm { cvt.f32.f16 %f618, %rs345;} // end inline asm fma.rn.ftz.f32 %f843, %f840, %f618, %f834; mov.b32 {%rs346, %rs352}, %r305; // begin inline asm { cvt.f32.f16 %f619, %rs346;} // end inline asm fma.rn.ftz.f32 %f844, %f840, %f619, %f835; mov.b32 {%rs347, %rs353}, %r313; // begin inline asm { cvt.f32.f16 %f620, %rs347;} // end inline asm fma.rn.ftz.f32 %f845, %f840, %f620, %f836; mov.b32 {%rs348, %rs354}, %r321; // begin inline asm { cvt.f32.f16 %f621, %rs348;} // end inline asm fma.rn.ftz.f32 %f846, %f840, %f621, %f837; shl.b16 %rs530, %rs26, 4; cvt.s16.s8 %rs531, %rs530; shr.s16 %rs532, %rs531, 7; and.b16 %rs533, %rs532, -16; or.b16 %rs534, %rs533, %rs27; cvt.rn.f32.s16 %f847, %rs534; sub.ftz.f32 %f848, %f847, %f8; mul.ftz.f32 %f849, %f81, %f848; // begin inline asm { cvt.f32.f16 %f622, %rs349;} // end inline asm fma.rn.ftz.f32 %f850, %f849, %f622, %f841; // begin inline asm { cvt.f32.f16 %f623, %rs350;} // end inline asm fma.rn.ftz.f32 %f851, %f849, %f623, %f842; // begin inline asm { cvt.f32.f16 %f624, %rs351;} // end inline asm fma.rn.ftz.f32 %f852, %f849, %f624, %f843; // begin inline asm { cvt.f32.f16 %f625, %rs352;} // end inline asm fma.rn.ftz.f32 %f853, %f849, %f625, %f844; // begin inline asm { cvt.f32.f16 %f626, %rs353;} // end inline asm fma.rn.ftz.f32 %f854, %f849, %f626, %f845; // begin inline asm { cvt.f32.f16 %f627, %rs354;} // end inline asm fma.rn.ftz.f32 %f855, %f849, %f627, %f846; shl.b16 %rs535, %rs28, 4; cvt.s16.s8 %rs536, %rs535; shr.s16 %rs537, %rs536, 7; and.b16 %rs538, %rs537, -16; or.b16 %rs539, %rs538, %rs29; cvt.rn.f32.s16 %f856, %rs539; sub.ftz.f32 %f857, %f856, %f8; mul.ftz.f32 %f858, %f81, %f857; mov.b32 {%rs355, %rs361}, %r282; // begin inline asm { cvt.f32.f16 %f628, %rs355;} // end inline asm fma.rn.ftz.f32 %f859, %f858, %f628, %f850; mov.b32 {%rs356, %rs362}, %r290; // begin inline asm { cvt.f32.f16 %f629, %rs356;} // end inline asm fma.rn.ftz.f32 %f860, %f858, %f629, %f851; mov.b32 {%rs357, %rs363}, %r298; // begin inline asm { cvt.f32.f16 %f630, %rs357;} // end inline asm fma.rn.ftz.f32 %f861, %f858, %f630, %f852; mov.b32 {%rs358, %rs364}, %r306; // begin inline asm { cvt.f32.f16 %f631, %rs358;} // end inline asm fma.rn.ftz.f32 %f862, %f858, %f631, %f853; mov.b32 {%rs359, %rs365}, %r314; // begin inline asm { cvt.f32.f16 %f632, %rs359;} // end inline asm fma.rn.ftz.f32 %f863, %f858, %f632, %f854; mov.b32 {%rs360, %rs366}, %r322; // begin inline asm { cvt.f32.f16 %f633, %rs360;} // end inline asm fma.rn.ftz.f32 %f864, %f858, %f633, %f855; shl.b16 %rs540, %rs30, 4; cvt.s16.s8 %rs541, %rs540; shr.s16 %rs542, %rs541, 7; and.b16 %rs543, %rs542, -16; or.b16 %rs544, %rs543, %rs31; cvt.rn.f32.s16 %f865, %rs544; sub.ftz.f32 %f866, %f865, %f8; mul.ftz.f32 %f867, %f81, %f866; // begin inline asm { cvt.f32.f16 %f634, %rs361;} // end inline asm fma.rn.ftz.f32 %f868, %f867, %f634, %f859; // begin inline asm { cvt.f32.f16 %f635, %rs362;} // end inline asm fma.rn.ftz.f32 %f869, %f867, %f635, %f860; // begin inline asm { cvt.f32.f16 %f636, %rs363;} // end inline asm fma.rn.ftz.f32 %f870, %f867, %f636, %f861; // begin inline asm { cvt.f32.f16 %f637, %rs364;} // end inline asm fma.rn.ftz.f32 %f871, %f867, %f637, %f862; // begin inline asm { cvt.f32.f16 %f638, %rs365;} // end inline asm fma.rn.ftz.f32 %f872, %f867, %f638, %f863; // begin inline asm { cvt.f32.f16 %f639, %rs366;} // end inline asm fma.rn.ftz.f32 %f873, %f867, %f639, %f864; shl.b16 %rs545, %rs32, 4; cvt.s16.s8 %rs546, %rs545; shr.s16 %rs547, %rs546, 7; and.b16 %rs548, %rs547, -16; or.b16 %rs549, %rs548, %rs33; cvt.rn.f32.s16 %f874, %rs549; sub.ftz.f32 %f875, %f874, %f8; mul.ftz.f32 %f876, %f81, %f875; mov.b32 {%rs367, %rs373}, %r283; // begin inline asm { cvt.f32.f16 %f640, %rs367;} // end inline asm fma.rn.ftz.f32 %f877, %f876, %f640, %f868; mov.b32 {%rs368, %rs374}, %r291; // begin inline asm { cvt.f32.f16 %f641, %rs368;} // end inline asm fma.rn.ftz.f32 %f878, %f876, %f641, %f869; mov.b32 {%rs369, %rs375}, %r299; // begin inline asm { cvt.f32.f16 %f642, %rs369;} // end inline asm fma.rn.ftz.f32 %f879, %f876, %f642, %f870; mov.b32 {%rs370, %rs376}, %r307; // begin inline asm { cvt.f32.f16 %f643, %rs370;} // end inline asm fma.rn.ftz.f32 %f880, %f876, %f643, %f871; mov.b32 {%rs371, %rs377}, %r315; // begin inline asm { cvt.f32.f16 %f644, %rs371;} // end inline asm fma.rn.ftz.f32 %f881, %f876, %f644, %f872; mov.b32 {%rs372, %rs378}, %r323; // begin inline asm { cvt.f32.f16 %f645, %rs372;} // end inline asm fma.rn.ftz.f32 %f882, %f876, %f645, %f873; shl.b16 %rs550, %rs34, 4; cvt.s16.s8 %rs551, %rs550; shr.s16 %rs552, %rs551, 7; and.b16 %rs553, %rs552, -16; or.b16 %rs554, %rs553, %rs34; cvt.rn.f32.s16 %f883, %rs554; sub.ftz.f32 %f884, %f883, %f8; mul.ftz.f32 %f885, %f81, %f884; // begin inline asm { cvt.f32.f16 %f646, %rs373;} // end inline asm fma.rn.ftz.f32 %f886, %f885, %f646, %f877; // begin inline asm { cvt.f32.f16 %f647, %rs374;} // end inline asm fma.rn.ftz.f32 %f887, %f885, %f647, %f878; // begin inline asm { cvt.f32.f16 %f648, %rs375;} // end inline asm fma.rn.ftz.f32 %f888, %f885, %f648, %f879; // begin inline asm { cvt.f32.f16 %f649, %rs376;} // end inline asm fma.rn.ftz.f32 %f889, %f885, %f649, %f880; // begin inline asm { cvt.f32.f16 %f650, %rs377;} // end inline asm fma.rn.ftz.f32 %f890, %f885, %f650, %f881; // begin inline asm { cvt.f32.f16 %f651, %rs378;} // end inline asm fma.rn.ftz.f32 %f891, %f885, %f651, %f882; ld.global.v4.u32 {%r328, %r329, %r330, %r331}, [%rd7+32]; ld.global.v4.u32 {%r336, %r337, %r338, %r339}, [%rd9+16]; ld.global.v4.u32 {%r344, %r345, %r346, %r347}, [%rd8+32]; ld.global.v4.u32 {%r352, %r353, %r354, %r355}, [%rd10+16]; ld.global.v4.u32 {%r360, %r361, %r362, %r363}, [%rd11+16]; ld.global.v4.u32 {%r368, %r369, %r370, %r371}, [%rd12+16]; shl.b16 %rs555, %rs35, 4; cvt.s16.s8 %rs556, %rs555; shr.s16 %rs557, %rs556, 7; and.b16 %rs558, %rs557, -16; or.b16 %rs559, %rs558, %rs36; cvt.rn.f32.s16 %f892, %rs559; sub.ftz.f32 %f893, %f892, %f8; mul.ftz.f32 %f894, %f81, %f893; mov.b32 {%rs379, %rs385}, %r328; // begin inline asm { cvt.f32.f16 %f652, %rs379;} // end inline asm fma.rn.ftz.f32 %f895, %f894, %f652, %f886; mov.b32 {%rs380, %rs386}, %r336; // begin inline asm { cvt.f32.f16 %f653, %rs380;} // end inline asm fma.rn.ftz.f32 %f896, %f894, %f653, %f887; mov.b32 {%rs381, %rs387}, %r344; // begin inline asm { cvt.f32.f16 %f654, %rs381;} // end inline asm fma.rn.ftz.f32 %f897, %f894, %f654, %f888; mov.b32 {%rs382, %rs388}, %r352; // begin inline asm { cvt.f32.f16 %f655, %rs382;} // end inline asm fma.rn.ftz.f32 %f898, %f894, %f655, %f889; mov.b32 {%rs383, %rs389}, %r360; // begin inline asm { cvt.f32.f16 %f656, %rs383;} // end inline asm fma.rn.ftz.f32 %f899, %f894, %f656, %f890; mov.b32 {%rs384, %rs390}, %r368; // begin inline asm { cvt.f32.f16 %f657, %rs384;} // end inline asm fma.rn.ftz.f32 %f900, %f894, %f657, %f891; shl.b16 %rs560, %rs37, 4; cvt.s16.s8 %rs561, %rs560; shr.s16 %rs562, %rs561, 7; and.b16 %rs563, %rs562, -16; or.b16 %rs564, %rs563, %rs38; cvt.rn.f32.s16 %f901, %rs564; sub.ftz.f32 %f902, %f901, %f8; mul.ftz.f32 %f903, %f81, %f902; // begin inline asm { cvt.f32.f16 %f658, %rs385;} // end inline asm fma.rn.ftz.f32 %f904, %f903, %f658, %f895; // begin inline asm { cvt.f32.f16 %f659, %rs386;} // end inline asm fma.rn.ftz.f32 %f905, %f903, %f659, %f896; // begin inline asm { cvt.f32.f16 %f660, %rs387;} // end inline asm fma.rn.ftz.f32 %f906, %f903, %f660, %f897; // begin inline asm { cvt.f32.f16 %f661, %rs388;} // end inline asm fma.rn.ftz.f32 %f907, %f903, %f661, %f898; // begin inline asm { cvt.f32.f16 %f662, %rs389;} // end inline asm fma.rn.ftz.f32 %f908, %f903, %f662, %f899; // begin inline asm { cvt.f32.f16 %f663, %rs390;} // end inline asm fma.rn.ftz.f32 %f909, %f903, %f663, %f900; shl.b16 %rs565, %rs39, 4; cvt.s16.s8 %rs566, %rs565; shr.s16 %rs567, %rs566, 7; and.b16 %rs568, %rs567, -16; or.b16 %rs569, %rs568, %rs40; cvt.rn.f32.s16 %f910, %rs569; sub.ftz.f32 %f911, %f910, %f8; mul.ftz.f32 %f912, %f81, %f911; mov.b32 {%rs391, %rs397}, %r329; // begin inline asm { cvt.f32.f16 %f664, %rs391;} // end inline asm fma.rn.ftz.f32 %f913, %f912, %f664, %f904; mov.b32 {%rs392, %rs398}, %r337; // begin inline asm { cvt.f32.f16 %f665, %rs392;} // end inline asm fma.rn.ftz.f32 %f914, %f912, %f665, %f905; mov.b32 {%rs393, %rs399}, %r345; // begin inline asm { cvt.f32.f16 %f666, %rs393;} // end inline asm fma.rn.ftz.f32 %f915, %f912, %f666, %f906; mov.b32 {%rs394, %rs400}, %r353; // begin inline asm { cvt.f32.f16 %f667, %rs394;} // end inline asm fma.rn.ftz.f32 %f916, %f912, %f667, %f907; mov.b32 {%rs395, %rs401}, %r361; // begin inline asm { cvt.f32.f16 %f668, %rs395;} // end inline asm fma.rn.ftz.f32 %f917, %f912, %f668, %f908; mov.b32 {%rs396, %rs402}, %r369; // begin inline asm { cvt.f32.f16 %f669, %rs396;} // end inline asm fma.rn.ftz.f32 %f918, %f912, %f669, %f909; shl.b16 %rs570, %rs41, 4; cvt.s16.s8 %rs571, %rs570; shr.s16 %rs572, %rs571, 7; and.b16 %rs573, %rs572, -16; or.b16 %rs574, %rs573, %rs42; cvt.rn.f32.s16 %f919, %rs574; sub.ftz.f32 %f920, %f919, %f8; mul.ftz.f32 %f921, %f81, %f920; // begin inline asm { cvt.f32.f16 %f670, %rs397;} // end inline asm fma.rn.ftz.f32 %f922, %f921, %f670, %f913; // begin inline asm { cvt.f32.f16 %f671, %rs398;} // end inline asm fma.rn.ftz.f32 %f923, %f921, %f671, %f914; // begin inline asm { cvt.f32.f16 %f672, %rs399;} // end inline asm fma.rn.ftz.f32 %f924, %f921, %f672, %f915; // begin inline asm { cvt.f32.f16 %f673, %rs400;} // end inline asm fma.rn.ftz.f32 %f925, %f921, %f673, %f916; // begin inline asm { cvt.f32.f16 %f674, %rs401;} // end inline asm fma.rn.ftz.f32 %f926, %f921, %f674, %f917; // begin inline asm { cvt.f32.f16 %f675, %rs402;} // end inline asm fma.rn.ftz.f32 %f927, %f921, %f675, %f918; shl.b16 %rs575, %rs43, 4; cvt.s16.s8 %rs576, %rs575; shr.s16 %rs577, %rs576, 7; and.b16 %rs578, %rs577, -16; or.b16 %rs579, %rs578, %rs44; cvt.rn.f32.s16 %f928, %rs579; sub.ftz.f32 %f929, %f928, %f8; mul.ftz.f32 %f930, %f81, %f929; mov.b32 {%rs403, %rs409}, %r330; // begin inline asm { cvt.f32.f16 %f676, %rs403;} // end inline asm fma.rn.ftz.f32 %f931, %f930, %f676, %f922; mov.b32 {%rs404, %rs410}, %r338; // begin inline asm { cvt.f32.f16 %f677, %rs404;} // end inline asm fma.rn.ftz.f32 %f932, %f930, %f677, %f923; mov.b32 {%rs405, %rs411}, %r346; // begin inline asm { cvt.f32.f16 %f678, %rs405;} // end inline asm fma.rn.ftz.f32 %f933, %f930, %f678, %f924; mov.b32 {%rs406, %rs412}, %r354; // begin inline asm { cvt.f32.f16 %f679, %rs406;} // end inline asm fma.rn.ftz.f32 %f934, %f930, %f679, %f925; mov.b32 {%rs407, %rs413}, %r362; // begin inline asm { cvt.f32.f16 %f680, %rs407;} // end inline asm fma.rn.ftz.f32 %f935, %f930, %f680, %f926; mov.b32 {%rs408, %rs414}, %r370; // begin inline asm { cvt.f32.f16 %f681, %rs408;} // end inline asm fma.rn.ftz.f32 %f936, %f930, %f681, %f927; shl.b16 %rs580, %rs45, 4; cvt.s16.s8 %rs581, %rs580; shr.s16 %rs582, %rs581, 7; and.b16 %rs583, %rs582, -16; or.b16 %rs584, %rs583, %rs46; cvt.rn.f32.s16 %f937, %rs584; sub.ftz.f32 %f938, %f937, %f8; mul.ftz.f32 %f939, %f81, %f938; // begin inline asm { cvt.f32.f16 %f682, %rs409;} // end inline asm fma.rn.ftz.f32 %f940, %f939, %f682, %f931; // begin inline asm { cvt.f32.f16 %f683, %rs410;} // end inline asm fma.rn.ftz.f32 %f941, %f939, %f683, %f932; // begin inline asm { cvt.f32.f16 %f684, %rs411;} // end inline asm fma.rn.ftz.f32 %f942, %f939, %f684, %f933; // begin inline asm { cvt.f32.f16 %f685, %rs412;} // end inline asm fma.rn.ftz.f32 %f943, %f939, %f685, %f934; // begin inline asm { cvt.f32.f16 %f686, %rs413;} // end inline asm fma.rn.ftz.f32 %f944, %f939, %f686, %f935; // begin inline asm { cvt.f32.f16 %f687, %rs414;} // end inline asm fma.rn.ftz.f32 %f945, %f939, %f687, %f936; shl.b16 %rs585, %rs47, 4; cvt.s16.s8 %rs586, %rs585; shr.s16 %rs587, %rs586, 7; and.b16 %rs588, %rs587, -16; or.b16 %rs589, %rs588, %rs48; cvt.rn.f32.s16 %f946, %rs589; sub.ftz.f32 %f947, %f946, %f8; mul.ftz.f32 %f948, %f81, %f947; mov.b32 {%rs415, %rs421}, %r331; // begin inline asm { cvt.f32.f16 %f688, %rs415;} // end inline asm fma.rn.ftz.f32 %f949, %f948, %f688, %f940; mov.b32 {%rs416, %rs422}, %r339; // begin inline asm { cvt.f32.f16 %f689, %rs416;} // end inline asm fma.rn.ftz.f32 %f950, %f948, %f689, %f941; mov.b32 {%rs417, %rs423}, %r347; // begin inline asm { cvt.f32.f16 %f690, %rs417;} // end inline asm fma.rn.ftz.f32 %f951, %f948, %f690, %f942; mov.b32 {%rs418, %rs424}, %r355; // begin inline asm { cvt.f32.f16 %f691, %rs418;} // end inline asm fma.rn.ftz.f32 %f952, %f948, %f691, %f943; mov.b32 {%rs419, %rs425}, %r363; // begin inline asm { cvt.f32.f16 %f692, %rs419;} // end inline asm fma.rn.ftz.f32 %f953, %f948, %f692, %f944; mov.b32 {%rs420, %rs426}, %r371; // begin inline asm { cvt.f32.f16 %f693, %rs420;} // end inline asm fma.rn.ftz.f32 %f954, %f948, %f693, %f945; shl.b16 %rs590, %rs49, 4; cvt.s16.s8 %rs591, %rs590; shr.s16 %rs592, %rs591, 7; and.b16 %rs593, %rs592, -16; or.b16 %rs594, %rs593, %rs49; cvt.rn.f32.s16 %f955, %rs594; sub.ftz.f32 %f956, %f955, %f8; mul.ftz.f32 %f957, %f81, %f956; // begin inline asm { cvt.f32.f16 %f694, %rs421;} // end inline asm fma.rn.ftz.f32 %f958, %f957, %f694, %f949; // begin inline asm { cvt.f32.f16 %f695, %rs422;} // end inline asm fma.rn.ftz.f32 %f959, %f957, %f695, %f950; // begin inline asm { cvt.f32.f16 %f696, %rs423;} // end inline asm fma.rn.ftz.f32 %f960, %f957, %f696, %f951; // begin inline asm { cvt.f32.f16 %f697, %rs424;} // end inline asm fma.rn.ftz.f32 %f961, %f957, %f697, %f952; // begin inline asm { cvt.f32.f16 %f698, %rs425;} // end inline asm fma.rn.ftz.f32 %f962, %f957, %f698, %f953; // begin inline asm { cvt.f32.f16 %f699, %rs426;} // end inline asm fma.rn.ftz.f32 %f963, %f957, %f699, %f954; ld.global.v4.u32 {%r376, %r377, %r378, %r379}, [%rd7+48]; ld.global.v4.u32 {%r384, %r385, %r386, %r387}, [%rd9+32]; ld.global.v4.u32 {%r392, %r393, %r394, %r395}, [%rd8+48]; ld.global.v4.u32 {%r400, %r401, %r402, %r403}, [%rd10+32]; ld.global.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11+32]; ld.global.v4.u32 {%r416, %r417, %r418, %r419}, [%rd12+32]; shl.b16 %rs595, %rs50, 4; cvt.s16.s8 %rs596, %rs595; shr.s16 %rs597, %rs596, 7; and.b16 %rs598, %rs597, -16; or.b16 %rs599, %rs598, %rs51; cvt.rn.f32.s16 %f964, %rs599; sub.ftz.f32 %f965, %f964, %f8; mul.ftz.f32 %f966, %f81, %f965; mov.b32 {%rs427, %rs433}, %r376; // begin inline asm { cvt.f32.f16 %f700, %rs427;} // end inline asm fma.rn.ftz.f32 %f967, %f966, %f700, %f958; mov.b32 {%rs428, %rs434}, %r384; // begin inline asm { cvt.f32.f16 %f701, %rs428;} // end inline asm fma.rn.ftz.f32 %f968, %f966, %f701, %f959; mov.b32 {%rs429, %rs435}, %r392; // begin inline asm { cvt.f32.f16 %f702, %rs429;} // end inline asm fma.rn.ftz.f32 %f969, %f966, %f702, %f960; mov.b32 {%rs430, %rs436}, %r400; // begin inline asm { cvt.f32.f16 %f703, %rs430;} // end inline asm fma.rn.ftz.f32 %f970, %f966, %f703, %f961; mov.b32 {%rs431, %rs437}, %r408; // begin inline asm { cvt.f32.f16 %f704, %rs431;} // end inline asm fma.rn.ftz.f32 %f971, %f966, %f704, %f962; mov.b32 {%rs432, %rs438}, %r416; // begin inline asm { cvt.f32.f16 %f705, %rs432;} // end inline asm fma.rn.ftz.f32 %f972, %f966, %f705, %f963; shl.b16 %rs600, %rs52, 4; cvt.s16.s8 %rs601, %rs600; shr.s16 %rs602, %rs601, 7; and.b16 %rs603, %rs602, -16; or.b16 %rs604, %rs603, %rs53; cvt.rn.f32.s16 %f973, %rs604; sub.ftz.f32 %f974, %f973, %f8; mul.ftz.f32 %f975, %f81, %f974; // begin inline asm { cvt.f32.f16 %f706, %rs433;} // end inline asm fma.rn.ftz.f32 %f976, %f975, %f706, %f967; // begin inline asm { cvt.f32.f16 %f707, %rs434;} // end inline asm fma.rn.ftz.f32 %f977, %f975, %f707, %f968; // begin inline asm { cvt.f32.f16 %f708, %rs435;} // end inline asm fma.rn.ftz.f32 %f978, %f975, %f708, %f969; // begin inline asm { cvt.f32.f16 %f709, %rs436;} // end inline asm fma.rn.ftz.f32 %f979, %f975, %f709, %f970; // begin inline asm { cvt.f32.f16 %f710, %rs437;} // end inline asm fma.rn.ftz.f32 %f980, %f975, %f710, %f971; // begin inline asm { cvt.f32.f16 %f711, %rs438;} // end inline asm fma.rn.ftz.f32 %f981, %f975, %f711, %f972; shl.b16 %rs605, %rs54, 4; cvt.s16.s8 %rs606, %rs605; shr.s16 %rs607, %rs606, 7; and.b16 %rs608, %rs607, -16; or.b16 %rs609, %rs608, %rs55; cvt.rn.f32.s16 %f982, %rs609; sub.ftz.f32 %f983, %f982, %f8; mul.ftz.f32 %f984, %f81, %f983; mov.b32 {%rs439, %rs445}, %r377; // begin inline asm { cvt.f32.f16 %f712, %rs439;} // end inline asm fma.rn.ftz.f32 %f985, %f984, %f712, %f976; mov.b32 {%rs440, %rs446}, %r385; // begin inline asm { cvt.f32.f16 %f713, %rs440;} // end inline asm fma.rn.ftz.f32 %f986, %f984, %f713, %f977; mov.b32 {%rs441, %rs447}, %r393; // begin inline asm { cvt.f32.f16 %f714, %rs441;} // end inline asm fma.rn.ftz.f32 %f987, %f984, %f714, %f978; mov.b32 {%rs442, %rs448}, %r401; // begin inline asm { cvt.f32.f16 %f715, %rs442;} // end inline asm fma.rn.ftz.f32 %f988, %f984, %f715, %f979; mov.b32 {%rs443, %rs449}, %r409; // begin inline asm { cvt.f32.f16 %f716, %rs443;} // end inline asm fma.rn.ftz.f32 %f989, %f984, %f716, %f980; mov.b32 {%rs444, %rs450}, %r417; // begin inline asm { cvt.f32.f16 %f717, %rs444;} // end inline asm fma.rn.ftz.f32 %f990, %f984, %f717, %f981; shl.b16 %rs610, %rs56, 4; cvt.s16.s8 %rs611, %rs610; shr.s16 %rs612, %rs611, 7; and.b16 %rs613, %rs612, -16; or.b16 %rs614, %rs613, %rs57; cvt.rn.f32.s16 %f991, %rs614; sub.ftz.f32 %f992, %f991, %f8; mul.ftz.f32 %f993, %f81, %f992; // begin inline asm { cvt.f32.f16 %f718, %rs445;} // end inline asm fma.rn.ftz.f32 %f994, %f993, %f718, %f985; // begin inline asm { cvt.f32.f16 %f719, %rs446;} // end inline asm fma.rn.ftz.f32 %f995, %f993, %f719, %f986; // begin inline asm { cvt.f32.f16 %f720, %rs447;} // end inline asm fma.rn.ftz.f32 %f996, %f993, %f720, %f987; // begin inline asm { cvt.f32.f16 %f721, %rs448;} // end inline asm fma.rn.ftz.f32 %f997, %f993, %f721, %f988; // begin inline asm { cvt.f32.f16 %f722, %rs449;} // end inline asm fma.rn.ftz.f32 %f998, %f993, %f722, %f989; // begin inline asm { cvt.f32.f16 %f723, %rs450;} // end inline asm fma.rn.ftz.f32 %f999, %f993, %f723, %f990; shl.b16 %rs615, %rs58, 4; cvt.s16.s8 %rs616, %rs615; shr.s16 %rs617, %rs616, 7; and.b16 %rs618, %rs617, -16; or.b16 %rs619, %rs618, %rs59; cvt.rn.f32.s16 %f1000, %rs619; sub.ftz.f32 %f1001, %f1000, %f8; mul.ftz.f32 %f1002, %f81, %f1001; mov.b32 {%rs451, %rs457}, %r378; // begin inline asm { cvt.f32.f16 %f724, %rs451;} // end inline asm fma.rn.ftz.f32 %f1003, %f1002, %f724, %f994; mov.b32 {%rs452, %rs458}, %r386; // begin inline asm { cvt.f32.f16 %f725, %rs452;} // end inline asm fma.rn.ftz.f32 %f1004, %f1002, %f725, %f995; mov.b32 {%rs453, %rs459}, %r394; // begin inline asm { cvt.f32.f16 %f726, %rs453;} // end inline asm fma.rn.ftz.f32 %f1005, %f1002, %f726, %f996; mov.b32 {%rs454, %rs460}, %r402; // begin inline asm { cvt.f32.f16 %f727, %rs454;} // end inline asm fma.rn.ftz.f32 %f1006, %f1002, %f727, %f997; mov.b32 {%rs455, %rs461}, %r410; // begin inline asm { cvt.f32.f16 %f728, %rs455;} // end inline asm fma.rn.ftz.f32 %f1007, %f1002, %f728, %f998; mov.b32 {%rs456, %rs462}, %r418; // begin inline asm { cvt.f32.f16 %f729, %rs456;} // end inline asm fma.rn.ftz.f32 %f1008, %f1002, %f729, %f999; shl.b16 %rs620, %rs60, 4; cvt.s16.s8 %rs621, %rs620; shr.s16 %rs622, %rs621, 7; and.b16 %rs623, %rs622, -16; or.b16 %rs624, %rs623, %rs61; cvt.rn.f32.s16 %f1009, %rs624; sub.ftz.f32 %f1010, %f1009, %f8; mul.ftz.f32 %f1011, %f81, %f1010; // begin inline asm { cvt.f32.f16 %f730, %rs457;} // end inline asm fma.rn.ftz.f32 %f1012, %f1011, %f730, %f1003; // begin inline asm { cvt.f32.f16 %f731, %rs458;} // end inline asm fma.rn.ftz.f32 %f1013, %f1011, %f731, %f1004; // begin inline asm { cvt.f32.f16 %f732, %rs459;} // end inline asm fma.rn.ftz.f32 %f1014, %f1011, %f732, %f1005; // begin inline asm { cvt.f32.f16 %f733, %rs460;} // end inline asm fma.rn.ftz.f32 %f1015, %f1011, %f733, %f1006; // begin inline asm { cvt.f32.f16 %f734, %rs461;} // end inline asm fma.rn.ftz.f32 %f1016, %f1011, %f734, %f1007; // begin inline asm { cvt.f32.f16 %f735, %rs462;} // end inline asm fma.rn.ftz.f32 %f1017, %f1011, %f735, %f1008; shl.b16 %rs625, %rs62, 4; cvt.s16.s8 %rs626, %rs625; shr.s16 %rs627, %rs626, 7; and.b16 %rs628, %rs627, -16; or.b16 %rs629, %rs628, %rs63; cvt.rn.f32.s16 %f1018, %rs629; sub.ftz.f32 %f1019, %f1018, %f8; mul.ftz.f32 %f1020, %f81, %f1019; mov.b32 {%rs463, %rs469}, %r379; // begin inline asm { cvt.f32.f16 %f736, %rs463;} // end inline asm fma.rn.ftz.f32 %f1021, %f1020, %f736, %f1012; mov.b32 {%rs464, %rs470}, %r387; // begin inline asm { cvt.f32.f16 %f737, %rs464;} // end inline asm fma.rn.ftz.f32 %f1022, %f1020, %f737, %f1013; mov.b32 {%rs465, %rs471}, %r395; // begin inline asm { cvt.f32.f16 %f738, %rs465;} // end inline asm fma.rn.ftz.f32 %f1023, %f1020, %f738, %f1014; mov.b32 {%rs466, %rs472}, %r403; // begin inline asm { cvt.f32.f16 %f739, %rs466;} // end inline asm fma.rn.ftz.f32 %f1024, %f1020, %f739, %f1015; mov.b32 {%rs467, %rs473}, %r411; // begin inline asm { cvt.f32.f16 %f740, %rs467;} // end inline asm fma.rn.ftz.f32 %f1025, %f1020, %f740, %f1016; mov.b32 {%rs468, %rs474}, %r419; // begin inline asm { cvt.f32.f16 %f741, %rs468;} // end inline asm fma.rn.ftz.f32 %f1026, %f1020, %f741, %f1017; shl.b16 %rs630, %rs64, 4; cvt.s16.s8 %rs631, %rs630; shr.s16 %rs632, %rs631, 7; and.b16 %rs633, %rs632, -16; or.b16 %rs634, %rs633, %rs64; cvt.rn.f32.s16 %f1027, %rs634; sub.ftz.f32 %f1028, %f1027, %f8; mul.ftz.f32 %f1029, %f81, %f1028; // begin inline asm { cvt.f32.f16 %f742, %rs469;} // end inline asm fma.rn.ftz.f32 %f1173, %f1029, %f742, %f1021; // begin inline asm { cvt.f32.f16 %f743, %rs470;} // end inline asm fma.rn.ftz.f32 %f1172, %f1029, %f743, %f1022; // begin inline asm { cvt.f32.f16 %f744, %rs471;} // end inline asm fma.rn.ftz.f32 %f1171, %f1029, %f744, %f1023; // begin inline asm { cvt.f32.f16 %f745, %rs472;} // end inline asm fma.rn.ftz.f32 %f1170, %f1029, %f745, %f1024; // begin inline asm { cvt.f32.f16 %f746, %rs473;} // end inline asm fma.rn.ftz.f32 %f1169, %f1029, %f746, %f1025; // begin inline asm { cvt.f32.f16 %f747, %rs474;} // end inline asm fma.rn.ftz.f32 %f1168, %f1029, %f747, %f1026; $L__BB0_8: add.s32 %r594, %r594, 4; shl.b32 %r424, %r594, 5; add.s32 %r593, %r424, %r58; shl.b32 %r592, %r593, 2; setp.lt.u32 %p7, %r592, %r55; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r426, %tid.y; shl.b32 %r427, %r426, 5; add.s32 %r44, %r427, %r58; setp.lt.u32 %p8, %r44, 32; shl.b32 %r429, %r44, 2; mov.u32 %r430, _ZZ9gemv_int4ILi4ELi128ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r431, %r430, %r429; @%p8 bra $L__BB0_11; add.s32 %r586, %r431, -112; st.shared.f32 [%r586], %f1173; $L__BB0_11: setp.gt.u32 %p9, %r44, 31; bar.sync 0; mad.lo.s32 %r46, %r44, 12, %r430; @%p9 bra $L__BB0_13; mov.u32 %r450, 16; ld.shared.f32 %f1045, [%r46+16]; add.ftz.f32 %f1046, %f1173, %f1045; ld.shared.f32 %f1047, [%r46+20]; add.ftz.f32 %f1048, %f1046, %f1047; ld.shared.f32 %f1049, [%r46+24]; add.ftz.f32 %f1032, %f1048, %f1049; mov.u32 %r438, 1; mov.u32 %r451, 31; mov.u32 %r452, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1032, %r438, %r451, %r452; @p add.f32 r0, r0, %f1032; mov.f32 %f1030, r0;} // end inline asm mov.u32 %r441, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1030, %r441, %r451, %r452; @p add.f32 r0, r0, %f1030; mov.f32 %f1033, r0;} // end inline asm mov.u32 %r444, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1033, %r444, %r451, %r452; @p add.f32 r0, r0, %f1033; mov.f32 %f1036, r0;} // end inline asm mov.u32 %r447, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1036, %r447, %r451, %r452; @p add.f32 r0, r0, %f1036; mov.f32 %f1039, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1039, %r450, %r451, %r452; @p add.f32 r0, r0, %f1039; mov.f32 %f1173, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r587, %r431, -112; st.shared.f32 [%r587+640], %f1172; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f1065, [%r46+656]; add.ftz.f32 %f1066, %f1172, %f1065; ld.shared.f32 %f1067, [%r46+660]; add.ftz.f32 %f1068, %f1066, %f1067; ld.shared.f32 %f1069, [%r46+664]; add.ftz.f32 %f1052, %f1068, %f1069; mov.u32 %r462, 1; mov.u32 %r475, 31; mov.u32 %r476, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1052, %r462, %r475, %r476; @p add.f32 r0, r0, %f1052; mov.f32 %f1050, r0;} // end inline asm mov.u32 %r465, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1050, %r465, %r475, %r476; @p add.f32 r0, r0, %f1050; mov.f32 %f1053, r0;} // end inline asm mov.u32 %r468, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1053, %r468, %r475, %r476; @p add.f32 r0, r0, %f1053; mov.f32 %f1056, r0;} // end inline asm mov.u32 %r471, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1056, %r471, %r475, %r476; @p add.f32 r0, r0, %f1056; mov.f32 %f1059, r0;} // end inline asm mov.u32 %r474, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1059, %r474, %r475, %r476; @p add.f32 r0, r0, %f1059; mov.f32 %f1172, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r588, %r431, -112; st.shared.f32 [%r588+1280], %f1171; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f1085, [%r46+1296]; add.ftz.f32 %f1086, %f1171, %f1085; ld.shared.f32 %f1087, [%r46+1300]; add.ftz.f32 %f1088, %f1086, %f1087; ld.shared.f32 %f1089, [%r46+1304]; add.ftz.f32 %f1072, %f1088, %f1089; mov.u32 %r486, 1; mov.u32 %r499, 31; mov.u32 %r500, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1072, %r486, %r499, %r500; @p add.f32 r0, r0, %f1072; mov.f32 %f1070, r0;} // end inline asm mov.u32 %r489, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1070, %r489, %r499, %r500; @p add.f32 r0, r0, %f1070; mov.f32 %f1073, r0;} // end inline asm mov.u32 %r492, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1073, %r492, %r499, %r500; @p add.f32 r0, r0, %f1073; mov.f32 %f1076, r0;} // end inline asm mov.u32 %r495, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1076, %r495, %r499, %r500; @p add.f32 r0, r0, %f1076; mov.f32 %f1079, r0;} // end inline asm mov.u32 %r498, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1079, %r498, %r499, %r500; @p add.f32 r0, r0, %f1079; mov.f32 %f1171, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r589, %r431, -112; st.shared.f32 [%r589+1920], %f1170; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f1105, [%r46+1936]; add.ftz.f32 %f1106, %f1170, %f1105; ld.shared.f32 %f1107, [%r46+1940]; add.ftz.f32 %f1108, %f1106, %f1107; ld.shared.f32 %f1109, [%r46+1944]; add.ftz.f32 %f1092, %f1108, %f1109; mov.u32 %r510, 1; mov.u32 %r523, 31; mov.u32 %r524, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1092, %r510, %r523, %r524; @p add.f32 r0, r0, %f1092; mov.f32 %f1090, r0;} // end inline asm mov.u32 %r513, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1090, %r513, %r523, %r524; @p add.f32 r0, r0, %f1090; mov.f32 %f1093, r0;} // end inline asm mov.u32 %r516, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1093, %r516, %r523, %r524; @p add.f32 r0, r0, %f1093; mov.f32 %f1096, r0;} // end inline asm mov.u32 %r519, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1096, %r519, %r523, %r524; @p add.f32 r0, r0, %f1096; mov.f32 %f1099, r0;} // end inline asm mov.u32 %r522, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1099, %r522, %r523, %r524; @p add.f32 r0, r0, %f1099; mov.f32 %f1170, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r590, %r431, -112; st.shared.f32 [%r590+2560], %f1169; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f1125, [%r46+2576]; add.ftz.f32 %f1126, %f1169, %f1125; ld.shared.f32 %f1127, [%r46+2580]; add.ftz.f32 %f1128, %f1126, %f1127; ld.shared.f32 %f1129, [%r46+2584]; add.ftz.f32 %f1112, %f1128, %f1129; mov.u32 %r534, 1; mov.u32 %r547, 31; mov.u32 %r548, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1112, %r534, %r547, %r548; @p add.f32 r0, r0, %f1112; mov.f32 %f1110, r0;} // end inline asm mov.u32 %r537, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1110, %r537, %r547, %r548; @p add.f32 r0, r0, %f1110; mov.f32 %f1113, r0;} // end inline asm mov.u32 %r540, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1113, %r540, %r547, %r548; @p add.f32 r0, r0, %f1113; mov.f32 %f1116, r0;} // end inline asm mov.u32 %r543, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1116, %r543, %r547, %r548; @p add.f32 r0, r0, %f1116; mov.f32 %f1119, r0;} // end inline asm mov.u32 %r546, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1119, %r546, %r547, %r548; @p add.f32 r0, r0, %f1119; mov.f32 %f1169, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r591, %r431, -112; st.shared.f32 [%r591+3200], %f1168; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f1145, [%r46+3216]; add.ftz.f32 %f1146, %f1168, %f1145; ld.shared.f32 %f1147, [%r46+3220]; add.ftz.f32 %f1148, %f1146, %f1147; ld.shared.f32 %f1149, [%r46+3224]; add.ftz.f32 %f1132, %f1148, %f1149; mov.u32 %r558, 1; mov.u32 %r571, 31; mov.u32 %r572, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1132, %r558, %r571, %r572; @p add.f32 r0, r0, %f1132; mov.f32 %f1130, r0;} // end inline asm mov.u32 %r561, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1130, %r561, %r571, %r572; @p add.f32 r0, r0, %f1130; mov.f32 %f1133, r0;} // end inline asm mov.u32 %r564, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1133, %r564, %r571, %r572; @p add.f32 r0, r0, %f1133; mov.f32 %f1136, r0;} // end inline asm mov.u32 %r567, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1136, %r567, %r571, %r572; @p add.f32 r0, r0, %f1136; mov.f32 %f1139, r0;} // end inline asm mov.u32 %r570, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f1139, %r570, %r571, %r572; @p add.f32 r0, r0, %f1139; mov.f32 %f1168, r0;} // end inline asm $L__BB0_33: or.b32 %r575, %r58, %r426; setp.ne.s32 %p20, %r575, 0; @%p20 bra $L__BB0_47; ld.param.u64 %rd87, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p21, %rd87, 0; mul.ftz.f32 %f1186, %f73, %f1173; mov.u32 %r576, %ctaid.x; cvt.s64.s32 %rd13, %r576; @%p21 bra $L__BB0_36; shl.b64 %rd54, %rd13, 1; add.s64 %rd55, %rd2, %rd54; ld.global.u16 %rs635, [%rd55]; // begin inline asm { cvt.f32.f16 %f1150, %rs635;} // end inline asm fma.rn.ftz.f32 %f1186, %f74, %f1150, %f1186; $L__BB0_36: ld.param.u64 %rd88, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs636, %f1186;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd56, 1.0; // end inline asm shl.b64 %rd59, %rd13, 1; add.s64 %rd57, %rd88, %rd59; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd57], %rs636, %rd56; // end inline asm mul.ftz.f32 %f1187, %f73, %f1172; add.s32 %r578, %r54, %r576; cvt.s64.s32 %rd16, %r578; @%p21 bra $L__BB0_38; shl.b64 %rd60, %rd16, 1; add.s64 %rd61, %rd2, %rd60; ld.global.u16 %rs638, [%rd61]; // begin inline asm { cvt.f32.f16 %f1152, %rs638;} // end inline asm fma.rn.ftz.f32 %f1187, %f74, %f1152, %f1187; $L__BB0_38: cvt.s64.s32 %rd17, %r54; mul.wide.s32 %rd65, %r54, 2; add.s64 %rd63, %rd57, %rd65; // begin inline asm { cvt.rn.f16.f32 %rs639, %f1187;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd62, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd63], %rs639, %rd62; // end inline asm mul.ftz.f32 %f1188, %f73, %f1171; cvt.u32.u64 %r579, %rd16; add.s32 %r580, %r579, %r54; cvt.s64.s32 %rd18, %r580; @%p21 bra $L__BB0_40; shl.b64 %rd66, %rd18, 1; add.s64 %rd67, %rd2, %rd66; ld.global.u16 %rs641, [%rd67]; // begin inline asm { cvt.f32.f16 %f1154, %rs641;} // end inline asm fma.rn.ftz.f32 %f1188, %f74, %f1154, %f1188; $L__BB0_40: ld.param.u64 %rd89, [_Z28dequant_gemv_group128_batch623DequantGemvKernelParams_param_0]; shl.b64 %rd71, %rd18, 1; add.s64 %rd69, %rd89, %rd71; // begin inline asm { cvt.rn.f16.f32 %rs642, %f1188;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd68, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd69], %rs642, %rd68; // end inline asm mul.ftz.f32 %f1189, %f73, %f1170; cvt.u32.u64 %r581, %rd18; add.s32 %r582, %r581, %r54; cvt.s64.s32 %rd20, %r582; @%p21 bra $L__BB0_42; shl.b64 %rd72, %rd20, 1; add.s64 %rd73, %rd2, %rd72; ld.global.u16 %rs644, [%rd73]; // begin inline asm { cvt.f32.f16 %f1156, %rs644;} // end inline asm fma.rn.ftz.f32 %f1189, %f74, %f1156, %f1189; $L__BB0_42: // begin inline asm { cvt.rn.f16.f32 %rs645, %f1189;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd74, 1.0; // end inline asm shl.b64 %rd21, %rd17, 1; add.s64 %rd75, %rd69, %rd21; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd75], %rs645, %rd74; // end inline asm mul.ftz.f32 %f1190, %f73, %f1169; cvt.u32.u64 %r583, %rd20; add.s32 %r584, %r583, %r54; cvt.s64.s32 %rd23, %r584; @%p21 bra $L__BB0_44; shl.b64 %rd77, %rd23, 1; add.s64 %rd78, %rd2, %rd77; ld.global.u16 %rs647, [%rd78]; // begin inline asm { cvt.f32.f16 %f1158, %rs647;} // end inline asm fma.rn.ftz.f32 %f1190, %f74, %f1158, %f1190; $L__BB0_44: // begin inline asm { cvt.rn.f16.f32 %rs648, %f1190;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd79, 1.0; // end inline asm add.s64 %rd80, %rd75, %rd21; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd80], %rs648, %rd79; // end inline asm mul.ftz.f32 %f1191, %f73, %f1168; cvt.u32.u64 %r585, %rd23; add.s32 %r48, %r585, %r54; @%p21 bra $L__BB0_46; mul.wide.s32 %rd82, %r48, 2; add.s64 %rd83, %rd2, %rd82; ld.global.u16 %rs650, [%rd83]; // begin inline asm { cvt.f32.f16 %f1160, %rs650;} // end inline asm fma.rn.ftz.f32 %f1191, %f74, %f1160, %f1191; $L__BB0_46: // begin inline asm { cvt.rn.f16.f32 %rs651, %f1191;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd84, 1.0; // end inline asm add.s64 %rd85, %rd80, %rd21; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd85], %rs651, %rd84; // end inline asm $L__BB0_47: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }