8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_1f8a2ee16thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch523DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<24>; .reg .b16 %rs<587>; .reg .f32 %f<1027>; .reg .b32 %r<513>; .reg .b64 %rd<82>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi5EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[3200]; ld.param.v2.u32 {%r48, %r49}, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r50, %r51}, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f62, %f63}, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs73, %rs74, %rs75, %rs76}, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd24, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd23, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd22, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd21, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd20, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd2, %rd20; mov.u32 %r512, %tid.y; shl.b32 %r52, %r512, 5; mov.u32 %r53, %tid.x; add.s32 %r511, %r52, %r53; shl.b32 %r510, %r511, 2; setp.ge.u32 %p1, %r510, %r50; mov.f32 %f1007, 0f00000000; mov.f32 %f1008, %f1007; mov.f32 %f1009, %f1007; mov.f32 %f1010, %f1007; mov.f32 %f1011, %f1007; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd21; mov.u32 %r54, %ctaid.x; mul.lo.s32 %r6, %r51, %r54; $L__BB0_2: mad.lo.s32 %r60, %r50, %r54, %r510; mul.wide.u32 %rd31, %r60, 4; add.s64 %rd26, %rd22, %rd31; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd25, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v4.u32 {%r55,%r56,%r57,%r58}, [%rd26], %rd25; // end inline asm shr.u32 %r62, %r53, 2; shl.b32 %r63, %r512, 3; add.s32 %r14, %r63, %r62; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd32, %r15, 2; add.s64 %rd29, %rd24, %rd32; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd28, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs81, [%rd29], %rd28; // end inline asm // begin inline asm { cvt.f32.f16 %f69, %rs81;} // end inline asm shl.b16 %rs586, %rs73, 3; setp.eq.s64 %p2, %rd23, 0; @%p2 bra $L__BB0_4; shr.u32 %r64, %r15, 31; add.s32 %r65, %r15, %r64; shr.s32 %r66, %r65, 1; cvt.s64.s32 %rd36, %r66; add.s64 %rd34, %rd23, %rd36; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd33, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs83, [%rd34], %rd33; // end inline asm cvt.u32.u16 %r67, %rs83; and.b32 %r68, %r67, 255; shl.b32 %r69, %r14, 2; and.b32 %r70, %r69, 4; shr.u32 %r71, %r68, %r70; cvt.u16.u32 %rs84, %r71; and.b16 %rs586, %rs84, 15; $L__BB0_4: shl.b32 %r16, %r511, 5; setp.ge.s32 %p3, %r16, %r48; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs73, 0; shr.u16 %rs86, %rs586, 3; and.b16 %rs87, %rs86, 1; setp.eq.b16 %p5, %rs87, 1; and.pred %p6, %p4, %p5; selp.b16 %rs88, -16, 0, %p6; or.b16 %rs89, %rs88, %rs586; cvt.s16.s8 %rs90, %rs89; cvt.rn.f32.s16 %f7, %rs90; mul.wide.s32 %rd37, %r16, 2; add.s64 %rd7, %rd3, %rd37; ld.global.v4.u32 {%r72, %r73, %r74, %r75}, [%rd7]; mul.wide.s32 %rd38, %r48, 2; add.s64 %rd39, %rd7, %rd38; ld.global.v4.u32 {%r76, %r77, %r78, %r79}, [%rd39]; add.s32 %r80, %r16, %r48; add.s32 %r81, %r80, %r48; shl.b32 %r82, %r48, 1; mul.wide.s32 %rd40, %r82, 2; add.s64 %rd41, %rd7, %rd40; ld.global.v4.u32 {%r83, %r84, %r85, %r86}, [%rd41]; add.s32 %r87, %r81, %r48; mul.wide.s32 %rd42, %r87, 2; add.s64 %rd8, %rd3, %rd42; ld.global.v4.u32 {%r88, %r89, %r90, %r91}, [%rd8]; add.s64 %rd43, %rd41, %rd40; ld.global.v4.u32 {%r92, %r93, %r94, %r95}, [%rd43]; cvt.u16.u32 %rs5, %r55; and.b16 %rs6, %rs5, 15; shr.u32 %r96, %r55, 4; cvt.u16.u32 %rs7, %r96; and.b16 %rs8, %rs7, 15; shr.u32 %r97, %r55, 8; cvt.u16.u32 %rs9, %r97; and.b16 %rs10, %rs9, 15; shr.u32 %r98, %r55, 12; cvt.u16.u32 %rs11, %r98; and.b16 %rs12, %rs11, 15; shr.u32 %r99, %r55, 16; cvt.u16.u32 %rs13, %r99; and.b16 %rs14, %rs13, 15; shr.u32 %r100, %r55, 20; cvt.u16.u32 %rs15, %r100; and.b16 %rs16, %rs15, 15; shr.u32 %r101, %r55, 24; cvt.u16.u32 %rs17, %r101; and.b16 %rs18, %rs17, 15; shr.u32 %r102, %r55, 28; cvt.u16.u32 %rs19, %r102; add.s32 %r103, %r80, 8; mul.wide.s32 %rd44, %r103, 2; add.s64 %rd9, %rd3, %rd44; add.s32 %r104, %r103, %r48; mul.wide.s32 %rd45, %r104, 2; add.s64 %rd10, %rd3, %rd45; add.s32 %r105, %r104, %r48; add.s32 %r106, %r105, %r48; mul.wide.s32 %rd46, %r106, 2; add.s64 %rd11, %rd3, %rd46; cvt.u16.u32 %rs20, %r56; and.b16 %rs21, %rs20, 15; shr.u32 %r107, %r56, 4; cvt.u16.u32 %rs22, %r107; and.b16 %rs23, %rs22, 15; shr.u32 %r108, %r56, 8; cvt.u16.u32 %rs24, %r108; and.b16 %rs25, %rs24, 15; shr.u32 %r109, %r56, 12; cvt.u16.u32 %rs26, %r109; and.b16 %rs27, %rs26, 15; shr.u32 %r110, %r56, 16; cvt.u16.u32 %rs28, %r110; and.b16 %rs29, %rs28, 15; shr.u32 %r111, %r56, 20; cvt.u16.u32 %rs30, %r111; and.b16 %rs31, %rs30, 15; shr.u32 %r112, %r56, 24; cvt.u16.u32 %rs32, %r112; and.b16 %rs33, %rs32, 15; shr.u32 %r113, %r56, 28; cvt.u16.u32 %rs34, %r113; cvt.u16.u32 %rs35, %r57; and.b16 %rs36, %rs35, 15; shr.u32 %r114, %r57, 4; cvt.u16.u32 %rs37, %r114; and.b16 %rs38, %rs37, 15; shr.u32 %r115, %r57, 8; cvt.u16.u32 %rs39, %r115; and.b16 %rs40, %rs39, 15; shr.u32 %r116, %r57, 12; cvt.u16.u32 %rs41, %r116; and.b16 %rs42, %rs41, 15; shr.u32 %r117, %r57, 16; cvt.u16.u32 %rs43, %r117; and.b16 %rs44, %rs43, 15; shr.u32 %r118, %r57, 20; cvt.u16.u32 %rs45, %r118; and.b16 %rs46, %rs45, 15; shr.u32 %r119, %r57, 24; cvt.u16.u32 %rs47, %r119; and.b16 %rs48, %rs47, 15; shr.u32 %r120, %r57, 28; cvt.u16.u32 %rs49, %r120; cvt.u16.u32 %rs50, %r58; and.b16 %rs51, %rs50, 15; shr.u32 %r121, %r58, 4; cvt.u16.u32 %rs52, %r121; and.b16 %rs53, %rs52, 15; shr.u32 %r122, %r58, 8; cvt.u16.u32 %rs54, %r122; and.b16 %rs55, %rs54, 15; shr.u32 %r123, %r58, 12; cvt.u16.u32 %rs56, %r123; and.b16 %rs57, %rs56, 15; shr.u32 %r124, %r58, 16; cvt.u16.u32 %rs58, %r124; and.b16 %rs59, %rs58, 15; shr.u32 %r125, %r58, 20; cvt.u16.u32 %rs60, %r125; and.b16 %rs61, %rs60, 15; shr.u32 %r126, %r58, 24; cvt.u16.u32 %rs62, %r126; and.b16 %rs63, %rs62, 15; shr.u32 %r127, %r58, 28; cvt.u16.u32 %rs64, %r127; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f230, %rs6; sub.ftz.f32 %f231, %f230, %f7; mul.ftz.f32 %f232, %f69, %f231; mov.b32 {%rs91, %rs96}, %r72; // begin inline asm { cvt.f32.f16 %f70, %rs91;} // end inline asm fma.rn.ftz.f32 %f233, %f232, %f70, %f1011; mov.b32 {%rs92, %rs97}, %r76; // begin inline asm { cvt.f32.f16 %f71, %rs92;} // end inline asm fma.rn.ftz.f32 %f234, %f232, %f71, %f1010; mov.b32 {%rs93, %rs98}, %r83; // begin inline asm { cvt.f32.f16 %f72, %rs93;} // end inline asm fma.rn.ftz.f32 %f235, %f232, %f72, %f1009; mov.b32 {%rs94, %rs99}, %r88; // begin inline asm { cvt.f32.f16 %f73, %rs94;} // end inline asm fma.rn.ftz.f32 %f236, %f232, %f73, %f1008; mov.b32 {%rs95, %rs100}, %r92; // begin inline asm { cvt.f32.f16 %f74, %rs95;} // end inline asm fma.rn.ftz.f32 %f237, %f232, %f74, %f1007; cvt.rn.f32.s16 %f238, %rs8; sub.ftz.f32 %f239, %f238, %f7; mul.ftz.f32 %f240, %f69, %f239; // begin inline asm { cvt.f32.f16 %f75, %rs96;} // end inline asm fma.rn.ftz.f32 %f241, %f240, %f75, %f233; // begin inline asm { cvt.f32.f16 %f76, %rs97;} // end inline asm fma.rn.ftz.f32 %f242, %f240, %f76, %f234; // begin inline asm { cvt.f32.f16 %f77, %rs98;} // end inline asm fma.rn.ftz.f32 %f243, %f240, %f77, %f235; // begin inline asm { cvt.f32.f16 %f78, %rs99;} // end inline asm fma.rn.ftz.f32 %f244, %f240, %f78, %f236; // begin inline asm { cvt.f32.f16 %f79, %rs100;} // end inline asm fma.rn.ftz.f32 %f245, %f240, %f79, %f237; cvt.rn.f32.s16 %f246, %rs10; sub.ftz.f32 %f247, %f246, %f7; mul.ftz.f32 %f248, %f69, %f247; mov.b32 {%rs101, %rs106}, %r73; // begin inline asm { cvt.f32.f16 %f80, %rs101;} // end inline asm fma.rn.ftz.f32 %f249, %f248, %f80, %f241; mov.b32 {%rs102, %rs107}, %r77; // begin inline asm { cvt.f32.f16 %f81, %rs102;} // end inline asm fma.rn.ftz.f32 %f250, %f248, %f81, %f242; mov.b32 {%rs103, %rs108}, %r84; // begin inline asm { cvt.f32.f16 %f82, %rs103;} // end inline asm fma.rn.ftz.f32 %f251, %f248, %f82, %f243; mov.b32 {%rs104, %rs109}, %r89; // begin inline asm { cvt.f32.f16 %f83, %rs104;} // end inline asm fma.rn.ftz.f32 %f252, %f248, %f83, %f244; mov.b32 {%rs105, %rs110}, %r93; // begin inline asm { cvt.f32.f16 %f84, %rs105;} // end inline asm fma.rn.ftz.f32 %f253, %f248, %f84, %f245; cvt.rn.f32.s16 %f254, %rs12; sub.ftz.f32 %f255, %f254, %f7; mul.ftz.f32 %f256, %f69, %f255; // begin inline asm { cvt.f32.f16 %f85, %rs106;} // end inline asm fma.rn.ftz.f32 %f257, %f256, %f85, %f249; // begin inline asm { cvt.f32.f16 %f86, %rs107;} // end inline asm fma.rn.ftz.f32 %f258, %f256, %f86, %f250; // begin inline asm { cvt.f32.f16 %f87, %rs108;} // end inline asm fma.rn.ftz.f32 %f259, %f256, %f87, %f251; // begin inline asm { cvt.f32.f16 %f88, %rs109;} // end inline asm fma.rn.ftz.f32 %f260, %f256, %f88, %f252; // begin inline asm { cvt.f32.f16 %f89, %rs110;} // end inline asm fma.rn.ftz.f32 %f261, %f256, %f89, %f253; cvt.rn.f32.s16 %f262, %rs14; sub.ftz.f32 %f263, %f262, %f7; mul.ftz.f32 %f264, %f69, %f263; mov.b32 {%rs111, %rs116}, %r74; // begin inline asm { cvt.f32.f16 %f90, %rs111;} // end inline asm fma.rn.ftz.f32 %f265, %f264, %f90, %f257; mov.b32 {%rs112, %rs117}, %r78; // begin inline asm { cvt.f32.f16 %f91, %rs112;} // end inline asm fma.rn.ftz.f32 %f266, %f264, %f91, %f258; mov.b32 {%rs113, %rs118}, %r85; // begin inline asm { cvt.f32.f16 %f92, %rs113;} // end inline asm fma.rn.ftz.f32 %f267, %f264, %f92, %f259; mov.b32 {%rs114, %rs119}, %r90; // begin inline asm { cvt.f32.f16 %f93, %rs114;} // end inline asm fma.rn.ftz.f32 %f268, %f264, %f93, %f260; mov.b32 {%rs115, %rs120}, %r94; // begin inline asm { cvt.f32.f16 %f94, %rs115;} // end inline asm fma.rn.ftz.f32 %f269, %f264, %f94, %f261; cvt.rn.f32.s16 %f270, %rs16; sub.ftz.f32 %f271, %f270, %f7; mul.ftz.f32 %f272, %f69, %f271; // begin inline asm { cvt.f32.f16 %f95, %rs116;} // end inline asm fma.rn.ftz.f32 %f273, %f272, %f95, %f265; // begin inline asm { cvt.f32.f16 %f96, %rs117;} // end inline asm fma.rn.ftz.f32 %f274, %f272, %f96, %f266; // begin inline asm { cvt.f32.f16 %f97, %rs118;} // end inline asm fma.rn.ftz.f32 %f275, %f272, %f97, %f267; // begin inline asm { cvt.f32.f16 %f98, %rs119;} // end inline asm fma.rn.ftz.f32 %f276, %f272, %f98, %f268; // begin inline asm { cvt.f32.f16 %f99, %rs120;} // end inline asm fma.rn.ftz.f32 %f277, %f272, %f99, %f269; cvt.rn.f32.s16 %f278, %rs18; sub.ftz.f32 %f279, %f278, %f7; mul.ftz.f32 %f280, %f69, %f279; mov.b32 {%rs121, %rs126}, %r75; // begin inline asm { cvt.f32.f16 %f100, %rs121;} // end inline asm fma.rn.ftz.f32 %f281, %f280, %f100, %f273; mov.b32 {%rs122, %rs127}, %r79; // begin inline asm { cvt.f32.f16 %f101, %rs122;} // end inline asm fma.rn.ftz.f32 %f282, %f280, %f101, %f274; mov.b32 {%rs123, %rs128}, %r86; // begin inline asm { cvt.f32.f16 %f102, %rs123;} // end inline asm fma.rn.ftz.f32 %f283, %f280, %f102, %f275; mov.b32 {%rs124, %rs129}, %r91; // begin inline asm { cvt.f32.f16 %f103, %rs124;} // end inline asm fma.rn.ftz.f32 %f284, %f280, %f103, %f276; mov.b32 {%rs125, %rs130}, %r95; // begin inline asm { cvt.f32.f16 %f104, %rs125;} // end inline asm fma.rn.ftz.f32 %f285, %f280, %f104, %f277; cvt.rn.f32.s16 %f286, %rs19; sub.ftz.f32 %f287, %f286, %f7; mul.ftz.f32 %f288, %f69, %f287; // begin inline asm { cvt.f32.f16 %f105, %rs126;} // end inline asm fma.rn.ftz.f32 %f289, %f288, %f105, %f281; // begin inline asm { cvt.f32.f16 %f106, %rs127;} // end inline asm fma.rn.ftz.f32 %f290, %f288, %f106, %f282; // begin inline asm { cvt.f32.f16 %f107, %rs128;} // end inline asm fma.rn.ftz.f32 %f291, %f288, %f107, %f283; // begin inline asm { cvt.f32.f16 %f108, %rs129;} // end inline asm fma.rn.ftz.f32 %f292, %f288, %f108, %f284; // begin inline asm { cvt.f32.f16 %f109, %rs130;} // end inline asm fma.rn.ftz.f32 %f293, %f288, %f109, %f285; ld.global.v4.u32 {%r128, %r129, %r130, %r131}, [%rd7+16]; ld.global.v4.u32 {%r136, %r137, %r138, %r139}, [%rd9]; ld.global.v4.u32 {%r144, %r145, %r146, %r147}, [%rd10]; ld.global.v4.u32 {%r152, %r153, %r154, %r155}, [%rd8+16]; ld.global.v4.u32 {%r160, %r161, %r162, %r163}, [%rd11]; cvt.rn.f32.s16 %f294, %rs21; sub.ftz.f32 %f295, %f294, %f7; mul.ftz.f32 %f296, %f69, %f295; mov.b32 {%rs131, %rs136}, %r128; // begin inline asm { cvt.f32.f16 %f110, %rs131;} // end inline asm fma.rn.ftz.f32 %f297, %f296, %f110, %f289; mov.b32 {%rs132, %rs137}, %r136; // begin inline asm { cvt.f32.f16 %f111, %rs132;} // end inline asm fma.rn.ftz.f32 %f298, %f296, %f111, %f290; mov.b32 {%rs133, %rs138}, %r144; // begin inline asm { cvt.f32.f16 %f112, %rs133;} // end inline asm fma.rn.ftz.f32 %f299, %f296, %f112, %f291; mov.b32 {%rs134, %rs139}, %r152; // begin inline asm { cvt.f32.f16 %f113, %rs134;} // end inline asm fma.rn.ftz.f32 %f300, %f296, %f113, %f292; mov.b32 {%rs135, %rs140}, %r160; // begin inline asm { cvt.f32.f16 %f114, %rs135;} // end inline asm fma.rn.ftz.f32 %f301, %f296, %f114, %f293; cvt.rn.f32.s16 %f302, %rs23; sub.ftz.f32 %f303, %f302, %f7; mul.ftz.f32 %f304, %f69, %f303; // begin inline asm { cvt.f32.f16 %f115, %rs136;} // end inline asm fma.rn.ftz.f32 %f305, %f304, %f115, %f297; // begin inline asm { cvt.f32.f16 %f116, %rs137;} // end inline asm fma.rn.ftz.f32 %f306, %f304, %f116, %f298; // begin inline asm { cvt.f32.f16 %f117, %rs138;} // end inline asm fma.rn.ftz.f32 %f307, %f304, %f117, %f299; // begin inline asm { cvt.f32.f16 %f118, %rs139;} // end inline asm fma.rn.ftz.f32 %f308, %f304, %f118, %f300; // begin inline asm { cvt.f32.f16 %f119, %rs140;} // end inline asm fma.rn.ftz.f32 %f309, %f304, %f119, %f301; cvt.rn.f32.s16 %f310, %rs25; sub.ftz.f32 %f311, %f310, %f7; mul.ftz.f32 %f312, %f69, %f311; mov.b32 {%rs141, %rs146}, %r129; // begin inline asm { cvt.f32.f16 %f120, %rs141;} // end inline asm fma.rn.ftz.f32 %f313, %f312, %f120, %f305; mov.b32 {%rs142, %rs147}, %r137; // begin inline asm { cvt.f32.f16 %f121, %rs142;} // end inline asm fma.rn.ftz.f32 %f314, %f312, %f121, %f306; mov.b32 {%rs143, %rs148}, %r145; // begin inline asm { cvt.f32.f16 %f122, %rs143;} // end inline asm fma.rn.ftz.f32 %f315, %f312, %f122, %f307; mov.b32 {%rs144, %rs149}, %r153; // begin inline asm { cvt.f32.f16 %f123, %rs144;} // end inline asm fma.rn.ftz.f32 %f316, %f312, %f123, %f308; mov.b32 {%rs145, %rs150}, %r161; // begin inline asm { cvt.f32.f16 %f124, %rs145;} // end inline asm fma.rn.ftz.f32 %f317, %f312, %f124, %f309; cvt.rn.f32.s16 %f318, %rs27; sub.ftz.f32 %f319, %f318, %f7; mul.ftz.f32 %f320, %f69, %f319; // begin inline asm { cvt.f32.f16 %f125, %rs146;} // end inline asm fma.rn.ftz.f32 %f321, %f320, %f125, %f313; // begin inline asm { cvt.f32.f16 %f126, %rs147;} // end inline asm fma.rn.ftz.f32 %f322, %f320, %f126, %f314; // begin inline asm { cvt.f32.f16 %f127, %rs148;} // end inline asm fma.rn.ftz.f32 %f323, %f320, %f127, %f315; // begin inline asm { cvt.f32.f16 %f128, %rs149;} // end inline asm fma.rn.ftz.f32 %f324, %f320, %f128, %f316; // begin inline asm { cvt.f32.f16 %f129, %rs150;} // end inline asm fma.rn.ftz.f32 %f325, %f320, %f129, %f317; cvt.rn.f32.s16 %f326, %rs29; sub.ftz.f32 %f327, %f326, %f7; mul.ftz.f32 %f328, %f69, %f327; mov.b32 {%rs151, %rs156}, %r130; // begin inline asm { cvt.f32.f16 %f130, %rs151;} // end inline asm fma.rn.ftz.f32 %f329, %f328, %f130, %f321; mov.b32 {%rs152, %rs157}, %r138; // begin inline asm { cvt.f32.f16 %f131, %rs152;} // end inline asm fma.rn.ftz.f32 %f330, %f328, %f131, %f322; mov.b32 {%rs153, %rs158}, %r146; // begin inline asm { cvt.f32.f16 %f132, %rs153;} // end inline asm fma.rn.ftz.f32 %f331, %f328, %f132, %f323; mov.b32 {%rs154, %rs159}, %r154; // begin inline asm { cvt.f32.f16 %f133, %rs154;} // end inline asm fma.rn.ftz.f32 %f332, %f328, %f133, %f324; mov.b32 {%rs155, %rs160}, %r162; // begin inline asm { cvt.f32.f16 %f134, %rs155;} // end inline asm fma.rn.ftz.f32 %f333, %f328, %f134, %f325; cvt.rn.f32.s16 %f334, %rs31; sub.ftz.f32 %f335, %f334, %f7; mul.ftz.f32 %f336, %f69, %f335; // begin inline asm { cvt.f32.f16 %f135, %rs156;} // end inline asm fma.rn.ftz.f32 %f337, %f336, %f135, %f329; // begin inline asm { cvt.f32.f16 %f136, %rs157;} // end inline asm fma.rn.ftz.f32 %f338, %f336, %f136, %f330; // begin inline asm { cvt.f32.f16 %f137, %rs158;} // end inline asm fma.rn.ftz.f32 %f339, %f336, %f137, %f331; // begin inline asm { cvt.f32.f16 %f138, %rs159;} // end inline asm fma.rn.ftz.f32 %f340, %f336, %f138, %f332; // begin inline asm { cvt.f32.f16 %f139, %rs160;} // end inline asm fma.rn.ftz.f32 %f341, %f336, %f139, %f333; cvt.rn.f32.s16 %f342, %rs33; sub.ftz.f32 %f343, %f342, %f7; mul.ftz.f32 %f344, %f69, %f343; mov.b32 {%rs161, %rs166}, %r131; // begin inline asm { cvt.f32.f16 %f140, %rs161;} // end inline asm fma.rn.ftz.f32 %f345, %f344, %f140, %f337; mov.b32 {%rs162, %rs167}, %r139; // begin inline asm { cvt.f32.f16 %f141, %rs162;} // end inline asm fma.rn.ftz.f32 %f346, %f344, %f141, %f338; mov.b32 {%rs163, %rs168}, %r147; // begin inline asm { cvt.f32.f16 %f142, %rs163;} // end inline asm fma.rn.ftz.f32 %f347, %f344, %f142, %f339; mov.b32 {%rs164, %rs169}, %r155; // begin inline asm { cvt.f32.f16 %f143, %rs164;} // end inline asm fma.rn.ftz.f32 %f348, %f344, %f143, %f340; mov.b32 {%rs165, %rs170}, %r163; // begin inline asm { cvt.f32.f16 %f144, %rs165;} // end inline asm fma.rn.ftz.f32 %f349, %f344, %f144, %f341; cvt.rn.f32.s16 %f350, %rs34; sub.ftz.f32 %f351, %f350, %f7; mul.ftz.f32 %f352, %f69, %f351; // begin inline asm { cvt.f32.f16 %f145, %rs166;} // end inline asm fma.rn.ftz.f32 %f353, %f352, %f145, %f345; // begin inline asm { cvt.f32.f16 %f146, %rs167;} // end inline asm fma.rn.ftz.f32 %f354, %f352, %f146, %f346; // begin inline asm { cvt.f32.f16 %f147, %rs168;} // end inline asm fma.rn.ftz.f32 %f355, %f352, %f147, %f347; // begin inline asm { cvt.f32.f16 %f148, %rs169;} // end inline asm fma.rn.ftz.f32 %f356, %f352, %f148, %f348; // begin inline asm { cvt.f32.f16 %f149, %rs170;} // end inline asm fma.rn.ftz.f32 %f357, %f352, %f149, %f349; ld.global.v4.u32 {%r168, %r169, %r170, %r171}, [%rd7+32]; ld.global.v4.u32 {%r176, %r177, %r178, %r179}, [%rd9+16]; ld.global.v4.u32 {%r184, %r185, %r186, %r187}, [%rd10+16]; ld.global.v4.u32 {%r192, %r193, %r194, %r195}, [%rd8+32]; ld.global.v4.u32 {%r200, %r201, %r202, %r203}, [%rd11+16]; cvt.rn.f32.s16 %f358, %rs36; sub.ftz.f32 %f359, %f358, %f7; mul.ftz.f32 %f360, %f69, %f359; mov.b32 {%rs171, %rs176}, %r168; // begin inline asm { cvt.f32.f16 %f150, %rs171;} // end inline asm fma.rn.ftz.f32 %f361, %f360, %f150, %f353; mov.b32 {%rs172, %rs177}, %r176; // begin inline asm { cvt.f32.f16 %f151, %rs172;} // end inline asm fma.rn.ftz.f32 %f362, %f360, %f151, %f354; mov.b32 {%rs173, %rs178}, %r184; // begin inline asm { cvt.f32.f16 %f152, %rs173;} // end inline asm fma.rn.ftz.f32 %f363, %f360, %f152, %f355; mov.b32 {%rs174, %rs179}, %r192; // begin inline asm { cvt.f32.f16 %f153, %rs174;} // end inline asm fma.rn.ftz.f32 %f364, %f360, %f153, %f356; mov.b32 {%rs175, %rs180}, %r200; // begin inline asm { cvt.f32.f16 %f154, %rs175;} // end inline asm fma.rn.ftz.f32 %f365, %f360, %f154, %f357; cvt.rn.f32.s16 %f366, %rs38; sub.ftz.f32 %f367, %f366, %f7; mul.ftz.f32 %f368, %f69, %f367; // begin inline asm { cvt.f32.f16 %f155, %rs176;} // end inline asm fma.rn.ftz.f32 %f369, %f368, %f155, %f361; // begin inline asm { cvt.f32.f16 %f156, %rs177;} // end inline asm fma.rn.ftz.f32 %f370, %f368, %f156, %f362; // begin inline asm { cvt.f32.f16 %f157, %rs178;} // end inline asm fma.rn.ftz.f32 %f371, %f368, %f157, %f363; // begin inline asm { cvt.f32.f16 %f158, %rs179;} // end inline asm fma.rn.ftz.f32 %f372, %f368, %f158, %f364; // begin inline asm { cvt.f32.f16 %f159, %rs180;} // end inline asm fma.rn.ftz.f32 %f373, %f368, %f159, %f365; cvt.rn.f32.s16 %f374, %rs40; sub.ftz.f32 %f375, %f374, %f7; mul.ftz.f32 %f376, %f69, %f375; mov.b32 {%rs181, %rs186}, %r169; // begin inline asm { cvt.f32.f16 %f160, %rs181;} // end inline asm fma.rn.ftz.f32 %f377, %f376, %f160, %f369; mov.b32 {%rs182, %rs187}, %r177; // begin inline asm { cvt.f32.f16 %f161, %rs182;} // end inline asm fma.rn.ftz.f32 %f378, %f376, %f161, %f370; mov.b32 {%rs183, %rs188}, %r185; // begin inline asm { cvt.f32.f16 %f162, %rs183;} // end inline asm fma.rn.ftz.f32 %f379, %f376, %f162, %f371; mov.b32 {%rs184, %rs189}, %r193; // begin inline asm { cvt.f32.f16 %f163, %rs184;} // end inline asm fma.rn.ftz.f32 %f380, %f376, %f163, %f372; mov.b32 {%rs185, %rs190}, %r201; // begin inline asm { cvt.f32.f16 %f164, %rs185;} // end inline asm fma.rn.ftz.f32 %f381, %f376, %f164, %f373; cvt.rn.f32.s16 %f382, %rs42; sub.ftz.f32 %f383, %f382, %f7; mul.ftz.f32 %f384, %f69, %f383; // begin inline asm { cvt.f32.f16 %f165, %rs186;} // end inline asm fma.rn.ftz.f32 %f385, %f384, %f165, %f377; // begin inline asm { cvt.f32.f16 %f166, %rs187;} // end inline asm fma.rn.ftz.f32 %f386, %f384, %f166, %f378; // begin inline asm { cvt.f32.f16 %f167, %rs188;} // end inline asm fma.rn.ftz.f32 %f387, %f384, %f167, %f379; // begin inline asm { cvt.f32.f16 %f168, %rs189;} // end inline asm fma.rn.ftz.f32 %f388, %f384, %f168, %f380; // begin inline asm { cvt.f32.f16 %f169, %rs190;} // end inline asm fma.rn.ftz.f32 %f389, %f384, %f169, %f381; cvt.rn.f32.s16 %f390, %rs44; sub.ftz.f32 %f391, %f390, %f7; mul.ftz.f32 %f392, %f69, %f391; mov.b32 {%rs191, %rs196}, %r170; // begin inline asm { cvt.f32.f16 %f170, %rs191;} // end inline asm fma.rn.ftz.f32 %f393, %f392, %f170, %f385; mov.b32 {%rs192, %rs197}, %r178; // begin inline asm { cvt.f32.f16 %f171, %rs192;} // end inline asm fma.rn.ftz.f32 %f394, %f392, %f171, %f386; mov.b32 {%rs193, %rs198}, %r186; // begin inline asm { cvt.f32.f16 %f172, %rs193;} // end inline asm fma.rn.ftz.f32 %f395, %f392, %f172, %f387; mov.b32 {%rs194, %rs199}, %r194; // begin inline asm { cvt.f32.f16 %f173, %rs194;} // end inline asm fma.rn.ftz.f32 %f396, %f392, %f173, %f388; mov.b32 {%rs195, %rs200}, %r202; // begin inline asm { cvt.f32.f16 %f174, %rs195;} // end inline asm fma.rn.ftz.f32 %f397, %f392, %f174, %f389; cvt.rn.f32.s16 %f398, %rs46; sub.ftz.f32 %f399, %f398, %f7; mul.ftz.f32 %f400, %f69, %f399; // begin inline asm { cvt.f32.f16 %f175, %rs196;} // end inline asm fma.rn.ftz.f32 %f401, %f400, %f175, %f393; // begin inline asm { cvt.f32.f16 %f176, %rs197;} // end inline asm fma.rn.ftz.f32 %f402, %f400, %f176, %f394; // begin inline asm { cvt.f32.f16 %f177, %rs198;} // end inline asm fma.rn.ftz.f32 %f403, %f400, %f177, %f395; // begin inline asm { cvt.f32.f16 %f178, %rs199;} // end inline asm fma.rn.ftz.f32 %f404, %f400, %f178, %f396; // begin inline asm { cvt.f32.f16 %f179, %rs200;} // end inline asm fma.rn.ftz.f32 %f405, %f400, %f179, %f397; cvt.rn.f32.s16 %f406, %rs48; sub.ftz.f32 %f407, %f406, %f7; mul.ftz.f32 %f408, %f69, %f407; mov.b32 {%rs201, %rs206}, %r171; // begin inline asm { cvt.f32.f16 %f180, %rs201;} // end inline asm fma.rn.ftz.f32 %f409, %f408, %f180, %f401; mov.b32 {%rs202, %rs207}, %r179; // begin inline asm { cvt.f32.f16 %f181, %rs202;} // end inline asm fma.rn.ftz.f32 %f410, %f408, %f181, %f402; mov.b32 {%rs203, %rs208}, %r187; // begin inline asm { cvt.f32.f16 %f182, %rs203;} // end inline asm fma.rn.ftz.f32 %f411, %f408, %f182, %f403; mov.b32 {%rs204, %rs209}, %r195; // begin inline asm { cvt.f32.f16 %f183, %rs204;} // end inline asm fma.rn.ftz.f32 %f412, %f408, %f183, %f404; mov.b32 {%rs205, %rs210}, %r203; // begin inline asm { cvt.f32.f16 %f184, %rs205;} // end inline asm fma.rn.ftz.f32 %f413, %f408, %f184, %f405; cvt.rn.f32.s16 %f414, %rs49; sub.ftz.f32 %f415, %f414, %f7; mul.ftz.f32 %f416, %f69, %f415; // begin inline asm { cvt.f32.f16 %f185, %rs206;} // end inline asm fma.rn.ftz.f32 %f417, %f416, %f185, %f409; // begin inline asm { cvt.f32.f16 %f186, %rs207;} // end inline asm fma.rn.ftz.f32 %f418, %f416, %f186, %f410; // begin inline asm { cvt.f32.f16 %f187, %rs208;} // end inline asm fma.rn.ftz.f32 %f419, %f416, %f187, %f411; // begin inline asm { cvt.f32.f16 %f188, %rs209;} // end inline asm fma.rn.ftz.f32 %f420, %f416, %f188, %f412; // begin inline asm { cvt.f32.f16 %f189, %rs210;} // end inline asm fma.rn.ftz.f32 %f421, %f416, %f189, %f413; ld.global.v4.u32 {%r208, %r209, %r210, %r211}, [%rd7+48]; ld.global.v4.u32 {%r216, %r217, %r218, %r219}, [%rd9+32]; ld.global.v4.u32 {%r224, %r225, %r226, %r227}, [%rd10+32]; ld.global.v4.u32 {%r232, %r233, %r234, %r235}, [%rd8+48]; ld.global.v4.u32 {%r240, %r241, %r242, %r243}, [%rd11+32]; cvt.rn.f32.s16 %f422, %rs51; sub.ftz.f32 %f423, %f422, %f7; mul.ftz.f32 %f424, %f69, %f423; mov.b32 {%rs211, %rs216}, %r208; // begin inline asm { cvt.f32.f16 %f190, %rs211;} // end inline asm fma.rn.ftz.f32 %f425, %f424, %f190, %f417; mov.b32 {%rs212, %rs217}, %r216; // begin inline asm { cvt.f32.f16 %f191, %rs212;} // end inline asm fma.rn.ftz.f32 %f426, %f424, %f191, %f418; mov.b32 {%rs213, %rs218}, %r224; // begin inline asm { cvt.f32.f16 %f192, %rs213;} // end inline asm fma.rn.ftz.f32 %f427, %f424, %f192, %f419; mov.b32 {%rs214, %rs219}, %r232; // begin inline asm { cvt.f32.f16 %f193, %rs214;} // end inline asm fma.rn.ftz.f32 %f428, %f424, %f193, %f420; mov.b32 {%rs215, %rs220}, %r240; // begin inline asm { cvt.f32.f16 %f194, %rs215;} // end inline asm fma.rn.ftz.f32 %f429, %f424, %f194, %f421; cvt.rn.f32.s16 %f430, %rs53; sub.ftz.f32 %f431, %f430, %f7; mul.ftz.f32 %f432, %f69, %f431; // begin inline asm { cvt.f32.f16 %f195, %rs216;} // end inline asm fma.rn.ftz.f32 %f433, %f432, %f195, %f425; // begin inline asm { cvt.f32.f16 %f196, %rs217;} // end inline asm fma.rn.ftz.f32 %f434, %f432, %f196, %f426; // begin inline asm { cvt.f32.f16 %f197, %rs218;} // end inline asm fma.rn.ftz.f32 %f435, %f432, %f197, %f427; // begin inline asm { cvt.f32.f16 %f198, %rs219;} // end inline asm fma.rn.ftz.f32 %f436, %f432, %f198, %f428; // begin inline asm { cvt.f32.f16 %f199, %rs220;} // end inline asm fma.rn.ftz.f32 %f437, %f432, %f199, %f429; cvt.rn.f32.s16 %f438, %rs55; sub.ftz.f32 %f439, %f438, %f7; mul.ftz.f32 %f440, %f69, %f439; mov.b32 {%rs221, %rs226}, %r209; // begin inline asm { cvt.f32.f16 %f200, %rs221;} // end inline asm fma.rn.ftz.f32 %f441, %f440, %f200, %f433; mov.b32 {%rs222, %rs227}, %r217; // begin inline asm { cvt.f32.f16 %f201, %rs222;} // end inline asm fma.rn.ftz.f32 %f442, %f440, %f201, %f434; mov.b32 {%rs223, %rs228}, %r225; // begin inline asm { cvt.f32.f16 %f202, %rs223;} // end inline asm fma.rn.ftz.f32 %f443, %f440, %f202, %f435; mov.b32 {%rs224, %rs229}, %r233; // begin inline asm { cvt.f32.f16 %f203, %rs224;} // end inline asm fma.rn.ftz.f32 %f444, %f440, %f203, %f436; mov.b32 {%rs225, %rs230}, %r241; // begin inline asm { cvt.f32.f16 %f204, %rs225;} // end inline asm fma.rn.ftz.f32 %f445, %f440, %f204, %f437; cvt.rn.f32.s16 %f446, %rs57; sub.ftz.f32 %f447, %f446, %f7; mul.ftz.f32 %f448, %f69, %f447; // begin inline asm { cvt.f32.f16 %f205, %rs226;} // end inline asm fma.rn.ftz.f32 %f449, %f448, %f205, %f441; // begin inline asm { cvt.f32.f16 %f206, %rs227;} // end inline asm fma.rn.ftz.f32 %f450, %f448, %f206, %f442; // begin inline asm { cvt.f32.f16 %f207, %rs228;} // end inline asm fma.rn.ftz.f32 %f451, %f448, %f207, %f443; // begin inline asm { cvt.f32.f16 %f208, %rs229;} // end inline asm fma.rn.ftz.f32 %f452, %f448, %f208, %f444; // begin inline asm { cvt.f32.f16 %f209, %rs230;} // end inline asm fma.rn.ftz.f32 %f453, %f448, %f209, %f445; cvt.rn.f32.s16 %f454, %rs59; sub.ftz.f32 %f455, %f454, %f7; mul.ftz.f32 %f456, %f69, %f455; mov.b32 {%rs231, %rs236}, %r210; // begin inline asm { cvt.f32.f16 %f210, %rs231;} // end inline asm fma.rn.ftz.f32 %f457, %f456, %f210, %f449; mov.b32 {%rs232, %rs237}, %r218; // begin inline asm { cvt.f32.f16 %f211, %rs232;} // end inline asm fma.rn.ftz.f32 %f458, %f456, %f211, %f450; mov.b32 {%rs233, %rs238}, %r226; // begin inline asm { cvt.f32.f16 %f212, %rs233;} // end inline asm fma.rn.ftz.f32 %f459, %f456, %f212, %f451; mov.b32 {%rs234, %rs239}, %r234; // begin inline asm { cvt.f32.f16 %f213, %rs234;} // end inline asm fma.rn.ftz.f32 %f460, %f456, %f213, %f452; mov.b32 {%rs235, %rs240}, %r242; // begin inline asm { cvt.f32.f16 %f214, %rs235;} // end inline asm fma.rn.ftz.f32 %f461, %f456, %f214, %f453; cvt.rn.f32.s16 %f462, %rs61; sub.ftz.f32 %f463, %f462, %f7; mul.ftz.f32 %f464, %f69, %f463; // begin inline asm { cvt.f32.f16 %f215, %rs236;} // end inline asm fma.rn.ftz.f32 %f465, %f464, %f215, %f457; // begin inline asm { cvt.f32.f16 %f216, %rs237;} // end inline asm fma.rn.ftz.f32 %f466, %f464, %f216, %f458; // begin inline asm { cvt.f32.f16 %f217, %rs238;} // end inline asm fma.rn.ftz.f32 %f467, %f464, %f217, %f459; // begin inline asm { cvt.f32.f16 %f218, %rs239;} // end inline asm fma.rn.ftz.f32 %f468, %f464, %f218, %f460; // begin inline asm { cvt.f32.f16 %f219, %rs240;} // end inline asm fma.rn.ftz.f32 %f469, %f464, %f219, %f461; cvt.rn.f32.s16 %f470, %rs63; sub.ftz.f32 %f471, %f470, %f7; mul.ftz.f32 %f472, %f69, %f471; mov.b32 {%rs241, %rs246}, %r211; // begin inline asm { cvt.f32.f16 %f220, %rs241;} // end inline asm fma.rn.ftz.f32 %f473, %f472, %f220, %f465; mov.b32 {%rs242, %rs247}, %r219; // begin inline asm { cvt.f32.f16 %f221, %rs242;} // end inline asm fma.rn.ftz.f32 %f474, %f472, %f221, %f466; mov.b32 {%rs243, %rs248}, %r227; // begin inline asm { cvt.f32.f16 %f222, %rs243;} // end inline asm fma.rn.ftz.f32 %f475, %f472, %f222, %f467; mov.b32 {%rs244, %rs249}, %r235; // begin inline asm { cvt.f32.f16 %f223, %rs244;} // end inline asm fma.rn.ftz.f32 %f476, %f472, %f223, %f468; mov.b32 {%rs245, %rs250}, %r243; // begin inline asm { cvt.f32.f16 %f224, %rs245;} // end inline asm fma.rn.ftz.f32 %f477, %f472, %f224, %f469; cvt.rn.f32.s16 %f478, %rs64; sub.ftz.f32 %f479, %f478, %f7; mul.ftz.f32 %f480, %f69, %f479; // begin inline asm { cvt.f32.f16 %f225, %rs246;} // end inline asm fma.rn.ftz.f32 %f1011, %f480, %f225, %f473; // begin inline asm { cvt.f32.f16 %f226, %rs247;} // end inline asm fma.rn.ftz.f32 %f1010, %f480, %f226, %f474; // begin inline asm { cvt.f32.f16 %f227, %rs248;} // end inline asm fma.rn.ftz.f32 %f1009, %f480, %f227, %f475; // begin inline asm { cvt.f32.f16 %f228, %rs249;} // end inline asm fma.rn.ftz.f32 %f1008, %f480, %f228, %f476; // begin inline asm { cvt.f32.f16 %f229, %rs250;} // end inline asm fma.rn.ftz.f32 %f1007, %f480, %f229, %f477; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs411, %rs5, 4; cvt.s16.s8 %rs412, %rs411; shr.s16 %rs413, %rs412, 7; and.b16 %rs414, %rs413, -16; or.b16 %rs415, %rs414, %rs6; cvt.rn.f32.s16 %f641, %rs415; sub.ftz.f32 %f642, %f641, %f7; mul.ftz.f32 %f643, %f69, %f642; mov.b32 {%rs251, %rs256}, %r72; // begin inline asm { cvt.f32.f16 %f481, %rs251;} // end inline asm fma.rn.ftz.f32 %f644, %f643, %f481, %f1011; mov.b32 {%rs252, %rs257}, %r76; // begin inline asm { cvt.f32.f16 %f482, %rs252;} // end inline asm fma.rn.ftz.f32 %f645, %f643, %f482, %f1010; mov.b32 {%rs253, %rs258}, %r83; // begin inline asm { cvt.f32.f16 %f483, %rs253;} // end inline asm fma.rn.ftz.f32 %f646, %f643, %f483, %f1009; mov.b32 {%rs254, %rs259}, %r88; // begin inline asm { cvt.f32.f16 %f484, %rs254;} // end inline asm fma.rn.ftz.f32 %f647, %f643, %f484, %f1008; mov.b32 {%rs255, %rs260}, %r92; // begin inline asm { cvt.f32.f16 %f485, %rs255;} // end inline asm fma.rn.ftz.f32 %f648, %f643, %f485, %f1007; shl.b16 %rs416, %rs7, 4; cvt.s16.s8 %rs417, %rs416; shr.s16 %rs418, %rs417, 7; and.b16 %rs419, %rs418, -16; or.b16 %rs420, %rs419, %rs8; cvt.rn.f32.s16 %f649, %rs420; sub.ftz.f32 %f650, %f649, %f7; mul.ftz.f32 %f651, %f69, %f650; // begin inline asm { cvt.f32.f16 %f486, %rs256;} // end inline asm fma.rn.ftz.f32 %f652, %f651, %f486, %f644; // begin inline asm { cvt.f32.f16 %f487, %rs257;} // end inline asm fma.rn.ftz.f32 %f653, %f651, %f487, %f645; // begin inline asm { cvt.f32.f16 %f488, %rs258;} // end inline asm fma.rn.ftz.f32 %f654, %f651, %f488, %f646; // begin inline asm { cvt.f32.f16 %f489, %rs259;} // end inline asm fma.rn.ftz.f32 %f655, %f651, %f489, %f647; // begin inline asm { cvt.f32.f16 %f490, %rs260;} // end inline asm fma.rn.ftz.f32 %f656, %f651, %f490, %f648; shl.b16 %rs421, %rs9, 4; cvt.s16.s8 %rs422, %rs421; shr.s16 %rs423, %rs422, 7; and.b16 %rs424, %rs423, -16; or.b16 %rs425, %rs424, %rs10; cvt.rn.f32.s16 %f657, %rs425; sub.ftz.f32 %f658, %f657, %f7; mul.ftz.f32 %f659, %f69, %f658; mov.b32 {%rs261, %rs266}, %r73; // begin inline asm { cvt.f32.f16 %f491, %rs261;} // end inline asm fma.rn.ftz.f32 %f660, %f659, %f491, %f652; mov.b32 {%rs262, %rs267}, %r77; // begin inline asm { cvt.f32.f16 %f492, %rs262;} // end inline asm fma.rn.ftz.f32 %f661, %f659, %f492, %f653; mov.b32 {%rs263, %rs268}, %r84; // begin inline asm { cvt.f32.f16 %f493, %rs263;} // end inline asm fma.rn.ftz.f32 %f662, %f659, %f493, %f654; mov.b32 {%rs264, %rs269}, %r89; // begin inline asm { cvt.f32.f16 %f494, %rs264;} // end inline asm fma.rn.ftz.f32 %f663, %f659, %f494, %f655; mov.b32 {%rs265, %rs270}, %r93; // begin inline asm { cvt.f32.f16 %f495, %rs265;} // end inline asm fma.rn.ftz.f32 %f664, %f659, %f495, %f656; shl.b16 %rs426, %rs11, 4; cvt.s16.s8 %rs427, %rs426; shr.s16 %rs428, %rs427, 7; and.b16 %rs429, %rs428, -16; or.b16 %rs430, %rs429, %rs12; cvt.rn.f32.s16 %f665, %rs430; sub.ftz.f32 %f666, %f665, %f7; mul.ftz.f32 %f667, %f69, %f666; // begin inline asm { cvt.f32.f16 %f496, %rs266;} // end inline asm fma.rn.ftz.f32 %f668, %f667, %f496, %f660; // begin inline asm { cvt.f32.f16 %f497, %rs267;} // end inline asm fma.rn.ftz.f32 %f669, %f667, %f497, %f661; // begin inline asm { cvt.f32.f16 %f498, %rs268;} // end inline asm fma.rn.ftz.f32 %f670, %f667, %f498, %f662; // begin inline asm { cvt.f32.f16 %f499, %rs269;} // end inline asm fma.rn.ftz.f32 %f671, %f667, %f499, %f663; // begin inline asm { cvt.f32.f16 %f500, %rs270;} // end inline asm fma.rn.ftz.f32 %f672, %f667, %f500, %f664; shl.b16 %rs431, %rs13, 4; cvt.s16.s8 %rs432, %rs431; shr.s16 %rs433, %rs432, 7; and.b16 %rs434, %rs433, -16; or.b16 %rs435, %rs434, %rs14; cvt.rn.f32.s16 %f673, %rs435; sub.ftz.f32 %f674, %f673, %f7; mul.ftz.f32 %f675, %f69, %f674; mov.b32 {%rs271, %rs276}, %r74; // begin inline asm { cvt.f32.f16 %f501, %rs271;} // end inline asm fma.rn.ftz.f32 %f676, %f675, %f501, %f668; mov.b32 {%rs272, %rs277}, %r78; // begin inline asm { cvt.f32.f16 %f502, %rs272;} // end inline asm fma.rn.ftz.f32 %f677, %f675, %f502, %f669; mov.b32 {%rs273, %rs278}, %r85; // begin inline asm { cvt.f32.f16 %f503, %rs273;} // end inline asm fma.rn.ftz.f32 %f678, %f675, %f503, %f670; mov.b32 {%rs274, %rs279}, %r90; // begin inline asm { cvt.f32.f16 %f504, %rs274;} // end inline asm fma.rn.ftz.f32 %f679, %f675, %f504, %f671; mov.b32 {%rs275, %rs280}, %r94; // begin inline asm { cvt.f32.f16 %f505, %rs275;} // end inline asm fma.rn.ftz.f32 %f680, %f675, %f505, %f672; shl.b16 %rs436, %rs15, 4; cvt.s16.s8 %rs437, %rs436; shr.s16 %rs438, %rs437, 7; and.b16 %rs439, %rs438, -16; or.b16 %rs440, %rs439, %rs16; cvt.rn.f32.s16 %f681, %rs440; sub.ftz.f32 %f682, %f681, %f7; mul.ftz.f32 %f683, %f69, %f682; // begin inline asm { cvt.f32.f16 %f506, %rs276;} // end inline asm fma.rn.ftz.f32 %f684, %f683, %f506, %f676; // begin inline asm { cvt.f32.f16 %f507, %rs277;} // end inline asm fma.rn.ftz.f32 %f685, %f683, %f507, %f677; // begin inline asm { cvt.f32.f16 %f508, %rs278;} // end inline asm fma.rn.ftz.f32 %f686, %f683, %f508, %f678; // begin inline asm { cvt.f32.f16 %f509, %rs279;} // end inline asm fma.rn.ftz.f32 %f687, %f683, %f509, %f679; // begin inline asm { cvt.f32.f16 %f510, %rs280;} // end inline asm fma.rn.ftz.f32 %f688, %f683, %f510, %f680; shl.b16 %rs441, %rs17, 4; cvt.s16.s8 %rs442, %rs441; shr.s16 %rs443, %rs442, 7; and.b16 %rs444, %rs443, -16; or.b16 %rs445, %rs444, %rs18; cvt.rn.f32.s16 %f689, %rs445; sub.ftz.f32 %f690, %f689, %f7; mul.ftz.f32 %f691, %f69, %f690; mov.b32 {%rs281, %rs286}, %r75; // begin inline asm { cvt.f32.f16 %f511, %rs281;} // end inline asm fma.rn.ftz.f32 %f692, %f691, %f511, %f684; mov.b32 {%rs282, %rs287}, %r79; // begin inline asm { cvt.f32.f16 %f512, %rs282;} // end inline asm fma.rn.ftz.f32 %f693, %f691, %f512, %f685; mov.b32 {%rs283, %rs288}, %r86; // begin inline asm { cvt.f32.f16 %f513, %rs283;} // end inline asm fma.rn.ftz.f32 %f694, %f691, %f513, %f686; mov.b32 {%rs284, %rs289}, %r91; // begin inline asm { cvt.f32.f16 %f514, %rs284;} // end inline asm fma.rn.ftz.f32 %f695, %f691, %f514, %f687; mov.b32 {%rs285, %rs290}, %r95; // begin inline asm { cvt.f32.f16 %f515, %rs285;} // end inline asm fma.rn.ftz.f32 %f696, %f691, %f515, %f688; shl.b16 %rs446, %rs19, 4; cvt.s16.s8 %rs447, %rs446; shr.s16 %rs448, %rs447, 7; and.b16 %rs449, %rs448, -16; or.b16 %rs450, %rs449, %rs19; cvt.rn.f32.s16 %f697, %rs450; sub.ftz.f32 %f698, %f697, %f7; mul.ftz.f32 %f699, %f69, %f698; // begin inline asm { cvt.f32.f16 %f516, %rs286;} // end inline asm fma.rn.ftz.f32 %f700, %f699, %f516, %f692; // begin inline asm { cvt.f32.f16 %f517, %rs287;} // end inline asm fma.rn.ftz.f32 %f701, %f699, %f517, %f693; // begin inline asm { cvt.f32.f16 %f518, %rs288;} // end inline asm fma.rn.ftz.f32 %f702, %f699, %f518, %f694; // begin inline asm { cvt.f32.f16 %f519, %rs289;} // end inline asm fma.rn.ftz.f32 %f703, %f699, %f519, %f695; // begin inline asm { cvt.f32.f16 %f520, %rs290;} // end inline asm fma.rn.ftz.f32 %f704, %f699, %f520, %f696; ld.global.v4.u32 {%r248, %r249, %r250, %r251}, [%rd7+16]; ld.global.v4.u32 {%r256, %r257, %r258, %r259}, [%rd9]; ld.global.v4.u32 {%r264, %r265, %r266, %r267}, [%rd10]; ld.global.v4.u32 {%r272, %r273, %r274, %r275}, [%rd8+16]; ld.global.v4.u32 {%r280, %r281, %r282, %r283}, [%rd11]; shl.b16 %rs451, %rs20, 4; cvt.s16.s8 %rs452, %rs451; shr.s16 %rs453, %rs452, 7; and.b16 %rs454, %rs453, -16; or.b16 %rs455, %rs454, %rs21; cvt.rn.f32.s16 %f705, %rs455; sub.ftz.f32 %f706, %f705, %f7; mul.ftz.f32 %f707, %f69, %f706; mov.b32 {%rs291, %rs296}, %r248; // begin inline asm { cvt.f32.f16 %f521, %rs291;} // end inline asm fma.rn.ftz.f32 %f708, %f707, %f521, %f700; mov.b32 {%rs292, %rs297}, %r256; // begin inline asm { cvt.f32.f16 %f522, %rs292;} // end inline asm fma.rn.ftz.f32 %f709, %f707, %f522, %f701; mov.b32 {%rs293, %rs298}, %r264; // begin inline asm { cvt.f32.f16 %f523, %rs293;} // end inline asm fma.rn.ftz.f32 %f710, %f707, %f523, %f702; mov.b32 {%rs294, %rs299}, %r272; // begin inline asm { cvt.f32.f16 %f524, %rs294;} // end inline asm fma.rn.ftz.f32 %f711, %f707, %f524, %f703; mov.b32 {%rs295, %rs300}, %r280; // begin inline asm { cvt.f32.f16 %f525, %rs295;} // end inline asm fma.rn.ftz.f32 %f712, %f707, %f525, %f704; shl.b16 %rs456, %rs22, 4; cvt.s16.s8 %rs457, %rs456; shr.s16 %rs458, %rs457, 7; and.b16 %rs459, %rs458, -16; or.b16 %rs460, %rs459, %rs23; cvt.rn.f32.s16 %f713, %rs460; sub.ftz.f32 %f714, %f713, %f7; mul.ftz.f32 %f715, %f69, %f714; // begin inline asm { cvt.f32.f16 %f526, %rs296;} // end inline asm fma.rn.ftz.f32 %f716, %f715, %f526, %f708; // begin inline asm { cvt.f32.f16 %f527, %rs297;} // end inline asm fma.rn.ftz.f32 %f717, %f715, %f527, %f709; // begin inline asm { cvt.f32.f16 %f528, %rs298;} // end inline asm fma.rn.ftz.f32 %f718, %f715, %f528, %f710; // begin inline asm { cvt.f32.f16 %f529, %rs299;} // end inline asm fma.rn.ftz.f32 %f719, %f715, %f529, %f711; // begin inline asm { cvt.f32.f16 %f530, %rs300;} // end inline asm fma.rn.ftz.f32 %f720, %f715, %f530, %f712; shl.b16 %rs461, %rs24, 4; cvt.s16.s8 %rs462, %rs461; shr.s16 %rs463, %rs462, 7; and.b16 %rs464, %rs463, -16; or.b16 %rs465, %rs464, %rs25; cvt.rn.f32.s16 %f721, %rs465; sub.ftz.f32 %f722, %f721, %f7; mul.ftz.f32 %f723, %f69, %f722; mov.b32 {%rs301, %rs306}, %r249; // begin inline asm { cvt.f32.f16 %f531, %rs301;} // end inline asm fma.rn.ftz.f32 %f724, %f723, %f531, %f716; mov.b32 {%rs302, %rs307}, %r257; // begin inline asm { cvt.f32.f16 %f532, %rs302;} // end inline asm fma.rn.ftz.f32 %f725, %f723, %f532, %f717; mov.b32 {%rs303, %rs308}, %r265; // begin inline asm { cvt.f32.f16 %f533, %rs303;} // end inline asm fma.rn.ftz.f32 %f726, %f723, %f533, %f718; mov.b32 {%rs304, %rs309}, %r273; // begin inline asm { cvt.f32.f16 %f534, %rs304;} // end inline asm fma.rn.ftz.f32 %f727, %f723, %f534, %f719; mov.b32 {%rs305, %rs310}, %r281; // begin inline asm { cvt.f32.f16 %f535, %rs305;} // end inline asm fma.rn.ftz.f32 %f728, %f723, %f535, %f720; shl.b16 %rs466, %rs26, 4; cvt.s16.s8 %rs467, %rs466; shr.s16 %rs468, %rs467, 7; and.b16 %rs469, %rs468, -16; or.b16 %rs470, %rs469, %rs27; cvt.rn.f32.s16 %f729, %rs470; sub.ftz.f32 %f730, %f729, %f7; mul.ftz.f32 %f731, %f69, %f730; // begin inline asm { cvt.f32.f16 %f536, %rs306;} // end inline asm fma.rn.ftz.f32 %f732, %f731, %f536, %f724; // begin inline asm { cvt.f32.f16 %f537, %rs307;} // end inline asm fma.rn.ftz.f32 %f733, %f731, %f537, %f725; // begin inline asm { cvt.f32.f16 %f538, %rs308;} // end inline asm fma.rn.ftz.f32 %f734, %f731, %f538, %f726; // begin inline asm { cvt.f32.f16 %f539, %rs309;} // end inline asm fma.rn.ftz.f32 %f735, %f731, %f539, %f727; // begin inline asm { cvt.f32.f16 %f540, %rs310;} // end inline asm fma.rn.ftz.f32 %f736, %f731, %f540, %f728; shl.b16 %rs471, %rs28, 4; cvt.s16.s8 %rs472, %rs471; shr.s16 %rs473, %rs472, 7; and.b16 %rs474, %rs473, -16; or.b16 %rs475, %rs474, %rs29; cvt.rn.f32.s16 %f737, %rs475; sub.ftz.f32 %f738, %f737, %f7; mul.ftz.f32 %f739, %f69, %f738; mov.b32 {%rs311, %rs316}, %r250; // begin inline asm { cvt.f32.f16 %f541, %rs311;} // end inline asm fma.rn.ftz.f32 %f740, %f739, %f541, %f732; mov.b32 {%rs312, %rs317}, %r258; // begin inline asm { cvt.f32.f16 %f542, %rs312;} // end inline asm fma.rn.ftz.f32 %f741, %f739, %f542, %f733; mov.b32 {%rs313, %rs318}, %r266; // begin inline asm { cvt.f32.f16 %f543, %rs313;} // end inline asm fma.rn.ftz.f32 %f742, %f739, %f543, %f734; mov.b32 {%rs314, %rs319}, %r274; // begin inline asm { cvt.f32.f16 %f544, %rs314;} // end inline asm fma.rn.ftz.f32 %f743, %f739, %f544, %f735; mov.b32 {%rs315, %rs320}, %r282; // begin inline asm { cvt.f32.f16 %f545, %rs315;} // end inline asm fma.rn.ftz.f32 %f744, %f739, %f545, %f736; shl.b16 %rs476, %rs30, 4; cvt.s16.s8 %rs477, %rs476; shr.s16 %rs478, %rs477, 7; and.b16 %rs479, %rs478, -16; or.b16 %rs480, %rs479, %rs31; cvt.rn.f32.s16 %f745, %rs480; sub.ftz.f32 %f746, %f745, %f7; mul.ftz.f32 %f747, %f69, %f746; // begin inline asm { cvt.f32.f16 %f546, %rs316;} // end inline asm fma.rn.ftz.f32 %f748, %f747, %f546, %f740; // begin inline asm { cvt.f32.f16 %f547, %rs317;} // end inline asm fma.rn.ftz.f32 %f749, %f747, %f547, %f741; // begin inline asm { cvt.f32.f16 %f548, %rs318;} // end inline asm fma.rn.ftz.f32 %f750, %f747, %f548, %f742; // begin inline asm { cvt.f32.f16 %f549, %rs319;} // end inline asm fma.rn.ftz.f32 %f751, %f747, %f549, %f743; // begin inline asm { cvt.f32.f16 %f550, %rs320;} // end inline asm fma.rn.ftz.f32 %f752, %f747, %f550, %f744; shl.b16 %rs481, %rs32, 4; cvt.s16.s8 %rs482, %rs481; shr.s16 %rs483, %rs482, 7; and.b16 %rs484, %rs483, -16; or.b16 %rs485, %rs484, %rs33; cvt.rn.f32.s16 %f753, %rs485; sub.ftz.f32 %f754, %f753, %f7; mul.ftz.f32 %f755, %f69, %f754; mov.b32 {%rs321, %rs326}, %r251; // begin inline asm { cvt.f32.f16 %f551, %rs321;} // end inline asm fma.rn.ftz.f32 %f756, %f755, %f551, %f748; mov.b32 {%rs322, %rs327}, %r259; // begin inline asm { cvt.f32.f16 %f552, %rs322;} // end inline asm fma.rn.ftz.f32 %f757, %f755, %f552, %f749; mov.b32 {%rs323, %rs328}, %r267; // begin inline asm { cvt.f32.f16 %f553, %rs323;} // end inline asm fma.rn.ftz.f32 %f758, %f755, %f553, %f750; mov.b32 {%rs324, %rs329}, %r275; // begin inline asm { cvt.f32.f16 %f554, %rs324;} // end inline asm fma.rn.ftz.f32 %f759, %f755, %f554, %f751; mov.b32 {%rs325, %rs330}, %r283; // begin inline asm { cvt.f32.f16 %f555, %rs325;} // end inline asm fma.rn.ftz.f32 %f760, %f755, %f555, %f752; shl.b16 %rs486, %rs34, 4; cvt.s16.s8 %rs487, %rs486; shr.s16 %rs488, %rs487, 7; and.b16 %rs489, %rs488, -16; or.b16 %rs490, %rs489, %rs34; cvt.rn.f32.s16 %f761, %rs490; sub.ftz.f32 %f762, %f761, %f7; mul.ftz.f32 %f763, %f69, %f762; // begin inline asm { cvt.f32.f16 %f556, %rs326;} // end inline asm fma.rn.ftz.f32 %f764, %f763, %f556, %f756; // begin inline asm { cvt.f32.f16 %f557, %rs327;} // end inline asm fma.rn.ftz.f32 %f765, %f763, %f557, %f757; // begin inline asm { cvt.f32.f16 %f558, %rs328;} // end inline asm fma.rn.ftz.f32 %f766, %f763, %f558, %f758; // begin inline asm { cvt.f32.f16 %f559, %rs329;} // end inline asm fma.rn.ftz.f32 %f767, %f763, %f559, %f759; // begin inline asm { cvt.f32.f16 %f560, %rs330;} // end inline asm fma.rn.ftz.f32 %f768, %f763, %f560, %f760; ld.global.v4.u32 {%r288, %r289, %r290, %r291}, [%rd7+32]; ld.global.v4.u32 {%r296, %r297, %r298, %r299}, [%rd9+16]; ld.global.v4.u32 {%r304, %r305, %r306, %r307}, [%rd10+16]; ld.global.v4.u32 {%r312, %r313, %r314, %r315}, [%rd8+32]; ld.global.v4.u32 {%r320, %r321, %r322, %r323}, [%rd11+16]; shl.b16 %rs491, %rs35, 4; cvt.s16.s8 %rs492, %rs491; shr.s16 %rs493, %rs492, 7; and.b16 %rs494, %rs493, -16; or.b16 %rs495, %rs494, %rs36; cvt.rn.f32.s16 %f769, %rs495; sub.ftz.f32 %f770, %f769, %f7; mul.ftz.f32 %f771, %f69, %f770; mov.b32 {%rs331, %rs336}, %r288; // begin inline asm { cvt.f32.f16 %f561, %rs331;} // end inline asm fma.rn.ftz.f32 %f772, %f771, %f561, %f764; mov.b32 {%rs332, %rs337}, %r296; // begin inline asm { cvt.f32.f16 %f562, %rs332;} // end inline asm fma.rn.ftz.f32 %f773, %f771, %f562, %f765; mov.b32 {%rs333, %rs338}, %r304; // begin inline asm { cvt.f32.f16 %f563, %rs333;} // end inline asm fma.rn.ftz.f32 %f774, %f771, %f563, %f766; mov.b32 {%rs334, %rs339}, %r312; // begin inline asm { cvt.f32.f16 %f564, %rs334;} // end inline asm fma.rn.ftz.f32 %f775, %f771, %f564, %f767; mov.b32 {%rs335, %rs340}, %r320; // begin inline asm { cvt.f32.f16 %f565, %rs335;} // end inline asm fma.rn.ftz.f32 %f776, %f771, %f565, %f768; shl.b16 %rs496, %rs37, 4; cvt.s16.s8 %rs497, %rs496; shr.s16 %rs498, %rs497, 7; and.b16 %rs499, %rs498, -16; or.b16 %rs500, %rs499, %rs38; cvt.rn.f32.s16 %f777, %rs500; sub.ftz.f32 %f778, %f777, %f7; mul.ftz.f32 %f779, %f69, %f778; // begin inline asm { cvt.f32.f16 %f566, %rs336;} // end inline asm fma.rn.ftz.f32 %f780, %f779, %f566, %f772; // begin inline asm { cvt.f32.f16 %f567, %rs337;} // end inline asm fma.rn.ftz.f32 %f781, %f779, %f567, %f773; // begin inline asm { cvt.f32.f16 %f568, %rs338;} // end inline asm fma.rn.ftz.f32 %f782, %f779, %f568, %f774; // begin inline asm { cvt.f32.f16 %f569, %rs339;} // end inline asm fma.rn.ftz.f32 %f783, %f779, %f569, %f775; // begin inline asm { cvt.f32.f16 %f570, %rs340;} // end inline asm fma.rn.ftz.f32 %f784, %f779, %f570, %f776; shl.b16 %rs501, %rs39, 4; cvt.s16.s8 %rs502, %rs501; shr.s16 %rs503, %rs502, 7; and.b16 %rs504, %rs503, -16; or.b16 %rs505, %rs504, %rs40; cvt.rn.f32.s16 %f785, %rs505; sub.ftz.f32 %f786, %f785, %f7; mul.ftz.f32 %f787, %f69, %f786; mov.b32 {%rs341, %rs346}, %r289; // begin inline asm { cvt.f32.f16 %f571, %rs341;} // end inline asm fma.rn.ftz.f32 %f788, %f787, %f571, %f780; mov.b32 {%rs342, %rs347}, %r297; // begin inline asm { cvt.f32.f16 %f572, %rs342;} // end inline asm fma.rn.ftz.f32 %f789, %f787, %f572, %f781; mov.b32 {%rs343, %rs348}, %r305; // begin inline asm { cvt.f32.f16 %f573, %rs343;} // end inline asm fma.rn.ftz.f32 %f790, %f787, %f573, %f782; mov.b32 {%rs344, %rs349}, %r313; // begin inline asm { cvt.f32.f16 %f574, %rs344;} // end inline asm fma.rn.ftz.f32 %f791, %f787, %f574, %f783; mov.b32 {%rs345, %rs350}, %r321; // begin inline asm { cvt.f32.f16 %f575, %rs345;} // end inline asm fma.rn.ftz.f32 %f792, %f787, %f575, %f784; shl.b16 %rs506, %rs41, 4; cvt.s16.s8 %rs507, %rs506; shr.s16 %rs508, %rs507, 7; and.b16 %rs509, %rs508, -16; or.b16 %rs510, %rs509, %rs42; cvt.rn.f32.s16 %f793, %rs510; sub.ftz.f32 %f794, %f793, %f7; mul.ftz.f32 %f795, %f69, %f794; // begin inline asm { cvt.f32.f16 %f576, %rs346;} // end inline asm fma.rn.ftz.f32 %f796, %f795, %f576, %f788; // begin inline asm { cvt.f32.f16 %f577, %rs347;} // end inline asm fma.rn.ftz.f32 %f797, %f795, %f577, %f789; // begin inline asm { cvt.f32.f16 %f578, %rs348;} // end inline asm fma.rn.ftz.f32 %f798, %f795, %f578, %f790; // begin inline asm { cvt.f32.f16 %f579, %rs349;} // end inline asm fma.rn.ftz.f32 %f799, %f795, %f579, %f791; // begin inline asm { cvt.f32.f16 %f580, %rs350;} // end inline asm fma.rn.ftz.f32 %f800, %f795, %f580, %f792; shl.b16 %rs511, %rs43, 4; cvt.s16.s8 %rs512, %rs511; shr.s16 %rs513, %rs512, 7; and.b16 %rs514, %rs513, -16; or.b16 %rs515, %rs514, %rs44; cvt.rn.f32.s16 %f801, %rs515; sub.ftz.f32 %f802, %f801, %f7; mul.ftz.f32 %f803, %f69, %f802; mov.b32 {%rs351, %rs356}, %r290; // begin inline asm { cvt.f32.f16 %f581, %rs351;} // end inline asm fma.rn.ftz.f32 %f804, %f803, %f581, %f796; mov.b32 {%rs352, %rs357}, %r298; // begin inline asm { cvt.f32.f16 %f582, %rs352;} // end inline asm fma.rn.ftz.f32 %f805, %f803, %f582, %f797; mov.b32 {%rs353, %rs358}, %r306; // begin inline asm { cvt.f32.f16 %f583, %rs353;} // end inline asm fma.rn.ftz.f32 %f806, %f803, %f583, %f798; mov.b32 {%rs354, %rs359}, %r314; // begin inline asm { cvt.f32.f16 %f584, %rs354;} // end inline asm fma.rn.ftz.f32 %f807, %f803, %f584, %f799; mov.b32 {%rs355, %rs360}, %r322; // begin inline asm { cvt.f32.f16 %f585, %rs355;} // end inline asm fma.rn.ftz.f32 %f808, %f803, %f585, %f800; shl.b16 %rs516, %rs45, 4; cvt.s16.s8 %rs517, %rs516; shr.s16 %rs518, %rs517, 7; and.b16 %rs519, %rs518, -16; or.b16 %rs520, %rs519, %rs46; cvt.rn.f32.s16 %f809, %rs520; sub.ftz.f32 %f810, %f809, %f7; mul.ftz.f32 %f811, %f69, %f810; // begin inline asm { cvt.f32.f16 %f586, %rs356;} // end inline asm fma.rn.ftz.f32 %f812, %f811, %f586, %f804; // begin inline asm { cvt.f32.f16 %f587, %rs357;} // end inline asm fma.rn.ftz.f32 %f813, %f811, %f587, %f805; // begin inline asm { cvt.f32.f16 %f588, %rs358;} // end inline asm fma.rn.ftz.f32 %f814, %f811, %f588, %f806; // begin inline asm { cvt.f32.f16 %f589, %rs359;} // end inline asm fma.rn.ftz.f32 %f815, %f811, %f589, %f807; // begin inline asm { cvt.f32.f16 %f590, %rs360;} // end inline asm fma.rn.ftz.f32 %f816, %f811, %f590, %f808; shl.b16 %rs521, %rs47, 4; cvt.s16.s8 %rs522, %rs521; shr.s16 %rs523, %rs522, 7; and.b16 %rs524, %rs523, -16; or.b16 %rs525, %rs524, %rs48; cvt.rn.f32.s16 %f817, %rs525; sub.ftz.f32 %f818, %f817, %f7; mul.ftz.f32 %f819, %f69, %f818; mov.b32 {%rs361, %rs366}, %r291; // begin inline asm { cvt.f32.f16 %f591, %rs361;} // end inline asm fma.rn.ftz.f32 %f820, %f819, %f591, %f812; mov.b32 {%rs362, %rs367}, %r299; // begin inline asm { cvt.f32.f16 %f592, %rs362;} // end inline asm fma.rn.ftz.f32 %f821, %f819, %f592, %f813; mov.b32 {%rs363, %rs368}, %r307; // begin inline asm { cvt.f32.f16 %f593, %rs363;} // end inline asm fma.rn.ftz.f32 %f822, %f819, %f593, %f814; mov.b32 {%rs364, %rs369}, %r315; // begin inline asm { cvt.f32.f16 %f594, %rs364;} // end inline asm fma.rn.ftz.f32 %f823, %f819, %f594, %f815; mov.b32 {%rs365, %rs370}, %r323; // begin inline asm { cvt.f32.f16 %f595, %rs365;} // end inline asm fma.rn.ftz.f32 %f824, %f819, %f595, %f816; shl.b16 %rs526, %rs49, 4; cvt.s16.s8 %rs527, %rs526; shr.s16 %rs528, %rs527, 7; and.b16 %rs529, %rs528, -16; or.b16 %rs530, %rs529, %rs49; cvt.rn.f32.s16 %f825, %rs530; sub.ftz.f32 %f826, %f825, %f7; mul.ftz.f32 %f827, %f69, %f826; // begin inline asm { cvt.f32.f16 %f596, %rs366;} // end inline asm fma.rn.ftz.f32 %f828, %f827, %f596, %f820; // begin inline asm { cvt.f32.f16 %f597, %rs367;} // end inline asm fma.rn.ftz.f32 %f829, %f827, %f597, %f821; // begin inline asm { cvt.f32.f16 %f598, %rs368;} // end inline asm fma.rn.ftz.f32 %f830, %f827, %f598, %f822; // begin inline asm { cvt.f32.f16 %f599, %rs369;} // end inline asm fma.rn.ftz.f32 %f831, %f827, %f599, %f823; // begin inline asm { cvt.f32.f16 %f600, %rs370;} // end inline asm fma.rn.ftz.f32 %f832, %f827, %f600, %f824; ld.global.v4.u32 {%r328, %r329, %r330, %r331}, [%rd7+48]; ld.global.v4.u32 {%r336, %r337, %r338, %r339}, [%rd9+32]; ld.global.v4.u32 {%r344, %r345, %r346, %r347}, [%rd10+32]; ld.global.v4.u32 {%r352, %r353, %r354, %r355}, [%rd8+48]; ld.global.v4.u32 {%r360, %r361, %r362, %r363}, [%rd11+32]; shl.b16 %rs531, %rs50, 4; cvt.s16.s8 %rs532, %rs531; shr.s16 %rs533, %rs532, 7; and.b16 %rs534, %rs533, -16; or.b16 %rs535, %rs534, %rs51; cvt.rn.f32.s16 %f833, %rs535; sub.ftz.f32 %f834, %f833, %f7; mul.ftz.f32 %f835, %f69, %f834; mov.b32 {%rs371, %rs376}, %r328; // begin inline asm { cvt.f32.f16 %f601, %rs371;} // end inline asm fma.rn.ftz.f32 %f836, %f835, %f601, %f828; mov.b32 {%rs372, %rs377}, %r336; // begin inline asm { cvt.f32.f16 %f602, %rs372;} // end inline asm fma.rn.ftz.f32 %f837, %f835, %f602, %f829; mov.b32 {%rs373, %rs378}, %r344; // begin inline asm { cvt.f32.f16 %f603, %rs373;} // end inline asm fma.rn.ftz.f32 %f838, %f835, %f603, %f830; mov.b32 {%rs374, %rs379}, %r352; // begin inline asm { cvt.f32.f16 %f604, %rs374;} // end inline asm fma.rn.ftz.f32 %f839, %f835, %f604, %f831; mov.b32 {%rs375, %rs380}, %r360; // begin inline asm { cvt.f32.f16 %f605, %rs375;} // end inline asm fma.rn.ftz.f32 %f840, %f835, %f605, %f832; shl.b16 %rs536, %rs52, 4; cvt.s16.s8 %rs537, %rs536; shr.s16 %rs538, %rs537, 7; and.b16 %rs539, %rs538, -16; or.b16 %rs540, %rs539, %rs53; cvt.rn.f32.s16 %f841, %rs540; sub.ftz.f32 %f842, %f841, %f7; mul.ftz.f32 %f843, %f69, %f842; // begin inline asm { cvt.f32.f16 %f606, %rs376;} // end inline asm fma.rn.ftz.f32 %f844, %f843, %f606, %f836; // begin inline asm { cvt.f32.f16 %f607, %rs377;} // end inline asm fma.rn.ftz.f32 %f845, %f843, %f607, %f837; // begin inline asm { cvt.f32.f16 %f608, %rs378;} // end inline asm fma.rn.ftz.f32 %f846, %f843, %f608, %f838; // begin inline asm { cvt.f32.f16 %f609, %rs379;} // end inline asm fma.rn.ftz.f32 %f847, %f843, %f609, %f839; // begin inline asm { cvt.f32.f16 %f610, %rs380;} // end inline asm fma.rn.ftz.f32 %f848, %f843, %f610, %f840; shl.b16 %rs541, %rs54, 4; cvt.s16.s8 %rs542, %rs541; shr.s16 %rs543, %rs542, 7; and.b16 %rs544, %rs543, -16; or.b16 %rs545, %rs544, %rs55; cvt.rn.f32.s16 %f849, %rs545; sub.ftz.f32 %f850, %f849, %f7; mul.ftz.f32 %f851, %f69, %f850; mov.b32 {%rs381, %rs386}, %r329; // begin inline asm { cvt.f32.f16 %f611, %rs381;} // end inline asm fma.rn.ftz.f32 %f852, %f851, %f611, %f844; mov.b32 {%rs382, %rs387}, %r337; // begin inline asm { cvt.f32.f16 %f612, %rs382;} // end inline asm fma.rn.ftz.f32 %f853, %f851, %f612, %f845; mov.b32 {%rs383, %rs388}, %r345; // begin inline asm { cvt.f32.f16 %f613, %rs383;} // end inline asm fma.rn.ftz.f32 %f854, %f851, %f613, %f846; mov.b32 {%rs384, %rs389}, %r353; // begin inline asm { cvt.f32.f16 %f614, %rs384;} // end inline asm fma.rn.ftz.f32 %f855, %f851, %f614, %f847; mov.b32 {%rs385, %rs390}, %r361; // begin inline asm { cvt.f32.f16 %f615, %rs385;} // end inline asm fma.rn.ftz.f32 %f856, %f851, %f615, %f848; shl.b16 %rs546, %rs56, 4; cvt.s16.s8 %rs547, %rs546; shr.s16 %rs548, %rs547, 7; and.b16 %rs549, %rs548, -16; or.b16 %rs550, %rs549, %rs57; cvt.rn.f32.s16 %f857, %rs550; sub.ftz.f32 %f858, %f857, %f7; mul.ftz.f32 %f859, %f69, %f858; // begin inline asm { cvt.f32.f16 %f616, %rs386;} // end inline asm fma.rn.ftz.f32 %f860, %f859, %f616, %f852; // begin inline asm { cvt.f32.f16 %f617, %rs387;} // end inline asm fma.rn.ftz.f32 %f861, %f859, %f617, %f853; // begin inline asm { cvt.f32.f16 %f618, %rs388;} // end inline asm fma.rn.ftz.f32 %f862, %f859, %f618, %f854; // begin inline asm { cvt.f32.f16 %f619, %rs389;} // end inline asm fma.rn.ftz.f32 %f863, %f859, %f619, %f855; // begin inline asm { cvt.f32.f16 %f620, %rs390;} // end inline asm fma.rn.ftz.f32 %f864, %f859, %f620, %f856; shl.b16 %rs551, %rs58, 4; cvt.s16.s8 %rs552, %rs551; shr.s16 %rs553, %rs552, 7; and.b16 %rs554, %rs553, -16; or.b16 %rs555, %rs554, %rs59; cvt.rn.f32.s16 %f865, %rs555; sub.ftz.f32 %f866, %f865, %f7; mul.ftz.f32 %f867, %f69, %f866; mov.b32 {%rs391, %rs396}, %r330; // begin inline asm { cvt.f32.f16 %f621, %rs391;} // end inline asm fma.rn.ftz.f32 %f868, %f867, %f621, %f860; mov.b32 {%rs392, %rs397}, %r338; // begin inline asm { cvt.f32.f16 %f622, %rs392;} // end inline asm fma.rn.ftz.f32 %f869, %f867, %f622, %f861; mov.b32 {%rs393, %rs398}, %r346; // begin inline asm { cvt.f32.f16 %f623, %rs393;} // end inline asm fma.rn.ftz.f32 %f870, %f867, %f623, %f862; mov.b32 {%rs394, %rs399}, %r354; // begin inline asm { cvt.f32.f16 %f624, %rs394;} // end inline asm fma.rn.ftz.f32 %f871, %f867, %f624, %f863; mov.b32 {%rs395, %rs400}, %r362; // begin inline asm { cvt.f32.f16 %f625, %rs395;} // end inline asm fma.rn.ftz.f32 %f872, %f867, %f625, %f864; shl.b16 %rs556, %rs60, 4; cvt.s16.s8 %rs557, %rs556; shr.s16 %rs558, %rs557, 7; and.b16 %rs559, %rs558, -16; or.b16 %rs560, %rs559, %rs61; cvt.rn.f32.s16 %f873, %rs560; sub.ftz.f32 %f874, %f873, %f7; mul.ftz.f32 %f875, %f69, %f874; // begin inline asm { cvt.f32.f16 %f626, %rs396;} // end inline asm fma.rn.ftz.f32 %f876, %f875, %f626, %f868; // begin inline asm { cvt.f32.f16 %f627, %rs397;} // end inline asm fma.rn.ftz.f32 %f877, %f875, %f627, %f869; // begin inline asm { cvt.f32.f16 %f628, %rs398;} // end inline asm fma.rn.ftz.f32 %f878, %f875, %f628, %f870; // begin inline asm { cvt.f32.f16 %f629, %rs399;} // end inline asm fma.rn.ftz.f32 %f879, %f875, %f629, %f871; // begin inline asm { cvt.f32.f16 %f630, %rs400;} // end inline asm fma.rn.ftz.f32 %f880, %f875, %f630, %f872; shl.b16 %rs561, %rs62, 4; cvt.s16.s8 %rs562, %rs561; shr.s16 %rs563, %rs562, 7; and.b16 %rs564, %rs563, -16; or.b16 %rs565, %rs564, %rs63; cvt.rn.f32.s16 %f881, %rs565; sub.ftz.f32 %f882, %f881, %f7; mul.ftz.f32 %f883, %f69, %f882; mov.b32 {%rs401, %rs406}, %r331; // begin inline asm { cvt.f32.f16 %f631, %rs401;} // end inline asm fma.rn.ftz.f32 %f884, %f883, %f631, %f876; mov.b32 {%rs402, %rs407}, %r339; // begin inline asm { cvt.f32.f16 %f632, %rs402;} // end inline asm fma.rn.ftz.f32 %f885, %f883, %f632, %f877; mov.b32 {%rs403, %rs408}, %r347; // begin inline asm { cvt.f32.f16 %f633, %rs403;} // end inline asm fma.rn.ftz.f32 %f886, %f883, %f633, %f878; mov.b32 {%rs404, %rs409}, %r355; // begin inline asm { cvt.f32.f16 %f634, %rs404;} // end inline asm fma.rn.ftz.f32 %f887, %f883, %f634, %f879; mov.b32 {%rs405, %rs410}, %r363; // begin inline asm { cvt.f32.f16 %f635, %rs405;} // end inline asm fma.rn.ftz.f32 %f888, %f883, %f635, %f880; shl.b16 %rs566, %rs64, 4; cvt.s16.s8 %rs567, %rs566; shr.s16 %rs568, %rs567, 7; and.b16 %rs569, %rs568, -16; or.b16 %rs570, %rs569, %rs64; cvt.rn.f32.s16 %f889, %rs570; sub.ftz.f32 %f890, %f889, %f7; mul.ftz.f32 %f891, %f69, %f890; // begin inline asm { cvt.f32.f16 %f636, %rs406;} // end inline asm fma.rn.ftz.f32 %f1011, %f891, %f636, %f884; // begin inline asm { cvt.f32.f16 %f637, %rs407;} // end inline asm fma.rn.ftz.f32 %f1010, %f891, %f637, %f885; // begin inline asm { cvt.f32.f16 %f638, %rs408;} // end inline asm fma.rn.ftz.f32 %f1009, %f891, %f638, %f886; // begin inline asm { cvt.f32.f16 %f639, %rs409;} // end inline asm fma.rn.ftz.f32 %f1008, %f891, %f639, %f887; // begin inline asm { cvt.f32.f16 %f640, %rs410;} // end inline asm fma.rn.ftz.f32 %f1007, %f891, %f640, %f888; $L__BB0_8: add.s32 %r512, %r512, 4; shl.b32 %r368, %r512, 5; add.s32 %r511, %r368, %r53; shl.b32 %r510, %r511, 2; setp.lt.u32 %p7, %r510, %r50; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r370, %tid.y; shl.b32 %r371, %r370, 5; add.s32 %r40, %r371, %r53; setp.lt.u32 %p8, %r40, 32; shl.b32 %r373, %r40, 2; mov.u32 %r374, _ZZ9gemv_int4ILi4ELi128ELi5EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r375, %r374, %r373; @%p8 bra $L__BB0_11; add.s32 %r505, %r375, -112; st.shared.f32 [%r505], %f1011; $L__BB0_11: setp.gt.u32 %p9, %r40, 31; bar.sync 0; mad.lo.s32 %r42, %r40, 12, %r374; @%p9 bra $L__BB0_13; mov.u32 %r394, 16; ld.shared.f32 %f907, [%r42+16]; add.ftz.f32 %f908, %f1011, %f907; ld.shared.f32 %f909, [%r42+20]; add.ftz.f32 %f910, %f908, %f909; ld.shared.f32 %f911, [%r42+24]; add.ftz.f32 %f894, %f910, %f911; mov.u32 %r382, 1; mov.u32 %r395, 31; mov.u32 %r396, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f894, %r382, %r395, %r396; @p add.f32 r0, r0, %f894; mov.f32 %f892, r0;} // end inline asm mov.u32 %r385, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f892, %r385, %r395, %r396; @p add.f32 r0, r0, %f892; mov.f32 %f895, r0;} // end inline asm mov.u32 %r388, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f895, %r388, %r395, %r396; @p add.f32 r0, r0, %f895; mov.f32 %f898, r0;} // end inline asm mov.u32 %r391, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f898, %r391, %r395, %r396; @p add.f32 r0, r0, %f898; mov.f32 %f901, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f901, %r394, %r395, %r396; @p add.f32 r0, r0, %f901; mov.f32 %f1011, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r506, %r375, -112; st.shared.f32 [%r506+640], %f1010; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f927, [%r42+656]; add.ftz.f32 %f928, %f1010, %f927; ld.shared.f32 %f929, [%r42+660]; add.ftz.f32 %f930, %f928, %f929; ld.shared.f32 %f931, [%r42+664]; add.ftz.f32 %f914, %f930, %f931; mov.u32 %r406, 1; mov.u32 %r419, 31; mov.u32 %r420, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f914, %r406, %r419, %r420; @p add.f32 r0, r0, %f914; mov.f32 %f912, r0;} // end inline asm mov.u32 %r409, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f912, %r409, %r419, %r420; @p add.f32 r0, r0, %f912; mov.f32 %f915, r0;} // end inline asm mov.u32 %r412, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f915, %r412, %r419, %r420; @p add.f32 r0, r0, %f915; mov.f32 %f918, r0;} // end inline asm mov.u32 %r415, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f918, %r415, %r419, %r420; @p add.f32 r0, r0, %f918; mov.f32 %f921, r0;} // end inline asm mov.u32 %r418, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f921, %r418, %r419, %r420; @p add.f32 r0, r0, %f921; mov.f32 %f1010, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r507, %r375, -112; st.shared.f32 [%r507+1280], %f1009; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f947, [%r42+1296]; add.ftz.f32 %f948, %f1009, %f947; ld.shared.f32 %f949, [%r42+1300]; add.ftz.f32 %f950, %f948, %f949; ld.shared.f32 %f951, [%r42+1304]; add.ftz.f32 %f934, %f950, %f951; mov.u32 %r430, 1; mov.u32 %r443, 31; mov.u32 %r444, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f934, %r430, %r443, %r444; @p add.f32 r0, r0, %f934; mov.f32 %f932, r0;} // end inline asm mov.u32 %r433, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f932, %r433, %r443, %r444; @p add.f32 r0, r0, %f932; mov.f32 %f935, r0;} // end inline asm mov.u32 %r436, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f935, %r436, %r443, %r444; @p add.f32 r0, r0, %f935; mov.f32 %f938, r0;} // end inline asm mov.u32 %r439, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f938, %r439, %r443, %r444; @p add.f32 r0, r0, %f938; mov.f32 %f941, r0;} // end inline asm mov.u32 %r442, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f941, %r442, %r443, %r444; @p add.f32 r0, r0, %f941; mov.f32 %f1009, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r508, %r375, -112; st.shared.f32 [%r508+1920], %f1008; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f967, [%r42+1936]; add.ftz.f32 %f968, %f1008, %f967; ld.shared.f32 %f969, [%r42+1940]; add.ftz.f32 %f970, %f968, %f969; ld.shared.f32 %f971, [%r42+1944]; add.ftz.f32 %f954, %f970, %f971; mov.u32 %r454, 1; mov.u32 %r467, 31; mov.u32 %r468, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f954, %r454, %r467, %r468; @p add.f32 r0, r0, %f954; mov.f32 %f952, r0;} // end inline asm mov.u32 %r457, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f952, %r457, %r467, %r468; @p add.f32 r0, r0, %f952; mov.f32 %f955, r0;} // end inline asm mov.u32 %r460, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f955, %r460, %r467, %r468; @p add.f32 r0, r0, %f955; mov.f32 %f958, r0;} // end inline asm mov.u32 %r463, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f958, %r463, %r467, %r468; @p add.f32 r0, r0, %f958; mov.f32 %f961, r0;} // end inline asm mov.u32 %r466, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f961, %r466, %r467, %r468; @p add.f32 r0, r0, %f961; mov.f32 %f1008, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r509, %r375, -112; st.shared.f32 [%r509+2560], %f1007; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f987, [%r42+2576]; add.ftz.f32 %f988, %f1007, %f987; ld.shared.f32 %f989, [%r42+2580]; add.ftz.f32 %f990, %f988, %f989; ld.shared.f32 %f991, [%r42+2584]; add.ftz.f32 %f974, %f990, %f991; mov.u32 %r478, 1; mov.u32 %r491, 31; mov.u32 %r492, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f974, %r478, %r491, %r492; @p add.f32 r0, r0, %f974; mov.f32 %f972, r0;} // end inline asm mov.u32 %r481, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f972, %r481, %r491, %r492; @p add.f32 r0, r0, %f972; mov.f32 %f975, r0;} // end inline asm mov.u32 %r484, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f975, %r484, %r491, %r492; @p add.f32 r0, r0, %f975; mov.f32 %f978, r0;} // end inline asm mov.u32 %r487, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f978, %r487, %r491, %r492; @p add.f32 r0, r0, %f978; mov.f32 %f981, r0;} // end inline asm mov.u32 %r490, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f981, %r490, %r491, %r492; @p add.f32 r0, r0, %f981; mov.f32 %f1007, r0;} // end inline asm $L__BB0_29: or.b32 %r495, %r53, %r370; setp.ne.s32 %p18, %r495, 0; @%p18 bra $L__BB0_41; ld.param.u64 %rd77, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p19, %rd77, 0; mul.ftz.f32 %f1022, %f62, %f1011; mov.u32 %r496, %ctaid.x; cvt.s64.s32 %rd12, %r496; @%p19 bra $L__BB0_32; shl.b64 %rd47, %rd12, 1; add.s64 %rd48, %rd2, %rd47; ld.global.u16 %rs571, [%rd48]; // begin inline asm { cvt.f32.f16 %f992, %rs571;} // end inline asm fma.rn.ftz.f32 %f1022, %f63, %f992, %f1022; $L__BB0_32: ld.param.u64 %rd78, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs572, %f1022;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd49, 1.0; // end inline asm shl.b64 %rd52, %rd12, 1; add.s64 %rd50, %rd78, %rd52; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd50], %rs572, %rd49; // end inline asm mul.ftz.f32 %f1023, %f62, %f1010; add.s32 %r498, %r49, %r496; cvt.s64.s32 %rd15, %r498; @%p19 bra $L__BB0_34; shl.b64 %rd53, %rd15, 1; add.s64 %rd54, %rd2, %rd53; ld.global.u16 %rs574, [%rd54]; // begin inline asm { cvt.f32.f16 %f994, %rs574;} // end inline asm fma.rn.ftz.f32 %f1023, %f63, %f994, %f1023; $L__BB0_34: mul.wide.s32 %rd58, %r49, 2; add.s64 %rd56, %rd50, %rd58; // begin inline asm { cvt.rn.f16.f32 %rs575, %f1023;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd55, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd56], %rs575, %rd55; // end inline asm mul.ftz.f32 %f1024, %f62, %f1009; cvt.u32.u64 %r499, %rd15; add.s32 %r500, %r499, %r49; cvt.s64.s32 %rd16, %r500; @%p19 bra $L__BB0_36; shl.b64 %rd59, %rd16, 1; add.s64 %rd60, %rd2, %rd59; ld.global.u16 %rs577, [%rd60]; // begin inline asm { cvt.f32.f16 %f996, %rs577;} // end inline asm fma.rn.ftz.f32 %f1024, %f63, %f996, %f1024; $L__BB0_36: ld.param.u64 %rd79, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs578, %f1024;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd61, 1.0; // end inline asm shl.b64 %rd64, %rd16, 1; add.s64 %rd62, %rd79, %rd64; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd62], %rs578, %rd61; // end inline asm mul.ftz.f32 %f1025, %f62, %f1008; cvt.u32.u64 %r501, %rd16; add.s32 %r502, %r501, %r49; cvt.s64.s32 %rd17, %r502; @%p19 bra $L__BB0_38; shl.b64 %rd65, %rd17, 1; add.s64 %rd66, %rd2, %rd65; ld.global.u16 %rs580, [%rd66]; // begin inline asm { cvt.f32.f16 %f998, %rs580;} // end inline asm fma.rn.ftz.f32 %f1025, %f63, %f998, %f1025; $L__BB0_38: ld.param.u64 %rd80, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs581, %f1025;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd67, 1.0; // end inline asm shl.b64 %rd70, %rd17, 1; add.s64 %rd68, %rd80, %rd70; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd68], %rs581, %rd67; // end inline asm mul.ftz.f32 %f1026, %f62, %f1007; cvt.u32.u64 %r503, %rd17; add.s32 %r504, %r503, %r49; cvt.s64.s32 %rd18, %r504; @%p19 bra $L__BB0_40; shl.b64 %rd71, %rd18, 1; add.s64 %rd72, %rd2, %rd71; ld.global.u16 %rs583, [%rd72]; // begin inline asm { cvt.f32.f16 %f1000, %rs583;} // end inline asm fma.rn.ftz.f32 %f1026, %f63, %f1000, %f1026; $L__BB0_40: ld.param.u64 %rd81, [_Z28dequant_gemv_group128_batch523DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs584, %f1026;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd73, 1.0; // end inline asm shl.b64 %rd76, %rd18, 1; add.s64 %rd74, %rd81, %rd76; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd74], %rs584, %rd73; // end inline asm $L__BB0_41: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }