.version 7.8 .target sm_80 .address_size 64 // .globl _Z28dequant_gemv_group128_batch423DequantGemvKernelParams // _ZZ9gemv_int4ILi4ELi128ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_f89788766thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch423DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<21>; .reg .b16 %rs<520>; .reg .f32 %f<862>; .reg .b32 %r<426>; .reg .b64 %rd<69>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[2560]; ld.param.v2.u32 {%r44, %r45}, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r46, %r47}, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f51, %f52}, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs73, %rs74, %rs75, %rs76}, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd22, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd21, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd20, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd19, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd18, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd2, %rd18; mov.u32 %r425, %tid.y; shl.b32 %r48, %r425, 5; mov.u32 %r49, %tid.x; add.s32 %r424, %r48, %r49; shl.b32 %r423, %r424, 2; setp.ge.u32 %p1, %r423, %r46; mov.f32 %f846, 0f00000000; mov.f32 %f847, %f846; mov.f32 %f848, %f846; mov.f32 %f849, %f846; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd19; mov.u32 %r50, %ctaid.x; mul.lo.s32 %r6, %r47, %r50; $L__BB0_2: mad.lo.s32 %r56, %r46, %r50, %r423; mul.wide.u32 %rd29, %r56, 4; add.s64 %rd24, %rd20, %rd29; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd23, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v4.u32 {%r51,%r52,%r53,%r54}, [%rd24], %rd23; // end inline asm shr.u32 %r58, %r49, 2; shl.b32 %r59, %r425, 3; add.s32 %r14, %r59, %r58; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd30, %r15, 2; add.s64 %rd27, %rd22, %rd30; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd26, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs81, [%rd27], %rd26; // end inline asm // begin inline asm { cvt.f32.f16 %f57, %rs81;} // end inline asm shl.b16 %rs519, %rs73, 3; setp.eq.s64 %p2, %rd21, 0; @%p2 bra $L__BB0_4; shr.u32 %r60, %r15, 31; add.s32 %r61, %r15, %r60; shr.s32 %r62, %r61, 1; cvt.s64.s32 %rd34, %r62; add.s64 %rd32, %rd21, %rd34; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd31, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs83, [%rd32], %rd31; // end inline asm cvt.u32.u16 %r63, %rs83; and.b32 %r64, %r63, 255; shl.b32 %r65, %r14, 2; and.b32 %r66, %r65, 4; shr.u32 %r67, %r64, %r66; cvt.u16.u32 %rs84, %r67; and.b16 %rs519, %rs84, 15; $L__BB0_4: shl.b32 %r16, %r424, 5; setp.ge.s32 %p3, %r16, %r44; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs73, 0; shr.u16 %rs86, %rs519, 3; and.b16 %rs87, %rs86, 1; setp.eq.b16 %p5, %rs87, 1; and.pred %p6, %p4, %p5; selp.b16 %rs88, -16, 0, %p6; or.b16 %rs89, %rs88, %rs519; cvt.s16.s8 %rs90, %rs89; cvt.rn.f32.s16 %f6, %rs90; mul.wide.s32 %rd35, %r16, 2; add.s64 %rd7, %rd3, %rd35; ld.global.v4.u32 {%r68, %r69, %r70, %r71}, [%rd7]; mul.wide.s32 %rd36, %r44, 2; add.s64 %rd37, %rd7, %rd36; ld.global.v4.u32 {%r72, %r73, %r74, %r75}, [%rd37]; add.s32 %r76, %r16, %r44; add.s32 %r77, %r76, %r44; mul.wide.s32 %rd38, %r77, 2; add.s64 %rd8, %rd3, %rd38; ld.global.v4.u32 {%r78, %r79, %r80, %r81}, [%rd8]; add.s32 %r82, %r77, %r44; mul.wide.s32 %rd39, %r82, 2; add.s64 %rd9, %rd3, %rd39; ld.global.v4.u32 {%r83, %r84, %r85, %r86}, [%rd9]; cvt.u16.u32 %rs5, %r51; and.b16 %rs6, %rs5, 15; shr.u32 %r87, %r51, 4; cvt.u16.u32 %rs7, %r87; and.b16 %rs8, %rs7, 15; shr.u32 %r88, %r51, 8; cvt.u16.u32 %rs9, %r88; and.b16 %rs10, %rs9, 15; shr.u32 %r89, %r51, 12; cvt.u16.u32 %rs11, %r89; and.b16 %rs12, %rs11, 15; shr.u32 %r90, %r51, 16; cvt.u16.u32 %rs13, %r90; and.b16 %rs14, %rs13, 15; shr.u32 %r91, %r51, 20; cvt.u16.u32 %rs15, %r91; and.b16 %rs16, %rs15, 15; shr.u32 %r92, %r51, 24; cvt.u16.u32 %rs17, %r92; and.b16 %rs18, %rs17, 15; shr.u32 %r93, %r51, 28; cvt.u16.u32 %rs19, %r93; add.s32 %r94, %r76, 8; mul.wide.s32 %rd40, %r94, 2; add.s64 %rd10, %rd3, %rd40; cvt.u16.u32 %rs20, %r52; and.b16 %rs21, %rs20, 15; shr.u32 %r95, %r52, 4; cvt.u16.u32 %rs22, %r95; and.b16 %rs23, %rs22, 15; shr.u32 %r96, %r52, 8; cvt.u16.u32 %rs24, %r96; and.b16 %rs25, %rs24, 15; shr.u32 %r97, %r52, 12; cvt.u16.u32 %rs26, %r97; and.b16 %rs27, %rs26, 15; shr.u32 %r98, %r52, 16; cvt.u16.u32 %rs28, %r98; and.b16 %rs29, %rs28, 15; shr.u32 %r99, %r52, 20; cvt.u16.u32 %rs30, %r99; and.b16 %rs31, %rs30, 15; shr.u32 %r100, %r52, 24; cvt.u16.u32 %rs32, %r100; and.b16 %rs33, %rs32, 15; shr.u32 %r101, %r52, 28; cvt.u16.u32 %rs34, %r101; cvt.u16.u32 %rs35, %r53; and.b16 %rs36, %rs35, 15; shr.u32 %r102, %r53, 4; cvt.u16.u32 %rs37, %r102; and.b16 %rs38, %rs37, 15; shr.u32 %r103, %r53, 8; cvt.u16.u32 %rs39, %r103; and.b16 %rs40, %rs39, 15; shr.u32 %r104, %r53, 12; cvt.u16.u32 %rs41, %r104; and.b16 %rs42, %rs41, 15; shr.u32 %r105, %r53, 16; cvt.u16.u32 %rs43, %r105; and.b16 %rs44, %rs43, 15; shr.u32 %r106, %r53, 20; cvt.u16.u32 %rs45, %r106; and.b16 %rs46, %rs45, 15; shr.u32 %r107, %r53, 24; cvt.u16.u32 %rs47, %r107; and.b16 %rs48, %rs47, 15; shr.u32 %r108, %r53, 28; cvt.u16.u32 %rs49, %r108; cvt.u16.u32 %rs50, %r54; and.b16 %rs51, %rs50, 15; shr.u32 %r109, %r54, 4; cvt.u16.u32 %rs52, %r109; and.b16 %rs53, %rs52, 15; shr.u32 %r110, %r54, 8; cvt.u16.u32 %rs54, %r110; and.b16 %rs55, %rs54, 15; shr.u32 %r111, %r54, 12; cvt.u16.u32 %rs56, %r111; and.b16 %rs57, %rs56, 15; shr.u32 %r112, %r54, 16; cvt.u16.u32 %rs58, %r112; and.b16 %rs59, %rs58, 15; shr.u32 %r113, %r54, 20; cvt.u16.u32 %rs60, %r113; and.b16 %rs61, %rs60, 15; shr.u32 %r114, %r54, 24; cvt.u16.u32 %rs62, %r114; and.b16 %rs63, %rs62, 15; shr.u32 %r115, %r54, 28; cvt.u16.u32 %rs64, %r115; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f186, %rs6; sub.ftz.f32 %f187, %f186, %f6; mul.ftz.f32 %f188, %f57, %f187; mov.b32 {%rs91, %rs95}, %r68; // begin inline asm { cvt.f32.f16 %f58, %rs91;} // end inline asm fma.rn.ftz.f32 %f189, %f188, %f58, %f849; mov.b32 {%rs92, %rs96}, %r72; // begin inline asm { cvt.f32.f16 %f59, %rs92;} // end inline asm fma.rn.ftz.f32 %f190, %f188, %f59, %f848; mov.b32 {%rs93, %rs97}, %r78; // begin inline asm { cvt.f32.f16 %f60, %rs93;} // end inline asm fma.rn.ftz.f32 %f191, %f188, %f60, %f847; mov.b32 {%rs94, %rs98}, %r83; // begin inline asm { cvt.f32.f16 %f61, %rs94;} // end inline asm fma.rn.ftz.f32 %f192, %f188, %f61, %f846; cvt.rn.f32.s16 %f193, %rs8; sub.ftz.f32 %f194, %f193, %f6; mul.ftz.f32 %f195, %f57, %f194; // begin inline asm { cvt.f32.f16 %f62, %rs95;} // end inline asm fma.rn.ftz.f32 %f196, %f195, %f62, %f189; // begin inline asm { cvt.f32.f16 %f63, %rs96;} // end inline asm fma.rn.ftz.f32 %f197, %f195, %f63, %f190; // begin inline asm { cvt.f32.f16 %f64, %rs97;} // end inline asm fma.rn.ftz.f32 %f198, %f195, %f64, %f191; // begin inline asm { cvt.f32.f16 %f65, %rs98;} // end inline asm fma.rn.ftz.f32 %f199, %f195, %f65, %f192; cvt.rn.f32.s16 %f200, %rs10; sub.ftz.f32 %f201, %f200, %f6; mul.ftz.f32 %f202, %f57, %f201; mov.b32 {%rs99, %rs103}, %r69; // begin inline asm { cvt.f32.f16 %f66, %rs99;} // end inline asm fma.rn.ftz.f32 %f203, %f202, %f66, %f196; mov.b32 {%rs100, %rs104}, %r73; // begin inline asm { cvt.f32.f16 %f67, %rs100;} // end inline asm fma.rn.ftz.f32 %f204, %f202, %f67, %f197; mov.b32 {%rs101, %rs105}, %r79; // begin inline asm { cvt.f32.f16 %f68, %rs101;} // end inline asm fma.rn.ftz.f32 %f205, %f202, %f68, %f198; mov.b32 {%rs102, %rs106}, %r84; // begin inline asm { cvt.f32.f16 %f69, %rs102;} // end inline asm fma.rn.ftz.f32 %f206, %f202, %f69, %f199; cvt.rn.f32.s16 %f207, %rs12; sub.ftz.f32 %f208, %f207, %f6; mul.ftz.f32 %f209, %f57, %f208; // begin inline asm { cvt.f32.f16 %f70, %rs103;} // end inline asm fma.rn.ftz.f32 %f210, %f209, %f70, %f203; // begin inline asm { cvt.f32.f16 %f71, %rs104;} // end inline asm fma.rn.ftz.f32 %f211, %f209, %f71, %f204; // begin inline asm { cvt.f32.f16 %f72, %rs105;} // end inline asm fma.rn.ftz.f32 %f212, %f209, %f72, %f205; // begin inline asm { cvt.f32.f16 %f73, %rs106;} // end inline asm fma.rn.ftz.f32 %f213, %f209, %f73, %f206; cvt.rn.f32.s16 %f214, %rs14; sub.ftz.f32 %f215, %f214, %f6; mul.ftz.f32 %f216, %f57, %f215; mov.b32 {%rs107, %rs111}, %r70; // begin inline asm { cvt.f32.f16 %f74, %rs107;} // end inline asm fma.rn.ftz.f32 %f217, %f216, %f74, %f210; mov.b32 {%rs108, %rs112}, %r74; // begin inline asm { cvt.f32.f16 %f75, %rs108;} // end inline asm fma.rn.ftz.f32 %f218, %f216, %f75, %f211; mov.b32 {%rs109, %rs113}, %r80; // begin inline asm { cvt.f32.f16 %f76, %rs109;} // end inline asm fma.rn.ftz.f32 %f219, %f216, %f76, %f212; mov.b32 {%rs110, %rs114}, %r85; // begin inline asm { cvt.f32.f16 %f77, %rs110;} // end inline asm fma.rn.ftz.f32 %f220, %f216, %f77, %f213; cvt.rn.f32.s16 %f221, %rs16; sub.ftz.f32 %f222, %f221, %f6; mul.ftz.f32 %f223, %f57, %f222; // begin inline asm { cvt.f32.f16 %f78, %rs111;} // end inline asm fma.rn.ftz.f32 %f224, %f223, %f78, %f217; // begin inline asm { cvt.f32.f16 %f79, %rs112;} // end inline asm fma.rn.ftz.f32 %f225, %f223, %f79, %f218; // begin inline asm { cvt.f32.f16 %f80, %rs113;} // end inline asm fma.rn.ftz.f32 %f226, %f223, %f80, %f219; // begin inline asm { cvt.f32.f16 %f81, %rs114;} // end inline asm fma.rn.ftz.f32 %f227, %f223, %f81, %f220; cvt.rn.f32.s16 %f228, %rs18; sub.ftz.f32 %f229, %f228, %f6; mul.ftz.f32 %f230, %f57, %f229; mov.b32 {%rs115, %rs119}, %r71; // begin inline asm { cvt.f32.f16 %f82, %rs115;} // end inline asm fma.rn.ftz.f32 %f231, %f230, %f82, %f224; mov.b32 {%rs116, %rs120}, %r75; // begin inline asm { cvt.f32.f16 %f83, %rs116;} // end inline asm fma.rn.ftz.f32 %f232, %f230, %f83, %f225; mov.b32 {%rs117, %rs121}, %r81; // begin inline asm { cvt.f32.f16 %f84, %rs117;} // end inline asm fma.rn.ftz.f32 %f233, %f230, %f84, %f226; mov.b32 {%rs118, %rs122}, %r86; // begin inline asm { cvt.f32.f16 %f85, %rs118;} // end inline asm fma.rn.ftz.f32 %f234, %f230, %f85, %f227; cvt.rn.f32.s16 %f235, %rs19; sub.ftz.f32 %f236, %f235, %f6; mul.ftz.f32 %f237, %f57, %f236; // begin inline asm { cvt.f32.f16 %f86, %rs119;} // end inline asm fma.rn.ftz.f32 %f238, %f237, %f86, %f231; // begin inline asm { cvt.f32.f16 %f87, %rs120;} // end inline asm fma.rn.ftz.f32 %f239, %f237, %f87, %f232; // begin inline asm { cvt.f32.f16 %f88, %rs121;} // end inline asm fma.rn.ftz.f32 %f240, %f237, %f88, %f233; // begin inline asm { cvt.f32.f16 %f89, %rs122;} // end inline asm fma.rn.ftz.f32 %f241, %f237, %f89, %f234; ld.global.v4.u32 {%r116, %r117, %r118, %r119}, [%rd7+16]; ld.global.v4.u32 {%r124, %r125, %r126, %r127}, [%rd10]; ld.global.v4.u32 {%r132, %r133, %r134, %r135}, [%rd8+16]; ld.global.v4.u32 {%r140, %r141, %r142, %r143}, [%rd9+16]; cvt.rn.f32.s16 %f242, %rs21; sub.ftz.f32 %f243, %f242, %f6; mul.ftz.f32 %f244, %f57, %f243; mov.b32 {%rs123, %rs127}, %r116; // begin inline asm { cvt.f32.f16 %f90, %rs123;} // end inline asm fma.rn.ftz.f32 %f245, %f244, %f90, %f238; mov.b32 {%rs124, %rs128}, %r124; // begin inline asm { cvt.f32.f16 %f91, %rs124;} // end inline asm fma.rn.ftz.f32 %f246, %f244, %f91, %f239; mov.b32 {%rs125, %rs129}, %r132; // begin inline asm { cvt.f32.f16 %f92, %rs125;} // end inline asm fma.rn.ftz.f32 %f247, %f244, %f92, %f240; mov.b32 {%rs126, %rs130}, %r140; // begin inline asm { cvt.f32.f16 %f93, %rs126;} // end inline asm fma.rn.ftz.f32 %f248, %f244, %f93, %f241; cvt.rn.f32.s16 %f249, %rs23; sub.ftz.f32 %f250, %f249, %f6; mul.ftz.f32 %f251, %f57, %f250; // begin inline asm { cvt.f32.f16 %f94, %rs127;} // end inline asm fma.rn.ftz.f32 %f252, %f251, %f94, %f245; // begin inline asm { cvt.f32.f16 %f95, %rs128;} // end inline asm fma.rn.ftz.f32 %f253, %f251, %f95, %f246; // begin inline asm { cvt.f32.f16 %f96, %rs129;} // end inline asm fma.rn.ftz.f32 %f254, %f251, %f96, %f247; // begin inline asm { cvt.f32.f16 %f97, %rs130;} // end inline asm fma.rn.ftz.f32 %f255, %f251, %f97, %f248; cvt.rn.f32.s16 %f256, %rs25; sub.ftz.f32 %f257, %f256, %f6; mul.ftz.f32 %f258, %f57, %f257; mov.b32 {%rs131, %rs135}, %r117; // begin inline asm { cvt.f32.f16 %f98, %rs131;} // end inline asm fma.rn.ftz.f32 %f259, %f258, %f98, %f252; mov.b32 {%rs132, %rs136}, %r125; // begin inline asm { cvt.f32.f16 %f99, %rs132;} // end inline asm fma.rn.ftz.f32 %f260, %f258, %f99, %f253; mov.b32 {%rs133, %rs137}, %r133; // begin inline asm { cvt.f32.f16 %f100, %rs133;} // end inline asm fma.rn.ftz.f32 %f261, %f258, %f100, %f254; mov.b32 {%rs134, %rs138}, %r141; // begin inline asm { cvt.f32.f16 %f101, %rs134;} // end inline asm fma.rn.ftz.f32 %f262, %f258, %f101, %f255; cvt.rn.f32.s16 %f263, %rs27; sub.ftz.f32 %f264, %f263, %f6; mul.ftz.f32 %f265, %f57, %f264; // begin inline asm { cvt.f32.f16 %f102, %rs135;} // end inline asm fma.rn.ftz.f32 %f266, %f265, %f102, %f259; // begin inline asm { cvt.f32.f16 %f103, %rs136;} // end inline asm fma.rn.ftz.f32 %f267, %f265, %f103, %f260; // begin inline asm { cvt.f32.f16 %f104, %rs137;} // end inline asm fma.rn.ftz.f32 %f268, %f265, %f104, %f261; // begin inline asm { cvt.f32.f16 %f105, %rs138;} // end inline asm fma.rn.ftz.f32 %f269, %f265, %f105, %f262; cvt.rn.f32.s16 %f270, %rs29; sub.ftz.f32 %f271, %f270, %f6; mul.ftz.f32 %f272, %f57, %f271; mov.b32 {%rs139, %rs143}, %r118; // begin inline asm { cvt.f32.f16 %f106, %rs139;} // end inline asm fma.rn.ftz.f32 %f273, %f272, %f106, %f266; mov.b32 {%rs140, %rs144}, %r126; // begin inline asm { cvt.f32.f16 %f107, %rs140;} // end inline asm fma.rn.ftz.f32 %f274, %f272, %f107, %f267; mov.b32 {%rs141, %rs145}, %r134; // begin inline asm { cvt.f32.f16 %f108, %rs141;} // end inline asm fma.rn.ftz.f32 %f275, %f272, %f108, %f268; mov.b32 {%rs142, %rs146}, %r142; // begin inline asm { cvt.f32.f16 %f109, %rs142;} // end inline asm fma.rn.ftz.f32 %f276, %f272, %f109, %f269; cvt.rn.f32.s16 %f277, %rs31; sub.ftz.f32 %f278, %f277, %f6; mul.ftz.f32 %f279, %f57, %f278; // begin inline asm { cvt.f32.f16 %f110, %rs143;} // end inline asm fma.rn.ftz.f32 %f280, %f279, %f110, %f273; // begin inline asm { cvt.f32.f16 %f111, %rs144;} // end inline asm fma.rn.ftz.f32 %f281, %f279, %f111, %f274; // begin inline asm { cvt.f32.f16 %f112, %rs145;} // end inline asm fma.rn.ftz.f32 %f282, %f279, %f112, %f275; // begin inline asm { cvt.f32.f16 %f113, %rs146;} // end inline asm fma.rn.ftz.f32 %f283, %f279, %f113, %f276; cvt.rn.f32.s16 %f284, %rs33; sub.ftz.f32 %f285, %f284, %f6; mul.ftz.f32 %f286, %f57, %f285; mov.b32 {%rs147, %rs151}, %r119; // begin inline asm { cvt.f32.f16 %f114, %rs147;} // end inline asm fma.rn.ftz.f32 %f287, %f286, %f114, %f280; mov.b32 {%rs148, %rs152}, %r127; // begin inline asm { cvt.f32.f16 %f115, %rs148;} // end inline asm fma.rn.ftz.f32 %f288, %f286, %f115, %f281; mov.b32 {%rs149, %rs153}, %r135; // begin inline asm { cvt.f32.f16 %f116, %rs149;} // end inline asm fma.rn.ftz.f32 %f289, %f286, %f116, %f282; mov.b32 {%rs150, %rs154}, %r143; // begin inline asm { cvt.f32.f16 %f117, %rs150;} // end inline asm fma.rn.ftz.f32 %f290, %f286, %f117, %f283; cvt.rn.f32.s16 %f291, %rs34; sub.ftz.f32 %f292, %f291, %f6; mul.ftz.f32 %f293, %f57, %f292; // begin inline asm { cvt.f32.f16 %f118, %rs151;} // end inline asm fma.rn.ftz.f32 %f294, %f293, %f118, %f287; // begin inline asm { cvt.f32.f16 %f119, %rs152;} // end inline asm fma.rn.ftz.f32 %f295, %f293, %f119, %f288; // begin inline asm { cvt.f32.f16 %f120, %rs153;} // end inline asm fma.rn.ftz.f32 %f296, %f293, %f120, %f289; // begin inline asm { cvt.f32.f16 %f121, %rs154;} // end inline asm fma.rn.ftz.f32 %f297, %f293, %f121, %f290; ld.global.v4.u32 {%r148, %r149, %r150, %r151}, [%rd7+32]; ld.global.v4.u32 {%r156, %r157, %r158, %r159}, [%rd10+16]; ld.global.v4.u32 {%r164, %r165, %r166, %r167}, [%rd8+32]; ld.global.v4.u32 {%r172, %r173, %r174, %r175}, [%rd9+32]; cvt.rn.f32.s16 %f298, %rs36; sub.ftz.f32 %f299, %f298, %f6; mul.ftz.f32 %f300, %f57, %f299; mov.b32 {%rs155, %rs159}, %r148; // begin inline asm { cvt.f32.f16 %f122, %rs155;} // end inline asm fma.rn.ftz.f32 %f301, %f300, %f122, %f294; mov.b32 {%rs156, %rs160}, %r156; // begin inline asm { cvt.f32.f16 %f123, %rs156;} // end inline asm fma.rn.ftz.f32 %f302, %f300, %f123, %f295; mov.b32 {%rs157, %rs161}, %r164; // begin inline asm { cvt.f32.f16 %f124, %rs157;} // end inline asm fma.rn.ftz.f32 %f303, %f300, %f124, %f296; mov.b32 {%rs158, %rs162}, %r172; // begin inline asm { cvt.f32.f16 %f125, %rs158;} // end inline asm fma.rn.ftz.f32 %f304, %f300, %f125, %f297; cvt.rn.f32.s16 %f305, %rs38; sub.ftz.f32 %f306, %f305, %f6; mul.ftz.f32 %f307, %f57, %f306; // begin inline asm { cvt.f32.f16 %f126, %rs159;} // end inline asm fma.rn.ftz.f32 %f308, %f307, %f126, %f301; // begin inline asm { cvt.f32.f16 %f127, %rs160;} // end inline asm fma.rn.ftz.f32 %f309, %f307, %f127, %f302; // begin inline asm { cvt.f32.f16 %f128, %rs161;} // end inline asm fma.rn.ftz.f32 %f310, %f307, %f128, %f303; // begin inline asm { cvt.f32.f16 %f129, %rs162;} // end inline asm fma.rn.ftz.f32 %f311, %f307, %f129, %f304; cvt.rn.f32.s16 %f312, %rs40; sub.ftz.f32 %f313, %f312, %f6; mul.ftz.f32 %f314, %f57, %f313; mov.b32 {%rs163, %rs167}, %r149; // begin inline asm { cvt.f32.f16 %f130, %rs163;} // end inline asm fma.rn.ftz.f32 %f315, %f314, %f130, %f308; mov.b32 {%rs164, %rs168}, %r157; // begin inline asm { cvt.f32.f16 %f131, %rs164;} // end inline asm fma.rn.ftz.f32 %f316, %f314, %f131, %f309; mov.b32 {%rs165, %rs169}, %r165; // begin inline asm { cvt.f32.f16 %f132, %rs165;} // end inline asm fma.rn.ftz.f32 %f317, %f314, %f132, %f310; mov.b32 {%rs166, %rs170}, %r173; // begin inline asm { cvt.f32.f16 %f133, %rs166;} // end inline asm fma.rn.ftz.f32 %f318, %f314, %f133, %f311; cvt.rn.f32.s16 %f319, %rs42; sub.ftz.f32 %f320, %f319, %f6; mul.ftz.f32 %f321, %f57, %f320; // begin inline asm { cvt.f32.f16 %f134, %rs167;} // end inline asm fma.rn.ftz.f32 %f322, %f321, %f134, %f315; // begin inline asm { cvt.f32.f16 %f135, %rs168;} // end inline asm fma.rn.ftz.f32 %f323, %f321, %f135, %f316; // begin inline asm { cvt.f32.f16 %f136, %rs169;} // end inline asm fma.rn.ftz.f32 %f324, %f321, %f136, %f317; // begin inline asm { cvt.f32.f16 %f137, %rs170;} // end inline asm fma.rn.ftz.f32 %f325, %f321, %f137, %f318; cvt.rn.f32.s16 %f326, %rs44; sub.ftz.f32 %f327, %f326, %f6; mul.ftz.f32 %f328, %f57, %f327; mov.b32 {%rs171, %rs175}, %r150; // begin inline asm { cvt.f32.f16 %f138, %rs171;} // end inline asm fma.rn.ftz.f32 %f329, %f328, %f138, %f322; mov.b32 {%rs172, %rs176}, %r158; // begin inline asm { cvt.f32.f16 %f139, %rs172;} // end inline asm fma.rn.ftz.f32 %f330, %f328, %f139, %f323; mov.b32 {%rs173, %rs177}, %r166; // begin inline asm { cvt.f32.f16 %f140, %rs173;} // end inline asm fma.rn.ftz.f32 %f331, %f328, %f140, %f324; mov.b32 {%rs174, %rs178}, %r174; // begin inline asm { cvt.f32.f16 %f141, %rs174;} // end inline asm fma.rn.ftz.f32 %f332, %f328, %f141, %f325; cvt.rn.f32.s16 %f333, %rs46; sub.ftz.f32 %f334, %f333, %f6; mul.ftz.f32 %f335, %f57, %f334; // begin inline asm { cvt.f32.f16 %f142, %rs175;} // end inline asm fma.rn.ftz.f32 %f336, %f335, %f142, %f329; // begin inline asm { cvt.f32.f16 %f143, %rs176;} // end inline asm fma.rn.ftz.f32 %f337, %f335, %f143, %f330; // begin inline asm { cvt.f32.f16 %f144, %rs177;} // end inline asm fma.rn.ftz.f32 %f338, %f335, %f144, %f331; // begin inline asm { cvt.f32.f16 %f145, %rs178;} // end inline asm fma.rn.ftz.f32 %f339, %f335, %f145, %f332; cvt.rn.f32.s16 %f340, %rs48; sub.ftz.f32 %f341, %f340, %f6; mul.ftz.f32 %f342, %f57, %f341; mov.b32 {%rs179, %rs183}, %r151; // begin inline asm { cvt.f32.f16 %f146, %rs179;} // end inline asm fma.rn.ftz.f32 %f343, %f342, %f146, %f336; mov.b32 {%rs180, %rs184}, %r159; // begin inline asm { cvt.f32.f16 %f147, %rs180;} // end inline asm fma.rn.ftz.f32 %f344, %f342, %f147, %f337; mov.b32 {%rs181, %rs185}, %r167; // begin inline asm { cvt.f32.f16 %f148, %rs181;} // end inline asm fma.rn.ftz.f32 %f345, %f342, %f148, %f338; mov.b32 {%rs182, %rs186}, %r175; // begin inline asm { cvt.f32.f16 %f149, %rs182;} // end inline asm fma.rn.ftz.f32 %f346, %f342, %f149, %f339; cvt.rn.f32.s16 %f347, %rs49; sub.ftz.f32 %f348, %f347, %f6; mul.ftz.f32 %f349, %f57, %f348; // begin inline asm { cvt.f32.f16 %f150, %rs183;} // end inline asm fma.rn.ftz.f32 %f350, %f349, %f150, %f343; // begin inline asm { cvt.f32.f16 %f151, %rs184;} // end inline asm fma.rn.ftz.f32 %f351, %f349, %f151, %f344; // begin inline asm { cvt.f32.f16 %f152, %rs185;} // end inline asm fma.rn.ftz.f32 %f352, %f349, %f152, %f345; // begin inline asm { cvt.f32.f16 %f153, %rs186;} // end inline asm fma.rn.ftz.f32 %f353, %f349, %f153, %f346; ld.global.v4.u32 {%r180, %r181, %r182, %r183}, [%rd7+48]; ld.global.v4.u32 {%r188, %r189, %r190, %r191}, [%rd10+32]; ld.global.v4.u32 {%r196, %r197, %r198, %r199}, [%rd8+48]; ld.global.v4.u32 {%r204, %r205, %r206, %r207}, [%rd9+48]; cvt.rn.f32.s16 %f354, %rs51; sub.ftz.f32 %f355, %f354, %f6; mul.ftz.f32 %f356, %f57, %f355; mov.b32 {%rs187, %rs191}, %r180; // begin inline asm { cvt.f32.f16 %f154, %rs187;} // end inline asm fma.rn.ftz.f32 %f357, %f356, %f154, %f350; mov.b32 {%rs188, %rs192}, %r188; // begin inline asm { cvt.f32.f16 %f155, %rs188;} // end inline asm fma.rn.ftz.f32 %f358, %f356, %f155, %f351; mov.b32 {%rs189, %rs193}, %r196; // begin inline asm { cvt.f32.f16 %f156, %rs189;} // end inline asm fma.rn.ftz.f32 %f359, %f356, %f156, %f352; mov.b32 {%rs190, %rs194}, %r204; // begin inline asm { cvt.f32.f16 %f157, %rs190;} // end inline asm fma.rn.ftz.f32 %f360, %f356, %f157, %f353; cvt.rn.f32.s16 %f361, %rs53; sub.ftz.f32 %f362, %f361, %f6; mul.ftz.f32 %f363, %f57, %f362; // begin inline asm { cvt.f32.f16 %f158, %rs191;} // end inline asm fma.rn.ftz.f32 %f364, %f363, %f158, %f357; // begin inline asm { cvt.f32.f16 %f159, %rs192;} // end inline asm fma.rn.ftz.f32 %f365, %f363, %f159, %f358; // begin inline asm { cvt.f32.f16 %f160, %rs193;} // end inline asm fma.rn.ftz.f32 %f366, %f363, %f160, %f359; // begin inline asm { cvt.f32.f16 %f161, %rs194;} // end inline asm fma.rn.ftz.f32 %f367, %f363, %f161, %f360; cvt.rn.f32.s16 %f368, %rs55; sub.ftz.f32 %f369, %f368, %f6; mul.ftz.f32 %f370, %f57, %f369; mov.b32 {%rs195, %rs199}, %r181; // begin inline asm { cvt.f32.f16 %f162, %rs195;} // end inline asm fma.rn.ftz.f32 %f371, %f370, %f162, %f364; mov.b32 {%rs196, %rs200}, %r189; // begin inline asm { cvt.f32.f16 %f163, %rs196;} // end inline asm fma.rn.ftz.f32 %f372, %f370, %f163, %f365; mov.b32 {%rs197, %rs201}, %r197; // begin inline asm { cvt.f32.f16 %f164, %rs197;} // end inline asm fma.rn.ftz.f32 %f373, %f370, %f164, %f366; mov.b32 {%rs198, %rs202}, %r205; // begin inline asm { cvt.f32.f16 %f165, %rs198;} // end inline asm fma.rn.ftz.f32 %f374, %f370, %f165, %f367; cvt.rn.f32.s16 %f375, %rs57; sub.ftz.f32 %f376, %f375, %f6; mul.ftz.f32 %f377, %f57, %f376; // begin inline asm { cvt.f32.f16 %f166, %rs199;} // end inline asm fma.rn.ftz.f32 %f378, %f377, %f166, %f371; // begin inline asm { cvt.f32.f16 %f167, %rs200;} // end inline asm fma.rn.ftz.f32 %f379, %f377, %f167, %f372; // begin inline asm { cvt.f32.f16 %f168, %rs201;} // end inline asm fma.rn.ftz.f32 %f380, %f377, %f168, %f373; // begin inline asm { cvt.f32.f16 %f169, %rs202;} // end inline asm fma.rn.ftz.f32 %f381, %f377, %f169, %f374; cvt.rn.f32.s16 %f382, %rs59; sub.ftz.f32 %f383, %f382, %f6; mul.ftz.f32 %f384, %f57, %f383; mov.b32 {%rs203, %rs207}, %r182; // begin inline asm { cvt.f32.f16 %f170, %rs203;} // end inline asm fma.rn.ftz.f32 %f385, %f384, %f170, %f378; mov.b32 {%rs204, %rs208}, %r190; // begin inline asm { cvt.f32.f16 %f171, %rs204;} // end inline asm fma.rn.ftz.f32 %f386, %f384, %f171, %f379; mov.b32 {%rs205, %rs209}, %r198; // begin inline asm { cvt.f32.f16 %f172, %rs205;} // end inline asm fma.rn.ftz.f32 %f387, %f384, %f172, %f380; mov.b32 {%rs206, %rs210}, %r206; // begin inline asm { cvt.f32.f16 %f173, %rs206;} // end inline asm fma.rn.ftz.f32 %f388, %f384, %f173, %f381; cvt.rn.f32.s16 %f389, %rs61; sub.ftz.f32 %f390, %f389, %f6; mul.ftz.f32 %f391, %f57, %f390; // begin inline asm { cvt.f32.f16 %f174, %rs207;} // end inline asm fma.rn.ftz.f32 %f392, %f391, %f174, %f385; // begin inline asm { cvt.f32.f16 %f175, %rs208;} // end inline asm fma.rn.ftz.f32 %f393, %f391, %f175, %f386; // begin inline asm { cvt.f32.f16 %f176, %rs209;} // end inline asm fma.rn.ftz.f32 %f394, %f391, %f176, %f387; // begin inline asm { cvt.f32.f16 %f177, %rs210;} // end inline asm fma.rn.ftz.f32 %f395, %f391, %f177, %f388; cvt.rn.f32.s16 %f396, %rs63; sub.ftz.f32 %f397, %f396, %f6; mul.ftz.f32 %f398, %f57, %f397; mov.b32 {%rs211, %rs215}, %r183; // begin inline asm { cvt.f32.f16 %f178, %rs211;} // end inline asm fma.rn.ftz.f32 %f399, %f398, %f178, %f392; mov.b32 {%rs212, %rs216}, %r191; // begin inline asm { cvt.f32.f16 %f179, %rs212;} // end inline asm fma.rn.ftz.f32 %f400, %f398, %f179, %f393; mov.b32 {%rs213, %rs217}, %r199; // begin inline asm { cvt.f32.f16 %f180, %rs213;} // end inline asm fma.rn.ftz.f32 %f401, %f398, %f180, %f394; mov.b32 {%rs214, %rs218}, %r207; // begin inline asm { cvt.f32.f16 %f181, %rs214;} // end inline asm fma.rn.ftz.f32 %f402, %f398, %f181, %f395; cvt.rn.f32.s16 %f403, %rs64; sub.ftz.f32 %f404, %f403, %f6; mul.ftz.f32 %f405, %f57, %f404; // begin inline asm { cvt.f32.f16 %f182, %rs215;} // end inline asm fma.rn.ftz.f32 %f849, %f405, %f182, %f399; // begin inline asm { cvt.f32.f16 %f183, %rs216;} // end inline asm fma.rn.ftz.f32 %f848, %f405, %f183, %f400; // begin inline asm { cvt.f32.f16 %f184, %rs217;} // end inline asm fma.rn.ftz.f32 %f847, %f405, %f184, %f401; // begin inline asm { cvt.f32.f16 %f185, %rs218;} // end inline asm fma.rn.ftz.f32 %f846, %f405, %f185, %f402; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs347, %rs5, 4; cvt.s16.s8 %rs348, %rs347; shr.s16 %rs349, %rs348, 7; and.b16 %rs350, %rs349, -16; or.b16 %rs351, %rs350, %rs6; cvt.rn.f32.s16 %f534, %rs351; sub.ftz.f32 %f535, %f534, %f6; mul.ftz.f32 %f536, %f57, %f535; mov.b32 {%rs219, %rs223}, %r68; // begin inline asm { cvt.f32.f16 %f406, %rs219;} // end inline asm fma.rn.ftz.f32 %f537, %f536, %f406, %f849; mov.b32 {%rs220, %rs224}, %r72; // begin inline asm { cvt.f32.f16 %f407, %rs220;} // end inline asm fma.rn.ftz.f32 %f538, %f536, %f407, %f848; mov.b32 {%rs221, %rs225}, %r78; // begin inline asm { cvt.f32.f16 %f408, %rs221;} // end inline asm fma.rn.ftz.f32 %f539, %f536, %f408, %f847; mov.b32 {%rs222, %rs226}, %r83; // begin inline asm { cvt.f32.f16 %f409, %rs222;} // end inline asm fma.rn.ftz.f32 %f540, %f536, %f409, %f846; shl.b16 %rs352, %rs7, 4; cvt.s16.s8 %rs353, %rs352; shr.s16 %rs354, %rs353, 7; and.b16 %rs355, %rs354, -16; or.b16 %rs356, %rs355, %rs8; cvt.rn.f32.s16 %f541, %rs356; sub.ftz.f32 %f542, %f541, %f6; mul.ftz.f32 %f543, %f57, %f542; // begin inline asm { cvt.f32.f16 %f410, %rs223;} // end inline asm fma.rn.ftz.f32 %f544, %f543, %f410, %f537; // begin inline asm { cvt.f32.f16 %f411, %rs224;} // end inline asm fma.rn.ftz.f32 %f545, %f543, %f411, %f538; // begin inline asm { cvt.f32.f16 %f412, %rs225;} // end inline asm fma.rn.ftz.f32 %f546, %f543, %f412, %f539; // begin inline asm { cvt.f32.f16 %f413, %rs226;} // end inline asm fma.rn.ftz.f32 %f547, %f543, %f413, %f540; shl.b16 %rs357, %rs9, 4; cvt.s16.s8 %rs358, %rs357; shr.s16 %rs359, %rs358, 7; and.b16 %rs360, %rs359, -16; or.b16 %rs361, %rs360, %rs10; cvt.rn.f32.s16 %f548, %rs361; sub.ftz.f32 %f549, %f548, %f6; mul.ftz.f32 %f550, %f57, %f549; mov.b32 {%rs227, %rs231}, %r69; // begin inline asm { cvt.f32.f16 %f414, %rs227;} // end inline asm fma.rn.ftz.f32 %f551, %f550, %f414, %f544; mov.b32 {%rs228, %rs232}, %r73; // begin inline asm { cvt.f32.f16 %f415, %rs228;} // end inline asm fma.rn.ftz.f32 %f552, %f550, %f415, %f545; mov.b32 {%rs229, %rs233}, %r79; // begin inline asm { cvt.f32.f16 %f416, %rs229;} // end inline asm fma.rn.ftz.f32 %f553, %f550, %f416, %f546; mov.b32 {%rs230, %rs234}, %r84; // begin inline asm { cvt.f32.f16 %f417, %rs230;} // end inline asm fma.rn.ftz.f32 %f554, %f550, %f417, %f547; shl.b16 %rs362, %rs11, 4; cvt.s16.s8 %rs363, %rs362; shr.s16 %rs364, %rs363, 7; and.b16 %rs365, %rs364, -16; or.b16 %rs366, %rs365, %rs12; cvt.rn.f32.s16 %f555, %rs366; sub.ftz.f32 %f556, %f555, %f6; mul.ftz.f32 %f557, %f57, %f556; // begin inline asm { cvt.f32.f16 %f418, %rs231;} // end inline asm fma.rn.ftz.f32 %f558, %f557, %f418, %f551; // begin inline asm { cvt.f32.f16 %f419, %rs232;} // end inline asm fma.rn.ftz.f32 %f559, %f557, %f419, %f552; // begin inline asm { cvt.f32.f16 %f420, %rs233;} // end inline asm fma.rn.ftz.f32 %f560, %f557, %f420, %f553; // begin inline asm { cvt.f32.f16 %f421, %rs234;} // end inline asm fma.rn.ftz.f32 %f561, %f557, %f421, %f554; shl.b16 %rs367, %rs13, 4; cvt.s16.s8 %rs368, %rs367; shr.s16 %rs369, %rs368, 7; and.b16 %rs370, %rs369, -16; or.b16 %rs371, %rs370, %rs14; cvt.rn.f32.s16 %f562, %rs371; sub.ftz.f32 %f563, %f562, %f6; mul.ftz.f32 %f564, %f57, %f563; mov.b32 {%rs235, %rs239}, %r70; // begin inline asm { cvt.f32.f16 %f422, %rs235;} // end inline asm fma.rn.ftz.f32 %f565, %f564, %f422, %f558; mov.b32 {%rs236, %rs240}, %r74; // begin inline asm { cvt.f32.f16 %f423, %rs236;} // end inline asm fma.rn.ftz.f32 %f566, %f564, %f423, %f559; mov.b32 {%rs237, %rs241}, %r80; // begin inline asm { cvt.f32.f16 %f424, %rs237;} // end inline asm fma.rn.ftz.f32 %f567, %f564, %f424, %f560; mov.b32 {%rs238, %rs242}, %r85; // begin inline asm { cvt.f32.f16 %f425, %rs238;} // end inline asm fma.rn.ftz.f32 %f568, %f564, %f425, %f561; shl.b16 %rs372, %rs15, 4; cvt.s16.s8 %rs373, %rs372; shr.s16 %rs374, %rs373, 7; and.b16 %rs375, %rs374, -16; or.b16 %rs376, %rs375, %rs16; cvt.rn.f32.s16 %f569, %rs376; sub.ftz.f32 %f570, %f569, %f6; mul.ftz.f32 %f571, %f57, %f570; // begin inline asm { cvt.f32.f16 %f426, %rs239;} // end inline asm fma.rn.ftz.f32 %f572, %f571, %f426, %f565; // begin inline asm { cvt.f32.f16 %f427, %rs240;} // end inline asm fma.rn.ftz.f32 %f573, %f571, %f427, %f566; // begin inline asm { cvt.f32.f16 %f428, %rs241;} // end inline asm fma.rn.ftz.f32 %f574, %f571, %f428, %f567; // begin inline asm { cvt.f32.f16 %f429, %rs242;} // end inline asm fma.rn.ftz.f32 %f575, %f571, %f429, %f568; shl.b16 %rs377, %rs17, 4; cvt.s16.s8 %rs378, %rs377; shr.s16 %rs379, %rs378, 7; and.b16 %rs380, %rs379, -16; or.b16 %rs381, %rs380, %rs18; cvt.rn.f32.s16 %f576, %rs381; sub.ftz.f32 %f577, %f576, %f6; mul.ftz.f32 %f578, %f57, %f577; mov.b32 {%rs243, %rs247}, %r71; // begin inline asm { cvt.f32.f16 %f430, %rs243;} // end inline asm fma.rn.ftz.f32 %f579, %f578, %f430, %f572; mov.b32 {%rs244, %rs248}, %r75; // begin inline asm { cvt.f32.f16 %f431, %rs244;} // end inline asm fma.rn.ftz.f32 %f580, %f578, %f431, %f573; mov.b32 {%rs245, %rs249}, %r81; // begin inline asm { cvt.f32.f16 %f432, %rs245;} // end inline asm fma.rn.ftz.f32 %f581, %f578, %f432, %f574; mov.b32 {%rs246, %rs250}, %r86; // begin inline asm { cvt.f32.f16 %f433, %rs246;} // end inline asm fma.rn.ftz.f32 %f582, %f578, %f433, %f575; shl.b16 %rs382, %rs19, 4; cvt.s16.s8 %rs383, %rs382; shr.s16 %rs384, %rs383, 7; and.b16 %rs385, %rs384, -16; or.b16 %rs386, %rs385, %rs19; cvt.rn.f32.s16 %f583, %rs386; sub.ftz.f32 %f584, %f583, %f6; mul.ftz.f32 %f585, %f57, %f584; // begin inline asm { cvt.f32.f16 %f434, %rs247;} // end inline asm fma.rn.ftz.f32 %f586, %f585, %f434, %f579; // begin inline asm { cvt.f32.f16 %f435, %rs248;} // end inline asm fma.rn.ftz.f32 %f587, %f585, %f435, %f580; // begin inline asm { cvt.f32.f16 %f436, %rs249;} // end inline asm fma.rn.ftz.f32 %f588, %f585, %f436, %f581; // begin inline asm { cvt.f32.f16 %f437, %rs250;} // end inline asm fma.rn.ftz.f32 %f589, %f585, %f437, %f582; ld.global.v4.u32 {%r212, %r213, %r214, %r215}, [%rd7+16]; ld.global.v4.u32 {%r220, %r221, %r222, %r223}, [%rd10]; ld.global.v4.u32 {%r228, %r229, %r230, %r231}, [%rd8+16]; ld.global.v4.u32 {%r236, %r237, %r238, %r239}, [%rd9+16]; shl.b16 %rs387, %rs20, 4; cvt.s16.s8 %rs388, %rs387; shr.s16 %rs389, %rs388, 7; and.b16 %rs390, %rs389, -16; or.b16 %rs391, %rs390, %rs21; cvt.rn.f32.s16 %f590, %rs391; sub.ftz.f32 %f591, %f590, %f6; mul.ftz.f32 %f592, %f57, %f591; mov.b32 {%rs251, %rs255}, %r212; // begin inline asm { cvt.f32.f16 %f438, %rs251;} // end inline asm fma.rn.ftz.f32 %f593, %f592, %f438, %f586; mov.b32 {%rs252, %rs256}, %r220; // begin inline asm { cvt.f32.f16 %f439, %rs252;} // end inline asm fma.rn.ftz.f32 %f594, %f592, %f439, %f587; mov.b32 {%rs253, %rs257}, %r228; // begin inline asm { cvt.f32.f16 %f440, %rs253;} // end inline asm fma.rn.ftz.f32 %f595, %f592, %f440, %f588; mov.b32 {%rs254, %rs258}, %r236; // begin inline asm { cvt.f32.f16 %f441, %rs254;} // end inline asm fma.rn.ftz.f32 %f596, %f592, %f441, %f589; shl.b16 %rs392, %rs22, 4; cvt.s16.s8 %rs393, %rs392; shr.s16 %rs394, %rs393, 7; and.b16 %rs395, %rs394, -16; or.b16 %rs396, %rs395, %rs23; cvt.rn.f32.s16 %f597, %rs396; sub.ftz.f32 %f598, %f597, %f6; mul.ftz.f32 %f599, %f57, %f598; // begin inline asm { cvt.f32.f16 %f442, %rs255;} // end inline asm fma.rn.ftz.f32 %f600, %f599, %f442, %f593; // begin inline asm { cvt.f32.f16 %f443, %rs256;} // end inline asm fma.rn.ftz.f32 %f601, %f599, %f443, %f594; // begin inline asm { cvt.f32.f16 %f444, %rs257;} // end inline asm fma.rn.ftz.f32 %f602, %f599, %f444, %f595; // begin inline asm { cvt.f32.f16 %f445, %rs258;} // end inline asm fma.rn.ftz.f32 %f603, %f599, %f445, %f596; shl.b16 %rs397, %rs24, 4; cvt.s16.s8 %rs398, %rs397; shr.s16 %rs399, %rs398, 7; and.b16 %rs400, %rs399, -16; or.b16 %rs401, %rs400, %rs25; cvt.rn.f32.s16 %f604, %rs401; sub.ftz.f32 %f605, %f604, %f6; mul.ftz.f32 %f606, %f57, %f605; mov.b32 {%rs259, %rs263}, %r213; // begin inline asm { cvt.f32.f16 %f446, %rs259;} // end inline asm fma.rn.ftz.f32 %f607, %f606, %f446, %f600; mov.b32 {%rs260, %rs264}, %r221; // begin inline asm { cvt.f32.f16 %f447, %rs260;} // end inline asm fma.rn.ftz.f32 %f608, %f606, %f447, %f601; mov.b32 {%rs261, %rs265}, %r229; // begin inline asm { cvt.f32.f16 %f448, %rs261;} // end inline asm fma.rn.ftz.f32 %f609, %f606, %f448, %f602; mov.b32 {%rs262, %rs266}, %r237; // begin inline asm { cvt.f32.f16 %f449, %rs262;} // end inline asm fma.rn.ftz.f32 %f610, %f606, %f449, %f603; shl.b16 %rs402, %rs26, 4; cvt.s16.s8 %rs403, %rs402; shr.s16 %rs404, %rs403, 7; and.b16 %rs405, %rs404, -16; or.b16 %rs406, %rs405, %rs27; cvt.rn.f32.s16 %f611, %rs406; sub.ftz.f32 %f612, %f611, %f6; mul.ftz.f32 %f613, %f57, %f612; // begin inline asm { cvt.f32.f16 %f450, %rs263;} // end inline asm fma.rn.ftz.f32 %f614, %f613, %f450, %f607; // begin inline asm { cvt.f32.f16 %f451, %rs264;} // end inline asm fma.rn.ftz.f32 %f615, %f613, %f451, %f608; // begin inline asm { cvt.f32.f16 %f452, %rs265;} // end inline asm fma.rn.ftz.f32 %f616, %f613, %f452, %f609; // begin inline asm { cvt.f32.f16 %f453, %rs266;} // end inline asm fma.rn.ftz.f32 %f617, %f613, %f453, %f610; shl.b16 %rs407, %rs28, 4; cvt.s16.s8 %rs408, %rs407; shr.s16 %rs409, %rs408, 7; and.b16 %rs410, %rs409, -16; or.b16 %rs411, %rs410, %rs29; cvt.rn.f32.s16 %f618, %rs411; sub.ftz.f32 %f619, %f618, %f6; mul.ftz.f32 %f620, %f57, %f619; mov.b32 {%rs267, %rs271}, %r214; // begin inline asm { cvt.f32.f16 %f454, %rs267;} // end inline asm fma.rn.ftz.f32 %f621, %f620, %f454, %f614; mov.b32 {%rs268, %rs272}, %r222; // begin inline asm { cvt.f32.f16 %f455, %rs268;} // end inline asm fma.rn.ftz.f32 %f622, %f620, %f455, %f615; mov.b32 {%rs269, %rs273}, %r230; // begin inline asm { cvt.f32.f16 %f456, %rs269;} // end inline asm fma.rn.ftz.f32 %f623, %f620, %f456, %f616; mov.b32 {%rs270, %rs274}, %r238; // begin inline asm { cvt.f32.f16 %f457, %rs270;} // end inline asm fma.rn.ftz.f32 %f624, %f620, %f457, %f617; shl.b16 %rs412, %rs30, 4; cvt.s16.s8 %rs413, %rs412; shr.s16 %rs414, %rs413, 7; and.b16 %rs415, %rs414, -16; or.b16 %rs416, %rs415, %rs31; cvt.rn.f32.s16 %f625, %rs416; sub.ftz.f32 %f626, %f625, %f6; mul.ftz.f32 %f627, %f57, %f626; // begin inline asm { cvt.f32.f16 %f458, %rs271;} // end inline asm fma.rn.ftz.f32 %f628, %f627, %f458, %f621; // begin inline asm { cvt.f32.f16 %f459, %rs272;} // end inline asm fma.rn.ftz.f32 %f629, %f627, %f459, %f622; // begin inline asm { cvt.f32.f16 %f460, %rs273;} // end inline asm fma.rn.ftz.f32 %f630, %f627, %f460, %f623; // begin inline asm { cvt.f32.f16 %f461, %rs274;} // end inline asm fma.rn.ftz.f32 %f631, %f627, %f461, %f624; shl.b16 %rs417, %rs32, 4; cvt.s16.s8 %rs418, %rs417; shr.s16 %rs419, %rs418, 7; and.b16 %rs420, %rs419, -16; or.b16 %rs421, %rs420, %rs33; cvt.rn.f32.s16 %f632, %rs421; sub.ftz.f32 %f633, %f632, %f6; mul.ftz.f32 %f634, %f57, %f633; mov.b32 {%rs275, %rs279}, %r215; // begin inline asm { cvt.f32.f16 %f462, %rs275;} // end inline asm fma.rn.ftz.f32 %f635, %f634, %f462, %f628; mov.b32 {%rs276, %rs280}, %r223; // begin inline asm { cvt.f32.f16 %f463, %rs276;} // end inline asm fma.rn.ftz.f32 %f636, %f634, %f463, %f629; mov.b32 {%rs277, %rs281}, %r231; // begin inline asm { cvt.f32.f16 %f464, %rs277;} // end inline asm fma.rn.ftz.f32 %f637, %f634, %f464, %f630; mov.b32 {%rs278, %rs282}, %r239; // begin inline asm { cvt.f32.f16 %f465, %rs278;} // end inline asm fma.rn.ftz.f32 %f638, %f634, %f465, %f631; shl.b16 %rs422, %rs34, 4; cvt.s16.s8 %rs423, %rs422; shr.s16 %rs424, %rs423, 7; and.b16 %rs425, %rs424, -16; or.b16 %rs426, %rs425, %rs34; cvt.rn.f32.s16 %f639, %rs426; sub.ftz.f32 %f640, %f639, %f6; mul.ftz.f32 %f641, %f57, %f640; // begin inline asm { cvt.f32.f16 %f466, %rs279;} // end inline asm fma.rn.ftz.f32 %f642, %f641, %f466, %f635; // begin inline asm { cvt.f32.f16 %f467, %rs280;} // end inline asm fma.rn.ftz.f32 %f643, %f641, %f467, %f636; // begin inline asm { cvt.f32.f16 %f468, %rs281;} // end inline asm fma.rn.ftz.f32 %f644, %f641, %f468, %f637; // begin inline asm { cvt.f32.f16 %f469, %rs282;} // end inline asm fma.rn.ftz.f32 %f645, %f641, %f469, %f638; ld.global.v4.u32 {%r244, %r245, %r246, %r247}, [%rd7+32]; ld.global.v4.u32 {%r252, %r253, %r254, %r255}, [%rd10+16]; ld.global.v4.u32 {%r260, %r261, %r262, %r263}, [%rd8+32]; ld.global.v4.u32 {%r268, %r269, %r270, %r271}, [%rd9+32]; shl.b16 %rs427, %rs35, 4; cvt.s16.s8 %rs428, %rs427; shr.s16 %rs429, %rs428, 7; and.b16 %rs430, %rs429, -16; or.b16 %rs431, %rs430, %rs36; cvt.rn.f32.s16 %f646, %rs431; sub.ftz.f32 %f647, %f646, %f6; mul.ftz.f32 %f648, %f57, %f647; mov.b32 {%rs283, %rs287}, %r244; // begin inline asm { cvt.f32.f16 %f470, %rs283;} // end inline asm fma.rn.ftz.f32 %f649, %f648, %f470, %f642; mov.b32 {%rs284, %rs288}, %r252; // begin inline asm { cvt.f32.f16 %f471, %rs284;} // end inline asm fma.rn.ftz.f32 %f650, %f648, %f471, %f643; mov.b32 {%rs285, %rs289}, %r260; // begin inline asm { cvt.f32.f16 %f472, %rs285;} // end inline asm fma.rn.ftz.f32 %f651, %f648, %f472, %f644; mov.b32 {%rs286, %rs290}, %r268; // begin inline asm { cvt.f32.f16 %f473, %rs286;} // end inline asm fma.rn.ftz.f32 %f652, %f648, %f473, %f645; shl.b16 %rs432, %rs37, 4; cvt.s16.s8 %rs433, %rs432; shr.s16 %rs434, %rs433, 7; and.b16 %rs435, %rs434, -16; or.b16 %rs436, %rs435, %rs38; cvt.rn.f32.s16 %f653, %rs436; sub.ftz.f32 %f654, %f653, %f6; mul.ftz.f32 %f655, %f57, %f654; // begin inline asm { cvt.f32.f16 %f474, %rs287;} // end inline asm fma.rn.ftz.f32 %f656, %f655, %f474, %f649; // begin inline asm { cvt.f32.f16 %f475, %rs288;} // end inline asm fma.rn.ftz.f32 %f657, %f655, %f475, %f650; // begin inline asm { cvt.f32.f16 %f476, %rs289;} // end inline asm fma.rn.ftz.f32 %f658, %f655, %f476, %f651; // begin inline asm { cvt.f32.f16 %f477, %rs290;} // end inline asm fma.rn.ftz.f32 %f659, %f655, %f477, %f652; shl.b16 %rs437, %rs39, 4; cvt.s16.s8 %rs438, %rs437; shr.s16 %rs439, %rs438, 7; and.b16 %rs440, %rs439, -16; or.b16 %rs441, %rs440, %rs40; cvt.rn.f32.s16 %f660, %rs441; sub.ftz.f32 %f661, %f660, %f6; mul.ftz.f32 %f662, %f57, %f661; mov.b32 {%rs291, %rs295}, %r245; // begin inline asm { cvt.f32.f16 %f478, %rs291;} // end inline asm fma.rn.ftz.f32 %f663, %f662, %f478, %f656; mov.b32 {%rs292, %rs296}, %r253; // begin inline asm { cvt.f32.f16 %f479, %rs292;} // end inline asm fma.rn.ftz.f32 %f664, %f662, %f479, %f657; mov.b32 {%rs293, %rs297}, %r261; // begin inline asm { cvt.f32.f16 %f480, %rs293;} // end inline asm fma.rn.ftz.f32 %f665, %f662, %f480, %f658; mov.b32 {%rs294, %rs298}, %r269; // begin inline asm { cvt.f32.f16 %f481, %rs294;} // end inline asm fma.rn.ftz.f32 %f666, %f662, %f481, %f659; shl.b16 %rs442, %rs41, 4; cvt.s16.s8 %rs443, %rs442; shr.s16 %rs444, %rs443, 7; and.b16 %rs445, %rs444, -16; or.b16 %rs446, %rs445, %rs42; cvt.rn.f32.s16 %f667, %rs446; sub.ftz.f32 %f668, %f667, %f6; mul.ftz.f32 %f669, %f57, %f668; // begin inline asm { cvt.f32.f16 %f482, %rs295;} // end inline asm fma.rn.ftz.f32 %f670, %f669, %f482, %f663; // begin inline asm { cvt.f32.f16 %f483, %rs296;} // end inline asm fma.rn.ftz.f32 %f671, %f669, %f483, %f664; // begin inline asm { cvt.f32.f16 %f484, %rs297;} // end inline asm fma.rn.ftz.f32 %f672, %f669, %f484, %f665; // begin inline asm { cvt.f32.f16 %f485, %rs298;} // end inline asm fma.rn.ftz.f32 %f673, %f669, %f485, %f666; shl.b16 %rs447, %rs43, 4; cvt.s16.s8 %rs448, %rs447; shr.s16 %rs449, %rs448, 7; and.b16 %rs450, %rs449, -16; or.b16 %rs451, %rs450, %rs44; cvt.rn.f32.s16 %f674, %rs451; sub.ftz.f32 %f675, %f674, %f6; mul.ftz.f32 %f676, %f57, %f675; mov.b32 {%rs299, %rs303}, %r246; // begin inline asm { cvt.f32.f16 %f486, %rs299;} // end inline asm fma.rn.ftz.f32 %f677, %f676, %f486, %f670; mov.b32 {%rs300, %rs304}, %r254; // begin inline asm { cvt.f32.f16 %f487, %rs300;} // end inline asm fma.rn.ftz.f32 %f678, %f676, %f487, %f671; mov.b32 {%rs301, %rs305}, %r262; // begin inline asm { cvt.f32.f16 %f488, %rs301;} // end inline asm fma.rn.ftz.f32 %f679, %f676, %f488, %f672; mov.b32 {%rs302, %rs306}, %r270; // begin inline asm { cvt.f32.f16 %f489, %rs302;} // end inline asm fma.rn.ftz.f32 %f680, %f676, %f489, %f673; shl.b16 %rs452, %rs45, 4; cvt.s16.s8 %rs453, %rs452; shr.s16 %rs454, %rs453, 7; and.b16 %rs455, %rs454, -16; or.b16 %rs456, %rs455, %rs46; cvt.rn.f32.s16 %f681, %rs456; sub.ftz.f32 %f682, %f681, %f6; mul.ftz.f32 %f683, %f57, %f682; // begin inline asm { cvt.f32.f16 %f490, %rs303;} // end inline asm fma.rn.ftz.f32 %f684, %f683, %f490, %f677; // begin inline asm { cvt.f32.f16 %f491, %rs304;} // end inline asm fma.rn.ftz.f32 %f685, %f683, %f491, %f678; // begin inline asm { cvt.f32.f16 %f492, %rs305;} // end inline asm fma.rn.ftz.f32 %f686, %f683, %f492, %f679; // begin inline asm { cvt.f32.f16 %f493, %rs306;} // end inline asm fma.rn.ftz.f32 %f687, %f683, %f493, %f680; shl.b16 %rs457, %rs47, 4; cvt.s16.s8 %rs458, %rs457; shr.s16 %rs459, %rs458, 7; and.b16 %rs460, %rs459, -16; or.b16 %rs461, %rs460, %rs48; cvt.rn.f32.s16 %f688, %rs461; sub.ftz.f32 %f689, %f688, %f6; mul.ftz.f32 %f690, %f57, %f689; mov.b32 {%rs307, %rs311}, %r247; // begin inline asm { cvt.f32.f16 %f494, %rs307;} // end inline asm fma.rn.ftz.f32 %f691, %f690, %f494, %f684; mov.b32 {%rs308, %rs312}, %r255; // begin inline asm { cvt.f32.f16 %f495, %rs308;} // end inline asm fma.rn.ftz.f32 %f692, %f690, %f495, %f685; mov.b32 {%rs309, %rs313}, %r263; // begin inline asm { cvt.f32.f16 %f496, %rs309;} // end inline asm fma.rn.ftz.f32 %f693, %f690, %f496, %f686; mov.b32 {%rs310, %rs314}, %r271; // begin inline asm { cvt.f32.f16 %f497, %rs310;} // end inline asm fma.rn.ftz.f32 %f694, %f690, %f497, %f687; shl.b16 %rs462, %rs49, 4; cvt.s16.s8 %rs463, %rs462; shr.s16 %rs464, %rs463, 7; and.b16 %rs465, %rs464, -16; or.b16 %rs466, %rs465, %rs49; cvt.rn.f32.s16 %f695, %rs466; sub.ftz.f32 %f696, %f695, %f6; mul.ftz.f32 %f697, %f57, %f696; // begin inline asm { cvt.f32.f16 %f498, %rs311;} // end inline asm fma.rn.ftz.f32 %f698, %f697, %f498, %f691; // begin inline asm { cvt.f32.f16 %f499, %rs312;} // end inline asm fma.rn.ftz.f32 %f699, %f697, %f499, %f692; // begin inline asm { cvt.f32.f16 %f500, %rs313;} // end inline asm fma.rn.ftz.f32 %f700, %f697, %f500, %f693; // begin inline asm { cvt.f32.f16 %f501, %rs314;} // end inline asm fma.rn.ftz.f32 %f701, %f697, %f501, %f694; ld.global.v4.u32 {%r276, %r277, %r278, %r279}, [%rd7+48]; ld.global.v4.u32 {%r284, %r285, %r286, %r287}, [%rd10+32]; ld.global.v4.u32 {%r292, %r293, %r294, %r295}, [%rd8+48]; ld.global.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9+48]; shl.b16 %rs467, %rs50, 4; cvt.s16.s8 %rs468, %rs467; shr.s16 %rs469, %rs468, 7; and.b16 %rs470, %rs469, -16; or.b16 %rs471, %rs470, %rs51; cvt.rn.f32.s16 %f702, %rs471; sub.ftz.f32 %f703, %f702, %f6; mul.ftz.f32 %f704, %f57, %f703; mov.b32 {%rs315, %rs319}, %r276; // begin inline asm { cvt.f32.f16 %f502, %rs315;} // end inline asm fma.rn.ftz.f32 %f705, %f704, %f502, %f698; mov.b32 {%rs316, %rs320}, %r284; // begin inline asm { cvt.f32.f16 %f503, %rs316;} // end inline asm fma.rn.ftz.f32 %f706, %f704, %f503, %f699; mov.b32 {%rs317, %rs321}, %r292; // begin inline asm { cvt.f32.f16 %f504, %rs317;} // end inline asm fma.rn.ftz.f32 %f707, %f704, %f504, %f700; mov.b32 {%rs318, %rs322}, %r300; // begin inline asm { cvt.f32.f16 %f505, %rs318;} // end inline asm fma.rn.ftz.f32 %f708, %f704, %f505, %f701; shl.b16 %rs472, %rs52, 4; cvt.s16.s8 %rs473, %rs472; shr.s16 %rs474, %rs473, 7; and.b16 %rs475, %rs474, -16; or.b16 %rs476, %rs475, %rs53; cvt.rn.f32.s16 %f709, %rs476; sub.ftz.f32 %f710, %f709, %f6; mul.ftz.f32 %f711, %f57, %f710; // begin inline asm { cvt.f32.f16 %f506, %rs319;} // end inline asm fma.rn.ftz.f32 %f712, %f711, %f506, %f705; // begin inline asm { cvt.f32.f16 %f507, %rs320;} // end inline asm fma.rn.ftz.f32 %f713, %f711, %f507, %f706; // begin inline asm { cvt.f32.f16 %f508, %rs321;} // end inline asm fma.rn.ftz.f32 %f714, %f711, %f508, %f707; // begin inline asm { cvt.f32.f16 %f509, %rs322;} // end inline asm fma.rn.ftz.f32 %f715, %f711, %f509, %f708; shl.b16 %rs477, %rs54, 4; cvt.s16.s8 %rs478, %rs477; shr.s16 %rs479, %rs478, 7; and.b16 %rs480, %rs479, -16; or.b16 %rs481, %rs480, %rs55; cvt.rn.f32.s16 %f716, %rs481; sub.ftz.f32 %f717, %f716, %f6; mul.ftz.f32 %f718, %f57, %f717; mov.b32 {%rs323, %rs327}, %r277; // begin inline asm { cvt.f32.f16 %f510, %rs323;} // end inline asm fma.rn.ftz.f32 %f719, %f718, %f510, %f712; mov.b32 {%rs324, %rs328}, %r285; // begin inline asm { cvt.f32.f16 %f511, %rs324;} // end inline asm fma.rn.ftz.f32 %f720, %f718, %f511, %f713; mov.b32 {%rs325, %rs329}, %r293; // begin inline asm { cvt.f32.f16 %f512, %rs325;} // end inline asm fma.rn.ftz.f32 %f721, %f718, %f512, %f714; mov.b32 {%rs326, %rs330}, %r301; // begin inline asm { cvt.f32.f16 %f513, %rs326;} // end inline asm fma.rn.ftz.f32 %f722, %f718, %f513, %f715; shl.b16 %rs482, %rs56, 4; cvt.s16.s8 %rs483, %rs482; shr.s16 %rs484, %rs483, 7; and.b16 %rs485, %rs484, -16; or.b16 %rs486, %rs485, %rs57; cvt.rn.f32.s16 %f723, %rs486; sub.ftz.f32 %f724, %f723, %f6; mul.ftz.f32 %f725, %f57, %f724; // begin inline asm { cvt.f32.f16 %f514, %rs327;} // end inline asm fma.rn.ftz.f32 %f726, %f725, %f514, %f719; // begin inline asm { cvt.f32.f16 %f515, %rs328;} // end inline asm fma.rn.ftz.f32 %f727, %f725, %f515, %f720; // begin inline asm { cvt.f32.f16 %f516, %rs329;} // end inline asm fma.rn.ftz.f32 %f728, %f725, %f516, %f721; // begin inline asm { cvt.f32.f16 %f517, %rs330;} // end inline asm fma.rn.ftz.f32 %f729, %f725, %f517, %f722; shl.b16 %rs487, %rs58, 4; cvt.s16.s8 %rs488, %rs487; shr.s16 %rs489, %rs488, 7; and.b16 %rs490, %rs489, -16; or.b16 %rs491, %rs490, %rs59; cvt.rn.f32.s16 %f730, %rs491; sub.ftz.f32 %f731, %f730, %f6; mul.ftz.f32 %f732, %f57, %f731; mov.b32 {%rs331, %rs335}, %r278; // begin inline asm { cvt.f32.f16 %f518, %rs331;} // end inline asm fma.rn.ftz.f32 %f733, %f732, %f518, %f726; mov.b32 {%rs332, %rs336}, %r286; // begin inline asm { cvt.f32.f16 %f519, %rs332;} // end inline asm fma.rn.ftz.f32 %f734, %f732, %f519, %f727; mov.b32 {%rs333, %rs337}, %r294; // begin inline asm { cvt.f32.f16 %f520, %rs333;} // end inline asm fma.rn.ftz.f32 %f735, %f732, %f520, %f728; mov.b32 {%rs334, %rs338}, %r302; // begin inline asm { cvt.f32.f16 %f521, %rs334;} // end inline asm fma.rn.ftz.f32 %f736, %f732, %f521, %f729; shl.b16 %rs492, %rs60, 4; cvt.s16.s8 %rs493, %rs492; shr.s16 %rs494, %rs493, 7; and.b16 %rs495, %rs494, -16; or.b16 %rs496, %rs495, %rs61; cvt.rn.f32.s16 %f737, %rs496; sub.ftz.f32 %f738, %f737, %f6; mul.ftz.f32 %f739, %f57, %f738; // begin inline asm { cvt.f32.f16 %f522, %rs335;} // end inline asm fma.rn.ftz.f32 %f740, %f739, %f522, %f733; // begin inline asm { cvt.f32.f16 %f523, %rs336;} // end inline asm fma.rn.ftz.f32 %f741, %f739, %f523, %f734; // begin inline asm { cvt.f32.f16 %f524, %rs337;} // end inline asm fma.rn.ftz.f32 %f742, %f739, %f524, %f735; // begin inline asm { cvt.f32.f16 %f525, %rs338;} // end inline asm fma.rn.ftz.f32 %f743, %f739, %f525, %f736; shl.b16 %rs497, %rs62, 4; cvt.s16.s8 %rs498, %rs497; shr.s16 %rs499, %rs498, 7; and.b16 %rs500, %rs499, -16; or.b16 %rs501, %rs500, %rs63; cvt.rn.f32.s16 %f744, %rs501; sub.ftz.f32 %f745, %f744, %f6; mul.ftz.f32 %f746, %f57, %f745; mov.b32 {%rs339, %rs343}, %r279; // begin inline asm { cvt.f32.f16 %f526, %rs339;} // end inline asm fma.rn.ftz.f32 %f747, %f746, %f526, %f740; mov.b32 {%rs340, %rs344}, %r287; // begin inline asm { cvt.f32.f16 %f527, %rs340;} // end inline asm fma.rn.ftz.f32 %f748, %f746, %f527, %f741; mov.b32 {%rs341, %rs345}, %r295; // begin inline asm { cvt.f32.f16 %f528, %rs341;} // end inline asm fma.rn.ftz.f32 %f749, %f746, %f528, %f742; mov.b32 {%rs342, %rs346}, %r303; // begin inline asm { cvt.f32.f16 %f529, %rs342;} // end inline asm fma.rn.ftz.f32 %f750, %f746, %f529, %f743; shl.b16 %rs502, %rs64, 4; cvt.s16.s8 %rs503, %rs502; shr.s16 %rs504, %rs503, 7; and.b16 %rs505, %rs504, -16; or.b16 %rs506, %rs505, %rs64; cvt.rn.f32.s16 %f751, %rs506; sub.ftz.f32 %f752, %f751, %f6; mul.ftz.f32 %f753, %f57, %f752; // begin inline asm { cvt.f32.f16 %f530, %rs343;} // end inline asm fma.rn.ftz.f32 %f849, %f753, %f530, %f747; // begin inline asm { cvt.f32.f16 %f531, %rs344;} // end inline asm fma.rn.ftz.f32 %f848, %f753, %f531, %f748; // begin inline asm { cvt.f32.f16 %f532, %rs345;} // end inline asm fma.rn.ftz.f32 %f847, %f753, %f532, %f749; // begin inline asm { cvt.f32.f16 %f533, %rs346;} // end inline asm fma.rn.ftz.f32 %f846, %f753, %f533, %f750; $L__BB0_8: add.s32 %r425, %r425, 4; shl.b32 %r308, %r425, 5; add.s32 %r424, %r308, %r49; shl.b32 %r423, %r424, 2; setp.lt.u32 %p7, %r423, %r46; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r310, %tid.y; shl.b32 %r311, %r310, 5; add.s32 %r36, %r311, %r49; setp.lt.u32 %p8, %r36, 32; shl.b32 %r313, %r36, 2; mov.u32 %r314, _ZZ9gemv_int4ILi4ELi128ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r315, %r314, %r313; @%p8 bra $L__BB0_11; add.s32 %r419, %r315, -112; st.shared.f32 [%r419], %f849; $L__BB0_11: setp.gt.u32 %p9, %r36, 31; bar.sync 0; mad.lo.s32 %r38, %r36, 12, %r314; @%p9 bra $L__BB0_13; mov.u32 %r334, 16; ld.shared.f32 %f769, [%r38+16]; add.ftz.f32 %f770, %f849, %f769; ld.shared.f32 %f771, [%r38+20]; add.ftz.f32 %f772, %f770, %f771; ld.shared.f32 %f773, [%r38+24]; add.ftz.f32 %f756, %f772, %f773; mov.u32 %r322, 1; mov.u32 %r335, 31; mov.u32 %r336, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f756, %r322, %r335, %r336; @p add.f32 r0, r0, %f756; mov.f32 %f754, r0;} // end inline asm mov.u32 %r325, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f754, %r325, %r335, %r336; @p add.f32 r0, r0, %f754; mov.f32 %f757, r0;} // end inline asm mov.u32 %r328, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f757, %r328, %r335, %r336; @p add.f32 r0, r0, %f757; mov.f32 %f760, r0;} // end inline asm mov.u32 %r331, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f760, %r331, %r335, %r336; @p add.f32 r0, r0, %f760; mov.f32 %f763, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f763, %r334, %r335, %r336; @p add.f32 r0, r0, %f763; mov.f32 %f849, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r420, %r315, -112; st.shared.f32 [%r420+640], %f848; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f789, [%r38+656]; add.ftz.f32 %f790, %f848, %f789; ld.shared.f32 %f791, [%r38+660]; add.ftz.f32 %f792, %f790, %f791; ld.shared.f32 %f793, [%r38+664]; add.ftz.f32 %f776, %f792, %f793; mov.u32 %r346, 1; mov.u32 %r359, 31; mov.u32 %r360, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f776, %r346, %r359, %r360; @p add.f32 r0, r0, %f776; mov.f32 %f774, r0;} // end inline asm mov.u32 %r349, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f774, %r349, %r359, %r360; @p add.f32 r0, r0, %f774; mov.f32 %f777, r0;} // end inline asm mov.u32 %r352, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f777, %r352, %r359, %r360; @p add.f32 r0, r0, %f777; mov.f32 %f780, r0;} // end inline asm mov.u32 %r355, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f780, %r355, %r359, %r360; @p add.f32 r0, r0, %f780; mov.f32 %f783, r0;} // end inline asm mov.u32 %r358, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f783, %r358, %r359, %r360; @p add.f32 r0, r0, %f783; mov.f32 %f848, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r421, %r315, -112; st.shared.f32 [%r421+1280], %f847; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f809, [%r38+1296]; add.ftz.f32 %f810, %f847, %f809; ld.shared.f32 %f811, [%r38+1300]; add.ftz.f32 %f812, %f810, %f811; ld.shared.f32 %f813, [%r38+1304]; add.ftz.f32 %f796, %f812, %f813; mov.u32 %r370, 1; mov.u32 %r383, 31; mov.u32 %r384, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f796, %r370, %r383, %r384; @p add.f32 r0, r0, %f796; mov.f32 %f794, r0;} // end inline asm mov.u32 %r373, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f794, %r373, %r383, %r384; @p add.f32 r0, r0, %f794; mov.f32 %f797, r0;} // end inline asm mov.u32 %r376, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f797, %r376, %r383, %r384; @p add.f32 r0, r0, %f797; mov.f32 %f800, r0;} // end inline asm mov.u32 %r379, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f800, %r379, %r383, %r384; @p add.f32 r0, r0, %f800; mov.f32 %f803, r0;} // end inline asm mov.u32 %r382, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f803, %r382, %r383, %r384; @p add.f32 r0, r0, %f803; mov.f32 %f847, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r422, %r315, -112; st.shared.f32 [%r422+1920], %f846; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f829, [%r38+1936]; add.ftz.f32 %f830, %f846, %f829; ld.shared.f32 %f831, [%r38+1940]; add.ftz.f32 %f832, %f830, %f831; ld.shared.f32 %f833, [%r38+1944]; add.ftz.f32 %f816, %f832, %f833; mov.u32 %r394, 1; mov.u32 %r407, 31; mov.u32 %r408, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f816, %r394, %r407, %r408; @p add.f32 r0, r0, %f816; mov.f32 %f814, r0;} // end inline asm mov.u32 %r397, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f814, %r397, %r407, %r408; @p add.f32 r0, r0, %f814; mov.f32 %f817, r0;} // end inline asm mov.u32 %r400, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f817, %r400, %r407, %r408; @p add.f32 r0, r0, %f817; mov.f32 %f820, r0;} // end inline asm mov.u32 %r403, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f820, %r403, %r407, %r408; @p add.f32 r0, r0, %f820; mov.f32 %f823, r0;} // end inline asm mov.u32 %r406, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f823, %r406, %r407, %r408; @p add.f32 r0, r0, %f823; mov.f32 %f846, r0;} // end inline asm $L__BB0_25: or.b32 %r411, %r49, %r310; setp.ne.s32 %p16, %r411, 0; @%p16 bra $L__BB0_35; ld.param.u64 %rd65, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p17, %rd65, 0; mul.ftz.f32 %f858, %f51, %f849; mov.u32 %r412, %ctaid.x; cvt.s64.s32 %rd11, %r412; @%p17 bra $L__BB0_28; shl.b64 %rd41, %rd11, 1; add.s64 %rd42, %rd2, %rd41; ld.global.u16 %rs507, [%rd42]; // begin inline asm { cvt.f32.f16 %f834, %rs507;} // end inline asm fma.rn.ftz.f32 %f858, %f52, %f834, %f858; $L__BB0_28: ld.param.u64 %rd66, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs508, %f858;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd43, 1.0; // end inline asm shl.b64 %rd46, %rd11, 1; add.s64 %rd44, %rd66, %rd46; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd44], %rs508, %rd43; // end inline asm mul.ftz.f32 %f859, %f51, %f848; add.s32 %r414, %r45, %r412; cvt.s64.s32 %rd14, %r414; @%p17 bra $L__BB0_30; shl.b64 %rd47, %rd14, 1; add.s64 %rd48, %rd2, %rd47; ld.global.u16 %rs510, [%rd48]; // begin inline asm { cvt.f32.f16 %f836, %rs510;} // end inline asm fma.rn.ftz.f32 %f859, %f52, %f836, %f859; $L__BB0_30: mul.wide.s32 %rd52, %r45, 2; add.s64 %rd50, %rd44, %rd52; // begin inline asm { cvt.rn.f16.f32 %rs511, %f859;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd49, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd50], %rs511, %rd49; // end inline asm mul.ftz.f32 %f860, %f51, %f847; cvt.u32.u64 %r415, %rd14; add.s32 %r416, %r415, %r45; cvt.s64.s32 %rd15, %r416; @%p17 bra $L__BB0_32; shl.b64 %rd53, %rd15, 1; add.s64 %rd54, %rd2, %rd53; ld.global.u16 %rs513, [%rd54]; // begin inline asm { cvt.f32.f16 %f838, %rs513;} // end inline asm fma.rn.ftz.f32 %f860, %f52, %f838, %f860; $L__BB0_32: ld.param.u64 %rd67, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs514, %f860;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd55, 1.0; // end inline asm shl.b64 %rd58, %rd15, 1; add.s64 %rd56, %rd67, %rd58; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd56], %rs514, %rd55; // end inline asm mul.ftz.f32 %f861, %f51, %f846; cvt.u32.u64 %r417, %rd15; add.s32 %r418, %r417, %r45; cvt.s64.s32 %rd16, %r418; @%p17 bra $L__BB0_34; shl.b64 %rd59, %rd16, 1; add.s64 %rd60, %rd2, %rd59; ld.global.u16 %rs516, [%rd60]; // begin inline asm { cvt.f32.f16 %f840, %rs516;} // end inline asm fma.rn.ftz.f32 %f861, %f52, %f840, %f861; $L__BB0_34: ld.param.u64 %rd68, [_Z28dequant_gemv_group128_batch423DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs517, %f861;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd61, 1.0; // end inline asm shl.b64 %rd64, %rd16, 1; add.s64 %rd62, %rd68, %rd64; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd62], %rs517, %rd61; // end inline asm $L__BB0_35: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }