roup64_batch623DequantGemvKernelParams // _ZZ9gemv_int4ILi4ELi64ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch623DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<27>; .reg .b16 %rs<384>; .reg .f32 %f<712>; .reg .b32 %r<340>; .reg .b64 %rd<87>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[3840]; ld.param.v2.u32 {%r50, %r51}, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r52, %r53}, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f73, %f74}, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs65, %rs66, %rs67, %rs68}, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd30, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd29, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd28, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd27, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd26, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd2, %rd26; mov.u32 %r339, %tid.y; shl.b32 %r54, %r339, 5; mov.u32 %r55, %tid.x; add.s32 %r338, %r54, %r55; shl.b32 %r337, %r338, 1; setp.ge.u32 %p1, %r337, %r52; mov.f32 %f688, 0f00000000; mov.f32 %f689, %f688; mov.f32 %f690, %f688; mov.f32 %f691, %f688; mov.f32 %f692, %f688; mov.f32 %f693, %f688; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd27; mov.u32 %r56, %ctaid.x; mul.lo.s32 %r6, %r53, %r56; $L__BB0_2: mad.lo.s32 %r60, %r52, %r56, %r337; mul.wide.u32 %rd37, %r60, 4; add.s64 %rd32, %rd28, %rd37; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd31, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v2.u32 {%r57,%r58}, [%rd32], %rd31; // end inline asm shr.u32 %r62, %r55, 2; shl.b32 %r63, %r339, 3; add.s32 %r12, %r63, %r62; add.s32 %r13, %r12, %r6; mul.wide.s32 %rd38, %r13, 2; add.s64 %rd35, %rd30, %rd38; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd34, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs73, [%rd35], %rd34; // end inline asm // begin inline asm { cvt.f32.f16 %f81, %rs73;} // end inline asm shl.b16 %rs383, %rs65, 3; setp.eq.s64 %p2, %rd29, 0; @%p2 bra $L__BB0_4; shr.u32 %r64, %r13, 31; add.s32 %r65, %r13, %r64; shr.s32 %r66, %r65, 1; cvt.s64.s32 %rd42, %r66; add.s64 %rd40, %rd29, %rd42; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd39, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs75, [%rd40], %rd39; // end inline asm cvt.u32.u16 %r67, %rs75; and.b32 %r68, %r67, 255; shl.b32 %r69, %r12, 2; and.b32 %r70, %r69, 4; shr.u32 %r71, %r68, %r70; cvt.u16.u32 %rs76, %r71; and.b16 %rs383, %rs76, 15; $L__BB0_4: shl.b32 %r14, %r338, 4; setp.ge.s32 %p3, %r14, %r50; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs65, 0; shr.u16 %rs78, %rs383, 3; and.b16 %rs79, %rs78, 1; setp.eq.b16 %p5, %rs79, 1; and.pred %p6, %p4, %p5; selp.b16 %rs80, -16, 0, %p6; or.b16 %rs81, %rs80, %rs383; cvt.s16.s8 %rs82, %rs81; cvt.rn.f32.s16 %f8, %rs82; mul.wide.s32 %rd43, %r14, 2; add.s64 %rd7, %rd3, %rd43; ld.global.v4.u32 {%r72, %r73, %r74, %r75}, [%rd7]; mul.wide.s32 %rd44, %r50, 2; add.s64 %rd8, %rd7, %rd44; ld.global.v4.u32 {%r76, %r77, %r78, %r79}, [%rd8]; add.s32 %r80, %r14, %r50; add.s32 %r81, %r80, %r50; mul.wide.s32 %rd45, %r81, 2; add.s64 %rd46, %rd3, %rd45; ld.global.v4.u32 {%r82, %r83, %r84, %r85}, [%rd46]; add.s64 %rd47, %rd46, %rd44; ld.global.v4.u32 {%r86, %r87, %r88, %r89}, [%rd47]; add.s64 %rd48, %rd47, %rd44; ld.global.v4.u32 {%r90, %r91, %r92, %r93}, [%rd48]; add.s64 %rd49, %rd48, %rd44; ld.global.v4.u32 {%r94, %r95, %r96, %r97}, [%rd49]; cvt.u16.u32 %rs5, %r57; and.b16 %rs6, %rs5, 15; mov.b32 {%rs9, %rs8}, %r72; mov.b32 {%rs12, %rs11}, %r76; mov.b32 {%rs15, %rs14}, %r82; mov.b32 {%rs18, %rs17}, %r86; mov.b32 {%rs21, %rs20}, %r90; mov.b32 {%rs24, %rs23}, %r94; shr.u32 %r98, %r57, 4; cvt.u16.u32 %rs25, %r98; and.b16 %rs26, %rs25, 15; shr.u32 %r99, %r57, 8; cvt.u16.u32 %rs27, %r99; and.b16 %rs28, %rs27, 15; shr.u32 %r100, %r57, 12; cvt.u16.u32 %rs29, %r100; and.b16 %rs30, %rs29, 15; shr.u32 %r101, %r57, 16; cvt.u16.u32 %rs31, %r101; and.b16 %rs32, %rs31, 15; shr.u32 %r102, %r57, 20; cvt.u16.u32 %rs33, %r102; and.b16 %rs34, %rs33, 15; mov.b32 {%rs83, %rs35}, %r84; mov.b32 {%rs84, %rs36}, %r88; mov.b32 {%rs85, %rs37}, %r92; mov.b32 {%rs86, %rs38}, %r96; shr.u32 %r103, %r57, 24; cvt.u16.u32 %rs39, %r103; and.b16 %rs40, %rs39, 15; shr.u32 %r104, %r57, 28; cvt.u16.u32 %rs41, %r104; cvt.u16.u32 %rs42, %r58; and.b16 %rs43, %rs42, 15; shr.u32 %r105, %r58, 4; cvt.u16.u32 %rs44, %r105; and.b16 %rs45, %rs44, 15; shr.u32 %r106, %r58, 8; cvt.u16.u32 %rs46, %r106; and.b16 %rs47, %rs46, 15; shr.u32 %r107, %r58, 12; cvt.u16.u32 %rs48, %r107; and.b16 %rs49, %rs48, 15; shr.u32 %r108, %r58, 16; cvt.u16.u32 %rs50, %r108; and.b16 %rs51, %rs50, 15; shr.u32 %r109, %r58, 20; cvt.u16.u32 %rs52, %r109; and.b16 %rs53, %rs52, 15; shr.u32 %r110, %r58, 24; cvt.u16.u32 %rs54, %r110; and.b16 %rs55, %rs54, 15; shr.u32 %r111, %r58, 28; cvt.u16.u32 %rs56, %r111; add.s64 %rd50, %rd8, %rd44; add.s64 %rd9, %rd50, 16; add.s64 %rd10, %rd9, %rd44; add.s64 %rd11, %rd10, %rd44; add.s64 %rd12, %rd11, %rd44; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f178, %rs6; sub.ftz.f32 %f179, %f178, %f8; mul.ftz.f32 %f180, %f81, %f179; // begin inline asm { cvt.f32.f16 %f82, %rs9;} // end inline asm fma.rn.ftz.f32 %f181, %f180, %f82, %f693; // begin inline asm { cvt.f32.f16 %f83, %rs12;} // end inline asm fma.rn.ftz.f32 %f182, %f180, %f83, %f692; // begin inline asm { cvt.f32.f16 %f84, %rs15;} // end inline asm fma.rn.ftz.f32 %f183, %f180, %f84, %f691; // begin inline asm { cvt.f32.f16 %f85, %rs18;} // end inline asm fma.rn.ftz.f32 %f184, %f180, %f85, %f690; // begin inline asm { cvt.f32.f16 %f86, %rs21;} // end inline asm fma.rn.ftz.f32 %f185, %f180, %f86, %f689; // begin inline asm { cvt.f32.f16 %f87, %rs24;} // end inline asm fma.rn.ftz.f32 %f186, %f180, %f87, %f688; cvt.rn.f32.s16 %f187, %rs26; sub.ftz.f32 %f188, %f187, %f8; mul.ftz.f32 %f189, %f81, %f188; // begin inline asm { cvt.f32.f16 %f88, %rs8;} // end inline asm fma.rn.ftz.f32 %f190, %f189, %f88, %f181; // begin inline asm { cvt.f32.f16 %f89, %rs11;} // end inline asm fma.rn.ftz.f32 %f191, %f189, %f89, %f182; // begin inline asm { cvt.f32.f16 %f90, %rs14;} // end inline asm fma.rn.ftz.f32 %f192, %f189, %f90, %f183; // begin inline asm { cvt.f32.f16 %f91, %rs17;} // end inline asm fma.rn.ftz.f32 %f193, %f189, %f91, %f184; // begin inline asm { cvt.f32.f16 %f92, %rs20;} // end inline asm fma.rn.ftz.f32 %f194, %f189, %f92, %f185; // begin inline asm { cvt.f32.f16 %f93, %rs23;} // end inline asm fma.rn.ftz.f32 %f195, %f189, %f93, %f186; cvt.rn.f32.s16 %f196, %rs28; sub.ftz.f32 %f197, %f196, %f8; mul.ftz.f32 %f198, %f81, %f197; mov.b32 {%rs99, %rs105}, %r73; // begin inline asm { cvt.f32.f16 %f94, %rs99;} // end inline asm fma.rn.ftz.f32 %f199, %f198, %f94, %f190; mov.b32 {%rs100, %rs106}, %r77; // begin inline asm { cvt.f32.f16 %f95, %rs100;} // end inline asm fma.rn.ftz.f32 %f200, %f198, %f95, %f191; mov.b32 {%rs101, %rs107}, %r83; // begin inline asm { cvt.f32.f16 %f96, %rs101;} // end inline asm fma.rn.ftz.f32 %f201, %f198, %f96, %f192; mov.b32 {%rs102, %rs108}, %r87; // begin inline asm { cvt.f32.f16 %f97, %rs102;} // end inline asm fma.rn.ftz.f32 %f202, %f198, %f97, %f193; mov.b32 {%rs103, %rs109}, %r91; // begin inline asm { cvt.f32.f16 %f98, %rs103;} // end inline asm fma.rn.ftz.f32 %f203, %f198, %f98, %f194; mov.b32 {%rs104, %rs110}, %r95; // begin inline asm { cvt.f32.f16 %f99, %rs104;} // end inline asm fma.rn.ftz.f32 %f204, %f198, %f99, %f195; cvt.rn.f32.s16 %f205, %rs30; sub.ftz.f32 %f206, %f205, %f8; mul.ftz.f32 %f207, %f81, %f206; // begin inline asm { cvt.f32.f16 %f100, %rs105;} // end inline asm fma.rn.ftz.f32 %f208, %f207, %f100, %f199; // begin inline asm { cvt.f32.f16 %f101, %rs106;} // end inline asm fma.rn.ftz.f32 %f209, %f207, %f101, %f200; // begin inline asm { cvt.f32.f16 %f102, %rs107;} // end inline asm fma.rn.ftz.f32 %f210, %f207, %f102, %f201; // begin inline asm { cvt.f32.f16 %f103, %rs108;} // end inline asm fma.rn.ftz.f32 %f211, %f207, %f103, %f202; // begin inline asm { cvt.f32.f16 %f104, %rs109;} // end inline asm fma.rn.ftz.f32 %f212, %f207, %f104, %f203; // begin inline asm { cvt.f32.f16 %f105, %rs110;} // end inline asm fma.rn.ftz.f32 %f213, %f207, %f105, %f204; cvt.rn.f32.s16 %f214, %rs32; sub.ftz.f32 %f215, %f214, %f8; mul.ftz.f32 %f216, %f81, %f215; mov.b32 {%rs111, %rs117}, %r74; // begin inline asm { cvt.f32.f16 %f106, %rs111;} // end inline asm fma.rn.ftz.f32 %f217, %f216, %f106, %f208; mov.b32 {%rs112, %rs118}, %r78; // begin inline asm { cvt.f32.f16 %f107, %rs112;} // end inline asm fma.rn.ftz.f32 %f218, %f216, %f107, %f209; cvt.u16.u32 %rs113, %r84; // begin inline asm { cvt.f32.f16 %f108, %rs113;} // end inline asm fma.rn.ftz.f32 %f219, %f216, %f108, %f210; cvt.u16.u32 %rs114, %r88; // begin inline asm { cvt.f32.f16 %f109, %rs114;} // end inline asm fma.rn.ftz.f32 %f220, %f216, %f109, %f211; cvt.u16.u32 %rs115, %r92; // begin inline asm { cvt.f32.f16 %f110, %rs115;} // end inline asm fma.rn.ftz.f32 %f221, %f216, %f110, %f212; cvt.u16.u32 %rs116, %r96; // begin inline asm { cvt.f32.f16 %f111, %rs116;} // end inline asm fma.rn.ftz.f32 %f222, %f216, %f111, %f213; cvt.rn.f32.s16 %f223, %rs34; sub.ftz.f32 %f224, %f223, %f8; mul.ftz.f32 %f225, %f81, %f224; // begin inline asm { cvt.f32.f16 %f112, %rs117;} // end inline asm fma.rn.ftz.f32 %f226, %f225, %f112, %f217; // begin inline asm { cvt.f32.f16 %f113, %rs118;} // end inline asm fma.rn.ftz.f32 %f227, %f225, %f113, %f218; // begin inline asm { cvt.f32.f16 %f114, %rs35;} // end inline asm fma.rn.ftz.f32 %f228, %f225, %f114, %f219; // begin inline asm { cvt.f32.f16 %f115, %rs36;} // end inline asm fma.rn.ftz.f32 %f229, %f225, %f115, %f220; // begin inline asm { cvt.f32.f16 %f116, %rs37;} // end inline asm fma.rn.ftz.f32 %f230, %f225, %f116, %f221; // begin inline asm { cvt.f32.f16 %f117, %rs38;} // end inline asm fma.rn.ftz.f32 %f231, %f225, %f117, %f222; cvt.rn.f32.s16 %f232, %rs40; sub.ftz.f32 %f233, %f232, %f8; mul.ftz.f32 %f234, %f81, %f233; mov.b32 {%rs123, %rs129}, %r75; // begin inline asm { cvt.f32.f16 %f118, %rs123;} // end inline asm fma.rn.ftz.f32 %f235, %f234, %f118, %f226; mov.b32 {%rs124, %rs130}, %r79; // begin inline asm { cvt.f32.f16 %f119, %rs124;} // end inline asm fma.rn.ftz.f32 %f236, %f234, %f119, %f227; mov.b32 {%rs125, %rs131}, %r85; // begin inline asm { cvt.f32.f16 %f120, %rs125;} // end inline asm fma.rn.ftz.f32 %f237, %f234, %f120, %f228; mov.b32 {%rs126, %rs132}, %r89; // begin inline asm { cvt.f32.f16 %f121, %rs126;} // end inline asm fma.rn.ftz.f32 %f238, %f234, %f121, %f229; mov.b32 {%rs127, %rs133}, %r93; // begin inline asm { cvt.f32.f16 %f122, %rs127;} // end inline asm fma.rn.ftz.f32 %f239, %f234, %f122, %f230; mov.b32 {%rs128, %rs134}, %r97; // begin inline asm { cvt.f32.f16 %f123, %rs128;} // end inline asm fma.rn.ftz.f32 %f240, %f234, %f123, %f231; cvt.rn.f32.s16 %f241, %rs41; sub.ftz.f32 %f242, %f241, %f8; mul.ftz.f32 %f243, %f81, %f242; // begin inline asm { cvt.f32.f16 %f124, %rs129;} // end inline asm fma.rn.ftz.f32 %f244, %f243, %f124, %f235; // begin inline asm { cvt.f32.f16 %f125, %rs130;} // end inline asm fma.rn.ftz.f32 %f245, %f243, %f125, %f236; // begin inline asm { cvt.f32.f16 %f126, %rs131;} // end inline asm fma.rn.ftz.f32 %f246, %f243, %f126, %f237; // begin inline asm { cvt.f32.f16 %f127, %rs132;} // end inline asm fma.rn.ftz.f32 %f247, %f243, %f127, %f238; // begin inline asm { cvt.f32.f16 %f128, %rs133;} // end inline asm fma.rn.ftz.f32 %f248, %f243, %f128, %f239; // begin inline asm { cvt.f32.f16 %f129, %rs134;} // end inline asm fma.rn.ftz.f32 %f249, %f243, %f129, %f240; ld.global.v4.u32 {%r112, %r113, %r114, %r115}, [%rd7+16]; ld.global.v4.u32 {%r120, %r121, %r122, %r123}, [%rd8+16]; ld.global.v4.u32 {%r128, %r129, %r130, %r131}, [%rd9]; ld.global.v4.u32 {%r136, %r137, %r138, %r139}, [%rd10]; ld.global.v4.u32 {%r144, %r145, %r146, %r147}, [%rd11]; ld.global.v4.u32 {%r152, %r153, %r154, %r155}, [%rd12]; cvt.rn.f32.s16 %f250, %rs43; sub.ftz.f32 %f251, %f250, %f8; mul.ftz.f32 %f252, %f81, %f251; mov.b32 {%rs135, %rs141}, %r112; // begin inline asm { cvt.f32.f16 %f130, %rs135;} // end inline asm fma.rn.ftz.f32 %f253, %f252, %f130, %f244; mov.b32 {%rs136, %rs142}, %r120; // begin inline asm { cvt.f32.f16 %f131, %rs136;} // end inline asm fma.rn.ftz.f32 %f254, %f252, %f131, %f245; mov.b32 {%rs137, %rs143}, %r128; // begin inline asm { cvt.f32.f16 %f132, %rs137;} // end inline asm fma.rn.ftz.f32 %f255, %f252, %f132, %f246; mov.b32 {%rs138, %rs144}, %r136; // begin inline asm { cvt.f32.f16 %f133, %rs138;} // end inline asm fma.rn.ftz.f32 %f256, %f252, %f133, %f247; mov.b32 {%rs139, %rs145}, %r144; // begin inline asm { cvt.f32.f16 %f134, %rs139;} // end inline asm fma.rn.ftz.f32 %f257, %f252, %f134, %f248; mov.b32 {%rs140, %rs146}, %r152; // begin inline asm { cvt.f32.f16 %f135, %rs140;} // end inline asm fma.rn.ftz.f32 %f258, %f252, %f135, %f249; cvt.rn.f32.s16 %f259, %rs45; sub.ftz.f32 %f260, %f259, %f8; mul.ftz.f32 %f261, %f81, %f260; // begin inline asm { cvt.f32.f16 %f136, %rs141;} // end inline asm fma.rn.ftz.f32 %f262, %f261, %f136, %f253; // begin inline asm { cvt.f32.f16 %f137, %rs142;} // end inline asm fma.rn.ftz.f32 %f263, %f261, %f137, %f254; // begin inline asm { cvt.f32.f16 %f138, %rs143;} // end inline asm fma.rn.ftz.f32 %f264, %f261, %f138, %f255; // begin inline asm { cvt.f32.f16 %f139, %rs144;} // end inline asm fma.rn.ftz.f32 %f265, %f261, %f139, %f256; // begin inline asm { cvt.f32.f16 %f140, %rs145;} // end inline asm fma.rn.ftz.f32 %f266, %f261, %f140, %f257; // begin inline asm { cvt.f32.f16 %f141, %rs146;} // end inline asm fma.rn.ftz.f32 %f267, %f261, %f141, %f258; cvt.rn.f32.s16 %f268, %rs47; sub.ftz.f32 %f269, %f268, %f8; mul.ftz.f32 %f270, %f81, %f269; mov.b32 {%rs147, %rs153}, %r113; // begin inline asm { cvt.f32.f16 %f142, %rs147;} // end inline asm fma.rn.ftz.f32 %f271, %f270, %f142, %f262; mov.b32 {%rs148, %rs154}, %r121; // begin inline asm { cvt.f32.f16 %f143, %rs148;} // end inline asm fma.rn.ftz.f32 %f272, %f270, %f143, %f263; mov.b32 {%rs149, %rs155}, %r129; // begin inline asm { cvt.f32.f16 %f144, %rs149;} // end inline asm fma.rn.ftz.f32 %f273, %f270, %f144, %f264; mov.b32 {%rs150, %rs156}, %r137; // begin inline asm { cvt.f32.f16 %f145, %rs150;} // end inline asm fma.rn.ftz.f32 %f274, %f270, %f145, %f265; mov.b32 {%rs151, %rs157}, %r145; // begin inline asm { cvt.f32.f16 %f146, %rs151;} // end inline asm fma.rn.ftz.f32 %f275, %f270, %f146, %f266; mov.b32 {%rs152, %rs158}, %r153; // begin inline asm { cvt.f32.f16 %f147, %rs152;} // end inline asm fma.rn.ftz.f32 %f276, %f270, %f147, %f267; cvt.rn.f32.s16 %f277, %rs49; sub.ftz.f32 %f278, %f277, %f8; mul.ftz.f32 %f279, %f81, %f278; // begin inline asm { cvt.f32.f16 %f148, %rs153;} // end inline asm fma.rn.ftz.f32 %f280, %f279, %f148, %f271; // begin inline asm { cvt.f32.f16 %f149, %rs154;} // end inline asm fma.rn.ftz.f32 %f281, %f279, %f149, %f272; // begin inline asm { cvt.f32.f16 %f150, %rs155;} // end inline asm fma.rn.ftz.f32 %f282, %f279, %f150, %f273; // begin inline asm { cvt.f32.f16 %f151, %rs156;} // end inline asm fma.rn.ftz.f32 %f283, %f279, %f151, %f274; // begin inline asm { cvt.f32.f16 %f152, %rs157;} // end inline asm fma.rn.ftz.f32 %f284, %f279, %f152, %f275; // begin inline asm { cvt.f32.f16 %f153, %rs158;} // end inline asm fma.rn.ftz.f32 %f285, %f279, %f153, %f276; cvt.rn.f32.s16 %f286, %rs51; sub.ftz.f32 %f287, %f286, %f8; mul.ftz.f32 %f288, %f81, %f287; mov.b32 {%rs159, %rs165}, %r114; // begin inline asm { cvt.f32.f16 %f154, %rs159;} // end inline asm fma.rn.ftz.f32 %f289, %f288, %f154, %f280; mov.b32 {%rs160, %rs166}, %r122; // begin inline asm { cvt.f32.f16 %f155, %rs160;} // end inline asm fma.rn.ftz.f32 %f290, %f288, %f155, %f281; mov.b32 {%rs161, %rs167}, %r130; // begin inline asm { cvt.f32.f16 %f156, %rs161;} // end inline asm fma.rn.ftz.f32 %f291, %f288, %f156, %f282; mov.b32 {%rs162, %rs168}, %r138; // begin inline asm { cvt.f32.f16 %f157, %rs162;} // end inline asm fma.rn.ftz.f32 %f292, %f288, %f157, %f283; mov.b32 {%rs163, %rs169}, %r146; // begin inline asm { cvt.f32.f16 %f158, %rs163;} // end inline asm fma.rn.ftz.f32 %f293, %f288, %f158, %f284; mov.b32 {%rs164, %rs170}, %r154; // begin inline asm { cvt.f32.f16 %f159, %rs164;} // end inline asm fma.rn.ftz.f32 %f294, %f288, %f159, %f285; cvt.rn.f32.s16 %f295, %rs53; sub.ftz.f32 %f296, %f295, %f8; mul.ftz.f32 %f297, %f81, %f296; // begin inline asm { cvt.f32.f16 %f160, %rs165;} // end inline asm fma.rn.ftz.f32 %f298, %f297, %f160, %f289; // begin inline asm { cvt.f32.f16 %f161, %rs166;} // end inline asm fma.rn.ftz.f32 %f299, %f297, %f161, %f290; // begin inline asm { cvt.f32.f16 %f162, %rs167;} // end inline asm fma.rn.ftz.f32 %f300, %f297, %f162, %f291; // begin inline asm { cvt.f32.f16 %f163, %rs168;} // end inline asm fma.rn.ftz.f32 %f301, %f297, %f163, %f292; // begin inline asm { cvt.f32.f16 %f164, %rs169;} // end inline asm fma.rn.ftz.f32 %f302, %f297, %f164, %f293; // begin inline asm { cvt.f32.f16 %f165, %rs170;} // end inline asm fma.rn.ftz.f32 %f303, %f297, %f165, %f294; cvt.rn.f32.s16 %f304, %rs55; sub.ftz.f32 %f305, %f304, %f8; mul.ftz.f32 %f306, %f81, %f305; mov.b32 {%rs171, %rs177}, %r115; // begin inline asm { cvt.f32.f16 %f166, %rs171;} // end inline asm fma.rn.ftz.f32 %f307, %f306, %f166, %f298; mov.b32 {%rs172, %rs178}, %r123; // begin inline asm { cvt.f32.f16 %f167, %rs172;} // end inline asm fma.rn.ftz.f32 %f308, %f306, %f167, %f299; mov.b32 {%rs173, %rs179}, %r131; // begin inline asm { cvt.f32.f16 %f168, %rs173;} // end inline asm fma.rn.ftz.f32 %f309, %f306, %f168, %f300; mov.b32 {%rs174, %rs180}, %r139; // begin inline asm { cvt.f32.f16 %f169, %rs174;} // end inline asm fma.rn.ftz.f32 %f310, %f306, %f169, %f301; mov.b32 {%rs175, %rs181}, %r147; // begin inline asm { cvt.f32.f16 %f170, %rs175;} // end inline asm fma.rn.ftz.f32 %f311, %f306, %f170, %f302; mov.b32 {%rs176, %rs182}, %r155; // begin inline asm { cvt.f32.f16 %f171, %rs176;} // end inline asm fma.rn.ftz.f32 %f312, %f306, %f171, %f303; cvt.rn.f32.s16 %f313, %rs56; sub.ftz.f32 %f314, %f313, %f8; mul.ftz.f32 %f315, %f81, %f314; // begin inline asm { cvt.f32.f16 %f172, %rs177;} // end inline asm fma.rn.ftz.f32 %f693, %f315, %f172, %f307; // begin inline asm { cvt.f32.f16 %f173, %rs178;} // end inline asm fma.rn.ftz.f32 %f692, %f315, %f173, %f308; // begin inline asm { cvt.f32.f16 %f174, %rs179;} // end inline asm fma.rn.ftz.f32 %f691, %f315, %f174, %f309; // begin inline asm { cvt.f32.f16 %f175, %rs180;} // end inline asm fma.rn.ftz.f32 %f690, %f315, %f175, %f310; // begin inline asm { cvt.f32.f16 %f176, %rs181;} // end inline asm fma.rn.ftz.f32 %f689, %f315, %f176, %f311; // begin inline asm { cvt.f32.f16 %f177, %rs182;} // end inline asm fma.rn.ftz.f32 %f688, %f315, %f177, %f312; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs279, %rs5, 4; cvt.s16.s8 %rs280, %rs279; shr.s16 %rs281, %rs280, 7; and.b16 %rs282, %rs281, -16; or.b16 %rs283, %rs282, %rs6; cvt.rn.f32.s16 %f412, %rs283; sub.ftz.f32 %f413, %f412, %f8; mul.ftz.f32 %f414, %f81, %f413; // begin inline asm { cvt.f32.f16 %f316, %rs9;} // end inline asm fma.rn.ftz.f32 %f415, %f414, %f316, %f693; // begin inline asm { cvt.f32.f16 %f317, %rs12;} // end inline asm fma.rn.ftz.f32 %f416, %f414, %f317, %f692; // begin inline asm { cvt.f32.f16 %f318, %rs15;} // end inline asm fma.rn.ftz.f32 %f417, %f414, %f318, %f691; // begin inline asm { cvt.f32.f16 %f319, %rs18;} // end inline asm fma.rn.ftz.f32 %f418, %f414, %f319, %f690; // begin inline asm { cvt.f32.f16 %f320, %rs21;} // end inline asm fma.rn.ftz.f32 %f419, %f414, %f320, %f689; // begin inline asm { cvt.f32.f16 %f321, %rs24;} // end inline asm fma.rn.ftz.f32 %f420, %f414, %f321, %f688; shl.b16 %rs284, %rs25, 4; cvt.s16.s8 %rs285, %rs284; shr.s16 %rs286, %rs285, 7; and.b16 %rs287, %rs286, -16; or.b16 %rs288, %rs287, %rs26; cvt.rn.f32.s16 %f421, %rs288; sub.ftz.f32 %f422, %f421, %f8; mul.ftz.f32 %f423, %f81, %f422; // begin inline asm { cvt.f32.f16 %f322, %rs8;} // end inline asm fma.rn.ftz.f32 %f424, %f423, %f322, %f415; // begin inline asm { cvt.f32.f16 %f323, %rs11;} // end inline asm fma.rn.ftz.f32 %f425, %f423, %f323, %f416; // begin inline asm { cvt.f32.f16 %f324, %rs14;} // end inline asm fma.rn.ftz.f32 %f426, %f423, %f324, %f417; // begin inline asm { cvt.f32.f16 %f325, %rs17;} // end inline asm fma.rn.ftz.f32 %f427, %f423, %f325, %f418; // begin inline asm { cvt.f32.f16 %f326, %rs20;} // end inline asm fma.rn.ftz.f32 %f428, %f423, %f326, %f419; // begin inline asm { cvt.f32.f16 %f327, %rs23;} // end inline asm fma.rn.ftz.f32 %f429, %f423, %f327, %f420; shl.b16 %rs295, %rs27, 4; cvt.s16.s8 %rs296, %rs295; shr.s16 %rs297, %rs296, 7; and.b16 %rs298, %rs297, -16; or.b16 %rs299, %rs298, %rs28; cvt.rn.f32.s16 %f430, %rs299; sub.ftz.f32 %f431, %f430, %f8; mul.ftz.f32 %f432, %f81, %f431; mov.b32 {%rs195, %rs201}, %r73; // begin inline asm { cvt.f32.f16 %f328, %rs195;} // end inline asm fma.rn.ftz.f32 %f433, %f432, %f328, %f424; mov.b32 {%rs196, %rs202}, %r77; // begin inline asm { cvt.f32.f16 %f329, %rs196;} // end inline asm fma.rn.ftz.f32 %f434, %f432, %f329, %f425; mov.b32 {%rs197, %rs203}, %r83; // begin inline asm { cvt.f32.f16 %f330, %rs197;} // end inline asm fma.rn.ftz.f32 %f435, %f432, %f330, %f426; mov.b32 {%rs198, %rs204}, %r87; // begin inline asm { cvt.f32.f16 %f331, %rs198;} // end inline asm fma.rn.ftz.f32 %f436, %f432, %f331, %f427; mov.b32 {%rs199, %rs205}, %r91; // begin inline asm { cvt.f32.f16 %f332, %rs199;} // end inline asm fma.rn.ftz.f32 %f437, %f432, %f332, %f428; mov.b32 {%rs200, %rs206}, %r95; // begin inline asm { cvt.f32.f16 %f333, %rs200;} // end inline asm fma.rn.ftz.f32 %f438, %f432, %f333, %f429; shl.b16 %rs300, %rs29, 4; cvt.s16.s8 %rs301, %rs300; shr.s16 %rs302, %rs301, 7; and.b16 %rs303, %rs302, -16; or.b16 %rs304, %rs303, %rs30; cvt.rn.f32.s16 %f439, %rs304; sub.ftz.f32 %f440, %f439, %f8; mul.ftz.f32 %f441, %f81, %f440; // begin inline asm { cvt.f32.f16 %f334, %rs201;} // end inline asm fma.rn.ftz.f32 %f442, %f441, %f334, %f433; // begin inline asm { cvt.f32.f16 %f335, %rs202;} // end inline asm fma.rn.ftz.f32 %f443, %f441, %f335, %f434; // begin inline asm { cvt.f32.f16 %f336, %rs203;} // end inline asm fma.rn.ftz.f32 %f444, %f441, %f336, %f435; // begin inline asm { cvt.f32.f16 %f337, %rs204;} // end inline asm fma.rn.ftz.f32 %f445, %f441, %f337, %f436; // begin inline asm { cvt.f32.f16 %f338, %rs205;} // end inline asm fma.rn.ftz.f32 %f446, %f441, %f338, %f437; // begin inline asm { cvt.f32.f16 %f339, %rs206;} // end inline asm fma.rn.ftz.f32 %f447, %f441, %f339, %f438; shl.b16 %rs305, %rs31, 4; cvt.s16.s8 %rs306, %rs305; shr.s16 %rs307, %rs306, 7; and.b16 %rs308, %rs307, -16; or.b16 %rs309, %rs308, %rs32; cvt.rn.f32.s16 %f448, %rs309; sub.ftz.f32 %f449, %f448, %f8; mul.ftz.f32 %f450, %f81, %f449; mov.b32 {%rs207, %rs213}, %r74; // begin inline asm { cvt.f32.f16 %f340, %rs207;} // end inline asm fma.rn.ftz.f32 %f451, %f450, %f340, %f442; mov.b32 {%rs208, %rs214}, %r78; // begin inline asm { cvt.f32.f16 %f341, %rs208;} // end inline asm fma.rn.ftz.f32 %f452, %f450, %f341, %f443; cvt.u16.u32 %rs209, %r84; // begin inline asm { cvt.f32.f16 %f342, %rs209;} // end inline asm fma.rn.ftz.f32 %f453, %f450, %f342, %f444; cvt.u16.u32 %rs210, %r88; // begin inline asm { cvt.f32.f16 %f343, %rs210;} // end inline asm fma.rn.ftz.f32 %f454, %f450, %f343, %f445; cvt.u16.u32 %rs211, %r92; // begin inline asm { cvt.f32.f16 %f344, %rs211;} // end inline asm fma.rn.ftz.f32 %f455, %f450, %f344, %f446; cvt.u16.u32 %rs212, %r96; // begin inline asm { cvt.f32.f16 %f345, %rs212;} // end inline asm fma.rn.ftz.f32 %f456, %f450, %f345, %f447; shl.b16 %rs310, %rs33, 4; cvt.s16.s8 %rs311, %rs310; shr.s16 %rs312, %rs311, 7; and.b16 %rs313, %rs312, -16; or.b16 %rs314, %rs313, %rs34; cvt.rn.f32.s16 %f457, %rs314; sub.ftz.f32 %f458, %f457, %f8; mul.ftz.f32 %f459, %f81, %f458; // begin inline asm { cvt.f32.f16 %f346, %rs213;} // end inline asm fma.rn.ftz.f32 %f460, %f459, %f346, %f451; // begin inline asm { cvt.f32.f16 %f347, %rs214;} // end inline asm fma.rn.ftz.f32 %f461, %f459, %f347, %f452; // begin inline asm { cvt.f32.f16 %f348, %rs35;} // end inline asm fma.rn.ftz.f32 %f462, %f459, %f348, %f453; // begin inline asm { cvt.f32.f16 %f349, %rs36;} // end inline asm fma.rn.ftz.f32 %f463, %f459, %f349, %f454; // begin inline asm { cvt.f32.f16 %f350, %rs37;} // end inline asm fma.rn.ftz.f32 %f464, %f459, %f350, %f455; // begin inline asm { cvt.f32.f16 %f351, %rs38;} // end inline asm fma.rn.ftz.f32 %f465, %f459, %f351, %f456; shl.b16 %rs315, %rs39, 4; cvt.s16.s8 %rs316, %rs315; shr.s16 %rs317, %rs316, 7; and.b16 %rs318, %rs317, -16; or.b16 %rs319, %rs318, %rs40; cvt.rn.f32.s16 %f466, %rs319; sub.ftz.f32 %f467, %f466, %f8; mul.ftz.f32 %f468, %f81, %f467; mov.b32 {%rs219, %rs225}, %r75; // begin inline asm { cvt.f32.f16 %f352, %rs219;} // end inline asm fma.rn.ftz.f32 %f469, %f468, %f352, %f460; mov.b32 {%rs220, %rs226}, %r79; // begin inline asm { cvt.f32.f16 %f353, %rs220;} // end inline asm fma.rn.ftz.f32 %f470, %f468, %f353, %f461; mov.b32 {%rs221, %rs227}, %r85; // begin inline asm { cvt.f32.f16 %f354, %rs221;} // end inline asm fma.rn.ftz.f32 %f471, %f468, %f354, %f462; mov.b32 {%rs222, %rs228}, %r89; // begin inline asm { cvt.f32.f16 %f355, %rs222;} // end inline asm fma.rn.ftz.f32 %f472, %f468, %f355, %f463; mov.b32 {%rs223, %rs229}, %r93; // begin inline asm { cvt.f32.f16 %f356, %rs223;} // end inline asm fma.rn.ftz.f32 %f473, %f468, %f356, %f464; mov.b32 {%rs224, %rs230}, %r97; // begin inline asm { cvt.f32.f16 %f357, %rs224;} // end inline asm fma.rn.ftz.f32 %f474, %f468, %f357, %f465; shl.b16 %rs320, %rs41, 4; cvt.s16.s8 %rs321, %rs320; shr.s16 %rs322, %rs321, 7; and.b16 %rs323, %rs322, -16; or.b16 %rs324, %rs323, %rs41; cvt.rn.f32.s16 %f475, %rs324; sub.ftz.f32 %f476, %f475, %f8; mul.ftz.f32 %f477, %f81, %f476; // begin inline asm { cvt.f32.f16 %f358, %rs225;} // end inline asm fma.rn.ftz.f32 %f478, %f477, %f358, %f469; // begin inline asm { cvt.f32.f16 %f359, %rs226;} // end inline asm fma.rn.ftz.f32 %f479, %f477, %f359, %f470; // begin inline asm { cvt.f32.f16 %f360, %rs227;} // end inline asm fma.rn.ftz.f32 %f480, %f477, %f360, %f471; // begin inline asm { cvt.f32.f16 %f361, %rs228;} // end inline asm fma.rn.ftz.f32 %f481, %f477, %f361, %f472; // begin inline asm { cvt.f32.f16 %f362, %rs229;} // end inline asm fma.rn.ftz.f32 %f482, %f477, %f362, %f473; // begin inline asm { cvt.f32.f16 %f363, %rs230;} // end inline asm fma.rn.ftz.f32 %f483, %f477, %f363, %f474; ld.global.v4.u32 {%r160, %r161, %r162, %r163}, [%rd7+16]; ld.global.v4.u32 {%r168, %r169, %r170, %r171}, [%rd8+16]; ld.global.v4.u32 {%r176, %r177, %r178, %r179}, [%rd9]; ld.global.v4.u32 {%r184, %r185, %r186, %r187}, [%rd10]; ld.global.v4.u32 {%r192, %r193, %r194, %r195}, [%rd11]; ld.global.v4.u32 {%r200, %r201, %r202, %r203}, [%rd12]; shl.b16 %rs325, %rs42, 4; cvt.s16.s8 %rs326, %rs325; shr.s16 %rs327, %rs326, 7; and.b16 %rs328, %rs327, -16; or.b16 %rs329, %rs328, %rs43; cvt.rn.f32.s16 %f484, %rs329; sub.ftz.f32 %f485, %f484, %f8; mul.ftz.f32 %f486, %f81, %f485; mov.b32 {%rs231, %rs237}, %r160; // begin inline asm { cvt.f32.f16 %f364, %rs231;} // end inline asm fma.rn.ftz.f32 %f487, %f486, %f364, %f478; mov.b32 {%rs232, %rs238}, %r168; // begin inline asm { cvt.f32.f16 %f365, %rs232;} // end inline asm fma.rn.ftz.f32 %f488, %f486, %f365, %f479; mov.b32 {%rs233, %rs239}, %r176; // begin inline asm { cvt.f32.f16 %f366, %rs233;} // end inline asm fma.rn.ftz.f32 %f489, %f486, %f366, %f480; mov.b32 {%rs234, %rs240}, %r184; // begin inline asm { cvt.f32.f16 %f367, %rs234;} // end inline asm fma.rn.ftz.f32 %f490, %f486, %f367, %f481; mov.b32 {%rs235, %rs241}, %r192; // begin inline asm { cvt.f32.f16 %f368, %rs235;} // end inline asm fma.rn.ftz.f32 %f491, %f486, %f368, %f482; mov.b32 {%rs236, %rs242}, %r200; // begin inline asm { cvt.f32.f16 %f369, %rs236;} // end inline asm fma.rn.ftz.f32 %f492, %f486, %f369, %f483; shl.b16 %rs330, %rs44, 4; cvt.s16.s8 %rs331, %rs330; shr.s16 %rs332, %rs331, 7; and.b16 %rs333, %rs332, -16; or.b16 %rs334, %rs333, %rs45; cvt.rn.f32.s16 %f493, %rs334; sub.ftz.f32 %f494, %f493, %f8; mul.ftz.f32 %f495, %f81, %f494; // begin inline asm { cvt.f32.f16 %f370, %rs237;} // end inline asm fma.rn.ftz.f32 %f496, %f495, %f370, %f487; // begin inline asm { cvt.f32.f16 %f371, %rs238;} // end inline asm fma.rn.ftz.f32 %f497, %f495, %f371, %f488; // begin inline asm { cvt.f32.f16 %f372, %rs239;} // end inline asm fma.rn.ftz.f32 %f498, %f495, %f372, %f489; // begin inline asm { cvt.f32.f16 %f373, %rs240;} // end inline asm fma.rn.ftz.f32 %f499, %f495, %f373, %f490; // begin inline asm { cvt.f32.f16 %f374, %rs241;} // end inline asm fma.rn.ftz.f32 %f500, %f495, %f374, %f491; // begin inline asm { cvt.f32.f16 %f375, %rs242;} // end inline asm fma.rn.ftz.f32 %f501, %f495, %f375, %f492; shl.b16 %rs335, %rs46, 4; cvt.s16.s8 %rs336, %rs335; shr.s16 %rs337, %rs336, 7; and.b16 %rs338, %rs337, -16; or.b16 %rs339, %rs338, %rs47; cvt.rn.f32.s16 %f502, %rs339; sub.ftz.f32 %f503, %f502, %f8; mul.ftz.f32 %f504, %f81, %f503; mov.b32 {%rs243, %rs249}, %r161; // begin inline asm { cvt.f32.f16 %f376, %rs243;} // end inline asm fma.rn.ftz.f32 %f505, %f504, %f376, %f496; mov.b32 {%rs244, %rs250}, %r169; // begin inline asm { cvt.f32.f16 %f377, %rs244;} // end inline asm fma.rn.ftz.f32 %f506, %f504, %f377, %f497; mov.b32 {%rs245, %rs251}, %r177; // begin inline asm { cvt.f32.f16 %f378, %rs245;} // end inline asm fma.rn.ftz.f32 %f507, %f504, %f378, %f498; mov.b32 {%rs246, %rs252}, %r185; // begin inline asm { cvt.f32.f16 %f379, %rs246;} // end inline asm fma.rn.ftz.f32 %f508, %f504, %f379, %f499; mov.b32 {%rs247, %rs253}, %r193; // begin inline asm { cvt.f32.f16 %f380, %rs247;} // end inline asm fma.rn.ftz.f32 %f509, %f504, %f380, %f500; mov.b32 {%rs248, %rs254}, %r201; // begin inline asm { cvt.f32.f16 %f381, %rs248;} // end inline asm fma.rn.ftz.f32 %f510, %f504, %f381, %f501; shl.b16 %rs340, %rs48, 4; cvt.s16.s8 %rs341, %rs340; shr.s16 %rs342, %rs341, 7; and.b16 %rs343, %rs342, -16; or.b16 %rs344, %rs343, %rs49; cvt.rn.f32.s16 %f511, %rs344; sub.ftz.f32 %f512, %f511, %f8; mul.ftz.f32 %f513, %f81, %f512; // begin inline asm { cvt.f32.f16 %f382, %rs249;} // end inline asm fma.rn.ftz.f32 %f514, %f513, %f382, %f505; // begin inline asm { cvt.f32.f16 %f383, %rs250;} // end inline asm fma.rn.ftz.f32 %f515, %f513, %f383, %f506; // begin inline asm { cvt.f32.f16 %f384, %rs251;} // end inline asm fma.rn.ftz.f32 %f516, %f513, %f384, %f507; // begin inline asm { cvt.f32.f16 %f385, %rs252;} // end inline asm fma.rn.ftz.f32 %f517, %f513, %f385, %f508; // begin inline asm { cvt.f32.f16 %f386, %rs253;} // end inline asm fma.rn.ftz.f32 %f518, %f513, %f386, %f509; // begin inline asm { cvt.f32.f16 %f387, %rs254;} // end inline asm fma.rn.ftz.f32 %f519, %f513, %f387, %f510; shl.b16 %rs345, %rs50, 4; cvt.s16.s8 %rs346, %rs345; shr.s16 %rs347, %rs346, 7; and.b16 %rs348, %rs347, -16; or.b16 %rs349, %rs348, %rs51; cvt.rn.f32.s16 %f520, %rs349; sub.ftz.f32 %f521, %f520, %f8; mul.ftz.f32 %f522, %f81, %f521; mov.b32 {%rs255, %rs261}, %r162; // begin inline asm { cvt.f32.f16 %f388, %rs255;} // end inline asm fma.rn.ftz.f32 %f523, %f522, %f388, %f514; mov.b32 {%rs256, %rs262}, %r170; // begin inline asm { cvt.f32.f16 %f389, %rs256;} // end inline asm fma.rn.ftz.f32 %f524, %f522, %f389, %f515; mov.b32 {%rs257, %rs263}, %r178; // begin inline asm { cvt.f32.f16 %f390, %rs257;} // end inline asm fma.rn.ftz.f32 %f525, %f522, %f390, %f516; mov.b32 {%rs258, %rs264}, %r186; // begin inline asm { cvt.f32.f16 %f391, %rs258;} // end inline asm fma.rn.ftz.f32 %f526, %f522, %f391, %f517; mov.b32 {%rs259, %rs265}, %r194; // begin inline asm { cvt.f32.f16 %f392, %rs259;} // end inline asm fma.rn.ftz.f32 %f527, %f522, %f392, %f518; mov.b32 {%rs260, %rs266}, %r202; // begin inline asm { cvt.f32.f16 %f393, %rs260;} // end inline asm fma.rn.ftz.f32 %f528, %f522, %f393, %f519; shl.b16 %rs350, %rs52, 4; cvt.s16.s8 %rs351, %rs350; shr.s16 %rs352, %rs351, 7; and.b16 %rs353, %rs352, -16; or.b16 %rs354, %rs353, %rs53; cvt.rn.f32.s16 %f529, %rs354; sub.ftz.f32 %f530, %f529, %f8; mul.ftz.f32 %f531, %f81, %f530; // begin inline asm { cvt.f32.f16 %f394, %rs261;} // end inline asm fma.rn.ftz.f32 %f532, %f531, %f394, %f523; // begin inline asm { cvt.f32.f16 %f395, %rs262;} // end inline asm fma.rn.ftz.f32 %f533, %f531, %f395, %f524; // begin inline asm { cvt.f32.f16 %f396, %rs263;} // end inline asm fma.rn.ftz.f32 %f534, %f531, %f396, %f525; // begin inline asm { cvt.f32.f16 %f397, %rs264;} // end inline asm fma.rn.ftz.f32 %f535, %f531, %f397, %f526; // begin inline asm { cvt.f32.f16 %f398, %rs265;} // end inline asm fma.rn.ftz.f32 %f536, %f531, %f398, %f527; // begin inline asm { cvt.f32.f16 %f399, %rs266;} // end inline asm fma.rn.ftz.f32 %f537, %f531, %f399, %f528; shl.b16 %rs355, %rs54, 4; cvt.s16.s8 %rs356, %rs355; shr.s16 %rs357, %rs356, 7; and.b16 %rs358, %rs357, -16; or.b16 %rs359, %rs358, %rs55; cvt.rn.f32.s16 %f538, %rs359; sub.ftz.f32 %f539, %f538, %f8; mul.ftz.f32 %f540, %f81, %f539; mov.b32 {%rs267, %rs273}, %r163; // begin inline asm { cvt.f32.f16 %f400, %rs267;} // end inline asm fma.rn.ftz.f32 %f541, %f540, %f400, %f532; mov.b32 {%rs268, %rs274}, %r171; // begin inline asm { cvt.f32.f16 %f401, %rs268;} // end inline asm fma.rn.ftz.f32 %f542, %f540, %f401, %f533; mov.b32 {%rs269, %rs275}, %r179; // begin inline asm { cvt.f32.f16 %f402, %rs269;} // end inline asm fma.rn.ftz.f32 %f543, %f540, %f402, %f534; mov.b32 {%rs270, %rs276}, %r187; // begin inline asm { cvt.f32.f16 %f403, %rs270;} // end inline asm fma.rn.ftz.f32 %f544, %f540, %f403, %f535; mov.b32 {%rs271, %rs277}, %r195; // begin inline asm { cvt.f32.f16 %f404, %rs271;} // end inline asm fma.rn.ftz.f32 %f545, %f540, %f404, %f536; mov.b32 {%rs272, %rs278}, %r203; // begin inline asm { cvt.f32.f16 %f405, %rs272;} // end inline asm fma.rn.ftz.f32 %f546, %f540, %f405, %f537; shl.b16 %rs360, %rs56, 4; cvt.s16.s8 %rs361, %rs360; shr.s16 %rs362, %rs361, 7; and.b16 %rs363, %rs362, -16; or.b16 %rs364, %rs363, %rs56; cvt.rn.f32.s16 %f547, %rs364; sub.ftz.f32 %f548, %f547, %f8; mul.ftz.f32 %f549, %f81, %f548; // begin inline asm { cvt.f32.f16 %f406, %rs273;} // end inline asm fma.rn.ftz.f32 %f693, %f549, %f406, %f541; // begin inline asm { cvt.f32.f16 %f407, %rs274;} // end inline asm fma.rn.ftz.f32 %f692, %f549, %f407, %f542; // begin inline asm { cvt.f32.f16 %f408, %rs275;} // end inline asm fma.rn.ftz.f32 %f691, %f549, %f408, %f543; // begin inline asm { cvt.f32.f16 %f409, %rs276;} // end inline asm fma.rn.ftz.f32 %f690, %f549, %f409, %f544; // begin inline asm { cvt.f32.f16 %f410, %rs277;} // end inline asm fma.rn.ftz.f32 %f689, %f549, %f410, %f545; // begin inline asm { cvt.f32.f16 %f411, %rs278;} // end inline asm fma.rn.ftz.f32 %f688, %f549, %f411, %f546; $L__BB0_8: add.s32 %r339, %r339, 4; shl.b32 %r208, %r339, 5; add.s32 %r338, %r208, %r55; shl.b32 %r337, %r338, 1; setp.lt.u32 %p7, %r337, %r52; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r332, %tid.y; mov.u32 %r331, %tid.x; shl.b32 %r330, %r332, 5; add.s32 %r329, %r330, %r331; shl.b32 %r210, %r329, 2; mov.u32 %r211, _ZZ9gemv_int4ILi4ELi64ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r212, %r211, %r210; setp.lt.u32 %p8, %r329, 32; @%p8 bra $L__BB0_11; add.s32 %r323, %r212, -112; st.shared.f32 [%r323], %f693; $L__BB0_11: mov.u32 %r336, %tid.y; mov.u32 %r335, %tid.x; shl.b32 %r334, %r336, 5; add.s32 %r333, %r334, %r335; setp.gt.u32 %p9, %r333, 31; bar.sync 0; mad.lo.s32 %r43, %r333, 12, %r211; @%p9 bra $L__BB0_13; mov.u32 %r227, 16; ld.shared.f32 %f565, [%r43+16]; add.ftz.f32 %f566, %f693, %f565; ld.shared.f32 %f567, [%r43+20]; add.ftz.f32 %f568, %f566, %f567; ld.shared.f32 %f569, [%r43+24]; add.ftz.f32 %f552, %f568, %f569; mov.u32 %r215, 1; mov.u32 %r228, 31; mov.u32 %r229, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f552, %r215, %r228, %r229; @p add.f32 r0, r0, %f552; mov.f32 %f550, r0;} // end inline asm mov.u32 %r218, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f550, %r218, %r228, %r229; @p add.f32 r0, r0, %f550; mov.f32 %f553, r0;} // end inline asm mov.u32 %r221, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f553, %r221, %r228, %r229; @p add.f32 r0, r0, %f553; mov.f32 %f556, r0;} // end inline asm mov.u32 %r224, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f556, %r224, %r228, %r229; @p add.f32 r0, r0, %f556; mov.f32 %f559, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f559, %r227, %r228, %r229; @p add.f32 r0, r0, %f559; mov.f32 %f693, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r324, %r212, -112; st.shared.f32 [%r324+640], %f692; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f585, [%r43+656]; add.ftz.f32 %f586, %f692, %f585; ld.shared.f32 %f587, [%r43+660]; add.ftz.f32 %f588, %f586, %f587; ld.shared.f32 %f589, [%r43+664]; add.ftz.f32 %f572, %f588, %f589; mov.u32 %r231, 1; mov.u32 %r244, 31; mov.u32 %r245, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f572, %r231, %r244, %r245; @p add.f32 r0, r0, %f572; mov.f32 %f570, r0;} // end inline asm mov.u32 %r234, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f570, %r234, %r244, %r245; @p add.f32 r0, r0, %f570; mov.f32 %f573, r0;} // end inline asm mov.u32 %r237, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f573, %r237, %r244, %r245; @p add.f32 r0, r0, %f573; mov.f32 %f576, r0;} // end inline asm mov.u32 %r240, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f576, %r240, %r244, %r245; @p add.f32 r0, r0, %f576; mov.f32 %f579, r0;} // end inline asm mov.u32 %r243, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f579, %r243, %r244, %r245; @p add.f32 r0, r0, %f579; mov.f32 %f692, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r325, %r212, -112; st.shared.f32 [%r325+1280], %f691; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f605, [%r43+1296]; add.ftz.f32 %f606, %f691, %f605; ld.shared.f32 %f607, [%r43+1300]; add.ftz.f32 %f608, %f606, %f607; ld.shared.f32 %f609, [%r43+1304]; add.ftz.f32 %f592, %f608, %f609; mov.u32 %r247, 1; mov.u32 %r260, 31; mov.u32 %r261, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f592, %r247, %r260, %r261; @p add.f32 r0, r0, %f592; mov.f32 %f590, r0;} // end inline asm mov.u32 %r250, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f590, %r250, %r260, %r261; @p add.f32 r0, r0, %f590; mov.f32 %f593, r0;} // end inline asm mov.u32 %r253, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f593, %r253, %r260, %r261; @p add.f32 r0, r0, %f593; mov.f32 %f596, r0;} // end inline asm mov.u32 %r256, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f596, %r256, %r260, %r261; @p add.f32 r0, r0, %f596; mov.f32 %f599, r0;} // end inline asm mov.u32 %r259, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f599, %r259, %r260, %r261; @p add.f32 r0, r0, %f599; mov.f32 %f691, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r326, %r212, -112; st.shared.f32 [%r326+1920], %f690; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f625, [%r43+1936]; add.ftz.f32 %f626, %f690, %f625; ld.shared.f32 %f627, [%r43+1940]; add.ftz.f32 %f628, %f626, %f627; ld.shared.f32 %f629, [%r43+1944]; add.ftz.f32 %f612, %f628, %f629; mov.u32 %r263, 1; mov.u32 %r276, 31; mov.u32 %r277, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f612, %r263, %r276, %r277; @p add.f32 r0, r0, %f612; mov.f32 %f610, r0;} // end inline asm mov.u32 %r266, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f610, %r266, %r276, %r277; @p add.f32 r0, r0, %f610; mov.f32 %f613, r0;} // end inline asm mov.u32 %r269, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f613, %r269, %r276, %r277; @p add.f32 r0, r0, %f613; mov.f32 %f616, r0;} // end inline asm mov.u32 %r272, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f616, %r272, %r276, %r277; @p add.f32 r0, r0, %f616; mov.f32 %f619, r0;} // end inline asm mov.u32 %r275, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f619, %r275, %r276, %r277; @p add.f32 r0, r0, %f619; mov.f32 %f690, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r327, %r212, -112; st.shared.f32 [%r327+2560], %f689; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f645, [%r43+2576]; add.ftz.f32 %f646, %f689, %f645; ld.shared.f32 %f647, [%r43+2580]; add.ftz.f32 %f648, %f646, %f647; ld.shared.f32 %f649, [%r43+2584]; add.ftz.f32 %f632, %f648, %f649; mov.u32 %r279, 1; mov.u32 %r292, 31; mov.u32 %r293, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f632, %r279, %r292, %r293; @p add.f32 r0, r0, %f632; mov.f32 %f630, r0;} // end inline asm mov.u32 %r282, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f630, %r282, %r292, %r293; @p add.f32 r0, r0, %f630; mov.f32 %f633, r0;} // end inline asm mov.u32 %r285, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f633, %r285, %r292, %r293; @p add.f32 r0, r0, %f633; mov.f32 %f636, r0;} // end inline asm mov.u32 %r288, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f636, %r288, %r292, %r293; @p add.f32 r0, r0, %f636; mov.f32 %f639, r0;} // end inline asm mov.u32 %r291, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f639, %r291, %r292, %r293; @p add.f32 r0, r0, %f639; mov.f32 %f689, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r328, %r212, -112; st.shared.f32 [%r328+3200], %f688; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f665, [%r43+3216]; add.ftz.f32 %f666, %f688, %f665; ld.shared.f32 %f667, [%r43+3220]; add.ftz.f32 %f668, %f666, %f667; ld.shared.f32 %f669, [%r43+3224]; add.ftz.f32 %f652, %f668, %f669; mov.u32 %r295, 1; mov.u32 %r308, 31; mov.u32 %r309, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f652, %r295, %r308, %r309; @p add.f32 r0, r0, %f652; mov.f32 %f650, r0;} // end inline asm mov.u32 %r298, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f650, %r298, %r308, %r309; @p add.f32 r0, r0, %f650; mov.f32 %f653, r0;} // end inline asm mov.u32 %r301, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f653, %r301, %r308, %r309; @p add.f32 r0, r0, %f653; mov.f32 %f656, r0;} // end inline asm mov.u32 %r304, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f656, %r304, %r308, %r309; @p add.f32 r0, r0, %f656; mov.f32 %f659, r0;} // end inline asm mov.u32 %r307, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f659, %r307, %r308, %r309; @p add.f32 r0, r0, %f659; mov.f32 %f688, r0;} // end inline asm $L__BB0_33: mov.u32 %r310, %tid.y; or.b32 %r312, %r55, %r310; setp.ne.s32 %p20, %r312, 0; @%p20 bra $L__BB0_47; ld.param.u64 %rd84, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p21, %rd84, 0; mul.ftz.f32 %f706, %f73, %f693; mov.u32 %r313, %ctaid.x; cvt.s64.s32 %rd13, %r313; @%p21 bra $L__BB0_36; shl.b64 %rd51, %rd13, 1; add.s64 %rd52, %rd2, %rd51; ld.global.u16 %rs365, [%rd52]; // begin inline asm { cvt.f32.f16 %f670, %rs365;} // end inline asm fma.rn.ftz.f32 %f706, %f74, %f670, %f706; $L__BB0_36: ld.param.u64 %rd85, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs366, %f706;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd53, 1.0; // end inline asm shl.b64 %rd56, %rd13, 1; add.s64 %rd54, %rd85, %rd56; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd54], %rs366, %rd53; // end inline asm mul.ftz.f32 %f707, %f73, %f692; add.s32 %r315, %r51, %r313; cvt.s64.s32 %rd16, %r315; @%p21 bra $L__BB0_38; shl.b64 %rd57, %rd16, 1; add.s64 %rd58, %rd2, %rd57; ld.global.u16 %rs368, [%rd58]; // begin inline asm { cvt.f32.f16 %f672, %rs368;} // end inline asm fma.rn.ftz.f32 %f707, %f74, %f672, %f707; $L__BB0_38: cvt.s64.s32 %rd17, %r51; mul.wide.s32 %rd62, %r51, 2; add.s64 %rd60, %rd54, %rd62; // begin inline asm { cvt.rn.f16.f32 %rs369, %f707;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd59, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd60], %rs369, %rd59; // end inline asm mul.ftz.f32 %f708, %f73, %f691; cvt.u32.u64 %r316, %rd16; add.s32 %r317, %r316, %r51; cvt.s64.s32 %rd18, %r317; @%p21 bra $L__BB0_40; shl.b64 %rd63, %rd18, 1; add.s64 %rd64, %rd2, %rd63; ld.global.u16 %rs371, [%rd64]; // begin inline asm { cvt.f32.f16 %f674, %rs371;} // end inline asm fma.rn.ftz.f32 %f708, %f74, %f674, %f708; $L__BB0_40: ld.param.u64 %rd86, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0]; shl.b64 %rd68, %rd18, 1; add.s64 %rd66, %rd86, %rd68; // begin inline asm { cvt.rn.f16.f32 %rs372, %f708;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd65, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd66], %rs372, %rd65; // end inline asm mul.ftz.f32 %f709, %f73, %f690; cvt.u32.u64 %r318, %rd18; add.s32 %r319, %r318, %r51; cvt.s64.s32 %rd20, %r319; @%p21 bra $L__BB0_42; shl.b64 %rd69, %rd20, 1; add.s64 %rd70, %rd2, %rd69; ld.global.u16 %rs374, [%rd70]; // begin inline asm { cvt.f32.f16 %f676, %rs374;} // end inline asm fma.rn.ftz.f32 %f709, %f74, %f676, %f709; $L__BB0_42: // begin inline asm { cvt.rn.f16.f32 %rs375, %f709;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd71, 1.0; // end inline asm shl.b64 %rd21, %rd17, 1; add.s64 %rd72, %rd66, %rd21; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd72], %rs375, %rd71; // end inline asm mul.ftz.f32 %f710, %f73, %f689; cvt.u32.u64 %r320, %rd20; add.s32 %r321, %r320, %r51; cvt.s64.s32 %rd23, %r321; @%p21 bra $L__BB0_44; shl.b64 %rd74, %rd23, 1; add.s64 %rd75, %rd2, %rd74; ld.global.u16 %rs377, [%rd75]; // begin inline asm { cvt.f32.f16 %f678, %rs377;} // end inline asm fma.rn.ftz.f32 %f710, %f74, %f678, %f710; $L__BB0_44: // begin inline asm { cvt.rn.f16.f32 %rs378, %f710;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd76, 1.0; // end inline asm add.s64 %rd77, %rd72, %rd21; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd77], %rs378, %rd76; // end inline asm mul.ftz.f32 %f711, %f73, %f688; cvt.u32.u64 %r322, %rd23; add.s32 %r45, %r322, %r51; @%p21 bra $L__BB0_46; mul.wide.s32 %rd79, %r45, 2; add.s64 %rd80, %rd2, %rd79; ld.global.u16 %rs380, [%rd80]; // begin inline asm { cvt.f32.f16 %f680, %rs380;} // end inline asm fma.rn.ftz.f32 %f711, %f74, %f680, %f711; $L__BB0_46: // begin inline asm { cvt.rn.f16.f32 %rs381, %f711;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd81, 1.0; // end inline asm add.s64 %rd82, %rd77, %rd21; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd82], %rs381, %rd81; // end inline asm $L__BB0_47: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }