dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f50, %f51}, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs68, %rs69, %rs70, %rs71}, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd20, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd19, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd18, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd17, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r191, %tid.y; shl.b32 %r28, %r191, 5; mov.u32 %r3, %tid.x; add.s32 %r190, %r28, %r3; setp.ge.u32 %p1, %r190, %r26; mov.f32 %f376, 0f00000000; mov.f32 %f377, %f376; mov.f32 %f378, %f376; mov.f32 %f379, %f376; mov.f32 %f380, %f376; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd1, %rd17; mul.lo.s32 %r7, %r26, %r1; shr.u32 %r8, %r3, 2; mul.lo.s32 %r9, %r27, %r1; shl.b16 %rs2, %rs68, 3; shl.b32 %r29, %r24, 1; mul.wide.s32 %rd3, %r29, 2; cvta.to.global.u64 %rd4, %rd19; cvta.to.global.u64 %rd5, %rd18; cvta.to.global.u64 %rd6, %rd20; $L__BB0_2: add.s32 %r30, %r190, %r7; mul.wide.u32 %rd21, %r30, 4; add.s64 %rd22, %rd5, %rd21; ld.global.u32 %r12, [%rd22]; shl.b32 %r31, %r191, 3; add.s32 %r13, %r31, %r8; add.s32 %r14, %r13, %r9; mul.wide.s32 %rd23, %r14, 2; add.s64 %rd24, %rd6, %rd23; ld.global.u16 %rs76, [%rd24]; // begin inline asm { cvt.f32.f16 %f57, %rs76;} // end inline asm setp.eq.s64 %p2, %rd19, 0; mov.u16 %rs219, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r32, %r14, 31; add.s32 %r33, %r14, %r32; shr.s32 %r34, %r33, 1; cvt.s64.s32 %rd25, %r34; add.s64 %rd26, %rd4, %rd25; ld.global.u8 %r35, [%rd26]; shl.b32 %r36, %r13, 2; and.b32 %r37, %r36, 4; shr.u32 %r38, %r35, %r37; cvt.u16.u32 %rs77, %r38; and.b16 %rs219, %rs77, 15; $L__BB0_4: shl.b32 %r15, %r190, 3; setp.ge.s32 %p3, %r15, %r24; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs68, 0; mul.wide.s32 %rd27, %r15, 2; add.s64 %rd28, %rd1, %rd27; ld.global.v4.u32 {%r39, %r40, %r41, %r42}, [%rd28]; mul.wide.s32 %rd29, %r24, 2; add.s64 %rd30, %rd28, %rd29; ld.global.v4.u32 {%r47, %r48, %r49, %r50}, [%rd30]; add.s32 %r55, %r24, %r15; add.s32 %r56, %r55, %r24; add.s64 %rd31, %rd28, %rd3; ld.global.v4.u32 {%r57, %r58, %r59, %r60}, [%rd31]; add.s32 %r65, %r56, %r24; mul.wide.s32 %rd32, %r65, 2; add.s64 %rd33, %rd1, %rd32; ld.global.v4.u32 {%r66, %r67, %r68, %r69}, [%rd33]; add.s64 %rd34, %rd31, %rd3; ld.global.v4.u32 {%r74, %r75, %r76, %r77}, [%rd34]; shr.u16 %rs79, %rs219, 3; and.b16 %rs80, %rs79, 1; setp.eq.b16 %p5, %rs80, 1; and.pred %p6, %p4, %p5; selp.b16 %rs81, -16, 0, %p6; or.b16 %rs82, %rs81, %rs219; cvt.s16.s8 %rs83, %rs82; cvt.rn.f32.s16 %f7, %rs83; cvt.u16.u32 %rs5, %r12; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs14}, %r39; mov.b32 {%rs8, %rs15}, %r47; mov.b32 {%rs9, %rs16}, %r57; mov.b32 {%rs10, %rs17}, %r66; mov.b32 {%rs11, %rs18}, %r74; shr.u32 %r82, %r12, 4; cvt.u16.u32 %rs12, %r82; and.b16 %rs13, %rs12, 15; shr.u32 %r83, %r12, 8; cvt.u16.u32 %rs19, %r83; and.b16 %rs20, %rs19, 15; mov.b32 {%rs21, %rs28}, %r40; mov.b32 {%rs22, %rs29}, %r48; mov.b32 {%rs23, %rs30}, %r58; mov.b32 {%rs24, %rs31}, %r67; mov.b32 {%rs25, %rs32}, %r75; shr.u32 %r84, %r12, 12; cvt.u16.u32 %rs26, %r84; and.b16 %rs27, %rs26, 15; shr.u32 %r85, %r12, 16; cvt.u16.u32 %rs33, %r85; and.b16 %rs34, %rs33, 15; mov.b32 {%rs35, %rs42}, %r41; mov.b32 {%rs36, %rs43}, %r49; mov.b32 {%rs37, %rs44}, %r59; mov.b32 {%rs38, %rs45}, %r68; mov.b32 {%rs39, %rs46}, %r76; shr.u32 %r86, %r12, 20; cvt.u16.u32 %rs40, %r86; and.b16 %rs41, %rs40, 15; shr.u32 %r87, %r12, 24; cvt.u16.u32 %rs47, %r87; and.b16 %rs48, %rs47, 15; mov.b32 {%rs49, %rs55}, %r42; mov.b32 {%rs50, %rs56}, %r50; mov.b32 {%rs51, %rs57}, %r60; mov.b32 {%rs52, %rs58}, %r69; mov.b32 {%rs53, %rs59}, %r77; shr.u32 %r88, %r12, 28; cvt.u16.u32 %rs54, %r88; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f98, %rs6; sub.ftz.f32 %f99, %f98, %f7; mul.ftz.f32 %f100, %f57, %f99; // begin inline asm { cvt.f32.f16 %f58, %rs7;} // end inline asm fma.rn.ftz.f32 %f101, %f100, %f58, %f376; // begin inline asm { cvt.f32.f16 %f59, %rs8;} // end inline asm fma.rn.ftz.f32 %f102, %f100, %f59, %f377; // begin inline asm { cvt.f32.f16 %f60, %rs9;} // end inline asm fma.rn.ftz.f32 %f103, %f100, %f60, %f378; // begin inline asm { cvt.f32.f16 %f61, %rs10;} // end inline asm fma.rn.ftz.f32 %f104, %f100, %f61, %f379; // begin inline asm { cvt.f32.f16 %f62, %rs11;} // end inline asm fma.rn.ftz.f32 %f105, %f100, %f62, %f380; cvt.rn.f32.s16 %f106, %rs13; sub.ftz.f32 %f107, %f106, %f7; mul.ftz.f32 %f108, %f57, %f107; // begin inline asm { cvt.f32.f16 %f63, %rs14;} // end inline asm fma.rn.ftz.f32 %f109, %f108, %f63, %f101; // begin inline asm { cvt.f32.f16 %f64, %rs15;} // end inline asm fma.rn.ftz.f32 %f110, %f108, %f64, %f102; // begin inline asm { cvt.f32.f16 %f65, %rs16;} // end inline asm fma.rn.ftz.f32 %f111, %f108, %f65, %f103; // begin inline asm { cvt.f32.f16 %f66, %rs17;} // end inline asm fma.rn.ftz.f32 %f112, %f108, %f66, %f104; // begin inline asm { cvt.f32.f16 %f67, %rs18;} // end inline asm fma.rn.ftz.f32 %f113, %f108, %f67, %f105; cvt.rn.f32.s16 %f114, %rs20; sub.ftz.f32 %f115, %f114, %f7; mul.ftz.f32 %f116, %f57, %f115; // begin inline asm { cvt.f32.f16 %f68, %rs21;} // end inline asm fma.rn.ftz.f32 %f117, %f116, %f68, %f109; // begin inline asm { cvt.f32.f16 %f69, %rs22;} // end inline asm fma.rn.ftz.f32 %f118, %f116, %f69, %f110; // begin inline asm { cvt.f32.f16 %f70, %rs23;} // end inline asm fma.rn.ftz.f32 %f119, %f116, %f70, %f111; // begin inline asm { cvt.f32.f16 %f71, %rs24;} // end inline asm fma.rn.ftz.f32 %f120, %f116, %f71, %f112; // begin inline asm { cvt.f32.f16 %f72, %rs25;} // end inline asm fma.rn.ftz.f32 %f121, %f116, %f72, %f113; cvt.rn.f32.s16 %f122, %rs27; sub.ftz.f32 %f123, %f122, %f7; mul.ftz.f32 %f124, %f57, %f123; // begin inline asm { cvt.f32.f16 %f73, %rs28;} // end inline asm fma.rn.ftz.f32 %f125, %f124, %f73, %f117; // begin inline asm { cvt.f32.f16 %f74, %rs29;} // end inline asm fma.rn.ftz.f32 %f126, %f124, %f74, %f118; // begin inline asm { cvt.f32.f16 %f75, %rs30;} // end inline asm fma.rn.ftz.f32 %f127, %f124, %f75, %f119; // begin inline asm { cvt.f32.f16 %f76, %rs31;} // end inline asm fma.rn.ftz.f32 %f128, %f124, %f76, %f120; // begin inline asm { cvt.f32.f16 %f77, %rs32;} // end inline asm fma.rn.ftz.f32 %f129, %f124, %f77, %f121; cvt.rn.f32.s16 %f130, %rs34; sub.ftz.f32 %f131, %f130, %f7; mul.ftz.f32 %f132, %f57, %f131; // begin inline asm { cvt.f32.f16 %f78, %rs35;} // end inline asm fma.rn.ftz.f32 %f133, %f132, %f78, %f125; // begin inline asm { cvt.f32.f16 %f79, %rs36;} // end inline asm fma.rn.ftz.f32 %f134, %f132, %f79, %f126; // begin inline asm { cvt.f32.f16 %f80, %rs37;} // end inline asm fma.rn.ftz.f32 %f135, %f132, %f80, %f127; // begin inline asm { cvt.f32.f16 %f81, %rs38;} // end inline asm fma.rn.ftz.f32 %f136, %f132, %f81, %f128; // begin inline asm { cvt.f32.f16 %f82, %rs39;} // end inline asm fma.rn.ftz.f32 %f137, %f132, %f82, %f129; cvt.rn.f32.s16 %f138, %rs41; sub.ftz.f32 %f139, %f138, %f7; mul.ftz.f32 %f140, %f57, %f139; // begin inline asm { cvt.f32.f16 %f83, %rs42;} // end inline asm fma.rn.ftz.f32 %f141, %f140, %f83, %f133; // begin inline asm { cvt.f32.f16 %f84, %rs43;} // end inline asm fma.rn.ftz.f32 %f142, %f140, %f84, %f134; // begin inline asm { cvt.f32.f16 %f85, %rs44;} // end inline asm fma.rn.ftz.f32 %f143, %f140, %f85, %f135; // begin inline asm { cvt.f32.f16 %f86, %rs45;} // end inline asm fma.rn.ftz.f32 %f144, %f140, %f86, %f136; // begin inline asm { cvt.f32.f16 %f87, %rs46;} // end inline asm fma.rn.ftz.f32 %f145, %f140, %f87, %f137; cvt.rn.f32.s16 %f146, %rs48; sub.ftz.f32 %f147, %f146, %f7; mul.ftz.f32 %f148, %f57, %f147; // begin inline asm { cvt.f32.f16 %f88, %rs49;} // end inline asm fma.rn.ftz.f32 %f149, %f148, %f88, %f141; // begin inline asm { cvt.f32.f16 %f89, %rs50;} // end inline asm fma.rn.ftz.f32 %f150, %f148, %f89, %f142; // begin inline asm { cvt.f32.f16 %f90, %rs51;} // end inline asm fma.rn.ftz.f32 %f151, %f148, %f90, %f143; // begin inline asm { cvt.f32.f16 %f91, %rs52;} // end inline asm fma.rn.ftz.f32 %f152, %f148, %f91, %f144; // begin inline asm { cvt.f32.f16 %f92, %rs53;} // end inline asm fma.rn.ftz.f32 %f153, %f148, %f92, %f145; cvt.rn.f32.s16 %f154, %rs54; sub.ftz.f32 %f155, %f154, %f7; mul.ftz.f32 %f156, %f57, %f155; // begin inline asm { cvt.f32.f16 %f93, %rs55;} // end inline asm fma.rn.ftz.f32 %f376, %f156, %f93, %f149; // begin inline asm { cvt.f32.f16 %f94, %rs56;} // end inline asm fma.rn.ftz.f32 %f377, %f156, %f94, %f150; // begin inline asm { cvt.f32.f16 %f95, %rs57;} // end inline asm fma.rn.ftz.f32 %f378, %f156, %f95, %f151; // begin inline asm { cvt.f32.f16 %f96, %rs58;} // end inline asm fma.rn.ftz.f32 %f379, %f156, %f96, %f152; // begin inline asm { cvt.f32.f16 %f97, %rs59;} // end inline asm fma.rn.ftz.f32 %f380, %f156, %f97, %f153; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs164, %rs5, 4; cvt.s16.s8 %rs165, %rs164; shr.s16 %rs166, %rs165, 7; and.b16 %rs167, %rs166, -16; or.b16 %rs168, %rs167, %rs6; cvt.rn.f32.s16 %f197, %rs168; sub.ftz.f32 %f198, %f197, %f7; mul.ftz.f32 %f199, %f57, %f198; // begin inline asm { cvt.f32.f16 %f157, %rs7;} // end inline asm fma.rn.ftz.f32 %f200, %f199, %f157, %f376; // begin inline asm { cvt.f32.f16 %f158, %rs8;} // end inline asm fma.rn.ftz.f32 %f201, %f199, %f158, %f377; // begin inline asm { cvt.f32.f16 %f159, %rs9;} // end inline asm fma.rn.ftz.f32 %f202, %f199, %f159, %f378; // begin inline asm { cvt.f32.f16 %f160, %rs10;} // end inline asm fma.rn.ftz.f32 %f203, %f199, %f160, %f379; // begin inline asm { cvt.f32.f16 %f161, %rs11;} // end inline asm fma.rn.ftz.f32 %f204, %f199, %f161, %f380; shl.b16 %rs169, %rs12, 4; cvt.s16.s8 %rs170, %rs169; shr.s16 %rs171, %rs170, 7; and.b16 %rs172, %rs171, -16; or.b16 %rs173, %rs172, %rs13; cvt.rn.f32.s16 %f205, %rs173; sub.ftz.f32 %f206, %f205, %f7; mul.ftz.f32 %f207, %f57, %f206; // begin inline asm { cvt.f32.f16 %f162, %rs14;} // end inline asm fma.rn.ftz.f32 %f208, %f207, %f162, %f200; // begin inline asm { cvt.f32.f16 %f163, %rs15;} // end inline asm fma.rn.ftz.f32 %f209, %f207, %f163, %f201; // begin inline asm { cvt.f32.f16 %f164, %rs16;} // end inline asm fma.rn.ftz.f32 %f210, %f207, %f164, %f202; // begin inline asm { cvt.f32.f16 %f165, %rs17;} // end inline asm fma.rn.ftz.f32 %f211, %f207, %f165, %f203; // begin inline asm { cvt.f32.f16 %f166, %rs18;} // end inline asm fma.rn.ftz.f32 %f212, %f207, %f166, %f204; shl.b16 %rs174, %rs19, 4; cvt.s16.s8 %rs175, %rs174; shr.s16 %rs176, %rs175, 7; and.b16 %rs177, %rs176, -16; or.b16 %rs178, %rs177, %rs20; cvt.rn.f32.s16 %f213, %rs178; sub.ftz.f32 %f214, %f213, %f7; mul.ftz.f32 %f215, %f57, %f214; // begin inline asm { cvt.f32.f16 %f167, %rs21;} // end inline asm fma.rn.ftz.f32 %f216, %f215, %f167, %f208; // begin inline asm { cvt.f32.f16 %f168, %rs22;} // end inline asm fma.rn.ftz.f32 %f217, %f215, %f168, %f209; // begin inline asm { cvt.f32.f16 %f169, %rs23;} // end inline asm fma.rn.ftz.f32 %f218, %f215, %f169, %f210; // begin inline asm { cvt.f32.f16 %f170, %rs24;} // end inline asm fma.rn.ftz.f32 %f219, %f215, %f170, %f211; // begin inline asm { cvt.f32.f16 %f171, %rs25;} // end inline asm fma.rn.ftz.f32 %f220, %f215, %f171, %f212; shl.b16 %rs179, %rs26, 4; cvt.s16.s8 %rs180, %rs179; shr.s16 %rs181, %rs180, 7; and.b16 %rs182, %rs181, -16; or.b16 %rs183, %rs182, %rs27; cvt.rn.f32.s16 %f221, %rs183; sub.ftz.f32 %f222, %f221, %f7; mul.ftz.f32 %f223, %f57, %f222; // begin inline asm { cvt.f32.f16 %f172, %rs28;} // end inline asm fma.rn.ftz.f32 %f224, %f223, %f172, %f216; // begin inline asm { cvt.f32.f16 %f173, %rs29;} // end inline asm fma.rn.ftz.f32 %f225, %f223, %f173, %f217; // begin inline asm { cvt.f32.f16 %f174, %rs30;} // end inline asm fma.rn.ftz.f32 %f226, %f223, %f174, %f218; // begin inline asm { cvt.f32.f16 %f175, %rs31;} // end inline asm fma.rn.ftz.f32 %f227, %f223, %f175, %f219; // begin inline asm { cvt.f32.f16 %f176, %rs32;} // end inline asm fma.rn.ftz.f32 %f228, %f223, %f176, %f220; shl.b16 %rs184, %rs33, 4; cvt.s16.s8 %rs185, %rs184; shr.s16 %rs186, %rs185, 7; and.b16 %rs187, %rs186, -16; or.b16 %rs188, %rs187, %rs34; cvt.rn.f32.s16 %f229, %rs188; sub.ftz.f32 %f230, %f229, %f7; mul.ftz.f32 %f231, %f57, %f230; // begin inline asm { cvt.f32.f16 %f177, %rs35;} // end inline asm fma.rn.ftz.f32 %f232, %f231, %f177, %f224; // begin inline asm { cvt.f32.f16 %f178, %rs36;} // end inline asm fma.rn.ftz.f32 %f233, %f231, %f178, %f225; // begin inline asm { cvt.f32.f16 %f179, %rs37;} // end inline asm fma.rn.ftz.f32 %f234, %f231, %f179, %f226; // begin inline asm { cvt.f32.f16 %f180, %rs38;} // end inline asm fma.rn.ftz.f32 %f235, %f231, %f180, %f227; // begin inline asm { cvt.f32.f16 %f181, %rs39;} // end inline asm fma.rn.ftz.f32 %f236, %f231, %f181, %f228; shl.b16 %rs189, %rs40, 4; cvt.s16.s8 %rs190, %rs189; shr.s16 %rs191, %rs190, 7; and.b16 %rs192, %rs191, -16; or.b16 %rs193, %rs192, %rs41; cvt.rn.f32.s16 %f237, %rs193; sub.ftz.f32 %f238, %f237, %f7; mul.ftz.f32 %f239, %f57, %f238; // begin inline asm { cvt.f32.f16 %f182, %rs42;} // end inline asm fma.rn.ftz.f32 %f240, %f239, %f182, %f232; // begin inline asm { cvt.f32.f16 %f183, %rs43;} // end inline asm fma.rn.ftz.f32 %f241, %f239, %f183, %f233; // begin inline asm { cvt.f32.f16 %f184, %rs44;} // end inline asm fma.rn.ftz.f32 %f242, %f239, %f184, %f234; // begin inline asm { cvt.f32.f16 %f185, %rs45;} // end inline asm fma.rn.ftz.f32 %f243, %f239, %f185, %f235; // begin inline asm { cvt.f32.f16 %f186, %rs46;} // end inline asm fma.rn.ftz.f32 %f244, %f239, %f186, %f236; shl.b16 %rs194, %rs47, 4; cvt.s16.s8 %rs195, %rs194; shr.s16 %rs196, %rs195, 7; and.b16 %rs197, %rs196, -16; or.b16 %rs198, %rs197, %rs48; cvt.rn.f32.s16 %f245, %rs198; sub.ftz.f32 %f246, %f245, %f7; mul.ftz.f32 %f247, %f57, %f246; // begin inline asm { cvt.f32.f16 %f187, %rs49;} // end inline asm fma.rn.ftz.f32 %f248, %f247, %f187, %f240; // begin inline asm { cvt.f32.f16 %f188, %rs50;} // end inline asm fma.rn.ftz.f32 %f249, %f247, %f188, %f241; // begin inline asm { cvt.f32.f16 %f189, %rs51;} // end inline asm fma.rn.ftz.f32 %f250, %f247, %f189, %f242; // begin inline asm { cvt.f32.f16 %f190, %rs52;} // end inline asm fma.rn.ftz.f32 %f251, %f247, %f190, %f243; // begin inline asm { cvt.f32.f16 %f191, %rs53;} // end inline asm fma.rn.ftz.f32 %f252, %f247, %f191, %f244; shl.b16 %rs199, %rs54, 4; cvt.s16.s8 %rs200, %rs199; shr.s16 %rs201, %rs200, 7; and.b16 %rs202, %rs201, -16; or.b16 %rs203, %rs202, %rs54; cvt.rn.f32.s16 %f253, %rs203; sub.ftz.f32 %f254, %f253, %f7; mul.ftz.f32 %f255, %f57, %f254; // begin inline asm { cvt.f32.f16 %f192, %rs55;} // end inline asm fma.rn.ftz.f32 %f376, %f255, %f192, %f248; // begin inline asm { cvt.f32.f16 %f193, %rs56;} // end inline asm fma.rn.ftz.f32 %f377, %f255, %f193, %f249; // begin inline asm { cvt.f32.f16 %f194, %rs57;} // end inline asm fma.rn.ftz.f32 %f378, %f255, %f194, %f250; // begin inline asm { cvt.f32.f16 %f195, %rs58;} // end inline asm fma.rn.ftz.f32 %f379, %f255, %f195, %f251; // begin inline asm { cvt.f32.f16 %f196, %rs59;} // end inline asm fma.rn.ftz.f32 %f380, %f255, %f196, %f252; $L__BB0_8: add.s32 %r191, %r191, 4; shl.b32 %r89, %r191, 5; add.s32 %r190, %r89, %r3; setp.lt.u32 %p7, %r190, %r26; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r186, %tid.y; shl.b32 %r185, %r186, 5; add.s32 %r184, %r185, %r3; shl.b32 %r90, %r184, 2; mov.u32 %r91, _ZZ9gemv_int4ILi4ELi32ELi5EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r92, %r91, %r90; setp.lt.u32 %p8, %r184, 32; @%p8 bra $L__BB0_11; add.s32 %r177, %r92, -112; st.shared.f32 [%r177], %f376; $L__BB0_11: mov.u32 %r189, %tid.y; shl.b32 %r188, %r189, 5; add.s32 %r187, %r188, %r3; setp.gt.u32 %p9, %r187, 31; bar.sync 0; mad.lo.s32 %r19, %r187, 12, %r91; @%p9 bra $L__BB0_13; mov.u32 %r107, 16; ld.shared.f32 %f271, [%r19+16]; add.ftz.f32 %f272, %f376, %f271; ld.shared.f32 %f273, [%r19+20]; add.ftz.f32 %f274, %f272, %f273; ld.shared.f32 %f275, [%r19+24]; add.ftz.f32 %f258, %f274, %f275; mov.u32 %r95, 1; mov.u32 %r108, 31; mov.u32 %r109, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f258, %r95, %r108, %r109; @p add.f32 r0, r0, %f258; mov.f32 %f256, r0;} // end inline asm mov.u32 %r98, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f256, %r98, %r108, %r109; @p add.f32 r0, r0, %f256; mov.f32 %f259, r0;} // end inline asm mov.u32 %r101, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f259, %r101, %r108, %r109; @p add.f32 r0, r0, %f259; mov.f32 %f262, r0;} // end inline asm mov.u32 %r104, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f262, %r104, %r108, %r109; @p add.f32 r0, r0, %f262; mov.f32 %f265, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f265, %r107, %r108, %r109; @p add.f32 r0, r0, %f265; mov.f32 %f376, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r178, %r92, -112; st.shared.f32 [%r178+640], %f377; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f291, [%r19+656]; add.ftz.f32 %f292, %f377, %f291; ld.shared.f32 %f293, [%r19+660]; add.ftz.f32 %f294, %f292, %f293; ld.shared.f32 %f295, [%r19+664]; add.ftz.f32 %f278, %f294, %f295; mov.u32 %r111, 1; mov.u32 %r124, 31; mov.u32 %r125, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f278, %r111, %r124, %r125; @p add.f32 r0, r0, %f278; mov.f32 %f276, r0;} // end inline asm mov.u32 %r114, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f276, %r114, %r124, %r125; @p add.f32 r0, r0, %f276; mov.f32 %f279, r0;} // end inline asm mov.u32 %r117, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f279, %r117, %r124, %r125; @p add.f32 r0, r0, %f279; mov.f32 %f282, r0;} // end inline asm mov.u32 %r120, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f282, %r120, %r124, %r125; @p add.f32 r0, r0, %f282; mov.f32 %f285, r0;} // end inline asm mov.u32 %r123, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f285, %r123, %r124, %r125; @p add.f32 r0, r0, %f285; mov.f32 %f377, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r179, %r92, -112; st.shared.f32 [%r179+1280], %f378; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f311, [%r19+1296]; add.ftz.f32 %f312, %f378, %f311; ld.shared.f32 %f313, [%r19+1300]; add.ftz.f32 %f314, %f312, %f313; ld.shared.f32 %f315, [%r19+1304]; add.ftz.f32 %f298, %f314, %f315; mov.u32 %r127, 1; mov.u32 %r140, 31; mov.u32 %r141, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f298, %r127, %r140, %r141; @p add.f32 r0, r0, %f298; mov.f32 %f296, r0;} // end inline asm mov.u32 %r130, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f296, %r130, %r140, %r141; @p add.f32 r0, r0, %f296; mov.f32 %f299, r0;} // end inline asm mov.u32 %r133, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f299, %r133, %r140, %r141; @p add.f32 r0, r0, %f299; mov.f32 %f302, r0;} // end inline asm mov.u32 %r136, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f302, %r136, %r140, %r141; @p add.f32 r0, r0, %f302; mov.f32 %f305, r0;} // end inline asm mov.u32 %r139, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f305, %r139, %r140, %r141; @p add.f32 r0, r0, %f305; mov.f32 %f378, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r180, %r92, -112; st.shared.f32 [%r180+1920], %f379; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f331, [%r19+1936]; add.ftz.f32 %f332, %f379, %f331; ld.shared.f32 %f333, [%r19+1940]; add.ftz.f32 %f334, %f332, %f333; ld.shared.f32 %f335, [%r19+1944]; add.ftz.f32 %f318, %f334, %f335; mov.u32 %r143, 1; mov.u32 %r156, 31; mov.u32 %r157, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f318, %r143, %r156, %r157; @p add.f32 r0, r0, %f318; mov.f32 %f316, r0;} // end inline asm mov.u32 %r146, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f316, %r146, %r156, %r157; @p add.f32 r0, r0, %f316; mov.f32 %f319, r0;} // end inline asm mov.u32 %r149, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f319, %r149, %r156, %r157; @p add.f32 r0, r0, %f319; mov.f32 %f322, r0;} // end inline asm mov.u32 %r152, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f322, %r152, %r156, %r157; @p add.f32 r0, r0, %f322; mov.f32 %f325, r0;} // end inline asm mov.u32 %r155, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f325, %r155, %r156, %r157; @p add.f32 r0, r0, %f325; mov.f32 %f379, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r181, %r92, -112; st.shared.f32 [%r181+2560], %f380; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f351, [%r19+2576]; add.ftz.f32 %f352, %f380, %f351; ld.shared.f32 %f353, [%r19+2580]; add.ftz.f32 %f354, %f352, %f353; ld.shared.f32 %f355, [%r19+2584]; add.ftz.f32 %f338, %f354, %f355; mov.u32 %r159, 1; mov.u32 %r172, 31; mov.u32 %r173, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f338, %r159, %r172, %r173; @p add.f32 r0, r0, %f338; mov.f32 %f336, r0;} // end inline asm mov.u32 %r162, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f336, %r162, %r172, %r173; @p add.f32 r0, r0, %f336; mov.f32 %f339, r0;} // end inline asm mov.u32 %r165, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f339, %r165, %r172, %r173; @p add.f32 r0, r0, %f339; mov.f32 %f342, r0;} // end inline asm mov.u32 %r168, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f342, %r168, %r172, %r173; @p add.f32 r0, r0, %f342; mov.f32 %f345, r0;} // end inline asm mov.u32 %r171, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f345, %r171, %r172, %r173; @p add.f32 r0, r0, %f345; mov.f32 %f380, r0;} // end inline asm $L__BB0_29: mov.u32 %r182, %tid.y; or.b32 %r174, %r3, %r182; setp.ne.s32 %p18, %r174, 0; @%p18 bra $L__BB0_33; ld.param.u64 %rd54, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd53, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0]; mov.u32 %r183, %ctaid.x; cvta.to.global.u64 %rd35, %rd53; setp.eq.s64 %p19, %rd54, 0; mul.ftz.f32 %f38, %f50, %f376; cvt.s64.s32 %rd8, %r183; mul.wide.s32 %rd36, %r183, 2; add.s64 %rd9, %rd35, %rd36; mul.ftz.f32 %f39, %f50, %f377; cvt.s64.s32 %rd10, %r25; mul.wide.s32 %rd37, %r25, 2; add.s64 %rd11, %rd9, %rd37; mul.ftz.f32 %f40, %f50, %f378; add.s32 %r175, %r25, %r183; add.s32 %r176, %r175, %r25; cvt.s64.s32 %rd12, %r176; mul.wide.s32 %rd38, %r176, 2; add.s64 %rd14, %rd35, %rd38; mul.ftz.f32 %f41, %f50, %f379; mul.ftz.f32 %f42, %f50, %f380; @%p19 bra $L__BB0_32; ld.param.u64 %rd55, [_Z27dequant_gemv_group32_batch523DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd39, %rd55; shl.b64 %rd40, %rd8, 1; add.s64 %rd41, %rd39, %rd40; ld.global.u16 %rs204, [%rd41]; // begin inline asm { cvt.f32.f16 %f356, %rs204;} // end inline asm fma.rn.ftz.f32 %f357, %f51, %f356, %f38; // begin inline asm { cvt.rn.f16.f32 %rs205, %f357;} // end inline asm st.global.u16 [%rd9], %rs205; shl.b64 %rd42, %rd10, 1; add.s64 %rd43, %rd41, %rd42; ld.global.u16 %rs206, [%rd43]; // begin inline asm { cvt.f32.f16 %f358, %rs206;} // end inline asm fma.rn.ftz.f32 %f359, %f51, %f358, %f39; // begin inline asm { cvt.rn.f16.f32 %rs207, %f359;} // end inline asm st.global.u16 [%rd11], %rs207; shl.b64 %rd44, %rd12, 1; add.s64 %rd45, %rd39, %rd44; ld.global.u16 %rs208, [%rd45]; // begin inline asm { cvt.f32.f16 %f360, %rs208;} // end inline asm fma.rn.ftz.f32 %f361, %f51, %f360, %f40; // begin inline asm { cvt.rn.f16.f32 %rs209, %f361;} // end inline asm st.global.u16 [%rd14], %rs209; add.s64 %rd46, %rd45, %rd42; ld.global.u16 %rs210, [%rd46]; // begin inline asm { cvt.f32.f16 %f362, %rs210;} // end inline asm fma.rn.ftz.f32 %f363, %f51, %f362, %f41; // begin inline asm { cvt.rn.f16.f32 %rs211, %f363;} // end inline asm add.s64 %rd47, %rd14, %rd42; st.global.u16 [%rd47], %rs211; add.s64 %rd48, %rd46, %rd42; ld.global.u16 %rs212, [%rd48]; // begin inline asm { cvt.f32.f16 %f364, %rs212;} // end inline asm fma.rn.ftz.f32 %f365, %f51, %f364, %f42; // begin inline asm { cvt.rn.f16.f32 %rs213, %f365;} // end inline asm add.s64 %rd49, %rd47, %rd42; st.global.u16 [%rd49], %rs213; bra.uni $L__BB0_33; $L__BB0_32: // begin inline asm { cvt.rn.f16.f32 %rs214, %f38;} // end inline asm st.global.u16 [%rd9], %rs214; // begin inline asm { cvt.rn.f16.f32 %rs215, %f39;} // end inline asm st.global.u16 [%rd11], %rs215; // begin inline asm { cvt.rn.f16.f32 %rs216, %f40;} // end inline asm st.global.u16 [%rd14], %rs216; // begin inline asm { cvt.rn.f16.f32 %rs217, %f41;} // end inline asm shl.b64 %rd50, %rd10, 1; add.s64 %rd51, %rd14, %rd50; st.global.u16 [%rd51], %rs217; // begin inline asm { cvt.rn.f16.f32 %rs218, %f42;} // end inline asm add.s64 %rd52, %rd51, %rd50; st.global.u16 [%rd52], %rs218; $L__BB0_33: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; } // // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-31678015 // Cuda compilation tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_60 .address_size 64 // .globl _Z27dequant_gemv_group32_batch723DequantGemvKernelParams // _ZZ9gemv_int4ILi4ELi32ELi7EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2826982a6thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch723DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<24>; .reg .b16 %rs<262>; .reg .f32 %f<525>; .reg .b32 %r<245>; .reg .b64 %rd<63>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi7EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[4480]; ld.param.v2.u32 {%r48, %r49}, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r50, %r51}, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f68, %f69}, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs70, %rs71, %rs72, %rs73}, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd18, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd17, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd16, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd15, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+16]; mov.u32 %r244, %tid.y; shl.b32 %r52, %r244, 5; mov.u32 %r53, %tid.x; add.s32 %r243, %r52, %r53; setp.ge.u32 %p1, %r243, %r50; mov.f32 %f504, 0f00000000; mov.f32 %f505, %f504; mov.f32 %f506, %f504; mov.f32 %f507, %f504; mov.f32 %f508, %f504; mov.f32 %f509, %f504; mov.f32 %f510, %f504; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd2, %rd15; mov.u32 %r54, %ctaid.x; mul.lo.s32 %r5, %r51, %r54; shl.b16 %rs2, %rs70, 3; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd18; $L__BB0_2: mad.lo.s32 %r56, %r50, %r54, %r243; mul.wide.u32 %rd19, %r56, 4; add.s64 %rd20, %rd3, %rd19; ld.global.u32 %r8, [%rd20]; shr.u32 %r58, %r53, 2; shl.b32 %r59, %r244, 3; add.s32 %r9, %r59, %r58; add.s32 %r10, %r9, %r5; mul.wide.s32 %rd21, %r10, 2; add.s64 %rd22, %rd4, %rd21; ld.global.u16 %rs78, [%rd22]; // begin inline asm { cvt.f32.f16 %f77, %rs78;} // end inline asm setp.eq.s64 %p2, %rd17, 0; mov.u16 %rs261, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r60, %r10, 31; add.s32 %r61, %r10, %r60; shr.s32 %r62, %r61, 1; cvt.s64.s32 %rd23, %r62; cvta.to.global.u64 %rd24, %rd17; add.s64 %rd25, %rd24, %rd23; ld.global.u8 %r63, [%rd25]; shl.b32 %r64, %r9, 2; and.b32 %r65, %r64, 4; shr.u32 %r66, %r63, %r65; cvt.u16.u32 %rs79, %r66; and.b16 %rs261, %rs79, 15; $L__BB0_4: shl.b32 %r11, %r243, 3; setp.ge.s32 %p3, %r11, %r48; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs70, 0; mul.wide.s32 %rd26, %r11, 2; add.s64 %rd27, %rd2, %rd26; ld.global.v4.u32 {%r67, %r68, %r69, %r70}, [%rd27]; mul.wide.s32 %rd28, %r48, 2; add.s64 %rd29, %rd27, %rd28; ld.global.v4.u32 {%r71, %r72, %r73, %r74}, [%rd29]; add.s32 %r75, %r48, %r11; add.s32 %r76, %r75, %r48; mul.wide.s32 %rd30, %r76, 2; add.s64 %rd31, %rd2, %rd30; ld.global.v4.u32 {%r77, %r78, %r79, %r80}, [%rd31]; add.s64 %rd32, %rd31, %rd28; ld.global.v4.u32 {%r81, %r82, %r83, %r84}, [%rd32]; add.s64 %rd33, %rd32, %rd28; ld.global.v4.u32 {%r85, %r86, %r87, %r88}, [%rd33]; add.s64 %rd34, %rd33, %rd28; ld.global.v4.u32 {%r89, %r90, %r91, %r92}, [%rd34]; add.s64 %rd35, %rd34, %rd28; ld.global.v4.u32 {%r93, %r94, %r95, %r96}, [%rd35]; shr.u16 %rs81, %rs261, 3; and.b16 %rs82, %rs81, 1; setp.eq.b16 %p5, %rs82, 1; and.pred %p6, %p4, %p5; selp.b16 %rs83, -16, 0, %p6; or.b16 %rs84, %rs83, %rs261; cvt.s16.s8 %rs85, %rs84; cvt.rn.f32.s16 %f9, %rs85; cvt.u16.u32 %rs5, %r8; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs16}, %r67; mov.b32 {%rs9, %rs17}, %r71; mov.b32 {%rs11, %rs20}, %r85; mov.b32 {%rs12, %rs21}, %r89; mov.b32 {%rs13, %rs22}, %r93; shr.u32 %r97, %r8, 4; cvt.u16.u32 %rs14, %r97; and.b16 %rs15, %rs14, 15; mov.b32 {%rs86, %rs18}, %r77; mov.b32 {%rs87, %rs19}, %r81; shr.u32 %r98, %r8, 8; cvt.u16.u32 %rs23, %r98; and.b16 %rs24, %rs23, 15; mov.b32 {%rs25, %rs34}, %r68; mov.b32 {%rs26, %rs35}, %r72; mov.b32 {%rs27, %rs36}, %r78; mov.b32 {%rs28, %rs37}, %r82; mov.b32 {%rs29, %rs38}, %r86; mov.b32 {%rs30, %rs39}, %r90; mov.b32 {%rs31, %rs40}, %r94; shr.u32 %r99, %r8, 12; cvt.u16.u32 %rs32, %r99; and.b16 %rs33, %rs32, 15; shr.u32 %r100, %r8, 16; cvt.u16.u32 %rs41, %r100; and.b16 %rs42, %rs41, 15; mov.b32 {%rs43, %rs52}, %r69; mov.b32 {%rs44, %rs53}, %r73; mov.b32 {%rs45, %rs54}, %r79; mov.b32 {%rs46, %rs55}, %r83; mov.b32 {%rs47, %rs56}, %r87; mov.b32 {%rs48, %rs57}, %r91; mov.b32 {%rs49, %rs58}, %r95; shr.u32 %r101, %r8, 20; cvt.u16.u32 %rs50, %r101; and.b16 %rs51, %rs50, 15; shr.u32 %r102, %r8, 24; cvt.u16.u32 %rs59, %r102; and.b16 %rs60, %rs59, 15; shr.u32 %r103, %r8, 28; cvt.u16.u32 %rs61, %r103; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f134, %rs6; sub.ftz.f32 %f135, %f134, %f9; mul.ftz.f32 %f136, %f77, %f135; // begin inline asm { cvt.f32.f16 %f78, %rs7;} // end inline asm fma.rn.ftz.f32 %f137, %f136, %f78, %f504; // begin inline asm { cvt.f32.f16 %f79, %rs9;} // end inline asm fma.rn.ftz.f32 %f138, %f136, %f79, %f505; cvt.u16.u32 %rs90, %r77; // begin inline asm { cvt.f32.f16 %f80, %rs90;} // end inline asm fma.rn.ftz.f32 %f139, %f136, %f80, %f506; cvt.u16.u32 %rs91, %r81; // begin inline asm { cvt.f32.f16 %f81, %rs91;} // end inline asm fma.rn.ftz.f32 %f140, %f136, %f81, %f507; // begin inline asm { cvt.f32.f16 %f82, %rs11;} // end inline asm fma.rn.ftz.f32 %f141, %f136, %f82, %f508; // begin inline asm { cvt.f32.f16 %f83, %rs12;} // end inline asm fma.rn.ftz.f32 %f142, %f136, %f83, %f509; // begin inline asm { cvt.f32.f16 %f84, %rs13;} // end inline asm fma.rn.ftz.f32 %f143, %f136, %f84, %f510; cvt.rn.f32.s16 %f144, %rs15; sub.ftz.f32 %f145, %f144, %f9; mul.ftz.f32 %f146, %f77, %f145; // begin inline asm { cvt.f32.f16 %f85, %rs16;} // end inline asm fma.rn.ftz.f32 %f147, %f146, %f85, %f137; // begin inline asm { cvt.f32.f16 %f86, %rs17;} // end inline asm fma.rn.ftz.f32 %f148, %f146, %f86, %f138; // begin inline asm { cvt.f32.f16 %f87, %rs18;} // end inline asm fma.rn.ftz.f32 %f149, %f146, %f87, %f139; // begin inline asm { cvt.f32.f16 %f88, %rs19;} // end inline asm fma.rn.ftz.f32 %f150, %f146, %f88, %f140; // begin inline asm { cvt.f32.f16 %f89, %rs20;} // end inline asm fma.rn.ftz.f32 %f151, %f146, %f89, %f141; // begin inline asm { cvt.f32.f16 %f90, %rs21;} // end inline asm fma.rn.ftz.f32 %f152, %f146, %f90, %f142; // begin inline asm { cvt.f32.f16 %f91, %rs22;} // end inline asm fma.rn.ftz.f32 %f153, %f146, %f91, %f143; cvt.rn.f32.s16 %f154, %rs24; sub.ftz.f32 %f155, %f154, %f9; mul.ftz.f32 %f156, %f77, %f155; // begin inline asm { cvt.f32.f16 %f92, %rs25;} // end inline asm fma.rn.ftz.f32 %f157, %f156, %f92, %f147; // begin inline asm { cvt.f32.f16 %f93, %rs26;} // end inline asm fma.rn.ftz.f32 %f158, %f156, %f93, %f148; // begin inline asm { cvt.f32.f16 %f94, %rs27;} // end inline asm fma.rn.ftz.f32 %f159, %f156, %f94, %f149; // begin inline asm { cvt.f32.f16 %f95, %rs28;} // end inline asm fma.rn.ftz.f32 %f160, %f156, %f95, %f150; // begin inline asm { cvt.f32.f16 %f96, %rs29;} // end inline asm fma.rn.ftz.f32 %f161, %f156, %f96, %f151; // begin inline asm { cvt.f32.f16 %f97, %rs30;} // end inline asm fma.rn.ftz.f32 %f162, %f156, %f97, %f152; // begin inline asm { cvt.f32.f16 %f98, %rs31;} // end inline asm fma.rn.ftz.f32 %f163, %f156, %f98, %f153; cvt.rn.f32.s16 %f164, %rs33; sub.ftz.f32 %f165, %f164, %f9; mul.ftz.f32 %f166, %f77, %f165; // begin inline asm { cvt.f32.f16 %f99, %rs34;} // end inline asm fma.rn.ftz.f32 %f167, %f166, %f99, %f157; // begin inline asm { cvt.f32.f16 %f100, %rs35;} // end inline asm fma.rn.ftz.f32 %f168, %f166, %f100, %f158; // begin inline asm { cvt.f32.f16 %f101, %rs36;} // end inline asm fma.rn.ftz.f32 %f169, %f166, %f101, %f159; // begin inline asm { cvt.f32.f16 %f102, %rs37;} // end inline asm fma.rn.ftz.f32 %f170, %f166, %f102, %f160; // begin inline asm { cvt.f32.f16 %f103, %rs38;} // end inline asm fma.rn.ftz.f32 %f171, %f166, %f103, %f161; // begin inline asm { cvt.f32.f16 %f104, %rs39;} // end inline asm fma.rn.ftz.f32 %f172, %f166, %f104, %f162; // begin inline asm { cvt.f32.f16 %f105, %rs40;} // end inline asm fma.rn.ftz.f32 %f173, %f166, %f105, %f163; cvt.rn.f32.s16 %f174, %rs42; sub.ftz.f32 %f175, %f174, %f9; mul.ftz.f32 %f176, %f77, %f175; // begin inline asm { cvt.f32.f16 %f106, %rs43;} // end inline asm fma.rn.ftz.f32 %f177, %f176, %f106, %f167; // begin inline asm { cvt.f32.f16 %f107, %rs44;} // end inline asm fma.rn.ftz.f32 %f178, %f176, %f107, %f168; // begin inline asm { cvt.f32.f16 %f108, %rs45;} // end inline asm fma.rn.ftz.f32 %f179, %f176, %f108, %f169; // begin inline asm { cvt.f32.f16 %f109, %rs46;} // end inline asm fma.rn.ftz.f32 %f180, %f176, %f109, %f170; // begin inline asm { cvt.f32.f16 %f110, %rs47;} // end inline asm fma.rn.ftz.f32 %f181, %f176, %f110, %f171; // begin inline asm { cvt.f32.f16 %f111, %rs48;} // end inline asm fma.rn.ftz.f32 %f182, %f176, %f111, %f172; // begin inline asm { cvt.f32.f16 %f112, %rs49;} // end inline asm fma.rn.ftz.f32 %f183, %f176, %f112, %f173; cvt.rn.f32.s16 %f184, %rs51; sub.ftz.f32 %f185, %f184, %f9; mul.ftz.f32 %f186, %f77, %f185; // begin inline asm { cvt.f32.f16 %f113, %rs52;} // end inline asm fma.rn.ftz.f32 %f187, %f186, %f113, %f177; // begin inline asm { cvt.f32.f16 %f114, %rs53;} // end inline asm fma.rn.ftz.f32 %f188, %f186, %f114, %f178; // begin inline asm { cvt.f32.f16 %f115, %rs54;} // end inline asm fma.rn.ftz.f32 %f189, %f186, %f115, %f179; // begin inline asm { cvt.f32.f16 %f116, %rs55;} // end inline asm fma.rn.ftz.f32 %f190, %f186, %f116, %f180; // begin inline asm { cvt.f32.f16 %f117, %rs56;} // end inline asm fma.rn.ftz.f32 %f191, %f186, %f117, %f181; // begin inline asm { cvt.f32.f16 %f118, %rs57;} // end inline asm fma.rn.ftz.f32 %f192, %f186, %f118, %f182; // begin inline asm { cvt.f32.f16 %f119, %rs58;} // end inline asm fma.rn.ftz.f32 %f193, %f186, %f119, %f183; cvt.rn.f32.s16 %f194, %rs60; sub.ftz.f32 %f195, %f194, %f9; mul.ftz.f32 %f196, %f77, %f195; mov.b32 {%rs130, %rs137}, %r70; // begin inline asm { cvt.f32.f16 %f120, %rs130;} // end inline asm fma.rn.ftz.f32 %f197, %f196, %f120, %f187; mov.b32 {%rs131, %rs138}, %r74; // begin inline asm { cvt.f32.f16 %f121, %rs131;} // end inline asm fma.rn.ftz.f32 %f198, %f196, %f121, %f188; mov.b32 {%rs132, %rs139}, %r80; // begin inline asm { cvt.f32.f16 %f122, %rs132;} // end inline asm fma.rn.ftz.f32 %f199, %f196, %f122, %f189; mov.b32 {%rs133, %rs140}, %r84; // begin inline asm { cvt.f32.f16 %f123, %rs133;} // end inline asm fma.rn.ftz.f32 %f200, %f196, %f123, %f190; mov.b32 {%rs134, %rs141}, %r88; // begin inline asm { cvt.f32.f16 %f124, %rs134;} // end inline asm fma.rn.ftz.f32 %f201, %f196, %f124, %f191; mov.b32 {%rs135, %rs142}, %r92; // begin inline asm { cvt.f32.f16 %f125, %rs135;} // end inline asm fma.rn.ftz.f32 %f202, %f196, %f125, %f192; mov.b32 {%rs136, %rs143}, %r96; // begin inline asm { cvt.f32.f16 %f126, %rs136;} // end inline asm fma.rn.ftz.f32 %f203, %f196, %f126, %f193; cvt.rn.f32.s16 %f204, %rs61; sub.ftz.f32 %f205, %f204, %f9; mul.ftz.f32 %f206, %f77, %f205; // begin inline asm { cvt.f32.f16 %f127, %rs137;} // end inline asm fma.rn.ftz.f32 %f504, %f206, %f127, %f197; // begin inline asm { cvt.f32.f16 %f128, %rs138;} // end inline asm fma.rn.ftz.f32 %f505, %f206, %f128, %f198; // begin inline asm { cvt.f32.f16 %f129, %rs139;} // end inline asm fma.rn.ftz.f32 %f506, %f206, %f129, %f199; // begin inline asm { cvt.f32.f16 %f130, %rs140;} // end inline asm fma.rn.ftz.f32 %f507, %f206, %f130, %f200; // begin inline asm { cvt.f32.f16 %f131, %rs141;} // end inline asm fma.rn.ftz.f32 %f508, %f206, %f131, %f201; // begin inline asm { cvt.f32.f16 %f132, %rs142;} // end inline asm fma.rn.ftz.f32 %f509, %f206, %f132, %f202; // begin inline asm { cvt.f32.f16 %f133, %rs143;} // end inline asm fma.rn.ftz.f32 %f510, %f206, %f133, %f203; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs200, %rs5, 4; cvt.s16.s8 %rs201, %rs200; shr.s16 %rs202, %rs201, 7; and.b16 %rs203, %rs202, -16; or.b16 %rs204, %rs203, %rs6; cvt.rn.f32.s16 %f263, %rs204; sub.ftz.f32 %f264, %f263, %f9; mul.ftz.f32 %f265, %f77, %f264; cvt.u16.u32 %rs144, %r67; // begin inline asm { cvt.f32.f16 %f207, %rs144;} // end inline asm fma.rn.ftz.f32 %f266, %f265, %f207, %f504; cvt.u16.u32 %rs145, %r71; // begin inline asm { cvt.f32.f16 %f208, %rs145;} // end inline asm fma.rn.ftz.f32 %f267, %f265, %f208, %f505; cvt.u16.u32 %rs146, %r77; // begin inline asm { cvt.f32.f16 %f209, %rs146;} // end inline asm fma.rn.ftz.f32 %f268, %f265, %f209, %f506; cvt.u16.u32 %rs147, %r81; // begin inline asm { cvt.f32.f16 %f210, %rs147;} // end inline asm fma.rn.ftz.f32 %f269, %f265, %f210, %f507; // begin inline asm { cvt.f32.f16 %f211, %rs11;} // end inline asm fma.rn.ftz.f32 %f270, %f265, %f211, %f508; // begin inline asm { cvt.f32.f16 %f212, %rs12;} // end inline asm fma.rn.ftz.f32 %f271, %f265, %f212, %f509; // begin inline asm { cvt.f32.f16 %f213, %rs13;} // end inline asm fma.rn.ftz.f32 %f272, %f265, %f213, %f510; shl.b16 %rs205, %rs14, 4; cvt.s16.s8 %rs206, %rs205; shr.s16 %rs207, %rs206, 7; and.b16 %rs208, %rs207, -16; or.b16 %rs209, %rs208, %rs15; cvt.rn.f32.s16 %f273, %rs209; sub.ftz.f32 %f274, %f273, %f9; mul.ftz.f32 %f275, %f77, %f274; // begin inline asm { cvt.f32.f16 %f214, %rs16;} // end inline asm fma.rn.ftz.f32 %f276, %f275, %f214, %f266; // begin inline asm { cvt.f32.f16 %f215, %rs17;} // end inline asm fma.rn.ftz.f32 %f277, %f275, %f215, %f267; // begin inline asm { cvt.f32.f16 %f216, %rs18;} // end inline asm fma.rn.ftz.f32 %f278, %f275, %f216, %f268; // begin inline asm { cvt.f32.f16 %f217, %rs19;} // end inline asm fma.rn.ftz.f32 %f279, %f275, %f217, %f269; // begin inline asm { cvt.f32.f16 %f218, %rs20;} // end inline asm fma.rn.ftz.f32 %f280, %f275, %f218, %f270; // begin inline asm { cvt.f32.f16 %f219, %rs21;} // end inline asm fma.rn.ftz.f32 %f281, %f275, %f219, %f271; // begin inline asm { cvt.f32.f16 %f220, %rs22;} // end inline asm fma.rn.ftz.f32 %f282, %f275, %f220, %f272; shl.b16 %rs210, %rs23, 4; cvt.s16.s8 %rs211, %rs210; shr.s16 %rs212, %rs211, 7; and.b16 %rs213, %rs212, -16; or.b16 %rs214, %rs213, %rs24; cvt.rn.f32.s16 %f283, %rs214; sub.ftz.f32 %f284, %f283, %f9; mul.ftz.f32 %f285, %f77, %f284; // begin inline asm { cvt.f32.f16 %f221, %rs25;} // end inline asm fma.rn.ftz.f32 %f286, %f285, %f221, %f276; // begin inline asm { cvt.f32.f16 %f222, %rs26;} // end inline asm fma.rn.ftz.f32 %f287, %f285, %f222, %f277; // begin inline asm { cvt.f32.f16 %f223, %rs27;} // end inline asm fma.rn.ftz.f32 %f288, %f285, %f223, %f278; // begin inline asm { cvt.f32.f16 %f224, %rs28;} // end inline asm fma.rn.ftz.f32 %f289, %f285, %f224, %f279; // begin inline asm { cvt.f32.f16 %f225, %rs29;} // end inline asm fma.rn.ftz.f32 %f290, %f285, %f225, %f280; // begin inline asm { cvt.f32.f16 %f226, %rs30;} // end inline asm fma.rn.ftz.f32 %f291, %f285, %f226, %f281; // begin inline asm { cvt.f32.f16 %f227, %rs31;} // end inline asm fma.rn.ftz.f32 %f292, %f285, %f227, %f282; shl.b16 %rs215, %rs32, 4; cvt.s16.s8 %rs216, %rs215; shr.s16 %rs217, %rs216, 7; and.b16 %rs218, %rs217, -16; or.b16 %rs219, %rs218, %rs33; cvt.rn.f32.s16 %f293, %rs219; sub.ftz.f32 %f294, %f293, %f9; mul.ftz.f32 %f295, %f77, %f294; // begin inline asm { cvt.f32.f16 %f228, %rs34;} // end inline asm fma.rn.ftz.f32 %f296, %f295, %f228, %f286; // begin inline asm { cvt.f32.f16 %f229, %rs35;} // end inline asm fma.rn.ftz.f32 %f297, %f295, %f229, %f287; // begin inline asm { cvt.f32.f16 %f230, %rs36;} // end inline asm fma.rn.ftz.f32 %f298, %f295, %f230, %f288; // begin inline asm { cvt.f32.f16 %f231, %rs37;} // end inline asm fma.rn.ftz.f32 %f299, %f295, %f231, %f289; // begin inline asm { cvt.f32.f16 %f232, %rs38;} // end inline asm fma.rn.ftz.f32 %f300, %f295, %f232, %f290; // begin inline asm { cvt.f32.f16 %f233, %rs39;} // end inline asm fma.rn.ftz.f32 %f301, %f295, %f233, %f291; // begin inline asm { cvt.f32.f16 %f234, %rs40;} // end inline asm fma.rn.ftz.f32 %f302, %f295, %f234, %f292; shl.b16 %rs220, %rs41, 4; cvt.s16.s8 %rs221, %rs220; shr.s16 %rs222, %rs221, 7; and.b16 %rs223, %rs222, -16; or.b16 %rs224, %rs223, %rs42; cvt.rn.f32.s16 %f303, %rs224; sub.ftz.f32 %f304, %f303, %f9; mul.ftz.f32 %f305, %f77, %f304; // begin inline asm { cvt.f32.f16 %f235, %rs43;} // end inline asm fma.rn.ftz.f32 %f306, %f305, %f235, %f296; // begin inline asm { cvt.f32.f16 %f236, %rs44;} // end inline asm fma.rn.ftz.f32 %f307, %f305, %f236, %f297; // begin inline asm { cvt.f32.f16 %f237, %rs45;} // end inline asm fma.rn.ftz.f32 %f308, %f305, %f237, %f298; // begin inline asm { cvt.f32.f16 %f238, %rs46;} // end inline asm fma.rn.ftz.f32 %f309, %f305, %f238, %f299; // begin inline asm { cvt.f32.f16 %f239, %rs47;} // end inline asm fma.rn.ftz.f32 %f310, %f305, %f239, %f300; // begin inline asm { cvt.f32.f16 %f240, %rs48;} // end inline asm fma.rn.ftz.f32 %f311, %f305, %f240, %f301; // begin inline asm { cvt.f32.f16 %f241, %rs49;} // end inline asm fma.rn.ftz.f32 %f312, %f305, %f241, %f302; shl.b16 %rs225, %rs50, 4; cvt.s16.s8 %rs226, %rs225; shr.s16 %rs227, %rs226, 7; and.b16 %rs228, %rs227, -16; or.b16 %rs229, %rs228, %rs51; cvt.rn.f32.s16 %f313, %rs229; sub.ftz.f32 %f314, %f313, %f9; mul.ftz.f32 %f315, %f77, %f314; // begin inline asm { cvt.f32.f16 %f242, %rs52;} // end inline asm fma.rn.ftz.f32 %f316, %f315, %f242, %f306; // begin inline asm { cvt.f32.f16 %f243, %rs53;} // end inline asm fma.rn.ftz.f32 %f317, %f315, %f243, %f307; // begin inline asm { cvt.f32.f16 %f244, %rs54;} // end inline asm fma.rn.ftz.f32 %f318, %f315, %f244, %f308; // begin inline asm { cvt.f32.f16 %f245, %rs55;} // end inline asm fma.rn.ftz.f32 %f319, %f315, %f245, %f309; // begin inline asm { cvt.f32.f16 %f246, %rs56;} // end inline asm fma.rn.ftz.f32 %f320, %f315, %f246, %f310; // begin inline asm { cvt.f32.f16 %f247, %rs57;} // end inline asm fma.rn.ftz.f32 %f321, %f315, %f247, %f311; // begin inline asm { cvt.f32.f16 %f248, %rs58;} // end inline asm fma.rn.ftz.f32 %f322, %f315, %f248, %f312; shl.b16 %rs230, %rs59, 4; cvt.s16.s8 %rs231, %rs230; shr.s16 %rs232, %rs231, 7; and.b16 %rs233, %rs232, -16; or.b16 %rs234, %rs233, %rs60; cvt.rn.f32.s16 %f323, %rs234; sub.ftz.f32 %f324, %f323, %f9; mul.ftz.f32 %f325, %f77, %f324; mov.b32 {%rs186, %rs193}, %r70; // begin inline asm { cvt.f32.f16 %f249, %rs186;} // end inline asm fma.rn.ftz.f32 %f326, %f325, %f249, %f316; mov.b32 {%rs187, %rs194}, %r74; // begin inline asm { cvt.f32.f16 %f250, %rs187;} // end inline asm fma.rn.ftz.f32 %f327, %f325, %f250, %f317; mov.b32 {%rs188, %rs195}, %r80; // begin inline asm { cvt.f32.f16 %f251, %rs188;} // end inline asm fma.rn.ftz.f32 %f328, %f325, %f251, %f318; mov.b32 {%rs189, %rs196}, %r84; // begin inline asm { cvt.f32.f16 %f252, %rs189;} // end inline asm fma.rn.ftz.f32 %f329, %f325, %f252, %f319; mov.b32 {%rs190, %rs197}, %r88; // begin inline asm { cvt.f32.f16 %f253, %rs190;} // end inline asm fma.rn.ftz.f32 %f330, %f325, %f253, %f320; mov.b32 {%rs191, %rs198}, %r92; // begin inline asm { cvt.f32.f16 %f254, %rs191;} // end inline asm fma.rn.ftz.f32 %f331, %f325, %f254, %f321; mov.b32 {%rs192, %rs199}, %r96; // begin inline asm { cvt.f32.f16 %f255, %rs192;} // end inline asm fma.rn.ftz.f32 %f332, %f325, %f255, %f322; shl.b16 %rs235, %rs61, 4; cvt.s16.s8 %rs236, %rs235; shr.s16 %rs237, %rs236, 7; and.b16 %rs238, %rs237, -16; or.b16 %rs239, %rs238, %rs61; cvt.rn.f32.s16 %f333, %rs239; sub.ftz.f32 %f334, %f333, %f9; mul.ftz.f32 %f335, %f77, %f334; // begin inline asm { cvt.f32.f16 %f256, %rs193;} // end inline asm fma.rn.ftz.f32 %f504, %f335, %f256, %f326; // begin inline asm { cvt.f32.f16 %f257, %rs194;} // end inline asm fma.rn.ftz.f32 %f505, %f335, %f257, %f327; // begin inline asm { cvt.f32.f16 %f258, %rs195;} // end inline asm fma.rn.ftz.f32 %f506, %f335, %f258, %f328; // begin inline asm { cvt.f32.f16 %f259, %rs196;} // end inline asm fma.rn.ftz.f32 %f507, %f335, %f259, %f329; // begin inline asm { cvt.f32.f16 %f260, %rs197;} // end inline asm fma.rn.ftz.f32 %f508, %f335, %f260, %f330; // begin inline asm { cvt.f32.f16 %f261, %rs198;} // end inline asm fma.rn.ftz.f32 %f509, %f335, %f261, %f331; // begin inline asm { cvt.f32.f16 %f262, %rs199;} // end inline asm fma.rn.ftz.f32 %f510, %f335, %f262, %f332; $L__BB0_8: add.s32 %r244, %r244, 4; shl.b32 %r104, %r244, 5; add.s32 %r243, %r104, %r53; setp.lt.u32 %p7, %r243, %r50; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r238, %tid.y; mov.u32 %r237, %tid.x; shl.b32 %r236, %r238, 5; add.s32 %r235, %r236, %r237; shl.b32 %r106, %r235, 2; mov.u32 %r107, _ZZ9gemv_int4ILi4ELi32ELi7EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r108, %r107, %r106; setp.lt.u32 %p8, %r235, 32; @%p8 bra $L__BB0_11; add.s32 %r228, %r108, -112; st.shared.f32 [%r228], %f504; $L__BB0_11: mov.u32 %r242, %tid.y; mov.u32 %r241, %tid.x; shl.b32 %r240, %r242, 5; add.s32 %r239, %r240, %r241; setp.gt.u32 %p9, %r239, 31; bar.sync 0; mad.lo.s32 %r43, %r239, 12, %r107; @%p9 bra $L__BB0_13; mov.u32 %r123, 16; ld.shared.f32 %f351, [%r43+16]; add.ftz.f32 %f352, %f504, %f351; ld.shared.f32 %f353, [%r43+20]; add.ftz.f32 %f354, %f352, %f353; ld.shared.f32 %f355, [%r43+24]; add.ftz.f32 %f338, %f354, %f355; mov.u32 %r111, 1; mov.u32 %r124, 31; mov.u32 %r125, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f338, %r111, %r124, %r125; @p add.f32 r0, r0, %f338; mov.f32 %f336, r0;} // end inline asm mov.u32 %r114, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f336, %r114, %r124, %r125; @p add.f32 r0, r0, %f336; mov.f32 %f339, r0;} // end inline asm mov.u32 %r117, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f339, %r117, %r124, %r125; @p add.f32 r0, r0, %f339; mov.f32 %f342, r0;} // end inline asm mov.u32 %r120, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f342, %r120, %r124, %r125; @p add.f32 r0, r0, %f342; mov.f32 %f345, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f345, %r123, %r124, %r125; @p add.f32 r0, r0, %f345; mov.f32 %f504, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r229, %r108, -112; st.shared.f32 [%r229+640], %f505; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f371, [%r43+656]; add.ftz.f32 %f372, %f505, %f371; ld.shared.f32 %f373, [%r43+660]; add.ftz.f32 %f374, %f372, %f373; ld.shared.f32 %f375, [%r43+664]; add.ftz.f32 %f358, %f374, %f375; mov.u32 %r127, 1; mov.u32 %r140, 31; mov.u32 %r141, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f358, %r127, %r140, %r141; @p add.f32 r0, r0, %f358; mov.f32 %f356, r0;} // end inline asm mov.u32 %r130, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f356, %r130, %r140, %r141; @p add.f32 r0, r0, %f356; mov.f32 %f359, r0;} // end inline asm mov.u32 %r133, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f359, %r133, %r140, %r141; @p add.f32 r0, r0, %f359; mov.f32 %f362, r0;} // end inline asm mov.u32 %r136, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f362, %r136, %r140, %r141; @p add.f32 r0, r0, %f362; mov.f32 %f365, r0;} // end inline asm mov.u32 %r139, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f365, %r139, %r140, %r141; @p add.f32 r0, r0, %f365; mov.f32 %f505, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r230, %r108, -112; st.shared.f32 [%r230+1280], %f506; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f391, [%r43+1296]; add.ftz.f32 %f392, %f506, %f391; ld.shared.f32 %f393, [%r43+1300]; add.ftz.f32 %f394, %f392, %f393; ld.shared.f32 %f395, [%r43+1304]; add.ftz.f32 %f378, %f394, %f395; mov.u32 %r143, 1; mov.u32 %r156, 31; mov.u32 %r157, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f378, %r143, %r156, %r157; @p add.f32 r0, r0, %f378; mov.f32 %f376, r0;} // end inline asm mov.u32 %r146, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f376, %r146, %r156, %r157; @p add.f32 r0, r0, %f376; mov.f32 %f379, r0;} // end inline asm mov.u32 %r149, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f379, %r149, %r156, %r157; @p add.f32 r0, r0, %f379; mov.f32 %f382, r0;} // end inline asm mov.u32 %r152, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f382, %r152, %r156, %r157; @p add.f32 r0, r0, %f382; mov.f32 %f385, r0;} // end inline asm mov.u32 %r155, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f385, %r155, %r156, %r157; @p add.f32 r0, r0, %f385; mov.f32 %f506, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r231, %r108, -112; st.shared.f32 [%r231+1920], %f507; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f411, [%r43+1936]; add.ftz.f32 %f412, %f507, %f411; ld.shared.f32 %f413, [%r43+1940]; add.ftz.f32 %f414, %f412, %f413; ld.shared.f32 %f415, [%r43+1944]; add.ftz.f32 %f398, %f414, %f415; mov.u32 %r159, 1; mov.u32 %r172, 31; mov.u32 %r173, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f398, %r159, %r172, %r173; @p add.f32 r0, r0, %f398; mov.f32 %f396, r0;} // end inline asm mov.u32 %r162, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f396, %r162, %r172, %r173; @p add.f32 r0, r0, %f396; mov.f32 %f399, r0;} // end inline asm mov.u32 %r165, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f399, %r165, %r172, %r173; @p add.f32 r0, r0, %f399; mov.f32 %f402, r0;} // end inline asm mov.u32 %r168, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f402, %r168, %r172, %r173; @p add.f32 r0, r0, %f402; mov.f32 %f405, r0;} // end inline asm mov.u32 %r171, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f405, %r171, %r172, %r173; @p add.f32 r0, r0, %f405; mov.f32 %f507, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r232, %r108, -112; st.shared.f32 [%r232+2560], %f508; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f431, [%r43+2576]; add.ftz.f32 %f432, %f508, %f431; ld.shared.f32 %f433, [%r43+2580]; add.ftz.f32 %f434, %f432, %f433; ld.shared.f32 %f435, [%r43+2584]; add.ftz.f32 %f418, %f434, %f435; mov.u32 %r175, 1; mov.u32 %r188, 31; mov.u32 %r189, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f418, %r175, %r188, %r189; @p add.f32 r0, r0, %f418; mov.f32 %f416, r0;} // end inline asm mov.u32 %r178, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f416, %r178, %r188, %r189; @p add.f32 r0, r0, %f416; mov.f32 %f419, r0;} // end inline asm mov.u32 %r181, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f419, %r181, %r188, %r189; @p add.f32 r0, r0, %f419; mov.f32 %f422, r0;} // end inline asm mov.u32 %r184, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f422, %r184, %r188, %r189; @p add.f32 r0, r0, %f422; mov.f32 %f425, r0;} // end inline asm mov.u32 %r187, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f425, %r187, %r188, %r189; @p add.f32 r0, r0, %f425; mov.f32 %f508, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r233, %r108, -112; st.shared.f32 [%r233+3200], %f509; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f451, [%r43+3216]; add.ftz.f32 %f452, %f509, %f451; ld.shared.f32 %f453, [%r43+3220]; add.ftz.f32 %f454, %f452, %f453; ld.shared.f32 %f455, [%r43+3224]; add.ftz.f32 %f438, %f454, %f455; mov.u32 %r191, 1; mov.u32 %r204, 31; mov.u32 %r205, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f438, %r191, %r204, %r205; @p add.f32 r0, r0, %f438; mov.f32 %f436, r0;} // end inline asm mov.u32 %r194, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f436, %r194, %r204, %r205; @p add.f32 r0, r0, %f436; mov.f32 %f439, r0;} // end inline asm mov.u32 %r197, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f439, %r197, %r204, %r205; @p add.f32 r0, r0, %f439; mov.f32 %f442, r0;} // end inline asm mov.u32 %r200, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f442, %r200, %r204, %r205; @p add.f32 r0, r0, %f442; mov.f32 %f445, r0;} // end inline asm mov.u32 %r203, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f445, %r203, %r204, %r205; @p add.f32 r0, r0, %f445; mov.f32 %f509, r0;} // end inline asm $L__BB0_33: @%p8 bra $L__BB0_35; add.s32 %r234, %r108, -112; st.shared.f32 [%r234+3840], %f510; $L__BB0_35: bar.sync 0; @%p9 bra $L__BB0_37; ld.shared.f32 %f471, [%r43+3856]; add.ftz.f32 %f472, %f510, %f471; ld.shared.f32 %f473, [%r43+3860]; add.ftz.f32 %f474, %f472, %f473; ld.shared.f32 %f475, [%r43+3864]; add.ftz.f32 %f458, %f474, %f475; mov.u32 %r207, 1; mov.u32 %r220, 31; mov.u32 %r221, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f458, %r207, %r220, %r221; @p add.f32 r0, r0, %f458; mov.f32 %f456, r0;} // end inline asm mov.u32 %r210, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f456, %r210, %r220, %r221; @p add.f32 r0, r0, %f456; mov.f32 %f459, r0;} // end inline asm mov.u32 %r213, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f459, %r213, %r220, %r221; @p add.f32 r0, r0, %f459; mov.f32 %f462, r0;} // end inline asm mov.u32 %r216, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f462, %r216, %r220, %r221; @p add.f32 r0, r0, %f462; mov.f32 %f465, r0;} // end inline asm mov.u32 %r219, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f465, %r219, %r220, %r221; @p add.f32 r0, r0, %f465; mov.f32 %f510, r0;} // end inline asm $L__BB0_37: mov.u32 %r223, %tid.y; or.b32 %r224, %r53, %r223; setp.ne.s32 %p22, %r224, 0; @%p22 bra $L__BB0_41; ld.param.u64 %rd61, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd60, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd36, %rd60; setp.eq.s64 %p23, %rd61, 0; mul.ftz.f32 %f52, %f68, %f504; mov.u32 %r225, %ctaid.x; cvt.s64.s32 %rd6, %r225; mul.wide.s32 %rd37, %r225, 2; add.s64 %rd7, %rd36, %rd37; mul.ftz.f32 %f53, %f68, %f505; add.s32 %r226, %r49, %r225; cvt.s64.s32 %rd8, %r49; mul.wide.s32 %rd38, %r49, 2; add.s64 %rd9, %rd7, %rd38; mul.ftz.f32 %f54, %f68, %f506; add.s32 %r227, %r226, %r49; cvt.s64.s32 %rd10, %r227; mul.wide.s32 %rd39, %r227, 2; add.s64 %rd12, %rd36, %rd39; mul.ftz.f32 %f55, %f68, %f507; mul.ftz.f32 %f56, %f68, %f508; mul.ftz.f32 %f57, %f68, %f509; mul.ftz.f32 %f58, %f68, %f510; @%p23 bra $L__BB0_40; ld.param.u64 %rd62, [_Z27dequant_gemv_group32_batch723DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd40, %rd62; shl.b64 %rd41, %rd6, 1; add.s64 %rd42, %rd40, %rd41; ld.global.u16 %rs240, [%rd42]; // begin inline asm { cvt.f32.f16 %f476, %rs240;} // end inline asm fma.rn.ftz.f32 %f477, %f69, %f476, %f52; // begin inline asm { cvt.rn.f16.f32 %rs241, %f477;} // end inline asm st.global.u16 [%rd7], %rs241; shl.b64 %rd43, %rd8, 1; add.s64 %rd44, %rd42, %rd43; ld.global.u16 %rs242, [%rd44]; // begin inline asm { cvt.f32.f16 %f478, %rs242;} // end inline asm fma.rn.ftz.f32 %f479, %f69, %f478, %f53; // begin inline asm { cvt.rn.f16.f32 %rs243, %f479;} // end inline asm st.global.u16 [%rd9], %rs243; shl.b64 %rd45, %rd10, 1; add.s64 %rd46, %rd40, %rd45; ld.global.u16 %rs244, [%rd46]; // begin inline asm { cvt.f32.f16 %f480, %rs244;} // end inline asm fma.rn.ftz.f32 %f481, %f69, %f480, %f54; // begin inline asm { cvt.rn.f16.f32 %rs245, %f481;} // end inline asm st.global.u16 [%rd12], %rs245; add.s64 %rd47, %rd46, %rd43; ld.global.u16 %rs246, [%rd47]; // begin inline asm { cvt.f32.f16 %f482, %rs246;} // end inline asm fma.rn.ftz.f32 %f483, %f69, %f482, %f55; // begin inline asm { cvt.rn.f16.f32 %rs247, %f483;} // end inline asm add.s64 %rd48, %rd12, %rd43; st.global.u16 [%rd48], %rs247; add.s64 %rd49, %rd47, %rd43; ld.global.u16 %rs248, [%rd49]; // begin inline asm { cvt.f32.f16 %f484, %rs248;} // end inline asm fma.rn.ftz.f32 %f485, %f69, %f484, %f56; // begin inline asm { cvt.rn.f16.f32 %rs249, %f485;} // end inline asm add.s64 %rd50, %rd48, %rd43; st.global.u16 [%rd50], %rs249; add.s64 %rd51, %rd49, %rd43; ld.global.u16 %rs250, [%rd51]; // begin inline asm { cvt.f32.f16 %f486, %rs250;} // end inline asm fma.rn.ftz.f32 %f487, %f69, %f486, %f57; // begin inline asm { cvt.rn.f16.f32 %rs251, %f487;} // end inline asm add.s64 %rd52, %rd50, %rd43; st.global.u16 [%rd52], %rs251; add.s64 %rd53, %rd51, %rd43; ld.global.u16 %rs252, [%rd53]; // begin inline asm { cvt.f32.f16 %f488, %rs252;} // end inline asm fma.rn.ftz.f32 %f489, %f69, %f488, %f58; // begin inline asm { cvt.rn.f16.f32 %rs253, %f489;} // end inline asm add.s64 %rd54, %rd52, %rd43; st.global.u16 [%rd54], %rs253; bra.uni $L__BB0_41; $L__BB0_40: // begin inline asm { cvt.rn.f16.f32 %rs254, %f52;} // end inline asm st.global.u16 [%rd7], %rs254; // begin inline asm { cvt.rn.f16.f32 %rs255, %f53;} // end inline asm st.global.u16 [%rd9], %rs255; // begin inline asm { cvt.rn.f16.f32 %rs256, %f54;} // end inline asm st.global.u16 [%rd12], %rs256; // begin inline asm { cvt.rn.f16.f32 %rs257, %f55;} // end inline asm shl.b64 %rd55, %rd8, 1; add.s64 %rd56, %rd12, %rd55; st.global.u16 [%rd56], %rs257; // begin inline asm { cvt.rn.f16.f32 %rs258, %f56;} // end inline asm add.s64 %rd57, %rd56, %rd55; st.global.u16 [%rd57], %rs258; // begin inline asm { cvt.rn.f16.f32 %rs259, %f57;} // end inline asm add.s64 %rd58, %rd57, %rd55; st.global.u16 [%rd58], %rs259; // begin inline asm { cvt.rn.f16.f32 %rs260, %f58;} // end inline asm add.s64 %rd59, %rd58, %rd55; st.global.u16 [%rd59], %rs260; $L__BB0_41: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }