l .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch423DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<21>; .reg .b16 %rs<195>; .reg .f32 %f<334>; .reg .b32 %r<173>; .reg .b64 %rd<67>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[2560]; ld.param.v2.u32 {%r25, %r26}, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r27, %r28}, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f51, %f52}, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs60, %rs61, %rs62, %rs63}, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd18, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd17, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd16, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd15, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd14, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+8]; mov.u32 %r1, %ctaid.x; cvta.to.global.u64 %rd2, %rd14; mov.u32 %r172, %tid.y; shl.b32 %r29, %r172, 5; mov.u32 %r3, %tid.x; add.s32 %r171, %r29, %r3; setp.ge.u32 %p1, %r171, %r27; mov.f32 %f318, 0f00000000; mov.f32 %f319, %f318; mov.f32 %f320, %f318; mov.f32 %f321, %f318; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd15; mul.lo.s32 %r7, %r27, %r1; shr.u32 %r8, %r3, 2; mul.lo.s32 %r9, %r28, %r1; shl.b16 %rs2, %rs60, 3; $L__BB0_2: add.s32 %r31, %r171, %r7; mul.wide.u32 %rd25, %r31, 4; add.s64 %rd20, %rd16, %rd25; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd19, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.u32 %r30, [%rd20], %rd19; // end inline asm shl.b32 %r32, %r172, 3; add.s32 %r13, %r32, %r8; add.s32 %r14, %r13, %r9; mul.wide.s32 %rd26, %r14, 2; add.s64 %rd23, %rd18, %rd26; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd22, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs68, [%rd23], %rd22; // end inline asm // begin inline asm { cvt.f32.f16 %f57, %rs68;} // end inline asm setp.eq.s64 %p2, %rd17, 0; mov.u16 %rs194, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r33, %r14, 31; add.s32 %r34, %r14, %r33; shr.s32 %r35, %r34, 1; cvt.s64.s32 %rd30, %r35; add.s64 %rd28, %rd17, %rd30; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd27, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs70, [%rd28], %rd27; // end inline asm cvt.u32.u16 %r36, %rs70; and.b32 %r37, %r36, 255; shl.b32 %r38, %r13, 2; and.b32 %r39, %r38, 4; shr.u32 %r40, %r37, %r39; cvt.u16.u32 %rs71, %r40; and.b16 %rs194, %rs71, 15; $L__BB0_4: shl.b32 %r15, %r171, 3; setp.ge.s32 %p3, %r15, %r25; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs60, 0; mul.wide.s32 %rd31, %r15, 2; add.s64 %rd32, %rd3, %rd31; ld.global.v4.u32 {%r41, %r42, %r43, %r44}, [%rd32]; mul.wide.s32 %rd33, %r25, 2; add.s64 %rd34, %rd32, %rd33; ld.global.v4.u32 {%r49, %r50, %r51, %r52}, [%rd34]; add.s32 %r57, %r25, %r15; add.s32 %r58, %r57, %r25; mul.wide.s32 %rd35, %r58, 2; add.s64 %rd36, %rd3, %rd35; ld.global.v4.u32 {%r59, %r60, %r61, %r62}, [%rd36]; add.s32 %r67, %r58, %r25; mul.wide.s32 %rd37, %r67, 2; add.s64 %rd38, %rd3, %rd37; ld.global.v4.u32 {%r68, %r69, %r70, %r71}, [%rd38]; shr.u16 %rs73, %rs194, 3; and.b16 %rs74, %rs73, 1; setp.eq.b16 %p5, %rs74, 1; and.pred %p6, %p4, %p5; selp.b16 %rs75, -16, 0, %p6; or.b16 %rs76, %rs75, %rs194; cvt.s16.s8 %rs77, %rs76; cvt.rn.f32.s16 %f6, %rs77; cvt.u16.u32 %rs5, %r30; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs13}, %r41; mov.b32 {%rs8, %rs14}, %r49; mov.b32 {%rs9, %rs15}, %r59; mov.b32 {%rs10, %rs16}, %r68; shr.u32 %r76, %r30, 4; cvt.u16.u32 %rs11, %r76; and.b16 %rs12, %rs11, 15; shr.u32 %r77, %r30, 8; cvt.u16.u32 %rs17, %r77; and.b16 %rs18, %rs17, 15; mov.b32 {%rs19, %rs25}, %r42; mov.b32 {%rs20, %rs26}, %r50; mov.b32 {%rs21, %rs27}, %r60; mov.b32 {%rs22, %rs28}, %r69; shr.u32 %r78, %r30, 12; cvt.u16.u32 %rs23, %r78; and.b16 %rs24, %rs23, 15; shr.u32 %r79, %r30, 16; cvt.u16.u32 %rs29, %r79; and.b16 %rs30, %rs29, 15; mov.b32 {%rs31, %rs37}, %r43; mov.b32 {%rs32, %rs38}, %r51; mov.b32 {%rs33, %rs39}, %r61; mov.b32 {%rs34, %rs40}, %r70; shr.u32 %r80, %r30, 20; cvt.u16.u32 %rs35, %r80; and.b16 %rs36, %rs35, 15; shr.u32 %r81, %r30, 24; cvt.u16.u32 %rs41, %r81; and.b16 %rs42, %rs41, 15; mov.b32 {%rs43, %rs48}, %r44; mov.b32 {%rs44, %rs49}, %r52; mov.b32 {%rs45, %rs50}, %r62; mov.b32 {%rs46, %rs51}, %r71; shr.u32 %r82, %r30, 28; cvt.u16.u32 %rs47, %r82; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f90, %rs6; sub.ftz.f32 %f91, %f90, %f6; mul.ftz.f32 %f92, %f57, %f91; // begin inline asm { cvt.f32.f16 %f58, %rs7;} // end inline asm fma.rn.ftz.f32 %f93, %f92, %f58, %f318; // begin inline asm { cvt.f32.f16 %f59, %rs8;} // end inline asm fma.rn.ftz.f32 %f94, %f92, %f59, %f319; // begin inline asm { cvt.f32.f16 %f60, %rs9;} // end inline asm fma.rn.ftz.f32 %f95, %f92, %f60, %f320; // begin inline asm { cvt.f32.f16 %f61, %rs10;} // end inline asm fma.rn.ftz.f32 %f96, %f92, %f61, %f321; cvt.rn.f32.s16 %f97, %rs12; sub.ftz.f32 %f98, %f97, %f6; mul.ftz.f32 %f99, %f57, %f98; // begin inline asm { cvt.f32.f16 %f62, %rs13;} // end inline asm fma.rn.ftz.f32 %f100, %f99, %f62, %f93; // begin inline asm { cvt.f32.f16 %f63, %rs14;} // end inline asm fma.rn.ftz.f32 %f101, %f99, %f63, %f94; // begin inline asm { cvt.f32.f16 %f64, %rs15;} // end inline asm fma.rn.ftz.f32 %f102, %f99, %f64, %f95; // begin inline asm { cvt.f32.f16 %f65, %rs16;} // end inline asm fma.rn.ftz.f32 %f103, %f99, %f65, %f96; cvt.rn.f32.s16 %f104, %rs18; sub.ftz.f32 %f105, %f104, %f6; mul.ftz.f32 %f106, %f57, %f105; // begin inline asm { cvt.f32.f16 %f66, %rs19;} // end inline asm fma.rn.ftz.f32 %f107, %f106, %f66, %f100; // begin inline asm { cvt.f32.f16 %f67, %rs20;} // end inline asm fma.rn.ftz.f32 %f108, %f106, %f67, %f101; // begin inline asm { cvt.f32.f16 %f68, %rs21;} // end inline asm fma.rn.ftz.f32 %f109, %f106, %f68, %f102; // begin inline asm { cvt.f32.f16 %f69, %rs22;} // end inline asm fma.rn.ftz.f32 %f110, %f106, %f69, %f103; cvt.rn.f32.s16 %f111, %rs24; sub.ftz.f32 %f112, %f111, %f6; mul.ftz.f32 %f113, %f57, %f112; // begin inline asm { cvt.f32.f16 %f70, %rs25;} // end inline asm fma.rn.ftz.f32 %f114, %f113, %f70, %f107; // begin inline asm { cvt.f32.f16 %f71, %rs26;} // end inline asm fma.rn.ftz.f32 %f115, %f113, %f71, %f108; // begin inline asm { cvt.f32.f16 %f72, %rs27;} // end inline asm fma.rn.ftz.f32 %f116, %f113, %f72, %f109; // begin inline asm { cvt.f32.f16 %f73, %rs28;} // end inline asm fma.rn.ftz.f32 %f117, %f113, %f73, %f110; cvt.rn.f32.s16 %f118, %rs30; sub.ftz.f32 %f119, %f118, %f6; mul.ftz.f32 %f120, %f57, %f119; // begin inline asm { cvt.f32.f16 %f74, %rs31;} // end inline asm fma.rn.ftz.f32 %f121, %f120, %f74, %f114; // begin inline asm { cvt.f32.f16 %f75, %rs32;} // end inline asm fma.rn.ftz.f32 %f122, %f120, %f75, %f115; // begin inline asm { cvt.f32.f16 %f76, %rs33;} // end inline asm fma.rn.ftz.f32 %f123, %f120, %f76, %f116; // begin inline asm { cvt.f32.f16 %f77, %rs34;} // end inline asm fma.rn.ftz.f32 %f124, %f120, %f77, %f117; cvt.rn.f32.s16 %f125, %rs36; sub.ftz.f32 %f126, %f125, %f6; mul.ftz.f32 %f127, %f57, %f126; // begin inline asm { cvt.f32.f16 %f78, %rs37;} // end inline asm fma.rn.ftz.f32 %f128, %f127, %f78, %f121; // begin inline asm { cvt.f32.f16 %f79, %rs38;} // end inline asm fma.rn.ftz.f32 %f129, %f127, %f79, %f122; // begin inline asm { cvt.f32.f16 %f80, %rs39;} // end inline asm fma.rn.ftz.f32 %f130, %f127, %f80, %f123; // begin inline asm { cvt.f32.f16 %f81, %rs40;} // end inline asm fma.rn.ftz.f32 %f131, %f127, %f81, %f124; cvt.rn.f32.s16 %f132, %rs42; sub.ftz.f32 %f133, %f132, %f6; mul.ftz.f32 %f134, %f57, %f133; // begin inline asm { cvt.f32.f16 %f82, %rs43;} // end inline asm fma.rn.ftz.f32 %f135, %f134, %f82, %f128; // begin inline asm { cvt.f32.f16 %f83, %rs44;} // end inline asm fma.rn.ftz.f32 %f136, %f134, %f83, %f129; // begin inline asm { cvt.f32.f16 %f84, %rs45;} // end inline asm fma.rn.ftz.f32 %f137, %f134, %f84, %f130; // begin inline asm { cvt.f32.f16 %f85, %rs46;} // end inline asm fma.rn.ftz.f32 %f138, %f134, %f85, %f131; cvt.rn.f32.s16 %f139, %rs47; sub.ftz.f32 %f140, %f139, %f6; mul.ftz.f32 %f141, %f57, %f140; // begin inline asm { cvt.f32.f16 %f86, %rs48;} // end inline asm fma.rn.ftz.f32 %f318, %f141, %f86, %f135; // begin inline asm { cvt.f32.f16 %f87, %rs49;} // end inline asm fma.rn.ftz.f32 %f319, %f141, %f87, %f136; // begin inline asm { cvt.f32.f16 %f88, %rs50;} // end inline asm fma.rn.ftz.f32 %f320, %f141, %f88, %f137; // begin inline asm { cvt.f32.f16 %f89, %rs51;} // end inline asm fma.rn.ftz.f32 %f321, %f141, %f89, %f138; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs142, %rs5, 4; cvt.s16.s8 %rs143, %rs142; shr.s16 %rs144, %rs143, 7; and.b16 %rs145, %rs144, -16; or.b16 %rs146, %rs145, %rs6; cvt.rn.f32.s16 %f174, %rs146; sub.ftz.f32 %f175, %f174, %f6; mul.ftz.f32 %f176, %f57, %f175; // begin inline asm { cvt.f32.f16 %f142, %rs7;} // end inline asm fma.rn.ftz.f32 %f177, %f176, %f142, %f318; // begin inline asm { cvt.f32.f16 %f143, %rs8;} // end inline asm fma.rn.ftz.f32 %f178, %f176, %f143, %f319; // begin inline asm { cvt.f32.f16 %f144, %rs9;} // end inline asm fma.rn.ftz.f32 %f179, %f176, %f144, %f320; // begin inline asm { cvt.f32.f16 %f145, %rs10;} // end inline asm fma.rn.ftz.f32 %f180, %f176, %f145, %f321; shl.b16 %rs147, %rs11, 4; cvt.s16.s8 %rs148, %rs147; shr.s16 %rs149, %rs148, 7; and.b16 %rs150, %rs149, -16; or.b16 %rs151, %rs150, %rs12; cvt.rn.f32.s16 %f181, %rs151; sub.ftz.f32 %f182, %f181, %f6; mul.ftz.f32 %f183, %f57, %f182; // begin inline asm { cvt.f32.f16 %f146, %rs13;} // end inline asm fma.rn.ftz.f32 %f184, %f183, %f146, %f177; // begin inline asm { cvt.f32.f16 %f147, %rs14;} // end inline asm fma.rn.ftz.f32 %f185, %f183, %f147, %f178; // begin inline asm { cvt.f32.f16 %f148, %rs15;} // end inline asm fma.rn.ftz.f32 %f186, %f183, %f148, %f179; // begin inline asm { cvt.f32.f16 %f149, %rs16;} // end inline asm fma.rn.ftz.f32 %f187, %f183, %f149, %f180; shl.b16 %rs152, %rs17, 4; cvt.s16.s8 %rs153, %rs152; shr.s16 %rs154, %rs153, 7; and.b16 %rs155, %rs154, -16; or.b16 %rs156, %rs155, %rs18; cvt.rn.f32.s16 %f188, %rs156; sub.ftz.f32 %f189, %f188, %f6; mul.ftz.f32 %f190, %f57, %f189; // begin inline asm { cvt.f32.f16 %f150, %rs19;} // end inline asm fma.rn.ftz.f32 %f191, %f190, %f150, %f184; // begin inline asm { cvt.f32.f16 %f151, %rs20;} // end inline asm fma.rn.ftz.f32 %f192, %f190, %f151, %f185; // begin inline asm { cvt.f32.f16 %f152, %rs21;} // end inline asm fma.rn.ftz.f32 %f193, %f190, %f152, %f186; // begin inline asm { cvt.f32.f16 %f153, %rs22;} // end inline asm fma.rn.ftz.f32 %f194, %f190, %f153, %f187; shl.b16 %rs157, %rs23, 4; cvt.s16.s8 %rs158, %rs157; shr.s16 %rs159, %rs158, 7; and.b16 %rs160, %rs159, -16; or.b16 %rs161, %rs160, %rs24; cvt.rn.f32.s16 %f195, %rs161; sub.ftz.f32 %f196, %f195, %f6; mul.ftz.f32 %f197, %f57, %f196; // begin inline asm { cvt.f32.f16 %f154, %rs25;} // end inline asm fma.rn.ftz.f32 %f198, %f197, %f154, %f191; // begin inline asm { cvt.f32.f16 %f155, %rs26;} // end inline asm fma.rn.ftz.f32 %f199, %f197, %f155, %f192; // begin inline asm { cvt.f32.f16 %f156, %rs27;} // end inline asm fma.rn.ftz.f32 %f200, %f197, %f156, %f193; // begin inline asm { cvt.f32.f16 %f157, %rs28;} // end inline asm fma.rn.ftz.f32 %f201, %f197, %f157, %f194; shl.b16 %rs162, %rs29, 4; cvt.s16.s8 %rs163, %rs162; shr.s16 %rs164, %rs163, 7; and.b16 %rs165, %rs164, -16; or.b16 %rs166, %rs165, %rs30; cvt.rn.f32.s16 %f202, %rs166; sub.ftz.f32 %f203, %f202, %f6; mul.ftz.f32 %f204, %f57, %f203; // begin inline asm { cvt.f32.f16 %f158, %rs31;} // end inline asm fma.rn.ftz.f32 %f205, %f204, %f158, %f198; // begin inline asm { cvt.f32.f16 %f159, %rs32;} // end inline asm fma.rn.ftz.f32 %f206, %f204, %f159, %f199; // begin inline asm { cvt.f32.f16 %f160, %rs33;} // end inline asm fma.rn.ftz.f32 %f207, %f204, %f160, %f200; // begin inline asm { cvt.f32.f16 %f161, %rs34;} // end inline asm fma.rn.ftz.f32 %f208, %f204, %f161, %f201; shl.b16 %rs167, %rs35, 4; cvt.s16.s8 %rs168, %rs167; shr.s16 %rs169, %rs168, 7; and.b16 %rs170, %rs169, -16; or.b16 %rs171, %rs170, %rs36; cvt.rn.f32.s16 %f209, %rs171; sub.ftz.f32 %f210, %f209, %f6; mul.ftz.f32 %f211, %f57, %f210; // begin inline asm { cvt.f32.f16 %f162, %rs37;} // end inline asm fma.rn.ftz.f32 %f212, %f211, %f162, %f205; // begin inline asm { cvt.f32.f16 %f163, %rs38;} // end inline asm fma.rn.ftz.f32 %f213, %f211, %f163, %f206; // begin inline asm { cvt.f32.f16 %f164, %rs39;} // end inline asm fma.rn.ftz.f32 %f214, %f211, %f164, %f207; // begin inline asm { cvt.f32.f16 %f165, %rs40;} // end inline asm fma.rn.ftz.f32 %f215, %f211, %f165, %f208; shl.b16 %rs172, %rs41, 4; cvt.s16.s8 %rs173, %rs172; shr.s16 %rs174, %rs173, 7; and.b16 %rs175, %rs174, -16; or.b16 %rs176, %rs175, %rs42; cvt.rn.f32.s16 %f216, %rs176; sub.ftz.f32 %f217, %f216, %f6; mul.ftz.f32 %f218, %f57, %f217; // begin inline asm { cvt.f32.f16 %f166, %rs43;} // end inline asm fma.rn.ftz.f32 %f219, %f218, %f166, %f212; // begin inline asm { cvt.f32.f16 %f167, %rs44;} // end inline asm fma.rn.ftz.f32 %f220, %f218, %f167, %f213; // begin inline asm { cvt.f32.f16 %f168, %rs45;} // end inline asm fma.rn.ftz.f32 %f221, %f218, %f168, %f214; // begin inline asm { cvt.f32.f16 %f169, %rs46;} // end inline asm fma.rn.ftz.f32 %f222, %f218, %f169, %f215; shl.b16 %rs177, %rs47, 4; cvt.s16.s8 %rs178, %rs177; shr.s16 %rs179, %rs178, 7; and.b16 %rs180, %rs179, -16; or.b16 %rs181, %rs180, %rs47; cvt.rn.f32.s16 %f223, %rs181; sub.ftz.f32 %f224, %f223, %f6; mul.ftz.f32 %f225, %f57, %f224; // begin inline asm { cvt.f32.f16 %f170, %rs48;} // end inline asm fma.rn.ftz.f32 %f318, %f225, %f170, %f219; // begin inline asm { cvt.f32.f16 %f171, %rs49;} // end inline asm fma.rn.ftz.f32 %f319, %f225, %f171, %f220; // begin inline asm { cvt.f32.f16 %f172, %rs50;} // end inline asm fma.rn.ftz.f32 %f320, %f225, %f172, %f221; // begin inline asm { cvt.f32.f16 %f173, %rs51;} // end inline asm fma.rn.ftz.f32 %f321, %f225, %f173, %f222; $L__BB0_8: add.s32 %r172, %r172, 4; shl.b32 %r83, %r172, 5; add.s32 %r171, %r83, %r3; setp.lt.u32 %p7, %r171, %r27; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r164, %tid.y; shl.b32 %r163, %r164, 5; add.s32 %r162, %r163, %r3; shl.b32 %r84, %r162, 2; mov.u32 %r85, _ZZ9gemv_int4ILi4ELi32ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r86, %r85, %r84; setp.lt.u32 %p8, %r162, 32; @%p8 bra $L__BB0_11; add.s32 %r158, %r86, -112; st.shared.f32 [%r158], %f318; $L__BB0_11: mov.u32 %r167, %tid.y; shl.b32 %r166, %r167, 5; add.s32 %r165, %r166, %r3; setp.gt.u32 %p9, %r165, 31; bar.sync 0; mad.lo.s32 %r19, %r165, 12, %r85; @%p9 bra $L__BB0_13; mov.u32 %r101, 16; ld.shared.f32 %f241, [%r19+16]; add.ftz.f32 %f242, %f318, %f241; ld.shared.f32 %f243, [%r19+20]; add.ftz.f32 %f244, %f242, %f243; ld.shared.f32 %f245, [%r19+24]; add.ftz.f32 %f228, %f244, %f245; mov.u32 %r89, 1; mov.u32 %r102, 31; mov.u32 %r103, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f228, %r89, %r102, %r103; @p add.f32 r0, r0, %f228; mov.f32 %f226, r0;} // end inline asm mov.u32 %r92, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f226, %r92, %r102, %r103; @p add.f32 r0, r0, %f226; mov.f32 %f229, r0;} // end inline asm mov.u32 %r95, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f229, %r95, %r102, %r103; @p add.f32 r0, r0, %f229; mov.f32 %f232, r0;} // end inline asm mov.u32 %r98, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f232, %r98, %r102, %r103; @p add.f32 r0, r0, %f232; mov.f32 %f235, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f235, %r101, %r102, %r103; @p add.f32 r0, r0, %f235; mov.f32 %f318, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r159, %r86, -112; st.shared.f32 [%r159+640], %f319; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f261, [%r19+656]; add.ftz.f32 %f262, %f319, %f261; ld.shared.f32 %f263, [%r19+660]; add.ftz.f32 %f264, %f262, %f263; ld.shared.f32 %f265, [%r19+664]; add.ftz.f32 %f248, %f264, %f265; mov.u32 %r105, 1; mov.u32 %r118, 31; mov.u32 %r119, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f248, %r105, %r118, %r119; @p add.f32 r0, r0, %f248; mov.f32 %f246, r0;} // end inline asm mov.u32 %r108, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f246, %r108, %r118, %r119; @p add.f32 r0, r0, %f246; mov.f32 %f249, r0;} // end inline asm mov.u32 %r111, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f249, %r111, %r118, %r119; @p add.f32 r0, r0, %f249; mov.f32 %f252, r0;} // end inline asm mov.u32 %r114, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f252, %r114, %r118, %r119; @p add.f32 r0, r0, %f252; mov.f32 %f255, r0;} // end inline asm mov.u32 %r117, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f255, %r117, %r118, %r119; @p add.f32 r0, r0, %f255; mov.f32 %f319, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r160, %r86, -112; st.shared.f32 [%r160+1280], %f320; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f281, [%r19+1296]; add.ftz.f32 %f282, %f320, %f281; ld.shared.f32 %f283, [%r19+1300]; add.ftz.f32 %f284, %f282, %f283; ld.shared.f32 %f285, [%r19+1304]; add.ftz.f32 %f268, %f284, %f285; mov.u32 %r121, 1; mov.u32 %r134, 31; mov.u32 %r135, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f268, %r121, %r134, %r135; @p add.f32 r0, r0, %f268; mov.f32 %f266, r0;} // end inline asm mov.u32 %r124, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f266, %r124, %r134, %r135; @p add.f32 r0, r0, %f266; mov.f32 %f269, r0;} // end inline asm mov.u32 %r127, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f269, %r127, %r134, %r135; @p add.f32 r0, r0, %f269; mov.f32 %f272, r0;} // end inline asm mov.u32 %r130, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f272, %r130, %r134, %r135; @p add.f32 r0, r0, %f272; mov.f32 %f275, r0;} // end inline asm mov.u32 %r133, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f275, %r133, %r134, %r135; @p add.f32 r0, r0, %f275; mov.f32 %f320, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r161, %r86, -112; st.shared.f32 [%r161+1920], %f321; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f301, [%r19+1936]; add.ftz.f32 %f302, %f321, %f301; ld.shared.f32 %f303, [%r19+1940]; add.ftz.f32 %f304, %f302, %f303; ld.shared.f32 %f305, [%r19+1944]; add.ftz.f32 %f288, %f304, %f305; mov.u32 %r137, 1; mov.u32 %r150, 31; mov.u32 %r151, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f288, %r137, %r150, %r151; @p add.f32 r0, r0, %f288; mov.f32 %f286, r0;} // end inline asm mov.u32 %r140, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f286, %r140, %r150, %r151; @p add.f32 r0, r0, %f286; mov.f32 %f289, r0;} // end inline asm mov.u32 %r143, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f289, %r143, %r150, %r151; @p add.f32 r0, r0, %f289; mov.f32 %f292, r0;} // end inline asm mov.u32 %r146, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f292, %r146, %r150, %r151; @p add.f32 r0, r0, %f292; mov.f32 %f295, r0;} // end inline asm mov.u32 %r149, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f295, %r149, %r150, %r151; @p add.f32 r0, r0, %f295; mov.f32 %f321, r0;} // end inline asm $L__BB0_25: mov.u32 %r168, %tid.y; or.b32 %r152, %r3, %r168; setp.ne.s32 %p16, %r152, 0; @%p16 bra $L__BB0_35; ld.param.u64 %rd63, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+8]; mov.u32 %r169, %ctaid.x; setp.eq.s64 %p17, %rd63, 0; mul.ftz.f32 %f330, %f51, %f318; cvt.s64.s32 %rd7, %r169; @%p17 bra $L__BB0_28; shl.b64 %rd39, %rd7, 1; add.s64 %rd40, %rd2, %rd39; ld.global.u16 %rs182, [%rd40]; // begin inline asm { cvt.f32.f16 %f306, %rs182;} // end inline asm fma.rn.ftz.f32 %f330, %f52, %f306, %f330; $L__BB0_28: ld.param.u64 %rd64, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0]; mov.u32 %r170, %ctaid.x; // begin inline asm { cvt.rn.f16.f32 %rs183, %f330;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd41, 1.0; // end inline asm shl.b64 %rd44, %rd7, 1; add.s64 %rd42, %rd64, %rd44; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd42], %rs183, %rd41; // end inline asm mul.ftz.f32 %f331, %f51, %f319; add.s32 %r153, %r26, %r170; cvt.s64.s32 %rd10, %r153; @%p17 bra $L__BB0_30; shl.b64 %rd45, %rd10, 1; add.s64 %rd46, %rd2, %rd45; ld.global.u16 %rs185, [%rd46]; // begin inline asm { cvt.f32.f16 %f308, %rs185;} // end inline asm fma.rn.ftz.f32 %f331, %f52, %f308, %f331; $L__BB0_30: mul.wide.s32 %rd50, %r26, 2; add.s64 %rd48, %rd42, %rd50; // begin inline asm { cvt.rn.f16.f32 %rs186, %f331;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd47, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd48], %rs186, %rd47; // end inline asm mul.ftz.f32 %f332, %f51, %f320; cvt.u32.u64 %r154, %rd10; add.s32 %r155, %r154, %r26; cvt.s64.s32 %rd11, %r155; @%p17 bra $L__BB0_32; shl.b64 %rd51, %rd11, 1; add.s64 %rd52, %rd2, %rd51; ld.global.u16 %rs188, [%rd52]; // begin inline asm { cvt.f32.f16 %f310, %rs188;} // end inline asm fma.rn.ftz.f32 %f332, %f52, %f310, %f332; $L__BB0_32: ld.param.u64 %rd65, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs189, %f332;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd53, 1.0; // end inline asm shl.b64 %rd56, %rd11, 1; add.s64 %rd54, %rd65, %rd56; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd54], %rs189, %rd53; // end inline asm mul.ftz.f32 %f333, %f51, %f321; cvt.u32.u64 %r156, %rd11; add.s32 %r157, %r156, %r26; cvt.s64.s32 %rd12, %r157; @%p17 bra $L__BB0_34; shl.b64 %rd57, %rd12, 1; add.s64 %rd58, %rd2, %rd57; ld.global.u16 %rs191, [%rd58]; // begin inline asm { cvt.f32.f16 %f312, %rs191;} // end inline asm fma.rn.ftz.f32 %f333, %f52, %f312, %f333; $L__BB0_34: ld.param.u64 %rd66, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs192, %f333;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd59, 1.0; // end inline asm shl.b64 %rd62, %rd12, 1; add.s64 %rd60, %rd66, %rd62; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd60], %rs192, %rd59; // end inline asm $L__BB0_35: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }