// .globl _Z28dequant_gemv_group128_batch123DequantGemvKernelParams // _ZZ9gemv_int4ILi4ELi128ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch123DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<12>; .reg .b16 %rs<327>; .reg .f32 %f<365>; .reg .b32 %r<159>; .reg .b64 %rd<37>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[640]; ld.param.v2.u32 {%r28, %r29}, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r30, %r31}, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f16, %f17}, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs81, %rs82, %rs83, %rs84}, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd13, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd12, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd11, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd10, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r158, %tid.y; shl.b32 %r32, %r158, 5; mov.u32 %r3, %tid.x; add.s32 %r4, %r32, %r3; shl.b32 %r156, %r4, 2; setp.ge.u32 %p1, %r156, %r30; mov.f32 %f361, 0f00000000; @%p1 bra $L__BB0_9; mul.lo.s32 %r7, %r30, %r1; shr.u32 %r8, %r3, 2; mul.lo.s32 %r9, %r31, %r1; shl.b16 %rs2, %rs81, 3; cvta.to.global.u64 %rd4, %rd10; mov.u32 %r157, %r4; $L__BB0_2: add.s32 %r37, %r156, %r7; mul.wide.u32 %rd20, %r37, 4; add.s64 %rd15, %rd11, %rd20; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd14, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v4.u32 {%r33,%r34,%r35,%r36}, [%rd15], %rd14; // end inline asm shl.b32 %r38, %r158, 3; add.s32 %r18, %r38, %r8; add.s32 %r19, %r18, %r9; mul.wide.s32 %rd21, %r19, 2; add.s64 %rd18, %rd13, %rd21; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd17, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs89, [%rd18], %rd17; // end inline asm // begin inline asm { cvt.f32.f16 %f19, %rs89;} // end inline asm setp.eq.s64 %p2, %rd12, 0; mov.u16 %rs326, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r39, %r19, 31; add.s32 %r40, %r19, %r39; shr.s32 %r41, %r40, 1; cvt.s64.s32 %rd25, %r41; add.s64 %rd23, %rd12, %rd25; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd22, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs91, [%rd23], %rd22; // end inline asm cvt.u32.u16 %r42, %rs91; and.b32 %r43, %r42, 255; shl.b32 %r44, %r18, 2; and.b32 %r45, %r44, 4; shr.u32 %r46, %r43, %r45; cvt.u16.u32 %rs92, %r46; and.b16 %rs326, %rs92, 15; $L__BB0_4: shl.b32 %r20, %r157, 5; setp.ge.s32 %p3, %r20, %r28; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs81, 0; shr.u16 %rs94, %rs326, 3; and.b16 %rs95, %rs94, 1; setp.eq.b16 %p5, %rs95, 1; and.pred %p6, %p4, %p5; selp.b16 %rs96, -16, 0, %p6; or.b16 %rs97, %rs96, %rs326; cvt.s16.s8 %rs98, %rs97; cvt.rn.f32.s16 %f3, %rs98; mul.wide.s32 %rd26, %r20, 2; add.s64 %rd5, %rd4, %rd26; ld.global.v4.u32 {%r47, %r48, %r49, %r50}, [%rd5]; cvt.u16.u32 %rs5, %r33; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs10}, %r47; shr.u32 %r55, %r33, 4; cvt.u16.u32 %rs8, %r55; and.b16 %rs9, %rs8, 15; shr.u32 %r56, %r33, 8; cvt.u16.u32 %rs11, %r56; and.b16 %rs12, %rs11, 15; mov.b32 {%rs13, %rs16}, %r48; shr.u32 %r57, %r33, 12; cvt.u16.u32 %rs14, %r57; and.b16 %rs15, %rs14, 15; shr.u32 %r58, %r33, 16; cvt.u16.u32 %rs17, %r58; and.b16 %rs18, %rs17, 15; mov.b32 {%rs19, %rs22}, %r49; shr.u32 %r59, %r33, 20; cvt.u16.u32 %rs20, %r59; and.b16 %rs21, %rs20, 15; shr.u32 %r60, %r33, 24; cvt.u16.u32 %rs23, %r60; and.b16 %rs24, %rs23, 15; mov.b32 {%rs25, %rs27}, %r50; shr.u32 %r61, %r33, 28; cvt.u16.u32 %rs26, %r61; cvt.u16.u32 %rs28, %r34; and.b16 %rs29, %rs28, 15; shr.u32 %r62, %r34, 4; cvt.u16.u32 %rs30, %r62; and.b16 %rs31, %rs30, 15; shr.u32 %r63, %r34, 8; cvt.u16.u32 %rs32, %r63; and.b16 %rs33, %rs32, 15; shr.u32 %r64, %r34, 12; cvt.u16.u32 %rs34, %r64; and.b16 %rs35, %rs34, 15; shr.u32 %r65, %r34, 16; cvt.u16.u32 %rs36, %r65; and.b16 %rs37, %rs36, 15; shr.u32 %r66, %r34, 20; cvt.u16.u32 %rs38, %r66; and.b16 %rs39, %rs38, 15; shr.u32 %r67, %r34, 24; cvt.u16.u32 %rs40, %r67; and.b16 %rs41, %rs40, 15; shr.u32 %r68, %r34, 28; cvt.u16.u32 %rs42, %r68; cvt.u16.u32 %rs43, %r35; and.b16 %rs44, %rs43, 15; shr.u32 %r69, %r35, 4; cvt.u16.u32 %rs45, %r69; and.b16 %rs46, %rs45, 15; shr.u32 %r70, %r35, 8; cvt.u16.u32 %rs47, %r70; and.b16 %rs48, %rs47, 15; shr.u32 %r71, %r35, 12; cvt.u16.u32 %rs49, %r71; and.b16 %rs50, %rs49, 15; shr.u32 %r72, %r35, 16; cvt.u16.u32 %rs51, %r72; and.b16 %rs52, %rs51, 15; shr.u32 %r73, %r35, 20; cvt.u16.u32 %rs53, %r73; and.b16 %rs54, %rs53, 15; shr.u32 %r74, %r35, 24; cvt.u16.u32 %rs55, %r74; and.b16 %rs56, %rs55, 15; shr.u32 %r75, %r35, 28; cvt.u16.u32 %rs57, %r75; cvt.u16.u32 %rs58, %r36; and.b16 %rs59, %rs58, 15; shr.u32 %r76, %r36, 4; cvt.u16.u32 %rs60, %r76; and.b16 %rs61, %rs60, 15; shr.u32 %r77, %r36, 8; cvt.u16.u32 %rs62, %r77; and.b16 %rs63, %rs62, 15; shr.u32 %r78, %r36, 12; cvt.u16.u32 %rs64, %r78; and.b16 %rs65, %rs64, 15; shr.u32 %r79, %r36, 16; cvt.u16.u32 %rs66, %r79; and.b16 %rs67, %rs66, 15; shr.u32 %r80, %r36, 20; cvt.u16.u32 %rs68, %r80; and.b16 %rs69, %rs68, 15; shr.u32 %r81, %r36, 24; cvt.u16.u32 %rs70, %r81; and.b16 %rs71, %rs70, 15; shr.u32 %r82, %r36, 28; cvt.u16.u32 %rs72, %r82; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f52, %rs6; sub.ftz.f32 %f53, %f52, %f3; mul.ftz.f32 %f54, %f19, %f53; // begin inline asm { cvt.f32.f16 %f20, %rs7;} // end inline asm fma.rn.ftz.f32 %f55, %f54, %f20, %f361; cvt.rn.f32.s16 %f56, %rs9; sub.ftz.f32 %f57, %f56, %f3; mul.ftz.f32 %f58, %f19, %f57; // begin inline asm { cvt.f32.f16 %f21, %rs10;} // end inline asm fma.rn.ftz.f32 %f59, %f58, %f21, %f55; cvt.rn.f32.s16 %f60, %rs12; sub.ftz.f32 %f61, %f60, %f3; mul.ftz.f32 %f62, %f19, %f61; // begin inline asm { cvt.f32.f16 %f22, %rs13;} // end inline asm fma.rn.ftz.f32 %f63, %f62, %f22, %f59; cvt.rn.f32.s16 %f64, %rs15; sub.ftz.f32 %f65, %f64, %f3; mul.ftz.f32 %f66, %f19, %f65; // begin inline asm { cvt.f32.f16 %f23, %rs16;} // end inline asm fma.rn.ftz.f32 %f67, %f66, %f23, %f63; cvt.rn.f32.s16 %f68, %rs18; sub.ftz.f32 %f69, %f68, %f3; mul.ftz.f32 %f70, %f19, %f69; // begin inline asm { cvt.f32.f16 %f24, %rs19;} // end inline asm fma.rn.ftz.f32 %f71, %f70, %f24, %f67; cvt.rn.f32.s16 %f72, %rs21; sub.ftz.f32 %f73, %f72, %f3; mul.ftz.f32 %f74, %f19, %f73; // begin inline asm { cvt.f32.f16 %f25, %rs22;} // end inline asm fma.rn.ftz.f32 %f75, %f74, %f25, %f71; cvt.rn.f32.s16 %f76, %rs24; sub.ftz.f32 %f77, %f76, %f3; mul.ftz.f32 %f78, %f19, %f77; // begin inline asm { cvt.f32.f16 %f26, %rs25;} // end inline asm fma.rn.ftz.f32 %f79, %f78, %f26, %f75; cvt.rn.f32.s16 %f80, %rs26; sub.ftz.f32 %f81, %f80, %f3; mul.ftz.f32 %f82, %f19, %f81; // begin inline asm { cvt.f32.f16 %f27, %rs27;} // end inline asm fma.rn.ftz.f32 %f83, %f82, %f27, %f79; ld.global.v4.u32 {%r83, %r84, %r85, %r86}, [%rd5+16]; cvt.rn.f32.s16 %f84, %rs29; sub.ftz.f32 %f85, %f84, %f3; mul.ftz.f32 %f86, %f19, %f85; mov.b32 {%rs107, %rs108}, %r83; // begin inline asm { cvt.f32.f16 %f28, %rs107;} // end inline asm fma.rn.ftz.f32 %f87, %f86, %f28, %f83; cvt.rn.f32.s16 %f88, %rs31; sub.ftz.f32 %f89, %f88, %f3; mul.ftz.f32 %f90, %f19, %f89; // begin inline asm { cvt.f32.f16 %f29, %rs108;} // end inline asm fma.rn.ftz.f32 %f91, %f90, %f29, %f87; cvt.rn.f32.s16 %f92, %rs33; sub.ftz.f32 %f93, %f92, %f3; mul.ftz.f32 %f94, %f19, %f93; mov.b32 {%rs109, %rs110}, %r84; // begin inline asm { cvt.f32.f16 %f30, %rs109;} // end inline asm fma.rn.ftz.f32 %f95, %f94, %f30, %f91; cvt.rn.f32.s16 %f96, %rs35; sub.ftz.f32 %f97, %f96, %f3; mul.ftz.f32 %f98, %f19, %f97; // begin inline asm { cvt.f32.f16 %f31, %rs110;} // end inline asm fma.rn.ftz.f32 %f99, %f98, %f31, %f95; cvt.rn.f32.s16 %f100, %rs37; sub.ftz.f32 %f101, %f100, %f3; mul.ftz.f32 %f102, %f19, %f101; mov.b32 {%rs111, %rs112}, %r85; // begin inline asm { cvt.f32.f16 %f32, %rs111;} // end inline asm fma.rn.ftz.f32 %f103, %f102, %f32, %f99; cvt.rn.f32.s16 %f104, %rs39; sub.ftz.f32 %f105, %f104, %f3; mul.ftz.f32 %f106, %f19, %f105; // begin inline asm { cvt.f32.f16 %f33, %rs112;} // end inline asm fma.rn.ftz.f32 %f107, %f106, %f33, %f103; cvt.rn.f32.s16 %f108, %rs41; sub.ftz.f32 %f109, %f108, %f3; mul.ftz.f32 %f110, %f19, %f109; mov.b32 {%rs113, %rs114}, %r86; // begin inline asm { cvt.f32.f16 %f34, %rs113;} // end inline asm fma.rn.ftz.f32 %f111, %f110, %f34, %f107; cvt.rn.f32.s16 %f112, %rs42; sub.ftz.f32 %f113, %f112, %f3; mul.ftz.f32 %f114, %f19, %f113; // begin inline asm { cvt.f32.f16 %f35, %rs114;} // end inline asm fma.rn.ftz.f32 %f115, %f114, %f35, %f111; ld.global.v4.u32 {%r91, %r92, %r93, %r94}, [%rd5+32]; cvt.rn.f32.s16 %f116, %rs44; sub.ftz.f32 %f117, %f116, %f3; mul.ftz.f32 %f118, %f19, %f117; mov.b32 {%rs115, %rs116}, %r91; // begin inline asm { cvt.f32.f16 %f36, %rs115;} // end inline asm fma.rn.ftz.f32 %f119, %f118, %f36, %f115; cvt.rn.f32.s16 %f120, %rs46; sub.ftz.f32 %f121, %f120, %f3; mul.ftz.f32 %f122, %f19, %f121; // begin inline asm { cvt.f32.f16 %f37, %rs116;} // end inline asm fma.rn.ftz.f32 %f123, %f122, %f37, %f119; cvt.rn.f32.s16 %f124, %rs48; sub.ftz.f32 %f125, %f124, %f3; mul.ftz.f32 %f126, %f19, %f125; mov.b32 {%rs117, %rs118}, %r92; // begin inline asm { cvt.f32.f16 %f38, %rs117;} // end inline asm fma.rn.ftz.f32 %f127, %f126, %f38, %f123; cvt.rn.f32.s16 %f128, %rs50; sub.ftz.f32 %f129, %f128, %f3; mul.ftz.f32 %f130, %f19, %f129; // begin inline asm { cvt.f32.f16 %f39, %rs118;} // end inline asm fma.rn.ftz.f32 %f131, %f130, %f39, %f127; cvt.rn.f32.s16 %f132, %rs52; sub.ftz.f32 %f133, %f132, %f3; mul.ftz.f32 %f134, %f19, %f133; mov.b32 {%rs119, %rs120}, %r93; // begin inline asm { cvt.f32.f16 %f40, %rs119;} // end inline asm fma.rn.ftz.f32 %f135, %f134, %f40, %f131; cvt.rn.f32.s16 %f136, %rs54; sub.ftz.f32 %f137, %f136, %f3; mul.ftz.f32 %f138, %f19, %f137; // begin inline asm { cvt.f32.f16 %f41, %rs120;} // end inline asm fma.rn.ftz.f32 %f139, %f138, %f41, %f135; cvt.rn.f32.s16 %f140, %rs56; sub.ftz.f32 %f141, %f140, %f3; mul.ftz.f32 %f142, %f19, %f141; mov.b32 {%rs121, %rs122}, %r94; // begin inline asm { cvt.f32.f16 %f42, %rs121;} // end inline asm fma.rn.ftz.f32 %f143, %f142, %f42, %f139; cvt.rn.f32.s16 %f144, %rs57; sub.ftz.f32 %f145, %f144, %f3; mul.ftz.f32 %f146, %f19, %f145; // begin inline asm { cvt.f32.f16 %f43, %rs122;} // end inline asm fma.rn.ftz.f32 %f147, %f146, %f43, %f143; ld.global.v4.u32 {%r99, %r100, %r101, %r102}, [%rd5+48]; cvt.rn.f32.s16 %f148, %rs59; sub.ftz.f32 %f149, %f148, %f3; mul.ftz.f32 %f150, %f19, %f149; mov.b32 {%rs123, %rs124}, %r99; // begin inline asm { cvt.f32.f16 %f44, %rs123;} // end inline asm fma.rn.ftz.f32 %f151, %f150, %f44, %f147; cvt.rn.f32.s16 %f152, %rs61; sub.ftz.f32 %f153, %f152, %f3; mul.ftz.f32 %f154, %f19, %f153; // begin inline asm { cvt.f32.f16 %f45, %rs124;} // end inline asm fma.rn.ftz.f32 %f155, %f154, %f45, %f151; cvt.rn.f32.s16 %f156, %rs63; sub.ftz.f32 %f157, %f156, %f3; mul.ftz.f32 %f158, %f19, %f157; mov.b32 {%rs125, %rs126}, %r100; // begin inline asm { cvt.f32.f16 %f46, %rs125;} // end inline asm fma.rn.ftz.f32 %f159, %f158, %f46, %f155; cvt.rn.f32.s16 %f160, %rs65; sub.ftz.f32 %f161, %f160, %f3; mul.ftz.f32 %f162, %f19, %f161; // begin inline asm { cvt.f32.f16 %f47, %rs126;} // end inline asm fma.rn.ftz.f32 %f163, %f162, %f47, %f159; cvt.rn.f32.s16 %f164, %rs67; sub.ftz.f32 %f165, %f164, %f3; mul.ftz.f32 %f166, %f19, %f165; mov.b32 {%rs127, %rs128}, %r101; // begin inline asm { cvt.f32.f16 %f48, %rs127;} // end inline asm fma.rn.ftz.f32 %f167, %f166, %f48, %f163; cvt.rn.f32.s16 %f168, %rs69; sub.ftz.f32 %f169, %f168, %f3; mul.ftz.f32 %f170, %f19, %f169; // begin inline asm { cvt.f32.f16 %f49, %rs128;} // end inline asm fma.rn.ftz.f32 %f171, %f170, %f49, %f167; cvt.rn.f32.s16 %f172, %rs71; sub.ftz.f32 %f173, %f172, %f3; mul.ftz.f32 %f174, %f19, %f173; mov.b32 {%rs129, %rs130}, %r102; // begin inline asm { cvt.f32.f16 %f50, %rs129;} // end inline asm fma.rn.ftz.f32 %f175, %f174, %f50, %f171; cvt.rn.f32.s16 %f176, %rs72; sub.ftz.f32 %f177, %f176, %f3; mul.ftz.f32 %f178, %f19, %f177; // begin inline asm { cvt.f32.f16 %f51, %rs130;} // end inline asm fma.rn.ftz.f32 %f361, %f178, %f51, %f175; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs163, %rs5, 4; cvt.s16.s8 %rs164, %rs163; shr.s16 %rs165, %rs164, 7; and.b16 %rs166, %rs165, -16; or.b16 %rs167, %rs166, %rs6; cvt.rn.f32.s16 %f211, %rs167; sub.ftz.f32 %f212, %f211, %f3; mul.ftz.f32 %f213, %f19, %f212; // begin inline asm { cvt.f32.f16 %f179, %rs7;} // end inline asm fma.rn.ftz.f32 %f214, %f213, %f179, %f361; shl.b16 %rs168, %rs8, 4; cvt.s16.s8 %rs169, %rs168; shr.s16 %rs170, %rs169, 7; and.b16 %rs171, %rs170, -16; or.b16 %rs172, %rs171, %rs9; cvt.rn.f32.s16 %f215, %rs172; sub.ftz.f32 %f216, %f215, %f3; mul.ftz.f32 %f217, %f19, %f216; // begin inline asm { cvt.f32.f16 %f180, %rs10;} // end inline asm fma.rn.ftz.f32 %f218, %f217, %f180, %f214; shl.b16 %rs173, %rs11, 4; cvt.s16.s8 %rs174, %rs173; shr.s16 %rs175, %rs174, 7; and.b16 %rs176, %rs175, -16; or.b16 %rs177, %rs176, %rs12; cvt.rn.f32.s16 %f219, %rs177; sub.ftz.f32 %f220, %f219, %f3; mul.ftz.f32 %f221, %f19, %f220; // begin inline asm { cvt.f32.f16 %f181, %rs13;} // end inline asm fma.rn.ftz.f32 %f222, %f221, %f181, %f218; shl.b16 %rs178, %rs14, 4; cvt.s16.s8 %rs179, %rs178; shr.s16 %rs180, %rs179, 7; and.b16 %rs181, %rs180, -16; or.b16 %rs182, %rs181, %rs15; cvt.rn.f32.s16 %f223, %rs182; sub.ftz.f32 %f224, %f223, %f3; mul.ftz.f32 %f225, %f19, %f224; // begin inline asm { cvt.f32.f16 %f182, %rs16;} // end inline asm fma.rn.ftz.f32 %f226, %f225, %f182, %f222; shl.b16 %rs183, %rs17, 4; cvt.s16.s8 %rs184, %rs183; shr.s16 %rs185, %rs184, 7; and.b16 %rs186, %rs185, -16; or.b16 %rs187, %rs186, %rs18; cvt.rn.f32.s16 %f227, %rs187; sub.ftz.f32 %f228, %f227, %f3; mul.ftz.f32 %f229, %f19, %f228; // begin inline asm { cvt.f32.f16 %f183, %rs19;} // end inline asm fma.rn.ftz.f32 %f230, %f229, %f183, %f226; shl.b16 %rs188, %rs20, 4; cvt.s16.s8 %rs189, %rs188; shr.s16 %rs190, %rs189, 7; and.b16 %rs191, %rs190, -16; or.b16 %rs192, %rs191, %rs21; cvt.rn.f32.s16 %f231, %rs192; sub.ftz.f32 %f232, %f231, %f3; mul.ftz.f32 %f233, %f19, %f232; // begin inline asm { cvt.f32.f16 %f184, %rs22;} // end inline asm fma.rn.ftz.f32 %f234, %f233, %f184, %f230; shl.b16 %rs193, %rs23, 4; cvt.s16.s8 %rs194, %rs193; shr.s16 %rs195, %rs194, 7; and.b16 %rs196, %rs195, -16; or.b16 %rs197, %rs196, %rs24; cvt.rn.f32.s16 %f235, %rs197; sub.ftz.f32 %f236, %f235, %f3; mul.ftz.f32 %f237, %f19, %f236; // begin inline asm { cvt.f32.f16 %f185, %rs25;} // end inline asm fma.rn.ftz.f32 %f238, %f237, %f185, %f234; shl.b16 %rs198, %rs26, 4; cvt.s16.s8 %rs199, %rs198; shr.s16 %rs200, %rs199, 7; and.b16 %rs201, %rs200, -16; or.b16 %rs202, %rs201, %rs26; cvt.rn.f32.s16 %f239, %rs202; sub.ftz.f32 %f240, %f239, %f3; mul.ftz.f32 %f241, %f19, %f240; // begin inline asm { cvt.f32.f16 %f186, %rs27;} // end inline asm fma.rn.ftz.f32 %f242, %f241, %f186, %f238; ld.global.v4.u32 {%r107, %r108, %r109, %r110}, [%rd5+16]; shl.b16 %rs203, %rs28, 4; cvt.s16.s8 %rs204, %rs203; shr.s16 %rs205, %rs204, 7; and.b16 %rs206, %rs205, -16; or.b16 %rs207, %rs206, %rs29; cvt.rn.f32.s16 %f243, %rs207; sub.ftz.f32 %f244, %f243, %f3; mul.ftz.f32 %f245, %f19, %f244; mov.b32 {%rs139, %rs140}, %r107; // begin inline asm { cvt.f32.f16 %f187, %rs139;} // end inline asm fma.rn.ftz.f32 %f246, %f245, %f187, %f242; shl.b16 %rs208, %rs30, 4; cvt.s16.s8 %rs209, %rs208; shr.s16 %rs210, %rs209, 7; and.b16 %rs211, %rs210, -16; or.b16 %rs212, %rs211, %rs31; cvt.rn.f32.s16 %f247, %rs212; sub.ftz.f32 %f248, %f247, %f3; mul.ftz.f32 %f249, %f19, %f248; // begin inline asm { cvt.f32.f16 %f188, %rs140;} // end inline asm fma.rn.ftz.f32 %f250, %f249, %f188, %f246; shl.b16 %rs213, %rs32, 4; cvt.s16.s8 %rs214, %rs213; shr.s16 %rs215, %rs214, 7; and.b16 %rs216, %rs215, -16; or.b16 %rs217, %rs216, %rs33; cvt.rn.f32.s16 %f251, %rs217; sub.ftz.f32 %f252, %f251, %f3; mul.ftz.f32 %f253, %f19, %f252; mov.b32 {%rs141, %rs142}, %r108; // begin inline asm { cvt.f32.f16 %f189, %rs141;} // end inline asm fma.rn.ftz.f32 %f254, %f253, %f189, %f250; shl.b16 %rs218, %rs34, 4; cvt.s16.s8 %rs219, %rs218; shr.s16 %rs220, %rs219, 7; and.b16 %rs221, %rs220, -16; or.b16 %rs222, %rs221, %rs35; cvt.rn.f32.s16 %f255, %rs222; sub.ftz.f32 %f256, %f255, %f3; mul.ftz.f32 %f257, %f19, %f256; // begin inline asm { cvt.f32.f16 %f190, %rs142;} // end inline asm fma.rn.ftz.f32 %f258, %f257, %f190, %f254; shl.b16 %rs223, %rs36, 4; cvt.s16.s8 %rs224, %rs223; shr.s16 %rs225, %rs224, 7; and.b16 %rs226, %rs225, -16; or.b16 %rs227, %rs226, %rs37; cvt.rn.f32.s16 %f259, %rs227; sub.ftz.f32 %f260, %f259, %f3; mul.ftz.f32 %f261, %f19, %f260; mov.b32 {%rs143, %rs144}, %r109; // begin inline asm { cvt.f32.f16 %f191, %rs143;} // end inline asm fma.rn.ftz.f32 %f262, %f261, %f191, %f258; shl.b16 %rs228, %rs38, 4; cvt.s16.s8 %rs229, %rs228; shr.s16 %rs230, %rs229, 7; and.b16 %rs231, %rs230, -16; or.b16 %rs232, %rs231, %rs39; cvt.rn.f32.s16 %f263, %rs232; sub.ftz.f32 %f264, %f263, %f3; mul.ftz.f32 %f265, %f19, %f264; // begin inline asm { cvt.f32.f16 %f192, %rs144;} // end inline asm fma.rn.ftz.f32 %f266, %f265, %f192, %f262; shl.b16 %rs233, %rs40, 4; cvt.s16.s8 %rs234, %rs233; shr.s16 %rs235, %rs234, 7; and.b16 %rs236, %rs235, -16; or.b16 %rs237, %rs236, %rs41; cvt.rn.f32.s16 %f267, %rs237; sub.ftz.f32 %f268, %f267, %f3; mul.ftz.f32 %f269, %f19, %f268; mov.b32 {%rs145, %rs146}, %r110; // begin inline asm { cvt.f32.f16 %f193, %rs145;} // end inline asm fma.rn.ftz.f32 %f270, %f269, %f193, %f266; shl.b16 %rs238, %rs42, 4; cvt.s16.s8 %rs239, %rs238; shr.s16 %rs240, %rs239, 7; and.b16 %rs241, %rs240, -16; or.b16 %rs242, %rs241, %rs42; cvt.rn.f32.s16 %f271, %rs242; sub.ftz.f32 %f272, %f271, %f3; mul.ftz.f32 %f273, %f19, %f272; // begin inline asm { cvt.f32.f16 %f194, %rs146;} // end inline asm fma.rn.ftz.f32 %f274, %f273, %f194, %f270; ld.global.v4.u32 {%r115, %r116, %r117, %r118}, [%rd5+32]; shl.b16 %rs243, %rs43, 4; cvt.s16.s8 %rs244, %rs243; shr.s16 %rs245, %rs244, 7; and.b16 %rs246, %rs245, -16; or.b16 %rs247, %rs246, %rs44; cvt.rn.f32.s16 %f275, %rs247; sub.ftz.f32 %f276, %f275, %f3; mul.ftz.f32 %f277, %f19, %f276; mov.b32 {%rs147, %rs148}, %r115; // begin inline asm { cvt.f32.f16 %f195, %rs147;} // end inline asm fma.rn.ftz.f32 %f278, %f277, %f195, %f274; shl.b16 %rs248, %rs45, 4; cvt.s16.s8 %rs249, %rs248; shr.s16 %rs250, %rs249, 7; and.b16 %rs251, %rs250, -16; or.b16 %rs252, %rs251, %rs46; cvt.rn.f32.s16 %f279, %rs252; sub.ftz.f32 %f280, %f279, %f3; mul.ftz.f32 %f281, %f19, %f280; // begin inline asm { cvt.f32.f16 %f196, %rs148;} // end inline asm fma.rn.ftz.f32 %f282, %f281, %f196, %f278; shl.b16 %rs253, %rs47, 4; cvt.s16.s8 %rs254, %rs253; shr.s16 %rs255, %rs254, 7; and.b16 %rs256, %rs255, -16; or.b16 %rs257, %rs256, %rs48; cvt.rn.f32.s16 %f283, %rs257; sub.ftz.f32 %f284, %f283, %f3; mul.ftz.f32 %f285, %f19, %f284; mov.b32 {%rs149, %rs150}, %r116; // begin inline asm { cvt.f32.f16 %f197, %rs149;} // end inline asm fma.rn.ftz.f32 %f286, %f285, %f197, %f282; shl.b16 %rs258, %rs49, 4; cvt.s16.s8 %rs259, %rs258; shr.s16 %rs260, %rs259, 7; and.b16 %rs261, %rs260, -16; or.b16 %rs262, %rs261, %rs50; cvt.rn.f32.s16 %f287, %rs262; sub.ftz.f32 %f288, %f287, %f3; mul.ftz.f32 %f289, %f19, %f288; // begin inline asm { cvt.f32.f16 %f198, %rs150;} // end inline asm fma.rn.ftz.f32 %f290, %f289, %f198, %f286; shl.b16 %rs263, %rs51, 4; cvt.s16.s8 %rs264, %rs263; shr.s16 %rs265, %rs264, 7; and.b16 %rs266, %rs265, -16; or.b16 %rs267, %rs266, %rs52; cvt.rn.f32.s16 %f291, %rs267; sub.ftz.f32 %f292, %f291, %f3; mul.ftz.f32 %f293, %f19, %f292; mov.b32 {%rs151, %rs152}, %r117; // begin inline asm { cvt.f32.f16 %f199, %rs151;} // end inline asm fma.rn.ftz.f32 %f294, %f293, %f199, %f290; shl.b16 %rs268, %rs53, 4; cvt.s16.s8 %rs269, %rs268; shr.s16 %rs270, %rs269, 7; and.b16 %rs271, %rs270, -16; or.b16 %rs272, %rs271, %rs54; cvt.rn.f32.s16 %f295, %rs272; sub.ftz.f32 %f296, %f295, %f3; mul.ftz.f32 %f297, %f19, %f296; // begin inline asm { cvt.f32.f16 %f200, %rs152;} // end inline asm fma.rn.ftz.f32 %f298, %f297, %f200, %f294; shl.b16 %rs273, %rs55, 4; cvt.s16.s8 %rs274, %rs273; shr.s16 %rs275, %rs274, 7; and.b16 %rs276, %rs275, -16; or.b16 %rs277, %rs276, %rs56; cvt.rn.f32.s16 %f299, %rs277; sub.ftz.f32 %f300, %f299, %f3; mul.ftz.f32 %f301, %f19, %f300; mov.b32 {%rs153, %rs154}, %r118; // begin inline asm { cvt.f32.f16 %f201, %rs153;} // end inline asm fma.rn.ftz.f32 %f302, %f301, %f201, %f298; shl.b16 %rs278, %rs57, 4; cvt.s16.s8 %rs279, %rs278; shr.s16 %rs280, %rs279, 7; and.b16 %rs281, %rs280, -16; or.b16 %rs282, %rs281, %rs57; cvt.rn.f32.s16 %f303, %rs282; sub.ftz.f32 %f304, %f303, %f3; mul.ftz.f32 %f305, %f19, %f304; // begin inline asm { cvt.f32.f16 %f202, %rs154;} // end inline asm fma.rn.ftz.f32 %f306, %f305, %f202, %f302; ld.global.v4.u32 {%r123, %r124, %r125, %r126}, [%rd5+48]; shl.b16 %rs283, %rs58, 4; cvt.s16.s8 %rs284, %rs283; shr.s16 %rs285, %rs284, 7; and.b16 %rs286, %rs285, -16; or.b16 %rs287, %rs286, %rs59; cvt.rn.f32.s16 %f307, %rs287; sub.ftz.f32 %f308, %f307, %f3; mul.ftz.f32 %f309, %f19, %f308; mov.b32 {%rs155, %rs156}, %r123; // begin inline asm { cvt.f32.f16 %f203, %rs155;} // end inline asm fma.rn.ftz.f32 %f310, %f309, %f203, %f306; shl.b16 %rs288, %rs60, 4; cvt.s16.s8 %rs289, %rs288; shr.s16 %rs290, %rs289, 7; and.b16 %rs291, %rs290, -16; or.b16 %rs292, %rs291, %rs61; cvt.rn.f32.s16 %f311, %rs292; sub.ftz.f32 %f312, %f311, %f3; mul.ftz.f32 %f313, %f19, %f312; // begin inline asm { cvt.f32.f16 %f204, %rs156;} // end inline asm fma.rn.ftz.f32 %f314, %f313, %f204, %f310; shl.b16 %rs293, %rs62, 4; cvt.s16.s8 %rs294, %rs293; shr.s16 %rs295, %rs294, 7; and.b16 %rs296, %rs295, -16; or.b16 %rs297, %rs296, %rs63; cvt.rn.f32.s16 %f315, %rs297; sub.ftz.f32 %f316, %f315, %f3; mul.ftz.f32 %f317, %f19, %f316; mov.b32 {%rs157, %rs158}, %r124; // begin inline asm { cvt.f32.f16 %f205, %rs157;} // end inline asm fma.rn.ftz.f32 %f318, %f317, %f205, %f314; shl.b16 %rs298, %rs64, 4; cvt.s16.s8 %rs299, %rs298; shr.s16 %rs300, %rs299, 7; and.b16 %rs301, %rs300, -16; or.b16 %rs302, %rs301, %rs65; cvt.rn.f32.s16 %f319, %rs302; sub.ftz.f32 %f320, %f319, %f3; mul.ftz.f32 %f321, %f19, %f320; // begin inline asm { cvt.f32.f16 %f206, %rs158;} // end inline asm fma.rn.ftz.f32 %f322, %f321, %f206, %f318; shl.b16 %rs303, %rs66, 4; cvt.s16.s8 %rs304, %rs303; shr.s16 %rs305, %rs304, 7; and.b16 %rs306, %rs305, -16; or.b16 %rs307, %rs306, %rs67; cvt.rn.f32.s16 %f323, %rs307; sub.ftz.f32 %f324, %f323, %f3; mul.ftz.f32 %f325, %f19, %f324; mov.b32 {%rs159, %rs160}, %r125; // begin inline asm { cvt.f32.f16 %f207, %rs159;} // end inline asm fma.rn.ftz.f32 %f326, %f325, %f207, %f322; shl.b16 %rs308, %rs68, 4; cvt.s16.s8 %rs309, %rs308; shr.s16 %rs310, %rs309, 7; and.b16 %rs311, %rs310, -16; or.b16 %rs312, %rs311, %rs69; cvt.rn.f32.s16 %f327, %rs312; sub.ftz.f32 %f328, %f327, %f3; mul.ftz.f32 %f329, %f19, %f328; // begin inline asm { cvt.f32.f16 %f208, %rs160;} // end inline asm fma.rn.ftz.f32 %f330, %f329, %f208, %f326; shl.b16 %rs313, %rs70, 4; cvt.s16.s8 %rs314, %rs313; shr.s16 %rs315, %rs314, 7; and.b16 %rs316, %rs315, -16; or.b16 %rs317, %rs316, %rs71; cvt.rn.f32.s16 %f331, %rs317; sub.ftz.f32 %f332, %f331, %f3; mul.ftz.f32 %f333, %f19, %f332; mov.b32 {%rs161, %rs162}, %r126; // begin inline asm { cvt.f32.f16 %f209, %rs161;} // end inline asm fma.rn.ftz.f32 %f334, %f333, %f209, %f330; shl.b16 %rs318, %rs72, 4; cvt.s16.s8 %rs319, %rs318; shr.s16 %rs320, %rs319, 7; and.b16 %rs321, %rs320, -16; or.b16 %rs322, %rs321, %rs72; cvt.rn.f32.s16 %f335, %rs322; sub.ftz.f32 %f336, %f335, %f3; mul.ftz.f32 %f337, %f19, %f336; // begin inline asm { cvt.f32.f16 %f210, %rs162;} // end inline asm fma.rn.ftz.f32 %f361, %f337, %f210, %f334; $L__BB0_8: add.s32 %r158, %r158, 4; shl.b32 %r131, %r158, 5; add.s32 %r157, %r131, %r3; shl.b32 %r156, %r157, 2; setp.lt.u32 %p7, %r156, %r30; @%p7 bra $L__BB0_2; $L__BB0_9: setp.lt.u32 %p8, %r4, 32; @%p8 bra $L__BB0_11; shl.b32 %r132, %r4, 2; mov.u32 %r133, _ZZ9gemv_int4ILi4ELi128ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r134, %r133, %r132; st.shared.f32 [%r134+-112], %f361; $L__BB0_11: bar.sync 0; setp.gt.u32 %p9, %r4, 31; @%p9 bra $L__BB0_13; mov.u32 %r151, _ZZ9gemv_int4ILi4ELi128ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; mad.lo.s32 %r152, %r4, 12, %r151; mov.u32 %r139, 2; mov.u32 %r148, 16; ld.shared.f32 %f353, [%r152+16]; add.ftz.f32 %f354, %f361, %f353; ld.shared.f32 %f355, [%r152+20]; add.ftz.f32 %f356, %f354, %f355; ld.shared.f32 %f357, [%r152+24]; add.ftz.f32 %f340, %f356, %f357; mov.u32 %r136, 1; mov.u32 %r149, 31; mov.u32 %r150, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f340, %r136, %r149, %r150; @p add.f32 r0, r0, %f340; mov.f32 %f338, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f338, %r139, %r149, %r150; @p add.f32 r0, r0, %f338; mov.f32 %f341, r0;} // end inline asm mov.u32 %r142, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f341, %r142, %r149, %r150; @p add.f32 r0, r0, %f341; mov.f32 %f344, r0;} // end inline asm mov.u32 %r145, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f344, %r145, %r149, %r150; @p add.f32 r0, r0, %f344; mov.f32 %f347, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f347, %r148, %r149, %r150; @p add.f32 r0, r0, %f347; mov.f32 %f361, r0;} // end inline asm $L__BB0_13: mov.u32 %r154, %tid.y; or.b32 %r153, %r3, %r154; setp.ne.s32 %p10, %r153, 0; @%p10 bra $L__BB0_17; ld.param.u64 %rd34, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+8]; mov.u32 %r155, %ctaid.x; mul.ftz.f32 %f364, %f16, %f361; cvt.s64.s32 %rd7, %r155; setp.eq.s64 %p11, %rd34, 0; @%p11 bra $L__BB0_16; ld.param.u64 %rd35, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd27, %rd35; shl.b64 %rd28, %rd7, 1; add.s64 %rd29, %rd27, %rd28; ld.global.u16 %rs323, [%rd29]; // begin inline asm { cvt.f32.f16 %f358, %rs323;} // end inline asm fma.rn.ftz.f32 %f364, %f17, %f358, %f364; $L__BB0_16: ld.param.u64 %rd36, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs324, %f364;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd30, 1.0; // end inline asm shl.b64 %rd33, %rd7, 1; add.s64 %rd31, %rd36, %rd33; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd31], %rs324, %rd30; // end inline asm $L__BB0_17: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }