storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e50bdfa76thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch223DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<15>; .reg .b16 %rs<228>; .reg .f32 %f<308>; .reg .b32 %r<161>; .reg .b64 %rd<49>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi2EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1280]; ld.param.v2.u32 {%r29, %r30}, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r31, %r32}, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f29, %f30}, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs59, %rs60, %rs61, %rs62}, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd16, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd15, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd14, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd13, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r160, %tid.y; shl.b32 %r33, %r160, 5; mov.u32 %r3, %tid.x; add.s32 %r159, %r33, %r3; shl.b32 %r158, %r159, 1; setp.ge.u32 %p1, %r158, %r31; mov.f32 %f300, 0f00000000; mov.f32 %f301, %f300; @%p1 bra $L__BB0_9; mul.lo.s32 %r8, %r31, %r1; shr.u32 %r9, %r3, 2; mul.lo.s32 %r10, %r32, %r1; shl.b16 %rs2, %rs59, 3; cvta.to.global.u64 %rd6, %rd13; $L__BB0_2: add.s32 %r36, %r158, %r8; mul.wide.u32 %rd23, %r36, 4; add.s64 %rd18, %rd14, %rd23; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd17, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v2.u32 {%r34,%r35}, [%rd18], %rd17; // end inline asm shl.b32 %r37, %r160, 3; add.s32 %r16, %r37, %r9; add.s32 %r17, %r16, %r10; mul.wide.s32 %rd24, %r17, 2; add.s64 %rd21, %rd16, %rd24; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd20, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs67, [%rd21], %rd20; // end inline asm // begin inline asm { cvt.f32.f16 %f33, %rs67;} // end inline asm setp.eq.s64 %p2, %rd15, 0; mov.u16 %rs227, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r38, %r17, 31; add.s32 %r39, %r17, %r38; shr.s32 %r40, %r39, 1; cvt.s64.s32 %rd28, %r40; add.s64 %rd26, %rd15, %rd28; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd25, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs69, [%rd26], %rd25; // end inline asm cvt.u32.u16 %r41, %rs69; and.b32 %r42, %r41, 255; shl.b32 %r43, %r16, 2; and.b32 %r44, %r43, 4; shr.u32 %r45, %r42, %r44; cvt.u16.u32 %rs70, %r45; and.b16 %rs227, %rs70, 15; $L__BB0_4: shl.b32 %r18, %r159, 4; setp.ge.s32 %p3, %r18, %r29; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs59, 0; shr.u16 %rs72, %rs227, 3; and.b16 %rs73, %rs72, 1; setp.eq.b16 %p5, %rs73, 1; and.pred %p6, %p4, %p5; selp.b16 %rs74, -16, 0, %p6; or.b16 %rs75, %rs74, %rs227; cvt.s16.s8 %rs76, %rs75; cvt.rn.f32.s16 %f4, %rs76; mul.wide.s32 %rd29, %r18, 2; add.s64 %rd7, %rd6, %rd29; ld.global.v4.u32 {%r46, %r47, %r48, %r49}, [%rd7]; mul.wide.s32 %rd30, %r29, 2; add.s64 %rd8, %rd7, %rd30; ld.global.v4.u32 {%r54, %r55, %r56, %r57}, [%rd8]; cvt.u16.u32 %rs5, %r34; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs11}, %r46; mov.b32 {%rs8, %rs12}, %r54; shr.u32 %r62, %r34, 4; cvt.u16.u32 %rs9, %r62; and.b16 %rs10, %rs9, 15; shr.u32 %r63, %r34, 8; cvt.u16.u32 %rs13, %r63; and.b16 %rs14, %rs13, 15; mov.b32 {%rs15, %rs19}, %r47; mov.b32 {%rs16, %rs20}, %r55; shr.u32 %r64, %r34, 12; cvt.u16.u32 %rs17, %r64; and.b16 %rs18, %rs17, 15; shr.u32 %r65, %r34, 16; cvt.u16.u32 %rs21, %r65; and.b16 %rs22, %rs21, 15; mov.b32 {%rs23, %rs27}, %r48; mov.b32 {%rs24, %rs28}, %r56; shr.u32 %r66, %r34, 20; cvt.u16.u32 %rs25, %r66; and.b16 %rs26, %rs25, 15; shr.u32 %r67, %r34, 24; cvt.u16.u32 %rs29, %r67; and.b16 %rs30, %rs29, 15; mov.b32 {%rs31, %rs34}, %r49; mov.b32 {%rs32, %rs35}, %r57; shr.u32 %r68, %r34, 28; cvt.u16.u32 %rs33, %r68; cvt.u16.u32 %rs36, %r35; and.b16 %rs37, %rs36, 15; shr.u32 %r69, %r35, 4; cvt.u16.u32 %rs38, %r69; and.b16 %rs39, %rs38, 15; shr.u32 %r70, %r35, 8; cvt.u16.u32 %rs40, %r70; and.b16 %rs41, %rs40, 15; shr.u32 %r71, %r35, 12; cvt.u16.u32 %rs42, %r71; and.b16 %rs43, %rs42, 15; shr.u32 %r72, %r35, 16; cvt.u16.u32 %rs44, %r72; and.b16 %rs45, %rs44, 15; shr.u32 %r73, %r35, 20; cvt.u16.u32 %rs46, %r73; and.b16 %rs47, %rs46, 15; shr.u32 %r74, %r35, 24; cvt.u16.u32 %rs48, %r74; and.b16 %rs49, %rs48, 15; shr.u32 %r75, %r35, 28; cvt.u16.u32 %rs50, %r75; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f66, %rs6; sub.ftz.f32 %f67, %f66, %f4; mul.ftz.f32 %f68, %f33, %f67; // begin inline asm { cvt.f32.f16 %f34, %rs7;} // end inline asm fma.rn.ftz.f32 %f69, %f68, %f34, %f301; // begin inline asm { cvt.f32.f16 %f35, %rs8;} // end inline asm fma.rn.ftz.f32 %f70, %f68, %f35, %f300; cvt.rn.f32.s16 %f71, %rs10; sub.ftz.f32 %f72, %f71, %f4; mul.ftz.f32 %f73, %f33, %f72; // begin inline asm { cvt.f32.f16 %f36, %rs11;} // end inline asm fma.rn.ftz.f32 %f74, %f73, %f36, %f69; // begin inline asm { cvt.f32.f16 %f37, %rs12;} // end inline asm fma.rn.ftz.f32 %f75, %f73, %f37, %f70; cvt.rn.f32.s16 %f76, %rs14; sub.ftz.f32 %f77, %f76, %f4; mul.ftz.f32 %f78, %f33, %f77; // begin inline asm { cvt.f32.f16 %f38, %rs15;} // end inline asm fma.rn.ftz.f32 %f79, %f78, %f38, %f74; // begin inline asm { cvt.f32.f16 %f39, %rs16;} // end inline asm fma.rn.ftz.f32 %f80, %f78, %f39, %f75; cvt.rn.f32.s16 %f81, %rs18; sub.ftz.f32 %f82, %f81, %f4; mul.ftz.f32 %f83, %f33, %f82; // begin inline asm { cvt.f32.f16 %f40, %rs19;} // end inline asm fma.rn.ftz.f32 %f84, %f83, %f40, %f79; // begin inline asm { cvt.f32.f16 %f41, %rs20;} // end inline asm fma.rn.ftz.f32 %f85, %f83, %f41, %f80; cvt.rn.f32.s16 %f86, %rs22; sub.ftz.f32 %f87, %f86, %f4; mul.ftz.f32 %f88, %f33, %f87; // begin inline asm { cvt.f32.f16 %f42, %rs23;} // end inline asm fma.rn.ftz.f32 %f89, %f88, %f42, %f84; // begin inline asm { cvt.f32.f16 %f43, %rs24;} // end inline asm fma.rn.ftz.f32 %f90, %f88, %f43, %f85; cvt.rn.f32.s16 %f91, %rs26; sub.ftz.f32 %f92, %f91, %f4; mul.ftz.f32 %f93, %f33, %f92; // begin inline asm { cvt.f32.f16 %f44, %rs27;} // end inline asm fma.rn.ftz.f32 %f94, %f93, %f44, %f89; // begin inline asm { cvt.f32.f16 %f45, %rs28;} // end inline asm fma.rn.ftz.f32 %f95, %f93, %f45, %f90; cvt.rn.f32.s16 %f96, %rs30; sub.ftz.f32 %f97, %f96, %f4; mul.ftz.f32 %f98, %f33, %f97; // begin inline asm { cvt.f32.f16 %f46, %rs31;} // end inline asm fma.rn.ftz.f32 %f99, %f98, %f46, %f94; // begin inline asm { cvt.f32.f16 %f47, %rs32;} // end inline asm fma.rn.ftz.f32 %f100, %f98, %f47, %f95; cvt.rn.f32.s16 %f101, %rs33; sub.ftz.f32 %f102, %f101, %f4; mul.ftz.f32 %f103, %f33, %f102; // begin inline asm { cvt.f32.f16 %f48, %rs34;} // end inline asm fma.rn.ftz.f32 %f104, %f103, %f48, %f99; // begin inline asm { cvt.f32.f16 %f49, %rs35;} // end inline asm fma.rn.ftz.f32 %f105, %f103, %f49, %f100; ld.global.v4.u32 {%r76, %r77, %r78, %r79}, [%rd7+16]; ld.global.v4.u32 {%r84, %r85, %r86, %r87}, [%rd8+16]; cvt.rn.f32.s16 %f106, %rs37; sub.ftz.f32 %f107, %f106, %f4; mul.ftz.f32 %f108, %f33, %f107; mov.b32 {%rs93, %rs95}, %r76; // begin inline asm { cvt.f32.f16 %f50, %rs93;} // end inline asm fma.rn.ftz.f32 %f109, %f108, %f50, %f104; mov.b32 {%rs94, %rs96}, %r84; // begin inline asm { cvt.f32.f16 %f51, %rs94;} // end inline asm fma.rn.ftz.f32 %f110, %f108, %f51, %f105; cvt.rn.f32.s16 %f111, %rs39; sub.ftz.f32 %f112, %f111, %f4; mul.ftz.f32 %f113, %f33, %f112; // begin inline asm { cvt.f32.f16 %f52, %rs95;} // end inline asm fma.rn.ftz.f32 %f114, %f113, %f52, %f109; // begin inline asm { cvt.f32.f16 %f53, %rs96;} // end inline asm fma.rn.ftz.f32 %f115, %f113, %f53, %f110; cvt.rn.f32.s16 %f116, %rs41; sub.ftz.f32 %f117, %f116, %f4; mul.ftz.f32 %f118, %f33, %f117; mov.b32 {%rs97, %rs99}, %r77; // begin inline asm { cvt.f32.f16 %f54, %rs97;} // end inline asm fma.rn.ftz.f32 %f119, %f118, %f54, %f114; mov.b32 {%rs98, %rs100}, %r85; // begin inline asm { cvt.f32.f16 %f55, %rs98;} // end inline asm fma.rn.ftz.f32 %f120, %f118, %f55, %f115; cvt.rn.f32.s16 %f121, %rs43; sub.ftz.f32 %f122, %f121, %f4; mul.ftz.f32 %f123, %f33, %f122; // begin inline asm { cvt.f32.f16 %f56, %rs99;} // end inline asm fma.rn.ftz.f32 %f124, %f123, %f56, %f119; // begin inline asm { cvt.f32.f16 %f57, %rs100;} // end inline asm fma.rn.ftz.f32 %f125, %f123, %f57, %f120; cvt.rn.f32.s16 %f126, %rs45; sub.ftz.f32 %f127, %f126, %f4; mul.ftz.f32 %f128, %f33, %f127; mov.b32 {%rs101, %rs103}, %r78; // begin inline asm { cvt.f32.f16 %f58, %rs101;} // end inline asm fma.rn.ftz.f32 %f129, %f128, %f58, %f124; mov.b32 {%rs102, %rs104}, %r86; // begin inline asm { cvt.f32.f16 %f59, %rs102;} // end inline asm fma.rn.ftz.f32 %f130, %f128, %f59, %f125; cvt.rn.f32.s16 %f131, %rs47; sub.ftz.f32 %f132, %f131, %f4; mul.ftz.f32 %f133, %f33, %f132; // begin inline asm { cvt.f32.f16 %f60, %rs103;} // end inline asm fma.rn.ftz.f32 %f134, %f133, %f60, %f129; // begin inline asm { cvt.f32.f16 %f61, %rs104;} // end inline asm fma.rn.ftz.f32 %f135, %f133, %f61, %f130; cvt.rn.f32.s16 %f136, %rs49; sub.ftz.f32 %f137, %f136, %f4; mul.ftz.f32 %f138, %f33, %f137; mov.b32 {%rs105, %rs107}, %r79; // begin inline asm { cvt.f32.f16 %f62, %rs105;} // end inline asm fma.rn.ftz.f32 %f139, %f138, %f62, %f134; mov.b32 {%rs106, %rs108}, %r87; // begin inline asm { cvt.f32.f16 %f63, %rs106;} // end inline asm fma.rn.ftz.f32 %f140, %f138, %f63, %f135; cvt.rn.f32.s16 %f141, %rs50; sub.ftz.f32 %f142, %f141, %f4; mul.ftz.f32 %f143, %f33, %f142; // begin inline asm { cvt.f32.f16 %f64, %rs107;} // end inline asm fma.rn.ftz.f32 %f301, %f143, %f64, %f139; // begin inline asm { cvt.f32.f16 %f65, %rs108;} // end inline asm fma.rn.ftz.f32 %f300, %f143, %f65, %f140; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs141, %rs5, 4; cvt.s16.s8 %rs142, %rs141; shr.s16 %rs143, %rs142, 7; and.b16 %rs144, %rs143, -16; or.b16 %rs145, %rs144, %rs6; cvt.rn.f32.s16 %f176, %rs145; sub.ftz.f32 %f177, %f176, %f4; mul.ftz.f32 %f178, %f33, %f177; // begin inline asm { cvt.f32.f16 %f144, %rs7;} // end inline asm fma.rn.ftz.f32 %f179, %f178, %f144, %f301; // begin inline asm { cvt.f32.f16 %f145, %rs8;} // end inline asm fma.rn.ftz.f32 %f180, %f178, %f145, %f300; shl.b16 %rs146, %rs9, 4; cvt.s16.s8 %rs147, %rs146; shr.s16 %rs148, %rs147, 7; and.b16 %rs149, %rs148, -16; or.b16 %rs150, %rs149, %rs10; cvt.rn.f32.s16 %f181, %rs150; sub.ftz.f32 %f182, %f181, %f4; mul.ftz.f32 %f183, %f33, %f182; // begin inline asm { cvt.f32.f16 %f146, %rs11;} // end inline asm fma.rn.ftz.f32 %f184, %f183, %f146, %f179; // begin inline asm { cvt.f32.f16 %f147, %rs12;} // end inline asm fma.rn.ftz.f32 %f185, %f183, %f147, %f180; shl.b16 %rs151, %rs13, 4; cvt.s16.s8 %rs152, %rs151; shr.s16 %rs153, %rs152, 7; and.b16 %rs154, %rs153, -16; or.b16 %rs155, %rs154, %rs14; cvt.rn.f32.s16 %f186, %rs155; sub.ftz.f32 %f187, %f186, %f4; mul.ftz.f32 %f188, %f33, %f187; // begin inline asm { cvt.f32.f16 %f148, %rs15;} // end inline asm fma.rn.ftz.f32 %f189, %f188, %f148, %f184; // begin inline asm { cvt.f32.f16 %f149, %rs16;} // end inline asm fma.rn.ftz.f32 %f190, %f188, %f149, %f185; shl.b16 %rs156, %rs17, 4; cvt.s16.s8 %rs157, %rs156; shr.s16 %rs158, %rs157, 7; and.b16 %rs159, %rs158, -16; or.b16 %rs160, %rs159, %rs18; cvt.rn.f32.s16 %f191, %rs160; sub.ftz.f32 %f192, %f191, %f4; mul.ftz.f32 %f193, %f33, %f192; // begin inline asm { cvt.f32.f16 %f150, %rs19;} // end inline asm fma.rn.ftz.f32 %f194, %f193, %f150, %f189; // begin inline asm { cvt.f32.f16 %f151, %rs20;} // end inline asm fma.rn.ftz.f32 %f195, %f193, %f151, %f190; shl.b16 %rs161, %rs21, 4; cvt.s16.s8 %rs162, %rs161; shr.s16 %rs163, %rs162, 7; and.b16 %rs164, %rs163, -16; or.b16 %rs165, %rs164, %rs22; cvt.rn.f32.s16 %f196, %rs165; sub.ftz.f32 %f197, %f196, %f4; mul.ftz.f32 %f198, %f33, %f197; // begin inline asm { cvt.f32.f16 %f152, %rs23;} // end inline asm fma.rn.ftz.f32 %f199, %f198, %f152, %f194; // begin inline asm { cvt.f32.f16 %f153, %rs24;} // end inline asm fma.rn.ftz.f32 %f200, %f198, %f153, %f195; shl.b16 %rs166, %rs25, 4; cvt.s16.s8 %rs167, %rs166; shr.s16 %rs168, %rs167, 7; and.b16 %rs169, %rs168, -16; or.b16 %rs170, %rs169, %rs26; cvt.rn.f32.s16 %f201, %rs170; sub.ftz.f32 %f202, %f201, %f4; mul.ftz.f32 %f203, %f33, %f202; // begin inline asm { cvt.f32.f16 %f154, %rs27;} // end inline asm fma.rn.ftz.f32 %f204, %f203, %f154, %f199; // begin inline asm { cvt.f32.f16 %f155, %rs28;} // end inline asm fma.rn.ftz.f32 %f205, %f203, %f155, %f200; shl.b16 %rs171, %rs29, 4; cvt.s16.s8 %rs172, %rs171; shr.s16 %rs173, %rs172, 7; and.b16 %rs174, %rs173, -16; or.b16 %rs175, %rs174, %rs30; cvt.rn.f32.s16 %f206, %rs175; sub.ftz.f32 %f207, %f206, %f4; mul.ftz.f32 %f208, %f33, %f207; // begin inline asm { cvt.f32.f16 %f156, %rs31;} // end inline asm fma.rn.ftz.f32 %f209, %f208, %f156, %f204; // begin inline asm { cvt.f32.f16 %f157, %rs32;} // end inline asm fma.rn.ftz.f32 %f210, %f208, %f157, %f205; shl.b16 %rs176, %rs33, 4; cvt.s16.s8 %rs177, %rs176; shr.s16 %rs178, %rs177, 7; and.b16 %rs179, %rs178, -16; or.b16 %rs180, %rs179, %rs33; cvt.rn.f32.s16 %f211, %rs180; sub.ftz.f32 %f212, %f211, %f4; mul.ftz.f32 %f213, %f33, %f212; // begin inline asm { cvt.f32.f16 %f158, %rs34;} // end inline asm fma.rn.ftz.f32 %f214, %f213, %f158, %f209; // begin inline asm { cvt.f32.f16 %f159, %rs35;} // end inline asm fma.rn.ftz.f32 %f215, %f213, %f159, %f210; ld.global.v4.u32 {%r92, %r93, %r94, %r95}, [%rd7+16]; ld.global.v4.u32 {%r100, %r101, %r102, %r103}, [%rd8+16]; shl.b16 %rs181, %rs36, 4; cvt.s16.s8 %rs182, %rs181; shr.s16 %rs183, %rs182, 7; and.b16 %rs184, %rs183, -16; or.b16 %rs185, %rs184, %rs37; cvt.rn.f32.s16 %f216, %rs185; sub.ftz.f32 %f217, %f216, %f4; mul.ftz.f32 %f218, %f33, %f217; mov.b32 {%rs125, %rs127}, %r92; // begin inline asm { cvt.f32.f16 %f160, %rs125;} // end inline asm fma.rn.ftz.f32 %f219, %f218, %f160, %f214; mov.b32 {%rs126, %rs128}, %r100; // begin inline asm { cvt.f32.f16 %f161, %rs126;} // end inline asm fma.rn.ftz.f32 %f220, %f218, %f161, %f215; shl.b16 %rs186, %rs38, 4; cvt.s16.s8 %rs187, %rs186; shr.s16 %rs188, %rs187, 7; and.b16 %rs189, %rs188, -16; or.b16 %rs190, %rs189, %rs39; cvt.rn.f32.s16 %f221, %rs190; sub.ftz.f32 %f222, %f221, %f4; mul.ftz.f32 %f223, %f33, %f222; // begin inline asm { cvt.f32.f16 %f162, %rs127;} // end inline asm fma.rn.ftz.f32 %f224, %f223, %f162, %f219; // begin inline asm { cvt.f32.f16 %f163, %rs128;} // end inline asm fma.rn.ftz.f32 %f225, %f223, %f163, %f220; shl.b16 %rs191, %rs40, 4; cvt.s16.s8 %rs192, %rs191; shr.s16 %rs193, %rs192, 7; and.b16 %rs194, %rs193, -16; or.b16 %rs195, %rs194, %rs41; cvt.rn.f32.s16 %f226, %rs195; sub.ftz.f32 %f227, %f226, %f4; mul.ftz.f32 %f228, %f33, %f227; mov.b32 {%rs129, %rs131}, %r93; // begin inline asm { cvt.f32.f16 %f164, %rs129;} // end inline asm fma.rn.ftz.f32 %f229, %f228, %f164, %f224; mov.b32 {%rs130, %rs132}, %r101; // begin inline asm { cvt.f32.f16 %f165, %rs130;} // end inline asm fma.rn.ftz.f32 %f230, %f228, %f165, %f225; shl.b16 %rs196, %rs42, 4; cvt.s16.s8 %rs197, %rs196; shr.s16 %rs198, %rs197, 7; and.b16 %rs199, %rs198, -16; or.b16 %rs200, %rs199, %rs43; cvt.rn.f32.s16 %f231, %rs200; sub.ftz.f32 %f232, %f231, %f4; mul.ftz.f32 %f233, %f33, %f232; // begin inline asm { cvt.f32.f16 %f166, %rs131;} // end inline asm fma.rn.ftz.f32 %f234, %f233, %f166, %f229; // begin inline asm { cvt.f32.f16 %f167, %rs132;} // end inline asm fma.rn.ftz.f32 %f235, %f233, %f167, %f230; shl.b16 %rs201, %rs44, 4; cvt.s16.s8 %rs202, %rs201; shr.s16 %rs203, %rs202, 7; and.b16 %rs204, %rs203, -16; or.b16 %rs205, %rs204, %rs45; cvt.rn.f32.s16 %f236, %rs205; sub.ftz.f32 %f237, %f236, %f4; mul.ftz.f32 %f238, %f33, %f237; mov.b32 {%rs133, %rs135}, %r94; // begin inline asm { cvt.f32.f16 %f168, %rs133;} // end inline asm fma.rn.ftz.f32 %f239, %f238, %f168, %f234; mov.b32 {%rs134, %rs136}, %r102; // begin inline asm { cvt.f32.f16 %f169, %rs134;} // end inline asm fma.rn.ftz.f32 %f240, %f238, %f169, %f235; shl.b16 %rs206, %rs46, 4; cvt.s16.s8 %rs207, %rs206; shr.s16 %rs208, %rs207, 7; and.b16 %rs209, %rs208, -16; or.b16 %rs210, %rs209, %rs47; cvt.rn.f32.s16 %f241, %rs210; sub.ftz.f32 %f242, %f241, %f4; mul.ftz.f32 %f243, %f33, %f242; // begin inline asm { cvt.f32.f16 %f170, %rs135;} // end inline asm fma.rn.ftz.f32 %f244, %f243, %f170, %f239; // begin inline asm { cvt.f32.f16 %f171, %rs136;} // end inline asm fma.rn.ftz.f32 %f245, %f243, %f171, %f240; shl.b16 %rs211, %rs48, 4; cvt.s16.s8 %rs212, %rs211; shr.s16 %rs213, %rs212, 7; and.b16 %rs214, %rs213, -16; or.b16 %rs215, %rs214, %rs49; cvt.rn.f32.s16 %f246, %rs215; sub.ftz.f32 %f247, %f246, %f4; mul.ftz.f32 %f248, %f33, %f247; mov.b32 {%rs137, %rs139}, %r95; // begin inline asm { cvt.f32.f16 %f172, %rs137;} // end inline asm fma.rn.ftz.f32 %f249, %f248, %f172, %f244; mov.b32 {%rs138, %rs140}, %r103; // begin inline asm { cvt.f32.f16 %f173, %rs138;} // end inline asm fma.rn.ftz.f32 %f250, %f248, %f173, %f245; shl.b16 %rs216, %rs50, 4; cvt.s16.s8 %rs217, %rs216; shr.s16 %rs218, %rs217, 7; and.b16 %rs219, %rs218, -16; or.b16 %rs220, %rs219, %rs50; cvt.rn.f32.s16 %f251, %rs220; sub.ftz.f32 %f252, %f251, %f4; mul.ftz.f32 %f253, %f33, %f252; // begin inline asm { cvt.f32.f16 %f174, %rs139;} // end inline asm fma.rn.ftz.f32 %f301, %f253, %f174, %f249; // begin inline asm { cvt.f32.f16 %f175, %rs140;} // end inline asm fma.rn.ftz.f32 %f300, %f253, %f175, %f250; $L__BB0_8: add.s32 %r160, %r160, 4; shl.b32 %r108, %r160, 5; add.s32 %r159, %r108, %r3; shl.b32 %r158, %r159, 1; setp.lt.u32 %p7, %r158, %r31; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r154, %tid.y; shl.b32 %r153, %r154, 5; add.s32 %r152, %r153, %r3; shl.b32 %r109, %r152, 2; mov.u32 %r110, _ZZ9gemv_int4ILi4ELi64ELi2EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r111, %r110, %r109; setp.lt.u32 %p8, %r152, 32; @%p8 bra $L__BB0_11; add.s32 %r147, %r111, -112; st.shared.f32 [%r147], %f301; $L__BB0_11: mov.u32 %r157, %tid.y; shl.b32 %r156, %r157, 5; add.s32 %r155, %r156, %r3; setp.gt.u32 %p9, %r155, 31; bar.sync 0; mad.lo.s32 %r23, %r155, 12, %r110; @%p9 bra $L__BB0_13; mov.u32 %r126, 16; ld.shared.f32 %f269, [%r23+16]; add.ftz.f32 %f270, %f301, %f269; ld.shared.f32 %f271, [%r23+20]; add.ftz.f32 %f272, %f270, %f271; ld.shared.f32 %f273, [%r23+24]; add.ftz.f32 %f256, %f272, %f273; mov.u32 %r114, 1; mov.u32 %r127, 31; mov.u32 %r128, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f256, %r114, %r127, %r128; @p add.f32 r0, r0, %f256; mov.f32 %f254, r0;} // end inline asm mov.u32 %r117, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f254, %r117, %r127, %r128; @p add.f32 r0, r0, %f254; mov.f32 %f257, r0;} // end inline asm mov.u32 %r120, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f257, %r120, %r127, %r128; @p add.f32 r0, r0, %f257; mov.f32 %f260, r0;} // end inline asm mov.u32 %r123, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f260, %r123, %r127, %r128; @p add.f32 r0, r0, %f260; mov.f32 %f263, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f263, %r126, %r127, %r128; @p add.f32 r0, r0, %f263; mov.f32 %f301, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r148, %r111, -112; st.shared.f32 [%r148+640], %f300; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f289, [%r23+656]; add.ftz.f32 %f290, %f300, %f289; ld.shared.f32 %f291, [%r23+660]; add.ftz.f32 %f292, %f290, %f291; ld.shared.f32 %f293, [%r23+664]; add.ftz.f32 %f276, %f292, %f293; mov.u32 %r130, 1; mov.u32 %r143, 31; mov.u32 %r144, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f276, %r130, %r143, %r144; @p add.f32 r0, r0, %f276; mov.f32 %f274, r0;} // end inline asm mov.u32 %r133, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f274, %r133, %r143, %r144; @p add.f32 r0, r0, %f274; mov.f32 %f277, r0;} // end inline asm mov.u32 %r136, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f277, %r136, %r143, %r144; @p add.f32 r0, r0, %f277; mov.f32 %f280, r0;} // end inline asm mov.u32 %r139, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f280, %r139, %r143, %r144; @p add.f32 r0, r0, %f280; mov.f32 %f283, r0;} // end inline asm mov.u32 %r142, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f283, %r142, %r143, %r144; @p add.f32 r0, r0, %f283; mov.f32 %f300, r0;} // end inline asm $L__BB0_17: mov.u32 %r149, %tid.y; or.b32 %r145, %r3, %r149; setp.ne.s32 %p12, %r145, 0; @%p12 bra $L__BB0_23; ld.param.u64 %rd43, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+8]; mov.u32 %r150, %ctaid.x; setp.eq.s64 %p13, %rd43, 0; mul.ftz.f32 %f306, %f29, %f301; cvt.s64.s32 %rd9, %r150; @%p13 bra $L__BB0_20; ld.param.u64 %rd46, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd45, %rd46; shl.b64 %rd31, %rd9, 1; add.s64 %rd32, %rd45, %rd31; ld.global.u16 %rs221, [%rd32]; // begin inline asm { cvt.f32.f16 %f294, %rs221;} // end inline asm fma.rn.ftz.f32 %f306, %f30, %f294, %f306; $L__BB0_20: ld.param.u64 %rd44, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs222, %f306;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd33, 1.0; // end inline asm shl.b64 %rd36, %rd9, 1; add.s64 %rd34, %rd44, %rd36; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd34], %rs222, %rd33; // end inline asm mul.ftz.f32 %f307, %f29, %f300; @%p13 bra $L__BB0_22; ld.param.u64 %rd48, [_Z27dequant_gemv_group64_batch223DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd47, %rd48; mov.u32 %r151, %ctaid.x; add.s32 %r146, %r30, %r151; mul.wide.s32 %rd37, %r146, 2; add.s64 %rd38, %rd47, %rd37; ld.global.u16 %rs224, [%rd38]; // begin inline asm { cvt.f32.f16 %f296, %rs224;} // end inline asm fma.rn.ftz.f32 %f307, %f30, %f296, %f307; $L__BB0_22: mul.wide.s32 %rd42, %r30, 2; add.s64 %rd40, %rd34, %rd42; // begin inline asm { cvt.rn.f16.f32 %rs225, %f307;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd39, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd40], %rs225, %rd39; // end inline asm $L__BB0_23: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }