tion tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_60 .address_size 64 // .globl _Z27dequant_gemv_group64_batch323DequantGemvKernelParams // _ZZ9gemv_int4ILi4ELi64ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch323DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<16>; .reg .b16 %rs<269>; .reg .f32 %f<401>; .reg .b32 %r<202>; .reg .b64 %rd<49>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1920]; ld.param.v2.u32 {%r28, %r29}, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r30, %r31}, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f32, %f33}, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs67, %rs68, %rs69, %rs70}, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd22, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd21, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd20, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd19, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r201, %tid.y; shl.b32 %r32, %r201, 5; mov.u32 %r3, %tid.x; add.s32 %r200, %r32, %r3; shl.b32 %r199, %r200, 1; setp.ge.u32 %p1, %r199, %r30; mov.f32 %f392, 0f00000000; mov.f32 %f393, %f392; mov.f32 %f394, %f392; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd1, %rd19; mul.lo.s32 %r8, %r30, %r1; shr.u32 %r9, %r3, 2; mul.lo.s32 %r10, %r31, %r1; shl.b16 %rs2, %rs67, 3; mul.wide.s32 %rd3, %r28, 2; cvta.to.global.u64 %rd4, %rd21; cvta.to.global.u64 %rd5, %rd20; cvta.to.global.u64 %rd6, %rd22; $L__BB0_2: add.s32 %r33, %r199, %r8; mul.wide.u32 %rd23, %r33, 4; add.s64 %rd24, %rd5, %rd23; ld.global.v2.u32 {%r34, %r35}, [%rd24]; shl.b32 %r36, %r201, 3; add.s32 %r16, %r36, %r9; add.s32 %r17, %r16, %r10; mul.wide.s32 %rd25, %r17, 2; add.s64 %rd26, %rd6, %rd25; ld.global.u16 %rs75, [%rd26]; // begin inline asm { cvt.f32.f16 %f37, %rs75;} // end inline asm setp.eq.s64 %p2, %rd21, 0; mov.u16 %rs268, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r37, %r17, 31; add.s32 %r38, %r17, %r37; shr.s32 %r39, %r38, 1; cvt.s64.s32 %rd27, %r39; add.s64 %rd28, %rd4, %rd27; ld.global.u8 %r40, [%rd28]; shl.b32 %r41, %r16, 2; and.b32 %r42, %r41, 4; shr.u32 %r43, %r40, %r42; cvt.u16.u32 %rs76, %r43; and.b16 %rs268, %rs76, 15; $L__BB0_4: shl.b32 %r18, %r200, 4; setp.ge.s32 %p3, %r18, %r28; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs67, 0; shr.u16 %rs78, %rs268, 3; and.b16 %rs79, %rs78, 1; setp.eq.b16 %p5, %rs79, 1; and.pred %p6, %p4, %p5; selp.b16 %rs80, -16, 0, %p6; or.b16 %rs81, %rs80, %rs268; cvt.s16.s8 %rs82, %rs81; cvt.rn.f32.s16 %f5, %rs82; mul.wide.s32 %rd29, %r18, 2; add.s64 %rd7, %rd1, %rd29; ld.global.v4.u32 {%r44, %r45, %r46, %r47}, [%rd7]; add.s64 %rd8, %rd7, %rd3; ld.global.v4.u32 {%r52, %r53, %r54, %r55}, [%rd8]; add.s32 %r60, %r18, %r28; add.s32 %r61, %r60, %r28; mul.wide.s32 %rd31, %r61, 2; add.s64 %rd32, %rd1, %rd31; ld.global.v4.u32 {%r62, %r63, %r64, %r65}, [%rd32]; cvt.u16.u32 %rs5, %r34; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs12}, %r44; mov.b32 {%rs8, %rs13}, %r52; mov.b32 {%rs9, %rs14}, %r62; shr.u32 %r70, %r34, 4; cvt.u16.u32 %rs10, %r70; and.b16 %rs11, %rs10, 15; shr.u32 %r71, %r34, 8; cvt.u16.u32 %rs15, %r71; and.b16 %rs16, %rs15, 15; mov.b32 {%rs17, %rs22}, %r45; mov.b32 {%rs18, %rs23}, %r53; mov.b32 {%rs19, %rs24}, %r63; shr.u32 %r72, %r34, 12; cvt.u16.u32 %rs20, %r72; and.b16 %rs21, %rs20, 15; shr.u32 %r73, %r34, 16; cvt.u16.u32 %rs25, %r73; and.b16 %rs26, %rs25, 15; mov.b32 {%rs27, %rs32}, %r46; mov.b32 {%rs28, %rs33}, %r54; mov.b32 {%rs29, %rs34}, %r64; shr.u32 %r74, %r34, 20; cvt.u16.u32 %rs30, %r74; and.b16 %rs31, %rs30, 15; shr.u32 %r75, %r34, 24; cvt.u16.u32 %rs35, %r75; and.b16 %rs36, %rs35, 15; mov.b32 {%rs37, %rs41}, %r47; mov.b32 {%rs38, %rs42}, %r55; mov.b32 {%rs39, %rs43}, %r65; shr.u32 %r76, %r34, 28; cvt.u16.u32 %rs40, %r76; cvt.u16.u32 %rs44, %r35; and.b16 %rs45, %rs44, 15; shr.u32 %r77, %r35, 4; cvt.u16.u32 %rs46, %r77; and.b16 %rs47, %rs46, 15; shr.u32 %r78, %r35, 8; cvt.u16.u32 %rs48, %r78; and.b16 %rs49, %rs48, 15; shr.u32 %r79, %r35, 12; cvt.u16.u32 %rs50, %r79; and.b16 %rs51, %rs50, 15; shr.u32 %r80, %r35, 16; cvt.u16.u32 %rs52, %r80; and.b16 %rs53, %rs52, 15; shr.u32 %r81, %r35, 20; cvt.u16.u32 %rs54, %r81; and.b16 %rs55, %rs54, 15; shr.u32 %r82, %r35, 24; cvt.u16.u32 %rs56, %r82; and.b16 %rs57, %rs56, 15; shr.u32 %r83, %r35, 28; cvt.u16.u32 %rs58, %r83; add.s64 %rd34, %rd8, %rd3; add.s64 %rd9, %rd34, 16; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f86, %rs6; sub.ftz.f32 %f87, %f86, %f5; mul.ftz.f32 %f88, %f37, %f87; // begin inline asm { cvt.f32.f16 %f38, %rs7;} // end inline asm fma.rn.ftz.f32 %f89, %f88, %f38, %f394; // begin inline asm { cvt.f32.f16 %f39, %rs8;} // end inline asm fma.rn.ftz.f32 %f90, %f88, %f39, %f393; // begin inline asm { cvt.f32.f16 %f40, %rs9;} // end inline asm fma.rn.ftz.f32 %f91, %f88, %f40, %f392; cvt.rn.f32.s16 %f92, %rs11; sub.ftz.f32 %f93, %f92, %f5; mul.ftz.f32 %f94, %f37, %f93; // begin inline asm { cvt.f32.f16 %f41, %rs12;} // end inline asm fma.rn.ftz.f32 %f95, %f94, %f41, %f89; // begin inline asm { cvt.f32.f16 %f42, %rs13;} // end inline asm fma.rn.ftz.f32 %f96, %f94, %f42, %f90; // begin inline asm { cvt.f32.f16 %f43, %rs14;} // end inline asm fma.rn.ftz.f32 %f97, %f94, %f43, %f91; cvt.rn.f32.s16 %f98, %rs16; sub.ftz.f32 %f99, %f98, %f5; mul.ftz.f32 %f100, %f37, %f99; // begin inline asm { cvt.f32.f16 %f44, %rs17;} // end inline asm fma.rn.ftz.f32 %f101, %f100, %f44, %f95; // begin inline asm { cvt.f32.f16 %f45, %rs18;} // end inline asm fma.rn.ftz.f32 %f102, %f100, %f45, %f96; // begin inline asm { cvt.f32.f16 %f46, %rs19;} // end inline asm fma.rn.ftz.f32 %f103, %f100, %f46, %f97; cvt.rn.f32.s16 %f104, %rs21; sub.ftz.f32 %f105, %f104, %f5; mul.ftz.f32 %f106, %f37, %f105; // begin inline asm { cvt.f32.f16 %f47, %rs22;} // end inline asm fma.rn.ftz.f32 %f107, %f106, %f47, %f101; // begin inline asm { cvt.f32.f16 %f48, %rs23;} // end inline asm fma.rn.ftz.f32 %f108, %f106, %f48, %f102; // begin inline asm { cvt.f32.f16 %f49, %rs24;} // end inline asm fma.rn.ftz.f32 %f109, %f106, %f49, %f103; cvt.rn.f32.s16 %f110, %rs26; sub.ftz.f32 %f111, %f110, %f5; mul.ftz.f32 %f112, %f37, %f111; // begin inline asm { cvt.f32.f16 %f50, %rs27;} // end inline asm fma.rn.ftz.f32 %f113, %f112, %f50, %f107; // begin inline asm { cvt.f32.f16 %f51, %rs28;} // end inline asm fma.rn.ftz.f32 %f114, %f112, %f51, %f108; // begin inline asm { cvt.f32.f16 %f52, %rs29;} // end inline asm fma.rn.ftz.f32 %f115, %f112, %f52, %f109; cvt.rn.f32.s16 %f116, %rs31; sub.ftz.f32 %f117, %f116, %f5; mul.ftz.f32 %f118, %f37, %f117; // begin inline asm { cvt.f32.f16 %f53, %rs32;} // end inline asm fma.rn.ftz.f32 %f119, %f118, %f53, %f113; // begin inline asm { cvt.f32.f16 %f54, %rs33;} // end inline asm fma.rn.ftz.f32 %f120, %f118, %f54, %f114; // begin inline asm { cvt.f32.f16 %f55, %rs34;} // end inline asm fma.rn.ftz.f32 %f121, %f118, %f55, %f115; cvt.rn.f32.s16 %f122, %rs36; sub.ftz.f32 %f123, %f122, %f5; mul.ftz.f32 %f124, %f37, %f123; // begin inline asm { cvt.f32.f16 %f56, %rs37;} // end inline asm fma.rn.ftz.f32 %f125, %f124, %f56, %f119; // begin inline asm { cvt.f32.f16 %f57, %rs38;} // end inline asm fma.rn.ftz.f32 %f126, %f124, %f57, %f120; // begin inline asm { cvt.f32.f16 %f58, %rs39;} // end inline asm fma.rn.ftz.f32 %f127, %f124, %f58, %f121; cvt.rn.f32.s16 %f128, %rs40; sub.ftz.f32 %f129, %f128, %f5; mul.ftz.f32 %f130, %f37, %f129; // begin inline asm { cvt.f32.f16 %f59, %rs41;} // end inline asm fma.rn.ftz.f32 %f131, %f130, %f59, %f125; // begin inline asm { cvt.f32.f16 %f60, %rs42;} // end inline asm fma.rn.ftz.f32 %f132, %f130, %f60, %f126; // begin inline asm { cvt.f32.f16 %f61, %rs43;} // end inline asm fma.rn.ftz.f32 %f133, %f130, %f61, %f127; ld.global.v4.u32 {%r84, %r85, %r86, %r87}, [%rd7+16]; ld.global.v4.u32 {%r92, %r93, %r94, %r95}, [%rd8+16]; ld.global.v4.u32 {%r100, %r101, %r102, %r103}, [%rd9]; cvt.rn.f32.s16 %f134, %rs45; sub.ftz.f32 %f135, %f134, %f5; mul.ftz.f32 %f136, %f37, %f135; mov.b32 {%rs107, %rs110}, %r84; // begin inline asm { cvt.f32.f16 %f62, %rs107;} // end inline asm fma.rn.ftz.f32 %f137, %f136, %f62, %f131; mov.b32 {%rs108, %rs111}, %r92; // begin inline asm { cvt.f32.f16 %f63, %rs108;} // end inline asm fma.rn.ftz.f32 %f138, %f136, %f63, %f132; mov.b32 {%rs109, %rs112}, %r100; // begin inline asm { cvt.f32.f16 %f64, %rs109;} // end inline asm fma.rn.ftz.f32 %f139, %f136, %f64, %f133; cvt.rn.f32.s16 %f140, %rs47; sub.ftz.f32 %f141, %f140, %f5; mul.ftz.f32 %f142, %f37, %f141; // begin inline asm { cvt.f32.f16 %f65, %rs110;} // end inline asm fma.rn.ftz.f32 %f143, %f142, %f65, %f137; // begin inline asm { cvt.f32.f16 %f66, %rs111;} // end inline asm fma.rn.ftz.f32 %f144, %f142, %f66, %f138; // begin inline asm { cvt.f32.f16 %f67, %rs112;} // end inline asm fma.rn.ftz.f32 %f145, %f142, %f67, %f139; cvt.rn.f32.s16 %f146, %rs49; sub.ftz.f32 %f147, %f146, %f5; mul.ftz.f32 %f148, %f37, %f147; mov.b32 {%rs113, %rs116}, %r85; // begin inline asm { cvt.f32.f16 %f68, %rs113;} // end inline asm fma.rn.ftz.f32 %f149, %f148, %f68, %f143; mov.b32 {%rs114, %rs117}, %r93; // begin inline asm { cvt.f32.f16 %f69, %rs114;} // end inline asm fma.rn.ftz.f32 %f150, %f148, %f69, %f144; mov.b32 {%rs115, %rs118}, %r101; // begin inline asm { cvt.f32.f16 %f70, %rs115;} // end inline asm fma.rn.ftz.f32 %f151, %f148, %f70, %f145; cvt.rn.f32.s16 %f152, %rs51; sub.ftz.f32 %f153, %f152, %f5; mul.ftz.f32 %f154, %f37, %f153; // begin inline asm { cvt.f32.f16 %f71, %rs116;} // end inline asm fma.rn.ftz.f32 %f155, %f154, %f71, %f149; // begin inline asm { cvt.f32.f16 %f72, %rs117;} // end inline asm fma.rn.ftz.f32 %f156, %f154, %f72, %f150; // begin inline asm { cvt.f32.f16 %f73, %rs118;} // end inline asm fma.rn.ftz.f32 %f157, %f154, %f73, %f151; cvt.rn.f32.s16 %f158, %rs53; sub.ftz.f32 %f159, %f158, %f5; mul.ftz.f32 %f160, %f37, %f159; mov.b32 {%rs119, %rs122}, %r86; // begin inline asm { cvt.f32.f16 %f74, %rs119;} // end inline asm fma.rn.ftz.f32 %f161, %f160, %f74, %f155; mov.b32 {%rs120, %rs123}, %r94; // begin inline asm { cvt.f32.f16 %f75, %rs120;} // end inline asm fma.rn.ftz.f32 %f162, %f160, %f75, %f156; mov.b32 {%rs121, %rs124}, %r102; // begin inline asm { cvt.f32.f16 %f76, %rs121;} // end inline asm fma.rn.ftz.f32 %f163, %f160, %f76, %f157; cvt.rn.f32.s16 %f164, %rs55; sub.ftz.f32 %f165, %f164, %f5; mul.ftz.f32 %f166, %f37, %f165; // begin inline asm { cvt.f32.f16 %f77, %rs122;} // end inline asm fma.rn.ftz.f32 %f167, %f166, %f77, %f161; // begin inline asm { cvt.f32.f16 %f78, %rs123;} // end inline asm fma.rn.ftz.f32 %f168, %f166, %f78, %f162; // begin inline asm { cvt.f32.f16 %f79, %rs124;} // end inline asm fma.rn.ftz.f32 %f169, %f166, %f79, %f163; cvt.rn.f32.s16 %f170, %rs57; sub.ftz.f32 %f171, %f170, %f5; mul.ftz.f32 %f172, %f37, %f171; mov.b32 {%rs125, %rs128}, %r87; // begin inline asm { cvt.f32.f16 %f80, %rs125;} // end inline asm fma.rn.ftz.f32 %f173, %f172, %f80, %f167; mov.b32 {%rs126, %rs129}, %r95; // begin inline asm { cvt.f32.f16 %f81, %rs126;} // end inline asm fma.rn.ftz.f32 %f174, %f172, %f81, %f168; mov.b32 {%rs127, %rs130}, %r103; // begin inline asm { cvt.f32.f16 %f82, %rs127;} // end inline asm fma.rn.ftz.f32 %f175, %f172, %f82, %f169; cvt.rn.f32.s16 %f176, %rs58; sub.ftz.f32 %f177, %f176, %f5; mul.ftz.f32 %f178, %f37, %f177; // begin inline asm { cvt.f32.f16 %f83, %rs128;} // end inline asm fma.rn.ftz.f32 %f394, %f178, %f83, %f173; // begin inline asm { cvt.f32.f16 %f84, %rs129;} // end inline asm fma.rn.ftz.f32 %f393, %f178, %f84, %f174; // begin inline asm { cvt.f32.f16 %f85, %rs130;} // end inline asm fma.rn.ftz.f32 %f392, %f178, %f85, %f175; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs179, %rs5, 4; cvt.s16.s8 %rs180, %rs179; shr.s16 %rs181, %rs180, 7; and.b16 %rs182, %rs181, -16; or.b16 %rs183, %rs182, %rs6; cvt.rn.f32.s16 %f227, %rs183; sub.ftz.f32 %f228, %f227, %f5; mul.ftz.f32 %f229, %f37, %f228; // begin inline asm { cvt.f32.f16 %f179, %rs7;} // end inline asm fma.rn.ftz.f32 %f230, %f229, %f179, %f394; // begin inline asm { cvt.f32.f16 %f180, %rs8;} // end inline asm fma.rn.ftz.f32 %f231, %f229, %f180, %f393; // begin inline asm { cvt.f32.f16 %f181, %rs9;} // end inline asm fma.rn.ftz.f32 %f232, %f229, %f181, %f392; shl.b16 %rs184, %rs10, 4; cvt.s16.s8 %rs185, %rs184; shr.s16 %rs186, %rs185, 7; and.b16 %rs187, %rs186, -16; or.b16 %rs188, %rs187, %rs11; cvt.rn.f32.s16 %f233, %rs188; sub.ftz.f32 %f234, %f233, %f5; mul.ftz.f32 %f235, %f37, %f234; // begin inline asm { cvt.f32.f16 %f182, %rs12;} // end inline asm fma.rn.ftz.f32 %f236, %f235, %f182, %f230; // begin inline asm { cvt.f32.f16 %f183, %rs13;} // end inline asm fma.rn.ftz.f32 %f237, %f235, %f183, %f231; // begin inline asm { cvt.f32.f16 %f184, %rs14;} // end inline asm fma.rn.ftz.f32 %f238, %f235, %f184, %f232; shl.b16 %rs189, %rs15, 4; cvt.s16.s8 %rs190, %rs189; shr.s16 %rs191, %rs190, 7; and.b16 %rs192, %rs191, -16; or.b16 %rs193, %rs192, %rs16; cvt.rn.f32.s16 %f239, %rs193; sub.ftz.f32 %f240, %f239, %f5; mul.ftz.f32 %f241, %f37, %f240; // begin inline asm { cvt.f32.f16 %f185, %rs17;} // end inline asm fma.rn.ftz.f32 %f242, %f241, %f185, %f236; // begin inline asm { cvt.f32.f16 %f186, %rs18;} // end inline asm fma.rn.ftz.f32 %f243, %f241, %f186, %f237; // begin inline asm { cvt.f32.f16 %f187, %rs19;} // end inline asm fma.rn.ftz.f32 %f244, %f241, %f187, %f238; shl.b16 %rs194, %rs20, 4; cvt.s16.s8 %rs195, %rs194; shr.s16 %rs196, %rs195, 7; and.b16 %rs197, %rs196, -16; or.b16 %rs198, %rs197, %rs21; cvt.rn.f32.s16 %f245, %rs198; sub.ftz.f32 %f246, %f245, %f5; mul.ftz.f32 %f247, %f37, %f246; // begin inline asm { cvt.f32.f16 %f188, %rs22;} // end inline asm fma.rn.ftz.f32 %f248, %f247, %f188, %f242; // begin inline asm { cvt.f32.f16 %f189, %rs23;} // end inline asm fma.rn.ftz.f32 %f249, %f247, %f189, %f243; // begin inline asm { cvt.f32.f16 %f190, %rs24;} // end inline asm fma.rn.ftz.f32 %f250, %f247, %f190, %f244; shl.b16 %rs199, %rs25, 4; cvt.s16.s8 %rs200, %rs199; shr.s16 %rs201, %rs200, 7; and.b16 %rs202, %rs201, -16; or.b16 %rs203, %rs202, %rs26; cvt.rn.f32.s16 %f251, %rs203; sub.ftz.f32 %f252, %f251, %f5; mul.ftz.f32 %f253, %f37, %f252; // begin inline asm { cvt.f32.f16 %f191, %rs27;} // end inline asm fma.rn.ftz.f32 %f254, %f253, %f191, %f248; // begin inline asm { cvt.f32.f16 %f192, %rs28;} // end inline asm fma.rn.ftz.f32 %f255, %f253, %f192, %f249; // begin inline asm { cvt.f32.f16 %f193, %rs29;} // end inline asm fma.rn.ftz.f32 %f256, %f253, %f193, %f250; shl.b16 %rs204, %rs30, 4; cvt.s16.s8 %rs205, %rs204; shr.s16 %rs206, %rs205, 7; and.b16 %rs207, %rs206, -16; or.b16 %rs208, %rs207, %rs31; cvt.rn.f32.s16 %f257, %rs208; sub.ftz.f32 %f258, %f257, %f5; mul.ftz.f32 %f259, %f37, %f258; // begin inline asm { cvt.f32.f16 %f194, %rs32;} // end inline asm fma.rn.ftz.f32 %f260, %f259, %f194, %f254; // begin inline asm { cvt.f32.f16 %f195, %rs33;} // end inline asm fma.rn.ftz.f32 %f261, %f259, %f195, %f255; // begin inline asm { cvt.f32.f16 %f196, %rs34;} // end inline asm fma.rn.ftz.f32 %f262, %f259, %f196, %f256; shl.b16 %rs209, %rs35, 4; cvt.s16.s8 %rs210, %rs209; shr.s16 %rs211, %rs210, 7; and.b16 %rs212, %rs211, -16; or.b16 %rs213, %rs212, %rs36; cvt.rn.f32.s16 %f263, %rs213; sub.ftz.f32 %f264, %f263, %f5; mul.ftz.f32 %f265, %f37, %f264; // begin inline asm { cvt.f32.f16 %f197, %rs37;} // end inline asm fma.rn.ftz.f32 %f266, %f265, %f197, %f260; // begin inline asm { cvt.f32.f16 %f198, %rs38;} // end inline asm fma.rn.ftz.f32 %f267, %f265, %f198, %f261; // begin inline asm { cvt.f32.f16 %f199, %rs39;} // end inline asm fma.rn.ftz.f32 %f268, %f265, %f199, %f262; shl.b16 %rs214, %rs40, 4; cvt.s16.s8 %rs215, %rs214; shr.s16 %rs216, %rs215, 7; and.b16 %rs217, %rs216, -16; or.b16 %rs218, %rs217, %rs40; cvt.rn.f32.s16 %f269, %rs218; sub.ftz.f32 %f270, %f269, %f5; mul.ftz.f32 %f271, %f37, %f270; // begin inline asm { cvt.f32.f16 %f200, %rs41;} // end inline asm fma.rn.ftz.f32 %f272, %f271, %f200, %f266; // begin inline asm { cvt.f32.f16 %f201, %rs42;} // end inline asm fma.rn.ftz.f32 %f273, %f271, %f201, %f267; // begin inline asm { cvt.f32.f16 %f202, %rs43;} // end inline asm fma.rn.ftz.f32 %f274, %f271, %f202, %f268; ld.global.v4.u32 {%r108, %r109, %r110, %r111}, [%rd7+16]; ld.global.v4.u32 {%r116, %r117, %r118, %r119}, [%rd8+16]; ld.global.v4.u32 {%r124, %r125, %r126, %r127}, [%rd9]; shl.b16 %rs219, %rs44, 4; cvt.s16.s8 %rs220, %rs219; shr.s16 %rs221, %rs220, 7; and.b16 %rs222, %rs221, -16; or.b16 %rs223, %rs222, %rs45; cvt.rn.f32.s16 %f275, %rs223; sub.ftz.f32 %f276, %f275, %f5; mul.ftz.f32 %f277, %f37, %f276; mov.b32 {%rs155, %rs158}, %r108; // begin inline asm { cvt.f32.f16 %f203, %rs155;} // end inline asm fma.rn.ftz.f32 %f278, %f277, %f203, %f272; mov.b32 {%rs156, %rs159}, %r116; // begin inline asm { cvt.f32.f16 %f204, %rs156;} // end inline asm fma.rn.ftz.f32 %f279, %f277, %f204, %f273; mov.b32 {%rs157, %rs160}, %r124; // begin inline asm { cvt.f32.f16 %f205, %rs157;} // end inline asm fma.rn.ftz.f32 %f280, %f277, %f205, %f274; shl.b16 %rs224, %rs46, 4; cvt.s16.s8 %rs225, %rs224; shr.s16 %rs226, %rs225, 7; and.b16 %rs227, %rs226, -16; or.b16 %rs228, %rs227, %rs47; cvt.rn.f32.s16 %f281, %rs228; sub.ftz.f32 %f282, %f281, %f5; mul.ftz.f32 %f283, %f37, %f282; // begin inline asm { cvt.f32.f16 %f206, %rs158;} // end inline asm fma.rn.ftz.f32 %f284, %f283, %f206, %f278; // begin inline asm { cvt.f32.f16 %f207, %rs159;} // end inline asm fma.rn.ftz.f32 %f285, %f283, %f207, %f279; // begin inline asm { cvt.f32.f16 %f208, %rs160;} // end inline asm fma.rn.ftz.f32 %f286, %f283, %f208, %f280; shl.b16 %rs229, %rs48, 4; cvt.s16.s8 %rs230, %rs229; shr.s16 %rs231, %rs230, 7; and.b16 %rs232, %rs231, -16; or.b16 %rs233, %rs232, %rs49; cvt.rn.f32.s16 %f287, %rs233; sub.ftz.f32 %f288, %f287, %f5; mul.ftz.f32 %f289, %f37, %f288; mov.b32 {%rs161, %rs164}, %r109; // begin inline asm { cvt.f32.f16 %f209, %rs161;} // end inline asm fma.rn.ftz.f32 %f290, %f289, %f209, %f284; mov.b32 {%rs162, %rs165}, %r117; // begin inline asm { cvt.f32.f16 %f210, %rs162;} // end inline asm fma.rn.ftz.f32 %f291, %f289, %f210, %f285; mov.b32 {%rs163, %rs166}, %r125; // begin inline asm { cvt.f32.f16 %f211, %rs163;} // end inline asm fma.rn.ftz.f32 %f292, %f289, %f211, %f286; shl.b16 %rs234, %rs50, 4; cvt.s16.s8 %rs235, %rs234; shr.s16 %rs236, %rs235, 7; and.b16 %rs237, %rs236, -16; or.b16 %rs238, %rs237, %rs51; cvt.rn.f32.s16 %f293, %rs238; sub.ftz.f32 %f294, %f293, %f5; mul.ftz.f32 %f295, %f37, %f294; // begin inline asm { cvt.f32.f16 %f212, %rs164;} // end inline asm fma.rn.ftz.f32 %f296, %f295, %f212, %f290; // begin inline asm { cvt.f32.f16 %f213, %rs165;} // end inline asm fma.rn.ftz.f32 %f297, %f295, %f213, %f291; // begin inline asm { cvt.f32.f16 %f214, %rs166;} // end inline asm fma.rn.ftz.f32 %f298, %f295, %f214, %f292; shl.b16 %rs239, %rs52, 4; cvt.s16.s8 %rs240, %rs239; shr.s16 %rs241, %rs240, 7; and.b16 %rs242, %rs241, -16; or.b16 %rs243, %rs242, %rs53; cvt.rn.f32.s16 %f299, %rs243; sub.ftz.f32 %f300, %f299, %f5; mul.ftz.f32 %f301, %f37, %f300; mov.b32 {%rs167, %rs170}, %r110; // begin inline asm { cvt.f32.f16 %f215, %rs167;} // end inline asm fma.rn.ftz.f32 %f302, %f301, %f215, %f296; mov.b32 {%rs168, %rs171}, %r118; // begin inline asm { cvt.f32.f16 %f216, %rs168;} // end inline asm fma.rn.ftz.f32 %f303, %f301, %f216, %f297; mov.b32 {%rs169, %rs172}, %r126; // begin inline asm { cvt.f32.f16 %f217, %rs169;} // end inline asm fma.rn.ftz.f32 %f304, %f301, %f217, %f298; shl.b16 %rs244, %rs54, 4; cvt.s16.s8 %rs245, %rs244; shr.s16 %rs246, %rs245, 7; and.b16 %rs247, %rs246, -16; or.b16 %rs248, %rs247, %rs55; cvt.rn.f32.s16 %f305, %rs248; sub.ftz.f32 %f306, %f305, %f5; mul.ftz.f32 %f307, %f37, %f306; // begin inline asm { cvt.f32.f16 %f218, %rs170;} // end inline asm fma.rn.ftz.f32 %f308, %f307, %f218, %f302; // begin inline asm { cvt.f32.f16 %f219, %rs171;} // end inline asm fma.rn.ftz.f32 %f309, %f307, %f219, %f303; // begin inline asm { cvt.f32.f16 %f220, %rs172;} // end inline asm fma.rn.ftz.f32 %f310, %f307, %f220, %f304; shl.b16 %rs249, %rs56, 4; cvt.s16.s8 %rs250, %rs249; shr.s16 %rs251, %rs250, 7; and.b16 %rs252, %rs251, -16; or.b16 %rs253, %rs252, %rs57; cvt.rn.f32.s16 %f311, %rs253; sub.ftz.f32 %f312, %f311, %f5; mul.ftz.f32 %f313, %f37, %f312; mov.b32 {%rs173, %rs176}, %r111; // begin inline asm { cvt.f32.f16 %f221, %rs173;} // end inline asm fma.rn.ftz.f32 %f314, %f313, %f221, %f308; mov.b32 {%rs174, %rs177}, %r119; // begin inline asm { cvt.f32.f16 %f222, %rs174;} // end inline asm fma.rn.ftz.f32 %f315, %f313, %f222, %f309; mov.b32 {%rs175, %rs178}, %r127; // begin inline asm { cvt.f32.f16 %f223, %rs175;} // end inline asm fma.rn.ftz.f32 %f316, %f313, %f223, %f310; shl.b16 %rs254, %rs58, 4; cvt.s16.s8 %rs255, %rs254; shr.s16 %rs256, %rs255, 7; and.b16 %rs257, %rs256, -16; or.b16 %rs258, %rs257, %rs58; cvt.rn.f32.s16 %f317, %rs258; sub.ftz.f32 %f318, %f317, %f5; mul.ftz.f32 %f319, %f37, %f318; // begin inline asm { cvt.f32.f16 %f224, %rs176;} // end inline asm fma.rn.ftz.f32 %f394, %f319, %f224, %f314; // begin inline asm { cvt.f32.f16 %f225, %rs177;} // end inline asm fma.rn.ftz.f32 %f393, %f319, %f225, %f315; // begin inline asm { cvt.f32.f16 %f226, %rs178;} // end inline asm fma.rn.ftz.f32 %f392, %f319, %f226, %f316; $L__BB0_8: add.s32 %r201, %r201, 4; shl.b32 %r132, %r201, 5; add.s32 %r200, %r132, %r3; shl.b32 %r199, %r200, 1; setp.lt.u32 %p7, %r199, %r30; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r195, %tid.y; shl.b32 %r194, %r195, 5; add.s32 %r193, %r194, %r3; shl.b32 %r133, %r193, 2; mov.u32 %r134, _ZZ9gemv_int4ILi4ELi64ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r135, %r134, %r133; setp.lt.u32 %p8, %r193, 32; @%p8 bra $L__BB0_11; add.s32 %r188, %r135, -112; st.shared.f32 [%r188], %f394; $L__BB0_11: mov.u32 %r198, %tid.y; shl.b32 %r197, %r198, 5; add.s32 %r196, %r197, %r3; setp.gt.u32 %p9, %r196, 31; bar.sync 0; mad.lo.s32 %r23, %r196, 12, %r134; @%p9 bra $L__BB0_13; mov.u32 %r150, 16; ld.shared.f32 %f335, [%r23+16]; add.ftz.f32 %f336, %f394, %f335; ld.shared.f32 %f337, [%r23+20]; add.ftz.f32 %f338, %f336, %f337; ld.shared.f32 %f339, [%r23+24]; add.ftz.f32 %f322, %f338, %f339; mov.u32 %r138, 1; mov.u32 %r151, 31; mov.u32 %r152, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f322, %r138, %r151, %r152; @p add.f32 r0, r0, %f322; mov.f32 %f320, r0;} // end inline asm mov.u32 %r141, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f320, %r141, %r151, %r152; @p add.f32 r0, r0, %f320; mov.f32 %f323, r0;} // end inline asm mov.u32 %r144, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f323, %r144, %r151, %r152; @p add.f32 r0, r0, %f323; mov.f32 %f326, r0;} // end inline asm mov.u32 %r147, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f326, %r147, %r151, %r152; @p add.f32 r0, r0, %f326; mov.f32 %f329, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f329, %r150, %r151, %r152; @p add.f32 r0, r0, %f329; mov.f32 %f394, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r189, %r135, -112; st.shared.f32 [%r189+640], %f393; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f355, [%r23+656]; add.ftz.f32 %f356, %f393, %f355; ld.shared.f32 %f357, [%r23+660]; add.ftz.f32 %f358, %f356, %f357; ld.shared.f32 %f359, [%r23+664]; add.ftz.f32 %f342, %f358, %f359; mov.u32 %r154, 1; mov.u32 %r167, 31; mov.u32 %r168, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f342, %r154, %r167, %r168; @p add.f32 r0, r0, %f342; mov.f32 %f340, r0;} // end inline asm mov.u32 %r157, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f340, %r157, %r167, %r168; @p add.f32 r0, r0, %f340; mov.f32 %f343, r0;} // end inline asm mov.u32 %r160, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f343, %r160, %r167, %r168; @p add.f32 r0, r0, %f343; mov.f32 %f346, r0;} // end inline asm mov.u32 %r163, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f346, %r163, %r167, %r168; @p add.f32 r0, r0, %f346; mov.f32 %f349, r0;} // end inline asm mov.u32 %r166, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f349, %r166, %r167, %r168; @p add.f32 r0, r0, %f349; mov.f32 %f393, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r190, %r135, -112; st.shared.f32 [%r190+1280], %f392; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f375, [%r23+1296]; add.ftz.f32 %f376, %f392, %f375; ld.shared.f32 %f377, [%r23+1300]; add.ftz.f32 %f378, %f376, %f377; ld.shared.f32 %f379, [%r23+1304]; add.ftz.f32 %f362, %f378, %f379; mov.u32 %r170, 1; mov.u32 %r183, 31; mov.u32 %r184, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f362, %r170, %r183, %r184; @p add.f32 r0, r0, %f362; mov.f32 %f360, r0;} // end inline asm mov.u32 %r173, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f360, %r173, %r183, %r184; @p add.f32 r0, r0, %f360; mov.f32 %f363, r0;} // end inline asm mov.u32 %r176, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f363, %r176, %r183, %r184; @p add.f32 r0, r0, %f363; mov.f32 %f366, r0;} // end inline asm mov.u32 %r179, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f366, %r179, %r183, %r184; @p add.f32 r0, r0, %f366; mov.f32 %f369, r0;} // end inline asm mov.u32 %r182, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f369, %r182, %r183, %r184; @p add.f32 r0, r0, %f369; mov.f32 %f392, r0;} // end inline asm $L__BB0_21: mov.u32 %r191, %tid.y; or.b32 %r185, %r3, %r191; setp.ne.s32 %p14, %r185, 0; @%p14 bra $L__BB0_25; ld.param.u64 %rd47, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd46, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0]; mov.u32 %r192, %ctaid.x; cvta.to.global.u64 %rd35, %rd46; setp.eq.s64 %p15, %rd47, 0; mul.ftz.f32 %f24, %f32, %f394; cvt.s64.s32 %rd11, %r192; mul.wide.s32 %rd36, %r192, 2; add.s64 %rd12, %rd35, %rd36; mul.ftz.f32 %f25, %f32, %f393; cvt.s64.s32 %rd13, %r29; mul.wide.s32 %rd37, %r29, 2; add.s64 %rd14, %rd12, %rd37; mul.ftz.f32 %f26, %f32, %f392; add.s32 %r186, %r29, %r192; add.s32 %r187, %r186, %r29; cvt.s64.s32 %rd15, %r187; mul.wide.s32 %rd38, %r187, 2; add.s64 %rd16, %rd35, %rd38; @%p15 bra $L__BB0_24; ld.param.u64 %rd48, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd39, %rd48; shl.b64 %rd40, %rd11, 1; add.s64 %rd41, %rd39, %rd40; ld.global.u16 %rs259, [%rd41]; // begin inline asm { cvt.f32.f16 %f380, %rs259;} // end inline asm fma.rn.ftz.f32 %f381, %f33, %f380, %f24; // begin inline asm { cvt.rn.f16.f32 %rs260, %f381;} // end inline asm st.global.u16 [%rd12], %rs260; shl.b64 %rd42, %rd13, 1; add.s64 %rd43, %rd41, %rd42; ld.global.u16 %rs261, [%rd43]; // begin inline asm { cvt.f32.f16 %f382, %rs261;} // end inline asm fma.rn.ftz.f32 %f383, %f33, %f382, %f25; // begin inline asm { cvt.rn.f16.f32 %rs262, %f383;} // end inline asm st.global.u16 [%rd14], %rs262; shl.b64 %rd44, %rd15, 1; add.s64 %rd45, %rd39, %rd44; ld.global.u16 %rs263, [%rd45]; // begin inline asm { cvt.f32.f16 %f384, %rs263;} // end inline asm fma.rn.ftz.f32 %f385, %f33, %f384, %f26; // begin inline asm { cvt.rn.f16.f32 %rs264, %f385;} // end inline asm st.global.u16 [%rd16], %rs264; bra.uni $L__BB0_25; $L__BB0_24: // begin inline asm { cvt.rn.f16.f32 %rs265, %f24;} // end inline asm st.global.u16 [%rd12], %rs265; // begin inline asm { cvt.rn.f16.f32 %rs266, %f25;} // end inline asm st.global.u16 [%rd14], %rs266; // begin inline asm { cvt.rn.f16.f32 %rs267, %f26;} // end inline asm st.global.u16 [%rd16], %rs267; $L__BB0_25: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }