7_gemv_cu_021679306thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_021679306thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch323DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<18>; .reg .b16 %rs<271>; .reg .f32 %f<409>; .reg .b32 %r<206>; .reg .b64 %rd<67>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1920]; ld.param.v2.u32 {%r29, %r30}, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r31, %r32}, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f40, %f41}, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs67, %rs68, %rs69, %rs70}, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd21, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd20, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd19, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd18, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r205, %tid.y; shl.b32 %r33, %r205, 5; mov.u32 %r3, %tid.x; add.s32 %r204, %r33, %r3; shl.b32 %r203, %r204, 1; setp.ge.u32 %p1, %r203, %r31; mov.f32 %f397, 0f00000000; mov.f32 %f398, %f397; mov.f32 %f399, %f397; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd18; mul.lo.s32 %r8, %r31, %r1; shr.u32 %r9, %r3, 2; mul.lo.s32 %r10, %r32, %r1; shl.b16 %rs2, %rs67, 3; mul.wide.s32 %rd5, %r29, 2; $L__BB0_2: add.s32 %r36, %r203, %r8; mul.wide.u32 %rd28, %r36, 4; add.s64 %rd23, %rd19, %rd28; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd22, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v2.u32 {%r34,%r35}, [%rd23], %rd22; // end inline asm shl.b32 %r37, %r205, 3; add.s32 %r16, %r37, %r9; add.s32 %r17, %r16, %r10; mul.wide.s32 %rd29, %r17, 2; add.s64 %rd26, %rd21, %rd29; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd25, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs75, [%rd26], %rd25; // end inline asm // begin inline asm { cvt.f32.f16 %f45, %rs75;} // end inline asm setp.eq.s64 %p2, %rd20, 0; mov.u16 %rs270, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r38, %r17, 31; add.s32 %r39, %r17, %r38; shr.s32 %r40, %r39, 1; cvt.s64.s32 %rd33, %r40; add.s64 %rd31, %rd20, %rd33; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd30, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs77, [%rd31], %rd30; // end inline asm cvt.u32.u16 %r41, %rs77; and.b32 %r42, %r41, 255; shl.b32 %r43, %r16, 2; and.b32 %r44, %r43, 4; shr.u32 %r45, %r42, %r44; cvt.u16.u32 %rs78, %r45; and.b16 %rs270, %rs78, 15; $L__BB0_4: shl.b32 %r18, %r204, 4; setp.ge.s32 %p3, %r18, %r29; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs67, 0; shr.u16 %rs80, %rs270, 3; and.b16 %rs81, %rs80, 1; setp.eq.b16 %p5, %rs81, 1; and.pred %p6, %p4, %p5; selp.b16 %rs82, -16, 0, %p6; or.b16 %rs83, %rs82, %rs270; cvt.s16.s8 %rs84, %rs83; cvt.rn.f32.s16 %f5, %rs84; mul.wide.s32 %rd34, %r18, 2; add.s64 %rd8, %rd3, %rd34; ld.global.v4.u32 {%r46, %r47, %r48, %r49}, [%rd8]; add.s64 %rd9, %rd8, %rd5; ld.global.v4.u32 {%r54, %r55, %r56, %r57}, [%rd9]; add.s32 %r62, %r18, %r29; add.s32 %r63, %r62, %r29; mul.wide.s32 %rd36, %r63, 2; add.s64 %rd37, %rd3, %rd36; ld.global.v4.u32 {%r64, %r65, %r66, %r67}, [%rd37]; cvt.u16.u32 %rs5, %r34; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs12}, %r46; mov.b32 {%rs8, %rs13}, %r54; mov.b32 {%rs9, %rs14}, %r64; shr.u32 %r72, %r34, 4; cvt.u16.u32 %rs10, %r72; and.b16 %rs11, %rs10, 15; shr.u32 %r73, %r34, 8; cvt.u16.u32 %rs15, %r73; and.b16 %rs16, %rs15, 15; mov.b32 {%rs17, %rs22}, %r47; mov.b32 {%rs18, %rs23}, %r55; mov.b32 {%rs19, %rs24}, %r65; shr.u32 %r74, %r34, 12; cvt.u16.u32 %rs20, %r74; and.b16 %rs21, %rs20, 15; shr.u32 %r75, %r34, 16; cvt.u16.u32 %rs25, %r75; and.b16 %rs26, %rs25, 15; mov.b32 {%rs27, %rs32}, %r48; mov.b32 {%rs28, %rs33}, %r56; mov.b32 {%rs29, %rs34}, %r66; shr.u32 %r76, %r34, 20; cvt.u16.u32 %rs30, %r76; and.b16 %rs31, %rs30, 15; shr.u32 %r77, %r34, 24; cvt.u16.u32 %rs35, %r77; and.b16 %rs36, %rs35, 15; mov.b32 {%rs37, %rs41}, %r49; mov.b32 {%rs38, %rs42}, %r57; mov.b32 {%rs39, %rs43}, %r67; shr.u32 %r78, %r34, 28; cvt.u16.u32 %rs40, %r78; cvt.u16.u32 %rs44, %r35; and.b16 %rs45, %rs44, 15; shr.u32 %r79, %r35, 4; cvt.u16.u32 %rs46, %r79; and.b16 %rs47, %rs46, 15; shr.u32 %r80, %r35, 8; cvt.u16.u32 %rs48, %r80; and.b16 %rs49, %rs48, 15; shr.u32 %r81, %r35, 12; cvt.u16.u32 %rs50, %r81; and.b16 %rs51, %rs50, 15; shr.u32 %r82, %r35, 16; cvt.u16.u32 %rs52, %r82; and.b16 %rs53, %rs52, 15; shr.u32 %r83, %r35, 20; cvt.u16.u32 %rs54, %r83; and.b16 %rs55, %rs54, 15; shr.u32 %r84, %r35, 24; cvt.u16.u32 %rs56, %r84; and.b16 %rs57, %rs56, 15; shr.u32 %r85, %r35, 28; cvt.u16.u32 %rs58, %r85; add.s64 %rd39, %rd9, %rd5; add.s64 %rd10, %rd39, 16; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f94, %rs6; sub.ftz.f32 %f95, %f94, %f5; mul.ftz.f32 %f96, %f45, %f95; // begin inline asm { cvt.f32.f16 %f46, %rs7;} // end inline asm fma.rn.ftz.f32 %f97, %f96, %f46, %f399; // begin inline asm { cvt.f32.f16 %f47, %rs8;} // end inline asm fma.rn.ftz.f32 %f98, %f96, %f47, %f398; // begin inline asm { cvt.f32.f16 %f48, %rs9;} // end inline asm fma.rn.ftz.f32 %f99, %f96, %f48, %f397; cvt.rn.f32.s16 %f100, %rs11; sub.ftz.f32 %f101, %f100, %f5; mul.ftz.f32 %f102, %f45, %f101; // begin inline asm { cvt.f32.f16 %f49, %rs12;} // end inline asm fma.rn.ftz.f32 %f103, %f102, %f49, %f97; // begin inline asm { cvt.f32.f16 %f50, %rs13;} // end inline asm fma.rn.ftz.f32 %f104, %f102, %f50, %f98; // begin inline asm { cvt.f32.f16 %f51, %rs14;} // end inline asm fma.rn.ftz.f32 %f105, %f102, %f51, %f99; cvt.rn.f32.s16 %f106, %rs16; sub.ftz.f32 %f107, %f106, %f5; mul.ftz.f32 %f108, %f45, %f107; // begin inline asm { cvt.f32.f16 %f52, %rs17;} // end inline asm fma.rn.ftz.f32 %f109, %f108, %f52, %f103; // begin inline asm { cvt.f32.f16 %f53, %rs18;} // end inline asm fma.rn.ftz.f32 %f110, %f108, %f53, %f104; // begin inline asm { cvt.f32.f16 %f54, %rs19;} // end inline asm fma.rn.ftz.f32 %f111, %f108, %f54, %f105; cvt.rn.f32.s16 %f112, %rs21; sub.ftz.f32 %f113, %f112, %f5; mul.ftz.f32 %f114, %f45, %f113; // begin inline asm { cvt.f32.f16 %f55, %rs22;} // end inline asm fma.rn.ftz.f32 %f115, %f114, %f55, %f109; // begin inline asm { cvt.f32.f16 %f56, %rs23;} // end inline asm fma.rn.ftz.f32 %f116, %f114, %f56, %f110; // begin inline asm { cvt.f32.f16 %f57, %rs24;} // end inline asm fma.rn.ftz.f32 %f117, %f114, %f57, %f111; cvt.rn.f32.s16 %f118, %rs26; sub.ftz.f32 %f119, %f118, %f5; mul.ftz.f32 %f120, %f45, %f119; // begin inline asm { cvt.f32.f16 %f58, %rs27;} // end inline asm fma.rn.ftz.f32 %f121, %f120, %f58, %f115; // begin inline asm { cvt.f32.f16 %f59, %rs28;} // end inline asm fma.rn.ftz.f32 %f122, %f120, %f59, %f116; // begin inline asm { cvt.f32.f16 %f60, %rs29;} // end inline asm fma.rn.ftz.f32 %f123, %f120, %f60, %f117; cvt.rn.f32.s16 %f124, %rs31; sub.ftz.f32 %f125, %f124, %f5; mul.ftz.f32 %f126, %f45, %f125; // begin inline asm { cvt.f32.f16 %f61, %rs32;} // end inline asm fma.rn.ftz.f32 %f127, %f126, %f61, %f121; // begin inline asm { cvt.f32.f16 %f62, %rs33;} // end inline asm fma.rn.ftz.f32 %f128, %f126, %f62, %f122; // begin inline asm { cvt.f32.f16 %f63, %rs34;} // end inline asm fma.rn.ftz.f32 %f129, %f126, %f63, %f123; cvt.rn.f32.s16 %f130, %rs36; sub.ftz.f32 %f131, %f130, %f5; mul.ftz.f32 %f132, %f45, %f131; // begin inline asm { cvt.f32.f16 %f64, %rs37;} // end inline asm fma.rn.ftz.f32 %f133, %f132, %f64, %f127; // begin inline asm { cvt.f32.f16 %f65, %rs38;} // end inline asm fma.rn.ftz.f32 %f134, %f132, %f65, %f128; // begin inline asm { cvt.f32.f16 %f66, %rs39;} // end inline asm fma.rn.ftz.f32 %f135, %f132, %f66, %f129; cvt.rn.f32.s16 %f136, %rs40; sub.ftz.f32 %f137, %f136, %f5; mul.ftz.f32 %f138, %f45, %f137; // begin inline asm { cvt.f32.f16 %f67, %rs41;} // end inline asm fma.rn.ftz.f32 %f139, %f138, %f67, %f133; // begin inline asm { cvt.f32.f16 %f68, %rs42;} // end inline asm fma.rn.ftz.f32 %f140, %f138, %f68, %f134; // begin inline asm { cvt.f32.f16 %f69, %rs43;} // end inline asm fma.rn.ftz.f32 %f141, %f138, %f69, %f135; ld.global.v4.u32 {%r86, %r87, %r88, %r89}, [%rd8+16]; ld.global.v4.u32 {%r94, %r95, %r96, %r97}, [%rd9+16]; ld.global.v4.u32 {%r102, %r103, %r104, %r105}, [%rd10]; cvt.rn.f32.s16 %f142, %rs45; sub.ftz.f32 %f143, %f142, %f5; mul.ftz.f32 %f144, %f45, %f143; mov.b32 {%rs109, %rs112}, %r86; // begin inline asm { cvt.f32.f16 %f70, %rs109;} // end inline asm fma.rn.ftz.f32 %f145, %f144, %f70, %f139; mov.b32 {%rs110, %rs113}, %r94; // begin inline asm { cvt.f32.f16 %f71, %rs110;} // end inline asm fma.rn.ftz.f32 %f146, %f144, %f71, %f140; mov.b32 {%rs111, %rs114}, %r102; // begin inline asm { cvt.f32.f16 %f72, %rs111;} // end inline asm fma.rn.ftz.f32 %f147, %f144, %f72, %f141; cvt.rn.f32.s16 %f148, %rs47; sub.ftz.f32 %f149, %f148, %f5; mul.ftz.f32 %f150, %f45, %f149; // begin inline asm { cvt.f32.f16 %f73, %rs112;} // end inline asm fma.rn.ftz.f32 %f151, %f150, %f73, %f145; // begin inline asm { cvt.f32.f16 %f74, %rs113;} // end inline asm fma.rn.ftz.f32 %f152, %f150, %f74, %f146; // begin inline asm { cvt.f32.f16 %f75, %rs114;} // end inline asm fma.rn.ftz.f32 %f153, %f150, %f75, %f147; cvt.rn.f32.s16 %f154, %rs49; sub.ftz.f32 %f155, %f154, %f5; mul.ftz.f32 %f156, %f45, %f155; mov.b32 {%rs115, %rs118}, %r87; // begin inline asm { cvt.f32.f16 %f76, %rs115;} // end inline asm fma.rn.ftz.f32 %f157, %f156, %f76, %f151; mov.b32 {%rs116, %rs119}, %r95; // begin inline asm { cvt.f32.f16 %f77, %rs116;} // end inline asm fma.rn.ftz.f32 %f158, %f156, %f77, %f152; mov.b32 {%rs117, %rs120}, %r103; // begin inline asm { cvt.f32.f16 %f78, %rs117;} // end inline asm fma.rn.ftz.f32 %f159, %f156, %f78, %f153; cvt.rn.f32.s16 %f160, %rs51; sub.ftz.f32 %f161, %f160, %f5; mul.ftz.f32 %f162, %f45, %f161; // begin inline asm { cvt.f32.f16 %f79, %rs118;} // end inline asm fma.rn.ftz.f32 %f163, %f162, %f79, %f157; // begin inline asm { cvt.f32.f16 %f80, %rs119;} // end inline asm fma.rn.ftz.f32 %f164, %f162, %f80, %f158; // begin inline asm { cvt.f32.f16 %f81, %rs120;} // end inline asm fma.rn.ftz.f32 %f165, %f162, %f81, %f159; cvt.rn.f32.s16 %f166, %rs53; sub.ftz.f32 %f167, %f166, %f5; mul.ftz.f32 %f168, %f45, %f167; mov.b32 {%rs121, %rs124}, %r88; // begin inline asm { cvt.f32.f16 %f82, %rs121;} // end inline asm fma.rn.ftz.f32 %f169, %f168, %f82, %f163; mov.b32 {%rs122, %rs125}, %r96; // begin inline asm { cvt.f32.f16 %f83, %rs122;} // end inline asm fma.rn.ftz.f32 %f170, %f168, %f83, %f164; mov.b32 {%rs123, %rs126}, %r104; // begin inline asm { cvt.f32.f16 %f84, %rs123;} // end inline asm fma.rn.ftz.f32 %f171, %f168, %f84, %f165; cvt.rn.f32.s16 %f172, %rs55; sub.ftz.f32 %f173, %f172, %f5; mul.ftz.f32 %f174, %f45, %f173; // begin inline asm { cvt.f32.f16 %f85, %rs124;} // end inline asm fma.rn.ftz.f32 %f175, %f174, %f85, %f169; // begin inline asm { cvt.f32.f16 %f86, %rs125;} // end inline asm fma.rn.ftz.f32 %f176, %f174, %f86, %f170; // begin inline asm { cvt.f32.f16 %f87, %rs126;} // end inline asm fma.rn.ftz.f32 %f177, %f174, %f87, %f171; cvt.rn.f32.s16 %f178, %rs57; sub.ftz.f32 %f179, %f178, %f5; mul.ftz.f32 %f180, %f45, %f179; mov.b32 {%rs127, %rs130}, %r89; // begin inline asm { cvt.f32.f16 %f88, %rs127;} // end inline asm fma.rn.ftz.f32 %f181, %f180, %f88, %f175; mov.b32 {%rs128, %rs131}, %r97; // begin inline asm { cvt.f32.f16 %f89, %rs128;} // end inline asm fma.rn.ftz.f32 %f182, %f180, %f89, %f176; mov.b32 {%rs129, %rs132}, %r105; // begin inline asm { cvt.f32.f16 %f90, %rs129;} // end inline asm fma.rn.ftz.f32 %f183, %f180, %f90, %f177; cvt.rn.f32.s16 %f184, %rs58; sub.ftz.f32 %f185, %f184, %f5; mul.ftz.f32 %f186, %f45, %f185; // begin inline asm { cvt.f32.f16 %f91, %rs130;} // end inline asm fma.rn.ftz.f32 %f399, %f186, %f91, %f181; // begin inline asm { cvt.f32.f16 %f92, %rs131;} // end inline asm fma.rn.ftz.f32 %f398, %f186, %f92, %f182; // begin inline asm { cvt.f32.f16 %f93, %rs132;} // end inline asm fma.rn.ftz.f32 %f397, %f186, %f93, %f183; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs181, %rs5, 4; cvt.s16.s8 %rs182, %rs181; shr.s16 %rs183, %rs182, 7; and.b16 %rs184, %rs183, -16; or.b16 %rs185, %rs184, %rs6; cvt.rn.f32.s16 %f235, %rs185; sub.ftz.f32 %f236, %f235, %f5; mul.ftz.f32 %f237, %f45, %f236; // begin inline asm { cvt.f32.f16 %f187, %rs7;} // end inline asm fma.rn.ftz.f32 %f238, %f237, %f187, %f399; // begin inline asm { cvt.f32.f16 %f188, %rs8;} // end inline asm fma.rn.ftz.f32 %f239, %f237, %f188, %f398; // begin inline asm { cvt.f32.f16 %f189, %rs9;} // end inline asm fma.rn.ftz.f32 %f240, %f237, %f189, %f397; shl.b16 %rs186, %rs10, 4; cvt.s16.s8 %rs187, %rs186; shr.s16 %rs188, %rs187, 7; and.b16 %rs189, %rs188, -16; or.b16 %rs190, %rs189, %rs11; cvt.rn.f32.s16 %f241, %rs190; sub.ftz.f32 %f242, %f241, %f5; mul.ftz.f32 %f243, %f45, %f242; // begin inline asm { cvt.f32.f16 %f190, %rs12;} // end inline asm fma.rn.ftz.f32 %f244, %f243, %f190, %f238; // begin inline asm { cvt.f32.f16 %f191, %rs13;} // end inline asm fma.rn.ftz.f32 %f245, %f243, %f191, %f239; // begin inline asm { cvt.f32.f16 %f192, %rs14;} // end inline asm fma.rn.ftz.f32 %f246, %f243, %f192, %f240; shl.b16 %rs191, %rs15, 4; cvt.s16.s8 %rs192, %rs191; shr.s16 %rs193, %rs192, 7; and.b16 %rs194, %rs193, -16; or.b16 %rs195, %rs194, %rs16; cvt.rn.f32.s16 %f247, %rs195; sub.ftz.f32 %f248, %f247, %f5; mul.ftz.f32 %f249, %f45, %f248; // begin inline asm { cvt.f32.f16 %f193, %rs17;} // end inline asm fma.rn.ftz.f32 %f250, %f249, %f193, %f244; // begin inline asm { cvt.f32.f16 %f194, %rs18;} // end inline asm fma.rn.ftz.f32 %f251, %f249, %f194, %f245; // begin inline asm { cvt.f32.f16 %f195, %rs19;} // end inline asm fma.rn.ftz.f32 %f252, %f249, %f195, %f246; shl.b16 %rs196, %rs20, 4; cvt.s16.s8 %rs197, %rs196; shr.s16 %rs198, %rs197, 7; and.b16 %rs199, %rs198, -16; or.b16 %rs200, %rs199, %rs21; cvt.rn.f32.s16 %f253, %rs200; sub.ftz.f32 %f254, %f253, %f5; mul.ftz.f32 %f255, %f45, %f254; // begin inline asm { cvt.f32.f16 %f196, %rs22;} // end inline asm fma.rn.ftz.f32 %f256, %f255, %f196, %f250; // begin inline asm { cvt.f32.f16 %f197, %rs23;} // end inline asm fma.rn.ftz.f32 %f257, %f255, %f197, %f251; // begin inline asm { cvt.f32.f16 %f198, %rs24;} // end inline asm fma.rn.ftz.f32 %f258, %f255, %f198, %f252; shl.b16 %rs201, %rs25, 4; cvt.s16.s8 %rs202, %rs201; shr.s16 %rs203, %rs202, 7; and.b16 %rs204, %rs203, -16; or.b16 %rs205, %rs204, %rs26; cvt.rn.f32.s16 %f259, %rs205; sub.ftz.f32 %f260, %f259, %f5; mul.ftz.f32 %f261, %f45, %f260; // begin inline asm { cvt.f32.f16 %f199, %rs27;} // end inline asm fma.rn.ftz.f32 %f262, %f261, %f199, %f256; // begin inline asm { cvt.f32.f16 %f200, %rs28;} // end inline asm fma.rn.ftz.f32 %f263, %f261, %f200, %f257; // begin inline asm { cvt.f32.f16 %f201, %rs29;} // end inline asm fma.rn.ftz.f32 %f264, %f261, %f201, %f258; shl.b16 %rs206, %rs30, 4; cvt.s16.s8 %rs207, %rs206; shr.s16 %rs208, %rs207, 7; and.b16 %rs209, %rs208, -16; or.b16 %rs210, %rs209, %rs31; cvt.rn.f32.s16 %f265, %rs210; sub.ftz.f32 %f266, %f265, %f5; mul.ftz.f32 %f267, %f45, %f266; // begin inline asm { cvt.f32.f16 %f202, %rs32;} // end inline asm fma.rn.ftz.f32 %f268, %f267, %f202, %f262; // begin inline asm { cvt.f32.f16 %f203, %rs33;} // end inline asm fma.rn.ftz.f32 %f269, %f267, %f203, %f263; // begin inline asm { cvt.f32.f16 %f204, %rs34;} // end inline asm fma.rn.ftz.f32 %f270, %f267, %f204, %f264; shl.b16 %rs211, %rs35, 4; cvt.s16.s8 %rs212, %rs211; shr.s16 %rs213, %rs212, 7; and.b16 %rs214, %rs213, -16; or.b16 %rs215, %rs214, %rs36; cvt.rn.f32.s16 %f271, %rs215; sub.ftz.f32 %f272, %f271, %f5; mul.ftz.f32 %f273, %f45, %f272; // begin inline asm { cvt.f32.f16 %f205, %rs37;} // end inline asm fma.rn.ftz.f32 %f274, %f273, %f205, %f268; // begin inline asm { cvt.f32.f16 %f206, %rs38;} // end inline asm fma.rn.ftz.f32 %f275, %f273, %f206, %f269; // begin inline asm { cvt.f32.f16 %f207, %rs39;} // end inline asm fma.rn.ftz.f32 %f276, %f273, %f207, %f270; shl.b16 %rs216, %rs40, 4; cvt.s16.s8 %rs217, %rs216; shr.s16 %rs218, %rs217, 7; and.b16 %rs219, %rs218, -16; or.b16 %rs220, %rs219, %rs40; cvt.rn.f32.s16 %f277, %rs220; sub.ftz.f32 %f278, %f277, %f5; mul.ftz.f32 %f279, %f45, %f278; // begin inline asm { cvt.f32.f16 %f208, %rs41;} // end inline asm fma.rn.ftz.f32 %f280, %f279, %f208, %f274; // begin inline asm { cvt.f32.f16 %f209, %rs42;} // end inline asm fma.rn.ftz.f32 %f281, %f279, %f209, %f275; // begin inline asm { cvt.f32.f16 %f210, %rs43;} // end inline asm fma.rn.ftz.f32 %f282, %f279, %f210, %f276; ld.global.v4.u32 {%r110, %r111, %r112, %r113}, [%rd8+16]; ld.global.v4.u32 {%r118, %r119, %r120, %r121}, [%rd9+16]; ld.global.v4.u32 {%r126, %r127, %r128, %r129}, [%rd10]; shl.b16 %rs221, %rs44, 4; cvt.s16.s8 %rs222, %rs221; shr.s16 %rs223, %rs222, 7; and.b16 %rs224, %rs223, -16; or.b16 %rs225, %rs224, %rs45; cvt.rn.f32.s16 %f283, %rs225; sub.ftz.f32 %f284, %f283, %f5; mul.ftz.f32 %f285, %f45, %f284; mov.b32 {%rs157, %rs160}, %r110; // begin inline asm { cvt.f32.f16 %f211, %rs157;} // end inline asm fma.rn.ftz.f32 %f286, %f285, %f211, %f280; mov.b32 {%rs158, %rs161}, %r118; // begin inline asm { cvt.f32.f16 %f212, %rs158;} // end inline asm fma.rn.ftz.f32 %f287, %f285, %f212, %f281; mov.b32 {%rs159, %rs162}, %r126; // begin inline asm { cvt.f32.f16 %f213, %rs159;} // end inline asm fma.rn.ftz.f32 %f288, %f285, %f213, %f282; shl.b16 %rs226, %rs46, 4; cvt.s16.s8 %rs227, %rs226; shr.s16 %rs228, %rs227, 7; and.b16 %rs229, %rs228, -16; or.b16 %rs230, %rs229, %rs47; cvt.rn.f32.s16 %f289, %rs230; sub.ftz.f32 %f290, %f289, %f5; mul.ftz.f32 %f291, %f45, %f290; // begin inline asm { cvt.f32.f16 %f214, %rs160;} // end inline asm fma.rn.ftz.f32 %f292, %f291, %f214, %f286; // begin inline asm { cvt.f32.f16 %f215, %rs161;} // end inline asm fma.rn.ftz.f32 %f293, %f291, %f215, %f287; // begin inline asm { cvt.f32.f16 %f216, %rs162;} // end inline asm fma.rn.ftz.f32 %f294, %f291, %f216, %f288; shl.b16 %rs231, %rs48, 4; cvt.s16.s8 %rs232, %rs231; shr.s16 %rs233, %rs232, 7; and.b16 %rs234, %rs233, -16; or.b16 %rs235, %rs234, %rs49; cvt.rn.f32.s16 %f295, %rs235; sub.ftz.f32 %f296, %f295, %f5; mul.ftz.f32 %f297, %f45, %f296; mov.b32 {%rs163, %rs166}, %r111; // begin inline asm { cvt.f32.f16 %f217, %rs163;} // end inline asm fma.rn.ftz.f32 %f298, %f297, %f217, %f292; mov.b32 {%rs164, %rs167}, %r119; // begin inline asm { cvt.f32.f16 %f218, %rs164;} // end inline asm fma.rn.ftz.f32 %f299, %f297, %f218, %f293; mov.b32 {%rs165, %rs168}, %r127; // begin inline asm { cvt.f32.f16 %f219, %rs165;} // end inline asm fma.rn.ftz.f32 %f300, %f297, %f219, %f294; shl.b16 %rs236, %rs50, 4; cvt.s16.s8 %rs237, %rs236; shr.s16 %rs238, %rs237, 7; and.b16 %rs239, %rs238, -16; or.b16 %rs240, %rs239, %rs51; cvt.rn.f32.s16 %f301, %rs240; sub.ftz.f32 %f302, %f301, %f5; mul.ftz.f32 %f303, %f45, %f302; // begin inline asm { cvt.f32.f16 %f220, %rs166;} // end inline asm fma.rn.ftz.f32 %f304, %f303, %f220, %f298; // begin inline asm { cvt.f32.f16 %f221, %rs167;} // end inline asm fma.rn.ftz.f32 %f305, %f303, %f221, %f299; // begin inline asm { cvt.f32.f16 %f222, %rs168;} // end inline asm fma.rn.ftz.f32 %f306, %f303, %f222, %f300; shl.b16 %rs241, %rs52, 4; cvt.s16.s8 %rs242, %rs241; shr.s16 %rs243, %rs242, 7; and.b16 %rs244, %rs243, -16; or.b16 %rs245, %rs244, %rs53; cvt.rn.f32.s16 %f307, %rs245; sub.ftz.f32 %f308, %f307, %f5; mul.ftz.f32 %f309, %f45, %f308; mov.b32 {%rs169, %rs172}, %r112; // begin inline asm { cvt.f32.f16 %f223, %rs169;} // end inline asm fma.rn.ftz.f32 %f310, %f309, %f223, %f304; mov.b32 {%rs170, %rs173}, %r120; // begin inline asm { cvt.f32.f16 %f224, %rs170;} // end inline asm fma.rn.ftz.f32 %f311, %f309, %f224, %f305; mov.b32 {%rs171, %rs174}, %r128; // begin inline asm { cvt.f32.f16 %f225, %rs171;} // end inline asm fma.rn.ftz.f32 %f312, %f309, %f225, %f306; shl.b16 %rs246, %rs54, 4; cvt.s16.s8 %rs247, %rs246; shr.s16 %rs248, %rs247, 7; and.b16 %rs249, %rs248, -16; or.b16 %rs250, %rs249, %rs55; cvt.rn.f32.s16 %f313, %rs250; sub.ftz.f32 %f314, %f313, %f5; mul.ftz.f32 %f315, %f45, %f314; // begin inline asm { cvt.f32.f16 %f226, %rs172;} // end inline asm fma.rn.ftz.f32 %f316, %f315, %f226, %f310; // begin inline asm { cvt.f32.f16 %f227, %rs173;} // end inline asm fma.rn.ftz.f32 %f317, %f315, %f227, %f311; // begin inline asm { cvt.f32.f16 %f228, %rs174;} // end inline asm fma.rn.ftz.f32 %f318, %f315, %f228, %f312; shl.b16 %rs251, %rs56, 4; cvt.s16.s8 %rs252, %rs251; shr.s16 %rs253, %rs252, 7; and.b16 %rs254, %rs253, -16; or.b16 %rs255, %rs254, %rs57; cvt.rn.f32.s16 %f319, %rs255; sub.ftz.f32 %f320, %f319, %f5; mul.ftz.f32 %f321, %f45, %f320; mov.b32 {%rs175, %rs178}, %r113; // begin inline asm { cvt.f32.f16 %f229, %rs175;} // end inline asm fma.rn.ftz.f32 %f322, %f321, %f229, %f316; mov.b32 {%rs176, %rs179}, %r121; // begin inline asm { cvt.f32.f16 %f230, %rs176;} // end inline asm fma.rn.ftz.f32 %f323, %f321, %f230, %f317; mov.b32 {%rs177, %rs180}, %r129; // begin inline asm { cvt.f32.f16 %f231, %rs177;} // end inline asm fma.rn.ftz.f32 %f324, %f321, %f231, %f318; shl.b16 %rs256, %rs58, 4; cvt.s16.s8 %rs257, %rs256; shr.s16 %rs258, %rs257, 7; and.b16 %rs259, %rs258, -16; or.b16 %rs260, %rs259, %rs58; cvt.rn.f32.s16 %f325, %rs260; sub.ftz.f32 %f326, %f325, %f5; mul.ftz.f32 %f327, %f45, %f326; // begin inline asm { cvt.f32.f16 %f232, %rs178;} // end inline asm fma.rn.ftz.f32 %f399, %f327, %f232, %f322; // begin inline asm { cvt.f32.f16 %f233, %rs179;} // end inline asm fma.rn.ftz.f32 %f398, %f327, %f233, %f323; // begin inline asm { cvt.f32.f16 %f234, %rs180;} // end inline asm fma.rn.ftz.f32 %f397, %f327, %f234, %f324; $L__BB0_8: add.s32 %r205, %r205, 4; shl.b32 %r134, %r205, 5; add.s32 %r204, %r134, %r3; shl.b32 %r203, %r204, 1; setp.lt.u32 %p7, %r203, %r31; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r199, %tid.y; shl.b32 %r198, %r199, 5; add.s32 %r197, %r198, %r3; shl.b32 %r135, %r197, 2; mov.u32 %r136, _ZZ9gemv_int4ILi4ELi64ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r137, %r136, %r135; setp.lt.u32 %p8, %r197, 32; @%p8 bra $L__BB0_11; add.s32 %r191, %r137, -112; st.shared.f32 [%r191], %f399; $L__BB0_11: mov.u32 %r202, %tid.y; shl.b32 %r201, %r202, 5; add.s32 %r200, %r201, %r3; setp.gt.u32 %p9, %r200, 31; bar.sync 0; mad.lo.s32 %r23, %r200, 12, %r136; @%p9 bra $L__BB0_13; mov.u32 %r152, 16; ld.shared.f32 %f343, [%r23+16]; add.ftz.f32 %f344, %f399, %f343; ld.shared.f32 %f345, [%r23+20]; add.ftz.f32 %f346, %f344, %f345; ld.shared.f32 %f347, [%r23+24]; add.ftz.f32 %f330, %f346, %f347; mov.u32 %r140, 1; mov.u32 %r153, 31; mov.u32 %r154, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f330, %r140, %r153, %r154; @p add.f32 r0, r0, %f330; mov.f32 %f328, r0;} // end inline asm mov.u32 %r143, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f328, %r143, %r153, %r154; @p add.f32 r0, r0, %f328; mov.f32 %f331, r0;} // end inline asm mov.u32 %r146, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f331, %r146, %r153, %r154; @p add.f32 r0, r0, %f331; mov.f32 %f334, r0;} // end inline asm mov.u32 %r149, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f334, %r149, %r153, %r154; @p add.f32 r0, r0, %f334; mov.f32 %f337, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f337, %r152, %r153, %r154; @p add.f32 r0, r0, %f337; mov.f32 %f399, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r192, %r137, -112; st.shared.f32 [%r192+640], %f398; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f363, [%r23+656]; add.ftz.f32 %f364, %f398, %f363; ld.shared.f32 %f365, [%r23+660]; add.ftz.f32 %f366, %f364, %f365; ld.shared.f32 %f367, [%r23+664]; add.ftz.f32 %f350, %f366, %f367; mov.u32 %r156, 1; mov.u32 %r169, 31; mov.u32 %r170, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f350, %r156, %r169, %r170; @p add.f32 r0, r0, %f350; mov.f32 %f348, r0;} // end inline asm mov.u32 %r159, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f348, %r159, %r169, %r170; @p add.f32 r0, r0, %f348; mov.f32 %f351, r0;} // end inline asm mov.u32 %r162, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f351, %r162, %r169, %r170; @p add.f32 r0, r0, %f351; mov.f32 %f354, r0;} // end inline asm mov.u32 %r165, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f354, %r165, %r169, %r170; @p add.f32 r0, r0, %f354; mov.f32 %f357, r0;} // end inline asm mov.u32 %r168, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f357, %r168, %r169, %r170; @p add.f32 r0, r0, %f357; mov.f32 %f398, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r193, %r137, -112; st.shared.f32 [%r193+1280], %f397; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f383, [%r23+1296]; add.ftz.f32 %f384, %f397, %f383; ld.shared.f32 %f385, [%r23+1300]; add.ftz.f32 %f386, %f384, %f385; ld.shared.f32 %f387, [%r23+1304]; add.ftz.f32 %f370, %f386, %f387; mov.u32 %r172, 1; mov.u32 %r185, 31; mov.u32 %r186, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f370, %r172, %r185, %r186; @p add.f32 r0, r0, %f370; mov.f32 %f368, r0;} // end inline asm mov.u32 %r175, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f368, %r175, %r185, %r186; @p add.f32 r0, r0, %f368; mov.f32 %f371, r0;} // end inline asm mov.u32 %r178, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f371, %r178, %r185, %r186; @p add.f32 r0, r0, %f371; mov.f32 %f374, r0;} // end inline asm mov.u32 %r181, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f374, %r181, %r185, %r186; @p add.f32 r0, r0, %f374; mov.f32 %f377, r0;} // end inline asm mov.u32 %r184, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f377, %r184, %r185, %r186; @p add.f32 r0, r0, %f377; mov.f32 %f397, r0;} // end inline asm $L__BB0_21: mov.u32 %r194, %tid.y; or.b32 %r187, %r3, %r194; setp.ne.s32 %p14, %r187, 0; @%p14 bra $L__BB0_29; ld.param.u64 %rd58, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+8]; mov.u32 %r195, %ctaid.x; setp.eq.s64 %p15, %rd58, 0; mul.ftz.f32 %f406, %f40, %f399; cvt.s64.s32 %rd11, %r195; @%p15 bra $L__BB0_24; ld.param.u64 %rd62, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd61, %rd62; shl.b64 %rd40, %rd11, 1; add.s64 %rd41, %rd61, %rd40; ld.global.u16 %rs261, [%rd41]; // begin inline asm { cvt.f32.f16 %f388, %rs261;} // end inline asm fma.rn.ftz.f32 %f406, %f41, %f388, %f406; $L__BB0_24: ld.param.u64 %rd59, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0]; mov.u32 %r196, %ctaid.x; // begin inline asm { cvt.rn.f16.f32 %rs262, %f406;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd42, 1.0; // end inline asm shl.b64 %rd45, %rd11, 1; add.s64 %rd43, %rd59, %rd45; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd43], %rs262, %rd42; // end inline asm mul.ftz.f32 %f407, %f40, %f398; add.s32 %r188, %r30, %r196; cvt.s64.s32 %rd14, %r188; @%p15 bra $L__BB0_26; ld.param.u64 %rd64, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd63, %rd64; shl.b64 %rd46, %rd14, 1; add.s64 %rd47, %rd63, %rd46; ld.global.u16 %rs264, [%rd47]; // begin inline asm { cvt.f32.f16 %f390, %rs264;} // end inline asm fma.rn.ftz.f32 %f407, %f41, %f390, %f407; $L__BB0_26: mul.wide.s32 %rd51, %r30, 2; add.s64 %rd49, %rd43, %rd51; // begin inline asm { cvt.rn.f16.f32 %rs265, %f407;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd48, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd49], %rs265, %rd48; // end inline asm mul.ftz.f32 %f408, %f40, %f397; cvt.u32.u64 %r189, %rd14; add.s32 %r190, %r189, %r30; cvt.s64.s32 %rd15, %r190; @%p15 bra $L__BB0_28; ld.param.u64 %rd66, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd65, %rd66; shl.b64 %rd52, %rd15, 1; add.s64 %rd53, %rd65, %rd52; ld.global.u16 %rs267, [%rd53]; // begin inline asm { cvt.f32.f16 %f392, %rs267;} // end inline asm fma.rn.ftz.f32 %f408, %f41, %f392, %f408; $L__BB0_28: ld.param.u64 %rd60, [_Z27dequant_gemv_group64_batch323DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs268, %f408;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd54, 1.0; // end inline asm shl.b64 %rd57, %rd15, 1; add.s64 %rd55, %rd60, %rd57; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd55], %rs268, %rd54; // end inline asm $L__BB0_29: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }