s2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_da7175d26thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch423DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<18>; .reg .b16 %rs<193>; .reg .f32 %f<324>; .reg .b32 %r<167>; .reg .b64 %rd<52>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[2560]; ld.param.v2.u32 {%r24, %r25}, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r26, %r27}, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f41, %f42}, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs60, %rs61, %rs62, %rs63}, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd20, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd19, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd18, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd17, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r166, %tid.y; shl.b32 %r28, %r166, 5; mov.u32 %r3, %tid.x; add.s32 %r165, %r28, %r3; setp.ge.u32 %p1, %r165, %r26; mov.f32 %f312, 0f00000000; mov.f32 %f313, %f312; mov.f32 %f314, %f312; mov.f32 %f315, %f312; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd1, %rd17; mul.lo.s32 %r7, %r26, %r1; shr.u32 %r8, %r3, 2; mul.lo.s32 %r9, %r27, %r1; shl.b16 %rs2, %rs60, 3; cvta.to.global.u64 %rd3, %rd19; cvta.to.global.u64 %rd4, %rd18; cvta.to.global.u64 %rd5, %rd20; $L__BB0_2: add.s32 %r29, %r165, %r7; mul.wide.u32 %rd21, %r29, 4; add.s64 %rd22, %rd4, %rd21; ld.global.u32 %r12, [%rd22]; shl.b32 %r30, %r166, 3; add.s32 %r13, %r30, %r8; add.s32 %r14, %r13, %r9; mul.wide.s32 %rd23, %r14, 2; add.s64 %rd24, %rd5, %rd23; ld.global.u16 %rs68, [%rd24]; // begin inline asm { cvt.f32.f16 %f47, %rs68;} // end inline asm setp.eq.s64 %p2, %rd19, 0; mov.u16 %rs192, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r31, %r14, 31; add.s32 %r32, %r14, %r31; shr.s32 %r33, %r32, 1; cvt.s64.s32 %rd25, %r33; add.s64 %rd26, %rd3, %rd25; ld.global.u8 %r34, [%rd26]; shl.b32 %r35, %r13, 2; and.b32 %r36, %r35, 4; shr.u32 %r37, %r34, %r36; cvt.u16.u32 %rs69, %r37; and.b16 %rs192, %rs69, 15; $L__BB0_4: shl.b32 %r15, %r165, 3; setp.ge.s32 %p3, %r15, %r24; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs60, 0; mul.wide.s32 %rd27, %r15, 2; add.s64 %rd28, %rd1, %rd27; ld.global.v4.u32 {%r38, %r39, %r40, %r41}, [%rd28]; mul.wide.s32 %rd29, %r24, 2; add.s64 %rd30, %rd28, %rd29; ld.global.v4.u32 {%r46, %r47, %r48, %r49}, [%rd30]; add.s32 %r54, %r24, %r15; add.s32 %r55, %r54, %r24; mul.wide.s32 %rd31, %r55, 2; add.s64 %rd32, %rd1, %rd31; ld.global.v4.u32 {%r56, %r57, %r58, %r59}, [%rd32]; add.s32 %r64, %r55, %r24; mul.wide.s32 %rd33, %r64, 2; add.s64 %rd34, %rd1, %rd33; ld.global.v4.u32 {%r65, %r66, %r67, %r68}, [%rd34]; shr.u16 %rs71, %rs192, 3; and.b16 %rs72, %rs71, 1; setp.eq.b16 %p5, %rs72, 1; and.pred %p6, %p4, %p5; selp.b16 %rs73, -16, 0, %p6; or.b16 %rs74, %rs73, %rs192; cvt.s16.s8 %rs75, %rs74; cvt.rn.f32.s16 %f6, %rs75; cvt.u16.u32 %rs5, %r12; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs13}, %r38; mov.b32 {%rs8, %rs14}, %r46; mov.b32 {%rs9, %rs15}, %r56; mov.b32 {%rs10, %rs16}, %r65; shr.u32 %r73, %r12, 4; cvt.u16.u32 %rs11, %r73; and.b16 %rs12, %rs11, 15; shr.u32 %r74, %r12, 8; cvt.u16.u32 %rs17, %r74; and.b16 %rs18, %rs17, 15; mov.b32 {%rs19, %rs25}, %r39; mov.b32 {%rs20, %rs26}, %r47; mov.b32 {%rs21, %rs27}, %r57; mov.b32 {%rs22, %rs28}, %r66; shr.u32 %r75, %r12, 12; cvt.u16.u32 %rs23, %r75; and.b16 %rs24, %rs23, 15; shr.u32 %r76, %r12, 16; cvt.u16.u32 %rs29, %r76; and.b16 %rs30, %rs29, 15; mov.b32 {%rs31, %rs37}, %r40; mov.b32 {%rs32, %rs38}, %r48; mov.b32 {%rs33, %rs39}, %r58; mov.b32 {%rs34, %rs40}, %r67; shr.u32 %r77, %r12, 20; cvt.u16.u32 %rs35, %r77; and.b16 %rs36, %rs35, 15; shr.u32 %r78, %r12, 24; cvt.u16.u32 %rs41, %r78; and.b16 %rs42, %rs41, 15; mov.b32 {%rs43, %rs48}, %r41; mov.b32 {%rs44, %rs49}, %r49; mov.b32 {%rs45, %rs50}, %r59; mov.b32 {%rs46, %rs51}, %r68; shr.u32 %r79, %r12, 28; cvt.u16.u32 %rs47, %r79; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f80, %rs6; sub.ftz.f32 %f81, %f80, %f6; mul.ftz.f32 %f82, %f47, %f81; // begin inline asm { cvt.f32.f16 %f48, %rs7;} // end inline asm fma.rn.ftz.f32 %f83, %f82, %f48, %f312; // begin inline asm { cvt.f32.f16 %f49, %rs8;} // end inline asm fma.rn.ftz.f32 %f84, %f82, %f49, %f313; // begin inline asm { cvt.f32.f16 %f50, %rs9;} // end inline asm fma.rn.ftz.f32 %f85, %f82, %f50, %f314; // begin inline asm { cvt.f32.f16 %f51, %rs10;} // end inline asm fma.rn.ftz.f32 %f86, %f82, %f51, %f315; cvt.rn.f32.s16 %f87, %rs12; sub.ftz.f32 %f88, %f87, %f6; mul.ftz.f32 %f89, %f47, %f88; // begin inline asm { cvt.f32.f16 %f52, %rs13;} // end inline asm fma.rn.ftz.f32 %f90, %f89, %f52, %f83; // begin inline asm { cvt.f32.f16 %f53, %rs14;} // end inline asm fma.rn.ftz.f32 %f91, %f89, %f53, %f84; // begin inline asm { cvt.f32.f16 %f54, %rs15;} // end inline asm fma.rn.ftz.f32 %f92, %f89, %f54, %f85; // begin inline asm { cvt.f32.f16 %f55, %rs16;} // end inline asm fma.rn.ftz.f32 %f93, %f89, %f55, %f86; cvt.rn.f32.s16 %f94, %rs18; sub.ftz.f32 %f95, %f94, %f6; mul.ftz.f32 %f96, %f47, %f95; // begin inline asm { cvt.f32.f16 %f56, %rs19;} // end inline asm fma.rn.ftz.f32 %f97, %f96, %f56, %f90; // begin inline asm { cvt.f32.f16 %f57, %rs20;} // end inline asm fma.rn.ftz.f32 %f98, %f96, %f57, %f91; // begin inline asm { cvt.f32.f16 %f58, %rs21;} // end inline asm fma.rn.ftz.f32 %f99, %f96, %f58, %f92; // begin inline asm { cvt.f32.f16 %f59, %rs22;} // end inline asm fma.rn.ftz.f32 %f100, %f96, %f59, %f93; cvt.rn.f32.s16 %f101, %rs24; sub.ftz.f32 %f102, %f101, %f6; mul.ftz.f32 %f103, %f47, %f102; // begin inline asm { cvt.f32.f16 %f60, %rs25;} // end inline asm fma.rn.ftz.f32 %f104, %f103, %f60, %f97; // begin inline asm { cvt.f32.f16 %f61, %rs26;} // end inline asm fma.rn.ftz.f32 %f105, %f103, %f61, %f98; // begin inline asm { cvt.f32.f16 %f62, %rs27;} // end inline asm fma.rn.ftz.f32 %f106, %f103, %f62, %f99; // begin inline asm { cvt.f32.f16 %f63, %rs28;} // end inline asm fma.rn.ftz.f32 %f107, %f103, %f63, %f100; cvt.rn.f32.s16 %f108, %rs30; sub.ftz.f32 %f109, %f108, %f6; mul.ftz.f32 %f110, %f47, %f109; // begin inline asm { cvt.f32.f16 %f64, %rs31;} // end inline asm fma.rn.ftz.f32 %f111, %f110, %f64, %f104; // begin inline asm { cvt.f32.f16 %f65, %rs32;} // end inline asm fma.rn.ftz.f32 %f112, %f110, %f65, %f105; // begin inline asm { cvt.f32.f16 %f66, %rs33;} // end inline asm fma.rn.ftz.f32 %f113, %f110, %f66, %f106; // begin inline asm { cvt.f32.f16 %f67, %rs34;} // end inline asm fma.rn.ftz.f32 %f114, %f110, %f67, %f107; cvt.rn.f32.s16 %f115, %rs36; sub.ftz.f32 %f116, %f115, %f6; mul.ftz.f32 %f117, %f47, %f116; // begin inline asm { cvt.f32.f16 %f68, %rs37;} // end inline asm fma.rn.ftz.f32 %f118, %f117, %f68, %f111; // begin inline asm { cvt.f32.f16 %f69, %rs38;} // end inline asm fma.rn.ftz.f32 %f119, %f117, %f69, %f112; // begin inline asm { cvt.f32.f16 %f70, %rs39;} // end inline asm fma.rn.ftz.f32 %f120, %f117, %f70, %f113; // begin inline asm { cvt.f32.f16 %f71, %rs40;} // end inline asm fma.rn.ftz.f32 %f121, %f117, %f71, %f114; cvt.rn.f32.s16 %f122, %rs42; sub.ftz.f32 %f123, %f122, %f6; mul.ftz.f32 %f124, %f47, %f123; // begin inline asm { cvt.f32.f16 %f72, %rs43;} // end inline asm fma.rn.ftz.f32 %f125, %f124, %f72, %f118; // begin inline asm { cvt.f32.f16 %f73, %rs44;} // end inline asm fma.rn.ftz.f32 %f126, %f124, %f73, %f119; // begin inline asm { cvt.f32.f16 %f74, %rs45;} // end inline asm fma.rn.ftz.f32 %f127, %f124, %f74, %f120; // begin inline asm { cvt.f32.f16 %f75, %rs46;} // end inline asm fma.rn.ftz.f32 %f128, %f124, %f75, %f121; cvt.rn.f32.s16 %f129, %rs47; sub.ftz.f32 %f130, %f129, %f6; mul.ftz.f32 %f131, %f47, %f130; // begin inline asm { cvt.f32.f16 %f76, %rs48;} // end inline asm fma.rn.ftz.f32 %f312, %f131, %f76, %f125; // begin inline asm { cvt.f32.f16 %f77, %rs49;} // end inline asm fma.rn.ftz.f32 %f313, %f131, %f77, %f126; // begin inline asm { cvt.f32.f16 %f78, %rs50;} // end inline asm fma.rn.ftz.f32 %f314, %f131, %f78, %f127; // begin inline asm { cvt.f32.f16 %f79, %rs51;} // end inline asm fma.rn.ftz.f32 %f315, %f131, %f79, %f128; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs140, %rs5, 4; cvt.s16.s8 %rs141, %rs140; shr.s16 %rs142, %rs141, 7; and.b16 %rs143, %rs142, -16; or.b16 %rs144, %rs143, %rs6; cvt.rn.f32.s16 %f164, %rs144; sub.ftz.f32 %f165, %f164, %f6; mul.ftz.f32 %f166, %f47, %f165; // begin inline asm { cvt.f32.f16 %f132, %rs7;} // end inline asm fma.rn.ftz.f32 %f167, %f166, %f132, %f312; // begin inline asm { cvt.f32.f16 %f133, %rs8;} // end inline asm fma.rn.ftz.f32 %f168, %f166, %f133, %f313; // begin inline asm { cvt.f32.f16 %f134, %rs9;} // end inline asm fma.rn.ftz.f32 %f169, %f166, %f134, %f314; // begin inline asm { cvt.f32.f16 %f135, %rs10;} // end inline asm fma.rn.ftz.f32 %f170, %f166, %f135, %f315; shl.b16 %rs145, %rs11, 4; cvt.s16.s8 %rs146, %rs145; shr.s16 %rs147, %rs146, 7; and.b16 %rs148, %rs147, -16; or.b16 %rs149, %rs148, %rs12; cvt.rn.f32.s16 %f171, %rs149; sub.ftz.f32 %f172, %f171, %f6; mul.ftz.f32 %f173, %f47, %f172; // begin inline asm { cvt.f32.f16 %f136, %rs13;} // end inline asm fma.rn.ftz.f32 %f174, %f173, %f136, %f167; // begin inline asm { cvt.f32.f16 %f137, %rs14;} // end inline asm fma.rn.ftz.f32 %f175, %f173, %f137, %f168; // begin inline asm { cvt.f32.f16 %f138, %rs15;} // end inline asm fma.rn.ftz.f32 %f176, %f173, %f138, %f169; // begin inline asm { cvt.f32.f16 %f139, %rs16;} // end inline asm fma.rn.ftz.f32 %f177, %f173, %f139, %f170; shl.b16 %rs150, %rs17, 4; cvt.s16.s8 %rs151, %rs150; shr.s16 %rs152, %rs151, 7; and.b16 %rs153, %rs152, -16; or.b16 %rs154, %rs153, %rs18; cvt.rn.f32.s16 %f178, %rs154; sub.ftz.f32 %f179, %f178, %f6; mul.ftz.f32 %f180, %f47, %f179; // begin inline asm { cvt.f32.f16 %f140, %rs19;} // end inline asm fma.rn.ftz.f32 %f181, %f180, %f140, %f174; // begin inline asm { cvt.f32.f16 %f141, %rs20;} // end inline asm fma.rn.ftz.f32 %f182, %f180, %f141, %f175; // begin inline asm { cvt.f32.f16 %f142, %rs21;} // end inline asm fma.rn.ftz.f32 %f183, %f180, %f142, %f176; // begin inline asm { cvt.f32.f16 %f143, %rs22;} // end inline asm fma.rn.ftz.f32 %f184, %f180, %f143, %f177; shl.b16 %rs155, %rs23, 4; cvt.s16.s8 %rs156, %rs155; shr.s16 %rs157, %rs156, 7; and.b16 %rs158, %rs157, -16; or.b16 %rs159, %rs158, %rs24; cvt.rn.f32.s16 %f185, %rs159; sub.ftz.f32 %f186, %f185, %f6; mul.ftz.f32 %f187, %f47, %f186; // begin inline asm { cvt.f32.f16 %f144, %rs25;} // end inline asm fma.rn.ftz.f32 %f188, %f187, %f144, %f181; // begin inline asm { cvt.f32.f16 %f145, %rs26;} // end inline asm fma.rn.ftz.f32 %f189, %f187, %f145, %f182; // begin inline asm { cvt.f32.f16 %f146, %rs27;} // end inline asm fma.rn.ftz.f32 %f190, %f187, %f146, %f183; // begin inline asm { cvt.f32.f16 %f147, %rs28;} // end inline asm fma.rn.ftz.f32 %f191, %f187, %f147, %f184; shl.b16 %rs160, %rs29, 4; cvt.s16.s8 %rs161, %rs160; shr.s16 %rs162, %rs161, 7; and.b16 %rs163, %rs162, -16; or.b16 %rs164, %rs163, %rs30; cvt.rn.f32.s16 %f192, %rs164; sub.ftz.f32 %f193, %f192, %f6; mul.ftz.f32 %f194, %f47, %f193; // begin inline asm { cvt.f32.f16 %f148, %rs31;} // end inline asm fma.rn.ftz.f32 %f195, %f194, %f148, %f188; // begin inline asm { cvt.f32.f16 %f149, %rs32;} // end inline asm fma.rn.ftz.f32 %f196, %f194, %f149, %f189; // begin inline asm { cvt.f32.f16 %f150, %rs33;} // end inline asm fma.rn.ftz.f32 %f197, %f194, %f150, %f190; // begin inline asm { cvt.f32.f16 %f151, %rs34;} // end inline asm fma.rn.ftz.f32 %f198, %f194, %f151, %f191; shl.b16 %rs165, %rs35, 4; cvt.s16.s8 %rs166, %rs165; shr.s16 %rs167, %rs166, 7; and.b16 %rs168, %rs167, -16; or.b16 %rs169, %rs168, %rs36; cvt.rn.f32.s16 %f199, %rs169; sub.ftz.f32 %f200, %f199, %f6; mul.ftz.f32 %f201, %f47, %f200; // begin inline asm { cvt.f32.f16 %f152, %rs37;} // end inline asm fma.rn.ftz.f32 %f202, %f201, %f152, %f195; // begin inline asm { cvt.f32.f16 %f153, %rs38;} // end inline asm fma.rn.ftz.f32 %f203, %f201, %f153, %f196; // begin inline asm { cvt.f32.f16 %f154, %rs39;} // end inline asm fma.rn.ftz.f32 %f204, %f201, %f154, %f197; // begin inline asm { cvt.f32.f16 %f155, %rs40;} // end inline asm fma.rn.ftz.f32 %f205, %f201, %f155, %f198; shl.b16 %rs170, %rs41, 4; cvt.s16.s8 %rs171, %rs170; shr.s16 %rs172, %rs171, 7; and.b16 %rs173, %rs172, -16; or.b16 %rs174, %rs173, %rs42; cvt.rn.f32.s16 %f206, %rs174; sub.ftz.f32 %f207, %f206, %f6; mul.ftz.f32 %f208, %f47, %f207; // begin inline asm { cvt.f32.f16 %f156, %rs43;} // end inline asm fma.rn.ftz.f32 %f209, %f208, %f156, %f202; // begin inline asm { cvt.f32.f16 %f157, %rs44;} // end inline asm fma.rn.ftz.f32 %f210, %f208, %f157, %f203; // begin inline asm { cvt.f32.f16 %f158, %rs45;} // end inline asm fma.rn.ftz.f32 %f211, %f208, %f158, %f204; // begin inline asm { cvt.f32.f16 %f159, %rs46;} // end inline asm fma.rn.ftz.f32 %f212, %f208, %f159, %f205; shl.b16 %rs175, %rs47, 4; cvt.s16.s8 %rs176, %rs175; shr.s16 %rs177, %rs176, 7; and.b16 %rs178, %rs177, -16; or.b16 %rs179, %rs178, %rs47; cvt.rn.f32.s16 %f213, %rs179; sub.ftz.f32 %f214, %f213, %f6; mul.ftz.f32 %f215, %f47, %f214; // begin inline asm { cvt.f32.f16 %f160, %rs48;} // end inline asm fma.rn.ftz.f32 %f312, %f215, %f160, %f209; // begin inline asm { cvt.f32.f16 %f161, %rs49;} // end inline asm fma.rn.ftz.f32 %f313, %f215, %f161, %f210; // begin inline asm { cvt.f32.f16 %f162, %rs50;} // end inline asm fma.rn.ftz.f32 %f314, %f215, %f162, %f211; // begin inline asm { cvt.f32.f16 %f163, %rs51;} // end inline asm fma.rn.ftz.f32 %f315, %f215, %f163, %f212; $L__BB0_8: add.s32 %r166, %r166, 4; shl.b32 %r80, %r166, 5; add.s32 %r165, %r80, %r3; setp.lt.u32 %p7, %r165, %r26; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r161, %tid.y; shl.b32 %r160, %r161, 5; add.s32 %r159, %r160, %r3; shl.b32 %r81, %r159, 2; mov.u32 %r82, _ZZ9gemv_int4ILi4ELi32ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r83, %r82, %r81; setp.lt.u32 %p8, %r159, 32; @%p8 bra $L__BB0_11; add.s32 %r153, %r83, -112; st.shared.f32 [%r153], %f312; $L__BB0_11: mov.u32 %r164, %tid.y; shl.b32 %r163, %r164, 5; add.s32 %r162, %r163, %r3; setp.gt.u32 %p9, %r162, 31; bar.sync 0; mad.lo.s32 %r19, %r162, 12, %r82; @%p9 bra $L__BB0_13; mov.u32 %r98, 16; ld.shared.f32 %f231, [%r19+16]; add.ftz.f32 %f232, %f312, %f231; ld.shared.f32 %f233, [%r19+20]; add.ftz.f32 %f234, %f232, %f233; ld.shared.f32 %f235, [%r19+24]; add.ftz.f32 %f218, %f234, %f235; mov.u32 %r86, 1; mov.u32 %r99, 31; mov.u32 %r100, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f218, %r86, %r99, %r100; @p add.f32 r0, r0, %f218; mov.f32 %f216, r0;} // end inline asm mov.u32 %r89, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f216, %r89, %r99, %r100; @p add.f32 r0, r0, %f216; mov.f32 %f219, r0;} // end inline asm mov.u32 %r92, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f219, %r92, %r99, %r100; @p add.f32 r0, r0, %f219; mov.f32 %f222, r0;} // end inline asm mov.u32 %r95, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f222, %r95, %r99, %r100; @p add.f32 r0, r0, %f222; mov.f32 %f225, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f225, %r98, %r99, %r100; @p add.f32 r0, r0, %f225; mov.f32 %f312, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r154, %r83, -112; st.shared.f32 [%r154+640], %f313; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f251, [%r19+656]; add.ftz.f32 %f252, %f313, %f251; ld.shared.f32 %f253, [%r19+660]; add.ftz.f32 %f254, %f252, %f253; ld.shared.f32 %f255, [%r19+664]; add.ftz.f32 %f238, %f254, %f255; mov.u32 %r102, 1; mov.u32 %r115, 31; mov.u32 %r116, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f238, %r102, %r115, %r116; @p add.f32 r0, r0, %f238; mov.f32 %f236, r0;} // end inline asm mov.u32 %r105, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f236, %r105, %r115, %r116; @p add.f32 r0, r0, %f236; mov.f32 %f239, r0;} // end inline asm mov.u32 %r108, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f239, %r108, %r115, %r116; @p add.f32 r0, r0, %f239; mov.f32 %f242, r0;} // end inline asm mov.u32 %r111, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f242, %r111, %r115, %r116; @p add.f32 r0, r0, %f242; mov.f32 %f245, r0;} // end inline asm mov.u32 %r114, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f245, %r114, %r115, %r116; @p add.f32 r0, r0, %f245; mov.f32 %f313, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r155, %r83, -112; st.shared.f32 [%r155+1280], %f314; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f271, [%r19+1296]; add.ftz.f32 %f272, %f314, %f271; ld.shared.f32 %f273, [%r19+1300]; add.ftz.f32 %f274, %f272, %f273; ld.shared.f32 %f275, [%r19+1304]; add.ftz.f32 %f258, %f274, %f275; mov.u32 %r118, 1; mov.u32 %r131, 31; mov.u32 %r132, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f258, %r118, %r131, %r132; @p add.f32 r0, r0, %f258; mov.f32 %f256, r0;} // end inline asm mov.u32 %r121, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f256, %r121, %r131, %r132; @p add.f32 r0, r0, %f256; mov.f32 %f259, r0;} // end inline asm mov.u32 %r124, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f259, %r124, %r131, %r132; @p add.f32 r0, r0, %f259; mov.f32 %f262, r0;} // end inline asm mov.u32 %r127, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f262, %r127, %r131, %r132; @p add.f32 r0, r0, %f262; mov.f32 %f265, r0;} // end inline asm mov.u32 %r130, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f265, %r130, %r131, %r132; @p add.f32 r0, r0, %f265; mov.f32 %f314, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r156, %r83, -112; st.shared.f32 [%r156+1920], %f315; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f291, [%r19+1936]; add.ftz.f32 %f292, %f315, %f291; ld.shared.f32 %f293, [%r19+1940]; add.ftz.f32 %f294, %f292, %f293; ld.shared.f32 %f295, [%r19+1944]; add.ftz.f32 %f278, %f294, %f295; mov.u32 %r134, 1; mov.u32 %r147, 31; mov.u32 %r148, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f278, %r134, %r147, %r148; @p add.f32 r0, r0, %f278; mov.f32 %f276, r0;} // end inline asm mov.u32 %r137, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f276, %r137, %r147, %r148; @p add.f32 r0, r0, %f276; mov.f32 %f279, r0;} // end inline asm mov.u32 %r140, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f279, %r140, %r147, %r148; @p add.f32 r0, r0, %f279; mov.f32 %f282, r0;} // end inline asm mov.u32 %r143, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f282, %r143, %r147, %r148; @p add.f32 r0, r0, %f282; mov.f32 %f285, r0;} // end inline asm mov.u32 %r146, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f285, %r146, %r147, %r148; @p add.f32 r0, r0, %f285; mov.f32 %f315, r0;} // end inline asm $L__BB0_25: mov.u32 %r157, %tid.y; or.b32 %r149, %r3, %r157; setp.ne.s32 %p16, %r149, 0; @%p16 bra $L__BB0_29; ld.param.u64 %rd50, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd49, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0]; mov.u32 %r158, %ctaid.x; cvta.to.global.u64 %rd35, %rd49; setp.eq.s64 %p17, %rd50, 0; mul.ftz.f32 %f31, %f41, %f312; cvt.s64.s32 %rd7, %r158; mul.wide.s32 %rd36, %r158, 2; add.s64 %rd8, %rd35, %rd36; mul.ftz.f32 %f32, %f41, %f313; cvt.s64.s32 %rd9, %r25; mul.wide.s32 %rd37, %r25, 2; add.s64 %rd10, %rd8, %rd37; mul.ftz.f32 %f33, %f41, %f314; add.s32 %r150, %r25, %r158; add.s32 %r151, %r150, %r25; cvt.s64.s32 %rd11, %r151; mul.wide.s32 %rd38, %r151, 2; add.s64 %rd12, %rd35, %rd38; mul.ftz.f32 %f34, %f41, %f315; add.s32 %r152, %r151, %r25; cvt.s64.s32 %rd13, %r152; mul.wide.s32 %rd39, %r152, 2; add.s64 %rd14, %rd35, %rd39; @%p17 bra $L__BB0_28; ld.param.u64 %rd51, [_Z27dequant_gemv_group32_batch423DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd40, %rd51; shl.b64 %rd41, %rd7, 1; add.s64 %rd42, %rd40, %rd41; ld.global.u16 %rs180, [%rd42]; // begin inline asm { cvt.f32.f16 %f296, %rs180;} // end inline asm fma.rn.ftz.f32 %f297, %f42, %f296, %f31; // begin inline asm { cvt.rn.f16.f32 %rs181, %f297;} // end inline asm st.global.u16 [%rd8], %rs181; shl.b64 %rd43, %rd9, 1; add.s64 %rd44, %rd42, %rd43; ld.global.u16 %rs182, [%rd44]; // begin inline asm { cvt.f32.f16 %f298, %rs182;} // end inline asm fma.rn.ftz.f32 %f299, %f42, %f298, %f32; // begin inline asm { cvt.rn.f16.f32 %rs183, %f299;} // end inline asm st.global.u16 [%rd10], %rs183; shl.b64 %rd45, %rd11, 1; add.s64 %rd46, %rd40, %rd45; ld.global.u16 %rs184, [%rd46]; // begin inline asm { cvt.f32.f16 %f300, %rs184;} // end inline asm fma.rn.ftz.f32 %f301, %f42, %f300, %f33; // begin inline asm { cvt.rn.f16.f32 %rs185, %f301;} // end inline asm st.global.u16 [%rd12], %rs185; shl.b64 %rd47, %rd13, 1; add.s64 %rd48, %rd40, %rd47; ld.global.u16 %rs186, [%rd48]; // begin inline asm { cvt.f32.f16 %f302, %rs186;} // end inline asm fma.rn.ftz.f32 %f303, %f42, %f302, %f34; // begin inline asm { cvt.rn.f16.f32 %rs187, %f303;} // end inline asm st.global.u16 [%rd14], %rs187; bra.uni $L__BB0_29; $L__BB0_28: // begin inline asm { cvt.rn.f16.f32 %rs188, %f31;} // end inline asm st.global.u16 [%rd8], %rs188; // begin inline asm { cvt.rn.f16.f32 %rs189, %f32;} // end inline asm st.global.u16 [%rd10], %rs189; // begin inline asm { cvt.rn.f16.f32 %rs190, %f33;} // end inline asm st.global.u16 [%rd12], %rs190; // begin inline asm { cvt.rn.f16.f32 %rs191, %f34;} // end inline asm st.global.u16 [%rd14], %rs191; $L__BB0_29: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }