ers2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_dad502166thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch423DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<21>; .reg .b16 %rs<314>; .reg .f32 %f<510>; .reg .b32 %r<250>; .reg .b64 %rd<72>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[2560]; ld.param.v2.u32 {%r29, %r30}, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r31, %r32}, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f51, %f52}, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs75, %rs76, %rs77, %rs78}, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd23, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd22, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd21, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd20, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd19, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+8]; mov.u32 %r1, %ctaid.x; cvta.to.global.u64 %rd2, %rd19; mov.u32 %r249, %tid.y; shl.b32 %r33, %r249, 5; mov.u32 %r3, %tid.x; add.s32 %r248, %r33, %r3; shl.b32 %r247, %r248, 1; setp.ge.u32 %p1, %r247, %r31; mov.f32 %f494, 0f00000000; mov.f32 %f495, %f494; mov.f32 %f496, %f494; mov.f32 %f497, %f494; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd20; mul.lo.s32 %r8, %r31, %r1; shr.u32 %r9, %r3, 2; mul.lo.s32 %r10, %r32, %r1; shl.b16 %rs2, %rs75, 3; mul.wide.s32 %rd5, %r29, 2; $L__BB0_2: add.s32 %r36, %r247, %r8; mul.wide.u32 %rd30, %r36, 4; add.s64 %rd25, %rd21, %rd30; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd24, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v2.u32 {%r34,%r35}, [%rd25], %rd24; // end inline asm shl.b32 %r37, %r249, 3; add.s32 %r16, %r37, %r9; add.s32 %r17, %r16, %r10; mul.wide.s32 %rd31, %r17, 2; add.s64 %rd28, %rd23, %rd31; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd27, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs83, [%rd28], %rd27; // end inline asm // begin inline asm { cvt.f32.f16 %f57, %rs83;} // end inline asm setp.eq.s64 %p2, %rd22, 0; mov.u16 %rs313, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r38, %r17, 31; add.s32 %r39, %r17, %r38; shr.s32 %r40, %r39, 1; cvt.s64.s32 %rd35, %r40; add.s64 %rd33, %rd22, %rd35; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd32, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs85, [%rd33], %rd32; // end inline asm cvt.u32.u16 %r41, %rs85; and.b32 %r42, %r41, 255; shl.b32 %r43, %r16, 2; and.b32 %r44, %r43, 4; shr.u32 %r45, %r42, %r44; cvt.u16.u32 %rs86, %r45; and.b16 %rs313, %rs86, 15; $L__BB0_4: shl.b32 %r18, %r248, 4; setp.ge.s32 %p3, %r18, %r29; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs75, 0; shr.u16 %rs88, %rs313, 3; and.b16 %rs89, %rs88, 1; setp.eq.b16 %p5, %rs89, 1; and.pred %p6, %p4, %p5; selp.b16 %rs90, -16, 0, %p6; or.b16 %rs91, %rs90, %rs313; cvt.s16.s8 %rs92, %rs91; cvt.rn.f32.s16 %f6, %rs92; mul.wide.s32 %rd36, %r18, 2; add.s64 %rd8, %rd3, %rd36; ld.global.v4.u32 {%r46, %r47, %r48, %r49}, [%rd8]; add.s64 %rd9, %rd8, %rd5; ld.global.v4.u32 {%r54, %r55, %r56, %r57}, [%rd9]; add.s32 %r62, %r18, %r29; add.s32 %r63, %r62, %r29; mul.wide.s32 %rd38, %r63, 2; add.s64 %rd39, %rd3, %rd38; ld.global.v4.u32 {%r64, %r65, %r66, %r67}, [%rd39]; add.s32 %r72, %r63, %r29; mul.wide.s32 %rd40, %r72, 2; add.s64 %rd41, %rd3, %rd40; ld.global.v4.u32 {%r73, %r74, %r75, %r76}, [%rd41]; cvt.u16.u32 %rs5, %r34; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs13}, %r46; mov.b32 {%rs8, %rs14}, %r54; mov.b32 {%rs9, %rs15}, %r64; mov.b32 {%rs10, %rs16}, %r73; shr.u32 %r81, %r34, 4; cvt.u16.u32 %rs11, %r81; and.b16 %rs12, %rs11, 15; shr.u32 %r82, %r34, 8; cvt.u16.u32 %rs17, %r82; and.b16 %rs18, %rs17, 15; mov.b32 {%rs19, %rs25}, %r47; mov.b32 {%rs20, %rs26}, %r55; mov.b32 {%rs21, %rs27}, %r65; mov.b32 {%rs22, %rs28}, %r74; shr.u32 %r83, %r34, 12; cvt.u16.u32 %rs23, %r83; and.b16 %rs24, %rs23, 15; shr.u32 %r84, %r34, 16; cvt.u16.u32 %rs29, %r84; and.b16 %rs30, %rs29, 15; mov.b32 {%rs31, %rs37}, %r48; mov.b32 {%rs32, %rs38}, %r56; mov.b32 {%rs33, %rs39}, %r66; mov.b32 {%rs34, %rs40}, %r75; shr.u32 %r85, %r34, 20; cvt.u16.u32 %rs35, %r85; and.b16 %rs36, %rs35, 15; shr.u32 %r86, %r34, 24; cvt.u16.u32 %rs41, %r86; and.b16 %rs42, %rs41, 15; mov.b32 {%rs43, %rs48}, %r49; mov.b32 {%rs44, %rs49}, %r57; mov.b32 {%rs45, %rs50}, %r67; mov.b32 {%rs46, %rs51}, %r76; shr.u32 %r87, %r34, 28; cvt.u16.u32 %rs47, %r87; cvt.u16.u32 %rs52, %r35; and.b16 %rs53, %rs52, 15; shr.u32 %r88, %r35, 4; cvt.u16.u32 %rs54, %r88; and.b16 %rs55, %rs54, 15; shr.u32 %r89, %r35, 8; cvt.u16.u32 %rs56, %r89; and.b16 %rs57, %rs56, 15; shr.u32 %r90, %r35, 12; cvt.u16.u32 %rs58, %r90; and.b16 %rs59, %rs58, 15; shr.u32 %r91, %r35, 16; cvt.u16.u32 %rs60, %r91; and.b16 %rs61, %rs60, 15; shr.u32 %r92, %r35, 20; cvt.u16.u32 %rs62, %r92; and.b16 %rs63, %rs62, 15; shr.u32 %r93, %r35, 24; cvt.u16.u32 %rs64, %r93; and.b16 %rs65, %rs64, 15; shr.u32 %r94, %r35, 28; cvt.u16.u32 %rs66, %r94; add.s64 %rd43, %rd9, %rd5; add.s64 %rd10, %rd43, 16; add.s64 %rd11, %rd10, %rd5; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f122, %rs6; sub.ftz.f32 %f123, %f122, %f6; mul.ftz.f32 %f124, %f57, %f123; // begin inline asm { cvt.f32.f16 %f58, %rs7;} // end inline asm fma.rn.ftz.f32 %f125, %f124, %f58, %f497; // begin inline asm { cvt.f32.f16 %f59, %rs8;} // end inline asm fma.rn.ftz.f32 %f126, %f124, %f59, %f496; // begin inline asm { cvt.f32.f16 %f60, %rs9;} // end inline asm fma.rn.ftz.f32 %f127, %f124, %f60, %f495; // begin inline asm { cvt.f32.f16 %f61, %rs10;} // end inline asm fma.rn.ftz.f32 %f128, %f124, %f61, %f494; cvt.rn.f32.s16 %f129, %rs12; sub.ftz.f32 %f130, %f129, %f6; mul.ftz.f32 %f131, %f57, %f130; // begin inline asm { cvt.f32.f16 %f62, %rs13;} // end inline asm fma.rn.ftz.f32 %f132, %f131, %f62, %f125; // begin inline asm { cvt.f32.f16 %f63, %rs14;} // end inline asm fma.rn.ftz.f32 %f133, %f131, %f63, %f126; // begin inline asm { cvt.f32.f16 %f64, %rs15;} // end inline asm fma.rn.ftz.f32 %f134, %f131, %f64, %f127; // begin inline asm { cvt.f32.f16 %f65, %rs16;} // end inline asm fma.rn.ftz.f32 %f135, %f131, %f65, %f128; cvt.rn.f32.s16 %f136, %rs18; sub.ftz.f32 %f137, %f136, %f6; mul.ftz.f32 %f138, %f57, %f137; // begin inline asm { cvt.f32.f16 %f66, %rs19;} // end inline asm fma.rn.ftz.f32 %f139, %f138, %f66, %f132; // begin inline asm { cvt.f32.f16 %f67, %rs20;} // end inline asm fma.rn.ftz.f32 %f140, %f138, %f67, %f133; // begin inline asm { cvt.f32.f16 %f68, %rs21;} // end inline asm fma.rn.ftz.f32 %f141, %f138, %f68, %f134; // begin inline asm { cvt.f32.f16 %f69, %rs22;} // end inline asm fma.rn.ftz.f32 %f142, %f138, %f69, %f135; cvt.rn.f32.s16 %f143, %rs24; sub.ftz.f32 %f144, %f143, %f6; mul.ftz.f32 %f145, %f57, %f144; // begin inline asm { cvt.f32.f16 %f70, %rs25;} // end inline asm fma.rn.ftz.f32 %f146, %f145, %f70, %f139; // begin inline asm { cvt.f32.f16 %f71, %rs26;} // end inline asm fma.rn.ftz.f32 %f147, %f145, %f71, %f140; // begin inline asm { cvt.f32.f16 %f72, %rs27;} // end inline asm fma.rn.ftz.f32 %f148, %f145, %f72, %f141; // begin inline asm { cvt.f32.f16 %f73, %rs28;} // end inline asm fma.rn.ftz.f32 %f149, %f145, %f73, %f142; cvt.rn.f32.s16 %f150, %rs30; sub.ftz.f32 %f151, %f150, %f6; mul.ftz.f32 %f152, %f57, %f151; // begin inline asm { cvt.f32.f16 %f74, %rs31;} // end inline asm fma.rn.ftz.f32 %f153, %f152, %f74, %f146; // begin inline asm { cvt.f32.f16 %f75, %rs32;} // end inline asm fma.rn.ftz.f32 %f154, %f152, %f75, %f147; // begin inline asm { cvt.f32.f16 %f76, %rs33;} // end inline asm fma.rn.ftz.f32 %f155, %f152, %f76, %f148; // begin inline asm { cvt.f32.f16 %f77, %rs34;} // end inline asm fma.rn.ftz.f32 %f156, %f152, %f77, %f149; cvt.rn.f32.s16 %f157, %rs36; sub.ftz.f32 %f158, %f157, %f6; mul.ftz.f32 %f159, %f57, %f158; // begin inline asm { cvt.f32.f16 %f78, %rs37;} // end inline asm fma.rn.ftz.f32 %f160, %f159, %f78, %f153; // begin inline asm { cvt.f32.f16 %f79, %rs38;} // end inline asm fma.rn.ftz.f32 %f161, %f159, %f79, %f154; // begin inline asm { cvt.f32.f16 %f80, %rs39;} // end inline asm fma.rn.ftz.f32 %f162, %f159, %f80, %f155; // begin inline asm { cvt.f32.f16 %f81, %rs40;} // end inline asm fma.rn.ftz.f32 %f163, %f159, %f81, %f156; cvt.rn.f32.s16 %f164, %rs42; sub.ftz.f32 %f165, %f164, %f6; mul.ftz.f32 %f166, %f57, %f165; // begin inline asm { cvt.f32.f16 %f82, %rs43;} // end inline asm fma.rn.ftz.f32 %f167, %f166, %f82, %f160; // begin inline asm { cvt.f32.f16 %f83, %rs44;} // end inline asm fma.rn.ftz.f32 %f168, %f166, %f83, %f161; // begin inline asm { cvt.f32.f16 %f84, %rs45;} // end inline asm fma.rn.ftz.f32 %f169, %f166, %f84, %f162; // begin inline asm { cvt.f32.f16 %f85, %rs46;} // end inline asm fma.rn.ftz.f32 %f170, %f166, %f85, %f163; cvt.rn.f32.s16 %f171, %rs47; sub.ftz.f32 %f172, %f171, %f6; mul.ftz.f32 %f173, %f57, %f172; // begin inline asm { cvt.f32.f16 %f86, %rs48;} // end inline asm fma.rn.ftz.f32 %f174, %f173, %f86, %f167; // begin inline asm { cvt.f32.f16 %f87, %rs49;} // end inline asm fma.rn.ftz.f32 %f175, %f173, %f87, %f168; // begin inline asm { cvt.f32.f16 %f88, %rs50;} // end inline asm fma.rn.ftz.f32 %f176, %f173, %f88, %f169; // begin inline asm { cvt.f32.f16 %f89, %rs51;} // end inline asm fma.rn.ftz.f32 %f177, %f173, %f89, %f170; ld.global.v4.u32 {%r95, %r96, %r97, %r98}, [%rd8+16]; ld.global.v4.u32 {%r103, %r104, %r105, %r106}, [%rd9+16]; ld.global.v4.u32 {%r111, %r112, %r113, %r114}, [%rd10]; ld.global.v4.u32 {%r119, %r120, %r121, %r122}, [%rd11]; cvt.rn.f32.s16 %f178, %rs53; sub.ftz.f32 %f179, %f178, %f6; mul.ftz.f32 %f180, %f57, %f179; mov.b32 {%rs125, %rs129}, %r95; // begin inline asm { cvt.f32.f16 %f90, %rs125;} // end inline asm fma.rn.ftz.f32 %f181, %f180, %f90, %f174; mov.b32 {%rs126, %rs130}, %r103; // begin inline asm { cvt.f32.f16 %f91, %rs126;} // end inline asm fma.rn.ftz.f32 %f182, %f180, %f91, %f175; mov.b32 {%rs127, %rs131}, %r111; // begin inline asm { cvt.f32.f16 %f92, %rs127;} // end inline asm fma.rn.ftz.f32 %f183, %f180, %f92, %f176; mov.b32 {%rs128, %rs132}, %r119; // begin inline asm { cvt.f32.f16 %f93, %rs128;} // end inline asm fma.rn.ftz.f32 %f184, %f180, %f93, %f177; cvt.rn.f32.s16 %f185, %rs55; sub.ftz.f32 %f186, %f185, %f6; mul.ftz.f32 %f187, %f57, %f186; // begin inline asm { cvt.f32.f16 %f94, %rs129;} // end inline asm fma.rn.ftz.f32 %f188, %f187, %f94, %f181; // begin inline asm { cvt.f32.f16 %f95, %rs130;} // end inline asm fma.rn.ftz.f32 %f189, %f187, %f95, %f182; // begin inline asm { cvt.f32.f16 %f96, %rs131;} // end inline asm fma.rn.ftz.f32 %f190, %f187, %f96, %f183; // begin inline asm { cvt.f32.f16 %f97, %rs132;} // end inline asm fma.rn.ftz.f32 %f191, %f187, %f97, %f184; cvt.rn.f32.s16 %f192, %rs57; sub.ftz.f32 %f193, %f192, %f6; mul.ftz.f32 %f194, %f57, %f193; mov.b32 {%rs133, %rs137}, %r96; // begin inline asm { cvt.f32.f16 %f98, %rs133;} // end inline asm fma.rn.ftz.f32 %f195, %f194, %f98, %f188; mov.b32 {%rs134, %rs138}, %r104; // begin inline asm { cvt.f32.f16 %f99, %rs134;} // end inline asm fma.rn.ftz.f32 %f196, %f194, %f99, %f189; mov.b32 {%rs135, %rs139}, %r112; // begin inline asm { cvt.f32.f16 %f100, %rs135;} // end inline asm fma.rn.ftz.f32 %f197, %f194, %f100, %f190; mov.b32 {%rs136, %rs140}, %r120; // begin inline asm { cvt.f32.f16 %f101, %rs136;} // end inline asm fma.rn.ftz.f32 %f198, %f194, %f101, %f191; cvt.rn.f32.s16 %f199, %rs59; sub.ftz.f32 %f200, %f199, %f6; mul.ftz.f32 %f201, %f57, %f200; // begin inline asm { cvt.f32.f16 %f102, %rs137;} // end inline asm fma.rn.ftz.f32 %f202, %f201, %f102, %f195; // begin inline asm { cvt.f32.f16 %f103, %rs138;} // end inline asm fma.rn.ftz.f32 %f203, %f201, %f103, %f196; // begin inline asm { cvt.f32.f16 %f104, %rs139;} // end inline asm fma.rn.ftz.f32 %f204, %f201, %f104, %f197; // begin inline asm { cvt.f32.f16 %f105, %rs140;} // end inline asm fma.rn.ftz.f32 %f205, %f201, %f105, %f198; cvt.rn.f32.s16 %f206, %rs61; sub.ftz.f32 %f207, %f206, %f6; mul.ftz.f32 %f208, %f57, %f207; mov.b32 {%rs141, %rs145}, %r97; // begin inline asm { cvt.f32.f16 %f106, %rs141;} // end inline asm fma.rn.ftz.f32 %f209, %f208, %f106, %f202; mov.b32 {%rs142, %rs146}, %r105; // begin inline asm { cvt.f32.f16 %f107, %rs142;} // end inline asm fma.rn.ftz.f32 %f210, %f208, %f107, %f203; mov.b32 {%rs143, %rs147}, %r113; // begin inline asm { cvt.f32.f16 %f108, %rs143;} // end inline asm fma.rn.ftz.f32 %f211, %f208, %f108, %f204; mov.b32 {%rs144, %rs148}, %r121; // begin inline asm { cvt.f32.f16 %f109, %rs144;} // end inline asm fma.rn.ftz.f32 %f212, %f208, %f109, %f205; cvt.rn.f32.s16 %f213, %rs63; sub.ftz.f32 %f214, %f213, %f6; mul.ftz.f32 %f215, %f57, %f214; // begin inline asm { cvt.f32.f16 %f110, %rs145;} // end inline asm fma.rn.ftz.f32 %f216, %f215, %f110, %f209; // begin inline asm { cvt.f32.f16 %f111, %rs146;} // end inline asm fma.rn.ftz.f32 %f217, %f215, %f111, %f210; // begin inline asm { cvt.f32.f16 %f112, %rs147;} // end inline asm fma.rn.ftz.f32 %f218, %f215, %f112, %f211; // begin inline asm { cvt.f32.f16 %f113, %rs148;} // end inline asm fma.rn.ftz.f32 %f219, %f215, %f113, %f212; cvt.rn.f32.s16 %f220, %rs65; sub.ftz.f32 %f221, %f220, %f6; mul.ftz.f32 %f222, %f57, %f221; mov.b32 {%rs149, %rs153}, %r98; // begin inline asm { cvt.f32.f16 %f114, %rs149;} // end inline asm fma.rn.ftz.f32 %f223, %f222, %f114, %f216; mov.b32 {%rs150, %rs154}, %r106; // begin inline asm { cvt.f32.f16 %f115, %rs150;} // end inline asm fma.rn.ftz.f32 %f224, %f222, %f115, %f217; mov.b32 {%rs151, %rs155}, %r114; // begin inline asm { cvt.f32.f16 %f116, %rs151;} // end inline asm fma.rn.ftz.f32 %f225, %f222, %f116, %f218; mov.b32 {%rs152, %rs156}, %r122; // begin inline asm { cvt.f32.f16 %f117, %rs152;} // end inline asm fma.rn.ftz.f32 %f226, %f222, %f117, %f219; cvt.rn.f32.s16 %f227, %rs66; sub.ftz.f32 %f228, %f227, %f6; mul.ftz.f32 %f229, %f57, %f228; // begin inline asm { cvt.f32.f16 %f118, %rs153;} // end inline asm fma.rn.ftz.f32 %f497, %f229, %f118, %f223; // begin inline asm { cvt.f32.f16 %f119, %rs154;} // end inline asm fma.rn.ftz.f32 %f496, %f229, %f119, %f224; // begin inline asm { cvt.f32.f16 %f120, %rs155;} // end inline asm fma.rn.ftz.f32 %f495, %f229, %f120, %f225; // begin inline asm { cvt.f32.f16 %f121, %rs156;} // end inline asm fma.rn.ftz.f32 %f494, %f229, %f121, %f226; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs221, %rs5, 4; cvt.s16.s8 %rs222, %rs221; shr.s16 %rs223, %rs222, 7; and.b16 %rs224, %rs223, -16; or.b16 %rs225, %rs224, %rs6; cvt.rn.f32.s16 %f294, %rs225; sub.ftz.f32 %f295, %f294, %f6; mul.ftz.f32 %f296, %f57, %f295; // begin inline asm { cvt.f32.f16 %f230, %rs7;} // end inline asm fma.rn.ftz.f32 %f297, %f296, %f230, %f497; // begin inline asm { cvt.f32.f16 %f231, %rs8;} // end inline asm fma.rn.ftz.f32 %f298, %f296, %f231, %f496; // begin inline asm { cvt.f32.f16 %f232, %rs9;} // end inline asm fma.rn.ftz.f32 %f299, %f296, %f232, %f495; // begin inline asm { cvt.f32.f16 %f233, %rs10;} // end inline asm fma.rn.ftz.f32 %f300, %f296, %f233, %f494; shl.b16 %rs226, %rs11, 4; cvt.s16.s8 %rs227, %rs226; shr.s16 %rs228, %rs227, 7; and.b16 %rs229, %rs228, -16; or.b16 %rs230, %rs229, %rs12; cvt.rn.f32.s16 %f301, %rs230; sub.ftz.f32 %f302, %f301, %f6; mul.ftz.f32 %f303, %f57, %f302; // begin inline asm { cvt.f32.f16 %f234, %rs13;} // end inline asm fma.rn.ftz.f32 %f304, %f303, %f234, %f297; // begin inline asm { cvt.f32.f16 %f235, %rs14;} // end inline asm fma.rn.ftz.f32 %f305, %f303, %f235, %f298; // begin inline asm { cvt.f32.f16 %f236, %rs15;} // end inline asm fma.rn.ftz.f32 %f306, %f303, %f236, %f299; // begin inline asm { cvt.f32.f16 %f237, %rs16;} // end inline asm fma.rn.ftz.f32 %f307, %f303, %f237, %f300; shl.b16 %rs231, %rs17, 4; cvt.s16.s8 %rs232, %rs231; shr.s16 %rs233, %rs232, 7; and.b16 %rs234, %rs233, -16; or.b16 %rs235, %rs234, %rs18; cvt.rn.f32.s16 %f308, %rs235; sub.ftz.f32 %f309, %f308, %f6; mul.ftz.f32 %f310, %f57, %f309; // begin inline asm { cvt.f32.f16 %f238, %rs19;} // end inline asm fma.rn.ftz.f32 %f311, %f310, %f238, %f304; // begin inline asm { cvt.f32.f16 %f239, %rs20;} // end inline asm fma.rn.ftz.f32 %f312, %f310, %f239, %f305; // begin inline asm { cvt.f32.f16 %f240, %rs21;} // end inline asm fma.rn.ftz.f32 %f313, %f310, %f240, %f306; // begin inline asm { cvt.f32.f16 %f241, %rs22;} // end inline asm fma.rn.ftz.f32 %f314, %f310, %f241, %f307; shl.b16 %rs236, %rs23, 4; cvt.s16.s8 %rs237, %rs236; shr.s16 %rs238, %rs237, 7; and.b16 %rs239, %rs238, -16; or.b16 %rs240, %rs239, %rs24; cvt.rn.f32.s16 %f315, %rs240; sub.ftz.f32 %f316, %f315, %f6; mul.ftz.f32 %f317, %f57, %f316; // begin inline asm { cvt.f32.f16 %f242, %rs25;} // end inline asm fma.rn.ftz.f32 %f318, %f317, %f242, %f311; // begin inline asm { cvt.f32.f16 %f243, %rs26;} // end inline asm fma.rn.ftz.f32 %f319, %f317, %f243, %f312; // begin inline asm { cvt.f32.f16 %f244, %rs27;} // end inline asm fma.rn.ftz.f32 %f320, %f317, %f244, %f313; // begin inline asm { cvt.f32.f16 %f245, %rs28;} // end inline asm fma.rn.ftz.f32 %f321, %f317, %f245, %f314; shl.b16 %rs241, %rs29, 4; cvt.s16.s8 %rs242, %rs241; shr.s16 %rs243, %rs242, 7; and.b16 %rs244, %rs243, -16; or.b16 %rs245, %rs244, %rs30; cvt.rn.f32.s16 %f322, %rs245; sub.ftz.f32 %f323, %f322, %f6; mul.ftz.f32 %f324, %f57, %f323; // begin inline asm { cvt.f32.f16 %f246, %rs31;} // end inline asm fma.rn.ftz.f32 %f325, %f324, %f246, %f318; // begin inline asm { cvt.f32.f16 %f247, %rs32;} // end inline asm fma.rn.ftz.f32 %f326, %f324, %f247, %f319; // begin inline asm { cvt.f32.f16 %f248, %rs33;} // end inline asm fma.rn.ftz.f32 %f327, %f324, %f248, %f320; // begin inline asm { cvt.f32.f16 %f249, %rs34;} // end inline asm fma.rn.ftz.f32 %f328, %f324, %f249, %f321; shl.b16 %rs246, %rs35, 4; cvt.s16.s8 %rs247, %rs246; shr.s16 %rs248, %rs247, 7; and.b16 %rs249, %rs248, -16; or.b16 %rs250, %rs249, %rs36; cvt.rn.f32.s16 %f329, %rs250; sub.ftz.f32 %f330, %f329, %f6; mul.ftz.f32 %f331, %f57, %f330; // begin inline asm { cvt.f32.f16 %f250, %rs37;} // end inline asm fma.rn.ftz.f32 %f332, %f331, %f250, %f325; // begin inline asm { cvt.f32.f16 %f251, %rs38;} // end inline asm fma.rn.ftz.f32 %f333, %f331, %f251, %f326; // begin inline asm { cvt.f32.f16 %f252, %rs39;} // end inline asm fma.rn.ftz.f32 %f334, %f331, %f252, %f327; // begin inline asm { cvt.f32.f16 %f253, %rs40;} // end inline asm fma.rn.ftz.f32 %f335, %f331, %f253, %f328; shl.b16 %rs251, %rs41, 4; cvt.s16.s8 %rs252, %rs251; shr.s16 %rs253, %rs252, 7; and.b16 %rs254, %rs253, -16; or.b16 %rs255, %rs254, %rs42; cvt.rn.f32.s16 %f336, %rs255; sub.ftz.f32 %f337, %f336, %f6; mul.ftz.f32 %f338, %f57, %f337; // begin inline asm { cvt.f32.f16 %f254, %rs43;} // end inline asm fma.rn.ftz.f32 %f339, %f338, %f254, %f332; // begin inline asm { cvt.f32.f16 %f255, %rs44;} // end inline asm fma.rn.ftz.f32 %f340, %f338, %f255, %f333; // begin inline asm { cvt.f32.f16 %f256, %rs45;} // end inline asm fma.rn.ftz.f32 %f341, %f338, %f256, %f334; // begin inline asm { cvt.f32.f16 %f257, %rs46;} // end inline asm fma.rn.ftz.f32 %f342, %f338, %f257, %f335; shl.b16 %rs256, %rs47, 4; cvt.s16.s8 %rs257, %rs256; shr.s16 %rs258, %rs257, 7; and.b16 %rs259, %rs258, -16; or.b16 %rs260, %rs259, %rs47; cvt.rn.f32.s16 %f343, %rs260; sub.ftz.f32 %f344, %f343, %f6; mul.ftz.f32 %f345, %f57, %f344; // begin inline asm { cvt.f32.f16 %f258, %rs48;} // end inline asm fma.rn.ftz.f32 %f346, %f345, %f258, %f339; // begin inline asm { cvt.f32.f16 %f259, %rs49;} // end inline asm fma.rn.ftz.f32 %f347, %f345, %f259, %f340; // begin inline asm { cvt.f32.f16 %f260, %rs50;} // end inline asm fma.rn.ftz.f32 %f348, %f345, %f260, %f341; // begin inline asm { cvt.f32.f16 %f261, %rs51;} // end inline asm fma.rn.ftz.f32 %f349, %f345, %f261, %f342; ld.global.v4.u32 {%r127, %r128, %r129, %r130}, [%rd8+16]; ld.global.v4.u32 {%r135, %r136, %r137, %r138}, [%rd9+16]; ld.global.v4.u32 {%r143, %r144, %r145, %r146}, [%rd10]; ld.global.v4.u32 {%r151, %r152, %r153, %r154}, [%rd11]; shl.b16 %rs261, %rs52, 4; cvt.s16.s8 %rs262, %rs261; shr.s16 %rs263, %rs262, 7; and.b16 %rs264, %rs263, -16; or.b16 %rs265, %rs264, %rs53; cvt.rn.f32.s16 %f350, %rs265; sub.ftz.f32 %f351, %f350, %f6; mul.ftz.f32 %f352, %f57, %f351; mov.b32 {%rs189, %rs193}, %r127; // begin inline asm { cvt.f32.f16 %f262, %rs189;} // end inline asm fma.rn.ftz.f32 %f353, %f352, %f262, %f346; mov.b32 {%rs190, %rs194}, %r135; // begin inline asm { cvt.f32.f16 %f263, %rs190;} // end inline asm fma.rn.ftz.f32 %f354, %f352, %f263, %f347; mov.b32 {%rs191, %rs195}, %r143; // begin inline asm { cvt.f32.f16 %f264, %rs191;} // end inline asm fma.rn.ftz.f32 %f355, %f352, %f264, %f348; mov.b32 {%rs192, %rs196}, %r151; // begin inline asm { cvt.f32.f16 %f265, %rs192;} // end inline asm fma.rn.ftz.f32 %f356, %f352, %f265, %f349; shl.b16 %rs266, %rs54, 4; cvt.s16.s8 %rs267, %rs266; shr.s16 %rs268, %rs267, 7; and.b16 %rs269, %rs268, -16; or.b16 %rs270, %rs269, %rs55; cvt.rn.f32.s16 %f357, %rs270; sub.ftz.f32 %f358, %f357, %f6; mul.ftz.f32 %f359, %f57, %f358; // begin inline asm { cvt.f32.f16 %f266, %rs193;} // end inline asm fma.rn.ftz.f32 %f360, %f359, %f266, %f353; // begin inline asm { cvt.f32.f16 %f267, %rs194;} // end inline asm fma.rn.ftz.f32 %f361, %f359, %f267, %f354; // begin inline asm { cvt.f32.f16 %f268, %rs195;} // end inline asm fma.rn.ftz.f32 %f362, %f359, %f268, %f355; // begin inline asm { cvt.f32.f16 %f269, %rs196;} // end inline asm fma.rn.ftz.f32 %f363, %f359, %f269, %f356; shl.b16 %rs271, %rs56, 4; cvt.s16.s8 %rs272, %rs271; shr.s16 %rs273, %rs272, 7; and.b16 %rs274, %rs273, -16; or.b16 %rs275, %rs274, %rs57; cvt.rn.f32.s16 %f364, %rs275; sub.ftz.f32 %f365, %f364, %f6; mul.ftz.f32 %f366, %f57, %f365; mov.b32 {%rs197, %rs201}, %r128; // begin inline asm { cvt.f32.f16 %f270, %rs197;} // end inline asm fma.rn.ftz.f32 %f367, %f366, %f270, %f360; mov.b32 {%rs198, %rs202}, %r136; // begin inline asm { cvt.f32.f16 %f271, %rs198;} // end inline asm fma.rn.ftz.f32 %f368, %f366, %f271, %f361; mov.b32 {%rs199, %rs203}, %r144; // begin inline asm { cvt.f32.f16 %f272, %rs199;} // end inline asm fma.rn.ftz.f32 %f369, %f366, %f272, %f362; mov.b32 {%rs200, %rs204}, %r152; // begin inline asm { cvt.f32.f16 %f273, %rs200;} // end inline asm fma.rn.ftz.f32 %f370, %f366, %f273, %f363; shl.b16 %rs276, %rs58, 4; cvt.s16.s8 %rs277, %rs276; shr.s16 %rs278, %rs277, 7; and.b16 %rs279, %rs278, -16; or.b16 %rs280, %rs279, %rs59; cvt.rn.f32.s16 %f371, %rs280; sub.ftz.f32 %f372, %f371, %f6; mul.ftz.f32 %f373, %f57, %f372; // begin inline asm { cvt.f32.f16 %f274, %rs201;} // end inline asm fma.rn.ftz.f32 %f374, %f373, %f274, %f367; // begin inline asm { cvt.f32.f16 %f275, %rs202;} // end inline asm fma.rn.ftz.f32 %f375, %f373, %f275, %f368; // begin inline asm { cvt.f32.f16 %f276, %rs203;} // end inline asm fma.rn.ftz.f32 %f376, %f373, %f276, %f369; // begin inline asm { cvt.f32.f16 %f277, %rs204;} // end inline asm fma.rn.ftz.f32 %f377, %f373, %f277, %f370; shl.b16 %rs281, %rs60, 4; cvt.s16.s8 %rs282, %rs281; shr.s16 %rs283, %rs282, 7; and.b16 %rs284, %rs283, -16; or.b16 %rs285, %rs284, %rs61; cvt.rn.f32.s16 %f378, %rs285; sub.ftz.f32 %f379, %f378, %f6; mul.ftz.f32 %f380, %f57, %f379; mov.b32 {%rs205, %rs209}, %r129; // begin inline asm { cvt.f32.f16 %f278, %rs205;} // end inline asm fma.rn.ftz.f32 %f381, %f380, %f278, %f374; mov.b32 {%rs206, %rs210}, %r137; // begin inline asm { cvt.f32.f16 %f279, %rs206;} // end inline asm fma.rn.ftz.f32 %f382, %f380, %f279, %f375; mov.b32 {%rs207, %rs211}, %r145; // begin inline asm { cvt.f32.f16 %f280, %rs207;} // end inline asm fma.rn.ftz.f32 %f383, %f380, %f280, %f376; mov.b32 {%rs208, %rs212}, %r153; // begin inline asm { cvt.f32.f16 %f281, %rs208;} // end inline asm fma.rn.ftz.f32 %f384, %f380, %f281, %f377; shl.b16 %rs286, %rs62, 4; cvt.s16.s8 %rs287, %rs286; shr.s16 %rs288, %rs287, 7; and.b16 %rs289, %rs288, -16; or.b16 %rs290, %rs289, %rs63; cvt.rn.f32.s16 %f385, %rs290; sub.ftz.f32 %f386, %f385, %f6; mul.ftz.f32 %f387, %f57, %f386; // begin inline asm { cvt.f32.f16 %f282, %rs209;} // end inline asm fma.rn.ftz.f32 %f388, %f387, %f282, %f381; // begin inline asm { cvt.f32.f16 %f283, %rs210;} // end inline asm fma.rn.ftz.f32 %f389, %f387, %f283, %f382; // begin inline asm { cvt.f32.f16 %f284, %rs211;} // end inline asm fma.rn.ftz.f32 %f390, %f387, %f284, %f383; // begin inline asm { cvt.f32.f16 %f285, %rs212;} // end inline asm fma.rn.ftz.f32 %f391, %f387, %f285, %f384; shl.b16 %rs291, %rs64, 4; cvt.s16.s8 %rs292, %rs291; shr.s16 %rs293, %rs292, 7; and.b16 %rs294, %rs293, -16; or.b16 %rs295, %rs294, %rs65; cvt.rn.f32.s16 %f392, %rs295; sub.ftz.f32 %f393, %f392, %f6; mul.ftz.f32 %f394, %f57, %f393; mov.b32 {%rs213, %rs217}, %r130; // begin inline asm { cvt.f32.f16 %f286, %rs213;} // end inline asm fma.rn.ftz.f32 %f395, %f394, %f286, %f388; mov.b32 {%rs214, %rs218}, %r138; // begin inline asm { cvt.f32.f16 %f287, %rs214;} // end inline asm fma.rn.ftz.f32 %f396, %f394, %f287, %f389; mov.b32 {%rs215, %rs219}, %r146; // begin inline asm { cvt.f32.f16 %f288, %rs215;} // end inline asm fma.rn.ftz.f32 %f397, %f394, %f288, %f390; mov.b32 {%rs216, %rs220}, %r154; // begin inline asm { cvt.f32.f16 %f289, %rs216;} // end inline asm fma.rn.ftz.f32 %f398, %f394, %f289, %f391; shl.b16 %rs296, %rs66, 4; cvt.s16.s8 %rs297, %rs296; shr.s16 %rs298, %rs297, 7; and.b16 %rs299, %rs298, -16; or.b16 %rs300, %rs299, %rs66; cvt.rn.f32.s16 %f399, %rs300; sub.ftz.f32 %f400, %f399, %f6; mul.ftz.f32 %f401, %f57, %f400; // begin inline asm { cvt.f32.f16 %f290, %rs217;} // end inline asm fma.rn.ftz.f32 %f497, %f401, %f290, %f395; // begin inline asm { cvt.f32.f16 %f291, %rs218;} // end inline asm fma.rn.ftz.f32 %f496, %f401, %f291, %f396; // begin inline asm { cvt.f32.f16 %f292, %rs219;} // end inline asm fma.rn.ftz.f32 %f495, %f401, %f292, %f397; // begin inline asm { cvt.f32.f16 %f293, %rs220;} // end inline asm fma.rn.ftz.f32 %f494, %f401, %f293, %f398; $L__BB0_8: add.s32 %r249, %r249, 4; shl.b32 %r159, %r249, 5; add.s32 %r248, %r159, %r3; shl.b32 %r247, %r248, 1; setp.lt.u32 %p7, %r247, %r31; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r240, %tid.y; shl.b32 %r239, %r240, 5; add.s32 %r238, %r239, %r3; shl.b32 %r160, %r238, 2; mov.u32 %r161, _ZZ9gemv_int4ILi4ELi64ELi4EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r162, %r161, %r160; setp.lt.u32 %p8, %r238, 32; @%p8 bra $L__BB0_11; add.s32 %r234, %r162, -112; st.shared.f32 [%r234], %f497; $L__BB0_11: mov.u32 %r243, %tid.y; shl.b32 %r242, %r243, 5; add.s32 %r241, %r242, %r3; setp.gt.u32 %p9, %r241, 31; bar.sync 0; mad.lo.s32 %r23, %r241, 12, %r161; @%p9 bra $L__BB0_13; mov.u32 %r177, 16; ld.shared.f32 %f417, [%r23+16]; add.ftz.f32 %f418, %f497, %f417; ld.shared.f32 %f419, [%r23+20]; add.ftz.f32 %f420, %f418, %f419; ld.shared.f32 %f421, [%r23+24]; add.ftz.f32 %f404, %f420, %f421; mov.u32 %r165, 1; mov.u32 %r178, 31; mov.u32 %r179, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f404, %r165, %r178, %r179; @p add.f32 r0, r0, %f404; mov.f32 %f402, r0;} // end inline asm mov.u32 %r168, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f402, %r168, %r178, %r179; @p add.f32 r0, r0, %f402; mov.f32 %f405, r0;} // end inline asm mov.u32 %r171, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f405, %r171, %r178, %r179; @p add.f32 r0, r0, %f405; mov.f32 %f408, r0;} // end inline asm mov.u32 %r174, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f408, %r174, %r178, %r179; @p add.f32 r0, r0, %f408; mov.f32 %f411, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f411, %r177, %r178, %r179; @p add.f32 r0, r0, %f411; mov.f32 %f497, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r235, %r162, -112; st.shared.f32 [%r235+640], %f496; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f437, [%r23+656]; add.ftz.f32 %f438, %f496, %f437; ld.shared.f32 %f439, [%r23+660]; add.ftz.f32 %f440, %f438, %f439; ld.shared.f32 %f441, [%r23+664]; add.ftz.f32 %f424, %f440, %f441; mov.u32 %r181, 1; mov.u32 %r194, 31; mov.u32 %r195, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f424, %r181, %r194, %r195; @p add.f32 r0, r0, %f424; mov.f32 %f422, r0;} // end inline asm mov.u32 %r184, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f422, %r184, %r194, %r195; @p add.f32 r0, r0, %f422; mov.f32 %f425, r0;} // end inline asm mov.u32 %r187, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f425, %r187, %r194, %r195; @p add.f32 r0, r0, %f425; mov.f32 %f428, r0;} // end inline asm mov.u32 %r190, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f428, %r190, %r194, %r195; @p add.f32 r0, r0, %f428; mov.f32 %f431, r0;} // end inline asm mov.u32 %r193, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f431, %r193, %r194, %r195; @p add.f32 r0, r0, %f431; mov.f32 %f496, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r236, %r162, -112; st.shared.f32 [%r236+1280], %f495; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f457, [%r23+1296]; add.ftz.f32 %f458, %f495, %f457; ld.shared.f32 %f459, [%r23+1300]; add.ftz.f32 %f460, %f458, %f459; ld.shared.f32 %f461, [%r23+1304]; add.ftz.f32 %f444, %f460, %f461; mov.u32 %r197, 1; mov.u32 %r210, 31; mov.u32 %r211, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f444, %r197, %r210, %r211; @p add.f32 r0, r0, %f444; mov.f32 %f442, r0;} // end inline asm mov.u32 %r200, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f442, %r200, %r210, %r211; @p add.f32 r0, r0, %f442; mov.f32 %f445, r0;} // end inline asm mov.u32 %r203, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f445, %r203, %r210, %r211; @p add.f32 r0, r0, %f445; mov.f32 %f448, r0;} // end inline asm mov.u32 %r206, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f448, %r206, %r210, %r211; @p add.f32 r0, r0, %f448; mov.f32 %f451, r0;} // end inline asm mov.u32 %r209, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f451, %r209, %r210, %r211; @p add.f32 r0, r0, %f451; mov.f32 %f495, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r237, %r162, -112; st.shared.f32 [%r237+1920], %f494; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f477, [%r23+1936]; add.ftz.f32 %f478, %f494, %f477; ld.shared.f32 %f479, [%r23+1940]; add.ftz.f32 %f480, %f478, %f479; ld.shared.f32 %f481, [%r23+1944]; add.ftz.f32 %f464, %f480, %f481; mov.u32 %r213, 1; mov.u32 %r226, 31; mov.u32 %r227, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f464, %r213, %r226, %r227; @p add.f32 r0, r0, %f464; mov.f32 %f462, r0;} // end inline asm mov.u32 %r216, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f462, %r216, %r226, %r227; @p add.f32 r0, r0, %f462; mov.f32 %f465, r0;} // end inline asm mov.u32 %r219, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f465, %r219, %r226, %r227; @p add.f32 r0, r0, %f465; mov.f32 %f468, r0;} // end inline asm mov.u32 %r222, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f468, %r222, %r226, %r227; @p add.f32 r0, r0, %f468; mov.f32 %f471, r0;} // end inline asm mov.u32 %r225, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f471, %r225, %r226, %r227; @p add.f32 r0, r0, %f471; mov.f32 %f494, r0;} // end inline asm $L__BB0_25: mov.u32 %r244, %tid.y; or.b32 %r228, %r3, %r244; setp.ne.s32 %p16, %r228, 0; @%p16 bra $L__BB0_35; ld.param.u64 %rd68, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0+8]; mov.u32 %r245, %ctaid.x; setp.eq.s64 %p17, %rd68, 0; mul.ftz.f32 %f506, %f51, %f497; cvt.s64.s32 %rd12, %r245; @%p17 bra $L__BB0_28; shl.b64 %rd44, %rd12, 1; add.s64 %rd45, %rd2, %rd44; ld.global.u16 %rs301, [%rd45]; // begin inline asm { cvt.f32.f16 %f482, %rs301;} // end inline asm fma.rn.ftz.f32 %f506, %f52, %f482, %f506; $L__BB0_28: ld.param.u64 %rd69, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0]; mov.u32 %r246, %ctaid.x; // begin inline asm { cvt.rn.f16.f32 %rs302, %f506;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd46, 1.0; // end inline asm shl.b64 %rd49, %rd12, 1; add.s64 %rd47, %rd69, %rd49; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd47], %rs302, %rd46; // end inline asm mul.ftz.f32 %f507, %f51, %f496; add.s32 %r229, %r30, %r246; cvt.s64.s32 %rd15, %r229; @%p17 bra $L__BB0_30; shl.b64 %rd50, %rd15, 1; add.s64 %rd51, %rd2, %rd50; ld.global.u16 %rs304, [%rd51]; // begin inline asm { cvt.f32.f16 %f484, %rs304;} // end inline asm fma.rn.ftz.f32 %f507, %f52, %f484, %f507; $L__BB0_30: mul.wide.s32 %rd55, %r30, 2; add.s64 %rd53, %rd47, %rd55; // begin inline asm { cvt.rn.f16.f32 %rs305, %f507;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd52, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd53], %rs305, %rd52; // end inline asm mul.ftz.f32 %f508, %f51, %f495; cvt.u32.u64 %r230, %rd15; add.s32 %r231, %r230, %r30; cvt.s64.s32 %rd16, %r231; @%p17 bra $L__BB0_32; shl.b64 %rd56, %rd16, 1; add.s64 %rd57, %rd2, %rd56; ld.global.u16 %rs307, [%rd57]; // begin inline asm { cvt.f32.f16 %f486, %rs307;} // end inline asm fma.rn.ftz.f32 %f508, %f52, %f486, %f508; $L__BB0_32: ld.param.u64 %rd70, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs308, %f508;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd58, 1.0; // end inline asm shl.b64 %rd61, %rd16, 1; add.s64 %rd59, %rd70, %rd61; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd59], %rs308, %rd58; // end inline asm mul.ftz.f32 %f509, %f51, %f494; cvt.u32.u64 %r232, %rd16; add.s32 %r233, %r232, %r30; cvt.s64.s32 %rd17, %r233; @%p17 bra $L__BB0_34; shl.b64 %rd62, %rd17, 1; add.s64 %rd63, %rd2, %rd62; ld.global.u16 %rs310, [%rd63]; // begin inline asm { cvt.f32.f16 %f488, %rs310;} // end inline asm fma.rn.ftz.f32 %f509, %f52, %f488, %f509; $L__BB0_34: ld.param.u64 %rd71, [_Z27dequant_gemv_group64_batch423DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs311, %f509;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd64, 1.0; // end inline asm shl.b64 %rd67, %rd17, 1; add.s64 %rd65, %rd71, %rd67; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd65], %rs311, %rd64; // end inline asm $L__BB0_35: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }