l .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf3b3ebd6thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch623DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<27>; .reg .b16 %rs<249>; .reg .f32 %f<472>; .reg .b32 %r<226>; .reg .b64 %rd<83>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[3840]; ld.param.v2.u32 {%r26, %r27}, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r28, %r29}, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f73, %f74}, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs76, %rs77, %rs78, %rs79}, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd25, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd24, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd23, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd22, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd21, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+8]; mov.u32 %r1, %ctaid.x; cvta.to.global.u64 %rd2, %rd21; mov.u32 %r225, %tid.y; shl.b32 %r30, %r225, 5; mov.u32 %r4, %tid.x; add.s32 %r224, %r30, %r4; setp.ge.u32 %p1, %r224, %r28; mov.f32 %f448, 0f00000000; mov.f32 %f449, %f448; mov.f32 %f450, %f448; mov.f32 %f451, %f448; mov.f32 %f452, %f448; mov.f32 %f453, %f448; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd22; mul.lo.s32 %r7, %r28, %r1; shr.u32 %r8, %r4, 2; mul.lo.s32 %r9, %r29, %r1; shl.b16 %rs2, %rs76, 3; mul.wide.s32 %rd5, %r26, 2; $L__BB0_2: add.s32 %r32, %r224, %r7; mul.wide.u32 %rd32, %r32, 4; add.s64 %rd27, %rd23, %rd32; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd26, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.u32 %r31, [%rd27], %rd26; // end inline asm shl.b32 %r33, %r225, 3; add.s32 %r13, %r33, %r8; add.s32 %r14, %r13, %r9; mul.wide.s32 %rd33, %r14, 2; add.s64 %rd30, %rd25, %rd33; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd29, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs84, [%rd30], %rd29; // end inline asm // begin inline asm { cvt.f32.f16 %f81, %rs84;} // end inline asm setp.eq.s64 %p2, %rd24, 0; mov.u16 %rs248, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r34, %r14, 31; add.s32 %r35, %r14, %r34; shr.s32 %r36, %r35, 1; cvt.s64.s32 %rd37, %r36; add.s64 %rd35, %rd24, %rd37; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd34, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs86, [%rd35], %rd34; // end inline asm cvt.u32.u16 %r37, %rs86; and.b32 %r38, %r37, 255; shl.b32 %r39, %r13, 2; and.b32 %r40, %r39, 4; shr.u32 %r41, %r38, %r40; cvt.u16.u32 %rs87, %r41; and.b16 %rs248, %rs87, 15; $L__BB0_4: shl.b32 %r15, %r224, 3; setp.ge.s32 %p3, %r15, %r26; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs76, 0; mul.wide.s32 %rd38, %r15, 2; add.s64 %rd39, %rd3, %rd38; ld.global.v4.u32 {%r42, %r43, %r44, %r45}, [%rd39]; add.s64 %rd41, %rd39, %rd5; ld.global.v4.u32 {%r50, %r51, %r52, %r53}, [%rd41]; add.s32 %r58, %r26, %r15; add.s32 %r59, %r58, %r26; mul.wide.s32 %rd42, %r59, 2; add.s64 %rd43, %rd3, %rd42; ld.global.v4.u32 {%r60, %r61, %r62, %r63}, [%rd43]; add.s64 %rd44, %rd43, %rd5; ld.global.v4.u32 {%r68, %r69, %r70, %r71}, [%rd44]; add.s64 %rd45, %rd44, %rd5; ld.global.v4.u32 {%r76, %r77, %r78, %r79}, [%rd45]; add.s64 %rd46, %rd45, %rd5; ld.global.v4.u32 {%r84, %r85, %r86, %r87}, [%rd46]; shr.u16 %rs89, %rs248, 3; and.b16 %rs90, %rs89, 1; setp.eq.b16 %p5, %rs90, 1; and.pred %p6, %p4, %p5; selp.b16 %rs91, -16, 0, %p6; or.b16 %rs92, %rs91, %rs248; cvt.s16.s8 %rs93, %rs92; cvt.rn.f32.s16 %f8, %rs93; cvt.u16.u32 %rs5, %r31; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs15}, %r42; mov.b32 {%rs8, %rs16}, %r50; mov.b32 {%rs9, %rs17}, %r60; mov.b32 {%rs10, %rs18}, %r68; mov.b32 {%rs11, %rs19}, %r76; mov.b32 {%rs12, %rs20}, %r84; shr.u32 %r92, %r31, 4; cvt.u16.u32 %rs13, %r92; and.b16 %rs14, %rs13, 15; shr.u32 %r93, %r31, 8; cvt.u16.u32 %rs21, %r93; and.b16 %rs22, %rs21, 15; mov.b32 {%rs23, %rs31}, %r43; mov.b32 {%rs24, %rs32}, %r51; mov.b32 {%rs25, %rs33}, %r61; mov.b32 {%rs26, %rs34}, %r69; mov.b32 {%rs27, %rs35}, %r77; mov.b32 {%rs28, %rs36}, %r85; shr.u32 %r94, %r31, 12; cvt.u16.u32 %rs29, %r94; and.b16 %rs30, %rs29, 15; shr.u32 %r95, %r31, 16; cvt.u16.u32 %rs37, %r95; and.b16 %rs38, %rs37, 15; mov.b32 {%rs39, %rs47}, %r44; mov.b32 {%rs40, %rs48}, %r52; mov.b32 {%rs41, %rs49}, %r62; mov.b32 {%rs42, %rs50}, %r70; mov.b32 {%rs43, %rs51}, %r78; mov.b32 {%rs44, %rs52}, %r86; shr.u32 %r96, %r31, 20; cvt.u16.u32 %rs45, %r96; and.b16 %rs46, %rs45, 15; shr.u32 %r97, %r31, 24; cvt.u16.u32 %rs53, %r97; and.b16 %rs54, %rs53, 15; mov.b32 {%rs55, %rs62}, %r45; mov.b32 {%rs56, %rs63}, %r53; mov.b32 {%rs57, %rs64}, %r63; mov.b32 {%rs58, %rs65}, %r71; mov.b32 {%rs59, %rs66}, %r79; mov.b32 {%rs60, %rs67}, %r87; shr.u32 %r98, %r31, 28; cvt.u16.u32 %rs61, %r98; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f130, %rs6; sub.ftz.f32 %f131, %f130, %f8; mul.ftz.f32 %f132, %f81, %f131; // begin inline asm { cvt.f32.f16 %f82, %rs7;} // end inline asm fma.rn.ftz.f32 %f133, %f132, %f82, %f448; // begin inline asm { cvt.f32.f16 %f83, %rs8;} // end inline asm fma.rn.ftz.f32 %f134, %f132, %f83, %f449; // begin inline asm { cvt.f32.f16 %f84, %rs9;} // end inline asm fma.rn.ftz.f32 %f135, %f132, %f84, %f450; // begin inline asm { cvt.f32.f16 %f85, %rs10;} // end inline asm fma.rn.ftz.f32 %f136, %f132, %f85, %f451; // begin inline asm { cvt.f32.f16 %f86, %rs11;} // end inline asm fma.rn.ftz.f32 %f137, %f132, %f86, %f452; // begin inline asm { cvt.f32.f16 %f87, %rs12;} // end inline asm fma.rn.ftz.f32 %f138, %f132, %f87, %f453; cvt.rn.f32.s16 %f139, %rs14; sub.ftz.f32 %f140, %f139, %f8; mul.ftz.f32 %f141, %f81, %f140; // begin inline asm { cvt.f32.f16 %f88, %rs15;} // end inline asm fma.rn.ftz.f32 %f142, %f141, %f88, %f133; // begin inline asm { cvt.f32.f16 %f89, %rs16;} // end inline asm fma.rn.ftz.f32 %f143, %f141, %f89, %f134; // begin inline asm { cvt.f32.f16 %f90, %rs17;} // end inline asm fma.rn.ftz.f32 %f144, %f141, %f90, %f135; // begin inline asm { cvt.f32.f16 %f91, %rs18;} // end inline asm fma.rn.ftz.f32 %f145, %f141, %f91, %f136; // begin inline asm { cvt.f32.f16 %f92, %rs19;} // end inline asm fma.rn.ftz.f32 %f146, %f141, %f92, %f137; // begin inline asm { cvt.f32.f16 %f93, %rs20;} // end inline asm fma.rn.ftz.f32 %f147, %f141, %f93, %f138; cvt.rn.f32.s16 %f148, %rs22; sub.ftz.f32 %f149, %f148, %f8; mul.ftz.f32 %f150, %f81, %f149; // begin inline asm { cvt.f32.f16 %f94, %rs23;} // end inline asm fma.rn.ftz.f32 %f151, %f150, %f94, %f142; // begin inline asm { cvt.f32.f16 %f95, %rs24;} // end inline asm fma.rn.ftz.f32 %f152, %f150, %f95, %f143; // begin inline asm { cvt.f32.f16 %f96, %rs25;} // end inline asm fma.rn.ftz.f32 %f153, %f150, %f96, %f144; // begin inline asm { cvt.f32.f16 %f97, %rs26;} // end inline asm fma.rn.ftz.f32 %f154, %f150, %f97, %f145; // begin inline asm { cvt.f32.f16 %f98, %rs27;} // end inline asm fma.rn.ftz.f32 %f155, %f150, %f98, %f146; // begin inline asm { cvt.f32.f16 %f99, %rs28;} // end inline asm fma.rn.ftz.f32 %f156, %f150, %f99, %f147; cvt.rn.f32.s16 %f157, %rs30; sub.ftz.f32 %f158, %f157, %f8; mul.ftz.f32 %f159, %f81, %f158; // begin inline asm { cvt.f32.f16 %f100, %rs31;} // end inline asm fma.rn.ftz.f32 %f160, %f159, %f100, %f151; // begin inline asm { cvt.f32.f16 %f101, %rs32;} // end inline asm fma.rn.ftz.f32 %f161, %f159, %f101, %f152; // begin inline asm { cvt.f32.f16 %f102, %rs33;} // end inline asm fma.rn.ftz.f32 %f162, %f159, %f102, %f153; // begin inline asm { cvt.f32.f16 %f103, %rs34;} // end inline asm fma.rn.ftz.f32 %f163, %f159, %f103, %f154; // begin inline asm { cvt.f32.f16 %f104, %rs35;} // end inline asm fma.rn.ftz.f32 %f164, %f159, %f104, %f155; // begin inline asm { cvt.f32.f16 %f105, %rs36;} // end inline asm fma.rn.ftz.f32 %f165, %f159, %f105, %f156; cvt.rn.f32.s16 %f166, %rs38; sub.ftz.f32 %f167, %f166, %f8; mul.ftz.f32 %f168, %f81, %f167; // begin inline asm { cvt.f32.f16 %f106, %rs39;} // end inline asm fma.rn.ftz.f32 %f169, %f168, %f106, %f160; // begin inline asm { cvt.f32.f16 %f107, %rs40;} // end inline asm fma.rn.ftz.f32 %f170, %f168, %f107, %f161; // begin inline asm { cvt.f32.f16 %f108, %rs41;} // end inline asm fma.rn.ftz.f32 %f171, %f168, %f108, %f162; // begin inline asm { cvt.f32.f16 %f109, %rs42;} // end inline asm fma.rn.ftz.f32 %f172, %f168, %f109, %f163; // begin inline asm { cvt.f32.f16 %f110, %rs43;} // end inline asm fma.rn.ftz.f32 %f173, %f168, %f110, %f164; // begin inline asm { cvt.f32.f16 %f111, %rs44;} // end inline asm fma.rn.ftz.f32 %f174, %f168, %f111, %f165; cvt.rn.f32.s16 %f175, %rs46; sub.ftz.f32 %f176, %f175, %f8; mul.ftz.f32 %f177, %f81, %f176; // begin inline asm { cvt.f32.f16 %f112, %rs47;} // end inline asm fma.rn.ftz.f32 %f178, %f177, %f112, %f169; // begin inline asm { cvt.f32.f16 %f113, %rs48;} // end inline asm fma.rn.ftz.f32 %f179, %f177, %f113, %f170; // begin inline asm { cvt.f32.f16 %f114, %rs49;} // end inline asm fma.rn.ftz.f32 %f180, %f177, %f114, %f171; // begin inline asm { cvt.f32.f16 %f115, %rs50;} // end inline asm fma.rn.ftz.f32 %f181, %f177, %f115, %f172; // begin inline asm { cvt.f32.f16 %f116, %rs51;} // end inline asm fma.rn.ftz.f32 %f182, %f177, %f116, %f173; // begin inline asm { cvt.f32.f16 %f117, %rs52;} // end inline asm fma.rn.ftz.f32 %f183, %f177, %f117, %f174; cvt.rn.f32.s16 %f184, %rs54; sub.ftz.f32 %f185, %f184, %f8; mul.ftz.f32 %f186, %f81, %f185; // begin inline asm { cvt.f32.f16 %f118, %rs55;} // end inline asm fma.rn.ftz.f32 %f187, %f186, %f118, %f178; // begin inline asm { cvt.f32.f16 %f119, %rs56;} // end inline asm fma.rn.ftz.f32 %f188, %f186, %f119, %f179; // begin inline asm { cvt.f32.f16 %f120, %rs57;} // end inline asm fma.rn.ftz.f32 %f189, %f186, %f120, %f180; // begin inline asm { cvt.f32.f16 %f121, %rs58;} // end inline asm fma.rn.ftz.f32 %f190, %f186, %f121, %f181; // begin inline asm { cvt.f32.f16 %f122, %rs59;} // end inline asm fma.rn.ftz.f32 %f191, %f186, %f122, %f182; // begin inline asm { cvt.f32.f16 %f123, %rs60;} // end inline asm fma.rn.ftz.f32 %f192, %f186, %f123, %f183; cvt.rn.f32.s16 %f193, %rs61; sub.ftz.f32 %f194, %f193, %f8; mul.ftz.f32 %f195, %f81, %f194; // begin inline asm { cvt.f32.f16 %f124, %rs62;} // end inline asm fma.rn.ftz.f32 %f448, %f195, %f124, %f187; // begin inline asm { cvt.f32.f16 %f125, %rs63;} // end inline asm fma.rn.ftz.f32 %f449, %f195, %f125, %f188; // begin inline asm { cvt.f32.f16 %f126, %rs64;} // end inline asm fma.rn.ftz.f32 %f450, %f195, %f126, %f189; // begin inline asm { cvt.f32.f16 %f127, %rs65;} // end inline asm fma.rn.ftz.f32 %f451, %f195, %f127, %f190; // begin inline asm { cvt.f32.f16 %f128, %rs66;} // end inline asm fma.rn.ftz.f32 %f452, %f195, %f128, %f191; // begin inline asm { cvt.f32.f16 %f129, %rs67;} // end inline asm fma.rn.ftz.f32 %f453, %f195, %f129, %f192; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs190, %rs5, 4; cvt.s16.s8 %rs191, %rs190; shr.s16 %rs192, %rs191, 7; and.b16 %rs193, %rs192, -16; or.b16 %rs194, %rs193, %rs6; cvt.rn.f32.s16 %f244, %rs194; sub.ftz.f32 %f245, %f244, %f8; mul.ftz.f32 %f246, %f81, %f245; // begin inline asm { cvt.f32.f16 %f196, %rs7;} // end inline asm fma.rn.ftz.f32 %f247, %f246, %f196, %f448; // begin inline asm { cvt.f32.f16 %f197, %rs8;} // end inline asm fma.rn.ftz.f32 %f248, %f246, %f197, %f449; // begin inline asm { cvt.f32.f16 %f198, %rs9;} // end inline asm fma.rn.ftz.f32 %f249, %f246, %f198, %f450; // begin inline asm { cvt.f32.f16 %f199, %rs10;} // end inline asm fma.rn.ftz.f32 %f250, %f246, %f199, %f451; // begin inline asm { cvt.f32.f16 %f200, %rs11;} // end inline asm fma.rn.ftz.f32 %f251, %f246, %f200, %f452; // begin inline asm { cvt.f32.f16 %f201, %rs12;} // end inline asm fma.rn.ftz.f32 %f252, %f246, %f201, %f453; shl.b16 %rs195, %rs13, 4; cvt.s16.s8 %rs196, %rs195; shr.s16 %rs197, %rs196, 7; and.b16 %rs198, %rs197, -16; or.b16 %rs199, %rs198, %rs14; cvt.rn.f32.s16 %f253, %rs199; sub.ftz.f32 %f254, %f253, %f8; mul.ftz.f32 %f255, %f81, %f254; // begin inline asm { cvt.f32.f16 %f202, %rs15;} // end inline asm fma.rn.ftz.f32 %f256, %f255, %f202, %f247; // begin inline asm { cvt.f32.f16 %f203, %rs16;} // end inline asm fma.rn.ftz.f32 %f257, %f255, %f203, %f248; // begin inline asm { cvt.f32.f16 %f204, %rs17;} // end inline asm fma.rn.ftz.f32 %f258, %f255, %f204, %f249; // begin inline asm { cvt.f32.f16 %f205, %rs18;} // end inline asm fma.rn.ftz.f32 %f259, %f255, %f205, %f250; // begin inline asm { cvt.f32.f16 %f206, %rs19;} // end inline asm fma.rn.ftz.f32 %f260, %f255, %f206, %f251; // begin inline asm { cvt.f32.f16 %f207, %rs20;} // end inline asm fma.rn.ftz.f32 %f261, %f255, %f207, %f252; shl.b16 %rs200, %rs21, 4; cvt.s16.s8 %rs201, %rs200; shr.s16 %rs202, %rs201, 7; and.b16 %rs203, %rs202, -16; or.b16 %rs204, %rs203, %rs22; cvt.rn.f32.s16 %f262, %rs204; sub.ftz.f32 %f263, %f262, %f8; mul.ftz.f32 %f264, %f81, %f263; // begin inline asm { cvt.f32.f16 %f208, %rs23;} // end inline asm fma.rn.ftz.f32 %f265, %f264, %f208, %f256; // begin inline asm { cvt.f32.f16 %f209, %rs24;} // end inline asm fma.rn.ftz.f32 %f266, %f264, %f209, %f257; // begin inline asm { cvt.f32.f16 %f210, %rs25;} // end inline asm fma.rn.ftz.f32 %f267, %f264, %f210, %f258; // begin inline asm { cvt.f32.f16 %f211, %rs26;} // end inline asm fma.rn.ftz.f32 %f268, %f264, %f211, %f259; // begin inline asm { cvt.f32.f16 %f212, %rs27;} // end inline asm fma.rn.ftz.f32 %f269, %f264, %f212, %f260; // begin inline asm { cvt.f32.f16 %f213, %rs28;} // end inline asm fma.rn.ftz.f32 %f270, %f264, %f213, %f261; shl.b16 %rs205, %rs29, 4; cvt.s16.s8 %rs206, %rs205; shr.s16 %rs207, %rs206, 7; and.b16 %rs208, %rs207, -16; or.b16 %rs209, %rs208, %rs30; cvt.rn.f32.s16 %f271, %rs209; sub.ftz.f32 %f272, %f271, %f8; mul.ftz.f32 %f273, %f81, %f272; // begin inline asm { cvt.f32.f16 %f214, %rs31;} // end inline asm fma.rn.ftz.f32 %f274, %f273, %f214, %f265; // begin inline asm { cvt.f32.f16 %f215, %rs32;} // end inline asm fma.rn.ftz.f32 %f275, %f273, %f215, %f266; // begin inline asm { cvt.f32.f16 %f216, %rs33;} // end inline asm fma.rn.ftz.f32 %f276, %f273, %f216, %f267; // begin inline asm { cvt.f32.f16 %f217, %rs34;} // end inline asm fma.rn.ftz.f32 %f277, %f273, %f217, %f268; // begin inline asm { cvt.f32.f16 %f218, %rs35;} // end inline asm fma.rn.ftz.f32 %f278, %f273, %f218, %f269; // begin inline asm { cvt.f32.f16 %f219, %rs36;} // end inline asm fma.rn.ftz.f32 %f279, %f273, %f219, %f270; shl.b16 %rs210, %rs37, 4; cvt.s16.s8 %rs211, %rs210; shr.s16 %rs212, %rs211, 7; and.b16 %rs213, %rs212, -16; or.b16 %rs214, %rs213, %rs38; cvt.rn.f32.s16 %f280, %rs214; sub.ftz.f32 %f281, %f280, %f8; mul.ftz.f32 %f282, %f81, %f281; // begin inline asm { cvt.f32.f16 %f220, %rs39;} // end inline asm fma.rn.ftz.f32 %f283, %f282, %f220, %f274; // begin inline asm { cvt.f32.f16 %f221, %rs40;} // end inline asm fma.rn.ftz.f32 %f284, %f282, %f221, %f275; // begin inline asm { cvt.f32.f16 %f222, %rs41;} // end inline asm fma.rn.ftz.f32 %f285, %f282, %f222, %f276; // begin inline asm { cvt.f32.f16 %f223, %rs42;} // end inline asm fma.rn.ftz.f32 %f286, %f282, %f223, %f277; // begin inline asm { cvt.f32.f16 %f224, %rs43;} // end inline asm fma.rn.ftz.f32 %f287, %f282, %f224, %f278; // begin inline asm { cvt.f32.f16 %f225, %rs44;} // end inline asm fma.rn.ftz.f32 %f288, %f282, %f225, %f279; shl.b16 %rs215, %rs45, 4; cvt.s16.s8 %rs216, %rs215; shr.s16 %rs217, %rs216, 7; and.b16 %rs218, %rs217, -16; or.b16 %rs219, %rs218, %rs46; cvt.rn.f32.s16 %f289, %rs219; sub.ftz.f32 %f290, %f289, %f8; mul.ftz.f32 %f291, %f81, %f290; // begin inline asm { cvt.f32.f16 %f226, %rs47;} // end inline asm fma.rn.ftz.f32 %f292, %f291, %f226, %f283; // begin inline asm { cvt.f32.f16 %f227, %rs48;} // end inline asm fma.rn.ftz.f32 %f293, %f291, %f227, %f284; // begin inline asm { cvt.f32.f16 %f228, %rs49;} // end inline asm fma.rn.ftz.f32 %f294, %f291, %f228, %f285; // begin inline asm { cvt.f32.f16 %f229, %rs50;} // end inline asm fma.rn.ftz.f32 %f295, %f291, %f229, %f286; // begin inline asm { cvt.f32.f16 %f230, %rs51;} // end inline asm fma.rn.ftz.f32 %f296, %f291, %f230, %f287; // begin inline asm { cvt.f32.f16 %f231, %rs52;} // end inline asm fma.rn.ftz.f32 %f297, %f291, %f231, %f288; shl.b16 %rs220, %rs53, 4; cvt.s16.s8 %rs221, %rs220; shr.s16 %rs222, %rs221, 7; and.b16 %rs223, %rs222, -16; or.b16 %rs224, %rs223, %rs54; cvt.rn.f32.s16 %f298, %rs224; sub.ftz.f32 %f299, %f298, %f8; mul.ftz.f32 %f300, %f81, %f299; // begin inline asm { cvt.f32.f16 %f232, %rs55;} // end inline asm fma.rn.ftz.f32 %f301, %f300, %f232, %f292; // begin inline asm { cvt.f32.f16 %f233, %rs56;} // end inline asm fma.rn.ftz.f32 %f302, %f300, %f233, %f293; // begin inline asm { cvt.f32.f16 %f234, %rs57;} // end inline asm fma.rn.ftz.f32 %f303, %f300, %f234, %f294; // begin inline asm { cvt.f32.f16 %f235, %rs58;} // end inline asm fma.rn.ftz.f32 %f304, %f300, %f235, %f295; // begin inline asm { cvt.f32.f16 %f236, %rs59;} // end inline asm fma.rn.ftz.f32 %f305, %f300, %f236, %f296; // begin inline asm { cvt.f32.f16 %f237, %rs60;} // end inline asm fma.rn.ftz.f32 %f306, %f300, %f237, %f297; shl.b16 %rs225, %rs61, 4; cvt.s16.s8 %rs226, %rs225; shr.s16 %rs227, %rs226, 7; and.b16 %rs228, %rs227, -16; or.b16 %rs229, %rs228, %rs61; cvt.rn.f32.s16 %f307, %rs229; sub.ftz.f32 %f308, %f307, %f8; mul.ftz.f32 %f309, %f81, %f308; // begin inline asm { cvt.f32.f16 %f238, %rs62;} // end inline asm fma.rn.ftz.f32 %f448, %f309, %f238, %f301; // begin inline asm { cvt.f32.f16 %f239, %rs63;} // end inline asm fma.rn.ftz.f32 %f449, %f309, %f239, %f302; // begin inline asm { cvt.f32.f16 %f240, %rs64;} // end inline asm fma.rn.ftz.f32 %f450, %f309, %f240, %f303; // begin inline asm { cvt.f32.f16 %f241, %rs65;} // end inline asm fma.rn.ftz.f32 %f451, %f309, %f241, %f304; // begin inline asm { cvt.f32.f16 %f242, %rs66;} // end inline asm fma.rn.ftz.f32 %f452, %f309, %f242, %f305; // begin inline asm { cvt.f32.f16 %f243, %rs67;} // end inline asm fma.rn.ftz.f32 %f453, %f309, %f243, %f306; $L__BB0_8: add.s32 %r225, %r225, 4; shl.b32 %r99, %r225, 5; add.s32 %r224, %r99, %r4; setp.lt.u32 %p7, %r224, %r28; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r220, %tid.y; shl.b32 %r219, %r220, 5; add.s32 %r218, %r219, %r4; shl.b32 %r100, %r218, 2; mov.u32 %r101, _ZZ9gemv_int4ILi4ELi32ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r102, %r101, %r100; setp.lt.u32 %p8, %r218, 32; @%p8 bra $L__BB0_11; add.s32 %r209, %r102, -112; st.shared.f32 [%r209], %f448; $L__BB0_11: mov.u32 %r223, %tid.y; shl.b32 %r222, %r223, 5; add.s32 %r221, %r222, %r4; setp.gt.u32 %p9, %r221, 31; bar.sync 0; mad.lo.s32 %r19, %r221, 12, %r101; @%p9 bra $L__BB0_13; mov.u32 %r117, 16; ld.shared.f32 %f325, [%r19+16]; add.ftz.f32 %f326, %f448, %f325; ld.shared.f32 %f327, [%r19+20]; add.ftz.f32 %f328, %f326, %f327; ld.shared.f32 %f329, [%r19+24]; add.ftz.f32 %f312, %f328, %f329; mov.u32 %r105, 1; mov.u32 %r118, 31; mov.u32 %r119, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f312, %r105, %r118, %r119; @p add.f32 r0, r0, %f312; mov.f32 %f310, r0;} // end inline asm mov.u32 %r108, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f310, %r108, %r118, %r119; @p add.f32 r0, r0, %f310; mov.f32 %f313, r0;} // end inline asm mov.u32 %r111, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f313, %r111, %r118, %r119; @p add.f32 r0, r0, %f313; mov.f32 %f316, r0;} // end inline asm mov.u32 %r114, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f316, %r114, %r118, %r119; @p add.f32 r0, r0, %f316; mov.f32 %f319, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f319, %r117, %r118, %r119; @p add.f32 r0, r0, %f319; mov.f32 %f448, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r210, %r102, -112; st.shared.f32 [%r210+640], %f449; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f345, [%r19+656]; add.ftz.f32 %f346, %f449, %f345; ld.shared.f32 %f347, [%r19+660]; add.ftz.f32 %f348, %f346, %f347; ld.shared.f32 %f349, [%r19+664]; add.ftz.f32 %f332, %f348, %f349; mov.u32 %r121, 1; mov.u32 %r134, 31; mov.u32 %r135, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f332, %r121, %r134, %r135; @p add.f32 r0, r0, %f332; mov.f32 %f330, r0;} // end inline asm mov.u32 %r124, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f330, %r124, %r134, %r135; @p add.f32 r0, r0, %f330; mov.f32 %f333, r0;} // end inline asm mov.u32 %r127, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f333, %r127, %r134, %r135; @p add.f32 r0, r0, %f333; mov.f32 %f336, r0;} // end inline asm mov.u32 %r130, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f336, %r130, %r134, %r135; @p add.f32 r0, r0, %f336; mov.f32 %f339, r0;} // end inline asm mov.u32 %r133, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f339, %r133, %r134, %r135; @p add.f32 r0, r0, %f339; mov.f32 %f449, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r211, %r102, -112; st.shared.f32 [%r211+1280], %f450; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f365, [%r19+1296]; add.ftz.f32 %f366, %f450, %f365; ld.shared.f32 %f367, [%r19+1300]; add.ftz.f32 %f368, %f366, %f367; ld.shared.f32 %f369, [%r19+1304]; add.ftz.f32 %f352, %f368, %f369; mov.u32 %r137, 1; mov.u32 %r150, 31; mov.u32 %r151, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f352, %r137, %r150, %r151; @p add.f32 r0, r0, %f352; mov.f32 %f350, r0;} // end inline asm mov.u32 %r140, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f350, %r140, %r150, %r151; @p add.f32 r0, r0, %f350; mov.f32 %f353, r0;} // end inline asm mov.u32 %r143, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f353, %r143, %r150, %r151; @p add.f32 r0, r0, %f353; mov.f32 %f356, r0;} // end inline asm mov.u32 %r146, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f356, %r146, %r150, %r151; @p add.f32 r0, r0, %f356; mov.f32 %f359, r0;} // end inline asm mov.u32 %r149, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f359, %r149, %r150, %r151; @p add.f32 r0, r0, %f359; mov.f32 %f450, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r212, %r102, -112; st.shared.f32 [%r212+1920], %f451; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f385, [%r19+1936]; add.ftz.f32 %f386, %f451, %f385; ld.shared.f32 %f387, [%r19+1940]; add.ftz.f32 %f388, %f386, %f387; ld.shared.f32 %f389, [%r19+1944]; add.ftz.f32 %f372, %f388, %f389; mov.u32 %r153, 1; mov.u32 %r166, 31; mov.u32 %r167, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f372, %r153, %r166, %r167; @p add.f32 r0, r0, %f372; mov.f32 %f370, r0;} // end inline asm mov.u32 %r156, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f370, %r156, %r166, %r167; @p add.f32 r0, r0, %f370; mov.f32 %f373, r0;} // end inline asm mov.u32 %r159, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f373, %r159, %r166, %r167; @p add.f32 r0, r0, %f373; mov.f32 %f376, r0;} // end inline asm mov.u32 %r162, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f376, %r162, %r166, %r167; @p add.f32 r0, r0, %f376; mov.f32 %f379, r0;} // end inline asm mov.u32 %r165, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f379, %r165, %r166, %r167; @p add.f32 r0, r0, %f379; mov.f32 %f451, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r213, %r102, -112; st.shared.f32 [%r213+2560], %f452; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f405, [%r19+2576]; add.ftz.f32 %f406, %f452, %f405; ld.shared.f32 %f407, [%r19+2580]; add.ftz.f32 %f408, %f406, %f407; ld.shared.f32 %f409, [%r19+2584]; add.ftz.f32 %f392, %f408, %f409; mov.u32 %r169, 1; mov.u32 %r182, 31; mov.u32 %r183, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f392, %r169, %r182, %r183; @p add.f32 r0, r0, %f392; mov.f32 %f390, r0;} // end inline asm mov.u32 %r172, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f390, %r172, %r182, %r183; @p add.f32 r0, r0, %f390; mov.f32 %f393, r0;} // end inline asm mov.u32 %r175, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f393, %r175, %r182, %r183; @p add.f32 r0, r0, %f393; mov.f32 %f396, r0;} // end inline asm mov.u32 %r178, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f396, %r178, %r182, %r183; @p add.f32 r0, r0, %f396; mov.f32 %f399, r0;} // end inline asm mov.u32 %r181, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f399, %r181, %r182, %r183; @p add.f32 r0, r0, %f399; mov.f32 %f452, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r214, %r102, -112; st.shared.f32 [%r214+3200], %f453; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f425, [%r19+3216]; add.ftz.f32 %f426, %f453, %f425; ld.shared.f32 %f427, [%r19+3220]; add.ftz.f32 %f428, %f426, %f427; ld.shared.f32 %f429, [%r19+3224]; add.ftz.f32 %f412, %f428, %f429; mov.u32 %r185, 1; mov.u32 %r198, 31; mov.u32 %r199, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f412, %r185, %r198, %r199; @p add.f32 r0, r0, %f412; mov.f32 %f410, r0;} // end inline asm mov.u32 %r188, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f410, %r188, %r198, %r199; @p add.f32 r0, r0, %f410; mov.f32 %f413, r0;} // end inline asm mov.u32 %r191, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f413, %r191, %r198, %r199; @p add.f32 r0, r0, %f413; mov.f32 %f416, r0;} // end inline asm mov.u32 %r194, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f416, %r194, %r198, %r199; @p add.f32 r0, r0, %f416; mov.f32 %f419, r0;} // end inline asm mov.u32 %r197, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f419, %r197, %r198, %r199; @p add.f32 r0, r0, %f419; mov.f32 %f453, r0;} // end inline asm $L__BB0_33: mov.u32 %r215, %tid.y; or.b32 %r200, %r4, %r215; setp.ne.s32 %p20, %r200, 0; @%p20 bra $L__BB0_47; ld.param.u64 %rd80, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0+8]; mov.u32 %r216, %ctaid.x; setp.eq.s64 %p21, %rd80, 0; mul.ftz.f32 %f466, %f73, %f448; cvt.s64.s32 %rd8, %r216; @%p21 bra $L__BB0_36; shl.b64 %rd47, %rd8, 1; add.s64 %rd48, %rd2, %rd47; ld.global.u16 %rs230, [%rd48]; // begin inline asm { cvt.f32.f16 %f430, %rs230;} // end inline asm fma.rn.ftz.f32 %f466, %f74, %f430, %f466; $L__BB0_36: ld.param.u64 %rd81, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0]; mov.u32 %r217, %ctaid.x; // begin inline asm { cvt.rn.f16.f32 %rs231, %f466;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd49, 1.0; // end inline asm shl.b64 %rd52, %rd8, 1; add.s64 %rd50, %rd81, %rd52; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd50], %rs231, %rd49; // end inline asm mul.ftz.f32 %f467, %f73, %f449; add.s32 %r201, %r27, %r217; cvt.s64.s32 %rd11, %r201; @%p21 bra $L__BB0_38; shl.b64 %rd53, %rd11, 1; add.s64 %rd54, %rd2, %rd53; ld.global.u16 %rs233, [%rd54]; // begin inline asm { cvt.f32.f16 %f432, %rs233;} // end inline asm fma.rn.ftz.f32 %f467, %f74, %f432, %f467; $L__BB0_38: cvt.s64.s32 %rd12, %r27; mul.wide.s32 %rd58, %r27, 2; add.s64 %rd56, %rd50, %rd58; // begin inline asm { cvt.rn.f16.f32 %rs234, %f467;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd55, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd56], %rs234, %rd55; // end inline asm mul.ftz.f32 %f468, %f73, %f450; cvt.u32.u64 %r202, %rd11; add.s32 %r203, %r202, %r27; cvt.s64.s32 %rd13, %r203; @%p21 bra $L__BB0_40; shl.b64 %rd59, %rd13, 1; add.s64 %rd60, %rd2, %rd59; ld.global.u16 %rs236, [%rd60]; // begin inline asm { cvt.f32.f16 %f434, %rs236;} // end inline asm fma.rn.ftz.f32 %f468, %f74, %f434, %f468; $L__BB0_40: ld.param.u64 %rd82, [_Z27dequant_gemv_group32_batch623DequantGemvKernelParams_param_0]; shl.b64 %rd64, %rd13, 1; add.s64 %rd62, %rd82, %rd64; // begin inline asm { cvt.rn.f16.f32 %rs237, %f468;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd61, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd62], %rs237, %rd61; // end inline asm mul.ftz.f32 %f469, %f73, %f451; cvt.u32.u64 %r204, %rd13; add.s32 %r205, %r204, %r27; cvt.s64.s32 %rd15, %r205; @%p21 bra $L__BB0_42; shl.b64 %rd65, %rd15, 1; add.s64 %rd66, %rd2, %rd65; ld.global.u16 %rs239, [%rd66]; // begin inline asm { cvt.f32.f16 %f436, %rs239;} // end inline asm fma.rn.ftz.f32 %f469, %f74, %f436, %f469; $L__BB0_42: // begin inline asm { cvt.rn.f16.f32 %rs240, %f469;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd67, 1.0; // end inline asm shl.b64 %rd16, %rd12, 1; add.s64 %rd68, %rd62, %rd16; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd68], %rs240, %rd67; // end inline asm mul.ftz.f32 %f470, %f73, %f452; cvt.u32.u64 %r206, %rd15; add.s32 %r207, %r206, %r27; cvt.s64.s32 %rd18, %r207; @%p21 bra $L__BB0_44; shl.b64 %rd70, %rd18, 1; add.s64 %rd71, %rd2, %rd70; ld.global.u16 %rs242, [%rd71]; // begin inline asm { cvt.f32.f16 %f438, %rs242;} // end inline asm fma.rn.ftz.f32 %f470, %f74, %f438, %f470; $L__BB0_44: // begin inline asm { cvt.rn.f16.f32 %rs243, %f470;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd72, 1.0; // end inline asm add.s64 %rd73, %rd68, %rd16; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd73], %rs243, %rd72; // end inline asm mul.ftz.f32 %f471, %f73, %f453; cvt.u32.u64 %r208, %rd18; add.s32 %r21, %r208, %r27; @%p21 bra $L__BB0_46; mul.wide.s32 %rd75, %r21, 2; add.s64 %rd76, %rd2, %rd75; ld.global.u16 %rs245, [%rd76]; // begin inline asm { cvt.f32.f16 %f440, %rs245;} // end inline asm fma.rn.ftz.f32 %f471, %f74, %f440, %f471; $L__BB0_46: // begin inline asm { cvt.rn.f16.f32 %rs246, %f471;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd77, 1.0; // end inline asm add.s64 %rd78, %rd73, %rd16; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd78], %rs246, %rd77; // end inline asm $L__BB0_47: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }