gemv_cu_02b20ef46thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch323DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<18>; .reg .b16 %rs<168>; .reg .f32 %f<265>; .reg .b32 %r<145>; .reg .b64 %rd<63>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1920]; ld.param.v2.u32 {%r25, %r26}, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r27, %r28}, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f40, %f41}, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs52, %rs53, %rs54, %rs55}, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd17, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd16, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd15, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd14, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r144, %tid.y; shl.b32 %r29, %r144, 5; mov.u32 %r3, %tid.x; add.s32 %r143, %r29, %r3; setp.ge.u32 %p1, %r143, %r27; mov.f32 %f253, 0f00000000; mov.f32 %f254, %f253; mov.f32 %f255, %f253; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd14; mul.lo.s32 %r7, %r27, %r1; shr.u32 %r8, %r3, 2; mul.lo.s32 %r9, %r28, %r1; shl.b16 %rs2, %rs52, 3; $L__BB0_2: add.s32 %r31, %r143, %r7; mul.wide.u32 %rd24, %r31, 4; add.s64 %rd19, %rd15, %rd24; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd18, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.u32 %r30, [%rd19], %rd18; // end inline asm shl.b32 %r32, %r144, 3; add.s32 %r13, %r32, %r8; add.s32 %r14, %r13, %r9; mul.wide.s32 %rd25, %r14, 2; add.s64 %rd22, %rd17, %rd25; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd21, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs60, [%rd22], %rd21; // end inline asm // begin inline asm { cvt.f32.f16 %f45, %rs60;} // end inline asm setp.eq.s64 %p2, %rd16, 0; mov.u16 %rs167, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r33, %r14, 31; add.s32 %r34, %r14, %r33; shr.s32 %r35, %r34, 1; cvt.s64.s32 %rd29, %r35; add.s64 %rd27, %rd16, %rd29; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd26, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs62, [%rd27], %rd26; // end inline asm cvt.u32.u16 %r36, %rs62; and.b32 %r37, %r36, 255; shl.b32 %r38, %r13, 2; and.b32 %r39, %r38, 4; shr.u32 %r40, %r37, %r39; cvt.u16.u32 %rs63, %r40; and.b16 %rs167, %rs63, 15; $L__BB0_4: shl.b32 %r15, %r143, 3; setp.ge.s32 %p3, %r15, %r25; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs52, 0; mul.wide.s32 %rd30, %r15, 2; add.s64 %rd31, %rd3, %rd30; ld.global.v4.u32 {%r41, %r42, %r43, %r44}, [%rd31]; mul.wide.s32 %rd32, %r25, 2; add.s64 %rd33, %rd31, %rd32; ld.global.v4.u32 {%r49, %r50, %r51, %r52}, [%rd33]; add.s32 %r57, %r25, %r15; add.s32 %r58, %r57, %r25; mul.wide.s32 %rd34, %r58, 2; add.s64 %rd35, %rd3, %rd34; ld.global.v4.u32 {%r59, %r60, %r61, %r62}, [%rd35]; shr.u16 %rs65, %rs167, 3; and.b16 %rs66, %rs65, 1; setp.eq.b16 %p5, %rs66, 1; and.pred %p6, %p4, %p5; selp.b16 %rs67, -16, 0, %p6; or.b16 %rs68, %rs67, %rs167; cvt.s16.s8 %rs69, %rs68; cvt.rn.f32.s16 %f5, %rs69; cvt.u16.u32 %rs5, %r30; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs12}, %r41; mov.b32 {%rs8, %rs13}, %r49; mov.b32 {%rs9, %rs14}, %r59; shr.u32 %r67, %r30, 4; cvt.u16.u32 %rs10, %r67; and.b16 %rs11, %rs10, 15; shr.u32 %r68, %r30, 8; cvt.u16.u32 %rs15, %r68; and.b16 %rs16, %rs15, 15; mov.b32 {%rs17, %rs22}, %r42; mov.b32 {%rs18, %rs23}, %r50; mov.b32 {%rs19, %rs24}, %r60; shr.u32 %r69, %r30, 12; cvt.u16.u32 %rs20, %r69; and.b16 %rs21, %rs20, 15; shr.u32 %r70, %r30, 16; cvt.u16.u32 %rs25, %r70; and.b16 %rs26, %rs25, 15; mov.b32 {%rs27, %rs32}, %r43; mov.b32 {%rs28, %rs33}, %r51; mov.b32 {%rs29, %rs34}, %r61; shr.u32 %r71, %r30, 20; cvt.u16.u32 %rs30, %r71; and.b16 %rs31, %rs30, 15; shr.u32 %r72, %r30, 24; cvt.u16.u32 %rs35, %r72; and.b16 %rs36, %rs35, 15; mov.b32 {%rs37, %rs41}, %r44; mov.b32 {%rs38, %rs42}, %r52; mov.b32 {%rs39, %rs43}, %r62; shr.u32 %r73, %r30, 28; cvt.u16.u32 %rs40, %r73; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f70, %rs6; sub.ftz.f32 %f71, %f70, %f5; mul.ftz.f32 %f72, %f45, %f71; // begin inline asm { cvt.f32.f16 %f46, %rs7;} // end inline asm fma.rn.ftz.f32 %f73, %f72, %f46, %f253; // begin inline asm { cvt.f32.f16 %f47, %rs8;} // end inline asm fma.rn.ftz.f32 %f74, %f72, %f47, %f254; // begin inline asm { cvt.f32.f16 %f48, %rs9;} // end inline asm fma.rn.ftz.f32 %f75, %f72, %f48, %f255; cvt.rn.f32.s16 %f76, %rs11; sub.ftz.f32 %f77, %f76, %f5; mul.ftz.f32 %f78, %f45, %f77; // begin inline asm { cvt.f32.f16 %f49, %rs12;} // end inline asm fma.rn.ftz.f32 %f79, %f78, %f49, %f73; // begin inline asm { cvt.f32.f16 %f50, %rs13;} // end inline asm fma.rn.ftz.f32 %f80, %f78, %f50, %f74; // begin inline asm { cvt.f32.f16 %f51, %rs14;} // end inline asm fma.rn.ftz.f32 %f81, %f78, %f51, %f75; cvt.rn.f32.s16 %f82, %rs16; sub.ftz.f32 %f83, %f82, %f5; mul.ftz.f32 %f84, %f45, %f83; // begin inline asm { cvt.f32.f16 %f52, %rs17;} // end inline asm fma.rn.ftz.f32 %f85, %f84, %f52, %f79; // begin inline asm { cvt.f32.f16 %f53, %rs18;} // end inline asm fma.rn.ftz.f32 %f86, %f84, %f53, %f80; // begin inline asm { cvt.f32.f16 %f54, %rs19;} // end inline asm fma.rn.ftz.f32 %f87, %f84, %f54, %f81; cvt.rn.f32.s16 %f88, %rs21; sub.ftz.f32 %f89, %f88, %f5; mul.ftz.f32 %f90, %f45, %f89; // begin inline asm { cvt.f32.f16 %f55, %rs22;} // end inline asm fma.rn.ftz.f32 %f91, %f90, %f55, %f85; // begin inline asm { cvt.f32.f16 %f56, %rs23;} // end inline asm fma.rn.ftz.f32 %f92, %f90, %f56, %f86; // begin inline asm { cvt.f32.f16 %f57, %rs24;} // end inline asm fma.rn.ftz.f32 %f93, %f90, %f57, %f87; cvt.rn.f32.s16 %f94, %rs26; sub.ftz.f32 %f95, %f94, %f5; mul.ftz.f32 %f96, %f45, %f95; // begin inline asm { cvt.f32.f16 %f58, %rs27;} // end inline asm fma.rn.ftz.f32 %f97, %f96, %f58, %f91; // begin inline asm { cvt.f32.f16 %f59, %rs28;} // end inline asm fma.rn.ftz.f32 %f98, %f96, %f59, %f92; // begin inline asm { cvt.f32.f16 %f60, %rs29;} // end inline asm fma.rn.ftz.f32 %f99, %f96, %f60, %f93; cvt.rn.f32.s16 %f100, %rs31; sub.ftz.f32 %f101, %f100, %f5; mul.ftz.f32 %f102, %f45, %f101; // begin inline asm { cvt.f32.f16 %f61, %rs32;} // end inline asm fma.rn.ftz.f32 %f103, %f102, %f61, %f97; // begin inline asm { cvt.f32.f16 %f62, %rs33;} // end inline asm fma.rn.ftz.f32 %f104, %f102, %f62, %f98; // begin inline asm { cvt.f32.f16 %f63, %rs34;} // end inline asm fma.rn.ftz.f32 %f105, %f102, %f63, %f99; cvt.rn.f32.s16 %f106, %rs36; sub.ftz.f32 %f107, %f106, %f5; mul.ftz.f32 %f108, %f45, %f107; // begin inline asm { cvt.f32.f16 %f64, %rs37;} // end inline asm fma.rn.ftz.f32 %f109, %f108, %f64, %f103; // begin inline asm { cvt.f32.f16 %f65, %rs38;} // end inline asm fma.rn.ftz.f32 %f110, %f108, %f65, %f104; // begin inline asm { cvt.f32.f16 %f66, %rs39;} // end inline asm fma.rn.ftz.f32 %f111, %f108, %f66, %f105; cvt.rn.f32.s16 %f112, %rs40; sub.ftz.f32 %f113, %f112, %f5; mul.ftz.f32 %f114, %f45, %f113; // begin inline asm { cvt.f32.f16 %f67, %rs41;} // end inline asm fma.rn.ftz.f32 %f253, %f114, %f67, %f109; // begin inline asm { cvt.f32.f16 %f68, %rs42;} // end inline asm fma.rn.ftz.f32 %f254, %f114, %f68, %f110; // begin inline asm { cvt.f32.f16 %f69, %rs43;} // end inline asm fma.rn.ftz.f32 %f255, %f114, %f69, %f111; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs118, %rs5, 4; cvt.s16.s8 %rs119, %rs118; shr.s16 %rs120, %rs119, 7; and.b16 %rs121, %rs120, -16; or.b16 %rs122, %rs121, %rs6; cvt.rn.f32.s16 %f139, %rs122; sub.ftz.f32 %f140, %f139, %f5; mul.ftz.f32 %f141, %f45, %f140; // begin inline asm { cvt.f32.f16 %f115, %rs7;} // end inline asm fma.rn.ftz.f32 %f142, %f141, %f115, %f253; // begin inline asm { cvt.f32.f16 %f116, %rs8;} // end inline asm fma.rn.ftz.f32 %f143, %f141, %f116, %f254; // begin inline asm { cvt.f32.f16 %f117, %rs9;} // end inline asm fma.rn.ftz.f32 %f144, %f141, %f117, %f255; shl.b16 %rs123, %rs10, 4; cvt.s16.s8 %rs124, %rs123; shr.s16 %rs125, %rs124, 7; and.b16 %rs126, %rs125, -16; or.b16 %rs127, %rs126, %rs11; cvt.rn.f32.s16 %f145, %rs127; sub.ftz.f32 %f146, %f145, %f5; mul.ftz.f32 %f147, %f45, %f146; // begin inline asm { cvt.f32.f16 %f118, %rs12;} // end inline asm fma.rn.ftz.f32 %f148, %f147, %f118, %f142; // begin inline asm { cvt.f32.f16 %f119, %rs13;} // end inline asm fma.rn.ftz.f32 %f149, %f147, %f119, %f143; // begin inline asm { cvt.f32.f16 %f120, %rs14;} // end inline asm fma.rn.ftz.f32 %f150, %f147, %f120, %f144; shl.b16 %rs128, %rs15, 4; cvt.s16.s8 %rs129, %rs128; shr.s16 %rs130, %rs129, 7; and.b16 %rs131, %rs130, -16; or.b16 %rs132, %rs131, %rs16; cvt.rn.f32.s16 %f151, %rs132; sub.ftz.f32 %f152, %f151, %f5; mul.ftz.f32 %f153, %f45, %f152; // begin inline asm { cvt.f32.f16 %f121, %rs17;} // end inline asm fma.rn.ftz.f32 %f154, %f153, %f121, %f148; // begin inline asm { cvt.f32.f16 %f122, %rs18;} // end inline asm fma.rn.ftz.f32 %f155, %f153, %f122, %f149; // begin inline asm { cvt.f32.f16 %f123, %rs19;} // end inline asm fma.rn.ftz.f32 %f156, %f153, %f123, %f150; shl.b16 %rs133, %rs20, 4; cvt.s16.s8 %rs134, %rs133; shr.s16 %rs135, %rs134, 7; and.b16 %rs136, %rs135, -16; or.b16 %rs137, %rs136, %rs21; cvt.rn.f32.s16 %f157, %rs137; sub.ftz.f32 %f158, %f157, %f5; mul.ftz.f32 %f159, %f45, %f158; // begin inline asm { cvt.f32.f16 %f124, %rs22;} // end inline asm fma.rn.ftz.f32 %f160, %f159, %f124, %f154; // begin inline asm { cvt.f32.f16 %f125, %rs23;} // end inline asm fma.rn.ftz.f32 %f161, %f159, %f125, %f155; // begin inline asm { cvt.f32.f16 %f126, %rs24;} // end inline asm fma.rn.ftz.f32 %f162, %f159, %f126, %f156; shl.b16 %rs138, %rs25, 4; cvt.s16.s8 %rs139, %rs138; shr.s16 %rs140, %rs139, 7; and.b16 %rs141, %rs140, -16; or.b16 %rs142, %rs141, %rs26; cvt.rn.f32.s16 %f163, %rs142; sub.ftz.f32 %f164, %f163, %f5; mul.ftz.f32 %f165, %f45, %f164; // begin inline asm { cvt.f32.f16 %f127, %rs27;} // end inline asm fma.rn.ftz.f32 %f166, %f165, %f127, %f160; // begin inline asm { cvt.f32.f16 %f128, %rs28;} // end inline asm fma.rn.ftz.f32 %f167, %f165, %f128, %f161; // begin inline asm { cvt.f32.f16 %f129, %rs29;} // end inline asm fma.rn.ftz.f32 %f168, %f165, %f129, %f162; shl.b16 %rs143, %rs30, 4; cvt.s16.s8 %rs144, %rs143; shr.s16 %rs145, %rs144, 7; and.b16 %rs146, %rs145, -16; or.b16 %rs147, %rs146, %rs31; cvt.rn.f32.s16 %f169, %rs147; sub.ftz.f32 %f170, %f169, %f5; mul.ftz.f32 %f171, %f45, %f170; // begin inline asm { cvt.f32.f16 %f130, %rs32;} // end inline asm fma.rn.ftz.f32 %f172, %f171, %f130, %f166; // begin inline asm { cvt.f32.f16 %f131, %rs33;} // end inline asm fma.rn.ftz.f32 %f173, %f171, %f131, %f167; // begin inline asm { cvt.f32.f16 %f132, %rs34;} // end inline asm fma.rn.ftz.f32 %f174, %f171, %f132, %f168; shl.b16 %rs148, %rs35, 4; cvt.s16.s8 %rs149, %rs148; shr.s16 %rs150, %rs149, 7; and.b16 %rs151, %rs150, -16; or.b16 %rs152, %rs151, %rs36; cvt.rn.f32.s16 %f175, %rs152; sub.ftz.f32 %f176, %f175, %f5; mul.ftz.f32 %f177, %f45, %f176; // begin inline asm { cvt.f32.f16 %f133, %rs37;} // end inline asm fma.rn.ftz.f32 %f178, %f177, %f133, %f172; // begin inline asm { cvt.f32.f16 %f134, %rs38;} // end inline asm fma.rn.ftz.f32 %f179, %f177, %f134, %f173; // begin inline asm { cvt.f32.f16 %f135, %rs39;} // end inline asm fma.rn.ftz.f32 %f180, %f177, %f135, %f174; shl.b16 %rs153, %rs40, 4; cvt.s16.s8 %rs154, %rs153; shr.s16 %rs155, %rs154, 7; and.b16 %rs156, %rs155, -16; or.b16 %rs157, %rs156, %rs40; cvt.rn.f32.s16 %f181, %rs157; sub.ftz.f32 %f182, %f181, %f5; mul.ftz.f32 %f183, %f45, %f182; // begin inline asm { cvt.f32.f16 %f136, %rs41;} // end inline asm fma.rn.ftz.f32 %f253, %f183, %f136, %f178; // begin inline asm { cvt.f32.f16 %f137, %rs42;} // end inline asm fma.rn.ftz.f32 %f254, %f183, %f137, %f179; // begin inline asm { cvt.f32.f16 %f138, %rs43;} // end inline asm fma.rn.ftz.f32 %f255, %f183, %f138, %f180; $L__BB0_8: add.s32 %r144, %r144, 4; shl.b32 %r74, %r144, 5; add.s32 %r143, %r74, %r3; setp.lt.u32 %p7, %r143, %r27; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r139, %tid.y; shl.b32 %r138, %r139, 5; add.s32 %r137, %r138, %r3; shl.b32 %r75, %r137, 2; mov.u32 %r76, _ZZ9gemv_int4ILi4ELi32ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r77, %r76, %r75; setp.lt.u32 %p8, %r137, 32; @%p8 bra $L__BB0_11; add.s32 %r131, %r77, -112; st.shared.f32 [%r131], %f253; $L__BB0_11: mov.u32 %r142, %tid.y; shl.b32 %r141, %r142, 5; add.s32 %r140, %r141, %r3; setp.gt.u32 %p9, %r140, 31; bar.sync 0; mad.lo.s32 %r19, %r140, 12, %r76; @%p9 bra $L__BB0_13; mov.u32 %r92, 16; ld.shared.f32 %f199, [%r19+16]; add.ftz.f32 %f200, %f253, %f199; ld.shared.f32 %f201, [%r19+20]; add.ftz.f32 %f202, %f200, %f201; ld.shared.f32 %f203, [%r19+24]; add.ftz.f32 %f186, %f202, %f203; mov.u32 %r80, 1; mov.u32 %r93, 31; mov.u32 %r94, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f186, %r80, %r93, %r94; @p add.f32 r0, r0, %f186; mov.f32 %f184, r0;} // end inline asm mov.u32 %r83, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f184, %r83, %r93, %r94; @p add.f32 r0, r0, %f184; mov.f32 %f187, r0;} // end inline asm mov.u32 %r86, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f187, %r86, %r93, %r94; @p add.f32 r0, r0, %f187; mov.f32 %f190, r0;} // end inline asm mov.u32 %r89, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f190, %r89, %r93, %r94; @p add.f32 r0, r0, %f190; mov.f32 %f193, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f193, %r92, %r93, %r94; @p add.f32 r0, r0, %f193; mov.f32 %f253, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r132, %r77, -112; st.shared.f32 [%r132+640], %f254; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f219, [%r19+656]; add.ftz.f32 %f220, %f254, %f219; ld.shared.f32 %f221, [%r19+660]; add.ftz.f32 %f222, %f220, %f221; ld.shared.f32 %f223, [%r19+664]; add.ftz.f32 %f206, %f222, %f223; mov.u32 %r96, 1; mov.u32 %r109, 31; mov.u32 %r110, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f206, %r96, %r109, %r110; @p add.f32 r0, r0, %f206; mov.f32 %f204, r0;} // end inline asm mov.u32 %r99, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f204, %r99, %r109, %r110; @p add.f32 r0, r0, %f204; mov.f32 %f207, r0;} // end inline asm mov.u32 %r102, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f207, %r102, %r109, %r110; @p add.f32 r0, r0, %f207; mov.f32 %f210, r0;} // end inline asm mov.u32 %r105, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f210, %r105, %r109, %r110; @p add.f32 r0, r0, %f210; mov.f32 %f213, r0;} // end inline asm mov.u32 %r108, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f213, %r108, %r109, %r110; @p add.f32 r0, r0, %f213; mov.f32 %f254, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r133, %r77, -112; st.shared.f32 [%r133+1280], %f255; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f239, [%r19+1296]; add.ftz.f32 %f240, %f255, %f239; ld.shared.f32 %f241, [%r19+1300]; add.ftz.f32 %f242, %f240, %f241; ld.shared.f32 %f243, [%r19+1304]; add.ftz.f32 %f226, %f242, %f243; mov.u32 %r112, 1; mov.u32 %r125, 31; mov.u32 %r126, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f226, %r112, %r125, %r126; @p add.f32 r0, r0, %f226; mov.f32 %f224, r0;} // end inline asm mov.u32 %r115, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f224, %r115, %r125, %r126; @p add.f32 r0, r0, %f224; mov.f32 %f227, r0;} // end inline asm mov.u32 %r118, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f227, %r118, %r125, %r126; @p add.f32 r0, r0, %f227; mov.f32 %f230, r0;} // end inline asm mov.u32 %r121, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f230, %r121, %r125, %r126; @p add.f32 r0, r0, %f230; mov.f32 %f233, r0;} // end inline asm mov.u32 %r124, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f233, %r124, %r125, %r126; @p add.f32 r0, r0, %f233; mov.f32 %f255, r0;} // end inline asm $L__BB0_21: mov.u32 %r134, %tid.y; or.b32 %r127, %r3, %r134; setp.ne.s32 %p14, %r127, 0; @%p14 bra $L__BB0_29; ld.param.u64 %rd54, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+8]; mov.u32 %r135, %ctaid.x; setp.eq.s64 %p15, %rd54, 0; mul.ftz.f32 %f262, %f40, %f253; cvt.s64.s32 %rd7, %r135; @%p15 bra $L__BB0_24; ld.param.u64 %rd58, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd57, %rd58; shl.b64 %rd36, %rd7, 1; add.s64 %rd37, %rd57, %rd36; ld.global.u16 %rs158, [%rd37]; // begin inline asm { cvt.f32.f16 %f244, %rs158;} // end inline asm fma.rn.ftz.f32 %f262, %f41, %f244, %f262; $L__BB0_24: ld.param.u64 %rd55, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0]; mov.u32 %r136, %ctaid.x; // begin inline asm { cvt.rn.f16.f32 %rs159, %f262;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd38, 1.0; // end inline asm shl.b64 %rd41, %rd7, 1; add.s64 %rd39, %rd55, %rd41; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd39], %rs159, %rd38; // end inline asm mul.ftz.f32 %f263, %f40, %f254; add.s32 %r128, %r26, %r136; cvt.s64.s32 %rd10, %r128; @%p15 bra $L__BB0_26; ld.param.u64 %rd60, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd59, %rd60; shl.b64 %rd42, %rd10, 1; add.s64 %rd43, %rd59, %rd42; ld.global.u16 %rs161, [%rd43]; // begin inline asm { cvt.f32.f16 %f246, %rs161;} // end inline asm fma.rn.ftz.f32 %f263, %f41, %f246, %f263; $L__BB0_26: mul.wide.s32 %rd47, %r26, 2; add.s64 %rd45, %rd39, %rd47; // begin inline asm { cvt.rn.f16.f32 %rs162, %f263;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd44, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd45], %rs162, %rd44; // end inline asm mul.ftz.f32 %f264, %f40, %f255; cvt.u32.u64 %r129, %rd10; add.s32 %r130, %r129, %r26; cvt.s64.s32 %rd11, %r130; @%p15 bra $L__BB0_28; ld.param.u64 %rd62, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd61, %rd62; shl.b64 %rd48, %rd11, 1; add.s64 %rd49, %rd61, %rd48; ld.global.u16 %rs164, [%rd49]; // begin inline asm { cvt.f32.f16 %f248, %rs164;} // end inline asm fma.rn.ftz.f32 %f264, %f41, %f248, %f264; $L__BB0_28: ld.param.u64 %rd56, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs165, %f264;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd50, 1.0; // end inline asm shl.b64 %rd53, %rd11, 1; add.s64 %rd51, %rd56, %rd53; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd51], %rs165, %rd50; // end inline asm $L__BB0_29: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }