emv_int4ILi4ELi32ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_02b20ef46thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch323DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<16>; .reg .b16 %rs<166>; .reg .f32 %f<257>; .reg .b32 %r<140>; .reg .b64 %rd<45>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1920]; ld.param.v2.u32 {%r24, %r25}, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r26, %r27}, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f32, %f33}, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs52, %rs53, %rs54, %rs55}, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd18, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd17, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd16, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd15, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+16]; mov.u32 %r1, %ctaid.x; mov.u32 %r139, %tid.y; shl.b32 %r28, %r139, 5; mov.u32 %r3, %tid.x; add.s32 %r138, %r28, %r3; setp.ge.u32 %p1, %r138, %r26; mov.f32 %f248, 0f00000000; mov.f32 %f249, %f248; mov.f32 %f250, %f248; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd1, %rd15; mul.lo.s32 %r7, %r26, %r1; shr.u32 %r8, %r3, 2; mul.lo.s32 %r9, %r27, %r1; shl.b16 %rs2, %rs52, 3; cvta.to.global.u64 %rd3, %rd17; cvta.to.global.u64 %rd4, %rd16; cvta.to.global.u64 %rd5, %rd18; $L__BB0_2: add.s32 %r29, %r138, %r7; mul.wide.u32 %rd19, %r29, 4; add.s64 %rd20, %rd4, %rd19; ld.global.u32 %r12, [%rd20]; shl.b32 %r30, %r139, 3; add.s32 %r13, %r30, %r8; add.s32 %r14, %r13, %r9; mul.wide.s32 %rd21, %r14, 2; add.s64 %rd22, %rd5, %rd21; ld.global.u16 %rs60, [%rd22]; // begin inline asm { cvt.f32.f16 %f37, %rs60;} // end inline asm setp.eq.s64 %p2, %rd17, 0; mov.u16 %rs165, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r31, %r14, 31; add.s32 %r32, %r14, %r31; shr.s32 %r33, %r32, 1; cvt.s64.s32 %rd23, %r33; add.s64 %rd24, %rd3, %rd23; ld.global.u8 %r34, [%rd24]; shl.b32 %r35, %r13, 2; and.b32 %r36, %r35, 4; shr.u32 %r37, %r34, %r36; cvt.u16.u32 %rs61, %r37; and.b16 %rs165, %rs61, 15; $L__BB0_4: shl.b32 %r15, %r138, 3; setp.ge.s32 %p3, %r15, %r24; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs52, 0; mul.wide.s32 %rd25, %r15, 2; add.s64 %rd26, %rd1, %rd25; ld.global.v4.u32 {%r38, %r39, %r40, %r41}, [%rd26]; mul.wide.s32 %rd27, %r24, 2; add.s64 %rd28, %rd26, %rd27; ld.global.v4.u32 {%r46, %r47, %r48, %r49}, [%rd28]; add.s32 %r54, %r24, %r15; add.s32 %r55, %r54, %r24; mul.wide.s32 %rd29, %r55, 2; add.s64 %rd30, %rd1, %rd29; ld.global.v4.u32 {%r56, %r57, %r58, %r59}, [%rd30]; shr.u16 %rs63, %rs165, 3; and.b16 %rs64, %rs63, 1; setp.eq.b16 %p5, %rs64, 1; and.pred %p6, %p4, %p5; selp.b16 %rs65, -16, 0, %p6; or.b16 %rs66, %rs65, %rs165; cvt.s16.s8 %rs67, %rs66; cvt.rn.f32.s16 %f5, %rs67; cvt.u16.u32 %rs5, %r12; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs12}, %r38; mov.b32 {%rs8, %rs13}, %r46; mov.b32 {%rs9, %rs14}, %r56; shr.u32 %r64, %r12, 4; cvt.u16.u32 %rs10, %r64; and.b16 %rs11, %rs10, 15; shr.u32 %r65, %r12, 8; cvt.u16.u32 %rs15, %r65; and.b16 %rs16, %rs15, 15; mov.b32 {%rs17, %rs22}, %r39; mov.b32 {%rs18, %rs23}, %r47; mov.b32 {%rs19, %rs24}, %r57; shr.u32 %r66, %r12, 12; cvt.u16.u32 %rs20, %r66; and.b16 %rs21, %rs20, 15; shr.u32 %r67, %r12, 16; cvt.u16.u32 %rs25, %r67; and.b16 %rs26, %rs25, 15; mov.b32 {%rs27, %rs32}, %r40; mov.b32 {%rs28, %rs33}, %r48; mov.b32 {%rs29, %rs34}, %r58; shr.u32 %r68, %r12, 20; cvt.u16.u32 %rs30, %r68; and.b16 %rs31, %rs30, 15; shr.u32 %r69, %r12, 24; cvt.u16.u32 %rs35, %r69; and.b16 %rs36, %rs35, 15; mov.b32 {%rs37, %rs41}, %r41; mov.b32 {%rs38, %rs42}, %r49; mov.b32 {%rs39, %rs43}, %r59; shr.u32 %r70, %r12, 28; cvt.u16.u32 %rs40, %r70; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f62, %rs6; sub.ftz.f32 %f63, %f62, %f5; mul.ftz.f32 %f64, %f37, %f63; // begin inline asm { cvt.f32.f16 %f38, %rs7;} // end inline asm fma.rn.ftz.f32 %f65, %f64, %f38, %f248; // begin inline asm { cvt.f32.f16 %f39, %rs8;} // end inline asm fma.rn.ftz.f32 %f66, %f64, %f39, %f249; // begin inline asm { cvt.f32.f16 %f40, %rs9;} // end inline asm fma.rn.ftz.f32 %f67, %f64, %f40, %f250; cvt.rn.f32.s16 %f68, %rs11; sub.ftz.f32 %f69, %f68, %f5; mul.ftz.f32 %f70, %f37, %f69; // begin inline asm { cvt.f32.f16 %f41, %rs12;} // end inline asm fma.rn.ftz.f32 %f71, %f70, %f41, %f65; // begin inline asm { cvt.f32.f16 %f42, %rs13;} // end inline asm fma.rn.ftz.f32 %f72, %f70, %f42, %f66; // begin inline asm { cvt.f32.f16 %f43, %rs14;} // end inline asm fma.rn.ftz.f32 %f73, %f70, %f43, %f67; cvt.rn.f32.s16 %f74, %rs16; sub.ftz.f32 %f75, %f74, %f5; mul.ftz.f32 %f76, %f37, %f75; // begin inline asm { cvt.f32.f16 %f44, %rs17;} // end inline asm fma.rn.ftz.f32 %f77, %f76, %f44, %f71; // begin inline asm { cvt.f32.f16 %f45, %rs18;} // end inline asm fma.rn.ftz.f32 %f78, %f76, %f45, %f72; // begin inline asm { cvt.f32.f16 %f46, %rs19;} // end inline asm fma.rn.ftz.f32 %f79, %f76, %f46, %f73; cvt.rn.f32.s16 %f80, %rs21; sub.ftz.f32 %f81, %f80, %f5; mul.ftz.f32 %f82, %f37, %f81; // begin inline asm { cvt.f32.f16 %f47, %rs22;} // end inline asm fma.rn.ftz.f32 %f83, %f82, %f47, %f77; // begin inline asm { cvt.f32.f16 %f48, %rs23;} // end inline asm fma.rn.ftz.f32 %f84, %f82, %f48, %f78; // begin inline asm { cvt.f32.f16 %f49, %rs24;} // end inline asm fma.rn.ftz.f32 %f85, %f82, %f49, %f79; cvt.rn.f32.s16 %f86, %rs26; sub.ftz.f32 %f87, %f86, %f5; mul.ftz.f32 %f88, %f37, %f87; // begin inline asm { cvt.f32.f16 %f50, %rs27;} // end inline asm fma.rn.ftz.f32 %f89, %f88, %f50, %f83; // begin inline asm { cvt.f32.f16 %f51, %rs28;} // end inline asm fma.rn.ftz.f32 %f90, %f88, %f51, %f84; // begin inline asm { cvt.f32.f16 %f52, %rs29;} // end inline asm fma.rn.ftz.f32 %f91, %f88, %f52, %f85; cvt.rn.f32.s16 %f92, %rs31; sub.ftz.f32 %f93, %f92, %f5; mul.ftz.f32 %f94, %f37, %f93; // begin inline asm { cvt.f32.f16 %f53, %rs32;} // end inline asm fma.rn.ftz.f32 %f95, %f94, %f53, %f89; // begin inline asm { cvt.f32.f16 %f54, %rs33;} // end inline asm fma.rn.ftz.f32 %f96, %f94, %f54, %f90; // begin inline asm { cvt.f32.f16 %f55, %rs34;} // end inline asm fma.rn.ftz.f32 %f97, %f94, %f55, %f91; cvt.rn.f32.s16 %f98, %rs36; sub.ftz.f32 %f99, %f98, %f5; mul.ftz.f32 %f100, %f37, %f99; // begin inline asm { cvt.f32.f16 %f56, %rs37;} // end inline asm fma.rn.ftz.f32 %f101, %f100, %f56, %f95; // begin inline asm { cvt.f32.f16 %f57, %rs38;} // end inline asm fma.rn.ftz.f32 %f102, %f100, %f57, %f96; // begin inline asm { cvt.f32.f16 %f58, %rs39;} // end inline asm fma.rn.ftz.f32 %f103, %f100, %f58, %f97; cvt.rn.f32.s16 %f104, %rs40; sub.ftz.f32 %f105, %f104, %f5; mul.ftz.f32 %f106, %f37, %f105; // begin inline asm { cvt.f32.f16 %f59, %rs41;} // end inline asm fma.rn.ftz.f32 %f248, %f106, %f59, %f101; // begin inline asm { cvt.f32.f16 %f60, %rs42;} // end inline asm fma.rn.ftz.f32 %f249, %f106, %f60, %f102; // begin inline asm { cvt.f32.f16 %f61, %rs43;} // end inline asm fma.rn.ftz.f32 %f250, %f106, %f61, %f103; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs116, %rs5, 4; cvt.s16.s8 %rs117, %rs116; shr.s16 %rs118, %rs117, 7; and.b16 %rs119, %rs118, -16; or.b16 %rs120, %rs119, %rs6; cvt.rn.f32.s16 %f131, %rs120; sub.ftz.f32 %f132, %f131, %f5; mul.ftz.f32 %f133, %f37, %f132; // begin inline asm { cvt.f32.f16 %f107, %rs7;} // end inline asm fma.rn.ftz.f32 %f134, %f133, %f107, %f248; // begin inline asm { cvt.f32.f16 %f108, %rs8;} // end inline asm fma.rn.ftz.f32 %f135, %f133, %f108, %f249; // begin inline asm { cvt.f32.f16 %f109, %rs9;} // end inline asm fma.rn.ftz.f32 %f136, %f133, %f109, %f250; shl.b16 %rs121, %rs10, 4; cvt.s16.s8 %rs122, %rs121; shr.s16 %rs123, %rs122, 7; and.b16 %rs124, %rs123, -16; or.b16 %rs125, %rs124, %rs11; cvt.rn.f32.s16 %f137, %rs125; sub.ftz.f32 %f138, %f137, %f5; mul.ftz.f32 %f139, %f37, %f138; // begin inline asm { cvt.f32.f16 %f110, %rs12;} // end inline asm fma.rn.ftz.f32 %f140, %f139, %f110, %f134; // begin inline asm { cvt.f32.f16 %f111, %rs13;} // end inline asm fma.rn.ftz.f32 %f141, %f139, %f111, %f135; // begin inline asm { cvt.f32.f16 %f112, %rs14;} // end inline asm fma.rn.ftz.f32 %f142, %f139, %f112, %f136; shl.b16 %rs126, %rs15, 4; cvt.s16.s8 %rs127, %rs126; shr.s16 %rs128, %rs127, 7; and.b16 %rs129, %rs128, -16; or.b16 %rs130, %rs129, %rs16; cvt.rn.f32.s16 %f143, %rs130; sub.ftz.f32 %f144, %f143, %f5; mul.ftz.f32 %f145, %f37, %f144; // begin inline asm { cvt.f32.f16 %f113, %rs17;} // end inline asm fma.rn.ftz.f32 %f146, %f145, %f113, %f140; // begin inline asm { cvt.f32.f16 %f114, %rs18;} // end inline asm fma.rn.ftz.f32 %f147, %f145, %f114, %f141; // begin inline asm { cvt.f32.f16 %f115, %rs19;} // end inline asm fma.rn.ftz.f32 %f148, %f145, %f115, %f142; shl.b16 %rs131, %rs20, 4; cvt.s16.s8 %rs132, %rs131; shr.s16 %rs133, %rs132, 7; and.b16 %rs134, %rs133, -16; or.b16 %rs135, %rs134, %rs21; cvt.rn.f32.s16 %f149, %rs135; sub.ftz.f32 %f150, %f149, %f5; mul.ftz.f32 %f151, %f37, %f150; // begin inline asm { cvt.f32.f16 %f116, %rs22;} // end inline asm fma.rn.ftz.f32 %f152, %f151, %f116, %f146; // begin inline asm { cvt.f32.f16 %f117, %rs23;} // end inline asm fma.rn.ftz.f32 %f153, %f151, %f117, %f147; // begin inline asm { cvt.f32.f16 %f118, %rs24;} // end inline asm fma.rn.ftz.f32 %f154, %f151, %f118, %f148; shl.b16 %rs136, %rs25, 4; cvt.s16.s8 %rs137, %rs136; shr.s16 %rs138, %rs137, 7; and.b16 %rs139, %rs138, -16; or.b16 %rs140, %rs139, %rs26; cvt.rn.f32.s16 %f155, %rs140; sub.ftz.f32 %f156, %f155, %f5; mul.ftz.f32 %f157, %f37, %f156; // begin inline asm { cvt.f32.f16 %f119, %rs27;} // end inline asm fma.rn.ftz.f32 %f158, %f157, %f119, %f152; // begin inline asm { cvt.f32.f16 %f120, %rs28;} // end inline asm fma.rn.ftz.f32 %f159, %f157, %f120, %f153; // begin inline asm { cvt.f32.f16 %f121, %rs29;} // end inline asm fma.rn.ftz.f32 %f160, %f157, %f121, %f154; shl.b16 %rs141, %rs30, 4; cvt.s16.s8 %rs142, %rs141; shr.s16 %rs143, %rs142, 7; and.b16 %rs144, %rs143, -16; or.b16 %rs145, %rs144, %rs31; cvt.rn.f32.s16 %f161, %rs145; sub.ftz.f32 %f162, %f161, %f5; mul.ftz.f32 %f163, %f37, %f162; // begin inline asm { cvt.f32.f16 %f122, %rs32;} // end inline asm fma.rn.ftz.f32 %f164, %f163, %f122, %f158; // begin inline asm { cvt.f32.f16 %f123, %rs33;} // end inline asm fma.rn.ftz.f32 %f165, %f163, %f123, %f159; // begin inline asm { cvt.f32.f16 %f124, %rs34;} // end inline asm fma.rn.ftz.f32 %f166, %f163, %f124, %f160; shl.b16 %rs146, %rs35, 4; cvt.s16.s8 %rs147, %rs146; shr.s16 %rs148, %rs147, 7; and.b16 %rs149, %rs148, -16; or.b16 %rs150, %rs149, %rs36; cvt.rn.f32.s16 %f167, %rs150; sub.ftz.f32 %f168, %f167, %f5; mul.ftz.f32 %f169, %f37, %f168; // begin inline asm { cvt.f32.f16 %f125, %rs37;} // end inline asm fma.rn.ftz.f32 %f170, %f169, %f125, %f164; // begin inline asm { cvt.f32.f16 %f126, %rs38;} // end inline asm fma.rn.ftz.f32 %f171, %f169, %f126, %f165; // begin inline asm { cvt.f32.f16 %f127, %rs39;} // end inline asm fma.rn.ftz.f32 %f172, %f169, %f127, %f166; shl.b16 %rs151, %rs40, 4; cvt.s16.s8 %rs152, %rs151; shr.s16 %rs153, %rs152, 7; and.b16 %rs154, %rs153, -16; or.b16 %rs155, %rs154, %rs40; cvt.rn.f32.s16 %f173, %rs155; sub.ftz.f32 %f174, %f173, %f5; mul.ftz.f32 %f175, %f37, %f174; // begin inline asm { cvt.f32.f16 %f128, %rs41;} // end inline asm fma.rn.ftz.f32 %f248, %f175, %f128, %f170; // begin inline asm { cvt.f32.f16 %f129, %rs42;} // end inline asm fma.rn.ftz.f32 %f249, %f175, %f129, %f171; // begin inline asm { cvt.f32.f16 %f130, %rs43;} // end inline asm fma.rn.ftz.f32 %f250, %f175, %f130, %f172; $L__BB0_8: add.s32 %r139, %r139, 4; shl.b32 %r71, %r139, 5; add.s32 %r138, %r71, %r3; setp.lt.u32 %p7, %r138, %r26; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r134, %tid.y; shl.b32 %r133, %r134, 5; add.s32 %r132, %r133, %r3; shl.b32 %r72, %r132, 2; mov.u32 %r73, _ZZ9gemv_int4ILi4ELi32ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r74, %r73, %r72; setp.lt.u32 %p8, %r132, 32; @%p8 bra $L__BB0_11; add.s32 %r127, %r74, -112; st.shared.f32 [%r127], %f248; $L__BB0_11: mov.u32 %r137, %tid.y; shl.b32 %r136, %r137, 5; add.s32 %r135, %r136, %r3; setp.gt.u32 %p9, %r135, 31; bar.sync 0; mad.lo.s32 %r19, %r135, 12, %r73; @%p9 bra $L__BB0_13; mov.u32 %r89, 16; ld.shared.f32 %f191, [%r19+16]; add.ftz.f32 %f192, %f248, %f191; ld.shared.f32 %f193, [%r19+20]; add.ftz.f32 %f194, %f192, %f193; ld.shared.f32 %f195, [%r19+24]; add.ftz.f32 %f178, %f194, %f195; mov.u32 %r77, 1; mov.u32 %r90, 31; mov.u32 %r91, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f178, %r77, %r90, %r91; @p add.f32 r0, r0, %f178; mov.f32 %f176, r0;} // end inline asm mov.u32 %r80, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f176, %r80, %r90, %r91; @p add.f32 r0, r0, %f176; mov.f32 %f179, r0;} // end inline asm mov.u32 %r83, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f179, %r83, %r90, %r91; @p add.f32 r0, r0, %f179; mov.f32 %f182, r0;} // end inline asm mov.u32 %r86, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f182, %r86, %r90, %r91; @p add.f32 r0, r0, %f182; mov.f32 %f185, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f185, %r89, %r90, %r91; @p add.f32 r0, r0, %f185; mov.f32 %f248, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r128, %r74, -112; st.shared.f32 [%r128+640], %f249; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f211, [%r19+656]; add.ftz.f32 %f212, %f249, %f211; ld.shared.f32 %f213, [%r19+660]; add.ftz.f32 %f214, %f212, %f213; ld.shared.f32 %f215, [%r19+664]; add.ftz.f32 %f198, %f214, %f215; mov.u32 %r93, 1; mov.u32 %r106, 31; mov.u32 %r107, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f198, %r93, %r106, %r107; @p add.f32 r0, r0, %f198; mov.f32 %f196, r0;} // end inline asm mov.u32 %r96, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f196, %r96, %r106, %r107; @p add.f32 r0, r0, %f196; mov.f32 %f199, r0;} // end inline asm mov.u32 %r99, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f199, %r99, %r106, %r107; @p add.f32 r0, r0, %f199; mov.f32 %f202, r0;} // end inline asm mov.u32 %r102, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f202, %r102, %r106, %r107; @p add.f32 r0, r0, %f202; mov.f32 %f205, r0;} // end inline asm mov.u32 %r105, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f205, %r105, %r106, %r107; @p add.f32 r0, r0, %f205; mov.f32 %f249, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r129, %r74, -112; st.shared.f32 [%r129+1280], %f250; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f231, [%r19+1296]; add.ftz.f32 %f232, %f250, %f231; ld.shared.f32 %f233, [%r19+1300]; add.ftz.f32 %f234, %f232, %f233; ld.shared.f32 %f235, [%r19+1304]; add.ftz.f32 %f218, %f234, %f235; mov.u32 %r109, 1; mov.u32 %r122, 31; mov.u32 %r123, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f218, %r109, %r122, %r123; @p add.f32 r0, r0, %f218; mov.f32 %f216, r0;} // end inline asm mov.u32 %r112, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f216, %r112, %r122, %r123; @p add.f32 r0, r0, %f216; mov.f32 %f219, r0;} // end inline asm mov.u32 %r115, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f219, %r115, %r122, %r123; @p add.f32 r0, r0, %f219; mov.f32 %f222, r0;} // end inline asm mov.u32 %r118, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f222, %r118, %r122, %r123; @p add.f32 r0, r0, %f222; mov.f32 %f225, r0;} // end inline asm mov.u32 %r121, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f225, %r121, %r122, %r123; @p add.f32 r0, r0, %f225; mov.f32 %f250, r0;} // end inline asm $L__BB0_21: mov.u32 %r130, %tid.y; or.b32 %r124, %r3, %r130; setp.ne.s32 %p14, %r124, 0; @%p14 bra $L__BB0_25; ld.param.u64 %rd43, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd42, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0]; mov.u32 %r131, %ctaid.x; cvta.to.global.u64 %rd31, %rd42; setp.eq.s64 %p15, %rd43, 0; mul.ftz.f32 %f24, %f32, %f248; cvt.s64.s32 %rd7, %r131; mul.wide.s32 %rd32, %r131, 2; add.s64 %rd8, %rd31, %rd32; mul.ftz.f32 %f25, %f32, %f249; cvt.s64.s32 %rd9, %r25; mul.wide.s32 %rd33, %r25, 2; add.s64 %rd10, %rd8, %rd33; mul.ftz.f32 %f26, %f32, %f250; add.s32 %r125, %r25, %r131; add.s32 %r126, %r125, %r25; cvt.s64.s32 %rd11, %r126; mul.wide.s32 %rd34, %r126, 2; add.s64 %rd12, %rd31, %rd34; @%p15 bra $L__BB0_24; ld.param.u64 %rd44, [_Z27dequant_gemv_group32_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd35, %rd44; shl.b64 %rd36, %rd7, 1; add.s64 %rd37, %rd35, %rd36; ld.global.u16 %rs156, [%rd37]; // begin inline asm { cvt.f32.f16 %f236, %rs156;} // end inline asm fma.rn.ftz.f32 %f237, %f33, %f236, %f24; // begin inline asm { cvt.rn.f16.f32 %rs157, %f237;} // end inline asm st.global.u16 [%rd8], %rs157; shl.b64 %rd38, %rd9, 1; add.s64 %rd39, %rd37, %rd38; ld.global.u16 %rs158, [%rd39]; // begin inline asm { cvt.f32.f16 %f238, %rs158;} // end inline asm fma.rn.ftz.f32 %f239, %f33, %f238, %f25; // begin inline asm { cvt.rn.f16.f32 %rs159, %f239;} // end inline asm st.global.u16 [%rd10], %rs159; shl.b64 %rd40, %rd11, 1; add.s64 %rd41, %rd35, %rd40; ld.global.u16 %rs160, [%rd41]; // begin inline asm { cvt.f32.f16 %f240, %rs160;} // end inline asm fma.rn.ftz.f32 %f241, %f33, %f240, %f26; // begin inline asm { cvt.rn.f16.f32 %rs161, %f241;} // end inline asm st.global.u16 [%rd12], %rs161; bra.uni $L__BB0_25; $L__BB0_24: // begin inline asm { cvt.rn.f16.f32 %rs162, %f24;} // end inline asm st.global.u16 [%rd8], %rs162; // begin inline asm { cvt.rn.f16.f32 %rs163, %f25;} // end inline asm st.global.u16 [%rd10], %rs163; // begin inline asm { cvt.rn.f16.f32 %rs164, %f26;} // end inline asm st.global.u16 [%rd12], %rs164; $L__BB0_25: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }