RNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch123DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<15>; .reg .b16 %rs<190>; .reg .f32 %f<213>; .reg .b32 %r<104>; .reg .b64 %rd<34>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[640]; ld.param.v2.u32 {%r27, %r28}, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r29, %r30}, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f22, %f23}, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs56, %rs57, %rs58, %rs59}, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd13, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd12, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd11, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd10, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd9, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd8, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd1, %rd10; cvta.to.global.u64 %rd2, %rd11; cvta.to.global.u64 %rd3, %rd13; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %tid.y; shl.b32 %r31, %r2, 5; mov.u32 %r3, %tid.x; add.s32 %r4, %r31, %r3; setp.ge.u32 %p1, %r4, %r29; mov.f32 %f207, 0f00000000; @%p1 bra $L__BB0_14; mul.lo.s32 %r7, %r29, %r1; shr.u32 %r8, %r3, 2; mul.lo.s32 %r9, %r30, %r1; setp.eq.s64 %p2, %rd12, 0; @%p2 bra $L__BB0_8; cvta.to.global.u64 %rd5, %rd12; mov.f32 %f207, 0f00000000; mov.u32 %r100, %r4; mov.u32 %r101, %r2; $L__BB0_3: add.s32 %r32, %r100, %r7; mul.wide.u32 %rd14, %r32, 4; add.s64 %rd15, %rd2, %rd14; ld.global.u32 %r12, [%rd15]; shl.b32 %r33, %r101, 3; add.s32 %r34, %r33, %r8; add.s32 %r35, %r34, %r9; mul.wide.s32 %rd16, %r35, 2; add.s64 %rd17, %rd3, %rd16; ld.global.u16 %rs64, [%rd17]; // begin inline asm { cvt.f32.f16 %f25, %rs64;} // end inline asm shr.u32 %r36, %r35, 31; add.s32 %r37, %r35, %r36; shr.s32 %r38, %r37, 1; cvt.s64.s32 %rd18, %r38; add.s64 %rd19, %rd5, %rd18; ld.global.u8 %r39, [%rd19]; shl.b32 %r40, %r34, 2; and.b32 %r41, %r40, 4; shr.u32 %r13, %r39, %r41; shl.b32 %r14, %r100, 3; setp.ge.s32 %p3, %r14, %r27; @%p3 bra $L__BB0_7; setp.eq.s16 %p4, %rs56, 0; mul.wide.s32 %rd20, %r14, 2; add.s64 %rd21, %rd1, %rd20; ld.global.v4.u32 {%r42, %r43, %r44, %r45}, [%rd21]; cvt.u16.u32 %rs66, %r13; shr.u16 %rs67, %rs66, 3; and.b16 %rs68, %rs67, 1; setp.eq.b16 %p5, %rs68, 1; and.pred %p6, %p4, %p5; selp.b16 %rs69, -16, 0, %p6; and.b16 %rs70, %rs66, 15; or.b16 %rs71, %rs69, %rs70; cvt.rn.f32.s16 %f3, %rs71; cvt.u16.u32 %rs2, %r12; and.b16 %rs3, %rs2, 15; mov.b32 {%rs4, %rs7}, %r42; shr.u32 %r50, %r12, 4; cvt.u16.u32 %rs5, %r50; and.b16 %rs6, %rs5, 15; shr.u32 %r51, %r12, 8; cvt.u16.u32 %rs8, %r51; and.b16 %rs9, %rs8, 15; mov.b32 {%rs10, %rs13}, %r43; shr.u32 %r52, %r12, 12; cvt.u16.u32 %rs11, %r52; and.b16 %rs12, %rs11, 15; shr.u32 %r53, %r12, 16; cvt.u16.u32 %rs14, %r53; and.b16 %rs15, %rs14, 15; mov.b32 {%rs16, %rs19}, %r44; shr.u32 %r54, %r12, 20; cvt.u16.u32 %rs17, %r54; and.b16 %rs18, %rs17, 15; shr.u32 %r55, %r12, 24; cvt.u16.u32 %rs20, %r55; and.b16 %rs21, %rs20, 15; mov.b32 {%rs22, %rs24}, %r45; shr.u32 %r56, %r12, 28; cvt.u16.u32 %rs23, %r56; @%p4 bra $L__BB0_6; cvt.rn.f32.s16 %f34, %rs3; sub.ftz.f32 %f35, %f34, %f3; mul.ftz.f32 %f36, %f25, %f35; // begin inline asm { cvt.f32.f16 %f26, %rs4;} // end inline asm fma.rn.ftz.f32 %f37, %f36, %f26, %f207; cvt.rn.f32.s16 %f38, %rs6; sub.ftz.f32 %f39, %f38, %f3; mul.ftz.f32 %f40, %f25, %f39; // begin inline asm { cvt.f32.f16 %f27, %rs7;} // end inline asm fma.rn.ftz.f32 %f41, %f40, %f27, %f37; cvt.rn.f32.s16 %f42, %rs9; sub.ftz.f32 %f43, %f42, %f3; mul.ftz.f32 %f44, %f25, %f43; // begin inline asm { cvt.f32.f16 %f28, %rs10;} // end inline asm fma.rn.ftz.f32 %f45, %f44, %f28, %f41; cvt.rn.f32.s16 %f46, %rs12; sub.ftz.f32 %f47, %f46, %f3; mul.ftz.f32 %f48, %f25, %f47; // begin inline asm { cvt.f32.f16 %f29, %rs13;} // end inline asm fma.rn.ftz.f32 %f49, %f48, %f29, %f45; cvt.rn.f32.s16 %f50, %rs15; sub.ftz.f32 %f51, %f50, %f3; mul.ftz.f32 %f52, %f25, %f51; // begin inline asm { cvt.f32.f16 %f30, %rs16;} // end inline asm fma.rn.ftz.f32 %f53, %f52, %f30, %f49; cvt.rn.f32.s16 %f54, %rs18; sub.ftz.f32 %f55, %f54, %f3; mul.ftz.f32 %f56, %f25, %f55; // begin inline asm { cvt.f32.f16 %f31, %rs19;} // end inline asm fma.rn.ftz.f32 %f57, %f56, %f31, %f53; cvt.rn.f32.s16 %f58, %rs21; sub.ftz.f32 %f59, %f58, %f3; mul.ftz.f32 %f60, %f25, %f59; // begin inline asm { cvt.f32.f16 %f32, %rs22;} // end inline asm fma.rn.ftz.f32 %f61, %f60, %f32, %f57; cvt.rn.f32.s16 %f62, %rs23; sub.ftz.f32 %f63, %f62, %f3; mul.ftz.f32 %f64, %f25, %f63; // begin inline asm { cvt.f32.f16 %f33, %rs24;} // end inline asm fma.rn.ftz.f32 %f207, %f64, %f33, %f61; bra.uni $L__BB0_7; $L__BB0_6: shl.b16 %rs88, %rs2, 4; cvt.s16.s8 %rs89, %rs88; shr.s16 %rs90, %rs89, 7; and.b16 %rs91, %rs90, -16; or.b16 %rs92, %rs91, %rs3; cvt.rn.f32.s16 %f73, %rs92; sub.ftz.f32 %f74, %f73, %f3; mul.ftz.f32 %f75, %f25, %f74; // begin inline asm { cvt.f32.f16 %f65, %rs4;} // end inline asm fma.rn.ftz.f32 %f76, %f75, %f65, %f207; shl.b16 %rs93, %rs5, 4; cvt.s16.s8 %rs94, %rs93; shr.s16 %rs95, %rs94, 7; and.b16 %rs96, %rs95, -16; or.b16 %rs97, %rs96, %rs6; cvt.rn.f32.s16 %f77, %rs97; sub.ftz.f32 %f78, %f77, %f3; mul.ftz.f32 %f79, %f25, %f78; // begin inline asm { cvt.f32.f16 %f66, %rs7;} // end inline asm fma.rn.ftz.f32 %f80, %f79, %f66, %f76; shl.b16 %rs98, %rs8, 4; cvt.s16.s8 %rs99, %rs98; shr.s16 %rs100, %rs99, 7; and.b16 %rs101, %rs100, -16; or.b16 %rs102, %rs101, %rs9; cvt.rn.f32.s16 %f81, %rs102; sub.ftz.f32 %f82, %f81, %f3; mul.ftz.f32 %f83, %f25, %f82; // begin inline asm { cvt.f32.f16 %f67, %rs10;} // end inline asm fma.rn.ftz.f32 %f84, %f83, %f67, %f80; shl.b16 %rs103, %rs11, 4; cvt.s16.s8 %rs104, %rs103; shr.s16 %rs105, %rs104, 7; and.b16 %rs106, %rs105, -16; or.b16 %rs107, %rs106, %rs12; cvt.rn.f32.s16 %f85, %rs107; sub.ftz.f32 %f86, %f85, %f3; mul.ftz.f32 %f87, %f25, %f86; // begin inline asm { cvt.f32.f16 %f68, %rs13;} // end inline asm fma.rn.ftz.f32 %f88, %f87, %f68, %f84; shl.b16 %rs108, %rs14, 4; cvt.s16.s8 %rs109, %rs108; shr.s16 %rs110, %rs109, 7; and.b16 %rs111, %rs110, -16; or.b16 %rs112, %rs111, %rs15; cvt.rn.f32.s16 %f89, %rs112; sub.ftz.f32 %f90, %f89, %f3; mul.ftz.f32 %f91, %f25, %f90; // begin inline asm { cvt.f32.f16 %f69, %rs16;} // end inline asm fma.rn.ftz.f32 %f92, %f91, %f69, %f88; shl.b16 %rs113, %rs17, 4; cvt.s16.s8 %rs114, %rs113; shr.s16 %rs115, %rs114, 7; and.b16 %rs116, %rs115, -16; or.b16 %rs117, %rs116, %rs18; cvt.rn.f32.s16 %f93, %rs117; sub.ftz.f32 %f94, %f93, %f3; mul.ftz.f32 %f95, %f25, %f94; // begin inline asm { cvt.f32.f16 %f70, %rs19;} // end inline asm fma.rn.ftz.f32 %f96, %f95, %f70, %f92; shl.b16 %rs118, %rs20, 4; cvt.s16.s8 %rs119, %rs118; shr.s16 %rs120, %rs119, 7; and.b16 %rs121, %rs120, -16; or.b16 %rs122, %rs121, %rs21; cvt.rn.f32.s16 %f97, %rs122; sub.ftz.f32 %f98, %f97, %f3; mul.ftz.f32 %f99, %f25, %f98; // begin inline asm { cvt.f32.f16 %f71, %rs22;} // end inline asm fma.rn.ftz.f32 %f100, %f99, %f71, %f96; shl.b16 %rs123, %rs23, 4; cvt.s16.s8 %rs124, %rs123; shr.s16 %rs125, %rs124, 7; and.b16 %rs126, %rs125, -16; or.b16 %rs127, %rs126, %rs23; cvt.rn.f32.s16 %f101, %rs127; sub.ftz.f32 %f102, %f101, %f3; mul.ftz.f32 %f103, %f25, %f102; // begin inline asm { cvt.f32.f16 %f72, %rs24;} // end inline asm fma.rn.ftz.f32 %f207, %f103, %f72, %f100; $L__BB0_7: add.s32 %r101, %r101, 4; shl.b32 %r57, %r101, 5; add.s32 %r100, %r57, %r3; setp.lt.u32 %p7, %r100, %r29; @%p7 bra $L__BB0_3; bra.uni $L__BB0_14; $L__BB0_8: shl.b16 %rs128, %rs56, 3; cvt.s16.s8 %rs129, %rs128; cvt.rn.f32.s16 %f7, %rs129; mov.f32 %f207, 0f00000000; mov.u32 %r102, %r4; mov.u32 %r103, %r2; $L__BB0_9: add.s32 %r58, %r102, %r7; mul.wide.u32 %rd22, %r58, 4; add.s64 %rd23, %rd2, %rd22; ld.global.u32 %r19, [%rd23]; shl.b32 %r59, %r103, 3; add.s32 %r60, %r59, %r8; add.s32 %r61, %r60, %r9; mul.wide.s32 %rd24, %r61, 2; add.s64 %rd25, %rd3, %rd24; ld.global.u16 %rs130, [%rd25]; // begin inline asm { cvt.f32.f16 %f105, %rs130;} // end inline asm shl.b32 %r20, %r102, 3; setp.ge.s32 %p8, %r20, %r27; @%p8 bra $L__BB0_13; setp.eq.s16 %p9, %rs56, 0; mul.wide.s32 %rd26, %r20, 2; add.s64 %rd27, %rd1, %rd26; ld.global.v4.u32 {%r62, %r63, %r64, %r65}, [%rd27]; cvt.u16.u32 %rs25, %r19; and.b16 %rs26, %rs25, 15; mov.b32 {%rs27, %rs30}, %r62; shr.u32 %r70, %r19, 4; cvt.u16.u32 %rs28, %r70; and.b16 %rs29, %rs28, 15; shr.u32 %r71, %r19, 8; cvt.u16.u32 %rs31, %r71; and.b16 %rs32, %rs31, 15; mov.b32 {%rs33, %rs36}, %r63; shr.u32 %r72, %r19, 12; cvt.u16.u32 %rs34, %r72; and.b16 %rs35, %rs34, 15; shr.u32 %r73, %r19, 16; cvt.u16.u32 %rs37, %r73; and.b16 %rs38, %rs37, 15; mov.b32 {%rs39, %rs42}, %r64; shr.u32 %r74, %r19, 20; cvt.u16.u32 %rs40, %r74; and.b16 %rs41, %rs40, 15; shr.u32 %r75, %r19, 24; cvt.u16.u32 %rs43, %r75; and.b16 %rs44, %rs43, 15; mov.b32 {%rs45, %rs47}, %r65; shr.u32 %r76, %r19, 28; cvt.u16.u32 %rs46, %r76; @%p9 bra $L__BB0_12; cvt.rn.f32.s16 %f114, %rs26; sub.ftz.f32 %f115, %f114, %f7; mul.ftz.f32 %f116, %f105, %f115; // begin inline asm { cvt.f32.f16 %f106, %rs27;} // end inline asm fma.rn.ftz.f32 %f117, %f116, %f106, %f207; cvt.rn.f32.s16 %f118, %rs29; sub.ftz.f32 %f119, %f118, %f7; mul.ftz.f32 %f120, %f105, %f119; // begin inline asm { cvt.f32.f16 %f107, %rs30;} // end inline asm fma.rn.ftz.f32 %f121, %f120, %f107, %f117; cvt.rn.f32.s16 %f122, %rs32; sub.ftz.f32 %f123, %f122, %f7; mul.ftz.f32 %f124, %f105, %f123; // begin inline asm { cvt.f32.f16 %f108, %rs33;} // end inline asm fma.rn.ftz.f32 %f125, %f124, %f108, %f121; cvt.rn.f32.s16 %f126, %rs35; sub.ftz.f32 %f127, %f126, %f7; mul.ftz.f32 %f128, %f105, %f127; // begin inline asm { cvt.f32.f16 %f109, %rs36;} // end inline asm fma.rn.ftz.f32 %f129, %f128, %f109, %f125; cvt.rn.f32.s16 %f130, %rs38; sub.ftz.f32 %f131, %f130, %f7; mul.ftz.f32 %f132, %f105, %f131; // begin inline asm { cvt.f32.f16 %f110, %rs39;} // end inline asm fma.rn.ftz.f32 %f133, %f132, %f110, %f129; cvt.rn.f32.s16 %f134, %rs41; sub.ftz.f32 %f135, %f134, %f7; mul.ftz.f32 %f136, %f105, %f135; // begin inline asm { cvt.f32.f16 %f111, %rs42;} // end inline asm fma.rn.ftz.f32 %f137, %f136, %f111, %f133; cvt.rn.f32.s16 %f138, %rs44; sub.ftz.f32 %f139, %f138, %f7; mul.ftz.f32 %f140, %f105, %f139; // begin inline asm { cvt.f32.f16 %f112, %rs45;} // end inline asm fma.rn.ftz.f32 %f141, %f140, %f112, %f137; cvt.rn.f32.s16 %f142, %rs46; sub.ftz.f32 %f143, %f142, %f7; mul.ftz.f32 %f144, %f105, %f143; // begin inline asm { cvt.f32.f16 %f113, %rs47;} // end inline asm fma.rn.ftz.f32 %f207, %f144, %f113, %f141; bra.uni $L__BB0_13; $L__BB0_12: shl.b16 %rs148, %rs25, 4; cvt.s16.s8 %rs149, %rs148; shr.s16 %rs150, %rs149, 7; and.b16 %rs151, %rs150, -16; or.b16 %rs152, %rs151, %rs26; cvt.rn.f32.s16 %f153, %rs152; sub.ftz.f32 %f154, %f153, %f7; mul.ftz.f32 %f155, %f105, %f154; // begin inline asm { cvt.f32.f16 %f145, %rs27;} // end inline asm fma.rn.ftz.f32 %f156, %f155, %f145, %f207; shl.b16 %rs153, %rs28, 4; cvt.s16.s8 %rs154, %rs153; shr.s16 %rs155, %rs154, 7; and.b16 %rs156, %rs155, -16; or.b16 %rs157, %rs156, %rs29; cvt.rn.f32.s16 %f157, %rs157; sub.ftz.f32 %f158, %f157, %f7; mul.ftz.f32 %f159, %f105, %f158; // begin inline asm { cvt.f32.f16 %f146, %rs30;} // end inline asm fma.rn.ftz.f32 %f160, %f159, %f146, %f156; shl.b16 %rs158, %rs31, 4; cvt.s16.s8 %rs159, %rs158; shr.s16 %rs160, %rs159, 7; and.b16 %rs161, %rs160, -16; or.b16 %rs162, %rs161, %rs32; cvt.rn.f32.s16 %f161, %rs162; sub.ftz.f32 %f162, %f161, %f7; mul.ftz.f32 %f163, %f105, %f162; // begin inline asm { cvt.f32.f16 %f147, %rs33;} // end inline asm fma.rn.ftz.f32 %f164, %f163, %f147, %f160; shl.b16 %rs163, %rs34, 4; cvt.s16.s8 %rs164, %rs163; shr.s16 %rs165, %rs164, 7; and.b16 %rs166, %rs165, -16; or.b16 %rs167, %rs166, %rs35; cvt.rn.f32.s16 %f165, %rs167; sub.ftz.f32 %f166, %f165, %f7; mul.ftz.f32 %f167, %f105, %f166; // begin inline asm { cvt.f32.f16 %f148, %rs36;} // end inline asm fma.rn.ftz.f32 %f168, %f167, %f148, %f164; shl.b16 %rs168, %rs37, 4; cvt.s16.s8 %rs169, %rs168; shr.s16 %rs170, %rs169, 7; and.b16 %rs171, %rs170, -16; or.b16 %rs172, %rs171, %rs38; cvt.rn.f32.s16 %f169, %rs172; sub.ftz.f32 %f170, %f169, %f7; mul.ftz.f32 %f171, %f105, %f170; // begin inline asm { cvt.f32.f16 %f149, %rs39;} // end inline asm fma.rn.ftz.f32 %f172, %f171, %f149, %f168; shl.b16 %rs173, %rs40, 4; cvt.s16.s8 %rs174, %rs173; shr.s16 %rs175, %rs174, 7; and.b16 %rs176, %rs175, -16; or.b16 %rs177, %rs176, %rs41; cvt.rn.f32.s16 %f173, %rs177; sub.ftz.f32 %f174, %f173, %f7; mul.ftz.f32 %f175, %f105, %f174; // begin inline asm { cvt.f32.f16 %f150, %rs42;} // end inline asm fma.rn.ftz.f32 %f176, %f175, %f150, %f172; shl.b16 %rs178, %rs43, 4; cvt.s16.s8 %rs179, %rs178; shr.s16 %rs180, %rs179, 7; and.b16 %rs181, %rs180, -16; or.b16 %rs182, %rs181, %rs44; cvt.rn.f32.s16 %f177, %rs182; sub.ftz.f32 %f178, %f177, %f7; mul.ftz.f32 %f179, %f105, %f178; // begin inline asm { cvt.f32.f16 %f151, %rs45;} // end inline asm fma.rn.ftz.f32 %f180, %f179, %f151, %f176; shl.b16 %rs183, %rs46, 4; cvt.s16.s8 %rs184, %rs183; shr.s16 %rs185, %rs184, 7; and.b16 %rs186, %rs185, -16; or.b16 %rs187, %rs186, %rs46; cvt.rn.f32.s16 %f181, %rs187; sub.ftz.f32 %f182, %f181, %f7; mul.ftz.f32 %f183, %f105, %f182; // begin inline asm { cvt.f32.f16 %f152, %rs47;} // end inline asm fma.rn.ftz.f32 %f207, %f183, %f152, %f180; $L__BB0_13: add.s32 %r103, %r103, 4; shl.b32 %r77, %r103, 5; add.s32 %r102, %r77, %r3; setp.lt.u32 %p10, %r102, %r29; @%p10 bra $L__BB0_9; $L__BB0_14: setp.lt.u32 %p11, %r4, 32; @%p11 bra $L__BB0_16; shl.b32 %r78, %r4, 2; mov.u32 %r79, _ZZ9gemv_int4ILi4ELi32ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r80, %r79, %r78; st.shared.f32 [%r80+-112], %f207; $L__BB0_16: bar.sync 0; setp.gt.u32 %p12, %r4, 31; @%p12 bra $L__BB0_18; mov.u32 %r97, _ZZ9gemv_int4ILi4ELi32ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; mad.lo.s32 %r98, %r4, 12, %r97; mov.u32 %r85, 2; mov.u32 %r94, 16; ld.shared.f32 %f199, [%r98+16]; add.ftz.f32 %f200, %f207, %f199; ld.shared.f32 %f201, [%r98+20]; add.ftz.f32 %f202, %f200, %f201; ld.shared.f32 %f203, [%r98+24]; add.ftz.f32 %f186, %f202, %f203; mov.u32 %r82, 1; mov.u32 %r95, 31; mov.u32 %r96, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f186, %r82, %r95, %r96; @p add.f32 r0, r0, %f186; mov.f32 %f184, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f184, %r85, %r95, %r96; @p add.f32 r0, r0, %f184; mov.f32 %f187, r0;} // end inline asm mov.u32 %r88, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f187, %r88, %r95, %r96; @p add.f32 r0, r0, %f187; mov.f32 %f190, r0;} // end inline asm mov.u32 %r91, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f190, %r91, %r95, %r96; @p add.f32 r0, r0, %f190; mov.f32 %f193, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f193, %r94, %r95, %r96; @p add.f32 r0, r0, %f193; mov.f32 %f207, r0;} // end inline asm $L__BB0_18: or.b32 %r99, %r3, %r2; setp.ne.s32 %p13, %r99, 0; @%p13 bra $L__BB0_22; mul.ftz.f32 %f212, %f22, %f207; cvt.s64.s32 %rd7, %r1; setp.eq.s64 %p14, %rd9, 0; @%p14 bra $L__BB0_21; cvta.to.global.u64 %rd28, %rd9; shl.b64 %rd29, %rd7, 1; add.s64 %rd30, %rd28, %rd29; ld.global.u16 %rs188, [%rd30]; // begin inline asm { cvt.f32.f16 %f204, %rs188;} // end inline asm fma.rn.ftz.f32 %f212, %f23, %f204, %f212; $L__BB0_21: // begin inline asm { cvt.rn.f16.f32 %rs189, %f212;} // end inline asm cvta.to.global.u64 %rd31, %rd8; shl.b64 %rd32, %rd7, 1; add.s64 %rd33, %rd31, %rd32; st.global.u16 [%rd33], %rs189; $L__BB0_22: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }