al .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_17f8459b6thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch123DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<12>; .reg .b16 %rs<114>; .reg .f32 %f<125>; .reg .b32 %r<78>; .reg .b64 %rd<34>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[640]; ld.param.v2.u32 {%r22, %r23}, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r24, %r25}, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f16, %f17}, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs36, %rs37, %rs38, %rs39}, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd12, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd11, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd10, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd9, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd8, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd7, [_Z27dequant_gemv_group32_batch123DequantGemvKernelParams_param_0]; mov.u32 %r1, %ctaid.x; mov.u32 %r2, %tid.y; shl.b32 %r26, %r2, 5; mov.u32 %r3, %tid.x; add.s32 %r4, %r26, %r3; setp.ge.u32 %p1, %r4, %r24; mov.f32 %f121, 0f00000000; @%p1 bra $L__BB0_9; mul.lo.s32 %r6, %r24, %r1; shr.u32 %r7, %r3, 2; mul.lo.s32 %r8, %r25, %r1; shl.b16 %rs2, %rs36, 3; cvta.to.global.u64 %rd4, %rd9; mov.u32 %r76, %r4; mov.u32 %r77, %r2; $L__BB0_2: add.s32 %r28, %r76, %r6; mul.wide.u32 %rd19, %r28, 4; add.s64 %rd14, %rd10, %rd19; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd13, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.u32 %r27, [%rd14], %rd13; // end inline asm shl.b32 %r29, %r77, 3; add.s32 %r13, %r29, %r7; add.s32 %r14, %r13, %r8; mul.wide.s32 %rd20, %r14, 2; add.s64 %rd17, %rd12, %rd20; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd16, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs44, [%rd17], %rd16; // end inline asm // begin inline asm { cvt.f32.f16 %f19, %rs44;} // end inline asm setp.eq.s64 %p2, %rd11, 0; mov.u16 %rs113, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r30, %r14, 31; add.s32 %r31, %r14, %r30; shr.s32 %r32, %r31, 1; cvt.s64.s32 %rd24, %r32; add.s64 %rd22, %rd11, %rd24; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd21, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs46, [%rd22], %rd21; // end inline asm cvt.u32.u16 %r33, %rs46; and.b32 %r34, %r33, 255; shl.b32 %r35, %r13, 2; and.b32 %r36, %r35, 4; shr.u32 %r37, %r34, %r36; cvt.u16.u32 %rs47, %r37; and.b16 %rs113, %rs47, 15; $L__BB0_4: shl.b32 %r15, %r76, 3; setp.ge.s32 %p3, %r15, %r22; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs36, 0; mul.wide.s32 %rd25, %r15, 2; add.s64 %rd26, %rd4, %rd25; ld.global.v4.u32 {%r38, %r39, %r40, %r41}, [%rd26]; shr.u16 %rs49, %rs113, 3; and.b16 %rs50, %rs49, 1; setp.eq.b16 %p5, %rs50, 1; and.pred %p6, %p4, %p5; selp.b16 %rs51, -16, 0, %p6; or.b16 %rs52, %rs51, %rs113; cvt.s16.s8 %rs53, %rs52; cvt.rn.f32.s16 %f3, %rs53; cvt.u16.u32 %rs5, %r27; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs10}, %r38; shr.u32 %r46, %r27, 4; cvt.u16.u32 %rs8, %r46; and.b16 %rs9, %rs8, 15; shr.u32 %r47, %r27, 8; cvt.u16.u32 %rs11, %r47; and.b16 %rs12, %rs11, 15; mov.b32 {%rs13, %rs16}, %r39; shr.u32 %r48, %r27, 12; cvt.u16.u32 %rs14, %r48; and.b16 %rs15, %rs14, 15; shr.u32 %r49, %r27, 16; cvt.u16.u32 %rs17, %r49; and.b16 %rs18, %rs17, 15; mov.b32 {%rs19, %rs22}, %r40; shr.u32 %r50, %r27, 20; cvt.u16.u32 %rs20, %r50; and.b16 %rs21, %rs20, 15; shr.u32 %r51, %r27, 24; cvt.u16.u32 %rs23, %r51; and.b16 %rs24, %rs23, 15; mov.b32 {%rs25, %rs27}, %r41; shr.u32 %r52, %r27, 28; cvt.u16.u32 %rs26, %r52; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f28, %rs6; sub.ftz.f32 %f29, %f28, %f3; mul.ftz.f32 %f30, %f19, %f29; // begin inline asm { cvt.f32.f16 %f20, %rs7;} // end inline asm fma.rn.ftz.f32 %f31, %f30, %f20, %f121; cvt.rn.f32.s16 %f32, %rs9; sub.ftz.f32 %f33, %f32, %f3; mul.ftz.f32 %f34, %f19, %f33; // begin inline asm { cvt.f32.f16 %f21, %rs10;} // end inline asm fma.rn.ftz.f32 %f35, %f34, %f21, %f31; cvt.rn.f32.s16 %f36, %rs12; sub.ftz.f32 %f37, %f36, %f3; mul.ftz.f32 %f38, %f19, %f37; // begin inline asm { cvt.f32.f16 %f22, %rs13;} // end inline asm fma.rn.ftz.f32 %f39, %f38, %f22, %f35; cvt.rn.f32.s16 %f40, %rs15; sub.ftz.f32 %f41, %f40, %f3; mul.ftz.f32 %f42, %f19, %f41; // begin inline asm { cvt.f32.f16 %f23, %rs16;} // end inline asm fma.rn.ftz.f32 %f43, %f42, %f23, %f39; cvt.rn.f32.s16 %f44, %rs18; sub.ftz.f32 %f45, %f44, %f3; mul.ftz.f32 %f46, %f19, %f45; // begin inline asm { cvt.f32.f16 %f24, %rs19;} // end inline asm fma.rn.ftz.f32 %f47, %f46, %f24, %f43; cvt.rn.f32.s16 %f48, %rs21; sub.ftz.f32 %f49, %f48, %f3; mul.ftz.f32 %f50, %f19, %f49; // begin inline asm { cvt.f32.f16 %f25, %rs22;} // end inline asm fma.rn.ftz.f32 %f51, %f50, %f25, %f47; cvt.rn.f32.s16 %f52, %rs24; sub.ftz.f32 %f53, %f52, %f3; mul.ftz.f32 %f54, %f19, %f53; // begin inline asm { cvt.f32.f16 %f26, %rs25;} // end inline asm fma.rn.ftz.f32 %f55, %f54, %f26, %f51; cvt.rn.f32.s16 %f56, %rs26; sub.ftz.f32 %f57, %f56, %f3; mul.ftz.f32 %f58, %f19, %f57; // begin inline asm { cvt.f32.f16 %f27, %rs27;} // end inline asm fma.rn.ftz.f32 %f121, %f58, %f27, %f55; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs70, %rs5, 4; cvt.s16.s8 %rs71, %rs70; shr.s16 %rs72, %rs71, 7; and.b16 %rs73, %rs72, -16; or.b16 %rs74, %rs73, %rs6; cvt.rn.f32.s16 %f67, %rs74; sub.ftz.f32 %f68, %f67, %f3; mul.ftz.f32 %f69, %f19, %f68; // begin inline asm { cvt.f32.f16 %f59, %rs7;} // end inline asm fma.rn.ftz.f32 %f70, %f69, %f59, %f121; shl.b16 %rs75, %rs8, 4; cvt.s16.s8 %rs76, %rs75; shr.s16 %rs77, %rs76, 7; and.b16 %rs78, %rs77, -16; or.b16 %rs79, %rs78, %rs9; cvt.rn.f32.s16 %f71, %rs79; sub.ftz.f32 %f72, %f71, %f3; mul.ftz.f32 %f73, %f19, %f72; // begin inline asm { cvt.f32.f16 %f60, %rs10;} // end inline asm fma.rn.ftz.f32 %f74, %f73, %f60, %f70; shl.b16 %rs80, %rs11, 4; cvt.s16.s8 %rs81, %rs80; shr.s16 %rs82, %rs81, 7; and.b16 %rs83, %rs82, -16; or.b16 %rs84, %rs83, %rs12; cvt.rn.f32.s16 %f75, %rs84; sub.ftz.f32 %f76, %f75, %f3; mul.ftz.f32 %f77, %f19, %f76; // begin inline asm { cvt.f32.f16 %f61, %rs13;} // end inline asm fma.rn.ftz.f32 %f78, %f77, %f61, %f74; shl.b16 %rs85, %rs14, 4; cvt.s16.s8 %rs86, %rs85; shr.s16 %rs87, %rs86, 7; and.b16 %rs88, %rs87, -16; or.b16 %rs89, %rs88, %rs15; cvt.rn.f32.s16 %f79, %rs89; sub.ftz.f32 %f80, %f79, %f3; mul.ftz.f32 %f81, %f19, %f80; // begin inline asm { cvt.f32.f16 %f62, %rs16;} // end inline asm fma.rn.ftz.f32 %f82, %f81, %f62, %f78; shl.b16 %rs90, %rs17, 4; cvt.s16.s8 %rs91, %rs90; shr.s16 %rs92, %rs91, 7; and.b16 %rs93, %rs92, -16; or.b16 %rs94, %rs93, %rs18; cvt.rn.f32.s16 %f83, %rs94; sub.ftz.f32 %f84, %f83, %f3; mul.ftz.f32 %f85, %f19, %f84; // begin inline asm { cvt.f32.f16 %f63, %rs19;} // end inline asm fma.rn.ftz.f32 %f86, %f85, %f63, %f82; shl.b16 %rs95, %rs20, 4; cvt.s16.s8 %rs96, %rs95; shr.s16 %rs97, %rs96, 7; and.b16 %rs98, %rs97, -16; or.b16 %rs99, %rs98, %rs21; cvt.rn.f32.s16 %f87, %rs99; sub.ftz.f32 %f88, %f87, %f3; mul.ftz.f32 %f89, %f19, %f88; // begin inline asm { cvt.f32.f16 %f64, %rs22;} // end inline asm fma.rn.ftz.f32 %f90, %f89, %f64, %f86; shl.b16 %rs100, %rs23, 4; cvt.s16.s8 %rs101, %rs100; shr.s16 %rs102, %rs101, 7; and.b16 %rs103, %rs102, -16; or.b16 %rs104, %rs103, %rs24; cvt.rn.f32.s16 %f91, %rs104; sub.ftz.f32 %f92, %f91, %f3; mul.ftz.f32 %f93, %f19, %f92; // begin inline asm { cvt.f32.f16 %f65, %rs25;} // end inline asm fma.rn.ftz.f32 %f94, %f93, %f65, %f90; shl.b16 %rs105, %rs26, 4; cvt.s16.s8 %rs106, %rs105; shr.s16 %rs107, %rs106, 7; and.b16 %rs108, %rs107, -16; or.b16 %rs109, %rs108, %rs26; cvt.rn.f32.s16 %f95, %rs109; sub.ftz.f32 %f96, %f95, %f3; mul.ftz.f32 %f97, %f19, %f96; // begin inline asm { cvt.f32.f16 %f66, %rs27;} // end inline asm fma.rn.ftz.f32 %f121, %f97, %f66, %f94; $L__BB0_8: add.s32 %r77, %r77, 4; shl.b32 %r53, %r77, 5; add.s32 %r76, %r53, %r3; setp.lt.u32 %p7, %r76, %r24; @%p7 bra $L__BB0_2; $L__BB0_9: setp.lt.u32 %p8, %r4, 32; @%p8 bra $L__BB0_11; shl.b32 %r54, %r4, 2; mov.u32 %r55, _ZZ9gemv_int4ILi4ELi32ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r56, %r55, %r54; st.shared.f32 [%r56+-112], %f121; $L__BB0_11: bar.sync 0; setp.gt.u32 %p9, %r4, 31; @%p9 bra $L__BB0_13; mov.u32 %r73, _ZZ9gemv_int4ILi4ELi32ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; mad.lo.s32 %r74, %r4, 12, %r73; mov.u32 %r61, 2; mov.u32 %r70, 16; ld.shared.f32 %f113, [%r74+16]; add.ftz.f32 %f114, %f121, %f113; ld.shared.f32 %f115, [%r74+20]; add.ftz.f32 %f116, %f114, %f115; ld.shared.f32 %f117, [%r74+24]; add.ftz.f32 %f100, %f116, %f117; mov.u32 %r58, 1; mov.u32 %r71, 31; mov.u32 %r72, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f100, %r58, %r71, %r72; @p add.f32 r0, r0, %f100; mov.f32 %f98, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f98, %r61, %r71, %r72; @p add.f32 r0, r0, %f98; mov.f32 %f101, r0;} // end inline asm mov.u32 %r64, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f101, %r64, %r71, %r72; @p add.f32 r0, r0, %f101; mov.f32 %f104, r0;} // end inline asm mov.u32 %r67, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f104, %r67, %r71, %r72; @p add.f32 r0, r0, %f104; mov.f32 %f107, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f107, %r70, %r71, %r72; @p add.f32 r0, r0, %f107; mov.f32 %f121, r0;} // end inline asm $L__BB0_13: or.b32 %r75, %r3, %r2; setp.ne.s32 %p10, %r75, 0; @%p10 bra $L__BB0_17; mul.ftz.f32 %f124, %f16, %f121; cvt.s64.s32 %rd6, %r1; setp.eq.s64 %p11, %rd8, 0; @%p11 bra $L__BB0_16; cvta.to.global.u64 %rd27, %rd8; shl.b64 %rd28, %rd6, 1; add.s64 %rd29, %rd27, %rd28; ld.global.u16 %rs110, [%rd29]; // begin inline asm { cvt.f32.f16 %f118, %rs110;} // end inline asm fma.rn.ftz.f32 %f124, %f17, %f118, %f124; $L__BB0_16: // begin inline asm { cvt.rn.f16.f32 %rs111, %f124;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd30, 1.0; // end inline asm shl.b64 %rd33, %rd6, 1; add.s64 %rd31, %rd7, %rd33; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd31], %rs111, %rd30; // end inline asm $L__BB0_17: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }