lders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e5afa8636thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e5afa8636thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e5afa8636thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e5afa8636thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e5afa8636thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e5afa8636thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_e5afa8636thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch223DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<14>; .reg .b16 %rs<139>; .reg .f32 %f<190>; .reg .b32 %r<105>; .reg .b64 %rd<35>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi2EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1280]; ld.param.v2.u32 {%r24, %r25}, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r26, %r27}, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f23, %f24}, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs44, %rs45, %rs46, %rs47}, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd16, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd15, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd14, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd13, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd12, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd11, [_Z27dequant_gemv_group32_batch223DequantGemvKernelParams_param_0]; mov.u32 %r1, %ctaid.x; mov.u32 %r104, %tid.y; shl.b32 %r28, %r104, 5; mov.u32 %r3, %tid.x; add.s32 %r4, %r28, %r3; setp.ge.u32 %p1, %r4, %r26; mov.f32 %f184, 0f00000000; mov.f32 %f185, %f184; @%p1 bra $L__BB0_9; mul.lo.s32 %r7, %r26, %r1; shr.u32 %r8, %r3, 2; mul.lo.s32 %r9, %r27, %r1; shl.b16 %rs2, %rs44, 3; cvta.to.global.u64 %rd2, %rd16; cvta.to.global.u64 %rd3, %rd15; cvta.to.global.u64 %rd4, %rd14; cvta.to.global.u64 %rd5, %rd13; mov.u32 %r103, %r4; $L__BB0_2: add.s32 %r29, %r103, %r7; mul.wide.u32 %rd17, %r29, 4; add.s64 %rd18, %rd4, %rd17; ld.global.u32 %r12, [%rd18]; shl.b32 %r30, %r104, 3; add.s32 %r13, %r30, %r8; add.s32 %r14, %r13, %r9; mul.wide.s32 %rd19, %r14, 2; add.s64 %rd20, %rd2, %rd19; ld.global.u16 %rs52, [%rd20]; // begin inline asm { cvt.f32.f16 %f27, %rs52;} // end inline asm setp.eq.s64 %p2, %rd15, 0; mov.u16 %rs138, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r31, %r14, 31; add.s32 %r32, %r14, %r31; shr.s32 %r33, %r32, 1; cvt.s64.s32 %rd21, %r33; add.s64 %rd22, %rd3, %rd21; ld.global.u8 %r34, [%rd22]; shl.b32 %r35, %r13, 2; and.b32 %r36, %r35, 4; shr.u32 %r37, %r34, %r36; cvt.u16.u32 %rs53, %r37; and.b16 %rs138, %rs53, 15; $L__BB0_4: shl.b32 %r15, %r103, 3; setp.ge.s32 %p3, %r15, %r24; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs44, 0; mul.wide.s32 %rd23, %r15, 2; add.s64 %rd24, %rd5, %rd23; ld.global.v4.u32 {%r38, %r39, %r40, %r41}, [%rd24]; mul.wide.s32 %rd25, %r24, 2; add.s64 %rd26, %rd24, %rd25; ld.global.v4.u32 {%r46, %r47, %r48, %r49}, [%rd26]; shr.u16 %rs55, %rs138, 3; and.b16 %rs56, %rs55, 1; setp.eq.b16 %p5, %rs56, 1; and.pred %p6, %p4, %p5; selp.b16 %rs57, -16, 0, %p6; or.b16 %rs58, %rs57, %rs138; cvt.s16.s8 %rs59, %rs58; cvt.rn.f32.s16 %f4, %rs59; cvt.u16.u32 %rs5, %r12; and.b16 %rs6, %rs5, 15; mov.b32 {%rs7, %rs11}, %r38; mov.b32 {%rs8, %rs12}, %r46; shr.u32 %r54, %r12, 4; cvt.u16.u32 %rs9, %r54; and.b16 %rs10, %rs9, 15; shr.u32 %r55, %r12, 8; cvt.u16.u32 %rs13, %r55; and.b16 %rs14, %rs13, 15; mov.b32 {%rs15, %rs19}, %r39; mov.b32 {%rs16, %rs20}, %r47; shr.u32 %r56, %r12, 12; cvt.u16.u32 %rs17, %r56; and.b16 %rs18, %rs17, 15; shr.u32 %r57, %r12, 16; cvt.u16.u32 %rs21, %r57; and.b16 %rs22, %rs21, 15; mov.b32 {%rs23, %rs27}, %r40; mov.b32 {%rs24, %rs28}, %r48; shr.u32 %r58, %r12, 20; cvt.u16.u32 %rs25, %r58; and.b16 %rs26, %rs25, 15; shr.u32 %r59, %r12, 24; cvt.u16.u32 %rs29, %r59; and.b16 %rs30, %rs29, 15; mov.b32 {%rs31, %rs34}, %r41; mov.b32 {%rs32, %rs35}, %r49; shr.u32 %r60, %r12, 28; cvt.u16.u32 %rs33, %r60; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f44, %rs6; sub.ftz.f32 %f45, %f44, %f4; mul.ftz.f32 %f46, %f27, %f45; // begin inline asm { cvt.f32.f16 %f28, %rs7;} // end inline asm fma.rn.ftz.f32 %f47, %f46, %f28, %f184; // begin inline asm { cvt.f32.f16 %f29, %rs8;} // end inline asm fma.rn.ftz.f32 %f48, %f46, %f29, %f185; cvt.rn.f32.s16 %f49, %rs10; sub.ftz.f32 %f50, %f49, %f4; mul.ftz.f32 %f51, %f27, %f50; // begin inline asm { cvt.f32.f16 %f30, %rs11;} // end inline asm fma.rn.ftz.f32 %f52, %f51, %f30, %f47; // begin inline asm { cvt.f32.f16 %f31, %rs12;} // end inline asm fma.rn.ftz.f32 %f53, %f51, %f31, %f48; cvt.rn.f32.s16 %f54, %rs14; sub.ftz.f32 %f55, %f54, %f4; mul.ftz.f32 %f56, %f27, %f55; // begin inline asm { cvt.f32.f16 %f32, %rs15;} // end inline asm fma.rn.ftz.f32 %f57, %f56, %f32, %f52; // begin inline asm { cvt.f32.f16 %f33, %rs16;} // end inline asm fma.rn.ftz.f32 %f58, %f56, %f33, %f53; cvt.rn.f32.s16 %f59, %rs18; sub.ftz.f32 %f60, %f59, %f4; mul.ftz.f32 %f61, %f27, %f60; // begin inline asm { cvt.f32.f16 %f34, %rs19;} // end inline asm fma.rn.ftz.f32 %f62, %f61, %f34, %f57; // begin inline asm { cvt.f32.f16 %f35, %rs20;} // end inline asm fma.rn.ftz.f32 %f63, %f61, %f35, %f58; cvt.rn.f32.s16 %f64, %rs22; sub.ftz.f32 %f65, %f64, %f4; mul.ftz.f32 %f66, %f27, %f65; // begin inline asm { cvt.f32.f16 %f36, %rs23;} // end inline asm fma.rn.ftz.f32 %f67, %f66, %f36, %f62; // begin inline asm { cvt.f32.f16 %f37, %rs24;} // end inline asm fma.rn.ftz.f32 %f68, %f66, %f37, %f63; cvt.rn.f32.s16 %f69, %rs26; sub.ftz.f32 %f70, %f69, %f4; mul.ftz.f32 %f71, %f27, %f70; // begin inline asm { cvt.f32.f16 %f38, %rs27;} // end inline asm fma.rn.ftz.f32 %f72, %f71, %f38, %f67; // begin inline asm { cvt.f32.f16 %f39, %rs28;} // end inline asm fma.rn.ftz.f32 %f73, %f71, %f39, %f68; cvt.rn.f32.s16 %f74, %rs30; sub.ftz.f32 %f75, %f74, %f4; mul.ftz.f32 %f76, %f27, %f75; // begin inline asm { cvt.f32.f16 %f40, %rs31;} // end inline asm fma.rn.ftz.f32 %f77, %f76, %f40, %f72; // begin inline asm { cvt.f32.f16 %f41, %rs32;} // end inline asm fma.rn.ftz.f32 %f78, %f76, %f41, %f73; cvt.rn.f32.s16 %f79, %rs33; sub.ftz.f32 %f80, %f79, %f4; mul.ftz.f32 %f81, %f27, %f80; // begin inline asm { cvt.f32.f16 %f42, %rs34;} // end inline asm fma.rn.ftz.f32 %f184, %f81, %f42, %f77; // begin inline asm { cvt.f32.f16 %f43, %rs35;} // end inline asm fma.rn.ftz.f32 %f185, %f81, %f43, %f78; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs92, %rs5, 4; cvt.s16.s8 %rs93, %rs92; shr.s16 %rs94, %rs93, 7; and.b16 %rs95, %rs94, -16; or.b16 %rs96, %rs95, %rs6; cvt.rn.f32.s16 %f98, %rs96; sub.ftz.f32 %f99, %f98, %f4; mul.ftz.f32 %f100, %f27, %f99; // begin inline asm { cvt.f32.f16 %f82, %rs7;} // end inline asm fma.rn.ftz.f32 %f101, %f100, %f82, %f184; // begin inline asm { cvt.f32.f16 %f83, %rs8;} // end inline asm fma.rn.ftz.f32 %f102, %f100, %f83, %f185; shl.b16 %rs97, %rs9, 4; cvt.s16.s8 %rs98, %rs97; shr.s16 %rs99, %rs98, 7; and.b16 %rs100, %rs99, -16; or.b16 %rs101, %rs100, %rs10; cvt.rn.f32.s16 %f103, %rs101; sub.ftz.f32 %f104, %f103, %f4; mul.ftz.f32 %f105, %f27, %f104; // begin inline asm { cvt.f32.f16 %f84, %rs11;} // end inline asm fma.rn.ftz.f32 %f106, %f105, %f84, %f101; // begin inline asm { cvt.f32.f16 %f85, %rs12;} // end inline asm fma.rn.ftz.f32 %f107, %f105, %f85, %f102; shl.b16 %rs102, %rs13, 4; cvt.s16.s8 %rs103, %rs102; shr.s16 %rs104, %rs103, 7; and.b16 %rs105, %rs104, -16; or.b16 %rs106, %rs105, %rs14; cvt.rn.f32.s16 %f108, %rs106; sub.ftz.f32 %f109, %f108, %f4; mul.ftz.f32 %f110, %f27, %f109; // begin inline asm { cvt.f32.f16 %f86, %rs15;} // end inline asm fma.rn.ftz.f32 %f111, %f110, %f86, %f106; // begin inline asm { cvt.f32.f16 %f87, %rs16;} // end inline asm fma.rn.ftz.f32 %f112, %f110, %f87, %f107; shl.b16 %rs107, %rs17, 4; cvt.s16.s8 %rs108, %rs107; shr.s16 %rs109, %rs108, 7; and.b16 %rs110, %rs109, -16; or.b16 %rs111, %rs110, %rs18; cvt.rn.f32.s16 %f113, %rs111; sub.ftz.f32 %f114, %f113, %f4; mul.ftz.f32 %f115, %f27, %f114; // begin inline asm { cvt.f32.f16 %f88, %rs19;} // end inline asm fma.rn.ftz.f32 %f116, %f115, %f88, %f111; // begin inline asm { cvt.f32.f16 %f89, %rs20;} // end inline asm fma.rn.ftz.f32 %f117, %f115, %f89, %f112; shl.b16 %rs112, %rs21, 4; cvt.s16.s8 %rs113, %rs112; shr.s16 %rs114, %rs113, 7; and.b16 %rs115, %rs114, -16; or.b16 %rs116, %rs115, %rs22; cvt.rn.f32.s16 %f118, %rs116; sub.ftz.f32 %f119, %f118, %f4; mul.ftz.f32 %f120, %f27, %f119; // begin inline asm { cvt.f32.f16 %f90, %rs23;} // end inline asm fma.rn.ftz.f32 %f121, %f120, %f90, %f116; // begin inline asm { cvt.f32.f16 %f91, %rs24;} // end inline asm fma.rn.ftz.f32 %f122, %f120, %f91, %f117; shl.b16 %rs117, %rs25, 4; cvt.s16.s8 %rs118, %rs117; shr.s16 %rs119, %rs118, 7; and.b16 %rs120, %rs119, -16; or.b16 %rs121, %rs120, %rs26; cvt.rn.f32.s16 %f123, %rs121; sub.ftz.f32 %f124, %f123, %f4; mul.ftz.f32 %f125, %f27, %f124; // begin inline asm { cvt.f32.f16 %f92, %rs27;} // end inline asm fma.rn.ftz.f32 %f126, %f125, %f92, %f121; // begin inline asm { cvt.f32.f16 %f93, %rs28;} // end inline asm fma.rn.ftz.f32 %f127, %f125, %f93, %f122; shl.b16 %rs122, %rs29, 4; cvt.s16.s8 %rs123, %rs122; shr.s16 %rs124, %rs123, 7; and.b16 %rs125, %rs124, -16; or.b16 %rs126, %rs125, %rs30; cvt.rn.f32.s16 %f128, %rs126; sub.ftz.f32 %f129, %f128, %f4; mul.ftz.f32 %f130, %f27, %f129; // begin inline asm { cvt.f32.f16 %f94, %rs31;} // end inline asm fma.rn.ftz.f32 %f131, %f130, %f94, %f126; // begin inline asm { cvt.f32.f16 %f95, %rs32;} // end inline asm fma.rn.ftz.f32 %f132, %f130, %f95, %f127; shl.b16 %rs127, %rs33, 4; cvt.s16.s8 %rs128, %rs127; shr.s16 %rs129, %rs128, 7; and.b16 %rs130, %rs129, -16; or.b16 %rs131, %rs130, %rs33; cvt.rn.f32.s16 %f133, %rs131; sub.ftz.f32 %f134, %f133, %f4; mul.ftz.f32 %f135, %f27, %f134; // begin inline asm { cvt.f32.f16 %f96, %rs34;} // end inline asm fma.rn.ftz.f32 %f184, %f135, %f96, %f131; // begin inline asm { cvt.f32.f16 %f97, %rs35;} // end inline asm fma.rn.ftz.f32 %f185, %f135, %f97, %f132; $L__BB0_8: add.s32 %r104, %r104, 4; shl.b32 %r61, %r104, 5; add.s32 %r103, %r61, %r3; setp.lt.u32 %p7, %r103, %r26; @%p7 bra $L__BB0_2; $L__BB0_9: shl.b32 %r62, %r4, 2; mov.u32 %r63, _ZZ9gemv_int4ILi4ELi32ELi2EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r64, %r63, %r62; setp.lt.u32 %p8, %r4, 32; @%p8 bra $L__BB0_11; add.s32 %r99, %r64, -112; st.shared.f32 [%r99], %f184; $L__BB0_11: setp.gt.u32 %p9, %r4, 31; bar.sync 0; mad.lo.s32 %r19, %r4, 12, %r63; @%p9 bra $L__BB0_13; mov.u32 %r79, 16; ld.shared.f32 %f151, [%r19+16]; add.ftz.f32 %f152, %f184, %f151; ld.shared.f32 %f153, [%r19+20]; add.ftz.f32 %f154, %f152, %f153; ld.shared.f32 %f155, [%r19+24]; add.ftz.f32 %f138, %f154, %f155; mov.u32 %r67, 1; mov.u32 %r80, 31; mov.u32 %r81, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f138, %r67, %r80, %r81; @p add.f32 r0, r0, %f138; mov.f32 %f136, r0;} // end inline asm mov.u32 %r70, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f136, %r70, %r80, %r81; @p add.f32 r0, r0, %f136; mov.f32 %f139, r0;} // end inline asm mov.u32 %r73, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f139, %r73, %r80, %r81; @p add.f32 r0, r0, %f139; mov.f32 %f142, r0;} // end inline asm mov.u32 %r76, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f142, %r76, %r80, %r81; @p add.f32 r0, r0, %f142; mov.f32 %f145, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f145, %r79, %r80, %r81; @p add.f32 r0, r0, %f145; mov.f32 %f184, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r100, %r64, -112; st.shared.f32 [%r100+640], %f185; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f171, [%r19+656]; add.ftz.f32 %f172, %f185, %f171; ld.shared.f32 %f173, [%r19+660]; add.ftz.f32 %f174, %f172, %f173; ld.shared.f32 %f175, [%r19+664]; add.ftz.f32 %f158, %f174, %f175; mov.u32 %r83, 1; mov.u32 %r96, 31; mov.u32 %r97, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f158, %r83, %r96, %r97; @p add.f32 r0, r0, %f158; mov.f32 %f156, r0;} // end inline asm mov.u32 %r86, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f156, %r86, %r96, %r97; @p add.f32 r0, r0, %f156; mov.f32 %f159, r0;} // end inline asm mov.u32 %r89, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f159, %r89, %r96, %r97; @p add.f32 r0, r0, %f159; mov.f32 %f162, r0;} // end inline asm mov.u32 %r92, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f162, %r92, %r96, %r97; @p add.f32 r0, r0, %f162; mov.f32 %f165, r0;} // end inline asm mov.u32 %r95, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f165, %r95, %r96, %r97; @p add.f32 r0, r0, %f165; mov.f32 %f185, r0;} // end inline asm $L__BB0_17: mov.u32 %r101, %tid.y; or.b32 %r98, %r3, %r101; setp.ne.s32 %p12, %r98, 0; @%p12 bra $L__BB0_21; mov.u32 %r102, %ctaid.x; setp.eq.s64 %p13, %rd12, 0; mul.ftz.f32 %f17, %f23, %f184; cvt.s64.s32 %rd7, %r102; cvta.to.global.u64 %rd27, %rd11; mul.wide.s32 %rd28, %r102, 2; add.s64 %rd8, %rd27, %rd28; mul.ftz.f32 %f18, %f23, %f185; cvt.s64.s32 %rd9, %r25; mul.wide.s32 %rd29, %r25, 2; add.s64 %rd10, %rd8, %rd29; @%p13 bra $L__BB0_20; cvta.to.global.u64 %rd30, %rd12; shl.b64 %rd31, %rd7, 1; add.s64 %rd32, %rd30, %rd31; ld.global.u16 %rs132, [%rd32]; // begin inline asm { cvt.f32.f16 %f176, %rs132;} // end inline asm fma.rn.ftz.f32 %f177, %f24, %f176, %f17; // begin inline asm { cvt.rn.f16.f32 %rs133, %f177;} // end inline asm st.global.u16 [%rd8], %rs133; shl.b64 %rd33, %rd9, 1; add.s64 %rd34, %rd32, %rd33; ld.global.u16 %rs134, [%rd34]; // begin inline asm { cvt.f32.f16 %f178, %rs134;} // end inline asm fma.rn.ftz.f32 %f179, %f24, %f178, %f18; // begin inline asm { cvt.rn.f16.f32 %rs135, %f179;} // end inline asm st.global.u16 [%rd10], %rs135; bra.uni $L__BB0_21; $L__BB0_20: // begin inline asm { cvt.rn.f16.f32 %rs136, %f17;} // end inline asm st.global.u16 [%rd8], %rs136; // begin inline asm { cvt.rn.f16.f32 %rs137, %f18;} // end inline asm st.global.u16 [%rd10], %rs137; $L__BB0_21: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }