// Cuda compilation tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_80 .address_size 64 // .globl _Z27dequant_gemv_group32_batch823DequantGemvKernelParams // _ZZ9gemv_int4ILi4ELi32ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch823DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<33>; .reg .b16 %rs<292>; .reg .f32 %f<610>; .reg .b32 %r<285>; .reg .b64 %rd<98>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[5120]; ld.param.v2.u32 {%r54, %r55}, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r56, %r57}, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f95, %f96}, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs69, %rs70, %rs71, %rs72}, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd28, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd27, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd26, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd25, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd24, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd2, %rd24; mov.u32 %r284, %tid.y; shl.b32 %r58, %r284, 5; mov.u32 %r59, %tid.x; add.s32 %r283, %r58, %r59; setp.ge.u32 %p1, %r283, %r56; mov.f32 %f578, 0f00000000; mov.f32 %f579, %f578; mov.f32 %f580, %f578; mov.f32 %f581, %f578; mov.f32 %f582, %f578; mov.f32 %f583, %f578; mov.f32 %f584, %f578; mov.f32 %f585, %f578; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd25; mov.u32 %r60, %ctaid.x; mul.lo.s32 %r5, %r57, %r60; $L__BB0_2: mad.lo.s32 %r63, %r56, %r60, %r283; mul.wide.u32 %rd35, %r63, 4; add.s64 %rd30, %rd26, %rd35; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd29, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.u32 %r61, [%rd30], %rd29; // end inline asm shr.u32 %r65, %r59, 2; shl.b32 %r66, %r284, 3; add.s32 %r9, %r66, %r65; add.s32 %r10, %r9, %r5; mul.wide.s32 %rd36, %r10, 2; add.s64 %rd33, %rd28, %rd36; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd32, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs77, [%rd33], %rd32; // end inline asm // begin inline asm { cvt.f32.f16 %f105, %rs77;} // end inline asm shl.b16 %rs291, %rs69, 3; setp.eq.s64 %p2, %rd27, 0; @%p2 bra $L__BB0_4; shr.u32 %r67, %r10, 31; add.s32 %r68, %r10, %r67; shr.s32 %r69, %r68, 1; cvt.s64.s32 %rd40, %r69; add.s64 %rd38, %rd27, %rd40; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd37, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs79, [%rd38], %rd37; // end inline asm cvt.u32.u16 %r70, %rs79; and.b32 %r71, %r70, 255; shl.b32 %r72, %r9, 2; and.b32 %r73, %r72, 4; shr.u32 %r74, %r71, %r73; cvt.u16.u32 %rs80, %r74; and.b16 %rs291, %rs80, 15; $L__BB0_4: shl.b32 %r11, %r283, 3; setp.ge.s32 %p3, %r11, %r54; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs69, 0; mul.wide.s32 %rd41, %r11, 2; add.s64 %rd42, %rd3, %rd41; ld.global.v4.u32 {%r75, %r76, %r77, %r78}, [%rd42]; mul.wide.s32 %rd43, %r54, 2; add.s64 %rd44, %rd42, %rd43; ld.global.v4.u32 {%r79, %r80, %r81, %r82}, [%rd44]; add.s32 %r83, %r54, %r11; add.s32 %r84, %r83, %r54; mul.wide.s32 %rd45, %r84, 2; add.s64 %rd46, %rd3, %rd45; ld.global.v4.u32 {%r85, %r86, %r87, %r88}, [%rd46]; add.s64 %rd47, %rd46, %rd43; ld.global.v4.u32 {%r89, %r90, %r91, %r92}, [%rd47]; add.s64 %rd48, %rd47, %rd43; ld.global.v4.u32 {%r93, %r94, %r95, %r96}, [%rd48]; add.s64 %rd49, %rd48, %rd43; ld.global.v4.u32 {%r97, %r98, %r99, %r100}, [%rd49]; add.s64 %rd50, %rd49, %rd43; ld.global.v4.u32 {%r101, %r102, %r103, %r104}, [%rd50]; add.s64 %rd51, %rd50, %rd43; ld.global.v4.u32 {%r105, %r106, %r107, %r108}, [%rd51]; shr.u16 %rs82, %rs291, 3; and.b16 %rs83, %rs82, 1; setp.eq.b16 %p5, %rs83, 1; and.pred %p6, %p4, %p5; selp.b16 %rs84, -16, 0, %p6; or.b16 %rs85, %rs84, %rs291; cvt.s16.s8 %rs86, %rs85; cvt.rn.f32.s16 %f10, %rs86; cvt.u16.u32 %rs5, %r61; and.b16 %rs6, %rs5, 15; cvt.u16.u32 %rs7, %r75; mov.b32 {%rs10, %rs9}, %r79; mov.b32 {%rs13, %rs12}, %r85; mov.b32 {%rs16, %rs15}, %r89; mov.b32 {%rs19, %rs18}, %r93; mov.b32 {%rs22, %rs21}, %r97; mov.b32 {%rs25, %rs24}, %r101; mov.b32 {%rs28, %rs27}, %r105; shr.u32 %r109, %r61, 4; cvt.u16.u32 %rs29, %r109; and.b16 %rs30, %rs29, 15; shr.u32 %r110, %r61, 8; cvt.u16.u32 %rs31, %r110; and.b16 %rs32, %rs31, 15; shr.u32 %r111, %r61, 12; cvt.u16.u32 %rs33, %r111; and.b16 %rs34, %rs33, 15; mov.b32 {%rs87, %rs35}, %r98; mov.b32 {%rs88, %rs36}, %r102; mov.b32 {%rs89, %rs37}, %r106; shr.u32 %r112, %r61, 16; cvt.u16.u32 %rs38, %r112; and.b16 %rs39, %rs38, 15; mov.b32 {%rs40, %rs50}, %r77; mov.b32 {%rs41, %rs51}, %r81; mov.b32 {%rs42, %rs52}, %r87; mov.b32 {%rs43, %rs53}, %r91; mov.b32 {%rs44, %rs54}, %r95; mov.b32 {%rs45, %rs55}, %r99; mov.b32 {%rs46, %rs56}, %r103; mov.b32 {%rs47, %rs57}, %r107; shr.u32 %r113, %r61, 20; cvt.u16.u32 %rs48, %r113; and.b16 %rs49, %rs48, 15; shr.u32 %r114, %r61, 24; cvt.u16.u32 %rs58, %r114; and.b16 %rs59, %rs58, 15; shr.u32 %r115, %r61, 28; cvt.u16.u32 %rs60, %r115; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f170, %rs6; sub.ftz.f32 %f171, %f170, %f10; mul.ftz.f32 %f172, %f105, %f171; // begin inline asm { cvt.f32.f16 %f106, %rs7;} // end inline asm fma.rn.ftz.f32 %f173, %f172, %f106, %f578; // begin inline asm { cvt.f32.f16 %f107, %rs10;} // end inline asm fma.rn.ftz.f32 %f174, %f172, %f107, %f579; // begin inline asm { cvt.f32.f16 %f108, %rs13;} // end inline asm fma.rn.ftz.f32 %f175, %f172, %f108, %f580; // begin inline asm { cvt.f32.f16 %f109, %rs16;} // end inline asm fma.rn.ftz.f32 %f176, %f172, %f109, %f581; // begin inline asm { cvt.f32.f16 %f110, %rs19;} // end inline asm fma.rn.ftz.f32 %f177, %f172, %f110, %f582; // begin inline asm { cvt.f32.f16 %f111, %rs22;} // end inline asm fma.rn.ftz.f32 %f178, %f172, %f111, %f583; // begin inline asm { cvt.f32.f16 %f112, %rs25;} // end inline asm fma.rn.ftz.f32 %f179, %f172, %f112, %f584; // begin inline asm { cvt.f32.f16 %f113, %rs28;} // end inline asm fma.rn.ftz.f32 %f180, %f172, %f113, %f585; cvt.rn.f32.s16 %f181, %rs30; sub.ftz.f32 %f182, %f181, %f10; mul.ftz.f32 %f183, %f105, %f182; mov.b32 {%rs154, %rs98}, %r75; // begin inline asm { cvt.f32.f16 %f114, %rs98;} // end inline asm fma.rn.ftz.f32 %f184, %f183, %f114, %f173; // begin inline asm { cvt.f32.f16 %f115, %rs9;} // end inline asm fma.rn.ftz.f32 %f185, %f183, %f115, %f174; // begin inline asm { cvt.f32.f16 %f116, %rs12;} // end inline asm fma.rn.ftz.f32 %f186, %f183, %f116, %f175; // begin inline asm { cvt.f32.f16 %f117, %rs15;} // end inline asm fma.rn.ftz.f32 %f187, %f183, %f117, %f176; // begin inline asm { cvt.f32.f16 %f118, %rs18;} // end inline asm fma.rn.ftz.f32 %f188, %f183, %f118, %f177; // begin inline asm { cvt.f32.f16 %f119, %rs21;} // end inline asm fma.rn.ftz.f32 %f189, %f183, %f119, %f178; // begin inline asm { cvt.f32.f16 %f120, %rs24;} // end inline asm fma.rn.ftz.f32 %f190, %f183, %f120, %f179; // begin inline asm { cvt.f32.f16 %f121, %rs27;} // end inline asm fma.rn.ftz.f32 %f191, %f183, %f121, %f180; cvt.rn.f32.s16 %f192, %rs32; sub.ftz.f32 %f193, %f192, %f10; mul.ftz.f32 %f194, %f105, %f193; mov.b32 {%rs106, %rs114}, %r76; // begin inline asm { cvt.f32.f16 %f122, %rs106;} // end inline asm fma.rn.ftz.f32 %f195, %f194, %f122, %f184; mov.b32 {%rs107, %rs115}, %r80; // begin inline asm { cvt.f32.f16 %f123, %rs107;} // end inline asm fma.rn.ftz.f32 %f196, %f194, %f123, %f185; mov.b32 {%rs108, %rs116}, %r86; // begin inline asm { cvt.f32.f16 %f124, %rs108;} // end inline asm fma.rn.ftz.f32 %f197, %f194, %f124, %f186; mov.b32 {%rs109, %rs117}, %r90; // begin inline asm { cvt.f32.f16 %f125, %rs109;} // end inline asm fma.rn.ftz.f32 %f198, %f194, %f125, %f187; mov.b32 {%rs110, %rs118}, %r94; // begin inline asm { cvt.f32.f16 %f126, %rs110;} // end inline asm fma.rn.ftz.f32 %f199, %f194, %f126, %f188; cvt.u16.u32 %rs111, %r98; // begin inline asm { cvt.f32.f16 %f127, %rs111;} // end inline asm fma.rn.ftz.f32 %f200, %f194, %f127, %f189; cvt.u16.u32 %rs112, %r102; // begin inline asm { cvt.f32.f16 %f128, %rs112;} // end inline asm fma.rn.ftz.f32 %f201, %f194, %f128, %f190; cvt.u16.u32 %rs113, %r106; // begin inline asm { cvt.f32.f16 %f129, %rs113;} // end inline asm fma.rn.ftz.f32 %f202, %f194, %f129, %f191; cvt.rn.f32.s16 %f203, %rs34; sub.ftz.f32 %f204, %f203, %f10; mul.ftz.f32 %f205, %f105, %f204; // begin inline asm { cvt.f32.f16 %f130, %rs114;} // end inline asm fma.rn.ftz.f32 %f206, %f205, %f130, %f195; // begin inline asm { cvt.f32.f16 %f131, %rs115;} // end inline asm fma.rn.ftz.f32 %f207, %f205, %f131, %f196; // begin inline asm { cvt.f32.f16 %f132, %rs116;} // end inline asm fma.rn.ftz.f32 %f208, %f205, %f132, %f197; // begin inline asm { cvt.f32.f16 %f133, %rs117;} // end inline asm fma.rn.ftz.f32 %f209, %f205, %f133, %f198; // begin inline asm { cvt.f32.f16 %f134, %rs118;} // end inline asm fma.rn.ftz.f32 %f210, %f205, %f134, %f199; // begin inline asm { cvt.f32.f16 %f135, %rs35;} // end inline asm fma.rn.ftz.f32 %f211, %f205, %f135, %f200; // begin inline asm { cvt.f32.f16 %f136, %rs36;} // end inline asm fma.rn.ftz.f32 %f212, %f205, %f136, %f201; // begin inline asm { cvt.f32.f16 %f137, %rs37;} // end inline asm fma.rn.ftz.f32 %f213, %f205, %f137, %f202; cvt.rn.f32.s16 %f214, %rs39; sub.ftz.f32 %f215, %f214, %f10; mul.ftz.f32 %f216, %f105, %f215; // begin inline asm { cvt.f32.f16 %f138, %rs40;} // end inline asm fma.rn.ftz.f32 %f217, %f216, %f138, %f206; // begin inline asm { cvt.f32.f16 %f139, %rs41;} // end inline asm fma.rn.ftz.f32 %f218, %f216, %f139, %f207; // begin inline asm { cvt.f32.f16 %f140, %rs42;} // end inline asm fma.rn.ftz.f32 %f219, %f216, %f140, %f208; // begin inline asm { cvt.f32.f16 %f141, %rs43;} // end inline asm fma.rn.ftz.f32 %f220, %f216, %f141, %f209; // begin inline asm { cvt.f32.f16 %f142, %rs44;} // end inline asm fma.rn.ftz.f32 %f221, %f216, %f142, %f210; // begin inline asm { cvt.f32.f16 %f143, %rs45;} // end inline asm fma.rn.ftz.f32 %f222, %f216, %f143, %f211; // begin inline asm { cvt.f32.f16 %f144, %rs46;} // end inline asm fma.rn.ftz.f32 %f223, %f216, %f144, %f212; // begin inline asm { cvt.f32.f16 %f145, %rs47;} // end inline asm fma.rn.ftz.f32 %f224, %f216, %f145, %f213; cvt.rn.f32.s16 %f225, %rs49; sub.ftz.f32 %f226, %f225, %f10; mul.ftz.f32 %f227, %f105, %f226; // begin inline asm { cvt.f32.f16 %f146, %rs50;} // end inline asm fma.rn.ftz.f32 %f228, %f227, %f146, %f217; // begin inline asm { cvt.f32.f16 %f147, %rs51;} // end inline asm fma.rn.ftz.f32 %f229, %f227, %f147, %f218; // begin inline asm { cvt.f32.f16 %f148, %rs52;} // end inline asm fma.rn.ftz.f32 %f230, %f227, %f148, %f219; // begin inline asm { cvt.f32.f16 %f149, %rs53;} // end inline asm fma.rn.ftz.f32 %f231, %f227, %f149, %f220; // begin inline asm { cvt.f32.f16 %f150, %rs54;} // end inline asm fma.rn.ftz.f32 %f232, %f227, %f150, %f221; // begin inline asm { cvt.f32.f16 %f151, %rs55;} // end inline asm fma.rn.ftz.f32 %f233, %f227, %f151, %f222; // begin inline asm { cvt.f32.f16 %f152, %rs56;} // end inline asm fma.rn.ftz.f32 %f234, %f227, %f152, %f223; // begin inline asm { cvt.f32.f16 %f153, %rs57;} // end inline asm fma.rn.ftz.f32 %f235, %f227, %f153, %f224; cvt.rn.f32.s16 %f236, %rs59; sub.ftz.f32 %f237, %f236, %f10; mul.ftz.f32 %f238, %f105, %f237; mov.b32 {%rs138, %rs146}, %r78; // begin inline asm { cvt.f32.f16 %f154, %rs138;} // end inline asm fma.rn.ftz.f32 %f239, %f238, %f154, %f228; mov.b32 {%rs139, %rs147}, %r82; // begin inline asm { cvt.f32.f16 %f155, %rs139;} // end inline asm fma.rn.ftz.f32 %f240, %f238, %f155, %f229; mov.b32 {%rs140, %rs148}, %r88; // begin inline asm { cvt.f32.f16 %f156, %rs140;} // end inline asm fma.rn.ftz.f32 %f241, %f238, %f156, %f230; mov.b32 {%rs141, %rs149}, %r92; // begin inline asm { cvt.f32.f16 %f157, %rs141;} // end inline asm fma.rn.ftz.f32 %f242, %f238, %f157, %f231; mov.b32 {%rs142, %rs150}, %r96; // begin inline asm { cvt.f32.f16 %f158, %rs142;} // end inline asm fma.rn.ftz.f32 %f243, %f238, %f158, %f232; mov.b32 {%rs143, %rs151}, %r100; // begin inline asm { cvt.f32.f16 %f159, %rs143;} // end inline asm fma.rn.ftz.f32 %f244, %f238, %f159, %f233; mov.b32 {%rs144, %rs152}, %r104; // begin inline asm { cvt.f32.f16 %f160, %rs144;} // end inline asm fma.rn.ftz.f32 %f245, %f238, %f160, %f234; mov.b32 {%rs145, %rs153}, %r108; // begin inline asm { cvt.f32.f16 %f161, %rs145;} // end inline asm fma.rn.ftz.f32 %f246, %f238, %f161, %f235; cvt.rn.f32.s16 %f247, %rs60; sub.ftz.f32 %f248, %f247, %f10; mul.ftz.f32 %f249, %f105, %f248; // begin inline asm { cvt.f32.f16 %f162, %rs146;} // end inline asm fma.rn.ftz.f32 %f578, %f249, %f162, %f239; // begin inline asm { cvt.f32.f16 %f163, %rs147;} // end inline asm fma.rn.ftz.f32 %f579, %f249, %f163, %f240; // begin inline asm { cvt.f32.f16 %f164, %rs148;} // end inline asm fma.rn.ftz.f32 %f580, %f249, %f164, %f241; // begin inline asm { cvt.f32.f16 %f165, %rs149;} // end inline asm fma.rn.ftz.f32 %f581, %f249, %f165, %f242; // begin inline asm { cvt.f32.f16 %f166, %rs150;} // end inline asm fma.rn.ftz.f32 %f582, %f249, %f166, %f243; // begin inline asm { cvt.f32.f16 %f167, %rs151;} // end inline asm fma.rn.ftz.f32 %f583, %f249, %f167, %f244; // begin inline asm { cvt.f32.f16 %f168, %rs152;} // end inline asm fma.rn.ftz.f32 %f584, %f249, %f168, %f245; // begin inline asm { cvt.f32.f16 %f169, %rs153;} // end inline asm fma.rn.ftz.f32 %f585, %f249, %f169, %f246; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs219, %rs5, 4; cvt.s16.s8 %rs220, %rs219; shr.s16 %rs221, %rs220, 7; and.b16 %rs222, %rs221, -16; or.b16 %rs223, %rs222, %rs6; cvt.rn.f32.s16 %f314, %rs223; sub.ftz.f32 %f315, %f314, %f10; mul.ftz.f32 %f316, %f105, %f315; // begin inline asm { cvt.f32.f16 %f250, %rs7;} // end inline asm fma.rn.ftz.f32 %f317, %f316, %f250, %f578; // begin inline asm { cvt.f32.f16 %f251, %rs10;} // end inline asm fma.rn.ftz.f32 %f318, %f316, %f251, %f579; // begin inline asm { cvt.f32.f16 %f252, %rs13;} // end inline asm fma.rn.ftz.f32 %f319, %f316, %f252, %f580; // begin inline asm { cvt.f32.f16 %f253, %rs16;} // end inline asm fma.rn.ftz.f32 %f320, %f316, %f253, %f581; // begin inline asm { cvt.f32.f16 %f254, %rs19;} // end inline asm fma.rn.ftz.f32 %f321, %f316, %f254, %f582; // begin inline asm { cvt.f32.f16 %f255, %rs22;} // end inline asm fma.rn.ftz.f32 %f322, %f316, %f255, %f583; // begin inline asm { cvt.f32.f16 %f256, %rs25;} // end inline asm fma.rn.ftz.f32 %f323, %f316, %f256, %f584; // begin inline asm { cvt.f32.f16 %f257, %rs28;} // end inline asm fma.rn.ftz.f32 %f324, %f316, %f257, %f585; shl.b16 %rs224, %rs29, 4; cvt.s16.s8 %rs225, %rs224; shr.s16 %rs226, %rs225, 7; and.b16 %rs227, %rs226, -16; or.b16 %rs228, %rs227, %rs30; cvt.rn.f32.s16 %f325, %rs228; sub.ftz.f32 %f326, %f325, %f10; mul.ftz.f32 %f327, %f105, %f326; mov.b32 {%rs229, %rs163}, %r75; // begin inline asm { cvt.f32.f16 %f258, %rs163;} // end inline asm fma.rn.ftz.f32 %f328, %f327, %f258, %f317; // begin inline asm { cvt.f32.f16 %f259, %rs9;} // end inline asm fma.rn.ftz.f32 %f329, %f327, %f259, %f318; // begin inline asm { cvt.f32.f16 %f260, %rs12;} // end inline asm fma.rn.ftz.f32 %f330, %f327, %f260, %f319; // begin inline asm { cvt.f32.f16 %f261, %rs15;} // end inline asm fma.rn.ftz.f32 %f331, %f327, %f261, %f320; // begin inline asm { cvt.f32.f16 %f262, %rs18;} // end inline asm fma.rn.ftz.f32 %f332, %f327, %f262, %f321; // begin inline asm { cvt.f32.f16 %f263, %rs21;} // end inline asm fma.rn.ftz.f32 %f333, %f327, %f263, %f322; // begin inline asm { cvt.f32.f16 %f264, %rs24;} // end inline asm fma.rn.ftz.f32 %f334, %f327, %f264, %f323; // begin inline asm { cvt.f32.f16 %f265, %rs27;} // end inline asm fma.rn.ftz.f32 %f335, %f327, %f265, %f324; shl.b16 %rs237, %rs31, 4; cvt.s16.s8 %rs238, %rs237; shr.s16 %rs239, %rs238, 7; and.b16 %rs240, %rs239, -16; or.b16 %rs241, %rs240, %rs32; cvt.rn.f32.s16 %f336, %rs241; sub.ftz.f32 %f337, %f336, %f10; mul.ftz.f32 %f338, %f105, %f337; mov.b32 {%rs171, %rs179}, %r76; // begin inline asm { cvt.f32.f16 %f266, %rs171;} // end inline asm fma.rn.ftz.f32 %f339, %f338, %f266, %f328; mov.b32 {%rs172, %rs180}, %r80; // begin inline asm { cvt.f32.f16 %f267, %rs172;} // end inline asm fma.rn.ftz.f32 %f340, %f338, %f267, %f329; mov.b32 {%rs173, %rs181}, %r86; // begin inline asm { cvt.f32.f16 %f268, %rs173;} // end inline asm fma.rn.ftz.f32 %f341, %f338, %f268, %f330; mov.b32 {%rs174, %rs182}, %r90; // begin inline asm { cvt.f32.f16 %f269, %rs174;} // end inline asm fma.rn.ftz.f32 %f342, %f338, %f269, %f331; mov.b32 {%rs175, %rs183}, %r94; // begin inline asm { cvt.f32.f16 %f270, %rs175;} // end inline asm fma.rn.ftz.f32 %f343, %f338, %f270, %f332; cvt.u16.u32 %rs176, %r98; // begin inline asm { cvt.f32.f16 %f271, %rs176;} // end inline asm fma.rn.ftz.f32 %f344, %f338, %f271, %f333; cvt.u16.u32 %rs177, %r102; // begin inline asm { cvt.f32.f16 %f272, %rs177;} // end inline asm fma.rn.ftz.f32 %f345, %f338, %f272, %f334; cvt.u16.u32 %rs178, %r106; // begin inline asm { cvt.f32.f16 %f273, %rs178;} // end inline asm fma.rn.ftz.f32 %f346, %f338, %f273, %f335; shl.b16 %rs242, %rs33, 4; cvt.s16.s8 %rs243, %rs242; shr.s16 %rs244, %rs243, 7; and.b16 %rs245, %rs244, -16; or.b16 %rs246, %rs245, %rs34; cvt.rn.f32.s16 %f347, %rs246; sub.ftz.f32 %f348, %f347, %f10; mul.ftz.f32 %f349, %f105, %f348; // begin inline asm { cvt.f32.f16 %f274, %rs179;} // end inline asm fma.rn.ftz.f32 %f350, %f349, %f274, %f339; // begin inline asm { cvt.f32.f16 %f275, %rs180;} // end inline asm fma.rn.ftz.f32 %f351, %f349, %f275, %f340; // begin inline asm { cvt.f32.f16 %f276, %rs181;} // end inline asm fma.rn.ftz.f32 %f352, %f349, %f276, %f341; // begin inline asm { cvt.f32.f16 %f277, %rs182;} // end inline asm fma.rn.ftz.f32 %f353, %f349, %f277, %f342; // begin inline asm { cvt.f32.f16 %f278, %rs183;} // end inline asm fma.rn.ftz.f32 %f354, %f349, %f278, %f343; // begin inline asm { cvt.f32.f16 %f279, %rs35;} // end inline asm fma.rn.ftz.f32 %f355, %f349, %f279, %f344; // begin inline asm { cvt.f32.f16 %f280, %rs36;} // end inline asm fma.rn.ftz.f32 %f356, %f349, %f280, %f345; // begin inline asm { cvt.f32.f16 %f281, %rs37;} // end inline asm fma.rn.ftz.f32 %f357, %f349, %f281, %f346; shl.b16 %rs247, %rs38, 4; cvt.s16.s8 %rs248, %rs247; shr.s16 %rs249, %rs248, 7; and.b16 %rs250, %rs249, -16; or.b16 %rs251, %rs250, %rs39; cvt.rn.f32.s16 %f358, %rs251; sub.ftz.f32 %f359, %f358, %f10; mul.ftz.f32 %f360, %f105, %f359; // begin inline asm { cvt.f32.f16 %f282, %rs40;} // end inline asm fma.rn.ftz.f32 %f361, %f360, %f282, %f350; // begin inline asm { cvt.f32.f16 %f283, %rs41;} // end inline asm fma.rn.ftz.f32 %f362, %f360, %f283, %f351; // begin inline asm { cvt.f32.f16 %f284, %rs42;} // end inline asm fma.rn.ftz.f32 %f363, %f360, %f284, %f352; // begin inline asm { cvt.f32.f16 %f285, %rs43;} // end inline asm fma.rn.ftz.f32 %f364, %f360, %f285, %f353; // begin inline asm { cvt.f32.f16 %f286, %rs44;} // end inline asm fma.rn.ftz.f32 %f365, %f360, %f286, %f354; // begin inline asm { cvt.f32.f16 %f287, %rs45;} // end inline asm fma.rn.ftz.f32 %f366, %f360, %f287, %f355; // begin inline asm { cvt.f32.f16 %f288, %rs46;} // end inline asm fma.rn.ftz.f32 %f367, %f360, %f288, %f356; // begin inline asm { cvt.f32.f16 %f289, %rs47;} // end inline asm fma.rn.ftz.f32 %f368, %f360, %f289, %f357; shl.b16 %rs252, %rs48, 4; cvt.s16.s8 %rs253, %rs252; shr.s16 %rs254, %rs253, 7; and.b16 %rs255, %rs254, -16; or.b16 %rs256, %rs255, %rs49; cvt.rn.f32.s16 %f369, %rs256; sub.ftz.f32 %f370, %f369, %f10; mul.ftz.f32 %f371, %f105, %f370; // begin inline asm { cvt.f32.f16 %f290, %rs50;} // end inline asm fma.rn.ftz.f32 %f372, %f371, %f290, %f361; // begin inline asm { cvt.f32.f16 %f291, %rs51;} // end inline asm fma.rn.ftz.f32 %f373, %f371, %f291, %f362; // begin inline asm { cvt.f32.f16 %f292, %rs52;} // end inline asm fma.rn.ftz.f32 %f374, %f371, %f292, %f363; // begin inline asm { cvt.f32.f16 %f293, %rs53;} // end inline asm fma.rn.ftz.f32 %f375, %f371, %f293, %f364; // begin inline asm { cvt.f32.f16 %f294, %rs54;} // end inline asm fma.rn.ftz.f32 %f376, %f371, %f294, %f365; // begin inline asm { cvt.f32.f16 %f295, %rs55;} // end inline asm fma.rn.ftz.f32 %f377, %f371, %f295, %f366; // begin inline asm { cvt.f32.f16 %f296, %rs56;} // end inline asm fma.rn.ftz.f32 %f378, %f371, %f296, %f367; // begin inline asm { cvt.f32.f16 %f297, %rs57;} // end inline asm fma.rn.ftz.f32 %f379, %f371, %f297, %f368; shl.b16 %rs257, %rs58, 4; cvt.s16.s8 %rs258, %rs257; shr.s16 %rs259, %rs258, 7; and.b16 %rs260, %rs259, -16; or.b16 %rs261, %rs260, %rs59; cvt.rn.f32.s16 %f380, %rs261; sub.ftz.f32 %f381, %f380, %f10; mul.ftz.f32 %f382, %f105, %f381; mov.b32 {%rs203, %rs211}, %r78; // begin inline asm { cvt.f32.f16 %f298, %rs203;} // end inline asm fma.rn.ftz.f32 %f383, %f382, %f298, %f372; mov.b32 {%rs204, %rs212}, %r82; // begin inline asm { cvt.f32.f16 %f299, %rs204;} // end inline asm fma.rn.ftz.f32 %f384, %f382, %f299, %f373; mov.b32 {%rs205, %rs213}, %r88; // begin inline asm { cvt.f32.f16 %f300, %rs205;} // end inline asm fma.rn.ftz.f32 %f385, %f382, %f300, %f374; mov.b32 {%rs206, %rs214}, %r92; // begin inline asm { cvt.f32.f16 %f301, %rs206;} // end inline asm fma.rn.ftz.f32 %f386, %f382, %f301, %f375; mov.b32 {%rs207, %rs215}, %r96; // begin inline asm { cvt.f32.f16 %f302, %rs207;} // end inline asm fma.rn.ftz.f32 %f387, %f382, %f302, %f376; mov.b32 {%rs208, %rs216}, %r100; // begin inline asm { cvt.f32.f16 %f303, %rs208;} // end inline asm fma.rn.ftz.f32 %f388, %f382, %f303, %f377; mov.b32 {%rs209, %rs217}, %r104; // begin inline asm { cvt.f32.f16 %f304, %rs209;} // end inline asm fma.rn.ftz.f32 %f389, %f382, %f304, %f378; mov.b32 {%rs210, %rs218}, %r108; // begin inline asm { cvt.f32.f16 %f305, %rs210;} // end inline asm fma.rn.ftz.f32 %f390, %f382, %f305, %f379; shl.b16 %rs262, %rs60, 4; cvt.s16.s8 %rs263, %rs262; shr.s16 %rs264, %rs263, 7; and.b16 %rs265, %rs264, -16; or.b16 %rs266, %rs265, %rs60; cvt.rn.f32.s16 %f391, %rs266; sub.ftz.f32 %f392, %f391, %f10; mul.ftz.f32 %f393, %f105, %f392; // begin inline asm { cvt.f32.f16 %f306, %rs211;} // end inline asm fma.rn.ftz.f32 %f578, %f393, %f306, %f383; // begin inline asm { cvt.f32.f16 %f307, %rs212;} // end inline asm fma.rn.ftz.f32 %f579, %f393, %f307, %f384; // begin inline asm { cvt.f32.f16 %f308, %rs213;} // end inline asm fma.rn.ftz.f32 %f580, %f393, %f308, %f385; // begin inline asm { cvt.f32.f16 %f309, %rs214;} // end inline asm fma.rn.ftz.f32 %f581, %f393, %f309, %f386; // begin inline asm { cvt.f32.f16 %f310, %rs215;} // end inline asm fma.rn.ftz.f32 %f582, %f393, %f310, %f387; // begin inline asm { cvt.f32.f16 %f311, %rs216;} // end inline asm fma.rn.ftz.f32 %f583, %f393, %f311, %f388; // begin inline asm { cvt.f32.f16 %f312, %rs217;} // end inline asm fma.rn.ftz.f32 %f584, %f393, %f312, %f389; // begin inline asm { cvt.f32.f16 %f313, %rs218;} // end inline asm fma.rn.ftz.f32 %f585, %f393, %f313, %f390; $L__BB0_8: add.s32 %r284, %r284, 4; shl.b32 %r116, %r284, 5; add.s32 %r283, %r116, %r59; setp.lt.u32 %p7, %r283, %r56; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r278, %tid.y; mov.u32 %r277, %tid.x; shl.b32 %r276, %r278, 5; add.s32 %r275, %r276, %r277; shl.b32 %r118, %r275, 2; mov.u32 %r119, _ZZ9gemv_int4ILi4ELi32ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r120, %r119, %r118; setp.lt.u32 %p8, %r275, 32; @%p8 bra $L__BB0_11; add.s32 %r267, %r120, -112; st.shared.f32 [%r267], %f578; $L__BB0_11: mov.u32 %r282, %tid.y; mov.u32 %r281, %tid.x; shl.b32 %r280, %r282, 5; add.s32 %r279, %r280, %r281; setp.gt.u32 %p9, %r279, 31; bar.sync 0; mad.lo.s32 %r47, %r279, 12, %r119; @%p9 bra $L__BB0_13; mov.u32 %r135, 16; ld.shared.f32 %f409, [%r47+16]; add.ftz.f32 %f410, %f578, %f409; ld.shared.f32 %f411, [%r47+20]; add.ftz.f32 %f412, %f410, %f411; ld.shared.f32 %f413, [%r47+24]; add.ftz.f32 %f396, %f412, %f413; mov.u32 %r123, 1; mov.u32 %r136, 31; mov.u32 %r137, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f396, %r123, %r136, %r137; @p add.f32 r0, r0, %f396; mov.f32 %f394, r0;} // end inline asm mov.u32 %r126, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f394, %r126, %r136, %r137; @p add.f32 r0, r0, %f394; mov.f32 %f397, r0;} // end inline asm mov.u32 %r129, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f397, %r129, %r136, %r137; @p add.f32 r0, r0, %f397; mov.f32 %f400, r0;} // end inline asm mov.u32 %r132, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f400, %r132, %r136, %r137; @p add.f32 r0, r0, %f400; mov.f32 %f403, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f403, %r135, %r136, %r137; @p add.f32 r0, r0, %f403; mov.f32 %f578, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r268, %r120, -112; st.shared.f32 [%r268+640], %f579; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f429, [%r47+656]; add.ftz.f32 %f430, %f579, %f429; ld.shared.f32 %f431, [%r47+660]; add.ftz.f32 %f432, %f430, %f431; ld.shared.f32 %f433, [%r47+664]; add.ftz.f32 %f416, %f432, %f433; mov.u32 %r139, 1; mov.u32 %r152, 31; mov.u32 %r153, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f416, %r139, %r152, %r153; @p add.f32 r0, r0, %f416; mov.f32 %f414, r0;} // end inline asm mov.u32 %r142, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f414, %r142, %r152, %r153; @p add.f32 r0, r0, %f414; mov.f32 %f417, r0;} // end inline asm mov.u32 %r145, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f417, %r145, %r152, %r153; @p add.f32 r0, r0, %f417; mov.f32 %f420, r0;} // end inline asm mov.u32 %r148, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f420, %r148, %r152, %r153; @p add.f32 r0, r0, %f420; mov.f32 %f423, r0;} // end inline asm mov.u32 %r151, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f423, %r151, %r152, %r153; @p add.f32 r0, r0, %f423; mov.f32 %f579, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r269, %r120, -112; st.shared.f32 [%r269+1280], %f580; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f449, [%r47+1296]; add.ftz.f32 %f450, %f580, %f449; ld.shared.f32 %f451, [%r47+1300]; add.ftz.f32 %f452, %f450, %f451; ld.shared.f32 %f453, [%r47+1304]; add.ftz.f32 %f436, %f452, %f453; mov.u32 %r155, 1; mov.u32 %r168, 31; mov.u32 %r169, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f436, %r155, %r168, %r169; @p add.f32 r0, r0, %f436; mov.f32 %f434, r0;} // end inline asm mov.u32 %r158, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f434, %r158, %r168, %r169; @p add.f32 r0, r0, %f434; mov.f32 %f437, r0;} // end inline asm mov.u32 %r161, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f437, %r161, %r168, %r169; @p add.f32 r0, r0, %f437; mov.f32 %f440, r0;} // end inline asm mov.u32 %r164, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f440, %r164, %r168, %r169; @p add.f32 r0, r0, %f440; mov.f32 %f443, r0;} // end inline asm mov.u32 %r167, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f443, %r167, %r168, %r169; @p add.f32 r0, r0, %f443; mov.f32 %f580, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r270, %r120, -112; st.shared.f32 [%r270+1920], %f581; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f469, [%r47+1936]; add.ftz.f32 %f470, %f581, %f469; ld.shared.f32 %f471, [%r47+1940]; add.ftz.f32 %f472, %f470, %f471; ld.shared.f32 %f473, [%r47+1944]; add.ftz.f32 %f456, %f472, %f473; mov.u32 %r171, 1; mov.u32 %r184, 31; mov.u32 %r185, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f456, %r171, %r184, %r185; @p add.f32 r0, r0, %f456; mov.f32 %f454, r0;} // end inline asm mov.u32 %r174, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f454, %r174, %r184, %r185; @p add.f32 r0, r0, %f454; mov.f32 %f457, r0;} // end inline asm mov.u32 %r177, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f457, %r177, %r184, %r185; @p add.f32 r0, r0, %f457; mov.f32 %f460, r0;} // end inline asm mov.u32 %r180, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f460, %r180, %r184, %r185; @p add.f32 r0, r0, %f460; mov.f32 %f463, r0;} // end inline asm mov.u32 %r183, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f463, %r183, %r184, %r185; @p add.f32 r0, r0, %f463; mov.f32 %f581, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r271, %r120, -112; st.shared.f32 [%r271+2560], %f582; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f489, [%r47+2576]; add.ftz.f32 %f490, %f582, %f489; ld.shared.f32 %f491, [%r47+2580]; add.ftz.f32 %f492, %f490, %f491; ld.shared.f32 %f493, [%r47+2584]; add.ftz.f32 %f476, %f492, %f493; mov.u32 %r187, 1; mov.u32 %r200, 31; mov.u32 %r201, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f476, %r187, %r200, %r201; @p add.f32 r0, r0, %f476; mov.f32 %f474, r0;} // end inline asm mov.u32 %r190, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f474, %r190, %r200, %r201; @p add.f32 r0, r0, %f474; mov.f32 %f477, r0;} // end inline asm mov.u32 %r193, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f477, %r193, %r200, %r201; @p add.f32 r0, r0, %f477; mov.f32 %f480, r0;} // end inline asm mov.u32 %r196, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f480, %r196, %r200, %r201; @p add.f32 r0, r0, %f480; mov.f32 %f483, r0;} // end inline asm mov.u32 %r199, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f483, %r199, %r200, %r201; @p add.f32 r0, r0, %f483; mov.f32 %f582, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r272, %r120, -112; st.shared.f32 [%r272+3200], %f583; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f509, [%r47+3216]; add.ftz.f32 %f510, %f583, %f509; ld.shared.f32 %f511, [%r47+3220]; add.ftz.f32 %f512, %f510, %f511; ld.shared.f32 %f513, [%r47+3224]; add.ftz.f32 %f496, %f512, %f513; mov.u32 %r203, 1; mov.u32 %r216, 31; mov.u32 %r217, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f496, %r203, %r216, %r217; @p add.f32 r0, r0, %f496; mov.f32 %f494, r0;} // end inline asm mov.u32 %r206, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f494, %r206, %r216, %r217; @p add.f32 r0, r0, %f494; mov.f32 %f497, r0;} // end inline asm mov.u32 %r209, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f497, %r209, %r216, %r217; @p add.f32 r0, r0, %f497; mov.f32 %f500, r0;} // end inline asm mov.u32 %r212, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f500, %r212, %r216, %r217; @p add.f32 r0, r0, %f500; mov.f32 %f503, r0;} // end inline asm mov.u32 %r215, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f503, %r215, %r216, %r217; @p add.f32 r0, r0, %f503; mov.f32 %f583, r0;} // end inline asm $L__BB0_33: @%p8 bra $L__BB0_35; add.s32 %r273, %r120, -112; st.shared.f32 [%r273+3840], %f584; $L__BB0_35: bar.sync 0; @%p9 bra $L__BB0_37; ld.shared.f32 %f529, [%r47+3856]; add.ftz.f32 %f530, %f584, %f529; ld.shared.f32 %f531, [%r47+3860]; add.ftz.f32 %f532, %f530, %f531; ld.shared.f32 %f533, [%r47+3864]; add.ftz.f32 %f516, %f532, %f533; mov.u32 %r219, 1; mov.u32 %r232, 31; mov.u32 %r233, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f516, %r219, %r232, %r233; @p add.f32 r0, r0, %f516; mov.f32 %f514, r0;} // end inline asm mov.u32 %r222, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f514, %r222, %r232, %r233; @p add.f32 r0, r0, %f514; mov.f32 %f517, r0;} // end inline asm mov.u32 %r225, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f517, %r225, %r232, %r233; @p add.f32 r0, r0, %f517; mov.f32 %f520, r0;} // end inline asm mov.u32 %r228, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f520, %r228, %r232, %r233; @p add.f32 r0, r0, %f520; mov.f32 %f523, r0;} // end inline asm mov.u32 %r231, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f523, %r231, %r232, %r233; @p add.f32 r0, r0, %f523; mov.f32 %f584, r0;} // end inline asm $L__BB0_37: @%p8 bra $L__BB0_39; add.s32 %r274, %r120, -112; st.shared.f32 [%r274+4480], %f585; $L__BB0_39: bar.sync 0; @%p9 bra $L__BB0_41; ld.shared.f32 %f549, [%r47+4496]; add.ftz.f32 %f550, %f585, %f549; ld.shared.f32 %f551, [%r47+4500]; add.ftz.f32 %f552, %f550, %f551; ld.shared.f32 %f553, [%r47+4504]; add.ftz.f32 %f536, %f552, %f553; mov.u32 %r235, 1; mov.u32 %r248, 31; mov.u32 %r249, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f536, %r235, %r248, %r249; @p add.f32 r0, r0, %f536; mov.f32 %f534, r0;} // end inline asm mov.u32 %r238, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f534, %r238, %r248, %r249; @p add.f32 r0, r0, %f534; mov.f32 %f537, r0;} // end inline asm mov.u32 %r241, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f537, %r241, %r248, %r249; @p add.f32 r0, r0, %f537; mov.f32 %f540, r0;} // end inline asm mov.u32 %r244, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f540, %r244, %r248, %r249; @p add.f32 r0, r0, %f540; mov.f32 %f543, r0;} // end inline asm mov.u32 %r247, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f543, %r247, %r248, %r249; @p add.f32 r0, r0, %f543; mov.f32 %f585, r0;} // end inline asm $L__BB0_41: mov.u32 %r250, %tid.y; or.b32 %r252, %r59, %r250; setp.ne.s32 %p24, %r252, 0; @%p24 bra $L__BB0_59; ld.param.u64 %rd95, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p25, %rd95, 0; mul.ftz.f32 %f602, %f95, %f578; mov.u32 %r253, %ctaid.x; cvt.s64.s32 %rd7, %r253; @%p25 bra $L__BB0_44; shl.b64 %rd52, %rd7, 1; add.s64 %rd53, %rd2, %rd52; ld.global.u16 %rs267, [%rd53]; // begin inline asm { cvt.f32.f16 %f554, %rs267;} // end inline asm fma.rn.ftz.f32 %f602, %f96, %f554, %f602; $L__BB0_44: ld.param.u64 %rd96, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs268, %f602;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd54, 1.0; // end inline asm shl.b64 %rd57, %rd7, 1; add.s64 %rd55, %rd96, %rd57; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd55], %rs268, %rd54; // end inline asm mul.ftz.f32 %f603, %f95, %f579; add.s32 %r255, %r55, %r253; cvt.s64.s32 %rd10, %r255; @%p25 bra $L__BB0_46; shl.b64 %rd58, %rd10, 1; add.s64 %rd59, %rd2, %rd58; ld.global.u16 %rs270, [%rd59]; // begin inline asm { cvt.f32.f16 %f556, %rs270;} // end inline asm fma.rn.ftz.f32 %f603, %f96, %f556, %f603; $L__BB0_46: cvt.s64.s32 %rd11, %r55; mul.wide.s32 %rd63, %r55, 2; add.s64 %rd61, %rd55, %rd63; // begin inline asm { cvt.rn.f16.f32 %rs271, %f603;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd60, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd61], %rs271, %rd60; // end inline asm mul.ftz.f32 %f604, %f95, %f580; cvt.u32.u64 %r256, %rd10; add.s32 %r257, %r256, %r55; cvt.s64.s32 %rd12, %r257; @%p25 bra $L__BB0_48; shl.b64 %rd64, %rd12, 1; add.s64 %rd65, %rd2, %rd64; ld.global.u16 %rs273, [%rd65]; // begin inline asm { cvt.f32.f16 %f558, %rs273;} // end inline asm fma.rn.ftz.f32 %f604, %f96, %f558, %f604; $L__BB0_48: ld.param.u64 %rd97, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0]; shl.b64 %rd69, %rd12, 1; add.s64 %rd67, %rd97, %rd69; // begin inline asm { cvt.rn.f16.f32 %rs274, %f604;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd66, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd67], %rs274, %rd66; // end inline asm mul.ftz.f32 %f605, %f95, %f581; cvt.u32.u64 %r258, %rd12; add.s32 %r259, %r258, %r55; cvt.s64.s32 %rd14, %r259; @%p25 bra $L__BB0_50; shl.b64 %rd70, %rd14, 1; add.s64 %rd71, %rd2, %rd70; ld.global.u16 %rs276, [%rd71]; // begin inline asm { cvt.f32.f16 %f560, %rs276;} // end inline asm fma.rn.ftz.f32 %f605, %f96, %f560, %f605; $L__BB0_50: // begin inline asm { cvt.rn.f16.f32 %rs277, %f605;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd72, 1.0; // end inline asm shl.b64 %rd15, %rd11, 1; add.s64 %rd73, %rd67, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd73], %rs277, %rd72; // end inline asm mul.ftz.f32 %f606, %f95, %f582; cvt.u32.u64 %r260, %rd14; add.s32 %r261, %r260, %r55; cvt.s64.s32 %rd17, %r261; @%p25 bra $L__BB0_52; shl.b64 %rd75, %rd17, 1; add.s64 %rd76, %rd2, %rd75; ld.global.u16 %rs279, [%rd76]; // begin inline asm { cvt.f32.f16 %f562, %rs279;} // end inline asm fma.rn.ftz.f32 %f606, %f96, %f562, %f606; $L__BB0_52: // begin inline asm { cvt.rn.f16.f32 %rs280, %f606;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd77, 1.0; // end inline asm add.s64 %rd78, %rd73, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd78], %rs280, %rd77; // end inline asm mul.ftz.f32 %f607, %f95, %f583; cvt.u32.u64 %r262, %rd17; add.s32 %r263, %r262, %r55; cvt.s64.s32 %rd19, %r263; @%p25 bra $L__BB0_54; shl.b64 %rd80, %rd19, 1; add.s64 %rd81, %rd2, %rd80; ld.global.u16 %rs282, [%rd81]; // begin inline asm { cvt.f32.f16 %f564, %rs282;} // end inline asm fma.rn.ftz.f32 %f607, %f96, %f564, %f607; $L__BB0_54: // begin inline asm { cvt.rn.f16.f32 %rs283, %f607;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd82, 1.0; // end inline asm add.s64 %rd83, %rd78, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd83], %rs283, %rd82; // end inline asm mul.ftz.f32 %f608, %f95, %f584; cvt.u32.u64 %r264, %rd19; add.s32 %r265, %r264, %r55; cvt.s64.s32 %rd21, %r265; @%p25 bra $L__BB0_56; shl.b64 %rd85, %rd21, 1; add.s64 %rd86, %rd2, %rd85; ld.global.u16 %rs285, [%rd86]; // begin inline asm { cvt.f32.f16 %f566, %rs285;} // end inline asm fma.rn.ftz.f32 %f608, %f96, %f566, %f608; $L__BB0_56: // begin inline asm { cvt.rn.f16.f32 %rs286, %f608;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd87, 1.0; // end inline asm add.s64 %rd88, %rd83, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd88], %rs286, %rd87; // end inline asm mul.ftz.f32 %f609, %f95, %f585; cvt.u32.u64 %r266, %rd21; add.s32 %r49, %r266, %r55; @%p25 bra $L__BB0_58; mul.wide.s32 %rd90, %r49, 2; add.s64 %rd91, %rd2, %rd90; ld.global.u16 %rs288, [%rd91]; // begin inline asm { cvt.f32.f16 %f568, %rs288;} // end inline asm fma.rn.ftz.f32 %f609, %f96, %f568, %f609; $L__BB0_58: // begin inline asm { cvt.rn.f16.f32 %rs289, %f609;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd92, 1.0; // end inline asm add.s64 %rd93, %rd88, %rd15; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd93], %rs289, %rd92; // end inline asm $L__BB0_59: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }