RNAL_36609967_7_gemv_cu_3dc8a4816thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_3dc8a4816thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_3dc8a4816thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_3dc8a4816thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_3dc8a4816thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_3dc8a4816thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_3dc8a4816thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_3dc8a4816thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch523DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<24>; .reg .b16 %rs<354>; .reg .f32 %f<611>; .reg .b32 %r<331>; .reg .b64 %rd<84>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi5EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[3200]; ld.param.v2.u32 {%r46, %r47}, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r48, %r49}, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f62, %f63}, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs79, %rs80, %rs81, %rs82}, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd23, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd22, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd21, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd20, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd19, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd2, %rd19; mov.u32 %r330, %tid.y; shl.b32 %r50, %r330, 5; mov.u32 %r51, %tid.x; add.s32 %r329, %r50, %r51; shl.b32 %r328, %r329, 1; setp.ge.u32 %p1, %r328, %r48; mov.f32 %f591, 0f00000000; mov.f32 %f592, %f591; mov.f32 %f593, %f591; mov.f32 %f594, %f591; mov.f32 %f595, %f591; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd20; mov.u32 %r52, %ctaid.x; mul.lo.s32 %r6, %r49, %r52; $L__BB0_2: mad.lo.s32 %r56, %r48, %r52, %r328; mul.wide.u32 %rd30, %r56, 4; add.s64 %rd25, %rd21, %rd30; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd24, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v2.u32 {%r53,%r54}, [%rd25], %rd24; // end inline asm shr.u32 %r58, %r51, 2; shl.b32 %r59, %r330, 3; add.s32 %r12, %r59, %r58; add.s32 %r13, %r12, %r6; mul.wide.s32 %rd31, %r13, 2; add.s64 %rd28, %rd23, %rd31; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd27, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs87, [%rd28], %rd27; // end inline asm // begin inline asm { cvt.f32.f16 %f69, %rs87;} // end inline asm shl.b16 %rs353, %rs79, 3; setp.eq.s64 %p2, %rd22, 0; @%p2 bra $L__BB0_4; shr.u32 %r60, %r13, 31; add.s32 %r61, %r13, %r60; shr.s32 %r62, %r61, 1; cvt.s64.s32 %rd35, %r62; add.s64 %rd33, %rd22, %rd35; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd32, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs89, [%rd33], %rd32; // end inline asm cvt.u32.u16 %r63, %rs89; and.b32 %r64, %r63, 255; shl.b32 %r65, %r12, 2; and.b32 %r66, %r65, 4; shr.u32 %r67, %r64, %r66; cvt.u16.u32 %rs90, %r67; and.b16 %rs353, %rs90, 15; $L__BB0_4: shl.b32 %r14, %r329, 4; setp.ge.s32 %p3, %r14, %r46; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs79, 0; shr.u16 %rs92, %rs353, 3; and.b16 %rs93, %rs92, 1; setp.eq.b16 %p5, %rs93, 1; and.pred %p6, %p4, %p5; selp.b16 %rs94, -16, 0, %p6; or.b16 %rs95, %rs94, %rs353; cvt.s16.s8 %rs96, %rs95; cvt.rn.f32.s16 %f7, %rs96; mul.wide.s32 %rd36, %r14, 2; add.s64 %rd7, %rd3, %rd36; ld.global.v4.u32 {%r68, %r69, %r70, %r71}, [%rd7]; mul.wide.s32 %rd9, %r46, 2; add.s64 %rd8, %rd7, %rd9; ld.global.v4.u32 {%r72, %r73, %r74, %r75}, [%rd8]; add.s32 %r76, %r14, %r46; add.s32 %r77, %r76, %r46; shl.b32 %r78, %r46, 1; mul.wide.s32 %rd37, %r78, 2; add.s64 %rd38, %rd7, %rd37; ld.global.v4.u32 {%r79, %r80, %r81, %r82}, [%rd38]; add.s32 %r83, %r77, %r46; mul.wide.s32 %rd39, %r83, 2; add.s64 %rd40, %rd3, %rd39; ld.global.v4.u32 {%r84, %r85, %r86, %r87}, [%rd40]; add.s64 %rd41, %rd38, %rd37; ld.global.v4.u32 {%r88, %r89, %r90, %r91}, [%rd41]; cvt.u16.u32 %rs5, %r53; and.b16 %rs6, %rs5, 15; mov.b32 {%rs9, %rs8}, %r68; mov.b32 {%rs10, %rs16}, %r72; mov.b32 {%rs11, %rs17}, %r79; mov.b32 {%rs12, %rs18}, %r84; mov.b32 {%rs13, %rs19}, %r88; shr.u32 %r92, %r53, 4; cvt.u16.u32 %rs14, %r92; and.b16 %rs15, %rs14, 15; shr.u32 %r93, %r53, 8; cvt.u16.u32 %rs20, %r93; and.b16 %rs21, %rs20, 15; mov.b32 {%rs22, %rs29}, %r69; mov.b32 {%rs23, %rs30}, %r73; mov.b32 {%rs24, %rs31}, %r80; mov.b32 {%rs25, %rs32}, %r85; mov.b32 {%rs26, %rs33}, %r89; shr.u32 %r94, %r53, 12; cvt.u16.u32 %rs27, %r94; and.b16 %rs28, %rs27, 15; shr.u32 %r95, %r53, 16; cvt.u16.u32 %rs34, %r95; and.b16 %rs35, %rs34, 15; mov.b32 {%rs36, %rs48}, %r70; mov.b32 {%rs38, %rs49}, %r74; mov.b32 {%rs40, %rs50}, %r81; mov.b32 {%rs42, %rs51}, %r86; mov.b32 {%rs44, %rs52}, %r90; shr.u32 %r96, %r53, 20; cvt.u16.u32 %rs46, %r96; and.b16 %rs47, %rs46, 15; shr.u32 %r97, %r53, 24; cvt.u16.u32 %rs53, %r97; and.b16 %rs54, %rs53, 15; shr.u32 %r98, %r53, 28; cvt.u16.u32 %rs55, %r98; cvt.u16.u32 %rs56, %r54; and.b16 %rs57, %rs56, 15; shr.u32 %r99, %r54, 4; cvt.u16.u32 %rs58, %r99; and.b16 %rs59, %rs58, 15; shr.u32 %r100, %r54, 8; cvt.u16.u32 %rs60, %r100; and.b16 %rs61, %rs60, 15; shr.u32 %r101, %r54, 12; cvt.u16.u32 %rs62, %r101; and.b16 %rs63, %rs62, 15; shr.u32 %r102, %r54, 16; cvt.u16.u32 %rs64, %r102; and.b16 %rs65, %rs64, 15; shr.u32 %r103, %r54, 20; cvt.u16.u32 %rs66, %r103; and.b16 %rs67, %rs66, 15; shr.u32 %r104, %r54, 24; cvt.u16.u32 %rs68, %r104; and.b16 %rs69, %rs68, 15; shr.u32 %r105, %r54, 28; cvt.u16.u32 %rs70, %r105; add.s64 %rd42, %rd8, %rd9; add.s64 %rd10, %rd42, 16; @%p4 bra $L__BB0_7; add.s64 %rd43, %rd10, %rd9; cvt.rn.f32.s16 %f150, %rs6; sub.ftz.f32 %f151, %f150, %f7; mul.ftz.f32 %f152, %f69, %f151; // begin inline asm { cvt.f32.f16 %f70, %rs9;} // end inline asm fma.rn.ftz.f32 %f153, %f152, %f70, %f595; // begin inline asm { cvt.f32.f16 %f71, %rs10;} // end inline asm fma.rn.ftz.f32 %f154, %f152, %f71, %f594; // begin inline asm { cvt.f32.f16 %f72, %rs11;} // end inline asm fma.rn.ftz.f32 %f155, %f152, %f72, %f593; // begin inline asm { cvt.f32.f16 %f73, %rs12;} // end inline asm fma.rn.ftz.f32 %f156, %f152, %f73, %f592; // begin inline asm { cvt.f32.f16 %f74, %rs13;} // end inline asm fma.rn.ftz.f32 %f157, %f152, %f74, %f591; cvt.rn.f32.s16 %f158, %rs15; sub.ftz.f32 %f159, %f158, %f7; mul.ftz.f32 %f160, %f69, %f159; // begin inline asm { cvt.f32.f16 %f75, %rs8;} // end inline asm fma.rn.ftz.f32 %f161, %f160, %f75, %f153; // begin inline asm { cvt.f32.f16 %f76, %rs16;} // end inline asm fma.rn.ftz.f32 %f162, %f160, %f76, %f154; // begin inline asm { cvt.f32.f16 %f77, %rs17;} // end inline asm fma.rn.ftz.f32 %f163, %f160, %f77, %f155; // begin inline asm { cvt.f32.f16 %f78, %rs18;} // end inline asm fma.rn.ftz.f32 %f164, %f160, %f78, %f156; // begin inline asm { cvt.f32.f16 %f79, %rs19;} // end inline asm fma.rn.ftz.f32 %f165, %f160, %f79, %f157; cvt.rn.f32.s16 %f166, %rs21; sub.ftz.f32 %f167, %f166, %f7; mul.ftz.f32 %f168, %f69, %f167; // begin inline asm { cvt.f32.f16 %f80, %rs22;} // end inline asm fma.rn.ftz.f32 %f169, %f168, %f80, %f161; // begin inline asm { cvt.f32.f16 %f81, %rs23;} // end inline asm fma.rn.ftz.f32 %f170, %f168, %f81, %f162; // begin inline asm { cvt.f32.f16 %f82, %rs24;} // end inline asm fma.rn.ftz.f32 %f171, %f168, %f82, %f163; // begin inline asm { cvt.f32.f16 %f83, %rs25;} // end inline asm fma.rn.ftz.f32 %f172, %f168, %f83, %f164; // begin inline asm { cvt.f32.f16 %f84, %rs26;} // end inline asm fma.rn.ftz.f32 %f173, %f168, %f84, %f165; cvt.rn.f32.s16 %f174, %rs28; sub.ftz.f32 %f175, %f174, %f7; mul.ftz.f32 %f176, %f69, %f175; // begin inline asm { cvt.f32.f16 %f85, %rs29;} // end inline asm fma.rn.ftz.f32 %f177, %f176, %f85, %f169; // begin inline asm { cvt.f32.f16 %f86, %rs30;} // end inline asm fma.rn.ftz.f32 %f178, %f176, %f86, %f170; // begin inline asm { cvt.f32.f16 %f87, %rs31;} // end inline asm fma.rn.ftz.f32 %f179, %f176, %f87, %f171; // begin inline asm { cvt.f32.f16 %f88, %rs32;} // end inline asm fma.rn.ftz.f32 %f180, %f176, %f88, %f172; // begin inline asm { cvt.f32.f16 %f89, %rs33;} // end inline asm fma.rn.ftz.f32 %f181, %f176, %f89, %f173; cvt.rn.f32.s16 %f182, %rs35; sub.ftz.f32 %f183, %f182, %f7; mul.ftz.f32 %f184, %f69, %f183; // begin inline asm { cvt.f32.f16 %f90, %rs36;} // end inline asm fma.rn.ftz.f32 %f185, %f184, %f90, %f177; // begin inline asm { cvt.f32.f16 %f91, %rs38;} // end inline asm fma.rn.ftz.f32 %f186, %f184, %f91, %f178; // begin inline asm { cvt.f32.f16 %f92, %rs40;} // end inline asm fma.rn.ftz.f32 %f187, %f184, %f92, %f179; // begin inline asm { cvt.f32.f16 %f93, %rs42;} // end inline asm fma.rn.ftz.f32 %f188, %f184, %f93, %f180; // begin inline asm { cvt.f32.f16 %f94, %rs44;} // end inline asm fma.rn.ftz.f32 %f189, %f184, %f94, %f181; cvt.rn.f32.s16 %f190, %rs47; sub.ftz.f32 %f191, %f190, %f7; mul.ftz.f32 %f192, %f69, %f191; // begin inline asm { cvt.f32.f16 %f95, %rs48;} // end inline asm fma.rn.ftz.f32 %f193, %f192, %f95, %f185; // begin inline asm { cvt.f32.f16 %f96, %rs49;} // end inline asm fma.rn.ftz.f32 %f194, %f192, %f96, %f186; // begin inline asm { cvt.f32.f16 %f97, %rs50;} // end inline asm fma.rn.ftz.f32 %f195, %f192, %f97, %f187; // begin inline asm { cvt.f32.f16 %f98, %rs51;} // end inline asm fma.rn.ftz.f32 %f196, %f192, %f98, %f188; // begin inline asm { cvt.f32.f16 %f99, %rs52;} // end inline asm fma.rn.ftz.f32 %f197, %f192, %f99, %f189; cvt.rn.f32.s16 %f198, %rs54; sub.ftz.f32 %f199, %f198, %f7; mul.ftz.f32 %f200, %f69, %f199; mov.b32 {%rs127, %rs132}, %r71; // begin inline asm { cvt.f32.f16 %f100, %rs127;} // end inline asm fma.rn.ftz.f32 %f201, %f200, %f100, %f193; mov.b32 {%rs128, %rs133}, %r75; // begin inline asm { cvt.f32.f16 %f101, %rs128;} // end inline asm fma.rn.ftz.f32 %f202, %f200, %f101, %f194; mov.b32 {%rs129, %rs134}, %r82; // begin inline asm { cvt.f32.f16 %f102, %rs129;} // end inline asm fma.rn.ftz.f32 %f203, %f200, %f102, %f195; mov.b32 {%rs130, %rs135}, %r87; // begin inline asm { cvt.f32.f16 %f103, %rs130;} // end inline asm fma.rn.ftz.f32 %f204, %f200, %f103, %f196; mov.b32 {%rs131, %rs136}, %r91; // begin inline asm { cvt.f32.f16 %f104, %rs131;} // end inline asm fma.rn.ftz.f32 %f205, %f200, %f104, %f197; cvt.rn.f32.s16 %f206, %rs55; sub.ftz.f32 %f207, %f206, %f7; mul.ftz.f32 %f208, %f69, %f207; // begin inline asm { cvt.f32.f16 %f105, %rs132;} // end inline asm fma.rn.ftz.f32 %f209, %f208, %f105, %f201; // begin inline asm { cvt.f32.f16 %f106, %rs133;} // end inline asm fma.rn.ftz.f32 %f210, %f208, %f106, %f202; // begin inline asm { cvt.f32.f16 %f107, %rs134;} // end inline asm fma.rn.ftz.f32 %f211, %f208, %f107, %f203; // begin inline asm { cvt.f32.f16 %f108, %rs135;} // end inline asm fma.rn.ftz.f32 %f212, %f208, %f108, %f204; // begin inline asm { cvt.f32.f16 %f109, %rs136;} // end inline asm fma.rn.ftz.f32 %f213, %f208, %f109, %f205; ld.global.v4.u32 {%r106, %r107, %r108, %r109}, [%rd7+16]; ld.global.v4.u32 {%r114, %r115, %r116, %r117}, [%rd8+16]; ld.global.v4.u32 {%r122, %r123, %r124, %r125}, [%rd10]; ld.global.v4.u32 {%r130, %r131, %r132, %r133}, [%rd43]; add.s64 %rd45, %rd43, %rd9; ld.global.v4.u32 {%r138, %r139, %r140, %r141}, [%rd45]; cvt.rn.f32.s16 %f214, %rs57; sub.ftz.f32 %f215, %f214, %f7; mul.ftz.f32 %f216, %f69, %f215; mov.b32 {%rs137, %rs142}, %r106; // begin inline asm { cvt.f32.f16 %f110, %rs137;} // end inline asm fma.rn.ftz.f32 %f217, %f216, %f110, %f209; mov.b32 {%rs138, %rs143}, %r114; // begin inline asm { cvt.f32.f16 %f111, %rs138;} // end inline asm fma.rn.ftz.f32 %f218, %f216, %f111, %f210; mov.b32 {%rs139, %rs144}, %r122; // begin inline asm { cvt.f32.f16 %f112, %rs139;} // end inline asm fma.rn.ftz.f32 %f219, %f216, %f112, %f211; mov.b32 {%rs140, %rs145}, %r130; // begin inline asm { cvt.f32.f16 %f113, %rs140;} // end inline asm fma.rn.ftz.f32 %f220, %f216, %f113, %f212; mov.b32 {%rs141, %rs146}, %r138; // begin inline asm { cvt.f32.f16 %f114, %rs141;} // end inline asm fma.rn.ftz.f32 %f221, %f216, %f114, %f213; cvt.rn.f32.s16 %f222, %rs59; sub.ftz.f32 %f223, %f222, %f7; mul.ftz.f32 %f224, %f69, %f223; // begin inline asm { cvt.f32.f16 %f115, %rs142;} // end inline asm fma.rn.ftz.f32 %f225, %f224, %f115, %f217; // begin inline asm { cvt.f32.f16 %f116, %rs143;} // end inline asm fma.rn.ftz.f32 %f226, %f224, %f116, %f218; // begin inline asm { cvt.f32.f16 %f117, %rs144;} // end inline asm fma.rn.ftz.f32 %f227, %f224, %f117, %f219; // begin inline asm { cvt.f32.f16 %f118, %rs145;} // end inline asm fma.rn.ftz.f32 %f228, %f224, %f118, %f220; // begin inline asm { cvt.f32.f16 %f119, %rs146;} // end inline asm fma.rn.ftz.f32 %f229, %f224, %f119, %f221; cvt.rn.f32.s16 %f230, %rs61; sub.ftz.f32 %f231, %f230, %f7; mul.ftz.f32 %f232, %f69, %f231; mov.b32 {%rs147, %rs152}, %r107; // begin inline asm { cvt.f32.f16 %f120, %rs147;} // end inline asm fma.rn.ftz.f32 %f233, %f232, %f120, %f225; mov.b32 {%rs148, %rs153}, %r115; // begin inline asm { cvt.f32.f16 %f121, %rs148;} // end inline asm fma.rn.ftz.f32 %f234, %f232, %f121, %f226; mov.b32 {%rs149, %rs154}, %r123; // begin inline asm { cvt.f32.f16 %f122, %rs149;} // end inline asm fma.rn.ftz.f32 %f235, %f232, %f122, %f227; mov.b32 {%rs150, %rs155}, %r131; // begin inline asm { cvt.f32.f16 %f123, %rs150;} // end inline asm fma.rn.ftz.f32 %f236, %f232, %f123, %f228; mov.b32 {%rs151, %rs156}, %r139; // begin inline asm { cvt.f32.f16 %f124, %rs151;} // end inline asm fma.rn.ftz.f32 %f237, %f232, %f124, %f229; cvt.rn.f32.s16 %f238, %rs63; sub.ftz.f32 %f239, %f238, %f7; mul.ftz.f32 %f240, %f69, %f239; // begin inline asm { cvt.f32.f16 %f125, %rs152;} // end inline asm fma.rn.ftz.f32 %f241, %f240, %f125, %f233; // begin inline asm { cvt.f32.f16 %f126, %rs153;} // end inline asm fma.rn.ftz.f32 %f242, %f240, %f126, %f234; // begin inline asm { cvt.f32.f16 %f127, %rs154;} // end inline asm fma.rn.ftz.f32 %f243, %f240, %f127, %f235; // begin inline asm { cvt.f32.f16 %f128, %rs155;} // end inline asm fma.rn.ftz.f32 %f244, %f240, %f128, %f236; // begin inline asm { cvt.f32.f16 %f129, %rs156;} // end inline asm fma.rn.ftz.f32 %f245, %f240, %f129, %f237; cvt.rn.f32.s16 %f246, %rs65; sub.ftz.f32 %f247, %f246, %f7; mul.ftz.f32 %f248, %f69, %f247; mov.b32 {%rs157, %rs162}, %r108; // begin inline asm { cvt.f32.f16 %f130, %rs157;} // end inline asm fma.rn.ftz.f32 %f249, %f248, %f130, %f241; mov.b32 {%rs158, %rs163}, %r116; // begin inline asm { cvt.f32.f16 %f131, %rs158;} // end inline asm fma.rn.ftz.f32 %f250, %f248, %f131, %f242; mov.b32 {%rs159, %rs164}, %r124; // begin inline asm { cvt.f32.f16 %f132, %rs159;} // end inline asm fma.rn.ftz.f32 %f251, %f248, %f132, %f243; mov.b32 {%rs160, %rs165}, %r132; // begin inline asm { cvt.f32.f16 %f133, %rs160;} // end inline asm fma.rn.ftz.f32 %f252, %f248, %f133, %f244; mov.b32 {%rs161, %rs166}, %r140; // begin inline asm { cvt.f32.f16 %f134, %rs161;} // end inline asm fma.rn.ftz.f32 %f253, %f248, %f134, %f245; cvt.rn.f32.s16 %f254, %rs67; sub.ftz.f32 %f255, %f254, %f7; mul.ftz.f32 %f256, %f69, %f255; // begin inline asm { cvt.f32.f16 %f135, %rs162;} // end inline asm fma.rn.ftz.f32 %f257, %f256, %f135, %f249; // begin inline asm { cvt.f32.f16 %f136, %rs163;} // end inline asm fma.rn.ftz.f32 %f258, %f256, %f136, %f250; // begin inline asm { cvt.f32.f16 %f137, %rs164;} // end inline asm fma.rn.ftz.f32 %f259, %f256, %f137, %f251; // begin inline asm { cvt.f32.f16 %f138, %rs165;} // end inline asm fma.rn.ftz.f32 %f260, %f256, %f138, %f252; // begin inline asm { cvt.f32.f16 %f139, %rs166;} // end inline asm fma.rn.ftz.f32 %f261, %f256, %f139, %f253; cvt.rn.f32.s16 %f262, %rs69; sub.ftz.f32 %f263, %f262, %f7; mul.ftz.f32 %f264, %f69, %f263; mov.b32 {%rs167, %rs172}, %r109; // begin inline asm { cvt.f32.f16 %f140, %rs167;} // end inline asm fma.rn.ftz.f32 %f265, %f264, %f140, %f257; mov.b32 {%rs168, %rs173}, %r117; // begin inline asm { cvt.f32.f16 %f141, %rs168;} // end inline asm fma.rn.ftz.f32 %f266, %f264, %f141, %f258; mov.b32 {%rs169, %rs174}, %r125; // begin inline asm { cvt.f32.f16 %f142, %rs169;} // end inline asm fma.rn.ftz.f32 %f267, %f264, %f142, %f259; mov.b32 {%rs170, %rs175}, %r133; // begin inline asm { cvt.f32.f16 %f143, %rs170;} // end inline asm fma.rn.ftz.f32 %f268, %f264, %f143, %f260; mov.b32 {%rs171, %rs176}, %r141; // begin inline asm { cvt.f32.f16 %f144, %rs171;} // end inline asm fma.rn.ftz.f32 %f269, %f264, %f144, %f261; cvt.rn.f32.s16 %f270, %rs70; sub.ftz.f32 %f271, %f270, %f7; mul.ftz.f32 %f272, %f69, %f271; // begin inline asm { cvt.f32.f16 %f145, %rs172;} // end inline asm fma.rn.ftz.f32 %f595, %f272, %f145, %f265; // begin inline asm { cvt.f32.f16 %f146, %rs173;} // end inline asm fma.rn.ftz.f32 %f594, %f272, %f146, %f266; // begin inline asm { cvt.f32.f16 %f147, %rs174;} // end inline asm fma.rn.ftz.f32 %f593, %f272, %f147, %f267; // begin inline asm { cvt.f32.f16 %f148, %rs175;} // end inline asm fma.rn.ftz.f32 %f592, %f272, %f148, %f268; // begin inline asm { cvt.f32.f16 %f149, %rs176;} // end inline asm fma.rn.ftz.f32 %f591, %f272, %f149, %f269; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs257, %rs5, 4; cvt.s16.s8 %rs258, %rs257; shr.s16 %rs259, %rs258, 7; and.b16 %rs260, %rs259, -16; or.b16 %rs261, %rs260, %rs6; cvt.rn.f32.s16 %f353, %rs261; sub.ftz.f32 %f354, %f353, %f7; mul.ftz.f32 %f355, %f69, %f354; // begin inline asm { cvt.f32.f16 %f273, %rs9;} // end inline asm fma.rn.ftz.f32 %f356, %f355, %f273, %f595; // begin inline asm { cvt.f32.f16 %f274, %rs10;} // end inline asm fma.rn.ftz.f32 %f357, %f355, %f274, %f594; // begin inline asm { cvt.f32.f16 %f275, %rs11;} // end inline asm fma.rn.ftz.f32 %f358, %f355, %f275, %f593; // begin inline asm { cvt.f32.f16 %f276, %rs12;} // end inline asm fma.rn.ftz.f32 %f359, %f355, %f276, %f592; // begin inline asm { cvt.f32.f16 %f277, %rs13;} // end inline asm fma.rn.ftz.f32 %f360, %f355, %f277, %f591; shl.b16 %rs262, %rs14, 4; cvt.s16.s8 %rs263, %rs262; shr.s16 %rs264, %rs263, 7; and.b16 %rs265, %rs264, -16; or.b16 %rs266, %rs265, %rs15; cvt.rn.f32.s16 %f361, %rs266; sub.ftz.f32 %f362, %f361, %f7; mul.ftz.f32 %f363, %f69, %f362; // begin inline asm { cvt.f32.f16 %f278, %rs8;} // end inline asm fma.rn.ftz.f32 %f364, %f363, %f278, %f356; // begin inline asm { cvt.f32.f16 %f279, %rs16;} // end inline asm fma.rn.ftz.f32 %f365, %f363, %f279, %f357; // begin inline asm { cvt.f32.f16 %f280, %rs17;} // end inline asm fma.rn.ftz.f32 %f366, %f363, %f280, %f358; // begin inline asm { cvt.f32.f16 %f281, %rs18;} // end inline asm fma.rn.ftz.f32 %f367, %f363, %f281, %f359; // begin inline asm { cvt.f32.f16 %f282, %rs19;} // end inline asm fma.rn.ftz.f32 %f368, %f363, %f282, %f360; shl.b16 %rs268, %rs20, 4; cvt.s16.s8 %rs269, %rs268; shr.s16 %rs270, %rs269, 7; and.b16 %rs271, %rs270, -16; or.b16 %rs272, %rs271, %rs21; cvt.rn.f32.s16 %f369, %rs272; sub.ftz.f32 %f370, %f369, %f7; mul.ftz.f32 %f371, %f69, %f370; // begin inline asm { cvt.f32.f16 %f283, %rs22;} // end inline asm fma.rn.ftz.f32 %f372, %f371, %f283, %f364; // begin inline asm { cvt.f32.f16 %f284, %rs23;} // end inline asm fma.rn.ftz.f32 %f373, %f371, %f284, %f365; // begin inline asm { cvt.f32.f16 %f285, %rs24;} // end inline asm fma.rn.ftz.f32 %f374, %f371, %f285, %f366; // begin inline asm { cvt.f32.f16 %f286, %rs25;} // end inline asm fma.rn.ftz.f32 %f375, %f371, %f286, %f367; // begin inline asm { cvt.f32.f16 %f287, %rs26;} // end inline asm fma.rn.ftz.f32 %f376, %f371, %f287, %f368; shl.b16 %rs273, %rs27, 4; cvt.s16.s8 %rs274, %rs273; shr.s16 %rs275, %rs274, 7; and.b16 %rs276, %rs275, -16; or.b16 %rs277, %rs276, %rs28; cvt.rn.f32.s16 %f377, %rs277; sub.ftz.f32 %f378, %f377, %f7; mul.ftz.f32 %f379, %f69, %f378; // begin inline asm { cvt.f32.f16 %f288, %rs29;} // end inline asm fma.rn.ftz.f32 %f380, %f379, %f288, %f372; // begin inline asm { cvt.f32.f16 %f289, %rs30;} // end inline asm fma.rn.ftz.f32 %f381, %f379, %f289, %f373; // begin inline asm { cvt.f32.f16 %f290, %rs31;} // end inline asm fma.rn.ftz.f32 %f382, %f379, %f290, %f374; // begin inline asm { cvt.f32.f16 %f291, %rs32;} // end inline asm fma.rn.ftz.f32 %f383, %f379, %f291, %f375; // begin inline asm { cvt.f32.f16 %f292, %rs33;} // end inline asm fma.rn.ftz.f32 %f384, %f379, %f292, %f376; shl.b16 %rs278, %rs34, 4; cvt.s16.s8 %rs279, %rs278; shr.s16 %rs280, %rs279, 7; and.b16 %rs281, %rs280, -16; or.b16 %rs282, %rs281, %rs35; cvt.rn.f32.s16 %f385, %rs282; sub.ftz.f32 %f386, %f385, %f7; mul.ftz.f32 %f387, %f69, %f386; // begin inline asm { cvt.f32.f16 %f293, %rs36;} // end inline asm fma.rn.ftz.f32 %f388, %f387, %f293, %f380; // begin inline asm { cvt.f32.f16 %f294, %rs38;} // end inline asm fma.rn.ftz.f32 %f389, %f387, %f294, %f381; // begin inline asm { cvt.f32.f16 %f295, %rs40;} // end inline asm fma.rn.ftz.f32 %f390, %f387, %f295, %f382; // begin inline asm { cvt.f32.f16 %f296, %rs42;} // end inline asm fma.rn.ftz.f32 %f391, %f387, %f296, %f383; // begin inline asm { cvt.f32.f16 %f297, %rs44;} // end inline asm fma.rn.ftz.f32 %f392, %f387, %f297, %f384; shl.b16 %rs283, %rs46, 4; cvt.s16.s8 %rs284, %rs283; shr.s16 %rs285, %rs284, 7; and.b16 %rs286, %rs285, -16; or.b16 %rs287, %rs286, %rs47; cvt.rn.f32.s16 %f393, %rs287; sub.ftz.f32 %f394, %f393, %f7; mul.ftz.f32 %f395, %f69, %f394; // begin inline asm { cvt.f32.f16 %f298, %rs48;} // end inline asm fma.rn.ftz.f32 %f396, %f395, %f298, %f388; // begin inline asm { cvt.f32.f16 %f299, %rs49;} // end inline asm fma.rn.ftz.f32 %f397, %f395, %f299, %f389; // begin inline asm { cvt.f32.f16 %f300, %rs50;} // end inline asm fma.rn.ftz.f32 %f398, %f395, %f300, %f390; // begin inline asm { cvt.f32.f16 %f301, %rs51;} // end inline asm fma.rn.ftz.f32 %f399, %f395, %f301, %f391; // begin inline asm { cvt.f32.f16 %f302, %rs52;} // end inline asm fma.rn.ftz.f32 %f400, %f395, %f302, %f392; shl.b16 %rs288, %rs53, 4; cvt.s16.s8 %rs289, %rs288; shr.s16 %rs290, %rs289, 7; and.b16 %rs291, %rs290, -16; or.b16 %rs292, %rs291, %rs54; cvt.rn.f32.s16 %f401, %rs292; sub.ftz.f32 %f402, %f401, %f7; mul.ftz.f32 %f403, %f69, %f402; mov.b32 {%rs207, %rs212}, %r71; // begin inline asm { cvt.f32.f16 %f303, %rs207;} // end inline asm fma.rn.ftz.f32 %f404, %f403, %f303, %f396; mov.b32 {%rs208, %rs213}, %r75; // begin inline asm { cvt.f32.f16 %f304, %rs208;} // end inline asm fma.rn.ftz.f32 %f405, %f403, %f304, %f397; mov.b32 {%rs209, %rs214}, %r82; // begin inline asm { cvt.f32.f16 %f305, %rs209;} // end inline asm fma.rn.ftz.f32 %f406, %f403, %f305, %f398; mov.b32 {%rs210, %rs215}, %r87; // begin inline asm { cvt.f32.f16 %f306, %rs210;} // end inline asm fma.rn.ftz.f32 %f407, %f403, %f306, %f399; mov.b32 {%rs211, %rs216}, %r91; // begin inline asm { cvt.f32.f16 %f307, %rs211;} // end inline asm fma.rn.ftz.f32 %f408, %f403, %f307, %f400; shl.b16 %rs293, %rs55, 4; cvt.s16.s8 %rs294, %rs293; shr.s16 %rs295, %rs294, 7; and.b16 %rs296, %rs295, -16; or.b16 %rs297, %rs296, %rs55; cvt.rn.f32.s16 %f409, %rs297; sub.ftz.f32 %f410, %f409, %f7; mul.ftz.f32 %f411, %f69, %f410; // begin inline asm { cvt.f32.f16 %f308, %rs212;} // end inline asm fma.rn.ftz.f32 %f412, %f411, %f308, %f404; // begin inline asm { cvt.f32.f16 %f309, %rs213;} // end inline asm fma.rn.ftz.f32 %f413, %f411, %f309, %f405; // begin inline asm { cvt.f32.f16 %f310, %rs214;} // end inline asm fma.rn.ftz.f32 %f414, %f411, %f310, %f406; // begin inline asm { cvt.f32.f16 %f311, %rs215;} // end inline asm fma.rn.ftz.f32 %f415, %f411, %f311, %f407; // begin inline asm { cvt.f32.f16 %f312, %rs216;} // end inline asm fma.rn.ftz.f32 %f416, %f411, %f312, %f408; ld.global.v4.u32 {%r146, %r147, %r148, %r149}, [%rd7+16]; ld.global.v4.u32 {%r154, %r155, %r156, %r157}, [%rd8+16]; ld.global.v4.u32 {%r162, %r163, %r164, %r165}, [%rd10]; add.s64 %rd47, %rd10, %rd9; ld.global.v4.u32 {%r170, %r171, %r172, %r173}, [%rd47]; add.s64 %rd48, %rd47, %rd9; ld.global.v4.u32 {%r178, %r179, %r180, %r181}, [%rd48]; shl.b16 %rs298, %rs56, 4; cvt.s16.s8 %rs299, %rs298; shr.s16 %rs300, %rs299, 7; and.b16 %rs301, %rs300, -16; or.b16 %rs302, %rs301, %rs57; cvt.rn.f32.s16 %f417, %rs302; sub.ftz.f32 %f418, %f417, %f7; mul.ftz.f32 %f419, %f69, %f418; mov.b32 {%rs217, %rs222}, %r146; // begin inline asm { cvt.f32.f16 %f313, %rs217;} // end inline asm fma.rn.ftz.f32 %f420, %f419, %f313, %f412; mov.b32 {%rs218, %rs223}, %r154; // begin inline asm { cvt.f32.f16 %f314, %rs218;} // end inline asm fma.rn.ftz.f32 %f421, %f419, %f314, %f413; mov.b32 {%rs219, %rs224}, %r162; // begin inline asm { cvt.f32.f16 %f315, %rs219;} // end inline asm fma.rn.ftz.f32 %f422, %f419, %f315, %f414; mov.b32 {%rs220, %rs225}, %r170; // begin inline asm { cvt.f32.f16 %f316, %rs220;} // end inline asm fma.rn.ftz.f32 %f423, %f419, %f316, %f415; mov.b32 {%rs221, %rs226}, %r178; // begin inline asm { cvt.f32.f16 %f317, %rs221;} // end inline asm fma.rn.ftz.f32 %f424, %f419, %f317, %f416; shl.b16 %rs303, %rs58, 4; cvt.s16.s8 %rs304, %rs303; shr.s16 %rs305, %rs304, 7; and.b16 %rs306, %rs305, -16; or.b16 %rs307, %rs306, %rs59; cvt.rn.f32.s16 %f425, %rs307; sub.ftz.f32 %f426, %f425, %f7; mul.ftz.f32 %f427, %f69, %f426; // begin inline asm { cvt.f32.f16 %f318, %rs222;} // end inline asm fma.rn.ftz.f32 %f428, %f427, %f318, %f420; // begin inline asm { cvt.f32.f16 %f319, %rs223;} // end inline asm fma.rn.ftz.f32 %f429, %f427, %f319, %f421; // begin inline asm { cvt.f32.f16 %f320, %rs224;} // end inline asm fma.rn.ftz.f32 %f430, %f427, %f320, %f422; // begin inline asm { cvt.f32.f16 %f321, %rs225;} // end inline asm fma.rn.ftz.f32 %f431, %f427, %f321, %f423; // begin inline asm { cvt.f32.f16 %f322, %rs226;} // end inline asm fma.rn.ftz.f32 %f432, %f427, %f322, %f424; shl.b16 %rs308, %rs60, 4; cvt.s16.s8 %rs309, %rs308; shr.s16 %rs310, %rs309, 7; and.b16 %rs311, %rs310, -16; or.b16 %rs312, %rs311, %rs61; cvt.rn.f32.s16 %f433, %rs312; sub.ftz.f32 %f434, %f433, %f7; mul.ftz.f32 %f435, %f69, %f434; mov.b32 {%rs227, %rs232}, %r147; // begin inline asm { cvt.f32.f16 %f323, %rs227;} // end inline asm fma.rn.ftz.f32 %f436, %f435, %f323, %f428; mov.b32 {%rs228, %rs233}, %r155; // begin inline asm { cvt.f32.f16 %f324, %rs228;} // end inline asm fma.rn.ftz.f32 %f437, %f435, %f324, %f429; mov.b32 {%rs229, %rs234}, %r163; // begin inline asm { cvt.f32.f16 %f325, %rs229;} // end inline asm fma.rn.ftz.f32 %f438, %f435, %f325, %f430; mov.b32 {%rs230, %rs235}, %r171; // begin inline asm { cvt.f32.f16 %f326, %rs230;} // end inline asm fma.rn.ftz.f32 %f439, %f435, %f326, %f431; mov.b32 {%rs231, %rs236}, %r179; // begin inline asm { cvt.f32.f16 %f327, %rs231;} // end inline asm fma.rn.ftz.f32 %f440, %f435, %f327, %f432; shl.b16 %rs313, %rs62, 4; cvt.s16.s8 %rs314, %rs313; shr.s16 %rs315, %rs314, 7; and.b16 %rs316, %rs315, -16; or.b16 %rs317, %rs316, %rs63; cvt.rn.f32.s16 %f441, %rs317; sub.ftz.f32 %f442, %f441, %f7; mul.ftz.f32 %f443, %f69, %f442; // begin inline asm { cvt.f32.f16 %f328, %rs232;} // end inline asm fma.rn.ftz.f32 %f444, %f443, %f328, %f436; // begin inline asm { cvt.f32.f16 %f329, %rs233;} // end inline asm fma.rn.ftz.f32 %f445, %f443, %f329, %f437; // begin inline asm { cvt.f32.f16 %f330, %rs234;} // end inline asm fma.rn.ftz.f32 %f446, %f443, %f330, %f438; // begin inline asm { cvt.f32.f16 %f331, %rs235;} // end inline asm fma.rn.ftz.f32 %f447, %f443, %f331, %f439; // begin inline asm { cvt.f32.f16 %f332, %rs236;} // end inline asm fma.rn.ftz.f32 %f448, %f443, %f332, %f440; shl.b16 %rs318, %rs64, 4; cvt.s16.s8 %rs319, %rs318; shr.s16 %rs320, %rs319, 7; and.b16 %rs321, %rs320, -16; or.b16 %rs322, %rs321, %rs65; cvt.rn.f32.s16 %f449, %rs322; sub.ftz.f32 %f450, %f449, %f7; mul.ftz.f32 %f451, %f69, %f450; mov.b32 {%rs237, %rs242}, %r148; // begin inline asm { cvt.f32.f16 %f333, %rs237;} // end inline asm fma.rn.ftz.f32 %f452, %f451, %f333, %f444; mov.b32 {%rs238, %rs243}, %r156; // begin inline asm { cvt.f32.f16 %f334, %rs238;} // end inline asm fma.rn.ftz.f32 %f453, %f451, %f334, %f445; mov.b32 {%rs239, %rs244}, %r164; // begin inline asm { cvt.f32.f16 %f335, %rs239;} // end inline asm fma.rn.ftz.f32 %f454, %f451, %f335, %f446; mov.b32 {%rs240, %rs245}, %r172; // begin inline asm { cvt.f32.f16 %f336, %rs240;} // end inline asm fma.rn.ftz.f32 %f455, %f451, %f336, %f447; mov.b32 {%rs241, %rs246}, %r180; // begin inline asm { cvt.f32.f16 %f337, %rs241;} // end inline asm fma.rn.ftz.f32 %f456, %f451, %f337, %f448; shl.b16 %rs323, %rs66, 4; cvt.s16.s8 %rs324, %rs323; shr.s16 %rs325, %rs324, 7; and.b16 %rs326, %rs325, -16; or.b16 %rs327, %rs326, %rs67; cvt.rn.f32.s16 %f457, %rs327; sub.ftz.f32 %f458, %f457, %f7; mul.ftz.f32 %f459, %f69, %f458; // begin inline asm { cvt.f32.f16 %f338, %rs242;} // end inline asm fma.rn.ftz.f32 %f460, %f459, %f338, %f452; // begin inline asm { cvt.f32.f16 %f339, %rs243;} // end inline asm fma.rn.ftz.f32 %f461, %f459, %f339, %f453; // begin inline asm { cvt.f32.f16 %f340, %rs244;} // end inline asm fma.rn.ftz.f32 %f462, %f459, %f340, %f454; // begin inline asm { cvt.f32.f16 %f341, %rs245;} // end inline asm fma.rn.ftz.f32 %f463, %f459, %f341, %f455; // begin inline asm { cvt.f32.f16 %f342, %rs246;} // end inline asm fma.rn.ftz.f32 %f464, %f459, %f342, %f456; shl.b16 %rs328, %rs68, 4; cvt.s16.s8 %rs329, %rs328; shr.s16 %rs330, %rs329, 7; and.b16 %rs331, %rs330, -16; or.b16 %rs332, %rs331, %rs69; cvt.rn.f32.s16 %f465, %rs332; sub.ftz.f32 %f466, %f465, %f7; mul.ftz.f32 %f467, %f69, %f466; mov.b32 {%rs247, %rs252}, %r149; // begin inline asm { cvt.f32.f16 %f343, %rs247;} // end inline asm fma.rn.ftz.f32 %f468, %f467, %f343, %f460; mov.b32 {%rs248, %rs253}, %r157; // begin inline asm { cvt.f32.f16 %f344, %rs248;} // end inline asm fma.rn.ftz.f32 %f469, %f467, %f344, %f461; mov.b32 {%rs249, %rs254}, %r165; // begin inline asm { cvt.f32.f16 %f345, %rs249;} // end inline asm fma.rn.ftz.f32 %f470, %f467, %f345, %f462; mov.b32 {%rs250, %rs255}, %r173; // begin inline asm { cvt.f32.f16 %f346, %rs250;} // end inline asm fma.rn.ftz.f32 %f471, %f467, %f346, %f463; mov.b32 {%rs251, %rs256}, %r181; // begin inline asm { cvt.f32.f16 %f347, %rs251;} // end inline asm fma.rn.ftz.f32 %f472, %f467, %f347, %f464; shl.b16 %rs333, %rs70, 4; cvt.s16.s8 %rs334, %rs333; shr.s16 %rs335, %rs334, 7; and.b16 %rs336, %rs335, -16; or.b16 %rs337, %rs336, %rs70; cvt.rn.f32.s16 %f473, %rs337; sub.ftz.f32 %f474, %f473, %f7; mul.ftz.f32 %f475, %f69, %f474; // begin inline asm { cvt.f32.f16 %f348, %rs252;} // end inline asm fma.rn.ftz.f32 %f595, %f475, %f348, %f468; // begin inline asm { cvt.f32.f16 %f349, %rs253;} // end inline asm fma.rn.ftz.f32 %f594, %f475, %f349, %f469; // begin inline asm { cvt.f32.f16 %f350, %rs254;} // end inline asm fma.rn.ftz.f32 %f593, %f475, %f350, %f470; // begin inline asm { cvt.f32.f16 %f351, %rs255;} // end inline asm fma.rn.ftz.f32 %f592, %f475, %f351, %f471; // begin inline asm { cvt.f32.f16 %f352, %rs256;} // end inline asm fma.rn.ftz.f32 %f591, %f475, %f352, %f472; $L__BB0_8: add.s32 %r330, %r330, 4; shl.b32 %r186, %r330, 5; add.s32 %r329, %r186, %r51; shl.b32 %r328, %r329, 1; setp.lt.u32 %p7, %r328, %r48; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r188, %tid.y; shl.b32 %r189, %r188, 5; add.s32 %r38, %r189, %r51; setp.lt.u32 %p8, %r38, 32; shl.b32 %r191, %r38, 2; mov.u32 %r192, _ZZ9gemv_int4ILi4ELi64ELi5EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r193, %r192, %r191; @%p8 bra $L__BB0_11; add.s32 %r323, %r193, -112; st.shared.f32 [%r323], %f595; $L__BB0_11: setp.gt.u32 %p9, %r38, 31; bar.sync 0; mad.lo.s32 %r40, %r38, 12, %r192; @%p9 bra $L__BB0_13; mov.u32 %r212, 16; ld.shared.f32 %f491, [%r40+16]; add.ftz.f32 %f492, %f595, %f491; ld.shared.f32 %f493, [%r40+20]; add.ftz.f32 %f494, %f492, %f493; ld.shared.f32 %f495, [%r40+24]; add.ftz.f32 %f478, %f494, %f495; mov.u32 %r200, 1; mov.u32 %r213, 31; mov.u32 %r214, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f478, %r200, %r213, %r214; @p add.f32 r0, r0, %f478; mov.f32 %f476, r0;} // end inline asm mov.u32 %r203, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f476, %r203, %r213, %r214; @p add.f32 r0, r0, %f476; mov.f32 %f479, r0;} // end inline asm mov.u32 %r206, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f479, %r206, %r213, %r214; @p add.f32 r0, r0, %f479; mov.f32 %f482, r0;} // end inline asm mov.u32 %r209, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f482, %r209, %r213, %r214; @p add.f32 r0, r0, %f482; mov.f32 %f485, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f485, %r212, %r213, %r214; @p add.f32 r0, r0, %f485; mov.f32 %f595, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r324, %r193, -112; st.shared.f32 [%r324+640], %f594; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f511, [%r40+656]; add.ftz.f32 %f512, %f594, %f511; ld.shared.f32 %f513, [%r40+660]; add.ftz.f32 %f514, %f512, %f513; ld.shared.f32 %f515, [%r40+664]; add.ftz.f32 %f498, %f514, %f515; mov.u32 %r224, 1; mov.u32 %r237, 31; mov.u32 %r238, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f498, %r224, %r237, %r238; @p add.f32 r0, r0, %f498; mov.f32 %f496, r0;} // end inline asm mov.u32 %r227, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f496, %r227, %r237, %r238; @p add.f32 r0, r0, %f496; mov.f32 %f499, r0;} // end inline asm mov.u32 %r230, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f499, %r230, %r237, %r238; @p add.f32 r0, r0, %f499; mov.f32 %f502, r0;} // end inline asm mov.u32 %r233, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f502, %r233, %r237, %r238; @p add.f32 r0, r0, %f502; mov.f32 %f505, r0;} // end inline asm mov.u32 %r236, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f505, %r236, %r237, %r238; @p add.f32 r0, r0, %f505; mov.f32 %f594, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r325, %r193, -112; st.shared.f32 [%r325+1280], %f593; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f531, [%r40+1296]; add.ftz.f32 %f532, %f593, %f531; ld.shared.f32 %f533, [%r40+1300]; add.ftz.f32 %f534, %f532, %f533; ld.shared.f32 %f535, [%r40+1304]; add.ftz.f32 %f518, %f534, %f535; mov.u32 %r248, 1; mov.u32 %r261, 31; mov.u32 %r262, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f518, %r248, %r261, %r262; @p add.f32 r0, r0, %f518; mov.f32 %f516, r0;} // end inline asm mov.u32 %r251, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f516, %r251, %r261, %r262; @p add.f32 r0, r0, %f516; mov.f32 %f519, r0;} // end inline asm mov.u32 %r254, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f519, %r254, %r261, %r262; @p add.f32 r0, r0, %f519; mov.f32 %f522, r0;} // end inline asm mov.u32 %r257, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f522, %r257, %r261, %r262; @p add.f32 r0, r0, %f522; mov.f32 %f525, r0;} // end inline asm mov.u32 %r260, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f525, %r260, %r261, %r262; @p add.f32 r0, r0, %f525; mov.f32 %f593, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r326, %r193, -112; st.shared.f32 [%r326+1920], %f592; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f551, [%r40+1936]; add.ftz.f32 %f552, %f592, %f551; ld.shared.f32 %f553, [%r40+1940]; add.ftz.f32 %f554, %f552, %f553; ld.shared.f32 %f555, [%r40+1944]; add.ftz.f32 %f538, %f554, %f555; mov.u32 %r272, 1; mov.u32 %r285, 31; mov.u32 %r286, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f538, %r272, %r285, %r286; @p add.f32 r0, r0, %f538; mov.f32 %f536, r0;} // end inline asm mov.u32 %r275, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f536, %r275, %r285, %r286; @p add.f32 r0, r0, %f536; mov.f32 %f539, r0;} // end inline asm mov.u32 %r278, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f539, %r278, %r285, %r286; @p add.f32 r0, r0, %f539; mov.f32 %f542, r0;} // end inline asm mov.u32 %r281, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f542, %r281, %r285, %r286; @p add.f32 r0, r0, %f542; mov.f32 %f545, r0;} // end inline asm mov.u32 %r284, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f545, %r284, %r285, %r286; @p add.f32 r0, r0, %f545; mov.f32 %f592, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r327, %r193, -112; st.shared.f32 [%r327+2560], %f591; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f571, [%r40+2576]; add.ftz.f32 %f572, %f591, %f571; ld.shared.f32 %f573, [%r40+2580]; add.ftz.f32 %f574, %f572, %f573; ld.shared.f32 %f575, [%r40+2584]; add.ftz.f32 %f558, %f574, %f575; mov.u32 %r296, 1; mov.u32 %r309, 31; mov.u32 %r310, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f558, %r296, %r309, %r310; @p add.f32 r0, r0, %f558; mov.f32 %f556, r0;} // end inline asm mov.u32 %r299, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f556, %r299, %r309, %r310; @p add.f32 r0, r0, %f556; mov.f32 %f559, r0;} // end inline asm mov.u32 %r302, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f559, %r302, %r309, %r310; @p add.f32 r0, r0, %f559; mov.f32 %f562, r0;} // end inline asm mov.u32 %r305, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f562, %r305, %r309, %r310; @p add.f32 r0, r0, %f562; mov.f32 %f565, r0;} // end inline asm mov.u32 %r308, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f565, %r308, %r309, %r310; @p add.f32 r0, r0, %f565; mov.f32 %f591, r0;} // end inline asm $L__BB0_29: or.b32 %r313, %r51, %r188; setp.ne.s32 %p18, %r313, 0; @%p18 bra $L__BB0_41; ld.param.u64 %rd79, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p19, %rd79, 0; mul.ftz.f32 %f606, %f62, %f595; mov.u32 %r314, %ctaid.x; cvt.s64.s32 %rd11, %r314; @%p19 bra $L__BB0_32; shl.b64 %rd49, %rd11, 1; add.s64 %rd50, %rd2, %rd49; ld.global.u16 %rs338, [%rd50]; // begin inline asm { cvt.f32.f16 %f576, %rs338;} // end inline asm fma.rn.ftz.f32 %f606, %f63, %f576, %f606; $L__BB0_32: ld.param.u64 %rd80, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs339, %f606;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd51, 1.0; // end inline asm shl.b64 %rd54, %rd11, 1; add.s64 %rd52, %rd80, %rd54; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd52], %rs339, %rd51; // end inline asm mul.ftz.f32 %f607, %f62, %f594; add.s32 %r316, %r47, %r314; cvt.s64.s32 %rd14, %r316; @%p19 bra $L__BB0_34; shl.b64 %rd55, %rd14, 1; add.s64 %rd56, %rd2, %rd55; ld.global.u16 %rs341, [%rd56]; // begin inline asm { cvt.f32.f16 %f578, %rs341;} // end inline asm fma.rn.ftz.f32 %f607, %f63, %f578, %f607; $L__BB0_34: mul.wide.s32 %rd60, %r47, 2; add.s64 %rd58, %rd52, %rd60; // begin inline asm { cvt.rn.f16.f32 %rs342, %f607;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd57, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd58], %rs342, %rd57; // end inline asm mul.ftz.f32 %f608, %f62, %f593; cvt.u32.u64 %r317, %rd14; add.s32 %r318, %r317, %r47; cvt.s64.s32 %rd15, %r318; @%p19 bra $L__BB0_36; shl.b64 %rd61, %rd15, 1; add.s64 %rd62, %rd2, %rd61; ld.global.u16 %rs344, [%rd62]; // begin inline asm { cvt.f32.f16 %f580, %rs344;} // end inline asm fma.rn.ftz.f32 %f608, %f63, %f580, %f608; $L__BB0_36: ld.param.u64 %rd81, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs345, %f608;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd63, 1.0; // end inline asm shl.b64 %rd66, %rd15, 1; add.s64 %rd64, %rd81, %rd66; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd64], %rs345, %rd63; // end inline asm mul.ftz.f32 %f609, %f62, %f592; cvt.u32.u64 %r319, %rd15; add.s32 %r320, %r319, %r47; cvt.s64.s32 %rd16, %r320; @%p19 bra $L__BB0_38; shl.b64 %rd67, %rd16, 1; add.s64 %rd68, %rd2, %rd67; ld.global.u16 %rs347, [%rd68]; // begin inline asm { cvt.f32.f16 %f582, %rs347;} // end inline asm fma.rn.ftz.f32 %f609, %f63, %f582, %f609; $L__BB0_38: ld.param.u64 %rd82, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs348, %f609;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd69, 1.0; // end inline asm shl.b64 %rd72, %rd16, 1; add.s64 %rd70, %rd82, %rd72; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd70], %rs348, %rd69; // end inline asm mul.ftz.f32 %f610, %f62, %f591; cvt.u32.u64 %r321, %rd16; add.s32 %r322, %r321, %r47; cvt.s64.s32 %rd17, %r322; @%p19 bra $L__BB0_40; shl.b64 %rd73, %rd17, 1; add.s64 %rd74, %rd2, %rd73; ld.global.u16 %rs350, [%rd74]; // begin inline asm { cvt.f32.f16 %f584, %rs350;} // end inline asm fma.rn.ftz.f32 %f610, %f63, %f584, %f610; $L__BB0_40: ld.param.u64 %rd83, [_Z27dequant_gemv_group64_batch523DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs351, %f610;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd75, 1.0; // end inline asm shl.b64 %rd78, %rd17, 1; add.s64 %rd76, %rd83, %rd78; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd76], %rs351, %rd75; // end inline asm $L__BB0_41: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }