44]; ld.param.u32 %r39, [gemvKernel_vectorized_fp16_param_0+40]; ld.param.u64 %rd9, [gemvKernel_vectorized_fp16_param_0+24]; ld.param.u64 %rd8, [gemvKernel_vectorized_fp16_param_0+16]; ld.param.u64 %rd7, [gemvKernel_vectorized_fp16_param_0+8]; ld.param.u64 %rd6, [gemvKernel_vectorized_fp16_param_0]; cvta.to.global.u64 %rd1, %rd6; cvta.to.global.u64 %rd2, %rd7; mov.u32 %r42, %ntid.y; mov.u32 %r43, %ctaid.x; mov.u32 %r44, %tid.y; mad.lo.s32 %r1, %r43, %r42, %r44; setp.ge.s32 %p2, %r1, %r40; @%p2 bra $L__BB0_55; setp.lt.s32 %p3, %r39, 1; mov.f32 %f119, 0f00000000; mov.u32 %r3, %tid.x; @%p3 bra $L__BB0_51; mov.u32 %r5, WARP_SZ; mul.lo.s32 %r6, %r41, %r1; and.b32 %r7, %r39, 1; setp.eq.s32 %p4, %r39, 1; mov.f32 %f119, 0f00000000; mov.u32 %r121, 0; @%p4 bra $L__BB0_35; sub.s32 %r120, %r39, %r7; $L__BB0_4: mad.lo.s32 %r11, %r121, %r5, %r3; shl.b32 %r12, %r11, 3; setp.ge.s32 %p5, %r12, %r41; @%p5 bra $L__BB0_19; add.s32 %r47, %r12, %r6; mul.wide.s32 %rd10, %r47, 2; add.s64 %rd11, %rd1, %rd10; ld.global.v4.u32 {%r48, %r49, %r50, %r51}, [%rd11]; mul.wide.s32 %rd12, %r12, 2; add.s64 %rd13, %rd2, %rd12; ld.global.v4.u32 {%r53, %r54, %r55, %r56}, [%rd13]; mov.b32 {%rs21, %rs2}, %r48; // begin inline asm { cvt.f32.f16 %f57, %rs21;} // end inline asm mov.b32 {%rs22, %rs4}, %r53; // begin inline asm { cvt.f32.f16 %f58, %rs22;} // end inline asm fma.rn.ftz.f32 %f119, %f57, %f58, %f119; add.s32 %r58, %r12, 1; setp.ge.s32 %p6, %r58, %r41; @%p6 bra $L__BB0_7; // begin inline asm { cvt.f32.f16 %f59, %rs2;} // end inline asm // begin inline asm { cvt.f32.f16 %f60, %rs4;} // end inline asm fma.rn.ftz.f32 %f119, %f59, %f60, %f119; $L__BB0_7: add.s32 %r59, %r12, 2; setp.ge.s32 %p7, %r59, %r41; @%p7 bra $L__BB0_9; cvt.u16.u32 %rs25, %r49; // begin inline asm { cvt.f32.f16 %f61, %rs25;} // end inline asm cvt.u16.u32 %rs26, %r54; // begin inline asm { cvt.f32.f16 %f62, %rs26;} // end inline asm fma.rn.ftz.f32 %f119, %f61, %f62, %f119; $L__BB0_9: add.s32 %r60, %r12, 3; setp.ge.s32 %p8, %r60, %r41; @%p8 bra $L__BB0_11; mov.b32 {%rs29, %rs27}, %r49; // begin inline asm { cvt.f32.f16 %f63, %rs27;} // end inline asm mov.b32 {%rs30, %rs28}, %r54; // begin inline asm { cvt.f32.f16 %f64, %rs28;} // end inline asm fma.rn.ftz.f32 %f119, %f63, %f64, %f119; $L__BB0_11: add.s32 %r61, %r12, 4; setp.ge.s32 %p9, %r61, %r41; @%p9 bra $L__BB0_13; cvt.u16.u32 %rs31, %r50; // begin inline asm { cvt.f32.f16 %f65, %rs31;} // end inline asm cvt.u16.u32 %rs32, %r55; // begin inline asm { cvt.f32.f16 %f66, %rs32;} // end inline asm fma.rn.ftz.f32 %f119, %f65, %f66, %f119; $L__BB0_13: add.s32 %r62, %r12, 5; setp.ge.s32 %p10, %r62, %r41; @%p10 bra $L__BB0_15; mov.b32 {%rs35, %rs33}, %r50; // begin inline asm { cvt.f32.f16 %f67, %rs33;} // end inline asm mov.b32 {%rs36, %rs34}, %r55; // begin inline asm { cvt.f32.f16 %f68, %rs34;} // end inline asm fma.rn.ftz.f32 %f119, %f67, %f68, %f119; $L__BB0_15: add.s32 %r63, %r12, 6; setp.ge.s32 %p11, %r63, %r41; @%p11 bra $L__BB0_17; cvt.u16.u32 %rs37, %r51; // begin inline asm { cvt.f32.f16 %f69, %rs37;} // end inline asm cvt.u16.u32 %rs38, %r56; // begin inline asm { cvt.f32.f16 %f70, %rs38;} // end inline asm fma.rn.ftz.f32 %f119, %f69, %f70, %f119; $L__BB0_17: add.s32 %r64, %r12, 7; setp.ge.s32 %p12, %r64, %r41; @%p12 bra $L__BB0_19; mov.b32 {%rs41, %rs39}, %r51; // begin inline asm { cvt.f32.f16 %f71, %rs39;} // end inline asm mov.b32 {%rs42, %rs40}, %r56; // begin inline asm { cvt.f32.f16 %f72, %rs40;} // end inline asm fma.rn.ftz.f32 %f119, %f71, %f72, %f119; $L__BB0_19: add.s32 %r65, %r11, %r5; shl.b32 %r19, %r65, 3; setp.ge.s32 %p13, %r19, %r41; @%p13 bra $L__BB0_34; add.s32 %r66, %r19, %r6; mul.wide.s32 %rd14, %r66, 2; add.s64 %rd15, %rd1, %rd14; ld.global.v4.u32 {%r67, %r68, %r69, %r70}, [%rd15]; mul.wide.s32 %rd16, %r19, 2; add.s64 %rd17, %rd2, %rd16; ld.global.v4.u32 {%r72, %r73, %r74, %r75}, [%rd17]; mov.b32 {%rs43, %rs6}, %r67; // begin inline asm { cvt.f32.f16 %f73, %rs43;} // end inline asm mov.b32 {%rs44, %rs8}, %r72; // begin inline asm { cvt.f32.f16 %f74, %rs44;} // end inline asm fma.rn.ftz.f32 %f119, %f73, %f74, %f119; add.s32 %r77, %r19, 1; setp.ge.s32 %p14, %r77, %r41; @%p14 bra $L__BB0_22; // begin inline asm { cvt.f32.f16 %f75, %rs6;} // end inline asm // begin inline asm { cvt.f32.f16 %f76, %rs8;} // end inline asm fma.rn.ftz.f32 %f119, %f75, %f76, %f119; $L__BB0_22: add.s32 %r78, %r19, 2; setp.ge.s32 %p15, %r78, %r41; @%p15 bra $L__BB0_24; cvt.u16.u32 %rs47, %r68; // begin inline asm { cvt.f32.f16 %f77, %rs47;} // end inline asm cvt.u16.u32 %rs48, %r73; // begin inline asm { cvt.f32.f16 %f78, %rs48;} // end inline asm fma.rn.ftz.f32 %f119, %f77, %f78, %f119; $L__BB0_24: add.s32 %r79, %r19, 3; setp.ge.s32 %p16, %r79, %r41; @%p16 bra $L__BB0_26; mov.b32 {%rs51, %rs49}, %r68; // begin inline asm { cvt.f32.f16 %f79, %rs49;} // end inline asm mov.b32 {%rs52, %rs50}, %r73; // begin inline asm { cvt.f32.f16 %f80, %rs50;} // end inline asm fma.rn.ftz.f32 %f119, %f79, %f80, %f119; $L__BB0_26: add.s32 %r80, %r19, 4; setp.ge.s32 %p17, %r80, %r41; @%p17 bra $L__BB0_28; cvt.u16.u32 %rs53, %r69; // begin inline asm { cvt.f32.f16 %f81, %rs53;} // end inline asm cvt.u16.u32 %rs54, %r74; // begin inline asm { cvt.f32.f16 %f82, %rs54;} // end inline asm fma.rn.ftz.f32 %f119, %f81, %f82, %f119; $L__BB0_28: add.s32 %r81, %r19, 5; setp.ge.s32 %p18, %r81, %r41; @%p18 bra $L__BB0_30; mov.b32 {%rs57, %rs55}, %r69; // begin inline asm { cvt.f32.f16 %f83, %rs55;} // end inline asm mov.b32 {%rs58, %rs56}, %r74; // begin inline asm { cvt.f32.f16 %f84, %rs56;} // end inline asm fma.rn.ftz.f32 %f119, %f83, %f84, %f119; $L__BB0_30: add.s32 %r82, %r19, 6; setp.ge.s32 %p19, %r82, %r41; @%p19 bra $L__BB0_32; cvt.u16.u32 %rs59, %r70; // begin inline asm { cvt.f32.f16 %f85, %rs59;} // end inline asm cvt.u16.u32 %rs60, %r75; // begin inline asm { cvt.f32.f16 %f86, %rs60;} // end inline asm fma.rn.ftz.f32 %f119, %f85, %f86, %f119; $L__BB0_32: add.s32 %r83, %r19, 7; setp.ge.s32 %p20, %r83, %r41; @%p20 bra $L__BB0_34; mov.b32 {%rs63, %rs61}, %r70; // begin inline asm { cvt.f32.f16 %f87, %rs61;} // end inline asm mov.b32 {%rs64, %rs62}, %r75; // begin inline asm { cvt.f32.f16 %f88, %rs62;} // end inline asm fma.rn.ftz.f32 %f119, %f87, %f88, %f119; $L__BB0_34: add.s32 %r121, %r121, 2; add.s32 %r120, %r120, -2; setp.ne.s32 %p21, %r120, 0; @%p21 bra $L__BB0_4; $L__BB0_35: setp.eq.s32 %p22, %r7, 0; @%p22 bra $L__BB0_51; mad.lo.s32 %r84, %r121, %r5, %r3; shl.b32 %r29, %r84, 3; setp.ge.s32 %p23, %r29, %r41; @%p23 bra $L__BB0_51; add.s32 %r85, %r29, %r6; mul.wide.s32 %rd18, %r85, 2; add.s64 %rd19, %rd1, %rd18; ld.global.v4.u32 {%r86, %r87, %r88, %r89}, [%rd19]; mul.wide.s32 %rd20, %r29, 2; add.s64 %rd21, %rd2, %rd20; ld.global.v4.u32 {%r91, %r92, %r93, %r94}, [%rd21]; mov.b32 {%rs65, %rs10}, %r86; // begin inline asm { cvt.f32.f16 %f89, %rs65;} // end inline asm mov.b32 {%rs66, %rs12}, %r91; // begin inline asm { cvt.f32.f16 %f90, %rs66;} // end inline asm fma.rn.ftz.f32 %f119, %f89, %f90, %f119; add.s32 %r96, %r29, 1; setp.ge.s32 %p24, %r96, %r41; @%p24 bra $L__BB0_39; // begin inline asm { cvt.f32.f16 %f91, %rs10;} // end inline asm // begin inline asm { cvt.f32.f16 %f92, %rs12;} // end inline asm fma.rn.ftz.f32 %f119, %f91, %f92, %f119; $L__BB0_39: add.s32 %r97, %r29, 2; setp.ge.s32 %p25, %r97, %r41; @%p25 bra $L__BB0_41; cvt.u16.u32 %rs69, %r87; // begin inline asm { cvt.f32.f16 %f93, %rs69;} // end inline asm cvt.u16.u32 %rs70, %r92; // begin inline asm { cvt.f32.f16 %f94, %rs70;} // end inline asm fma.rn.ftz.f32 %f119, %f93, %f94, %f119; $L__BB0_41: add.s32 %r98, %r29, 3; setp.ge.s32 %p26, %r98, %r41; @%p26 bra $L__BB0_43; mov.b32 {%rs73, %rs71}, %r87; // begin inline asm { cvt.f32.f16 %f95, %rs71;} // end inline asm mov.b32 {%rs74, %rs72}, %r92; // begin inline asm { cvt.f32.f16 %f96, %rs72;} // end inline asm fma.rn.ftz.f32 %f119, %f95, %f96, %f119; $L__BB0_43: add.s32 %r99, %r29, 4; setp.ge.s32 %p27, %r99, %r41; @%p27 bra $L__BB0_45; cvt.u16.u32 %rs75, %r88; // begin inline asm { cvt.f32.f16 %f97, %rs75;} // end inline asm cvt.u16.u32 %rs76, %r93; // begin inline asm { cvt.f32.f16 %f98, %rs76;} // end inline asm fma.rn.ftz.f32 %f119, %f97, %f98, %f119; $L__BB0_45: add.s32 %r100, %r29, 5; setp.ge.s32 %p28, %r100, %r41; @%p28 bra $L__BB0_47; mov.b32 {%rs79, %rs77}, %r88; // begin inline asm { cvt.f32.f16 %f99, %rs77;} // end inline asm mov.b32 {%rs80, %rs78}, %r93; // begin inline asm { cvt.f32.f16 %f100, %rs78;} // end inline asm fma.rn.ftz.f32 %f119, %f99, %f100, %f119; $L__BB0_47: add.s32 %r101, %r29, 6; setp.ge.s32 %p29, %r101, %r41; @%p29 bra $L__BB0_49; cvt.u16.u32 %rs81, %r89; // begin inline asm { cvt.f32.f16 %f101, %rs81;} // end inline asm cvt.u16.u32 %rs82, %r94; // begin inline asm { cvt.f32.f16 %f102, %rs82;} // end inline asm fma.rn.ftz.f32 %f119, %f101, %f102, %f119; $L__BB0_49: add.s32 %r102, %r29, 7; setp.ge.s32 %p30, %r102, %r41; @%p30 bra $L__BB0_51; mov.b32 {%rs85, %rs83}, %r89; // begin inline asm { cvt.f32.f16 %f103, %rs83;} // end inline asm mov.b32 {%rs86, %rs84}, %r94; // begin inline asm { cvt.f32.f16 %f104, %rs84;} // end inline asm fma.rn.ftz.f32 %f119, %f103, %f104, %f119; $L__BB0_51: mov.b32 %r103, %f119; mov.u32 %r104, 31; mov.u32 %r105, 16; mov.u32 %r106, -1; shfl.sync.bfly.b32 %r107|%p31, %r103, %r105, %r104, %r106; mov.b32 %f105, %r107; add.ftz.f32 %f106, %f119, %f105; mov.b32 %r108, %f106; mov.u32 %r109, 8; shfl.sync.bfly.b32 %r110|%p32, %r108, %r109, %r104, %r106; mov.b32 %f107, %r110; add.ftz.f32 %f108, %f106, %f107; mov.b32 %r111, %f108; mov.u32 %r112, 4; shfl.sync.bfly.b32 %r113|%p33, %r111, %r112, %r104, %r106; mov.b32 %f109, %r113; add.ftz.f32 %f110, %f108, %f109; mov.b32 %r114, %f110; mov.u32 %r115, 2; shfl.sync.bfly.b32 %r116|%p34, %r114, %r115, %r104, %r106; mov.b32 %f111, %r116; add.ftz.f32 %f49, %f110, %f111; mov.b32 %r117, %f49; mov.u32 %r118, 1; shfl.sync.bfly.b32 %r36|%p1, %r117, %r118, %r104, %r106; setp.ne.s32 %p35, %r3, 0; @%p35 bra $L__BB0_55; setp.eq.s64 %p36, %rd8, 0; mov.b32 %f112, %r36; add.ftz.f32 %f113, %f49, %f112; mul.ftz.f32 %f50, %f51, %f113; cvt.s64.s32 %rd4, %r1; cvta.to.global.u64 %rd22, %rd9; mul.wide.s32 %rd23, %r1, 2; add.s64 %rd5, %rd22, %rd23; @%p36 bra $L__BB0_54; cvta.to.global.u64 %rd24, %rd8; shl.b64 %rd25, %rd4, 1; add.s64 %rd26, %rd24, %rd25; ld.global.u16 %rs87, [%rd26]; // begin inline asm { cvt.f32.f16 %f114, %rs87;} // end inline asm fma.rn.ftz.f32 %f115, %f52, %f114, %f50; // begin inline asm { cvt.rn.f16.f32 %rs88, %f115;} // end inline asm st.global.u16 [%rd5], %rs88; bra.uni $L__BB0_55; $L__BB0_54: // begin inline asm { cvt.rn.f16.f32 %rs89, %f50;} // end inline asm st.global.u16 [%rd5], %rs89; $L__BB0_55: ret; }