6thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_351eb83f6thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch123DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<15>; .reg .b16 %rs<616>; .reg .f32 %f<693>; .reg .b32 %r<265>; .reg .b64 %rd<37>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[640]; ld.param.v2.u32 {%r39, %r40}, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r41, %r42}, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f22, %f23}, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs146, %rs147, %rs148, %rs149}, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd15, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd14, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd13, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd12, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+16]; cvta.to.global.u64 %rd1, %rd12; cvta.to.global.u64 %rd2, %rd13; cvta.to.global.u64 %rd3, %rd15; mov.u32 %r1, %ctaid.x; mov.u32 %r261, %tid.y; shl.b32 %r43, %r261, 5; mov.u32 %r3, %tid.x; add.s32 %r4, %r43, %r3; shl.b32 %r259, %r4, 2; setp.ge.u32 %p1, %r259, %r41; mov.f32 %f687, 0f00000000; @%p1 bra $L__BB0_14; mul.lo.s32 %r8, %r41, %r1; shr.u32 %r9, %r3, 2; mul.lo.s32 %r10, %r42, %r1; setp.eq.s64 %p2, %rd14, 0; @%p2 bra $L__BB0_8; cvta.to.global.u64 %rd5, %rd14; mov.f32 %f687, 0f00000000; mov.u32 %r260, %r4; $L__BB0_3: add.s32 %r44, %r259, %r8; mul.wide.u32 %rd16, %r44, 4; add.s64 %rd17, %rd2, %rd16; ld.global.v4.u32 {%r45, %r46, %r47, %r48}, [%rd17]; shl.b32 %r49, %r261, 3; add.s32 %r18, %r49, %r9; add.s32 %r19, %r18, %r10; mul.wide.s32 %rd18, %r19, 2; add.s64 %rd19, %rd3, %rd18; ld.global.u16 %rs154, [%rd19]; // begin inline asm { cvt.f32.f16 %f25, %rs154;} // end inline asm shl.b32 %r20, %r260, 5; setp.ge.s32 %p3, %r20, %r39; @%p3 bra $L__BB0_7; shr.u32 %r50, %r19, 31; add.s32 %r51, %r19, %r50; shr.s32 %r52, %r51, 1; cvt.s64.s32 %rd20, %r52; add.s64 %rd21, %rd5, %rd20; ld.global.u8 %r53, [%rd21]; shl.b32 %r54, %r18, 2; and.b32 %r55, %r54, 4; shr.u32 %r56, %r53, %r55; setp.eq.s16 %p4, %rs146, 0; cvt.u16.u32 %rs156, %r56; shr.u16 %rs157, %rs156, 3; and.b16 %rs158, %rs157, 1; setp.eq.b16 %p5, %rs158, 1; and.pred %p6, %p4, %p5; selp.b16 %rs159, -16, 0, %p6; and.b16 %rs160, %rs156, 15; or.b16 %rs161, %rs159, %rs160; cvt.rn.f32.s16 %f3, %rs161; mul.wide.s32 %rd22, %r20, 2; add.s64 %rd6, %rd1, %rd22; ld.global.v4.u32 {%r57, %r58, %r59, %r60}, [%rd6]; cvt.u16.u32 %rs2, %r45; and.b16 %rs3, %rs2, 15; mov.b32 {%rs4, %rs7}, %r57; shr.u32 %r65, %r45, 4; cvt.u16.u32 %rs5, %r65; and.b16 %rs6, %rs5, 15; shr.u32 %r66, %r45, 8; cvt.u16.u32 %rs8, %r66; and.b16 %rs9, %rs8, 15; mov.b32 {%rs10, %rs13}, %r58; shr.u32 %r67, %r45, 12; cvt.u16.u32 %rs11, %r67; and.b16 %rs12, %rs11, 15; shr.u32 %r68, %r45, 16; cvt.u16.u32 %rs14, %r68; and.b16 %rs15, %rs14, 15; mov.b32 {%rs16, %rs19}, %r59; shr.u32 %r69, %r45, 20; cvt.u16.u32 %rs17, %r69; and.b16 %rs18, %rs17, 15; shr.u32 %r70, %r45, 24; cvt.u16.u32 %rs20, %r70; and.b16 %rs21, %rs20, 15; mov.b32 {%rs22, %rs24}, %r60; shr.u32 %r71, %r45, 28; cvt.u16.u32 %rs23, %r71; cvt.u16.u32 %rs25, %r46; and.b16 %rs26, %rs25, 15; shr.u32 %r72, %r46, 4; cvt.u16.u32 %rs27, %r72; and.b16 %rs28, %rs27, 15; shr.u32 %r73, %r46, 8; cvt.u16.u32 %rs29, %r73; and.b16 %rs30, %rs29, 15; shr.u32 %r74, %r46, 12; cvt.u16.u32 %rs31, %r74; and.b16 %rs32, %rs31, 15; shr.u32 %r75, %r46, 16; cvt.u16.u32 %rs33, %r75; and.b16 %rs34, %rs33, 15; shr.u32 %r76, %r46, 20; cvt.u16.u32 %rs35, %r76; and.b16 %rs36, %rs35, 15; shr.u32 %r77, %r46, 24; cvt.u16.u32 %rs37, %r77; and.b16 %rs38, %rs37, 15; shr.u32 %r78, %r46, 28; cvt.u16.u32 %rs39, %r78; cvt.u16.u32 %rs40, %r47; and.b16 %rs41, %rs40, 15; shr.u32 %r79, %r47, 4; cvt.u16.u32 %rs42, %r79; and.b16 %rs43, %rs42, 15; shr.u32 %r80, %r47, 8; cvt.u16.u32 %rs44, %r80; and.b16 %rs45, %rs44, 15; shr.u32 %r81, %r47, 12; cvt.u16.u32 %rs46, %r81; and.b16 %rs47, %rs46, 15; shr.u32 %r82, %r47, 16; cvt.u16.u32 %rs48, %r82; and.b16 %rs49, %rs48, 15; shr.u32 %r83, %r47, 20; cvt.u16.u32 %rs50, %r83; and.b16 %rs51, %rs50, 15; shr.u32 %r84, %r47, 24; cvt.u16.u32 %rs52, %r84; and.b16 %rs53, %rs52, 15; shr.u32 %r85, %r47, 28; cvt.u16.u32 %rs54, %r85; cvt.u16.u32 %rs55, %r48; and.b16 %rs56, %rs55, 15; shr.u32 %r86, %r48, 4; cvt.u16.u32 %rs57, %r86; and.b16 %rs58, %rs57, 15; shr.u32 %r87, %r48, 8; cvt.u16.u32 %rs59, %r87; and.b16 %rs60, %rs59, 15; shr.u32 %r88, %r48, 12; cvt.u16.u32 %rs61, %r88; and.b16 %rs62, %rs61, 15; shr.u32 %r89, %r48, 16; cvt.u16.u32 %rs63, %r89; and.b16 %rs64, %rs63, 15; shr.u32 %r90, %r48, 20; cvt.u16.u32 %rs65, %r90; and.b16 %rs66, %rs65, 15; shr.u32 %r91, %r48, 24; cvt.u16.u32 %rs67, %r91; and.b16 %rs68, %rs67, 15; shr.u32 %r92, %r48, 28; cvt.u16.u32 %rs69, %r92; @%p4 bra $L__BB0_6; cvt.rn.f32.s16 %f58, %rs3; sub.ftz.f32 %f59, %f58, %f3; mul.ftz.f32 %f60, %f25, %f59; // begin inline asm { cvt.f32.f16 %f26, %rs4;} // end inline asm fma.rn.ftz.f32 %f61, %f60, %f26, %f687; cvt.rn.f32.s16 %f62, %rs6; sub.ftz.f32 %f63, %f62, %f3; mul.ftz.f32 %f64, %f25, %f63; // begin inline asm { cvt.f32.f16 %f27, %rs7;} // end inline asm fma.rn.ftz.f32 %f65, %f64, %f27, %f61; cvt.rn.f32.s16 %f66, %rs9; sub.ftz.f32 %f67, %f66, %f3; mul.ftz.f32 %f68, %f25, %f67; // begin inline asm { cvt.f32.f16 %f28, %rs10;} // end inline asm fma.rn.ftz.f32 %f69, %f68, %f28, %f65; cvt.rn.f32.s16 %f70, %rs12; sub.ftz.f32 %f71, %f70, %f3; mul.ftz.f32 %f72, %f25, %f71; // begin inline asm { cvt.f32.f16 %f29, %rs13;} // end inline asm fma.rn.ftz.f32 %f73, %f72, %f29, %f69; cvt.rn.f32.s16 %f74, %rs15; sub.ftz.f32 %f75, %f74, %f3; mul.ftz.f32 %f76, %f25, %f75; // begin inline asm { cvt.f32.f16 %f30, %rs16;} // end inline asm fma.rn.ftz.f32 %f77, %f76, %f30, %f73; cvt.rn.f32.s16 %f78, %rs18; sub.ftz.f32 %f79, %f78, %f3; mul.ftz.f32 %f80, %f25, %f79; // begin inline asm { cvt.f32.f16 %f31, %rs19;} // end inline asm fma.rn.ftz.f32 %f81, %f80, %f31, %f77; cvt.rn.f32.s16 %f82, %rs21; sub.ftz.f32 %f83, %f82, %f3; mul.ftz.f32 %f84, %f25, %f83; // begin inline asm { cvt.f32.f16 %f32, %rs22;} // end inline asm fma.rn.ftz.f32 %f85, %f84, %f32, %f81; cvt.rn.f32.s16 %f86, %rs23; sub.ftz.f32 %f87, %f86, %f3; mul.ftz.f32 %f88, %f25, %f87; // begin inline asm { cvt.f32.f16 %f33, %rs24;} // end inline asm fma.rn.ftz.f32 %f89, %f88, %f33, %f85; ld.global.v4.u32 {%r93, %r94, %r95, %r96}, [%rd6+16]; cvt.rn.f32.s16 %f90, %rs26; sub.ftz.f32 %f91, %f90, %f3; mul.ftz.f32 %f92, %f25, %f91; mov.b32 {%rs170, %rs171}, %r93; // begin inline asm { cvt.f32.f16 %f34, %rs170;} // end inline asm fma.rn.ftz.f32 %f93, %f92, %f34, %f89; cvt.rn.f32.s16 %f94, %rs28; sub.ftz.f32 %f95, %f94, %f3; mul.ftz.f32 %f96, %f25, %f95; // begin inline asm { cvt.f32.f16 %f35, %rs171;} // end inline asm fma.rn.ftz.f32 %f97, %f96, %f35, %f93; cvt.rn.f32.s16 %f98, %rs30; sub.ftz.f32 %f99, %f98, %f3; mul.ftz.f32 %f100, %f25, %f99; mov.b32 {%rs172, %rs173}, %r94; // begin inline asm { cvt.f32.f16 %f36, %rs172;} // end inline asm fma.rn.ftz.f32 %f101, %f100, %f36, %f97; cvt.rn.f32.s16 %f102, %rs32; sub.ftz.f32 %f103, %f102, %f3; mul.ftz.f32 %f104, %f25, %f103; // begin inline asm { cvt.f32.f16 %f37, %rs173;} // end inline asm fma.rn.ftz.f32 %f105, %f104, %f37, %f101; cvt.rn.f32.s16 %f106, %rs34; sub.ftz.f32 %f107, %f106, %f3; mul.ftz.f32 %f108, %f25, %f107; mov.b32 {%rs174, %rs175}, %r95; // begin inline asm { cvt.f32.f16 %f38, %rs174;} // end inline asm fma.rn.ftz.f32 %f109, %f108, %f38, %f105; cvt.rn.f32.s16 %f110, %rs36; sub.ftz.f32 %f111, %f110, %f3; mul.ftz.f32 %f112, %f25, %f111; // begin inline asm { cvt.f32.f16 %f39, %rs175;} // end inline asm fma.rn.ftz.f32 %f113, %f112, %f39, %f109; cvt.rn.f32.s16 %f114, %rs38; sub.ftz.f32 %f115, %f114, %f3; mul.ftz.f32 %f116, %f25, %f115; mov.b32 {%rs176, %rs177}, %r96; // begin inline asm { cvt.f32.f16 %f40, %rs176;} // end inline asm fma.rn.ftz.f32 %f117, %f116, %f40, %f113; cvt.rn.f32.s16 %f118, %rs39; sub.ftz.f32 %f119, %f118, %f3; mul.ftz.f32 %f120, %f25, %f119; // begin inline asm { cvt.f32.f16 %f41, %rs177;} // end inline asm fma.rn.ftz.f32 %f121, %f120, %f41, %f117; ld.global.v4.u32 {%r101, %r102, %r103, %r104}, [%rd6+32]; cvt.rn.f32.s16 %f122, %rs41; sub.ftz.f32 %f123, %f122, %f3; mul.ftz.f32 %f124, %f25, %f123; mov.b32 {%rs178, %rs179}, %r101; // begin inline asm { cvt.f32.f16 %f42, %rs178;} // end inline asm fma.rn.ftz.f32 %f125, %f124, %f42, %f121; cvt.rn.f32.s16 %f126, %rs43; sub.ftz.f32 %f127, %f126, %f3; mul.ftz.f32 %f128, %f25, %f127; // begin inline asm { cvt.f32.f16 %f43, %rs179;} // end inline asm fma.rn.ftz.f32 %f129, %f128, %f43, %f125; cvt.rn.f32.s16 %f130, %rs45; sub.ftz.f32 %f131, %f130, %f3; mul.ftz.f32 %f132, %f25, %f131; mov.b32 {%rs180, %rs181}, %r102; // begin inline asm { cvt.f32.f16 %f44, %rs180;} // end inline asm fma.rn.ftz.f32 %f133, %f132, %f44, %f129; cvt.rn.f32.s16 %f134, %rs47; sub.ftz.f32 %f135, %f134, %f3; mul.ftz.f32 %f136, %f25, %f135; // begin inline asm { cvt.f32.f16 %f45, %rs181;} // end inline asm fma.rn.ftz.f32 %f137, %f136, %f45, %f133; cvt.rn.f32.s16 %f138, %rs49; sub.ftz.f32 %f139, %f138, %f3; mul.ftz.f32 %f140, %f25, %f139; mov.b32 {%rs182, %rs183}, %r103; // begin inline asm { cvt.f32.f16 %f46, %rs182;} // end inline asm fma.rn.ftz.f32 %f141, %f140, %f46, %f137; cvt.rn.f32.s16 %f142, %rs51; sub.ftz.f32 %f143, %f142, %f3; mul.ftz.f32 %f144, %f25, %f143; // begin inline asm { cvt.f32.f16 %f47, %rs183;} // end inline asm fma.rn.ftz.f32 %f145, %f144, %f47, %f141; cvt.rn.f32.s16 %f146, %rs53; sub.ftz.f32 %f147, %f146, %f3; mul.ftz.f32 %f148, %f25, %f147; mov.b32 {%rs184, %rs185}, %r104; // begin inline asm { cvt.f32.f16 %f48, %rs184;} // end inline asm fma.rn.ftz.f32 %f149, %f148, %f48, %f145; cvt.rn.f32.s16 %f150, %rs54; sub.ftz.f32 %f151, %f150, %f3; mul.ftz.f32 %f152, %f25, %f151; // begin inline asm { cvt.f32.f16 %f49, %rs185;} // end inline asm fma.rn.ftz.f32 %f153, %f152, %f49, %f149; ld.global.v4.u32 {%r109, %r110, %r111, %r112}, [%rd6+48]; cvt.rn.f32.s16 %f154, %rs56; sub.ftz.f32 %f155, %f154, %f3; mul.ftz.f32 %f156, %f25, %f155; mov.b32 {%rs186, %rs187}, %r109; // begin inline asm { cvt.f32.f16 %f50, %rs186;} // end inline asm fma.rn.ftz.f32 %f157, %f156, %f50, %f153; cvt.rn.f32.s16 %f158, %rs58; sub.ftz.f32 %f159, %f158, %f3; mul.ftz.f32 %f160, %f25, %f159; // begin inline asm { cvt.f32.f16 %f51, %rs187;} // end inline asm fma.rn.ftz.f32 %f161, %f160, %f51, %f157; cvt.rn.f32.s16 %f162, %rs60; sub.ftz.f32 %f163, %f162, %f3; mul.ftz.f32 %f164, %f25, %f163; mov.b32 {%rs188, %rs189}, %r110; // begin inline asm { cvt.f32.f16 %f52, %rs188;} // end inline asm fma.rn.ftz.f32 %f165, %f164, %f52, %f161; cvt.rn.f32.s16 %f166, %rs62; sub.ftz.f32 %f167, %f166, %f3; mul.ftz.f32 %f168, %f25, %f167; // begin inline asm { cvt.f32.f16 %f53, %rs189;} // end inline asm fma.rn.ftz.f32 %f169, %f168, %f53, %f165; cvt.rn.f32.s16 %f170, %rs64; sub.ftz.f32 %f171, %f170, %f3; mul.ftz.f32 %f172, %f25, %f171; mov.b32 {%rs190, %rs191}, %r111; // begin inline asm { cvt.f32.f16 %f54, %rs190;} // end inline asm fma.rn.ftz.f32 %f173, %f172, %f54, %f169; cvt.rn.f32.s16 %f174, %rs66; sub.ftz.f32 %f175, %f174, %f3; mul.ftz.f32 %f176, %f25, %f175; // begin inline asm { cvt.f32.f16 %f55, %rs191;} // end inline asm fma.rn.ftz.f32 %f177, %f176, %f55, %f173; cvt.rn.f32.s16 %f178, %rs68; sub.ftz.f32 %f179, %f178, %f3; mul.ftz.f32 %f180, %f25, %f179; mov.b32 {%rs192, %rs193}, %r112; // begin inline asm { cvt.f32.f16 %f56, %rs192;} // end inline asm fma.rn.ftz.f32 %f181, %f180, %f56, %f177; cvt.rn.f32.s16 %f182, %rs69; sub.ftz.f32 %f183, %f182, %f3; mul.ftz.f32 %f184, %f25, %f183; // begin inline asm { cvt.f32.f16 %f57, %rs193;} // end inline asm fma.rn.ftz.f32 %f687, %f184, %f57, %f181; bra.uni $L__BB0_7; $L__BB0_6: shl.b16 %rs226, %rs2, 4; cvt.s16.s8 %rs227, %rs226; shr.s16 %rs228, %rs227, 7; and.b16 %rs229, %rs228, -16; or.b16 %rs230, %rs229, %rs3; cvt.rn.f32.s16 %f217, %rs230; sub.ftz.f32 %f218, %f217, %f3; mul.ftz.f32 %f219, %f25, %f218; // begin inline asm { cvt.f32.f16 %f185, %rs4;} // end inline asm fma.rn.ftz.f32 %f220, %f219, %f185, %f687; shl.b16 %rs231, %rs5, 4; cvt.s16.s8 %rs232, %rs231; shr.s16 %rs233, %rs232, 7; and.b16 %rs234, %rs233, -16; or.b16 %rs235, %rs234, %rs6; cvt.rn.f32.s16 %f221, %rs235; sub.ftz.f32 %f222, %f221, %f3; mul.ftz.f32 %f223, %f25, %f222; // begin inline asm { cvt.f32.f16 %f186, %rs7;} // end inline asm fma.rn.ftz.f32 %f224, %f223, %f186, %f220; shl.b16 %rs236, %rs8, 4; cvt.s16.s8 %rs237, %rs236; shr.s16 %rs238, %rs237, 7; and.b16 %rs239, %rs238, -16; or.b16 %rs240, %rs239, %rs9; cvt.rn.f32.s16 %f225, %rs240; sub.ftz.f32 %f226, %f225, %f3; mul.ftz.f32 %f227, %f25, %f226; // begin inline asm { cvt.f32.f16 %f187, %rs10;} // end inline asm fma.rn.ftz.f32 %f228, %f227, %f187, %f224; shl.b16 %rs241, %rs11, 4; cvt.s16.s8 %rs242, %rs241; shr.s16 %rs243, %rs242, 7; and.b16 %rs244, %rs243, -16; or.b16 %rs245, %rs244, %rs12; cvt.rn.f32.s16 %f229, %rs245; sub.ftz.f32 %f230, %f229, %f3; mul.ftz.f32 %f231, %f25, %f230; // begin inline asm { cvt.f32.f16 %f188, %rs13;} // end inline asm fma.rn.ftz.f32 %f232, %f231, %f188, %f228; shl.b16 %rs246, %rs14, 4; cvt.s16.s8 %rs247, %rs246; shr.s16 %rs248, %rs247, 7; and.b16 %rs249, %rs248, -16; or.b16 %rs250, %rs249, %rs15; cvt.rn.f32.s16 %f233, %rs250; sub.ftz.f32 %f234, %f233, %f3; mul.ftz.f32 %f235, %f25, %f234; // begin inline asm { cvt.f32.f16 %f189, %rs16;} // end inline asm fma.rn.ftz.f32 %f236, %f235, %f189, %f232; shl.b16 %rs251, %rs17, 4; cvt.s16.s8 %rs252, %rs251; shr.s16 %rs253, %rs252, 7; and.b16 %rs254, %rs253, -16; or.b16 %rs255, %rs254, %rs18; cvt.rn.f32.s16 %f237, %rs255; sub.ftz.f32 %f238, %f237, %f3; mul.ftz.f32 %f239, %f25, %f238; // begin inline asm { cvt.f32.f16 %f190, %rs19;} // end inline asm fma.rn.ftz.f32 %f240, %f239, %f190, %f236; shl.b16 %rs256, %rs20, 4; cvt.s16.s8 %rs257, %rs256; shr.s16 %rs258, %rs257, 7; and.b16 %rs259, %rs258, -16; or.b16 %rs260, %rs259, %rs21; cvt.rn.f32.s16 %f241, %rs260; sub.ftz.f32 %f242, %f241, %f3; mul.ftz.f32 %f243, %f25, %f242; // begin inline asm { cvt.f32.f16 %f191, %rs22;} // end inline asm fma.rn.ftz.f32 %f244, %f243, %f191, %f240; shl.b16 %rs261, %rs23, 4; cvt.s16.s8 %rs262, %rs261; shr.s16 %rs263, %rs262, 7; and.b16 %rs264, %rs263, -16; or.b16 %rs265, %rs264, %rs23; cvt.rn.f32.s16 %f245, %rs265; sub.ftz.f32 %f246, %f245, %f3; mul.ftz.f32 %f247, %f25, %f246; // begin inline asm { cvt.f32.f16 %f192, %rs24;} // end inline asm fma.rn.ftz.f32 %f248, %f247, %f192, %f244; ld.global.v4.u32 {%r117, %r118, %r119, %r120}, [%rd6+16]; shl.b16 %rs266, %rs25, 4; cvt.s16.s8 %rs267, %rs266; shr.s16 %rs268, %rs267, 7; and.b16 %rs269, %rs268, -16; or.b16 %rs270, %rs269, %rs26; cvt.rn.f32.s16 %f249, %rs270; sub.ftz.f32 %f250, %f249, %f3; mul.ftz.f32 %f251, %f25, %f250; mov.b32 {%rs202, %rs203}, %r117; // begin inline asm { cvt.f32.f16 %f193, %rs202;} // end inline asm fma.rn.ftz.f32 %f252, %f251, %f193, %f248; shl.b16 %rs271, %rs27, 4; cvt.s16.s8 %rs272, %rs271; shr.s16 %rs273, %rs272, 7; and.b16 %rs274, %rs273, -16; or.b16 %rs275, %rs274, %rs28; cvt.rn.f32.s16 %f253, %rs275; sub.ftz.f32 %f254, %f253, %f3; mul.ftz.f32 %f255, %f25, %f254; // begin inline asm { cvt.f32.f16 %f194, %rs203;} // end inline asm fma.rn.ftz.f32 %f256, %f255, %f194, %f252; shl.b16 %rs276, %rs29, 4; cvt.s16.s8 %rs277, %rs276; shr.s16 %rs278, %rs277, 7; and.b16 %rs279, %rs278, -16; or.b16 %rs280, %rs279, %rs30; cvt.rn.f32.s16 %f257, %rs280; sub.ftz.f32 %f258, %f257, %f3; mul.ftz.f32 %f259, %f25, %f258; mov.b32 {%rs204, %rs205}, %r118; // begin inline asm { cvt.f32.f16 %f195, %rs204;} // end inline asm fma.rn.ftz.f32 %f260, %f259, %f195, %f256; shl.b16 %rs281, %rs31, 4; cvt.s16.s8 %rs282, %rs281; shr.s16 %rs283, %rs282, 7; and.b16 %rs284, %rs283, -16; or.b16 %rs285, %rs284, %rs32; cvt.rn.f32.s16 %f261, %rs285; sub.ftz.f32 %f262, %f261, %f3; mul.ftz.f32 %f263, %f25, %f262; // begin inline asm { cvt.f32.f16 %f196, %rs205;} // end inline asm fma.rn.ftz.f32 %f264, %f263, %f196, %f260; shl.b16 %rs286, %rs33, 4; cvt.s16.s8 %rs287, %rs286; shr.s16 %rs288, %rs287, 7; and.b16 %rs289, %rs288, -16; or.b16 %rs290, %rs289, %rs34; cvt.rn.f32.s16 %f265, %rs290; sub.ftz.f32 %f266, %f265, %f3; mul.ftz.f32 %f267, %f25, %f266; mov.b32 {%rs206, %rs207}, %r119; // begin inline asm { cvt.f32.f16 %f197, %rs206;} // end inline asm fma.rn.ftz.f32 %f268, %f267, %f197, %f264; shl.b16 %rs291, %rs35, 4; cvt.s16.s8 %rs292, %rs291; shr.s16 %rs293, %rs292, 7; and.b16 %rs294, %rs293, -16; or.b16 %rs295, %rs294, %rs36; cvt.rn.f32.s16 %f269, %rs295; sub.ftz.f32 %f270, %f269, %f3; mul.ftz.f32 %f271, %f25, %f270; // begin inline asm { cvt.f32.f16 %f198, %rs207;} // end inline asm fma.rn.ftz.f32 %f272, %f271, %f198, %f268; shl.b16 %rs296, %rs37, 4; cvt.s16.s8 %rs297, %rs296; shr.s16 %rs298, %rs297, 7; and.b16 %rs299, %rs298, -16; or.b16 %rs300, %rs299, %rs38; cvt.rn.f32.s16 %f273, %rs300; sub.ftz.f32 %f274, %f273, %f3; mul.ftz.f32 %f275, %f25, %f274; mov.b32 {%rs208, %rs209}, %r120; // begin inline asm { cvt.f32.f16 %f199, %rs208;} // end inline asm fma.rn.ftz.f32 %f276, %f275, %f199, %f272; shl.b16 %rs301, %rs39, 4; cvt.s16.s8 %rs302, %rs301; shr.s16 %rs303, %rs302, 7; and.b16 %rs304, %rs303, -16; or.b16 %rs305, %rs304, %rs39; cvt.rn.f32.s16 %f277, %rs305; sub.ftz.f32 %f278, %f277, %f3; mul.ftz.f32 %f279, %f25, %f278; // begin inline asm { cvt.f32.f16 %f200, %rs209;} // end inline asm fma.rn.ftz.f32 %f280, %f279, %f200, %f276; ld.global.v4.u32 {%r125, %r126, %r127, %r128}, [%rd6+32]; shl.b16 %rs306, %rs40, 4; cvt.s16.s8 %rs307, %rs306; shr.s16 %rs308, %rs307, 7; and.b16 %rs309, %rs308, -16; or.b16 %rs310, %rs309, %rs41; cvt.rn.f32.s16 %f281, %rs310; sub.ftz.f32 %f282, %f281, %f3; mul.ftz.f32 %f283, %f25, %f282; mov.b32 {%rs210, %rs211}, %r125; // begin inline asm { cvt.f32.f16 %f201, %rs210;} // end inline asm fma.rn.ftz.f32 %f284, %f283, %f201, %f280; shl.b16 %rs311, %rs42, 4; cvt.s16.s8 %rs312, %rs311; shr.s16 %rs313, %rs312, 7; and.b16 %rs314, %rs313, -16; or.b16 %rs315, %rs314, %rs43; cvt.rn.f32.s16 %f285, %rs315; sub.ftz.f32 %f286, %f285, %f3; mul.ftz.f32 %f287, %f25, %f286; // begin inline asm { cvt.f32.f16 %f202, %rs211;} // end inline asm fma.rn.ftz.f32 %f288, %f287, %f202, %f284; shl.b16 %rs316, %rs44, 4; cvt.s16.s8 %rs317, %rs316; shr.s16 %rs318, %rs317, 7; and.b16 %rs319, %rs318, -16; or.b16 %rs320, %rs319, %rs45; cvt.rn.f32.s16 %f289, %rs320; sub.ftz.f32 %f290, %f289, %f3; mul.ftz.f32 %f291, %f25, %f290; mov.b32 {%rs212, %rs213}, %r126; // begin inline asm { cvt.f32.f16 %f203, %rs212;} // end inline asm fma.rn.ftz.f32 %f292, %f291, %f203, %f288; shl.b16 %rs321, %rs46, 4; cvt.s16.s8 %rs322, %rs321; shr.s16 %rs323, %rs322, 7; and.b16 %rs324, %rs323, -16; or.b16 %rs325, %rs324, %rs47; cvt.rn.f32.s16 %f293, %rs325; sub.ftz.f32 %f294, %f293, %f3; mul.ftz.f32 %f295, %f25, %f294; // begin inline asm { cvt.f32.f16 %f204, %rs213;} // end inline asm fma.rn.ftz.f32 %f296, %f295, %f204, %f292; shl.b16 %rs326, %rs48, 4; cvt.s16.s8 %rs327, %rs326; shr.s16 %rs328, %rs327, 7; and.b16 %rs329, %rs328, -16; or.b16 %rs330, %rs329, %rs49; cvt.rn.f32.s16 %f297, %rs330; sub.ftz.f32 %f298, %f297, %f3; mul.ftz.f32 %f299, %f25, %f298; mov.b32 {%rs214, %rs215}, %r127; // begin inline asm { cvt.f32.f16 %f205, %rs214;} // end inline asm fma.rn.ftz.f32 %f300, %f299, %f205, %f296; shl.b16 %rs331, %rs50, 4; cvt.s16.s8 %rs332, %rs331; shr.s16 %rs333, %rs332, 7; and.b16 %rs334, %rs333, -16; or.b16 %rs335, %rs334, %rs51; cvt.rn.f32.s16 %f301, %rs335; sub.ftz.f32 %f302, %f301, %f3; mul.ftz.f32 %f303, %f25, %f302; // begin inline asm { cvt.f32.f16 %f206, %rs215;} // end inline asm fma.rn.ftz.f32 %f304, %f303, %f206, %f300; shl.b16 %rs336, %rs52, 4; cvt.s16.s8 %rs337, %rs336; shr.s16 %rs338, %rs337, 7; and.b16 %rs339, %rs338, -16; or.b16 %rs340, %rs339, %rs53; cvt.rn.f32.s16 %f305, %rs340; sub.ftz.f32 %f306, %f305, %f3; mul.ftz.f32 %f307, %f25, %f306; mov.b32 {%rs216, %rs217}, %r128; // begin inline asm { cvt.f32.f16 %f207, %rs216;} // end inline asm fma.rn.ftz.f32 %f308, %f307, %f207, %f304; shl.b16 %rs341, %rs54, 4; cvt.s16.s8 %rs342, %rs341; shr.s16 %rs343, %rs342, 7; and.b16 %rs344, %rs343, -16; or.b16 %rs345, %rs344, %rs54; cvt.rn.f32.s16 %f309, %rs345; sub.ftz.f32 %f310, %f309, %f3; mul.ftz.f32 %f311, %f25, %f310; // begin inline asm { cvt.f32.f16 %f208, %rs217;} // end inline asm fma.rn.ftz.f32 %f312, %f311, %f208, %f308; ld.global.v4.u32 {%r133, %r134, %r135, %r136}, [%rd6+48]; shl.b16 %rs346, %rs55, 4; cvt.s16.s8 %rs347, %rs346; shr.s16 %rs348, %rs347, 7; and.b16 %rs349, %rs348, -16; or.b16 %rs350, %rs349, %rs56; cvt.rn.f32.s16 %f313, %rs350; sub.ftz.f32 %f314, %f313, %f3; mul.ftz.f32 %f315, %f25, %f314; mov.b32 {%rs218, %rs219}, %r133; // begin inline asm { cvt.f32.f16 %f209, %rs218;} // end inline asm fma.rn.ftz.f32 %f316, %f315, %f209, %f312; shl.b16 %rs351, %rs57, 4; cvt.s16.s8 %rs352, %rs351; shr.s16 %rs353, %rs352, 7; and.b16 %rs354, %rs353, -16; or.b16 %rs355, %rs354, %rs58; cvt.rn.f32.s16 %f317, %rs355; sub.ftz.f32 %f318, %f317, %f3; mul.ftz.f32 %f319, %f25, %f318; // begin inline asm { cvt.f32.f16 %f210, %rs219;} // end inline asm fma.rn.ftz.f32 %f320, %f319, %f210, %f316; shl.b16 %rs356, %rs59, 4; cvt.s16.s8 %rs357, %rs356; shr.s16 %rs358, %rs357, 7; and.b16 %rs359, %rs358, -16; or.b16 %rs360, %rs359, %rs60; cvt.rn.f32.s16 %f321, %rs360; sub.ftz.f32 %f322, %f321, %f3; mul.ftz.f32 %f323, %f25, %f322; mov.b32 {%rs220, %rs221}, %r134; // begin inline asm { cvt.f32.f16 %f211, %rs220;} // end inline asm fma.rn.ftz.f32 %f324, %f323, %f211, %f320; shl.b16 %rs361, %rs61, 4; cvt.s16.s8 %rs362, %rs361; shr.s16 %rs363, %rs362, 7; and.b16 %rs364, %rs363, -16; or.b16 %rs365, %rs364, %rs62; cvt.rn.f32.s16 %f325, %rs365; sub.ftz.f32 %f326, %f325, %f3; mul.ftz.f32 %f327, %f25, %f326; // begin inline asm { cvt.f32.f16 %f212, %rs221;} // end inline asm fma.rn.ftz.f32 %f328, %f327, %f212, %f324; shl.b16 %rs366, %rs63, 4; cvt.s16.s8 %rs367, %rs366; shr.s16 %rs368, %rs367, 7; and.b16 %rs369, %rs368, -16; or.b16 %rs370, %rs369, %rs64; cvt.rn.f32.s16 %f329, %rs370; sub.ftz.f32 %f330, %f329, %f3; mul.ftz.f32 %f331, %f25, %f330; mov.b32 {%rs222, %rs223}, %r135; // begin inline asm { cvt.f32.f16 %f213, %rs222;} // end inline asm fma.rn.ftz.f32 %f332, %f331, %f213, %f328; shl.b16 %rs371, %rs65, 4; cvt.s16.s8 %rs372, %rs371; shr.s16 %rs373, %rs372, 7; and.b16 %rs374, %rs373, -16; or.b16 %rs375, %rs374, %rs66; cvt.rn.f32.s16 %f333, %rs375; sub.ftz.f32 %f334, %f333, %f3; mul.ftz.f32 %f335, %f25, %f334; // begin inline asm { cvt.f32.f16 %f214, %rs223;} // end inline asm fma.rn.ftz.f32 %f336, %f335, %f214, %f332; shl.b16 %rs376, %rs67, 4; cvt.s16.s8 %rs377, %rs376; shr.s16 %rs378, %rs377, 7; and.b16 %rs379, %rs378, -16; or.b16 %rs380, %rs379, %rs68; cvt.rn.f32.s16 %f337, %rs380; sub.ftz.f32 %f338, %f337, %f3; mul.ftz.f32 %f339, %f25, %f338; mov.b32 {%rs224, %rs225}, %r136; // begin inline asm { cvt.f32.f16 %f215, %rs224;} // end inline asm fma.rn.ftz.f32 %f340, %f339, %f215, %f336; shl.b16 %rs381, %rs69, 4; cvt.s16.s8 %rs382, %rs381; shr.s16 %rs383, %rs382, 7; and.b16 %rs384, %rs383, -16; or.b16 %rs385, %rs384, %rs69; cvt.rn.f32.s16 %f341, %rs385; sub.ftz.f32 %f342, %f341, %f3; mul.ftz.f32 %f343, %f25, %f342; // begin inline asm { cvt.f32.f16 %f216, %rs225;} // end inline asm fma.rn.ftz.f32 %f687, %f343, %f216, %f340; $L__BB0_7: add.s32 %r261, %r261, 4; shl.b32 %r141, %r261, 5; add.s32 %r260, %r141, %r3; shl.b32 %r259, %r260, 2; setp.lt.u32 %p7, %r259, %r41; @%p7 bra $L__BB0_3; bra.uni $L__BB0_14; $L__BB0_8: shl.b16 %rs386, %rs146, 3; cvt.s16.s8 %rs387, %rs386; cvt.rn.f32.s16 %f7, %rs387; mov.f32 %f687, 0f00000000; mov.u32 %r263, %r4; $L__BB0_9: add.s32 %r142, %r259, %r8; mul.wide.u32 %rd23, %r142, 4; add.s64 %rd24, %rd2, %rd23; ld.global.v4.u32 {%r143, %r144, %r145, %r146}, [%rd24]; shl.b32 %r147, %r261, 3; add.s32 %r148, %r147, %r9; add.s32 %r149, %r148, %r10; mul.wide.s32 %rd25, %r149, 2; add.s64 %rd26, %rd3, %rd25; ld.global.u16 %rs388, [%rd26]; // begin inline asm { cvt.f32.f16 %f345, %rs388;} // end inline asm shl.b32 %r31, %r263, 5; setp.ge.s32 %p8, %r31, %r39; @%p8 bra $L__BB0_13; setp.eq.s16 %p9, %rs146, 0; mul.wide.s32 %rd27, %r31, 2; add.s64 %rd7, %rd1, %rd27; ld.global.v4.u32 {%r150, %r151, %r152, %r153}, [%rd7]; cvt.u16.u32 %rs70, %r143; and.b16 %rs71, %rs70, 15; mov.b32 {%rs72, %rs75}, %r150; shr.u32 %r158, %r143, 4; cvt.u16.u32 %rs73, %r158; and.b16 %rs74, %rs73, 15; shr.u32 %r159, %r143, 8; cvt.u16.u32 %rs76, %r159; and.b16 %rs77, %rs76, 15; mov.b32 {%rs78, %rs81}, %r151; shr.u32 %r160, %r143, 12; cvt.u16.u32 %rs79, %r160; and.b16 %rs80, %rs79, 15; shr.u32 %r161, %r143, 16; cvt.u16.u32 %rs82, %r161; and.b16 %rs83, %rs82, 15; mov.b32 {%rs84, %rs87}, %r152; shr.u32 %r162, %r143, 20; cvt.u16.u32 %rs85, %r162; and.b16 %rs86, %rs85, 15; shr.u32 %r163, %r143, 24; cvt.u16.u32 %rs88, %r163; and.b16 %rs89, %rs88, 15; mov.b32 {%rs90, %rs92}, %r153; shr.u32 %r164, %r143, 28; cvt.u16.u32 %rs91, %r164; cvt.u16.u32 %rs93, %r144; and.b16 %rs94, %rs93, 15; shr.u32 %r165, %r144, 4; cvt.u16.u32 %rs95, %r165; and.b16 %rs96, %rs95, 15; shr.u32 %r166, %r144, 8; cvt.u16.u32 %rs97, %r166; and.b16 %rs98, %rs97, 15; shr.u32 %r167, %r144, 12; cvt.u16.u32 %rs99, %r167; and.b16 %rs100, %rs99, 15; shr.u32 %r168, %r144, 16; cvt.u16.u32 %rs101, %r168; and.b16 %rs102, %rs101, 15; shr.u32 %r169, %r144, 20; cvt.u16.u32 %rs103, %r169; and.b16 %rs104, %rs103, 15; shr.u32 %r170, %r144, 24; cvt.u16.u32 %rs105, %r170; and.b16 %rs106, %rs105, 15; shr.u32 %r171, %r144, 28; cvt.u16.u32 %rs107, %r171; cvt.u16.u32 %rs108, %r145; and.b16 %rs109, %rs108, 15; shr.u32 %r172, %r145, 4; cvt.u16.u32 %rs110, %r172; and.b16 %rs111, %rs110, 15; shr.u32 %r173, %r145, 8; cvt.u16.u32 %rs112, %r173; and.b16 %rs113, %rs112, 15; shr.u32 %r174, %r145, 12; cvt.u16.u32 %rs114, %r174; and.b16 %rs115, %rs114, 15; shr.u32 %r175, %r145, 16; cvt.u16.u32 %rs116, %r175; and.b16 %rs117, %rs116, 15; shr.u32 %r176, %r145, 20; cvt.u16.u32 %rs118, %r176; and.b16 %rs119, %rs118, 15; shr.u32 %r177, %r145, 24; cvt.u16.u32 %rs120, %r177; and.b16 %rs121, %rs120, 15; shr.u32 %r178, %r145, 28; cvt.u16.u32 %rs122, %r178; cvt.u16.u32 %rs123, %r146; and.b16 %rs124, %rs123, 15; shr.u32 %r179, %r146, 4; cvt.u16.u32 %rs125, %r179; and.b16 %rs126, %rs125, 15; shr.u32 %r180, %r146, 8; cvt.u16.u32 %rs127, %r180; and.b16 %rs128, %rs127, 15; shr.u32 %r181, %r146, 12; cvt.u16.u32 %rs129, %r181; and.b16 %rs130, %rs129, 15; shr.u32 %r182, %r146, 16; cvt.u16.u32 %rs131, %r182; and.b16 %rs132, %rs131, 15; shr.u32 %r183, %r146, 20; cvt.u16.u32 %rs133, %r183; and.b16 %rs134, %rs133, 15; shr.u32 %r184, %r146, 24; cvt.u16.u32 %rs135, %r184; and.b16 %rs136, %rs135, 15; shr.u32 %r185, %r146, 28; cvt.u16.u32 %rs137, %r185; @%p9 bra $L__BB0_12; cvt.rn.f32.s16 %f378, %rs71; sub.ftz.f32 %f379, %f378, %f7; mul.ftz.f32 %f380, %f345, %f379; // begin inline asm { cvt.f32.f16 %f346, %rs72;} // end inline asm fma.rn.ftz.f32 %f381, %f380, %f346, %f687; cvt.rn.f32.s16 %f382, %rs74; sub.ftz.f32 %f383, %f382, %f7; mul.ftz.f32 %f384, %f345, %f383; // begin inline asm { cvt.f32.f16 %f347, %rs75;} // end inline asm fma.rn.ftz.f32 %f385, %f384, %f347, %f381; cvt.rn.f32.s16 %f386, %rs77; sub.ftz.f32 %f387, %f386, %f7; mul.ftz.f32 %f388, %f345, %f387; // begin inline asm { cvt.f32.f16 %f348, %rs78;} // end inline asm fma.rn.ftz.f32 %f389, %f388, %f348, %f385; cvt.rn.f32.s16 %f390, %rs80; sub.ftz.f32 %f391, %f390, %f7; mul.ftz.f32 %f392, %f345, %f391; // begin inline asm { cvt.f32.f16 %f349, %rs81;} // end inline asm fma.rn.ftz.f32 %f393, %f392, %f349, %f389; cvt.rn.f32.s16 %f394, %rs83; sub.ftz.f32 %f395, %f394, %f7; mul.ftz.f32 %f396, %f345, %f395; // begin inline asm { cvt.f32.f16 %f350, %rs84;} // end inline asm fma.rn.ftz.f32 %f397, %f396, %f350, %f393; cvt.rn.f32.s16 %f398, %rs86; sub.ftz.f32 %f399, %f398, %f7; mul.ftz.f32 %f400, %f345, %f399; // begin inline asm { cvt.f32.f16 %f351, %rs87;} // end inline asm fma.rn.ftz.f32 %f401, %f400, %f351, %f397; cvt.rn.f32.s16 %f402, %rs89; sub.ftz.f32 %f403, %f402, %f7; mul.ftz.f32 %f404, %f345, %f403; // begin inline asm { cvt.f32.f16 %f352, %rs90;} // end inline asm fma.rn.ftz.f32 %f405, %f404, %f352, %f401; cvt.rn.f32.s16 %f406, %rs91; sub.ftz.f32 %f407, %f406, %f7; mul.ftz.f32 %f408, %f345, %f407; // begin inline asm { cvt.f32.f16 %f353, %rs92;} // end inline asm fma.rn.ftz.f32 %f409, %f408, %f353, %f405; ld.global.v4.u32 {%r186, %r187, %r188, %r189}, [%rd7+16]; cvt.rn.f32.s16 %f410, %rs94; sub.ftz.f32 %f411, %f410, %f7; mul.ftz.f32 %f412, %f345, %f411; mov.b32 {%rs398, %rs399}, %r186; // begin inline asm { cvt.f32.f16 %f354, %rs398;} // end inline asm fma.rn.ftz.f32 %f413, %f412, %f354, %f409; cvt.rn.f32.s16 %f414, %rs96; sub.ftz.f32 %f415, %f414, %f7; mul.ftz.f32 %f416, %f345, %f415; // begin inline asm { cvt.f32.f16 %f355, %rs399;} // end inline asm fma.rn.ftz.f32 %f417, %f416, %f355, %f413; cvt.rn.f32.s16 %f418, %rs98; sub.ftz.f32 %f419, %f418, %f7; mul.ftz.f32 %f420, %f345, %f419; mov.b32 {%rs400, %rs401}, %r187; // begin inline asm { cvt.f32.f16 %f356, %rs400;} // end inline asm fma.rn.ftz.f32 %f421, %f420, %f356, %f417; cvt.rn.f32.s16 %f422, %rs100; sub.ftz.f32 %f423, %f422, %f7; mul.ftz.f32 %f424, %f345, %f423; // begin inline asm { cvt.f32.f16 %f357, %rs401;} // end inline asm fma.rn.ftz.f32 %f425, %f424, %f357, %f421; cvt.rn.f32.s16 %f426, %rs102; sub.ftz.f32 %f427, %f426, %f7; mul.ftz.f32 %f428, %f345, %f427; mov.b32 {%rs402, %rs403}, %r188; // begin inline asm { cvt.f32.f16 %f358, %rs402;} // end inline asm fma.rn.ftz.f32 %f429, %f428, %f358, %f425; cvt.rn.f32.s16 %f430, %rs104; sub.ftz.f32 %f431, %f430, %f7; mul.ftz.f32 %f432, %f345, %f431; // begin inline asm { cvt.f32.f16 %f359, %rs403;} // end inline asm fma.rn.ftz.f32 %f433, %f432, %f359, %f429; cvt.rn.f32.s16 %f434, %rs106; sub.ftz.f32 %f435, %f434, %f7; mul.ftz.f32 %f436, %f345, %f435; mov.b32 {%rs404, %rs405}, %r189; // begin inline asm { cvt.f32.f16 %f360, %rs404;} // end inline asm fma.rn.ftz.f32 %f437, %f436, %f360, %f433; cvt.rn.f32.s16 %f438, %rs107; sub.ftz.f32 %f439, %f438, %f7; mul.ftz.f32 %f440, %f345, %f439; // begin inline asm { cvt.f32.f16 %f361, %rs405;} // end inline asm fma.rn.ftz.f32 %f441, %f440, %f361, %f437; ld.global.v4.u32 {%r194, %r195, %r196, %r197}, [%rd7+32]; cvt.rn.f32.s16 %f442, %rs109; sub.ftz.f32 %f443, %f442, %f7; mul.ftz.f32 %f444, %f345, %f443; mov.b32 {%rs406, %rs407}, %r194; // begin inline asm { cvt.f32.f16 %f362, %rs406;} // end inline asm fma.rn.ftz.f32 %f445, %f444, %f362, %f441; cvt.rn.f32.s16 %f446, %rs111; sub.ftz.f32 %f447, %f446, %f7; mul.ftz.f32 %f448, %f345, %f447; // begin inline asm { cvt.f32.f16 %f363, %rs407;} // end inline asm fma.rn.ftz.f32 %f449, %f448, %f363, %f445; cvt.rn.f32.s16 %f450, %rs113; sub.ftz.f32 %f451, %f450, %f7; mul.ftz.f32 %f452, %f345, %f451; mov.b32 {%rs408, %rs409}, %r195; // begin inline asm { cvt.f32.f16 %f364, %rs408;} // end inline asm fma.rn.ftz.f32 %f453, %f452, %f364, %f449; cvt.rn.f32.s16 %f454, %rs115; sub.ftz.f32 %f455, %f454, %f7; mul.ftz.f32 %f456, %f345, %f455; // begin inline asm { cvt.f32.f16 %f365, %rs409;} // end inline asm fma.rn.ftz.f32 %f457, %f456, %f365, %f453; cvt.rn.f32.s16 %f458, %rs117; sub.ftz.f32 %f459, %f458, %f7; mul.ftz.f32 %f460, %f345, %f459; mov.b32 {%rs410, %rs411}, %r196; // begin inline asm { cvt.f32.f16 %f366, %rs410;} // end inline asm fma.rn.ftz.f32 %f461, %f460, %f366, %f457; cvt.rn.f32.s16 %f462, %rs119; sub.ftz.f32 %f463, %f462, %f7; mul.ftz.f32 %f464, %f345, %f463; // begin inline asm { cvt.f32.f16 %f367, %rs411;} // end inline asm fma.rn.ftz.f32 %f465, %f464, %f367, %f461; cvt.rn.f32.s16 %f466, %rs121; sub.ftz.f32 %f467, %f466, %f7; mul.ftz.f32 %f468, %f345, %f467; mov.b32 {%rs412, %rs413}, %r197; // begin inline asm { cvt.f32.f16 %f368, %rs412;} // end inline asm fma.rn.ftz.f32 %f469, %f468, %f368, %f465; cvt.rn.f32.s16 %f470, %rs122; sub.ftz.f32 %f471, %f470, %f7; mul.ftz.f32 %f472, %f345, %f471; // begin inline asm { cvt.f32.f16 %f369, %rs413;} // end inline asm fma.rn.ftz.f32 %f473, %f472, %f369, %f469; ld.global.v4.u32 {%r202, %r203, %r204, %r205}, [%rd7+48]; cvt.rn.f32.s16 %f474, %rs124; sub.ftz.f32 %f475, %f474, %f7; mul.ftz.f32 %f476, %f345, %f475; mov.b32 {%rs414, %rs415}, %r202; // begin inline asm { cvt.f32.f16 %f370, %rs414;} // end inline asm fma.rn.ftz.f32 %f477, %f476, %f370, %f473; cvt.rn.f32.s16 %f478, %rs126; sub.ftz.f32 %f479, %f478, %f7; mul.ftz.f32 %f480, %f345, %f479; // begin inline asm { cvt.f32.f16 %f371, %rs415;} // end inline asm fma.rn.ftz.f32 %f481, %f480, %f371, %f477; cvt.rn.f32.s16 %f482, %rs128; sub.ftz.f32 %f483, %f482, %f7; mul.ftz.f32 %f484, %f345, %f483; mov.b32 {%rs416, %rs417}, %r203; // begin inline asm { cvt.f32.f16 %f372, %rs416;} // end inline asm fma.rn.ftz.f32 %f485, %f484, %f372, %f481; cvt.rn.f32.s16 %f486, %rs130; sub.ftz.f32 %f487, %f486, %f7; mul.ftz.f32 %f488, %f345, %f487; // begin inline asm { cvt.f32.f16 %f373, %rs417;} // end inline asm fma.rn.ftz.f32 %f489, %f488, %f373, %f485; cvt.rn.f32.s16 %f490, %rs132; sub.ftz.f32 %f491, %f490, %f7; mul.ftz.f32 %f492, %f345, %f491; mov.b32 {%rs418, %rs419}, %r204; // begin inline asm { cvt.f32.f16 %f374, %rs418;} // end inline asm fma.rn.ftz.f32 %f493, %f492, %f374, %f489; cvt.rn.f32.s16 %f494, %rs134; sub.ftz.f32 %f495, %f494, %f7; mul.ftz.f32 %f496, %f345, %f495; // begin inline asm { cvt.f32.f16 %f375, %rs419;} // end inline asm fma.rn.ftz.f32 %f497, %f496, %f375, %f493; cvt.rn.f32.s16 %f498, %rs136; sub.ftz.f32 %f499, %f498, %f7; mul.ftz.f32 %f500, %f345, %f499; mov.b32 {%rs420, %rs421}, %r205; // begin inline asm { cvt.f32.f16 %f376, %rs420;} // end inline asm fma.rn.ftz.f32 %f501, %f500, %f376, %f497; cvt.rn.f32.s16 %f502, %rs137; sub.ftz.f32 %f503, %f502, %f7; mul.ftz.f32 %f504, %f345, %f503; // begin inline asm { cvt.f32.f16 %f377, %rs421;} // end inline asm fma.rn.ftz.f32 %f687, %f504, %f377, %f501; bra.uni $L__BB0_13; $L__BB0_12: shl.b16 %rs454, %rs70, 4; cvt.s16.s8 %rs455, %rs454; shr.s16 %rs456, %rs455, 7; and.b16 %rs457, %rs456, -16; or.b16 %rs458, %rs457, %rs71; cvt.rn.f32.s16 %f537, %rs458; sub.ftz.f32 %f538, %f537, %f7; mul.ftz.f32 %f539, %f345, %f538; // begin inline asm { cvt.f32.f16 %f505, %rs72;} // end inline asm fma.rn.ftz.f32 %f540, %f539, %f505, %f687; shl.b16 %rs459, %rs73, 4; cvt.s16.s8 %rs460, %rs459; shr.s16 %rs461, %rs460, 7; and.b16 %rs462, %rs461, -16; or.b16 %rs463, %rs462, %rs74; cvt.rn.f32.s16 %f541, %rs463; sub.ftz.f32 %f542, %f541, %f7; mul.ftz.f32 %f543, %f345, %f542; // begin inline asm { cvt.f32.f16 %f506, %rs75;} // end inline asm fma.rn.ftz.f32 %f544, %f543, %f506, %f540; shl.b16 %rs464, %rs76, 4; cvt.s16.s8 %rs465, %rs464; shr.s16 %rs466, %rs465, 7; and.b16 %rs467, %rs466, -16; or.b16 %rs468, %rs467, %rs77; cvt.rn.f32.s16 %f545, %rs468; sub.ftz.f32 %f546, %f545, %f7; mul.ftz.f32 %f547, %f345, %f546; // begin inline asm { cvt.f32.f16 %f507, %rs78;} // end inline asm fma.rn.ftz.f32 %f548, %f547, %f507, %f544; shl.b16 %rs469, %rs79, 4; cvt.s16.s8 %rs470, %rs469; shr.s16 %rs471, %rs470, 7; and.b16 %rs472, %rs471, -16; or.b16 %rs473, %rs472, %rs80; cvt.rn.f32.s16 %f549, %rs473; sub.ftz.f32 %f550, %f549, %f7; mul.ftz.f32 %f551, %f345, %f550; // begin inline asm { cvt.f32.f16 %f508, %rs81;} // end inline asm fma.rn.ftz.f32 %f552, %f551, %f508, %f548; shl.b16 %rs474, %rs82, 4; cvt.s16.s8 %rs475, %rs474; shr.s16 %rs476, %rs475, 7; and.b16 %rs477, %rs476, -16; or.b16 %rs478, %rs477, %rs83; cvt.rn.f32.s16 %f553, %rs478; sub.ftz.f32 %f554, %f553, %f7; mul.ftz.f32 %f555, %f345, %f554; // begin inline asm { cvt.f32.f16 %f509, %rs84;} // end inline asm fma.rn.ftz.f32 %f556, %f555, %f509, %f552; shl.b16 %rs479, %rs85, 4; cvt.s16.s8 %rs480, %rs479; shr.s16 %rs481, %rs480, 7; and.b16 %rs482, %rs481, -16; or.b16 %rs483, %rs482, %rs86; cvt.rn.f32.s16 %f557, %rs483; sub.ftz.f32 %f558, %f557, %f7; mul.ftz.f32 %f559, %f345, %f558; // begin inline asm { cvt.f32.f16 %f510, %rs87;} // end inline asm fma.rn.ftz.f32 %f560, %f559, %f510, %f556; shl.b16 %rs484, %rs88, 4; cvt.s16.s8 %rs485, %rs484; shr.s16 %rs486, %rs485, 7; and.b16 %rs487, %rs486, -16; or.b16 %rs488, %rs487, %rs89; cvt.rn.f32.s16 %f561, %rs488; sub.ftz.f32 %f562, %f561, %f7; mul.ftz.f32 %f563, %f345, %f562; // begin inline asm { cvt.f32.f16 %f511, %rs90;} // end inline asm fma.rn.ftz.f32 %f564, %f563, %f511, %f560; shl.b16 %rs489, %rs91, 4; cvt.s16.s8 %rs490, %rs489; shr.s16 %rs491, %rs490, 7; and.b16 %rs492, %rs491, -16; or.b16 %rs493, %rs492, %rs91; cvt.rn.f32.s16 %f565, %rs493; sub.ftz.f32 %f566, %f565, %f7; mul.ftz.f32 %f567, %f345, %f566; // begin inline asm { cvt.f32.f16 %f512, %rs92;} // end inline asm fma.rn.ftz.f32 %f568, %f567, %f512, %f564; ld.global.v4.u32 {%r210, %r211, %r212, %r213}, [%rd7+16]; shl.b16 %rs494, %rs93, 4; cvt.s16.s8 %rs495, %rs494; shr.s16 %rs496, %rs495, 7; and.b16 %rs497, %rs496, -16; or.b16 %rs498, %rs497, %rs94; cvt.rn.f32.s16 %f569, %rs498; sub.ftz.f32 %f570, %f569, %f7; mul.ftz.f32 %f571, %f345, %f570; mov.b32 {%rs430, %rs431}, %r210; // begin inline asm { cvt.f32.f16 %f513, %rs430;} // end inline asm fma.rn.ftz.f32 %f572, %f571, %f513, %f568; shl.b16 %rs499, %rs95, 4; cvt.s16.s8 %rs500, %rs499; shr.s16 %rs501, %rs500, 7; and.b16 %rs502, %rs501, -16; or.b16 %rs503, %rs502, %rs96; cvt.rn.f32.s16 %f573, %rs503; sub.ftz.f32 %f574, %f573, %f7; mul.ftz.f32 %f575, %f345, %f574; // begin inline asm { cvt.f32.f16 %f514, %rs431;} // end inline asm fma.rn.ftz.f32 %f576, %f575, %f514, %f572; shl.b16 %rs504, %rs97, 4; cvt.s16.s8 %rs505, %rs504; shr.s16 %rs506, %rs505, 7; and.b16 %rs507, %rs506, -16; or.b16 %rs508, %rs507, %rs98; cvt.rn.f32.s16 %f577, %rs508; sub.ftz.f32 %f578, %f577, %f7; mul.ftz.f32 %f579, %f345, %f578; mov.b32 {%rs432, %rs433}, %r211; // begin inline asm { cvt.f32.f16 %f515, %rs432;} // end inline asm fma.rn.ftz.f32 %f580, %f579, %f515, %f576; shl.b16 %rs509, %rs99, 4; cvt.s16.s8 %rs510, %rs509; shr.s16 %rs511, %rs510, 7; and.b16 %rs512, %rs511, -16; or.b16 %rs513, %rs512, %rs100; cvt.rn.f32.s16 %f581, %rs513; sub.ftz.f32 %f582, %f581, %f7; mul.ftz.f32 %f583, %f345, %f582; // begin inline asm { cvt.f32.f16 %f516, %rs433;} // end inline asm fma.rn.ftz.f32 %f584, %f583, %f516, %f580; shl.b16 %rs514, %rs101, 4; cvt.s16.s8 %rs515, %rs514; shr.s16 %rs516, %rs515, 7; and.b16 %rs517, %rs516, -16; or.b16 %rs518, %rs517, %rs102; cvt.rn.f32.s16 %f585, %rs518; sub.ftz.f32 %f586, %f585, %f7; mul.ftz.f32 %f587, %f345, %f586; mov.b32 {%rs434, %rs435}, %r212; // begin inline asm { cvt.f32.f16 %f517, %rs434;} // end inline asm fma.rn.ftz.f32 %f588, %f587, %f517, %f584; shl.b16 %rs519, %rs103, 4; cvt.s16.s8 %rs520, %rs519; shr.s16 %rs521, %rs520, 7; and.b16 %rs522, %rs521, -16; or.b16 %rs523, %rs522, %rs104; cvt.rn.f32.s16 %f589, %rs523; sub.ftz.f32 %f590, %f589, %f7; mul.ftz.f32 %f591, %f345, %f590; // begin inline asm { cvt.f32.f16 %f518, %rs435;} // end inline asm fma.rn.ftz.f32 %f592, %f591, %f518, %f588; shl.b16 %rs524, %rs105, 4; cvt.s16.s8 %rs525, %rs524; shr.s16 %rs526, %rs525, 7; and.b16 %rs527, %rs526, -16; or.b16 %rs528, %rs527, %rs106; cvt.rn.f32.s16 %f593, %rs528; sub.ftz.f32 %f594, %f593, %f7; mul.ftz.f32 %f595, %f345, %f594; mov.b32 {%rs436, %rs437}, %r213; // begin inline asm { cvt.f32.f16 %f519, %rs436;} // end inline asm fma.rn.ftz.f32 %f596, %f595, %f519, %f592; shl.b16 %rs529, %rs107, 4; cvt.s16.s8 %rs530, %rs529; shr.s16 %rs531, %rs530, 7; and.b16 %rs532, %rs531, -16; or.b16 %rs533, %rs532, %rs107; cvt.rn.f32.s16 %f597, %rs533; sub.ftz.f32 %f598, %f597, %f7; mul.ftz.f32 %f599, %f345, %f598; // begin inline asm { cvt.f32.f16 %f520, %rs437;} // end inline asm fma.rn.ftz.f32 %f600, %f599, %f520, %f596; ld.global.v4.u32 {%r218, %r219, %r220, %r221}, [%rd7+32]; shl.b16 %rs534, %rs108, 4; cvt.s16.s8 %rs535, %rs534; shr.s16 %rs536, %rs535, 7; and.b16 %rs537, %rs536, -16; or.b16 %rs538, %rs537, %rs109; cvt.rn.f32.s16 %f601, %rs538; sub.ftz.f32 %f602, %f601, %f7; mul.ftz.f32 %f603, %f345, %f602; mov.b32 {%rs438, %rs439}, %r218; // begin inline asm { cvt.f32.f16 %f521, %rs438;} // end inline asm fma.rn.ftz.f32 %f604, %f603, %f521, %f600; shl.b16 %rs539, %rs110, 4; cvt.s16.s8 %rs540, %rs539; shr.s16 %rs541, %rs540, 7; and.b16 %rs542, %rs541, -16; or.b16 %rs543, %rs542, %rs111; cvt.rn.f32.s16 %f605, %rs543; sub.ftz.f32 %f606, %f605, %f7; mul.ftz.f32 %f607, %f345, %f606; // begin inline asm { cvt.f32.f16 %f522, %rs439;} // end inline asm fma.rn.ftz.f32 %f608, %f607, %f522, %f604; shl.b16 %rs544, %rs112, 4; cvt.s16.s8 %rs545, %rs544; shr.s16 %rs546, %rs545, 7; and.b16 %rs547, %rs546, -16; or.b16 %rs548, %rs547, %rs113; cvt.rn.f32.s16 %f609, %rs548; sub.ftz.f32 %f610, %f609, %f7; mul.ftz.f32 %f611, %f345, %f610; mov.b32 {%rs440, %rs441}, %r219; // begin inline asm { cvt.f32.f16 %f523, %rs440;} // end inline asm fma.rn.ftz.f32 %f612, %f611, %f523, %f608; shl.b16 %rs549, %rs114, 4; cvt.s16.s8 %rs550, %rs549; shr.s16 %rs551, %rs550, 7; and.b16 %rs552, %rs551, -16; or.b16 %rs553, %rs552, %rs115; cvt.rn.f32.s16 %f613, %rs553; sub.ftz.f32 %f614, %f613, %f7; mul.ftz.f32 %f615, %f345, %f614; // begin inline asm { cvt.f32.f16 %f524, %rs441;} // end inline asm fma.rn.ftz.f32 %f616, %f615, %f524, %f612; shl.b16 %rs554, %rs116, 4; cvt.s16.s8 %rs555, %rs554; shr.s16 %rs556, %rs555, 7; and.b16 %rs557, %rs556, -16; or.b16 %rs558, %rs557, %rs117; cvt.rn.f32.s16 %f617, %rs558; sub.ftz.f32 %f618, %f617, %f7; mul.ftz.f32 %f619, %f345, %f618; mov.b32 {%rs442, %rs443}, %r220; // begin inline asm { cvt.f32.f16 %f525, %rs442;} // end inline asm fma.rn.ftz.f32 %f620, %f619, %f525, %f616; shl.b16 %rs559, %rs118, 4; cvt.s16.s8 %rs560, %rs559; shr.s16 %rs561, %rs560, 7; and.b16 %rs562, %rs561, -16; or.b16 %rs563, %rs562, %rs119; cvt.rn.f32.s16 %f621, %rs563; sub.ftz.f32 %f622, %f621, %f7; mul.ftz.f32 %f623, %f345, %f622; // begin inline asm { cvt.f32.f16 %f526, %rs443;} // end inline asm fma.rn.ftz.f32 %f624, %f623, %f526, %f620; shl.b16 %rs564, %rs120, 4; cvt.s16.s8 %rs565, %rs564; shr.s16 %rs566, %rs565, 7; and.b16 %rs567, %rs566, -16; or.b16 %rs568, %rs567, %rs121; cvt.rn.f32.s16 %f625, %rs568; sub.ftz.f32 %f626, %f625, %f7; mul.ftz.f32 %f627, %f345, %f626; mov.b32 {%rs444, %rs445}, %r221; // begin inline asm { cvt.f32.f16 %f527, %rs444;} // end inline asm fma.rn.ftz.f32 %f628, %f627, %f527, %f624; shl.b16 %rs569, %rs122, 4; cvt.s16.s8 %rs570, %rs569; shr.s16 %rs571, %rs570, 7; and.b16 %rs572, %rs571, -16; or.b16 %rs573, %rs572, %rs122; cvt.rn.f32.s16 %f629, %rs573; sub.ftz.f32 %f630, %f629, %f7; mul.ftz.f32 %f631, %f345, %f630; // begin inline asm { cvt.f32.f16 %f528, %rs445;} // end inline asm fma.rn.ftz.f32 %f632, %f631, %f528, %f628; ld.global.v4.u32 {%r226, %r227, %r228, %r229}, [%rd7+48]; shl.b16 %rs574, %rs123, 4; cvt.s16.s8 %rs575, %rs574; shr.s16 %rs576, %rs575, 7; and.b16 %rs577, %rs576, -16; or.b16 %rs578, %rs577, %rs124; cvt.rn.f32.s16 %f633, %rs578; sub.ftz.f32 %f634, %f633, %f7; mul.ftz.f32 %f635, %f345, %f634; mov.b32 {%rs446, %rs447}, %r226; // begin inline asm { cvt.f32.f16 %f529, %rs446;} // end inline asm fma.rn.ftz.f32 %f636, %f635, %f529, %f632; shl.b16 %rs579, %rs125, 4; cvt.s16.s8 %rs580, %rs579; shr.s16 %rs581, %rs580, 7; and.b16 %rs582, %rs581, -16; or.b16 %rs583, %rs582, %rs126; cvt.rn.f32.s16 %f637, %rs583; sub.ftz.f32 %f638, %f637, %f7; mul.ftz.f32 %f639, %f345, %f638; // begin inline asm { cvt.f32.f16 %f530, %rs447;} // end inline asm fma.rn.ftz.f32 %f640, %f639, %f530, %f636; shl.b16 %rs584, %rs127, 4; cvt.s16.s8 %rs585, %rs584; shr.s16 %rs586, %rs585, 7; and.b16 %rs587, %rs586, -16; or.b16 %rs588, %rs587, %rs128; cvt.rn.f32.s16 %f641, %rs588; sub.ftz.f32 %f642, %f641, %f7; mul.ftz.f32 %f643, %f345, %f642; mov.b32 {%rs448, %rs449}, %r227; // begin inline asm { cvt.f32.f16 %f531, %rs448;} // end inline asm fma.rn.ftz.f32 %f644, %f643, %f531, %f640; shl.b16 %rs589, %rs129, 4; cvt.s16.s8 %rs590, %rs589; shr.s16 %rs591, %rs590, 7; and.b16 %rs592, %rs591, -16; or.b16 %rs593, %rs592, %rs130; cvt.rn.f32.s16 %f645, %rs593; sub.ftz.f32 %f646, %f645, %f7; mul.ftz.f32 %f647, %f345, %f646; // begin inline asm { cvt.f32.f16 %f532, %rs449;} // end inline asm fma.rn.ftz.f32 %f648, %f647, %f532, %f644; shl.b16 %rs594, %rs131, 4; cvt.s16.s8 %rs595, %rs594; shr.s16 %rs596, %rs595, 7; and.b16 %rs597, %rs596, -16; or.b16 %rs598, %rs597, %rs132; cvt.rn.f32.s16 %f649, %rs598; sub.ftz.f32 %f650, %f649, %f7; mul.ftz.f32 %f651, %f345, %f650; mov.b32 {%rs450, %rs451}, %r228; // begin inline asm { cvt.f32.f16 %f533, %rs450;} // end inline asm fma.rn.ftz.f32 %f652, %f651, %f533, %f648; shl.b16 %rs599, %rs133, 4; cvt.s16.s8 %rs600, %rs599; shr.s16 %rs601, %rs600, 7; and.b16 %rs602, %rs601, -16; or.b16 %rs603, %rs602, %rs134; cvt.rn.f32.s16 %f653, %rs603; sub.ftz.f32 %f654, %f653, %f7; mul.ftz.f32 %f655, %f345, %f654; // begin inline asm { cvt.f32.f16 %f534, %rs451;} // end inline asm fma.rn.ftz.f32 %f656, %f655, %f534, %f652; shl.b16 %rs604, %rs135, 4; cvt.s16.s8 %rs605, %rs604; shr.s16 %rs606, %rs605, 7; and.b16 %rs607, %rs606, -16; or.b16 %rs608, %rs607, %rs136; cvt.rn.f32.s16 %f657, %rs608; sub.ftz.f32 %f658, %f657, %f7; mul.ftz.f32 %f659, %f345, %f658; mov.b32 {%rs452, %rs453}, %r229; // begin inline asm { cvt.f32.f16 %f535, %rs452;} // end inline asm fma.rn.ftz.f32 %f660, %f659, %f535, %f656; shl.b16 %rs609, %rs137, 4; cvt.s16.s8 %rs610, %rs609; shr.s16 %rs611, %rs610, 7; and.b16 %rs612, %rs611, -16; or.b16 %rs613, %rs612, %rs137; cvt.rn.f32.s16 %f661, %rs613; sub.ftz.f32 %f662, %f661, %f7; mul.ftz.f32 %f663, %f345, %f662; // begin inline asm { cvt.f32.f16 %f536, %rs453;} // end inline asm fma.rn.ftz.f32 %f687, %f663, %f536, %f660; $L__BB0_13: add.s32 %r261, %r261, 4; shl.b32 %r234, %r261, 5; add.s32 %r263, %r234, %r3; shl.b32 %r259, %r263, 2; setp.lt.u32 %p10, %r259, %r41; @%p10 bra $L__BB0_9; $L__BB0_14: setp.lt.u32 %p11, %r4, 32; @%p11 bra $L__BB0_16; shl.b32 %r235, %r4, 2; mov.u32 %r236, _ZZ9gemv_int4ILi4ELi128ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r237, %r236, %r235; st.shared.f32 [%r237+-112], %f687; $L__BB0_16: bar.sync 0; setp.gt.u32 %p12, %r4, 31; @%p12 bra $L__BB0_18; mov.u32 %r254, _ZZ9gemv_int4ILi4ELi128ELi1EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; mad.lo.s32 %r255, %r4, 12, %r254; mov.u32 %r242, 2; mov.u32 %r251, 16; ld.shared.f32 %f679, [%r255+16]; add.ftz.f32 %f680, %f687, %f679; ld.shared.f32 %f681, [%r255+20]; add.ftz.f32 %f682, %f680, %f681; ld.shared.f32 %f683, [%r255+24]; add.ftz.f32 %f666, %f682, %f683; mov.u32 %r239, 1; mov.u32 %r252, 31; mov.u32 %r253, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f666, %r239, %r252, %r253; @p add.f32 r0, r0, %f666; mov.f32 %f664, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f664, %r242, %r252, %r253; @p add.f32 r0, r0, %f664; mov.f32 %f667, r0;} // end inline asm mov.u32 %r245, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f667, %r245, %r252, %r253; @p add.f32 r0, r0, %f667; mov.f32 %f670, r0;} // end inline asm mov.u32 %r248, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f670, %r248, %r252, %r253; @p add.f32 r0, r0, %f670; mov.f32 %f673, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f673, %r251, %r252, %r253; @p add.f32 r0, r0, %f673; mov.f32 %f687, r0;} // end inline asm $L__BB0_18: mov.u32 %r257, %tid.y; or.b32 %r256, %r3, %r257; setp.ne.s32 %p13, %r256, 0; @%p13 bra $L__BB0_22; ld.param.u64 %rd34, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+8]; mov.u32 %r258, %ctaid.x; mul.ftz.f32 %f692, %f22, %f687; cvt.s64.s32 %rd9, %r258; setp.eq.s64 %p14, %rd34, 0; @%p14 bra $L__BB0_21; ld.param.u64 %rd35, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd28, %rd35; shl.b64 %rd29, %rd9, 1; add.s64 %rd30, %rd28, %rd29; ld.global.u16 %rs614, [%rd30]; // begin inline asm { cvt.f32.f16 %f684, %rs614;} // end inline asm fma.rn.ftz.f32 %f692, %f23, %f684, %f692; $L__BB0_21: ld.param.u64 %rd36, [_Z28dequant_gemv_group128_batch123DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs615, %f692;} // end inline asm cvta.to.global.u64 %rd31, %rd36; shl.b64 %rd32, %rd9, 1; add.s64 %rd33, %rd31, %rd32; st.global.u16 [%rd33], %rs615; $L__BB0_22: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }