n 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch223DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<14>; .reg .b16 %rs<398>; .reg .f32 %f<526>; .reg .b32 %r<250>; .reg .b64 %rd<38>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi2EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1280]; ld.param.v2.u32 {%r35, %r36}, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r37, %r38}, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f23, %f24}, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs80, %rs81, %rs82, %rs83}, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd17, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd16, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd15, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd14, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+16]; mov.u32 %r249, %tid.y; shl.b32 %r39, %r249, 5; mov.u32 %r40, %tid.x; add.s32 %r248, %r39, %r40; shl.b32 %r247, %r248, 2; setp.ge.u32 %p1, %r247, %r37; mov.f32 %f520, 0f00000000; mov.f32 %f521, %f520; @%p1 bra $L__BB0_9; mov.u32 %r41, %ctaid.x; mul.lo.s32 %r6, %r38, %r41; shl.b16 %rs2, %rs80, 3; cvta.to.global.u64 %rd2, %rd17; cvta.to.global.u64 %rd3, %rd15; cvta.to.global.u64 %rd4, %rd14; $L__BB0_2: mad.lo.s32 %r43, %r37, %r41, %r247; mul.wide.u32 %rd18, %r43, 4; add.s64 %rd19, %rd3, %rd18; ld.global.v4.u32 {%r44, %r45, %r46, %r47}, [%rd19]; shr.u32 %r49, %r40, 2; shl.b32 %r50, %r249, 3; add.s32 %r14, %r50, %r49; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd20, %r15, 2; add.s64 %rd21, %rd2, %rd20; ld.global.u16 %rs88, [%rd21]; // begin inline asm { cvt.f32.f16 %f27, %rs88;} // end inline asm setp.eq.s64 %p2, %rd16, 0; mov.u16 %rs397, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r51, %r15, 31; add.s32 %r52, %r15, %r51; shr.s32 %r53, %r52, 1; cvt.s64.s32 %rd22, %r53; cvta.to.global.u64 %rd23, %rd16; add.s64 %rd24, %rd23, %rd22; ld.global.u8 %r54, [%rd24]; shl.b32 %r55, %r14, 2; and.b32 %r56, %r55, 4; shr.u32 %r57, %r54, %r56; cvt.u16.u32 %rs89, %r57; and.b16 %rs397, %rs89, 15; $L__BB0_4: shl.b32 %r16, %r248, 5; setp.ge.s32 %p3, %r16, %r35; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs80, 0; shr.u16 %rs91, %rs397, 3; and.b16 %rs92, %rs91, 1; setp.eq.b16 %p5, %rs92, 1; and.pred %p6, %p4, %p5; selp.b16 %rs93, -16, 0, %p6; or.b16 %rs94, %rs93, %rs397; cvt.s16.s8 %rs95, %rs94; cvt.rn.f32.s16 %f4, %rs95; mul.wide.s32 %rd25, %r16, 2; add.s64 %rd5, %rd4, %rd25; ld.global.v4.u32 {%r58, %r59, %r60, %r61}, [%rd5]; mul.wide.s32 %rd26, %r35, 2; add.s64 %rd6, %rd5, %rd26; ld.global.v4.u32 {%r62, %r63, %r64, %r65}, [%rd6]; cvt.u16.u32 %rs96, %r44; and.b16 %rs5, %rs96, 15; mov.b32 {%rs6, %rs9}, %r58; mov.b32 {%rs7, %rs10}, %r62; shr.u16 %rs97, %rs96, 4; and.b16 %rs8, %rs97, 15; shr.u16 %rs98, %rs96, 8; and.b16 %rs11, %rs98, 15; mov.b32 {%rs12, %rs15}, %r59; mov.b32 {%rs13, %rs16}, %r63; shr.u16 %rs14, %rs96, 12; shr.u32 %r66, %r44, 16; cvt.u16.u32 %rs99, %r66; and.b16 %rs17, %rs99, 15; mov.b32 {%rs18, %rs24}, %r60; mov.b32 {%rs20, %rs25}, %r64; shr.u32 %r67, %r44, 20; cvt.u16.u32 %rs22, %r67; and.b16 %rs23, %rs22, 15; shr.u32 %r68, %r44, 24; cvt.u16.u32 %rs26, %r68; and.b16 %rs27, %rs26, 15; shr.u32 %r69, %r44, 28; cvt.u16.u32 %rs28, %r69; cvt.u16.u32 %rs100, %r45; and.b16 %rs29, %rs100, 15; shr.u32 %r70, %r45, 4; cvt.u16.u32 %rs30, %r70; and.b16 %rs31, %rs30, 15; shr.u32 %r71, %r45, 8; cvt.u16.u32 %rs32, %r71; and.b16 %rs33, %rs32, 15; shr.u32 %r72, %r45, 12; cvt.u16.u32 %rs34, %r72; and.b16 %rs35, %rs34, 15; shr.u32 %r73, %r45, 16; cvt.u16.u32 %rs36, %r73; and.b16 %rs37, %rs36, 15; shr.u32 %r74, %r45, 20; cvt.u16.u32 %rs38, %r74; and.b16 %rs39, %rs38, 15; shr.u32 %r75, %r45, 24; cvt.u16.u32 %rs40, %r75; and.b16 %rs41, %rs40, 15; shr.u32 %r76, %r45, 28; cvt.u16.u32 %rs42, %r76; cvt.u16.u32 %rs101, %r46; and.b16 %rs43, %rs101, 15; shr.u32 %r77, %r46, 4; cvt.u16.u32 %rs44, %r77; and.b16 %rs45, %rs44, 15; shr.u32 %r78, %r46, 8; cvt.u16.u32 %rs46, %r78; and.b16 %rs47, %rs46, 15; shr.u32 %r79, %r46, 12; cvt.u16.u32 %rs48, %r79; and.b16 %rs49, %rs48, 15; shr.u32 %r80, %r46, 16; cvt.u16.u32 %rs50, %r80; and.b16 %rs51, %rs50, 15; shr.u32 %r81, %r46, 20; cvt.u16.u32 %rs52, %r81; and.b16 %rs53, %rs52, 15; shr.u32 %r82, %r46, 24; cvt.u16.u32 %rs54, %r82; and.b16 %rs55, %rs54, 15; shr.u32 %r83, %r46, 28; cvt.u16.u32 %rs56, %r83; cvt.u16.u32 %rs57, %r47; and.b16 %rs58, %rs57, 15; shr.u32 %r84, %r47, 4; cvt.u16.u32 %rs59, %r84; and.b16 %rs60, %rs59, 15; shr.u32 %r85, %r47, 8; cvt.u16.u32 %rs61, %r85; and.b16 %rs62, %rs61, 15; shr.u32 %r86, %r47, 12; cvt.u16.u32 %rs63, %r86; and.b16 %rs64, %rs63, 15; shr.u32 %r87, %r47, 16; cvt.u16.u32 %rs65, %r87; and.b16 %rs66, %rs65, 15; shr.u32 %r88, %r47, 20; cvt.u16.u32 %rs67, %r88; and.b16 %rs68, %rs67, 15; shr.u32 %r89, %r47, 24; cvt.u16.u32 %rs69, %r89; and.b16 %rs70, %rs69, 15; shr.u32 %r90, %r47, 28; cvt.u16.u32 %rs71, %r90; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f92, %rs5; sub.ftz.f32 %f93, %f92, %f4; mul.ftz.f32 %f94, %f27, %f93; // begin inline asm { cvt.f32.f16 %f28, %rs6;} // end inline asm fma.rn.ftz.f32 %f95, %f94, %f28, %f521; // begin inline asm { cvt.f32.f16 %f29, %rs7;} // end inline asm fma.rn.ftz.f32 %f96, %f94, %f29, %f520; cvt.rn.f32.s16 %f97, %rs8; sub.ftz.f32 %f98, %f97, %f4; mul.ftz.f32 %f99, %f27, %f98; // begin inline asm { cvt.f32.f16 %f30, %rs9;} // end inline asm fma.rn.ftz.f32 %f100, %f99, %f30, %f95; // begin inline asm { cvt.f32.f16 %f31, %rs10;} // end inline asm fma.rn.ftz.f32 %f101, %f99, %f31, %f96; cvt.rn.f32.s16 %f102, %rs11; sub.ftz.f32 %f103, %f102, %f4; mul.ftz.f32 %f104, %f27, %f103; // begin inline asm { cvt.f32.f16 %f32, %rs12;} // end inline asm fma.rn.ftz.f32 %f105, %f104, %f32, %f100; // begin inline asm { cvt.f32.f16 %f33, %rs13;} // end inline asm fma.rn.ftz.f32 %f106, %f104, %f33, %f101; cvt.rn.f32.s16 %f107, %rs14; sub.ftz.f32 %f108, %f107, %f4; mul.ftz.f32 %f109, %f27, %f108; // begin inline asm { cvt.f32.f16 %f34, %rs15;} // end inline asm fma.rn.ftz.f32 %f110, %f109, %f34, %f105; // begin inline asm { cvt.f32.f16 %f35, %rs16;} // end inline asm fma.rn.ftz.f32 %f111, %f109, %f35, %f106; cvt.rn.f32.s16 %f112, %rs17; sub.ftz.f32 %f113, %f112, %f4; mul.ftz.f32 %f114, %f27, %f113; // begin inline asm { cvt.f32.f16 %f36, %rs18;} // end inline asm fma.rn.ftz.f32 %f115, %f114, %f36, %f110; // begin inline asm { cvt.f32.f16 %f37, %rs20;} // end inline asm fma.rn.ftz.f32 %f116, %f114, %f37, %f111; cvt.rn.f32.s16 %f117, %rs23; sub.ftz.f32 %f118, %f117, %f4; mul.ftz.f32 %f119, %f27, %f118; // begin inline asm { cvt.f32.f16 %f38, %rs24;} // end inline asm fma.rn.ftz.f32 %f120, %f119, %f38, %f115; // begin inline asm { cvt.f32.f16 %f39, %rs25;} // end inline asm fma.rn.ftz.f32 %f121, %f119, %f39, %f116; cvt.rn.f32.s16 %f122, %rs27; sub.ftz.f32 %f123, %f122, %f4; mul.ftz.f32 %f124, %f27, %f123; mov.b32 {%rs114, %rs116}, %r61; // begin inline asm { cvt.f32.f16 %f40, %rs114;} // end inline asm fma.rn.ftz.f32 %f125, %f124, %f40, %f120; mov.b32 {%rs115, %rs117}, %r65; // begin inline asm { cvt.f32.f16 %f41, %rs115;} // end inline asm fma.rn.ftz.f32 %f126, %f124, %f41, %f121; cvt.rn.f32.s16 %f127, %rs28; sub.ftz.f32 %f128, %f127, %f4; mul.ftz.f32 %f129, %f27, %f128; // begin inline asm { cvt.f32.f16 %f42, %rs116;} // end inline asm fma.rn.ftz.f32 %f130, %f129, %f42, %f125; // begin inline asm { cvt.f32.f16 %f43, %rs117;} // end inline asm fma.rn.ftz.f32 %f131, %f129, %f43, %f126; ld.global.v4.u32 {%r91, %r92, %r93, %r94}, [%rd5+16]; ld.global.v4.u32 {%r99, %r100, %r101, %r102}, [%rd6+16]; cvt.rn.f32.s16 %f132, %rs29; sub.ftz.f32 %f133, %f132, %f4; mul.ftz.f32 %f134, %f27, %f133; mov.b32 {%rs118, %rs120}, %r91; // begin inline asm { cvt.f32.f16 %f44, %rs118;} // end inline asm fma.rn.ftz.f32 %f135, %f134, %f44, %f130; mov.b32 {%rs119, %rs121}, %r99; // begin inline asm { cvt.f32.f16 %f45, %rs119;} // end inline asm fma.rn.ftz.f32 %f136, %f134, %f45, %f131; cvt.rn.f32.s16 %f137, %rs31; sub.ftz.f32 %f138, %f137, %f4; mul.ftz.f32 %f139, %f27, %f138; // begin inline asm { cvt.f32.f16 %f46, %rs120;} // end inline asm fma.rn.ftz.f32 %f140, %f139, %f46, %f135; // begin inline asm { cvt.f32.f16 %f47, %rs121;} // end inline asm fma.rn.ftz.f32 %f141, %f139, %f47, %f136; cvt.rn.f32.s16 %f142, %rs33; sub.ftz.f32 %f143, %f142, %f4; mul.ftz.f32 %f144, %f27, %f143; mov.b32 {%rs122, %rs124}, %r92; // begin inline asm { cvt.f32.f16 %f48, %rs122;} // end inline asm fma.rn.ftz.f32 %f145, %f144, %f48, %f140; mov.b32 {%rs123, %rs125}, %r100; // begin inline asm { cvt.f32.f16 %f49, %rs123;} // end inline asm fma.rn.ftz.f32 %f146, %f144, %f49, %f141; cvt.rn.f32.s16 %f147, %rs35; sub.ftz.f32 %f148, %f147, %f4; mul.ftz.f32 %f149, %f27, %f148; // begin inline asm { cvt.f32.f16 %f50, %rs124;} // end inline asm fma.rn.ftz.f32 %f150, %f149, %f50, %f145; // begin inline asm { cvt.f32.f16 %f51, %rs125;} // end inline asm fma.rn.ftz.f32 %f151, %f149, %f51, %f146; cvt.rn.f32.s16 %f152, %rs37; sub.ftz.f32 %f153, %f152, %f4; mul.ftz.f32 %f154, %f27, %f153; mov.b32 {%rs126, %rs128}, %r93; // begin inline asm { cvt.f32.f16 %f52, %rs126;} // end inline asm fma.rn.ftz.f32 %f155, %f154, %f52, %f150; mov.b32 {%rs127, %rs129}, %r101; // begin inline asm { cvt.f32.f16 %f53, %rs127;} // end inline asm fma.rn.ftz.f32 %f156, %f154, %f53, %f151; cvt.rn.f32.s16 %f157, %rs39; sub.ftz.f32 %f158, %f157, %f4; mul.ftz.f32 %f159, %f27, %f158; // begin inline asm { cvt.f32.f16 %f54, %rs128;} // end inline asm fma.rn.ftz.f32 %f160, %f159, %f54, %f155; // begin inline asm { cvt.f32.f16 %f55, %rs129;} // end inline asm fma.rn.ftz.f32 %f161, %f159, %f55, %f156; cvt.rn.f32.s16 %f162, %rs41; sub.ftz.f32 %f163, %f162, %f4; mul.ftz.f32 %f164, %f27, %f163; mov.b32 {%rs130, %rs132}, %r94; // begin inline asm { cvt.f32.f16 %f56, %rs130;} // end inline asm fma.rn.ftz.f32 %f165, %f164, %f56, %f160; mov.b32 {%rs131, %rs133}, %r102; // begin inline asm { cvt.f32.f16 %f57, %rs131;} // end inline asm fma.rn.ftz.f32 %f166, %f164, %f57, %f161; cvt.rn.f32.s16 %f167, %rs42; sub.ftz.f32 %f168, %f167, %f4; mul.ftz.f32 %f169, %f27, %f168; // begin inline asm { cvt.f32.f16 %f58, %rs132;} // end inline asm fma.rn.ftz.f32 %f170, %f169, %f58, %f165; // begin inline asm { cvt.f32.f16 %f59, %rs133;} // end inline asm fma.rn.ftz.f32 %f171, %f169, %f59, %f166; ld.global.v4.u32 {%r107, %r108, %r109, %r110}, [%rd5+32]; ld.global.v4.u32 {%r115, %r116, %r117, %r118}, [%rd6+32]; cvt.rn.f32.s16 %f172, %rs43; sub.ftz.f32 %f173, %f172, %f4; mul.ftz.f32 %f174, %f27, %f173; mov.b32 {%rs134, %rs136}, %r107; // begin inline asm { cvt.f32.f16 %f60, %rs134;} // end inline asm fma.rn.ftz.f32 %f175, %f174, %f60, %f170; mov.b32 {%rs135, %rs137}, %r115; // begin inline asm { cvt.f32.f16 %f61, %rs135;} // end inline asm fma.rn.ftz.f32 %f176, %f174, %f61, %f171; cvt.rn.f32.s16 %f177, %rs45; sub.ftz.f32 %f178, %f177, %f4; mul.ftz.f32 %f179, %f27, %f178; // begin inline asm { cvt.f32.f16 %f62, %rs136;} // end inline asm fma.rn.ftz.f32 %f180, %f179, %f62, %f175; // begin inline asm { cvt.f32.f16 %f63, %rs137;} // end inline asm fma.rn.ftz.f32 %f181, %f179, %f63, %f176; cvt.rn.f32.s16 %f182, %rs47; sub.ftz.f32 %f183, %f182, %f4; mul.ftz.f32 %f184, %f27, %f183; mov.b32 {%rs138, %rs140}, %r108; // begin inline asm { cvt.f32.f16 %f64, %rs138;} // end inline asm fma.rn.ftz.f32 %f185, %f184, %f64, %f180; mov.b32 {%rs139, %rs141}, %r116; // begin inline asm { cvt.f32.f16 %f65, %rs139;} // end inline asm fma.rn.ftz.f32 %f186, %f184, %f65, %f181; cvt.rn.f32.s16 %f187, %rs49; sub.ftz.f32 %f188, %f187, %f4; mul.ftz.f32 %f189, %f27, %f188; // begin inline asm { cvt.f32.f16 %f66, %rs140;} // end inline asm fma.rn.ftz.f32 %f190, %f189, %f66, %f185; // begin inline asm { cvt.f32.f16 %f67, %rs141;} // end inline asm fma.rn.ftz.f32 %f191, %f189, %f67, %f186; cvt.rn.f32.s16 %f192, %rs51; sub.ftz.f32 %f193, %f192, %f4; mul.ftz.f32 %f194, %f27, %f193; mov.b32 {%rs142, %rs144}, %r109; // begin inline asm { cvt.f32.f16 %f68, %rs142;} // end inline asm fma.rn.ftz.f32 %f195, %f194, %f68, %f190; mov.b32 {%rs143, %rs145}, %r117; // begin inline asm { cvt.f32.f16 %f69, %rs143;} // end inline asm fma.rn.ftz.f32 %f196, %f194, %f69, %f191; cvt.rn.f32.s16 %f197, %rs53; sub.ftz.f32 %f198, %f197, %f4; mul.ftz.f32 %f199, %f27, %f198; // begin inline asm { cvt.f32.f16 %f70, %rs144;} // end inline asm fma.rn.ftz.f32 %f200, %f199, %f70, %f195; // begin inline asm { cvt.f32.f16 %f71, %rs145;} // end inline asm fma.rn.ftz.f32 %f201, %f199, %f71, %f196; cvt.rn.f32.s16 %f202, %rs55; sub.ftz.f32 %f203, %f202, %f4; mul.ftz.f32 %f204, %f27, %f203; mov.b32 {%rs146, %rs148}, %r110; // begin inline asm { cvt.f32.f16 %f72, %rs146;} // end inline asm fma.rn.ftz.f32 %f205, %f204, %f72, %f200; mov.b32 {%rs147, %rs149}, %r118; // begin inline asm { cvt.f32.f16 %f73, %rs147;} // end inline asm fma.rn.ftz.f32 %f206, %f204, %f73, %f201; cvt.rn.f32.s16 %f207, %rs56; sub.ftz.f32 %f208, %f207, %f4; mul.ftz.f32 %f209, %f27, %f208; // begin inline asm { cvt.f32.f16 %f74, %rs148;} // end inline asm fma.rn.ftz.f32 %f210, %f209, %f74, %f205; // begin inline asm { cvt.f32.f16 %f75, %rs149;} // end inline asm fma.rn.ftz.f32 %f211, %f209, %f75, %f206; ld.global.v4.u32 {%r123, %r124, %r125, %r126}, [%rd5+48]; ld.global.v4.u32 {%r131, %r132, %r133, %r134}, [%rd6+48]; cvt.rn.f32.s16 %f212, %rs58; sub.ftz.f32 %f213, %f212, %f4; mul.ftz.f32 %f214, %f27, %f213; mov.b32 {%rs150, %rs152}, %r123; // begin inline asm { cvt.f32.f16 %f76, %rs150;} // end inline asm fma.rn.ftz.f32 %f215, %f214, %f76, %f210; mov.b32 {%rs151, %rs153}, %r131; // begin inline asm { cvt.f32.f16 %f77, %rs151;} // end inline asm fma.rn.ftz.f32 %f216, %f214, %f77, %f211; cvt.rn.f32.s16 %f217, %rs60; sub.ftz.f32 %f218, %f217, %f4; mul.ftz.f32 %f219, %f27, %f218; // begin inline asm { cvt.f32.f16 %f78, %rs152;} // end inline asm fma.rn.ftz.f32 %f220, %f219, %f78, %f215; // begin inline asm { cvt.f32.f16 %f79, %rs153;} // end inline asm fma.rn.ftz.f32 %f221, %f219, %f79, %f216; cvt.rn.f32.s16 %f222, %rs62; sub.ftz.f32 %f223, %f222, %f4; mul.ftz.f32 %f224, %f27, %f223; mov.b32 {%rs154, %rs156}, %r124; // begin inline asm { cvt.f32.f16 %f80, %rs154;} // end inline asm fma.rn.ftz.f32 %f225, %f224, %f80, %f220; mov.b32 {%rs155, %rs157}, %r132; // begin inline asm { cvt.f32.f16 %f81, %rs155;} // end inline asm fma.rn.ftz.f32 %f226, %f224, %f81, %f221; cvt.rn.f32.s16 %f227, %rs64; sub.ftz.f32 %f228, %f227, %f4; mul.ftz.f32 %f229, %f27, %f228; // begin inline asm { cvt.f32.f16 %f82, %rs156;} // end inline asm fma.rn.ftz.f32 %f230, %f229, %f82, %f225; // begin inline asm { cvt.f32.f16 %f83, %rs157;} // end inline asm fma.rn.ftz.f32 %f231, %f229, %f83, %f226; cvt.rn.f32.s16 %f232, %rs66; sub.ftz.f32 %f233, %f232, %f4; mul.ftz.f32 %f234, %f27, %f233; mov.b32 {%rs158, %rs160}, %r125; // begin inline asm { cvt.f32.f16 %f84, %rs158;} // end inline asm fma.rn.ftz.f32 %f235, %f234, %f84, %f230; mov.b32 {%rs159, %rs161}, %r133; // begin inline asm { cvt.f32.f16 %f85, %rs159;} // end inline asm fma.rn.ftz.f32 %f236, %f234, %f85, %f231; cvt.rn.f32.s16 %f237, %rs68; sub.ftz.f32 %f238, %f237, %f4; mul.ftz.f32 %f239, %f27, %f238; // begin inline asm { cvt.f32.f16 %f86, %rs160;} // end inline asm fma.rn.ftz.f32 %f240, %f239, %f86, %f235; // begin inline asm { cvt.f32.f16 %f87, %rs161;} // end inline asm fma.rn.ftz.f32 %f241, %f239, %f87, %f236; cvt.rn.f32.s16 %f242, %rs70; sub.ftz.f32 %f243, %f242, %f4; mul.ftz.f32 %f244, %f27, %f243; mov.b32 {%rs162, %rs164}, %r126; // begin inline asm { cvt.f32.f16 %f88, %rs162;} // end inline asm fma.rn.ftz.f32 %f245, %f244, %f88, %f240; mov.b32 {%rs163, %rs165}, %r134; // begin inline asm { cvt.f32.f16 %f89, %rs163;} // end inline asm fma.rn.ftz.f32 %f246, %f244, %f89, %f241; cvt.rn.f32.s16 %f247, %rs71; sub.ftz.f32 %f248, %f247, %f4; mul.ftz.f32 %f249, %f27, %f248; // begin inline asm { cvt.f32.f16 %f90, %rs164;} // end inline asm fma.rn.ftz.f32 %f521, %f249, %f90, %f245; // begin inline asm { cvt.f32.f16 %f91, %rs165;} // end inline asm fma.rn.ftz.f32 %f520, %f249, %f91, %f246; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs231, %rs96, 4; cvt.s16.s8 %rs232, %rs231; shr.s16 %rs233, %rs232, 7; and.b16 %rs234, %rs233, -16; or.b16 %rs235, %rs234, %rs5; cvt.rn.f32.s16 %f314, %rs235; sub.ftz.f32 %f315, %f314, %f4; mul.ftz.f32 %f316, %f27, %f315; // begin inline asm { cvt.f32.f16 %f250, %rs6;} // end inline asm fma.rn.ftz.f32 %f317, %f316, %f250, %f521; // begin inline asm { cvt.f32.f16 %f251, %rs7;} // end inline asm fma.rn.ftz.f32 %f318, %f316, %f251, %f520; cvt.s16.s8 %rs236, %rs96; shr.s16 %rs237, %rs236, 7; and.b16 %rs238, %rs237, -16; or.b16 %rs239, %rs238, %rs8; cvt.rn.f32.s16 %f319, %rs239; sub.ftz.f32 %f320, %f319, %f4; mul.ftz.f32 %f321, %f27, %f320; // begin inline asm { cvt.f32.f16 %f252, %rs9;} // end inline asm fma.rn.ftz.f32 %f322, %f321, %f252, %f317; // begin inline asm { cvt.f32.f16 %f253, %rs10;} // end inline asm fma.rn.ftz.f32 %f323, %f321, %f253, %f318; cvt.s16.s8 %rs241, %rs97; shr.s16 %rs242, %rs241, 7; and.b16 %rs243, %rs242, -16; or.b16 %rs244, %rs243, %rs11; cvt.rn.f32.s16 %f324, %rs244; sub.ftz.f32 %f325, %f324, %f4; mul.ftz.f32 %f326, %f27, %f325; // begin inline asm { cvt.f32.f16 %f254, %rs12;} // end inline asm fma.rn.ftz.f32 %f327, %f326, %f254, %f322; // begin inline asm { cvt.f32.f16 %f255, %rs13;} // end inline asm fma.rn.ftz.f32 %f328, %f326, %f255, %f323; shr.s16 %rs245, %rs96, 15; and.b16 %rs246, %rs245, -16; or.b16 %rs247, %rs246, %rs14; cvt.rn.f32.s16 %f329, %rs247; sub.ftz.f32 %f330, %f329, %f4; mul.ftz.f32 %f331, %f27, %f330; // begin inline asm { cvt.f32.f16 %f256, %rs15;} // end inline asm fma.rn.ftz.f32 %f332, %f331, %f256, %f327; // begin inline asm { cvt.f32.f16 %f257, %rs16;} // end inline asm fma.rn.ftz.f32 %f333, %f331, %f257, %f328; shl.b16 %rs249, %rs99, 4; cvt.s16.s8 %rs250, %rs249; shr.s16 %rs251, %rs250, 7; and.b16 %rs252, %rs251, -16; or.b16 %rs253, %rs252, %rs17; cvt.rn.f32.s16 %f334, %rs253; sub.ftz.f32 %f335, %f334, %f4; mul.ftz.f32 %f336, %f27, %f335; // begin inline asm { cvt.f32.f16 %f258, %rs18;} // end inline asm fma.rn.ftz.f32 %f337, %f336, %f258, %f332; // begin inline asm { cvt.f32.f16 %f259, %rs20;} // end inline asm fma.rn.ftz.f32 %f338, %f336, %f259, %f333; shl.b16 %rs254, %rs22, 4; cvt.s16.s8 %rs255, %rs254; shr.s16 %rs256, %rs255, 7; and.b16 %rs257, %rs256, -16; or.b16 %rs258, %rs257, %rs23; cvt.rn.f32.s16 %f339, %rs258; sub.ftz.f32 %f340, %f339, %f4; mul.ftz.f32 %f341, %f27, %f340; // begin inline asm { cvt.f32.f16 %f260, %rs24;} // end inline asm fma.rn.ftz.f32 %f342, %f341, %f260, %f337; // begin inline asm { cvt.f32.f16 %f261, %rs25;} // end inline asm fma.rn.ftz.f32 %f343, %f341, %f261, %f338; shl.b16 %rs259, %rs26, 4; cvt.s16.s8 %rs260, %rs259; shr.s16 %rs261, %rs260, 7; and.b16 %rs262, %rs261, -16; or.b16 %rs263, %rs262, %rs27; cvt.rn.f32.s16 %f344, %rs263; sub.ftz.f32 %f345, %f344, %f4; mul.ftz.f32 %f346, %f27, %f345; mov.b32 {%rs178, %rs180}, %r61; // begin inline asm { cvt.f32.f16 %f262, %rs178;} // end inline asm fma.rn.ftz.f32 %f347, %f346, %f262, %f342; mov.b32 {%rs179, %rs181}, %r65; // begin inline asm { cvt.f32.f16 %f263, %rs179;} // end inline asm fma.rn.ftz.f32 %f348, %f346, %f263, %f343; shl.b16 %rs264, %rs28, 4; cvt.s16.s8 %rs265, %rs264; shr.s16 %rs266, %rs265, 7; and.b16 %rs267, %rs266, -16; or.b16 %rs268, %rs267, %rs28; cvt.rn.f32.s16 %f349, %rs268; sub.ftz.f32 %f350, %f349, %f4; mul.ftz.f32 %f351, %f27, %f350; // begin inline asm { cvt.f32.f16 %f264, %rs180;} // end inline asm fma.rn.ftz.f32 %f352, %f351, %f264, %f347; // begin inline asm { cvt.f32.f16 %f265, %rs181;} // end inline asm fma.rn.ftz.f32 %f353, %f351, %f265, %f348; ld.global.v4.u32 {%r140, %r141, %r142, %r143}, [%rd5+16]; ld.global.v4.u32 {%r148, %r149, %r150, %r151}, [%rd6+16]; shl.b16 %rs270, %rs100, 4; cvt.s16.s8 %rs271, %rs270; shr.s16 %rs272, %rs271, 7; and.b16 %rs273, %rs272, -16; or.b16 %rs274, %rs273, %rs29; cvt.rn.f32.s16 %f354, %rs274; sub.ftz.f32 %f355, %f354, %f4; mul.ftz.f32 %f356, %f27, %f355; mov.b32 {%rs182, %rs184}, %r140; // begin inline asm { cvt.f32.f16 %f266, %rs182;} // end inline asm fma.rn.ftz.f32 %f357, %f356, %f266, %f352; mov.b32 {%rs183, %rs185}, %r148; // begin inline asm { cvt.f32.f16 %f267, %rs183;} // end inline asm fma.rn.ftz.f32 %f358, %f356, %f267, %f353; shl.b16 %rs275, %rs30, 4; cvt.s16.s8 %rs276, %rs275; shr.s16 %rs277, %rs276, 7; and.b16 %rs278, %rs277, -16; or.b16 %rs279, %rs278, %rs31; cvt.rn.f32.s16 %f359, %rs279; sub.ftz.f32 %f360, %f359, %f4; mul.ftz.f32 %f361, %f27, %f360; // begin inline asm { cvt.f32.f16 %f268, %rs184;} // end inline asm fma.rn.ftz.f32 %f362, %f361, %f268, %f357; // begin inline asm { cvt.f32.f16 %f269, %rs185;} // end inline asm fma.rn.ftz.f32 %f363, %f361, %f269, %f358; shl.b16 %rs280, %rs32, 4; cvt.s16.s8 %rs281, %rs280; shr.s16 %rs282, %rs281, 7; and.b16 %rs283, %rs282, -16; or.b16 %rs284, %rs283, %rs33; cvt.rn.f32.s16 %f364, %rs284; sub.ftz.f32 %f365, %f364, %f4; mul.ftz.f32 %f366, %f27, %f365; mov.b32 {%rs186, %rs188}, %r141; // begin inline asm { cvt.f32.f16 %f270, %rs186;} // end inline asm fma.rn.ftz.f32 %f367, %f366, %f270, %f362; mov.b32 {%rs187, %rs189}, %r149; // begin inline asm { cvt.f32.f16 %f271, %rs187;} // end inline asm fma.rn.ftz.f32 %f368, %f366, %f271, %f363; shl.b16 %rs285, %rs34, 4; cvt.s16.s8 %rs286, %rs285; shr.s16 %rs287, %rs286, 7; and.b16 %rs288, %rs287, -16; or.b16 %rs289, %rs288, %rs35; cvt.rn.f32.s16 %f369, %rs289; sub.ftz.f32 %f370, %f369, %f4; mul.ftz.f32 %f371, %f27, %f370; // begin inline asm { cvt.f32.f16 %f272, %rs188;} // end inline asm fma.rn.ftz.f32 %f372, %f371, %f272, %f367; // begin inline asm { cvt.f32.f16 %f273, %rs189;} // end inline asm fma.rn.ftz.f32 %f373, %f371, %f273, %f368; shl.b16 %rs290, %rs36, 4; cvt.s16.s8 %rs291, %rs290; shr.s16 %rs292, %rs291, 7; and.b16 %rs293, %rs292, -16; or.b16 %rs294, %rs293, %rs37; cvt.rn.f32.s16 %f374, %rs294; sub.ftz.f32 %f375, %f374, %f4; mul.ftz.f32 %f376, %f27, %f375; mov.b32 {%rs190, %rs192}, %r142; // begin inline asm { cvt.f32.f16 %f274, %rs190;} // end inline asm fma.rn.ftz.f32 %f377, %f376, %f274, %f372; mov.b32 {%rs191, %rs193}, %r150; // begin inline asm { cvt.f32.f16 %f275, %rs191;} // end inline asm fma.rn.ftz.f32 %f378, %f376, %f275, %f373; shl.b16 %rs295, %rs38, 4; cvt.s16.s8 %rs296, %rs295; shr.s16 %rs297, %rs296, 7; and.b16 %rs298, %rs297, -16; or.b16 %rs299, %rs298, %rs39; cvt.rn.f32.s16 %f379, %rs299; sub.ftz.f32 %f380, %f379, %f4; mul.ftz.f32 %f381, %f27, %f380; // begin inline asm { cvt.f32.f16 %f276, %rs192;} // end inline asm fma.rn.ftz.f32 %f382, %f381, %f276, %f377; // begin inline asm { cvt.f32.f16 %f277, %rs193;} // end inline asm fma.rn.ftz.f32 %f383, %f381, %f277, %f378; shl.b16 %rs300, %rs40, 4; cvt.s16.s8 %rs301, %rs300; shr.s16 %rs302, %rs301, 7; and.b16 %rs303, %rs302, -16; or.b16 %rs304, %rs303, %rs41; cvt.rn.f32.s16 %f384, %rs304; sub.ftz.f32 %f385, %f384, %f4; mul.ftz.f32 %f386, %f27, %f385; mov.b32 {%rs194, %rs196}, %r143; // begin inline asm { cvt.f32.f16 %f278, %rs194;} // end inline asm fma.rn.ftz.f32 %f387, %f386, %f278, %f382; mov.b32 {%rs195, %rs197}, %r151; // begin inline asm { cvt.f32.f16 %f279, %rs195;} // end inline asm fma.rn.ftz.f32 %f388, %f386, %f279, %f383; shl.b16 %rs305, %rs42, 4; cvt.s16.s8 %rs306, %rs305; shr.s16 %rs307, %rs306, 7; and.b16 %rs308, %rs307, -16; or.b16 %rs309, %rs308, %rs42; cvt.rn.f32.s16 %f389, %rs309; sub.ftz.f32 %f390, %f389, %f4; mul.ftz.f32 %f391, %f27, %f390; // begin inline asm { cvt.f32.f16 %f280, %rs196;} // end inline asm fma.rn.ftz.f32 %f392, %f391, %f280, %f387; // begin inline asm { cvt.f32.f16 %f281, %rs197;} // end inline asm fma.rn.ftz.f32 %f393, %f391, %f281, %f388; ld.global.v4.u32 {%r156, %r157, %r158, %r159}, [%rd5+32]; ld.global.v4.u32 {%r164, %r165, %r166, %r167}, [%rd6+32]; shl.b16 %rs311, %rs101, 4; cvt.s16.s8 %rs312, %rs311; shr.s16 %rs313, %rs312, 7; and.b16 %rs314, %rs313, -16; or.b16 %rs315, %rs314, %rs43; cvt.rn.f32.s16 %f394, %rs315; sub.ftz.f32 %f395, %f394, %f4; mul.ftz.f32 %f396, %f27, %f395; mov.b32 {%rs198, %rs200}, %r156; // begin inline asm { cvt.f32.f16 %f282, %rs198;} // end inline asm fma.rn.ftz.f32 %f397, %f396, %f282, %f392; mov.b32 {%rs199, %rs201}, %r164; // begin inline asm { cvt.f32.f16 %f283, %rs199;} // end inline asm fma.rn.ftz.f32 %f398, %f396, %f283, %f393; shl.b16 %rs316, %rs44, 4; cvt.s16.s8 %rs317, %rs316; shr.s16 %rs318, %rs317, 7; and.b16 %rs319, %rs318, -16; or.b16 %rs320, %rs319, %rs45; cvt.rn.f32.s16 %f399, %rs320; sub.ftz.f32 %f400, %f399, %f4; mul.ftz.f32 %f401, %f27, %f400; // begin inline asm { cvt.f32.f16 %f284, %rs200;} // end inline asm fma.rn.ftz.f32 %f402, %f401, %f284, %f397; // begin inline asm { cvt.f32.f16 %f285, %rs201;} // end inline asm fma.rn.ftz.f32 %f403, %f401, %f285, %f398; shl.b16 %rs321, %rs46, 4; cvt.s16.s8 %rs322, %rs321; shr.s16 %rs323, %rs322, 7; and.b16 %rs324, %rs323, -16; or.b16 %rs325, %rs324, %rs47; cvt.rn.f32.s16 %f404, %rs325; sub.ftz.f32 %f405, %f404, %f4; mul.ftz.f32 %f406, %f27, %f405; mov.b32 {%rs202, %rs204}, %r157; // begin inline asm { cvt.f32.f16 %f286, %rs202;} // end inline asm fma.rn.ftz.f32 %f407, %f406, %f286, %f402; mov.b32 {%rs203, %rs205}, %r165; // begin inline asm { cvt.f32.f16 %f287, %rs203;} // end inline asm fma.rn.ftz.f32 %f408, %f406, %f287, %f403; shl.b16 %rs326, %rs48, 4; cvt.s16.s8 %rs327, %rs326; shr.s16 %rs328, %rs327, 7; and.b16 %rs329, %rs328, -16; or.b16 %rs330, %rs329, %rs49; cvt.rn.f32.s16 %f409, %rs330; sub.ftz.f32 %f410, %f409, %f4; mul.ftz.f32 %f411, %f27, %f410; // begin inline asm { cvt.f32.f16 %f288, %rs204;} // end inline asm fma.rn.ftz.f32 %f412, %f411, %f288, %f407; // begin inline asm { cvt.f32.f16 %f289, %rs205;} // end inline asm fma.rn.ftz.f32 %f413, %f411, %f289, %f408; shl.b16 %rs331, %rs50, 4; cvt.s16.s8 %rs332, %rs331; shr.s16 %rs333, %rs332, 7; and.b16 %rs334, %rs333, -16; or.b16 %rs335, %rs334, %rs51; cvt.rn.f32.s16 %f414, %rs335; sub.ftz.f32 %f415, %f414, %f4; mul.ftz.f32 %f416, %f27, %f415; mov.b32 {%rs206, %rs208}, %r158; // begin inline asm { cvt.f32.f16 %f290, %rs206;} // end inline asm fma.rn.ftz.f32 %f417, %f416, %f290, %f412; mov.b32 {%rs207, %rs209}, %r166; // begin inline asm { cvt.f32.f16 %f291, %rs207;} // end inline asm fma.rn.ftz.f32 %f418, %f416, %f291, %f413; shl.b16 %rs336, %rs52, 4; cvt.s16.s8 %rs337, %rs336; shr.s16 %rs338, %rs337, 7; and.b16 %rs339, %rs338, -16; or.b16 %rs340, %rs339, %rs53; cvt.rn.f32.s16 %f419, %rs340; sub.ftz.f32 %f420, %f419, %f4; mul.ftz.f32 %f421, %f27, %f420; // begin inline asm { cvt.f32.f16 %f292, %rs208;} // end inline asm fma.rn.ftz.f32 %f422, %f421, %f292, %f417; // begin inline asm { cvt.f32.f16 %f293, %rs209;} // end inline asm fma.rn.ftz.f32 %f423, %f421, %f293, %f418; shl.b16 %rs341, %rs54, 4; cvt.s16.s8 %rs342, %rs341; shr.s16 %rs343, %rs342, 7; and.b16 %rs344, %rs343, -16; or.b16 %rs345, %rs344, %rs55; cvt.rn.f32.s16 %f424, %rs345; sub.ftz.f32 %f425, %f424, %f4; mul.ftz.f32 %f426, %f27, %f425; mov.b32 {%rs210, %rs212}, %r159; // begin inline asm { cvt.f32.f16 %f294, %rs210;} // end inline asm fma.rn.ftz.f32 %f427, %f426, %f294, %f422; mov.b32 {%rs211, %rs213}, %r167; // begin inline asm { cvt.f32.f16 %f295, %rs211;} // end inline asm fma.rn.ftz.f32 %f428, %f426, %f295, %f423; shl.b16 %rs346, %rs56, 4; cvt.s16.s8 %rs347, %rs346; shr.s16 %rs348, %rs347, 7; and.b16 %rs349, %rs348, -16; or.b16 %rs350, %rs349, %rs56; cvt.rn.f32.s16 %f429, %rs350; sub.ftz.f32 %f430, %f429, %f4; mul.ftz.f32 %f431, %f27, %f430; // begin inline asm { cvt.f32.f16 %f296, %rs212;} // end inline asm fma.rn.ftz.f32 %f432, %f431, %f296, %f427; // begin inline asm { cvt.f32.f16 %f297, %rs213;} // end inline asm fma.rn.ftz.f32 %f433, %f431, %f297, %f428; ld.global.v4.u32 {%r172, %r173, %r174, %r175}, [%rd5+48]; ld.global.v4.u32 {%r180, %r181, %r182, %r183}, [%rd6+48]; shl.b16 %rs351, %rs57, 4; cvt.s16.s8 %rs352, %rs351; shr.s16 %rs353, %rs352, 7; and.b16 %rs354, %rs353, -16; or.b16 %rs355, %rs354, %rs58; cvt.rn.f32.s16 %f434, %rs355; sub.ftz.f32 %f435, %f434, %f4; mul.ftz.f32 %f436, %f27, %f435; mov.b32 {%rs214, %rs216}, %r172; // begin inline asm { cvt.f32.f16 %f298, %rs214;} // end inline asm fma.rn.ftz.f32 %f437, %f436, %f298, %f432; mov.b32 {%rs215, %rs217}, %r180; // begin inline asm { cvt.f32.f16 %f299, %rs215;} // end inline asm fma.rn.ftz.f32 %f438, %f436, %f299, %f433; shl.b16 %rs356, %rs59, 4; cvt.s16.s8 %rs357, %rs356; shr.s16 %rs358, %rs357, 7; and.b16 %rs359, %rs358, -16; or.b16 %rs360, %rs359, %rs60; cvt.rn.f32.s16 %f439, %rs360; sub.ftz.f32 %f440, %f439, %f4; mul.ftz.f32 %f441, %f27, %f440; // begin inline asm { cvt.f32.f16 %f300, %rs216;} // end inline asm fma.rn.ftz.f32 %f442, %f441, %f300, %f437; // begin inline asm { cvt.f32.f16 %f301, %rs217;} // end inline asm fma.rn.ftz.f32 %f443, %f441, %f301, %f438; shl.b16 %rs361, %rs61, 4; cvt.s16.s8 %rs362, %rs361; shr.s16 %rs363, %rs362, 7; and.b16 %rs364, %rs363, -16; or.b16 %rs365, %rs364, %rs62; cvt.rn.f32.s16 %f444, %rs365; sub.ftz.f32 %f445, %f444, %f4; mul.ftz.f32 %f446, %f27, %f445; mov.b32 {%rs218, %rs220}, %r173; // begin inline asm { cvt.f32.f16 %f302, %rs218;} // end inline asm fma.rn.ftz.f32 %f447, %f446, %f302, %f442; mov.b32 {%rs219, %rs221}, %r181; // begin inline asm { cvt.f32.f16 %f303, %rs219;} // end inline asm fma.rn.ftz.f32 %f448, %f446, %f303, %f443; shl.b16 %rs366, %rs63, 4; cvt.s16.s8 %rs367, %rs366; shr.s16 %rs368, %rs367, 7; and.b16 %rs369, %rs368, -16; or.b16 %rs370, %rs369, %rs64; cvt.rn.f32.s16 %f449, %rs370; sub.ftz.f32 %f450, %f449, %f4; mul.ftz.f32 %f451, %f27, %f450; // begin inline asm { cvt.f32.f16 %f304, %rs220;} // end inline asm fma.rn.ftz.f32 %f452, %f451, %f304, %f447; // begin inline asm { cvt.f32.f16 %f305, %rs221;} // end inline asm fma.rn.ftz.f32 %f453, %f451, %f305, %f448; shl.b16 %rs371, %rs65, 4; cvt.s16.s8 %rs372, %rs371; shr.s16 %rs373, %rs372, 7; and.b16 %rs374, %rs373, -16; or.b16 %rs375, %rs374, %rs66; cvt.rn.f32.s16 %f454, %rs375; sub.ftz.f32 %f455, %f454, %f4; mul.ftz.f32 %f456, %f27, %f455; mov.b32 {%rs222, %rs224}, %r174; // begin inline asm { cvt.f32.f16 %f306, %rs222;} // end inline asm fma.rn.ftz.f32 %f457, %f456, %f306, %f452; mov.b32 {%rs223, %rs225}, %r182; // begin inline asm { cvt.f32.f16 %f307, %rs223;} // end inline asm fma.rn.ftz.f32 %f458, %f456, %f307, %f453; shl.b16 %rs376, %rs67, 4; cvt.s16.s8 %rs377, %rs376; shr.s16 %rs378, %rs377, 7; and.b16 %rs379, %rs378, -16; or.b16 %rs380, %rs379, %rs68; cvt.rn.f32.s16 %f459, %rs380; sub.ftz.f32 %f460, %f459, %f4; mul.ftz.f32 %f461, %f27, %f460; // begin inline asm { cvt.f32.f16 %f308, %rs224;} // end inline asm fma.rn.ftz.f32 %f462, %f461, %f308, %f457; // begin inline asm { cvt.f32.f16 %f309, %rs225;} // end inline asm fma.rn.ftz.f32 %f463, %f461, %f309, %f458; shl.b16 %rs381, %rs69, 4; cvt.s16.s8 %rs382, %rs381; shr.s16 %rs383, %rs382, 7; and.b16 %rs384, %rs383, -16; or.b16 %rs385, %rs384, %rs70; cvt.rn.f32.s16 %f464, %rs385; sub.ftz.f32 %f465, %f464, %f4; mul.ftz.f32 %f466, %f27, %f465; mov.b32 {%rs226, %rs228}, %r175; // begin inline asm { cvt.f32.f16 %f310, %rs226;} // end inline asm fma.rn.ftz.f32 %f467, %f466, %f310, %f462; mov.b32 {%rs227, %rs229}, %r183; // begin inline asm { cvt.f32.f16 %f311, %rs227;} // end inline asm fma.rn.ftz.f32 %f468, %f466, %f311, %f463; shl.b16 %rs386, %rs71, 4; cvt.s16.s8 %rs387, %rs386; shr.s16 %rs388, %rs387, 7; and.b16 %rs389, %rs388, -16; or.b16 %rs390, %rs389, %rs71; cvt.rn.f32.s16 %f469, %rs390; sub.ftz.f32 %f470, %f469, %f4; mul.ftz.f32 %f471, %f27, %f470; // begin inline asm { cvt.f32.f16 %f312, %rs228;} // end inline asm fma.rn.ftz.f32 %f521, %f471, %f312, %f467; // begin inline asm { cvt.f32.f16 %f313, %rs229;} // end inline asm fma.rn.ftz.f32 %f520, %f471, %f313, %f468; $L__BB0_8: add.s32 %r249, %r249, 4; shl.b32 %r188, %r249, 5; add.s32 %r248, %r188, %r40; shl.b32 %r247, %r248, 2; setp.lt.u32 %p7, %r247, %r37; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r190, %tid.y; shl.b32 %r191, %r190, 5; add.s32 %r28, %r191, %r40; setp.lt.u32 %p8, %r28, 32; shl.b32 %r193, %r28, 2; mov.u32 %r194, _ZZ9gemv_int4ILi4ELi128ELi2EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r195, %r194, %r193; @%p8 bra $L__BB0_11; add.s32 %r245, %r195, -112; st.shared.f32 [%r245], %f521; $L__BB0_11: setp.gt.u32 %p9, %r28, 31; bar.sync 0; mad.lo.s32 %r30, %r28, 12, %r194; @%p9 bra $L__BB0_13; mov.u32 %r214, 16; ld.shared.f32 %f487, [%r30+16]; add.ftz.f32 %f488, %f521, %f487; ld.shared.f32 %f489, [%r30+20]; add.ftz.f32 %f490, %f488, %f489; ld.shared.f32 %f491, [%r30+24]; add.ftz.f32 %f474, %f490, %f491; mov.u32 %r202, 1; mov.u32 %r215, 31; mov.u32 %r216, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f474, %r202, %r215, %r216; @p add.f32 r0, r0, %f474; mov.f32 %f472, r0;} // end inline asm mov.u32 %r205, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f472, %r205, %r215, %r216; @p add.f32 r0, r0, %f472; mov.f32 %f475, r0;} // end inline asm mov.u32 %r208, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f475, %r208, %r215, %r216; @p add.f32 r0, r0, %f475; mov.f32 %f478, r0;} // end inline asm mov.u32 %r211, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f478, %r211, %r215, %r216; @p add.f32 r0, r0, %f478; mov.f32 %f481, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f481, %r214, %r215, %r216; @p add.f32 r0, r0, %f481; mov.f32 %f521, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r246, %r195, -112; st.shared.f32 [%r246+640], %f520; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f507, [%r30+656]; add.ftz.f32 %f508, %f520, %f507; ld.shared.f32 %f509, [%r30+660]; add.ftz.f32 %f510, %f508, %f509; ld.shared.f32 %f511, [%r30+664]; add.ftz.f32 %f494, %f510, %f511; mov.u32 %r226, 1; mov.u32 %r239, 31; mov.u32 %r240, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f494, %r226, %r239, %r240; @p add.f32 r0, r0, %f494; mov.f32 %f492, r0;} // end inline asm mov.u32 %r229, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f492, %r229, %r239, %r240; @p add.f32 r0, r0, %f492; mov.f32 %f495, r0;} // end inline asm mov.u32 %r232, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f495, %r232, %r239, %r240; @p add.f32 r0, r0, %f495; mov.f32 %f498, r0;} // end inline asm mov.u32 %r235, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f498, %r235, %r239, %r240; @p add.f32 r0, r0, %f498; mov.f32 %f501, r0;} // end inline asm mov.u32 %r238, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f501, %r238, %r239, %r240; @p add.f32 r0, r0, %f501; mov.f32 %f520, r0;} // end inline asm $L__BB0_17: or.b32 %r243, %r40, %r190; setp.ne.s32 %p12, %r243, 0; @%p12 bra $L__BB0_21; ld.param.u64 %rd36, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd35, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0]; setp.eq.s64 %p13, %rd36, 0; mul.ftz.f32 %f17, %f23, %f521; mov.u32 %r244, %ctaid.x; cvt.s64.s32 %rd8, %r244; cvta.to.global.u64 %rd27, %rd35; mul.wide.s32 %rd28, %r244, 2; add.s64 %rd9, %rd27, %rd28; mul.ftz.f32 %f18, %f23, %f520; cvt.s64.s32 %rd10, %r36; mul.wide.s32 %rd29, %r36, 2; add.s64 %rd11, %rd9, %rd29; @%p13 bra $L__BB0_20; ld.param.u64 %rd37, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd30, %rd37; shl.b64 %rd31, %rd8, 1; add.s64 %rd32, %rd30, %rd31; ld.global.u16 %rs391, [%rd32]; // begin inline asm { cvt.f32.f16 %f512, %rs391;} // end inline asm fma.rn.ftz.f32 %f513, %f24, %f512, %f17; // begin inline asm { cvt.rn.f16.f32 %rs392, %f513;} // end inline asm st.global.u16 [%rd9], %rs392; shl.b64 %rd33, %rd10, 1; add.s64 %rd34, %rd32, %rd33; ld.global.u16 %rs393, [%rd34]; // begin inline asm { cvt.f32.f16 %f514, %rs393;} // end inline asm fma.rn.ftz.f32 %f515, %f24, %f514, %f18; // begin inline asm { cvt.rn.f16.f32 %rs394, %f515;} // end inline asm st.global.u16 [%rd11], %rs394; bra.uni $L__BB0_21; $L__BB0_20: // begin inline asm { cvt.rn.f16.f32 %rs395, %f17;} // end inline asm st.global.u16 [%rd9], %rs395; // begin inline asm { cvt.rn.f16.f32 %rs396, %f18;} // end inline asm st.global.u16 [%rd11], %rs396; $L__BB0_21: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }