TERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_c74955c76thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch223DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<15>; .reg .b16 %rs<400>; .reg .f32 %f<532>; .reg .b32 %r<256>; .reg .b64 %rd<49>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi2EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1280]; ld.param.v2.u32 {%r36, %r37}, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r38, %r39}, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f29, %f30}, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs84, %rs85, %rs86, %rs87}, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd16, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd15, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd14, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd13, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+16]; mov.u32 %r255, %tid.y; shl.b32 %r40, %r255, 5; mov.u32 %r41, %tid.x; add.s32 %r254, %r40, %r41; shl.b32 %r253, %r254, 2; setp.ge.u32 %p1, %r253, %r38; mov.f32 %f524, 0f00000000; mov.f32 %f525, %f524; @%p1 bra $L__BB0_9; mov.u32 %r42, %ctaid.x; mul.lo.s32 %r6, %r39, %r42; cvta.to.global.u64 %rd6, %rd13; $L__BB0_2: mad.lo.s32 %r48, %r38, %r42, %r253; mul.wide.u32 %rd23, %r48, 4; add.s64 %rd18, %rd14, %rd23; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd17, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v4.u32 {%r43,%r44,%r45,%r46}, [%rd18], %rd17; // end inline asm shr.u32 %r50, %r41, 2; shl.b32 %r51, %r255, 3; add.s32 %r14, %r51, %r50; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd24, %r15, 2; add.s64 %rd21, %rd16, %rd24; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd20, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs92, [%rd21], %rd20; // end inline asm // begin inline asm { cvt.f32.f16 %f33, %rs92;} // end inline asm shl.b16 %rs399, %rs84, 3; setp.eq.s64 %p2, %rd15, 0; @%p2 bra $L__BB0_4; shr.u32 %r52, %r15, 31; add.s32 %r53, %r15, %r52; shr.s32 %r54, %r53, 1; cvt.s64.s32 %rd28, %r54; add.s64 %rd26, %rd15, %rd28; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd25, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs94, [%rd26], %rd25; // end inline asm cvt.u32.u16 %r55, %rs94; and.b32 %r56, %r55, 255; shl.b32 %r57, %r14, 2; and.b32 %r58, %r57, 4; shr.u32 %r59, %r56, %r58; cvt.u16.u32 %rs95, %r59; and.b16 %rs399, %rs95, 15; $L__BB0_4: shl.b32 %r16, %r254, 5; setp.ge.s32 %p3, %r16, %r36; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs84, 0; shr.u16 %rs97, %rs399, 3; and.b16 %rs98, %rs97, 1; setp.eq.b16 %p5, %rs98, 1; and.pred %p6, %p4, %p5; selp.b16 %rs99, -16, 0, %p6; or.b16 %rs100, %rs99, %rs399; cvt.s16.s8 %rs101, %rs100; cvt.rn.f32.s16 %f4, %rs101; mul.wide.s32 %rd29, %r16, 2; add.s64 %rd7, %rd6, %rd29; ld.global.v4.u32 {%r60, %r61, %r62, %r63}, [%rd7]; mul.wide.s32 %rd30, %r36, 2; add.s64 %rd8, %rd7, %rd30; ld.global.v4.u32 {%r64, %r65, %r66, %r67}, [%rd8]; cvt.u16.u32 %rs5, %r43; and.b16 %rs6, %rs5, 15; mov.b32 {%rs9, %rs8}, %r60; mov.b32 {%rs12, %rs11}, %r64; shr.u32 %r68, %r43, 4; cvt.u16.u32 %rs13, %r68; and.b16 %rs14, %rs13, 15; shr.u32 %r69, %r43, 8; cvt.u16.u32 %rs15, %r69; and.b16 %rs16, %rs15, 15; shr.u32 %r70, %r43, 12; cvt.u16.u32 %rs17, %r70; and.b16 %rs18, %rs17, 15; mov.b32 {%rs102, %rs19}, %r65; shr.u32 %r71, %r43, 16; cvt.u16.u32 %rs20, %r71; and.b16 %rs21, %rs20, 15; mov.b32 {%rs22, %rs26}, %r62; mov.b32 {%rs23, %rs27}, %r66; shr.u32 %r72, %r43, 20; cvt.u16.u32 %rs24, %r72; and.b16 %rs25, %rs24, 15; shr.u32 %r73, %r43, 24; cvt.u16.u32 %rs28, %r73; and.b16 %rs29, %rs28, 15; shr.u32 %r74, %r43, 28; cvt.u16.u32 %rs30, %r74; cvt.u16.u32 %rs31, %r44; and.b16 %rs32, %rs31, 15; shr.u32 %r75, %r44, 4; cvt.u16.u32 %rs33, %r75; and.b16 %rs34, %rs33, 15; shr.u32 %r76, %r44, 8; cvt.u16.u32 %rs35, %r76; and.b16 %rs36, %rs35, 15; shr.u32 %r77, %r44, 12; cvt.u16.u32 %rs37, %r77; and.b16 %rs38, %rs37, 15; shr.u32 %r78, %r44, 16; cvt.u16.u32 %rs39, %r78; and.b16 %rs40, %rs39, 15; shr.u32 %r79, %r44, 20; cvt.u16.u32 %rs41, %r79; and.b16 %rs42, %rs41, 15; shr.u32 %r80, %r44, 24; cvt.u16.u32 %rs43, %r80; and.b16 %rs44, %rs43, 15; shr.u32 %r81, %r44, 28; cvt.u16.u32 %rs45, %r81; cvt.u16.u32 %rs46, %r45; and.b16 %rs47, %rs46, 15; shr.u32 %r82, %r45, 4; cvt.u16.u32 %rs48, %r82; and.b16 %rs49, %rs48, 15; shr.u32 %r83, %r45, 8; cvt.u16.u32 %rs50, %r83; and.b16 %rs51, %rs50, 15; shr.u32 %r84, %r45, 12; cvt.u16.u32 %rs52, %r84; and.b16 %rs53, %rs52, 15; shr.u32 %r85, %r45, 16; cvt.u16.u32 %rs54, %r85; and.b16 %rs55, %rs54, 15; shr.u32 %r86, %r45, 20; cvt.u16.u32 %rs56, %r86; and.b16 %rs57, %rs56, 15; shr.u32 %r87, %r45, 24; cvt.u16.u32 %rs58, %r87; and.b16 %rs59, %rs58, 15; shr.u32 %r88, %r45, 28; cvt.u16.u32 %rs60, %r88; cvt.u16.u32 %rs61, %r46; and.b16 %rs62, %rs61, 15; shr.u32 %r89, %r46, 4; cvt.u16.u32 %rs63, %r89; and.b16 %rs64, %rs63, 15; shr.u32 %r90, %r46, 8; cvt.u16.u32 %rs65, %r90; and.b16 %rs66, %rs65, 15; shr.u32 %r91, %r46, 12; cvt.u16.u32 %rs67, %r91; and.b16 %rs68, %rs67, 15; shr.u32 %r92, %r46, 16; cvt.u16.u32 %rs69, %r92; and.b16 %rs70, %rs69, 15; shr.u32 %r93, %r46, 20; cvt.u16.u32 %rs71, %r93; and.b16 %rs72, %rs71, 15; shr.u32 %r94, %r46, 24; cvt.u16.u32 %rs73, %r94; and.b16 %rs74, %rs73, 15; shr.u32 %r95, %r46, 28; cvt.u16.u32 %rs75, %r95; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f98, %rs6; sub.ftz.f32 %f99, %f98, %f4; mul.ftz.f32 %f100, %f33, %f99; // begin inline asm { cvt.f32.f16 %f34, %rs9;} // end inline asm fma.rn.ftz.f32 %f101, %f100, %f34, %f525; // begin inline asm { cvt.f32.f16 %f35, %rs12;} // end inline asm fma.rn.ftz.f32 %f102, %f100, %f35, %f524; cvt.rn.f32.s16 %f103, %rs14; sub.ftz.f32 %f104, %f103, %f4; mul.ftz.f32 %f105, %f33, %f104; // begin inline asm { cvt.f32.f16 %f36, %rs8;} // end inline asm fma.rn.ftz.f32 %f106, %f105, %f36, %f101; // begin inline asm { cvt.f32.f16 %f37, %rs11;} // end inline asm fma.rn.ftz.f32 %f107, %f105, %f37, %f102; cvt.rn.f32.s16 %f108, %rs16; sub.ftz.f32 %f109, %f108, %f4; mul.ftz.f32 %f110, %f33, %f109; mov.b32 {%rs107, %rs109}, %r61; // begin inline asm { cvt.f32.f16 %f38, %rs107;} // end inline asm fma.rn.ftz.f32 %f111, %f110, %f38, %f106; cvt.u16.u32 %rs108, %r65; // begin inline asm { cvt.f32.f16 %f39, %rs108;} // end inline asm fma.rn.ftz.f32 %f112, %f110, %f39, %f107; cvt.rn.f32.s16 %f113, %rs18; sub.ftz.f32 %f114, %f113, %f4; mul.ftz.f32 %f115, %f33, %f114; // begin inline asm { cvt.f32.f16 %f40, %rs109;} // end inline asm fma.rn.ftz.f32 %f116, %f115, %f40, %f111; // begin inline asm { cvt.f32.f16 %f41, %rs19;} // end inline asm fma.rn.ftz.f32 %f117, %f115, %f41, %f112; cvt.rn.f32.s16 %f118, %rs21; sub.ftz.f32 %f119, %f118, %f4; mul.ftz.f32 %f120, %f33, %f119; // begin inline asm { cvt.f32.f16 %f42, %rs22;} // end inline asm fma.rn.ftz.f32 %f121, %f120, %f42, %f116; // begin inline asm { cvt.f32.f16 %f43, %rs23;} // end inline asm fma.rn.ftz.f32 %f122, %f120, %f43, %f117; cvt.rn.f32.s16 %f123, %rs25; sub.ftz.f32 %f124, %f123, %f4; mul.ftz.f32 %f125, %f33, %f124; // begin inline asm { cvt.f32.f16 %f44, %rs26;} // end inline asm fma.rn.ftz.f32 %f126, %f125, %f44, %f121; // begin inline asm { cvt.f32.f16 %f45, %rs27;} // end inline asm fma.rn.ftz.f32 %f127, %f125, %f45, %f122; cvt.rn.f32.s16 %f128, %rs29; sub.ftz.f32 %f129, %f128, %f4; mul.ftz.f32 %f130, %f33, %f129; mov.b32 {%rs115, %rs117}, %r63; // begin inline asm { cvt.f32.f16 %f46, %rs115;} // end inline asm fma.rn.ftz.f32 %f131, %f130, %f46, %f126; mov.b32 {%rs116, %rs118}, %r67; // begin inline asm { cvt.f32.f16 %f47, %rs116;} // end inline asm fma.rn.ftz.f32 %f132, %f130, %f47, %f127; cvt.rn.f32.s16 %f133, %rs30; sub.ftz.f32 %f134, %f133, %f4; mul.ftz.f32 %f135, %f33, %f134; // begin inline asm { cvt.f32.f16 %f48, %rs117;} // end inline asm fma.rn.ftz.f32 %f136, %f135, %f48, %f131; // begin inline asm { cvt.f32.f16 %f49, %rs118;} // end inline asm fma.rn.ftz.f32 %f137, %f135, %f49, %f132; ld.global.v4.u32 {%r96, %r97, %r98, %r99}, [%rd7+16]; ld.global.v4.u32 {%r104, %r105, %r106, %r107}, [%rd8+16]; cvt.rn.f32.s16 %f138, %rs32; sub.ftz.f32 %f139, %f138, %f4; mul.ftz.f32 %f140, %f33, %f139; mov.b32 {%rs119, %rs121}, %r96; // begin inline asm { cvt.f32.f16 %f50, %rs119;} // end inline asm fma.rn.ftz.f32 %f141, %f140, %f50, %f136; mov.b32 {%rs120, %rs122}, %r104; // begin inline asm { cvt.f32.f16 %f51, %rs120;} // end inline asm fma.rn.ftz.f32 %f142, %f140, %f51, %f137; cvt.rn.f32.s16 %f143, %rs34; sub.ftz.f32 %f144, %f143, %f4; mul.ftz.f32 %f145, %f33, %f144; // begin inline asm { cvt.f32.f16 %f52, %rs121;} // end inline asm fma.rn.ftz.f32 %f146, %f145, %f52, %f141; // begin inline asm { cvt.f32.f16 %f53, %rs122;} // end inline asm fma.rn.ftz.f32 %f147, %f145, %f53, %f142; cvt.rn.f32.s16 %f148, %rs36; sub.ftz.f32 %f149, %f148, %f4; mul.ftz.f32 %f150, %f33, %f149; mov.b32 {%rs123, %rs125}, %r97; // begin inline asm { cvt.f32.f16 %f54, %rs123;} // end inline asm fma.rn.ftz.f32 %f151, %f150, %f54, %f146; mov.b32 {%rs124, %rs126}, %r105; // begin inline asm { cvt.f32.f16 %f55, %rs124;} // end inline asm fma.rn.ftz.f32 %f152, %f150, %f55, %f147; cvt.rn.f32.s16 %f153, %rs38; sub.ftz.f32 %f154, %f153, %f4; mul.ftz.f32 %f155, %f33, %f154; // begin inline asm { cvt.f32.f16 %f56, %rs125;} // end inline asm fma.rn.ftz.f32 %f156, %f155, %f56, %f151; // begin inline asm { cvt.f32.f16 %f57, %rs126;} // end inline asm fma.rn.ftz.f32 %f157, %f155, %f57, %f152; cvt.rn.f32.s16 %f158, %rs40; sub.ftz.f32 %f159, %f158, %f4; mul.ftz.f32 %f160, %f33, %f159; mov.b32 {%rs127, %rs129}, %r98; // begin inline asm { cvt.f32.f16 %f58, %rs127;} // end inline asm fma.rn.ftz.f32 %f161, %f160, %f58, %f156; mov.b32 {%rs128, %rs130}, %r106; // begin inline asm { cvt.f32.f16 %f59, %rs128;} // end inline asm fma.rn.ftz.f32 %f162, %f160, %f59, %f157; cvt.rn.f32.s16 %f163, %rs42; sub.ftz.f32 %f164, %f163, %f4; mul.ftz.f32 %f165, %f33, %f164; // begin inline asm { cvt.f32.f16 %f60, %rs129;} // end inline asm fma.rn.ftz.f32 %f166, %f165, %f60, %f161; // begin inline asm { cvt.f32.f16 %f61, %rs130;} // end inline asm fma.rn.ftz.f32 %f167, %f165, %f61, %f162; cvt.rn.f32.s16 %f168, %rs44; sub.ftz.f32 %f169, %f168, %f4; mul.ftz.f32 %f170, %f33, %f169; mov.b32 {%rs131, %rs133}, %r99; // begin inline asm { cvt.f32.f16 %f62, %rs131;} // end inline asm fma.rn.ftz.f32 %f171, %f170, %f62, %f166; mov.b32 {%rs132, %rs134}, %r107; // begin inline asm { cvt.f32.f16 %f63, %rs132;} // end inline asm fma.rn.ftz.f32 %f172, %f170, %f63, %f167; cvt.rn.f32.s16 %f173, %rs45; sub.ftz.f32 %f174, %f173, %f4; mul.ftz.f32 %f175, %f33, %f174; // begin inline asm { cvt.f32.f16 %f64, %rs133;} // end inline asm fma.rn.ftz.f32 %f176, %f175, %f64, %f171; // begin inline asm { cvt.f32.f16 %f65, %rs134;} // end inline asm fma.rn.ftz.f32 %f177, %f175, %f65, %f172; ld.global.v4.u32 {%r112, %r113, %r114, %r115}, [%rd7+32]; ld.global.v4.u32 {%r120, %r121, %r122, %r123}, [%rd8+32]; cvt.rn.f32.s16 %f178, %rs47; sub.ftz.f32 %f179, %f178, %f4; mul.ftz.f32 %f180, %f33, %f179; mov.b32 {%rs135, %rs137}, %r112; // begin inline asm { cvt.f32.f16 %f66, %rs135;} // end inline asm fma.rn.ftz.f32 %f181, %f180, %f66, %f176; mov.b32 {%rs136, %rs138}, %r120; // begin inline asm { cvt.f32.f16 %f67, %rs136;} // end inline asm fma.rn.ftz.f32 %f182, %f180, %f67, %f177; cvt.rn.f32.s16 %f183, %rs49; sub.ftz.f32 %f184, %f183, %f4; mul.ftz.f32 %f185, %f33, %f184; // begin inline asm { cvt.f32.f16 %f68, %rs137;} // end inline asm fma.rn.ftz.f32 %f186, %f185, %f68, %f181; // begin inline asm { cvt.f32.f16 %f69, %rs138;} // end inline asm fma.rn.ftz.f32 %f187, %f185, %f69, %f182; cvt.rn.f32.s16 %f188, %rs51; sub.ftz.f32 %f189, %f188, %f4; mul.ftz.f32 %f190, %f33, %f189; mov.b32 {%rs139, %rs141}, %r113; // begin inline asm { cvt.f32.f16 %f70, %rs139;} // end inline asm fma.rn.ftz.f32 %f191, %f190, %f70, %f186; mov.b32 {%rs140, %rs142}, %r121; // begin inline asm { cvt.f32.f16 %f71, %rs140;} // end inline asm fma.rn.ftz.f32 %f192, %f190, %f71, %f187; cvt.rn.f32.s16 %f193, %rs53; sub.ftz.f32 %f194, %f193, %f4; mul.ftz.f32 %f195, %f33, %f194; // begin inline asm { cvt.f32.f16 %f72, %rs141;} // end inline asm fma.rn.ftz.f32 %f196, %f195, %f72, %f191; // begin inline asm { cvt.f32.f16 %f73, %rs142;} // end inline asm fma.rn.ftz.f32 %f197, %f195, %f73, %f192; cvt.rn.f32.s16 %f198, %rs55; sub.ftz.f32 %f199, %f198, %f4; mul.ftz.f32 %f200, %f33, %f199; mov.b32 {%rs143, %rs145}, %r114; // begin inline asm { cvt.f32.f16 %f74, %rs143;} // end inline asm fma.rn.ftz.f32 %f201, %f200, %f74, %f196; mov.b32 {%rs144, %rs146}, %r122; // begin inline asm { cvt.f32.f16 %f75, %rs144;} // end inline asm fma.rn.ftz.f32 %f202, %f200, %f75, %f197; cvt.rn.f32.s16 %f203, %rs57; sub.ftz.f32 %f204, %f203, %f4; mul.ftz.f32 %f205, %f33, %f204; // begin inline asm { cvt.f32.f16 %f76, %rs145;} // end inline asm fma.rn.ftz.f32 %f206, %f205, %f76, %f201; // begin inline asm { cvt.f32.f16 %f77, %rs146;} // end inline asm fma.rn.ftz.f32 %f207, %f205, %f77, %f202; cvt.rn.f32.s16 %f208, %rs59; sub.ftz.f32 %f209, %f208, %f4; mul.ftz.f32 %f210, %f33, %f209; mov.b32 {%rs147, %rs149}, %r115; // begin inline asm { cvt.f32.f16 %f78, %rs147;} // end inline asm fma.rn.ftz.f32 %f211, %f210, %f78, %f206; mov.b32 {%rs148, %rs150}, %r123; // begin inline asm { cvt.f32.f16 %f79, %rs148;} // end inline asm fma.rn.ftz.f32 %f212, %f210, %f79, %f207; cvt.rn.f32.s16 %f213, %rs60; sub.ftz.f32 %f214, %f213, %f4; mul.ftz.f32 %f215, %f33, %f214; // begin inline asm { cvt.f32.f16 %f80, %rs149;} // end inline asm fma.rn.ftz.f32 %f216, %f215, %f80, %f211; // begin inline asm { cvt.f32.f16 %f81, %rs150;} // end inline asm fma.rn.ftz.f32 %f217, %f215, %f81, %f212; ld.global.v4.u32 {%r128, %r129, %r130, %r131}, [%rd7+48]; ld.global.v4.u32 {%r136, %r137, %r138, %r139}, [%rd8+48]; cvt.rn.f32.s16 %f218, %rs62; sub.ftz.f32 %f219, %f218, %f4; mul.ftz.f32 %f220, %f33, %f219; mov.b32 {%rs151, %rs153}, %r128; // begin inline asm { cvt.f32.f16 %f82, %rs151;} // end inline asm fma.rn.ftz.f32 %f221, %f220, %f82, %f216; mov.b32 {%rs152, %rs154}, %r136; // begin inline asm { cvt.f32.f16 %f83, %rs152;} // end inline asm fma.rn.ftz.f32 %f222, %f220, %f83, %f217; cvt.rn.f32.s16 %f223, %rs64; sub.ftz.f32 %f224, %f223, %f4; mul.ftz.f32 %f225, %f33, %f224; // begin inline asm { cvt.f32.f16 %f84, %rs153;} // end inline asm fma.rn.ftz.f32 %f226, %f225, %f84, %f221; // begin inline asm { cvt.f32.f16 %f85, %rs154;} // end inline asm fma.rn.ftz.f32 %f227, %f225, %f85, %f222; cvt.rn.f32.s16 %f228, %rs66; sub.ftz.f32 %f229, %f228, %f4; mul.ftz.f32 %f230, %f33, %f229; mov.b32 {%rs155, %rs157}, %r129; // begin inline asm { cvt.f32.f16 %f86, %rs155;} // end inline asm fma.rn.ftz.f32 %f231, %f230, %f86, %f226; mov.b32 {%rs156, %rs158}, %r137; // begin inline asm { cvt.f32.f16 %f87, %rs156;} // end inline asm fma.rn.ftz.f32 %f232, %f230, %f87, %f227; cvt.rn.f32.s16 %f233, %rs68; sub.ftz.f32 %f234, %f233, %f4; mul.ftz.f32 %f235, %f33, %f234; // begin inline asm { cvt.f32.f16 %f88, %rs157;} // end inline asm fma.rn.ftz.f32 %f236, %f235, %f88, %f231; // begin inline asm { cvt.f32.f16 %f89, %rs158;} // end inline asm fma.rn.ftz.f32 %f237, %f235, %f89, %f232; cvt.rn.f32.s16 %f238, %rs70; sub.ftz.f32 %f239, %f238, %f4; mul.ftz.f32 %f240, %f33, %f239; mov.b32 {%rs159, %rs161}, %r130; // begin inline asm { cvt.f32.f16 %f90, %rs159;} // end inline asm fma.rn.ftz.f32 %f241, %f240, %f90, %f236; mov.b32 {%rs160, %rs162}, %r138; // begin inline asm { cvt.f32.f16 %f91, %rs160;} // end inline asm fma.rn.ftz.f32 %f242, %f240, %f91, %f237; cvt.rn.f32.s16 %f243, %rs72; sub.ftz.f32 %f244, %f243, %f4; mul.ftz.f32 %f245, %f33, %f244; // begin inline asm { cvt.f32.f16 %f92, %rs161;} // end inline asm fma.rn.ftz.f32 %f246, %f245, %f92, %f241; // begin inline asm { cvt.f32.f16 %f93, %rs162;} // end inline asm fma.rn.ftz.f32 %f247, %f245, %f93, %f242; cvt.rn.f32.s16 %f248, %rs74; sub.ftz.f32 %f249, %f248, %f4; mul.ftz.f32 %f250, %f33, %f249; mov.b32 {%rs163, %rs165}, %r131; // begin inline asm { cvt.f32.f16 %f94, %rs163;} // end inline asm fma.rn.ftz.f32 %f251, %f250, %f94, %f246; mov.b32 {%rs164, %rs166}, %r139; // begin inline asm { cvt.f32.f16 %f95, %rs164;} // end inline asm fma.rn.ftz.f32 %f252, %f250, %f95, %f247; cvt.rn.f32.s16 %f253, %rs75; sub.ftz.f32 %f254, %f253, %f4; mul.ftz.f32 %f255, %f33, %f254; // begin inline asm { cvt.f32.f16 %f96, %rs165;} // end inline asm fma.rn.ftz.f32 %f525, %f255, %f96, %f251; // begin inline asm { cvt.f32.f16 %f97, %rs166;} // end inline asm fma.rn.ftz.f32 %f524, %f255, %f97, %f252; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs231, %rs5, 4; cvt.s16.s8 %rs232, %rs231; shr.s16 %rs233, %rs232, 7; and.b16 %rs234, %rs233, -16; or.b16 %rs235, %rs234, %rs6; cvt.rn.f32.s16 %f320, %rs235; sub.ftz.f32 %f321, %f320, %f4; mul.ftz.f32 %f322, %f33, %f321; // begin inline asm { cvt.f32.f16 %f256, %rs9;} // end inline asm fma.rn.ftz.f32 %f323, %f322, %f256, %f525; // begin inline asm { cvt.f32.f16 %f257, %rs12;} // end inline asm fma.rn.ftz.f32 %f324, %f322, %f257, %f524; shl.b16 %rs236, %rs13, 4; cvt.s16.s8 %rs237, %rs236; shr.s16 %rs238, %rs237, 7; and.b16 %rs239, %rs238, -16; or.b16 %rs240, %rs239, %rs14; cvt.rn.f32.s16 %f325, %rs240; sub.ftz.f32 %f326, %f325, %f4; mul.ftz.f32 %f327, %f33, %f326; // begin inline asm { cvt.f32.f16 %f258, %rs8;} // end inline asm fma.rn.ftz.f32 %f328, %f327, %f258, %f323; // begin inline asm { cvt.f32.f16 %f259, %rs11;} // end inline asm fma.rn.ftz.f32 %f329, %f327, %f259, %f324; shl.b16 %rs243, %rs15, 4; cvt.s16.s8 %rs244, %rs243; shr.s16 %rs245, %rs244, 7; and.b16 %rs246, %rs245, -16; or.b16 %rs247, %rs246, %rs16; cvt.rn.f32.s16 %f330, %rs247; sub.ftz.f32 %f331, %f330, %f4; mul.ftz.f32 %f332, %f33, %f331; mov.b32 {%rs171, %rs173}, %r61; // begin inline asm { cvt.f32.f16 %f260, %rs171;} // end inline asm fma.rn.ftz.f32 %f333, %f332, %f260, %f328; cvt.u16.u32 %rs172, %r65; // begin inline asm { cvt.f32.f16 %f261, %rs172;} // end inline asm fma.rn.ftz.f32 %f334, %f332, %f261, %f329; shl.b16 %rs248, %rs17, 4; cvt.s16.s8 %rs249, %rs248; shr.s16 %rs250, %rs249, 7; and.b16 %rs251, %rs250, -16; or.b16 %rs252, %rs251, %rs18; cvt.rn.f32.s16 %f335, %rs252; sub.ftz.f32 %f336, %f335, %f4; mul.ftz.f32 %f337, %f33, %f336; // begin inline asm { cvt.f32.f16 %f262, %rs173;} // end inline asm fma.rn.ftz.f32 %f338, %f337, %f262, %f333; // begin inline asm { cvt.f32.f16 %f263, %rs19;} // end inline asm fma.rn.ftz.f32 %f339, %f337, %f263, %f334; shl.b16 %rs253, %rs20, 4; cvt.s16.s8 %rs254, %rs253; shr.s16 %rs255, %rs254, 7; and.b16 %rs256, %rs255, -16; or.b16 %rs257, %rs256, %rs21; cvt.rn.f32.s16 %f340, %rs257; sub.ftz.f32 %f341, %f340, %f4; mul.ftz.f32 %f342, %f33, %f341; // begin inline asm { cvt.f32.f16 %f264, %rs22;} // end inline asm fma.rn.ftz.f32 %f343, %f342, %f264, %f338; // begin inline asm { cvt.f32.f16 %f265, %rs23;} // end inline asm fma.rn.ftz.f32 %f344, %f342, %f265, %f339; shl.b16 %rs258, %rs24, 4; cvt.s16.s8 %rs259, %rs258; shr.s16 %rs260, %rs259, 7; and.b16 %rs261, %rs260, -16; or.b16 %rs262, %rs261, %rs25; cvt.rn.f32.s16 %f345, %rs262; sub.ftz.f32 %f346, %f345, %f4; mul.ftz.f32 %f347, %f33, %f346; // begin inline asm { cvt.f32.f16 %f266, %rs26;} // end inline asm fma.rn.ftz.f32 %f348, %f347, %f266, %f343; // begin inline asm { cvt.f32.f16 %f267, %rs27;} // end inline asm fma.rn.ftz.f32 %f349, %f347, %f267, %f344; shl.b16 %rs263, %rs28, 4; cvt.s16.s8 %rs264, %rs263; shr.s16 %rs265, %rs264, 7; and.b16 %rs266, %rs265, -16; or.b16 %rs267, %rs266, %rs29; cvt.rn.f32.s16 %f350, %rs267; sub.ftz.f32 %f351, %f350, %f4; mul.ftz.f32 %f352, %f33, %f351; mov.b32 {%rs179, %rs181}, %r63; // begin inline asm { cvt.f32.f16 %f268, %rs179;} // end inline asm fma.rn.ftz.f32 %f353, %f352, %f268, %f348; mov.b32 {%rs180, %rs182}, %r67; // begin inline asm { cvt.f32.f16 %f269, %rs180;} // end inline asm fma.rn.ftz.f32 %f354, %f352, %f269, %f349; shl.b16 %rs268, %rs30, 4; cvt.s16.s8 %rs269, %rs268; shr.s16 %rs270, %rs269, 7; and.b16 %rs271, %rs270, -16; or.b16 %rs272, %rs271, %rs30; cvt.rn.f32.s16 %f355, %rs272; sub.ftz.f32 %f356, %f355, %f4; mul.ftz.f32 %f357, %f33, %f356; // begin inline asm { cvt.f32.f16 %f270, %rs181;} // end inline asm fma.rn.ftz.f32 %f358, %f357, %f270, %f353; // begin inline asm { cvt.f32.f16 %f271, %rs182;} // end inline asm fma.rn.ftz.f32 %f359, %f357, %f271, %f354; ld.global.v4.u32 {%r144, %r145, %r146, %r147}, [%rd7+16]; ld.global.v4.u32 {%r152, %r153, %r154, %r155}, [%rd8+16]; shl.b16 %rs273, %rs31, 4; cvt.s16.s8 %rs274, %rs273; shr.s16 %rs275, %rs274, 7; and.b16 %rs276, %rs275, -16; or.b16 %rs277, %rs276, %rs32; cvt.rn.f32.s16 %f360, %rs277; sub.ftz.f32 %f361, %f360, %f4; mul.ftz.f32 %f362, %f33, %f361; mov.b32 {%rs183, %rs185}, %r144; // begin inline asm { cvt.f32.f16 %f272, %rs183;} // end inline asm fma.rn.ftz.f32 %f363, %f362, %f272, %f358; mov.b32 {%rs184, %rs186}, %r152; // begin inline asm { cvt.f32.f16 %f273, %rs184;} // end inline asm fma.rn.ftz.f32 %f364, %f362, %f273, %f359; shl.b16 %rs278, %rs33, 4; cvt.s16.s8 %rs279, %rs278; shr.s16 %rs280, %rs279, 7; and.b16 %rs281, %rs280, -16; or.b16 %rs282, %rs281, %rs34; cvt.rn.f32.s16 %f365, %rs282; sub.ftz.f32 %f366, %f365, %f4; mul.ftz.f32 %f367, %f33, %f366; // begin inline asm { cvt.f32.f16 %f274, %rs185;} // end inline asm fma.rn.ftz.f32 %f368, %f367, %f274, %f363; // begin inline asm { cvt.f32.f16 %f275, %rs186;} // end inline asm fma.rn.ftz.f32 %f369, %f367, %f275, %f364; shl.b16 %rs283, %rs35, 4; cvt.s16.s8 %rs284, %rs283; shr.s16 %rs285, %rs284, 7; and.b16 %rs286, %rs285, -16; or.b16 %rs287, %rs286, %rs36; cvt.rn.f32.s16 %f370, %rs287; sub.ftz.f32 %f371, %f370, %f4; mul.ftz.f32 %f372, %f33, %f371; mov.b32 {%rs187, %rs189}, %r145; // begin inline asm { cvt.f32.f16 %f276, %rs187;} // end inline asm fma.rn.ftz.f32 %f373, %f372, %f276, %f368; mov.b32 {%rs188, %rs190}, %r153; // begin inline asm { cvt.f32.f16 %f277, %rs188;} // end inline asm fma.rn.ftz.f32 %f374, %f372, %f277, %f369; shl.b16 %rs288, %rs37, 4; cvt.s16.s8 %rs289, %rs288; shr.s16 %rs290, %rs289, 7; and.b16 %rs291, %rs290, -16; or.b16 %rs292, %rs291, %rs38; cvt.rn.f32.s16 %f375, %rs292; sub.ftz.f32 %f376, %f375, %f4; mul.ftz.f32 %f377, %f33, %f376; // begin inline asm { cvt.f32.f16 %f278, %rs189;} // end inline asm fma.rn.ftz.f32 %f378, %f377, %f278, %f373; // begin inline asm { cvt.f32.f16 %f279, %rs190;} // end inline asm fma.rn.ftz.f32 %f379, %f377, %f279, %f374; shl.b16 %rs293, %rs39, 4; cvt.s16.s8 %rs294, %rs293; shr.s16 %rs295, %rs294, 7; and.b16 %rs296, %rs295, -16; or.b16 %rs297, %rs296, %rs40; cvt.rn.f32.s16 %f380, %rs297; sub.ftz.f32 %f381, %f380, %f4; mul.ftz.f32 %f382, %f33, %f381; mov.b32 {%rs191, %rs193}, %r146; // begin inline asm { cvt.f32.f16 %f280, %rs191;} // end inline asm fma.rn.ftz.f32 %f383, %f382, %f280, %f378; mov.b32 {%rs192, %rs194}, %r154; // begin inline asm { cvt.f32.f16 %f281, %rs192;} // end inline asm fma.rn.ftz.f32 %f384, %f382, %f281, %f379; shl.b16 %rs298, %rs41, 4; cvt.s16.s8 %rs299, %rs298; shr.s16 %rs300, %rs299, 7; and.b16 %rs301, %rs300, -16; or.b16 %rs302, %rs301, %rs42; cvt.rn.f32.s16 %f385, %rs302; sub.ftz.f32 %f386, %f385, %f4; mul.ftz.f32 %f387, %f33, %f386; // begin inline asm { cvt.f32.f16 %f282, %rs193;} // end inline asm fma.rn.ftz.f32 %f388, %f387, %f282, %f383; // begin inline asm { cvt.f32.f16 %f283, %rs194;} // end inline asm fma.rn.ftz.f32 %f389, %f387, %f283, %f384; shl.b16 %rs303, %rs43, 4; cvt.s16.s8 %rs304, %rs303; shr.s16 %rs305, %rs304, 7; and.b16 %rs306, %rs305, -16; or.b16 %rs307, %rs306, %rs44; cvt.rn.f32.s16 %f390, %rs307; sub.ftz.f32 %f391, %f390, %f4; mul.ftz.f32 %f392, %f33, %f391; mov.b32 {%rs195, %rs197}, %r147; // begin inline asm { cvt.f32.f16 %f284, %rs195;} // end inline asm fma.rn.ftz.f32 %f393, %f392, %f284, %f388; mov.b32 {%rs196, %rs198}, %r155; // begin inline asm { cvt.f32.f16 %f285, %rs196;} // end inline asm fma.rn.ftz.f32 %f394, %f392, %f285, %f389; shl.b16 %rs308, %rs45, 4; cvt.s16.s8 %rs309, %rs308; shr.s16 %rs310, %rs309, 7; and.b16 %rs311, %rs310, -16; or.b16 %rs312, %rs311, %rs45; cvt.rn.f32.s16 %f395, %rs312; sub.ftz.f32 %f396, %f395, %f4; mul.ftz.f32 %f397, %f33, %f396; // begin inline asm { cvt.f32.f16 %f286, %rs197;} // end inline asm fma.rn.ftz.f32 %f398, %f397, %f286, %f393; // begin inline asm { cvt.f32.f16 %f287, %rs198;} // end inline asm fma.rn.ftz.f32 %f399, %f397, %f287, %f394; ld.global.v4.u32 {%r160, %r161, %r162, %r163}, [%rd7+32]; ld.global.v4.u32 {%r168, %r169, %r170, %r171}, [%rd8+32]; shl.b16 %rs313, %rs46, 4; cvt.s16.s8 %rs314, %rs313; shr.s16 %rs315, %rs314, 7; and.b16 %rs316, %rs315, -16; or.b16 %rs317, %rs316, %rs47; cvt.rn.f32.s16 %f400, %rs317; sub.ftz.f32 %f401, %f400, %f4; mul.ftz.f32 %f402, %f33, %f401; mov.b32 {%rs199, %rs201}, %r160; // begin inline asm { cvt.f32.f16 %f288, %rs199;} // end inline asm fma.rn.ftz.f32 %f403, %f402, %f288, %f398; mov.b32 {%rs200, %rs202}, %r168; // begin inline asm { cvt.f32.f16 %f289, %rs200;} // end inline asm fma.rn.ftz.f32 %f404, %f402, %f289, %f399; shl.b16 %rs318, %rs48, 4; cvt.s16.s8 %rs319, %rs318; shr.s16 %rs320, %rs319, 7; and.b16 %rs321, %rs320, -16; or.b16 %rs322, %rs321, %rs49; cvt.rn.f32.s16 %f405, %rs322; sub.ftz.f32 %f406, %f405, %f4; mul.ftz.f32 %f407, %f33, %f406; // begin inline asm { cvt.f32.f16 %f290, %rs201;} // end inline asm fma.rn.ftz.f32 %f408, %f407, %f290, %f403; // begin inline asm { cvt.f32.f16 %f291, %rs202;} // end inline asm fma.rn.ftz.f32 %f409, %f407, %f291, %f404; shl.b16 %rs323, %rs50, 4; cvt.s16.s8 %rs324, %rs323; shr.s16 %rs325, %rs324, 7; and.b16 %rs326, %rs325, -16; or.b16 %rs327, %rs326, %rs51; cvt.rn.f32.s16 %f410, %rs327; sub.ftz.f32 %f411, %f410, %f4; mul.ftz.f32 %f412, %f33, %f411; mov.b32 {%rs203, %rs205}, %r161; // begin inline asm { cvt.f32.f16 %f292, %rs203;} // end inline asm fma.rn.ftz.f32 %f413, %f412, %f292, %f408; mov.b32 {%rs204, %rs206}, %r169; // begin inline asm { cvt.f32.f16 %f293, %rs204;} // end inline asm fma.rn.ftz.f32 %f414, %f412, %f293, %f409; shl.b16 %rs328, %rs52, 4; cvt.s16.s8 %rs329, %rs328; shr.s16 %rs330, %rs329, 7; and.b16 %rs331, %rs330, -16; or.b16 %rs332, %rs331, %rs53; cvt.rn.f32.s16 %f415, %rs332; sub.ftz.f32 %f416, %f415, %f4; mul.ftz.f32 %f417, %f33, %f416; // begin inline asm { cvt.f32.f16 %f294, %rs205;} // end inline asm fma.rn.ftz.f32 %f418, %f417, %f294, %f413; // begin inline asm { cvt.f32.f16 %f295, %rs206;} // end inline asm fma.rn.ftz.f32 %f419, %f417, %f295, %f414; shl.b16 %rs333, %rs54, 4; cvt.s16.s8 %rs334, %rs333; shr.s16 %rs335, %rs334, 7; and.b16 %rs336, %rs335, -16; or.b16 %rs337, %rs336, %rs55; cvt.rn.f32.s16 %f420, %rs337; sub.ftz.f32 %f421, %f420, %f4; mul.ftz.f32 %f422, %f33, %f421; mov.b32 {%rs207, %rs209}, %r162; // begin inline asm { cvt.f32.f16 %f296, %rs207;} // end inline asm fma.rn.ftz.f32 %f423, %f422, %f296, %f418; mov.b32 {%rs208, %rs210}, %r170; // begin inline asm { cvt.f32.f16 %f297, %rs208;} // end inline asm fma.rn.ftz.f32 %f424, %f422, %f297, %f419; shl.b16 %rs338, %rs56, 4; cvt.s16.s8 %rs339, %rs338; shr.s16 %rs340, %rs339, 7; and.b16 %rs341, %rs340, -16; or.b16 %rs342, %rs341, %rs57; cvt.rn.f32.s16 %f425, %rs342; sub.ftz.f32 %f426, %f425, %f4; mul.ftz.f32 %f427, %f33, %f426; // begin inline asm { cvt.f32.f16 %f298, %rs209;} // end inline asm fma.rn.ftz.f32 %f428, %f427, %f298, %f423; // begin inline asm { cvt.f32.f16 %f299, %rs210;} // end inline asm fma.rn.ftz.f32 %f429, %f427, %f299, %f424; shl.b16 %rs343, %rs58, 4; cvt.s16.s8 %rs344, %rs343; shr.s16 %rs345, %rs344, 7; and.b16 %rs346, %rs345, -16; or.b16 %rs347, %rs346, %rs59; cvt.rn.f32.s16 %f430, %rs347; sub.ftz.f32 %f431, %f430, %f4; mul.ftz.f32 %f432, %f33, %f431; mov.b32 {%rs211, %rs213}, %r163; // begin inline asm { cvt.f32.f16 %f300, %rs211;} // end inline asm fma.rn.ftz.f32 %f433, %f432, %f300, %f428; mov.b32 {%rs212, %rs214}, %r171; // begin inline asm { cvt.f32.f16 %f301, %rs212;} // end inline asm fma.rn.ftz.f32 %f434, %f432, %f301, %f429; shl.b16 %rs348, %rs60, 4; cvt.s16.s8 %rs349, %rs348; shr.s16 %rs350, %rs349, 7; and.b16 %rs351, %rs350, -16; or.b16 %rs352, %rs351, %rs60; cvt.rn.f32.s16 %f435, %rs352; sub.ftz.f32 %f436, %f435, %f4; mul.ftz.f32 %f437, %f33, %f436; // begin inline asm { cvt.f32.f16 %f302, %rs213;} // end inline asm fma.rn.ftz.f32 %f438, %f437, %f302, %f433; // begin inline asm { cvt.f32.f16 %f303, %rs214;} // end inline asm fma.rn.ftz.f32 %f439, %f437, %f303, %f434; ld.global.v4.u32 {%r176, %r177, %r178, %r179}, [%rd7+48]; ld.global.v4.u32 {%r184, %r185, %r186, %r187}, [%rd8+48]; shl.b16 %rs353, %rs61, 4; cvt.s16.s8 %rs354, %rs353; shr.s16 %rs355, %rs354, 7; and.b16 %rs356, %rs355, -16; or.b16 %rs357, %rs356, %rs62; cvt.rn.f32.s16 %f440, %rs357; sub.ftz.f32 %f441, %f440, %f4; mul.ftz.f32 %f442, %f33, %f441; mov.b32 {%rs215, %rs217}, %r176; // begin inline asm { cvt.f32.f16 %f304, %rs215;} // end inline asm fma.rn.ftz.f32 %f443, %f442, %f304, %f438; mov.b32 {%rs216, %rs218}, %r184; // begin inline asm { cvt.f32.f16 %f305, %rs216;} // end inline asm fma.rn.ftz.f32 %f444, %f442, %f305, %f439; shl.b16 %rs358, %rs63, 4; cvt.s16.s8 %rs359, %rs358; shr.s16 %rs360, %rs359, 7; and.b16 %rs361, %rs360, -16; or.b16 %rs362, %rs361, %rs64; cvt.rn.f32.s16 %f445, %rs362; sub.ftz.f32 %f446, %f445, %f4; mul.ftz.f32 %f447, %f33, %f446; // begin inline asm { cvt.f32.f16 %f306, %rs217;} // end inline asm fma.rn.ftz.f32 %f448, %f447, %f306, %f443; // begin inline asm { cvt.f32.f16 %f307, %rs218;} // end inline asm fma.rn.ftz.f32 %f449, %f447, %f307, %f444; shl.b16 %rs363, %rs65, 4; cvt.s16.s8 %rs364, %rs363; shr.s16 %rs365, %rs364, 7; and.b16 %rs366, %rs365, -16; or.b16 %rs367, %rs366, %rs66; cvt.rn.f32.s16 %f450, %rs367; sub.ftz.f32 %f451, %f450, %f4; mul.ftz.f32 %f452, %f33, %f451; mov.b32 {%rs219, %rs221}, %r177; // begin inline asm { cvt.f32.f16 %f308, %rs219;} // end inline asm fma.rn.ftz.f32 %f453, %f452, %f308, %f448; mov.b32 {%rs220, %rs222}, %r185; // begin inline asm { cvt.f32.f16 %f309, %rs220;} // end inline asm fma.rn.ftz.f32 %f454, %f452, %f309, %f449; shl.b16 %rs368, %rs67, 4; cvt.s16.s8 %rs369, %rs368; shr.s16 %rs370, %rs369, 7; and.b16 %rs371, %rs370, -16; or.b16 %rs372, %rs371, %rs68; cvt.rn.f32.s16 %f455, %rs372; sub.ftz.f32 %f456, %f455, %f4; mul.ftz.f32 %f457, %f33, %f456; // begin inline asm { cvt.f32.f16 %f310, %rs221;} // end inline asm fma.rn.ftz.f32 %f458, %f457, %f310, %f453; // begin inline asm { cvt.f32.f16 %f311, %rs222;} // end inline asm fma.rn.ftz.f32 %f459, %f457, %f311, %f454; shl.b16 %rs373, %rs69, 4; cvt.s16.s8 %rs374, %rs373; shr.s16 %rs375, %rs374, 7; and.b16 %rs376, %rs375, -16; or.b16 %rs377, %rs376, %rs70; cvt.rn.f32.s16 %f460, %rs377; sub.ftz.f32 %f461, %f460, %f4; mul.ftz.f32 %f462, %f33, %f461; mov.b32 {%rs223, %rs225}, %r178; // begin inline asm { cvt.f32.f16 %f312, %rs223;} // end inline asm fma.rn.ftz.f32 %f463, %f462, %f312, %f458; mov.b32 {%rs224, %rs226}, %r186; // begin inline asm { cvt.f32.f16 %f313, %rs224;} // end inline asm fma.rn.ftz.f32 %f464, %f462, %f313, %f459; shl.b16 %rs378, %rs71, 4; cvt.s16.s8 %rs379, %rs378; shr.s16 %rs380, %rs379, 7; and.b16 %rs381, %rs380, -16; or.b16 %rs382, %rs381, %rs72; cvt.rn.f32.s16 %f465, %rs382; sub.ftz.f32 %f466, %f465, %f4; mul.ftz.f32 %f467, %f33, %f466; // begin inline asm { cvt.f32.f16 %f314, %rs225;} // end inline asm fma.rn.ftz.f32 %f468, %f467, %f314, %f463; // begin inline asm { cvt.f32.f16 %f315, %rs226;} // end inline asm fma.rn.ftz.f32 %f469, %f467, %f315, %f464; shl.b16 %rs383, %rs73, 4; cvt.s16.s8 %rs384, %rs383; shr.s16 %rs385, %rs384, 7; and.b16 %rs386, %rs385, -16; or.b16 %rs387, %rs386, %rs74; cvt.rn.f32.s16 %f470, %rs387; sub.ftz.f32 %f471, %f470, %f4; mul.ftz.f32 %f472, %f33, %f471; mov.b32 {%rs227, %rs229}, %r179; // begin inline asm { cvt.f32.f16 %f316, %rs227;} // end inline asm fma.rn.ftz.f32 %f473, %f472, %f316, %f468; mov.b32 {%rs228, %rs230}, %r187; // begin inline asm { cvt.f32.f16 %f317, %rs228;} // end inline asm fma.rn.ftz.f32 %f474, %f472, %f317, %f469; shl.b16 %rs388, %rs75, 4; cvt.s16.s8 %rs389, %rs388; shr.s16 %rs390, %rs389, 7; and.b16 %rs391, %rs390, -16; or.b16 %rs392, %rs391, %rs75; cvt.rn.f32.s16 %f475, %rs392; sub.ftz.f32 %f476, %f475, %f4; mul.ftz.f32 %f477, %f33, %f476; // begin inline asm { cvt.f32.f16 %f318, %rs229;} // end inline asm fma.rn.ftz.f32 %f525, %f477, %f318, %f473; // begin inline asm { cvt.f32.f16 %f319, %rs230;} // end inline asm fma.rn.ftz.f32 %f524, %f477, %f319, %f474; $L__BB0_8: add.s32 %r255, %r255, 4; shl.b32 %r192, %r255, 5; add.s32 %r254, %r192, %r41; shl.b32 %r253, %r254, 2; setp.lt.u32 %p7, %r253, %r38; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r194, %tid.y; shl.b32 %r195, %r194, 5; add.s32 %r28, %r195, %r41; setp.lt.u32 %p8, %r28, 32; shl.b32 %r197, %r28, 2; mov.u32 %r198, _ZZ9gemv_int4ILi4ELi128ELi2EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r199, %r198, %r197; @%p8 bra $L__BB0_11; add.s32 %r251, %r199, -112; st.shared.f32 [%r251], %f525; $L__BB0_11: setp.gt.u32 %p9, %r28, 31; bar.sync 0; mad.lo.s32 %r30, %r28, 12, %r198; @%p9 bra $L__BB0_13; mov.u32 %r218, 16; ld.shared.f32 %f493, [%r30+16]; add.ftz.f32 %f494, %f525, %f493; ld.shared.f32 %f495, [%r30+20]; add.ftz.f32 %f496, %f494, %f495; ld.shared.f32 %f497, [%r30+24]; add.ftz.f32 %f480, %f496, %f497; mov.u32 %r206, 1; mov.u32 %r219, 31; mov.u32 %r220, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f480, %r206, %r219, %r220; @p add.f32 r0, r0, %f480; mov.f32 %f478, r0;} // end inline asm mov.u32 %r209, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f478, %r209, %r219, %r220; @p add.f32 r0, r0, %f478; mov.f32 %f481, r0;} // end inline asm mov.u32 %r212, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f481, %r212, %r219, %r220; @p add.f32 r0, r0, %f481; mov.f32 %f484, r0;} // end inline asm mov.u32 %r215, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f484, %r215, %r219, %r220; @p add.f32 r0, r0, %f484; mov.f32 %f487, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f487, %r218, %r219, %r220; @p add.f32 r0, r0, %f487; mov.f32 %f525, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r252, %r199, -112; st.shared.f32 [%r252+640], %f524; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f513, [%r30+656]; add.ftz.f32 %f514, %f524, %f513; ld.shared.f32 %f515, [%r30+660]; add.ftz.f32 %f516, %f514, %f515; ld.shared.f32 %f517, [%r30+664]; add.ftz.f32 %f500, %f516, %f517; mov.u32 %r230, 1; mov.u32 %r243, 31; mov.u32 %r244, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f500, %r230, %r243, %r244; @p add.f32 r0, r0, %f500; mov.f32 %f498, r0;} // end inline asm mov.u32 %r233, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f498, %r233, %r243, %r244; @p add.f32 r0, r0, %f498; mov.f32 %f501, r0;} // end inline asm mov.u32 %r236, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f501, %r236, %r243, %r244; @p add.f32 r0, r0, %f501; mov.f32 %f504, r0;} // end inline asm mov.u32 %r239, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f504, %r239, %r243, %r244; @p add.f32 r0, r0, %f504; mov.f32 %f507, r0;} // end inline asm mov.u32 %r242, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f507, %r242, %r243, %r244; @p add.f32 r0, r0, %f507; mov.f32 %f524, r0;} // end inline asm $L__BB0_17: or.b32 %r247, %r41, %r194; setp.ne.s32 %p12, %r247, 0; @%p12 bra $L__BB0_23; ld.param.u64 %rd43, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p13, %rd43, 0; mul.ftz.f32 %f530, %f29, %f525; mov.u32 %r248, %ctaid.x; cvt.s64.s32 %rd9, %r248; @%p13 bra $L__BB0_20; ld.param.u64 %rd46, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd45, %rd46; shl.b64 %rd31, %rd9, 1; add.s64 %rd32, %rd45, %rd31; ld.global.u16 %rs393, [%rd32]; // begin inline asm { cvt.f32.f16 %f518, %rs393;} // end inline asm fma.rn.ftz.f32 %f530, %f30, %f518, %f530; $L__BB0_20: ld.param.u64 %rd44, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs394, %f530;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd33, 1.0; // end inline asm shl.b64 %rd36, %rd9, 1; add.s64 %rd34, %rd44, %rd36; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd34], %rs394, %rd33; // end inline asm mul.ftz.f32 %f531, %f29, %f524; @%p13 bra $L__BB0_22; ld.param.u64 %rd48, [_Z28dequant_gemv_group128_batch223DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd47, %rd48; add.s32 %r250, %r37, %r248; mul.wide.s32 %rd37, %r250, 2; add.s64 %rd38, %rd47, %rd37; ld.global.u16 %rs396, [%rd38]; // begin inline asm { cvt.f32.f16 %f520, %rs396;} // end inline asm fma.rn.ftz.f32 %f531, %f30, %f520, %f531; $L__BB0_22: mul.wide.s32 %rd42, %r37, 2; add.s64 %rd40, %rd34, %rd42; // begin inline asm { cvt.rn.f16.f32 %rs397, %f531;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd39, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd40], %rs397, %rd39; // end inline asm $L__BB0_23: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }