p_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_2054f3506thrust12placeholders3_10E[1]; .visible .entry _Z28dequant_gemv_group128_batch323DequantGemvKernelParams( .param .align 8 .b8 _Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<16>; .reg .b16 %rs<463>; .reg .f32 %f<689>; .reg .b32 %r<340>; .reg .b64 %rd<47>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi128ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[1920]; ld.param.v2.u32 {%r40, %r41}, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r42, %r43}, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f32, %f33}, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs67, %rs68, %rs69, %rs70}, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd20, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd19, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd18, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd17, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+16]; mov.u32 %r339, %tid.y; shl.b32 %r44, %r339, 5; mov.u32 %r45, %tid.x; add.s32 %r338, %r44, %r45; shl.b32 %r337, %r338, 2; setp.ge.u32 %p1, %r337, %r42; mov.f32 %f680, 0f00000000; mov.f32 %f681, %f680; mov.f32 %f682, %f680; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd2, %rd17; mov.u32 %r46, %ctaid.x; mul.lo.s32 %r6, %r43, %r46; shl.b16 %rs2, %rs67, 3; cvta.to.global.u64 %rd3, %rd18; cvta.to.global.u64 %rd4, %rd20; $L__BB0_2: mad.lo.s32 %r48, %r42, %r46, %r337; mul.wide.u32 %rd21, %r48, 4; add.s64 %rd22, %rd3, %rd21; ld.global.v4.u32 {%r49, %r50, %r51, %r52}, [%rd22]; shr.u32 %r54, %r45, 2; shl.b32 %r55, %r339, 3; add.s32 %r14, %r55, %r54; add.s32 %r15, %r14, %r6; mul.wide.s32 %rd23, %r15, 2; add.s64 %rd24, %rd4, %rd23; ld.global.u16 %rs75, [%rd24]; // begin inline asm { cvt.f32.f16 %f37, %rs75;} // end inline asm setp.eq.s64 %p2, %rd19, 0; mov.u16 %rs462, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r56, %r15, 31; add.s32 %r57, %r15, %r56; shr.s32 %r58, %r57, 1; cvt.s64.s32 %rd25, %r58; cvta.to.global.u64 %rd26, %rd19; add.s64 %rd27, %rd26, %rd25; ld.global.u8 %r59, [%rd27]; shl.b32 %r60, %r14, 2; and.b32 %r61, %r60, 4; shr.u32 %r62, %r59, %r61; cvt.u16.u32 %rs76, %r62; and.b16 %rs462, %rs76, 15; $L__BB0_4: shl.b32 %r16, %r338, 5; setp.ge.s32 %p3, %r16, %r40; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs67, 0; shr.u16 %rs78, %rs462, 3; and.b16 %rs79, %rs78, 1; setp.eq.b16 %p5, %rs79, 1; and.pred %p6, %p4, %p5; selp.b16 %rs80, -16, 0, %p6; or.b16 %rs81, %rs80, %rs462; cvt.s16.s8 %rs82, %rs81; cvt.rn.f32.s16 %f5, %rs82; mul.wide.s32 %rd28, %r16, 2; add.s64 %rd5, %rd2, %rd28; ld.global.v4.u32 {%r63, %r64, %r65, %r66}, [%rd5]; mul.wide.s32 %rd29, %r40, 2; add.s64 %rd30, %rd5, %rd29; ld.global.v4.u32 {%r67, %r68, %r69, %r70}, [%rd30]; add.s32 %r71, %r16, %r40; add.s32 %r72, %r71, %r40; mul.wide.s32 %rd31, %r72, 2; add.s64 %rd6, %rd2, %rd31; ld.global.v4.u32 {%r73, %r74, %r75, %r76}, [%rd6]; mov.b32 {%rs7, %rs6}, %r73; shr.u32 %r77, %r49, 4; cvt.u16.u32 %rs8, %r77; and.b16 %rs9, %rs8, 15; mov.b32 {%rs83, %rs10}, %r63; cvt.u16.u32 %rs84, %r49; shr.u16 %rs85, %rs84, 8; and.b16 %rs11, %rs85, 15; shr.u16 %rs12, %rs84, 12; shr.u32 %r78, %r49, 16; cvt.u16.u32 %rs86, %r78; and.b16 %rs13, %rs86, 15; shr.u32 %r79, %r49, 20; cvt.u16.u32 %rs87, %r79; and.b16 %rs14, %rs87, 15; shr.u32 %r80, %r49, 24; cvt.u16.u32 %rs88, %r80; and.b16 %rs15, %rs88, 15; shr.u32 %r81, %r49, 28; cvt.u16.u32 %rs16, %r81; add.s32 %r82, %r71, 8; mul.wide.s32 %rd32, %r82, 2; add.s64 %rd7, %rd2, %rd32; cvt.u16.u32 %rs89, %r50; and.b16 %rs17, %rs89, 15; shr.u32 %r83, %r50, 4; cvt.u16.u32 %rs18, %r83; and.b16 %rs19, %rs18, 15; shr.u32 %r84, %r50, 8; cvt.u16.u32 %rs20, %r84; and.b16 %rs21, %rs20, 15; shr.u32 %r85, %r50, 12; cvt.u16.u32 %rs22, %r85; and.b16 %rs23, %rs22, 15; shr.u32 %r86, %r50, 16; cvt.u16.u32 %rs24, %r86; and.b16 %rs25, %rs24, 15; shr.u32 %r87, %r50, 20; cvt.u16.u32 %rs26, %r87; and.b16 %rs27, %rs26, 15; shr.u32 %r88, %r50, 24; cvt.u16.u32 %rs28, %r88; and.b16 %rs29, %rs28, 15; shr.u32 %r89, %r50, 28; cvt.u16.u32 %rs30, %r89; cvt.u16.u32 %rs90, %r51; and.b16 %rs31, %rs90, 15; shr.u32 %r90, %r51, 4; cvt.u16.u32 %rs32, %r90; and.b16 %rs33, %rs32, 15; shr.u32 %r91, %r51, 8; cvt.u16.u32 %rs34, %r91; and.b16 %rs35, %rs34, 15; shr.u32 %r92, %r51, 12; cvt.u16.u32 %rs36, %r92; and.b16 %rs37, %rs36, 15; shr.u32 %r93, %r51, 16; cvt.u16.u32 %rs38, %r93; and.b16 %rs39, %rs38, 15; shr.u32 %r94, %r51, 20; cvt.u16.u32 %rs40, %r94; and.b16 %rs41, %rs40, 15; shr.u32 %r95, %r51, 24; cvt.u16.u32 %rs42, %r95; and.b16 %rs43, %rs42, 15; shr.u32 %r96, %r51, 28; cvt.u16.u32 %rs44, %r96; cvt.u16.u32 %rs91, %r52; and.b16 %rs45, %rs91, 15; shr.u32 %r97, %r52, 4; cvt.u16.u32 %rs46, %r97; and.b16 %rs47, %rs46, 15; shr.u32 %r98, %r52, 8; cvt.u16.u32 %rs48, %r98; and.b16 %rs49, %rs48, 15; shr.u32 %r99, %r52, 12; cvt.u16.u32 %rs50, %r99; and.b16 %rs51, %rs50, 15; shr.u32 %r100, %r52, 16; cvt.u16.u32 %rs52, %r100; and.b16 %rs53, %rs52, 15; shr.u32 %r101, %r52, 20; cvt.u16.u32 %rs54, %r101; and.b16 %rs55, %rs54, 15; shr.u32 %r102, %r52, 24; cvt.u16.u32 %rs56, %r102; and.b16 %rs57, %rs56, 15; shr.u32 %r103, %r52, 28; cvt.u16.u32 %rs58, %r103; @%p4 bra $L__BB0_7; and.b16 %rs189, %rs84, 15; cvt.rn.f32.s16 %f134, %rs189; sub.ftz.f32 %f135, %f134, %f5; mul.ftz.f32 %f136, %f37, %f135; cvt.u16.u32 %rs92, %r63; // begin inline asm { cvt.f32.f16 %f38, %rs92;} // end inline asm fma.rn.ftz.f32 %f137, %f136, %f38, %f682; mov.b32 {%rs93, %rs96}, %r67; // begin inline asm { cvt.f32.f16 %f39, %rs93;} // end inline asm fma.rn.ftz.f32 %f138, %f136, %f39, %f681; // begin inline asm { cvt.f32.f16 %f40, %rs7;} // end inline asm fma.rn.ftz.f32 %f139, %f136, %f40, %f680; cvt.rn.f32.s16 %f140, %rs9; sub.ftz.f32 %f141, %f140, %f5; mul.ftz.f32 %f142, %f37, %f141; // begin inline asm { cvt.f32.f16 %f41, %rs10;} // end inline asm fma.rn.ftz.f32 %f143, %f142, %f41, %f137; // begin inline asm { cvt.f32.f16 %f42, %rs96;} // end inline asm fma.rn.ftz.f32 %f144, %f142, %f42, %f138; // begin inline asm { cvt.f32.f16 %f43, %rs6;} // end inline asm fma.rn.ftz.f32 %f145, %f142, %f43, %f139; cvt.rn.f32.s16 %f146, %rs11; sub.ftz.f32 %f147, %f146, %f5; mul.ftz.f32 %f148, %f37, %f147; mov.b32 {%rs98, %rs101}, %r64; // begin inline asm { cvt.f32.f16 %f44, %rs98;} // end inline asm fma.rn.ftz.f32 %f149, %f148, %f44, %f143; mov.b32 {%rs99, %rs102}, %r68; // begin inline asm { cvt.f32.f16 %f45, %rs99;} // end inline asm fma.rn.ftz.f32 %f150, %f148, %f45, %f144; mov.b32 {%rs100, %rs103}, %r74; // begin inline asm { cvt.f32.f16 %f46, %rs100;} // end inline asm fma.rn.ftz.f32 %f151, %f148, %f46, %f145; cvt.rn.f32.s16 %f152, %rs12; sub.ftz.f32 %f153, %f152, %f5; mul.ftz.f32 %f154, %f37, %f153; // begin inline asm { cvt.f32.f16 %f47, %rs101;} // end inline asm fma.rn.ftz.f32 %f155, %f154, %f47, %f149; // begin inline asm { cvt.f32.f16 %f48, %rs102;} // end inline asm fma.rn.ftz.f32 %f156, %f154, %f48, %f150; // begin inline asm { cvt.f32.f16 %f49, %rs103;} // end inline asm fma.rn.ftz.f32 %f157, %f154, %f49, %f151; cvt.rn.f32.s16 %f158, %rs13; sub.ftz.f32 %f159, %f158, %f5; mul.ftz.f32 %f160, %f37, %f159; mov.b32 {%rs104, %rs107}, %r65; // begin inline asm { cvt.f32.f16 %f50, %rs104;} // end inline asm fma.rn.ftz.f32 %f161, %f160, %f50, %f155; mov.b32 {%rs105, %rs108}, %r69; // begin inline asm { cvt.f32.f16 %f51, %rs105;} // end inline asm fma.rn.ftz.f32 %f162, %f160, %f51, %f156; mov.b32 {%rs106, %rs109}, %r75; // begin inline asm { cvt.f32.f16 %f52, %rs106;} // end inline asm fma.rn.ftz.f32 %f163, %f160, %f52, %f157; cvt.rn.f32.s16 %f164, %rs14; sub.ftz.f32 %f165, %f164, %f5; mul.ftz.f32 %f166, %f37, %f165; // begin inline asm { cvt.f32.f16 %f53, %rs107;} // end inline asm fma.rn.ftz.f32 %f167, %f166, %f53, %f161; // begin inline asm { cvt.f32.f16 %f54, %rs108;} // end inline asm fma.rn.ftz.f32 %f168, %f166, %f54, %f162; // begin inline asm { cvt.f32.f16 %f55, %rs109;} // end inline asm fma.rn.ftz.f32 %f169, %f166, %f55, %f163; cvt.rn.f32.s16 %f170, %rs15; sub.ftz.f32 %f171, %f170, %f5; mul.ftz.f32 %f172, %f37, %f171; mov.b32 {%rs110, %rs113}, %r66; // begin inline asm { cvt.f32.f16 %f56, %rs110;} // end inline asm fma.rn.ftz.f32 %f173, %f172, %f56, %f167; mov.b32 {%rs111, %rs114}, %r70; // begin inline asm { cvt.f32.f16 %f57, %rs111;} // end inline asm fma.rn.ftz.f32 %f174, %f172, %f57, %f168; mov.b32 {%rs112, %rs115}, %r76; // begin inline asm { cvt.f32.f16 %f58, %rs112;} // end inline asm fma.rn.ftz.f32 %f175, %f172, %f58, %f169; cvt.rn.f32.s16 %f176, %rs16; sub.ftz.f32 %f177, %f176, %f5; mul.ftz.f32 %f178, %f37, %f177; // begin inline asm { cvt.f32.f16 %f59, %rs113;} // end inline asm fma.rn.ftz.f32 %f179, %f178, %f59, %f173; // begin inline asm { cvt.f32.f16 %f60, %rs114;} // end inline asm fma.rn.ftz.f32 %f180, %f178, %f60, %f174; // begin inline asm { cvt.f32.f16 %f61, %rs115;} // end inline asm fma.rn.ftz.f32 %f181, %f178, %f61, %f175; ld.global.v4.u32 {%r104, %r105, %r106, %r107}, [%rd5+16]; ld.global.v4.u32 {%r112, %r113, %r114, %r115}, [%rd7]; ld.global.v4.u32 {%r120, %r121, %r122, %r123}, [%rd6+16]; cvt.rn.f32.s16 %f182, %rs17; sub.ftz.f32 %f183, %f182, %f5; mul.ftz.f32 %f184, %f37, %f183; mov.b32 {%rs116, %rs119}, %r104; // begin inline asm { cvt.f32.f16 %f62, %rs116;} // end inline asm fma.rn.ftz.f32 %f185, %f184, %f62, %f179; mov.b32 {%rs117, %rs120}, %r112; // begin inline asm { cvt.f32.f16 %f63, %rs117;} // end inline asm fma.rn.ftz.f32 %f186, %f184, %f63, %f180; mov.b32 {%rs118, %rs121}, %r120; // begin inline asm { cvt.f32.f16 %f64, %rs118;} // end inline asm fma.rn.ftz.f32 %f187, %f184, %f64, %f181; cvt.rn.f32.s16 %f188, %rs19; sub.ftz.f32 %f189, %f188, %f5; mul.ftz.f32 %f190, %f37, %f189; // begin inline asm { cvt.f32.f16 %f65, %rs119;} // end inline asm fma.rn.ftz.f32 %f191, %f190, %f65, %f185; // begin inline asm { cvt.f32.f16 %f66, %rs120;} // end inline asm fma.rn.ftz.f32 %f192, %f190, %f66, %f186; // begin inline asm { cvt.f32.f16 %f67, %rs121;} // end inline asm fma.rn.ftz.f32 %f193, %f190, %f67, %f187; cvt.rn.f32.s16 %f194, %rs21; sub.ftz.f32 %f195, %f194, %f5; mul.ftz.f32 %f196, %f37, %f195; mov.b32 {%rs122, %rs125}, %r105; // begin inline asm { cvt.f32.f16 %f68, %rs122;} // end inline asm fma.rn.ftz.f32 %f197, %f196, %f68, %f191; mov.b32 {%rs123, %rs126}, %r113; // begin inline asm { cvt.f32.f16 %f69, %rs123;} // end inline asm fma.rn.ftz.f32 %f198, %f196, %f69, %f192; mov.b32 {%rs124, %rs127}, %r121; // begin inline asm { cvt.f32.f16 %f70, %rs124;} // end inline asm fma.rn.ftz.f32 %f199, %f196, %f70, %f193; cvt.rn.f32.s16 %f200, %rs23; sub.ftz.f32 %f201, %f200, %f5; mul.ftz.f32 %f202, %f37, %f201; // begin inline asm { cvt.f32.f16 %f71, %rs125;} // end inline asm fma.rn.ftz.f32 %f203, %f202, %f71, %f197; // begin inline asm { cvt.f32.f16 %f72, %rs126;} // end inline asm fma.rn.ftz.f32 %f204, %f202, %f72, %f198; // begin inline asm { cvt.f32.f16 %f73, %rs127;} // end inline asm fma.rn.ftz.f32 %f205, %f202, %f73, %f199; cvt.rn.f32.s16 %f206, %rs25; sub.ftz.f32 %f207, %f206, %f5; mul.ftz.f32 %f208, %f37, %f207; mov.b32 {%rs128, %rs131}, %r106; // begin inline asm { cvt.f32.f16 %f74, %rs128;} // end inline asm fma.rn.ftz.f32 %f209, %f208, %f74, %f203; mov.b32 {%rs129, %rs132}, %r114; // begin inline asm { cvt.f32.f16 %f75, %rs129;} // end inline asm fma.rn.ftz.f32 %f210, %f208, %f75, %f204; mov.b32 {%rs130, %rs133}, %r122; // begin inline asm { cvt.f32.f16 %f76, %rs130;} // end inline asm fma.rn.ftz.f32 %f211, %f208, %f76, %f205; cvt.rn.f32.s16 %f212, %rs27; sub.ftz.f32 %f213, %f212, %f5; mul.ftz.f32 %f214, %f37, %f213; // begin inline asm { cvt.f32.f16 %f77, %rs131;} // end inline asm fma.rn.ftz.f32 %f215, %f214, %f77, %f209; // begin inline asm { cvt.f32.f16 %f78, %rs132;} // end inline asm fma.rn.ftz.f32 %f216, %f214, %f78, %f210; // begin inline asm { cvt.f32.f16 %f79, %rs133;} // end inline asm fma.rn.ftz.f32 %f217, %f214, %f79, %f211; cvt.rn.f32.s16 %f218, %rs29; sub.ftz.f32 %f219, %f218, %f5; mul.ftz.f32 %f220, %f37, %f219; mov.b32 {%rs134, %rs137}, %r107; // begin inline asm { cvt.f32.f16 %f80, %rs134;} // end inline asm fma.rn.ftz.f32 %f221, %f220, %f80, %f215; mov.b32 {%rs135, %rs138}, %r115; // begin inline asm { cvt.f32.f16 %f81, %rs135;} // end inline asm fma.rn.ftz.f32 %f222, %f220, %f81, %f216; mov.b32 {%rs136, %rs139}, %r123; // begin inline asm { cvt.f32.f16 %f82, %rs136;} // end inline asm fma.rn.ftz.f32 %f223, %f220, %f82, %f217; cvt.rn.f32.s16 %f224, %rs30; sub.ftz.f32 %f225, %f224, %f5; mul.ftz.f32 %f226, %f37, %f225; // begin inline asm { cvt.f32.f16 %f83, %rs137;} // end inline asm fma.rn.ftz.f32 %f227, %f226, %f83, %f221; // begin inline asm { cvt.f32.f16 %f84, %rs138;} // end inline asm fma.rn.ftz.f32 %f228, %f226, %f84, %f222; // begin inline asm { cvt.f32.f16 %f85, %rs139;} // end inline asm fma.rn.ftz.f32 %f229, %f226, %f85, %f223; ld.global.v4.u32 {%r128, %r129, %r130, %r131}, [%rd5+32]; ld.global.v4.u32 {%r136, %r137, %r138, %r139}, [%rd7+16]; ld.global.v4.u32 {%r144, %r145, %r146, %r147}, [%rd6+32]; cvt.rn.f32.s16 %f230, %rs31; sub.ftz.f32 %f231, %f230, %f5; mul.ftz.f32 %f232, %f37, %f231; mov.b32 {%rs140, %rs143}, %r128; // begin inline asm { cvt.f32.f16 %f86, %rs140;} // end inline asm fma.rn.ftz.f32 %f233, %f232, %f86, %f227; mov.b32 {%rs141, %rs144}, %r136; // begin inline asm { cvt.f32.f16 %f87, %rs141;} // end inline asm fma.rn.ftz.f32 %f234, %f232, %f87, %f228; mov.b32 {%rs142, %rs145}, %r144; // begin inline asm { cvt.f32.f16 %f88, %rs142;} // end inline asm fma.rn.ftz.f32 %f235, %f232, %f88, %f229; cvt.rn.f32.s16 %f236, %rs33; sub.ftz.f32 %f237, %f236, %f5; mul.ftz.f32 %f238, %f37, %f237; // begin inline asm { cvt.f32.f16 %f89, %rs143;} // end inline asm fma.rn.ftz.f32 %f239, %f238, %f89, %f233; // begin inline asm { cvt.f32.f16 %f90, %rs144;} // end inline asm fma.rn.ftz.f32 %f240, %f238, %f90, %f234; // begin inline asm { cvt.f32.f16 %f91, %rs145;} // end inline asm fma.rn.ftz.f32 %f241, %f238, %f91, %f235; cvt.rn.f32.s16 %f242, %rs35; sub.ftz.f32 %f243, %f242, %f5; mul.ftz.f32 %f244, %f37, %f243; mov.b32 {%rs146, %rs149}, %r129; // begin inline asm { cvt.f32.f16 %f92, %rs146;} // end inline asm fma.rn.ftz.f32 %f245, %f244, %f92, %f239; mov.b32 {%rs147, %rs150}, %r137; // begin inline asm { cvt.f32.f16 %f93, %rs147;} // end inline asm fma.rn.ftz.f32 %f246, %f244, %f93, %f240; mov.b32 {%rs148, %rs151}, %r145; // begin inline asm { cvt.f32.f16 %f94, %rs148;} // end inline asm fma.rn.ftz.f32 %f247, %f244, %f94, %f241; cvt.rn.f32.s16 %f248, %rs37; sub.ftz.f32 %f249, %f248, %f5; mul.ftz.f32 %f250, %f37, %f249; // begin inline asm { cvt.f32.f16 %f95, %rs149;} // end inline asm fma.rn.ftz.f32 %f251, %f250, %f95, %f245; // begin inline asm { cvt.f32.f16 %f96, %rs150;} // end inline asm fma.rn.ftz.f32 %f252, %f250, %f96, %f246; // begin inline asm { cvt.f32.f16 %f97, %rs151;} // end inline asm fma.rn.ftz.f32 %f253, %f250, %f97, %f247; cvt.rn.f32.s16 %f254, %rs39; sub.ftz.f32 %f255, %f254, %f5; mul.ftz.f32 %f256, %f37, %f255; mov.b32 {%rs152, %rs155}, %r130; // begin inline asm { cvt.f32.f16 %f98, %rs152;} // end inline asm fma.rn.ftz.f32 %f257, %f256, %f98, %f251; mov.b32 {%rs153, %rs156}, %r138; // begin inline asm { cvt.f32.f16 %f99, %rs153;} // end inline asm fma.rn.ftz.f32 %f258, %f256, %f99, %f252; mov.b32 {%rs154, %rs157}, %r146; // begin inline asm { cvt.f32.f16 %f100, %rs154;} // end inline asm fma.rn.ftz.f32 %f259, %f256, %f100, %f253; cvt.rn.f32.s16 %f260, %rs41; sub.ftz.f32 %f261, %f260, %f5; mul.ftz.f32 %f262, %f37, %f261; // begin inline asm { cvt.f32.f16 %f101, %rs155;} // end inline asm fma.rn.ftz.f32 %f263, %f262, %f101, %f257; // begin inline asm { cvt.f32.f16 %f102, %rs156;} // end inline asm fma.rn.ftz.f32 %f264, %f262, %f102, %f258; // begin inline asm { cvt.f32.f16 %f103, %rs157;} // end inline asm fma.rn.ftz.f32 %f265, %f262, %f103, %f259; cvt.rn.f32.s16 %f266, %rs43; sub.ftz.f32 %f267, %f266, %f5; mul.ftz.f32 %f268, %f37, %f267; mov.b32 {%rs158, %rs161}, %r131; // begin inline asm { cvt.f32.f16 %f104, %rs158;} // end inline asm fma.rn.ftz.f32 %f269, %f268, %f104, %f263; mov.b32 {%rs159, %rs162}, %r139; // begin inline asm { cvt.f32.f16 %f105, %rs159;} // end inline asm fma.rn.ftz.f32 %f270, %f268, %f105, %f264; mov.b32 {%rs160, %rs163}, %r147; // begin inline asm { cvt.f32.f16 %f106, %rs160;} // end inline asm fma.rn.ftz.f32 %f271, %f268, %f106, %f265; cvt.rn.f32.s16 %f272, %rs44; sub.ftz.f32 %f273, %f272, %f5; mul.ftz.f32 %f274, %f37, %f273; // begin inline asm { cvt.f32.f16 %f107, %rs161;} // end inline asm fma.rn.ftz.f32 %f275, %f274, %f107, %f269; // begin inline asm { cvt.f32.f16 %f108, %rs162;} // end inline asm fma.rn.ftz.f32 %f276, %f274, %f108, %f270; // begin inline asm { cvt.f32.f16 %f109, %rs163;} // end inline asm fma.rn.ftz.f32 %f277, %f274, %f109, %f271; ld.global.v4.u32 {%r152, %r153, %r154, %r155}, [%rd5+48]; ld.global.v4.u32 {%r160, %r161, %r162, %r163}, [%rd7+32]; ld.global.v4.u32 {%r168, %r169, %r170, %r171}, [%rd6+48]; cvt.rn.f32.s16 %f278, %rs45; sub.ftz.f32 %f279, %f278, %f5; mul.ftz.f32 %f280, %f37, %f279; mov.b32 {%rs164, %rs167}, %r152; // begin inline asm { cvt.f32.f16 %f110, %rs164;} // end inline asm fma.rn.ftz.f32 %f281, %f280, %f110, %f275; mov.b32 {%rs165, %rs168}, %r160; // begin inline asm { cvt.f32.f16 %f111, %rs165;} // end inline asm fma.rn.ftz.f32 %f282, %f280, %f111, %f276; mov.b32 {%rs166, %rs169}, %r168; // begin inline asm { cvt.f32.f16 %f112, %rs166;} // end inline asm fma.rn.ftz.f32 %f283, %f280, %f112, %f277; cvt.rn.f32.s16 %f284, %rs47; sub.ftz.f32 %f285, %f284, %f5; mul.ftz.f32 %f286, %f37, %f285; // begin inline asm { cvt.f32.f16 %f113, %rs167;} // end inline asm fma.rn.ftz.f32 %f287, %f286, %f113, %f281; // begin inline asm { cvt.f32.f16 %f114, %rs168;} // end inline asm fma.rn.ftz.f32 %f288, %f286, %f114, %f282; // begin inline asm { cvt.f32.f16 %f115, %rs169;} // end inline asm fma.rn.ftz.f32 %f289, %f286, %f115, %f283; cvt.rn.f32.s16 %f290, %rs49; sub.ftz.f32 %f291, %f290, %f5; mul.ftz.f32 %f292, %f37, %f291; mov.b32 {%rs170, %rs173}, %r153; // begin inline asm { cvt.f32.f16 %f116, %rs170;} // end inline asm fma.rn.ftz.f32 %f293, %f292, %f116, %f287; mov.b32 {%rs171, %rs174}, %r161; // begin inline asm { cvt.f32.f16 %f117, %rs171;} // end inline asm fma.rn.ftz.f32 %f294, %f292, %f117, %f288; mov.b32 {%rs172, %rs175}, %r169; // begin inline asm { cvt.f32.f16 %f118, %rs172;} // end inline asm fma.rn.ftz.f32 %f295, %f292, %f118, %f289; cvt.rn.f32.s16 %f296, %rs51; sub.ftz.f32 %f297, %f296, %f5; mul.ftz.f32 %f298, %f37, %f297; // begin inline asm { cvt.f32.f16 %f119, %rs173;} // end inline asm fma.rn.ftz.f32 %f299, %f298, %f119, %f293; // begin inline asm { cvt.f32.f16 %f120, %rs174;} // end inline asm fma.rn.ftz.f32 %f300, %f298, %f120, %f294; // begin inline asm { cvt.f32.f16 %f121, %rs175;} // end inline asm fma.rn.ftz.f32 %f301, %f298, %f121, %f295; cvt.rn.f32.s16 %f302, %rs53; sub.ftz.f32 %f303, %f302, %f5; mul.ftz.f32 %f304, %f37, %f303; mov.b32 {%rs176, %rs179}, %r154; // begin inline asm { cvt.f32.f16 %f122, %rs176;} // end inline asm fma.rn.ftz.f32 %f305, %f304, %f122, %f299; mov.b32 {%rs177, %rs180}, %r162; // begin inline asm { cvt.f32.f16 %f123, %rs177;} // end inline asm fma.rn.ftz.f32 %f306, %f304, %f123, %f300; mov.b32 {%rs178, %rs181}, %r170; // begin inline asm { cvt.f32.f16 %f124, %rs178;} // end inline asm fma.rn.ftz.f32 %f307, %f304, %f124, %f301; cvt.rn.f32.s16 %f308, %rs55; sub.ftz.f32 %f309, %f308, %f5; mul.ftz.f32 %f310, %f37, %f309; // begin inline asm { cvt.f32.f16 %f125, %rs179;} // end inline asm fma.rn.ftz.f32 %f311, %f310, %f125, %f305; // begin inline asm { cvt.f32.f16 %f126, %rs180;} // end inline asm fma.rn.ftz.f32 %f312, %f310, %f126, %f306; // begin inline asm { cvt.f32.f16 %f127, %rs181;} // end inline asm fma.rn.ftz.f32 %f313, %f310, %f127, %f307; cvt.rn.f32.s16 %f314, %rs57; sub.ftz.f32 %f315, %f314, %f5; mul.ftz.f32 %f316, %f37, %f315; mov.b32 {%rs182, %rs185}, %r155; // begin inline asm { cvt.f32.f16 %f128, %rs182;} // end inline asm fma.rn.ftz.f32 %f317, %f316, %f128, %f311; mov.b32 {%rs183, %rs186}, %r163; // begin inline asm { cvt.f32.f16 %f129, %rs183;} // end inline asm fma.rn.ftz.f32 %f318, %f316, %f129, %f312; mov.b32 {%rs184, %rs187}, %r171; // begin inline asm { cvt.f32.f16 %f130, %rs184;} // end inline asm fma.rn.ftz.f32 %f319, %f316, %f130, %f313; cvt.rn.f32.s16 %f320, %rs58; sub.ftz.f32 %f321, %f320, %f5; mul.ftz.f32 %f322, %f37, %f321; // begin inline asm { cvt.f32.f16 %f131, %rs185;} // end inline asm fma.rn.ftz.f32 %f682, %f322, %f131, %f317; // begin inline asm { cvt.f32.f16 %f132, %rs186;} // end inline asm fma.rn.ftz.f32 %f681, %f322, %f132, %f318; // begin inline asm { cvt.f32.f16 %f133, %rs187;} // end inline asm fma.rn.ftz.f32 %f680, %f322, %f133, %f319; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs287, %rs84, 4; cvt.s16.s8 %rs288, %rs287; shr.s16 %rs289, %rs288, 7; and.b16 %rs290, %rs289, -16; and.b16 %rs291, %rs84, 15; or.b16 %rs292, %rs290, %rs291; cvt.rn.f32.s16 %f419, %rs292; sub.ftz.f32 %f420, %f419, %f5; mul.ftz.f32 %f421, %f37, %f420; cvt.u16.u32 %rs190, %r63; // begin inline asm { cvt.f32.f16 %f323, %rs190;} // end inline asm fma.rn.ftz.f32 %f422, %f421, %f323, %f682; mov.b32 {%rs191, %rs194}, %r67; // begin inline asm { cvt.f32.f16 %f324, %rs191;} // end inline asm fma.rn.ftz.f32 %f423, %f421, %f324, %f681; // begin inline asm { cvt.f32.f16 %f325, %rs7;} // end inline asm fma.rn.ftz.f32 %f424, %f421, %f325, %f680; shl.b16 %rs293, %rs8, 4; cvt.s16.s8 %rs294, %rs293; shr.s16 %rs295, %rs294, 7; and.b16 %rs296, %rs295, -16; or.b16 %rs297, %rs296, %rs9; cvt.rn.f32.s16 %f425, %rs297; sub.ftz.f32 %f426, %f425, %f5; mul.ftz.f32 %f427, %f37, %f426; // begin inline asm { cvt.f32.f16 %f326, %rs10;} // end inline asm fma.rn.ftz.f32 %f428, %f427, %f326, %f422; // begin inline asm { cvt.f32.f16 %f327, %rs194;} // end inline asm fma.rn.ftz.f32 %f429, %f427, %f327, %f423; // begin inline asm { cvt.f32.f16 %f328, %rs6;} // end inline asm fma.rn.ftz.f32 %f430, %f427, %f328, %f424; shr.u16 %rs299, %rs84, 4; cvt.s16.s8 %rs300, %rs299; shr.s16 %rs301, %rs300, 7; and.b16 %rs302, %rs301, -16; or.b16 %rs303, %rs302, %rs11; cvt.rn.f32.s16 %f431, %rs303; sub.ftz.f32 %f432, %f431, %f5; mul.ftz.f32 %f433, %f37, %f432; mov.b32 {%rs196, %rs199}, %r64; // begin inline asm { cvt.f32.f16 %f329, %rs196;} // end inline asm fma.rn.ftz.f32 %f434, %f433, %f329, %f428; mov.b32 {%rs197, %rs200}, %r68; // begin inline asm { cvt.f32.f16 %f330, %rs197;} // end inline asm fma.rn.ftz.f32 %f435, %f433, %f330, %f429; mov.b32 {%rs198, %rs201}, %r74; // begin inline asm { cvt.f32.f16 %f331, %rs198;} // end inline asm fma.rn.ftz.f32 %f436, %f433, %f331, %f430; shr.s16 %rs304, %rs84, 15; and.b16 %rs305, %rs304, -16; or.b16 %rs306, %rs305, %rs12; cvt.rn.f32.s16 %f437, %rs306; sub.ftz.f32 %f438, %f437, %f5; mul.ftz.f32 %f439, %f37, %f438; // begin inline asm { cvt.f32.f16 %f332, %rs199;} // end inline asm fma.rn.ftz.f32 %f440, %f439, %f332, %f434; // begin inline asm { cvt.f32.f16 %f333, %rs200;} // end inline asm fma.rn.ftz.f32 %f441, %f439, %f333, %f435; // begin inline asm { cvt.f32.f16 %f334, %rs201;} // end inline asm fma.rn.ftz.f32 %f442, %f439, %f334, %f436; shl.b16 %rs308, %rs86, 4; cvt.s16.s8 %rs309, %rs308; shr.s16 %rs310, %rs309, 7; and.b16 %rs311, %rs310, -16; or.b16 %rs312, %rs311, %rs13; cvt.rn.f32.s16 %f443, %rs312; sub.ftz.f32 %f444, %f443, %f5; mul.ftz.f32 %f445, %f37, %f444; mov.b32 {%rs202, %rs205}, %r65; // begin inline asm { cvt.f32.f16 %f335, %rs202;} // end inline asm fma.rn.ftz.f32 %f446, %f445, %f335, %f440; mov.b32 {%rs203, %rs206}, %r69; // begin inline asm { cvt.f32.f16 %f336, %rs203;} // end inline asm fma.rn.ftz.f32 %f447, %f445, %f336, %f441; mov.b32 {%rs204, %rs207}, %r75; // begin inline asm { cvt.f32.f16 %f337, %rs204;} // end inline asm fma.rn.ftz.f32 %f448, %f445, %f337, %f442; shl.b16 %rs314, %rs87, 4; cvt.s16.s8 %rs315, %rs314; shr.s16 %rs316, %rs315, 7; and.b16 %rs317, %rs316, -16; or.b16 %rs318, %rs317, %rs14; cvt.rn.f32.s16 %f449, %rs318; sub.ftz.f32 %f450, %f449, %f5; mul.ftz.f32 %f451, %f37, %f450; // begin inline asm { cvt.f32.f16 %f338, %rs205;} // end inline asm fma.rn.ftz.f32 %f452, %f451, %f338, %f446; // begin inline asm { cvt.f32.f16 %f339, %rs206;} // end inline asm fma.rn.ftz.f32 %f453, %f451, %f339, %f447; // begin inline asm { cvt.f32.f16 %f340, %rs207;} // end inline asm fma.rn.ftz.f32 %f454, %f451, %f340, %f448; shl.b16 %rs320, %rs88, 4; cvt.s16.s8 %rs321, %rs320; shr.s16 %rs322, %rs321, 7; and.b16 %rs323, %rs322, -16; or.b16 %rs324, %rs323, %rs15; cvt.rn.f32.s16 %f455, %rs324; sub.ftz.f32 %f456, %f455, %f5; mul.ftz.f32 %f457, %f37, %f456; mov.b32 {%rs208, %rs211}, %r66; // begin inline asm { cvt.f32.f16 %f341, %rs208;} // end inline asm fma.rn.ftz.f32 %f458, %f457, %f341, %f452; mov.b32 {%rs209, %rs212}, %r70; // begin inline asm { cvt.f32.f16 %f342, %rs209;} // end inline asm fma.rn.ftz.f32 %f459, %f457, %f342, %f453; mov.b32 {%rs210, %rs213}, %r76; // begin inline asm { cvt.f32.f16 %f343, %rs210;} // end inline asm fma.rn.ftz.f32 %f460, %f457, %f343, %f454; shl.b16 %rs325, %rs16, 4; cvt.s16.s8 %rs326, %rs325; shr.s16 %rs327, %rs326, 7; and.b16 %rs328, %rs327, -16; or.b16 %rs329, %rs328, %rs16; cvt.rn.f32.s16 %f461, %rs329; sub.ftz.f32 %f462, %f461, %f5; mul.ftz.f32 %f463, %f37, %f462; // begin inline asm { cvt.f32.f16 %f344, %rs211;} // end inline asm fma.rn.ftz.f32 %f464, %f463, %f344, %f458; // begin inline asm { cvt.f32.f16 %f345, %rs212;} // end inline asm fma.rn.ftz.f32 %f465, %f463, %f345, %f459; // begin inline asm { cvt.f32.f16 %f346, %rs213;} // end inline asm fma.rn.ftz.f32 %f466, %f463, %f346, %f460; ld.global.v4.u32 {%r179, %r180, %r181, %r182}, [%rd5+16]; ld.global.v4.u32 {%r187, %r188, %r189, %r190}, [%rd7]; ld.global.v4.u32 {%r195, %r196, %r197, %r198}, [%rd6+16]; shl.b16 %rs331, %rs89, 4; cvt.s16.s8 %rs332, %rs331; shr.s16 %rs333, %rs332, 7; and.b16 %rs334, %rs333, -16; or.b16 %rs335, %rs334, %rs17; cvt.rn.f32.s16 %f467, %rs335; sub.ftz.f32 %f468, %f467, %f5; mul.ftz.f32 %f469, %f37, %f468; mov.b32 {%rs214, %rs217}, %r179; // begin inline asm { cvt.f32.f16 %f347, %rs214;} // end inline asm fma.rn.ftz.f32 %f470, %f469, %f347, %f464; mov.b32 {%rs215, %rs218}, %r187; // begin inline asm { cvt.f32.f16 %f348, %rs215;} // end inline asm fma.rn.ftz.f32 %f471, %f469, %f348, %f465; mov.b32 {%rs216, %rs219}, %r195; // begin inline asm { cvt.f32.f16 %f349, %rs216;} // end inline asm fma.rn.ftz.f32 %f472, %f469, %f349, %f466; shl.b16 %rs336, %rs18, 4; cvt.s16.s8 %rs337, %rs336; shr.s16 %rs338, %rs337, 7; and.b16 %rs339, %rs338, -16; or.b16 %rs340, %rs339, %rs19; cvt.rn.f32.s16 %f473, %rs340; sub.ftz.f32 %f474, %f473, %f5; mul.ftz.f32 %f475, %f37, %f474; // begin inline asm { cvt.f32.f16 %f350, %rs217;} // end inline asm fma.rn.ftz.f32 %f476, %f475, %f350, %f470; // begin inline asm { cvt.f32.f16 %f351, %rs218;} // end inline asm fma.rn.ftz.f32 %f477, %f475, %f351, %f471; // begin inline asm { cvt.f32.f16 %f352, %rs219;} // end inline asm fma.rn.ftz.f32 %f478, %f475, %f352, %f472; shl.b16 %rs341, %rs20, 4; cvt.s16.s8 %rs342, %rs341; shr.s16 %rs343, %rs342, 7; and.b16 %rs344, %rs343, -16; or.b16 %rs345, %rs344, %rs21; cvt.rn.f32.s16 %f479, %rs345; sub.ftz.f32 %f480, %f479, %f5; mul.ftz.f32 %f481, %f37, %f480; mov.b32 {%rs220, %rs223}, %r180; // begin inline asm { cvt.f32.f16 %f353, %rs220;} // end inline asm fma.rn.ftz.f32 %f482, %f481, %f353, %f476; mov.b32 {%rs221, %rs224}, %r188; // begin inline asm { cvt.f32.f16 %f354, %rs221;} // end inline asm fma.rn.ftz.f32 %f483, %f481, %f354, %f477; mov.b32 {%rs222, %rs225}, %r196; // begin inline asm { cvt.f32.f16 %f355, %rs222;} // end inline asm fma.rn.ftz.f32 %f484, %f481, %f355, %f478; shl.b16 %rs346, %rs22, 4; cvt.s16.s8 %rs347, %rs346; shr.s16 %rs348, %rs347, 7; and.b16 %rs349, %rs348, -16; or.b16 %rs350, %rs349, %rs23; cvt.rn.f32.s16 %f485, %rs350; sub.ftz.f32 %f486, %f485, %f5; mul.ftz.f32 %f487, %f37, %f486; // begin inline asm { cvt.f32.f16 %f356, %rs223;} // end inline asm fma.rn.ftz.f32 %f488, %f487, %f356, %f482; // begin inline asm { cvt.f32.f16 %f357, %rs224;} // end inline asm fma.rn.ftz.f32 %f489, %f487, %f357, %f483; // begin inline asm { cvt.f32.f16 %f358, %rs225;} // end inline asm fma.rn.ftz.f32 %f490, %f487, %f358, %f484; shl.b16 %rs351, %rs24, 4; cvt.s16.s8 %rs352, %rs351; shr.s16 %rs353, %rs352, 7; and.b16 %rs354, %rs353, -16; or.b16 %rs355, %rs354, %rs25; cvt.rn.f32.s16 %f491, %rs355; sub.ftz.f32 %f492, %f491, %f5; mul.ftz.f32 %f493, %f37, %f492; mov.b32 {%rs226, %rs229}, %r181; // begin inline asm { cvt.f32.f16 %f359, %rs226;} // end inline asm fma.rn.ftz.f32 %f494, %f493, %f359, %f488; mov.b32 {%rs227, %rs230}, %r189; // begin inline asm { cvt.f32.f16 %f360, %rs227;} // end inline asm fma.rn.ftz.f32 %f495, %f493, %f360, %f489; mov.b32 {%rs228, %rs231}, %r197; // begin inline asm { cvt.f32.f16 %f361, %rs228;} // end inline asm fma.rn.ftz.f32 %f496, %f493, %f361, %f490; shl.b16 %rs356, %rs26, 4; cvt.s16.s8 %rs357, %rs356; shr.s16 %rs358, %rs357, 7; and.b16 %rs359, %rs358, -16; or.b16 %rs360, %rs359, %rs27; cvt.rn.f32.s16 %f497, %rs360; sub.ftz.f32 %f498, %f497, %f5; mul.ftz.f32 %f499, %f37, %f498; // begin inline asm { cvt.f32.f16 %f362, %rs229;} // end inline asm fma.rn.ftz.f32 %f500, %f499, %f362, %f494; // begin inline asm { cvt.f32.f16 %f363, %rs230;} // end inline asm fma.rn.ftz.f32 %f501, %f499, %f363, %f495; // begin inline asm { cvt.f32.f16 %f364, %rs231;} // end inline asm fma.rn.ftz.f32 %f502, %f499, %f364, %f496; shl.b16 %rs361, %rs28, 4; cvt.s16.s8 %rs362, %rs361; shr.s16 %rs363, %rs362, 7; and.b16 %rs364, %rs363, -16; or.b16 %rs365, %rs364, %rs29; cvt.rn.f32.s16 %f503, %rs365; sub.ftz.f32 %f504, %f503, %f5; mul.ftz.f32 %f505, %f37, %f504; mov.b32 {%rs232, %rs235}, %r182; // begin inline asm { cvt.f32.f16 %f365, %rs232;} // end inline asm fma.rn.ftz.f32 %f506, %f505, %f365, %f500; mov.b32 {%rs233, %rs236}, %r190; // begin inline asm { cvt.f32.f16 %f366, %rs233;} // end inline asm fma.rn.ftz.f32 %f507, %f505, %f366, %f501; mov.b32 {%rs234, %rs237}, %r198; // begin inline asm { cvt.f32.f16 %f367, %rs234;} // end inline asm fma.rn.ftz.f32 %f508, %f505, %f367, %f502; shl.b16 %rs366, %rs30, 4; cvt.s16.s8 %rs367, %rs366; shr.s16 %rs368, %rs367, 7; and.b16 %rs369, %rs368, -16; or.b16 %rs370, %rs369, %rs30; cvt.rn.f32.s16 %f509, %rs370; sub.ftz.f32 %f510, %f509, %f5; mul.ftz.f32 %f511, %f37, %f510; // begin inline asm { cvt.f32.f16 %f368, %rs235;} // end inline asm fma.rn.ftz.f32 %f512, %f511, %f368, %f506; // begin inline asm { cvt.f32.f16 %f369, %rs236;} // end inline asm fma.rn.ftz.f32 %f513, %f511, %f369, %f507; // begin inline asm { cvt.f32.f16 %f370, %rs237;} // end inline asm fma.rn.ftz.f32 %f514, %f511, %f370, %f508; ld.global.v4.u32 {%r203, %r204, %r205, %r206}, [%rd5+32]; ld.global.v4.u32 {%r211, %r212, %r213, %r214}, [%rd7+16]; ld.global.v4.u32 {%r219, %r220, %r221, %r222}, [%rd6+32]; shl.b16 %rs372, %rs90, 4; cvt.s16.s8 %rs373, %rs372; shr.s16 %rs374, %rs373, 7; and.b16 %rs375, %rs374, -16; or.b16 %rs376, %rs375, %rs31; cvt.rn.f32.s16 %f515, %rs376; sub.ftz.f32 %f516, %f515, %f5; mul.ftz.f32 %f517, %f37, %f516; mov.b32 {%rs238, %rs241}, %r203; // begin inline asm { cvt.f32.f16 %f371, %rs238;} // end inline asm fma.rn.ftz.f32 %f518, %f517, %f371, %f512; mov.b32 {%rs239, %rs242}, %r211; // begin inline asm { cvt.f32.f16 %f372, %rs239;} // end inline asm fma.rn.ftz.f32 %f519, %f517, %f372, %f513; mov.b32 {%rs240, %rs243}, %r219; // begin inline asm { cvt.f32.f16 %f373, %rs240;} // end inline asm fma.rn.ftz.f32 %f520, %f517, %f373, %f514; shl.b16 %rs377, %rs32, 4; cvt.s16.s8 %rs378, %rs377; shr.s16 %rs379, %rs378, 7; and.b16 %rs380, %rs379, -16; or.b16 %rs381, %rs380, %rs33; cvt.rn.f32.s16 %f521, %rs381; sub.ftz.f32 %f522, %f521, %f5; mul.ftz.f32 %f523, %f37, %f522; // begin inline asm { cvt.f32.f16 %f374, %rs241;} // end inline asm fma.rn.ftz.f32 %f524, %f523, %f374, %f518; // begin inline asm { cvt.f32.f16 %f375, %rs242;} // end inline asm fma.rn.ftz.f32 %f525, %f523, %f375, %f519; // begin inline asm { cvt.f32.f16 %f376, %rs243;} // end inline asm fma.rn.ftz.f32 %f526, %f523, %f376, %f520; shl.b16 %rs382, %rs34, 4; cvt.s16.s8 %rs383, %rs382; shr.s16 %rs384, %rs383, 7; and.b16 %rs385, %rs384, -16; or.b16 %rs386, %rs385, %rs35; cvt.rn.f32.s16 %f527, %rs386; sub.ftz.f32 %f528, %f527, %f5; mul.ftz.f32 %f529, %f37, %f528; mov.b32 {%rs244, %rs247}, %r204; // begin inline asm { cvt.f32.f16 %f377, %rs244;} // end inline asm fma.rn.ftz.f32 %f530, %f529, %f377, %f524; mov.b32 {%rs245, %rs248}, %r212; // begin inline asm { cvt.f32.f16 %f378, %rs245;} // end inline asm fma.rn.ftz.f32 %f531, %f529, %f378, %f525; mov.b32 {%rs246, %rs249}, %r220; // begin inline asm { cvt.f32.f16 %f379, %rs246;} // end inline asm fma.rn.ftz.f32 %f532, %f529, %f379, %f526; shl.b16 %rs387, %rs36, 4; cvt.s16.s8 %rs388, %rs387; shr.s16 %rs389, %rs388, 7; and.b16 %rs390, %rs389, -16; or.b16 %rs391, %rs390, %rs37; cvt.rn.f32.s16 %f533, %rs391; sub.ftz.f32 %f534, %f533, %f5; mul.ftz.f32 %f535, %f37, %f534; // begin inline asm { cvt.f32.f16 %f380, %rs247;} // end inline asm fma.rn.ftz.f32 %f536, %f535, %f380, %f530; // begin inline asm { cvt.f32.f16 %f381, %rs248;} // end inline asm fma.rn.ftz.f32 %f537, %f535, %f381, %f531; // begin inline asm { cvt.f32.f16 %f382, %rs249;} // end inline asm fma.rn.ftz.f32 %f538, %f535, %f382, %f532; shl.b16 %rs392, %rs38, 4; cvt.s16.s8 %rs393, %rs392; shr.s16 %rs394, %rs393, 7; and.b16 %rs395, %rs394, -16; or.b16 %rs396, %rs395, %rs39; cvt.rn.f32.s16 %f539, %rs396; sub.ftz.f32 %f540, %f539, %f5; mul.ftz.f32 %f541, %f37, %f540; mov.b32 {%rs250, %rs253}, %r205; // begin inline asm { cvt.f32.f16 %f383, %rs250;} // end inline asm fma.rn.ftz.f32 %f542, %f541, %f383, %f536; mov.b32 {%rs251, %rs254}, %r213; // begin inline asm { cvt.f32.f16 %f384, %rs251;} // end inline asm fma.rn.ftz.f32 %f543, %f541, %f384, %f537; mov.b32 {%rs252, %rs255}, %r221; // begin inline asm { cvt.f32.f16 %f385, %rs252;} // end inline asm fma.rn.ftz.f32 %f544, %f541, %f385, %f538; shl.b16 %rs397, %rs40, 4; cvt.s16.s8 %rs398, %rs397; shr.s16 %rs399, %rs398, 7; and.b16 %rs400, %rs399, -16; or.b16 %rs401, %rs400, %rs41; cvt.rn.f32.s16 %f545, %rs401; sub.ftz.f32 %f546, %f545, %f5; mul.ftz.f32 %f547, %f37, %f546; // begin inline asm { cvt.f32.f16 %f386, %rs253;} // end inline asm fma.rn.ftz.f32 %f548, %f547, %f386, %f542; // begin inline asm { cvt.f32.f16 %f387, %rs254;} // end inline asm fma.rn.ftz.f32 %f549, %f547, %f387, %f543; // begin inline asm { cvt.f32.f16 %f388, %rs255;} // end inline asm fma.rn.ftz.f32 %f550, %f547, %f388, %f544; shl.b16 %rs402, %rs42, 4; cvt.s16.s8 %rs403, %rs402; shr.s16 %rs404, %rs403, 7; and.b16 %rs405, %rs404, -16; or.b16 %rs406, %rs405, %rs43; cvt.rn.f32.s16 %f551, %rs406; sub.ftz.f32 %f552, %f551, %f5; mul.ftz.f32 %f553, %f37, %f552; mov.b32 {%rs256, %rs259}, %r206; // begin inline asm { cvt.f32.f16 %f389, %rs256;} // end inline asm fma.rn.ftz.f32 %f554, %f553, %f389, %f548; mov.b32 {%rs257, %rs260}, %r214; // begin inline asm { cvt.f32.f16 %f390, %rs257;} // end inline asm fma.rn.ftz.f32 %f555, %f553, %f390, %f549; mov.b32 {%rs258, %rs261}, %r222; // begin inline asm { cvt.f32.f16 %f391, %rs258;} // end inline asm fma.rn.ftz.f32 %f556, %f553, %f391, %f550; shl.b16 %rs407, %rs44, 4; cvt.s16.s8 %rs408, %rs407; shr.s16 %rs409, %rs408, 7; and.b16 %rs410, %rs409, -16; or.b16 %rs411, %rs410, %rs44; cvt.rn.f32.s16 %f557, %rs411; sub.ftz.f32 %f558, %f557, %f5; mul.ftz.f32 %f559, %f37, %f558; // begin inline asm { cvt.f32.f16 %f392, %rs259;} // end inline asm fma.rn.ftz.f32 %f560, %f559, %f392, %f554; // begin inline asm { cvt.f32.f16 %f393, %rs260;} // end inline asm fma.rn.ftz.f32 %f561, %f559, %f393, %f555; // begin inline asm { cvt.f32.f16 %f394, %rs261;} // end inline asm fma.rn.ftz.f32 %f562, %f559, %f394, %f556; ld.global.v4.u32 {%r227, %r228, %r229, %r230}, [%rd5+48]; ld.global.v4.u32 {%r235, %r236, %r237, %r238}, [%rd7+32]; ld.global.v4.u32 {%r243, %r244, %r245, %r246}, [%rd6+48]; shl.b16 %rs413, %rs91, 4; cvt.s16.s8 %rs414, %rs413; shr.s16 %rs415, %rs414, 7; and.b16 %rs416, %rs415, -16; or.b16 %rs417, %rs416, %rs45; cvt.rn.f32.s16 %f563, %rs417; sub.ftz.f32 %f564, %f563, %f5; mul.ftz.f32 %f565, %f37, %f564; mov.b32 {%rs262, %rs265}, %r227; // begin inline asm { cvt.f32.f16 %f395, %rs262;} // end inline asm fma.rn.ftz.f32 %f566, %f565, %f395, %f560; mov.b32 {%rs263, %rs266}, %r235; // begin inline asm { cvt.f32.f16 %f396, %rs263;} // end inline asm fma.rn.ftz.f32 %f567, %f565, %f396, %f561; mov.b32 {%rs264, %rs267}, %r243; // begin inline asm { cvt.f32.f16 %f397, %rs264;} // end inline asm fma.rn.ftz.f32 %f568, %f565, %f397, %f562; shl.b16 %rs418, %rs46, 4; cvt.s16.s8 %rs419, %rs418; shr.s16 %rs420, %rs419, 7; and.b16 %rs421, %rs420, -16; or.b16 %rs422, %rs421, %rs47; cvt.rn.f32.s16 %f569, %rs422; sub.ftz.f32 %f570, %f569, %f5; mul.ftz.f32 %f571, %f37, %f570; // begin inline asm { cvt.f32.f16 %f398, %rs265;} // end inline asm fma.rn.ftz.f32 %f572, %f571, %f398, %f566; // begin inline asm { cvt.f32.f16 %f399, %rs266;} // end inline asm fma.rn.ftz.f32 %f573, %f571, %f399, %f567; // begin inline asm { cvt.f32.f16 %f400, %rs267;} // end inline asm fma.rn.ftz.f32 %f574, %f571, %f400, %f568; shl.b16 %rs423, %rs48, 4; cvt.s16.s8 %rs424, %rs423; shr.s16 %rs425, %rs424, 7; and.b16 %rs426, %rs425, -16; or.b16 %rs427, %rs426, %rs49; cvt.rn.f32.s16 %f575, %rs427; sub.ftz.f32 %f576, %f575, %f5; mul.ftz.f32 %f577, %f37, %f576; mov.b32 {%rs268, %rs271}, %r228; // begin inline asm { cvt.f32.f16 %f401, %rs268;} // end inline asm fma.rn.ftz.f32 %f578, %f577, %f401, %f572; mov.b32 {%rs269, %rs272}, %r236; // begin inline asm { cvt.f32.f16 %f402, %rs269;} // end inline asm fma.rn.ftz.f32 %f579, %f577, %f402, %f573; mov.b32 {%rs270, %rs273}, %r244; // begin inline asm { cvt.f32.f16 %f403, %rs270;} // end inline asm fma.rn.ftz.f32 %f580, %f577, %f403, %f574; shl.b16 %rs428, %rs50, 4; cvt.s16.s8 %rs429, %rs428; shr.s16 %rs430, %rs429, 7; and.b16 %rs431, %rs430, -16; or.b16 %rs432, %rs431, %rs51; cvt.rn.f32.s16 %f581, %rs432; sub.ftz.f32 %f582, %f581, %f5; mul.ftz.f32 %f583, %f37, %f582; // begin inline asm { cvt.f32.f16 %f404, %rs271;} // end inline asm fma.rn.ftz.f32 %f584, %f583, %f404, %f578; // begin inline asm { cvt.f32.f16 %f405, %rs272;} // end inline asm fma.rn.ftz.f32 %f585, %f583, %f405, %f579; // begin inline asm { cvt.f32.f16 %f406, %rs273;} // end inline asm fma.rn.ftz.f32 %f586, %f583, %f406, %f580; shl.b16 %rs433, %rs52, 4; cvt.s16.s8 %rs434, %rs433; shr.s16 %rs435, %rs434, 7; and.b16 %rs436, %rs435, -16; or.b16 %rs437, %rs436, %rs53; cvt.rn.f32.s16 %f587, %rs437; sub.ftz.f32 %f588, %f587, %f5; mul.ftz.f32 %f589, %f37, %f588; mov.b32 {%rs274, %rs277}, %r229; // begin inline asm { cvt.f32.f16 %f407, %rs274;} // end inline asm fma.rn.ftz.f32 %f590, %f589, %f407, %f584; mov.b32 {%rs275, %rs278}, %r237; // begin inline asm { cvt.f32.f16 %f408, %rs275;} // end inline asm fma.rn.ftz.f32 %f591, %f589, %f408, %f585; mov.b32 {%rs276, %rs279}, %r245; // begin inline asm { cvt.f32.f16 %f409, %rs276;} // end inline asm fma.rn.ftz.f32 %f592, %f589, %f409, %f586; shl.b16 %rs438, %rs54, 4; cvt.s16.s8 %rs439, %rs438; shr.s16 %rs440, %rs439, 7; and.b16 %rs441, %rs440, -16; or.b16 %rs442, %rs441, %rs55; cvt.rn.f32.s16 %f593, %rs442; sub.ftz.f32 %f594, %f593, %f5; mul.ftz.f32 %f595, %f37, %f594; // begin inline asm { cvt.f32.f16 %f410, %rs277;} // end inline asm fma.rn.ftz.f32 %f596, %f595, %f410, %f590; // begin inline asm { cvt.f32.f16 %f411, %rs278;} // end inline asm fma.rn.ftz.f32 %f597, %f595, %f411, %f591; // begin inline asm { cvt.f32.f16 %f412, %rs279;} // end inline asm fma.rn.ftz.f32 %f598, %f595, %f412, %f592; shl.b16 %rs443, %rs56, 4; cvt.s16.s8 %rs444, %rs443; shr.s16 %rs445, %rs444, 7; and.b16 %rs446, %rs445, -16; or.b16 %rs447, %rs446, %rs57; cvt.rn.f32.s16 %f599, %rs447; sub.ftz.f32 %f600, %f599, %f5; mul.ftz.f32 %f601, %f37, %f600; mov.b32 {%rs280, %rs283}, %r230; // begin inline asm { cvt.f32.f16 %f413, %rs280;} // end inline asm fma.rn.ftz.f32 %f602, %f601, %f413, %f596; mov.b32 {%rs281, %rs284}, %r238; // begin inline asm { cvt.f32.f16 %f414, %rs281;} // end inline asm fma.rn.ftz.f32 %f603, %f601, %f414, %f597; mov.b32 {%rs282, %rs285}, %r246; // begin inline asm { cvt.f32.f16 %f415, %rs282;} // end inline asm fma.rn.ftz.f32 %f604, %f601, %f415, %f598; shl.b16 %rs448, %rs58, 4; cvt.s16.s8 %rs449, %rs448; shr.s16 %rs450, %rs449, 7; and.b16 %rs451, %rs450, -16; or.b16 %rs452, %rs451, %rs58; cvt.rn.f32.s16 %f605, %rs452; sub.ftz.f32 %f606, %f605, %f5; mul.ftz.f32 %f607, %f37, %f606; // begin inline asm { cvt.f32.f16 %f416, %rs283;} // end inline asm fma.rn.ftz.f32 %f682, %f607, %f416, %f602; // begin inline asm { cvt.f32.f16 %f417, %rs284;} // end inline asm fma.rn.ftz.f32 %f681, %f607, %f417, %f603; // begin inline asm { cvt.f32.f16 %f418, %rs285;} // end inline asm fma.rn.ftz.f32 %f680, %f607, %f418, %f604; $L__BB0_8: add.s32 %r339, %r339, 4; shl.b32 %r251, %r339, 5; add.s32 %r338, %r251, %r45; shl.b32 %r337, %r338, 2; setp.lt.u32 %p7, %r337, %r42; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r253, %tid.y; shl.b32 %r254, %r253, 5; add.s32 %r33, %r254, %r45; setp.lt.u32 %p8, %r33, 32; shl.b32 %r256, %r33, 2; mov.u32 %r257, _ZZ9gemv_int4ILi4ELi128ELi3EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r258, %r257, %r256; @%p8 bra $L__BB0_11; add.s32 %r334, %r258, -112; st.shared.f32 [%r334], %f682; $L__BB0_11: setp.gt.u32 %p9, %r33, 31; bar.sync 0; mad.lo.s32 %r35, %r33, 12, %r257; @%p9 bra $L__BB0_13; mov.u32 %r277, 16; ld.shared.f32 %f623, [%r35+16]; add.ftz.f32 %f624, %f682, %f623; ld.shared.f32 %f625, [%r35+20]; add.ftz.f32 %f626, %f624, %f625; ld.shared.f32 %f627, [%r35+24]; add.ftz.f32 %f610, %f626, %f627; mov.u32 %r265, 1; mov.u32 %r278, 31; mov.u32 %r279, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f610, %r265, %r278, %r279; @p add.f32 r0, r0, %f610; mov.f32 %f608, r0;} // end inline asm mov.u32 %r268, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f608, %r268, %r278, %r279; @p add.f32 r0, r0, %f608; mov.f32 %f611, r0;} // end inline asm mov.u32 %r271, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f611, %r271, %r278, %r279; @p add.f32 r0, r0, %f611; mov.f32 %f614, r0;} // end inline asm mov.u32 %r274, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f614, %r274, %r278, %r279; @p add.f32 r0, r0, %f614; mov.f32 %f617, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f617, %r277, %r278, %r279; @p add.f32 r0, r0, %f617; mov.f32 %f682, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r335, %r258, -112; st.shared.f32 [%r335+640], %f681; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f643, [%r35+656]; add.ftz.f32 %f644, %f681, %f643; ld.shared.f32 %f645, [%r35+660]; add.ftz.f32 %f646, %f644, %f645; ld.shared.f32 %f647, [%r35+664]; add.ftz.f32 %f630, %f646, %f647; mov.u32 %r289, 1; mov.u32 %r302, 31; mov.u32 %r303, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f630, %r289, %r302, %r303; @p add.f32 r0, r0, %f630; mov.f32 %f628, r0;} // end inline asm mov.u32 %r292, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f628, %r292, %r302, %r303; @p add.f32 r0, r0, %f628; mov.f32 %f631, r0;} // end inline asm mov.u32 %r295, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f631, %r295, %r302, %r303; @p add.f32 r0, r0, %f631; mov.f32 %f634, r0;} // end inline asm mov.u32 %r298, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f634, %r298, %r302, %r303; @p add.f32 r0, r0, %f634; mov.f32 %f637, r0;} // end inline asm mov.u32 %r301, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f637, %r301, %r302, %r303; @p add.f32 r0, r0, %f637; mov.f32 %f681, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r336, %r258, -112; st.shared.f32 [%r336+1280], %f680; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f663, [%r35+1296]; add.ftz.f32 %f664, %f680, %f663; ld.shared.f32 %f665, [%r35+1300]; add.ftz.f32 %f666, %f664, %f665; ld.shared.f32 %f667, [%r35+1304]; add.ftz.f32 %f650, %f666, %f667; mov.u32 %r313, 1; mov.u32 %r326, 31; mov.u32 %r327, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f650, %r313, %r326, %r327; @p add.f32 r0, r0, %f650; mov.f32 %f648, r0;} // end inline asm mov.u32 %r316, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f648, %r316, %r326, %r327; @p add.f32 r0, r0, %f648; mov.f32 %f651, r0;} // end inline asm mov.u32 %r319, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f651, %r319, %r326, %r327; @p add.f32 r0, r0, %f651; mov.f32 %f654, r0;} // end inline asm mov.u32 %r322, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f654, %r322, %r326, %r327; @p add.f32 r0, r0, %f654; mov.f32 %f657, r0;} // end inline asm mov.u32 %r325, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f657, %r325, %r326, %r327; @p add.f32 r0, r0, %f657; mov.f32 %f680, r0;} // end inline asm $L__BB0_21: or.b32 %r330, %r45, %r253; setp.ne.s32 %p14, %r330, 0; @%p14 bra $L__BB0_25; ld.param.u64 %rd45, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd44, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd33, %rd44; setp.eq.s64 %p15, %rd45, 0; mul.ftz.f32 %f24, %f32, %f682; mov.u32 %r331, %ctaid.x; cvt.s64.s32 %rd9, %r331; mul.wide.s32 %rd34, %r331, 2; add.s64 %rd10, %rd33, %rd34; mul.ftz.f32 %f25, %f32, %f681; add.s32 %r332, %r41, %r331; cvt.s64.s32 %rd11, %r41; mul.wide.s32 %rd35, %r41, 2; add.s64 %rd12, %rd10, %rd35; mul.ftz.f32 %f26, %f32, %f680; add.s32 %r333, %r332, %r41; cvt.s64.s32 %rd13, %r333; mul.wide.s32 %rd36, %r333, 2; add.s64 %rd14, %rd33, %rd36; @%p15 bra $L__BB0_24; ld.param.u64 %rd46, [_Z28dequant_gemv_group128_batch323DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd37, %rd46; shl.b64 %rd38, %rd9, 1; add.s64 %rd39, %rd37, %rd38; ld.global.u16 %rs453, [%rd39]; // begin inline asm { cvt.f32.f16 %f668, %rs453;} // end inline asm fma.rn.ftz.f32 %f669, %f33, %f668, %f24; // begin inline asm { cvt.rn.f16.f32 %rs454, %f669;} // end inline asm st.global.u16 [%rd10], %rs454; shl.b64 %rd40, %rd11, 1; add.s64 %rd41, %rd39, %rd40; ld.global.u16 %rs455, [%rd41]; // begin inline asm { cvt.f32.f16 %f670, %rs455;} // end inline asm fma.rn.ftz.f32 %f671, %f33, %f670, %f25; // begin inline asm { cvt.rn.f16.f32 %rs456, %f671;} // end inline asm st.global.u16 [%rd12], %rs456; shl.b64 %rd42, %rd13, 1; add.s64 %rd43, %rd37, %rd42; ld.global.u16 %rs457, [%rd43]; // begin inline asm { cvt.f32.f16 %f672, %rs457;} // end inline asm fma.rn.ftz.f32 %f673, %f33, %f672, %f26; // begin inline asm { cvt.rn.f16.f32 %rs458, %f673;} // end inline asm st.global.u16 [%rd14], %rs458; bra.uni $L__BB0_25; $L__BB0_24: // begin inline asm { cvt.rn.f16.f32 %rs459, %f24;} // end inline asm st.global.u16 [%rd10], %rs459; // begin inline asm { cvt.rn.f16.f32 %rs460, %f25;} // end inline asm st.global.u16 [%rd12], %rs460; // begin inline asm { cvt.rn.f16.f32 %rs461, %f26;} // end inline asm st.global.u16 [%rd14], %rs461; $L__BB0_25: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }