emv_int4ILi4ELi64ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_cf9f49796thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch623DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<22>; .reg .b16 %rs<385>; .reg .f32 %f<698>; .reg .b32 %r<328>; .reg .b64 %rd<64>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[3840]; ld.param.v2.u32 {%r48, %r49}, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r50, %r51}, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f59, %f60}, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs65, %rs66, %rs67, %rs68}, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd24, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd23, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd22, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd21, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+16]; mov.u32 %r327, %tid.y; shl.b32 %r52, %r327, 5; mov.u32 %r53, %tid.x; add.s32 %r326, %r52, %r53; shl.b32 %r325, %r326, 1; setp.ge.u32 %p1, %r325, %r50; mov.f32 %f680, 0f00000000; mov.f32 %f681, %f680; mov.f32 %f682, %f680; mov.f32 %f683, %f680; mov.f32 %f684, %f680; mov.f32 %f685, %f680; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd2, %rd21; mov.u32 %r54, %ctaid.x; mul.lo.s32 %r6, %r51, %r54; shl.b16 %rs2, %rs65, 3; cvta.to.global.u64 %rd3, %rd22; cvta.to.global.u64 %rd4, %rd24; $L__BB0_2: mad.lo.s32 %r56, %r50, %r54, %r325; mul.wide.u32 %rd25, %r56, 4; add.s64 %rd26, %rd3, %rd25; ld.global.v2.u32 {%r57, %r58}, [%rd26]; shr.u32 %r60, %r53, 2; shl.b32 %r61, %r327, 3; add.s32 %r12, %r61, %r60; add.s32 %r13, %r12, %r6; mul.wide.s32 %rd27, %r13, 2; add.s64 %rd28, %rd4, %rd27; ld.global.u16 %rs73, [%rd28]; // begin inline asm { cvt.f32.f16 %f67, %rs73;} // end inline asm setp.eq.s64 %p2, %rd23, 0; mov.u16 %rs384, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r62, %r13, 31; add.s32 %r63, %r13, %r62; shr.s32 %r64, %r63, 1; cvt.s64.s32 %rd29, %r64; cvta.to.global.u64 %rd30, %rd23; add.s64 %rd31, %rd30, %rd29; ld.global.u8 %r65, [%rd31]; shl.b32 %r66, %r12, 2; and.b32 %r67, %r66, 4; shr.u32 %r68, %r65, %r67; cvt.u16.u32 %rs74, %r68; and.b16 %rs384, %rs74, 15; $L__BB0_4: shl.b32 %r14, %r326, 4; setp.ge.s32 %p3, %r14, %r48; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs65, 0; shr.u16 %rs76, %rs384, 3; and.b16 %rs77, %rs76, 1; setp.eq.b16 %p5, %rs77, 1; and.pred %p6, %p4, %p5; selp.b16 %rs78, -16, 0, %p6; or.b16 %rs79, %rs78, %rs384; cvt.s16.s8 %rs80, %rs79; cvt.rn.f32.s16 %f8, %rs80; mul.wide.s32 %rd32, %r14, 2; add.s64 %rd5, %rd2, %rd32; ld.global.v4.u32 {%r69, %r70, %r71, %r72}, [%rd5]; mul.wide.s32 %rd33, %r48, 2; add.s64 %rd6, %rd5, %rd33; ld.global.v4.u32 {%r73, %r74, %r75, %r76}, [%rd6]; add.s32 %r77, %r14, %r48; add.s32 %r78, %r77, %r48; mul.wide.s32 %rd34, %r78, 2; add.s64 %rd35, %rd2, %rd34; ld.global.v4.u32 {%r79, %r80, %r81, %r82}, [%rd35]; add.s64 %rd36, %rd35, %rd33; ld.global.v4.u32 {%r83, %r84, %r85, %r86}, [%rd36]; add.s64 %rd37, %rd36, %rd33; ld.global.v4.u32 {%r87, %r88, %r89, %r90}, [%rd37]; add.s64 %rd38, %rd37, %rd33; ld.global.v4.u32 {%r91, %r92, %r93, %r94}, [%rd38]; cvt.u16.u32 %rs81, %r57; and.b16 %rs5, %rs81, 15; mov.b32 {%rs8, %rs7}, %r69; mov.b32 {%rs11, %rs10}, %r73; mov.b32 {%rs14, %rs13}, %r79; mov.b32 {%rs17, %rs16}, %r83; mov.b32 {%rs20, %rs19}, %r87; cvt.u16.u32 %rs21, %r91; shr.u16 %rs82, %rs81, 4; and.b16 %rs22, %rs82, 15; shr.u16 %rs83, %rs81, 8; and.b16 %rs23, %rs83, 15; shr.u16 %rs24, %rs81, 12; shr.u32 %r95, %r57, 16; cvt.u16.u32 %rs84, %r95; and.b16 %rs25, %rs84, 15; mov.b32 {%rs26, %rs36}, %r85; mov.b32 {%rs28, %rs37}, %r89; mov.b32 {%rs30, %rs38}, %r93; shr.u32 %r96, %r57, 20; cvt.u16.u32 %rs31, %r96; and.b16 %rs32, %rs31, 15; mov.b32 {%rs85, %rs33}, %r71; mov.b32 {%rs86, %rs34}, %r75; mov.b32 {%rs87, %rs35}, %r81; shr.u32 %r97, %r57, 24; cvt.u16.u32 %rs39, %r97; and.b16 %rs40, %rs39, 15; shr.u32 %r98, %r57, 28; cvt.u16.u32 %rs41, %r98; cvt.u16.u32 %rs42, %r58; and.b16 %rs43, %rs42, 15; shr.u32 %r99, %r58, 4; cvt.u16.u32 %rs44, %r99; and.b16 %rs45, %rs44, 15; shr.u32 %r100, %r58, 8; cvt.u16.u32 %rs46, %r100; and.b16 %rs47, %rs46, 15; shr.u32 %r101, %r58, 12; cvt.u16.u32 %rs48, %r101; and.b16 %rs49, %rs48, 15; shr.u32 %r102, %r58, 16; cvt.u16.u32 %rs50, %r102; and.b16 %rs51, %rs50, 15; shr.u32 %r103, %r58, 20; cvt.u16.u32 %rs52, %r103; and.b16 %rs53, %rs52, 15; shr.u32 %r104, %r58, 24; cvt.u16.u32 %rs54, %r104; and.b16 %rs55, %rs54, 15; shr.u32 %r105, %r58, 28; cvt.u16.u32 %rs56, %r105; add.s64 %rd39, %rd6, %rd33; add.s64 %rd7, %rd39, 16; add.s64 %rd8, %rd7, %rd33; add.s64 %rd9, %rd8, %rd33; add.s64 %rd10, %rd9, %rd33; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f164, %rs5; sub.ftz.f32 %f165, %f164, %f8; mul.ftz.f32 %f166, %f67, %f165; // begin inline asm { cvt.f32.f16 %f68, %rs8;} // end inline asm fma.rn.ftz.f32 %f167, %f166, %f68, %f685; // begin inline asm { cvt.f32.f16 %f69, %rs11;} // end inline asm fma.rn.ftz.f32 %f168, %f166, %f69, %f684; // begin inline asm { cvt.f32.f16 %f70, %rs14;} // end inline asm fma.rn.ftz.f32 %f169, %f166, %f70, %f683; // begin inline asm { cvt.f32.f16 %f71, %rs17;} // end inline asm fma.rn.ftz.f32 %f170, %f166, %f71, %f682; // begin inline asm { cvt.f32.f16 %f72, %rs20;} // end inline asm fma.rn.ftz.f32 %f171, %f166, %f72, %f681; // begin inline asm { cvt.f32.f16 %f73, %rs21;} // end inline asm fma.rn.ftz.f32 %f172, %f166, %f73, %f680; cvt.rn.f32.s16 %f173, %rs22; sub.ftz.f32 %f174, %f173, %f8; mul.ftz.f32 %f175, %f67, %f174; // begin inline asm { cvt.f32.f16 %f74, %rs7;} // end inline asm fma.rn.ftz.f32 %f176, %f175, %f74, %f167; // begin inline asm { cvt.f32.f16 %f75, %rs10;} // end inline asm fma.rn.ftz.f32 %f177, %f175, %f75, %f168; // begin inline asm { cvt.f32.f16 %f76, %rs13;} // end inline asm fma.rn.ftz.f32 %f178, %f175, %f76, %f169; // begin inline asm { cvt.f32.f16 %f77, %rs16;} // end inline asm fma.rn.ftz.f32 %f179, %f175, %f77, %f170; // begin inline asm { cvt.f32.f16 %f78, %rs19;} // end inline asm fma.rn.ftz.f32 %f180, %f175, %f78, %f171; mov.b32 {%rs184, %rs99}, %r91; // begin inline asm { cvt.f32.f16 %f79, %rs99;} // end inline asm fma.rn.ftz.f32 %f181, %f175, %f79, %f172; cvt.rn.f32.s16 %f182, %rs23; sub.ftz.f32 %f183, %f182, %f8; mul.ftz.f32 %f184, %f67, %f183; mov.b32 {%rs100, %rs106}, %r70; // begin inline asm { cvt.f32.f16 %f80, %rs100;} // end inline asm fma.rn.ftz.f32 %f185, %f184, %f80, %f176; mov.b32 {%rs101, %rs107}, %r74; // begin inline asm { cvt.f32.f16 %f81, %rs101;} // end inline asm fma.rn.ftz.f32 %f186, %f184, %f81, %f177; mov.b32 {%rs102, %rs108}, %r80; // begin inline asm { cvt.f32.f16 %f82, %rs102;} // end inline asm fma.rn.ftz.f32 %f187, %f184, %f82, %f178; mov.b32 {%rs103, %rs109}, %r84; // begin inline asm { cvt.f32.f16 %f83, %rs103;} // end inline asm fma.rn.ftz.f32 %f188, %f184, %f83, %f179; mov.b32 {%rs104, %rs110}, %r88; // begin inline asm { cvt.f32.f16 %f84, %rs104;} // end inline asm fma.rn.ftz.f32 %f189, %f184, %f84, %f180; mov.b32 {%rs105, %rs111}, %r92; // begin inline asm { cvt.f32.f16 %f85, %rs105;} // end inline asm fma.rn.ftz.f32 %f190, %f184, %f85, %f181; cvt.rn.f32.s16 %f191, %rs24; sub.ftz.f32 %f192, %f191, %f8; mul.ftz.f32 %f193, %f67, %f192; // begin inline asm { cvt.f32.f16 %f86, %rs106;} // end inline asm fma.rn.ftz.f32 %f194, %f193, %f86, %f185; // begin inline asm { cvt.f32.f16 %f87, %rs107;} // end inline asm fma.rn.ftz.f32 %f195, %f193, %f87, %f186; // begin inline asm { cvt.f32.f16 %f88, %rs108;} // end inline asm fma.rn.ftz.f32 %f196, %f193, %f88, %f187; // begin inline asm { cvt.f32.f16 %f89, %rs109;} // end inline asm fma.rn.ftz.f32 %f197, %f193, %f89, %f188; // begin inline asm { cvt.f32.f16 %f90, %rs110;} // end inline asm fma.rn.ftz.f32 %f198, %f193, %f90, %f189; // begin inline asm { cvt.f32.f16 %f91, %rs111;} // end inline asm fma.rn.ftz.f32 %f199, %f193, %f91, %f190; cvt.rn.f32.s16 %f200, %rs25; sub.ftz.f32 %f201, %f200, %f8; mul.ftz.f32 %f202, %f67, %f201; cvt.u16.u32 %rs112, %r71; // begin inline asm { cvt.f32.f16 %f92, %rs112;} // end inline asm fma.rn.ftz.f32 %f203, %f202, %f92, %f194; cvt.u16.u32 %rs113, %r75; // begin inline asm { cvt.f32.f16 %f93, %rs113;} // end inline asm fma.rn.ftz.f32 %f204, %f202, %f93, %f195; cvt.u16.u32 %rs114, %r81; // begin inline asm { cvt.f32.f16 %f94, %rs114;} // end inline asm fma.rn.ftz.f32 %f205, %f202, %f94, %f196; // begin inline asm { cvt.f32.f16 %f95, %rs26;} // end inline asm fma.rn.ftz.f32 %f206, %f202, %f95, %f197; // begin inline asm { cvt.f32.f16 %f96, %rs28;} // end inline asm fma.rn.ftz.f32 %f207, %f202, %f96, %f198; // begin inline asm { cvt.f32.f16 %f97, %rs30;} // end inline asm fma.rn.ftz.f32 %f208, %f202, %f97, %f199; cvt.rn.f32.s16 %f209, %rs32; sub.ftz.f32 %f210, %f209, %f8; mul.ftz.f32 %f211, %f67, %f210; // begin inline asm { cvt.f32.f16 %f98, %rs33;} // end inline asm fma.rn.ftz.f32 %f212, %f211, %f98, %f203; // begin inline asm { cvt.f32.f16 %f99, %rs34;} // end inline asm fma.rn.ftz.f32 %f213, %f211, %f99, %f204; // begin inline asm { cvt.f32.f16 %f100, %rs35;} // end inline asm fma.rn.ftz.f32 %f214, %f211, %f100, %f205; // begin inline asm { cvt.f32.f16 %f101, %rs36;} // end inline asm fma.rn.ftz.f32 %f215, %f211, %f101, %f206; // begin inline asm { cvt.f32.f16 %f102, %rs37;} // end inline asm fma.rn.ftz.f32 %f216, %f211, %f102, %f207; // begin inline asm { cvt.f32.f16 %f103, %rs38;} // end inline asm fma.rn.ftz.f32 %f217, %f211, %f103, %f208; cvt.rn.f32.s16 %f218, %rs40; sub.ftz.f32 %f219, %f218, %f8; mul.ftz.f32 %f220, %f67, %f219; mov.b32 {%rs124, %rs130}, %r72; // begin inline asm { cvt.f32.f16 %f104, %rs124;} // end inline asm fma.rn.ftz.f32 %f221, %f220, %f104, %f212; mov.b32 {%rs125, %rs131}, %r76; // begin inline asm { cvt.f32.f16 %f105, %rs125;} // end inline asm fma.rn.ftz.f32 %f222, %f220, %f105, %f213; mov.b32 {%rs126, %rs132}, %r82; // begin inline asm { cvt.f32.f16 %f106, %rs126;} // end inline asm fma.rn.ftz.f32 %f223, %f220, %f106, %f214; mov.b32 {%rs127, %rs133}, %r86; // begin inline asm { cvt.f32.f16 %f107, %rs127;} // end inline asm fma.rn.ftz.f32 %f224, %f220, %f107, %f215; mov.b32 {%rs128, %rs134}, %r90; // begin inline asm { cvt.f32.f16 %f108, %rs128;} // end inline asm fma.rn.ftz.f32 %f225, %f220, %f108, %f216; mov.b32 {%rs129, %rs135}, %r94; // begin inline asm { cvt.f32.f16 %f109, %rs129;} // end inline asm fma.rn.ftz.f32 %f226, %f220, %f109, %f217; cvt.rn.f32.s16 %f227, %rs41; sub.ftz.f32 %f228, %f227, %f8; mul.ftz.f32 %f229, %f67, %f228; // begin inline asm { cvt.f32.f16 %f110, %rs130;} // end inline asm fma.rn.ftz.f32 %f230, %f229, %f110, %f221; // begin inline asm { cvt.f32.f16 %f111, %rs131;} // end inline asm fma.rn.ftz.f32 %f231, %f229, %f111, %f222; // begin inline asm { cvt.f32.f16 %f112, %rs132;} // end inline asm fma.rn.ftz.f32 %f232, %f229, %f112, %f223; // begin inline asm { cvt.f32.f16 %f113, %rs133;} // end inline asm fma.rn.ftz.f32 %f233, %f229, %f113, %f224; // begin inline asm { cvt.f32.f16 %f114, %rs134;} // end inline asm fma.rn.ftz.f32 %f234, %f229, %f114, %f225; // begin inline asm { cvt.f32.f16 %f115, %rs135;} // end inline asm fma.rn.ftz.f32 %f235, %f229, %f115, %f226; ld.global.v4.u32 {%r106, %r107, %r108, %r109}, [%rd5+16]; ld.global.v4.u32 {%r114, %r115, %r116, %r117}, [%rd6+16]; ld.global.v4.u32 {%r122, %r123, %r124, %r125}, [%rd7]; ld.global.v4.u32 {%r130, %r131, %r132, %r133}, [%rd8]; ld.global.v4.u32 {%r138, %r139, %r140, %r141}, [%rd9]; ld.global.v4.u32 {%r146, %r147, %r148, %r149}, [%rd10]; cvt.rn.f32.s16 %f236, %rs43; sub.ftz.f32 %f237, %f236, %f8; mul.ftz.f32 %f238, %f67, %f237; mov.b32 {%rs136, %rs142}, %r106; // begin inline asm { cvt.f32.f16 %f116, %rs136;} // end inline asm fma.rn.ftz.f32 %f239, %f238, %f116, %f230; mov.b32 {%rs137, %rs143}, %r114; // begin inline asm { cvt.f32.f16 %f117, %rs137;} // end inline asm fma.rn.ftz.f32 %f240, %f238, %f117, %f231; mov.b32 {%rs138, %rs144}, %r122; // begin inline asm { cvt.f32.f16 %f118, %rs138;} // end inline asm fma.rn.ftz.f32 %f241, %f238, %f118, %f232; mov.b32 {%rs139, %rs145}, %r130; // begin inline asm { cvt.f32.f16 %f119, %rs139;} // end inline asm fma.rn.ftz.f32 %f242, %f238, %f119, %f233; mov.b32 {%rs140, %rs146}, %r138; // begin inline asm { cvt.f32.f16 %f120, %rs140;} // end inline asm fma.rn.ftz.f32 %f243, %f238, %f120, %f234; mov.b32 {%rs141, %rs147}, %r146; // begin inline asm { cvt.f32.f16 %f121, %rs141;} // end inline asm fma.rn.ftz.f32 %f244, %f238, %f121, %f235; cvt.rn.f32.s16 %f245, %rs45; sub.ftz.f32 %f246, %f245, %f8; mul.ftz.f32 %f247, %f67, %f246; // begin inline asm { cvt.f32.f16 %f122, %rs142;} // end inline asm fma.rn.ftz.f32 %f248, %f247, %f122, %f239; // begin inline asm { cvt.f32.f16 %f123, %rs143;} // end inline asm fma.rn.ftz.f32 %f249, %f247, %f123, %f240; // begin inline asm { cvt.f32.f16 %f124, %rs144;} // end inline asm fma.rn.ftz.f32 %f250, %f247, %f124, %f241; // begin inline asm { cvt.f32.f16 %f125, %rs145;} // end inline asm fma.rn.ftz.f32 %f251, %f247, %f125, %f242; // begin inline asm { cvt.f32.f16 %f126, %rs146;} // end inline asm fma.rn.ftz.f32 %f252, %f247, %f126, %f243; // begin inline asm { cvt.f32.f16 %f127, %rs147;} // end inline asm fma.rn.ftz.f32 %f253, %f247, %f127, %f244; cvt.rn.f32.s16 %f254, %rs47; sub.ftz.f32 %f255, %f254, %f8; mul.ftz.f32 %f256, %f67, %f255; mov.b32 {%rs148, %rs154}, %r107; // begin inline asm { cvt.f32.f16 %f128, %rs148;} // end inline asm fma.rn.ftz.f32 %f257, %f256, %f128, %f248; mov.b32 {%rs149, %rs155}, %r115; // begin inline asm { cvt.f32.f16 %f129, %rs149;} // end inline asm fma.rn.ftz.f32 %f258, %f256, %f129, %f249; mov.b32 {%rs150, %rs156}, %r123; // begin inline asm { cvt.f32.f16 %f130, %rs150;} // end inline asm fma.rn.ftz.f32 %f259, %f256, %f130, %f250; mov.b32 {%rs151, %rs157}, %r131; // begin inline asm { cvt.f32.f16 %f131, %rs151;} // end inline asm fma.rn.ftz.f32 %f260, %f256, %f131, %f251; mov.b32 {%rs152, %rs158}, %r139; // begin inline asm { cvt.f32.f16 %f132, %rs152;} // end inline asm fma.rn.ftz.f32 %f261, %f256, %f132, %f252; mov.b32 {%rs153, %rs159}, %r147; // begin inline asm { cvt.f32.f16 %f133, %rs153;} // end inline asm fma.rn.ftz.f32 %f262, %f256, %f133, %f253; cvt.rn.f32.s16 %f263, %rs49; sub.ftz.f32 %f264, %f263, %f8; mul.ftz.f32 %f265, %f67, %f264; // begin inline asm { cvt.f32.f16 %f134, %rs154;} // end inline asm fma.rn.ftz.f32 %f266, %f265, %f134, %f257; // begin inline asm { cvt.f32.f16 %f135, %rs155;} // end inline asm fma.rn.ftz.f32 %f267, %f265, %f135, %f258; // begin inline asm { cvt.f32.f16 %f136, %rs156;} // end inline asm fma.rn.ftz.f32 %f268, %f265, %f136, %f259; // begin inline asm { cvt.f32.f16 %f137, %rs157;} // end inline asm fma.rn.ftz.f32 %f269, %f265, %f137, %f260; // begin inline asm { cvt.f32.f16 %f138, %rs158;} // end inline asm fma.rn.ftz.f32 %f270, %f265, %f138, %f261; // begin inline asm { cvt.f32.f16 %f139, %rs159;} // end inline asm fma.rn.ftz.f32 %f271, %f265, %f139, %f262; cvt.rn.f32.s16 %f272, %rs51; sub.ftz.f32 %f273, %f272, %f8; mul.ftz.f32 %f274, %f67, %f273; mov.b32 {%rs160, %rs166}, %r108; // begin inline asm { cvt.f32.f16 %f140, %rs160;} // end inline asm fma.rn.ftz.f32 %f275, %f274, %f140, %f266; mov.b32 {%rs161, %rs167}, %r116; // begin inline asm { cvt.f32.f16 %f141, %rs161;} // end inline asm fma.rn.ftz.f32 %f276, %f274, %f141, %f267; mov.b32 {%rs162, %rs168}, %r124; // begin inline asm { cvt.f32.f16 %f142, %rs162;} // end inline asm fma.rn.ftz.f32 %f277, %f274, %f142, %f268; mov.b32 {%rs163, %rs169}, %r132; // begin inline asm { cvt.f32.f16 %f143, %rs163;} // end inline asm fma.rn.ftz.f32 %f278, %f274, %f143, %f269; mov.b32 {%rs164, %rs170}, %r140; // begin inline asm { cvt.f32.f16 %f144, %rs164;} // end inline asm fma.rn.ftz.f32 %f279, %f274, %f144, %f270; mov.b32 {%rs165, %rs171}, %r148; // begin inline asm { cvt.f32.f16 %f145, %rs165;} // end inline asm fma.rn.ftz.f32 %f280, %f274, %f145, %f271; cvt.rn.f32.s16 %f281, %rs53; sub.ftz.f32 %f282, %f281, %f8; mul.ftz.f32 %f283, %f67, %f282; // begin inline asm { cvt.f32.f16 %f146, %rs166;} // end inline asm fma.rn.ftz.f32 %f284, %f283, %f146, %f275; // begin inline asm { cvt.f32.f16 %f147, %rs167;} // end inline asm fma.rn.ftz.f32 %f285, %f283, %f147, %f276; // begin inline asm { cvt.f32.f16 %f148, %rs168;} // end inline asm fma.rn.ftz.f32 %f286, %f283, %f148, %f277; // begin inline asm { cvt.f32.f16 %f149, %rs169;} // end inline asm fma.rn.ftz.f32 %f287, %f283, %f149, %f278; // begin inline asm { cvt.f32.f16 %f150, %rs170;} // end inline asm fma.rn.ftz.f32 %f288, %f283, %f150, %f279; // begin inline asm { cvt.f32.f16 %f151, %rs171;} // end inline asm fma.rn.ftz.f32 %f289, %f283, %f151, %f280; cvt.rn.f32.s16 %f290, %rs55; sub.ftz.f32 %f291, %f290, %f8; mul.ftz.f32 %f292, %f67, %f291; mov.b32 {%rs172, %rs178}, %r109; // begin inline asm { cvt.f32.f16 %f152, %rs172;} // end inline asm fma.rn.ftz.f32 %f293, %f292, %f152, %f284; mov.b32 {%rs173, %rs179}, %r117; // begin inline asm { cvt.f32.f16 %f153, %rs173;} // end inline asm fma.rn.ftz.f32 %f294, %f292, %f153, %f285; mov.b32 {%rs174, %rs180}, %r125; // begin inline asm { cvt.f32.f16 %f154, %rs174;} // end inline asm fma.rn.ftz.f32 %f295, %f292, %f154, %f286; mov.b32 {%rs175, %rs181}, %r133; // begin inline asm { cvt.f32.f16 %f155, %rs175;} // end inline asm fma.rn.ftz.f32 %f296, %f292, %f155, %f287; mov.b32 {%rs176, %rs182}, %r141; // begin inline asm { cvt.f32.f16 %f156, %rs176;} // end inline asm fma.rn.ftz.f32 %f297, %f292, %f156, %f288; mov.b32 {%rs177, %rs183}, %r149; // begin inline asm { cvt.f32.f16 %f157, %rs177;} // end inline asm fma.rn.ftz.f32 %f298, %f292, %f157, %f289; cvt.rn.f32.s16 %f299, %rs56; sub.ftz.f32 %f300, %f299, %f8; mul.ftz.f32 %f301, %f67, %f300; // begin inline asm { cvt.f32.f16 %f158, %rs178;} // end inline asm fma.rn.ftz.f32 %f685, %f301, %f158, %f293; // begin inline asm { cvt.f32.f16 %f159, %rs179;} // end inline asm fma.rn.ftz.f32 %f684, %f301, %f159, %f294; // begin inline asm { cvt.f32.f16 %f160, %rs180;} // end inline asm fma.rn.ftz.f32 %f683, %f301, %f160, %f295; // begin inline asm { cvt.f32.f16 %f161, %rs181;} // end inline asm fma.rn.ftz.f32 %f682, %f301, %f161, %f296; // begin inline asm { cvt.f32.f16 %f162, %rs182;} // end inline asm fma.rn.ftz.f32 %f681, %f301, %f162, %f297; // begin inline asm { cvt.f32.f16 %f163, %rs183;} // end inline asm fma.rn.ftz.f32 %f680, %f301, %f163, %f298; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs282, %rs81, 4; cvt.s16.s8 %rs283, %rs282; shr.s16 %rs284, %rs283, 7; and.b16 %rs285, %rs284, -16; or.b16 %rs286, %rs285, %rs5; cvt.rn.f32.s16 %f398, %rs286; sub.ftz.f32 %f399, %f398, %f8; mul.ftz.f32 %f400, %f67, %f399; // begin inline asm { cvt.f32.f16 %f302, %rs8;} // end inline asm fma.rn.ftz.f32 %f401, %f400, %f302, %f685; // begin inline asm { cvt.f32.f16 %f303, %rs11;} // end inline asm fma.rn.ftz.f32 %f402, %f400, %f303, %f684; // begin inline asm { cvt.f32.f16 %f304, %rs14;} // end inline asm fma.rn.ftz.f32 %f403, %f400, %f304, %f683; // begin inline asm { cvt.f32.f16 %f305, %rs17;} // end inline asm fma.rn.ftz.f32 %f404, %f400, %f305, %f682; // begin inline asm { cvt.f32.f16 %f306, %rs20;} // end inline asm fma.rn.ftz.f32 %f405, %f400, %f306, %f681; // begin inline asm { cvt.f32.f16 %f307, %rs21;} // end inline asm fma.rn.ftz.f32 %f406, %f400, %f307, %f680; cvt.s16.s8 %rs287, %rs81; shr.s16 %rs288, %rs287, 7; and.b16 %rs289, %rs288, -16; or.b16 %rs290, %rs289, %rs22; cvt.rn.f32.s16 %f407, %rs290; sub.ftz.f32 %f408, %f407, %f8; mul.ftz.f32 %f409, %f67, %f408; // begin inline asm { cvt.f32.f16 %f308, %rs7;} // end inline asm fma.rn.ftz.f32 %f410, %f409, %f308, %f401; // begin inline asm { cvt.f32.f16 %f309, %rs10;} // end inline asm fma.rn.ftz.f32 %f411, %f409, %f309, %f402; // begin inline asm { cvt.f32.f16 %f310, %rs13;} // end inline asm fma.rn.ftz.f32 %f412, %f409, %f310, %f403; // begin inline asm { cvt.f32.f16 %f311, %rs16;} // end inline asm fma.rn.ftz.f32 %f413, %f409, %f311, %f404; // begin inline asm { cvt.f32.f16 %f312, %rs19;} // end inline asm fma.rn.ftz.f32 %f414, %f409, %f312, %f405; mov.b32 {%rs296, %rs196}, %r91; // begin inline asm { cvt.f32.f16 %f313, %rs196;} // end inline asm fma.rn.ftz.f32 %f415, %f409, %f313, %f406; cvt.s16.s8 %rs298, %rs82; shr.s16 %rs299, %rs298, 7; and.b16 %rs300, %rs299, -16; or.b16 %rs301, %rs300, %rs23; cvt.rn.f32.s16 %f416, %rs301; sub.ftz.f32 %f417, %f416, %f8; mul.ftz.f32 %f418, %f67, %f417; mov.b32 {%rs197, %rs203}, %r70; // begin inline asm { cvt.f32.f16 %f314, %rs197;} // end inline asm fma.rn.ftz.f32 %f419, %f418, %f314, %f410; mov.b32 {%rs198, %rs204}, %r74; // begin inline asm { cvt.f32.f16 %f315, %rs198;} // end inline asm fma.rn.ftz.f32 %f420, %f418, %f315, %f411; mov.b32 {%rs199, %rs205}, %r80; // begin inline asm { cvt.f32.f16 %f316, %rs199;} // end inline asm fma.rn.ftz.f32 %f421, %f418, %f316, %f412; mov.b32 {%rs200, %rs206}, %r84; // begin inline asm { cvt.f32.f16 %f317, %rs200;} // end inline asm fma.rn.ftz.f32 %f422, %f418, %f317, %f413; mov.b32 {%rs201, %rs207}, %r88; // begin inline asm { cvt.f32.f16 %f318, %rs201;} // end inline asm fma.rn.ftz.f32 %f423, %f418, %f318, %f414; mov.b32 {%rs202, %rs208}, %r92; // begin inline asm { cvt.f32.f16 %f319, %rs202;} // end inline asm fma.rn.ftz.f32 %f424, %f418, %f319, %f415; shr.s16 %rs302, %rs81, 15; and.b16 %rs303, %rs302, -16; or.b16 %rs304, %rs303, %rs24; cvt.rn.f32.s16 %f425, %rs304; sub.ftz.f32 %f426, %f425, %f8; mul.ftz.f32 %f427, %f67, %f426; // begin inline asm { cvt.f32.f16 %f320, %rs203;} // end inline asm fma.rn.ftz.f32 %f428, %f427, %f320, %f419; // begin inline asm { cvt.f32.f16 %f321, %rs204;} // end inline asm fma.rn.ftz.f32 %f429, %f427, %f321, %f420; // begin inline asm { cvt.f32.f16 %f322, %rs205;} // end inline asm fma.rn.ftz.f32 %f430, %f427, %f322, %f421; // begin inline asm { cvt.f32.f16 %f323, %rs206;} // end inline asm fma.rn.ftz.f32 %f431, %f427, %f323, %f422; // begin inline asm { cvt.f32.f16 %f324, %rs207;} // end inline asm fma.rn.ftz.f32 %f432, %f427, %f324, %f423; // begin inline asm { cvt.f32.f16 %f325, %rs208;} // end inline asm fma.rn.ftz.f32 %f433, %f427, %f325, %f424; shl.b16 %rs306, %rs84, 4; cvt.s16.s8 %rs307, %rs306; shr.s16 %rs308, %rs307, 7; and.b16 %rs309, %rs308, -16; or.b16 %rs310, %rs309, %rs25; cvt.rn.f32.s16 %f434, %rs310; sub.ftz.f32 %f435, %f434, %f8; mul.ftz.f32 %f436, %f67, %f435; cvt.u16.u32 %rs209, %r71; // begin inline asm { cvt.f32.f16 %f326, %rs209;} // end inline asm fma.rn.ftz.f32 %f437, %f436, %f326, %f428; cvt.u16.u32 %rs210, %r75; // begin inline asm { cvt.f32.f16 %f327, %rs210;} // end inline asm fma.rn.ftz.f32 %f438, %f436, %f327, %f429; cvt.u16.u32 %rs211, %r81; // begin inline asm { cvt.f32.f16 %f328, %rs211;} // end inline asm fma.rn.ftz.f32 %f439, %f436, %f328, %f430; cvt.u16.u32 %rs212, %r85; // begin inline asm { cvt.f32.f16 %f329, %rs212;} // end inline asm fma.rn.ftz.f32 %f440, %f436, %f329, %f431; cvt.u16.u32 %rs213, %r89; // begin inline asm { cvt.f32.f16 %f330, %rs213;} // end inline asm fma.rn.ftz.f32 %f441, %f436, %f330, %f432; // begin inline asm { cvt.f32.f16 %f331, %rs30;} // end inline asm fma.rn.ftz.f32 %f442, %f436, %f331, %f433; shl.b16 %rs311, %rs31, 4; cvt.s16.s8 %rs312, %rs311; shr.s16 %rs313, %rs312, 7; and.b16 %rs314, %rs313, -16; or.b16 %rs315, %rs314, %rs32; cvt.rn.f32.s16 %f443, %rs315; sub.ftz.f32 %f444, %f443, %f8; mul.ftz.f32 %f445, %f67, %f444; // begin inline asm { cvt.f32.f16 %f332, %rs33;} // end inline asm fma.rn.ftz.f32 %f446, %f445, %f332, %f437; // begin inline asm { cvt.f32.f16 %f333, %rs34;} // end inline asm fma.rn.ftz.f32 %f447, %f445, %f333, %f438; // begin inline asm { cvt.f32.f16 %f334, %rs35;} // end inline asm fma.rn.ftz.f32 %f448, %f445, %f334, %f439; // begin inline asm { cvt.f32.f16 %f335, %rs36;} // end inline asm fma.rn.ftz.f32 %f449, %f445, %f335, %f440; // begin inline asm { cvt.f32.f16 %f336, %rs37;} // end inline asm fma.rn.ftz.f32 %f450, %f445, %f336, %f441; // begin inline asm { cvt.f32.f16 %f337, %rs38;} // end inline asm fma.rn.ftz.f32 %f451, %f445, %f337, %f442; shl.b16 %rs316, %rs39, 4; cvt.s16.s8 %rs317, %rs316; shr.s16 %rs318, %rs317, 7; and.b16 %rs319, %rs318, -16; or.b16 %rs320, %rs319, %rs40; cvt.rn.f32.s16 %f452, %rs320; sub.ftz.f32 %f453, %f452, %f8; mul.ftz.f32 %f454, %f67, %f453; mov.b32 {%rs221, %rs227}, %r72; // begin inline asm { cvt.f32.f16 %f338, %rs221;} // end inline asm fma.rn.ftz.f32 %f455, %f454, %f338, %f446; mov.b32 {%rs222, %rs228}, %r76; // begin inline asm { cvt.f32.f16 %f339, %rs222;} // end inline asm fma.rn.ftz.f32 %f456, %f454, %f339, %f447; mov.b32 {%rs223, %rs229}, %r82; // begin inline asm { cvt.f32.f16 %f340, %rs223;} // end inline asm fma.rn.ftz.f32 %f457, %f454, %f340, %f448; mov.b32 {%rs224, %rs230}, %r86; // begin inline asm { cvt.f32.f16 %f341, %rs224;} // end inline asm fma.rn.ftz.f32 %f458, %f454, %f341, %f449; mov.b32 {%rs225, %rs231}, %r90; // begin inline asm { cvt.f32.f16 %f342, %rs225;} // end inline asm fma.rn.ftz.f32 %f459, %f454, %f342, %f450; mov.b32 {%rs226, %rs232}, %r94; // begin inline asm { cvt.f32.f16 %f343, %rs226;} // end inline asm fma.rn.ftz.f32 %f460, %f454, %f343, %f451; shl.b16 %rs321, %rs41, 4; cvt.s16.s8 %rs322, %rs321; shr.s16 %rs323, %rs322, 7; and.b16 %rs324, %rs323, -16; or.b16 %rs325, %rs324, %rs41; cvt.rn.f32.s16 %f461, %rs325; sub.ftz.f32 %f462, %f461, %f8; mul.ftz.f32 %f463, %f67, %f462; // begin inline asm { cvt.f32.f16 %f344, %rs227;} // end inline asm fma.rn.ftz.f32 %f464, %f463, %f344, %f455; // begin inline asm { cvt.f32.f16 %f345, %rs228;} // end inline asm fma.rn.ftz.f32 %f465, %f463, %f345, %f456; // begin inline asm { cvt.f32.f16 %f346, %rs229;} // end inline asm fma.rn.ftz.f32 %f466, %f463, %f346, %f457; // begin inline asm { cvt.f32.f16 %f347, %rs230;} // end inline asm fma.rn.ftz.f32 %f467, %f463, %f347, %f458; // begin inline asm { cvt.f32.f16 %f348, %rs231;} // end inline asm fma.rn.ftz.f32 %f468, %f463, %f348, %f459; // begin inline asm { cvt.f32.f16 %f349, %rs232;} // end inline asm fma.rn.ftz.f32 %f469, %f463, %f349, %f460; ld.global.v4.u32 {%r155, %r156, %r157, %r158}, [%rd5+16]; ld.global.v4.u32 {%r163, %r164, %r165, %r166}, [%rd6+16]; ld.global.v4.u32 {%r171, %r172, %r173, %r174}, [%rd7]; ld.global.v4.u32 {%r179, %r180, %r181, %r182}, [%rd8]; ld.global.v4.u32 {%r187, %r188, %r189, %r190}, [%rd9]; ld.global.v4.u32 {%r195, %r196, %r197, %r198}, [%rd10]; shl.b16 %rs326, %rs42, 4; cvt.s16.s8 %rs327, %rs326; shr.s16 %rs328, %rs327, 7; and.b16 %rs329, %rs328, -16; or.b16 %rs330, %rs329, %rs43; cvt.rn.f32.s16 %f470, %rs330; sub.ftz.f32 %f471, %f470, %f8; mul.ftz.f32 %f472, %f67, %f471; mov.b32 {%rs233, %rs239}, %r155; // begin inline asm { cvt.f32.f16 %f350, %rs233;} // end inline asm fma.rn.ftz.f32 %f473, %f472, %f350, %f464; mov.b32 {%rs234, %rs240}, %r163; // begin inline asm { cvt.f32.f16 %f351, %rs234;} // end inline asm fma.rn.ftz.f32 %f474, %f472, %f351, %f465; mov.b32 {%rs235, %rs241}, %r171; // begin inline asm { cvt.f32.f16 %f352, %rs235;} // end inline asm fma.rn.ftz.f32 %f475, %f472, %f352, %f466; mov.b32 {%rs236, %rs242}, %r179; // begin inline asm { cvt.f32.f16 %f353, %rs236;} // end inline asm fma.rn.ftz.f32 %f476, %f472, %f353, %f467; mov.b32 {%rs237, %rs243}, %r187; // begin inline asm { cvt.f32.f16 %f354, %rs237;} // end inline asm fma.rn.ftz.f32 %f477, %f472, %f354, %f468; mov.b32 {%rs238, %rs244}, %r195; // begin inline asm { cvt.f32.f16 %f355, %rs238;} // end inline asm fma.rn.ftz.f32 %f478, %f472, %f355, %f469; shl.b16 %rs331, %rs44, 4; cvt.s16.s8 %rs332, %rs331; shr.s16 %rs333, %rs332, 7; and.b16 %rs334, %rs333, -16; or.b16 %rs335, %rs334, %rs45; cvt.rn.f32.s16 %f479, %rs335; sub.ftz.f32 %f480, %f479, %f8; mul.ftz.f32 %f481, %f67, %f480; // begin inline asm { cvt.f32.f16 %f356, %rs239;} // end inline asm fma.rn.ftz.f32 %f482, %f481, %f356, %f473; // begin inline asm { cvt.f32.f16 %f357, %rs240;} // end inline asm fma.rn.ftz.f32 %f483, %f481, %f357, %f474; // begin inline asm { cvt.f32.f16 %f358, %rs241;} // end inline asm fma.rn.ftz.f32 %f484, %f481, %f358, %f475; // begin inline asm { cvt.f32.f16 %f359, %rs242;} // end inline asm fma.rn.ftz.f32 %f485, %f481, %f359, %f476; // begin inline asm { cvt.f32.f16 %f360, %rs243;} // end inline asm fma.rn.ftz.f32 %f486, %f481, %f360, %f477; // begin inline asm { cvt.f32.f16 %f361, %rs244;} // end inline asm fma.rn.ftz.f32 %f487, %f481, %f361, %f478; shl.b16 %rs336, %rs46, 4; cvt.s16.s8 %rs337, %rs336; shr.s16 %rs338, %rs337, 7; and.b16 %rs339, %rs338, -16; or.b16 %rs340, %rs339, %rs47; cvt.rn.f32.s16 %f488, %rs340; sub.ftz.f32 %f489, %f488, %f8; mul.ftz.f32 %f490, %f67, %f489; mov.b32 {%rs245, %rs251}, %r156; // begin inline asm { cvt.f32.f16 %f362, %rs245;} // end inline asm fma.rn.ftz.f32 %f491, %f490, %f362, %f482; mov.b32 {%rs246, %rs252}, %r164; // begin inline asm { cvt.f32.f16 %f363, %rs246;} // end inline asm fma.rn.ftz.f32 %f492, %f490, %f363, %f483; mov.b32 {%rs247, %rs253}, %r172; // begin inline asm { cvt.f32.f16 %f364, %rs247;} // end inline asm fma.rn.ftz.f32 %f493, %f490, %f364, %f484; mov.b32 {%rs248, %rs254}, %r180; // begin inline asm { cvt.f32.f16 %f365, %rs248;} // end inline asm fma.rn.ftz.f32 %f494, %f490, %f365, %f485; mov.b32 {%rs249, %rs255}, %r188; // begin inline asm { cvt.f32.f16 %f366, %rs249;} // end inline asm fma.rn.ftz.f32 %f495, %f490, %f366, %f486; mov.b32 {%rs250, %rs256}, %r196; // begin inline asm { cvt.f32.f16 %f367, %rs250;} // end inline asm fma.rn.ftz.f32 %f496, %f490, %f367, %f487; shl.b16 %rs341, %rs48, 4; cvt.s16.s8 %rs342, %rs341; shr.s16 %rs343, %rs342, 7; and.b16 %rs344, %rs343, -16; or.b16 %rs345, %rs344, %rs49; cvt.rn.f32.s16 %f497, %rs345; sub.ftz.f32 %f498, %f497, %f8; mul.ftz.f32 %f499, %f67, %f498; // begin inline asm { cvt.f32.f16 %f368, %rs251;} // end inline asm fma.rn.ftz.f32 %f500, %f499, %f368, %f491; // begin inline asm { cvt.f32.f16 %f369, %rs252;} // end inline asm fma.rn.ftz.f32 %f501, %f499, %f369, %f492; // begin inline asm { cvt.f32.f16 %f370, %rs253;} // end inline asm fma.rn.ftz.f32 %f502, %f499, %f370, %f493; // begin inline asm { cvt.f32.f16 %f371, %rs254;} // end inline asm fma.rn.ftz.f32 %f503, %f499, %f371, %f494; // begin inline asm { cvt.f32.f16 %f372, %rs255;} // end inline asm fma.rn.ftz.f32 %f504, %f499, %f372, %f495; // begin inline asm { cvt.f32.f16 %f373, %rs256;} // end inline asm fma.rn.ftz.f32 %f505, %f499, %f373, %f496; shl.b16 %rs346, %rs50, 4; cvt.s16.s8 %rs347, %rs346; shr.s16 %rs348, %rs347, 7; and.b16 %rs349, %rs348, -16; or.b16 %rs350, %rs349, %rs51; cvt.rn.f32.s16 %f506, %rs350; sub.ftz.f32 %f507, %f506, %f8; mul.ftz.f32 %f508, %f67, %f507; mov.b32 {%rs257, %rs263}, %r157; // begin inline asm { cvt.f32.f16 %f374, %rs257;} // end inline asm fma.rn.ftz.f32 %f509, %f508, %f374, %f500; mov.b32 {%rs258, %rs264}, %r165; // begin inline asm { cvt.f32.f16 %f375, %rs258;} // end inline asm fma.rn.ftz.f32 %f510, %f508, %f375, %f501; mov.b32 {%rs259, %rs265}, %r173; // begin inline asm { cvt.f32.f16 %f376, %rs259;} // end inline asm fma.rn.ftz.f32 %f511, %f508, %f376, %f502; mov.b32 {%rs260, %rs266}, %r181; // begin inline asm { cvt.f32.f16 %f377, %rs260;} // end inline asm fma.rn.ftz.f32 %f512, %f508, %f377, %f503; mov.b32 {%rs261, %rs267}, %r189; // begin inline asm { cvt.f32.f16 %f378, %rs261;} // end inline asm fma.rn.ftz.f32 %f513, %f508, %f378, %f504; mov.b32 {%rs262, %rs268}, %r197; // begin inline asm { cvt.f32.f16 %f379, %rs262;} // end inline asm fma.rn.ftz.f32 %f514, %f508, %f379, %f505; shl.b16 %rs351, %rs52, 4; cvt.s16.s8 %rs352, %rs351; shr.s16 %rs353, %rs352, 7; and.b16 %rs354, %rs353, -16; or.b16 %rs355, %rs354, %rs53; cvt.rn.f32.s16 %f515, %rs355; sub.ftz.f32 %f516, %f515, %f8; mul.ftz.f32 %f517, %f67, %f516; // begin inline asm { cvt.f32.f16 %f380, %rs263;} // end inline asm fma.rn.ftz.f32 %f518, %f517, %f380, %f509; // begin inline asm { cvt.f32.f16 %f381, %rs264;} // end inline asm fma.rn.ftz.f32 %f519, %f517, %f381, %f510; // begin inline asm { cvt.f32.f16 %f382, %rs265;} // end inline asm fma.rn.ftz.f32 %f520, %f517, %f382, %f511; // begin inline asm { cvt.f32.f16 %f383, %rs266;} // end inline asm fma.rn.ftz.f32 %f521, %f517, %f383, %f512; // begin inline asm { cvt.f32.f16 %f384, %rs267;} // end inline asm fma.rn.ftz.f32 %f522, %f517, %f384, %f513; // begin inline asm { cvt.f32.f16 %f385, %rs268;} // end inline asm fma.rn.ftz.f32 %f523, %f517, %f385, %f514; shl.b16 %rs356, %rs54, 4; cvt.s16.s8 %rs357, %rs356; shr.s16 %rs358, %rs357, 7; and.b16 %rs359, %rs358, -16; or.b16 %rs360, %rs359, %rs55; cvt.rn.f32.s16 %f524, %rs360; sub.ftz.f32 %f525, %f524, %f8; mul.ftz.f32 %f526, %f67, %f525; mov.b32 {%rs269, %rs275}, %r158; // begin inline asm { cvt.f32.f16 %f386, %rs269;} // end inline asm fma.rn.ftz.f32 %f527, %f526, %f386, %f518; mov.b32 {%rs270, %rs276}, %r166; // begin inline asm { cvt.f32.f16 %f387, %rs270;} // end inline asm fma.rn.ftz.f32 %f528, %f526, %f387, %f519; mov.b32 {%rs271, %rs277}, %r174; // begin inline asm { cvt.f32.f16 %f388, %rs271;} // end inline asm fma.rn.ftz.f32 %f529, %f526, %f388, %f520; mov.b32 {%rs272, %rs278}, %r182; // begin inline asm { cvt.f32.f16 %f389, %rs272;} // end inline asm fma.rn.ftz.f32 %f530, %f526, %f389, %f521; mov.b32 {%rs273, %rs279}, %r190; // begin inline asm { cvt.f32.f16 %f390, %rs273;} // end inline asm fma.rn.ftz.f32 %f531, %f526, %f390, %f522; mov.b32 {%rs274, %rs280}, %r198; // begin inline asm { cvt.f32.f16 %f391, %rs274;} // end inline asm fma.rn.ftz.f32 %f532, %f526, %f391, %f523; shl.b16 %rs361, %rs56, 4; cvt.s16.s8 %rs362, %rs361; shr.s16 %rs363, %rs362, 7; and.b16 %rs364, %rs363, -16; or.b16 %rs365, %rs364, %rs56; cvt.rn.f32.s16 %f533, %rs365; sub.ftz.f32 %f534, %f533, %f8; mul.ftz.f32 %f535, %f67, %f534; // begin inline asm { cvt.f32.f16 %f392, %rs275;} // end inline asm fma.rn.ftz.f32 %f685, %f535, %f392, %f527; // begin inline asm { cvt.f32.f16 %f393, %rs276;} // end inline asm fma.rn.ftz.f32 %f684, %f535, %f393, %f528; // begin inline asm { cvt.f32.f16 %f394, %rs277;} // end inline asm fma.rn.ftz.f32 %f683, %f535, %f394, %f529; // begin inline asm { cvt.f32.f16 %f395, %rs278;} // end inline asm fma.rn.ftz.f32 %f682, %f535, %f395, %f530; // begin inline asm { cvt.f32.f16 %f396, %rs279;} // end inline asm fma.rn.ftz.f32 %f681, %f535, %f396, %f531; // begin inline asm { cvt.f32.f16 %f397, %rs280;} // end inline asm fma.rn.ftz.f32 %f680, %f535, %f397, %f532; $L__BB0_8: add.s32 %r327, %r327, 4; shl.b32 %r203, %r327, 5; add.s32 %r326, %r203, %r53; shl.b32 %r325, %r326, 1; setp.lt.u32 %p7, %r325, %r50; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r320, %tid.y; mov.u32 %r319, %tid.x; shl.b32 %r318, %r320, 5; add.s32 %r317, %r318, %r319; shl.b32 %r205, %r317, 2; mov.u32 %r206, _ZZ9gemv_int4ILi4ELi64ELi6EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r207, %r206, %r205; setp.lt.u32 %p8, %r317, 32; @%p8 bra $L__BB0_11; add.s32 %r311, %r207, -112; st.shared.f32 [%r311], %f685; $L__BB0_11: mov.u32 %r324, %tid.y; mov.u32 %r323, %tid.x; shl.b32 %r322, %r324, 5; add.s32 %r321, %r322, %r323; setp.gt.u32 %p9, %r321, 31; bar.sync 0; mad.lo.s32 %r43, %r321, 12, %r206; @%p9 bra $L__BB0_13; mov.u32 %r222, 16; ld.shared.f32 %f551, [%r43+16]; add.ftz.f32 %f552, %f685, %f551; ld.shared.f32 %f553, [%r43+20]; add.ftz.f32 %f554, %f552, %f553; ld.shared.f32 %f555, [%r43+24]; add.ftz.f32 %f538, %f554, %f555; mov.u32 %r210, 1; mov.u32 %r223, 31; mov.u32 %r224, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f538, %r210, %r223, %r224; @p add.f32 r0, r0, %f538; mov.f32 %f536, r0;} // end inline asm mov.u32 %r213, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f536, %r213, %r223, %r224; @p add.f32 r0, r0, %f536; mov.f32 %f539, r0;} // end inline asm mov.u32 %r216, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f539, %r216, %r223, %r224; @p add.f32 r0, r0, %f539; mov.f32 %f542, r0;} // end inline asm mov.u32 %r219, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f542, %r219, %r223, %r224; @p add.f32 r0, r0, %f542; mov.f32 %f545, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f545, %r222, %r223, %r224; @p add.f32 r0, r0, %f545; mov.f32 %f685, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r312, %r207, -112; st.shared.f32 [%r312+640], %f684; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f571, [%r43+656]; add.ftz.f32 %f572, %f684, %f571; ld.shared.f32 %f573, [%r43+660]; add.ftz.f32 %f574, %f572, %f573; ld.shared.f32 %f575, [%r43+664]; add.ftz.f32 %f558, %f574, %f575; mov.u32 %r226, 1; mov.u32 %r239, 31; mov.u32 %r240, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f558, %r226, %r239, %r240; @p add.f32 r0, r0, %f558; mov.f32 %f556, r0;} // end inline asm mov.u32 %r229, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f556, %r229, %r239, %r240; @p add.f32 r0, r0, %f556; mov.f32 %f559, r0;} // end inline asm mov.u32 %r232, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f559, %r232, %r239, %r240; @p add.f32 r0, r0, %f559; mov.f32 %f562, r0;} // end inline asm mov.u32 %r235, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f562, %r235, %r239, %r240; @p add.f32 r0, r0, %f562; mov.f32 %f565, r0;} // end inline asm mov.u32 %r238, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f565, %r238, %r239, %r240; @p add.f32 r0, r0, %f565; mov.f32 %f684, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r313, %r207, -112; st.shared.f32 [%r313+1280], %f683; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f591, [%r43+1296]; add.ftz.f32 %f592, %f683, %f591; ld.shared.f32 %f593, [%r43+1300]; add.ftz.f32 %f594, %f592, %f593; ld.shared.f32 %f595, [%r43+1304]; add.ftz.f32 %f578, %f594, %f595; mov.u32 %r242, 1; mov.u32 %r255, 31; mov.u32 %r256, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f578, %r242, %r255, %r256; @p add.f32 r0, r0, %f578; mov.f32 %f576, r0;} // end inline asm mov.u32 %r245, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f576, %r245, %r255, %r256; @p add.f32 r0, r0, %f576; mov.f32 %f579, r0;} // end inline asm mov.u32 %r248, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f579, %r248, %r255, %r256; @p add.f32 r0, r0, %f579; mov.f32 %f582, r0;} // end inline asm mov.u32 %r251, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f582, %r251, %r255, %r256; @p add.f32 r0, r0, %f582; mov.f32 %f585, r0;} // end inline asm mov.u32 %r254, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f585, %r254, %r255, %r256; @p add.f32 r0, r0, %f585; mov.f32 %f683, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r314, %r207, -112; st.shared.f32 [%r314+1920], %f682; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f611, [%r43+1936]; add.ftz.f32 %f612, %f682, %f611; ld.shared.f32 %f613, [%r43+1940]; add.ftz.f32 %f614, %f612, %f613; ld.shared.f32 %f615, [%r43+1944]; add.ftz.f32 %f598, %f614, %f615; mov.u32 %r258, 1; mov.u32 %r271, 31; mov.u32 %r272, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f598, %r258, %r271, %r272; @p add.f32 r0, r0, %f598; mov.f32 %f596, r0;} // end inline asm mov.u32 %r261, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f596, %r261, %r271, %r272; @p add.f32 r0, r0, %f596; mov.f32 %f599, r0;} // end inline asm mov.u32 %r264, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f599, %r264, %r271, %r272; @p add.f32 r0, r0, %f599; mov.f32 %f602, r0;} // end inline asm mov.u32 %r267, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f602, %r267, %r271, %r272; @p add.f32 r0, r0, %f602; mov.f32 %f605, r0;} // end inline asm mov.u32 %r270, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f605, %r270, %r271, %r272; @p add.f32 r0, r0, %f605; mov.f32 %f682, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r315, %r207, -112; st.shared.f32 [%r315+2560], %f681; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f631, [%r43+2576]; add.ftz.f32 %f632, %f681, %f631; ld.shared.f32 %f633, [%r43+2580]; add.ftz.f32 %f634, %f632, %f633; ld.shared.f32 %f635, [%r43+2584]; add.ftz.f32 %f618, %f634, %f635; mov.u32 %r274, 1; mov.u32 %r287, 31; mov.u32 %r288, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f618, %r274, %r287, %r288; @p add.f32 r0, r0, %f618; mov.f32 %f616, r0;} // end inline asm mov.u32 %r277, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f616, %r277, %r287, %r288; @p add.f32 r0, r0, %f616; mov.f32 %f619, r0;} // end inline asm mov.u32 %r280, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f619, %r280, %r287, %r288; @p add.f32 r0, r0, %f619; mov.f32 %f622, r0;} // end inline asm mov.u32 %r283, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f622, %r283, %r287, %r288; @p add.f32 r0, r0, %f622; mov.f32 %f625, r0;} // end inline asm mov.u32 %r286, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f625, %r286, %r287, %r288; @p add.f32 r0, r0, %f625; mov.f32 %f681, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r316, %r207, -112; st.shared.f32 [%r316+3200], %f680; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f651, [%r43+3216]; add.ftz.f32 %f652, %f680, %f651; ld.shared.f32 %f653, [%r43+3220]; add.ftz.f32 %f654, %f652, %f653; ld.shared.f32 %f655, [%r43+3224]; add.ftz.f32 %f638, %f654, %f655; mov.u32 %r290, 1; mov.u32 %r303, 31; mov.u32 %r304, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f638, %r290, %r303, %r304; @p add.f32 r0, r0, %f638; mov.f32 %f636, r0;} // end inline asm mov.u32 %r293, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f636, %r293, %r303, %r304; @p add.f32 r0, r0, %f636; mov.f32 %f639, r0;} // end inline asm mov.u32 %r296, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f639, %r296, %r303, %r304; @p add.f32 r0, r0, %f639; mov.f32 %f642, r0;} // end inline asm mov.u32 %r299, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f642, %r299, %r303, %r304; @p add.f32 r0, r0, %f642; mov.f32 %f645, r0;} // end inline asm mov.u32 %r302, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f645, %r302, %r303, %r304; @p add.f32 r0, r0, %f645; mov.f32 %f680, r0;} // end inline asm $L__BB0_33: mov.u32 %r305, %tid.y; or.b32 %r307, %r53, %r305; setp.ne.s32 %p20, %r307, 0; @%p20 bra $L__BB0_37; ld.param.u64 %rd62, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd61, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd40, %rd61; setp.eq.s64 %p21, %rd62, 0; mul.ftz.f32 %f45, %f59, %f685; mov.u32 %r308, %ctaid.x; cvt.s64.s32 %rd12, %r308; mul.wide.s32 %rd41, %r308, 2; add.s64 %rd13, %rd40, %rd41; mul.ftz.f32 %f46, %f59, %f684; add.s32 %r309, %r49, %r308; cvt.s64.s32 %rd14, %r49; mul.wide.s32 %rd42, %r49, 2; add.s64 %rd15, %rd13, %rd42; mul.ftz.f32 %f47, %f59, %f683; add.s32 %r310, %r309, %r49; cvt.s64.s32 %rd16, %r310; mul.wide.s32 %rd43, %r310, 2; add.s64 %rd18, %rd40, %rd43; mul.ftz.f32 %f48, %f59, %f682; mul.ftz.f32 %f49, %f59, %f681; mul.ftz.f32 %f50, %f59, %f680; @%p21 bra $L__BB0_36; ld.param.u64 %rd63, [_Z27dequant_gemv_group64_batch623DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd44, %rd63; shl.b64 %rd45, %rd12, 1; add.s64 %rd46, %rd44, %rd45; ld.global.u16 %rs366, [%rd46]; // begin inline asm { cvt.f32.f16 %f656, %rs366;} // end inline asm fma.rn.ftz.f32 %f657, %f60, %f656, %f45; // begin inline asm { cvt.rn.f16.f32 %rs367, %f657;} // end inline asm st.global.u16 [%rd13], %rs367; shl.b64 %rd47, %rd14, 1; add.s64 %rd48, %rd46, %rd47; ld.global.u16 %rs368, [%rd48]; // begin inline asm { cvt.f32.f16 %f658, %rs368;} // end inline asm fma.rn.ftz.f32 %f659, %f60, %f658, %f46; // begin inline asm { cvt.rn.f16.f32 %rs369, %f659;} // end inline asm st.global.u16 [%rd15], %rs369; shl.b64 %rd49, %rd16, 1; add.s64 %rd50, %rd44, %rd49; ld.global.u16 %rs370, [%rd50]; // begin inline asm { cvt.f32.f16 %f660, %rs370;} // end inline asm fma.rn.ftz.f32 %f661, %f60, %f660, %f47; // begin inline asm { cvt.rn.f16.f32 %rs371, %f661;} // end inline asm st.global.u16 [%rd18], %rs371; add.s64 %rd51, %rd50, %rd47; ld.global.u16 %rs372, [%rd51]; // begin inline asm { cvt.f32.f16 %f662, %rs372;} // end inline asm fma.rn.ftz.f32 %f663, %f60, %f662, %f48; // begin inline asm { cvt.rn.f16.f32 %rs373, %f663;} // end inline asm add.s64 %rd52, %rd18, %rd47; st.global.u16 [%rd52], %rs373; add.s64 %rd53, %rd51, %rd47; ld.global.u16 %rs374, [%rd53]; // begin inline asm { cvt.f32.f16 %f664, %rs374;} // end inline asm fma.rn.ftz.f32 %f665, %f60, %f664, %f49; // begin inline asm { cvt.rn.f16.f32 %rs375, %f665;} // end inline asm add.s64 %rd54, %rd52, %rd47; st.global.u16 [%rd54], %rs375; add.s64 %rd55, %rd53, %rd47; ld.global.u16 %rs376, [%rd55]; // begin inline asm { cvt.f32.f16 %f666, %rs376;} // end inline asm fma.rn.ftz.f32 %f667, %f60, %f666, %f50; // begin inline asm { cvt.rn.f16.f32 %rs377, %f667;} // end inline asm add.s64 %rd56, %rd54, %rd47; st.global.u16 [%rd56], %rs377; bra.uni $L__BB0_37; $L__BB0_36: // begin inline asm { cvt.rn.f16.f32 %rs378, %f45;} // end inline asm st.global.u16 [%rd13], %rs378; // begin inline asm { cvt.rn.f16.f32 %rs379, %f46;} // end inline asm st.global.u16 [%rd15], %rs379; // begin inline asm { cvt.rn.f16.f32 %rs380, %f47;} // end inline asm st.global.u16 [%rd18], %rs380; // begin inline asm { cvt.rn.f16.f32 %rs381, %f48;} // end inline asm shl.b64 %rd57, %rd14, 1; add.s64 %rd58, %rd18, %rd57; st.global.u16 [%rd58], %rs381; // begin inline asm { cvt.rn.f16.f32 %rs382, %f49;} // end inline asm add.s64 %rd59, %rd58, %rd57; st.global.u16 [%rd59], %rs382; // begin inline asm { cvt.rn.f16.f32 %rs383, %f50;} // end inline asm add.s64 %rd60, %rd59, %rd57; st.global.u16 [%rd60], %rs383; $L__BB0_37: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }