tion tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_86 .address_size 64 // .globl fmha_mhca_fp16_128_64_sm86_kernel_nl .extern .shared .align 16 .b8 _ZN25fused_multihead_attention5smem_E[]; .visible .entry fmha_mhca_fp16_128_64_sm86_kernel_nl( .param .align 8 .b8 fmha_mhca_fp16_128_64_sm86_kernel_nl_param_0[240] ) { .reg .pred %p<173>; .reg .b16 %rs<129>; .reg .f32 %f<664>; .reg .b32 %r<2517>; .reg .b64 %rd<65>; mov.b64 %rd7, fmha_mhca_fp16_128_64_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd7; mov.u32 %r1, %tid.x; ld.param.u32 %r2, [fmha_mhca_fp16_128_64_sm86_kernel_nl_param_0+200]; mov.u32 %r79, %ctaid.z; shl.b32 %r3, %r79, 6; setp.le.s32 %p17, %r2, %r3; @%p17 bra $L__BB0_40; mov.u32 %r1176, %ctaid.y; ld.param.u32 %r1177, [%rd1+192]; mov.u32 %r1178, %ctaid.x; mul.lo.s32 %r1179, %r2, %r1176; mad.lo.s32 %r1180, %r1179, %r1177, %r1178; ld.param.u32 %r1181, [%rd1+232]; ld.param.u32 %r1182, [%rd1+224]; shr.s32 %r1183, %r1, 31; shr.u32 %r1184, %r1183, 27; add.s32 %r1185, %r1, %r1184; and.b32 %r1186, %r1185, -32; sub.s32 %r1187, %r1, %r1186; ld.param.u64 %rd28, [%rd1+184]; ld.param.u64 %rd29, [%rd1+176]; ld.param.u32 %r1188, [%rd1+196]; shr.s32 %r1189, %r1188, 31; shr.u32 %r1190, %r1189, 29; add.s32 %r1191, %r1188, %r1190; shr.s32 %r1192, %r1191, 3; shr.u32 %r1193, %r1183, 29; add.s32 %r1194, %r1, %r1193; and.b32 %r1195, %r1194, -8; sub.s32 %r4, %r1, %r1195; setp.lt.s32 %p18, %r4, %r1192; shr.s32 %r5, %r1194, 3; add.s32 %r1196, %r5, %r3; cvt.s64.s32 %rd2, %r1196; shl.b32 %r1197, %r4, 4; cvt.s64.s32 %rd3, %r1197; shr.s32 %r1198, %r1194, 31; shr.u32 %r1199, %r1198, 29; add.s32 %r1200, %r5, %r1199; and.b32 %r1201, %r1200, 268435448; sub.s32 %r1202, %r5, %r1201; xor.b32 %r1203, %r1202, %r4; shl.b32 %r6, %r1203, 4; mov.u32 %r1204, 31; mov.u32 %r1205, 0; mov.u32 %r1206, -1; shfl.sync.idx.b32 %r1207|%p19, %r1205, %r1205, %r1204, %r1206; shfl.sync.idx.b32 %r1208|%p20, %r1205, %r1205, %r1204, %r1206; and.b32 %r1209, %r1, 7; shl.b32 %r1210, %r1, 4; and.b32 %r1211, %r1210, 112; and.b32 %r1212, %r1, 16; xor.b32 %r1213, %r1211, %r1212; ld.param.u32 %r1214, [%rd1+228]; shl.b32 %r1215, %r1214, 1; mov.u32 %r1216, 1; shr.s32 %r1217, %r1214, 31; shr.u32 %r1218, %r1217, 29; add.s32 %r1219, %r1214, %r1218; shr.s32 %r1220, %r1219, 3; setp.lt.s32 %p21, %r4, %r1220; cvt.s64.s32 %rd30, %r5; ld.param.u64 %rd31, [%rd1+216]; mul.lo.s64 %rd32, %rd31, %rd30; mul.lo.s32 %r1221, %r1181, %r1176; mad.lo.s32 %r1222, %r1221, %r1182, %r1178; shl.b32 %r1223, %r1222, 1; mul.wide.s32 %rd33, %r1215, %r1223; add.s64 %rd34, %rd33, %rd3; add.s64 %rd35, %rd34, %rd32; ld.param.u64 %rd36, [%rd1+208]; add.s64 %rd12, %rd36, %rd35; shfl.sync.idx.b32 %r1224|%p22, %r1205, %r1205, %r1204, %r1206; shfl.sync.idx.b32 %r1225|%p23, %r1205, %r1205, %r1204, %r1206; and.b32 %r1226, %r1, 96; shr.u32 %r1227, %r1226, 1; or.b32 %r1228, %r1227, %r1209; shr.u32 %r1229, %r1212, 1; or.b32 %r1230, %r1228, %r1229; and.b32 %r1231, %r1, 8; shr.u32 %r1232, %r1231, 3; xor.b32 %r1233, %r1232, %r1209; or.b32 %r1234, %r1223, 1; mul.wide.s32 %rd37, %r1215, %r1234; add.s64 %rd38, %rd37, %rd3; add.s64 %rd39, %rd38, %rd32; add.s64 %rd20, %rd36, %rd39; shfl.sync.idx.b32 %r1235|%p24, %r1205, %r1205, %r1204, %r1206; shfl.sync.idx.b32 %r1236|%p25, %r1205, %r1205, %r1204, %r1206; and.b32 %r1237, %r1, 224; shr.u32 %r1238, %r1237, 1; and.b32 %r1239, %r1, 15; or.b32 %r1240, %r1238, %r1239; ld.param.u64 %rd4, [%rd1+32]; ld.param.u64 %rd40, [%rd1+8]; cvta.to.global.u64 %rd5, %rd40; ld.param.u32 %r1241, [%rd1+60]; mul.lo.s32 %r1242, %r1180, %r1241; shl.b32 %r7, %r1242, 1; sub.s32 %r1243, %r2, %r3; min.s32 %r1244, %r1243, 64; shl.b32 %r1248, %r1240, 7; shl.b32 %r1249, %r1233, 4; shl.b32 %r1250, %r1230, 7; shl.b32 %r1251, %r1, 7; and.b32 %r1252, %r1251, 1920; shl.b32 %r1253, %r5, 7; shl.b32 %r1254, %r1188, 1; shr.s32 %r1255, %r1187, 31; shr.u32 %r1256, %r1255, 30; add.s32 %r1257, %r1187, %r1256; and.b32 %r1258, %r1257, 2147483644; sub.s32 %r1259, %r1187, %r1258; setp.lt.s32 %p26, %r5, %r1244; and.pred %p27, %p18, %p26; add.s32 %r1260, %r5, 16; setp.lt.s32 %p28, %r1260, %r1244; and.pred %p29, %p18, %p28; add.s32 %r1261, %r5, 32; setp.lt.s32 %p30, %r1261, %r1244; and.pred %p31, %p18, %p30; add.s32 %r1262, %r5, 48; setp.lt.s32 %p32, %r1262, %r1244; and.pred %p33, %p18, %p32; mul.lo.s64 %rd41, %rd28, %rd2; mul.wide.s32 %rd42, %r1254, %r1180; add.s32 %r1263, %r6, %r1253; or.b32 %r1264, %r1213, %r1252; or.b32 %r1265, %r1250, %r1249; or.b32 %r1266, %r1248, %r1213; shr.s32 %r1267, %r1185, 5; shl.b32 %r1268, %r1267, 4; shl.b32 %r1269, %r1259, 1; add.s64 %rd43, %rd41, %rd3; add.s64 %rd44, %rd43, %rd42; shl.b64 %rd45, %rd28, 4; add.s64 %rd8, %rd29, %rd44; mov.u32 %r1270, _ZN25fused_multihead_attention5smem_E; add.s32 %r1271, %r1263, %r1270; add.s32 %r80, %r1271, %r1208; add.s32 %r82, %r80, 2048; add.s32 %r84, %r80, 4096; add.s32 %r86, %r80, 6144; selp.b32 %r81, 16, 0, %p27; // begin inline asm cp.async.cg.shared.global [%r80], [%rd8], 16, %r81; // end inline asm selp.b32 %r83, 16, 0, %p29; add.s64 %rd9, %rd8, %rd45; // begin inline asm cp.async.cg.shared.global [%r82], [%rd9], 16, %r83; // end inline asm selp.b32 %r85, 16, 0, %p31; add.s64 %rd10, %rd9, %rd45; // begin inline asm cp.async.cg.shared.global [%r84], [%rd10], 16, %r85; // end inline asm selp.b32 %r87, 16, 0, %p33; add.s64 %rd11, %rd10, %rd45; // begin inline asm cp.async.cg.shared.global [%r86], [%rd11], 16, %r87; // end inline asm min.s32 %r1272, %r1181, 128; setp.lt.s32 %p34, %r5, %r1272; and.pred %p35, %p21, %p34; setp.lt.s32 %p36, %r1260, %r1272; and.pred %p37, %p21, %p36; setp.lt.s32 %p38, %r1261, %r1272; and.pred %p39, %p21, %p38; setp.lt.s32 %p40, %r1262, %r1272; and.pred %p41, %p21, %p40; add.s32 %r1273, %r5, 64; setp.lt.s32 %p42, %r1273, %r1272; and.pred %p43, %p21, %p42; add.s32 %r1274, %r5, 80; setp.lt.s32 %p44, %r1274, %r1272; and.pred %p45, %p21, %p44; add.s32 %r1275, %r5, 96; setp.lt.s32 %p46, %r1275, %r1272; and.pred %p47, %p21, %p46; add.s32 %r1276, %r5, 112; setp.lt.s32 %p48, %r1276, %r1272; and.pred %p49, %p21, %p48; shl.b64 %rd46, %rd31, 4; selp.b32 %r115, 16, 0, %p45; add.s32 %r1277, %r1270, 16384; add.s32 %r1278, %r1263, %r1277; add.s32 %r88, %r1278, %r1225; add.s32 %r90, %r88, 2048; add.s32 %r92, %r88, 4096; add.s32 %r94, %r88, 6144; add.s32 %r96, %r88, 8192; add.s32 %r98, %r88, 10240; add.s32 %r100, %r88, 12288; add.s32 %r102, %r88, 14336; selp.b32 %r105, 16, 0, %p35; // begin inline asm cp.async.cg.shared.global [%r88], [%rd12], 16, %r105; // end inline asm selp.b32 %r107, 16, 0, %p37; add.s64 %rd13, %rd12, %rd46; // begin inline asm cp.async.cg.shared.global [%r90], [%rd13], 16, %r107; // end inline asm selp.b32 %r109, 16, 0, %p39; add.s64 %rd14, %rd13, %rd46; // begin inline asm cp.async.cg.shared.global [%r92], [%rd14], 16, %r109; // end inline asm selp.b32 %r111, 16, 0, %p41; add.s64 %rd15, %rd14, %rd46; // begin inline asm cp.async.cg.shared.global [%r94], [%rd15], 16, %r111; // end inline asm selp.b32 %r113, 16, 0, %p43; add.s64 %rd16, %rd15, %rd46; // begin inline asm cp.async.cg.shared.global [%r96], [%rd16], 16, %r113; // end inline asm add.s64 %rd17, %rd16, %rd46; // begin inline asm cp.async.cg.shared.global [%r98], [%rd17], 16, %r115; // end inline asm selp.b32 %r117, 16, 0, %p47; add.s64 %rd18, %rd17, %rd46; // begin inline asm cp.async.cg.shared.global [%r100], [%rd18], 16, %r117; // end inline asm selp.b32 %r119, 16, 0, %p49; add.s64 %rd19, %rd18, %rd46; // begin inline asm cp.async.cg.shared.global [%r102], [%rd19], 16, %r119; // end inline asm add.s32 %r1279, %r1270, 32768; add.s32 %r1280, %r1263, %r1279; add.s32 %r104, %r1280, %r1236; add.s32 %r106, %r104, 2048; add.s32 %r108, %r104, 4096; add.s32 %r110, %r104, 6144; add.s32 %r112, %r104, 8192; add.s32 %r114, %r104, 10240; add.s32 %r116, %r104, 12288; add.s32 %r118, %r104, 14336; // begin inline asm cp.async.cg.shared.global [%r104], [%rd20], 16, %r105; // end inline asm add.s64 %rd21, %rd20, %rd46; // begin inline asm cp.async.cg.shared.global [%r106], [%rd21], 16, %r107; // end inline asm add.s64 %rd22, %rd21, %rd46; // begin inline asm cp.async.cg.shared.global [%r108], [%rd22], 16, %r109; // end inline asm add.s64 %rd23, %rd22, %rd46; // begin inline asm cp.async.cg.shared.global [%r110], [%rd23], 16, %r111; // end inline asm add.s64 %rd24, %rd23, %rd46; // begin inline asm cp.async.cg.shared.global [%r112], [%rd24], 16, %r113; // end inline asm add.s64 %rd25, %rd24, %rd46; // begin inline asm cp.async.cg.shared.global [%r114], [%rd25], 16, %r115; // end inline asm add.s64 %rd26, %rd25, %rd46; // begin inline asm cp.async.cg.shared.global [%r116], [%rd26], 16, %r117; // end inline asm add.s64 %rd27, %rd26, %rd46; // begin inline asm cp.async.cg.shared.global [%r118], [%rd27], 16, %r119; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r1281, %r1264, %r1270; add.s32 %r124, %r1281, %r1207; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r120, %r121, %r122, %r123}, [%r124]; // end inline asm add.s32 %r129, %r124, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r125, %r126, %r127, %r128}, [%r129]; // end inline asm add.s32 %r134, %r124, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r130, %r131, %r132, %r133}, [%r134]; // end inline asm add.s32 %r139, %r124, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r135, %r136, %r137, %r138}, [%r139]; // end inline asm add.s32 %r1282, %r1224, %r1277; add.s32 %r144, %r1282, %r1265; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r140, %r141, %r142, %r143}, [%r144]; // end inline asm add.s32 %r149, %r144, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r145, %r146, %r147, %r148}, [%r149]; // end inline asm xor.b32 %r1283, %r1265, 32; add.s32 %r154, %r1282, %r1283; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r150, %r151, %r152, %r153}, [%r154]; // end inline asm add.s32 %r159, %r154, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r155, %r156, %r157, %r158}, [%r159]; // end inline asm xor.b32 %r1284, %r1265, 64; add.s32 %r164, %r1282, %r1284; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r160, %r161, %r162, %r163}, [%r164]; // end inline asm add.s32 %r169, %r164, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r165, %r166, %r167, %r168}, [%r169]; // end inline asm xor.b32 %r1285, %r1265, 96; add.s32 %r174, %r1282, %r1285; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r170, %r171, %r172, %r173}, [%r174]; // end inline asm add.s32 %r179, %r174, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r175, %r176, %r177, %r178}, [%r179]; // end inline asm xor.b32 %r1286, %r1264, 32; add.s32 %r184, %r1266, %r1279; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r180, %r181, %r182, %r183}, [%r184]; // end inline asm xor.b32 %r1287, %r1266, 32; add.s32 %r189, %r1287, %r1279; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r185, %r186, %r187, %r188}, [%r189]; // end inline asm xor.b32 %r1288, %r1266, 64; add.s32 %r194, %r1288, %r1279; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r190, %r191, %r192, %r193}, [%r194]; // end inline asm xor.b32 %r1289, %r1266, 96; add.s32 %r199, %r1289, %r1279; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r195, %r196, %r197, %r198}, [%r199]; // end inline asm add.s32 %r1290, %r1270, 40960; add.s32 %r204, %r1266, %r1290; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r200, %r201, %r202, %r203}, [%r204]; // end inline asm add.s32 %r209, %r1287, %r1290; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r205, %r206, %r207, %r208}, [%r209]; // end inline asm add.s32 %r214, %r1288, %r1290; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r210, %r211, %r212, %r213}, [%r214]; // end inline asm add.s32 %r219, %r1289, %r1290; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r215, %r216, %r217, %r218}, [%r219]; // end inline asm shr.s32 %r1291, %r1185, 31; shr.u32 %r1292, %r1291, 30; add.s32 %r1293, %r1267, %r1292; and.b32 %r1294, %r1293, 1073741820; sub.s32 %r1295, %r1267, %r1294; shr.u32 %r1296, %r1183, 25; add.s32 %r1297, %r1, %r1296; shr.u32 %r1298, %r1297, 1; and.b32 %r1299, %r1298, 268435392; shr.s32 %r41, %r1257, 2; add.s32 %r1300, %r1299, %r41; shl.b32 %r1301, %r1300, 2; mov.u32 %r1302, 2; add.s32 %r1303, %r1301, %r1295; shl.b32 %r1304, %r1303, 2; add.s32 %r42, %r1277, %r1304; ld.param.u32 %r954, [%rd1+64]; // begin inline asm mov.u32 %r452, 0; // end inline asm // begin inline asm mov.u32 %r453, 0; // end inline asm // begin inline asm mov.u32 %r462, 0; // end inline asm // begin inline asm mov.u32 %r463, 0; // end inline asm // begin inline asm mov.u32 %r472, 0; // end inline asm // begin inline asm mov.u32 %r473, 0; // end inline asm // begin inline asm mov.u32 %r482, 0; // end inline asm // begin inline asm mov.u32 %r483, 0; // end inline asm // begin inline asm mov.u32 %r492, 0; // end inline asm // begin inline asm mov.u32 %r493, 0; // end inline asm // begin inline asm mov.u32 %r502, 0; // end inline asm // begin inline asm mov.u32 %r503, 0; // end inline asm // begin inline asm mov.u32 %r512, 0; // end inline asm // begin inline asm mov.u32 %r513, 0; // end inline asm // begin inline asm mov.u32 %r522, 0; // end inline asm // begin inline asm mov.u32 %r523, 0; // end inline asm // begin inline asm mov.u32 %r532, 0; // end inline asm // begin inline asm mov.u32 %r533, 0; // end inline asm // begin inline asm mov.u32 %r542, 0; // end inline asm // begin inline asm mov.u32 %r543, 0; // end inline asm // begin inline asm mov.u32 %r552, 0; // end inline asm // begin inline asm mov.u32 %r553, 0; // end inline asm // begin inline asm mov.u32 %r562, 0; // end inline asm // begin inline asm mov.u32 %r563, 0; // end inline asm // begin inline asm mov.u32 %r572, 0; // end inline asm // begin inline asm mov.u32 %r573, 0; // end inline asm // begin inline asm mov.u32 %r582, 0; // end inline asm // begin inline asm mov.u32 %r583, 0; // end inline asm // begin inline asm mov.u32 %r592, 0; // end inline asm // begin inline asm mov.u32 %r593, 0; // end inline asm // begin inline asm mov.u32 %r602, 0; // end inline asm // begin inline asm mov.u32 %r603, 0; // end inline asm add.s32 %r1305, %r1207, %r1270; add.s32 %r256, %r1305, %r1286; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r252, %r253, %r254, %r255}, [%r256]; // end inline asm add.s32 %r261, %r256, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r257, %r258, %r259, %r260}, [%r261]; // end inline asm add.s32 %r266, %r256, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r262, %r263, %r264, %r265}, [%r266]; // end inline asm add.s32 %r271, %r256, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r267, %r268, %r269, %r270}, [%r271]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r452, %r453}, {%r120, %r121, %r122, %r123}, {%r140, %r141}, {%r452, %r453}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r462, %r463}, {%r120, %r121, %r122, %r123}, {%r142, %r143}, {%r462, %r463}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r472, %r473}, {%r120, %r121, %r122, %r123}, {%r145, %r146}, {%r472, %r473}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r482, %r483}, {%r120, %r121, %r122, %r123}, {%r147, %r148}, {%r482, %r483}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r492, %r493}, {%r125, %r126, %r127, %r128}, {%r140, %r141}, {%r492, %r493}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r502, %r503}, {%r125, %r126, %r127, %r128}, {%r142, %r143}, {%r502, %r503}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r512, %r513}, {%r125, %r126, %r127, %r128}, {%r145, %r146}, {%r512, %r513}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r522, %r523}, {%r125, %r126, %r127, %r128}, {%r147, %r148}, {%r522, %r523}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r532, %r533}, {%r130, %r131, %r132, %r133}, {%r140, %r141}, {%r532, %r533}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r542, %r543}, {%r130, %r131, %r132, %r133}, {%r142, %r143}, {%r542, %r543}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r552, %r553}, {%r130, %r131, %r132, %r133}, {%r145, %r146}, {%r552, %r553}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r562, %r563}, {%r130, %r131, %r132, %r133}, {%r147, %r148}, {%r562, %r563}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r572, %r573}, {%r135, %r136, %r137, %r138}, {%r140, %r141}, {%r572, %r573}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r582, %r583}, {%r135, %r136, %r137, %r138}, {%r142, %r143}, {%r582, %r583}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r592, %r593}, {%r135, %r136, %r137, %r138}, {%r145, %r146}, {%r592, %r593}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r602, %r603}, {%r135, %r136, %r137, %r138}, {%r147, %r148}, {%r602, %r603}; // end inline asm xor.b32 %r1306, %r1264, 64; add.s32 %r436, %r1305, %r1306; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r432, %r433, %r434, %r435}, [%r436]; // end inline asm add.s32 %r441, %r436, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r437, %r438, %r439, %r440}, [%r441]; // end inline asm add.s32 %r446, %r436, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r442, %r443, %r444, %r445}, [%r446]; // end inline asm add.s32 %r451, %r436, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r447, %r448, %r449, %r450}, [%r451]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r452, %r453}, {%r252, %r253, %r254, %r255}, {%r150, %r151}, {%r452, %r453}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r462, %r463}, {%r252, %r253, %r254, %r255}, {%r152, %r153}, {%r462, %r463}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r472, %r473}, {%r252, %r253, %r254, %r255}, {%r155, %r156}, {%r472, %r473}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r482, %r483}, {%r252, %r253, %r254, %r255}, {%r157, %r158}, {%r482, %r483}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r492, %r493}, {%r257, %r258, %r259, %r260}, {%r150, %r151}, {%r492, %r493}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r502, %r503}, {%r257, %r258, %r259, %r260}, {%r152, %r153}, {%r502, %r503}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r512, %r513}, {%r257, %r258, %r259, %r260}, {%r155, %r156}, {%r512, %r513}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r522, %r523}, {%r257, %r258, %r259, %r260}, {%r157, %r158}, {%r522, %r523}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r532, %r533}, {%r262, %r263, %r264, %r265}, {%r150, %r151}, {%r532, %r533}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r542, %r543}, {%r262, %r263, %r264, %r265}, {%r152, %r153}, {%r542, %r543}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r552, %r553}, {%r262, %r263, %r264, %r265}, {%r155, %r156}, {%r552, %r553}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r562, %r563}, {%r262, %r263, %r264, %r265}, {%r157, %r158}, {%r562, %r563}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r572, %r573}, {%r267, %r268, %r269, %r270}, {%r150, %r151}, {%r572, %r573}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r582, %r583}, {%r267, %r268, %r269, %r270}, {%r152, %r153}, {%r582, %r583}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r592, %r593}, {%r267, %r268, %r269, %r270}, {%r155, %r156}, {%r592, %r593}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r602, %r603}, {%r267, %r268, %r269, %r270}, {%r157, %r158}, {%r602, %r603}; // end inline asm xor.b32 %r1307, %r1264, 96; add.s32 %r616, %r1305, %r1307; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r612, %r613, %r614, %r615}, [%r616]; // end inline asm add.s32 %r621, %r616, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r617, %r618, %r619, %r620}, [%r621]; // end inline asm add.s32 %r626, %r616, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r622, %r623, %r624, %r625}, [%r626]; // end inline asm add.s32 %r631, %r616, 6144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r627, %r628, %r629, %r630}, [%r631]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r452, %r453}, {%r432, %r433, %r434, %r435}, {%r160, %r161}, {%r452, %r453}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r462, %r463}, {%r432, %r433, %r434, %r435}, {%r162, %r163}, {%r462, %r463}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r472, %r473}, {%r432, %r433, %r434, %r435}, {%r165, %r166}, {%r472, %r473}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r482, %r483}, {%r432, %r433, %r434, %r435}, {%r167, %r168}, {%r482, %r483}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r492, %r493}, {%r437, %r438, %r439, %r440}, {%r160, %r161}, {%r492, %r493}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r502, %r503}, {%r437, %r438, %r439, %r440}, {%r162, %r163}, {%r502, %r503}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r512, %r513}, {%r437, %r438, %r439, %r440}, {%r165, %r166}, {%r512, %r513}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r522, %r523}, {%r437, %r438, %r439, %r440}, {%r167, %r168}, {%r522, %r523}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r532, %r533}, {%r442, %r443, %r444, %r445}, {%r160, %r161}, {%r532, %r533}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r542, %r543}, {%r442, %r443, %r444, %r445}, {%r162, %r163}, {%r542, %r543}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r552, %r553}, {%r442, %r443, %r444, %r445}, {%r165, %r166}, {%r552, %r553}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r562, %r563}, {%r442, %r443, %r444, %r445}, {%r167, %r168}, {%r562, %r563}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r572, %r573}, {%r447, %r448, %r449, %r450}, {%r160, %r161}, {%r572, %r573}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r582, %r583}, {%r447, %r448, %r449, %r450}, {%r162, %r163}, {%r582, %r583}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r592, %r593}, {%r447, %r448, %r449, %r450}, {%r165, %r166}, {%r592, %r593}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r602, %r603}, {%r447, %r448, %r449, %r450}, {%r167, %r168}, {%r602, %r603}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r452, %r453}, {%r612, %r613, %r614, %r615}, {%r170, %r171}, {%r452, %r453}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r462, %r463}, {%r612, %r613, %r614, %r615}, {%r172, %r173}, {%r462, %r463}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r472, %r473}, {%r612, %r613, %r614, %r615}, {%r175, %r176}, {%r472, %r473}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r482, %r483}, {%r612, %r613, %r614, %r615}, {%r177, %r178}, {%r482, %r483}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r492, %r493}, {%r617, %r618, %r619, %r620}, {%r170, %r171}, {%r492, %r493}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r502, %r503}, {%r617, %r618, %r619, %r620}, {%r172, %r173}, {%r502, %r503}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r512, %r513}, {%r617, %r618, %r619, %r620}, {%r175, %r176}, {%r512, %r513}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r522, %r523}, {%r617, %r618, %r619, %r620}, {%r177, %r178}, {%r522, %r523}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r532, %r533}, {%r622, %r623, %r624, %r625}, {%r170, %r171}, {%r532, %r533}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r542, %r543}, {%r622, %r623, %r624, %r625}, {%r172, %r173}, {%r542, %r543}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r552, %r553}, {%r622, %r623, %r624, %r625}, {%r175, %r176}, {%r552, %r553}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r562, %r563}, {%r622, %r623, %r624, %r625}, {%r177, %r178}, {%r562, %r563}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r572, %r573}, {%r627, %r628, %r629, %r630}, {%r170, %r171}, {%r572, %r573}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r582, %r583}, {%r627, %r628, %r629, %r630}, {%r172, %r173}, {%r582, %r583}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r592, %r593}, {%r627, %r628, %r629, %r630}, {%r175, %r176}, {%r592, %r593}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r602, %r603}, {%r627, %r628, %r629, %r630}, {%r177, %r178}, {%r602, %r603}; // end inline asm // begin inline asm mul.f16x2 %r952, %r452, %r954; // end inline asm mov.u32 %r1171, 2080340991; // begin inline asm min.xorsign.abs.f16x2 %r955, %r952, %r1171; // end inline asm // begin inline asm mul.f16x2 %r958, %r453, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r961, %r958, %r1171; // end inline asm // begin inline asm mul.f16x2 %r964, %r462, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r967, %r964, %r1171; // end inline asm // begin inline asm mul.f16x2 %r970, %r463, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r973, %r970, %r1171; // end inline asm // begin inline asm mov.b32 {%rs1, %rs2}, %r955; // end inline asm // begin inline asm cvt.f32.f16 %f187, %rs1; // end inline asm // begin inline asm cvt.f32.f16 %f188, %rs2; // end inline asm // begin inline asm mov.b32 {%rs5, %rs6}, %r961; // end inline asm // begin inline asm cvt.f32.f16 %f189, %rs5; // end inline asm // begin inline asm cvt.f32.f16 %f190, %rs6; // end inline asm // begin inline asm mov.b32 {%rs9, %rs10}, %r967; // end inline asm // begin inline asm cvt.f32.f16 %f191, %rs9; // end inline asm // begin inline asm cvt.f32.f16 %f192, %rs10; // end inline asm // begin inline asm mov.b32 {%rs13, %rs14}, %r973; // end inline asm // begin inline asm cvt.f32.f16 %f193, %rs13; // end inline asm // begin inline asm cvt.f32.f16 %f194, %rs14; // end inline asm // begin inline asm mul.f16x2 %r980, %r472, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r983, %r980, %r1171; // end inline asm // begin inline asm mul.f16x2 %r986, %r473, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r989, %r986, %r1171; // end inline asm // begin inline asm mul.f16x2 %r992, %r482, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r995, %r992, %r1171; // end inline asm // begin inline asm mul.f16x2 %r998, %r483, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1001, %r998, %r1171; // end inline asm // begin inline asm mov.b32 {%rs17, %rs18}, %r983; // end inline asm // begin inline asm cvt.f32.f16 %f195, %rs17; // end inline asm // begin inline asm cvt.f32.f16 %f196, %rs18; // end inline asm // begin inline asm mov.b32 {%rs21, %rs22}, %r989; // end inline asm // begin inline asm cvt.f32.f16 %f197, %rs21; // end inline asm // begin inline asm cvt.f32.f16 %f198, %rs22; // end inline asm // begin inline asm mov.b32 {%rs25, %rs26}, %r995; // end inline asm // begin inline asm cvt.f32.f16 %f199, %rs25; // end inline asm // begin inline asm cvt.f32.f16 %f200, %rs26; // end inline asm // begin inline asm mov.b32 {%rs29, %rs30}, %r1001; // end inline asm // begin inline asm cvt.f32.f16 %f201, %rs29; // end inline asm // begin inline asm cvt.f32.f16 %f202, %rs30; // end inline asm // begin inline asm mul.f16x2 %r1008, %r492, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1011, %r1008, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1014, %r493, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1017, %r1014, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1020, %r502, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1023, %r1020, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1026, %r503, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1029, %r1026, %r1171; // end inline asm // begin inline asm mov.b32 {%rs33, %rs34}, %r1011; // end inline asm // begin inline asm cvt.f32.f16 %f203, %rs33; // end inline asm // begin inline asm cvt.f32.f16 %f204, %rs34; // end inline asm // begin inline asm mov.b32 {%rs37, %rs38}, %r1017; // end inline asm // begin inline asm cvt.f32.f16 %f205, %rs37; // end inline asm // begin inline asm cvt.f32.f16 %f206, %rs38; // end inline asm // begin inline asm mov.b32 {%rs41, %rs42}, %r1023; // end inline asm // begin inline asm cvt.f32.f16 %f207, %rs41; // end inline asm // begin inline asm cvt.f32.f16 %f208, %rs42; // end inline asm // begin inline asm mov.b32 {%rs45, %rs46}, %r1029; // end inline asm // begin inline asm cvt.f32.f16 %f209, %rs45; // end inline asm // begin inline asm cvt.f32.f16 %f210, %rs46; // end inline asm // begin inline asm mul.f16x2 %r1036, %r512, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1039, %r1036, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1042, %r513, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1045, %r1042, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1048, %r522, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1051, %r1048, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1054, %r523, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1057, %r1054, %r1171; // end inline asm // begin inline asm mov.b32 {%rs49, %rs50}, %r1039; // end inline asm // begin inline asm cvt.f32.f16 %f211, %rs49; // end inline asm // begin inline asm cvt.f32.f16 %f212, %rs50; // end inline asm // begin inline asm mov.b32 {%rs53, %rs54}, %r1045; // end inline asm // begin inline asm cvt.f32.f16 %f213, %rs53; // end inline asm // begin inline asm cvt.f32.f16 %f214, %rs54; // end inline asm // begin inline asm mov.b32 {%rs57, %rs58}, %r1051; // end inline asm // begin inline asm cvt.f32.f16 %f215, %rs57; // end inline asm // begin inline asm cvt.f32.f16 %f216, %rs58; // end inline asm // begin inline asm mov.b32 {%rs61, %rs62}, %r1057; // end inline asm // begin inline asm cvt.f32.f16 %f217, %rs61; // end inline asm // begin inline asm cvt.f32.f16 %f218, %rs62; // end inline asm // begin inline asm mul.f16x2 %r1064, %r532, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1067, %r1064, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1070, %r533, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1073, %r1070, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1076, %r542, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1079, %r1076, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1082, %r543, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1085, %r1082, %r1171; // end inline asm // begin inline asm mov.b32 {%rs65, %rs66}, %r1067; // end inline asm // begin inline asm cvt.f32.f16 %f219, %rs65; // end inline asm // begin inline asm cvt.f32.f16 %f220, %rs66; // end inline asm // begin inline asm mov.b32 {%rs69, %rs70}, %r1073; // end inline asm // begin inline asm cvt.f32.f16 %f221, %rs69; // end inline asm // begin inline asm cvt.f32.f16 %f222, %rs70; // end inline asm // begin inline asm mov.b32 {%rs73, %rs74}, %r1079; // end inline asm // begin inline asm cvt.f32.f16 %f223, %rs73; // end inline asm // begin inline asm cvt.f32.f16 %f224, %rs74; // end inline asm // begin inline asm mov.b32 {%rs77, %rs78}, %r1085; // end inline asm // begin inline asm cvt.f32.f16 %f225, %rs77; // end inline asm // begin inline asm cvt.f32.f16 %f226, %rs78; // end inline asm // begin inline asm mul.f16x2 %r1092, %r552, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1095, %r1092, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1098, %r553, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1101, %r1098, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1104, %r562, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1107, %r1104, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1110, %r563, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1113, %r1110, %r1171; // end inline asm // begin inline asm mov.b32 {%rs81, %rs82}, %r1095; // end inline asm // begin inline asm cvt.f32.f16 %f227, %rs81; // end inline asm // begin inline asm cvt.f32.f16 %f228, %rs82; // end inline asm // begin inline asm mov.b32 {%rs85, %rs86}, %r1101; // end inline asm // begin inline asm cvt.f32.f16 %f229, %rs85; // end inline asm // begin inline asm cvt.f32.f16 %f230, %rs86; // end inline asm // begin inline asm mov.b32 {%rs89, %rs90}, %r1107; // end inline asm // begin inline asm cvt.f32.f16 %f231, %rs89; // end inline asm // begin inline asm cvt.f32.f16 %f232, %rs90; // end inline asm // begin inline asm mov.b32 {%rs93, %rs94}, %r1113; // end inline asm // begin inline asm cvt.f32.f16 %f233, %rs93; // end inline asm // begin inline asm cvt.f32.f16 %f234, %rs94; // end inline asm // begin inline asm mul.f16x2 %r1120, %r572, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1123, %r1120, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1126, %r573, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1129, %r1126, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1132, %r582, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1135, %r1132, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1138, %r583, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1141, %r1138, %r1171; // end inline asm // begin inline asm mov.b32 {%rs97, %rs98}, %r1123; // end inline asm // begin inline asm cvt.f32.f16 %f235, %rs97; // end inline asm // begin inline asm cvt.f32.f16 %f236, %rs98; // end inline asm // begin inline asm mov.b32 {%rs101, %rs102}, %r1129; // end inline asm // begin inline asm cvt.f32.f16 %f237, %rs101; // end inline asm // begin inline asm cvt.f32.f16 %f238, %rs102; // end inline asm // begin inline asm mov.b32 {%rs105, %rs106}, %r1135; // end inline asm // begin inline asm cvt.f32.f16 %f239, %rs105; // end inline asm // begin inline asm cvt.f32.f16 %f240, %rs106; // end inline asm // begin inline asm mov.b32 {%rs109, %rs110}, %r1141; // end inline asm // begin inline asm cvt.f32.f16 %f241, %rs109; // end inline asm // begin inline asm cvt.f32.f16 %f242, %rs110; // end inline asm // begin inline asm mul.f16x2 %r1148, %r592, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1151, %r1148, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1154, %r593, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1157, %r1154, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1160, %r602, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1163, %r1160, %r1171; // end inline asm // begin inline asm mul.f16x2 %r1166, %r603, %r954; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1169, %r1166, %r1171; // end inline asm // begin inline asm mov.b32 {%rs113, %rs114}, %r1151; // end inline asm // begin inline asm cvt.f32.f16 %f243, %rs113; // end inline asm // begin inline asm cvt.f32.f16 %f244, %rs114; // end inline asm // begin inline asm mov.b32 {%rs117, %rs118}, %r1157; // end inline asm // begin inline asm cvt.f32.f16 %f245, %rs117; // end inline asm // begin inline asm cvt.f32.f16 %f246, %rs118; // end inline asm // begin inline asm mov.b32 {%rs121, %rs122}, %r1163; // end inline asm // begin inline asm cvt.f32.f16 %f247, %rs121; // end inline asm // begin inline asm cvt.f32.f16 %f248, %rs122; // end inline asm // begin inline asm mov.b32 {%rs125, %rs126}, %r1169; // end inline asm // begin inline asm cvt.f32.f16 %f249, %rs125; // end inline asm // begin inline asm cvt.f32.f16 %f250, %rs126; // end inline asm add.s32 %r1308, %r1269, %r1268; setp.lt.s32 %p50, %r1308, %r1181; selp.f32 %f1, %f187, 0fFF800000, %p50; or.b32 %r1309, %r1308, 1; setp.lt.s32 %p51, %r1309, %r1181; selp.f32 %f2, %f188, 0fFF800000, %p51; add.s32 %r1310, %r1308, 8; setp.lt.s32 %p52, %r1310, %r1181; selp.f32 %f3, %f191, 0fFF800000, %p52; add.s32 %r1311, %r1308, 9; setp.lt.s32 %p53, %r1311, %r1181; selp.f32 %f4, %f192, 0fFF800000, %p53; add.s32 %r1312, %r1308, 64; setp.lt.s32 %p54, %r1312, %r1181; selp.f32 %f5, %f195, 0fFF800000, %p54; add.s32 %r1313, %r1308, 65; setp.lt.s32 %p55, %r1313, %r1181; selp.f32 %f6, %f196, 0fFF800000, %p55; add.s32 %r1314, %r1308, 72; setp.lt.s32 %p56, %r1314, %r1181; selp.f32 %f7, %f199, 0fFF800000, %p56; add.s32 %r1315, %r1308, 73; setp.lt.s32 %p57, %r1315, %r1181; selp.f32 %f8, %f200, 0fFF800000, %p57; selp.f32 %f9, %f189, 0fFF800000, %p50; selp.f32 %f10, %f190, 0fFF800000, %p51; selp.f32 %f11, %f193, 0fFF800000, %p52; selp.f32 %f12, %f194, 0fFF800000, %p53; selp.f32 %f13, %f197, 0fFF800000, %p54; selp.f32 %f14, %f198, 0fFF800000, %p55; selp.f32 %f15, %f201, 0fFF800000, %p56; selp.f32 %f16, %f202, 0fFF800000, %p57; selp.f32 %f17, %f203, 0fFF800000, %p50; selp.f32 %f18, %f204, 0fFF800000, %p51; selp.f32 %f19, %f207, 0fFF800000, %p52; selp.f32 %f20, %f208, 0fFF800000, %p53; selp.f32 %f21, %f211, 0fFF800000, %p54; selp.f32 %f22, %f212, 0fFF800000, %p55; selp.f32 %f23, %f215, 0fFF800000, %p56; selp.f32 %f24, %f216, 0fFF800000, %p57; selp.f32 %f25, %f205, 0fFF800000, %p50; selp.f32 %f26, %f206, 0fFF800000, %p51; selp.f32 %f27, %f209, 0fFF800000, %p52; selp.f32 %f28, %f210, 0fFF800000, %p53; selp.f32 %f29, %f213, 0fFF800000, %p54; selp.f32 %f30, %f214, 0fFF800000, %p55; selp.f32 %f31, %f217, 0fFF800000, %p56; selp.f32 %f32, %f218, 0fFF800000, %p57; selp.f32 %f33, %f219, 0fFF800000, %p50; selp.f32 %f34, %f220, 0fFF800000, %p51; selp.f32 %f35, %f223, 0fFF800000, %p52; selp.f32 %f36, %f224, 0fFF800000, %p53; selp.f32 %f37, %f227, 0fFF800000, %p54; selp.f32 %f38, %f228, 0fFF800000, %p55; selp.f32 %f39, %f231, 0fFF800000, %p56; selp.f32 %f40, %f232, 0fFF800000, %p57; selp.f32 %f41, %f221, 0fFF800000, %p50; selp.f32 %f42, %f222, 0fFF800000, %p51; selp.f32 %f43, %f225, 0fFF800000, %p52; selp.f32 %f44, %f226, 0fFF800000, %p53; selp.f32 %f45, %f229, 0fFF800000, %p54; selp.f32 %f46, %f230, 0fFF800000, %p55; selp.f32 %f47, %f233, 0fFF800000, %p56; selp.f32 %f48, %f234, 0fFF800000, %p57; selp.f32 %f49, %f235, 0fFF800000, %p50; selp.f32 %f50, %f236, 0fFF800000, %p51; selp.f32 %f51, %f239, 0fFF800000, %p52; selp.f32 %f52, %f240, 0fFF800000, %p53; selp.f32 %f53, %f243, 0fFF800000, %p54; selp.f32 %f54, %f244, 0fFF800000, %p55; selp.f32 %f55, %f247, 0fFF800000, %p56; selp.f32 %f56, %f248, 0fFF800000, %p57; selp.f32 %f57, %f237, 0fFF800000, %p50; selp.f32 %f58, %f238, 0fFF800000, %p51; selp.f32 %f59, %f241, 0fFF800000, %p52; selp.f32 %f60, %f242, 0fFF800000, %p53; selp.f32 %f61, %f245, 0fFF800000, %p54; selp.f32 %f62, %f246, 0fFF800000, %p55; selp.f32 %f63, %f249, 0fFF800000, %p56; selp.f32 %f64, %f250, 0fFF800000, %p57; setp.gt.ftz.f32 %p58, %f1, %f2; selp.f32 %f251, %f1, %f2, %p58; setp.gt.ftz.f32 %p59, %f251, %f3; selp.f32 %f252, %f251, %f3, %p59; setp.gt.ftz.f32 %p60, %f252, %f4; selp.f32 %f253, %f252, %f4, %p60; setp.gt.ftz.f32 %p61, %f253, %f5; selp.f32 %f254, %f253, %f5, %p61; setp.gt.ftz.f32 %p62, %f254, %f6; selp.f32 %f255, %f254, %f6, %p62; setp.gt.ftz.f32 %p63, %f255, %f7; selp.f32 %f256, %f255, %f7, %p63; setp.gt.ftz.f32 %p64, %f256, %f8; selp.f32 %f257, %f256, %f8, %p64; setp.gt.ftz.f32 %p65, %f9, %f10; selp.f32 %f258, %f9, %f10, %p65; setp.gt.ftz.f32 %p66, %f258, %f11; selp.f32 %f259, %f258, %f11, %p66; setp.gt.ftz.f32 %p67, %f259, %f12; selp.f32 %f260, %f259, %f12, %p67; setp.gt.ftz.f32 %p68, %f260, %f13; selp.f32 %f261, %f260, %f13, %p68; setp.gt.ftz.f32 %p69, %f261, %f14; selp.f32 %f262, %f261, %f14, %p69; setp.gt.ftz.f32 %p70, %f262, %f15; selp.f32 %f263, %f262, %f15, %p70; setp.gt.ftz.f32 %p71, %f263, %f16; selp.f32 %f264, %f263, %f16, %p71; setp.gt.ftz.f32 %p72, %f17, %f18; selp.f32 %f265, %f17, %f18, %p72; setp.gt.ftz.f32 %p73, %f265, %f19; selp.f32 %f266, %f265, %f19, %p73; setp.gt.ftz.f32 %p74, %f266, %f20; selp.f32 %f267, %f266, %f20, %p74; setp.gt.ftz.f32 %p75, %f267, %f21; selp.f32 %f268, %f267, %f21, %p75; setp.gt.ftz.f32 %p76, %f268, %f22; selp.f32 %f269, %f268, %f22, %p76; setp.gt.ftz.f32 %p77, %f269, %f23; selp.f32 %f270, %f269, %f23, %p77; setp.gt.ftz.f32 %p78, %f270, %f24; selp.f32 %f271, %f270, %f24, %p78; setp.gt.ftz.f32 %p79, %f25, %f26; selp.f32 %f272, %f25, %f26, %p79; setp.gt.ftz.f32 %p80, %f272, %f27; selp.f32 %f273, %f272, %f27, %p80; setp.gt.ftz.f32 %p81, %f273, %f28; selp.f32 %f274, %f273, %f28, %p81; setp.gt.ftz.f32 %p82, %f274, %f29; selp.f32 %f275, %f274, %f29, %p82; setp.gt.ftz.f32 %p83, %f275, %f30; selp.f32 %f276, %f275, %f30, %p83; setp.gt.ftz.f32 %p84, %f276, %f31; selp.f32 %f277, %f276, %f31, %p84; setp.gt.ftz.f32 %p85, %f277, %f32; selp.f32 %f278, %f277, %f32, %p85; setp.gt.ftz.f32 %p86, %f33, %f34; selp.f32 %f279, %f33, %f34, %p86; setp.gt.ftz.f32 %p87, %f279, %f35; selp.f32 %f280, %f279, %f35, %p87; setp.gt.ftz.f32 %p88, %f280, %f36; selp.f32 %f281, %f280, %f36, %p88; setp.gt.ftz.f32 %p89, %f281, %f37; selp.f32 %f282, %f281, %f37, %p89; setp.gt.ftz.f32 %p90, %f282, %f38; selp.f32 %f283, %f282, %f38, %p90; setp.gt.ftz.f32 %p91, %f283, %f39; selp.f32 %f284, %f283, %f39, %p91; setp.gt.ftz.f32 %p92, %f284, %f40; selp.f32 %f285, %f284, %f40, %p92; setp.gt.ftz.f32 %p93, %f41, %f42; selp.f32 %f286, %f41, %f42, %p93; setp.gt.ftz.f32 %p94, %f286, %f43; selp.f32 %f287, %f286, %f43, %p94; setp.gt.ftz.f32 %p95, %f287, %f44; selp.f32 %f288, %f287, %f44, %p95; setp.gt.ftz.f32 %p96, %f288, %f45; selp.f32 %f289, %f288, %f45, %p96; setp.gt.ftz.f32 %p97, %f289, %f46; selp.f32 %f290, %f289, %f46, %p97; setp.gt.ftz.f32 %p98, %f290, %f47; selp.f32 %f291, %f290, %f47, %p98; setp.gt.ftz.f32 %p99, %f291, %f48; selp.f32 %f292, %f291, %f48, %p99; setp.gt.ftz.f32 %p100, %f49, %f50; selp.f32 %f293, %f49, %f50, %p100; setp.gt.ftz.f32 %p101, %f293, %f51; selp.f32 %f294, %f293, %f51, %p101; setp.gt.ftz.f32 %p102, %f294, %f52; selp.f32 %f295, %f294, %f52, %p102; setp.gt.ftz.f32 %p103, %f295, %f53; selp.f32 %f296, %f295, %f53, %p103; setp.gt.ftz.f32 %p104, %f296, %f54; selp.f32 %f297, %f296, %f54, %p104; setp.gt.ftz.f32 %p105, %f297, %f55; selp.f32 %f298, %f297, %f55, %p105; setp.gt.ftz.f32 %p106, %f298, %f56; selp.f32 %f299, %f298, %f56, %p106; setp.gt.ftz.f32 %p107, %f57, %f58; selp.f32 %f300, %f57, %f58, %p107; setp.gt.ftz.f32 %p108, %f300, %f59; selp.f32 %f301, %f300, %f59, %p108; setp.gt.ftz.f32 %p109, %f301, %f60; selp.f32 %f302, %f301, %f60, %p109; setp.gt.ftz.f32 %p110, %f302, %f61; selp.f32 %f303, %f302, %f61, %p110; setp.gt.ftz.f32 %p111, %f303, %f62; selp.f32 %f304, %f303, %f62, %p111; setp.gt.ftz.f32 %p112, %f304, %f63; selp.f32 %f305, %f304, %f63, %p112; setp.gt.ftz.f32 %p113, %f305, %f64; selp.f32 %f306, %f305, %f64, %p113; mov.b32 %r1316, %f257; shfl.sync.bfly.b32 %r1317|%p114, %r1316, %r1216, %r1204, %r1206; mov.b32 %f307, %r1317; setp.gt.ftz.f32 %p115, %f257, %f307; selp.f32 %f65, %f257, %f307, %p115; mov.b32 %r1318, %f65; shfl.sync.bfly.b32 %r43|%p1, %r1318, %r1302, %r1204, %r1206; mov.b32 %r1319, %f264; shfl.sync.bfly.b32 %r1320|%p116, %r1319, %r1216, %r1204, %r1206; mov.b32 %f308, %r1320; setp.gt.ftz.f32 %p117, %f264, %f308; selp.f32 %f66, %f264, %f308, %p117; mov.b32 %r1321, %f66; shfl.sync.bfly.b32 %r44|%p2, %r1321, %r1302, %r1204, %r1206; mov.b32 %r1322, %f271; shfl.sync.bfly.b32 %r1323|%p118, %r1322, %r1216, %r1204, %r1206; mov.b32 %f309, %r1323; setp.gt.ftz.f32 %p119, %f271, %f309; selp.f32 %f67, %f271, %f309, %p119; mov.b32 %r1324, %f67; shfl.sync.bfly.b32 %r45|%p3, %r1324, %r1302, %r1204, %r1206; mov.b32 %r1325, %f278; shfl.sync.bfly.b32 %r1326|%p120, %r1325, %r1216, %r1204, %r1206; mov.b32 %f310, %r1326; setp.gt.ftz.f32 %p121, %f278, %f310; selp.f32 %f68, %f278, %f310, %p121; mov.b32 %r1327, %f68; shfl.sync.bfly.b32 %r46|%p4, %r1327, %r1302, %r1204, %r1206; mov.b32 %r1328, %f285; shfl.sync.bfly.b32 %r1329|%p122, %r1328, %r1216, %r1204, %r1206; mov.b32 %f311, %r1329; setp.gt.ftz.f32 %p123, %f285, %f311; selp.f32 %f69, %f285, %f311, %p123; mov.b32 %r1330, %f69; shfl.sync.bfly.b32 %r47|%p5, %r1330, %r1302, %r1204, %r1206; mov.b32 %r1331, %f292; shfl.sync.bfly.b32 %r1332|%p124, %r1331, %r1216, %r1204, %r1206; mov.b32 %f312, %r1332; setp.gt.ftz.f32 %p125, %f292, %f312; selp.f32 %f70, %f292, %f312, %p125; mov.b32 %r1333, %f70; shfl.sync.bfly.b32 %r48|%p6, %r1333, %r1302, %r1204, %r1206; mov.b32 %r1334, %f299; shfl.sync.bfly.b32 %r1335|%p126, %r1334, %r1216, %r1204, %r1206; mov.b32 %f313, %r1335; setp.gt.ftz.f32 %p127, %f299, %f313; selp.f32 %f71, %f299, %f313, %p127; mov.b32 %r1336, %f71; shfl.sync.bfly.b32 %r49|%p7, %r1336, %r1302, %r1204, %r1206; mov.b32 %r1337, %f306; shfl.sync.bfly.b32 %r1338|%p128, %r1337, %r1216, %r1204, %r1206; mov.b32 %f314, %r1338; setp.gt.ftz.f32 %p129, %f306, %f314; selp.f32 %f72, %f306, %f314, %p129; mov.b32 %r1339, %f72; shfl.sync.bfly.b32 %r50|%p8, %r1339, %r1302, %r1204, %r1206; and.b32 %r51, %r1, 3; setp.ne.s32 %p130, %r51, 0; @%p130 bra $L__BB0_3; mov.b32 %f315, %r43; mov.b32 %f316, %r44; mov.b32 %f317, %r45; mov.b32 %f318, %r46; mov.b32 %f319, %r47; mov.b32 %f320, %r48; mov.b32 %f321, %r49; mov.b32 %f322, %r50; setp.gt.ftz.f32 %p131, %f65, %f315; selp.f32 %f323, %f65, %f315, %p131; st.shared.f32 [%r42], %f323; setp.gt.ftz.f32 %p132, %f66, %f316; selp.f32 %f324, %f66, %f316, %p132; st.shared.f32 [%r42+128], %f324; setp.gt.ftz.f32 %p133, %f67, %f317; selp.f32 %f325, %f67, %f317, %p133; st.shared.f32 [%r42+256], %f325; setp.gt.ftz.f32 %p134, %f68, %f318; selp.f32 %f326, %f68, %f318, %p134; st.shared.f32 [%r42+384], %f326; setp.gt.ftz.f32 %p135, %f69, %f319; selp.f32 %f327, %f69, %f319, %p135; st.shared.f32 [%r42+512], %f327; setp.gt.ftz.f32 %p136, %f70, %f320; selp.f32 %f328, %f70, %f320, %p136; st.shared.f32 [%r42+640], %f328; setp.gt.ftz.f32 %p137, %f71, %f321; selp.f32 %f329, %f71, %f321, %p137; st.shared.f32 [%r42+768], %f329; setp.gt.ftz.f32 %p138, %f72, %f322; selp.f32 %f330, %f72, %f322, %p138; st.shared.f32 [%r42+896], %f330; $L__BB0_3: mov.u32 %r2488, _ZN25fused_multihead_attention5smem_E; mov.u32 %r2487, %tid.x; bar.sync 0; add.s32 %r52, %r2488, %r1210; setp.gt.s32 %p139, %r2487, 63; @%p139 bra $L__BB0_5; ld.shared.v4.f32 {%f648, %f653, %f650, %f655}, [%r52+16384]; $L__BB0_5: mov.u32 %r2490, _ZN25fused_multihead_attention5smem_E; mov.u32 %r2489, %tid.x; setp.gt.ftz.f32 %p140, %f648, %f653; selp.f32 %f336, %f648, %f653, %p140; setp.gt.ftz.f32 %p141, %f650, %f655; selp.f32 %f654, %f650, %f655, %p141; setp.gt.ftz.f32 %p142, %f336, %f654; selp.f32 %f652, %f336, %f654, %p142; bar.sync 0; shl.b32 %r1342, %r2489, 2; add.s32 %r53, %r2490, %r1342; @%p139 bra $L__BB0_7; st.shared.f32 [%r53+16384], %f652; $L__BB0_7: mov.u32 %r2493, %tid.x; and.b32 %r2492, %r2493, 3; setp.ne.s32 %p172, %r2492, 0; mov.u32 %r2491, _ZN25fused_multihead_attention5smem_E; shl.b32 %r1344, %r41, 2; add.s32 %r1346, %r2491, %r1344; mov.u32 %r1347, 2; bar.sync 0; ld.shared.f32 %f337, [%r1346+16384]; ld.shared.f32 %f338, [%r1346+16416]; ld.shared.f32 %f339, [%r1346+16448]; ld.shared.f32 %f340, [%r1346+16480]; ld.shared.f32 %f341, [%r1346+16512]; ld.shared.f32 %f342, [%r1346+16544]; ld.shared.f32 %f343, [%r1346+16576]; ld.shared.f32 %f344, [%r1346+16608]; bar.sync 0; bar.sync 0; sub.ftz.f32 %f345, %f1, %f337; mul.ftz.f32 %f346, %f345, 0f3FB8AA3B; ex2.approx.ftz.f32 %f83, %f346; sub.ftz.f32 %f347, %f2, %f337; mul.ftz.f32 %f348, %f347, 0f3FB8AA3B; ex2.approx.ftz.f32 %f84, %f348; sub.ftz.f32 %f349, %f3, %f337; mul.ftz.f32 %f350, %f349, 0f3FB8AA3B; ex2.approx.ftz.f32 %f85, %f350; sub.ftz.f32 %f351, %f4, %f337; mul.ftz.f32 %f352, %f351, 0f3FB8AA3B; ex2.approx.ftz.f32 %f86, %f352; sub.ftz.f32 %f353, %f5, %f337; mul.ftz.f32 %f354, %f353, 0f3FB8AA3B; ex2.approx.ftz.f32 %f87, %f354; sub.ftz.f32 %f355, %f6, %f337; mul.ftz.f32 %f356, %f355, 0f3FB8AA3B; ex2.approx.ftz.f32 %f88, %f356; sub.ftz.f32 %f357, %f7, %f337; mul.ftz.f32 %f358, %f357, 0f3FB8AA3B; ex2.approx.ftz.f32 %f89, %f358; sub.ftz.f32 %f359, %f8, %f337; mul.ftz.f32 %f360, %f359, 0f3FB8AA3B; ex2.approx.ftz.f32 %f90, %f360; sub.ftz.f32 %f361, %f9, %f338; mul.ftz.f32 %f362, %f361, 0f3FB8AA3B; ex2.approx.ftz.f32 %f91, %f362; sub.ftz.f32 %f363, %f10, %f338; mul.ftz.f32 %f364, %f363, 0f3FB8AA3B; ex2.approx.ftz.f32 %f92, %f364; sub.ftz.f32 %f365, %f11, %f338; mul.ftz.f32 %f366, %f365, 0f3FB8AA3B; ex2.approx.ftz.f32 %f93, %f366; sub.ftz.f32 %f367, %f12, %f338; mul.ftz.f32 %f368, %f367, 0f3FB8AA3B; ex2.approx.ftz.f32 %f94, %f368; sub.ftz.f32 %f369, %f13, %f338; mul.ftz.f32 %f370, %f369, 0f3FB8AA3B; ex2.approx.ftz.f32 %f95, %f370; sub.ftz.f32 %f371, %f14, %f338; mul.ftz.f32 %f372, %f371, 0f3FB8AA3B; ex2.approx.ftz.f32 %f96, %f372; sub.ftz.f32 %f373, %f15, %f338; mul.ftz.f32 %f374, %f373, 0f3FB8AA3B; ex2.approx.ftz.f32 %f97, %f374; sub.ftz.f32 %f375, %f16, %f338; mul.ftz.f32 %f376, %f375, 0f3FB8AA3B; ex2.approx.ftz.f32 %f98, %f376; sub.ftz.f32 %f377, %f17, %f339; mul.ftz.f32 %f378, %f377, 0f3FB8AA3B; ex2.approx.ftz.f32 %f99, %f378; sub.ftz.f32 %f379, %f18, %f339; mul.ftz.f32 %f380, %f379, 0f3FB8AA3B; ex2.approx.ftz.f32 %f100, %f380; sub.ftz.f32 %f381, %f19, %f339; mul.ftz.f32 %f382, %f381, 0f3FB8AA3B; ex2.approx.ftz.f32 %f101, %f382; sub.ftz.f32 %f383, %f20, %f339; mul.ftz.f32 %f384, %f383, 0f3FB8AA3B; ex2.approx.ftz.f32 %f102, %f384; sub.ftz.f32 %f385, %f21, %f339; mul.ftz.f32 %f386, %f385, 0f3FB8AA3B; ex2.approx.ftz.f32 %f103, %f386; sub.ftz.f32 %f387, %f22, %f339; mul.ftz.f32 %f388, %f387, 0f3FB8AA3B; ex2.approx.ftz.f32 %f104, %f388; sub.ftz.f32 %f389, %f23, %f339; mul.ftz.f32 %f390, %f389, 0f3FB8AA3B; ex2.approx.ftz.f32 %f105, %f390; sub.ftz.f32 %f391, %f24, %f339; mul.ftz.f32 %f392, %f391, 0f3FB8AA3B; ex2.approx.ftz.f32 %f106, %f392; sub.ftz.f32 %f393, %f25, %f340; mul.ftz.f32 %f394, %f393, 0f3FB8AA3B; ex2.approx.ftz.f32 %f107, %f394; sub.ftz.f32 %f395, %f26, %f340; mul.ftz.f32 %f396, %f395, 0f3FB8AA3B; ex2.approx.ftz.f32 %f108, %f396; sub.ftz.f32 %f397, %f27, %f340; mul.ftz.f32 %f398, %f397, 0f3FB8AA3B; ex2.approx.ftz.f32 %f109, %f398; sub.ftz.f32 %f399, %f28, %f340; mul.ftz.f32 %f400, %f399, 0f3FB8AA3B; ex2.approx.ftz.f32 %f110, %f400; sub.ftz.f32 %f401, %f29, %f340; mul.ftz.f32 %f402, %f401, 0f3FB8AA3B; ex2.approx.ftz.f32 %f111, %f402; sub.ftz.f32 %f403, %f30, %f340; mul.ftz.f32 %f404, %f403, 0f3FB8AA3B; ex2.approx.ftz.f32 %f112, %f404; sub.ftz.f32 %f405, %f31, %f340; mul.ftz.f32 %f406, %f405, 0f3FB8AA3B; ex2.approx.ftz.f32 %f113, %f406; sub.ftz.f32 %f407, %f32, %f340; mul.ftz.f32 %f408, %f407, 0f3FB8AA3B; ex2.approx.ftz.f32 %f114, %f408; sub.ftz.f32 %f409, %f33, %f341; mul.ftz.f32 %f410, %f409, 0f3FB8AA3B; ex2.approx.ftz.f32 %f115, %f410; sub.ftz.f32 %f411, %f34, %f341; mul.ftz.f32 %f412, %f411, 0f3FB8AA3B; ex2.approx.ftz.f32 %f116, %f412; sub.ftz.f32 %f413, %f35, %f341; mul.ftz.f32 %f414, %f413, 0f3FB8AA3B; ex2.approx.ftz.f32 %f117, %f414; sub.ftz.f32 %f415, %f36, %f341; mul.ftz.f32 %f416, %f415, 0f3FB8AA3B; ex2.approx.ftz.f32 %f118, %f416; sub.ftz.f32 %f417, %f37, %f341; mul.ftz.f32 %f418, %f417, 0f3FB8AA3B; ex2.approx.ftz.f32 %f119, %f418; sub.ftz.f32 %f419, %f38, %f341; mul.ftz.f32 %f420, %f419, 0f3FB8AA3B; ex2.approx.ftz.f32 %f120, %f420; sub.ftz.f32 %f421, %f39, %f341; mul.ftz.f32 %f422, %f421, 0f3FB8AA3B; ex2.approx.ftz.f32 %f121, %f422; sub.ftz.f32 %f423, %f40, %f341; mul.ftz.f32 %f424, %f423, 0f3FB8AA3B; ex2.approx.ftz.f32 %f122, %f424; sub.ftz.f32 %f425, %f41, %f342; mul.ftz.f32 %f426, %f425, 0f3FB8AA3B; ex2.approx.ftz.f32 %f123, %f426; sub.ftz.f32 %f427, %f42, %f342; mul.ftz.f32 %f428, %f427, 0f3FB8AA3B; ex2.approx.ftz.f32 %f124, %f428; sub.ftz.f32 %f429, %f43, %f342; mul.ftz.f32 %f430, %f429, 0f3FB8AA3B; ex2.approx.ftz.f32 %f125, %f430; sub.ftz.f32 %f431, %f44, %f342; mul.ftz.f32 %f432, %f431, 0f3FB8AA3B; ex2.approx.ftz.f32 %f126, %f432; sub.ftz.f32 %f433, %f45, %f342; mul.ftz.f32 %f434, %f433, 0f3FB8AA3B; ex2.approx.ftz.f32 %f127, %f434; sub.ftz.f32 %f435, %f46, %f342; mul.ftz.f32 %f436, %f435, 0f3FB8AA3B; ex2.approx.ftz.f32 %f128, %f436; sub.ftz.f32 %f437, %f47, %f342; mul.ftz.f32 %f438, %f437, 0f3FB8AA3B; ex2.approx.ftz.f32 %f129, %f438; sub.ftz.f32 %f439, %f48, %f342; mul.ftz.f32 %f440, %f439, 0f3FB8AA3B; ex2.approx.ftz.f32 %f130, %f440; sub.ftz.f32 %f441, %f49, %f343; mul.ftz.f32 %f442, %f441, 0f3FB8AA3B; ex2.approx.ftz.f32 %f131, %f442; sub.ftz.f32 %f443, %f50, %f343; mul.ftz.f32 %f444, %f443, 0f3FB8AA3B; ex2.approx.ftz.f32 %f132, %f444; sub.ftz.f32 %f445, %f51, %f343; mul.ftz.f32 %f446, %f445, 0f3FB8AA3B; ex2.approx.ftz.f32 %f133, %f446; sub.ftz.f32 %f447, %f52, %f343; mul.ftz.f32 %f448, %f447, 0f3FB8AA3B; ex2.approx.ftz.f32 %f134, %f448; sub.ftz.f32 %f449, %f53, %f343; mul.ftz.f32 %f450, %f449, 0f3FB8AA3B; ex2.approx.ftz.f32 %f135, %f450; sub.ftz.f32 %f451, %f54, %f343; mul.ftz.f32 %f452, %f451, 0f3FB8AA3B; ex2.approx.ftz.f32 %f136, %f452; sub.ftz.f32 %f453, %f55, %f343; mul.ftz.f32 %f454, %f453, 0f3FB8AA3B; ex2.approx.ftz.f32 %f137, %f454; sub.ftz.f32 %f455, %f56, %f343; mul.ftz.f32 %f456, %f455, 0f3FB8AA3B; ex2.approx.ftz.f32 %f138, %f456; sub.ftz.f32 %f457, %f57, %f344; mul.ftz.f32 %f458, %f457, 0f3FB8AA3B; ex2.approx.ftz.f32 %f139, %f458; sub.ftz.f32 %f459, %f58, %f344; mul.ftz.f32 %f460, %f459, 0f3FB8AA3B; ex2.approx.ftz.f32 %f140, %f460; sub.ftz.f32 %f461, %f59, %f344; mul.ftz.f32 %f462, %f461, 0f3FB8AA3B; ex2.approx.ftz.f32 %f141, %f462; sub.ftz.f32 %f463, %f60, %f344; mul.ftz.f32 %f464, %f463, 0f3FB8AA3B; ex2.approx.ftz.f32 %f142, %f464; sub.ftz.f32 %f465, %f61, %f344; mul.ftz.f32 %f466, %f465, 0f3FB8AA3B; ex2.approx.ftz.f32 %f143, %f466; sub.ftz.f32 %f467, %f62, %f344; mul.ftz.f32 %f468, %f467, 0f3FB8AA3B; ex2.approx.ftz.f32 %f144, %f468; sub.ftz.f32 %f469, %f63, %f344; mul.ftz.f32 %f470, %f469, 0f3FB8AA3B; ex2.approx.ftz.f32 %f145, %f470; sub.ftz.f32 %f471, %f64, %f344; mul.ftz.f32 %f472, %f471, 0f3FB8AA3B; ex2.approx.ftz.f32 %f146, %f472; add.ftz.f32 %f473, %f83, %f84; add.ftz.f32 %f474, %f473, 0f00000000; add.ftz.f32 %f475, %f85, %f86; add.ftz.f32 %f476, %f475, 0f00000000; add.ftz.f32 %f477, %f87, %f88; add.ftz.f32 %f478, %f474, %f477; add.ftz.f32 %f479, %f89, %f90; add.ftz.f32 %f480, %f476, %f479; add.ftz.f32 %f481, %f478, %f480; add.ftz.f32 %f482, %f91, %f92; add.ftz.f32 %f483, %f482, 0f00000000; add.ftz.f32 %f484, %f93, %f94; add.ftz.f32 %f485, %f484, 0f00000000; add.ftz.f32 %f486, %f95, %f96; add.ftz.f32 %f487, %f483, %f486; add.ftz.f32 %f488, %f97, %f98; add.ftz.f32 %f489, %f485, %f488; add.ftz.f32 %f490, %f487, %f489; add.ftz.f32 %f491, %f99, %f100; add.ftz.f32 %f492, %f491, 0f00000000; add.ftz.f32 %f493, %f101, %f102; add.ftz.f32 %f494, %f493, 0f00000000; add.ftz.f32 %f495, %f103, %f104; add.ftz.f32 %f496, %f492, %f495; add.ftz.f32 %f497, %f105, %f106; add.ftz.f32 %f498, %f494, %f497; add.ftz.f32 %f499, %f496, %f498; add.ftz.f32 %f500, %f107, %f108; add.ftz.f32 %f501, %f500, 0f00000000; add.ftz.f32 %f502, %f109, %f110; add.ftz.f32 %f503, %f502, 0f00000000; add.ftz.f32 %f504, %f111, %f112; add.ftz.f32 %f505, %f501, %f504; add.ftz.f32 %f506, %f113, %f114; add.ftz.f32 %f507, %f503, %f506; add.ftz.f32 %f508, %f505, %f507; add.ftz.f32 %f509, %f115, %f116; add.ftz.f32 %f510, %f509, 0f00000000; add.ftz.f32 %f511, %f117, %f118; add.ftz.f32 %f512, %f511, 0f00000000; add.ftz.f32 %f513, %f119, %f120; add.ftz.f32 %f514, %f510, %f513; add.ftz.f32 %f515, %f121, %f122; add.ftz.f32 %f516, %f512, %f515; add.ftz.f32 %f517, %f514, %f516; add.ftz.f32 %f518, %f123, %f124; add.ftz.f32 %f519, %f518, 0f00000000; add.ftz.f32 %f520, %f125, %f126; add.ftz.f32 %f521, %f520, 0f00000000; add.ftz.f32 %f522, %f127, %f128; add.ftz.f32 %f523, %f519, %f522; add.ftz.f32 %f524, %f129, %f130; add.ftz.f32 %f525, %f521, %f524; add.ftz.f32 %f526, %f523, %f525; add.ftz.f32 %f527, %f131, %f132; add.ftz.f32 %f528, %f527, 0f00000000; add.ftz.f32 %f529, %f133, %f134; add.ftz.f32 %f530, %f529, 0f00000000; add.ftz.f32 %f531, %f135, %f136; add.ftz.f32 %f532, %f528, %f531; add.ftz.f32 %f533, %f137, %f138; add.ftz.f32 %f534, %f530, %f533; add.ftz.f32 %f535, %f532, %f534; add.ftz.f32 %f536, %f139, %f140; add.ftz.f32 %f537, %f536, 0f00000000; add.ftz.f32 %f538, %f141, %f142; add.ftz.f32 %f539, %f538, 0f00000000; add.ftz.f32 %f540, %f143, %f144; add.ftz.f32 %f541, %f537, %f540; add.ftz.f32 %f542, %f145, %f146; add.ftz.f32 %f543, %f539, %f542; add.ftz.f32 %f544, %f541, %f543; mov.b32 %r1348, %f481; mov.u32 %r1349, 31; mov.u32 %r1350, 1; mov.u32 %r1351, -1; shfl.sync.bfly.b32 %r1352|%p145, %r1348, %r1350, %r1349, %r1351; mov.b32 %f545, %r1352; add.ftz.f32 %f147, %f481, %f545; mov.b32 %r1353, %f147; shfl.sync.bfly.b32 %r55|%p9, %r1353, %r1347, %r1349, %r1351; mov.b32 %r1354, %f490; shfl.sync.bfly.b32 %r1355|%p146, %r1354, %r1350, %r1349, %r1351; mov.b32 %f546, %r1355; add.ftz.f32 %f148, %f490, %f546; mov.b32 %r1356, %f148; shfl.sync.bfly.b32 %r56|%p10, %r1356, %r1347, %r1349, %r1351; mov.b32 %r1357, %f499; shfl.sync.bfly.b32 %r1358|%p147, %r1357, %r1350, %r1349, %r1351; mov.b32 %f547, %r1358; add.ftz.f32 %f149, %f499, %f547; mov.b32 %r1359, %f149; shfl.sync.bfly.b32 %r57|%p11, %r1359, %r1347, %r1349, %r1351; mov.b32 %r1360, %f508; shfl.sync.bfly.b32 %r1361|%p148, %r1360, %r1350, %r1349, %r1351; mov.b32 %f548, %r1361; add.ftz.f32 %f150, %f508, %f548; mov.b32 %r1362, %f150; shfl.sync.bfly.b32 %r58|%p12, %r1362, %r1347, %r1349, %r1351; mov.b32 %r1363, %f517; shfl.sync.bfly.b32 %r1364|%p149, %r1363, %r1350, %r1349, %r1351; mov.b32 %f549, %r1364; add.ftz.f32 %f151, %f517, %f549; mov.b32 %r1365, %f151; shfl.sync.bfly.b32 %r59|%p13, %r1365, %r1347, %r1349, %r1351; mov.b32 %r1366, %f526; shfl.sync.bfly.b32 %r1367|%p150, %r1366, %r1350, %r1349, %r1351; mov.b32 %f550, %r1367; add.ftz.f32 %f152, %f526, %f550; mov.b32 %r1368, %f152; shfl.sync.bfly.b32 %r60|%p14, %r1368, %r1347, %r1349, %r1351; mov.b32 %r1369, %f535; shfl.sync.bfly.b32 %r1370|%p151, %r1369, %r1350, %r1349, %r1351; mov.b32 %f551, %r1370; add.ftz.f32 %f153, %f535, %f551; mov.b32 %r1371, %f153; shfl.sync.bfly.b32 %r61|%p15, %r1371, %r1347, %r1349, %r1351; mov.b32 %r1372, %f544; shfl.sync.bfly.b32 %r1373|%p152, %r1372, %r1350, %r1349, %r1351; mov.b32 %f552, %r1373; add.ftz.f32 %f154, %f544, %f552; mov.b32 %r1374, %f154; shfl.sync.bfly.b32 %r62|%p16, %r1374, %r1347, %r1349, %r1351; @%p172 bra $L__BB0_9; mov.b32 %f553, %r55; add.ftz.f32 %f554, %f147, %f553; st.shared.f32 [%r42], %f554; mov.b32 %f555, %r56; add.ftz.f32 %f556, %f148, %f555; st.shared.f32 [%r42+128], %f556; mov.b32 %f557, %r57; add.ftz.f32 %f558, %f149, %f557; st.shared.f32 [%r42+256], %f558; mov.b32 %f559, %r58; add.ftz.f32 %f560, %f150, %f559; st.shared.f32 [%r42+384], %f560; mov.b32 %f561, %r59; add.ftz.f32 %f562, %f151, %f561; st.shared.f32 [%r42+512], %f562; mov.b32 %f563, %r60; add.ftz.f32 %f564, %f152, %f563; st.shared.f32 [%r42+640], %f564; mov.b32 %f565, %r61; add.ftz.f32 %f566, %f153, %f565; st.shared.f32 [%r42+768], %f566; mov.b32 %f567, %r62; add.ftz.f32 %f568, %f154, %f567; st.shared.f32 [%r42+896], %f568; $L__BB0_9: bar.sync 0; @%p139 bra $L__BB0_11; ld.shared.v4.f32 {%f652, %f653, %f654, %f655}, [%r52+16384]; $L__BB0_11: bar.sync 0; @%p139 bra $L__BB0_13; add.ftz.f32 %f573, %f652, %f653; add.ftz.f32 %f574, %f654, %f655; add.ftz.f32 %f575, %f573, %f574; st.shared.f32 [%r53+16384], %f575; $L__BB0_13: bar.sync 0; add.s32 %r2479, %r1346, 16384; ld.shared.f32 %f163, [%r2479]; add.s32 %r2480, %r1346, 16384; ld.shared.f32 %f164, [%r2480+32]; add.s32 %r2481, %r1346, 16384; ld.shared.f32 %f165, [%r2481+64]; add.s32 %r2482, %r1346, 16384; ld.shared.f32 %f166, [%r2482+96]; add.s32 %r2483, %r1346, 16384; ld.shared.f32 %f167, [%r2483+128]; add.s32 %r2484, %r1346, 16384; ld.shared.f32 %f168, [%r2484+160]; add.s32 %r2485, %r1346, 16384; ld.shared.f32 %f169, [%r2485+192]; add.s32 %r2486, %r1346, 16384; ld.shared.f32 %f170, [%r2486+224]; bar.sync 0; setp.equ.ftz.f32 %p155, %f163, 0f00000000; mov.f32 %f657, 0f3F800000; mov.f32 %f656, %f657; @%p155 bra $L__BB0_15; rcp.approx.ftz.f32 %f656, %f163; $L__BB0_15: setp.equ.ftz.f32 %p156, %f164, 0f00000000; @%p156 bra $L__BB0_17; rcp.approx.ftz.f32 %f657, %f164; $L__BB0_17: setp.equ.ftz.f32 %p157, %f165, 0f00000000; mov.f32 %f659, 0f3F800000; mov.f32 %f658, %f659; @%p157 bra $L__BB0_19; rcp.approx.ftz.f32 %f658, %f165; $L__BB0_19: setp.equ.ftz.f32 %p158, %f166, 0f00000000; @%p158 bra $L__BB0_21; rcp.approx.ftz.f32 %f659, %f166; $L__BB0_21: setp.equ.ftz.f32 %p159, %f167, 0f00000000; mov.f32 %f661, 0f3F800000; mov.f32 %f660, %f661; @%p159 bra $L__BB0_23; rcp.approx.ftz.f32 %f660, %f167; $L__BB0_23: setp.equ.ftz.f32 %p160, %f168, 0f00000000; @%p160 bra $L__BB0_25; rcp.approx.ftz.f32 %f661, %f168; $L__BB0_25: setp.equ.ftz.f32 %p161, %f169, 0f00000000; mov.f32 %f663, 0f3F800000; mov.f32 %f662, %f663; @%p161 bra $L__BB0_27; rcp.approx.ftz.f32 %f662, %f169; $L__BB0_27: setp.equ.ftz.f32 %p162, %f170, 0f00000000; @%p162 bra $L__BB0_29; rcp.approx.ftz.f32 %f663, %f170; $L__BB0_29: mov.u32 %r2502, %tid.x; shr.s32 %r2501, %r2502, 31; shr.u32 %r2500, %r2501, 29; add.s32 %r2499, %r2502, %r2500; shr.s32 %r2498, %r2499, 3; mov.u32 %r2497, _ZN25fused_multihead_attention5smem_E; add.s32 %r2496, %r2497, 16384; shl.b32 %r2495, %r2502, 7; ld.param.u32 %r2494, [fmha_mhca_fp16_128_64_sm86_kernel_nl_param_0+200]; and.b32 %r2464, %r2495, 3584; add.s32 %r2467, %r2464, %r2496; and.b32 %r2469, %r1342, 1020; add.s32 %r2111, %r2467, %r2469; mul.ftz.f32 %f584, %f656, %f84; mul.ftz.f32 %f585, %f656, %f83; // begin inline asm cvt.rn.f16x2.f32 %r1375, %f584, %f585; // end inline asm mul.ftz.f32 %f586, %f657, %f92; mul.ftz.f32 %f587, %f657, %f91; // begin inline asm cvt.rn.f16x2.f32 %r1376, %f586, %f587; // end inline asm mul.ftz.f32 %f588, %f656, %f86; mul.ftz.f32 %f589, %f656, %f85; // begin inline asm cvt.rn.f16x2.f32 %r1377, %f588, %f589; // end inline asm mul.ftz.f32 %f590, %f657, %f94; mul.ftz.f32 %f591, %f657, %f93; // begin inline asm cvt.rn.f16x2.f32 %r1378, %f590, %f591; // end inline asm mul.ftz.f32 %f592, %f656, %f88; mul.ftz.f32 %f593, %f656, %f87; // begin inline asm cvt.rn.f16x2.f32 %r1379, %f592, %f593; // end inline asm mul.ftz.f32 %f594, %f657, %f96; mul.ftz.f32 %f595, %f657, %f95; // begin inline asm cvt.rn.f16x2.f32 %r1380, %f594, %f595; // end inline asm mul.ftz.f32 %f596, %f656, %f90; mul.ftz.f32 %f597, %f656, %f89; // begin inline asm cvt.rn.f16x2.f32 %r1381, %f596, %f597; // end inline asm mul.ftz.f32 %f598, %f657, %f98; mul.ftz.f32 %f599, %f657, %f97; // begin inline asm cvt.rn.f16x2.f32 %r1382, %f598, %f599; // end inline asm mul.ftz.f32 %f600, %f658, %f100; mul.ftz.f32 %f601, %f658, %f99; // begin inline asm cvt.rn.f16x2.f32 %r1383, %f600, %f601; // end inline asm mul.ftz.f32 %f602, %f659, %f108; mul.ftz.f32 %f603, %f659, %f107; // begin inline asm cvt.rn.f16x2.f32 %r1384, %f602, %f603; // end inline asm mul.ftz.f32 %f604, %f658, %f102; mul.ftz.f32 %f605, %f658, %f101; // begin inline asm cvt.rn.f16x2.f32 %r1385, %f604, %f605; // end inline asm mul.ftz.f32 %f606, %f659, %f110; mul.ftz.f32 %f607, %f659, %f109; // begin inline asm cvt.rn.f16x2.f32 %r1386, %f606, %f607; // end inline asm mul.ftz.f32 %f608, %f658, %f104; mul.ftz.f32 %f609, %f658, %f103; // begin inline asm cvt.rn.f16x2.f32 %r1387, %f608, %f609; // end inline asm mul.ftz.f32 %f610, %f659, %f112; mul.ftz.f32 %f611, %f659, %f111; // begin inline asm cvt.rn.f16x2.f32 %r1388, %f610, %f611; // end inline asm mul.ftz.f32 %f612, %f658, %f106; mul.ftz.f32 %f613, %f658, %f105; // begin inline asm cvt.rn.f16x2.f32 %r1389, %f612, %f613; // end inline asm mul.ftz.f32 %f614, %f659, %f114; mul.ftz.f32 %f615, %f659, %f113; // begin inline asm cvt.rn.f16x2.f32 %r1390, %f614, %f615; // end inline asm mul.ftz.f32 %f616, %f660, %f116; mul.ftz.f32 %f617, %f660, %f115; // begin inline asm cvt.rn.f16x2.f32 %r1391, %f616, %f617; // end inline asm mul.ftz.f32 %f618, %f661, %f124; mul.ftz.f32 %f619, %f661, %f123; // begin inline asm cvt.rn.f16x2.f32 %r1392, %f618, %f619; // end inline asm mul.ftz.f32 %f620, %f660, %f118; mul.ftz.f32 %f621, %f660, %f117; // begin inline asm cvt.rn.f16x2.f32 %r1393, %f620, %f621; // end inline asm mul.ftz.f32 %f622, %f661, %f126; mul.ftz.f32 %f623, %f661, %f125; // begin inline asm cvt.rn.f16x2.f32 %r1394, %f622, %f623; // end inline asm mul.ftz.f32 %f624, %f660, %f120; mul.ftz.f32 %f625, %f660, %f119; // begin inline asm cvt.rn.f16x2.f32 %r1395, %f624, %f625; // end inline asm mul.ftz.f32 %f626, %f661, %f128; mul.ftz.f32 %f627, %f661, %f127; // begin inline asm cvt.rn.f16x2.f32 %r1396, %f626, %f627; // end inline asm mul.ftz.f32 %f628, %f660, %f122; mul.ftz.f32 %f629, %f660, %f121; // begin inline asm cvt.rn.f16x2.f32 %r1397, %f628, %f629; // end inline asm mul.ftz.f32 %f630, %f661, %f130; mul.ftz.f32 %f631, %f661, %f129; // begin inline asm cvt.rn.f16x2.f32 %r1398, %f630, %f631; // end inline asm mul.ftz.f32 %f632, %f662, %f132; mul.ftz.f32 %f633, %f662, %f131; // begin inline asm cvt.rn.f16x2.f32 %r1399, %f632, %f633; // end inline asm mul.ftz.f32 %f634, %f663, %f140; mul.ftz.f32 %f635, %f663, %f139; // begin inline asm cvt.rn.f16x2.f32 %r1400, %f634, %f635; // end inline asm mul.ftz.f32 %f636, %f662, %f134; mul.ftz.f32 %f637, %f662, %f133; // begin inline asm cvt.rn.f16x2.f32 %r1401, %f636, %f637; // end inline asm mul.ftz.f32 %f638, %f663, %f142; mul.ftz.f32 %f639, %f663, %f141; // begin inline asm cvt.rn.f16x2.f32 %r1402, %f638, %f639; // end inline asm mul.ftz.f32 %f640, %f662, %f136; mul.ftz.f32 %f641, %f662, %f135; // begin inline asm cvt.rn.f16x2.f32 %r1403, %f640, %f641; // end inline asm mul.ftz.f32 %f642, %f663, %f144; mul.ftz.f32 %f643, %f663, %f143; // begin inline asm cvt.rn.f16x2.f32 %r1404, %f642, %f643; // end inline asm mul.ftz.f32 %f644, %f662, %f138; mul.ftz.f32 %f645, %f662, %f137; // begin inline asm cvt.rn.f16x2.f32 %r1405, %f644, %f645; // end inline asm mul.ftz.f32 %f646, %f663, %f146; mul.ftz.f32 %f647, %f663, %f145; // begin inline asm cvt.rn.f16x2.f32 %r1406, %f646, %f647; // end inline asm // begin inline asm mov.u32 %r1791, 0; // end inline asm // begin inline asm mov.u32 %r1792, 0; // end inline asm // begin inline asm mov.u32 %r1801, 0; // end inline asm // begin inline asm mov.u32 %r1802, 0; // end inline asm // begin inline asm mov.u32 %r1811, 0; // end inline asm // begin inline asm mov.u32 %r1812, 0; // end inline asm // begin inline asm mov.u32 %r1821, 0; // end inline asm // begin inline asm mov.u32 %r1822, 0; // end inline asm // begin inline asm mov.u32 %r1831, 0; // end inline asm // begin inline asm mov.u32 %r1832, 0; // end inline asm // begin inline asm mov.u32 %r1841, 0; // end inline asm // begin inline asm mov.u32 %r1842, 0; // end inline asm // begin inline asm mov.u32 %r1851, 0; // end inline asm // begin inline asm mov.u32 %r1852, 0; // end inline asm // begin inline asm mov.u32 %r1861, 0; // end inline asm // begin inline asm mov.u32 %r1862, 0; // end inline asm // begin inline asm mov.u32 %r1871, 0; // end inline asm // begin inline asm mov.u32 %r1872, 0; // end inline asm // begin inline asm mov.u32 %r1881, 0; // end inline asm // begin inline asm mov.u32 %r1882, 0; // end inline asm // begin inline asm mov.u32 %r1891, 0; // end inline asm // begin inline asm mov.u32 %r1892, 0; // end inline asm // begin inline asm mov.u32 %r1901, 0; // end inline asm // begin inline asm mov.u32 %r1902, 0; // end inline asm // begin inline asm mov.u32 %r1911, 0; // end inline asm // begin inline asm mov.u32 %r1912, 0; // end inline asm // begin inline asm mov.u32 %r1921, 0; // end inline asm // begin inline asm mov.u32 %r1922, 0; // end inline asm // begin inline asm mov.u32 %r1931, 0; // end inline asm // begin inline asm mov.u32 %r1932, 0; // end inline asm // begin inline asm mov.u32 %r1941, 0; // end inline asm // begin inline asm mov.u32 %r1942, 0; // end inline asm // begin inline asm mov.u32 %r1951, 0; // end inline asm // begin inline asm mov.u32 %r1952, 0; // end inline asm // begin inline asm mov.u32 %r1961, 0; // end inline asm // begin inline asm mov.u32 %r1962, 0; // end inline asm // begin inline asm mov.u32 %r1971, 0; // end inline asm // begin inline asm mov.u32 %r1972, 0; // end inline asm // begin inline asm mov.u32 %r1981, 0; // end inline asm // begin inline asm mov.u32 %r1982, 0; // end inline asm // begin inline asm mov.u32 %r1991, 0; // end inline asm // begin inline asm mov.u32 %r1992, 0; // end inline asm // begin inline asm mov.u32 %r2001, 0; // end inline asm // begin inline asm mov.u32 %r2002, 0; // end inline asm // begin inline asm mov.u32 %r2011, 0; // end inline asm // begin inline asm mov.u32 %r2012, 0; // end inline asm // begin inline asm mov.u32 %r2021, 0; // end inline asm // begin inline asm mov.u32 %r2022, 0; // end inline asm // begin inline asm mov.u32 %r2031, 0; // end inline asm // begin inline asm mov.u32 %r2032, 0; // end inline asm // begin inline asm mov.u32 %r2041, 0; // end inline asm // begin inline asm mov.u32 %r2042, 0; // end inline asm // begin inline asm mov.u32 %r2051, 0; // end inline asm // begin inline asm mov.u32 %r2052, 0; // end inline asm // begin inline asm mov.u32 %r2061, 0; // end inline asm // begin inline asm mov.u32 %r2062, 0; // end inline asm // begin inline asm mov.u32 %r2071, 0; // end inline asm // begin inline asm mov.u32 %r2072, 0; // end inline asm // begin inline asm mov.u32 %r2081, 0; // end inline asm // begin inline asm mov.u32 %r2082, 0; // end inline asm // begin inline asm mov.u32 %r2091, 0; // end inline asm // begin inline asm mov.u32 %r2092, 0; // end inline asm // begin inline asm mov.u32 %r2101, 0; // end inline asm // begin inline asm mov.u32 %r2102, 0; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1791, %r1792}, {%r1375, %r1376, %r1377, %r1378}, {%r180, %r181}, {%r1791, %r1792}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1801, %r1802}, {%r1375, %r1376, %r1377, %r1378}, {%r182, %r183}, {%r1801, %r1802}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1811, %r1812}, {%r1375, %r1376, %r1377, %r1378}, {%r185, %r186}, {%r1811, %r1812}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1821, %r1822}, {%r1375, %r1376, %r1377, %r1378}, {%r187, %r188}, {%r1821, %r1822}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1831, %r1832}, {%r1375, %r1376, %r1377, %r1378}, {%r190, %r191}, {%r1831, %r1832}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1841, %r1842}, {%r1375, %r1376, %r1377, %r1378}, {%r192, %r193}, {%r1841, %r1842}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1851, %r1852}, {%r1375, %r1376, %r1377, %r1378}, {%r195, %r196}, {%r1851, %r1852}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1861, %r1862}, {%r1375, %r1376, %r1377, %r1378}, {%r197, %r198}, {%r1861, %r1862}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1871, %r1872}, {%r1383, %r1384, %r1385, %r1386}, {%r180, %r181}, {%r1871, %r1872}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1881, %r1882}, {%r1383, %r1384, %r1385, %r1386}, {%r182, %r183}, {%r1881, %r1882}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1891, %r1892}, {%r1383, %r1384, %r1385, %r1386}, {%r185, %r186}, {%r1891, %r1892}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1901, %r1902}, {%r1383, %r1384, %r1385, %r1386}, {%r187, %r188}, {%r1901, %r1902}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1911, %r1912}, {%r1383, %r1384, %r1385, %r1386}, {%r190, %r191}, {%r1911, %r1912}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1921, %r1922}, {%r1383, %r1384, %r1385, %r1386}, {%r192, %r193}, {%r1921, %r1922}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1931, %r1932}, {%r1383, %r1384, %r1385, %r1386}, {%r195, %r196}, {%r1931, %r1932}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1941, %r1942}, {%r1383, %r1384, %r1385, %r1386}, {%r197, %r198}, {%r1941, %r1942}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1951, %r1952}, {%r1391, %r1392, %r1393, %r1394}, {%r180, %r181}, {%r1951, %r1952}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1961, %r1962}, {%r1391, %r1392, %r1393, %r1394}, {%r182, %r183}, {%r1961, %r1962}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1971, %r1972}, {%r1391, %r1392, %r1393, %r1394}, {%r185, %r186}, {%r1971, %r1972}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1981, %r1982}, {%r1391, %r1392, %r1393, %r1394}, {%r187, %r188}, {%r1981, %r1982}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1991, %r1992}, {%r1391, %r1392, %r1393, %r1394}, {%r190, %r191}, {%r1991, %r1992}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2001, %r2002}, {%r1391, %r1392, %r1393, %r1394}, {%r192, %r193}, {%r2001, %r2002}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2011, %r2012}, {%r1391, %r1392, %r1393, %r1394}, {%r195, %r196}, {%r2011, %r2012}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2021, %r2022}, {%r1391, %r1392, %r1393, %r1394}, {%r197, %r198}, {%r2021, %r2022}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2031, %r2032}, {%r1399, %r1400, %r1401, %r1402}, {%r180, %r181}, {%r2031, %r2032}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2041, %r2042}, {%r1399, %r1400, %r1401, %r1402}, {%r182, %r183}, {%r2041, %r2042}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2051, %r2052}, {%r1399, %r1400, %r1401, %r1402}, {%r185, %r186}, {%r2051, %r2052}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2061, %r2062}, {%r1399, %r1400, %r1401, %r1402}, {%r187, %r188}, {%r2061, %r2062}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2071, %r2072}, {%r1399, %r1400, %r1401, %r1402}, {%r190, %r191}, {%r2071, %r2072}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2081, %r2082}, {%r1399, %r1400, %r1401, %r1402}, {%r192, %r193}, {%r2081, %r2082}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2091, %r2092}, {%r1399, %r1400, %r1401, %r1402}, {%r195, %r196}, {%r2091, %r2092}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2101, %r2102}, {%r1399, %r1400, %r1401, %r1402}, {%r197, %r198}, {%r2101, %r2102}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1791, %r1792}, {%r1379, %r1380, %r1381, %r1382}, {%r200, %r201}, {%r1791, %r1792}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1801, %r1802}, {%r1379, %r1380, %r1381, %r1382}, {%r202, %r203}, {%r1801, %r1802}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1811, %r1812}, {%r1379, %r1380, %r1381, %r1382}, {%r205, %r206}, {%r1811, %r1812}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1821, %r1822}, {%r1379, %r1380, %r1381, %r1382}, {%r207, %r208}, {%r1821, %r1822}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1831, %r1832}, {%r1379, %r1380, %r1381, %r1382}, {%r210, %r211}, {%r1831, %r1832}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1841, %r1842}, {%r1379, %r1380, %r1381, %r1382}, {%r212, %r213}, {%r1841, %r1842}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1851, %r1852}, {%r1379, %r1380, %r1381, %r1382}, {%r215, %r216}, {%r1851, %r1852}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1861, %r1862}, {%r1379, %r1380, %r1381, %r1382}, {%r217, %r218}, {%r1861, %r1862}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1871, %r1872}, {%r1387, %r1388, %r1389, %r1390}, {%r200, %r201}, {%r1871, %r1872}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1881, %r1882}, {%r1387, %r1388, %r1389, %r1390}, {%r202, %r203}, {%r1881, %r1882}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1891, %r1892}, {%r1387, %r1388, %r1389, %r1390}, {%r205, %r206}, {%r1891, %r1892}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1901, %r1902}, {%r1387, %r1388, %r1389, %r1390}, {%r207, %r208}, {%r1901, %r1902}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1911, %r1912}, {%r1387, %r1388, %r1389, %r1390}, {%r210, %r211}, {%r1911, %r1912}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1921, %r1922}, {%r1387, %r1388, %r1389, %r1390}, {%r212, %r213}, {%r1921, %r1922}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1931, %r1932}, {%r1387, %r1388, %r1389, %r1390}, {%r215, %r216}, {%r1931, %r1932}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1941, %r1942}, {%r1387, %r1388, %r1389, %r1390}, {%r217, %r218}, {%r1941, %r1942}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1951, %r1952}, {%r1395, %r1396, %r1397, %r1398}, {%r200, %r201}, {%r1951, %r1952}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1961, %r1962}, {%r1395, %r1396, %r1397, %r1398}, {%r202, %r203}, {%r1961, %r1962}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1971, %r1972}, {%r1395, %r1396, %r1397, %r1398}, {%r205, %r206}, {%r1971, %r1972}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1981, %r1982}, {%r1395, %r1396, %r1397, %r1398}, {%r207, %r208}, {%r1981, %r1982}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1991, %r1992}, {%r1395, %r1396, %r1397, %r1398}, {%r210, %r211}, {%r1991, %r1992}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2001, %r2002}, {%r1395, %r1396, %r1397, %r1398}, {%r212, %r213}, {%r2001, %r2002}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2011, %r2012}, {%r1395, %r1396, %r1397, %r1398}, {%r215, %r216}, {%r2011, %r2012}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2021, %r2022}, {%r1395, %r1396, %r1397, %r1398}, {%r217, %r218}, {%r2021, %r2022}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2031, %r2032}, {%r1403, %r1404, %r1405, %r1406}, {%r200, %r201}, {%r2031, %r2032}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2041, %r2042}, {%r1403, %r1404, %r1405, %r1406}, {%r202, %r203}, {%r2041, %r2042}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2051, %r2052}, {%r1403, %r1404, %r1405, %r1406}, {%r205, %r206}, {%r2051, %r2052}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2061, %r2062}, {%r1403, %r1404, %r1405, %r1406}, {%r207, %r208}, {%r2061, %r2062}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2071, %r2072}, {%r1403, %r1404, %r1405, %r1406}, {%r210, %r211}, {%r2071, %r2072}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2081, %r2082}, {%r1403, %r1404, %r1405, %r1406}, {%r212, %r213}, {%r2081, %r2082}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2091, %r2092}, {%r1403, %r1404, %r1405, %r1406}, {%r215, %r216}, {%r2091, %r2092}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r2101, %r2102}, {%r1403, %r1404, %r1405, %r1406}, {%r217, %r218}, {%r2101, %r2102}; // end inline asm // begin inline asm st.shared.b32 [%r2111], %r1791; // end inline asm add.s32 %r2113, %r2111, 4096; // begin inline asm st.shared.b32 [%r2113], %r1792; // end inline asm add.s32 %r2115, %r2111, 8192; // begin inline asm st.shared.b32 [%r2115], %r1871; // end inline asm add.s32 %r2117, %r2111, 12288; // begin inline asm st.shared.b32 [%r2117], %r1872; // end inline asm add.s32 %r2119, %r2111, 16384; // begin inline asm st.shared.b32 [%r2119], %r1951; // end inline asm add.s32 %r2121, %r2111, 20480; // begin inline asm st.shared.b32 [%r2121], %r1952; // end inline asm add.s32 %r2123, %r2111, 24576; // begin inline asm st.shared.b32 [%r2123], %r2031; // end inline asm add.s32 %r2125, %r2111, 28672; // begin inline asm st.shared.b32 [%r2125], %r2032; // end inline asm xor.b32 %r2127, %r2111, 16; add.s32 %r2129, %r2127, 4096; // begin inline asm st.shared.b32 [%r2127], %r1801; // end inline asm // begin inline asm st.shared.b32 [%r2129], %r1802; // end inline asm add.s32 %r2131, %r2127, 8192; // begin inline asm st.shared.b32 [%r2131], %r1881; // end inline asm add.s32 %r2133, %r2127, 12288; // begin inline asm st.shared.b32 [%r2133], %r1882; // end inline asm add.s32 %r2135, %r2127, 16384; // begin inline asm st.shared.b32 [%r2135], %r1961; // end inline asm add.s32 %r2137, %r2127, 20480; // begin inline asm st.shared.b32 [%r2137], %r1962; // end inline asm add.s32 %r2139, %r2127, 24576; // begin inline asm st.shared.b32 [%r2139], %r2041; // end inline asm add.s32 %r2141, %r2127, 28672; // begin inline asm st.shared.b32 [%r2141], %r2042; // end inline asm xor.b32 %r2143, %r2111, 32; add.s32 %r2145, %r2143, 4096; // begin inline asm st.shared.b32 [%r2143], %r1811; // end inline asm // begin inline asm st.shared.b32 [%r2145], %r1812; // end inline asm add.s32 %r2147, %r2143, 8192; // begin inline asm st.shared.b32 [%r2147], %r1891; // end inline asm add.s32 %r2149, %r2143, 12288; // begin inline asm st.shared.b32 [%r2149], %r1892; // end inline asm add.s32 %r2151, %r2143, 16384; // begin inline asm st.shared.b32 [%r2151], %r1971; // end inline asm add.s32 %r2153, %r2143, 20480; // begin inline asm st.shared.b32 [%r2153], %r1972; // end inline asm add.s32 %r2155, %r2143, 24576; // begin inline asm st.shared.b32 [%r2155], %r2051; // end inline asm add.s32 %r2157, %r2143, 28672; // begin inline asm st.shared.b32 [%r2157], %r2052; // end inline asm xor.b32 %r2159, %r2111, 48; add.s32 %r2161, %r2159, 4096; // begin inline asm st.shared.b32 [%r2159], %r1821; // end inline asm // begin inline asm st.shared.b32 [%r2161], %r1822; // end inline asm add.s32 %r2163, %r2159, 8192; // begin inline asm st.shared.b32 [%r2163], %r1901; // end inline asm add.s32 %r2165, %r2159, 12288; // begin inline asm st.shared.b32 [%r2165], %r1902; // end inline asm add.s32 %r2167, %r2159, 16384; // begin inline asm st.shared.b32 [%r2167], %r1981; // end inline asm add.s32 %r2169, %r2159, 20480; // begin inline asm st.shared.b32 [%r2169], %r1982; // end inline asm add.s32 %r2171, %r2159, 24576; // begin inline asm st.shared.b32 [%r2171], %r2061; // end inline asm add.s32 %r2173, %r2159, 28672; // begin inline asm st.shared.b32 [%r2173], %r2062; // end inline asm xor.b32 %r2175, %r2111, 64; add.s32 %r2177, %r2175, 4096; // begin inline asm st.shared.b32 [%r2175], %r1831; // end inline asm // begin inline asm st.shared.b32 [%r2177], %r1832; // end inline asm add.s32 %r2179, %r2175, 8192; // begin inline asm st.shared.b32 [%r2179], %r1911; // end inline asm add.s32 %r2181, %r2175, 12288; // begin inline asm st.shared.b32 [%r2181], %r1912; // end inline asm add.s32 %r2183, %r2175, 16384; // begin inline asm st.shared.b32 [%r2183], %r1991; // end inline asm add.s32 %r2185, %r2175, 20480; // begin inline asm st.shared.b32 [%r2185], %r1992; // end inline asm add.s32 %r2187, %r2175, 24576; // begin inline asm st.shared.b32 [%r2187], %r2071; // end inline asm add.s32 %r2189, %r2175, 28672; // begin inline asm st.shared.b32 [%r2189], %r2072; // end inline asm xor.b32 %r2191, %r2111, 80; add.s32 %r2193, %r2191, 4096; // begin inline asm st.shared.b32 [%r2191], %r1841; // end inline asm // begin inline asm st.shared.b32 [%r2193], %r1842; // end inline asm add.s32 %r2195, %r2191, 8192; // begin inline asm st.shared.b32 [%r2195], %r1921; // end inline asm add.s32 %r2197, %r2191, 12288; // begin inline asm st.shared.b32 [%r2197], %r1922; // end inline asm add.s32 %r2199, %r2191, 16384; // begin inline asm st.shared.b32 [%r2199], %r2001; // end inline asm add.s32 %r2201, %r2191, 20480; // begin inline asm st.shared.b32 [%r2201], %r2002; // end inline asm add.s32 %r2203, %r2191, 24576; // begin inline asm st.shared.b32 [%r2203], %r2081; // end inline asm add.s32 %r2205, %r2191, 28672; // begin inline asm st.shared.b32 [%r2205], %r2082; // end inline asm xor.b32 %r2207, %r2111, 96; add.s32 %r2209, %r2207, 4096; // begin inline asm st.shared.b32 [%r2207], %r1851; // end inline asm // begin inline asm st.shared.b32 [%r2209], %r1852; // end inline asm add.s32 %r2211, %r2207, 8192; // begin inline asm st.shared.b32 [%r2211], %r1931; // end inline asm add.s32 %r2213, %r2207, 12288; // begin inline asm st.shared.b32 [%r2213], %r1932; // end inline asm add.s32 %r2215, %r2207, 16384; // begin inline asm st.shared.b32 [%r2215], %r2011; // end inline asm add.s32 %r2217, %r2207, 20480; // begin inline asm st.shared.b32 [%r2217], %r2012; // end inline asm add.s32 %r2219, %r2207, 24576; // begin inline asm st.shared.b32 [%r2219], %r2091; // end inline asm add.s32 %r2221, %r2207, 28672; // begin inline asm st.shared.b32 [%r2221], %r2092; // end inline asm xor.b32 %r2223, %r2111, 112; add.s32 %r2225, %r2223, 4096; // begin inline asm st.shared.b32 [%r2223], %r1861; // end inline asm // begin inline asm st.shared.b32 [%r2225], %r1862; // end inline asm add.s32 %r2227, %r2223, 8192; // begin inline asm st.shared.b32 [%r2227], %r1941; // end inline asm add.s32 %r2229, %r2223, 12288; // begin inline asm st.shared.b32 [%r2229], %r1942; // end inline asm add.s32 %r2231, %r2223, 16384; // begin inline asm st.shared.b32 [%r2231], %r2021; // end inline asm add.s32 %r2233, %r2223, 20480; // begin inline asm st.shared.b32 [%r2233], %r2022; // end inline asm add.s32 %r2235, %r2223, 24576; // begin inline asm st.shared.b32 [%r2235], %r2101; // end inline asm add.s32 %r2237, %r2223, 28672; // begin inline asm st.shared.b32 [%r2237], %r2102; // end inline asm bar.sync 0; shl.b32 %r2470, %r2498, 9; add.s32 %r2471, %r2470, %r2496; add.s32 %r2243, %r2471, %r6; // begin inline asm ld.shared.v4.b32 {%r2239, %r2240, %r2241, %r2242}, [%r2243]; // end inline asm add.s32 %r2248, %r2243, 128; // begin inline asm ld.shared.v4.b32 {%r2244, %r2245, %r2246, %r2247}, [%r2248]; // end inline asm add.s32 %r2253, %r2243, 256; // begin inline asm ld.shared.v4.b32 {%r2249, %r2250, %r2251, %r2252}, [%r2253]; // end inline asm add.s32 %r2258, %r2243, 384; // begin inline asm ld.shared.v4.b32 {%r2254, %r2255, %r2256, %r2257}, [%r2258]; // end inline asm // begin inline asm add.f16x2 %r2259, %r2239, %r2244; // end inline asm // begin inline asm add.f16x2 %r2262, %r2240, %r2245; // end inline asm // begin inline asm add.f16x2 %r2265, %r2241, %r2246; // end inline asm // begin inline asm add.f16x2 %r2268, %r2242, %r2247; // end inline asm // begin inline asm add.f16x2 %r2271, %r2259, %r2249; // end inline asm // begin inline asm add.f16x2 %r2274, %r2262, %r2250; // end inline asm // begin inline asm add.f16x2 %r2277, %r2265, %r2251; // end inline asm // begin inline asm add.f16x2 %r2280, %r2268, %r2252; // end inline asm // begin inline asm add.f16x2 %r2283, %r2271, %r2254; // end inline asm // begin inline asm add.f16x2 %r2286, %r2274, %r2255; // end inline asm // begin inline asm add.f16x2 %r2289, %r2277, %r2256; // end inline asm // begin inline asm add.f16x2 %r2292, %r2280, %r2257; // end inline asm add.s32 %r2299, %r2243, 8192; // begin inline asm ld.shared.v4.b32 {%r2295, %r2296, %r2297, %r2298}, [%r2299]; // end inline asm add.s32 %r2304, %r2243, 8320; // begin inline asm ld.shared.v4.b32 {%r2300, %r2301, %r2302, %r2303}, [%r2304]; // end inline asm add.s32 %r2309, %r2243, 8448; // begin inline asm ld.shared.v4.b32 {%r2305, %r2306, %r2307, %r2308}, [%r2309]; // end inline asm add.s32 %r2314, %r2243, 8576; // begin inline asm ld.shared.v4.b32 {%r2310, %r2311, %r2312, %r2313}, [%r2314]; // end inline asm // begin inline asm add.f16x2 %r2315, %r2295, %r2300; // end inline asm // begin inline asm add.f16x2 %r2318, %r2296, %r2301; // end inline asm // begin inline asm add.f16x2 %r2321, %r2297, %r2302; // end inline asm // begin inline asm add.f16x2 %r2324, %r2298, %r2303; // end inline asm // begin inline asm add.f16x2 %r2327, %r2315, %r2305; // end inline asm // begin inline asm add.f16x2 %r2330, %r2318, %r2306; // end inline asm // begin inline asm add.f16x2 %r2333, %r2321, %r2307; // end inline asm // begin inline asm add.f16x2 %r2336, %r2324, %r2308; // end inline asm // begin inline asm add.f16x2 %r2339, %r2327, %r2310; // end inline asm // begin inline asm add.f16x2 %r2342, %r2330, %r2311; // end inline asm // begin inline asm add.f16x2 %r2345, %r2333, %r2312; // end inline asm // begin inline asm add.f16x2 %r2348, %r2336, %r2313; // end inline asm add.s32 %r2355, %r2243, 16384; // begin inline asm ld.shared.v4.b32 {%r2351, %r2352, %r2353, %r2354}, [%r2355]; // end inline asm add.s32 %r2360, %r2243, 16512; // begin inline asm ld.shared.v4.b32 {%r2356, %r2357, %r2358, %r2359}, [%r2360]; // end inline asm add.s32 %r2365, %r2243, 16640; // begin inline asm ld.shared.v4.b32 {%r2361, %r2362, %r2363, %r2364}, [%r2365]; // end inline asm add.s32 %r2370, %r2243, 16768; // begin inline asm ld.shared.v4.b32 {%r2366, %r2367, %r2368, %r2369}, [%r2370]; // end inline asm // begin inline asm add.f16x2 %r2371, %r2351, %r2356; // end inline asm // begin inline asm add.f16x2 %r2374, %r2352, %r2357; // end inline asm // begin inline asm add.f16x2 %r2377, %r2353, %r2358; // end inline asm // begin inline asm add.f16x2 %r2380, %r2354, %r2359; // end inline asm // begin inline asm add.f16x2 %r2383, %r2371, %r2361; // end inline asm // begin inline asm add.f16x2 %r2386, %r2374, %r2362; // end inline asm // begin inline asm add.f16x2 %r2389, %r2377, %r2363; // end inline asm // begin inline asm add.f16x2 %r2392, %r2380, %r2364; // end inline asm // begin inline asm add.f16x2 %r2395, %r2383, %r2366; // end inline asm // begin inline asm add.f16x2 %r2398, %r2386, %r2367; // end inline asm // begin inline asm add.f16x2 %r2401, %r2389, %r2368; // end inline asm // begin inline asm add.f16x2 %r2404, %r2392, %r2369; // end inline asm add.s32 %r2411, %r2243, 24576; // begin inline asm ld.shared.v4.b32 {%r2407, %r2408, %r2409, %r2410}, [%r2411]; // end inline asm add.s32 %r2416, %r2243, 24704; // begin inline asm ld.shared.v4.b32 {%r2412, %r2413, %r2414, %r2415}, [%r2416]; // end inline asm add.s32 %r2421, %r2243, 24832; // begin inline asm ld.shared.v4.b32 {%r2417, %r2418, %r2419, %r2420}, [%r2421]; // end inline asm add.s32 %r2426, %r2243, 24960; // begin inline asm ld.shared.v4.b32 {%r2422, %r2423, %r2424, %r2425}, [%r2426]; // end inline asm // begin inline asm add.f16x2 %r2427, %r2407, %r2412; // end inline asm // begin inline asm add.f16x2 %r2430, %r2408, %r2413; // end inline asm // begin inline asm add.f16x2 %r2433, %r2409, %r2414; // end inline asm // begin inline asm add.f16x2 %r2436, %r2410, %r2415; // end inline asm // begin inline asm add.f16x2 %r2439, %r2427, %r2417; // end inline asm // begin inline asm add.f16x2 %r2442, %r2430, %r2418; // end inline asm // begin inline asm add.f16x2 %r2445, %r2433, %r2419; // end inline asm // begin inline asm add.f16x2 %r2448, %r2436, %r2420; // end inline asm // begin inline asm add.f16x2 %r2451, %r2439, %r2422; // end inline asm // begin inline asm add.f16x2 %r2454, %r2442, %r2423; // end inline asm // begin inline asm add.f16x2 %r2457, %r2445, %r2424; // end inline asm // begin inline asm add.f16x2 %r2460, %r2448, %r2425; // end inline asm cvt.s64.s32 %rd47, %r7; add.s64 %rd6, %rd47, %rd3; cvt.u32.u64 %r2472, %rd2; setp.ge.s32 %p163, %r2472, %r2494; @%p163 bra $L__BB0_40; mov.b64 %rd64, fmha_mhca_fp16_128_64_sm86_kernel_nl_param_0; mov.u64 %rd63, %rd64; ld.param.u32 %r2513, [%rd63+60]; mov.u32 %r2512, %tid.x; shr.s32 %r2511, %r2513, 31; shr.u32 %r2510, %r2511, 29; add.s32 %r2509, %r2513, %r2510; shr.s32 %r2508, %r2509, 3; shr.s32 %r2507, %r2512, 31; shr.u32 %r2506, %r2507, 29; add.s32 %r2505, %r2512, %r2506; and.b32 %r2504, %r2505, -8; sub.s32 %r2503, %r2512, %r2504; setp.ge.s32 %p164, %r2503, %r2508; @%p164 bra $L__BB0_32; mul.lo.s64 %rd48, %rd4, %rd2; add.s64 %rd49, %rd6, %rd48; add.s64 %rd50, %rd5, %rd49; st.global.v4.u32 [%rd50], {%r2283, %r2286, %r2289, %r2292}; $L__BB0_32: ld.param.u32 %r2514, [fmha_mhca_fp16_128_64_sm86_kernel_nl_param_0+200]; add.s32 %r2474, %r2472, 16; setp.ge.s32 %p165, %r2474, %r2514; @%p165 bra $L__BB0_40; @%p164 bra $L__BB0_35; add.s64 %rd51, %rd2, 16; mul.lo.s64 %rd52, %rd51, %rd4; add.s64 %rd53, %rd6, %rd52; add.s64 %rd54, %rd5, %rd53; st.global.v4.u32 [%rd54], {%r2339, %r2342, %r2345, %r2348}; $L__BB0_35: ld.param.u32 %r2515, [fmha_mhca_fp16_128_64_sm86_kernel_nl_param_0+200]; add.s32 %r2476, %r2472, 32; setp.ge.s32 %p167, %r2476, %r2515; @%p167 bra $L__BB0_40; @%p164 bra $L__BB0_38; add.s64 %rd55, %rd2, 32; mul.lo.s64 %rd56, %rd55, %rd4; add.s64 %rd57, %rd6, %rd56; add.s64 %rd58, %rd5, %rd57; st.global.v4.u32 [%rd58], {%r2395, %r2398, %r2401, %r2404}; $L__BB0_38: ld.param.u32 %r2516, [fmha_mhca_fp16_128_64_sm86_kernel_nl_param_0+200]; add.s32 %r2478, %r2472, 48; setp.ge.s32 %p169, %r2478, %r2516; or.pred %p171, %p169, %p164; @%p171 bra $L__BB0_40; add.s64 %rd59, %rd2, 48; mul.lo.s64 %rd60, %rd59, %rd4; add.s64 %rd61, %rd6, %rd60; add.s64 %rd62, %rd5, %rd61; st.global.v4.u32 [%rd62], {%r2451, %r2454, %r2457, %r2460}; $L__BB0_40: ret; }