_S_128_sm86_kernel_nl_param_0+52]; mov.u32 %r1071, %ctaid.z; shl.b32 %r3, %r1071, 7; setp.le.s32 %p1, %r1, %r3; @%p1 bra $L__BB0_99; mov.u32 %r1400, %tid.x; mov.u32 %r1401, %ctaid.y; mov.u32 %r1402, %ctaid.x; mul.lo.s32 %r1403, %r1, %r1401; mad.lo.s32 %r1404, %r1403, %r2, %r1402; shr.s32 %r1405, %r1400, 31; shr.u32 %r1406, %r1405, 27; add.s32 %r1407, %r1400, %r1406; and.b32 %r1408, %r1407, -32; sub.s32 %r1409, %r1400, %r1408; shr.u32 %r1410, %r1405, 25; add.s32 %r1411, %r1400, %r1410; shr.s32 %r1412, %r1411, 7; shl.b32 %r1413, %r1412, 4; shr.s32 %r1414, %r1409, 31; shr.u32 %r1415, %r1414, 30; add.s32 %r1416, %r1409, %r1415; and.b32 %r1417, %r1416, 2147483644; sub.s32 %r1418, %r1409, %r1417; shl.b32 %r1419, %r1418, 1; add.s32 %r2820, %r1419, %r1413; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r1420, %r1407, 5; shr.s32 %r1421, %r1407, 31; shr.u32 %r1422, %r1421, 30; add.s32 %r1423, %r1420, %r1422; and.b32 %r1424, %r1423, 268435452; sub.s32 %r1425, %r1420, %r1424; shl.b32 %r1426, %r1425, 4; shr.s32 %r1427, %r1416, 2; add.s32 %r5, %r1426, %r1427; shr.u32 %r1428, %r1405, 28; add.s32 %r1429, %r1400, %r1428; shr.s32 %r6, %r1429, 4; add.s32 %r1430, %r6, %r3; cvt.s64.s32 %rd5, %r1430; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd55, %rd6, %rd5; mul.lo.s32 %r1431, %r1404, 3; mul.wide.s32 %rd56, %r1431, 256; and.b32 %r1432, %r1429, -16; sub.s32 %r7, %r1400, %r1432; shl.b32 %r1433, %r7, 4; cvt.s64.s32 %rd57, %r1433; add.s64 %rd58, %rd56, %rd57; add.s64 %rd59, %rd58, %rd55; ld.param.u64 %rd60, [%rd1]; add.s64 %rd35, %rd60, %rd59; shr.s32 %r1434, %r1429, 31; shr.u32 %r1435, %r1434, 29; add.s32 %r1436, %r6, %r1435; and.b32 %r1437, %r1436, 268435448; sub.s32 %r1438, %r6, %r1437; xor.b32 %r1439, %r1438, %r7; shl.b32 %r1440, %r6, 8; shl.b32 %r1441, %r1439, 4; mov.u32 %r1442, 31; mov.u32 %r2819, 0; mov.u32 %r1444, -1; shfl.sync.idx.b32 %r1445|%p2, %r2819, %r2819, %r1442, %r1444; shfl.sync.idx.b32 %r1446|%p3, %r2819, %r2819, %r1442, %r1444; and.b32 %r1447, %r1400, 96; shr.u32 %r1448, %r1447, 1; and.b32 %r1449, %r1400, 15; or.b32 %r1450, %r1448, %r1449; and.b32 %r1451, %r1400, 7; shl.b32 %r1452, %r1400, 4; and.b32 %r1453, %r1452, 112; and.b32 %r1454, %r1400, 16; xor.b32 %r1455, %r1453, %r1454; cvt.s64.s32 %rd61, %r6; mul.lo.s64 %rd62, %rd6, %rd61; add.s32 %r1456, %r1431, 1; mul.wide.s32 %rd63, %r1456, 256; add.s64 %rd64, %rd63, %rd57; add.s64 %rd65, %rd64, %rd62; add.s64 %rd238, %rd60, %rd65; shfl.sync.idx.b32 %r1457|%p4, %r2819, %r2819, %r1442, %r1444; shfl.sync.idx.b32 %r1458|%p5, %r2819, %r2819, %r1442, %r1444; shr.u32 %r1459, %r1454, 1; or.b32 %r1460, %r1459, %r1451; and.b32 %r1461, %r1400, 8; shr.u32 %r1462, %r1461, 3; xor.b32 %r1463, %r1462, %r1451; add.s32 %r1464, %r1431, 2; mul.wide.s32 %rd66, %r1464, 256; add.s64 %rd67, %rd66, %rd57; add.s64 %rd68, %rd67, %rd62; add.s64 %rd237, %rd60, %rd68; shfl.sync.idx.b32 %r1465|%p6, %r2819, %r2819, %r1442, %r1444; shfl.sync.idx.b32 %r1466|%p7, %r2819, %r2819, %r1442, %r1444; ld.param.u64 %rd10, [%rd1+32]; ld.param.u64 %rd11, [%rd1+8]; ld.param.u32 %r8, [%rd1+60]; sub.s32 %r1467, %r1, %r3; min.s32 %r1468, %r1467, 128; shr.s32 %r1469, %r8, 31; shr.u32 %r1470, %r1469, 29; add.s32 %r1471, %r8, %r1470; shr.s32 %r9, %r1471, 3; shl.b32 %r1472, %r1400, 8; and.b32 %r1473, %r1472, 3840; shl.b32 %r1474, %r1463, 4; shl.b32 %r1475, %r1460, 8; shl.b32 %r1476, %r1450, 8; setp.lt.s32 %p8, %r6, %r1468; add.s32 %r1477, %r6, 8; setp.lt.s32 %p9, %r1477, %r1468; add.s32 %r1478, %r6, 16; setp.lt.s32 %p10, %r1478, %r1468; add.s32 %r1479, %r6, 24; setp.lt.s32 %p11, %r1479, %r1468; add.s32 %r1480, %r6, 32; setp.lt.s32 %p12, %r1480, %r1468; add.s32 %r1481, %r6, 40; setp.lt.s32 %p13, %r1481, %r1468; add.s32 %r1482, %r6, 48; setp.lt.s32 %p14, %r1482, %r1468; add.s32 %r1483, %r6, 56; setp.lt.s32 %p15, %r1483, %r1468; add.s32 %r1484, %r6, 64; setp.lt.s32 %p16, %r1484, %r1468; add.s32 %r1485, %r6, 72; setp.lt.s32 %p17, %r1485, %r1468; add.s32 %r1486, %r6, 80; setp.lt.s32 %p18, %r1486, %r1468; add.s32 %r1487, %r6, 88; setp.lt.s32 %p19, %r1487, %r1468; add.s32 %r1488, %r6, 96; setp.lt.s32 %p20, %r1488, %r1468; add.s32 %r1489, %r6, 104; setp.lt.s32 %p21, %r1489, %r1468; add.s32 %r1490, %r6, 112; setp.lt.s32 %p22, %r1490, %r1468; add.s32 %r1491, %r6, 120; setp.lt.s32 %p23, %r1491, %r1468; add.s32 %r1492, %r1441, %r1440; or.b32 %r1493, %r1476, %r1455; or.b32 %r1494, %r1475, %r1474; or.b32 %r1495, %r1455, %r1473; mov.u32 %r1496, _ZN25fused_multihead_attention5smem_E; add.s32 %r1497, %r1496, 32768; add.s32 %r10, %r1492, %r1497; shl.b64 %rd69, %rd6, 3; selp.b32 %r1083, 16, 0, %p13; selp.b32 %r1085, 16, 0, %p14; selp.b32 %r1087, 16, 0, %p15; selp.b32 %r1089, 16, 0, %p16; selp.b32 %r1091, 16, 0, %p17; selp.b32 %r1093, 16, 0, %p18; selp.b32 %r1095, 16, 0, %p19; selp.b32 %r1097, 16, 0, %p20; selp.b32 %r1099, 16, 0, %p21; add.s32 %r1498, %r1492, %r1496; add.s32 %r1072, %r1498, %r1446; add.s32 %r1074, %r1072, 2048; add.s32 %r1076, %r1072, 4096; add.s32 %r1078, %r1072, 6144; add.s32 %r1080, %r1072, 8192; add.s32 %r1082, %r1072, 10240; add.s32 %r1084, %r1072, 12288; add.s32 %r1086, %r1072, 14336; add.s32 %r1088, %r1072, 16384; add.s32 %r1090, %r1072, 18432; add.s32 %r1092, %r1072, 20480; add.s32 %r1094, %r1072, 22528; add.s32 %r1096, %r1072, 24576; add.s32 %r1098, %r1072, 26624; add.s32 %r1100, %r1072, 28672; add.s32 %r1102, %r1072, 30720; selp.b32 %r1073, 16, 0, %p8; // begin inline asm cp.async.cg.shared.global [%r1072], [%rd35], 16, %r1073; // end inline asm selp.b32 %r1075, 16, 0, %p9; add.s64 %rd36, %rd35, %rd69; // begin inline asm cp.async.cg.shared.global [%r1074], [%rd36], 16, %r1075; // end inline asm selp.b32 %r1077, 16, 0, %p10; add.s64 %rd37, %rd36, %rd69; // begin inline asm cp.async.cg.shared.global [%r1076], [%rd37], 16, %r1077; // end inline asm selp.b32 %r1079, 16, 0, %p11; add.s64 %rd38, %rd37, %rd69; // begin inline asm cp.async.cg.shared.global [%r1078], [%rd38], 16, %r1079; // end inline asm selp.b32 %r1081, 16, 0, %p12; add.s64 %rd39, %rd38, %rd69; // begin inline asm cp.async.cg.shared.global [%r1080], [%rd39], 16, %r1081; // end inline asm add.s64 %rd40, %rd39, %rd69; // begin inline asm cp.async.cg.shared.global [%r1082], [%rd40], 16, %r1083; // end inline asm add.s64 %rd41, %rd40, %rd69; // begin inline asm cp.async.cg.shared.global [%r1084], [%rd41], 16, %r1085; // end inline asm add.s64 %rd42, %rd41, %rd69; // begin inline asm cp.async.cg.shared.global [%r1086], [%rd42], 16, %r1087; // end inline asm add.s64 %rd43, %rd42, %rd69; // begin inline asm cp.async.cg.shared.global [%r1088], [%rd43], 16, %r1089; // end inline asm add.s64 %rd44, %rd43, %rd69; // begin inline asm cp.async.cg.shared.global [%r1090], [%rd44], 16, %r1091; // end inline asm add.s64 %rd45, %rd44, %rd69; // begin inline asm cp.async.cg.shared.global [%r1092], [%rd45], 16, %r1093; // end inline asm add.s64 %rd46, %rd45, %rd69; // begin inline asm cp.async.cg.shared.global [%r1094], [%rd46], 16, %r1095; // end inline asm add.s64 %rd47, %rd46, %rd69; // begin inline asm cp.async.cg.shared.global [%r1096], [%rd47], 16, %r1097; // end inline asm add.s64 %rd48, %rd47, %rd69; // begin inline asm cp.async.cg.shared.global [%r1098], [%rd48], 16, %r1099; // end inline asm selp.b32 %r1101, 16, 0, %p22; add.s64 %rd49, %rd48, %rd69; // begin inline asm cp.async.cg.shared.global [%r1100], [%rd49], 16, %r1101; // end inline asm selp.b32 %r1103, 16, 0, %p23; add.s64 %rd50, %rd49, %rd69; // begin inline asm cp.async.cg.shared.global [%r1102], [%rd50], 16, %r1103; // end inline asm min.s32 %r1499, %r1, 16; setp.lt.s32 %p24, %r6, %r1499; setp.lt.s32 %p25, %r1477, %r1499; add.s64 %rd52, %rd238, %rd69; add.s32 %r1104, %r10, %r1458; add.s32 %r1106, %r1104, 2048; selp.b32 %r1109, 16, 0, %p24; // begin inline asm cp.async.cg.shared.global [%r1104], [%rd238], 16, %r1109; // end inline asm selp.b32 %r1111, 16, 0, %p25; // begin inline asm cp.async.cg.shared.global [%r1106], [%rd52], 16, %r1111; // end inline asm add.s64 %rd54, %rd237, %rd69; add.s32 %r1500, %r1496, 36864; add.s32 %r1501, %r1492, %r1500; add.s32 %r1108, %r1501, %r1466; add.s32 %r1110, %r1108, 2048; // begin inline asm cp.async.cg.shared.global [%r1108], [%rd237], 16, %r1109; // end inline asm // begin inline asm cp.async.cg.shared.global [%r1110], [%rd54], 16, %r1111; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r1502, %r1445, %r1496; add.s32 %r1116, %r1502, %r1493; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1112, %r1113, %r1114, %r1115}, [%r1116]; // end inline asm add.s32 %r1121, %r1116, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1117, %r1118, %r1119, %r1120}, [%r1121]; // end inline asm xor.b32 %r1503, %r1493, 32; add.s32 %r1126, %r1502, %r1503; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1122, %r1123, %r1124, %r1125}, [%r1126]; // end inline asm add.s32 %r1131, %r1126, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1127, %r1128, %r1129, %r1130}, [%r1131]; // end inline asm xor.b32 %r1504, %r1493, 64; add.s32 %r1136, %r1502, %r1504; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1132, %r1133, %r1134, %r1135}, [%r1136]; // end inline asm add.s32 %r1141, %r1136, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1137, %r1138, %r1139, %r1140}, [%r1141]; // end inline asm xor.b32 %r1505, %r1493, 96; add.s32 %r1146, %r1502, %r1505; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1142, %r1143, %r1144, %r1145}, [%r1146]; // end inline asm add.s32 %r1151, %r1146, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1147, %r1148, %r1149, %r1150}, [%r1151]; // end inline asm or.b32 %r1506, %r1493, 128; add.s32 %r1156, %r1502, %r1506; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1152, %r1153, %r1154, %r1155}, [%r1156]; // end inline asm add.s32 %r1161, %r1156, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1157, %r1158, %r1159, %r1160}, [%r1161]; // end inline asm xor.b32 %r1507, %r1493, 160; add.s32 %r1166, %r1502, %r1507; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1162, %r1163, %r1164, %r1165}, [%r1166]; // end inline asm add.s32 %r1171, %r1166, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1167, %r1168, %r1169, %r1170}, [%r1171]; // end inline asm xor.b32 %r1508, %r1493, 192; add.s32 %r1176, %r1502, %r1508; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1172, %r1173, %r1174, %r1175}, [%r1176]; // end inline asm add.s32 %r1181, %r1176, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1177, %r1178, %r1179, %r1180}, [%r1181]; // end inline asm xor.b32 %r1509, %r1493, 224; add.s32 %r1186, %r1502, %r1509; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1182, %r1183, %r1184, %r1185}, [%r1186]; // end inline asm add.s32 %r1191, %r1186, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1187, %r1188, %r1189, %r1190}, [%r1191]; // end inline asm add.s32 %r79, %r1457, %r1497; add.s32 %r1196, %r79, %r1494; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2854, %r2853, %r2852, %r2851}, [%r1196]; // end inline asm xor.b32 %r1510, %r1494, 32; add.s32 %r1201, %r79, %r1510; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2850, %r2849, %r2848, %r2847}, [%r1201]; // end inline asm xor.b32 %r1511, %r1494, 64; add.s32 %r1206, %r79, %r1511; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2846, %r2845, %r2844, %r2843}, [%r1206]; // end inline asm xor.b32 %r1512, %r1494, 96; add.s32 %r1211, %r79, %r1512; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2842, %r2841, %r2840, %r2839}, [%r1211]; // end inline asm or.b32 %r1513, %r1494, 128; add.s32 %r1216, %r79, %r1513; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2838, %r2837, %r2836, %r2835}, [%r1216]; // end inline asm xor.b32 %r1514, %r1494, 160; add.s32 %r1221, %r79, %r1514; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2834, %r2833, %r2832, %r2831}, [%r1221]; // end inline asm xor.b32 %r1515, %r1494, 192; add.s32 %r1226, %r79, %r1515; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2830, %r2829, %r2828, %r2827}, [%r1226]; // end inline asm xor.b32 %r1516, %r1494, 224; add.s32 %r1231, %r79, %r1516; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2826, %r2825, %r2824, %r2823}, [%r1231]; // end inline asm add.s32 %r1236, %r1495, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2886, %r2885, %r2884, %r2883}, [%r1236]; // end inline asm xor.b32 %r1517, %r1495, 32; add.s32 %r1241, %r1517, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2882, %r2881, %r2880, %r2879}, [%r1241]; // end inline asm xor.b32 %r1518, %r1495, 64; add.s32 %r1246, %r1518, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2878, %r2877, %r2876, %r2875}, [%r1246]; // end inline asm xor.b32 %r1519, %r1495, 96; add.s32 %r1251, %r1519, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2874, %r2873, %r2872, %r2871}, [%r1251]; // end inline asm or.b32 %r1520, %r1495, 128; add.s32 %r1256, %r1520, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2870, %r2869, %r2868, %r2867}, [%r1256]; // end inline asm xor.b32 %r1521, %r1495, 160; add.s32 %r1261, %r1521, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2866, %r2865, %r2864, %r2863}, [%r1261]; // end inline asm xor.b32 %r1522, %r1495, 192; add.s32 %r1266, %r1522, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2862, %r2861, %r2860, %r2859}, [%r1266]; // end inline asm xor.b32 %r1523, %r1495, 224; add.s32 %r1271, %r1523, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2858, %r2857, %r2856, %r2855}, [%r1271]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r3014, 0; // end inline asm // begin inline asm mov.u32 %r3013, 0; // end inline asm // begin inline asm mov.u32 %r3012, 0; // end inline asm // begin inline asm mov.u32 %r3011, 0; // end inline asm // begin inline asm mov.u32 %r3010, 0; // end inline asm // begin inline asm mov.u32 %r3009, 0; // end inline asm // begin inline asm mov.u32 %r3008, 0; // end inline asm // begin inline asm mov.u32 %r3007, 0; // end inline asm // begin inline asm mov.u32 %r3006, 0; // end inline asm // begin inline asm mov.u32 %r3005, 0; // end inline asm // begin inline asm mov.u32 %r3004, 0; // end inline asm // begin inline asm mov.u32 %r3003, 0; // end inline asm // begin inline asm mov.u32 %r3002, 0; // end inline asm // begin inline asm mov.u32 %r3001, 0; // end inline asm // begin inline asm mov.u32 %r3000, 0; // end inline asm // begin inline asm mov.u32 %r2999, 0; // end inline asm // begin inline asm mov.u32 %r2998, 0; // end inline asm // begin inline asm mov.u32 %r2997, 0; // end inline asm // begin inline asm mov.u32 %r2996, 0; // end inline asm // begin inline asm mov.u32 %r2995, 0; // end inline asm // begin inline asm mov.u32 %r2994, 0; // end inline asm // begin inline asm mov.u32 %r2993, 0; // end inline asm // begin inline asm mov.u32 %r2992, 0; // end inline asm // begin inline asm mov.u32 %r2991, 0; // end inline asm // begin inline asm mov.u32 %r2990, 0; // end inline asm // begin inline asm mov.u32 %r2989, 0; // end inline asm // begin inline asm mov.u32 %r2988, 0; // end inline asm // begin inline asm mov.u32 %r2987, 0; // end inline asm // begin inline asm mov.u32 %r2986, 0; // end inline asm // begin inline asm mov.u32 %r2985, 0; // end inline asm // begin inline asm mov.u32 %r2984, 0; // end inline asm // begin inline asm mov.u32 %r2983, 0; // end inline asm // begin inline asm mov.u32 %r2982, 0; // end inline asm // begin inline asm mov.u32 %r2981, 0; // end inline asm // begin inline asm mov.u32 %r2980, 0; // end inline asm // begin inline asm mov.u32 %r2979, 0; // end inline asm // begin inline asm mov.u32 %r2978, 0; // end inline asm // begin inline asm mov.u32 %r2977, 0; // end inline asm // begin inline asm mov.u32 %r2976, 0; // end inline asm // begin inline asm mov.u32 %r2975, 0; // end inline asm // begin inline asm mov.u32 %r2974, 0; // end inline asm // begin inline asm mov.u32 %r2973, 0; // end inline asm // begin inline asm mov.u32 %r2972, 0; // end inline asm // begin inline asm mov.u32 %r2971, 0; // end inline asm // begin inline asm mov.u32 %r2970, 0; // end inline asm // begin inline asm mov.u32 %r2969, 0; // end inline asm // begin inline asm mov.u32 %r2968, 0; // end inline asm // begin inline asm mov.u32 %r2967, 0; // end inline asm // begin inline asm mov.u32 %r2966, 0; // end inline asm // begin inline asm mov.u32 %r2965, 0; // end inline asm // begin inline asm mov.u32 %r2964, 0; // end inline asm // begin inline asm mov.u32 %r2963, 0; // end inline asm // begin inline asm mov.u32 %r2962, 0; // end inline asm // begin inline asm mov.u32 %r2961, 0; // end inline asm // begin inline asm mov.u32 %r2960, 0; // end inline asm // begin inline asm mov.u32 %r2959, 0; // end inline asm // begin inline asm mov.u32 %r2958, 0; // end inline asm // begin inline asm mov.u32 %r2957, 0; // end inline asm // begin inline asm mov.u32 %r2956, 0; // end inline asm // begin inline asm mov.u32 %r2955, 0; // end inline asm // begin inline asm mov.u32 %r2954, 0; // end inline asm // begin inline asm mov.u32 %r2953, 0; // end inline asm // begin inline asm mov.u32 %r2952, 0; // end inline asm // begin inline asm mov.u32 %r2951, 0; // end inline asm // begin inline asm mov.u32 %r2950, 0; // end inline asm // begin inline asm mov.u32 %r2949, 0; // end inline asm // begin inline asm mov.u32 %r2948, 0; // end inline asm // begin inline asm mov.u32 %r2947, 0; // end inline asm // begin inline asm mov.u32 %r2946, 0; // end inline asm // begin inline asm mov.u32 %r2945, 0; // end inline asm // begin inline asm mov.u32 %r2944, 0; // end inline asm // begin inline asm mov.u32 %r2943, 0; // end inline asm // begin inline asm mov.u32 %r2942, 0; // end inline asm // begin inline asm mov.u32 %r2941, 0; // end inline asm // begin inline asm mov.u32 %r2940, 0; // end inline asm // begin inline asm mov.u32 %r2939, 0; // end inline asm // begin inline asm mov.u32 %r2938, 0; // end inline asm // begin inline asm mov.u32 %r2937, 0; // end inline asm // begin inline asm mov.u32 %r2936, 0; // end inline asm // begin inline asm mov.u32 %r2935, 0; // end inline asm // begin inline asm mov.u32 %r2934, 0; // end inline asm // begin inline asm mov.u32 %r2933, 0; // end inline asm // begin inline asm mov.u32 %r2932, 0; // end inline asm // begin inline asm mov.u32 %r2931, 0; // end inline asm // begin inline asm mov.u32 %r2930, 0; // end inline asm // begin inline asm mov.u32 %r2929, 0; // end inline asm // begin inline asm mov.u32 %r2928, 0; // end inline asm // begin inline asm mov.u32 %r2927, 0; // end inline asm // begin inline asm mov.u32 %r2926, 0; // end inline asm // begin inline asm mov.u32 %r2925, 0; // end inline asm // begin inline asm mov.u32 %r2924, 0; // end inline asm // begin inline asm mov.u32 %r2923, 0; // end inline asm // begin inline asm mov.u32 %r2922, 0; // end inline asm // begin inline asm mov.u32 %r2921, 0; // end inline asm // begin inline asm mov.u32 %r2920, 0; // end inline asm // begin inline asm mov.u32 %r2919, 0; // end inline asm // begin inline asm mov.u32 %r2918, 0; // end inline asm // begin inline asm mov.u32 %r2917, 0; // end inline asm // begin inline asm mov.u32 %r2916, 0; // end inline asm // begin inline asm mov.u32 %r2915, 0; // end inline asm // begin inline asm mov.u32 %r2914, 0; // end inline asm // begin inline asm mov.u32 %r2913, 0; // end inline asm // begin inline asm mov.u32 %r2912, 0; // end inline asm // begin inline asm mov.u32 %r2911, 0; // end inline asm // begin inline asm mov.u32 %r2910, 0; // end inline asm // begin inline asm mov.u32 %r2909, 0; // end inline asm // begin inline asm mov.u32 %r2908, 0; // end inline asm // begin inline asm mov.u32 %r2907, 0; // end inline asm // begin inline asm mov.u32 %r2906, 0; // end inline asm // begin inline asm mov.u32 %r2905, 0; // end inline asm // begin inline asm mov.u32 %r2904, 0; // end inline asm // begin inline asm mov.u32 %r2903, 0; // end inline asm // begin inline asm mov.u32 %r2902, 0; // end inline asm // begin inline asm mov.u32 %r2901, 0; // end inline asm // begin inline asm mov.u32 %r2900, 0; // end inline asm // begin inline asm mov.u32 %r2899, 0; // end inline asm // begin inline asm mov.u32 %r2898, 0; // end inline asm // begin inline asm mov.u32 %r2897, 0; // end inline asm // begin inline asm mov.u32 %r2896, 0; // end inline asm // begin inline asm mov.u32 %r2895, 0; // end inline asm // begin inline asm mov.u32 %r2894, 0; // end inline asm // begin inline asm mov.u32 %r2893, 0; // end inline asm // begin inline asm mov.u32 %r2892, 0; // end inline asm // begin inline asm mov.u32 %r2891, 0; // end inline asm // begin inline asm mov.u32 %r2890, 0; // end inline asm // begin inline asm mov.u32 %r2889, 0; // end inline asm // begin inline asm mov.u32 %r2888, 0; // end inline asm // begin inline asm mov.u32 %r2887, 0; // end inline asm add.s32 %r1524, %r1, 15; shr.s32 %r1525, %r1524, 31; shr.u32 %r1526, %r1525, 28; add.s32 %r1527, %r1524, %r1526; and.b32 %r272, %r1527, -16; setp.lt.s32 %p26, %r1, 1; @%p26 bra $L__BB0_53; ld.param.u8 %rs1, [%rd1+160]; cvt.s64.s32 %rd12, %r5; cvt.s64.s32 %rd13, %r2820; add.s32 %r1529, %r2820, 1; cvt.s64.s32 %rd14, %r1529; add.s32 %r1530, %r2820, 8; cvt.s64.s32 %rd15, %r1530; add.s32 %r1531, %r2820, 9; cvt.s64.s32 %rd16, %r1531; add.s32 %r401, %r5, 8; add.s32 %r402, %r5, 64; add.s32 %r403, %r5, 72; mov.f32 %f1418, 0fFF800000; mov.f32 %f1414, 0f00000000; mov.u32 %r2821, %r1; mov.u32 %r2822, %r1; mov.f32 %f1415, %f1414; mov.f32 %f1416, %f1414; mov.f32 %f1417, %f1414; mov.f32 %f1419, %f1418; mov.f32 %f1420, %f1418; mov.f32 %f1421, %f1418; $L__BB0_3: add.s32 %r1532, %r2819, 16; setp.ge.s32 %p27, %r1532, %r272; @%p27 bra $L__BB0_5; bar.sync 0; shl.b64 %rd74, %rd6, 4; add.s64 %rd70, %rd238, %rd74; add.s32 %r2822, %r2822, -16; min.s32 %r1541, %r2822, 16; setp.lt.s32 %p28, %r6, %r1541; setp.lt.s32 %p29, %r1477, %r1541; mul.lo.s64 %rd75, %rd6, 24; add.s64 %rd71, %rd238, %rd75; selp.b32 %r1534, 16, 0, %p28; // begin inline asm cp.async.cg.shared.global [%r1104], [%rd70], 16, %r1534; // end inline asm selp.b32 %r1536, 16, 0, %p29; // begin inline asm cp.async.cg.shared.global [%r1106], [%rd71], 16, %r1536; // end inline asm add.s64 %rd72, %rd237, %rd74; add.s32 %r2821, %r2821, -16; min.s32 %r1543, %r2821, 16; setp.lt.s32 %p30, %r6, %r1543; setp.lt.s32 %p31, %r1477, %r1543; add.s64 %rd73, %rd237, %rd75; selp.b32 %r1538, 16, 0, %p30; // begin inline asm cp.async.cg.shared.global [%r1108], [%rd72], 16, %r1538; // end inline asm selp.b32 %r1540, 16, 0, %p31; // begin inline asm cp.async.cg.shared.global [%r1110], [%rd73], 16, %r1540; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm mov.u64 %rd237, %rd72; mov.u64 %rd238, %rd70; $L__BB0_5: setp.eq.s16 %p32, %rs1, 0; @%p32 bra $L__BB0_38; cvt.s64.s32 %rd78, %r3; add.s64 %rd79, %rd12, %rd78; setp.ge.u64 %p33, %rd79, %rd2; mul.lo.s32 %r1546, %r1, %r3; cvt.s64.s32 %rd80, %r1546; cvt.u64.u32 %rd23, %r2819; add.s64 %rd81, %rd80, %rd23; add.s64 %rd24, %rd81, %rd13; mul.lo.s64 %rd82, %rd12, %rd2; add.s64 %rd83, %rd24, %rd82; add.s64 %rd25, %rd13, %rd23; setp.ge.u64 %p34, %rd25, %rd2; shl.b64 %rd84, %rd83, 1; mad.lo.s32 %r1549, %r2, %r1401, %r1402; cvt.s64.s32 %rd85, %r1549; mul.lo.s64 %rd86, %rd4, %rd85; add.s64 %rd87, %rd86, %rd84; cvta.to.global.u64 %rd88, %rd3; add.s64 %rd26, %rd88, %rd87; mov.u16 %rs132, 0; or.pred %p35, %p34, %p33; mov.u16 %rs131, %rs132; @%p35 bra $L__BB0_8; ld.global.u16 %rs131, [%rd26]; $L__BB0_8: add.s64 %rd27, %rd14, %rd23; setp.ge.u64 %p37, %rd27, %rd2; or.pred %p38, %p37, %p33; @%p38 bra $L__BB0_10; ld.global.u16 %rs132, [%rd26+2]; $L__BB0_10: add.s64 %rd28, %rd15, %rd23; setp.ge.u64 %p40, %rd28, %rd2; mov.u16 %rs134, 0; or.pred %p41, %p40, %p33; mov.u16 %rs133, %rs134; @%p41 bra $L__BB0_12; ld.global.u16 %rs133, [%rd26+16]; $L__BB0_12: add.s64 %rd29, %rd16, %rd23; setp.ge.u64 %p43, %rd29, %rd2; or.pred %p44, %p43, %p33; @%p44 bra $L__BB0_14; ld.global.u16 %rs134, [%rd26+18]; $L__BB0_14: cvt.s64.s32 %rd96, %r401; add.s64 %rd97, %rd96, %rd78; setp.ge.u64 %p45, %rd97, %rd2; mul.lo.s64 %rd98, %rd96, %rd2; add.s64 %rd99, %rd24, %rd98; shl.b64 %rd100, %rd99, 1; add.s64 %rd103, %rd86, %rd100; add.s64 %rd30, %rd88, %rd103; mov.u16 %rs136, 0; or.pred %p47, %p34, %p45; mov.u16 %rs135, %rs136; @%p47 bra $L__BB0_16; ld.global.u16 %rs135, [%rd30]; $L__BB0_16: or.pred %p50, %p37, %p45; @%p50 bra $L__BB0_18; ld.global.u16 %rs136, [%rd30+2]; $L__BB0_18: mov.u16 %rs138, 0; or.pred %p53, %p40, %p45; mov.u16 %rs137, %rs138; @%p53 bra $L__BB0_20; ld.global.u16 %rs137, [%rd30+16]; $L__BB0_20: or.pred %p56, %p43, %p45; @%p56 bra $L__BB0_22; ld.global.u16 %rs138, [%rd30+18]; $L__BB0_22: cvt.s64.s32 %rd115, %r402; add.s64 %rd116, %rd115, %rd78; setp.ge.u64 %p57, %rd116, %rd2; mul.lo.s64 %rd117, %rd115, %rd2; add.s64 %rd118, %rd24, %rd117; shl.b64 %rd119, %rd118, 1; add.s64 %rd122, %rd86, %rd119; add.s64 %rd31, %rd88, %rd122; mov.u16 %rs140, 0; or.pred %p59, %p34, %p57; mov.u16 %rs139, %rs140; @%p59 bra $L__BB0_24; ld.global.u16 %rs139, [%rd31]; $L__BB0_24: or.pred %p62, %p37, %p57; @%p62 bra $L__BB0_26; ld.global.u16 %rs140, [%rd31+2]; $L__BB0_26: mov.u16 %rs142, 0; or.pred %p65, %p40, %p57; mov.u16 %rs141, %rs142; @%p65 bra $L__BB0_28; ld.global.u16 %rs141, [%rd31+16]; $L__BB0_28: or.pred %p68, %p43, %p57; @%p68 bra $L__BB0_30; ld.global.u16 %rs142, [%rd31+18]; $L__BB0_30: cvt.s64.s32 %rd134, %r403; add.s64 %rd135, %rd134, %rd78; setp.ge.u64 %p69, %rd135, %rd2; mul.lo.s64 %rd136, %rd134, %rd2; add.s64 %rd137, %rd24, %rd136; shl.b64 %rd138, %rd137, 1; add.s64 %rd141, %rd86, %rd138; add.s64 %rd32, %rd88, %rd141; mov.u16 %rs144, 0; or.pred %p71, %p34, %p69; mov.u16 %rs143, %rs144; @%p71 bra $L__BB0_32; ld.global.u16 %rs143, [%rd32]; $L__BB0_32: or.pred %p74, %p37, %p69; @%p74 bra $L__BB0_34; ld.global.u16 %rs144, [%rd32+2]; $L__BB0_34: mov.u16 %rs146, 0; or.pred %p77, %p40, %p69; mov.u16 %rs145, %rs146; @%p77 bra $L__BB0_36; ld.global.u16 %rs145, [%rd32+16]; $L__BB0_36: or.pred %p80, %p43, %p69; @%p80 bra $L__BB0_38; ld.global.u16 %rs146, [%rd32+18]; $L__BB0_38: // begin inline asm mov.u32 %r1589, 0; // end inline asm // begin inline asm mov.u32 %r1590, 0; // end inline asm // begin inline asm mov.u32 %r1591, 0; // end inline asm // begin inline asm mov.u32 %r1592, 0; // end inline asm // begin inline asm mov.u32 %r1593, 0; // end inline asm // begin inline asm mov.u32 %r1594, 0; // end inline asm // begin inline asm mov.u32 %r1595, 0; // end inline asm // begin inline asm mov.u32 %r1596, 0; // end inline asm // begin inline asm mov.u32 %r1597, 0; // end inline asm // begin inline asm mov.u32 %r1598, 0; // end inline asm // begin inline asm mov.u32 %r1599, 0; // end inline asm // begin inline asm mov.u32 %r1600, 0; // end inline asm // begin inline asm mov.u32 %r1601, 0; // end inline asm // begin inline asm mov.u32 %r1602, 0; // end inline asm // begin inline asm mov.u32 %r1603, 0; // end inline asm // begin inline asm mov.u32 %r1604, 0; // end inline asm mov.b32 %f262, %r1589; mov.b32 %f263, %r1590; mov.b32 %f264, %r1591; mov.b32 %f265, %r1592; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f262, %f263, %f264, %f265}, {%r1112, %r1113, %r1114, %r1115}, {%r2854, %r2853}, {%f262, %f263, %f264, %f265}; // end inline asm mov.b32 %f270, %r1593; mov.b32 %f271, %r1594; mov.b32 %f272, %r1595; mov.b32 %f273, %r1596; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f270, %f271, %f272, %f273}, {%r1112, %r1113, %r1114, %r1115}, {%r2852, %r2851}, {%f270, %f271, %f272, %f273}; // end inline asm mov.b32 %f278, %r1597; mov.b32 %f279, %r1598; mov.b32 %f280, %r1599; mov.b32 %f281, %r1600; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f278, %f279, %f280, %f281}, {%r1117, %r1118, %r1119, %r1120}, {%r2854, %r2853}, {%f278, %f279, %f280, %f281}; // end inline asm mov.b32 %f286, %r1601; mov.b32 %f287, %r1602; mov.b32 %f288, %r1603; mov.b32 %f289, %r1604; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f286, %f287, %f288, %f289}, {%r1117, %r1118, %r1119, %r1120}, {%r2852, %r2851}, {%f286, %f287, %f288, %f289}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f262, %f263, %f264, %f265}, {%r1122, %r1123, %r1124, %r1125}, {%r2850, %r2849}, {%f262, %f263, %f264, %f265}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f270, %f271, %f272, %f273}, {%r1122, %r1123, %r1124, %r1125}, {%r2848, %r2847}, {%f270, %f271, %f272, %f273}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f278, %f279, %f280, %f281}, {%r1127, %r1128, %r1129, %r1130}, {%r2850, %r2849}, {%f278, %f279, %f280, %f281}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f286, %f287, %f288, %f289}, {%r1127, %r1128, %r1129, %r1130}, {%r2848, %r2847}, {%f286, %f287, %f288, %f289}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f262, %f263, %f264, %f265}, {%r1132, %r1133, %r1134, %r1135}, {%r2846, %r2845}, {%f262, %f263, %f264, %f265}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f270, %f271, %f272, %f273}, {%r1132, %r1133, %r1134, %r1135}, {%r2844, %r2843}, {%f270, %f271, %f272, %f273}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f278, %f279, %f280, %f281}, {%r1137, %r1138, %r1139, %r1140}, {%r2846, %r2845}, {%f278, %f279, %f280, %f281}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f286, %f287, %f288, %f289}, {%r1137, %r1138, %r1139, %r1140}, {%r2844, %r2843}, {%f286, %f287, %f288, %f289}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f262, %f263, %f264, %f265}, {%r1142, %r1143, %r1144, %r1145}, {%r2842, %r2841}, {%f262, %f263, %f264, %f265}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f270, %f271, %f272, %f273}, {%r1142, %r1143, %r1144, %r1145}, {%r2840, %r2839}, {%f270, %f271, %f272, %f273}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f278, %f279, %f280, %f281}, {%r1147, %r1148, %r1149, %r1150}, {%r2842, %r2841}, {%f278, %f279, %f280, %f281}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f286, %f287, %f288, %f289}, {%r1147, %r1148, %r1149, %r1150}, {%r2840, %r2839}, {%f286, %f287, %f288, %f289}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f262, %f263, %f264, %f265}, {%r1152, %r1153, %r1154, %r1155}, {%r2838, %r2837}, {%f262, %f263, %f264, %f265}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f270, %f271, %f272, %f273}, {%r1152, %r1153, %r1154, %r1155}, {%r2836, %r2835}, {%f270, %f271, %f272, %f273}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f278, %f279, %f280, %f281}, {%r1157, %r1158, %r1159, %r1160}, {%r2838, %r2837}, {%f278, %f279, %f280, %f281}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f286, %f287, %f288, %f289}, {%r1157, %r1158, %r1159, %r1160}, {%r2836, %r2835}, {%f286, %f287, %f288, %f289}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f262, %f263, %f264, %f265}, {%r1162, %r1163, %r1164, %r1165}, {%r2834, %r2833}, {%f262, %f263, %f264, %f265}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f270, %f271, %f272, %f273}, {%r1162, %r1163, %r1164, %r1165}, {%r2832, %r2831}, {%f270, %f271, %f272, %f273}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f278, %f279, %f280, %f281}, {%r1167, %r1168, %r1169, %r1170}, {%r2834, %r2833}, {%f278, %f279, %f280, %f281}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f286, %f287, %f288, %f289}, {%r1167, %r1168, %r1169, %r1170}, {%r2832, %r2831}, {%f286, %f287, %f288, %f289}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f262, %f263, %f264, %f265}, {%r1172, %r1173, %r1174, %r1175}, {%r2830, %r2829}, {%f262, %f263, %f264, %f265}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f270, %f271, %f272, %f273}, {%r1172, %r1173, %r1174, %r1175}, {%r2828, %r2827}, {%f270, %f271, %f272, %f273}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f278, %f279, %f280, %f281}, {%r1177, %r1178, %r1179, %r1180}, {%r2830, %r2829}, {%f278, %f279, %f280, %f281}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f286, %f287, %f288, %f289}, {%r1177, %r1178, %r1179, %r1180}, {%r2828, %r2827}, {%f286, %f287, %f288, %f289}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f262, %f263, %f264, %f265}, {%r1182, %r1183, %r1184, %r1185}, {%r2826, %r2825}, {%f262, %f263, %f264, %f265}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f270, %f271, %f272, %f273}, {%r1182, %r1183, %r1184, %r1185}, {%r2824, %r2823}, {%f270, %f271, %f272, %f273}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f278, %f279, %f280, %f281}, {%r1187, %r1188, %r1189, %r1190}, {%r2826, %r2825}, {%f278, %f279, %f280, %f281}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f286, %f287, %f288, %f289}, {%r1187, %r1188, %r1189, %r1190}, {%r2824, %r2823}, {%f286, %f287, %f288, %f289}; // end inline asm mul.ftz.f32 %f486, %f1, %f262; mul.ftz.f32 %f487, %f1, %f263; mul.ftz.f32 %f488, %f1, %f270; mul.ftz.f32 %f489, %f1, %f271; mul.ftz.f32 %f490, %f1, %f264; mul.ftz.f32 %f491, %f1, %f265; mul.ftz.f32 %f492, %f1, %f272; mul.ftz.f32 %f493, %f1, %f273; mul.ftz.f32 %f494, %f1, %f278; mul.ftz.f32 %f495, %f1, %f279; mul.ftz.f32 %f496, %f1, %f286; mul.ftz.f32 %f497, %f1, %f287; mul.ftz.f32 %f498, %f1, %f280; mul.ftz.f32 %f499, %f1, %f281; mul.ftz.f32 %f500, %f1, %f288; mul.ftz.f32 %f501, %f1, %f289; setp.lt.s32 %p81, %r2820, %r1; selp.f32 %f1437, %f486, 0fFF800000, %p81; add.s32 %r1797, %r2820, 1; setp.lt.s32 %p82, %r1797, %r1; selp.f32 %f1436, %f487, 0fFF800000, %p82; add.s32 %r1798, %r2820, 8; setp.lt.s32 %p83, %r1798, %r1; selp.f32 %f1435, %f488, 0fFF800000, %p83; add.s32 %r1799, %r2820, 9; setp.lt.s32 %p84, %r1799, %r1; selp.f32 %f1434, %f489, 0fFF800000, %p84; selp.f32 %f1433, %f490, 0fFF800000, %p81; selp.f32 %f1432, %f491, 0fFF800000, %p82; selp.f32 %f1431, %f492, 0fFF800000, %p83; selp.f32 %f1430, %f493, 0fFF800000, %p84; selp.f32 %f1429, %f494, 0fFF800000, %p81; selp.f32 %f1428, %f495, 0fFF800000, %p82; selp.f32 %f1427, %f496, 0fFF800000, %p83; selp.f32 %f1426, %f497, 0fFF800000, %p84; selp.f32 %f1425, %f498, 0fFF800000, %p81; selp.f32 %f1424, %f499, 0fFF800000, %p82; selp.f32 %f1423, %f500, 0fFF800000, %p83; selp.f32 %f1422, %f501, 0fFF800000, %p84; @%p32 bra $L__BB0_40; // begin inline asm cvt.f32.f16 %f502, %rs131; // end inline asm add.ftz.f32 %f1437, %f502, %f1437; // begin inline asm cvt.f32.f16 %f503, %rs132; // end inline asm add.ftz.f32 %f1436, %f503, %f1436; // begin inline asm cvt.f32.f16 %f504, %rs133; // end inline asm add.ftz.f32 %f1435, %f504, %f1435; // begin inline asm cvt.f32.f16 %f505, %rs134; // end inline asm add.ftz.f32 %f1434, %f505, %f1434; // begin inline asm cvt.f32.f16 %f506, %rs135; // end inline asm add.ftz.f32 %f1433, %f506, %f1433; // begin inline asm cvt.f32.f16 %f507, %rs136; // end inline asm add.ftz.f32 %f1432, %f507, %f1432; // begin inline asm cvt.f32.f16 %f508, %rs137; // end inline asm add.ftz.f32 %f1431, %f508, %f1431; // begin inline asm cvt.f32.f16 %f509, %rs138; // end inline asm add.ftz.f32 %f1430, %f509, %f1430; // begin inline asm cvt.f32.f16 %f510, %rs139; // end inline asm add.ftz.f32 %f1429, %f510, %f1429; // begin inline asm cvt.f32.f16 %f511, %rs140; // end inline asm add.ftz.f32 %f1428, %f511, %f1428; // begin inline asm cvt.f32.f16 %f512, %rs141; // end inline asm add.ftz.f32 %f1427, %f512, %f1427; // begin inline asm cvt.f32.f16 %f513, %rs142; // end inline asm add.ftz.f32 %f1426, %f513, %f1426; // begin inline asm cvt.f32.f16 %f514, %rs143; // end inline asm add.ftz.f32 %f1425, %f514, %f1425; // begin inline asm cvt.f32.f16 %f515, %rs144; // end inline asm add.ftz.f32 %f1424, %f515, %f1424; // begin inline asm cvt.f32.f16 %f516, %rs145; // end inline asm add.ftz.f32 %f1423, %f516, %f1423; // begin inline asm cvt.f32.f16 %f517, %rs146; // end inline asm add.ftz.f32 %f1422, %f517, %f1422; $L__BB0_40: add.s32 %r2624, %r2819, 16; setp.ge.s32 %p164, %r2624, %r272; setp.gt.ftz.f32 %p87, %f1437, %f1436; selp.f32 %f518, %f1437, %f1436, %p87; setp.gt.ftz.f32 %p88, %f518, %f1435; selp.f32 %f519, %f518, %f1435, %p88; setp.gt.ftz.f32 %p89, %f519, %f1434; selp.f32 %f520, %f519, %f1434, %p89; setp.gt.ftz.f32 %p90, %f1433, %f1432; selp.f32 %f521, %f1433, %f1432, %p90; setp.gt.ftz.f32 %p91, %f521, %f1431; selp.f32 %f522, %f521, %f1431, %p91; setp.gt.ftz.f32 %p92, %f522, %f1430; selp.f32 %f523, %f522, %f1430, %p92; setp.gt.ftz.f32 %p93, %f1429, %f1428; selp.f32 %f524, %f1429, %f1428, %p93; setp.gt.ftz.f32 %p94, %f524, %f1427; selp.f32 %f525, %f524, %f1427, %p94; setp.gt.ftz.f32 %p95, %f525, %f1426; selp.f32 %f526, %f525, %f1426, %p95; setp.gt.ftz.f32 %p96, %f1425, %f1424; selp.f32 %f527, %f1425, %f1424, %p96; setp.gt.ftz.f32 %p97, %f527, %f1423; selp.f32 %f528, %f527, %f1423, %p97; setp.gt.ftz.f32 %p98, %f528, %f1422; selp.f32 %f529, %f528, %f1422, %p98; mov.b32 %r1801, %f520; mov.u32 %r1802, 31; mov.u32 %r1803, 1; mov.u32 %r1804, -1; shfl.sync.bfly.b32 %r1805|%p99, %r1801, %r1803, %r1802, %r1804; mov.b32 %f530, %r1805; setp.gt.ftz.f32 %p100, %f520, %f530; selp.f32 %f531, %f520, %f530, %p100; mov.b32 %r1806, %f531; mov.u32 %r1807, 2; shfl.sync.bfly.b32 %r1808|%p101, %r1806, %r1807, %r1802, %r1804; mov.b32 %f532, %r1808; setp.gt.ftz.f32 %p102, %f531, %f532; selp.f32 %f533, %f531, %f532, %p102; mov.b32 %r1809, %f523; shfl.sync.bfly.b32 %r1810|%p103, %r1809, %r1803, %r1802, %r1804; mov.b32 %f534, %r1810; setp.gt.ftz.f32 %p104, %f523, %f534; selp.f32 %f535, %f523, %f534, %p104; mov.b32 %r1811, %f535; shfl.sync.bfly.b32 %r1812|%p105, %r1811, %r1807, %r1802, %r1804; mov.b32 %f536, %r1812; setp.gt.ftz.f32 %p106, %f535, %f536; selp.f32 %f537, %f535, %f536, %p106; mov.b32 %r1813, %f526; shfl.sync.bfly.b32 %r1814|%p107, %r1813, %r1803, %r1802, %r1804; mov.b32 %f538, %r1814; setp.gt.ftz.f32 %p108, %f526, %f538; selp.f32 %f539, %f526, %f538, %p108; mov.b32 %r1815, %f539; shfl.sync.bfly.b32 %r1816|%p109, %r1815, %r1807, %r1802, %r1804; mov.b32 %f540, %r1816; setp.gt.ftz.f32 %p110, %f539, %f540; selp.f32 %f541, %f539, %f540, %p110; mov.b32 %r1817, %f529; shfl.sync.bfly.b32 %r1818|%p111, %r1817, %r1803, %r1802, %r1804; mov.b32 %f542, %r1818; setp.gt.ftz.f32 %p112, %f529, %f542; selp.f32 %f543, %f529, %f542, %p112; mov.b32 %r1819, %f543; shfl.sync.bfly.b32 %r1820|%p113, %r1819, %r1807, %r1802, %r1804; mov.b32 %f544, %r1820; setp.gt.ftz.f32 %p114, %f543, %f544; selp.f32 %f545, %f543, %f544, %p114; max.ftz.f32 %f58, %f533, %f1421; max.ftz.f32 %f59, %f537, %f1420; max.ftz.f32 %f60, %f541, %f1419; max.ftz.f32 %f61, %f545, %f1418; sub.ftz.f32 %f546, %f1437, %f58; mul.ftz.f32 %f547, %f546, 0f3FB8AA3B; ex2.approx.ftz.f32 %f62, %f547; sub.ftz.f32 %f548, %f1436, %f58; mul.ftz.f32 %f549, %f548, 0f3FB8AA3B; ex2.approx.ftz.f32 %f63, %f549; sub.ftz.f32 %f550, %f1435, %f58; mul.ftz.f32 %f551, %f550, 0f3FB8AA3B; ex2.approx.ftz.f32 %f64, %f551; sub.ftz.f32 %f552, %f1434, %f58; mul.ftz.f32 %f553, %f552, 0f3FB8AA3B; ex2.approx.ftz.f32 %f65, %f553; sub.ftz.f32 %f554, %f1433, %f59; mul.ftz.f32 %f555, %f554, 0f3FB8AA3B; ex2.approx.ftz.f32 %f66, %f555; sub.ftz.f32 %f556, %f1432, %f59; mul.ftz.f32 %f557, %f556, 0f3FB8AA3B; ex2.approx.ftz.f32 %f67, %f557; sub.ftz.f32 %f558, %f1431, %f59; mul.ftz.f32 %f559, %f558, 0f3FB8AA3B; ex2.approx.ftz.f32 %f68, %f559; sub.ftz.f32 %f560, %f1430, %f59; mul.ftz.f32 %f561, %f560, 0f3FB8AA3B; ex2.approx.ftz.f32 %f69, %f561; sub.ftz.f32 %f562, %f1429, %f60; mul.ftz.f32 %f563, %f562, 0f3FB8AA3B; ex2.approx.ftz.f32 %f70, %f563; sub.ftz.f32 %f564, %f1428, %f60; mul.ftz.f32 %f565, %f564, 0f3FB8AA3B; ex2.approx.ftz.f32 %f71, %f565; sub.ftz.f32 %f566, %f1427, %f60; mul.ftz.f32 %f567, %f566, 0f3FB8AA3B; ex2.approx.ftz.f32 %f72, %f567; sub.ftz.f32 %f568, %f1426, %f60; mul.ftz.f32 %f569, %f568, 0f3FB8AA3B; ex2.approx.ftz.f32 %f73, %f569; sub.ftz.f32 %f570, %f1425, %f61; mul.ftz.f32 %f571, %f570, 0f3FB8AA3B; ex2.approx.ftz.f32 %f74, %f571; sub.ftz.f32 %f572, %f1424, %f61; mul.ftz.f32 %f573, %f572, 0f3FB8AA3B; ex2.approx.ftz.f32 %f75, %f573; sub.ftz.f32 %f574, %f1423, %f61; mul.ftz.f32 %f575, %f574, 0f3FB8AA3B; ex2.approx.ftz.f32 %f76, %f575; sub.ftz.f32 %f576, %f1422, %f61; mul.ftz.f32 %f577, %f576, 0f3FB8AA3B; ex2.approx.ftz.f32 %f77, %f577; add.ftz.f32 %f578, %f62, %f63; add.ftz.f32 %f579, %f578, 0f00000000; add.ftz.f32 %f580, %f64, %f65; add.ftz.f32 %f581, %f580, 0f00000000; add.ftz.f32 %f582, %f579, %f581; add.ftz.f32 %f583, %f66, %f67; add.ftz.f32 %f584, %f583, 0f00000000; add.ftz.f32 %f585, %f68, %f69; add.ftz.f32 %f586, %f585, 0f00000000; add.ftz.f32 %f587, %f584, %f586; add.ftz.f32 %f588, %f70, %f71; add.ftz.f32 %f589, %f588, 0f00000000; add.ftz.f32 %f590, %f72, %f73; add.ftz.f32 %f591, %f590, 0f00000000; add.ftz.f32 %f592, %f589, %f591; add.ftz.f32 %f593, %f74, %f75; add.ftz.f32 %f594, %f593, 0f00000000; add.ftz.f32 %f595, %f76, %f77; add.ftz.f32 %f596, %f595, 0f00000000; add.ftz.f32 %f597, %f594, %f596; mov.b32 %r1821, %f582; shfl.sync.bfly.b32 %r1822|%p115, %r1821, %r1803, %r1802, %r1804; mov.b32 %f598, %r1822; add.ftz.f32 %f599, %f582, %f598; mov.b32 %r1823, %f599; shfl.sync.bfly.b32 %r1824|%p116, %r1823, %r1807, %r1802, %r1804; mov.b32 %f600, %r1824; add.ftz.f32 %f601, %f599, %f600; mov.b32 %r1825, %f587; shfl.sync.bfly.b32 %r1826|%p117, %r1825, %r1803, %r1802, %r1804; mov.b32 %f602, %r1826; add.ftz.f32 %f603, %f587, %f602; mov.b32 %r1827, %f603; shfl.sync.bfly.b32 %r1828|%p118, %r1827, %r1807, %r1802, %r1804; mov.b32 %f604, %r1828; add.ftz.f32 %f605, %f603, %f604; mov.b32 %r1829, %f592; shfl.sync.bfly.b32 %r1830|%p119, %r1829, %r1803, %r1802, %r1804; mov.b32 %f606, %r1830; add.ftz.f32 %f607, %f592, %f606; mov.b32 %r1831, %f607; shfl.sync.bfly.b32 %r1832|%p120, %r1831, %r1807, %r1802, %r1804; mov.b32 %f608, %r1832; add.ftz.f32 %f609, %f607, %f608; mov.b32 %r1833, %f597; shfl.sync.bfly.b32 %r1834|%p121, %r1833, %r1803, %r1802, %r1804; mov.b32 %f610, %r1834; add.ftz.f32 %f611, %f597, %f610; mov.b32 %r1835, %f611; shfl.sync.bfly.b32 %r1836|%p122, %r1835, %r1807, %r1802, %r1804; mov.b32 %f612, %r1836; add.ftz.f32 %f613, %f611, %f612; sub.ftz.f32 %f614, %f1421, %f58; mul.ftz.f32 %f615, %f614, 0f3FB8AA3B; ex2.approx.ftz.f32 %f616, %f615; mul.ftz.f32 %f78, %f616, %f1417; add.ftz.f32 %f1417, %f78, %f601; sub.ftz.f32 %f617, %f1420, %f59; mul.ftz.f32 %f618, %f617, 0f3FB8AA3B; ex2.approx.ftz.f32 %f619, %f618; mul.ftz.f32 %f80, %f619, %f1416; add.ftz.f32 %f1416, %f80, %f605; sub.ftz.f32 %f620, %f1419, %f60; mul.ftz.f32 %f621, %f620, 0f3FB8AA3B; ex2.approx.ftz.f32 %f622, %f621; mul.ftz.f32 %f82, %f622, %f1415; add.ftz.f32 %f1415, %f82, %f609; sub.ftz.f32 %f623, %f1418, %f61; mul.ftz.f32 %f624, %f623, 0f3FB8AA3B; ex2.approx.ftz.f32 %f625, %f624; mul.ftz.f32 %f84, %f625, %f1414; add.ftz.f32 %f1414, %f84, %f613; @%p164 bra $L__BB0_42; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2854, %r2853, %r2852, %r2851}, [%r1196]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2850, %r2849, %r2848, %r2847}, [%r1201]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2846, %r2845, %r2844, %r2843}, [%r1206]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2842, %r2841, %r2840, %r2839}, [%r1211]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2838, %r2837, %r2836, %r2835}, [%r1216]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2834, %r2833, %r2832, %r2831}, [%r1221]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2830, %r2829, %r2828, %r2827}, [%r1226]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2826, %r2825, %r2824, %r2823}, [%r1231]; // end inline asm $L__BB0_42: // begin inline asm cvt.rn.f16x2.f32 %r1895, %f63, %f62; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1896, %f67, %f66; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1897, %f65, %f64; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1898, %f69, %f68; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1899, %f71, %f70; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1900, %f75, %f74; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1901, %f73, %f72; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1902, %f77, %f76; // end inline asm // begin inline asm mov.u32 %r1903, 0; // end inline asm // begin inline asm mov.u32 %r1904, 0; // end inline asm // begin inline asm mov.u32 %r1905, 0; // end inline asm // begin inline asm mov.u32 %r1906, 0; // end inline asm // begin inline asm mov.u32 %r1907, 0; // end inline asm // begin inline asm mov.u32 %r1908, 0; // end inline asm // begin inline asm mov.u32 %r1909, 0; // end inline asm // begin inline asm mov.u32 %r1910, 0; // end inline asm // begin inline asm mov.u32 %r1911, 0; // end inline asm // begin inline asm mov.u32 %r1912, 0; // end inline asm // begin inline asm mov.u32 %r1913, 0; // end inline asm // begin inline asm mov.u32 %r1914, 0; // end inline asm // begin inline asm mov.u32 %r1915, 0; // end inline asm // begin inline asm mov.u32 %r1916, 0; // end inline asm // begin inline asm mov.u32 %r1917, 0; // end inline asm // begin inline asm mov.u32 %r1918, 0; // end inline asm // begin inline asm mov.u32 %r1919, 0; // end inline asm // begin inline asm mov.u32 %r1920, 0; // end inline asm // begin inline asm mov.u32 %r1921, 0; // end inline asm // begin inline asm mov.u32 %r1922, 0; // end inline asm // begin inline asm mov.u32 %r1923, 0; // end inline asm // begin inline asm mov.u32 %r1924, 0; // end inline asm // begin inline asm mov.u32 %r1925, 0; // end inline asm // begin inline asm mov.u32 %r1926, 0; // end inline asm // begin inline asm mov.u32 %r1927, 0; // end inline asm // begin inline asm mov.u32 %r1928, 0; // end inline asm // begin inline asm mov.u32 %r1929, 0; // end inline asm // begin inline asm mov.u32 %r1930, 0; // end inline asm // begin inline asm mov.u32 %r1931, 0; // end inline asm // begin inline asm mov.u32 %r1932, 0; // end inline asm // begin inline asm mov.u32 %r1933, 0; // end inline asm // begin inline asm mov.u32 %r1934, 0; // end inline asm // begin inline asm mov.u32 %r1935, 0; // end inline asm // begin inline asm mov.u32 %r1936, 0; // end inline asm // begin inline asm mov.u32 %r1937, 0; // end inline asm // begin inline asm mov.u32 %r1938, 0; // end inline asm // begin inline asm mov.u32 %r1939, 0; // end inline asm // begin inline asm mov.u32 %r1940, 0; // end inline asm // begin inline asm mov.u32 %r1941, 0; // end inline asm // begin inline asm mov.u32 %r1942, 0; // end inline asm // begin inline asm mov.u32 %r1943, 0; // end inline asm // begin inline asm mov.u32 %r1944, 0; // end inline asm // begin inline asm mov.u32 %r1945, 0; // end inline asm // begin inline asm mov.u32 %r1946, 0; // end inline asm // begin inline asm mov.u32 %r1947, 0; // end inline asm // begin inline asm mov.u32 %r1948, 0; // end inline asm // begin inline asm mov.u32 %r1949, 0; // end inline asm // begin inline asm mov.u32 %r1950, 0; // end inline asm // begin inline asm mov.u32 %r1951, 0; // end inline asm // begin inline asm mov.u32 %r1952, 0; // end inline asm // begin inline asm mov.u32 %r1953, 0; // end inline asm // begin inline asm mov.u32 %r1954, 0; // end inline asm // begin inline asm mov.u32 %r1955, 0; // end inline asm // begin inline asm mov.u32 %r1956, 0; // end inline asm // begin inline asm mov.u32 %r1957, 0; // end inline asm // begin inline asm mov.u32 %r1958, 0; // end inline asm // begin inline asm mov.u32 %r1959, 0; // end inline asm // begin inline asm mov.u32 %r1960, 0; // end inline asm // begin inline asm mov.u32 %r1961, 0; // end inline asm // begin inline asm mov.u32 %r1962, 0; // end inline asm // begin inline asm mov.u32 %r1963, 0; // end inline asm // begin inline asm mov.u32 %r1964, 0; // end inline asm // begin inline asm mov.u32 %r1965, 0; // end inline asm // begin inline asm mov.u32 %r1966, 0; // end inline asm // begin inline asm mov.u32 %r1967, 0; // end inline asm // begin inline asm mov.u32 %r1968, 0; // end inline asm // begin inline asm mov.u32 %r1969, 0; // end inline asm // begin inline asm mov.u32 %r1970, 0; // end inline asm // begin inline asm mov.u32 %r1971, 0; // end inline asm // begin inline asm mov.u32 %r1972, 0; // end inline asm // begin inline asm mov.u32 %r1973, 0; // end inline asm // begin inline asm mov.u32 %r1974, 0; // end inline asm // begin inline asm mov.u32 %r1975, 0; // end inline asm // begin inline asm mov.u32 %r1976, 0; // end inline asm // begin inline asm mov.u32 %r1977, 0; // end inline asm // begin inline asm mov.u32 %r1978, 0; // end inline asm // begin inline asm mov.u32 %r1979, 0; // end inline asm // begin inline asm mov.u32 %r1980, 0; // end inline asm // begin inline asm mov.u32 %r1981, 0; // end inline asm // begin inline asm mov.u32 %r1982, 0; // end inline asm // begin inline asm mov.u32 %r1983, 0; // end inline asm // begin inline asm mov.u32 %r1984, 0; // end inline asm // begin inline asm mov.u32 %r1985, 0; // end inline asm // begin inline asm mov.u32 %r1986, 0; // end inline asm // begin inline asm mov.u32 %r1987, 0; // end inline asm // begin inline asm mov.u32 %r1988, 0; // end inline asm // begin inline asm mov.u32 %r1989, 0; // end inline asm // begin inline asm mov.u32 %r1990, 0; // end inline asm // begin inline asm mov.u32 %r1991, 0; // end inline asm // begin inline asm mov.u32 %r1992, 0; // end inline asm // begin inline asm mov.u32 %r1993, 0; // end inline asm // begin inline asm mov.u32 %r1994, 0; // end inline asm // begin inline asm mov.u32 %r1995, 0; // end inline asm // begin inline asm mov.u32 %r1996, 0; // end inline asm // begin inline asm mov.u32 %r1997, 0; // end inline asm // begin inline asm mov.u32 %r1998, 0; // end inline asm // begin inline asm mov.u32 %r1999, 0; // end inline asm // begin inline asm mov.u32 %r2000, 0; // end inline asm // begin inline asm mov.u32 %r2001, 0; // end inline asm // begin inline asm mov.u32 %r2002, 0; // end inline asm // begin inline asm mov.u32 %r2003, 0; // end inline asm // begin inline asm mov.u32 %r2004, 0; // end inline asm // begin inline asm mov.u32 %r2005, 0; // end inline asm // begin inline asm mov.u32 %r2006, 0; // end inline asm // begin inline asm mov.u32 %r2007, 0; // end inline asm // begin inline asm mov.u32 %r2008, 0; // end inline asm // begin inline asm mov.u32 %r2009, 0; // end inline asm // begin inline asm mov.u32 %r2010, 0; // end inline asm // begin inline asm mov.u32 %r2011, 0; // end inline asm // begin inline asm mov.u32 %r2012, 0; // end inline asm // begin inline asm mov.u32 %r2013, 0; // end inline asm // begin inline asm mov.u32 %r2014, 0; // end inline asm // begin inline asm mov.u32 %r2015, 0; // end inline asm // begin inline asm mov.u32 %r2016, 0; // end inline asm // begin inline asm mov.u32 %r2017, 0; // end inline asm // begin inline asm mov.u32 %r2018, 0; // end inline asm // begin inline asm mov.u32 %r2019, 0; // end inline asm // begin inline asm mov.u32 %r2020, 0; // end inline asm // begin inline asm mov.u32 %r2021, 0; // end inline asm // begin inline asm mov.u32 %r2022, 0; // end inline asm // begin inline asm mov.u32 %r2023, 0; // end inline asm // begin inline asm mov.u32 %r2024, 0; // end inline asm // begin inline asm mov.u32 %r2025, 0; // end inline asm // begin inline asm mov.u32 %r2026, 0; // end inline asm // begin inline asm mov.u32 %r2027, 0; // end inline asm // begin inline asm mov.u32 %r2028, 0; // end inline asm // begin inline asm mov.u32 %r2029, 0; // end inline asm // begin inline asm mov.u32 %r2030, 0; // end inline asm mov.b32 %f642, %r1903; mov.b32 %f643, %r1904; mov.b32 %f644, %r1905; mov.b32 %f645, %r1906; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f642, %f643, %f644, %f645}, {%r1895, %r1896, %r1897, %r1898}, {%r2886, %r2885}, {%f642, %f643, %f644, %f645}; // end inline asm mov.b32 %f650, %r1907; mov.b32 %f651, %r1908; mov.b32 %f652, %r1909; mov.b32 %f653, %r1910; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f650, %f651, %f652, %f653}, {%r1895, %r1896, %r1897, %r1898}, {%r2884, %r2883}, {%f650, %f651, %f652, %f653}; // end inline asm mov.b32 %f658, %r1911; mov.b32 %f659, %r1912; mov.b32 %f660, %r1913; mov.b32 %f661, %r1914; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f658, %f659, %f660, %f661}, {%r1895, %r1896, %r1897, %r1898}, {%r2882, %r2881}, {%f658, %f659, %f660, %f661}; // end inline asm mov.b32 %f666, %r1915; mov.b32 %f667, %r1916; mov.b32 %f668, %r1917; mov.b32 %f669, %r1918; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f666, %f667, %f668, %f669}, {%r1895, %r1896, %r1897, %r1898}, {%r2880, %r2879}, {%f666, %f667, %f668, %f669}; // end inline asm mov.b32 %f674, %r1919; mov.b32 %f675, %r1920; mov.b32 %f676, %r1921; mov.b32 %f677, %r1922; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f674, %f675, %f676, %f677}, {%r1895, %r1896, %r1897, %r1898}, {%r2878, %r2877}, {%f674, %f675, %f676, %f677}; // end inline asm mov.b32 %f682, %r1923; mov.b32 %f683, %r1924; mov.b32 %f684, %r1925; mov.b32 %f685, %r1926; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f682, %f683, %f684, %f685}, {%r1895, %r1896, %r1897, %r1898}, {%r2876, %r2875}, {%f682, %f683, %f684, %f685}; // end inline asm mov.b32 %f690, %r1927; mov.b32 %f691, %r1928; mov.b32 %f692, %r1929; mov.b32 %f693, %r1930; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f690, %f691, %f692, %f693}, {%r1895, %r1896, %r1897, %r1898}, {%r2874, %r2873}, {%f690, %f691, %f692, %f693}; // end inline asm mov.b32 %f698, %r1931; mov.b32 %f699, %r1932; mov.b32 %f700, %r1933; mov.b32 %f701, %r1934; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f698, %f699, %f700, %f701}, {%r1895, %r1896, %r1897, %r1898}, {%r2872, %r2871}, {%f698, %f699, %f700, %f701}; // end inline asm mov.b32 %f706, %r1935; mov.b32 %f707, %r1936; mov.b32 %f708, %r1937; mov.b32 %f709, %r1938; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f706, %f707, %f708, %f709}, {%r1895, %r1896, %r1897, %r1898}, {%r2870, %r2869}, {%f706, %f707, %f708, %f709}; // end inline asm mov.b32 %f714, %r1939; mov.b32 %f715, %r1940; mov.b32 %f716, %r1941; mov.b32 %f717, %r1942; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f714, %f715, %f716, %f717}, {%r1895, %r1896, %r1897, %r1898}, {%r2868, %r2867}, {%f714, %f715, %f716, %f717}; // end inline asm mov.b32 %f722, %r1943; mov.b32 %f723, %r1944; mov.b32 %f724, %r1945; mov.b32 %f725, %r1946; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f722, %f723, %f724, %f725}, {%r1895, %r1896, %r1897, %r1898}, {%r2866, %r2865}, {%f722, %f723, %f724, %f725}; // end inline asm mov.b32 %f730, %r1947; mov.b32 %f731, %r1948; mov.b32 %f732, %r1949; mov.b32 %f733, %r1950; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f730, %f731, %f732, %f733}, {%r1895, %r1896, %r1897, %r1898}, {%r2864, %r2863}, {%f730, %f731, %f732, %f733}; // end inline asm mov.b32 %f738, %r1951; mov.b32 %f739, %r1952; mov.b32 %f740, %r1953; mov.b32 %f741, %r1954; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f738, %f739, %f740, %f741}, {%r1895, %r1896, %r1897, %r1898}, {%r2862, %r2861}, {%f738, %f739, %f740, %f741}; // end inline asm mov.b32 %f746, %r1955; mov.b32 %f747, %r1956; mov.b32 %f748, %r1957; mov.b32 %f749, %r1958; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f746, %f747, %f748, %f749}, {%r1895, %r1896, %r1897, %r1898}, {%r2860, %r2859}, {%f746, %f747, %f748, %f749}; // end inline asm mov.b32 %f754, %r1959; mov.b32 %f755, %r1960; mov.b32 %f756, %r1961; mov.b32 %f757, %r1962; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f754, %f755, %f756, %f757}, {%r1895, %r1896, %r1897, %r1898}, {%r2858, %r2857}, {%f754, %f755, %f756, %f757}; // end inline asm mov.b32 %f762, %r1963; mov.b32 %f763, %r1964; mov.b32 %f764, %r1965; mov.b32 %f765, %r1966; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f762, %f763, %f764, %f765}, {%r1895, %r1896, %r1897, %r1898}, {%r2856, %r2855}, {%f762, %f763, %f764, %f765}; // end inline asm mov.b32 %f770, %r1967; mov.b32 %f771, %r1968; mov.b32 %f772, %r1969; mov.b32 %f773, %r1970; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f770, %f771, %f772, %f773}, {%r1899, %r1900, %r1901, %r1902}, {%r2886, %r2885}, {%f770, %f771, %f772, %f773}; // end inline asm mov.b32 %f778, %r1971; mov.b32 %f779, %r1972; mov.b32 %f780, %r1973; mov.b32 %f781, %r1974; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f778, %f779, %f780, %f781}, {%r1899, %r1900, %r1901, %r1902}, {%r2884, %r2883}, {%f778, %f779, %f780, %f781}; // end inline asm mov.b32 %f786, %r1975; mov.b32 %f787, %r1976; mov.b32 %f788, %r1977; mov.b32 %f789, %r1978; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f786, %f787, %f788, %f789}, {%r1899, %r1900, %r1901, %r1902}, {%r2882, %r2881}, {%f786, %f787, %f788, %f789}; // end inline asm mov.b32 %f794, %r1979; mov.b32 %f795, %r1980; mov.b32 %f796, %r1981; mov.b32 %f797, %r1982; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f794, %f795, %f796, %f797}, {%r1899, %r1900, %r1901, %r1902}, {%r2880, %r2879}, {%f794, %f795, %f796, %f797}; // end inline asm mov.b32 %f802, %r1983; mov.b32 %f803, %r1984; mov.b32 %f804, %r1985; mov.b32 %f805, %r1986; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f802, %f803, %f804, %f805}, {%r1899, %r1900, %r1901, %r1902}, {%r2878, %r2877}, {%f802, %f803, %f804, %f805}; // end inline asm mov.b32 %f810, %r1987; mov.b32 %f811, %r1988; mov.b32 %f812, %r1989; mov.b32 %f813, %r1990; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f810, %f811, %f812, %f813}, {%r1899, %r1900, %r1901, %r1902}, {%r2876, %r2875}, {%f810, %f811, %f812, %f813}; // end inline asm mov.b32 %f818, %r1991; mov.b32 %f819, %r1992; mov.b32 %f820, %r1993; mov.b32 %f821, %r1994; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f818, %f819, %f820, %f821}, {%r1899, %r1900, %r1901, %r1902}, {%r2874, %r2873}, {%f818, %f819, %f820, %f821}; // end inline asm mov.b32 %f826, %r1995; mov.b32 %f827, %r1996; mov.b32 %f828, %r1997; mov.b32 %f829, %r1998; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f826, %f827, %f828, %f829}, {%r1899, %r1900, %r1901, %r1902}, {%r2872, %r2871}, {%f826, %f827, %f828, %f829}; // end inline asm mov.b32 %f834, %r1999; mov.b32 %f835, %r2000; mov.b32 %f836, %r2001; mov.b32 %f837, %r2002; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f834, %f835, %f836, %f837}, {%r1899, %r1900, %r1901, %r1902}, {%r2870, %r2869}, {%f834, %f835, %f836, %f837}; // end inline asm mov.b32 %f842, %r2003; mov.b32 %f843, %r2004; mov.b32 %f844, %r2005; mov.b32 %f845, %r2006; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f842, %f843, %f844, %f845}, {%r1899, %r1900, %r1901, %r1902}, {%r2868, %r2867}, {%f842, %f843, %f844, %f845}; // end inline asm mov.b32 %f850, %r2007; mov.b32 %f851, %r2008; mov.b32 %f852, %r2009; mov.b32 %f853, %r2010; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f850, %f851, %f852, %f853}, {%r1899, %r1900, %r1901, %r1902}, {%r2866, %r2865}, {%f850, %f851, %f852, %f853}; // end inline asm mov.b32 %f858, %r2011; mov.b32 %f859, %r2012; mov.b32 %f860, %r2013; mov.b32 %f861, %r2014; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f858, %f859, %f860, %f861}, {%r1899, %r1900, %r1901, %r1902}, {%r2864, %r2863}, {%f858, %f859, %f860, %f861}; // end inline asm mov.b32 %f866, %r2015; mov.b32 %f867, %r2016; mov.b32 %f868, %r2017; mov.b32 %f869, %r2018; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f866, %f867, %f868, %f869}, {%r1899, %r1900, %r1901, %r1902}, {%r2862, %r2861}, {%f866, %f867, %f868, %f869}; // end inline asm mov.b32 %f874, %r2019; mov.b32 %f875, %r2020; mov.b32 %f876, %r2021; mov.b32 %f877, %r2022; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f874, %f875, %f876, %f877}, {%r1899, %r1900, %r1901, %r1902}, {%r2860, %r2859}, {%f874, %f875, %f876, %f877}; // end inline asm mov.b32 %f882, %r2023; mov.b32 %f883, %r2024; mov.b32 %f884, %r2025; mov.b32 %f885, %r2026; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f882, %f883, %f884, %f885}, {%r1899, %r1900, %r1901, %r1902}, {%r2858, %r2857}, {%f882, %f883, %f884, %f885}; // end inline asm mov.b32 %f890, %r2027; mov.b32 %f891, %r2028; mov.b32 %f892, %r2029; mov.b32 %f893, %r2030; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f890, %f891, %f892, %f893}, {%r1899, %r1900, %r1901, %r1902}, {%r2856, %r2855}, {%f890, %f891, %f892, %f893}; // end inline asm setp.equ.ftz.f32 %p123, %f1417, 0f00000000; mov.f32 %f1439, 0f3F800000; mov.f32 %f1438, %f1439; @%p123 bra $L__BB0_44; rcp.approx.ftz.f32 %f1438, %f1417; $L__BB0_44: setp.equ.ftz.f32 %p124, %f1416, 0f00000000; @%p124 bra $L__BB0_46; rcp.approx.ftz.f32 %f1439, %f1416; $L__BB0_46: mov.b32 %f901, %r3014; fma.rn.ftz.f32 %f902, %f78, %f901, %f642; mul.ftz.f32 %f903, %f1438, %f902; mov.b32 %r3014, %f903; mov.b32 %f904, %r3013; fma.rn.ftz.f32 %f905, %f78, %f904, %f643; mul.ftz.f32 %f906, %f1438, %f905; mov.b32 %r3013, %f906; mov.b32 %f907, %r3012; fma.rn.ftz.f32 %f908, %f80, %f907, %f644; mul.ftz.f32 %f909, %f1439, %f908; mov.b32 %r3012, %f909; mov.b32 %f910, %r3011; fma.rn.ftz.f32 %f911, %f80, %f910, %f645; mul.ftz.f32 %f912, %f1439, %f911; mov.b32 %r3011, %f912; mov.b32 %f913, %r3010; fma.rn.ftz.f32 %f914, %f78, %f913, %f650; mul.ftz.f32 %f915, %f1438, %f914; mov.b32 %r3010, %f915; mov.b32 %f916, %r3009; fma.rn.ftz.f32 %f917, %f78, %f916, %f651; mul.ftz.f32 %f918, %f1438, %f917; mov.b32 %r3009, %f918; mov.b32 %f919, %r3008; fma.rn.ftz.f32 %f920, %f80, %f919, %f652; mul.ftz.f32 %f921, %f1439, %f920; mov.b32 %r3008, %f921; mov.b32 %f922, %r3007; fma.rn.ftz.f32 %f923, %f80, %f922, %f653; mul.ftz.f32 %f924, %f1439, %f923; mov.b32 %r3007, %f924; mov.b32 %f925, %r3006; fma.rn.ftz.f32 %f926, %f78, %f925, %f658; mul.ftz.f32 %f927, %f1438, %f926; mov.b32 %r3006, %f927; mov.b32 %f928, %r3005; fma.rn.ftz.f32 %f929, %f78, %f928, %f659; mul.ftz.f32 %f930, %f1438, %f929; mov.b32 %r3005, %f930; mov.b32 %f931, %r3004; fma.rn.ftz.f32 %f932, %f80, %f931, %f660; mul.ftz.f32 %f933, %f1439, %f932; mov.b32 %r3004, %f933; mov.b32 %f934, %r3003; fma.rn.ftz.f32 %f935, %f80, %f934, %f661; mul.ftz.f32 %f936, %f1439, %f935; mov.b32 %r3003, %f936; mov.b32 %f937, %r3002; fma.rn.ftz.f32 %f938, %f78, %f937, %f666; mul.ftz.f32 %f939, %f1438, %f938; mov.b32 %r3002, %f939; mov.b32 %f940, %r3001; fma.rn.ftz.f32 %f941, %f78, %f940, %f667; mul.ftz.f32 %f942, %f1438, %f941; mov.b32 %r3001, %f942; mov.b32 %f943, %r3000; fma.rn.ftz.f32 %f944, %f80, %f943, %f668; mul.ftz.f32 %f945, %f1439, %f944; mov.b32 %r3000, %f945; mov.b32 %f946, %r2999; fma.rn.ftz.f32 %f947, %f80, %f946, %f669; mul.ftz.f32 %f948, %f1439, %f947; mov.b32 %r2999, %f948; mov.b32 %f949, %r2998; fma.rn.ftz.f32 %f950, %f78, %f949, %f674; mul.ftz.f32 %f951, %f1438, %f950; mov.b32 %r2998, %f951; mov.b32 %f952, %r2997; fma.rn.ftz.f32 %f953, %f78, %f952, %f675; mul.ftz.f32 %f954, %f1438, %f953; mov.b32 %r2997, %f954; mov.b32 %f955, %r2996; fma.rn.ftz.f32 %f956, %f80, %f955, %f676; mul.ftz.f32 %f957, %f1439, %f956; mov.b32 %r2996, %f957; mov.b32 %f958, %r2995; fma.rn.ftz.f32 %f959, %f80, %f958, %f677; mul.ftz.f32 %f960, %f1439, %f959; mov.b32 %r2995, %f960; mov.b32 %f961, %r2994; fma.rn.ftz.f32 %f962, %f78, %f961, %f682; mul.ftz.f32 %f963, %f1438, %f962; mov.b32 %r2994, %f963; mov.b32 %f964, %r2993; fma.rn.ftz.f32 %f965, %f78, %f964, %f683; mul.ftz.f32 %f966, %f1438, %f965; mov.b32 %r2993, %f966; mov.b32 %f967, %r2992; fma.rn.ftz.f32 %f968, %f80, %f967, %f684; mul.ftz.f32 %f969, %f1439, %f968; mov.b32 %r2992, %f969; mov.b32 %f970, %r2991; fma.rn.ftz.f32 %f971, %f80, %f970, %f685; mul.ftz.f32 %f972, %f1439, %f971; mov.b32 %r2991, %f972; mov.b32 %f973, %r2990; fma.rn.ftz.f32 %f974, %f78, %f973, %f690; mul.ftz.f32 %f975, %f1438, %f974; mov.b32 %r2990, %f975; mov.b32 %f976, %r2989; fma.rn.ftz.f32 %f977, %f78, %f976, %f691; mul.ftz.f32 %f978, %f1438, %f977; mov.b32 %r2989, %f978; mov.b32 %f979, %r2988; fma.rn.ftz.f32 %f980, %f80, %f979, %f692; mul.ftz.f32 %f981, %f1439, %f980; mov.b32 %r2988, %f981; mov.b32 %f982, %r2987; fma.rn.ftz.f32 %f983, %f80, %f982, %f693; mul.ftz.f32 %f984, %f1439, %f983; mov.b32 %r2987, %f984; mov.b32 %f985, %r2986; fma.rn.ftz.f32 %f986, %f78, %f985, %f698; mul.ftz.f32 %f987, %f1438, %f986; mov.b32 %r2986, %f987; mov.b32 %f988, %r2985; fma.rn.ftz.f32 %f989, %f78, %f988, %f699; mul.ftz.f32 %f990, %f1438, %f989; mov.b32 %r2985, %f990; mov.b32 %f991, %r2984; fma.rn.ftz.f32 %f992, %f80, %f991, %f700; mul.ftz.f32 %f993, %f1439, %f992; mov.b32 %r2984, %f993; mov.b32 %f994, %r2983; fma.rn.ftz.f32 %f995, %f80, %f994, %f701; mul.ftz.f32 %f996, %f1439, %f995; mov.b32 %r2983, %f996; mov.b32 %f997, %r2982; fma.rn.ftz.f32 %f998, %f78, %f997, %f706; mul.ftz.f32 %f999, %f1438, %f998; mov.b32 %r2982, %f999; mov.b32 %f1000, %r2981; fma.rn.ftz.f32 %f1001, %f78, %f1000, %f707; mul.ftz.f32 %f1002, %f1438, %f1001; mov.b32 %r2981, %f1002; mov.b32 %f1003, %r2980; fma.rn.ftz.f32 %f1004, %f80, %f1003, %f708; mul.ftz.f32 %f1005, %f1439, %f1004; mov.b32 %r2980, %f1005; mov.b32 %f1006, %r2979; fma.rn.ftz.f32 %f1007, %f80, %f1006, %f709; mul.ftz.f32 %f1008, %f1439, %f1007; mov.b32 %r2979, %f1008; mov.b32 %f1009, %r2978; fma.rn.ftz.f32 %f1010, %f78, %f1009, %f714; mul.ftz.f32 %f1011, %f1438, %f1010; mov.b32 %r2978, %f1011; mov.b32 %f1012, %r2977; fma.rn.ftz.f32 %f1013, %f78, %f1012, %f715; mul.ftz.f32 %f1014, %f1438, %f1013; mov.b32 %r2977, %f1014; mov.b32 %f1015, %r2976; fma.rn.ftz.f32 %f1016, %f80, %f1015, %f716; mul.ftz.f32 %f1017, %f1439, %f1016; mov.b32 %r2976, %f1017; mov.b32 %f1018, %r2975; fma.rn.ftz.f32 %f1019, %f80, %f1018, %f717; mul.ftz.f32 %f1020, %f1439, %f1019; mov.b32 %r2975, %f1020; mov.b32 %f1021, %r2974; fma.rn.ftz.f32 %f1022, %f78, %f1021, %f722; mul.ftz.f32 %f1023, %f1438, %f1022; mov.b32 %r2974, %f1023; mov.b32 %f1024, %r2973; fma.rn.ftz.f32 %f1025, %f78, %f1024, %f723; mul.ftz.f32 %f1026, %f1438, %f1025; mov.b32 %r2973, %f1026; mov.b32 %f1027, %r2972; fma.rn.ftz.f32 %f1028, %f80, %f1027, %f724; mul.ftz.f32 %f1029, %f1439, %f1028; mov.b32 %r2972, %f1029; mov.b32 %f1030, %r2971; fma.rn.ftz.f32 %f1031, %f80, %f1030, %f725; mul.ftz.f32 %f1032, %f1439, %f1031; mov.b32 %r2971, %f1032; mov.b32 %f1033, %r2970; fma.rn.ftz.f32 %f1034, %f78, %f1033, %f730; mul.ftz.f32 %f1035, %f1438, %f1034; mov.b32 %r2970, %f1035; mov.b32 %f1036, %r2969; fma.rn.ftz.f32 %f1037, %f78, %f1036, %f731; mul.ftz.f32 %f1038, %f1438, %f1037; mov.b32 %r2969, %f1038; mov.b32 %f1039, %r2968; fma.rn.ftz.f32 %f1040, %f80, %f1039, %f732; mul.ftz.f32 %f1041, %f1439, %f1040; mov.b32 %r2968, %f1041; mov.b32 %f1042, %r2967; fma.rn.ftz.f32 %f1043, %f80, %f1042, %f733; mul.ftz.f32 %f1044, %f1439, %f1043; mov.b32 %r2967, %f1044; mov.b32 %f1045, %r2966; fma.rn.ftz.f32 %f1046, %f78, %f1045, %f738; mul.ftz.f32 %f1047, %f1438, %f1046; mov.b32 %r2966, %f1047; mov.b32 %f1048, %r2965; fma.rn.ftz.f32 %f1049, %f78, %f1048, %f739; mul.ftz.f32 %f1050, %f1438, %f1049; mov.b32 %r2965, %f1050; mov.b32 %f1051, %r2964; fma.rn.ftz.f32 %f1052, %f80, %f1051, %f740; mul.ftz.f32 %f1053, %f1439, %f1052; mov.b32 %r2964, %f1053; mov.b32 %f1054, %r2963; fma.rn.ftz.f32 %f1055, %f80, %f1054, %f741; mul.ftz.f32 %f1056, %f1439, %f1055; mov.b32 %r2963, %f1056; mov.b32 %f1057, %r2962; fma.rn.ftz.f32 %f1058, %f78, %f1057, %f746; mul.ftz.f32 %f1059, %f1438, %f1058; mov.b32 %r2962, %f1059; mov.b32 %f1060, %r2961; fma.rn.ftz.f32 %f1061, %f78, %f1060, %f747; mul.ftz.f32 %f1062, %f1438, %f1061; mov.b32 %r2961, %f1062; mov.b32 %f1063, %r2960; fma.rn.ftz.f32 %f1064, %f80, %f1063, %f748; mul.ftz.f32 %f1065, %f1439, %f1064; mov.b32 %r2960, %f1065; mov.b32 %f1066, %r2959; fma.rn.ftz.f32 %f1067, %f80, %f1066, %f749; mul.ftz.f32 %f1068, %f1439, %f1067; mov.b32 %r2959, %f1068; mov.b32 %f1069, %r2958; fma.rn.ftz.f32 %f1070, %f78, %f1069, %f754; mul.ftz.f32 %f1071, %f1438, %f1070; mov.b32 %r2958, %f1071; mov.b32 %f1072, %r2957; fma.rn.ftz.f32 %f1073, %f78, %f1072, %f755; mul.ftz.f32 %f1074, %f1438, %f1073; mov.b32 %r2957, %f1074; mov.b32 %f1075, %r2956; fma.rn.ftz.f32 %f1076, %f80, %f1075, %f756; mul.ftz.f32 %f1077, %f1439, %f1076; mov.b32 %r2956, %f1077; mov.b32 %f1078, %r2955; fma.rn.ftz.f32 %f1079, %f80, %f1078, %f757; mul.ftz.f32 %f1080, %f1439, %f1079; mov.b32 %r2955, %f1080; mov.b32 %f1081, %r2954; fma.rn.ftz.f32 %f1082, %f78, %f1081, %f762; mul.ftz.f32 %f1083, %f1438, %f1082; mov.b32 %r2954, %f1083; mov.b32 %f1084, %r2953; fma.rn.ftz.f32 %f1085, %f78, %f1084, %f763; mul.ftz.f32 %f1086, %f1438, %f1085; mov.b32 %r2953, %f1086; mov.b32 %f1087, %r2952; fma.rn.ftz.f32 %f1088, %f80, %f1087, %f764; mul.ftz.f32 %f1089, %f1439, %f1088; mov.b32 %r2952, %f1089; mov.b32 %f1090, %r2951; fma.rn.ftz.f32 %f1091, %f80, %f1090, %f765; mul.ftz.f32 %f1092, %f1439, %f1091; mov.b32 %r2951, %f1092; setp.equ.ftz.f32 %p125, %f1415, 0f00000000; mov.f32 %f1441, 0f3F800000; mov.f32 %f1440, %f1441; @%p125 bra $L__BB0_48; rcp.approx.ftz.f32 %f1440, %f1415; $L__BB0_48: setp.equ.ftz.f32 %p126, %f1414, 0f00000000; @%p126 bra $L__BB0_50; rcp.approx.ftz.f32 %f1441, %f1414; $L__BB0_50: add.s32 %r2615, %r2819, 16; setp.ge.s32 %p163, %r2615, %r272; mov.b32 %f1094, %r2950; fma.rn.ftz.f32 %f1095, %f82, %f1094, %f770; mul.ftz.f32 %f1096, %f1440, %f1095; mov.b32 %r2950, %f1096; mov.b32 %f1097, %r2949; fma.rn.ftz.f32 %f1098, %f82, %f1097, %f771; mul.ftz.f32 %f1099, %f1440, %f1098; mov.b32 %r2949, %f1099; mov.b32 %f1100, %r2948; fma.rn.ftz.f32 %f1101, %f84, %f1100, %f772; mul.ftz.f32 %f1102, %f1441, %f1101; mov.b32 %r2948, %f1102; mov.b32 %f1103, %r2947; fma.rn.ftz.f32 %f1104, %f84, %f1103, %f773; mul.ftz.f32 %f1105, %f1441, %f1104; mov.b32 %r2947, %f1105; mov.b32 %f1106, %r2946; fma.rn.ftz.f32 %f1107, %f82, %f1106, %f778; mul.ftz.f32 %f1108, %f1440, %f1107; mov.b32 %r2946, %f1108; mov.b32 %f1109, %r2945; fma.rn.ftz.f32 %f1110, %f82, %f1109, %f779; mul.ftz.f32 %f1111, %f1440, %f1110; mov.b32 %r2945, %f1111; mov.b32 %f1112, %r2944; fma.rn.ftz.f32 %f1113, %f84, %f1112, %f780; mul.ftz.f32 %f1114, %f1441, %f1113; mov.b32 %r2944, %f1114; mov.b32 %f1115, %r2943; fma.rn.ftz.f32 %f1116, %f84, %f1115, %f781; mul.ftz.f32 %f1117, %f1441, %f1116; mov.b32 %r2943, %f1117; mov.b32 %f1118, %r2942; fma.rn.ftz.f32 %f1119, %f82, %f1118, %f786; mul.ftz.f32 %f1120, %f1440, %f1119; mov.b32 %r2942, %f1120; mov.b32 %f1121, %r2941; fma.rn.ftz.f32 %f1122, %f82, %f1121, %f787; mul.ftz.f32 %f1123, %f1440, %f1122; mov.b32 %r2941, %f1123; mov.b32 %f1124, %r2940; fma.rn.ftz.f32 %f1125, %f84, %f1124, %f788; mul.ftz.f32 %f1126, %f1441, %f1125; mov.b32 %r2940, %f1126; mov.b32 %f1127, %r2939; fma.rn.ftz.f32 %f1128, %f84, %f1127, %f789; mul.ftz.f32 %f1129, %f1441, %f1128; mov.b32 %r2939, %f1129; mov.b32 %f1130, %r2938; fma.rn.ftz.f32 %f1131, %f82, %f1130, %f794; mul.ftz.f32 %f1132, %f1440, %f1131; mov.b32 %r2938, %f1132; mov.b32 %f1133, %r2937; fma.rn.ftz.f32 %f1134, %f82, %f1133, %f795; mul.ftz.f32 %f1135, %f1440, %f1134; mov.b32 %r2937, %f1135; mov.b32 %f1136, %r2936; fma.rn.ftz.f32 %f1137, %f84, %f1136, %f796; mul.ftz.f32 %f1138, %f1441, %f1137; mov.b32 %r2936, %f1138; mov.b32 %f1139, %r2935; fma.rn.ftz.f32 %f1140, %f84, %f1139, %f797; mul.ftz.f32 %f1141, %f1441, %f1140; mov.b32 %r2935, %f1141; mov.b32 %f1142, %r2934; fma.rn.ftz.f32 %f1143, %f82, %f1142, %f802; mul.ftz.f32 %f1144, %f1440, %f1143; mov.b32 %r2934, %f1144; mov.b32 %f1145, %r2933; fma.rn.ftz.f32 %f1146, %f82, %f1145, %f803; mul.ftz.f32 %f1147, %f1440, %f1146; mov.b32 %r2933, %f1147; mov.b32 %f1148, %r2932; fma.rn.ftz.f32 %f1149, %f84, %f1148, %f804; mul.ftz.f32 %f1150, %f1441, %f1149; mov.b32 %r2932, %f1150; mov.b32 %f1151, %r2931; fma.rn.ftz.f32 %f1152, %f84, %f1151, %f805; mul.ftz.f32 %f1153, %f1441, %f1152; mov.b32 %r2931, %f1153; mov.b32 %f1154, %r2930; fma.rn.ftz.f32 %f1155, %f82, %f1154, %f810; mul.ftz.f32 %f1156, %f1440, %f1155; mov.b32 %r2930, %f1156; mov.b32 %f1157, %r2929; fma.rn.ftz.f32 %f1158, %f82, %f1157, %f811; mul.ftz.f32 %f1159, %f1440, %f1158; mov.b32 %r2929, %f1159; mov.b32 %f1160, %r2928; fma.rn.ftz.f32 %f1161, %f84, %f1160, %f812; mul.ftz.f32 %f1162, %f1441, %f1161; mov.b32 %r2928, %f1162; mov.b32 %f1163, %r2927; fma.rn.ftz.f32 %f1164, %f84, %f1163, %f813; mul.ftz.f32 %f1165, %f1441, %f1164; mov.b32 %r2927, %f1165; mov.b32 %f1166, %r2926; fma.rn.ftz.f32 %f1167, %f82, %f1166, %f818; mul.ftz.f32 %f1168, %f1440, %f1167; mov.b32 %r2926, %f1168; mov.b32 %f1169, %r2925; fma.rn.ftz.f32 %f1170, %f82, %f1169, %f819; mul.ftz.f32 %f1171, %f1440, %f1170; mov.b32 %r2925, %f1171; mov.b32 %f1172, %r2924; fma.rn.ftz.f32 %f1173, %f84, %f1172, %f820; mul.ftz.f32 %f1174, %f1441, %f1173; mov.b32 %r2924, %f1174; mov.b32 %f1175, %r2923; fma.rn.ftz.f32 %f1176, %f84, %f1175, %f821; mul.ftz.f32 %f1177, %f1441, %f1176; mov.b32 %r2923, %f1177; mov.b32 %f1178, %r2922; fma.rn.ftz.f32 %f1179, %f82, %f1178, %f826; mul.ftz.f32 %f1180, %f1440, %f1179; mov.b32 %r2922, %f1180; mov.b32 %f1181, %r2921; fma.rn.ftz.f32 %f1182, %f82, %f1181, %f827; mul.ftz.f32 %f1183, %f1440, %f1182; mov.b32 %r2921, %f1183; mov.b32 %f1184, %r2920; fma.rn.ftz.f32 %f1185, %f84, %f1184, %f828; mul.ftz.f32 %f1186, %f1441, %f1185; mov.b32 %r2920, %f1186; mov.b32 %f1187, %r2919; fma.rn.ftz.f32 %f1188, %f84, %f1187, %f829; mul.ftz.f32 %f1189, %f1441, %f1188; mov.b32 %r2919, %f1189; mov.b32 %f1190, %r2918; fma.rn.ftz.f32 %f1191, %f82, %f1190, %f834; mul.ftz.f32 %f1192, %f1440, %f1191; mov.b32 %r2918, %f1192; mov.b32 %f1193, %r2917; fma.rn.ftz.f32 %f1194, %f82, %f1193, %f835; mul.ftz.f32 %f1195, %f1440, %f1194; mov.b32 %r2917, %f1195; mov.b32 %f1196, %r2916; fma.rn.ftz.f32 %f1197, %f84, %f1196, %f836; mul.ftz.f32 %f1198, %f1441, %f1197; mov.b32 %r2916, %f1198; mov.b32 %f1199, %r2915; fma.rn.ftz.f32 %f1200, %f84, %f1199, %f837; mul.ftz.f32 %f1201, %f1441, %f1200; mov.b32 %r2915, %f1201; mov.b32 %f1202, %r2914; fma.rn.ftz.f32 %f1203, %f82, %f1202, %f842; mul.ftz.f32 %f1204, %f1440, %f1203; mov.b32 %r2914, %f1204; mov.b32 %f1205, %r2913; fma.rn.ftz.f32 %f1206, %f82, %f1205, %f843; mul.ftz.f32 %f1207, %f1440, %f1206; mov.b32 %r2913, %f1207; mov.b32 %f1208, %r2912; fma.rn.ftz.f32 %f1209, %f84, %f1208, %f844; mul.ftz.f32 %f1210, %f1441, %f1209; mov.b32 %r2912, %f1210; mov.b32 %f1211, %r2911; fma.rn.ftz.f32 %f1212, %f84, %f1211, %f845; mul.ftz.f32 %f1213, %f1441, %f1212; mov.b32 %r2911, %f1213; mov.b32 %f1214, %r2910; fma.rn.ftz.f32 %f1215, %f82, %f1214, %f850; mul.ftz.f32 %f1216, %f1440, %f1215; mov.b32 %r2910, %f1216; mov.b32 %f1217, %r2909; fma.rn.ftz.f32 %f1218, %f82, %f1217, %f851; mul.ftz.f32 %f1219, %f1440, %f1218; mov.b32 %r2909, %f1219; mov.b32 %f1220, %r2908; fma.rn.ftz.f32 %f1221, %f84, %f1220, %f852; mul.ftz.f32 %f1222, %f1441, %f1221; mov.b32 %r2908, %f1222; mov.b32 %f1223, %r2907; fma.rn.ftz.f32 %f1224, %f84, %f1223, %f853; mul.ftz.f32 %f1225, %f1441, %f1224; mov.b32 %r2907, %f1225; mov.b32 %f1226, %r2906; fma.rn.ftz.f32 %f1227, %f82, %f1226, %f858; mul.ftz.f32 %f1228, %f1440, %f1227; mov.b32 %r2906, %f1228; mov.b32 %f1229, %r2905; fma.rn.ftz.f32 %f1230, %f82, %f1229, %f859; mul.ftz.f32 %f1231, %f1440, %f1230; mov.b32 %r2905, %f1231; mov.b32 %f1232, %r2904; fma.rn.ftz.f32 %f1233, %f84, %f1232, %f860; mul.ftz.f32 %f1234, %f1441, %f1233; mov.b32 %r2904, %f1234; mov.b32 %f1235, %r2903; fma.rn.ftz.f32 %f1236, %f84, %f1235, %f861; mul.ftz.f32 %f1237, %f1441, %f1236; mov.b32 %r2903, %f1237; mov.b32 %f1238, %r2902; fma.rn.ftz.f32 %f1239, %f82, %f1238, %f866; mul.ftz.f32 %f1240, %f1440, %f1239; mov.b32 %r2902, %f1240; mov.b32 %f1241, %r2901; fma.rn.ftz.f32 %f1242, %f82, %f1241, %f867; mul.ftz.f32 %f1243, %f1440, %f1242; mov.b32 %r2901, %f1243; mov.b32 %f1244, %r2900; fma.rn.ftz.f32 %f1245, %f84, %f1244, %f868; mul.ftz.f32 %f1246, %f1441, %f1245; mov.b32 %r2900, %f1246; mov.b32 %f1247, %r2899; fma.rn.ftz.f32 %f1248, %f84, %f1247, %f869; mul.ftz.f32 %f1249, %f1441, %f1248; mov.b32 %r2899, %f1249; mov.b32 %f1250, %r2898; fma.rn.ftz.f32 %f1251, %f82, %f1250, %f874; mul.ftz.f32 %f1252, %f1440, %f1251; mov.b32 %r2898, %f1252; mov.b32 %f1253, %r2897; fma.rn.ftz.f32 %f1254, %f82, %f1253, %f875; mul.ftz.f32 %f1255, %f1440, %f1254; mov.b32 %r2897, %f1255; mov.b32 %f1256, %r2896; fma.rn.ftz.f32 %f1257, %f84, %f1256, %f876; mul.ftz.f32 %f1258, %f1441, %f1257; mov.b32 %r2896, %f1258; mov.b32 %f1259, %r2895; fma.rn.ftz.f32 %f1260, %f84, %f1259, %f877; mul.ftz.f32 %f1261, %f1441, %f1260; mov.b32 %r2895, %f1261; mov.b32 %f1262, %r2894; fma.rn.ftz.f32 %f1263, %f82, %f1262, %f882; mul.ftz.f32 %f1264, %f1440, %f1263; mov.b32 %r2894, %f1264; mov.b32 %f1265, %r2893; fma.rn.ftz.f32 %f1266, %f82, %f1265, %f883; mul.ftz.f32 %f1267, %f1440, %f1266; mov.b32 %r2893, %f1267; mov.b32 %f1268, %r2892; fma.rn.ftz.f32 %f1269, %f84, %f1268, %f884; mul.ftz.f32 %f1270, %f1441, %f1269; mov.b32 %r2892, %f1270; mov.b32 %f1271, %r2891; fma.rn.ftz.f32 %f1272, %f84, %f1271, %f885; mul.ftz.f32 %f1273, %f1441, %f1272; mov.b32 %r2891, %f1273; mov.b32 %f1274, %r2890; fma.rn.ftz.f32 %f1275, %f82, %f1274, %f890; mul.ftz.f32 %f1276, %f1440, %f1275; mov.b32 %r2890, %f1276; mov.b32 %f1277, %r2889; fma.rn.ftz.f32 %f1278, %f82, %f1277, %f891; mul.ftz.f32 %f1279, %f1440, %f1278; mov.b32 %r2889, %f1279; mov.b32 %f1280, %r2888; fma.rn.ftz.f32 %f1281, %f84, %f1280, %f892; mul.ftz.f32 %f1282, %f1441, %f1281; mov.b32 %r2888, %f1282; mov.b32 %f1283, %r2887; fma.rn.ftz.f32 %f1284, %f84, %f1283, %f893; mul.ftz.f32 %f1285, %f1441, %f1284; mov.b32 %r2887, %f1285; @%p163 bra $L__BB0_52; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2886, %r2885, %r2884, %r2883}, [%r1236]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2882, %r2881, %r2880, %r2879}, [%r1241]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2878, %r2877, %r2876, %r2875}, [%r1246]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2874, %r2873, %r2872, %r2871}, [%r1251]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2870, %r2869, %r2868, %r2867}, [%r1256]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2866, %r2865, %r2864, %r2863}, [%r1261]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2862, %r2861, %r2860, %r2859}, [%r1266]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2858, %r2857, %r2856, %r2855}, [%r1271]; // end inline asm $L__BB0_52: add.s32 %r2819, %r2819, 16; setp.lt.s32 %p128, %r2819, %r272; add.s32 %r2820, %r2820, 16; mov.f32 %f1418, %f61; mov.f32 %f1419, %f60; mov.f32 %f1420, %f59; mov.f32 %f1421, %f58; @%p128 bra $L__BB0_3; $L__BB0_53: mov.u32 %r2623, %tid.x; and.b32 %r2622, %r2623, 96; shr.u32 %r2621, %r2622, 1; mov.u32 %r2620, _ZN25fused_multihead_attention5smem_E; mov.b64 %rd234, fmha_v2_flash_attention_fp16_128_16_S_128_sm86_kernel_nl_param_0; mov.u64 %rd233, %rd234; ld.param.u32 %r2619, [%rd233+60]; mul.lo.s32 %r2618, %r1, %r1401; mad.lo.s32 %r2617, %r2618, %r2, %r1402; bar.sync 0; mul.lo.s32 %r2421, %r2617, %r2619; shl.b32 %r2422, %r2421, 1; cvt.s64.s32 %rd152, %r2422; add.s64 %rd33, %rd152, %rd57; mov.b32 %f1286, %r3013; mov.b32 %f1287, %r3014; // begin inline asm cvt.rn.f16x2.f32 %r2281, %f1286, %f1287; // end inline asm mov.b32 %f1288, %r3011; mov.b32 %f1289, %r3012; // begin inline asm cvt.rn.f16x2.f32 %r2282, %f1288, %f1289; // end inline asm shl.b32 %r2425, %r2623, 2; and.b32 %r2426, %r2425, 124; add.s32 %r2428, %r2426, %r2620; and.b32 %r2431, %r2623, 28; shr.u32 %r2432, %r2431, 2; or.b32 %r2433, %r2621, %r2432; shl.b32 %r2434, %r2433, 8; add.s32 %r2435, %r2428, %r2434; add.s32 %r2283, %r2435, 32768; // begin inline asm st.shared.b32 [%r2283], %r2281; // end inline asm add.s32 %r990, %r2435, 34816; // begin inline asm st.shared.b32 [%r990], %r2282; // end inline asm xor.b32 %r2289, %r2283, 16; mov.b32 %f1290, %r3009; mov.b32 %f1291, %r3010; // begin inline asm cvt.rn.f16x2.f32 %r2287, %f1290, %f1291; // end inline asm mov.b32 %f1292, %r3007; mov.b32 %f1293, %r3008; // begin inline asm cvt.rn.f16x2.f32 %r2288, %f1292, %f1293; // end inline asm // begin inline asm st.shared.b32 [%r2289], %r2287; // end inline asm add.s32 %r2291, %r2289, 2048; // begin inline asm st.shared.b32 [%r2291], %r2288; // end inline asm xor.b32 %r2295, %r2283, 32; mov.b32 %f1294, %r3005; mov.b32 %f1295, %r3006; // begin inline asm cvt.rn.f16x2.f32 %r2293, %f1294, %f1295; // end inline asm mov.b32 %f1296, %r3003; mov.b32 %f1297, %r3004; // begin inline asm cvt.rn.f16x2.f32 %r2294, %f1296, %f1297; // end inline asm // begin inline asm st.shared.b32 [%r2295], %r2293; // end inline asm add.s32 %r2297, %r2295, 2048; // begin inline asm st.shared.b32 [%r2297], %r2294; // end inline asm xor.b32 %r2301, %r2283, 48; mov.b32 %f1298, %r3001; mov.b32 %f1299, %r3002; // begin inline asm cvt.rn.f16x2.f32 %r2299, %f1298, %f1299; // end inline asm mov.b32 %f1300, %r2999; mov.b32 %f1301, %r3000; // begin inline asm cvt.rn.f16x2.f32 %r2300, %f1300, %f1301; // end inline asm // begin inline asm st.shared.b32 [%r2301], %r2299; // end inline asm add.s32 %r2303, %r2301, 2048; // begin inline asm st.shared.b32 [%r2303], %r2300; // end inline asm xor.b32 %r2307, %r2283, 64; mov.b32 %f1302, %r2997; mov.b32 %f1303, %r2998; // begin inline asm cvt.rn.f16x2.f32 %r2305, %f1302, %f1303; // end inline asm mov.b32 %f1304, %r2995; mov.b32 %f1305, %r2996; // begin inline asm cvt.rn.f16x2.f32 %r2306, %f1304, %f1305; // end inline asm // begin inline asm st.shared.b32 [%r2307], %r2305; // end inline asm add.s32 %r2309, %r2307, 2048; // begin inline asm st.shared.b32 [%r2309], %r2306; // end inline asm xor.b32 %r2313, %r2283, 80; mov.b32 %f1306, %r2993; mov.b32 %f1307, %r2994; // begin inline asm cvt.rn.f16x2.f32 %r2311, %f1306, %f1307; // end inline asm mov.b32 %f1308, %r2991; mov.b32 %f1309, %r2992; // begin inline asm cvt.rn.f16x2.f32 %r2312, %f1308, %f1309; // end inline asm // begin inline asm st.shared.b32 [%r2313], %r2311; // end inline asm add.s32 %r2315, %r2313, 2048; // begin inline asm st.shared.b32 [%r2315], %r2312; // end inline asm xor.b32 %r2319, %r2283, 96; mov.b32 %f1310, %r2989; mov.b32 %f1311, %r2990; // begin inline asm cvt.rn.f16x2.f32 %r2317, %f1310, %f1311; // end inline asm mov.b32 %f1312, %r2987; mov.b32 %f1313, %r2988; // begin inline asm cvt.rn.f16x2.f32 %r2318, %f1312, %f1313; // end inline asm // begin inline asm st.shared.b32 [%r2319], %r2317; // end inline asm add.s32 %r2321, %r2319, 2048; // begin inline asm st.shared.b32 [%r2321], %r2318; // end inline asm xor.b32 %r2325, %r2283, 112; mov.b32 %f1314, %r2985; mov.b32 %f1315, %r2986; // begin inline asm cvt.rn.f16x2.f32 %r2323, %f1314, %f1315; // end inline asm mov.b32 %f1316, %r2983; mov.b32 %f1317, %r2984; // begin inline asm cvt.rn.f16x2.f32 %r2324, %f1316, %f1317; // end inline asm // begin inline asm st.shared.b32 [%r2325], %r2323; // end inline asm add.s32 %r2327, %r2325, 2048; // begin inline asm st.shared.b32 [%r2327], %r2324; // end inline asm xor.b32 %r2331, %r2283, 128; mov.b32 %f1318, %r2981; mov.b32 %f1319, %r2982; // begin inline asm cvt.rn.f16x2.f32 %r2329, %f1318, %f1319; // end inline asm mov.b32 %f1320, %r2979; mov.b32 %f1321, %r2980; // begin inline asm cvt.rn.f16x2.f32 %r2330, %f1320, %f1321; // end inline asm // begin inline asm st.shared.b32 [%r2331], %r2329; // end inline asm add.s32 %r2333, %r2331, 2048; // begin inline asm st.shared.b32 [%r2333], %r2330; // end inline asm xor.b32 %r2337, %r2283, 144; mov.b32 %f1322, %r2977; mov.b32 %f1323, %r2978; // begin inline asm cvt.rn.f16x2.f32 %r2335, %f1322, %f1323; // end inline asm mov.b32 %f1324, %r2975; mov.b32 %f1325, %r2976; // begin inline asm cvt.rn.f16x2.f32 %r2336, %f1324, %f1325; // end inline asm // begin inline asm st.shared.b32 [%r2337], %r2335; // end inline asm add.s32 %r2339, %r2337, 2048; // begin inline asm st.shared.b32 [%r2339], %r2336; // end inline asm xor.b32 %r2343, %r2283, 160; mov.b32 %f1326, %r2973; mov.b32 %f1327, %r2974; // begin inline asm cvt.rn.f16x2.f32 %r2341, %f1326, %f1327; // end inline asm mov.b32 %f1328, %r2971; mov.b32 %f1329, %r2972; // begin inline asm cvt.rn.f16x2.f32 %r2342, %f1328, %f1329; // end inline asm // begin inline asm st.shared.b32 [%r2343], %r2341; // end inline asm add.s32 %r2345, %r2343, 2048; // begin inline asm st.shared.b32 [%r2345], %r2342; // end inline asm xor.b32 %r2349, %r2283, 176; mov.b32 %f1330, %r2969; mov.b32 %f1331, %r2970; // begin inline asm cvt.rn.f16x2.f32 %r2347, %f1330, %f1331; // end inline asm mov.b32 %f1332, %r2967; mov.b32 %f1333, %r2968; // begin inline asm cvt.rn.f16x2.f32 %r2348, %f1332, %f1333; // end inline asm // begin inline asm st.shared.b32 [%r2349], %r2347; // end inline asm add.s32 %r2351, %r2349, 2048; // begin inline asm st.shared.b32 [%r2351], %r2348; // end inline asm xor.b32 %r2355, %r2283, 192; mov.b32 %f1334, %r2965; mov.b32 %f1335, %r2966; // begin inline asm cvt.rn.f16x2.f32 %r2353, %f1334, %f1335; // end inline asm mov.b32 %f1336, %r2963; mov.b32 %f1337, %r2964; // begin inline asm cvt.rn.f16x2.f32 %r2354, %f1336, %f1337; // end inline asm // begin inline asm st.shared.b32 [%r2355], %r2353; // end inline asm add.s32 %r2357, %r2355, 2048; // begin inline asm st.shared.b32 [%r2357], %r2354; // end inline asm xor.b32 %r2361, %r2283, 208; mov.b32 %f1338, %r2961; mov.b32 %f1339, %r2962; // begin inline asm cvt.rn.f16x2.f32 %r2359, %f1338, %f1339; // end inline asm mov.b32 %f1340, %r2959; mov.b32 %f1341, %r2960; // begin inline asm cvt.rn.f16x2.f32 %r2360, %f1340, %f1341; // end inline asm // begin inline asm st.shared.b32 [%r2361], %r2359; // end inline asm add.s32 %r2363, %r2361, 2048; // begin inline asm st.shared.b32 [%r2363], %r2360; // end inline asm xor.b32 %r2367, %r2283, 224; mov.b32 %f1342, %r2957; mov.b32 %f1343, %r2958; // begin inline asm cvt.rn.f16x2.f32 %r2365, %f1342, %f1343; // end inline asm mov.b32 %f1344, %r2955; mov.b32 %f1345, %r2956; // begin inline asm cvt.rn.f16x2.f32 %r2366, %f1344, %f1345; // end inline asm // begin inline asm st.shared.b32 [%r2367], %r2365; // end inline asm add.s32 %r2369, %r2367, 2048; // begin inline asm st.shared.b32 [%r2369], %r2366; // end inline asm xor.b32 %r2373, %r2283, 240; mov.b32 %f1346, %r2953; mov.b32 %f1347, %r2954; // begin inline asm cvt.rn.f16x2.f32 %r2371, %f1346, %f1347; // end inline asm mov.b32 %f1348, %r2951; mov.b32 %f1349, %r2952; // begin inline asm cvt.rn.f16x2.f32 %r2372, %f1348, %f1349; // end inline asm // begin inline asm st.shared.b32 [%r2373], %r2371; // end inline asm add.s32 %r2375, %r2373, 2048; // begin inline asm st.shared.b32 [%r2375], %r2372; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r2377, %r2378, %r2379, %r2380}, [%r10]; // end inline asm add.s32 %r2386, %r10, 2048; // begin inline asm ld.shared.v4.b32 {%r2382, %r2383, %r2384, %r2385}, [%r2386]; // end inline asm add.s32 %r2391, %r10, 4096; // begin inline asm ld.shared.v4.b32 {%r2387, %r2388, %r2389, %r2390}, [%r2391]; // end inline asm add.s32 %r2396, %r10, 6144; // begin inline asm ld.shared.v4.b32 {%r2392, %r2393, %r2394, %r2395}, [%r2396]; // end inline asm add.s32 %r2401, %r10, 8192; // begin inline asm ld.shared.v4.b32 {%r2397, %r2398, %r2399, %r2400}, [%r2401]; // end inline asm add.s32 %r2406, %r10, 10240; // begin inline asm ld.shared.v4.b32 {%r2402, %r2403, %r2404, %r2405}, [%r2406]; // end inline asm add.s32 %r2411, %r10, 12288; // begin inline asm ld.shared.v4.b32 {%r2407, %r2408, %r2409, %r2410}, [%r2411]; // end inline asm add.s32 %r2416, %r10, 14336; // begin inline asm ld.shared.v4.b32 {%r2412, %r2413, %r2414, %r2415}, [%r2416]; // end inline asm bar.sync 0; cvt.u32.u64 %r2436, %rd5; setp.ge.s32 %p129, %r2436, %r1; @%p129 bra $L__BB0_76; setp.ge.s32 %p130, %r7, %r9; @%p130 bra $L__BB0_56; mul.lo.s64 %rd154, %rd10, %rd5; add.s64 %rd155, %rd33, %rd154; cvta.to.global.u64 %rd156, %rd11; add.s64 %rd157, %rd156, %rd155; st.global.v4.u32 [%rd157], {%r2377, %r2378, %r2379, %r2380}; $L__BB0_56: add.s32 %r2438, %r2436, 8; setp.ge.s32 %p131, %r2438, %r1; @%p131 bra $L__BB0_76; @%p130 bra $L__BB0_59; add.s64 %rd158, %rd5, 8; mul.lo.s64 %rd159, %rd158, %rd10; add.s64 %rd160, %rd33, %rd159; cvta.to.global.u64 %rd161, %rd11; add.s64 %rd162, %rd161, %rd160; st.global.v4.u32 [%rd162], {%r2382, %r2383, %r2384, %r2385}; $L__BB0_59: add.s32 %r2440, %r2436, 16; setp.ge.s32 %p133, %r2440, %r1; @%p133 bra $L__BB0_76; @%p130 bra $L__BB0_62; add.s64 %rd163, %rd5, 16; mul.lo.s64 %rd164, %rd163, %rd10; add.s64 %rd165, %rd33, %rd164; cvta.to.global.u64 %rd166, %rd11; add.s64 %rd167, %rd166, %rd165; st.global.v4.u32 [%rd167], {%r2387, %r2388, %r2389, %r2390}; $L__BB0_62: add.s32 %r2442, %r2436, 24; setp.ge.s32 %p135, %r2442, %r1; @%p135 bra $L__BB0_76; @%p130 bra $L__BB0_65; add.s64 %rd168, %rd5, 24; mul.lo.s64 %rd169, %rd168, %rd10; add.s64 %rd170, %rd33, %rd169; cvta.to.global.u64 %rd171, %rd11; add.s64 %rd172, %rd171, %rd170; st.global.v4.u32 [%rd172], {%r2392, %r2393, %r2394, %r2395}; $L__BB0_65: add.s32 %r2444, %r2436, 32; setp.ge.s32 %p137, %r2444, %r1; @%p137 bra $L__BB0_76; @%p130 bra $L__BB0_68; add.s64 %rd173, %rd5, 32; mul.lo.s64 %rd174, %rd173, %rd10; add.s64 %rd175, %rd33, %rd174; cvta.to.global.u64 %rd176, %rd11; add.s64 %rd177, %rd176, %rd175; st.global.v4.u32 [%rd177], {%r2397, %r2398, %r2399, %r2400}; $L__BB0_68: add.s32 %r2446, %r2436, 40; setp.ge.s32 %p139, %r2446, %r1; @%p139 bra $L__BB0_76; @%p130 bra $L__BB0_71; add.s64 %rd178, %rd5, 40; mul.lo.s64 %rd179, %rd178, %rd10; add.s64 %rd180, %rd33, %rd179; cvta.to.global.u64 %rd181, %rd11; add.s64 %rd182, %rd181, %rd180; st.global.v4.u32 [%rd182], {%r2402, %r2403, %r2404, %r2405}; $L__BB0_71: add.s32 %r2448, %r2436, 48; setp.ge.s32 %p141, %r2448, %r1; @%p141 bra $L__BB0_76; @%p130 bra $L__BB0_74; add.s64 %rd183, %rd5, 48; mul.lo.s64 %rd184, %rd183, %rd10; add.s64 %rd185, %rd33, %rd184; cvta.to.global.u64 %rd186, %rd11; add.s64 %rd187, %rd186, %rd185; st.global.v4.u32 [%rd187], {%r2407, %r2408, %r2409, %r2410}; $L__BB0_74: add.s32 %r2450, %r2436, 56; setp.ge.s32 %p143, %r2450, %r1; or.pred %p145, %p143, %p130; @%p145 bra $L__BB0_76; add.s64 %rd188, %rd5, 56; mul.lo.s64 %rd189, %rd188, %rd10; add.s64 %rd190, %rd33, %rd189; cvta.to.global.u64 %rd191, %rd11; add.s64 %rd192, %rd191, %rd190; st.global.v4.u32 [%rd192], {%r2412, %r2413, %r2414, %r2415}; $L__BB0_76: mov.b32 %f1350, %r2949; mov.b32 %f1351, %r2950; // begin inline asm cvt.rn.f16x2.f32 %r2451, %f1350, %f1351; // end inline asm mov.b32 %f1352, %r2947; mov.b32 %f1353, %r2948; // begin inline asm cvt.rn.f16x2.f32 %r2452, %f1352, %f1353; // end inline asm // begin inline asm st.shared.b32 [%r2283], %r2451; // end inline asm // begin inline asm st.shared.b32 [%r990], %r2452; // end inline asm mov.b32 %f1354, %r2945; mov.b32 %f1355, %r2946; // begin inline asm cvt.rn.f16x2.f32 %r2457, %f1354, %f1355; // end inline asm mov.b32 %f1356, %r2943; mov.b32 %f1357, %r2944; // begin inline asm cvt.rn.f16x2.f32 %r2458, %f1356, %f1357; // end inline asm // begin inline asm st.shared.b32 [%r2289], %r2457; // end inline asm // begin inline asm st.shared.b32 [%r2291], %r2458; // end inline asm mov.b32 %f1358, %r2941; mov.b32 %f1359, %r2942; // begin inline asm cvt.rn.f16x2.f32 %r2463, %f1358, %f1359; // end inline asm mov.b32 %f1360, %r2939; mov.b32 %f1361, %r2940; // begin inline asm cvt.rn.f16x2.f32 %r2464, %f1360, %f1361; // end inline asm // begin inline asm st.shared.b32 [%r2295], %r2463; // end inline asm // begin inline asm st.shared.b32 [%r2297], %r2464; // end inline asm mov.b32 %f1362, %r2937; mov.b32 %f1363, %r2938; // begin inline asm cvt.rn.f16x2.f32 %r2469, %f1362, %f1363; // end inline asm mov.b32 %f1364, %r2935; mov.b32 %f1365, %r2936; // begin inline asm cvt.rn.f16x2.f32 %r2470, %f1364, %f1365; // end inline asm // begin inline asm st.shared.b32 [%r2301], %r2469; // end inline asm // begin inline asm st.shared.b32 [%r2303], %r2470; // end inline asm mov.b32 %f1366, %r2933; mov.b32 %f1367, %r2934; // begin inline asm cvt.rn.f16x2.f32 %r2475, %f1366, %f1367; // end inline asm mov.b32 %f1368, %r2931; mov.b32 %f1369, %r2932; // begin inline asm cvt.rn.f16x2.f32 %r2476, %f1368, %f1369; // end inline asm // begin inline asm st.shared.b32 [%r2307], %r2475; // end inline asm // begin inline asm st.shared.b32 [%r2309], %r2476; // end inline asm mov.b32 %f1370, %r2929; mov.b32 %f1371, %r2930; // begin inline asm cvt.rn.f16x2.f32 %r2481, %f1370, %f1371; // end inline asm mov.b32 %f1372, %r2927; mov.b32 %f1373, %r2928; // begin inline asm cvt.rn.f16x2.f32 %r2482, %f1372, %f1373; // end inline asm // begin inline asm st.shared.b32 [%r2313], %r2481; // end inline asm // begin inline asm st.shared.b32 [%r2315], %r2482; // end inline asm mov.b32 %f1374, %r2925; mov.b32 %f1375, %r2926; // begin inline asm cvt.rn.f16x2.f32 %r2487, %f1374, %f1375; // end inline asm mov.b32 %f1376, %r2923; mov.b32 %f1377, %r2924; // begin inline asm cvt.rn.f16x2.f32 %r2488, %f1376, %f1377; // end inline asm // begin inline asm st.shared.b32 [%r2319], %r2487; // end inline asm // begin inline asm st.shared.b32 [%r2321], %r2488; // end inline asm mov.b32 %f1378, %r2921; mov.b32 %f1379, %r2922; // begin inline asm cvt.rn.f16x2.f32 %r2493, %f1378, %f1379; // end inline asm mov.b32 %f1380, %r2919; mov.b32 %f1381, %r2920; // begin inline asm cvt.rn.f16x2.f32 %r2494, %f1380, %f1381; // end inline asm // begin inline asm st.shared.b32 [%r2325], %r2493; // end inline asm // begin inline asm st.shared.b32 [%r2327], %r2494; // end inline asm mov.b32 %f1382, %r2917; mov.b32 %f1383, %r2918; // begin inline asm cvt.rn.f16x2.f32 %r2499, %f1382, %f1383; // end inline asm mov.b32 %f1384, %r2915; mov.b32 %f1385, %r2916; // begin inline asm cvt.rn.f16x2.f32 %r2500, %f1384, %f1385; // end inline asm // begin inline asm st.shared.b32 [%r2331], %r2499; // end inline asm // begin inline asm st.shared.b32 [%r2333], %r2500; // end inline asm mov.b32 %f1386, %r2913; mov.b32 %f1387, %r2914; // begin inline asm cvt.rn.f16x2.f32 %r2505, %f1386, %f1387; // end inline asm mov.b32 %f1388, %r2911; mov.b32 %f1389, %r2912; // begin inline asm cvt.rn.f16x2.f32 %r2506, %f1388, %f1389; // end inline asm // begin inline asm st.shared.b32 [%r2337], %r2505; // end inline asm // begin inline asm st.shared.b32 [%r2339], %r2506; // end inline asm mov.b32 %f1390, %r2909; mov.b32 %f1391, %r2910; // begin inline asm cvt.rn.f16x2.f32 %r2511, %f1390, %f1391; // end inline asm mov.b32 %f1392, %r2907; mov.b32 %f1393, %r2908; // begin inline asm cvt.rn.f16x2.f32 %r2512, %f1392, %f1393; // end inline asm // begin inline asm st.shared.b32 [%r2343], %r2511; // end inline asm // begin inline asm st.shared.b32 [%r2345], %r2512; // end inline asm mov.b32 %f1394, %r2905; mov.b32 %f1395, %r2906; // begin inline asm cvt.rn.f16x2.f32 %r2517, %f1394, %f1395; // end inline asm mov.b32 %f1396, %r2903; mov.b32 %f1397, %r2904; // begin inline asm cvt.rn.f16x2.f32 %r2518, %f1396, %f1397; // end inline asm // begin inline asm st.shared.b32 [%r2349], %r2517; // end inline asm // begin inline asm st.shared.b32 [%r2351], %r2518; // end inline asm mov.b32 %f1398, %r2901; mov.b32 %f1399, %r2902; // begin inline asm cvt.rn.f16x2.f32 %r2523, %f1398, %f1399; // end inline asm mov.b32 %f1400, %r2899; mov.b32 %f1401, %r2900; // begin inline asm cvt.rn.f16x2.f32 %r2524, %f1400, %f1401; // end inline asm // begin inline asm st.shared.b32 [%r2355], %r2523; // end inline asm // begin inline asm st.shared.b32 [%r2357], %r2524; // end inline asm mov.b32 %f1402, %r2897; mov.b32 %f1403, %r2898; // begin inline asm cvt.rn.f16x2.f32 %r2529, %f1402, %f1403; // end inline asm mov.b32 %f1404, %r2895; mov.b32 %f1405, %r2896; // begin inline asm cvt.rn.f16x2.f32 %r2530, %f1404, %f1405; // end inline asm // begin inline asm st.shared.b32 [%r2361], %r2529; // end inline asm // begin inline asm st.shared.b32 [%r2363], %r2530; // end inline asm mov.b32 %f1406, %r2893; mov.b32 %f1407, %r2894; // begin inline asm cvt.rn.f16x2.f32 %r2535, %f1406, %f1407; // end inline asm mov.b32 %f1408, %r2891; mov.b32 %f1409, %r2892; // begin inline asm cvt.rn.f16x2.f32 %r2536, %f1408, %f1409; // end inline asm // begin inline asm st.shared.b32 [%r2367], %r2535; // end inline asm // begin inline asm st.shared.b32 [%r2369], %r2536; // end inline asm mov.b32 %f1410, %r2889; mov.b32 %f1411, %r2890; // begin inline asm cvt.rn.f16x2.f32 %r2541, %f1410, %f1411; // end inline asm mov.b32 %f1412, %r2887; mov.b32 %f1413, %r2888; // begin inline asm cvt.rn.f16x2.f32 %r2542, %f1412, %f1413; // end inline asm // begin inline asm st.shared.b32 [%r2373], %r2541; // end inline asm // begin inline asm st.shared.b32 [%r2375], %r2542; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r2547, %r2548, %r2549, %r2550}, [%r10]; // end inline asm // begin inline asm ld.shared.v4.b32 {%r2552, %r2553, %r2554, %r2555}, [%r2386]; // end inline asm add.s32 %r2561, %r2386, 2048; // begin inline asm ld.shared.v4.b32 {%r2557, %r2558, %r2559, %r2560}, [%r2561]; // end inline asm add.s32 %r2566, %r2386, 4096; // begin inline asm ld.shared.v4.b32 {%r2562, %r2563, %r2564, %r2565}, [%r2566]; // end inline asm add.s32 %r2571, %r2386, 6144; // begin inline asm ld.shared.v4.b32 {%r2567, %r2568, %r2569, %r2570}, [%r2571]; // end inline asm add.s32 %r2576, %r2386, 8192; // begin inline asm ld.shared.v4.b32 {%r2572, %r2573, %r2574, %r2575}, [%r2576]; // end inline asm add.s32 %r2581, %r2386, 10240; // begin inline asm ld.shared.v4.b32 {%r2577, %r2578, %r2579, %r2580}, [%r2581]; // end inline asm add.s32 %r2586, %r2386, 12288; // begin inline asm ld.shared.v4.b32 {%r2582, %r2583, %r2584, %r2585}, [%r2586]; // end inline asm add.s32 %r2600, %r2436, 64; setp.ge.s32 %p146, %r2600, %r1; @%p146 bra $L__BB0_99; setp.ge.s32 %p147, %r7, %r9; @%p147 bra $L__BB0_79; add.s64 %rd193, %rd5, 64; mul.lo.s64 %rd194, %rd193, %rd10; add.s64 %rd195, %rd33, %rd194; cvta.to.global.u64 %rd196, %rd11; add.s64 %rd197, %rd196, %rd195; st.global.v4.u32 [%rd197], {%r2547, %r2548, %r2549, %r2550}; $L__BB0_79: add.s32 %r2602, %r2436, 72; setp.ge.s32 %p148, %r2602, %r1; @%p148 bra $L__BB0_99; @%p147 bra $L__BB0_82; add.s64 %rd198, %rd5, 72; mul.lo.s64 %rd199, %rd198, %rd10; add.s64 %rd200, %rd33, %rd199; cvta.to.global.u64 %rd201, %rd11; add.s64 %rd202, %rd201, %rd200; st.global.v4.u32 [%rd202], {%r2552, %r2553, %r2554, %r2555}; $L__BB0_82: add.s32 %r2604, %r2436, 80; setp.ge.s32 %p150, %r2604, %r1; @%p150 bra $L__BB0_99; @%p147 bra $L__BB0_85; add.s64 %rd203, %rd5, 80; mul.lo.s64 %rd204, %rd203, %rd10; add.s64 %rd205, %rd33, %rd204; cvta.to.global.u64 %rd206, %rd11; add.s64 %rd207, %rd206, %rd205; st.global.v4.u32 [%rd207], {%r2557, %r2558, %r2559, %r2560}; $L__BB0_85: add.s32 %r2606, %r2436, 88; setp.ge.s32 %p152, %r2606, %r1; @%p152 bra $L__BB0_99; @%p147 bra $L__BB0_88; add.s64 %rd208, %rd5, 88; mul.lo.s64 %rd209, %rd208, %rd10; add.s64 %rd210, %rd33, %rd209; cvta.to.global.u64 %rd211, %rd11; add.s64 %rd212, %rd211, %rd210; st.global.v4.u32 [%rd212], {%r2562, %r2563, %r2564, %r2565}; $L__BB0_88: add.s32 %r2608, %r2436, 96; setp.ge.s32 %p154, %r2608, %r1; @%p154 bra $L__BB0_99; @%p147 bra $L__BB0_91; add.s64 %rd213, %rd5, 96; mul.lo.s64 %rd214, %rd213, %rd10; add.s64 %rd215, %rd33, %rd214; cvta.to.global.u64 %rd216, %rd11; add.s64 %rd217, %rd216, %rd215; st.global.v4.u32 [%rd217], {%r2567, %r2568, %r2569, %r2570}; $L__BB0_91: add.s32 %r2610, %r2436, 104; setp.ge.s32 %p156, %r2610, %r1; @%p156 bra $L__BB0_99; @%p147 bra $L__BB0_94; add.s64 %rd218, %rd5, 104; mul.lo.s64 %rd219, %rd218, %rd10; add.s64 %rd220, %rd33, %rd219; cvta.to.global.u64 %rd221, %rd11; add.s64 %rd222, %rd221, %rd220; st.global.v4.u32 [%rd222], {%r2572, %r2573, %r2574, %r2575}; $L__BB0_94: add.s32 %r2612, %r2436, 112; setp.ge.s32 %p158, %r2612, %r1; @%p158 bra $L__BB0_99; @%p147 bra $L__BB0_97; add.s64 %rd223, %rd5, 112; mul.lo.s64 %rd224, %rd223, %rd10; add.s64 %rd225, %rd33, %rd224; cvta.to.global.u64 %rd226, %rd11; add.s64 %rd227, %rd226, %rd225; st.global.v4.u32 [%rd227], {%r2577, %r2578, %r2579, %r2580}; $L__BB0_97: add.s32 %r2614, %r2436, 120; setp.ge.s32 %p160, %r2614, %r1; or.pred %p162, %p160, %p147; @%p162 bra $L__BB0_99; add.s64 %rd228, %rd5, 120; mul.lo.s64 %rd229, %rd228, %rd10; add.s64 %rd230, %rd33, %rd229; cvta.to.global.u64 %rd231, %rd11; add.s64 %rd232, %rd231, %rd230; st.global.v4.u32 [%rd232], {%r2582, %r2583, %r2584, %r2585}; $L__BB0_99: ret; }