_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_64_32_S_128_sm86_kernel_nl_param_0+52]; mov.u32 %r1024, %ctaid.z; shl.b32 %r3, %r1024, 6; setp.le.s32 %p1, %r1, %r3; @%p1 bra $L__BB0_72; mov.u32 %r1321, %tid.x; mov.u32 %r1322, %ctaid.y; mov.u32 %r1323, %ctaid.x; mul.lo.s32 %r1324, %r1, %r1322; mad.lo.s32 %r1325, %r1324, %r2, %r1323; shr.s32 %r1326, %r1321, 31; shr.u32 %r1327, %r1326, 27; add.s32 %r1328, %r1321, %r1327; and.b32 %r1329, %r1328, -32; sub.s32 %r1330, %r1321, %r1329; shr.u32 %r1331, %r1326, 25; add.s32 %r1332, %r1321, %r1331; shr.s32 %r1333, %r1332, 7; shl.b32 %r1334, %r1333, 4; shr.s32 %r1335, %r1330, 31; shr.u32 %r1336, %r1335, 30; add.s32 %r1337, %r1330, %r1336; and.b32 %r1338, %r1337, 2147483644; sub.s32 %r1339, %r1330, %r1338; shl.b32 %r1340, %r1339, 1; add.s32 %r2594, %r1340, %r1334; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r1341, %r1328, 5; shr.s32 %r1342, %r1328, 31; shr.u32 %r1343, %r1342, 30; add.s32 %r1344, %r1341, %r1343; and.b32 %r1345, %r1344, 268435452; sub.s32 %r1346, %r1341, %r1345; shl.b32 %r1347, %r1346, 4; shr.s32 %r1348, %r1337, 2; add.s32 %r5, %r1347, %r1348; shr.u32 %r1349, %r1326, 28; add.s32 %r1350, %r1321, %r1349; shr.s32 %r6, %r1350, 4; add.s32 %r1351, %r6, %r3; cvt.s64.s32 %rd5, %r1351; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd57, %rd6, %rd5; mul.lo.s32 %r1352, %r1325, 3; mul.wide.s32 %rd58, %r1352, 256; and.b32 %r1353, %r1350, -16; sub.s32 %r7, %r1321, %r1353; shl.b32 %r1354, %r7, 4; cvt.s64.s32 %rd59, %r1354; add.s64 %rd60, %rd58, %rd59; add.s64 %rd61, %rd60, %rd57; ld.param.u64 %rd62, [%rd1]; add.s64 %rd41, %rd62, %rd61; shr.s32 %r1355, %r1350, 31; shr.u32 %r1356, %r1355, 29; add.s32 %r1357, %r6, %r1356; and.b32 %r1358, %r1357, 268435448; sub.s32 %r1359, %r6, %r1358; xor.b32 %r1360, %r1359, %r7; shl.b32 %r1361, %r6, 8; shl.b32 %r1362, %r1360, 4; mov.u32 %r1363, 31; mov.u32 %r2593, 0; mov.u32 %r1365, -1; shfl.sync.idx.b32 %r1366|%p2, %r2593, %r2593, %r1363, %r1365; shfl.sync.idx.b32 %r1367|%p3, %r2593, %r2593, %r1363, %r1365; and.b32 %r1368, %r1321, 96; shr.u32 %r1369, %r1368, 1; and.b32 %r1370, %r1321, 15; or.b32 %r1371, %r1369, %r1370; and.b32 %r1372, %r1321, 7; shl.b32 %r1373, %r1321, 4; and.b32 %r1374, %r1373, 112; and.b32 %r1375, %r1321, 16; xor.b32 %r1376, %r1374, %r1375; cvt.s64.s32 %rd63, %r6; mul.lo.s64 %rd64, %rd6, %rd63; add.s32 %r1377, %r1352, 1; mul.wide.s32 %rd65, %r1377, 256; add.s64 %rd66, %rd65, %rd59; add.s64 %rd67, %rd66, %rd64; add.s64 %rd189, %rd62, %rd67; shfl.sync.idx.b32 %r1378|%p4, %r2593, %r2593, %r1363, %r1365; shfl.sync.idx.b32 %r1379|%p5, %r2593, %r2593, %r1363, %r1365; shr.u32 %r1380, %r1375, 1; or.b32 %r1381, %r1380, %r1372; and.b32 %r1382, %r1321, 8; shr.u32 %r1383, %r1382, 3; xor.b32 %r1384, %r1383, %r1372; add.s32 %r1385, %r1352, 2; mul.wide.s32 %rd68, %r1385, 256; add.s64 %rd69, %rd68, %rd59; add.s64 %rd70, %rd69, %rd64; add.s64 %rd188, %rd62, %rd70; shfl.sync.idx.b32 %r1386|%p6, %r2593, %r2593, %r1363, %r1365; shfl.sync.idx.b32 %r1387|%p7, %r2593, %r2593, %r1363, %r1365; ld.param.u64 %rd10, [%rd1+32]; ld.param.u64 %rd11, [%rd1+8]; sub.s32 %r1388, %r1, %r3; min.s32 %r1389, %r1388, 64; shl.b32 %r1393, %r1321, 8; and.b32 %r1394, %r1393, 3840; shl.b32 %r1395, %r1384, 4; shl.b32 %r1396, %r1381, 8; shl.b32 %r1397, %r1371, 8; setp.lt.s32 %p8, %r6, %r1389; add.s32 %r1398, %r6, 8; setp.lt.s32 %p9, %r1398, %r1389; add.s32 %r1399, %r6, 16; setp.lt.s32 %p10, %r1399, %r1389; add.s32 %r1400, %r6, 24; setp.lt.s32 %p11, %r1400, %r1389; add.s32 %r1401, %r6, 32; setp.lt.s32 %p12, %r1401, %r1389; add.s32 %r1402, %r6, 40; setp.lt.s32 %p13, %r1402, %r1389; add.s32 %r1403, %r6, 48; setp.lt.s32 %p14, %r1403, %r1389; add.s32 %r1404, %r6, 56; setp.lt.s32 %p15, %r1404, %r1389; add.s32 %r1405, %r1362, %r1361; or.b32 %r1406, %r1397, %r1376; or.b32 %r1407, %r1396, %r1395; or.b32 %r1408, %r1376, %r1394; mov.u32 %r1409, _ZN25fused_multihead_attention5smem_E; add.s32 %r1410, %r1409, 16384; add.s32 %r10, %r1405, %r1410; shl.b64 %rd71, %rd6, 3; selp.b32 %r1036, 16, 0, %p13; add.s32 %r1411, %r1405, %r1409; add.s32 %r1025, %r1411, %r1367; add.s32 %r1027, %r1025, 2048; add.s32 %r1029, %r1025, 4096; add.s32 %r1031, %r1025, 6144; add.s32 %r1033, %r1025, 8192; add.s32 %r1035, %r1025, 10240; add.s32 %r1037, %r1025, 12288; add.s32 %r1039, %r1025, 14336; selp.b32 %r1026, 16, 0, %p8; // begin inline asm cp.async.cg.shared.global [%r1025], [%rd41], 16, %r1026; // end inline asm selp.b32 %r1028, 16, 0, %p9; add.s64 %rd42, %rd41, %rd71; // begin inline asm cp.async.cg.shared.global [%r1027], [%rd42], 16, %r1028; // end inline asm selp.b32 %r1030, 16, 0, %p10; add.s64 %rd43, %rd42, %rd71; // begin inline asm cp.async.cg.shared.global [%r1029], [%rd43], 16, %r1030; // end inline asm selp.b32 %r1032, 16, 0, %p11; add.s64 %rd44, %rd43, %rd71; // begin inline asm cp.async.cg.shared.global [%r1031], [%rd44], 16, %r1032; // end inline asm selp.b32 %r1034, 16, 0, %p12; add.s64 %rd45, %rd44, %rd71; // begin inline asm cp.async.cg.shared.global [%r1033], [%rd45], 16, %r1034; // end inline asm add.s64 %rd46, %rd45, %rd71; // begin inline asm cp.async.cg.shared.global [%r1035], [%rd46], 16, %r1036; // end inline asm selp.b32 %r1038, 16, 0, %p14; add.s64 %rd47, %rd46, %rd71; // begin inline asm cp.async.cg.shared.global [%r1037], [%rd47], 16, %r1038; // end inline asm selp.b32 %r1040, 16, 0, %p15; add.s64 %rd48, %rd47, %rd71; // begin inline asm cp.async.cg.shared.global [%r1039], [%rd48], 16, %r1040; // end inline asm min.s32 %r1412, %r1, 32; setp.lt.s32 %p16, %r6, %r1412; setp.lt.s32 %p17, %r1398, %r1412; setp.lt.s32 %p18, %r1399, %r1412; setp.lt.s32 %p19, %r1400, %r1412; add.s32 %r1041, %r10, %r1379; add.s32 %r1043, %r1041, 2048; add.s32 %r1045, %r1041, 4096; add.s32 %r1047, %r1041, 6144; selp.b32 %r1050, 16, 0, %p16; // begin inline asm cp.async.cg.shared.global [%r1041], [%rd189], 16, %r1050; // end inline asm selp.b32 %r1052, 16, 0, %p17; add.s64 %rd50, %rd189, %rd71; // begin inline asm cp.async.cg.shared.global [%r1043], [%rd50], 16, %r1052; // end inline asm selp.b32 %r1054, 16, 0, %p18; add.s64 %rd51, %rd50, %rd71; // begin inline asm cp.async.cg.shared.global [%r1045], [%rd51], 16, %r1054; // end inline asm selp.b32 %r1056, 16, 0, %p19; add.s64 %rd52, %rd51, %rd71; // begin inline asm cp.async.cg.shared.global [%r1047], [%rd52], 16, %r1056; // end inline asm add.s32 %r1413, %r1409, 24576; add.s32 %r1414, %r1405, %r1413; add.s32 %r1049, %r1414, %r1387; add.s32 %r1051, %r1049, 2048; add.s32 %r1053, %r1049, 4096; add.s32 %r1055, %r1049, 6144; // begin inline asm cp.async.cg.shared.global [%r1049], [%rd188], 16, %r1050; // end inline asm add.s64 %rd54, %rd188, %rd71; // begin inline asm cp.async.cg.shared.global [%r1051], [%rd54], 16, %r1052; // end inline asm add.s64 %rd55, %rd54, %rd71; // begin inline asm cp.async.cg.shared.global [%r1053], [%rd55], 16, %r1054; // end inline asm add.s64 %rd56, %rd55, %rd71; // begin inline asm cp.async.cg.shared.global [%r1055], [%rd56], 16, %r1056; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r1415, %r1366, %r1409; add.s32 %r1061, %r1415, %r1406; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1057, %r1058, %r1059, %r1060}, [%r1061]; // end inline asm xor.b32 %r1416, %r1406, 32; add.s32 %r1066, %r1415, %r1416; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1062, %r1063, %r1064, %r1065}, [%r1066]; // end inline asm xor.b32 %r1417, %r1406, 64; add.s32 %r1071, %r1415, %r1417; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1067, %r1068, %r1069, %r1070}, [%r1071]; // end inline asm xor.b32 %r1418, %r1406, 96; add.s32 %r1076, %r1415, %r1418; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1072, %r1073, %r1074, %r1075}, [%r1076]; // end inline asm or.b32 %r1419, %r1406, 128; add.s32 %r1081, %r1415, %r1419; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1077, %r1078, %r1079, %r1080}, [%r1081]; // end inline asm xor.b32 %r1420, %r1406, 160; add.s32 %r1086, %r1415, %r1420; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1082, %r1083, %r1084, %r1085}, [%r1086]; // end inline asm xor.b32 %r1421, %r1406, 192; add.s32 %r1091, %r1415, %r1421; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1087, %r1088, %r1089, %r1090}, [%r1091]; // end inline asm xor.b32 %r1422, %r1406, 224; add.s32 %r1096, %r1415, %r1422; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1092, %r1093, %r1094, %r1095}, [%r1096]; // end inline asm add.s32 %r51, %r1378, %r1410; add.s32 %r1101, %r51, %r1407; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2660, %r2659, %r2658, %r2657}, [%r1101]; // end inline asm add.s32 %r1106, %r1101, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2656, %r2655, %r2654, %r2653}, [%r1106]; // end inline asm xor.b32 %r1423, %r1407, 32; add.s32 %r1111, %r51, %r1423; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2652, %r2651, %r2650, %r2649}, [%r1111]; // end inline asm add.s32 %r1116, %r1111, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2648, %r2647, %r2646, %r2645}, [%r1116]; // end inline asm xor.b32 %r1424, %r1407, 64; add.s32 %r1121, %r51, %r1424; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2644, %r2643, %r2642, %r2641}, [%r1121]; // end inline asm add.s32 %r1126, %r1121, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2640, %r2639, %r2638, %r2637}, [%r1126]; // end inline asm xor.b32 %r1425, %r1407, 96; add.s32 %r1131, %r51, %r1425; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2636, %r2635, %r2634, %r2633}, [%r1131]; // end inline asm add.s32 %r1136, %r1131, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2632, %r2631, %r2630, %r2629}, [%r1136]; // end inline asm or.b32 %r1426, %r1407, 128; add.s32 %r1141, %r51, %r1426; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2628, %r2627, %r2626, %r2625}, [%r1141]; // end inline asm add.s32 %r1146, %r1141, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2624, %r2623, %r2622, %r2621}, [%r1146]; // end inline asm xor.b32 %r1427, %r1407, 160; add.s32 %r1151, %r51, %r1427; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2620, %r2619, %r2618, %r2617}, [%r1151]; // end inline asm add.s32 %r1156, %r1151, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2616, %r2615, %r2614, %r2613}, [%r1156]; // end inline asm xor.b32 %r1428, %r1407, 192; add.s32 %r1161, %r51, %r1428; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2612, %r2611, %r2610, %r2609}, [%r1161]; // end inline asm add.s32 %r1166, %r1161, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2608, %r2607, %r2606, %r2605}, [%r1166]; // end inline asm xor.b32 %r1429, %r1407, 224; add.s32 %r1171, %r51, %r1429; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2604, %r2603, %r2602, %r2601}, [%r1171]; // end inline asm add.s32 %r1176, %r1171, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2600, %r2599, %r2598, %r2597}, [%r1176]; // end inline asm add.s32 %r1181, %r1408, %r1413; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2724, %r2723, %r2722, %r2721}, [%r1181]; // end inline asm xor.b32 %r1430, %r1408, 32; add.s32 %r1186, %r1430, %r1413; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2720, %r2719, %r2718, %r2717}, [%r1186]; // end inline asm xor.b32 %r1431, %r1408, 64; add.s32 %r1191, %r1431, %r1413; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2716, %r2715, %r2714, %r2713}, [%r1191]; // end inline asm xor.b32 %r1432, %r1408, 96; add.s32 %r1196, %r1432, %r1413; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2712, %r2711, %r2710, %r2709}, [%r1196]; // end inline asm or.b32 %r1433, %r1408, 128; add.s32 %r1201, %r1433, %r1413; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2708, %r2707, %r2706, %r2705}, [%r1201]; // end inline asm xor.b32 %r1434, %r1408, 160; add.s32 %r1206, %r1434, %r1413; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2704, %r2703, %r2702, %r2701}, [%r1206]; // end inline asm xor.b32 %r1435, %r1408, 192; add.s32 %r1211, %r1435, %r1413; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2700, %r2699, %r2698, %r2697}, [%r1211]; // end inline asm xor.b32 %r1436, %r1408, 224; add.s32 %r1216, %r1436, %r1413; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2696, %r2695, %r2694, %r2693}, [%r1216]; // end inline asm add.s32 %r1437, %r1409, 28672; add.s32 %r1221, %r1408, %r1437; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2692, %r2691, %r2690, %r2689}, [%r1221]; // end inline asm add.s32 %r1226, %r1430, %r1437; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2688, %r2687, %r2686, %r2685}, [%r1226]; // end inline asm add.s32 %r1231, %r1431, %r1437; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2684, %r2683, %r2682, %r2681}, [%r1231]; // end inline asm add.s32 %r1236, %r1432, %r1437; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2680, %r2679, %r2678, %r2677}, [%r1236]; // end inline asm add.s32 %r1241, %r1433, %r1437; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2676, %r2675, %r2674, %r2673}, [%r1241]; // end inline asm add.s32 %r1246, %r1434, %r1437; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2672, %r2671, %r2670, %r2669}, [%r1246]; // end inline asm add.s32 %r1251, %r1435, %r1437; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2668, %r2667, %r2666, %r2665}, [%r1251]; // end inline asm add.s32 %r1256, %r1436, %r1437; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2664, %r2663, %r2662, %r2661}, [%r1256]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r2788, 0; // end inline asm // begin inline asm mov.u32 %r2787, 0; // end inline asm // begin inline asm mov.u32 %r2786, 0; // end inline asm // begin inline asm mov.u32 %r2785, 0; // end inline asm // begin inline asm mov.u32 %r2784, 0; // end inline asm // begin inline asm mov.u32 %r2783, 0; // end inline asm // begin inline asm mov.u32 %r2782, 0; // end inline asm // begin inline asm mov.u32 %r2781, 0; // end inline asm // begin inline asm mov.u32 %r2780, 0; // end inline asm // begin inline asm mov.u32 %r2779, 0; // end inline asm // begin inline asm mov.u32 %r2778, 0; // end inline asm // begin inline asm mov.u32 %r2777, 0; // end inline asm // begin inline asm mov.u32 %r2776, 0; // end inline asm // begin inline asm mov.u32 %r2775, 0; // end inline asm // begin inline asm mov.u32 %r2774, 0; // end inline asm // begin inline asm mov.u32 %r2773, 0; // end inline asm // begin inline asm mov.u32 %r2772, 0; // end inline asm // begin inline asm mov.u32 %r2771, 0; // end inline asm // begin inline asm mov.u32 %r2770, 0; // end inline asm // begin inline asm mov.u32 %r2769, 0; // end inline asm // begin inline asm mov.u32 %r2768, 0; // end inline asm // begin inline asm mov.u32 %r2767, 0; // end inline asm // begin inline asm mov.u32 %r2766, 0; // end inline asm // begin inline asm mov.u32 %r2765, 0; // end inline asm // begin inline asm mov.u32 %r2764, 0; // end inline asm // begin inline asm mov.u32 %r2763, 0; // end inline asm // begin inline asm mov.u32 %r2762, 0; // end inline asm // begin inline asm mov.u32 %r2761, 0; // end inline asm // begin inline asm mov.u32 %r2760, 0; // end inline asm // begin inline asm mov.u32 %r2759, 0; // end inline asm // begin inline asm mov.u32 %r2758, 0; // end inline asm // begin inline asm mov.u32 %r2757, 0; // end inline asm // begin inline asm mov.u32 %r2756, 0; // end inline asm // begin inline asm mov.u32 %r2755, 0; // end inline asm // begin inline asm mov.u32 %r2754, 0; // end inline asm // begin inline asm mov.u32 %r2753, 0; // end inline asm // begin inline asm mov.u32 %r2752, 0; // end inline asm // begin inline asm mov.u32 %r2751, 0; // end inline asm // begin inline asm mov.u32 %r2750, 0; // end inline asm // begin inline asm mov.u32 %r2749, 0; // end inline asm // begin inline asm mov.u32 %r2748, 0; // end inline asm // begin inline asm mov.u32 %r2747, 0; // end inline asm // begin inline asm mov.u32 %r2746, 0; // end inline asm // begin inline asm mov.u32 %r2745, 0; // end inline asm // begin inline asm mov.u32 %r2744, 0; // end inline asm // begin inline asm mov.u32 %r2743, 0; // end inline asm // begin inline asm mov.u32 %r2742, 0; // end inline asm // begin inline asm mov.u32 %r2741, 0; // end inline asm // begin inline asm mov.u32 %r2740, 0; // end inline asm // begin inline asm mov.u32 %r2739, 0; // end inline asm // begin inline asm mov.u32 %r2738, 0; // end inline asm // begin inline asm mov.u32 %r2737, 0; // end inline asm // begin inline asm mov.u32 %r2736, 0; // end inline asm // begin inline asm mov.u32 %r2735, 0; // end inline asm // begin inline asm mov.u32 %r2734, 0; // end inline asm // begin inline asm mov.u32 %r2733, 0; // end inline asm // begin inline asm mov.u32 %r2732, 0; // end inline asm // begin inline asm mov.u32 %r2731, 0; // end inline asm // begin inline asm mov.u32 %r2730, 0; // end inline asm // begin inline asm mov.u32 %r2729, 0; // end inline asm // begin inline asm mov.u32 %r2728, 0; // end inline asm // begin inline asm mov.u32 %r2727, 0; // end inline asm // begin inline asm mov.u32 %r2726, 0; // end inline asm // begin inline asm mov.u32 %r2725, 0; // end inline asm add.s32 %r1438, %r1, 31; shr.s32 %r1439, %r1438, 31; shr.u32 %r1440, %r1439, 27; add.s32 %r1441, %r1438, %r1440; and.b32 %r244, %r1441, -32; setp.lt.s32 %p20, %r1, 1; @%p20 bra $L__BB0_49; ld.param.u8 %rs1, [%rd1+160]; cvt.s64.s32 %rd12, %r5; cvt.s64.s32 %rd13, %r2594; add.s32 %r1443, %r2594, 1; cvt.s64.s32 %rd14, %r1443; add.s32 %r1444, %r2594, 8; cvt.s64.s32 %rd15, %r1444; add.s32 %r1445, %r2594, 9; cvt.s64.s32 %rd16, %r1445; add.s32 %r1446, %r2594, 16; cvt.s64.s32 %rd17, %r1446; add.s32 %r1447, %r2594, 17; cvt.s64.s32 %rd18, %r1447; add.s32 %r1448, %r2594, 24; cvt.s64.s32 %rd19, %r1448; add.s32 %r1449, %r2594, 25; cvt.s64.s32 %rd20, %r1449; add.s32 %r405, %r5, 8; mov.f32 %f1054, 0fFF800000; mov.f32 %f1052, 0f00000000; mov.u32 %r2595, %r1; mov.f32 %f1053, %f1052; mov.f32 %f1055, %f1054; mov.u32 %r2596, %r1; $L__BB0_3: add.s32 %r1450, %r2593, 32; setp.ge.s32 %p21, %r1450, %r244; @%p21 bra $L__BB0_5; bar.sync 0; shl.b64 %rd80, %rd6, 5; add.s64 %rd189, %rd189, %rd80; add.s32 %r2596, %r2596, -32; min.s32 %r1467, %r2596, 32; setp.lt.s32 %p22, %r6, %r1467; setp.lt.s32 %p23, %r1398, %r1467; setp.lt.s32 %p24, %r1399, %r1467; setp.lt.s32 %p25, %r1400, %r1467; selp.b32 %r1452, 16, 0, %p22; // begin inline asm cp.async.cg.shared.global [%r1041], [%rd189], 16, %r1452; // end inline asm selp.b32 %r1454, 16, 0, %p23; add.s64 %rd73, %rd189, %rd71; // begin inline asm cp.async.cg.shared.global [%r1043], [%rd73], 16, %r1454; // end inline asm selp.b32 %r1456, 16, 0, %p24; add.s64 %rd74, %rd73, %rd71; // begin inline asm cp.async.cg.shared.global [%r1045], [%rd74], 16, %r1456; // end inline asm selp.b32 %r1458, 16, 0, %p25; add.s64 %rd75, %rd74, %rd71; // begin inline asm cp.async.cg.shared.global [%r1047], [%rd75], 16, %r1458; // end inline asm add.s64 %rd188, %rd188, %rd80; add.s32 %r2595, %r2595, -32; min.s32 %r1471, %r2595, 32; setp.lt.s32 %p26, %r6, %r1471; setp.lt.s32 %p27, %r1398, %r1471; setp.lt.s32 %p28, %r1399, %r1471; setp.lt.s32 %p29, %r1400, %r1471; selp.b32 %r1460, 16, 0, %p26; // begin inline asm cp.async.cg.shared.global [%r1049], [%rd188], 16, %r1460; // end inline asm selp.b32 %r1462, 16, 0, %p27; add.s64 %rd77, %rd188, %rd71; // begin inline asm cp.async.cg.shared.global [%r1051], [%rd77], 16, %r1462; // end inline asm selp.b32 %r1464, 16, 0, %p28; add.s64 %rd78, %rd77, %rd71; // begin inline asm cp.async.cg.shared.global [%r1053], [%rd78], 16, %r1464; // end inline asm selp.b32 %r1466, 16, 0, %p29; add.s64 %rd79, %rd78, %rd71; // begin inline asm cp.async.cg.shared.global [%r1055], [%rd79], 16, %r1466; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm $L__BB0_5: setp.eq.s16 %p30, %rs1, 0; @%p30 bra $L__BB0_38; cvt.s64.s32 %rd84, %r3; add.s64 %rd85, %rd12, %rd84; setp.ge.u64 %p31, %rd85, %rd2; mul.lo.s32 %r1474, %r1, %r3; cvt.s64.s32 %rd86, %r1474; cvt.u64.u32 %rd27, %r2593; add.s64 %rd28, %rd86, %rd27; mul.lo.s64 %rd87, %rd2, %rd12; add.s64 %rd88, %rd28, %rd87; add.s64 %rd89, %rd88, %rd13; add.s64 %rd29, %rd13, %rd27; setp.ge.u64 %p32, %rd29, %rd2; shl.b64 %rd90, %rd89, 1; mad.lo.s32 %r1477, %r2, %r1322, %r1323; cvt.s64.s32 %rd91, %r1477; mul.lo.s64 %rd92, %rd4, %rd91; add.s64 %rd93, %rd92, %rd90; cvta.to.global.u64 %rd94, %rd3; add.s64 %rd30, %rd94, %rd93; mov.u16 %rs132, 0; or.pred %p33, %p32, %p31; mov.u16 %rs131, %rs132; @%p33 bra $L__BB0_8; ld.global.u16 %rs131, [%rd30]; $L__BB0_8: add.s64 %rd31, %rd14, %rd27; setp.ge.u64 %p35, %rd31, %rd2; or.pred %p36, %p35, %p31; @%p36 bra $L__BB0_10; ld.global.u16 %rs132, [%rd30+2]; $L__BB0_10: add.s64 %rd32, %rd15, %rd27; setp.ge.u64 %p38, %rd32, %rd2; mov.u16 %rs134, 0; or.pred %p39, %p38, %p31; mov.u16 %rs133, %rs134; @%p39 bra $L__BB0_12; ld.global.u16 %rs133, [%rd30+16]; $L__BB0_12: add.s64 %rd33, %rd16, %rd27; setp.ge.u64 %p41, %rd33, %rd2; or.pred %p42, %p41, %p31; @%p42 bra $L__BB0_14; ld.global.u16 %rs134, [%rd30+18]; $L__BB0_14: add.s64 %rd34, %rd17, %rd27; setp.ge.u64 %p44, %rd34, %rd2; mov.u16 %rs136, 0; or.pred %p45, %p44, %p31; mov.u16 %rs135, %rs136; @%p45 bra $L__BB0_16; ld.global.u16 %rs135, [%rd30+32]; $L__BB0_16: add.s64 %rd35, %rd18, %rd27; setp.ge.u64 %p47, %rd35, %rd2; or.pred %p48, %p47, %p31; @%p48 bra $L__BB0_18; ld.global.u16 %rs136, [%rd30+34]; $L__BB0_18: add.s64 %rd36, %rd19, %rd27; setp.ge.u64 %p50, %rd36, %rd2; mov.u16 %rs138, 0; or.pred %p51, %p50, %p31; mov.u16 %rs137, %rs138; @%p51 bra $L__BB0_20; ld.global.u16 %rs137, [%rd30+48]; $L__BB0_20: add.s64 %rd37, %rd20, %rd27; setp.ge.u64 %p53, %rd37, %rd2; or.pred %p54, %p53, %p31; @%p54 bra $L__BB0_22; ld.global.u16 %rs138, [%rd30+50]; $L__BB0_22: cvt.s64.s32 %rd110, %r405; add.s64 %rd111, %rd110, %rd84; setp.ge.u64 %p55, %rd111, %rd2; mul.lo.s64 %rd112, %rd2, %rd110; add.s64 %rd113, %rd28, %rd112; add.s64 %rd114, %rd113, %rd13; shl.b64 %rd115, %rd114, 1; add.s64 %rd118, %rd92, %rd115; add.s64 %rd38, %rd94, %rd118; mov.u16 %rs140, 0; or.pred %p57, %p32, %p55; mov.u16 %rs139, %rs140; @%p57 bra $L__BB0_24; ld.global.u16 %rs139, [%rd38]; $L__BB0_24: or.pred %p60, %p35, %p55; @%p60 bra $L__BB0_26; ld.global.u16 %rs140, [%rd38+2]; $L__BB0_26: mov.u16 %rs142, 0; or.pred %p63, %p38, %p55; mov.u16 %rs141, %rs142; @%p63 bra $L__BB0_28; ld.global.u16 %rs141, [%rd38+16]; $L__BB0_28: or.pred %p66, %p41, %p55; @%p66 bra $L__BB0_30; ld.global.u16 %rs142, [%rd38+18]; $L__BB0_30: mov.u16 %rs144, 0; or.pred %p69, %p44, %p55; mov.u16 %rs143, %rs144; @%p69 bra $L__BB0_32; ld.global.u16 %rs143, [%rd38+32]; $L__BB0_32: or.pred %p72, %p47, %p55; @%p72 bra $L__BB0_34; ld.global.u16 %rs144, [%rd38+34]; $L__BB0_34: mov.u16 %rs146, 0; or.pred %p75, %p50, %p55; mov.u16 %rs145, %rs146; @%p75 bra $L__BB0_36; ld.global.u16 %rs145, [%rd38+48]; $L__BB0_36: or.pred %p78, %p53, %p55; @%p78 bra $L__BB0_38; ld.global.u16 %rs146, [%rd38+50]; $L__BB0_38: // begin inline asm mov.u32 %r1511, 0; // end inline asm // begin inline asm mov.u32 %r1512, 0; // end inline asm // begin inline asm mov.u32 %r1513, 0; // end inline asm // begin inline asm mov.u32 %r1514, 0; // end inline asm // begin inline asm mov.u32 %r1515, 0; // end inline asm // begin inline asm mov.u32 %r1516, 0; // end inline asm // begin inline asm mov.u32 %r1517, 0; // end inline asm // begin inline asm mov.u32 %r1518, 0; // end inline asm // begin inline asm mov.u32 %r1519, 0; // end inline asm // begin inline asm mov.u32 %r1520, 0; // end inline asm // begin inline asm mov.u32 %r1521, 0; // end inline asm // begin inline asm mov.u32 %r1522, 0; // end inline asm // begin inline asm mov.u32 %r1523, 0; // end inline asm // begin inline asm mov.u32 %r1524, 0; // end inline asm // begin inline asm mov.u32 %r1525, 0; // end inline asm // begin inline asm mov.u32 %r1526, 0; // end inline asm mov.b32 %f180, %r1511; mov.b32 %f181, %r1512; mov.b32 %f182, %r1513; mov.b32 %f183, %r1514; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r1057, %r1058, %r1059, %r1060}, {%r2660, %r2659}, {%f180, %f181, %f182, %f183}; // end inline asm mov.b32 %f188, %r1515; mov.b32 %f189, %r1516; mov.b32 %f190, %r1517; mov.b32 %f191, %r1518; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r1057, %r1058, %r1059, %r1060}, {%r2658, %r2657}, {%f188, %f189, %f190, %f191}; // end inline asm mov.b32 %f196, %r1519; mov.b32 %f197, %r1520; mov.b32 %f198, %r1521; mov.b32 %f199, %r1522; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r1057, %r1058, %r1059, %r1060}, {%r2656, %r2655}, {%f196, %f197, %f198, %f199}; // end inline asm mov.b32 %f204, %r1523; mov.b32 %f205, %r1524; mov.b32 %f206, %r1525; mov.b32 %f207, %r1526; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r1057, %r1058, %r1059, %r1060}, {%r2654, %r2653}, {%f204, %f205, %f206, %f207}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r1062, %r1063, %r1064, %r1065}, {%r2652, %r2651}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r1062, %r1063, %r1064, %r1065}, {%r2650, %r2649}, {%f188, %f189, %f190, %f191}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r1062, %r1063, %r1064, %r1065}, {%r2648, %r2647}, {%f196, %f197, %f198, %f199}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r1062, %r1063, %r1064, %r1065}, {%r2646, %r2645}, {%f204, %f205, %f206, %f207}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r1067, %r1068, %r1069, %r1070}, {%r2644, %r2643}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r1067, %r1068, %r1069, %r1070}, {%r2642, %r2641}, {%f188, %f189, %f190, %f191}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r1067, %r1068, %r1069, %r1070}, {%r2640, %r2639}, {%f196, %f197, %f198, %f199}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r1067, %r1068, %r1069, %r1070}, {%r2638, %r2637}, {%f204, %f205, %f206, %f207}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r1072, %r1073, %r1074, %r1075}, {%r2636, %r2635}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r1072, %r1073, %r1074, %r1075}, {%r2634, %r2633}, {%f188, %f189, %f190, %f191}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r1072, %r1073, %r1074, %r1075}, {%r2632, %r2631}, {%f196, %f197, %f198, %f199}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r1072, %r1073, %r1074, %r1075}, {%r2630, %r2629}, {%f204, %f205, %f206, %f207}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r1077, %r1078, %r1079, %r1080}, {%r2628, %r2627}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r1077, %r1078, %r1079, %r1080}, {%r2626, %r2625}, {%f188, %f189, %f190, %f191}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r1077, %r1078, %r1079, %r1080}, {%r2624, %r2623}, {%f196, %f197, %f198, %f199}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r1077, %r1078, %r1079, %r1080}, {%r2622, %r2621}, {%f204, %f205, %f206, %f207}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r1082, %r1083, %r1084, %r1085}, {%r2620, %r2619}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r1082, %r1083, %r1084, %r1085}, {%r2618, %r2617}, {%f188, %f189, %f190, %f191}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r1082, %r1083, %r1084, %r1085}, {%r2616, %r2615}, {%f196, %f197, %f198, %f199}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r1082, %r1083, %r1084, %r1085}, {%r2614, %r2613}, {%f204, %f205, %f206, %f207}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r1087, %r1088, %r1089, %r1090}, {%r2612, %r2611}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r1087, %r1088, %r1089, %r1090}, {%r2610, %r2609}, {%f188, %f189, %f190, %f191}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r1087, %r1088, %r1089, %r1090}, {%r2608, %r2607}, {%f196, %f197, %f198, %f199}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r1087, %r1088, %r1089, %r1090}, {%r2606, %r2605}, {%f204, %f205, %f206, %f207}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r1092, %r1093, %r1094, %r1095}, {%r2604, %r2603}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f188, %f189, %f190, %f191}, {%r1092, %r1093, %r1094, %r1095}, {%r2602, %r2601}, {%f188, %f189, %f190, %f191}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f196, %f197, %f198, %f199}, {%r1092, %r1093, %r1094, %r1095}, {%r2600, %r2599}, {%f196, %f197, %f198, %f199}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f204, %f205, %f206, %f207}, {%r1092, %r1093, %r1094, %r1095}, {%r2598, %r2597}, {%f204, %f205, %f206, %f207}; // end inline asm mul.ftz.f32 %f404, %f1, %f180; mul.ftz.f32 %f405, %f1, %f181; mul.ftz.f32 %f406, %f1, %f188; mul.ftz.f32 %f407, %f1, %f189; mul.ftz.f32 %f408, %f1, %f182; mul.ftz.f32 %f409, %f1, %f183; mul.ftz.f32 %f410, %f1, %f190; mul.ftz.f32 %f411, %f1, %f191; mul.ftz.f32 %f412, %f1, %f196; mul.ftz.f32 %f413, %f1, %f197; mul.ftz.f32 %f414, %f1, %f204; mul.ftz.f32 %f415, %f1, %f205; mul.ftz.f32 %f416, %f1, %f198; mul.ftz.f32 %f417, %f1, %f199; mul.ftz.f32 %f418, %f1, %f206; mul.ftz.f32 %f419, %f1, %f207; setp.lt.s32 %p79, %r2594, %r1; selp.f32 %f1071, %f404, 0fFF800000, %p79; add.s32 %r1719, %r2594, 1; setp.lt.s32 %p80, %r1719, %r1; selp.f32 %f1070, %f405, 0fFF800000, %p80; add.s32 %r1720, %r2594, 8; setp.lt.s32 %p81, %r1720, %r1; selp.f32 %f1069, %f406, 0fFF800000, %p81; add.s32 %r1721, %r2594, 9; setp.lt.s32 %p82, %r1721, %r1; selp.f32 %f1068, %f407, 0fFF800000, %p82; add.s32 %r1722, %r2594, 16; setp.lt.s32 %p83, %r1722, %r1; selp.f32 %f1067, %f412, 0fFF800000, %p83; add.s32 %r1723, %r2594, 17; setp.lt.s32 %p84, %r1723, %r1; selp.f32 %f1066, %f413, 0fFF800000, %p84; add.s32 %r1724, %r2594, 24; setp.lt.s32 %p85, %r1724, %r1; selp.f32 %f1065, %f414, 0fFF800000, %p85; add.s32 %r1725, %r2594, 25; setp.lt.s32 %p86, %r1725, %r1; selp.f32 %f1064, %f415, 0fFF800000, %p86; selp.f32 %f1063, %f408, 0fFF800000, %p79; selp.f32 %f1062, %f409, 0fFF800000, %p80; selp.f32 %f1061, %f410, 0fFF800000, %p81; selp.f32 %f1060, %f411, 0fFF800000, %p82; selp.f32 %f1059, %f416, 0fFF800000, %p83; selp.f32 %f1058, %f417, 0fFF800000, %p84; selp.f32 %f1057, %f418, 0fFF800000, %p85; selp.f32 %f1056, %f419, 0fFF800000, %p86; @%p30 bra $L__BB0_40; // begin inline asm cvt.f32.f16 %f420, %rs131; // end inline asm add.ftz.f32 %f1071, %f420, %f1071; // begin inline asm cvt.f32.f16 %f421, %rs132; // end inline asm add.ftz.f32 %f1070, %f421, %f1070; // begin inline asm cvt.f32.f16 %f422, %rs133; // end inline asm add.ftz.f32 %f1069, %f422, %f1069; // begin inline asm cvt.f32.f16 %f423, %rs134; // end inline asm add.ftz.f32 %f1068, %f423, %f1068; // begin inline asm cvt.f32.f16 %f424, %rs135; // end inline asm add.ftz.f32 %f1067, %f424, %f1067; // begin inline asm cvt.f32.f16 %f425, %rs136; // end inline asm add.ftz.f32 %f1066, %f425, %f1066; // begin inline asm cvt.f32.f16 %f426, %rs137; // end inline asm add.ftz.f32 %f1065, %f426, %f1065; // begin inline asm cvt.f32.f16 %f427, %rs138; // end inline asm add.ftz.f32 %f1064, %f427, %f1064; // begin inline asm cvt.f32.f16 %f428, %rs139; // end inline asm add.ftz.f32 %f1063, %f428, %f1063; // begin inline asm cvt.f32.f16 %f429, %rs140; // end inline asm add.ftz.f32 %f1062, %f429, %f1062; // begin inline asm cvt.f32.f16 %f430, %rs141; // end inline asm add.ftz.f32 %f1061, %f430, %f1061; // begin inline asm cvt.f32.f16 %f431, %rs142; // end inline asm add.ftz.f32 %f1060, %f431, %f1060; // begin inline asm cvt.f32.f16 %f432, %rs143; // end inline asm add.ftz.f32 %f1059, %f432, %f1059; // begin inline asm cvt.f32.f16 %f433, %rs144; // end inline asm add.ftz.f32 %f1058, %f433, %f1058; // begin inline asm cvt.f32.f16 %f434, %rs145; // end inline asm add.ftz.f32 %f1057, %f434, %f1057; // begin inline asm cvt.f32.f16 %f435, %rs146; // end inline asm add.ftz.f32 %f1056, %f435, %f1056; $L__BB0_40: add.s32 %r2378, %r2593, 32; setp.ge.s32 %p136, %r2378, %r244; setp.gt.ftz.f32 %p89, %f1071, %f1070; selp.f32 %f436, %f1071, %f1070, %p89; setp.gt.ftz.f32 %p90, %f436, %f1069; selp.f32 %f437, %f436, %f1069, %p90; setp.gt.ftz.f32 %p91, %f437, %f1068; selp.f32 %f438, %f437, %f1068, %p91; setp.gt.ftz.f32 %p92, %f438, %f1067; selp.f32 %f439, %f438, %f1067, %p92; setp.gt.ftz.f32 %p93, %f439, %f1066; selp.f32 %f440, %f439, %f1066, %p93; setp.gt.ftz.f32 %p94, %f440, %f1065; selp.f32 %f441, %f440, %f1065, %p94; setp.gt.ftz.f32 %p95, %f441, %f1064; selp.f32 %f442, %f441, %f1064, %p95; setp.gt.ftz.f32 %p96, %f1063, %f1062; selp.f32 %f443, %f1063, %f1062, %p96; setp.gt.ftz.f32 %p97, %f443, %f1061; selp.f32 %f444, %f443, %f1061, %p97; setp.gt.ftz.f32 %p98, %f444, %f1060; selp.f32 %f445, %f444, %f1060, %p98; setp.gt.ftz.f32 %p99, %f445, %f1059; selp.f32 %f446, %f445, %f1059, %p99; setp.gt.ftz.f32 %p100, %f446, %f1058; selp.f32 %f447, %f446, %f1058, %p100; setp.gt.ftz.f32 %p101, %f447, %f1057; selp.f32 %f448, %f447, %f1057, %p101; setp.gt.ftz.f32 %p102, %f448, %f1056; selp.f32 %f449, %f448, %f1056, %p102; mov.b32 %r1727, %f442; mov.u32 %r1728, 31; mov.u32 %r1729, 1; mov.u32 %r1730, -1; shfl.sync.bfly.b32 %r1731|%p103, %r1727, %r1729, %r1728, %r1730; mov.b32 %f450, %r1731; setp.gt.ftz.f32 %p104, %f442, %f450; selp.f32 %f451, %f442, %f450, %p104; mov.b32 %r1732, %f451; mov.u32 %r1733, 2; shfl.sync.bfly.b32 %r1734|%p105, %r1732, %r1733, %r1728, %r1730; mov.b32 %f452, %r1734; setp.gt.ftz.f32 %p106, %f451, %f452; selp.f32 %f453, %f451, %f452, %p106; mov.b32 %r1735, %f449; shfl.sync.bfly.b32 %r1736|%p107, %r1735, %r1729, %r1728, %r1730; mov.b32 %f454, %r1736; setp.gt.ftz.f32 %p108, %f449, %f454; selp.f32 %f455, %f449, %f454, %p108; mov.b32 %r1737, %f455; shfl.sync.bfly.b32 %r1738|%p109, %r1737, %r1733, %r1728, %r1730; mov.b32 %f456, %r1738; setp.gt.ftz.f32 %p110, %f455, %f456; selp.f32 %f457, %f455, %f456, %p110; max.ftz.f32 %f54, %f453, %f1055; max.ftz.f32 %f55, %f457, %f1054; sub.ftz.f32 %f458, %f1071, %f54; mul.ftz.f32 %f459, %f458, 0f3FB8AA3B; ex2.approx.ftz.f32 %f56, %f459; sub.ftz.f32 %f460, %f1070, %f54; mul.ftz.f32 %f461, %f460, 0f3FB8AA3B; ex2.approx.ftz.f32 %f57, %f461; sub.ftz.f32 %f462, %f1069, %f54; mul.ftz.f32 %f463, %f462, 0f3FB8AA3B; ex2.approx.ftz.f32 %f58, %f463; sub.ftz.f32 %f464, %f1068, %f54; mul.ftz.f32 %f465, %f464, 0f3FB8AA3B; ex2.approx.ftz.f32 %f59, %f465; sub.ftz.f32 %f466, %f1067, %f54; mul.ftz.f32 %f467, %f466, 0f3FB8AA3B; ex2.approx.ftz.f32 %f60, %f467; sub.ftz.f32 %f468, %f1066, %f54; mul.ftz.f32 %f469, %f468, 0f3FB8AA3B; ex2.approx.ftz.f32 %f61, %f469; sub.ftz.f32 %f470, %f1065, %f54; mul.ftz.f32 %f471, %f470, 0f3FB8AA3B; ex2.approx.ftz.f32 %f62, %f471; sub.ftz.f32 %f472, %f1064, %f54; mul.ftz.f32 %f473, %f472, 0f3FB8AA3B; ex2.approx.ftz.f32 %f63, %f473; sub.ftz.f32 %f474, %f1063, %f55; mul.ftz.f32 %f475, %f474, 0f3FB8AA3B; ex2.approx.ftz.f32 %f64, %f475; sub.ftz.f32 %f476, %f1062, %f55; mul.ftz.f32 %f477, %f476, 0f3FB8AA3B; ex2.approx.ftz.f32 %f65, %f477; sub.ftz.f32 %f478, %f1061, %f55; mul.ftz.f32 %f479, %f478, 0f3FB8AA3B; ex2.approx.ftz.f32 %f66, %f479; sub.ftz.f32 %f480, %f1060, %f55; mul.ftz.f32 %f481, %f480, 0f3FB8AA3B; ex2.approx.ftz.f32 %f67, %f481; sub.ftz.f32 %f482, %f1059, %f55; mul.ftz.f32 %f483, %f482, 0f3FB8AA3B; ex2.approx.ftz.f32 %f68, %f483; sub.ftz.f32 %f484, %f1058, %f55; mul.ftz.f32 %f485, %f484, 0f3FB8AA3B; ex2.approx.ftz.f32 %f69, %f485; sub.ftz.f32 %f486, %f1057, %f55; mul.ftz.f32 %f487, %f486, 0f3FB8AA3B; ex2.approx.ftz.f32 %f70, %f487; sub.ftz.f32 %f488, %f1056, %f55; mul.ftz.f32 %f489, %f488, 0f3FB8AA3B; ex2.approx.ftz.f32 %f71, %f489; add.ftz.f32 %f490, %f56, %f57; add.ftz.f32 %f491, %f490, 0f00000000; add.ftz.f32 %f492, %f58, %f59; add.ftz.f32 %f493, %f492, 0f00000000; add.ftz.f32 %f494, %f60, %f61; add.ftz.f32 %f495, %f491, %f494; add.ftz.f32 %f496, %f62, %f63; add.ftz.f32 %f497, %f493, %f496; add.ftz.f32 %f498, %f495, %f497; add.ftz.f32 %f499, %f64, %f65; add.ftz.f32 %f500, %f499, 0f00000000; add.ftz.f32 %f501, %f66, %f67; add.ftz.f32 %f502, %f501, 0f00000000; add.ftz.f32 %f503, %f68, %f69; add.ftz.f32 %f504, %f500, %f503; add.ftz.f32 %f505, %f70, %f71; add.ftz.f32 %f506, %f502, %f505; add.ftz.f32 %f507, %f504, %f506; mov.b32 %r1739, %f498; shfl.sync.bfly.b32 %r1740|%p111, %r1739, %r1729, %r1728, %r1730; mov.b32 %f508, %r1740; add.ftz.f32 %f509, %f498, %f508; mov.b32 %r1741, %f509; shfl.sync.bfly.b32 %r1742|%p112, %r1741, %r1733, %r1728, %r1730; mov.b32 %f510, %r1742; add.ftz.f32 %f511, %f509, %f510; mov.b32 %r1743, %f507; shfl.sync.bfly.b32 %r1744|%p113, %r1743, %r1729, %r1728, %r1730; mov.b32 %f512, %r1744; add.ftz.f32 %f513, %f507, %f512; mov.b32 %r1745, %f513; shfl.sync.bfly.b32 %r1746|%p114, %r1745, %r1733, %r1728, %r1730; mov.b32 %f514, %r1746; add.ftz.f32 %f515, %f513, %f514; sub.ftz.f32 %f516, %f1055, %f54; mul.ftz.f32 %f517, %f516, 0f3FB8AA3B; ex2.approx.ftz.f32 %f518, %f517; mul.ftz.f32 %f72, %f518, %f1053; add.ftz.f32 %f1053, %f72, %f511; sub.ftz.f32 %f519, %f1054, %f55; mul.ftz.f32 %f520, %f519, 0f3FB8AA3B; ex2.approx.ftz.f32 %f521, %f520; mul.ftz.f32 %f74, %f521, %f1052; add.ftz.f32 %f1052, %f74, %f515; @%p136 bra $L__BB0_42; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2660, %r2659, %r2658, %r2657}, [%r1101]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2656, %r2655, %r2654, %r2653}, [%r1106]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2652, %r2651, %r2650, %r2649}, [%r1111]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2648, %r2647, %r2646, %r2645}, [%r1116]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2644, %r2643, %r2642, %r2641}, [%r1121]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2640, %r2639, %r2638, %r2637}, [%r1126]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2636, %r2635, %r2634, %r2633}, [%r1131]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2632, %r2631, %r2630, %r2629}, [%r1136]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2628, %r2627, %r2626, %r2625}, [%r1141]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2624, %r2623, %r2622, %r2621}, [%r1146]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2620, %r2619, %r2618, %r2617}, [%r1151]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2616, %r2615, %r2614, %r2613}, [%r1156]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2612, %r2611, %r2610, %r2609}, [%r1161]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2608, %r2607, %r2606, %r2605}, [%r1166]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2604, %r2603, %r2602, %r2601}, [%r1171]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2600, %r2599, %r2598, %r2597}, [%r1176]; // end inline asm $L__BB0_42: // begin inline asm cvt.rn.f16x2.f32 %r1845, %f57, %f56; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1846, %f65, %f64; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1847, %f59, %f58; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1848, %f67, %f66; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1849, %f61, %f60; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1850, %f69, %f68; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1851, %f63, %f62; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1852, %f71, %f70; // end inline asm // begin inline asm mov.u32 %r1853, 0; // end inline asm // begin inline asm mov.u32 %r1854, 0; // end inline asm // begin inline asm mov.u32 %r1855, 0; // end inline asm // begin inline asm mov.u32 %r1856, 0; // end inline asm // begin inline asm mov.u32 %r1857, 0; // end inline asm // begin inline asm mov.u32 %r1858, 0; // end inline asm // begin inline asm mov.u32 %r1859, 0; // end inline asm // begin inline asm mov.u32 %r1860, 0; // end inline asm // begin inline asm mov.u32 %r1861, 0; // end inline asm // begin inline asm mov.u32 %r1862, 0; // end inline asm // begin inline asm mov.u32 %r1863, 0; // end inline asm // begin inline asm mov.u32 %r1864, 0; // end inline asm // begin inline asm mov.u32 %r1865, 0; // end inline asm // begin inline asm mov.u32 %r1866, 0; // end inline asm // begin inline asm mov.u32 %r1867, 0; // end inline asm // begin inline asm mov.u32 %r1868, 0; // end inline asm // begin inline asm mov.u32 %r1869, 0; // end inline asm // begin inline asm mov.u32 %r1870, 0; // end inline asm // begin inline asm mov.u32 %r1871, 0; // end inline asm // begin inline asm mov.u32 %r1872, 0; // end inline asm // begin inline asm mov.u32 %r1873, 0; // end inline asm // begin inline asm mov.u32 %r1874, 0; // end inline asm // begin inline asm mov.u32 %r1875, 0; // end inline asm // begin inline asm mov.u32 %r1876, 0; // end inline asm // begin inline asm mov.u32 %r1877, 0; // end inline asm // begin inline asm mov.u32 %r1878, 0; // end inline asm // begin inline asm mov.u32 %r1879, 0; // end inline asm // begin inline asm mov.u32 %r1880, 0; // end inline asm // begin inline asm mov.u32 %r1881, 0; // end inline asm // begin inline asm mov.u32 %r1882, 0; // end inline asm // begin inline asm mov.u32 %r1883, 0; // end inline asm // begin inline asm mov.u32 %r1884, 0; // end inline asm // begin inline asm mov.u32 %r1885, 0; // end inline asm // begin inline asm mov.u32 %r1886, 0; // end inline asm // begin inline asm mov.u32 %r1887, 0; // end inline asm // begin inline asm mov.u32 %r1888, 0; // end inline asm // begin inline asm mov.u32 %r1889, 0; // end inline asm // begin inline asm mov.u32 %r1890, 0; // end inline asm // begin inline asm mov.u32 %r1891, 0; // end inline asm // begin inline asm mov.u32 %r1892, 0; // end inline asm // begin inline asm mov.u32 %r1893, 0; // end inline asm // begin inline asm mov.u32 %r1894, 0; // end inline asm // begin inline asm mov.u32 %r1895, 0; // end inline asm // begin inline asm mov.u32 %r1896, 0; // end inline asm // begin inline asm mov.u32 %r1897, 0; // end inline asm // begin inline asm mov.u32 %r1898, 0; // end inline asm // begin inline asm mov.u32 %r1899, 0; // end inline asm // begin inline asm mov.u32 %r1900, 0; // end inline asm // begin inline asm mov.u32 %r1901, 0; // end inline asm // begin inline asm mov.u32 %r1902, 0; // end inline asm // begin inline asm mov.u32 %r1903, 0; // end inline asm // begin inline asm mov.u32 %r1904, 0; // end inline asm // begin inline asm mov.u32 %r1905, 0; // end inline asm // begin inline asm mov.u32 %r1906, 0; // end inline asm // begin inline asm mov.u32 %r1907, 0; // end inline asm // begin inline asm mov.u32 %r1908, 0; // end inline asm // begin inline asm mov.u32 %r1909, 0; // end inline asm // begin inline asm mov.u32 %r1910, 0; // end inline asm // begin inline asm mov.u32 %r1911, 0; // end inline asm // begin inline asm mov.u32 %r1912, 0; // end inline asm // begin inline asm mov.u32 %r1913, 0; // end inline asm // begin inline asm mov.u32 %r1914, 0; // end inline asm // begin inline asm mov.u32 %r1915, 0; // end inline asm // begin inline asm mov.u32 %r1916, 0; // end inline asm mov.b32 %f666, %r1853; mov.b32 %f667, %r1854; mov.b32 %f668, %r1855; mov.b32 %f669, %r1856; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f666, %f667, %f668, %f669}, {%r1845, %r1846, %r1847, %r1848}, {%r2724, %r2723}, {%f666, %f667, %f668, %f669}; // end inline asm mov.b32 %f674, %r1857; mov.b32 %f675, %r1858; mov.b32 %f676, %r1859; mov.b32 %f677, %r1860; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f674, %f675, %f676, %f677}, {%r1845, %r1846, %r1847, %r1848}, {%r2722, %r2721}, {%f674, %f675, %f676, %f677}; // end inline asm mov.b32 %f682, %r1861; mov.b32 %f683, %r1862; mov.b32 %f684, %r1863; mov.b32 %f685, %r1864; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f682, %f683, %f684, %f685}, {%r1845, %r1846, %r1847, %r1848}, {%r2720, %r2719}, {%f682, %f683, %f684, %f685}; // end inline asm mov.b32 %f690, %r1865; mov.b32 %f691, %r1866; mov.b32 %f692, %r1867; mov.b32 %f693, %r1868; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f690, %f691, %f692, %f693}, {%r1845, %r1846, %r1847, %r1848}, {%r2718, %r2717}, {%f690, %f691, %f692, %f693}; // end inline asm mov.b32 %f698, %r1869; mov.b32 %f699, %r1870; mov.b32 %f700, %r1871; mov.b32 %f701, %r1872; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f698, %f699, %f700, %f701}, {%r1845, %r1846, %r1847, %r1848}, {%r2716, %r2715}, {%f698, %f699, %f700, %f701}; // end inline asm mov.b32 %f706, %r1873; mov.b32 %f707, %r1874; mov.b32 %f708, %r1875; mov.b32 %f709, %r1876; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f706, %f707, %f708, %f709}, {%r1845, %r1846, %r1847, %r1848}, {%r2714, %r2713}, {%f706, %f707, %f708, %f709}; // end inline asm mov.b32 %f714, %r1877; mov.b32 %f715, %r1878; mov.b32 %f716, %r1879; mov.b32 %f717, %r1880; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f714, %f715, %f716, %f717}, {%r1845, %r1846, %r1847, %r1848}, {%r2712, %r2711}, {%f714, %f715, %f716, %f717}; // end inline asm mov.b32 %f722, %r1881; mov.b32 %f723, %r1882; mov.b32 %f724, %r1883; mov.b32 %f725, %r1884; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f722, %f723, %f724, %f725}, {%r1845, %r1846, %r1847, %r1848}, {%r2710, %r2709}, {%f722, %f723, %f724, %f725}; // end inline asm mov.b32 %f730, %r1885; mov.b32 %f731, %r1886; mov.b32 %f732, %r1887; mov.b32 %f733, %r1888; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f730, %f731, %f732, %f733}, {%r1845, %r1846, %r1847, %r1848}, {%r2708, %r2707}, {%f730, %f731, %f732, %f733}; // end inline asm mov.b32 %f738, %r1889; mov.b32 %f739, %r1890; mov.b32 %f740, %r1891; mov.b32 %f741, %r1892; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f738, %f739, %f740, %f741}, {%r1845, %r1846, %r1847, %r1848}, {%r2706, %r2705}, {%f738, %f739, %f740, %f741}; // end inline asm mov.b32 %f746, %r1893; mov.b32 %f747, %r1894; mov.b32 %f748, %r1895; mov.b32 %f749, %r1896; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f746, %f747, %f748, %f749}, {%r1845, %r1846, %r1847, %r1848}, {%r2704, %r2703}, {%f746, %f747, %f748, %f749}; // end inline asm mov.b32 %f754, %r1897; mov.b32 %f755, %r1898; mov.b32 %f756, %r1899; mov.b32 %f757, %r1900; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f754, %f755, %f756, %f757}, {%r1845, %r1846, %r1847, %r1848}, {%r2702, %r2701}, {%f754, %f755, %f756, %f757}; // end inline asm mov.b32 %f762, %r1901; mov.b32 %f763, %r1902; mov.b32 %f764, %r1903; mov.b32 %f765, %r1904; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f762, %f763, %f764, %f765}, {%r1845, %r1846, %r1847, %r1848}, {%r2700, %r2699}, {%f762, %f763, %f764, %f765}; // end inline asm mov.b32 %f770, %r1905; mov.b32 %f771, %r1906; mov.b32 %f772, %r1907; mov.b32 %f773, %r1908; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f770, %f771, %f772, %f773}, {%r1845, %r1846, %r1847, %r1848}, {%r2698, %r2697}, {%f770, %f771, %f772, %f773}; // end inline asm mov.b32 %f778, %r1909; mov.b32 %f779, %r1910; mov.b32 %f780, %r1911; mov.b32 %f781, %r1912; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f778, %f779, %f780, %f781}, {%r1845, %r1846, %r1847, %r1848}, {%r2696, %r2695}, {%f778, %f779, %f780, %f781}; // end inline asm mov.b32 %f786, %r1913; mov.b32 %f787, %r1914; mov.b32 %f788, %r1915; mov.b32 %f789, %r1916; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f786, %f787, %f788, %f789}, {%r1845, %r1846, %r1847, %r1848}, {%r2694, %r2693}, {%f786, %f787, %f788, %f789}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f666, %f667, %f668, %f669}, {%r1849, %r1850, %r1851, %r1852}, {%r2692, %r2691}, {%f666, %f667, %f668, %f669}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f674, %f675, %f676, %f677}, {%r1849, %r1850, %r1851, %r1852}, {%r2690, %r2689}, {%f674, %f675, %f676, %f677}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f682, %f683, %f684, %f685}, {%r1849, %r1850, %r1851, %r1852}, {%r2688, %r2687}, {%f682, %f683, %f684, %f685}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f690, %f691, %f692, %f693}, {%r1849, %r1850, %r1851, %r1852}, {%r2686, %r2685}, {%f690, %f691, %f692, %f693}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f698, %f699, %f700, %f701}, {%r1849, %r1850, %r1851, %r1852}, {%r2684, %r2683}, {%f698, %f699, %f700, %f701}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f706, %f707, %f708, %f709}, {%r1849, %r1850, %r1851, %r1852}, {%r2682, %r2681}, {%f706, %f707, %f708, %f709}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f714, %f715, %f716, %f717}, {%r1849, %r1850, %r1851, %r1852}, {%r2680, %r2679}, {%f714, %f715, %f716, %f717}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f722, %f723, %f724, %f725}, {%r1849, %r1850, %r1851, %r1852}, {%r2678, %r2677}, {%f722, %f723, %f724, %f725}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f730, %f731, %f732, %f733}, {%r1849, %r1850, %r1851, %r1852}, {%r2676, %r2675}, {%f730, %f731, %f732, %f733}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f738, %f739, %f740, %f741}, {%r1849, %r1850, %r1851, %r1852}, {%r2674, %r2673}, {%f738, %f739, %f740, %f741}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f746, %f747, %f748, %f749}, {%r1849, %r1850, %r1851, %r1852}, {%r2672, %r2671}, {%f746, %f747, %f748, %f749}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f754, %f755, %f756, %f757}, {%r1849, %r1850, %r1851, %r1852}, {%r2670, %r2669}, {%f754, %f755, %f756, %f757}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f762, %f763, %f764, %f765}, {%r1849, %r1850, %r1851, %r1852}, {%r2668, %r2667}, {%f762, %f763, %f764, %f765}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f770, %f771, %f772, %f773}, {%r1849, %r1850, %r1851, %r1852}, {%r2666, %r2665}, {%f770, %f771, %f772, %f773}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f778, %f779, %f780, %f781}, {%r1849, %r1850, %r1851, %r1852}, {%r2664, %r2663}, {%f778, %f779, %f780, %f781}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f786, %f787, %f788, %f789}, {%r1849, %r1850, %r1851, %r1852}, {%r2662, %r2661}, {%f786, %f787, %f788, %f789}; // end inline asm setp.equ.ftz.f32 %p115, %f1053, 0f00000000; mov.f32 %f1073, 0f3F800000; mov.f32 %f1072, %f1073; @%p115 bra $L__BB0_44; rcp.approx.ftz.f32 %f1072, %f1053; $L__BB0_44: setp.equ.ftz.f32 %p116, %f1052, 0f00000000; @%p116 bra $L__BB0_46; rcp.approx.ftz.f32 %f1073, %f1052; $L__BB0_46: add.s32 %r2379, %r2593, 32; setp.ge.s32 %p137, %r2379, %r244; mov.b32 %f796, %r2788; fma.rn.ftz.f32 %f797, %f72, %f796, %f666; mul.ftz.f32 %f798, %f1072, %f797; mov.b32 %r2788, %f798; mov.b32 %f799, %r2787; fma.rn.ftz.f32 %f800, %f72, %f799, %f667; mul.ftz.f32 %f801, %f1072, %f800; mov.b32 %r2787, %f801; mov.b32 %f802, %r2786; fma.rn.ftz.f32 %f803, %f74, %f802, %f668; mul.ftz.f32 %f804, %f1073, %f803; mov.b32 %r2786, %f804; mov.b32 %f805, %r2785; fma.rn.ftz.f32 %f806, %f74, %f805, %f669; mul.ftz.f32 %f807, %f1073, %f806; mov.b32 %r2785, %f807; mov.b32 %f808, %r2784; fma.rn.ftz.f32 %f809, %f72, %f808, %f674; mul.ftz.f32 %f810, %f1072, %f809; mov.b32 %r2784, %f810; mov.b32 %f811, %r2783; fma.rn.ftz.f32 %f812, %f72, %f811, %f675; mul.ftz.f32 %f813, %f1072, %f812; mov.b32 %r2783, %f813; mov.b32 %f814, %r2782; fma.rn.ftz.f32 %f815, %f74, %f814, %f676; mul.ftz.f32 %f816, %f1073, %f815; mov.b32 %r2782, %f816; mov.b32 %f817, %r2781; fma.rn.ftz.f32 %f818, %f74, %f817, %f677; mul.ftz.f32 %f819, %f1073, %f818; mov.b32 %r2781, %f819; mov.b32 %f820, %r2780; fma.rn.ftz.f32 %f821, %f72, %f820, %f682; mul.ftz.f32 %f822, %f1072, %f821; mov.b32 %r2780, %f822; mov.b32 %f823, %r2779; fma.rn.ftz.f32 %f824, %f72, %f823, %f683; mul.ftz.f32 %f825, %f1072, %f824; mov.b32 %r2779, %f825; mov.b32 %f826, %r2778; fma.rn.ftz.f32 %f827, %f74, %f826, %f684; mul.ftz.f32 %f828, %f1073, %f827; mov.b32 %r2778, %f828; mov.b32 %f829, %r2777; fma.rn.ftz.f32 %f830, %f74, %f829, %f685; mul.ftz.f32 %f831, %f1073, %f830; mov.b32 %r2777, %f831; mov.b32 %f832, %r2776; fma.rn.ftz.f32 %f833, %f72, %f832, %f690; mul.ftz.f32 %f834, %f1072, %f833; mov.b32 %r2776, %f834; mov.b32 %f835, %r2775; fma.rn.ftz.f32 %f836, %f72, %f835, %f691; mul.ftz.f32 %f837, %f1072, %f836; mov.b32 %r2775, %f837; mov.b32 %f838, %r2774; fma.rn.ftz.f32 %f839, %f74, %f838, %f692; mul.ftz.f32 %f840, %f1073, %f839; mov.b32 %r2774, %f840; mov.b32 %f841, %r2773; fma.rn.ftz.f32 %f842, %f74, %f841, %f693; mul.ftz.f32 %f843, %f1073, %f842; mov.b32 %r2773, %f843; mov.b32 %f844, %r2772; fma.rn.ftz.f32 %f845, %f72, %f844, %f698; mul.ftz.f32 %f846, %f1072, %f845; mov.b32 %r2772, %f846; mov.b32 %f847, %r2771; fma.rn.ftz.f32 %f848, %f72, %f847, %f699; mul.ftz.f32 %f849, %f1072, %f848; mov.b32 %r2771, %f849; mov.b32 %f850, %r2770; fma.rn.ftz.f32 %f851, %f74, %f850, %f700; mul.ftz.f32 %f852, %f1073, %f851; mov.b32 %r2770, %f852; mov.b32 %f853, %r2769; fma.rn.ftz.f32 %f854, %f74, %f853, %f701; mul.ftz.f32 %f855, %f1073, %f854; mov.b32 %r2769, %f855; mov.b32 %f856, %r2768; fma.rn.ftz.f32 %f857, %f72, %f856, %f706; mul.ftz.f32 %f858, %f1072, %f857; mov.b32 %r2768, %f858; mov.b32 %f859, %r2767; fma.rn.ftz.f32 %f860, %f72, %f859, %f707; mul.ftz.f32 %f861, %f1072, %f860; mov.b32 %r2767, %f861; mov.b32 %f862, %r2766; fma.rn.ftz.f32 %f863, %f74, %f862, %f708; mul.ftz.f32 %f864, %f1073, %f863; mov.b32 %r2766, %f864; mov.b32 %f865, %r2765; fma.rn.ftz.f32 %f866, %f74, %f865, %f709; mul.ftz.f32 %f867, %f1073, %f866; mov.b32 %r2765, %f867; mov.b32 %f868, %r2764; fma.rn.ftz.f32 %f869, %f72, %f868, %f714; mul.ftz.f32 %f870, %f1072, %f869; mov.b32 %r2764, %f870; mov.b32 %f871, %r2763; fma.rn.ftz.f32 %f872, %f72, %f871, %f715; mul.ftz.f32 %f873, %f1072, %f872; mov.b32 %r2763, %f873; mov.b32 %f874, %r2762; fma.rn.ftz.f32 %f875, %f74, %f874, %f716; mul.ftz.f32 %f876, %f1073, %f875; mov.b32 %r2762, %f876; mov.b32 %f877, %r2761; fma.rn.ftz.f32 %f878, %f74, %f877, %f717; mul.ftz.f32 %f879, %f1073, %f878; mov.b32 %r2761, %f879; mov.b32 %f880, %r2760; fma.rn.ftz.f32 %f881, %f72, %f880, %f722; mul.ftz.f32 %f882, %f1072, %f881; mov.b32 %r2760, %f882; mov.b32 %f883, %r2759; fma.rn.ftz.f32 %f884, %f72, %f883, %f723; mul.ftz.f32 %f885, %f1072, %f884; mov.b32 %r2759, %f885; mov.b32 %f886, %r2758; fma.rn.ftz.f32 %f887, %f74, %f886, %f724; mul.ftz.f32 %f888, %f1073, %f887; mov.b32 %r2758, %f888; mov.b32 %f889, %r2757; fma.rn.ftz.f32 %f890, %f74, %f889, %f725; mul.ftz.f32 %f891, %f1073, %f890; mov.b32 %r2757, %f891; mov.b32 %f892, %r2756; fma.rn.ftz.f32 %f893, %f72, %f892, %f730; mul.ftz.f32 %f894, %f1072, %f893; mov.b32 %r2756, %f894; mov.b32 %f895, %r2755; fma.rn.ftz.f32 %f896, %f72, %f895, %f731; mul.ftz.f32 %f897, %f1072, %f896; mov.b32 %r2755, %f897; mov.b32 %f898, %r2754; fma.rn.ftz.f32 %f899, %f74, %f898, %f732; mul.ftz.f32 %f900, %f1073, %f899; mov.b32 %r2754, %f900; mov.b32 %f901, %r2753; fma.rn.ftz.f32 %f902, %f74, %f901, %f733; mul.ftz.f32 %f903, %f1073, %f902; mov.b32 %r2753, %f903; mov.b32 %f904, %r2752; fma.rn.ftz.f32 %f905, %f72, %f904, %f738; mul.ftz.f32 %f906, %f1072, %f905; mov.b32 %r2752, %f906; mov.b32 %f907, %r2751; fma.rn.ftz.f32 %f908, %f72, %f907, %f739; mul.ftz.f32 %f909, %f1072, %f908; mov.b32 %r2751, %f909; mov.b32 %f910, %r2750; fma.rn.ftz.f32 %f911, %f74, %f910, %f740; mul.ftz.f32 %f912, %f1073, %f911; mov.b32 %r2750, %f912; mov.b32 %f913, %r2749; fma.rn.ftz.f32 %f914, %f74, %f913, %f741; mul.ftz.f32 %f915, %f1073, %f914; mov.b32 %r2749, %f915; mov.b32 %f916, %r2748; fma.rn.ftz.f32 %f917, %f72, %f916, %f746; mul.ftz.f32 %f918, %f1072, %f917; mov.b32 %r2748, %f918; mov.b32 %f919, %r2747; fma.rn.ftz.f32 %f920, %f72, %f919, %f747; mul.ftz.f32 %f921, %f1072, %f920; mov.b32 %r2747, %f921; mov.b32 %f922, %r2746; fma.rn.ftz.f32 %f923, %f74, %f922, %f748; mul.ftz.f32 %f924, %f1073, %f923; mov.b32 %r2746, %f924; mov.b32 %f925, %r2745; fma.rn.ftz.f32 %f926, %f74, %f925, %f749; mul.ftz.f32 %f927, %f1073, %f926; mov.b32 %r2745, %f927; mov.b32 %f928, %r2744; fma.rn.ftz.f32 %f929, %f72, %f928, %f754; mul.ftz.f32 %f930, %f1072, %f929; mov.b32 %r2744, %f930; mov.b32 %f931, %r2743; fma.rn.ftz.f32 %f932, %f72, %f931, %f755; mul.ftz.f32 %f933, %f1072, %f932; mov.b32 %r2743, %f933; mov.b32 %f934, %r2742; fma.rn.ftz.f32 %f935, %f74, %f934, %f756; mul.ftz.f32 %f936, %f1073, %f935; mov.b32 %r2742, %f936; mov.b32 %f937, %r2741; fma.rn.ftz.f32 %f938, %f74, %f937, %f757; mul.ftz.f32 %f939, %f1073, %f938; mov.b32 %r2741, %f939; mov.b32 %f940, %r2740; fma.rn.ftz.f32 %f941, %f72, %f940, %f762; mul.ftz.f32 %f942, %f1072, %f941; mov.b32 %r2740, %f942; mov.b32 %f943, %r2739; fma.rn.ftz.f32 %f944, %f72, %f943, %f763; mul.ftz.f32 %f945, %f1072, %f944; mov.b32 %r2739, %f945; mov.b32 %f946, %r2738; fma.rn.ftz.f32 %f947, %f74, %f946, %f764; mul.ftz.f32 %f948, %f1073, %f947; mov.b32 %r2738, %f948; mov.b32 %f949, %r2737; fma.rn.ftz.f32 %f950, %f74, %f949, %f765; mul.ftz.f32 %f951, %f1073, %f950; mov.b32 %r2737, %f951; mov.b32 %f952, %r2736; fma.rn.ftz.f32 %f953, %f72, %f952, %f770; mul.ftz.f32 %f954, %f1072, %f953; mov.b32 %r2736, %f954; mov.b32 %f955, %r2735; fma.rn.ftz.f32 %f956, %f72, %f955, %f771; mul.ftz.f32 %f957, %f1072, %f956; mov.b32 %r2735, %f957; mov.b32 %f958, %r2734; fma.rn.ftz.f32 %f959, %f74, %f958, %f772; mul.ftz.f32 %f960, %f1073, %f959; mov.b32 %r2734, %f960; mov.b32 %f961, %r2733; fma.rn.ftz.f32 %f962, %f74, %f961, %f773; mul.ftz.f32 %f963, %f1073, %f962; mov.b32 %r2733, %f963; mov.b32 %f964, %r2732; fma.rn.ftz.f32 %f965, %f72, %f964, %f778; mul.ftz.f32 %f966, %f1072, %f965; mov.b32 %r2732, %f966; mov.b32 %f967, %r2731; fma.rn.ftz.f32 %f968, %f72, %f967, %f779; mul.ftz.f32 %f969, %f1072, %f968; mov.b32 %r2731, %f969; mov.b32 %f970, %r2730; fma.rn.ftz.f32 %f971, %f74, %f970, %f780; mul.ftz.f32 %f972, %f1073, %f971; mov.b32 %r2730, %f972; mov.b32 %f973, %r2729; fma.rn.ftz.f32 %f974, %f74, %f973, %f781; mul.ftz.f32 %f975, %f1073, %f974; mov.b32 %r2729, %f975; mov.b32 %f976, %r2728; fma.rn.ftz.f32 %f977, %f72, %f976, %f786; mul.ftz.f32 %f978, %f1072, %f977; mov.b32 %r2728, %f978; mov.b32 %f979, %r2727; fma.rn.ftz.f32 %f980, %f72, %f979, %f787; mul.ftz.f32 %f981, %f1072, %f980; mov.b32 %r2727, %f981; mov.b32 %f982, %r2726; fma.rn.ftz.f32 %f983, %f74, %f982, %f788; mul.ftz.f32 %f984, %f1073, %f983; mov.b32 %r2726, %f984; mov.b32 %f985, %r2725; fma.rn.ftz.f32 %f986, %f74, %f985, %f789; mul.ftz.f32 %f987, %f1073, %f986; mov.b32 %r2725, %f987; @%p137 bra $L__BB0_48; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2724, %r2723, %r2722, %r2721}, [%r1181]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2720, %r2719, %r2718, %r2717}, [%r1186]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2716, %r2715, %r2714, %r2713}, [%r1191]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2712, %r2711, %r2710, %r2709}, [%r1196]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2708, %r2707, %r2706, %r2705}, [%r1201]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2704, %r2703, %r2702, %r2701}, [%r1206]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2700, %r2699, %r2698, %r2697}, [%r1211]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2696, %r2695, %r2694, %r2693}, [%r1216]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2692, %r2691, %r2690, %r2689}, [%r1221]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2688, %r2687, %r2686, %r2685}, [%r1226]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2684, %r2683, %r2682, %r2681}, [%r1231]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2680, %r2679, %r2678, %r2677}, [%r1236]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2676, %r2675, %r2674, %r2673}, [%r1241]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2672, %r2671, %r2670, %r2669}, [%r1246]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2668, %r2667, %r2666, %r2665}, [%r1251]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2664, %r2663, %r2662, %r2661}, [%r1256]; // end inline asm $L__BB0_48: add.s32 %r2593, %r2593, 32; setp.lt.s32 %p118, %r2593, %r244; add.s32 %r2594, %r2594, 32; mov.f32 %f1054, %f55; mov.f32 %f1055, %f54; @%p118 bra $L__BB0_3; $L__BB0_49: mov.u32 %r2387, %tid.x; mov.b64 %rd183, fmha_v2_flash_attention_fp16_64_32_S_128_sm86_kernel_nl_param_0; mov.u64 %rd182, %rd183; ld.param.u32 %r2386, [%rd182+60]; mul.lo.s32 %r2385, %r1, %r1322; mad.lo.s32 %r2384, %r2385, %r2, %r1323; and.b32 %r2383, %r2387, 96; shr.u32 %r2382, %r2383, 1; mov.u32 %r2381, _ZN25fused_multihead_attention5smem_E; bar.sync 0; mov.b32 %f988, %r2787; mov.b32 %f989, %r2788; // begin inline asm cvt.rn.f16x2.f32 %r2208, %f988, %f989; // end inline asm mov.b32 %f990, %r2785; mov.b32 %f991, %r2786; // begin inline asm cvt.rn.f16x2.f32 %r2209, %f990, %f991; // end inline asm shl.b32 %r2345, %r2387, 2; and.b32 %r2346, %r2345, 124; add.s32 %r2348, %r2346, %r2381; and.b32 %r2351, %r2387, 28; shr.u32 %r2352, %r2351, 2; or.b32 %r2353, %r2382, %r2352; shl.b32 %r2354, %r2353, 8; add.s32 %r2355, %r2348, %r2354; add.s32 %r2210, %r2355, 16384; // begin inline asm st.shared.b32 [%r2210], %r2208; // end inline asm add.s32 %r2212, %r2355, 18432; // begin inline asm st.shared.b32 [%r2212], %r2209; // end inline asm xor.b32 %r2216, %r2210, 16; mov.b32 %f992, %r2783; mov.b32 %f993, %r2784; // begin inline asm cvt.rn.f16x2.f32 %r2214, %f992, %f993; // end inline asm mov.b32 %f994, %r2781; mov.b32 %f995, %r2782; // begin inline asm cvt.rn.f16x2.f32 %r2215, %f994, %f995; // end inline asm // begin inline asm st.shared.b32 [%r2216], %r2214; // end inline asm add.s32 %r2218, %r2216, 2048; // begin inline asm st.shared.b32 [%r2218], %r2215; // end inline asm xor.b32 %r2222, %r2210, 32; mov.b32 %f996, %r2779; mov.b32 %f997, %r2780; // begin inline asm cvt.rn.f16x2.f32 %r2220, %f996, %f997; // end inline asm mov.b32 %f998, %r2777; mov.b32 %f999, %r2778; // begin inline asm cvt.rn.f16x2.f32 %r2221, %f998, %f999; // end inline asm // begin inline asm st.shared.b32 [%r2222], %r2220; // end inline asm add.s32 %r2224, %r2222, 2048; // begin inline asm st.shared.b32 [%r2224], %r2221; // end inline asm xor.b32 %r2228, %r2210, 48; mov.b32 %f1000, %r2775; mov.b32 %f1001, %r2776; // begin inline asm cvt.rn.f16x2.f32 %r2226, %f1000, %f1001; // end inline asm mov.b32 %f1002, %r2773; mov.b32 %f1003, %r2774; // begin inline asm cvt.rn.f16x2.f32 %r2227, %f1002, %f1003; // end inline asm // begin inline asm st.shared.b32 [%r2228], %r2226; // end inline asm add.s32 %r2230, %r2228, 2048; // begin inline asm st.shared.b32 [%r2230], %r2227; // end inline asm xor.b32 %r2234, %r2210, 64; mov.b32 %f1004, %r2771; mov.b32 %f1005, %r2772; // begin inline asm cvt.rn.f16x2.f32 %r2232, %f1004, %f1005; // end inline asm mov.b32 %f1006, %r2769; mov.b32 %f1007, %r2770; // begin inline asm cvt.rn.f16x2.f32 %r2233, %f1006, %f1007; // end inline asm // begin inline asm st.shared.b32 [%r2234], %r2232; // end inline asm add.s32 %r2236, %r2234, 2048; // begin inline asm st.shared.b32 [%r2236], %r2233; // end inline asm xor.b32 %r2240, %r2210, 80; mov.b32 %f1008, %r2767; mov.b32 %f1009, %r2768; // begin inline asm cvt.rn.f16x2.f32 %r2238, %f1008, %f1009; // end inline asm mov.b32 %f1010, %r2765; mov.b32 %f1011, %r2766; // begin inline asm cvt.rn.f16x2.f32 %r2239, %f1010, %f1011; // end inline asm // begin inline asm st.shared.b32 [%r2240], %r2238; // end inline asm add.s32 %r2242, %r2240, 2048; // begin inline asm st.shared.b32 [%r2242], %r2239; // end inline asm xor.b32 %r2246, %r2210, 96; mov.b32 %f1012, %r2763; mov.b32 %f1013, %r2764; // begin inline asm cvt.rn.f16x2.f32 %r2244, %f1012, %f1013; // end inline asm mov.b32 %f1014, %r2761; mov.b32 %f1015, %r2762; // begin inline asm cvt.rn.f16x2.f32 %r2245, %f1014, %f1015; // end inline asm // begin inline asm st.shared.b32 [%r2246], %r2244; // end inline asm add.s32 %r2248, %r2246, 2048; // begin inline asm st.shared.b32 [%r2248], %r2245; // end inline asm xor.b32 %r2252, %r2210, 112; mov.b32 %f1016, %r2759; mov.b32 %f1017, %r2760; // begin inline asm cvt.rn.f16x2.f32 %r2250, %f1016, %f1017; // end inline asm mov.b32 %f1018, %r2757; mov.b32 %f1019, %r2758; // begin inline asm cvt.rn.f16x2.f32 %r2251, %f1018, %f1019; // end inline asm // begin inline asm st.shared.b32 [%r2252], %r2250; // end inline asm add.s32 %r2254, %r2252, 2048; // begin inline asm st.shared.b32 [%r2254], %r2251; // end inline asm xor.b32 %r2258, %r2210, 128; mov.b32 %f1020, %r2755; mov.b32 %f1021, %r2756; // begin inline asm cvt.rn.f16x2.f32 %r2256, %f1020, %f1021; // end inline asm mov.b32 %f1022, %r2753; mov.b32 %f1023, %r2754; // begin inline asm cvt.rn.f16x2.f32 %r2257, %f1022, %f1023; // end inline asm // begin inline asm st.shared.b32 [%r2258], %r2256; // end inline asm add.s32 %r2260, %r2258, 2048; // begin inline asm st.shared.b32 [%r2260], %r2257; // end inline asm xor.b32 %r2264, %r2210, 144; mov.b32 %f1024, %r2751; mov.b32 %f1025, %r2752; // begin inline asm cvt.rn.f16x2.f32 %r2262, %f1024, %f1025; // end inline asm mov.b32 %f1026, %r2749; mov.b32 %f1027, %r2750; // begin inline asm cvt.rn.f16x2.f32 %r2263, %f1026, %f1027; // end inline asm // begin inline asm st.shared.b32 [%r2264], %r2262; // end inline asm add.s32 %r2266, %r2264, 2048; // begin inline asm st.shared.b32 [%r2266], %r2263; // end inline asm xor.b32 %r2270, %r2210, 160; mov.b32 %f1028, %r2747; mov.b32 %f1029, %r2748; // begin inline asm cvt.rn.f16x2.f32 %r2268, %f1028, %f1029; // end inline asm mov.b32 %f1030, %r2745; mov.b32 %f1031, %r2746; // begin inline asm cvt.rn.f16x2.f32 %r2269, %f1030, %f1031; // end inline asm // begin inline asm st.shared.b32 [%r2270], %r2268; // end inline asm add.s32 %r2272, %r2270, 2048; // begin inline asm st.shared.b32 [%r2272], %r2269; // end inline asm xor.b32 %r2276, %r2210, 176; mov.b32 %f1032, %r2743; mov.b32 %f1033, %r2744; // begin inline asm cvt.rn.f16x2.f32 %r2274, %f1032, %f1033; // end inline asm mov.b32 %f1034, %r2741; mov.b32 %f1035, %r2742; // begin inline asm cvt.rn.f16x2.f32 %r2275, %f1034, %f1035; // end inline asm // begin inline asm st.shared.b32 [%r2276], %r2274; // end inline asm add.s32 %r2278, %r2276, 2048; // begin inline asm st.shared.b32 [%r2278], %r2275; // end inline asm xor.b32 %r2282, %r2210, 192; mov.b32 %f1036, %r2739; mov.b32 %f1037, %r2740; // begin inline asm cvt.rn.f16x2.f32 %r2280, %f1036, %f1037; // end inline asm mov.b32 %f1038, %r2737; mov.b32 %f1039, %r2738; // begin inline asm cvt.rn.f16x2.f32 %r2281, %f1038, %f1039; // end inline asm // begin inline asm st.shared.b32 [%r2282], %r2280; // end inline asm add.s32 %r2284, %r2282, 2048; // begin inline asm st.shared.b32 [%r2284], %r2281; // end inline asm xor.b32 %r2288, %r2210, 208; mov.b32 %f1040, %r2735; mov.b32 %f1041, %r2736; // begin inline asm cvt.rn.f16x2.f32 %r2286, %f1040, %f1041; // end inline asm mov.b32 %f1042, %r2733; mov.b32 %f1043, %r2734; // begin inline asm cvt.rn.f16x2.f32 %r2287, %f1042, %f1043; // end inline asm // begin inline asm st.shared.b32 [%r2288], %r2286; // end inline asm add.s32 %r2290, %r2288, 2048; // begin inline asm st.shared.b32 [%r2290], %r2287; // end inline asm xor.b32 %r2294, %r2210, 224; mov.b32 %f1044, %r2731; mov.b32 %f1045, %r2732; // begin inline asm cvt.rn.f16x2.f32 %r2292, %f1044, %f1045; // end inline asm mov.b32 %f1046, %r2729; mov.b32 %f1047, %r2730; // begin inline asm cvt.rn.f16x2.f32 %r2293, %f1046, %f1047; // end inline asm // begin inline asm st.shared.b32 [%r2294], %r2292; // end inline asm add.s32 %r2296, %r2294, 2048; // begin inline asm st.shared.b32 [%r2296], %r2293; // end inline asm xor.b32 %r2300, %r2210, 240; mov.b32 %f1048, %r2727; mov.b32 %f1049, %r2728; // begin inline asm cvt.rn.f16x2.f32 %r2298, %f1048, %f1049; // end inline asm mov.b32 %f1050, %r2725; mov.b32 %f1051, %r2726; // begin inline asm cvt.rn.f16x2.f32 %r2299, %f1050, %f1051; // end inline asm // begin inline asm st.shared.b32 [%r2300], %r2298; // end inline asm add.s32 %r2302, %r2300, 2048; // begin inline asm st.shared.b32 [%r2302], %r2299; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r2304, %r2305, %r2306, %r2307}, [%r10]; // end inline asm add.s32 %r2313, %r10, 2048; // begin inline asm ld.shared.v4.b32 {%r2309, %r2310, %r2311, %r2312}, [%r2313]; // end inline asm add.s32 %r2318, %r10, 4096; // begin inline asm ld.shared.v4.b32 {%r2314, %r2315, %r2316, %r2317}, [%r2318]; // end inline asm add.s32 %r2323, %r10, 6144; // begin inline asm ld.shared.v4.b32 {%r2319, %r2320, %r2321, %r2322}, [%r2323]; // end inline asm add.s32 %r2328, %r10, 8192; // begin inline asm ld.shared.v4.b32 {%r2324, %r2325, %r2326, %r2327}, [%r2328]; // end inline asm add.s32 %r2333, %r10, 10240; // begin inline asm ld.shared.v4.b32 {%r2329, %r2330, %r2331, %r2332}, [%r2333]; // end inline asm add.s32 %r2338, %r10, 12288; // begin inline asm ld.shared.v4.b32 {%r2334, %r2335, %r2336, %r2337}, [%r2338]; // end inline asm add.s32 %r2343, %r10, 14336; // begin inline asm ld.shared.v4.b32 {%r2339, %r2340, %r2341, %r2342}, [%r2343]; // end inline asm mul.lo.s32 %r2360, %r2384, %r2386; shl.b32 %r2361, %r2360, 1; cvt.s64.s32 %rd141, %r2361; add.s64 %rd39, %rd141, %rd59; cvt.u32.u64 %r2363, %rd5; setp.ge.s32 %p119, %r2363, %r1; @%p119 bra $L__BB0_72; mov.b64 %rd185, fmha_v2_flash_attention_fp16_64_32_S_128_sm86_kernel_nl_param_0; mov.u64 %rd184, %rd185; ld.param.u32 %r2398, [%rd184+60]; mov.u32 %r2397, %tid.x; shr.s32 %r2396, %r2398, 31; shr.u32 %r2395, %r2396, 29; add.s32 %r2394, %r2398, %r2395; shr.s32 %r2393, %r2394, 3; shr.s32 %r2392, %r2397, 31; shr.u32 %r2391, %r2392, 28; add.s32 %r2390, %r2397, %r2391; and.b32 %r2389, %r2390, -16; sub.s32 %r2388, %r2397, %r2389; setp.ge.s32 %p120, %r2388, %r2393; @%p120 bra $L__BB0_52; mul.lo.s64 %rd143, %rd10, %rd5; add.s64 %rd144, %rd39, %rd143; cvta.to.global.u64 %rd145, %rd11; add.s64 %rd146, %rd145, %rd144; st.global.v4.u32 [%rd146], {%r2304, %r2305, %r2306, %r2307}; $L__BB0_52: add.s32 %r2365, %r2363, 8; setp.ge.s32 %p121, %r2365, %r1; @%p121 bra $L__BB0_72; @%p120 bra $L__BB0_55; add.s64 %rd147, %rd5, 8; mul.lo.s64 %rd148, %rd147, %rd10; add.s64 %rd149, %rd39, %rd148; cvta.to.global.u64 %rd150, %rd11; add.s64 %rd151, %rd150, %rd149; st.global.v4.u32 [%rd151], {%r2309, %r2310, %r2311, %r2312}; $L__BB0_55: add.s32 %r2367, %r2363, 16; setp.ge.s32 %p123, %r2367, %r1; @%p123 bra $L__BB0_72; @%p120 bra $L__BB0_58; add.s64 %rd152, %rd5, 16; mul.lo.s64 %rd153, %rd152, %rd10; add.s64 %rd154, %rd39, %rd153; cvta.to.global.u64 %rd155, %rd11; add.s64 %rd156, %rd155, %rd154; st.global.v4.u32 [%rd156], {%r2314, %r2315, %r2316, %r2317}; $L__BB0_58: add.s32 %r2369, %r2363, 24; setp.ge.s32 %p125, %r2369, %r1; @%p125 bra $L__BB0_72; @%p120 bra $L__BB0_61; add.s64 %rd157, %rd5, 24; mul.lo.s64 %rd158, %rd157, %rd10; add.s64 %rd159, %rd39, %rd158; cvta.to.global.u64 %rd160, %rd11; add.s64 %rd161, %rd160, %rd159; st.global.v4.u32 [%rd161], {%r2319, %r2320, %r2321, %r2322}; $L__BB0_61: add.s32 %r2371, %r2363, 32; setp.ge.s32 %p127, %r2371, %r1; @%p127 bra $L__BB0_72; @%p120 bra $L__BB0_64; add.s64 %rd162, %rd5, 32; mul.lo.s64 %rd163, %rd162, %rd10; add.s64 %rd164, %rd39, %rd163; cvta.to.global.u64 %rd165, %rd11; add.s64 %rd166, %rd165, %rd164; st.global.v4.u32 [%rd166], {%r2324, %r2325, %r2326, %r2327}; $L__BB0_64: add.s32 %r2373, %r2363, 40; setp.ge.s32 %p129, %r2373, %r1; @%p129 bra $L__BB0_72; @%p120 bra $L__BB0_67; add.s64 %rd167, %rd5, 40; mul.lo.s64 %rd168, %rd167, %rd10; add.s64 %rd169, %rd39, %rd168; cvta.to.global.u64 %rd170, %rd11; add.s64 %rd171, %rd170, %rd169; st.global.v4.u32 [%rd171], {%r2329, %r2330, %r2331, %r2332}; $L__BB0_67: add.s32 %r2375, %r2363, 48; setp.ge.s32 %p131, %r2375, %r1; @%p131 bra $L__BB0_72; @%p120 bra $L__BB0_70; add.s64 %rd172, %rd5, 48; mul.lo.s64 %rd173, %rd172, %rd10; add.s64 %rd174, %rd39, %rd173; cvta.to.global.u64 %rd175, %rd11; add.s64 %rd176, %rd175, %rd174; st.global.v4.u32 [%rd176], {%r2334, %r2335, %r2336, %r2337}; $L__BB0_70: add.s32 %r2377, %r2363, 56; setp.ge.s32 %p133, %r2377, %r1; or.pred %p135, %p133, %p120; @%p135 bra $L__BB0_72; add.s64 %rd177, %rd5, 56; mul.lo.s64 %rd178, %rd177, %rd10; add.s64 %rd179, %rd39, %rd178; cvta.to.global.u64 %rd180, %rd11; add.s64 %rd181, %rd180, %rd179; st.global.v4.u32 [%rd181], {%r2339, %r2340, %r2341, %r2342}; $L__BB0_72: ret; }