%rd40; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_64_32_S_64_sm86_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_64_32_S_64_sm86_kernel_nl_param_0+52]; mov.u32 %r524, %ctaid.z; shl.b32 %r3, %r524, 6; setp.le.s32 %p1, %r1, %r3; @%p1 bra $L__BB0_60; mov.u32 %r673, %tid.x; mov.u32 %r674, %ctaid.y; mov.u32 %r675, %ctaid.x; mul.lo.s32 %r676, %r1, %r674; mad.lo.s32 %r677, %r676, %r2, %r675; shr.s32 %r678, %r673, 31; shr.u32 %r679, %r678, 27; add.s32 %r680, %r673, %r679; and.b32 %r681, %r680, -32; sub.s32 %r682, %r673, %r681; shr.u32 %r683, %r678, 25; add.s32 %r684, %r673, %r683; shr.s32 %r685, %r684, 7; shl.b32 %r686, %r685, 4; shr.s32 %r687, %r682, 31; shr.u32 %r688, %r687, 30; add.s32 %r689, %r682, %r688; and.b32 %r690, %r689, 2147483644; sub.s32 %r691, %r682, %r690; shl.b32 %r692, %r691, 1; add.s32 %r1462, %r692, %r686; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r693, %r680, 5; shr.s32 %r694, %r680, 31; shr.u32 %r695, %r694, 30; add.s32 %r696, %r693, %r695; and.b32 %r697, %r696, 268435452; sub.s32 %r698, %r693, %r697; shl.b32 %r699, %r698, 4; shr.s32 %r700, %r689, 2; add.s32 %r5, %r699, %r700; shr.u32 %r701, %r678, 29; add.s32 %r702, %r673, %r701; shr.s32 %r6, %r702, 3; add.s32 %r703, %r6, %r3; cvt.s64.s32 %rd5, %r703; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd49, %rd6, %rd5; mul.lo.s32 %r704, %r677, 3; mul.wide.s32 %rd50, %r704, 128; and.b32 %r705, %r702, -8; sub.s32 %r7, %r673, %r705; shl.b32 %r706, %r7, 4; cvt.s64.s32 %rd51, %r706; add.s64 %rd52, %rd50, %rd51; add.s64 %rd53, %rd52, %rd49; ld.param.u64 %rd54, [%rd1]; add.s64 %rd41, %rd54, %rd53; shr.s32 %r707, %r702, 31; shr.u32 %r708, %r707, 29; add.s32 %r709, %r6, %r708; and.b32 %r710, %r709, 268435448; sub.s32 %r711, %r6, %r710; xor.b32 %r712, %r711, %r7; shl.b32 %r713, %r6, 7; shl.b32 %r714, %r712, 4; mov.u32 %r715, 31; mov.u32 %r1461, 0; mov.u32 %r717, -1; shfl.sync.idx.b32 %r718|%p2, %r1461, %r1461, %r715, %r717; shfl.sync.idx.b32 %r719|%p3, %r1461, %r1461, %r715, %r717; and.b32 %r720, %r673, 96; shr.u32 %r721, %r720, 1; and.b32 %r722, %r673, 15; or.b32 %r723, %r721, %r722; and.b32 %r724, %r673, 7; shl.b32 %r725, %r673, 4; and.b32 %r726, %r725, 112; and.b32 %r727, %r673, 16; xor.b32 %r728, %r726, %r727; cvt.s64.s32 %rd55, %r6; mul.lo.s64 %rd56, %rd6, %rd55; add.s32 %r729, %r704, 1; mul.wide.s32 %rd57, %r729, 128; add.s64 %rd58, %rd57, %rd51; add.s64 %rd59, %rd58, %rd56; add.s64 %rd158, %rd54, %rd59; shfl.sync.idx.b32 %r730|%p4, %r1461, %r1461, %r715, %r717; shfl.sync.idx.b32 %r731|%p5, %r1461, %r1461, %r715, %r717; shr.u32 %r732, %r727, 1; or.b32 %r733, %r732, %r724; and.b32 %r734, %r673, 8; shr.u32 %r735, %r734, 3; xor.b32 %r736, %r735, %r724; add.s32 %r737, %r704, 2; mul.wide.s32 %rd60, %r737, 128; add.s64 %rd61, %rd60, %rd51; add.s64 %rd62, %rd61, %rd56; add.s64 %rd157, %rd54, %rd62; shfl.sync.idx.b32 %r738|%p6, %r1461, %r1461, %r715, %r717; shfl.sync.idx.b32 %r739|%p7, %r1461, %r1461, %r715, %r717; ld.param.u64 %rd10, [%rd1+32]; ld.param.u64 %rd11, [%rd1+8]; sub.s32 %r740, %r1, %r3; min.s32 %r741, %r740, 64; shl.b32 %r745, %r673, 7; and.b32 %r746, %r745, 1920; shl.b32 %r747, %r736, 4; shl.b32 %r748, %r733, 7; shl.b32 %r749, %r723, 7; setp.lt.s32 %p8, %r6, %r741; add.s32 %r750, %r6, 16; setp.lt.s32 %p9, %r750, %r741; add.s32 %r751, %r6, 32; setp.lt.s32 %p10, %r751, %r741; add.s32 %r752, %r6, 48; setp.lt.s32 %p11, %r752, %r741; add.s32 %r753, %r714, %r713; or.b32 %r754, %r749, %r728; or.b32 %r755, %r748, %r747; or.b32 %r756, %r728, %r746; mov.u32 %r757, _ZN25fused_multihead_attention5smem_E; add.s32 %r758, %r757, 8192; add.s32 %r10, %r753, %r758; shl.b64 %rd63, %rd6, 4; add.s32 %r759, %r753, %r757; add.s32 %r525, %r759, %r719; add.s32 %r527, %r525, 2048; add.s32 %r529, %r525, 4096; add.s32 %r531, %r525, 6144; selp.b32 %r526, 16, 0, %p8; // begin inline asm cp.async.cg.shared.global [%r525], [%rd41], 16, %r526; // end inline asm selp.b32 %r528, 16, 0, %p9; add.s64 %rd42, %rd41, %rd63; // begin inline asm cp.async.cg.shared.global [%r527], [%rd42], 16, %r528; // end inline asm selp.b32 %r530, 16, 0, %p10; add.s64 %rd43, %rd42, %rd63; // begin inline asm cp.async.cg.shared.global [%r529], [%rd43], 16, %r530; // end inline asm selp.b32 %r532, 16, 0, %p11; add.s64 %rd44, %rd43, %rd63; // begin inline asm cp.async.cg.shared.global [%r531], [%rd44], 16, %r532; // end inline asm min.s32 %r760, %r1, 32; setp.lt.s32 %p12, %r6, %r760; setp.lt.s32 %p13, %r750, %r760; add.s64 %rd46, %rd158, %rd63; add.s32 %r533, %r10, %r731; add.s32 %r535, %r533, 2048; selp.b32 %r538, 16, 0, %p12; // begin inline asm cp.async.cg.shared.global [%r533], [%rd158], 16, %r538; // end inline asm selp.b32 %r540, 16, 0, %p13; // begin inline asm cp.async.cg.shared.global [%r535], [%rd46], 16, %r540; // end inline asm add.s64 %rd48, %rd157, %rd63; add.s32 %r761, %r757, 12288; add.s32 %r762, %r753, %r761; add.s32 %r537, %r762, %r739; add.s32 %r539, %r537, 2048; // begin inline asm cp.async.cg.shared.global [%r537], [%rd157], 16, %r538; // end inline asm // begin inline asm cp.async.cg.shared.global [%r539], [%rd48], 16, %r540; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r763, %r718, %r757; add.s32 %r545, %r763, %r754; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r541, %r542, %r543, %r544}, [%r545]; // end inline asm xor.b32 %r764, %r754, 32; add.s32 %r550, %r763, %r764; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r546, %r547, %r548, %r549}, [%r550]; // end inline asm xor.b32 %r765, %r754, 64; add.s32 %r555, %r763, %r765; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r551, %r552, %r553, %r554}, [%r555]; // end inline asm xor.b32 %r766, %r754, 96; add.s32 %r560, %r763, %r766; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r556, %r557, %r558, %r559}, [%r560]; // end inline asm add.s32 %r31, %r730, %r758; add.s32 %r565, %r31, %r755; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1493, %r1492, %r1491, %r1490}, [%r565]; // end inline asm add.s32 %r570, %r565, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1489, %r1488, %r1487, %r1486}, [%r570]; // end inline asm xor.b32 %r767, %r755, 32; add.s32 %r575, %r31, %r767; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1485, %r1484, %r1483, %r1482}, [%r575]; // end inline asm add.s32 %r580, %r575, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1481, %r1480, %r1479, %r1478}, [%r580]; // end inline asm xor.b32 %r768, %r755, 64; add.s32 %r585, %r31, %r768; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1477, %r1476, %r1475, %r1474}, [%r585]; // end inline asm add.s32 %r590, %r585, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1473, %r1472, %r1471, %r1470}, [%r590]; // end inline asm xor.b32 %r769, %r755, 96; add.s32 %r595, %r31, %r769; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1469, %r1468, %r1467, %r1466}, [%r595]; // end inline asm add.s32 %r600, %r595, 2048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1465, %r1494, %r1495, %r1496}, [%r600]; // end inline asm add.s32 %r605, %r756, %r761; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1528, %r1527, %r1526, %r1525}, [%r605]; // end inline asm xor.b32 %r770, %r756, 32; add.s32 %r610, %r770, %r761; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1524, %r1523, %r1522, %r1521}, [%r610]; // end inline asm xor.b32 %r771, %r756, 64; add.s32 %r615, %r771, %r761; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1520, %r1519, %r1518, %r1517}, [%r615]; // end inline asm xor.b32 %r772, %r756, 96; add.s32 %r620, %r772, %r761; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1516, %r1515, %r1514, %r1513}, [%r620]; // end inline asm add.s32 %r773, %r757, 14336; add.s32 %r625, %r756, %r773; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1512, %r1511, %r1510, %r1509}, [%r625]; // end inline asm add.s32 %r630, %r770, %r773; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1508, %r1507, %r1506, %r1505}, [%r630]; // end inline asm add.s32 %r635, %r771, %r773; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1504, %r1503, %r1502, %r1501}, [%r635]; // end inline asm add.s32 %r640, %r772, %r773; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1500, %r1499, %r1498, %r1497}, [%r640]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r1560, 0; // end inline asm // begin inline asm mov.u32 %r1559, 0; // end inline asm // begin inline asm mov.u32 %r1558, 0; // end inline asm // begin inline asm mov.u32 %r1557, 0; // end inline asm // begin inline asm mov.u32 %r1556, 0; // end inline asm // begin inline asm mov.u32 %r1555, 0; // end inline asm // begin inline asm mov.u32 %r1554, 0; // end inline asm // begin inline asm mov.u32 %r1553, 0; // end inline asm // begin inline asm mov.u32 %r1552, 0; // end inline asm // begin inline asm mov.u32 %r1551, 0; // end inline asm // begin inline asm mov.u32 %r1550, 0; // end inline asm // begin inline asm mov.u32 %r1549, 0; // end inline asm // begin inline asm mov.u32 %r1548, 0; // end inline asm // begin inline asm mov.u32 %r1547, 0; // end inline asm // begin inline asm mov.u32 %r1546, 0; // end inline asm // begin inline asm mov.u32 %r1545, 0; // end inline asm // begin inline asm mov.u32 %r1544, 0; // end inline asm // begin inline asm mov.u32 %r1543, 0; // end inline asm // begin inline asm mov.u32 %r1542, 0; // end inline asm // begin inline asm mov.u32 %r1541, 0; // end inline asm // begin inline asm mov.u32 %r1540, 0; // end inline asm // begin inline asm mov.u32 %r1539, 0; // end inline asm // begin inline asm mov.u32 %r1538, 0; // end inline asm // begin inline asm mov.u32 %r1537, 0; // end inline asm // begin inline asm mov.u32 %r1536, 0; // end inline asm // begin inline asm mov.u32 %r1535, 0; // end inline asm // begin inline asm mov.u32 %r1534, 0; // end inline asm // begin inline asm mov.u32 %r1533, 0; // end inline asm // begin inline asm mov.u32 %r1532, 0; // end inline asm // begin inline asm mov.u32 %r1531, 0; // end inline asm // begin inline asm mov.u32 %r1530, 0; // end inline asm // begin inline asm mov.u32 %r1529, 0; // end inline asm add.s32 %r774, %r1, 31; shr.s32 %r775, %r774, 31; shr.u32 %r776, %r775, 27; add.s32 %r777, %r774, %r776; and.b32 %r128, %r777, -32; setp.lt.s32 %p14, %r1, 1; @%p14 bra $L__BB0_49; ld.param.u8 %rs1, [%rd1+160]; cvt.s64.s32 %rd12, %r5; cvt.s64.s32 %rd13, %r1462; add.s32 %r779, %r1462, 1; cvt.s64.s32 %rd14, %r779; add.s32 %r780, %r1462, 8; cvt.s64.s32 %rd15, %r780; add.s32 %r781, %r1462, 9; cvt.s64.s32 %rd16, %r781; add.s32 %r782, %r1462, 16; cvt.s64.s32 %rd17, %r782; add.s32 %r783, %r1462, 17; cvt.s64.s32 %rd18, %r783; add.s32 %r784, %r1462, 24; cvt.s64.s32 %rd19, %r784; add.s32 %r785, %r1462, 25; cvt.s64.s32 %rd20, %r785; add.s32 %r209, %r5, 8; mov.f32 %f638, 0fFF800000; mov.f32 %f636, 0f00000000; mov.u32 %r1463, %r1; mov.f32 %f637, %f636; mov.f32 %f639, %f638; mov.u32 %r1464, %r1; $L__BB0_3: add.s32 %r786, %r1461, 32; setp.ge.s32 %p15, %r786, %r128; @%p15 bra $L__BB0_5; bar.sync 0; shl.b64 %rd68, %rd6, 5; add.s64 %rd64, %rd158, %rd68; add.s32 %r1464, %r1464, -32; min.s32 %r795, %r1464, 32; setp.lt.s32 %p16, %r6, %r795; setp.lt.s32 %p17, %r750, %r795; mul.lo.s64 %rd69, %rd6, 48; add.s64 %rd65, %rd158, %rd69; selp.b32 %r788, 16, 0, %p16; // begin inline asm cp.async.cg.shared.global [%r533], [%rd64], 16, %r788; // end inline asm selp.b32 %r790, 16, 0, %p17; // begin inline asm cp.async.cg.shared.global [%r535], [%rd65], 16, %r790; // end inline asm add.s64 %rd66, %rd157, %rd68; add.s32 %r1463, %r1463, -32; min.s32 %r797, %r1463, 32; setp.lt.s32 %p18, %r6, %r797; setp.lt.s32 %p19, %r750, %r797; add.s64 %rd67, %rd157, %rd69; selp.b32 %r792, 16, 0, %p18; // begin inline asm cp.async.cg.shared.global [%r537], [%rd66], 16, %r792; // end inline asm selp.b32 %r794, 16, 0, %p19; // begin inline asm cp.async.cg.shared.global [%r539], [%rd67], 16, %r794; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm mov.u64 %rd157, %rd66; mov.u64 %rd158, %rd64; $L__BB0_5: setp.eq.s16 %p20, %rs1, 0; @%p20 bra $L__BB0_38; mov.u32 %r1362, %ctaid.x; mov.u32 %r1361, %ctaid.y; mov.u32 %r1360, %ctaid.z; shl.b32 %r1359, %r1360, 6; ld.param.u32 %r1358, [fmha_v2_flash_attention_fp16_64_32_S_64_sm86_kernel_nl_param_0+52]; cvt.s64.s32 %rd72, %r1359; add.s64 %rd73, %rd12, %rd72; setp.ge.u64 %p21, %rd73, %rd2; mul.lo.s32 %r800, %r1, %r1359; cvt.s64.s32 %rd74, %r800; cvt.u64.u32 %rd27, %r1461; add.s64 %rd28, %rd74, %rd27; mul.lo.s64 %rd75, %rd2, %rd12; add.s64 %rd76, %rd28, %rd75; add.s64 %rd77, %rd76, %rd13; add.s64 %rd29, %rd13, %rd27; setp.ge.u64 %p22, %rd29, %rd2; shl.b64 %rd78, %rd77, 1; mad.lo.s32 %r803, %r1358, %r1361, %r1362; cvt.s64.s32 %rd79, %r803; mul.lo.s64 %rd80, %rd4, %rd79; add.s64 %rd81, %rd80, %rd78; cvta.to.global.u64 %rd82, %rd3; add.s64 %rd30, %rd82, %rd81; mov.u16 %rs132, 0; or.pred %p23, %p22, %p21; mov.u16 %rs131, %rs132; @%p23 bra $L__BB0_8; ld.global.u16 %rs131, [%rd30]; $L__BB0_8: add.s64 %rd31, %rd14, %rd27; setp.ge.u64 %p25, %rd31, %rd2; or.pred %p26, %p25, %p21; @%p26 bra $L__BB0_10; ld.global.u16 %rs132, [%rd30+2]; $L__BB0_10: add.s64 %rd32, %rd15, %rd27; setp.ge.u64 %p28, %rd32, %rd2; mov.u16 %rs134, 0; or.pred %p29, %p28, %p21; mov.u16 %rs133, %rs134; @%p29 bra $L__BB0_12; ld.global.u16 %rs133, [%rd30+16]; $L__BB0_12: add.s64 %rd33, %rd16, %rd27; setp.ge.u64 %p31, %rd33, %rd2; or.pred %p32, %p31, %p21; @%p32 bra $L__BB0_14; ld.global.u16 %rs134, [%rd30+18]; $L__BB0_14: add.s64 %rd34, %rd17, %rd27; setp.ge.u64 %p34, %rd34, %rd2; mov.u16 %rs136, 0; or.pred %p35, %p34, %p21; mov.u16 %rs135, %rs136; @%p35 bra $L__BB0_16; ld.global.u16 %rs135, [%rd30+32]; $L__BB0_16: add.s64 %rd35, %rd18, %rd27; setp.ge.u64 %p37, %rd35, %rd2; or.pred %p38, %p37, %p21; @%p38 bra $L__BB0_18; ld.global.u16 %rs136, [%rd30+34]; $L__BB0_18: add.s64 %rd36, %rd19, %rd27; setp.ge.u64 %p40, %rd36, %rd2; mov.u16 %rs138, 0; or.pred %p41, %p40, %p21; mov.u16 %rs137, %rs138; @%p41 bra $L__BB0_20; ld.global.u16 %rs137, [%rd30+48]; $L__BB0_20: add.s64 %rd37, %rd20, %rd27; setp.ge.u64 %p43, %rd37, %rd2; or.pred %p44, %p43, %p21; @%p44 bra $L__BB0_22; ld.global.u16 %rs138, [%rd30+50]; $L__BB0_22: cvt.s64.s32 %rd98, %r209; add.s64 %rd99, %rd98, %rd72; setp.ge.u64 %p45, %rd99, %rd2; mul.lo.s64 %rd100, %rd2, %rd98; add.s64 %rd101, %rd28, %rd100; add.s64 %rd102, %rd101, %rd13; shl.b64 %rd103, %rd102, 1; add.s64 %rd106, %rd80, %rd103; add.s64 %rd38, %rd82, %rd106; mov.u16 %rs140, 0; or.pred %p47, %p22, %p45; mov.u16 %rs139, %rs140; @%p47 bra $L__BB0_24; ld.global.u16 %rs139, [%rd38]; $L__BB0_24: or.pred %p50, %p25, %p45; @%p50 bra $L__BB0_26; ld.global.u16 %rs140, [%rd38+2]; $L__BB0_26: mov.u16 %rs142, 0; or.pred %p53, %p28, %p45; mov.u16 %rs141, %rs142; @%p53 bra $L__BB0_28; ld.global.u16 %rs141, [%rd38+16]; $L__BB0_28: or.pred %p56, %p31, %p45; @%p56 bra $L__BB0_30; ld.global.u16 %rs142, [%rd38+18]; $L__BB0_30: mov.u16 %rs144, 0; or.pred %p59, %p34, %p45; mov.u16 %rs143, %rs144; @%p59 bra $L__BB0_32; ld.global.u16 %rs143, [%rd38+32]; $L__BB0_32: or.pred %p62, %p37, %p45; @%p62 bra $L__BB0_34; ld.global.u16 %rs144, [%rd38+34]; $L__BB0_34: mov.u16 %rs146, 0; or.pred %p65, %p40, %p45; mov.u16 %rs145, %rs146; @%p65 bra $L__BB0_36; ld.global.u16 %rs145, [%rd38+48]; $L__BB0_36: or.pred %p68, %p43, %p45; @%p68 bra $L__BB0_38; ld.global.u16 %rs146, [%rd38+50]; $L__BB0_38: // begin inline asm mov.u32 %r837, 0; // end inline asm // begin inline asm mov.u32 %r838, 0; // end inline asm // begin inline asm mov.u32 %r839, 0; // end inline asm // begin inline asm mov.u32 %r840, 0; // end inline asm // begin inline asm mov.u32 %r841, 0; // end inline asm // begin inline asm mov.u32 %r842, 0; // end inline asm // begin inline asm mov.u32 %r843, 0; // end inline asm // begin inline asm mov.u32 %r844, 0; // end inline asm // begin inline asm mov.u32 %r845, 0; // end inline asm // begin inline asm mov.u32 %r846, 0; // end inline asm // begin inline asm mov.u32 %r847, 0; // end inline asm // begin inline asm mov.u32 %r848, 0; // end inline asm // begin inline asm mov.u32 %r849, 0; // end inline asm // begin inline asm mov.u32 %r850, 0; // end inline asm // begin inline asm mov.u32 %r851, 0; // end inline asm // begin inline asm mov.u32 %r852, 0; // end inline asm mov.b32 %f148, %r837; mov.b32 %f149, %r838; mov.b32 %f150, %r839; mov.b32 %f151, %r840; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r541, %r542, %r543, %r544}, {%r1493, %r1492}, {%f148, %f149, %f150, %f151}; // end inline asm mov.b32 %f156, %r841; mov.b32 %f157, %r842; mov.b32 %f158, %r843; mov.b32 %f159, %r844; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r541, %r542, %r543, %r544}, {%r1491, %r1490}, {%f156, %f157, %f158, %f159}; // end inline asm mov.b32 %f164, %r845; mov.b32 %f165, %r846; mov.b32 %f166, %r847; mov.b32 %f167, %r848; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r541, %r542, %r543, %r544}, {%r1489, %r1488}, {%f164, %f165, %f166, %f167}; // end inline asm mov.b32 %f172, %r849; mov.b32 %f173, %r850; mov.b32 %f174, %r851; mov.b32 %f175, %r852; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r541, %r542, %r543, %r544}, {%r1487, %r1486}, {%f172, %f173, %f174, %f175}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r546, %r547, %r548, %r549}, {%r1485, %r1484}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r546, %r547, %r548, %r549}, {%r1483, %r1482}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r546, %r547, %r548, %r549}, {%r1481, %r1480}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r546, %r547, %r548, %r549}, {%r1479, %r1478}, {%f172, %f173, %f174, %f175}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r551, %r552, %r553, %r554}, {%r1477, %r1476}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r551, %r552, %r553, %r554}, {%r1475, %r1474}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r551, %r552, %r553, %r554}, {%r1473, %r1472}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r551, %r552, %r553, %r554}, {%r1471, %r1470}, {%f172, %f173, %f174, %f175}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r556, %r557, %r558, %r559}, {%r1469, %r1468}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r556, %r557, %r558, %r559}, {%r1467, %r1466}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r556, %r557, %r558, %r559}, {%r1465, %r1494}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r556, %r557, %r558, %r559}, {%r1495, %r1496}, {%f172, %f173, %f174, %f175}; // end inline asm mul.ftz.f32 %f244, %f1, %f148; mul.ftz.f32 %f245, %f1, %f149; mul.ftz.f32 %f246, %f1, %f156; mul.ftz.f32 %f247, %f1, %f157; mul.ftz.f32 %f248, %f1, %f150; mul.ftz.f32 %f249, %f1, %f151; mul.ftz.f32 %f250, %f1, %f158; mul.ftz.f32 %f251, %f1, %f159; mul.ftz.f32 %f252, %f1, %f164; mul.ftz.f32 %f253, %f1, %f165; mul.ftz.f32 %f254, %f1, %f172; mul.ftz.f32 %f255, %f1, %f173; mul.ftz.f32 %f256, %f1, %f166; mul.ftz.f32 %f257, %f1, %f167; mul.ftz.f32 %f258, %f1, %f174; mul.ftz.f32 %f259, %f1, %f175; setp.lt.s32 %p69, %r1462, %r1; selp.f32 %f655, %f244, 0fFF800000, %p69; add.s32 %r949, %r1462, 1; setp.lt.s32 %p70, %r949, %r1; selp.f32 %f654, %f245, 0fFF800000, %p70; add.s32 %r950, %r1462, 8; setp.lt.s32 %p71, %r950, %r1; selp.f32 %f653, %f246, 0fFF800000, %p71; add.s32 %r951, %r1462, 9; setp.lt.s32 %p72, %r951, %r1; selp.f32 %f652, %f247, 0fFF800000, %p72; add.s32 %r952, %r1462, 16; setp.lt.s32 %p73, %r952, %r1; selp.f32 %f651, %f252, 0fFF800000, %p73; add.s32 %r953, %r1462, 17; setp.lt.s32 %p74, %r953, %r1; selp.f32 %f650, %f253, 0fFF800000, %p74; add.s32 %r954, %r1462, 24; setp.lt.s32 %p75, %r954, %r1; selp.f32 %f649, %f254, 0fFF800000, %p75; add.s32 %r955, %r1462, 25; setp.lt.s32 %p76, %r955, %r1; selp.f32 %f648, %f255, 0fFF800000, %p76; selp.f32 %f647, %f248, 0fFF800000, %p69; selp.f32 %f646, %f249, 0fFF800000, %p70; selp.f32 %f645, %f250, 0fFF800000, %p71; selp.f32 %f644, %f251, 0fFF800000, %p72; selp.f32 %f643, %f256, 0fFF800000, %p73; selp.f32 %f642, %f257, 0fFF800000, %p74; selp.f32 %f641, %f258, 0fFF800000, %p75; selp.f32 %f640, %f259, 0fFF800000, %p76; @%p20 bra $L__BB0_40; // begin inline asm cvt.f32.f16 %f260, %rs131; // end inline asm add.ftz.f32 %f655, %f260, %f655; // begin inline asm cvt.f32.f16 %f261, %rs132; // end inline asm add.ftz.f32 %f654, %f261, %f654; // begin inline asm cvt.f32.f16 %f262, %rs133; // end inline asm add.ftz.f32 %f653, %f262, %f653; // begin inline asm cvt.f32.f16 %f263, %rs134; // end inline asm add.ftz.f32 %f652, %f263, %f652; // begin inline asm cvt.f32.f16 %f264, %rs135; // end inline asm add.ftz.f32 %f651, %f264, %f651; // begin inline asm cvt.f32.f16 %f265, %rs136; // end inline asm add.ftz.f32 %f650, %f265, %f650; // begin inline asm cvt.f32.f16 %f266, %rs137; // end inline asm add.ftz.f32 %f649, %f266, %f649; // begin inline asm cvt.f32.f16 %f267, %rs138; // end inline asm add.ftz.f32 %f648, %f267, %f648; // begin inline asm cvt.f32.f16 %f268, %rs139; // end inline asm add.ftz.f32 %f647, %f268, %f647; // begin inline asm cvt.f32.f16 %f269, %rs140; // end inline asm add.ftz.f32 %f646, %f269, %f646; // begin inline asm cvt.f32.f16 %f270, %rs141; // end inline asm add.ftz.f32 %f645, %f270, %f645; // begin inline asm cvt.f32.f16 %f271, %rs142; // end inline asm add.ftz.f32 %f644, %f271, %f644; // begin inline asm cvt.f32.f16 %f272, %rs143; // end inline asm add.ftz.f32 %f643, %f272, %f643; // begin inline asm cvt.f32.f16 %f273, %rs144; // end inline asm add.ftz.f32 %f642, %f273, %f642; // begin inline asm cvt.f32.f16 %f274, %rs145; // end inline asm add.ftz.f32 %f641, %f274, %f641; // begin inline asm cvt.f32.f16 %f275, %rs146; // end inline asm add.ftz.f32 %f640, %f275, %f640; $L__BB0_40: add.s32 %r1316, %r1461, 32; setp.ge.s32 %p118, %r1316, %r128; setp.gt.ftz.f32 %p79, %f655, %f654; selp.f32 %f276, %f655, %f654, %p79; setp.gt.ftz.f32 %p80, %f276, %f653; selp.f32 %f277, %f276, %f653, %p80; setp.gt.ftz.f32 %p81, %f277, %f652; selp.f32 %f278, %f277, %f652, %p81; setp.gt.ftz.f32 %p82, %f278, %f651; selp.f32 %f279, %f278, %f651, %p82; setp.gt.ftz.f32 %p83, %f279, %f650; selp.f32 %f280, %f279, %f650, %p83; setp.gt.ftz.f32 %p84, %f280, %f649; selp.f32 %f281, %f280, %f649, %p84; setp.gt.ftz.f32 %p85, %f281, %f648; selp.f32 %f282, %f281, %f648, %p85; setp.gt.ftz.f32 %p86, %f647, %f646; selp.f32 %f283, %f647, %f646, %p86; setp.gt.ftz.f32 %p87, %f283, %f645; selp.f32 %f284, %f283, %f645, %p87; setp.gt.ftz.f32 %p88, %f284, %f644; selp.f32 %f285, %f284, %f644, %p88; setp.gt.ftz.f32 %p89, %f285, %f643; selp.f32 %f286, %f285, %f643, %p89; setp.gt.ftz.f32 %p90, %f286, %f642; selp.f32 %f287, %f286, %f642, %p90; setp.gt.ftz.f32 %p91, %f287, %f641; selp.f32 %f288, %f287, %f641, %p91; setp.gt.ftz.f32 %p92, %f288, %f640; selp.f32 %f289, %f288, %f640, %p92; mov.b32 %r957, %f282; mov.u32 %r958, 31; mov.u32 %r959, 1; mov.u32 %r960, -1; shfl.sync.bfly.b32 %r961|%p93, %r957, %r959, %r958, %r960; mov.b32 %f290, %r961; setp.gt.ftz.f32 %p94, %f282, %f290; selp.f32 %f291, %f282, %f290, %p94; mov.b32 %r962, %f291; mov.u32 %r963, 2; shfl.sync.bfly.b32 %r964|%p95, %r962, %r963, %r958, %r960; mov.b32 %f292, %r964; setp.gt.ftz.f32 %p96, %f291, %f292; selp.f32 %f293, %f291, %f292, %p96; mov.b32 %r965, %f289; shfl.sync.bfly.b32 %r966|%p97, %r965, %r959, %r958, %r960; mov.b32 %f294, %r966; setp.gt.ftz.f32 %p98, %f289, %f294; selp.f32 %f295, %f289, %f294, %p98; mov.b32 %r967, %f295; shfl.sync.bfly.b32 %r968|%p99, %r967, %r963, %r958, %r960; mov.b32 %f296, %r968; setp.gt.ftz.f32 %p100, %f295, %f296; selp.f32 %f297, %f295, %f296, %p100; max.ftz.f32 %f54, %f293, %f639; max.ftz.f32 %f55, %f297, %f638; sub.ftz.f32 %f298, %f655, %f54; mul.ftz.f32 %f299, %f298, 0f3FB8AA3B; ex2.approx.ftz.f32 %f56, %f299; sub.ftz.f32 %f300, %f654, %f54; mul.ftz.f32 %f301, %f300, 0f3FB8AA3B; ex2.approx.ftz.f32 %f57, %f301; sub.ftz.f32 %f302, %f653, %f54; mul.ftz.f32 %f303, %f302, 0f3FB8AA3B; ex2.approx.ftz.f32 %f58, %f303; sub.ftz.f32 %f304, %f652, %f54; mul.ftz.f32 %f305, %f304, 0f3FB8AA3B; ex2.approx.ftz.f32 %f59, %f305; sub.ftz.f32 %f306, %f651, %f54; mul.ftz.f32 %f307, %f306, 0f3FB8AA3B; ex2.approx.ftz.f32 %f60, %f307; sub.ftz.f32 %f308, %f650, %f54; mul.ftz.f32 %f309, %f308, 0f3FB8AA3B; ex2.approx.ftz.f32 %f61, %f309; sub.ftz.f32 %f310, %f649, %f54; mul.ftz.f32 %f311, %f310, 0f3FB8AA3B; ex2.approx.ftz.f32 %f62, %f311; sub.ftz.f32 %f312, %f648, %f54; mul.ftz.f32 %f313, %f312, 0f3FB8AA3B; ex2.approx.ftz.f32 %f63, %f313; sub.ftz.f32 %f314, %f647, %f55; mul.ftz.f32 %f315, %f314, 0f3FB8AA3B; ex2.approx.ftz.f32 %f64, %f315; sub.ftz.f32 %f316, %f646, %f55; mul.ftz.f32 %f317, %f316, 0f3FB8AA3B; ex2.approx.ftz.f32 %f65, %f317; sub.ftz.f32 %f318, %f645, %f55; mul.ftz.f32 %f319, %f318, 0f3FB8AA3B; ex2.approx.ftz.f32 %f66, %f319; sub.ftz.f32 %f320, %f644, %f55; mul.ftz.f32 %f321, %f320, 0f3FB8AA3B; ex2.approx.ftz.f32 %f67, %f321; sub.ftz.f32 %f322, %f643, %f55; mul.ftz.f32 %f323, %f322, 0f3FB8AA3B; ex2.approx.ftz.f32 %f68, %f323; sub.ftz.f32 %f324, %f642, %f55; mul.ftz.f32 %f325, %f324, 0f3FB8AA3B; ex2.approx.ftz.f32 %f69, %f325; sub.ftz.f32 %f326, %f641, %f55; mul.ftz.f32 %f327, %f326, 0f3FB8AA3B; ex2.approx.ftz.f32 %f70, %f327; sub.ftz.f32 %f328, %f640, %f55; mul.ftz.f32 %f329, %f328, 0f3FB8AA3B; ex2.approx.ftz.f32 %f71, %f329; add.ftz.f32 %f330, %f56, %f57; add.ftz.f32 %f331, %f330, 0f00000000; add.ftz.f32 %f332, %f58, %f59; add.ftz.f32 %f333, %f332, 0f00000000; add.ftz.f32 %f334, %f60, %f61; add.ftz.f32 %f335, %f331, %f334; add.ftz.f32 %f336, %f62, %f63; add.ftz.f32 %f337, %f333, %f336; add.ftz.f32 %f338, %f335, %f337; add.ftz.f32 %f339, %f64, %f65; add.ftz.f32 %f340, %f339, 0f00000000; add.ftz.f32 %f341, %f66, %f67; add.ftz.f32 %f342, %f341, 0f00000000; add.ftz.f32 %f343, %f68, %f69; add.ftz.f32 %f344, %f340, %f343; add.ftz.f32 %f345, %f70, %f71; add.ftz.f32 %f346, %f342, %f345; add.ftz.f32 %f347, %f344, %f346; mov.b32 %r969, %f338; shfl.sync.bfly.b32 %r970|%p101, %r969, %r959, %r958, %r960; mov.b32 %f348, %r970; add.ftz.f32 %f349, %f338, %f348; mov.b32 %r971, %f349; shfl.sync.bfly.b32 %r972|%p102, %r971, %r963, %r958, %r960; mov.b32 %f350, %r972; add.ftz.f32 %f351, %f349, %f350; mov.b32 %r973, %f347; shfl.sync.bfly.b32 %r974|%p103, %r973, %r959, %r958, %r960; mov.b32 %f352, %r974; add.ftz.f32 %f353, %f347, %f352; mov.b32 %r975, %f353; shfl.sync.bfly.b32 %r976|%p104, %r975, %r963, %r958, %r960; mov.b32 %f354, %r976; add.ftz.f32 %f355, %f353, %f354; sub.ftz.f32 %f356, %f639, %f54; mul.ftz.f32 %f357, %f356, 0f3FB8AA3B; ex2.approx.ftz.f32 %f358, %f357; mul.ftz.f32 %f72, %f358, %f637; add.ftz.f32 %f637, %f72, %f351; sub.ftz.f32 %f359, %f638, %f55; mul.ftz.f32 %f360, %f359, 0f3FB8AA3B; ex2.approx.ftz.f32 %f361, %f360; mul.ftz.f32 %f74, %f361, %f636; add.ftz.f32 %f636, %f74, %f355; @%p118 bra $L__BB0_42; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1493, %r1492, %r1491, %r1490}, [%r565]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1489, %r1488, %r1487, %r1486}, [%r570]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1485, %r1484, %r1483, %r1482}, [%r575]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1481, %r1480, %r1479, %r1478}, [%r580]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1477, %r1476, %r1475, %r1474}, [%r585]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1473, %r1472, %r1471, %r1470}, [%r590]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1469, %r1468, %r1467, %r1466}, [%r595]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1465, %r1494, %r1495, %r1496}, [%r600]; // end inline asm $L__BB0_42: // begin inline asm cvt.rn.f16x2.f32 %r1031, %f57, %f56; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1032, %f65, %f64; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1033, %f59, %f58; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1034, %f67, %f66; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1035, %f61, %f60; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1036, %f69, %f68; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1037, %f63, %f62; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1038, %f71, %f70; // end inline asm // begin inline asm mov.u32 %r1039, 0; // end inline asm // begin inline asm mov.u32 %r1040, 0; // end inline asm // begin inline asm mov.u32 %r1041, 0; // end inline asm // begin inline asm mov.u32 %r1042, 0; // end inline asm // begin inline asm mov.u32 %r1043, 0; // end inline asm // begin inline asm mov.u32 %r1044, 0; // end inline asm // begin inline asm mov.u32 %r1045, 0; // end inline asm // begin inline asm mov.u32 %r1046, 0; // end inline asm // begin inline asm mov.u32 %r1047, 0; // end inline asm // begin inline asm mov.u32 %r1048, 0; // end inline asm // begin inline asm mov.u32 %r1049, 0; // end inline asm // begin inline asm mov.u32 %r1050, 0; // end inline asm // begin inline asm mov.u32 %r1051, 0; // end inline asm // begin inline asm mov.u32 %r1052, 0; // end inline asm // begin inline asm mov.u32 %r1053, 0; // end inline asm // begin inline asm mov.u32 %r1054, 0; // end inline asm // begin inline asm mov.u32 %r1055, 0; // end inline asm // begin inline asm mov.u32 %r1056, 0; // end inline asm // begin inline asm mov.u32 %r1057, 0; // end inline asm // begin inline asm mov.u32 %r1058, 0; // end inline asm // begin inline asm mov.u32 %r1059, 0; // end inline asm // begin inline asm mov.u32 %r1060, 0; // end inline asm // begin inline asm mov.u32 %r1061, 0; // end inline asm // begin inline asm mov.u32 %r1062, 0; // end inline asm // begin inline asm mov.u32 %r1063, 0; // end inline asm // begin inline asm mov.u32 %r1064, 0; // end inline asm // begin inline asm mov.u32 %r1065, 0; // end inline asm // begin inline asm mov.u32 %r1066, 0; // end inline asm // begin inline asm mov.u32 %r1067, 0; // end inline asm // begin inline asm mov.u32 %r1068, 0; // end inline asm // begin inline asm mov.u32 %r1069, 0; // end inline asm // begin inline asm mov.u32 %r1070, 0; // end inline asm mov.b32 %f442, %r1039; mov.b32 %f443, %r1040; mov.b32 %f444, %r1041; mov.b32 %f445, %r1042; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f442, %f443, %f444, %f445}, {%r1031, %r1032, %r1033, %r1034}, {%r1528, %r1527}, {%f442, %f443, %f444, %f445}; // end inline asm mov.b32 %f450, %r1043; mov.b32 %f451, %r1044; mov.b32 %f452, %r1045; mov.b32 %f453, %r1046; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f450, %f451, %f452, %f453}, {%r1031, %r1032, %r1033, %r1034}, {%r1526, %r1525}, {%f450, %f451, %f452, %f453}; // end inline asm mov.b32 %f458, %r1047; mov.b32 %f459, %r1048; mov.b32 %f460, %r1049; mov.b32 %f461, %r1050; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f458, %f459, %f460, %f461}, {%r1031, %r1032, %r1033, %r1034}, {%r1524, %r1523}, {%f458, %f459, %f460, %f461}; // end inline asm mov.b32 %f466, %r1051; mov.b32 %f467, %r1052; mov.b32 %f468, %r1053; mov.b32 %f469, %r1054; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f466, %f467, %f468, %f469}, {%r1031, %r1032, %r1033, %r1034}, {%r1522, %r1521}, {%f466, %f467, %f468, %f469}; // end inline asm mov.b32 %f474, %r1055; mov.b32 %f475, %r1056; mov.b32 %f476, %r1057; mov.b32 %f477, %r1058; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f474, %f475, %f476, %f477}, {%r1031, %r1032, %r1033, %r1034}, {%r1520, %r1519}, {%f474, %f475, %f476, %f477}; // end inline asm mov.b32 %f482, %r1059; mov.b32 %f483, %r1060; mov.b32 %f484, %r1061; mov.b32 %f485, %r1062; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r1031, %r1032, %r1033, %r1034}, {%r1518, %r1517}, {%f482, %f483, %f484, %f485}; // end inline asm mov.b32 %f490, %r1063; mov.b32 %f491, %r1064; mov.b32 %f492, %r1065; mov.b32 %f493, %r1066; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r1031, %r1032, %r1033, %r1034}, {%r1516, %r1515}, {%f490, %f491, %f492, %f493}; // end inline asm mov.b32 %f498, %r1067; mov.b32 %f499, %r1068; mov.b32 %f500, %r1069; mov.b32 %f501, %r1070; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f498, %f499, %f500, %f501}, {%r1031, %r1032, %r1033, %r1034}, {%r1514, %r1513}, {%f498, %f499, %f500, %f501}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f442, %f443, %f444, %f445}, {%r1035, %r1036, %r1037, %r1038}, {%r1512, %r1511}, {%f442, %f443, %f444, %f445}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f450, %f451, %f452, %f453}, {%r1035, %r1036, %r1037, %r1038}, {%r1510, %r1509}, {%f450, %f451, %f452, %f453}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f458, %f459, %f460, %f461}, {%r1035, %r1036, %r1037, %r1038}, {%r1508, %r1507}, {%f458, %f459, %f460, %f461}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f466, %f467, %f468, %f469}, {%r1035, %r1036, %r1037, %r1038}, {%r1506, %r1505}, {%f466, %f467, %f468, %f469}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f474, %f475, %f476, %f477}, {%r1035, %r1036, %r1037, %r1038}, {%r1504, %r1503}, {%f474, %f475, %f476, %f477}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r1035, %r1036, %r1037, %r1038}, {%r1502, %r1501}, {%f482, %f483, %f484, %f485}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r1035, %r1036, %r1037, %r1038}, {%r1500, %r1499}, {%f490, %f491, %f492, %f493}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f498, %f499, %f500, %f501}, {%r1035, %r1036, %r1037, %r1038}, {%r1498, %r1497}, {%f498, %f499, %f500, %f501}; // end inline asm setp.equ.ftz.f32 %p105, %f637, 0f00000000; mov.f32 %f657, 0f3F800000; mov.f32 %f656, %f657; @%p105 bra $L__BB0_44; rcp.approx.ftz.f32 %f656, %f637; $L__BB0_44: setp.equ.ftz.f32 %p106, %f636, 0f00000000; @%p106 bra $L__BB0_46; rcp.approx.ftz.f32 %f657, %f636; $L__BB0_46: add.s32 %r1317, %r1461, 32; setp.ge.s32 %p119, %r1317, %r128; mov.b32 %f508, %r1560; fma.rn.ftz.f32 %f509, %f72, %f508, %f442; mul.ftz.f32 %f510, %f656, %f509; mov.b32 %r1560, %f510; mov.b32 %f511, %r1559; fma.rn.ftz.f32 %f512, %f72, %f511, %f443; mul.ftz.f32 %f513, %f656, %f512; mov.b32 %r1559, %f513; mov.b32 %f514, %r1558; fma.rn.ftz.f32 %f515, %f74, %f514, %f444; mul.ftz.f32 %f516, %f657, %f515; mov.b32 %r1558, %f516; mov.b32 %f517, %r1557; fma.rn.ftz.f32 %f518, %f74, %f517, %f445; mul.ftz.f32 %f519, %f657, %f518; mov.b32 %r1557, %f519; mov.b32 %f520, %r1556; fma.rn.ftz.f32 %f521, %f72, %f520, %f450; mul.ftz.f32 %f522, %f656, %f521; mov.b32 %r1556, %f522; mov.b32 %f523, %r1555; fma.rn.ftz.f32 %f524, %f72, %f523, %f451; mul.ftz.f32 %f525, %f656, %f524; mov.b32 %r1555, %f525; mov.b32 %f526, %r1554; fma.rn.ftz.f32 %f527, %f74, %f526, %f452; mul.ftz.f32 %f528, %f657, %f527; mov.b32 %r1554, %f528; mov.b32 %f529, %r1553; fma.rn.ftz.f32 %f530, %f74, %f529, %f453; mul.ftz.f32 %f531, %f657, %f530; mov.b32 %r1553, %f531; mov.b32 %f532, %r1552; fma.rn.ftz.f32 %f533, %f72, %f532, %f458; mul.ftz.f32 %f534, %f656, %f533; mov.b32 %r1552, %f534; mov.b32 %f535, %r1551; fma.rn.ftz.f32 %f536, %f72, %f535, %f459; mul.ftz.f32 %f537, %f656, %f536; mov.b32 %r1551, %f537; mov.b32 %f538, %r1550; fma.rn.ftz.f32 %f539, %f74, %f538, %f460; mul.ftz.f32 %f540, %f657, %f539; mov.b32 %r1550, %f540; mov.b32 %f541, %r1549; fma.rn.ftz.f32 %f542, %f74, %f541, %f461; mul.ftz.f32 %f543, %f657, %f542; mov.b32 %r1549, %f543; mov.b32 %f544, %r1548; fma.rn.ftz.f32 %f545, %f72, %f544, %f466; mul.ftz.f32 %f546, %f656, %f545; mov.b32 %r1548, %f546; mov.b32 %f547, %r1547; fma.rn.ftz.f32 %f548, %f72, %f547, %f467; mul.ftz.f32 %f549, %f656, %f548; mov.b32 %r1547, %f549; mov.b32 %f550, %r1546; fma.rn.ftz.f32 %f551, %f74, %f550, %f468; mul.ftz.f32 %f552, %f657, %f551; mov.b32 %r1546, %f552; mov.b32 %f553, %r1545; fma.rn.ftz.f32 %f554, %f74, %f553, %f469; mul.ftz.f32 %f555, %f657, %f554; mov.b32 %r1545, %f555; mov.b32 %f556, %r1544; fma.rn.ftz.f32 %f557, %f72, %f556, %f474; mul.ftz.f32 %f558, %f656, %f557; mov.b32 %r1544, %f558; mov.b32 %f559, %r1543; fma.rn.ftz.f32 %f560, %f72, %f559, %f475; mul.ftz.f32 %f561, %f656, %f560; mov.b32 %r1543, %f561; mov.b32 %f562, %r1542; fma.rn.ftz.f32 %f563, %f74, %f562, %f476; mul.ftz.f32 %f564, %f657, %f563; mov.b32 %r1542, %f564; mov.b32 %f565, %r1541; fma.rn.ftz.f32 %f566, %f74, %f565, %f477; mul.ftz.f32 %f567, %f657, %f566; mov.b32 %r1541, %f567; mov.b32 %f568, %r1540; fma.rn.ftz.f32 %f569, %f72, %f568, %f482; mul.ftz.f32 %f570, %f656, %f569; mov.b32 %r1540, %f570; mov.b32 %f571, %r1539; fma.rn.ftz.f32 %f572, %f72, %f571, %f483; mul.ftz.f32 %f573, %f656, %f572; mov.b32 %r1539, %f573; mov.b32 %f574, %r1538; fma.rn.ftz.f32 %f575, %f74, %f574, %f484; mul.ftz.f32 %f576, %f657, %f575; mov.b32 %r1538, %f576; mov.b32 %f577, %r1537; fma.rn.ftz.f32 %f578, %f74, %f577, %f485; mul.ftz.f32 %f579, %f657, %f578; mov.b32 %r1537, %f579; mov.b32 %f580, %r1536; fma.rn.ftz.f32 %f581, %f72, %f580, %f490; mul.ftz.f32 %f582, %f656, %f581; mov.b32 %r1536, %f582; mov.b32 %f583, %r1535; fma.rn.ftz.f32 %f584, %f72, %f583, %f491; mul.ftz.f32 %f585, %f656, %f584; mov.b32 %r1535, %f585; mov.b32 %f586, %r1534; fma.rn.ftz.f32 %f587, %f74, %f586, %f492; mul.ftz.f32 %f588, %f657, %f587; mov.b32 %r1534, %f588; mov.b32 %f589, %r1533; fma.rn.ftz.f32 %f590, %f74, %f589, %f493; mul.ftz.f32 %f591, %f657, %f590; mov.b32 %r1533, %f591; mov.b32 %f592, %r1532; fma.rn.ftz.f32 %f593, %f72, %f592, %f498; mul.ftz.f32 %f594, %f656, %f593; mov.b32 %r1532, %f594; mov.b32 %f595, %r1531; fma.rn.ftz.f32 %f596, %f72, %f595, %f499; mul.ftz.f32 %f597, %f656, %f596; mov.b32 %r1531, %f597; mov.b32 %f598, %r1530; fma.rn.ftz.f32 %f599, %f74, %f598, %f500; mul.ftz.f32 %f600, %f657, %f599; mov.b32 %r1530, %f600; mov.b32 %f601, %r1529; fma.rn.ftz.f32 %f602, %f74, %f601, %f501; mul.ftz.f32 %f603, %f657, %f602; mov.b32 %r1529, %f603; @%p119 bra $L__BB0_48; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1528, %r1527, %r1526, %r1525}, [%r605]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1524, %r1523, %r1522, %r1521}, [%r610]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1520, %r1519, %r1518, %r1517}, [%r615]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1516, %r1515, %r1514, %r1513}, [%r620]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1512, %r1511, %r1510, %r1509}, [%r625]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1508, %r1507, %r1506, %r1505}, [%r630]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1504, %r1503, %r1502, %r1501}, [%r635]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1500, %r1499, %r1498, %r1497}, [%r640]; // end inline asm $L__BB0_48: add.s32 %r1461, %r1461, 32; setp.lt.s32 %p108, %r1461, %r128; add.s32 %r1462, %r1462, 32; mov.f32 %f638, %f55; mov.f32 %f639, %f54; @%p108 bra $L__BB0_3; $L__BB0_49: mov.u32 %r1346, %tid.x; mov.u32 %r1345, _ZN25fused_multihead_attention5smem_E; shr.s32 %r1344, %r1346, 31; shr.u32 %r1343, %r1344, 29; add.s32 %r1342, %r1346, %r1343; shr.s32 %r1341, %r1342, 3; and.b32 %r1340, %r1342, -8; sub.s32 %r1339, %r1346, %r1340; shl.b32 %r1338, %r1339, 4; cvt.s64.s32 %rd152, %r1338; mov.b64 %rd151, fmha_v2_flash_attention_fp16_64_32_S_64_sm86_kernel_nl_param_0; mov.u64 %rd150, %rd151; ld.param.u32 %r1337, [%rd150+60]; mov.u32 %r1336, %ctaid.y; mov.u32 %r1335, %ctaid.x; ld.param.u32 %r1334, [fmha_v2_flash_attention_fp16_64_32_S_64_sm86_kernel_nl_param_0+52]; mul.lo.s32 %r1333, %r1, %r1336; mad.lo.s32 %r1332, %r1333, %r1334, %r1335; shr.s32 %r1331, %r1342, 31; shr.u32 %r1330, %r1331, 29; add.s32 %r1329, %r1341, %r1330; and.b32 %r1328, %r1329, 268435448; sub.s32 %r1327, %r1341, %r1328; xor.b32 %r1326, %r1327, %r1339; shl.b32 %r1325, %r1341, 7; shl.b32 %r1324, %r1326, 4; add.s32 %r1323, %r1345, 8192; add.s32 %r1322, %r1324, %r1325; add.s32 %r1321, %r1322, %r1323; and.b32 %r1320, %r1346, 96; shr.u32 %r1319, %r1320, 1; bar.sync 0; mov.b32 %f604, %r1559; mov.b32 %f605, %r1560; // begin inline asm cvt.rn.f16x2.f32 %r1222, %f604, %f605; // end inline asm mov.b32 %f606, %r1557; mov.b32 %f607, %r1558; // begin inline asm cvt.rn.f16x2.f32 %r1223, %f606, %f607; // end inline asm shl.b32 %r1291, %r1346, 2; and.b32 %r1292, %r1291, 124; add.s32 %r1294, %r1292, %r1345; and.b32 %r1297, %r1346, 28; shr.u32 %r1298, %r1297, 2; or.b32 %r1299, %r1319, %r1298; shl.b32 %r1300, %r1299, 7; add.s32 %r1301, %r1294, %r1300; add.s32 %r1224, %r1301, 8192; // begin inline asm st.shared.b32 [%r1224], %r1222; // end inline asm add.s32 %r1226, %r1301, 9216; // begin inline asm st.shared.b32 [%r1226], %r1223; // end inline asm xor.b32 %r1230, %r1224, 16; mov.b32 %f608, %r1555; mov.b32 %f609, %r1556; // begin inline asm cvt.rn.f16x2.f32 %r1228, %f608, %f609; // end inline asm mov.b32 %f610, %r1553; mov.b32 %f611, %r1554; // begin inline asm cvt.rn.f16x2.f32 %r1229, %f610, %f611; // end inline asm // begin inline asm st.shared.b32 [%r1230], %r1228; // end inline asm add.s32 %r1232, %r1230, 1024; // begin inline asm st.shared.b32 [%r1232], %r1229; // end inline asm xor.b32 %r1236, %r1224, 32; mov.b32 %f612, %r1551; mov.b32 %f613, %r1552; // begin inline asm cvt.rn.f16x2.f32 %r1234, %f612, %f613; // end inline asm mov.b32 %f614, %r1549; mov.b32 %f615, %r1550; // begin inline asm cvt.rn.f16x2.f32 %r1235, %f614, %f615; // end inline asm // begin inline asm st.shared.b32 [%r1236], %r1234; // end inline asm add.s32 %r1238, %r1236, 1024; // begin inline asm st.shared.b32 [%r1238], %r1235; // end inline asm xor.b32 %r1242, %r1224, 48; mov.b32 %f616, %r1547; mov.b32 %f617, %r1548; // begin inline asm cvt.rn.f16x2.f32 %r1240, %f616, %f617; // end inline asm mov.b32 %f618, %r1545; mov.b32 %f619, %r1546; // begin inline asm cvt.rn.f16x2.f32 %r1241, %f618, %f619; // end inline asm // begin inline asm st.shared.b32 [%r1242], %r1240; // end inline asm add.s32 %r1244, %r1242, 1024; // begin inline asm st.shared.b32 [%r1244], %r1241; // end inline asm xor.b32 %r1248, %r1224, 64; mov.b32 %f620, %r1543; mov.b32 %f621, %r1544; // begin inline asm cvt.rn.f16x2.f32 %r1246, %f620, %f621; // end inline asm mov.b32 %f622, %r1541; mov.b32 %f623, %r1542; // begin inline asm cvt.rn.f16x2.f32 %r1247, %f622, %f623; // end inline asm // begin inline asm st.shared.b32 [%r1248], %r1246; // end inline asm add.s32 %r1250, %r1248, 1024; // begin inline asm st.shared.b32 [%r1250], %r1247; // end inline asm xor.b32 %r1254, %r1224, 80; mov.b32 %f624, %r1539; mov.b32 %f625, %r1540; // begin inline asm cvt.rn.f16x2.f32 %r1252, %f624, %f625; // end inline asm mov.b32 %f626, %r1537; mov.b32 %f627, %r1538; // begin inline asm cvt.rn.f16x2.f32 %r1253, %f626, %f627; // end inline asm // begin inline asm st.shared.b32 [%r1254], %r1252; // end inline asm add.s32 %r1256, %r1254, 1024; // begin inline asm st.shared.b32 [%r1256], %r1253; // end inline asm xor.b32 %r1260, %r1224, 96; mov.b32 %f628, %r1535; mov.b32 %f629, %r1536; // begin inline asm cvt.rn.f16x2.f32 %r1258, %f628, %f629; // end inline asm mov.b32 %f630, %r1533; mov.b32 %f631, %r1534; // begin inline asm cvt.rn.f16x2.f32 %r1259, %f630, %f631; // end inline asm // begin inline asm st.shared.b32 [%r1260], %r1258; // end inline asm add.s32 %r1262, %r1260, 1024; // begin inline asm st.shared.b32 [%r1262], %r1259; // end inline asm xor.b32 %r1266, %r1224, 112; mov.b32 %f632, %r1531; mov.b32 %f633, %r1532; // begin inline asm cvt.rn.f16x2.f32 %r1264, %f632, %f633; // end inline asm mov.b32 %f634, %r1529; mov.b32 %f635, %r1530; // begin inline asm cvt.rn.f16x2.f32 %r1265, %f634, %f635; // end inline asm // begin inline asm st.shared.b32 [%r1266], %r1264; // end inline asm add.s32 %r1268, %r1266, 1024; // begin inline asm st.shared.b32 [%r1268], %r1265; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1270, %r1271, %r1272, %r1273}, [%r1321]; // end inline asm add.s32 %r1279, %r1321, 2048; // begin inline asm ld.shared.v4.b32 {%r1275, %r1276, %r1277, %r1278}, [%r1279]; // end inline asm add.s32 %r1284, %r1321, 4096; // begin inline asm ld.shared.v4.b32 {%r1280, %r1281, %r1282, %r1283}, [%r1284]; // end inline asm add.s32 %r1289, %r1321, 6144; // begin inline asm ld.shared.v4.b32 {%r1285, %r1286, %r1287, %r1288}, [%r1289]; // end inline asm mul.lo.s32 %r1306, %r1332, %r1337; shl.b32 %r1307, %r1306, 1; cvt.s64.s32 %rd129, %r1307; add.s64 %rd39, %rd129, %rd152; cvt.u32.u64 %r1309, %rd5; setp.ge.s32 %p109, %r1309, %r1; @%p109 bra $L__BB0_60; mov.b64 %rd154, fmha_v2_flash_attention_fp16_64_32_S_64_sm86_kernel_nl_param_0; mov.u64 %rd153, %rd154; ld.param.u32 %r1357, [%rd153+60]; mov.u32 %r1356, %tid.x; shr.s32 %r1355, %r1357, 31; shr.u32 %r1354, %r1355, 29; add.s32 %r1353, %r1357, %r1354; shr.s32 %r1352, %r1353, 3; shr.s32 %r1351, %r1356, 31; shr.u32 %r1350, %r1351, 29; add.s32 %r1349, %r1356, %r1350; and.b32 %r1348, %r1349, -8; sub.s32 %r1347, %r1356, %r1348; setp.ge.s32 %p110, %r1347, %r1352; @%p110 bra $L__BB0_52; mul.lo.s64 %rd131, %rd10, %rd5; add.s64 %rd132, %rd39, %rd131; cvta.to.global.u64 %rd133, %rd11; add.s64 %rd134, %rd133, %rd132; st.global.v4.u32 [%rd134], {%r1270, %r1271, %r1272, %r1273}; $L__BB0_52: add.s32 %r1311, %r1309, 16; setp.ge.s32 %p111, %r1311, %r1; @%p111 bra $L__BB0_60; @%p110 bra $L__BB0_55; add.s64 %rd135, %rd5, 16; mul.lo.s64 %rd136, %rd135, %rd10; add.s64 %rd137, %rd39, %rd136; cvta.to.global.u64 %rd138, %rd11; add.s64 %rd139, %rd138, %rd137; st.global.v4.u32 [%rd139], {%r1275, %r1276, %r1277, %r1278}; $L__BB0_55: add.s32 %r1313, %r1309, 32; setp.ge.s32 %p113, %r1313, %r1; @%p113 bra $L__BB0_60; @%p110 bra $L__BB0_58; add.s64 %rd140, %rd5, 32; mul.lo.s64 %rd141, %rd140, %rd10; add.s64 %rd142, %rd39, %rd141; cvta.to.global.u64 %rd143, %rd11; add.s64 %rd144, %rd143, %rd142; st.global.v4.u32 [%rd144], {%r1280, %r1281, %r1282, %r1283}; $L__BB0_58: add.s32 %r1315, %r1309, 48; setp.ge.s32 %p115, %r1315, %r1; or.pred %p117, %p115, %p110; @%p117 bra $L__BB0_60; add.s64 %rd145, %rd5, 48; mul.lo.s64 %rd146, %rd145, %rd10; add.s64 %rd147, %rd39, %rd146; cvta.to.global.u64 %rd148, %rd11; add.s64 %rd149, %rd148, %rd147; st.global.v4.u32 [%rd149], {%r1285, %r1286, %r1287, %r1288}; $L__BB0_60: ret; }