%rd1, %rd38; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_256_16_S_32_sm86_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_256_16_S_32_sm86_kernel_nl_param_0+52]; mov.u32 %r469, %ctaid.z; shl.b32 %r3, %r469, 8; setp.le.s32 %p1, %r1, %r3; @%p1 bra $L__BB0_113; mov.u32 %r614, %tid.x; mov.u32 %r615, %ctaid.y; mov.u32 %r616, %ctaid.x; mul.lo.s32 %r617, %r1, %r615; mad.lo.s32 %r618, %r617, %r2, %r616; shr.s32 %r619, %r614, 31; shr.u32 %r620, %r619, 27; add.s32 %r621, %r614, %r620; and.b32 %r622, %r621, -32; sub.s32 %r623, %r614, %r622; shr.u32 %r624, %r619, 25; add.s32 %r625, %r614, %r624; shr.s32 %r626, %r625, 7; shl.b32 %r627, %r626, 4; shr.s32 %r628, %r623, 31; shr.u32 %r629, %r628, 30; add.s32 %r630, %r623, %r629; and.b32 %r631, %r630, 2147483644; sub.s32 %r632, %r623, %r631; shl.b32 %r633, %r632, 1; add.s32 %r1579, %r633, %r627; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r634, %r621, 5; shr.s32 %r635, %r621, 31; shr.u32 %r636, %r635, 30; add.s32 %r637, %r634, %r636; and.b32 %r638, %r637, 268435452; sub.s32 %r639, %r634, %r638; shl.b32 %r640, %r639, 4; shr.s32 %r641, %r630, 2; add.s32 %r5, %r640, %r641; shr.u32 %r642, %r619, 30; add.s32 %r643, %r614, %r642; shr.s32 %r6, %r643, 2; add.s32 %r644, %r6, %r3; cvt.s64.s32 %rd5, %r644; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd49, %rd6, %rd5; mul.lo.s32 %r645, %r618, 3; mul.wide.s32 %rd50, %r645, 64; and.b32 %r646, %r643, -4; sub.s32 %r7, %r614, %r646; shl.b32 %r647, %r7, 4; cvt.s64.s32 %rd51, %r647; add.s64 %rd52, %rd50, %rd51; add.s64 %rd53, %rd52, %rd49; ld.param.u64 %rd54, [%rd1]; add.s64 %rd39, %rd54, %rd53; shr.u32 %r648, %r619, 29; add.s32 %r649, %r614, %r648; shr.s32 %r650, %r649, 3; shr.s32 %r651, %r649, 31; shr.u32 %r652, %r651, 30; add.s32 %r653, %r650, %r652; and.b32 %r654, %r653, 268435452; sub.s32 %r655, %r650, %r654; and.b32 %r656, %r649, 268435448; sub.s32 %r657, %r614, %r656; xor.b32 %r658, %r655, %r657; shl.b32 %r659, %r650, 7; shl.b32 %r660, %r658, 4; mov.u32 %r661, 31; mov.u32 %r1578, 0; mov.u32 %r663, -1; shfl.sync.idx.b32 %r664|%p2, %r1578, %r1578, %r661, %r663; shfl.sync.idx.b32 %r665|%p3, %r1578, %r1578, %r661, %r663; and.b32 %r666, %r614, 96; shr.u32 %r667, %r666, 2; and.b32 %r668, %r614, 14; shr.u32 %r669, %r668, 1; or.b32 %r670, %r667, %r669; and.b32 %r671, %r614, 6; shr.u32 %r672, %r671, 1; shl.b32 %r673, %r614, 2; and.b32 %r674, %r673, 4; or.b32 %r675, %r672, %r674; shl.b32 %r676, %r675, 4; and.b32 %r677, %r614, 16; xor.b32 %r678, %r676, %r677; cvt.s64.s32 %rd55, %r6; mul.lo.s64 %rd56, %rd6, %rd55; add.s32 %r679, %r645, 1; mul.wide.s32 %rd57, %r679, 64; add.s64 %rd58, %rd57, %rd51; add.s64 %rd59, %rd58, %rd56; shfl.sync.idx.b32 %r680|%p4, %r1578, %r1578, %r661, %r663; shfl.sync.idx.b32 %r681|%p5, %r1578, %r1578, %r661, %r663; shr.u32 %r682, %r677, 2; or.b32 %r683, %r672, %r682; and.b32 %r684, %r614, 8; shr.u32 %r685, %r684, 3; xor.b32 %r686, %r675, %r685; add.s32 %r687, %r645, 2; mul.wide.s32 %rd60, %r687, 64; add.s64 %rd61, %rd60, %rd51; add.s64 %rd62, %rd61, %rd56; shfl.sync.idx.b32 %r688|%p6, %r1578, %r1578, %r661, %r663; shfl.sync.idx.b32 %r689|%p7, %r1578, %r1578, %r661, %r663; ld.param.u64 %rd8, [%rd1+32]; ld.param.u64 %rd9, [%rd1+8]; ld.param.u32 %r8, [%rd1+60]; sub.s32 %r690, %r1, %r3; min.s32 %r691, %r690, 256; shr.s32 %r692, %r8, 31; shr.u32 %r693, %r692, 29; add.s32 %r694, %r8, %r693; shr.s32 %r9, %r694, 3; shl.b32 %r695, %r614, 6; and.b32 %r696, %r695, 896; shl.b32 %r697, %r686, 4; shl.b32 %r698, %r683, 7; shl.b32 %r699, %r670, 7; setp.lt.s32 %p8, %r6, %r691; add.s32 %r700, %r6, 32; setp.lt.s32 %p9, %r700, %r691; add.s32 %r701, %r6, 64; setp.lt.s32 %p10, %r701, %r691; add.s32 %r702, %r6, 96; setp.lt.s32 %p11, %r702, %r691; add.s32 %r703, %r6, 128; setp.lt.s32 %p12, %r703, %r691; add.s32 %r704, %r6, 160; setp.lt.s32 %p13, %r704, %r691; add.s32 %r705, %r6, 192; setp.lt.s32 %p14, %r705, %r691; add.s32 %r706, %r6, 224; setp.lt.s32 %p15, %r706, %r691; add.s32 %r707, %r660, %r659; or.b32 %r708, %r678, %r699; add.s64 %rd264, %rd54, %rd59; or.b32 %r709, %r697, %r698; add.s64 %rd265, %rd54, %rd62; or.b32 %r710, %r678, %r696; mov.u32 %r711, _ZN25fused_multihead_attention5smem_E; add.s32 %r712, %r711, 16384; add.s32 %r10, %r707, %r712; shl.b64 %rd63, %rd6, 5; selp.b32 %r481, 16, 0, %p13; add.s32 %r713, %r707, %r711; add.s32 %r470, %r713, %r665; add.s32 %r472, %r470, 2048; add.s32 %r474, %r470, 4096; add.s32 %r476, %r470, 6144; add.s32 %r478, %r470, 8192; add.s32 %r480, %r470, 10240; add.s32 %r482, %r470, 12288; add.s32 %r484, %r470, 14336; selp.b32 %r471, 16, 0, %p8; // begin inline asm cp.async.cg.shared.global [%r470], [%rd39], 16, %r471; // end inline asm selp.b32 %r473, 16, 0, %p9; add.s64 %rd40, %rd39, %rd63; // begin inline asm cp.async.cg.shared.global [%r472], [%rd40], 16, %r473; // end inline asm selp.b32 %r475, 16, 0, %p10; add.s64 %rd41, %rd40, %rd63; // begin inline asm cp.async.cg.shared.global [%r474], [%rd41], 16, %r475; // end inline asm selp.b32 %r477, 16, 0, %p11; add.s64 %rd42, %rd41, %rd63; // begin inline asm cp.async.cg.shared.global [%r476], [%rd42], 16, %r477; // end inline asm selp.b32 %r479, 16, 0, %p12; add.s64 %rd43, %rd42, %rd63; // begin inline asm cp.async.cg.shared.global [%r478], [%rd43], 16, %r479; // end inline asm add.s64 %rd44, %rd43, %rd63; // begin inline asm cp.async.cg.shared.global [%r480], [%rd44], 16, %r481; // end inline asm selp.b32 %r483, 16, 0, %p14; add.s64 %rd45, %rd44, %rd63; // begin inline asm cp.async.cg.shared.global [%r482], [%rd45], 16, %r483; // end inline asm selp.b32 %r485, 16, 0, %p15; add.s64 %rd46, %rd45, %rd63; // begin inline asm cp.async.cg.shared.global [%r484], [%rd46], 16, %r485; // end inline asm min.s32 %r714, %r1, 16; setp.lt.s32 %p16, %r6, %r714; add.s32 %r486, %r10, %r681; selp.b32 %r489, 16, 0, %p16; // begin inline asm cp.async.cg.shared.global [%r486], [%rd264], 16, %r489; // end inline asm add.s32 %r715, %r711, 18432; add.s32 %r716, %r707, %r715; add.s32 %r488, %r716, %r689; // begin inline asm cp.async.cg.shared.global [%r488], [%rd265], 16, %r489; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r717, %r664, %r711; add.s32 %r494, %r717, %r708; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r490, %r491, %r492, %r493}, [%r494]; // end inline asm add.s32 %r499, %r494, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r495, %r496, %r497, %r498}, [%r499]; // end inline asm add.s32 %r504, %r494, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r500, %r501, %r502, %r503}, [%r504]; // end inline asm add.s32 %r509, %r494, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r505, %r506, %r507, %r508}, [%r509]; // end inline asm xor.b32 %r718, %r708, 32; add.s32 %r514, %r717, %r718; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r510, %r511, %r512, %r513}, [%r514]; // end inline asm add.s32 %r519, %r514, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r515, %r516, %r517, %r518}, [%r519]; // end inline asm add.s32 %r524, %r514, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r520, %r521, %r522, %r523}, [%r524]; // end inline asm add.s32 %r529, %r514, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r525, %r526, %r527, %r528}, [%r529]; // end inline asm add.s32 %r45, %r680, %r712; add.s32 %r534, %r45, %r709; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1584, %r1585, %r1586, %r1587}, [%r534]; // end inline asm xor.b32 %r719, %r709, 32; add.s32 %r539, %r45, %r719; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1588, %r1589, %r1590, %r1591}, [%r539]; // end inline asm add.s32 %r544, %r710, %r715; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1592, %r1593, %r1594, %r1595}, [%r544]; // end inline asm xor.b32 %r720, %r710, 32; add.s32 %r549, %r720, %r715; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1596, %r1597, %r1598, %r1599}, [%r549]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r1663, 0; // end inline asm // begin inline asm mov.u32 %r1662, 0; // end inline asm // begin inline asm mov.u32 %r1661, 0; // end inline asm // begin inline asm mov.u32 %r1660, 0; // end inline asm // begin inline asm mov.u32 %r1659, 0; // end inline asm // begin inline asm mov.u32 %r1658, 0; // end inline asm // begin inline asm mov.u32 %r1657, 0; // end inline asm // begin inline asm mov.u32 %r1656, 0; // end inline asm // begin inline asm mov.u32 %r1655, 0; // end inline asm // begin inline asm mov.u32 %r1654, 0; // end inline asm // begin inline asm mov.u32 %r1653, 0; // end inline asm // begin inline asm mov.u32 %r1652, 0; // end inline asm // begin inline asm mov.u32 %r1651, 0; // end inline asm // begin inline asm mov.u32 %r1650, 0; // end inline asm // begin inline asm mov.u32 %r1649, 0; // end inline asm // begin inline asm mov.u32 %r1648, 0; // end inline asm // begin inline asm mov.u32 %r1647, 0; // end inline asm // begin inline asm mov.u32 %r1646, 0; // end inline asm // begin inline asm mov.u32 %r1645, 0; // end inline asm // begin inline asm mov.u32 %r1644, 0; // end inline asm // begin inline asm mov.u32 %r1643, 0; // end inline asm // begin inline asm mov.u32 %r1642, 0; // end inline asm // begin inline asm mov.u32 %r1641, 0; // end inline asm // begin inline asm mov.u32 %r1640, 0; // end inline asm // begin inline asm mov.u32 %r1639, 0; // end inline asm // begin inline asm mov.u32 %r1638, 0; // end inline asm // begin inline asm mov.u32 %r1637, 0; // end inline asm // begin inline asm mov.u32 %r1636, 0; // end inline asm // begin inline asm mov.u32 %r1635, 0; // end inline asm // begin inline asm mov.u32 %r1634, 0; // end inline asm // begin inline asm mov.u32 %r1633, 0; // end inline asm // begin inline asm mov.u32 %r1632, 0; // end inline asm // begin inline asm mov.u32 %r1631, 0; // end inline asm // begin inline asm mov.u32 %r1630, 0; // end inline asm // begin inline asm mov.u32 %r1629, 0; // end inline asm // begin inline asm mov.u32 %r1628, 0; // end inline asm // begin inline asm mov.u32 %r1627, 0; // end inline asm // begin inline asm mov.u32 %r1626, 0; // end inline asm // begin inline asm mov.u32 %r1625, 0; // end inline asm // begin inline asm mov.u32 %r1624, 0; // end inline asm // begin inline asm mov.u32 %r1623, 0; // end inline asm // begin inline asm mov.u32 %r1622, 0; // end inline asm // begin inline asm mov.u32 %r1621, 0; // end inline asm // begin inline asm mov.u32 %r1620, 0; // end inline asm // begin inline asm mov.u32 %r1619, 0; // end inline asm // begin inline asm mov.u32 %r1618, 0; // end inline asm // begin inline asm mov.u32 %r1617, 0; // end inline asm // begin inline asm mov.u32 %r1616, 0; // end inline asm // begin inline asm mov.u32 %r1615, 0; // end inline asm // begin inline asm mov.u32 %r1614, 0; // end inline asm // begin inline asm mov.u32 %r1613, 0; // end inline asm // begin inline asm mov.u32 %r1612, 0; // end inline asm // begin inline asm mov.u32 %r1611, 0; // end inline asm // begin inline asm mov.u32 %r1610, 0; // end inline asm // begin inline asm mov.u32 %r1609, 0; // end inline asm // begin inline asm mov.u32 %r1608, 0; // end inline asm // begin inline asm mov.u32 %r1607, 0; // end inline asm // begin inline asm mov.u32 %r1606, 0; // end inline asm // begin inline asm mov.u32 %r1605, 0; // end inline asm // begin inline asm mov.u32 %r1604, 0; // end inline asm // begin inline asm mov.u32 %r1603, 0; // end inline asm // begin inline asm mov.u32 %r1602, 0; // end inline asm // begin inline asm mov.u32 %r1601, 0; // end inline asm // begin inline asm mov.u32 %r1600, 0; // end inline asm add.s32 %r721, %r1, 15; shr.s32 %r722, %r721, 31; shr.u32 %r723, %r722, 28; add.s32 %r724, %r721, %r723; and.b32 %r126, %r724, -16; setp.lt.s32 %p17, %r1, 1; @%p17 bra $L__BB0_93; ld.param.u8 %rs1, [%rd1+160]; cvt.s64.s32 %rd12, %r5; cvt.s64.s32 %rd13, %r1579; add.s32 %r726, %r1579, 1; cvt.s64.s32 %rd14, %r726; add.s32 %r727, %r1579, 8; cvt.s64.s32 %rd15, %r727; add.s32 %r728, %r1579, 9; cvt.s64.s32 %rd16, %r728; add.s32 %r175, %r5, 8; add.s32 %r176, %r5, 64; add.s32 %r177, %r5, 72; add.s32 %r178, %r5, 128; add.s32 %r179, %r5, 136; add.s32 %r180, %r5, 192; add.s32 %r181, %r5, 200; mov.f32 %f1106, 0fFF800000; mov.f32 %f1098, 0f00000000; mov.f32 %f1099, %f1098; mov.f32 %f1100, %f1098; mov.f32 %f1101, %f1098; mov.f32 %f1102, %f1098; mov.f32 %f1103, %f1098; mov.f32 %f1104, %f1098; mov.f32 %f1105, %f1098; mov.f32 %f1107, %f1106; mov.f32 %f1108, %f1106; mov.f32 %f1109, %f1106; mov.f32 %f1110, %f1106; mov.f32 %f1111, %f1106; mov.f32 %f1112, %f1106; mov.f32 %f1113, %f1106; mov.u32 %r1583, %r1; mov.u32 %r1582, %r1; $L__BB0_3: add.s32 %r729, %r1578, 16; setp.ge.s32 %p18, %r729, %r126; @%p18 bra $L__BB0_5; bar.sync 0; shl.b64 %rd66, %rd6, 4; add.s64 %rd264, %rd264, %rd66; add.s32 %r1583, %r1583, -16; min.s32 %r734, %r1583, 16; setp.lt.s32 %p19, %r6, %r734; selp.b32 %r731, 16, 0, %p19; // begin inline asm cp.async.cg.shared.global [%r486], [%rd264], 16, %r731; // end inline asm add.s64 %rd265, %rd265, %rd66; add.s32 %r1582, %r1582, -16; min.s32 %r735, %r1582, 16; setp.lt.s32 %p20, %r6, %r735; selp.b32 %r733, 16, 0, %p20; // begin inline asm cp.async.cg.shared.global [%r488], [%rd265], 16, %r733; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm $L__BB0_5: setp.eq.s16 %p21, %rs1, 0; @%p21 bra $L__BB0_70; mov.u32 %r1496, %ctaid.x; mov.u32 %r1495, %ctaid.y; mov.u32 %r1494, %ctaid.z; shl.b32 %r1493, %r1494, 8; ld.param.u32 %r1492, [fmha_v2_flash_attention_fp16_256_16_S_32_sm86_kernel_nl_param_0+52]; cvt.s64.s32 %rd68, %r1493; add.s64 %rd69, %rd12, %rd68; setp.ge.u64 %p22, %rd69, %rd2; mul.lo.s32 %r738, %r1, %r1493; cvt.s64.s32 %rd70, %r738; cvt.u64.u32 %rd23, %r1578; add.s64 %rd71, %rd70, %rd23; add.s64 %rd24, %rd71, %rd13; mul.lo.s64 %rd72, %rd12, %rd2; add.s64 %rd73, %rd24, %rd72; add.s64 %rd25, %rd13, %rd23; setp.ge.u64 %p23, %rd25, %rd2; shl.b64 %rd74, %rd73, 1; mad.lo.s32 %r741, %r1492, %r1495, %r1496; cvt.s64.s32 %rd75, %r741; mul.lo.s64 %rd76, %rd4, %rd75; add.s64 %rd77, %rd76, %rd74; cvta.to.global.u64 %rd78, %rd3; add.s64 %rd26, %rd78, %rd77; mov.u16 %rs260, 0; or.pred %p24, %p23, %p22; mov.u16 %rs259, %rs260; @%p24 bra $L__BB0_8; ld.global.u16 %rs259, [%rd26]; $L__BB0_8: add.s64 %rd27, %rd14, %rd23; setp.ge.u64 %p26, %rd27, %rd2; or.pred %p27, %p26, %p22; @%p27 bra $L__BB0_10; ld.global.u16 %rs260, [%rd26+2]; $L__BB0_10: add.s64 %rd28, %rd15, %rd23; setp.ge.u64 %p29, %rd28, %rd2; mov.u16 %rs262, 0; or.pred %p30, %p29, %p22; mov.u16 %rs261, %rs262; @%p30 bra $L__BB0_12; ld.global.u16 %rs261, [%rd26+16]; $L__BB0_12: add.s64 %rd29, %rd16, %rd23; setp.ge.u64 %p32, %rd29, %rd2; or.pred %p33, %p32, %p22; @%p33 bra $L__BB0_14; ld.global.u16 %rs262, [%rd26+18]; $L__BB0_14: cvt.s64.s32 %rd86, %r175; add.s64 %rd87, %rd86, %rd68; setp.ge.u64 %p34, %rd87, %rd2; mul.lo.s64 %rd88, %rd86, %rd2; add.s64 %rd89, %rd24, %rd88; shl.b64 %rd90, %rd89, 1; add.s64 %rd93, %rd76, %rd90; add.s64 %rd30, %rd78, %rd93; mov.u16 %rs264, 0; or.pred %p36, %p23, %p34; mov.u16 %rs263, %rs264; @%p36 bra $L__BB0_16; ld.global.u16 %rs263, [%rd30]; $L__BB0_16: or.pred %p39, %p26, %p34; @%p39 bra $L__BB0_18; ld.global.u16 %rs264, [%rd30+2]; $L__BB0_18: mov.u16 %rs266, 0; or.pred %p42, %p29, %p34; mov.u16 %rs265, %rs266; @%p42 bra $L__BB0_20; ld.global.u16 %rs265, [%rd30+16]; $L__BB0_20: or.pred %p45, %p32, %p34; @%p45 bra $L__BB0_22; ld.global.u16 %rs266, [%rd30+18]; $L__BB0_22: cvt.s64.s32 %rd105, %r176; add.s64 %rd106, %rd105, %rd68; setp.ge.u64 %p46, %rd106, %rd2; mul.lo.s64 %rd107, %rd105, %rd2; add.s64 %rd108, %rd24, %rd107; shl.b64 %rd109, %rd108, 1; add.s64 %rd112, %rd76, %rd109; add.s64 %rd31, %rd78, %rd112; mov.u16 %rs268, 0; or.pred %p48, %p23, %p46; mov.u16 %rs267, %rs268; @%p48 bra $L__BB0_24; ld.global.u16 %rs267, [%rd31]; $L__BB0_24: or.pred %p51, %p26, %p46; @%p51 bra $L__BB0_26; ld.global.u16 %rs268, [%rd31+2]; $L__BB0_26: mov.u16 %rs270, 0; or.pred %p54, %p29, %p46; mov.u16 %rs269, %rs270; @%p54 bra $L__BB0_28; ld.global.u16 %rs269, [%rd31+16]; $L__BB0_28: or.pred %p57, %p32, %p46; @%p57 bra $L__BB0_30; ld.global.u16 %rs270, [%rd31+18]; $L__BB0_30: cvt.s64.s32 %rd124, %r177; add.s64 %rd125, %rd124, %rd68; setp.ge.u64 %p58, %rd125, %rd2; mul.lo.s64 %rd126, %rd124, %rd2; add.s64 %rd127, %rd24, %rd126; shl.b64 %rd128, %rd127, 1; add.s64 %rd131, %rd76, %rd128; add.s64 %rd32, %rd78, %rd131; mov.u16 %rs272, 0; or.pred %p60, %p23, %p58; mov.u16 %rs271, %rs272; @%p60 bra $L__BB0_32; ld.global.u16 %rs271, [%rd32]; $L__BB0_32: or.pred %p63, %p26, %p58; @%p63 bra $L__BB0_34; ld.global.u16 %rs272, [%rd32+2]; $L__BB0_34: mov.u16 %rs274, 0; or.pred %p66, %p29, %p58; mov.u16 %rs273, %rs274; @%p66 bra $L__BB0_36; ld.global.u16 %rs273, [%rd32+16]; $L__BB0_36: or.pred %p69, %p32, %p58; @%p69 bra $L__BB0_38; ld.global.u16 %rs274, [%rd32+18]; $L__BB0_38: cvt.s64.s32 %rd143, %r178; add.s64 %rd144, %rd143, %rd68; setp.ge.u64 %p70, %rd144, %rd2; mul.lo.s64 %rd145, %rd143, %rd2; add.s64 %rd146, %rd24, %rd145; shl.b64 %rd147, %rd146, 1; add.s64 %rd150, %rd76, %rd147; add.s64 %rd33, %rd78, %rd150; mov.u16 %rs276, 0; or.pred %p72, %p23, %p70; mov.u16 %rs275, %rs276; @%p72 bra $L__BB0_40; ld.global.u16 %rs275, [%rd33]; $L__BB0_40: or.pred %p75, %p26, %p70; @%p75 bra $L__BB0_42; ld.global.u16 %rs276, [%rd33+2]; $L__BB0_42: mov.u16 %rs278, 0; or.pred %p78, %p29, %p70; mov.u16 %rs277, %rs278; @%p78 bra $L__BB0_44; ld.global.u16 %rs277, [%rd33+16]; $L__BB0_44: or.pred %p81, %p32, %p70; @%p81 bra $L__BB0_46; ld.global.u16 %rs278, [%rd33+18]; $L__BB0_46: cvt.s64.s32 %rd162, %r179; add.s64 %rd163, %rd162, %rd68; setp.ge.u64 %p82, %rd163, %rd2; mul.lo.s64 %rd164, %rd162, %rd2; add.s64 %rd165, %rd24, %rd164; shl.b64 %rd166, %rd165, 1; add.s64 %rd169, %rd76, %rd166; add.s64 %rd34, %rd78, %rd169; mov.u16 %rs280, 0; or.pred %p84, %p23, %p82; mov.u16 %rs279, %rs280; @%p84 bra $L__BB0_48; ld.global.u16 %rs279, [%rd34]; $L__BB0_48: or.pred %p87, %p26, %p82; @%p87 bra $L__BB0_50; ld.global.u16 %rs280, [%rd34+2]; $L__BB0_50: mov.u16 %rs282, 0; or.pred %p90, %p29, %p82; mov.u16 %rs281, %rs282; @%p90 bra $L__BB0_52; ld.global.u16 %rs281, [%rd34+16]; $L__BB0_52: or.pred %p93, %p32, %p82; @%p93 bra $L__BB0_54; ld.global.u16 %rs282, [%rd34+18]; $L__BB0_54: cvt.s64.s32 %rd181, %r180; add.s64 %rd182, %rd181, %rd68; setp.ge.u64 %p94, %rd182, %rd2; mul.lo.s64 %rd183, %rd181, %rd2; add.s64 %rd184, %rd24, %rd183; shl.b64 %rd185, %rd184, 1; add.s64 %rd188, %rd76, %rd185; add.s64 %rd35, %rd78, %rd188; mov.u16 %rs284, 0; or.pred %p96, %p23, %p94; mov.u16 %rs283, %rs284; @%p96 bra $L__BB0_56; ld.global.u16 %rs283, [%rd35]; $L__BB0_56: or.pred %p99, %p26, %p94; @%p99 bra $L__BB0_58; ld.global.u16 %rs284, [%rd35+2]; $L__BB0_58: mov.u16 %rs286, 0; or.pred %p102, %p29, %p94; mov.u16 %rs285, %rs286; @%p102 bra $L__BB0_60; ld.global.u16 %rs285, [%rd35+16]; $L__BB0_60: or.pred %p105, %p32, %p94; @%p105 bra $L__BB0_62; ld.global.u16 %rs286, [%rd35+18]; $L__BB0_62: cvt.s64.s32 %rd200, %r181; add.s64 %rd201, %rd200, %rd68; setp.ge.u64 %p106, %rd201, %rd2; mul.lo.s64 %rd202, %rd200, %rd2; add.s64 %rd203, %rd24, %rd202; shl.b64 %rd204, %rd203, 1; add.s64 %rd207, %rd76, %rd204; add.s64 %rd36, %rd78, %rd207; mov.u16 %rs288, 0; or.pred %p108, %p23, %p106; mov.u16 %rs287, %rs288; @%p108 bra $L__BB0_64; ld.global.u16 %rs287, [%rd36]; $L__BB0_64: or.pred %p111, %p26, %p106; @%p111 bra $L__BB0_66; ld.global.u16 %rs288, [%rd36+2]; $L__BB0_66: mov.u16 %rs290, 0; or.pred %p114, %p29, %p106; mov.u16 %rs289, %rs290; @%p114 bra $L__BB0_68; ld.global.u16 %rs289, [%rd36+16]; $L__BB0_68: or.pred %p117, %p32, %p106; @%p117 bra $L__BB0_70; ld.global.u16 %rs290, [%rd36+18]; $L__BB0_70: // begin inline asm mov.u32 %r825, 0; // end inline asm // begin inline asm mov.u32 %r826, 0; // end inline asm // begin inline asm mov.u32 %r827, 0; // end inline asm // begin inline asm mov.u32 %r828, 0; // end inline asm // begin inline asm mov.u32 %r829, 0; // end inline asm // begin inline asm mov.u32 %r830, 0; // end inline asm // begin inline asm mov.u32 %r831, 0; // end inline asm // begin inline asm mov.u32 %r832, 0; // end inline asm // begin inline asm mov.u32 %r833, 0; // end inline asm // begin inline asm mov.u32 %r834, 0; // end inline asm // begin inline asm mov.u32 %r835, 0; // end inline asm // begin inline asm mov.u32 %r836, 0; // end inline asm // begin inline asm mov.u32 %r837, 0; // end inline asm // begin inline asm mov.u32 %r838, 0; // end inline asm // begin inline asm mov.u32 %r839, 0; // end inline asm // begin inline asm mov.u32 %r840, 0; // end inline asm // begin inline asm mov.u32 %r841, 0; // end inline asm // begin inline asm mov.u32 %r842, 0; // end inline asm // begin inline asm mov.u32 %r843, 0; // end inline asm // begin inline asm mov.u32 %r844, 0; // end inline asm // begin inline asm mov.u32 %r845, 0; // end inline asm // begin inline asm mov.u32 %r846, 0; // end inline asm // begin inline asm mov.u32 %r847, 0; // end inline asm // begin inline asm mov.u32 %r848, 0; // end inline asm // begin inline asm mov.u32 %r849, 0; // end inline asm // begin inline asm mov.u32 %r850, 0; // end inline asm // begin inline asm mov.u32 %r851, 0; // end inline asm // begin inline asm mov.u32 %r852, 0; // end inline asm // begin inline asm mov.u32 %r853, 0; // end inline asm // begin inline asm mov.u32 %r854, 0; // end inline asm // begin inline asm mov.u32 %r855, 0; // end inline asm // begin inline asm mov.u32 %r856, 0; // end inline asm mov.b32 %f330, %r825; mov.b32 %f331, %r826; mov.b32 %f332, %r827; mov.b32 %f333, %r828; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f330, %f331, %f332, %f333}, {%r490, %r491, %r492, %r493}, {%r1584, %r1585}, {%f330, %f331, %f332, %f333}; // end inline asm mov.b32 %f338, %r829; mov.b32 %f339, %r830; mov.b32 %f340, %r831; mov.b32 %f341, %r832; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f338, %f339, %f340, %f341}, {%r490, %r491, %r492, %r493}, {%r1586, %r1587}, {%f338, %f339, %f340, %f341}; // end inline asm mov.b32 %f346, %r833; mov.b32 %f347, %r834; mov.b32 %f348, %r835; mov.b32 %f349, %r836; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f346, %f347, %f348, %f349}, {%r495, %r496, %r497, %r498}, {%r1584, %r1585}, {%f346, %f347, %f348, %f349}; // end inline asm mov.b32 %f354, %r837; mov.b32 %f355, %r838; mov.b32 %f356, %r839; mov.b32 %f357, %r840; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f354, %f355, %f356, %f357}, {%r495, %r496, %r497, %r498}, {%r1586, %r1587}, {%f354, %f355, %f356, %f357}; // end inline asm mov.b32 %f362, %r841; mov.b32 %f363, %r842; mov.b32 %f364, %r843; mov.b32 %f365, %r844; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f362, %f363, %f364, %f365}, {%r500, %r501, %r502, %r503}, {%r1584, %r1585}, {%f362, %f363, %f364, %f365}; // end inline asm mov.b32 %f370, %r845; mov.b32 %f371, %r846; mov.b32 %f372, %r847; mov.b32 %f373, %r848; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f370, %f371, %f372, %f373}, {%r500, %r501, %r502, %r503}, {%r1586, %r1587}, {%f370, %f371, %f372, %f373}; // end inline asm mov.b32 %f378, %r849; mov.b32 %f379, %r850; mov.b32 %f380, %r851; mov.b32 %f381, %r852; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f378, %f379, %f380, %f381}, {%r505, %r506, %r507, %r508}, {%r1584, %r1585}, {%f378, %f379, %f380, %f381}; // end inline asm mov.b32 %f386, %r853; mov.b32 %f387, %r854; mov.b32 %f388, %r855; mov.b32 %f389, %r856; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f386, %f387, %f388, %f389}, {%r505, %r506, %r507, %r508}, {%r1586, %r1587}, {%f386, %f387, %f388, %f389}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f330, %f331, %f332, %f333}, {%r510, %r511, %r512, %r513}, {%r1588, %r1589}, {%f330, %f331, %f332, %f333}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f338, %f339, %f340, %f341}, {%r510, %r511, %r512, %r513}, {%r1590, %r1591}, {%f338, %f339, %f340, %f341}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f346, %f347, %f348, %f349}, {%r515, %r516, %r517, %r518}, {%r1588, %r1589}, {%f346, %f347, %f348, %f349}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f354, %f355, %f356, %f357}, {%r515, %r516, %r517, %r518}, {%r1590, %r1591}, {%f354, %f355, %f356, %f357}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f362, %f363, %f364, %f365}, {%r520, %r521, %r522, %r523}, {%r1588, %r1589}, {%f362, %f363, %f364, %f365}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f370, %f371, %f372, %f373}, {%r520, %r521, %r522, %r523}, {%r1590, %r1591}, {%f370, %f371, %f372, %f373}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f378, %f379, %f380, %f381}, {%r525, %r526, %r527, %r528}, {%r1588, %r1589}, {%f378, %f379, %f380, %f381}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f386, %f387, %f388, %f389}, {%r525, %r526, %r527, %r528}, {%r1590, %r1591}, {%f386, %f387, %f388, %f389}; // end inline asm mul.ftz.f32 %f394, %f1, %f330; mul.ftz.f32 %f395, %f1, %f331; mul.ftz.f32 %f396, %f1, %f338; mul.ftz.f32 %f397, %f1, %f339; mul.ftz.f32 %f398, %f1, %f332; mul.ftz.f32 %f399, %f1, %f333; mul.ftz.f32 %f400, %f1, %f340; mul.ftz.f32 %f401, %f1, %f341; mul.ftz.f32 %f402, %f1, %f346; mul.ftz.f32 %f403, %f1, %f347; mul.ftz.f32 %f404, %f1, %f354; mul.ftz.f32 %f405, %f1, %f355; mul.ftz.f32 %f406, %f1, %f348; mul.ftz.f32 %f407, %f1, %f349; mul.ftz.f32 %f408, %f1, %f356; mul.ftz.f32 %f409, %f1, %f357; mul.ftz.f32 %f410, %f1, %f362; mul.ftz.f32 %f411, %f1, %f363; mul.ftz.f32 %f412, %f1, %f370; mul.ftz.f32 %f413, %f1, %f371; mul.ftz.f32 %f414, %f1, %f364; mul.ftz.f32 %f415, %f1, %f365; mul.ftz.f32 %f416, %f1, %f372; mul.ftz.f32 %f417, %f1, %f373; mul.ftz.f32 %f418, %f1, %f378; mul.ftz.f32 %f419, %f1, %f379; mul.ftz.f32 %f420, %f1, %f386; mul.ftz.f32 %f421, %f1, %f387; mul.ftz.f32 %f422, %f1, %f380; mul.ftz.f32 %f423, %f1, %f381; mul.ftz.f32 %f424, %f1, %f388; mul.ftz.f32 %f425, %f1, %f389; setp.lt.s32 %p118, %r1579, %r1; selp.f32 %f1145, %f394, 0fFF800000, %p118; add.s32 %r953, %r1579, 1; setp.lt.s32 %p119, %r953, %r1; selp.f32 %f1144, %f395, 0fFF800000, %p119; add.s32 %r954, %r1579, 8; setp.lt.s32 %p120, %r954, %r1; selp.f32 %f1143, %f396, 0fFF800000, %p120; add.s32 %r955, %r1579, 9; setp.lt.s32 %p121, %r955, %r1; selp.f32 %f1142, %f397, 0fFF800000, %p121; selp.f32 %f1141, %f398, 0fFF800000, %p118; selp.f32 %f1140, %f399, 0fFF800000, %p119; selp.f32 %f1139, %f400, 0fFF800000, %p120; selp.f32 %f1138, %f401, 0fFF800000, %p121; selp.f32 %f1137, %f402, 0fFF800000, %p118; selp.f32 %f1136, %f403, 0fFF800000, %p119; selp.f32 %f1135, %f404, 0fFF800000, %p120; selp.f32 %f1134, %f405, 0fFF800000, %p121; selp.f32 %f1133, %f406, 0fFF800000, %p118; selp.f32 %f1132, %f407, 0fFF800000, %p119; selp.f32 %f1131, %f408, 0fFF800000, %p120; selp.f32 %f1130, %f409, 0fFF800000, %p121; selp.f32 %f1129, %f410, 0fFF800000, %p118; selp.f32 %f1128, %f411, 0fFF800000, %p119; selp.f32 %f1127, %f412, 0fFF800000, %p120; selp.f32 %f1126, %f413, 0fFF800000, %p121; selp.f32 %f1125, %f414, 0fFF800000, %p118; selp.f32 %f1124, %f415, 0fFF800000, %p119; selp.f32 %f1123, %f416, 0fFF800000, %p120; selp.f32 %f1122, %f417, 0fFF800000, %p121; selp.f32 %f1121, %f418, 0fFF800000, %p118; selp.f32 %f1120, %f419, 0fFF800000, %p119; selp.f32 %f1119, %f420, 0fFF800000, %p120; selp.f32 %f1118, %f421, 0fFF800000, %p121; selp.f32 %f1117, %f422, 0fFF800000, %p118; selp.f32 %f1116, %f423, 0fFF800000, %p119; selp.f32 %f1115, %f424, 0fFF800000, %p120; selp.f32 %f1114, %f425, 0fFF800000, %p121; @%p21 bra $L__BB0_72; // begin inline asm cvt.f32.f16 %f426, %rs259; // end inline asm add.ftz.f32 %f1145, %f426, %f1145; // begin inline asm cvt.f32.f16 %f427, %rs260; // end inline asm add.ftz.f32 %f1144, %f427, %f1144; // begin inline asm cvt.f32.f16 %f428, %rs261; // end inline asm add.ftz.f32 %f1143, %f428, %f1143; // begin inline asm cvt.f32.f16 %f429, %rs262; // end inline asm add.ftz.f32 %f1142, %f429, %f1142; // begin inline asm cvt.f32.f16 %f430, %rs263; // end inline asm add.ftz.f32 %f1141, %f430, %f1141; // begin inline asm cvt.f32.f16 %f431, %rs264; // end inline asm add.ftz.f32 %f1140, %f431, %f1140; // begin inline asm cvt.f32.f16 %f432, %rs265; // end inline asm add.ftz.f32 %f1139, %f432, %f1139; // begin inline asm cvt.f32.f16 %f433, %rs266; // end inline asm add.ftz.f32 %f1138, %f433, %f1138; // begin inline asm cvt.f32.f16 %f434, %rs267; // end inline asm add.ftz.f32 %f1137, %f434, %f1137; // begin inline asm cvt.f32.f16 %f435, %rs268; // end inline asm add.ftz.f32 %f1136, %f435, %f1136; // begin inline asm cvt.f32.f16 %f436, %rs269; // end inline asm add.ftz.f32 %f1135, %f436, %f1135; // begin inline asm cvt.f32.f16 %f437, %rs270; // end inline asm add.ftz.f32 %f1134, %f437, %f1134; // begin inline asm cvt.f32.f16 %f438, %rs271; // end inline asm add.ftz.f32 %f1133, %f438, %f1133; // begin inline asm cvt.f32.f16 %f439, %rs272; // end inline asm add.ftz.f32 %f1132, %f439, %f1132; // begin inline asm cvt.f32.f16 %f440, %rs273; // end inline asm add.ftz.f32 %f1131, %f440, %f1131; // begin inline asm cvt.f32.f16 %f441, %rs274; // end inline asm add.ftz.f32 %f1130, %f441, %f1130; // begin inline asm cvt.f32.f16 %f442, %rs275; // end inline asm add.ftz.f32 %f1129, %f442, %f1129; // begin inline asm cvt.f32.f16 %f443, %rs276; // end inline asm add.ftz.f32 %f1128, %f443, %f1128; // begin inline asm cvt.f32.f16 %f444, %rs277; // end inline asm add.ftz.f32 %f1127, %f444, %f1127; // begin inline asm cvt.f32.f16 %f445, %rs278; // end inline asm add.ftz.f32 %f1126, %f445, %f1126; // begin inline asm cvt.f32.f16 %f446, %rs279; // end inline asm add.ftz.f32 %f1125, %f446, %f1125; // begin inline asm cvt.f32.f16 %f447, %rs280; // end inline asm add.ftz.f32 %f1124, %f447, %f1124; // begin inline asm cvt.f32.f16 %f448, %rs281; // end inline asm add.ftz.f32 %f1123, %f448, %f1123; // begin inline asm cvt.f32.f16 %f449, %rs282; // end inline asm add.ftz.f32 %f1122, %f449, %f1122; // begin inline asm cvt.f32.f16 %f450, %rs283; // end inline asm add.ftz.f32 %f1121, %f450, %f1121; // begin inline asm cvt.f32.f16 %f451, %rs284; // end inline asm add.ftz.f32 %f1120, %f451, %f1120; // begin inline asm cvt.f32.f16 %f452, %rs285; // end inline asm add.ftz.f32 %f1119, %f452, %f1119; // begin inline asm cvt.f32.f16 %f453, %rs286; // end inline asm add.ftz.f32 %f1118, %f453, %f1118; // begin inline asm cvt.f32.f16 %f454, %rs287; // end inline asm add.ftz.f32 %f1117, %f454, %f1117; // begin inline asm cvt.f32.f16 %f455, %rs288; // end inline asm add.ftz.f32 %f1116, %f455, %f1116; // begin inline asm cvt.f32.f16 %f456, %rs289; // end inline asm add.ftz.f32 %f1115, %f456, %f1115; // begin inline asm cvt.f32.f16 %f457, %rs290; // end inline asm add.ftz.f32 %f1114, %f457, %f1114; $L__BB0_72: add.s32 %r1497, %r1578, 16; setp.ge.s32 %p227, %r1497, %r126; setp.gt.ftz.f32 %p124, %f1145, %f1144; selp.f32 %f458, %f1145, %f1144, %p124; setp.gt.ftz.f32 %p125, %f458, %f1143; selp.f32 %f459, %f458, %f1143, %p125; setp.gt.ftz.f32 %p126, %f459, %f1142; selp.f32 %f460, %f459, %f1142, %p126; setp.gt.ftz.f32 %p127, %f1141, %f1140; selp.f32 %f461, %f1141, %f1140, %p127; setp.gt.ftz.f32 %p128, %f461, %f1139; selp.f32 %f462, %f461, %f1139, %p128; setp.gt.ftz.f32 %p129, %f462, %f1138; selp.f32 %f463, %f462, %f1138, %p129; setp.gt.ftz.f32 %p130, %f1137, %f1136; selp.f32 %f464, %f1137, %f1136, %p130; setp.gt.ftz.f32 %p131, %f464, %f1135; selp.f32 %f465, %f464, %f1135, %p131; setp.gt.ftz.f32 %p132, %f465, %f1134; selp.f32 %f466, %f465, %f1134, %p132; setp.gt.ftz.f32 %p133, %f1133, %f1132; selp.f32 %f467, %f1133, %f1132, %p133; setp.gt.ftz.f32 %p134, %f467, %f1131; selp.f32 %f468, %f467, %f1131, %p134; setp.gt.ftz.f32 %p135, %f468, %f1130; selp.f32 %f469, %f468, %f1130, %p135; setp.gt.ftz.f32 %p136, %f1129, %f1128; selp.f32 %f470, %f1129, %f1128, %p136; setp.gt.ftz.f32 %p137, %f470, %f1127; selp.f32 %f471, %f470, %f1127, %p137; setp.gt.ftz.f32 %p138, %f471, %f1126; selp.f32 %f472, %f471, %f1126, %p138; setp.gt.ftz.f32 %p139, %f1125, %f1124; selp.f32 %f473, %f1125, %f1124, %p139; setp.gt.ftz.f32 %p140, %f473, %f1123; selp.f32 %f474, %f473, %f1123, %p140; setp.gt.ftz.f32 %p141, %f474, %f1122; selp.f32 %f475, %f474, %f1122, %p141; setp.gt.ftz.f32 %p142, %f1121, %f1120; selp.f32 %f476, %f1121, %f1120, %p142; setp.gt.ftz.f32 %p143, %f476, %f1119; selp.f32 %f477, %f476, %f1119, %p143; setp.gt.ftz.f32 %p144, %f477, %f1118; selp.f32 %f478, %f477, %f1118, %p144; setp.gt.ftz.f32 %p145, %f1117, %f1116; selp.f32 %f479, %f1117, %f1116, %p145; setp.gt.ftz.f32 %p146, %f479, %f1115; selp.f32 %f480, %f479, %f1115, %p146; setp.gt.ftz.f32 %p147, %f480, %f1114; selp.f32 %f481, %f480, %f1114, %p147; mov.b32 %r957, %f460; mov.u32 %r958, 31; mov.u32 %r959, 1; mov.u32 %r960, -1; shfl.sync.bfly.b32 %r961|%p148, %r957, %r959, %r958, %r960; mov.b32 %f482, %r961; setp.gt.ftz.f32 %p149, %f460, %f482; selp.f32 %f483, %f460, %f482, %p149; mov.b32 %r962, %f483; mov.u32 %r963, 2; shfl.sync.bfly.b32 %r964|%p150, %r962, %r963, %r958, %r960; mov.b32 %f484, %r964; setp.gt.ftz.f32 %p151, %f483, %f484; selp.f32 %f485, %f483, %f484, %p151; mov.b32 %r965, %f463; shfl.sync.bfly.b32 %r966|%p152, %r965, %r959, %r958, %r960; mov.b32 %f486, %r966; setp.gt.ftz.f32 %p153, %f463, %f486; selp.f32 %f487, %f463, %f486, %p153; mov.b32 %r967, %f487; shfl.sync.bfly.b32 %r968|%p154, %r967, %r963, %r958, %r960; mov.b32 %f488, %r968; setp.gt.ftz.f32 %p155, %f487, %f488; selp.f32 %f489, %f487, %f488, %p155; mov.b32 %r969, %f466; shfl.sync.bfly.b32 %r970|%p156, %r969, %r959, %r958, %r960; mov.b32 %f490, %r970; setp.gt.ftz.f32 %p157, %f466, %f490; selp.f32 %f491, %f466, %f490, %p157; mov.b32 %r971, %f491; shfl.sync.bfly.b32 %r972|%p158, %r971, %r963, %r958, %r960; mov.b32 %f492, %r972; setp.gt.ftz.f32 %p159, %f491, %f492; selp.f32 %f493, %f491, %f492, %p159; mov.b32 %r973, %f469; shfl.sync.bfly.b32 %r974|%p160, %r973, %r959, %r958, %r960; mov.b32 %f494, %r974; setp.gt.ftz.f32 %p161, %f469, %f494; selp.f32 %f495, %f469, %f494, %p161; mov.b32 %r975, %f495; shfl.sync.bfly.b32 %r976|%p162, %r975, %r963, %r958, %r960; mov.b32 %f496, %r976; setp.gt.ftz.f32 %p163, %f495, %f496; selp.f32 %f497, %f495, %f496, %p163; mov.b32 %r977, %f472; shfl.sync.bfly.b32 %r978|%p164, %r977, %r959, %r958, %r960; mov.b32 %f498, %r978; setp.gt.ftz.f32 %p165, %f472, %f498; selp.f32 %f499, %f472, %f498, %p165; mov.b32 %r979, %f499; shfl.sync.bfly.b32 %r980|%p166, %r979, %r963, %r958, %r960; mov.b32 %f500, %r980; setp.gt.ftz.f32 %p167, %f499, %f500; selp.f32 %f501, %f499, %f500, %p167; mov.b32 %r981, %f475; shfl.sync.bfly.b32 %r982|%p168, %r981, %r959, %r958, %r960; mov.b32 %f502, %r982; setp.gt.ftz.f32 %p169, %f475, %f502; selp.f32 %f503, %f475, %f502, %p169; mov.b32 %r983, %f503; shfl.sync.bfly.b32 %r984|%p170, %r983, %r963, %r958, %r960; mov.b32 %f504, %r984; setp.gt.ftz.f32 %p171, %f503, %f504; selp.f32 %f505, %f503, %f504, %p171; mov.b32 %r985, %f478; shfl.sync.bfly.b32 %r986|%p172, %r985, %r959, %r958, %r960; mov.b32 %f506, %r986; setp.gt.ftz.f32 %p173, %f478, %f506; selp.f32 %f507, %f478, %f506, %p173; mov.b32 %r987, %f507; shfl.sync.bfly.b32 %r988|%p174, %r987, %r963, %r958, %r960; mov.b32 %f508, %r988; setp.gt.ftz.f32 %p175, %f507, %f508; selp.f32 %f509, %f507, %f508, %p175; mov.b32 %r989, %f481; shfl.sync.bfly.b32 %r990|%p176, %r989, %r959, %r958, %r960; mov.b32 %f510, %r990; setp.gt.ftz.f32 %p177, %f481, %f510; selp.f32 %f511, %f481, %f510, %p177; mov.b32 %r991, %f511; shfl.sync.bfly.b32 %r992|%p178, %r991, %r963, %r958, %r960; mov.b32 %f512, %r992; setp.gt.ftz.f32 %p179, %f511, %f512; selp.f32 %f513, %f511, %f512, %p179; max.ftz.f32 %f114, %f485, %f1113; max.ftz.f32 %f115, %f489, %f1112; max.ftz.f32 %f116, %f493, %f1111; max.ftz.f32 %f117, %f497, %f1110; max.ftz.f32 %f118, %f501, %f1109; max.ftz.f32 %f119, %f505, %f1108; max.ftz.f32 %f120, %f509, %f1107; max.ftz.f32 %f121, %f513, %f1106; sub.ftz.f32 %f514, %f1145, %f114; mul.ftz.f32 %f515, %f514, 0f3FB8AA3B; ex2.approx.ftz.f32 %f122, %f515; sub.ftz.f32 %f516, %f1144, %f114; mul.ftz.f32 %f517, %f516, 0f3FB8AA3B; ex2.approx.ftz.f32 %f123, %f517; sub.ftz.f32 %f518, %f1143, %f114; mul.ftz.f32 %f519, %f518, 0f3FB8AA3B; ex2.approx.ftz.f32 %f124, %f519; sub.ftz.f32 %f520, %f1142, %f114; mul.ftz.f32 %f521, %f520, 0f3FB8AA3B; ex2.approx.ftz.f32 %f125, %f521; sub.ftz.f32 %f522, %f1141, %f115; mul.ftz.f32 %f523, %f522, 0f3FB8AA3B; ex2.approx.ftz.f32 %f126, %f523; sub.ftz.f32 %f524, %f1140, %f115; mul.ftz.f32 %f525, %f524, 0f3FB8AA3B; ex2.approx.ftz.f32 %f127, %f525; sub.ftz.f32 %f526, %f1139, %f115; mul.ftz.f32 %f527, %f526, 0f3FB8AA3B; ex2.approx.ftz.f32 %f128, %f527; sub.ftz.f32 %f528, %f1138, %f115; mul.ftz.f32 %f529, %f528, 0f3FB8AA3B; ex2.approx.ftz.f32 %f129, %f529; sub.ftz.f32 %f530, %f1137, %f116; mul.ftz.f32 %f531, %f530, 0f3FB8AA3B; ex2.approx.ftz.f32 %f130, %f531; sub.ftz.f32 %f532, %f1136, %f116; mul.ftz.f32 %f533, %f532, 0f3FB8AA3B; ex2.approx.ftz.f32 %f131, %f533; sub.ftz.f32 %f534, %f1135, %f116; mul.ftz.f32 %f535, %f534, 0f3FB8AA3B; ex2.approx.ftz.f32 %f132, %f535; sub.ftz.f32 %f536, %f1134, %f116; mul.ftz.f32 %f537, %f536, 0f3FB8AA3B; ex2.approx.ftz.f32 %f133, %f537; sub.ftz.f32 %f538, %f1133, %f117; mul.ftz.f32 %f539, %f538, 0f3FB8AA3B; ex2.approx.ftz.f32 %f134, %f539; sub.ftz.f32 %f540, %f1132, %f117; mul.ftz.f32 %f541, %f540, 0f3FB8AA3B; ex2.approx.ftz.f32 %f135, %f541; sub.ftz.f32 %f542, %f1131, %f117; mul.ftz.f32 %f543, %f542, 0f3FB8AA3B; ex2.approx.ftz.f32 %f136, %f543; sub.ftz.f32 %f544, %f1130, %f117; mul.ftz.f32 %f545, %f544, 0f3FB8AA3B; ex2.approx.ftz.f32 %f137, %f545; sub.ftz.f32 %f546, %f1129, %f118; mul.ftz.f32 %f547, %f546, 0f3FB8AA3B; ex2.approx.ftz.f32 %f138, %f547; sub.ftz.f32 %f548, %f1128, %f118; mul.ftz.f32 %f549, %f548, 0f3FB8AA3B; ex2.approx.ftz.f32 %f139, %f549; sub.ftz.f32 %f550, %f1127, %f118; mul.ftz.f32 %f551, %f550, 0f3FB8AA3B; ex2.approx.ftz.f32 %f140, %f551; sub.ftz.f32 %f552, %f1126, %f118; mul.ftz.f32 %f553, %f552, 0f3FB8AA3B; ex2.approx.ftz.f32 %f141, %f553; sub.ftz.f32 %f554, %f1125, %f119; mul.ftz.f32 %f555, %f554, 0f3FB8AA3B; ex2.approx.ftz.f32 %f142, %f555; sub.ftz.f32 %f556, %f1124, %f119; mul.ftz.f32 %f557, %f556, 0f3FB8AA3B; ex2.approx.ftz.f32 %f143, %f557; sub.ftz.f32 %f558, %f1123, %f119; mul.ftz.f32 %f559, %f558, 0f3FB8AA3B; ex2.approx.ftz.f32 %f144, %f559; sub.ftz.f32 %f560, %f1122, %f119; mul.ftz.f32 %f561, %f560, 0f3FB8AA3B; ex2.approx.ftz.f32 %f145, %f561; sub.ftz.f32 %f562, %f1121, %f120; mul.ftz.f32 %f563, %f562, 0f3FB8AA3B; ex2.approx.ftz.f32 %f146, %f563; sub.ftz.f32 %f564, %f1120, %f120; mul.ftz.f32 %f565, %f564, 0f3FB8AA3B; ex2.approx.ftz.f32 %f147, %f565; sub.ftz.f32 %f566, %f1119, %f120; mul.ftz.f32 %f567, %f566, 0f3FB8AA3B; ex2.approx.ftz.f32 %f148, %f567; sub.ftz.f32 %f568, %f1118, %f120; mul.ftz.f32 %f569, %f568, 0f3FB8AA3B; ex2.approx.ftz.f32 %f149, %f569; sub.ftz.f32 %f570, %f1117, %f121; mul.ftz.f32 %f571, %f570, 0f3FB8AA3B; ex2.approx.ftz.f32 %f150, %f571; sub.ftz.f32 %f572, %f1116, %f121; mul.ftz.f32 %f573, %f572, 0f3FB8AA3B; ex2.approx.ftz.f32 %f151, %f573; sub.ftz.f32 %f574, %f1115, %f121; mul.ftz.f32 %f575, %f574, 0f3FB8AA3B; ex2.approx.ftz.f32 %f152, %f575; sub.ftz.f32 %f576, %f1114, %f121; mul.ftz.f32 %f577, %f576, 0f3FB8AA3B; ex2.approx.ftz.f32 %f153, %f577; add.ftz.f32 %f578, %f122, %f123; add.ftz.f32 %f579, %f578, 0f00000000; add.ftz.f32 %f580, %f124, %f125; add.ftz.f32 %f581, %f580, 0f00000000; add.ftz.f32 %f582, %f579, %f581; add.ftz.f32 %f583, %f126, %f127; add.ftz.f32 %f584, %f583, 0f00000000; add.ftz.f32 %f585, %f128, %f129; add.ftz.f32 %f586, %f585, 0f00000000; add.ftz.f32 %f587, %f584, %f586; add.ftz.f32 %f588, %f130, %f131; add.ftz.f32 %f589, %f588, 0f00000000; add.ftz.f32 %f590, %f132, %f133; add.ftz.f32 %f591, %f590, 0f00000000; add.ftz.f32 %f592, %f589, %f591; add.ftz.f32 %f593, %f134, %f135; add.ftz.f32 %f594, %f593, 0f00000000; add.ftz.f32 %f595, %f136, %f137; add.ftz.f32 %f596, %f595, 0f00000000; add.ftz.f32 %f597, %f594, %f596; add.ftz.f32 %f598, %f138, %f139; add.ftz.f32 %f599, %f598, 0f00000000; add.ftz.f32 %f600, %f140, %f141; add.ftz.f32 %f601, %f600, 0f00000000; add.ftz.f32 %f602, %f599, %f601; add.ftz.f32 %f603, %f142, %f143; add.ftz.f32 %f604, %f603, 0f00000000; add.ftz.f32 %f605, %f144, %f145; add.ftz.f32 %f606, %f605, 0f00000000; add.ftz.f32 %f607, %f604, %f606; add.ftz.f32 %f608, %f146, %f147; add.ftz.f32 %f609, %f608, 0f00000000; add.ftz.f32 %f610, %f148, %f149; add.ftz.f32 %f611, %f610, 0f00000000; add.ftz.f32 %f612, %f609, %f611; add.ftz.f32 %f613, %f150, %f151; add.ftz.f32 %f614, %f613, 0f00000000; add.ftz.f32 %f615, %f152, %f153; add.ftz.f32 %f616, %f615, 0f00000000; add.ftz.f32 %f617, %f614, %f616; mov.b32 %r993, %f582; shfl.sync.bfly.b32 %r994|%p180, %r993, %r959, %r958, %r960; mov.b32 %f618, %r994; add.ftz.f32 %f619, %f582, %f618; mov.b32 %r995, %f619; shfl.sync.bfly.b32 %r996|%p181, %r995, %r963, %r958, %r960; mov.b32 %f620, %r996; add.ftz.f32 %f621, %f619, %f620; mov.b32 %r997, %f587; shfl.sync.bfly.b32 %r998|%p182, %r997, %r959, %r958, %r960; mov.b32 %f622, %r998; add.ftz.f32 %f623, %f587, %f622; mov.b32 %r999, %f623; shfl.sync.bfly.b32 %r1000|%p183, %r999, %r963, %r958, %r960; mov.b32 %f624, %r1000; add.ftz.f32 %f625, %f623, %f624; mov.b32 %r1001, %f592; shfl.sync.bfly.b32 %r1002|%p184, %r1001, %r959, %r958, %r960; mov.b32 %f626, %r1002; add.ftz.f32 %f627, %f592, %f626; mov.b32 %r1003, %f627; shfl.sync.bfly.b32 %r1004|%p185, %r1003, %r963, %r958, %r960; mov.b32 %f628, %r1004; add.ftz.f32 %f629, %f627, %f628; mov.b32 %r1005, %f597; shfl.sync.bfly.b32 %r1006|%p186, %r1005, %r959, %r958, %r960; mov.b32 %f630, %r1006; add.ftz.f32 %f631, %f597, %f630; mov.b32 %r1007, %f631; shfl.sync.bfly.b32 %r1008|%p187, %r1007, %r963, %r958, %r960; mov.b32 %f632, %r1008; add.ftz.f32 %f633, %f631, %f632; mov.b32 %r1009, %f602; shfl.sync.bfly.b32 %r1010|%p188, %r1009, %r959, %r958, %r960; mov.b32 %f634, %r1010; add.ftz.f32 %f635, %f602, %f634; mov.b32 %r1011, %f635; shfl.sync.bfly.b32 %r1012|%p189, %r1011, %r963, %r958, %r960; mov.b32 %f636, %r1012; add.ftz.f32 %f637, %f635, %f636; mov.b32 %r1013, %f607; shfl.sync.bfly.b32 %r1014|%p190, %r1013, %r959, %r958, %r960; mov.b32 %f638, %r1014; add.ftz.f32 %f639, %f607, %f638; mov.b32 %r1015, %f639; shfl.sync.bfly.b32 %r1016|%p191, %r1015, %r963, %r958, %r960; mov.b32 %f640, %r1016; add.ftz.f32 %f641, %f639, %f640; mov.b32 %r1017, %f612; shfl.sync.bfly.b32 %r1018|%p192, %r1017, %r959, %r958, %r960; mov.b32 %f642, %r1018; add.ftz.f32 %f643, %f612, %f642; mov.b32 %r1019, %f643; shfl.sync.bfly.b32 %r1020|%p193, %r1019, %r963, %r958, %r960; mov.b32 %f644, %r1020; add.ftz.f32 %f645, %f643, %f644; mov.b32 %r1021, %f617; shfl.sync.bfly.b32 %r1022|%p194, %r1021, %r959, %r958, %r960; mov.b32 %f646, %r1022; add.ftz.f32 %f647, %f617, %f646; mov.b32 %r1023, %f647; shfl.sync.bfly.b32 %r1024|%p195, %r1023, %r963, %r958, %r960; mov.b32 %f648, %r1024; add.ftz.f32 %f649, %f647, %f648; sub.ftz.f32 %f650, %f1113, %f114; mul.ftz.f32 %f651, %f650, 0f3FB8AA3B; ex2.approx.ftz.f32 %f652, %f651; mul.ftz.f32 %f154, %f652, %f1105; add.ftz.f32 %f1105, %f154, %f621; sub.ftz.f32 %f653, %f1112, %f115; mul.ftz.f32 %f654, %f653, 0f3FB8AA3B; ex2.approx.ftz.f32 %f655, %f654; mul.ftz.f32 %f156, %f655, %f1104; add.ftz.f32 %f1104, %f156, %f625; sub.ftz.f32 %f656, %f1111, %f116; mul.ftz.f32 %f657, %f656, 0f3FB8AA3B; ex2.approx.ftz.f32 %f658, %f657; mul.ftz.f32 %f158, %f658, %f1103; add.ftz.f32 %f1103, %f158, %f629; sub.ftz.f32 %f659, %f1110, %f117; mul.ftz.f32 %f660, %f659, 0f3FB8AA3B; ex2.approx.ftz.f32 %f661, %f660; mul.ftz.f32 %f160, %f661, %f1102; add.ftz.f32 %f1102, %f160, %f633; sub.ftz.f32 %f662, %f1109, %f118; mul.ftz.f32 %f663, %f662, 0f3FB8AA3B; ex2.approx.ftz.f32 %f664, %f663; mul.ftz.f32 %f162, %f664, %f1101; add.ftz.f32 %f1101, %f162, %f637; sub.ftz.f32 %f665, %f1108, %f119; mul.ftz.f32 %f666, %f665, 0f3FB8AA3B; ex2.approx.ftz.f32 %f667, %f666; mul.ftz.f32 %f164, %f667, %f1100; add.ftz.f32 %f1100, %f164, %f641; sub.ftz.f32 %f668, %f1107, %f120; mul.ftz.f32 %f669, %f668, 0f3FB8AA3B; ex2.approx.ftz.f32 %f670, %f669; mul.ftz.f32 %f166, %f670, %f1099; add.ftz.f32 %f1099, %f166, %f645; sub.ftz.f32 %f671, %f1106, %f121; mul.ftz.f32 %f672, %f671, 0f3FB8AA3B; ex2.approx.ftz.f32 %f673, %f672; mul.ftz.f32 %f168, %f673, %f1098; add.ftz.f32 %f1098, %f168, %f649; @%p227 bra $L__BB0_74; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1584, %r1585, %r1586, %r1587}, [%r534]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1588, %r1589, %r1590, %r1591}, [%r539]; // end inline asm $L__BB0_74: // begin inline asm cvt.rn.f16x2.f32 %r1051, %f123, %f122; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1052, %f127, %f126; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1053, %f125, %f124; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1054, %f129, %f128; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1055, %f131, %f130; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1056, %f135, %f134; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1057, %f133, %f132; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1058, %f137, %f136; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1059, %f139, %f138; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1060, %f143, %f142; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1061, %f141, %f140; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1062, %f145, %f144; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1063, %f147, %f146; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1064, %f151, %f150; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1065, %f149, %f148; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1066, %f153, %f152; // end inline asm // begin inline asm mov.u32 %r1067, 0; // end inline asm // begin inline asm mov.u32 %r1068, 0; // end inline asm // begin inline asm mov.u32 %r1069, 0; // end inline asm // begin inline asm mov.u32 %r1070, 0; // end inline asm // begin inline asm mov.u32 %r1071, 0; // end inline asm // begin inline asm mov.u32 %r1072, 0; // end inline asm // begin inline asm mov.u32 %r1073, 0; // end inline asm // begin inline asm mov.u32 %r1074, 0; // end inline asm // begin inline asm mov.u32 %r1075, 0; // end inline asm // begin inline asm mov.u32 %r1076, 0; // end inline asm // begin inline asm mov.u32 %r1077, 0; // end inline asm // begin inline asm mov.u32 %r1078, 0; // end inline asm // begin inline asm mov.u32 %r1079, 0; // end inline asm // begin inline asm mov.u32 %r1080, 0; // end inline asm // begin inline asm mov.u32 %r1081, 0; // end inline asm // begin inline asm mov.u32 %r1082, 0; // end inline asm // begin inline asm mov.u32 %r1083, 0; // end inline asm // begin inline asm mov.u32 %r1084, 0; // end inline asm // begin inline asm mov.u32 %r1085, 0; // end inline asm // begin inline asm mov.u32 %r1086, 0; // end inline asm // begin inline asm mov.u32 %r1087, 0; // end inline asm // begin inline asm mov.u32 %r1088, 0; // end inline asm // begin inline asm mov.u32 %r1089, 0; // end inline asm // begin inline asm mov.u32 %r1090, 0; // end inline asm // begin inline asm mov.u32 %r1091, 0; // end inline asm // begin inline asm mov.u32 %r1092, 0; // end inline asm // begin inline asm mov.u32 %r1093, 0; // end inline asm // begin inline asm mov.u32 %r1094, 0; // end inline asm // begin inline asm mov.u32 %r1095, 0; // end inline asm // begin inline asm mov.u32 %r1096, 0; // end inline asm // begin inline asm mov.u32 %r1097, 0; // end inline asm // begin inline asm mov.u32 %r1098, 0; // end inline asm // begin inline asm mov.u32 %r1099, 0; // end inline asm // begin inline asm mov.u32 %r1100, 0; // end inline asm // begin inline asm mov.u32 %r1101, 0; // end inline asm // begin inline asm mov.u32 %r1102, 0; // end inline asm // begin inline asm mov.u32 %r1103, 0; // end inline asm // begin inline asm mov.u32 %r1104, 0; // end inline asm // begin inline asm mov.u32 %r1105, 0; // end inline asm // begin inline asm mov.u32 %r1106, 0; // end inline asm // begin inline asm mov.u32 %r1107, 0; // end inline asm // begin inline asm mov.u32 %r1108, 0; // end inline asm // begin inline asm mov.u32 %r1109, 0; // end inline asm // begin inline asm mov.u32 %r1110, 0; // end inline asm // begin inline asm mov.u32 %r1111, 0; // end inline asm // begin inline asm mov.u32 %r1112, 0; // end inline asm // begin inline asm mov.u32 %r1113, 0; // end inline asm // begin inline asm mov.u32 %r1114, 0; // end inline asm // begin inline asm mov.u32 %r1115, 0; // end inline asm // begin inline asm mov.u32 %r1116, 0; // end inline asm // begin inline asm mov.u32 %r1117, 0; // end inline asm // begin inline asm mov.u32 %r1118, 0; // end inline asm // begin inline asm mov.u32 %r1119, 0; // end inline asm // begin inline asm mov.u32 %r1120, 0; // end inline asm // begin inline asm mov.u32 %r1121, 0; // end inline asm // begin inline asm mov.u32 %r1122, 0; // end inline asm // begin inline asm mov.u32 %r1123, 0; // end inline asm // begin inline asm mov.u32 %r1124, 0; // end inline asm // begin inline asm mov.u32 %r1125, 0; // end inline asm // begin inline asm mov.u32 %r1126, 0; // end inline asm // begin inline asm mov.u32 %r1127, 0; // end inline asm // begin inline asm mov.u32 %r1128, 0; // end inline asm // begin inline asm mov.u32 %r1129, 0; // end inline asm // begin inline asm mov.u32 %r1130, 0; // end inline asm mov.b32 %f706, %r1067; mov.b32 %f707, %r1068; mov.b32 %f708, %r1069; mov.b32 %f709, %r1070; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f706, %f707, %f708, %f709}, {%r1051, %r1052, %r1053, %r1054}, {%r1592, %r1593}, {%f706, %f707, %f708, %f709}; // end inline asm mov.b32 %f714, %r1071; mov.b32 %f715, %r1072; mov.b32 %f716, %r1073; mov.b32 %f717, %r1074; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f714, %f715, %f716, %f717}, {%r1051, %r1052, %r1053, %r1054}, {%r1594, %r1595}, {%f714, %f715, %f716, %f717}; // end inline asm mov.b32 %f722, %r1075; mov.b32 %f723, %r1076; mov.b32 %f724, %r1077; mov.b32 %f725, %r1078; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f722, %f723, %f724, %f725}, {%r1051, %r1052, %r1053, %r1054}, {%r1596, %r1597}, {%f722, %f723, %f724, %f725}; // end inline asm mov.b32 %f730, %r1079; mov.b32 %f731, %r1080; mov.b32 %f732, %r1081; mov.b32 %f733, %r1082; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f730, %f731, %f732, %f733}, {%r1051, %r1052, %r1053, %r1054}, {%r1598, %r1599}, {%f730, %f731, %f732, %f733}; // end inline asm mov.b32 %f738, %r1083; mov.b32 %f739, %r1084; mov.b32 %f740, %r1085; mov.b32 %f741, %r1086; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f738, %f739, %f740, %f741}, {%r1055, %r1056, %r1057, %r1058}, {%r1592, %r1593}, {%f738, %f739, %f740, %f741}; // end inline asm mov.b32 %f746, %r1087; mov.b32 %f747, %r1088; mov.b32 %f748, %r1089; mov.b32 %f749, %r1090; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f746, %f747, %f748, %f749}, {%r1055, %r1056, %r1057, %r1058}, {%r1594, %r1595}, {%f746, %f747, %f748, %f749}; // end inline asm mov.b32 %f754, %r1091; mov.b32 %f755, %r1092; mov.b32 %f756, %r1093; mov.b32 %f757, %r1094; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f754, %f755, %f756, %f757}, {%r1055, %r1056, %r1057, %r1058}, {%r1596, %r1597}, {%f754, %f755, %f756, %f757}; // end inline asm mov.b32 %f762, %r1095; mov.b32 %f763, %r1096; mov.b32 %f764, %r1097; mov.b32 %f765, %r1098; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f762, %f763, %f764, %f765}, {%r1055, %r1056, %r1057, %r1058}, {%r1598, %r1599}, {%f762, %f763, %f764, %f765}; // end inline asm mov.b32 %f770, %r1099; mov.b32 %f771, %r1100; mov.b32 %f772, %r1101; mov.b32 %f773, %r1102; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f770, %f771, %f772, %f773}, {%r1059, %r1060, %r1061, %r1062}, {%r1592, %r1593}, {%f770, %f771, %f772, %f773}; // end inline asm mov.b32 %f778, %r1103; mov.b32 %f779, %r1104; mov.b32 %f780, %r1105; mov.b32 %f781, %r1106; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f778, %f779, %f780, %f781}, {%r1059, %r1060, %r1061, %r1062}, {%r1594, %r1595}, {%f778, %f779, %f780, %f781}; // end inline asm mov.b32 %f786, %r1107; mov.b32 %f787, %r1108; mov.b32 %f788, %r1109; mov.b32 %f789, %r1110; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f786, %f787, %f788, %f789}, {%r1059, %r1060, %r1061, %r1062}, {%r1596, %r1597}, {%f786, %f787, %f788, %f789}; // end inline asm mov.b32 %f794, %r1111; mov.b32 %f795, %r1112; mov.b32 %f796, %r1113; mov.b32 %f797, %r1114; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f794, %f795, %f796, %f797}, {%r1059, %r1060, %r1061, %r1062}, {%r1598, %r1599}, {%f794, %f795, %f796, %f797}; // end inline asm mov.b32 %f802, %r1115; mov.b32 %f803, %r1116; mov.b32 %f804, %r1117; mov.b32 %f805, %r1118; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f802, %f803, %f804, %f805}, {%r1063, %r1064, %r1065, %r1066}, {%r1592, %r1593}, {%f802, %f803, %f804, %f805}; // end inline asm mov.b32 %f810, %r1119; mov.b32 %f811, %r1120; mov.b32 %f812, %r1121; mov.b32 %f813, %r1122; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f810, %f811, %f812, %f813}, {%r1063, %r1064, %r1065, %r1066}, {%r1594, %r1595}, {%f810, %f811, %f812, %f813}; // end inline asm mov.b32 %f818, %r1123; mov.b32 %f819, %r1124; mov.b32 %f820, %r1125; mov.b32 %f821, %r1126; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f818, %f819, %f820, %f821}, {%r1063, %r1064, %r1065, %r1066}, {%r1596, %r1597}, {%f818, %f819, %f820, %f821}; // end inline asm mov.b32 %f826, %r1127; mov.b32 %f827, %r1128; mov.b32 %f828, %r1129; mov.b32 %f829, %r1130; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f826, %f827, %f828, %f829}, {%r1063, %r1064, %r1065, %r1066}, {%r1598, %r1599}, {%f826, %f827, %f828, %f829}; // end inline asm setp.equ.ftz.f32 %p196, %f1105, 0f00000000; mov.f32 %f1147, 0f3F800000; mov.f32 %f1146, %f1147; @%p196 bra $L__BB0_76; rcp.approx.ftz.f32 %f1146, %f1105; $L__BB0_76: setp.equ.ftz.f32 %p197, %f1104, 0f00000000; @%p197 bra $L__BB0_78; rcp.approx.ftz.f32 %f1147, %f1104; $L__BB0_78: mov.b32 %f837, %r1663; fma.rn.ftz.f32 %f838, %f154, %f837, %f706; mul.ftz.f32 %f839, %f1146, %f838; mov.b32 %r1663, %f839; mov.b32 %f840, %r1662; fma.rn.ftz.f32 %f841, %f154, %f840, %f707; mul.ftz.f32 %f842, %f1146, %f841; mov.b32 %r1662, %f842; mov.b32 %f843, %r1661; fma.rn.ftz.f32 %f844, %f156, %f843, %f708; mul.ftz.f32 %f845, %f1147, %f844; mov.b32 %r1661, %f845; mov.b32 %f846, %r1660; fma.rn.ftz.f32 %f847, %f156, %f846, %f709; mul.ftz.f32 %f848, %f1147, %f847; mov.b32 %r1660, %f848; mov.b32 %f849, %r1659; fma.rn.ftz.f32 %f850, %f154, %f849, %f714; mul.ftz.f32 %f851, %f1146, %f850; mov.b32 %r1659, %f851; mov.b32 %f852, %r1658; fma.rn.ftz.f32 %f853, %f154, %f852, %f715; mul.ftz.f32 %f854, %f1146, %f853; mov.b32 %r1658, %f854; mov.b32 %f855, %r1657; fma.rn.ftz.f32 %f856, %f156, %f855, %f716; mul.ftz.f32 %f857, %f1147, %f856; mov.b32 %r1657, %f857; mov.b32 %f858, %r1656; fma.rn.ftz.f32 %f859, %f156, %f858, %f717; mul.ftz.f32 %f860, %f1147, %f859; mov.b32 %r1656, %f860; mov.b32 %f861, %r1655; fma.rn.ftz.f32 %f862, %f154, %f861, %f722; mul.ftz.f32 %f863, %f1146, %f862; mov.b32 %r1655, %f863; mov.b32 %f864, %r1654; fma.rn.ftz.f32 %f865, %f154, %f864, %f723; mul.ftz.f32 %f866, %f1146, %f865; mov.b32 %r1654, %f866; mov.b32 %f867, %r1653; fma.rn.ftz.f32 %f868, %f156, %f867, %f724; mul.ftz.f32 %f869, %f1147, %f868; mov.b32 %r1653, %f869; mov.b32 %f870, %r1652; fma.rn.ftz.f32 %f871, %f156, %f870, %f725; mul.ftz.f32 %f872, %f1147, %f871; mov.b32 %r1652, %f872; mov.b32 %f873, %r1651; fma.rn.ftz.f32 %f874, %f154, %f873, %f730; mul.ftz.f32 %f875, %f1146, %f874; mov.b32 %r1651, %f875; mov.b32 %f876, %r1650; fma.rn.ftz.f32 %f877, %f154, %f876, %f731; mul.ftz.f32 %f878, %f1146, %f877; mov.b32 %r1650, %f878; mov.b32 %f879, %r1649; fma.rn.ftz.f32 %f880, %f156, %f879, %f732; mul.ftz.f32 %f881, %f1147, %f880; mov.b32 %r1649, %f881; mov.b32 %f882, %r1648; fma.rn.ftz.f32 %f883, %f156, %f882, %f733; mul.ftz.f32 %f884, %f1147, %f883; mov.b32 %r1648, %f884; setp.equ.ftz.f32 %p198, %f1103, 0f00000000; mov.f32 %f1149, 0f3F800000; mov.f32 %f1148, %f1149; @%p198 bra $L__BB0_80; rcp.approx.ftz.f32 %f1148, %f1103; $L__BB0_80: setp.equ.ftz.f32 %p199, %f1102, 0f00000000; @%p199 bra $L__BB0_82; rcp.approx.ftz.f32 %f1149, %f1102; $L__BB0_82: mov.b32 %f887, %r1647; fma.rn.ftz.f32 %f888, %f158, %f887, %f738; mul.ftz.f32 %f889, %f1148, %f888; mov.b32 %r1647, %f889; mov.b32 %f890, %r1646; fma.rn.ftz.f32 %f891, %f158, %f890, %f739; mul.ftz.f32 %f892, %f1148, %f891; mov.b32 %r1646, %f892; mov.b32 %f893, %r1645; fma.rn.ftz.f32 %f894, %f160, %f893, %f740; mul.ftz.f32 %f895, %f1149, %f894; mov.b32 %r1645, %f895; mov.b32 %f896, %r1644; fma.rn.ftz.f32 %f897, %f160, %f896, %f741; mul.ftz.f32 %f898, %f1149, %f897; mov.b32 %r1644, %f898; mov.b32 %f899, %r1643; fma.rn.ftz.f32 %f900, %f158, %f899, %f746; mul.ftz.f32 %f901, %f1148, %f900; mov.b32 %r1643, %f901; mov.b32 %f902, %r1642; fma.rn.ftz.f32 %f903, %f158, %f902, %f747; mul.ftz.f32 %f904, %f1148, %f903; mov.b32 %r1642, %f904; mov.b32 %f905, %r1641; fma.rn.ftz.f32 %f906, %f160, %f905, %f748; mul.ftz.f32 %f907, %f1149, %f906; mov.b32 %r1641, %f907; mov.b32 %f908, %r1640; fma.rn.ftz.f32 %f909, %f160, %f908, %f749; mul.ftz.f32 %f910, %f1149, %f909; mov.b32 %r1640, %f910; mov.b32 %f911, %r1639; fma.rn.ftz.f32 %f912, %f158, %f911, %f754; mul.ftz.f32 %f913, %f1148, %f912; mov.b32 %r1639, %f913; mov.b32 %f914, %r1638; fma.rn.ftz.f32 %f915, %f158, %f914, %f755; mul.ftz.f32 %f916, %f1148, %f915; mov.b32 %r1638, %f916; mov.b32 %f917, %r1637; fma.rn.ftz.f32 %f918, %f160, %f917, %f756; mul.ftz.f32 %f919, %f1149, %f918; mov.b32 %r1637, %f919; mov.b32 %f920, %r1636; fma.rn.ftz.f32 %f921, %f160, %f920, %f757; mul.ftz.f32 %f922, %f1149, %f921; mov.b32 %r1636, %f922; mov.b32 %f923, %r1635; fma.rn.ftz.f32 %f924, %f158, %f923, %f762; mul.ftz.f32 %f925, %f1148, %f924; mov.b32 %r1635, %f925; mov.b32 %f926, %r1634; fma.rn.ftz.f32 %f927, %f158, %f926, %f763; mul.ftz.f32 %f928, %f1148, %f927; mov.b32 %r1634, %f928; mov.b32 %f929, %r1633; fma.rn.ftz.f32 %f930, %f160, %f929, %f764; mul.ftz.f32 %f931, %f1149, %f930; mov.b32 %r1633, %f931; mov.b32 %f932, %r1632; fma.rn.ftz.f32 %f933, %f160, %f932, %f765; mul.ftz.f32 %f934, %f1149, %f933; mov.b32 %r1632, %f934; setp.equ.ftz.f32 %p200, %f1101, 0f00000000; mov.f32 %f1151, 0f3F800000; mov.f32 %f1150, %f1151; @%p200 bra $L__BB0_84; rcp.approx.ftz.f32 %f1150, %f1101; $L__BB0_84: setp.equ.ftz.f32 %p201, %f1100, 0f00000000; @%p201 bra $L__BB0_86; rcp.approx.ftz.f32 %f1151, %f1100; $L__BB0_86: mov.b32 %f937, %r1631; fma.rn.ftz.f32 %f938, %f162, %f937, %f770; mul.ftz.f32 %f939, %f1150, %f938; mov.b32 %r1631, %f939; mov.b32 %f940, %r1630; fma.rn.ftz.f32 %f941, %f162, %f940, %f771; mul.ftz.f32 %f942, %f1150, %f941; mov.b32 %r1630, %f942; mov.b32 %f943, %r1629; fma.rn.ftz.f32 %f944, %f164, %f943, %f772; mul.ftz.f32 %f945, %f1151, %f944; mov.b32 %r1629, %f945; mov.b32 %f946, %r1628; fma.rn.ftz.f32 %f947, %f164, %f946, %f773; mul.ftz.f32 %f948, %f1151, %f947; mov.b32 %r1628, %f948; mov.b32 %f949, %r1627; fma.rn.ftz.f32 %f950, %f162, %f949, %f778; mul.ftz.f32 %f951, %f1150, %f950; mov.b32 %r1627, %f951; mov.b32 %f952, %r1626; fma.rn.ftz.f32 %f953, %f162, %f952, %f779; mul.ftz.f32 %f954, %f1150, %f953; mov.b32 %r1626, %f954; mov.b32 %f955, %r1625; fma.rn.ftz.f32 %f956, %f164, %f955, %f780; mul.ftz.f32 %f957, %f1151, %f956; mov.b32 %r1625, %f957; mov.b32 %f958, %r1624; fma.rn.ftz.f32 %f959, %f164, %f958, %f781; mul.ftz.f32 %f960, %f1151, %f959; mov.b32 %r1624, %f960; mov.b32 %f961, %r1623; fma.rn.ftz.f32 %f962, %f162, %f961, %f786; mul.ftz.f32 %f963, %f1150, %f962; mov.b32 %r1623, %f963; mov.b32 %f964, %r1622; fma.rn.ftz.f32 %f965, %f162, %f964, %f787; mul.ftz.f32 %f966, %f1150, %f965; mov.b32 %r1622, %f966; mov.b32 %f967, %r1621; fma.rn.ftz.f32 %f968, %f164, %f967, %f788; mul.ftz.f32 %f969, %f1151, %f968; mov.b32 %r1621, %f969; mov.b32 %f970, %r1620; fma.rn.ftz.f32 %f971, %f164, %f970, %f789; mul.ftz.f32 %f972, %f1151, %f971; mov.b32 %r1620, %f972; mov.b32 %f973, %r1619; fma.rn.ftz.f32 %f974, %f162, %f973, %f794; mul.ftz.f32 %f975, %f1150, %f974; mov.b32 %r1619, %f975; mov.b32 %f976, %r1618; fma.rn.ftz.f32 %f977, %f162, %f976, %f795; mul.ftz.f32 %f978, %f1150, %f977; mov.b32 %r1618, %f978; mov.b32 %f979, %r1617; fma.rn.ftz.f32 %f980, %f164, %f979, %f796; mul.ftz.f32 %f981, %f1151, %f980; mov.b32 %r1617, %f981; mov.b32 %f982, %r1616; fma.rn.ftz.f32 %f983, %f164, %f982, %f797; mul.ftz.f32 %f984, %f1151, %f983; mov.b32 %r1616, %f984; setp.equ.ftz.f32 %p202, %f1099, 0f00000000; mov.f32 %f1153, 0f3F800000; mov.f32 %f1152, %f1153; @%p202 bra $L__BB0_88; rcp.approx.ftz.f32 %f1152, %f1099; $L__BB0_88: setp.equ.ftz.f32 %p203, %f1098, 0f00000000; @%p203 bra $L__BB0_90; rcp.approx.ftz.f32 %f1153, %f1098; $L__BB0_90: add.s32 %r1475, %r1578, 16; setp.ge.s32 %p226, %r1475, %r126; mov.b32 %f986, %r1615; fma.rn.ftz.f32 %f987, %f166, %f986, %f802; mul.ftz.f32 %f988, %f1152, %f987; mov.b32 %r1615, %f988; mov.b32 %f989, %r1614; fma.rn.ftz.f32 %f990, %f166, %f989, %f803; mul.ftz.f32 %f991, %f1152, %f990; mov.b32 %r1614, %f991; mov.b32 %f992, %r1613; fma.rn.ftz.f32 %f993, %f168, %f992, %f804; mul.ftz.f32 %f994, %f1153, %f993; mov.b32 %r1613, %f994; mov.b32 %f995, %r1612; fma.rn.ftz.f32 %f996, %f168, %f995, %f805; mul.ftz.f32 %f997, %f1153, %f996; mov.b32 %r1612, %f997; mov.b32 %f998, %r1611; fma.rn.ftz.f32 %f999, %f166, %f998, %f810; mul.ftz.f32 %f1000, %f1152, %f999; mov.b32 %r1611, %f1000; mov.b32 %f1001, %r1610; fma.rn.ftz.f32 %f1002, %f166, %f1001, %f811; mul.ftz.f32 %f1003, %f1152, %f1002; mov.b32 %r1610, %f1003; mov.b32 %f1004, %r1609; fma.rn.ftz.f32 %f1005, %f168, %f1004, %f812; mul.ftz.f32 %f1006, %f1153, %f1005; mov.b32 %r1609, %f1006; mov.b32 %f1007, %r1608; fma.rn.ftz.f32 %f1008, %f168, %f1007, %f813; mul.ftz.f32 %f1009, %f1153, %f1008; mov.b32 %r1608, %f1009; mov.b32 %f1010, %r1607; fma.rn.ftz.f32 %f1011, %f166, %f1010, %f818; mul.ftz.f32 %f1012, %f1152, %f1011; mov.b32 %r1607, %f1012; mov.b32 %f1013, %r1606; fma.rn.ftz.f32 %f1014, %f166, %f1013, %f819; mul.ftz.f32 %f1015, %f1152, %f1014; mov.b32 %r1606, %f1015; mov.b32 %f1016, %r1605; fma.rn.ftz.f32 %f1017, %f168, %f1016, %f820; mul.ftz.f32 %f1018, %f1153, %f1017; mov.b32 %r1605, %f1018; mov.b32 %f1019, %r1604; fma.rn.ftz.f32 %f1020, %f168, %f1019, %f821; mul.ftz.f32 %f1021, %f1153, %f1020; mov.b32 %r1604, %f1021; mov.b32 %f1022, %r1603; fma.rn.ftz.f32 %f1023, %f166, %f1022, %f826; mul.ftz.f32 %f1024, %f1152, %f1023; mov.b32 %r1603, %f1024; mov.b32 %f1025, %r1602; fma.rn.ftz.f32 %f1026, %f166, %f1025, %f827; mul.ftz.f32 %f1027, %f1152, %f1026; mov.b32 %r1602, %f1027; mov.b32 %f1028, %r1601; fma.rn.ftz.f32 %f1029, %f168, %f1028, %f828; mul.ftz.f32 %f1030, %f1153, %f1029; mov.b32 %r1601, %f1030; mov.b32 %f1031, %r1600; fma.rn.ftz.f32 %f1032, %f168, %f1031, %f829; mul.ftz.f32 %f1033, %f1153, %f1032; mov.b32 %r1600, %f1033; @%p226 bra $L__BB0_92; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1592, %r1593, %r1594, %r1595}, [%r544]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1596, %r1597, %r1598, %r1599}, [%r549]; // end inline asm $L__BB0_92: add.s32 %r1578, %r1578, 16; setp.lt.s32 %p205, %r1578, %r126; add.s32 %r1579, %r1579, 16; mov.f32 %f1106, %f121; mov.f32 %f1107, %f120; mov.f32 %f1108, %f119; mov.f32 %f1109, %f118; mov.f32 %f1110, %f117; mov.f32 %f1111, %f116; mov.f32 %f1112, %f115; mov.f32 %f1113, %f114; @%p205 bra $L__BB0_3; $L__BB0_93: mov.u32 %r1491, %tid.x; mov.u32 %r1490, _ZN25fused_multihead_attention5smem_E; mov.u32 %r1489, %tid.x; shr.s32 %r1488, %r1489, 31; shr.u32 %r1487, %r1488, 30; add.s32 %r1486, %r1489, %r1487; and.b32 %r1485, %r1486, -4; sub.s32 %r1484, %r1489, %r1485; shl.b32 %r1483, %r1484, 4; cvt.s64.s32 %rd261, %r1483; mov.b64 %rd260, fmha_v2_flash_attention_fp16_256_16_S_32_sm86_kernel_nl_param_0; mov.u64 %rd259, %rd260; ld.param.u32 %r1482, [%rd259+60]; mov.u32 %r1481, %ctaid.y; mov.u32 %r1480, %ctaid.x; ld.param.u32 %r1479, [fmha_v2_flash_attention_fp16_256_16_S_32_sm86_kernel_nl_param_0+52]; mul.lo.s32 %r1478, %r1, %r1481; mad.lo.s32 %r1477, %r1478, %r1479, %r1480; bar.sync 0; mul.lo.s32 %r1291, %r1477, %r1482; shl.b32 %r1292, %r1291, 1; cvt.s64.s32 %rd218, %r1292; add.s64 %rd37, %rd218, %rd261; mov.b32 %f1034, %r1662; mov.b32 %f1035, %r1663; // begin inline asm cvt.rn.f16x2.f32 %r1253, %f1034, %f1035; // end inline asm mov.b32 %f1036, %r1660; mov.b32 %f1037, %r1661; // begin inline asm cvt.rn.f16x2.f32 %r1254, %f1036, %f1037; // end inline asm and.b32 %r1295, %r1489, 224; shr.u32 %r1296, %r1295, 2; and.b32 %r1297, %r1489, 24; shr.u32 %r1298, %r1297, 3; or.b32 %r1299, %r1296, %r1298; shl.b32 %r1300, %r1299, 7; add.s32 %r1302, %r1300, %r1490; and.b32 %r1303, %r1489, 4; and.b32 %r1304, %r1489, 3; bfi.b32 %r1305, %r1303, %r1304, 2, 30; shr.u32 %r1306, %r1297, 1; or.b32 %r1307, %r1305, %r1306; shl.b32 %r1308, %r1307, 2; add.s32 %r1309, %r1302, %r1308; add.s32 %r1255, %r1309, 16384; // begin inline asm st.shared.b32 [%r1255], %r1253; // end inline asm add.s32 %r432, %r1309, 16896; // begin inline asm st.shared.b32 [%r432], %r1254; // end inline asm xor.b32 %r1261, %r1255, 16; mov.b32 %f1038, %r1658; mov.b32 %f1039, %r1659; // begin inline asm cvt.rn.f16x2.f32 %r1259, %f1038, %f1039; // end inline asm mov.b32 %f1040, %r1656; mov.b32 %f1041, %r1657; // begin inline asm cvt.rn.f16x2.f32 %r1260, %f1040, %f1041; // end inline asm // begin inline asm st.shared.b32 [%r1261], %r1259; // end inline asm add.s32 %r1263, %r1261, 512; // begin inline asm st.shared.b32 [%r1263], %r1260; // end inline asm xor.b32 %r1267, %r1255, 32; mov.b32 %f1042, %r1654; mov.b32 %f1043, %r1655; // begin inline asm cvt.rn.f16x2.f32 %r1265, %f1042, %f1043; // end inline asm mov.b32 %f1044, %r1652; mov.b32 %f1045, %r1653; // begin inline asm cvt.rn.f16x2.f32 %r1266, %f1044, %f1045; // end inline asm // begin inline asm st.shared.b32 [%r1267], %r1265; // end inline asm add.s32 %r1269, %r1267, 512; // begin inline asm st.shared.b32 [%r1269], %r1266; // end inline asm xor.b32 %r1273, %r1255, 48; mov.b32 %f1046, %r1650; mov.b32 %f1047, %r1651; // begin inline asm cvt.rn.f16x2.f32 %r1271, %f1046, %f1047; // end inline asm mov.b32 %f1048, %r1648; mov.b32 %f1049, %r1649; // begin inline asm cvt.rn.f16x2.f32 %r1272, %f1048, %f1049; // end inline asm // begin inline asm st.shared.b32 [%r1273], %r1271; // end inline asm add.s32 %r1275, %r1273, 512; // begin inline asm st.shared.b32 [%r1275], %r1272; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1277, %r1278, %r1279, %r1280}, [%r10]; // end inline asm add.s32 %r1286, %r10, 2048; // begin inline asm ld.shared.v4.b32 {%r1282, %r1283, %r1284, %r1285}, [%r1286]; // end inline asm bar.sync 0; cvt.u32.u64 %r1310, %rd5; setp.ge.s32 %p206, %r1310, %r1; @%p206 bra $L__BB0_98; setp.ge.s32 %p207, %r7, %r9; @%p207 bra $L__BB0_96; mul.lo.s64 %rd220, %rd8, %rd5; add.s64 %rd221, %rd37, %rd220; cvta.to.global.u64 %rd222, %rd9; add.s64 %rd223, %rd222, %rd221; st.global.v4.u32 [%rd223], {%r1277, %r1278, %r1279, %r1280}; $L__BB0_96: add.s32 %r1312, %r1310, 32; setp.ge.s32 %p208, %r1312, %r1; or.pred %p210, %p208, %p207; @%p210 bra $L__BB0_98; add.s64 %rd224, %rd5, 32; mul.lo.s64 %rd225, %rd224, %rd8; add.s64 %rd226, %rd37, %rd225; cvta.to.global.u64 %rd227, %rd9; add.s64 %rd228, %rd227, %rd226; st.global.v4.u32 [%rd228], {%r1282, %r1283, %r1284, %r1285}; $L__BB0_98: mov.b32 %f1050, %r1646; mov.b32 %f1051, %r1647; // begin inline asm cvt.rn.f16x2.f32 %r1313, %f1050, %f1051; // end inline asm mov.b32 %f1052, %r1644; mov.b32 %f1053, %r1645; // begin inline asm cvt.rn.f16x2.f32 %r1314, %f1052, %f1053; // end inline asm // begin inline asm st.shared.b32 [%r1255], %r1313; // end inline asm // begin inline asm st.shared.b32 [%r432], %r1314; // end inline asm mov.b32 %f1054, %r1642; mov.b32 %f1055, %r1643; // begin inline asm cvt.rn.f16x2.f32 %r1319, %f1054, %f1055; // end inline asm mov.b32 %f1056, %r1640; mov.b32 %f1057, %r1641; // begin inline asm cvt.rn.f16x2.f32 %r1320, %f1056, %f1057; // end inline asm // begin inline asm st.shared.b32 [%r1261], %r1319; // end inline asm // begin inline asm st.shared.b32 [%r1263], %r1320; // end inline asm mov.b32 %f1058, %r1638; mov.b32 %f1059, %r1639; // begin inline asm cvt.rn.f16x2.f32 %r1325, %f1058, %f1059; // end inline asm mov.b32 %f1060, %r1636; mov.b32 %f1061, %r1637; // begin inline asm cvt.rn.f16x2.f32 %r1326, %f1060, %f1061; // end inline asm // begin inline asm st.shared.b32 [%r1267], %r1325; // end inline asm // begin inline asm st.shared.b32 [%r1269], %r1326; // end inline asm mov.b32 %f1062, %r1634; mov.b32 %f1063, %r1635; // begin inline asm cvt.rn.f16x2.f32 %r1331, %f1062, %f1063; // end inline asm mov.b32 %f1064, %r1632; mov.b32 %f1065, %r1633; // begin inline asm cvt.rn.f16x2.f32 %r1332, %f1064, %f1065; // end inline asm // begin inline asm st.shared.b32 [%r1273], %r1331; // end inline asm // begin inline asm st.shared.b32 [%r1275], %r1332; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1337, %r1338, %r1339, %r1340}, [%r10]; // end inline asm // begin inline asm ld.shared.v4.b32 {%r1342, %r1343, %r1344, %r1345}, [%r1286]; // end inline asm bar.sync 0; add.s32 %r1364, %r1310, 64; setp.ge.s32 %p211, %r1364, %r1; @%p211 bra $L__BB0_103; setp.ge.s32 %p212, %r7, %r9; @%p212 bra $L__BB0_101; add.s64 %rd229, %rd5, 64; mul.lo.s64 %rd230, %rd229, %rd8; add.s64 %rd231, %rd37, %rd230; cvta.to.global.u64 %rd232, %rd9; add.s64 %rd233, %rd232, %rd231; st.global.v4.u32 [%rd233], {%r1337, %r1338, %r1339, %r1340}; $L__BB0_101: add.s32 %r1366, %r1310, 96; setp.ge.s32 %p213, %r1366, %r1; or.pred %p215, %p213, %p212; @%p215 bra $L__BB0_103; add.s64 %rd234, %rd5, 96; mul.lo.s64 %rd235, %rd234, %rd8; add.s64 %rd236, %rd37, %rd235; cvta.to.global.u64 %rd237, %rd9; add.s64 %rd238, %rd237, %rd236; st.global.v4.u32 [%rd238], {%r1342, %r1343, %r1344, %r1345}; $L__BB0_103: mov.b32 %f1066, %r1630; mov.b32 %f1067, %r1631; // begin inline asm cvt.rn.f16x2.f32 %r1367, %f1066, %f1067; // end inline asm mov.b32 %f1068, %r1628; mov.b32 %f1069, %r1629; // begin inline asm cvt.rn.f16x2.f32 %r1368, %f1068, %f1069; // end inline asm // begin inline asm st.shared.b32 [%r1255], %r1367; // end inline asm // begin inline asm st.shared.b32 [%r432], %r1368; // end inline asm mov.b32 %f1070, %r1626; mov.b32 %f1071, %r1627; // begin inline asm cvt.rn.f16x2.f32 %r1373, %f1070, %f1071; // end inline asm mov.b32 %f1072, %r1624; mov.b32 %f1073, %r1625; // begin inline asm cvt.rn.f16x2.f32 %r1374, %f1072, %f1073; // end inline asm // begin inline asm st.shared.b32 [%r1261], %r1373; // end inline asm // begin inline asm st.shared.b32 [%r1263], %r1374; // end inline asm mov.b32 %f1074, %r1622; mov.b32 %f1075, %r1623; // begin inline asm cvt.rn.f16x2.f32 %r1379, %f1074, %f1075; // end inline asm mov.b32 %f1076, %r1620; mov.b32 %f1077, %r1621; // begin inline asm cvt.rn.f16x2.f32 %r1380, %f1076, %f1077; // end inline asm // begin inline asm st.shared.b32 [%r1267], %r1379; // end inline asm // begin inline asm st.shared.b32 [%r1269], %r1380; // end inline asm mov.b32 %f1078, %r1618; mov.b32 %f1079, %r1619; // begin inline asm cvt.rn.f16x2.f32 %r1385, %f1078, %f1079; // end inline asm mov.b32 %f1080, %r1616; mov.b32 %f1081, %r1617; // begin inline asm cvt.rn.f16x2.f32 %r1386, %f1080, %f1081; // end inline asm // begin inline asm st.shared.b32 [%r1273], %r1385; // end inline asm // begin inline asm st.shared.b32 [%r1275], %r1386; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1391, %r1392, %r1393, %r1394}, [%r10]; // end inline asm // begin inline asm ld.shared.v4.b32 {%r1396, %r1397, %r1398, %r1399}, [%r1286]; // end inline asm bar.sync 0; add.s32 %r1418, %r1310, 128; setp.ge.s32 %p216, %r1418, %r1; @%p216 bra $L__BB0_108; setp.ge.s32 %p217, %r7, %r9; @%p217 bra $L__BB0_106; add.s64 %rd239, %rd5, 128; mul.lo.s64 %rd240, %rd239, %rd8; add.s64 %rd241, %rd37, %rd240; cvta.to.global.u64 %rd242, %rd9; add.s64 %rd243, %rd242, %rd241; st.global.v4.u32 [%rd243], {%r1391, %r1392, %r1393, %r1394}; $L__BB0_106: add.s32 %r1420, %r1310, 160; setp.ge.s32 %p218, %r1420, %r1; or.pred %p220, %p218, %p217; @%p220 bra $L__BB0_108; add.s64 %rd244, %rd5, 160; mul.lo.s64 %rd245, %rd244, %rd8; add.s64 %rd246, %rd37, %rd245; cvta.to.global.u64 %rd247, %rd9; add.s64 %rd248, %rd247, %rd246; st.global.v4.u32 [%rd248], {%r1396, %r1397, %r1398, %r1399}; $L__BB0_108: mov.b32 %f1082, %r1614; mov.b32 %f1083, %r1615; // begin inline asm cvt.rn.f16x2.f32 %r1421, %f1082, %f1083; // end inline asm mov.b32 %f1084, %r1612; mov.b32 %f1085, %r1613; // begin inline asm cvt.rn.f16x2.f32 %r1422, %f1084, %f1085; // end inline asm // begin inline asm st.shared.b32 [%r1255], %r1421; // end inline asm // begin inline asm st.shared.b32 [%r432], %r1422; // end inline asm mov.b32 %f1086, %r1610; mov.b32 %f1087, %r1611; // begin inline asm cvt.rn.f16x2.f32 %r1427, %f1086, %f1087; // end inline asm mov.b32 %f1088, %r1608; mov.b32 %f1089, %r1609; // begin inline asm cvt.rn.f16x2.f32 %r1428, %f1088, %f1089; // end inline asm // begin inline asm st.shared.b32 [%r1261], %r1427; // end inline asm // begin inline asm st.shared.b32 [%r1263], %r1428; // end inline asm mov.b32 %f1090, %r1606; mov.b32 %f1091, %r1607; // begin inline asm cvt.rn.f16x2.f32 %r1433, %f1090, %f1091; // end inline asm mov.b32 %f1092, %r1604; mov.b32 %f1093, %r1605; // begin inline asm cvt.rn.f16x2.f32 %r1434, %f1092, %f1093; // end inline asm // begin inline asm st.shared.b32 [%r1267], %r1433; // end inline asm // begin inline asm st.shared.b32 [%r1269], %r1434; // end inline asm mov.b32 %f1094, %r1602; mov.b32 %f1095, %r1603; // begin inline asm cvt.rn.f16x2.f32 %r1439, %f1094, %f1095; // end inline asm mov.b32 %f1096, %r1600; mov.b32 %f1097, %r1601; // begin inline asm cvt.rn.f16x2.f32 %r1440, %f1096, %f1097; // end inline asm // begin inline asm st.shared.b32 [%r1273], %r1439; // end inline asm // begin inline asm st.shared.b32 [%r1275], %r1440; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1445, %r1446, %r1447, %r1448}, [%r10]; // end inline asm // begin inline asm ld.shared.v4.b32 {%r1450, %r1451, %r1452, %r1453}, [%r1286]; // end inline asm add.s32 %r1472, %r1310, 192; setp.ge.s32 %p221, %r1472, %r1; @%p221 bra $L__BB0_113; setp.ge.s32 %p222, %r7, %r9; @%p222 bra $L__BB0_111; add.s64 %rd249, %rd5, 192; mul.lo.s64 %rd250, %rd249, %rd8; add.s64 %rd251, %rd37, %rd250; cvta.to.global.u64 %rd252, %rd9; add.s64 %rd253, %rd252, %rd251; st.global.v4.u32 [%rd253], {%r1445, %r1446, %r1447, %r1448}; $L__BB0_111: add.s32 %r1474, %r1310, 224; setp.ge.s32 %p223, %r1474, %r1; or.pred %p225, %p223, %p222; @%p225 bra $L__BB0_113; add.s64 %rd254, %rd5, 224; mul.lo.s64 %rd255, %rd254, %rd8; add.s64 %rd256, %rd37, %rd255; cvta.to.global.u64 %rd257, %rd9; add.s64 %rd258, %rd257, %rd256; st.global.v4.u32 [%rd258], {%r1450, %r1451, %r1452, %r1453}; $L__BB0_113: ret; }