ion5smem_E[]; .visible .entry fmha_v2_flash_attention_fp16_128_16_S_64_sm86_kernel_nl( .param .align 8 .b8 fmha_v2_flash_attention_fp16_128_16_S_64_sm86_kernel_nl_param_0[168] ) { .reg .pred %p<138>; .reg .b16 %rs<147>; .reg .f32 %f<866>; .reg .b32 %r<1656>; .reg .b64 %rd<185>; mov.b64 %rd34, fmha_v2_flash_attention_fp16_128_16_S_64_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd34; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_128_16_S_64_sm86_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_128_16_S_64_sm86_kernel_nl_param_0+52]; mov.u32 %r549, %ctaid.z; shl.b32 %r3, %r549, 7; setp.le.s32 %p1, %r1, %r3; @%p1 bra $L__BB0_75; mov.u32 %r714, %tid.x; mov.u32 %r715, %ctaid.y; mov.u32 %r716, %ctaid.x; mul.lo.s32 %r717, %r1, %r715; mad.lo.s32 %r718, %r717, %r2, %r716; shr.s32 %r719, %r714, 31; shr.u32 %r720, %r719, 27; add.s32 %r721, %r714, %r720; and.b32 %r722, %r721, -32; sub.s32 %r723, %r714, %r722; shr.u32 %r724, %r719, 25; add.s32 %r725, %r714, %r724; shr.s32 %r726, %r725, 7; shl.b32 %r727, %r726, 4; shr.s32 %r728, %r723, 31; shr.u32 %r729, %r728, 30; add.s32 %r730, %r723, %r729; and.b32 %r731, %r730, 2147483644; sub.s32 %r732, %r723, %r731; shl.b32 %r733, %r732, 1; add.s32 %r1555, %r733, %r727; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r734, %r721, 5; shr.s32 %r735, %r721, 31; shr.u32 %r736, %r735, 30; add.s32 %r737, %r734, %r736; and.b32 %r738, %r737, 268435452; sub.s32 %r739, %r734, %r738; shl.b32 %r740, %r739, 4; shr.s32 %r741, %r730, 2; add.s32 %r5, %r740, %r741; shr.u32 %r742, %r719, 29; add.s32 %r743, %r714, %r742; shr.s32 %r6, %r743, 3; add.s32 %r744, %r6, %r3; cvt.s64.s32 %rd5, %r744; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd45, %rd6, %rd5; mul.lo.s32 %r745, %r718, 3; mul.wide.s32 %rd46, %r745, 128; and.b32 %r746, %r743, -8; sub.s32 %r7, %r714, %r746; shl.b32 %r747, %r7, 4; cvt.s64.s32 %rd47, %r747; add.s64 %rd48, %rd46, %rd47; add.s64 %rd49, %rd48, %rd45; ld.param.u64 %rd50, [%rd1]; add.s64 %rd35, %rd50, %rd49; shr.s32 %r748, %r743, 31; shr.u32 %r749, %r748, 29; add.s32 %r750, %r6, %r749; and.b32 %r751, %r750, 268435448; sub.s32 %r752, %r6, %r751; xor.b32 %r753, %r752, %r7; shl.b32 %r754, %r6, 7; shl.b32 %r755, %r753, 4; mov.u32 %r756, 31; mov.u32 %r1554, 0; mov.u32 %r758, -1; shfl.sync.idx.b32 %r759|%p2, %r1554, %r1554, %r756, %r758; shfl.sync.idx.b32 %r760|%p3, %r1554, %r1554, %r756, %r758; and.b32 %r761, %r714, 96; shr.u32 %r762, %r761, 1; and.b32 %r763, %r714, 15; or.b32 %r764, %r762, %r763; and.b32 %r765, %r714, 7; shl.b32 %r766, %r714, 4; and.b32 %r767, %r766, 112; and.b32 %r768, %r714, 16; xor.b32 %r769, %r767, %r768; cvt.s64.s32 %rd51, %r6; mul.lo.s64 %rd52, %rd6, %rd51; add.s32 %r770, %r745, 1; mul.wide.s32 %rd53, %r770, 128; add.s64 %rd54, %rd53, %rd47; add.s64 %rd55, %rd54, %rd52; shfl.sync.idx.b32 %r771|%p4, %r1554, %r1554, %r756, %r758; shfl.sync.idx.b32 %r772|%p5, %r1554, %r1554, %r756, %r758; shr.u32 %r773, %r768, 1; or.b32 %r774, %r773, %r765; and.b32 %r775, %r714, 8; shr.u32 %r776, %r775, 3; xor.b32 %r777, %r776, %r765; add.s32 %r778, %r745, 2; mul.wide.s32 %rd56, %r778, 128; add.s64 %rd57, %rd56, %rd47; add.s64 %rd58, %rd57, %rd52; shfl.sync.idx.b32 %r779|%p6, %r1554, %r1554, %r756, %r758; shfl.sync.idx.b32 %r780|%p7, %r1554, %r1554, %r756, %r758; ld.param.u64 %rd8, [%rd1+32]; ld.param.u64 %rd9, [%rd1+8]; ld.param.u32 %r8, [%rd1+60]; sub.s32 %r781, %r1, %r3; min.s32 %r782, %r781, 128; shr.s32 %r783, %r8, 31; shr.u32 %r784, %r783, 29; add.s32 %r785, %r8, %r784; shr.s32 %r9, %r785, 3; shl.b32 %r786, %r714, 7; and.b32 %r787, %r786, 1920; shl.b32 %r788, %r777, 4; shl.b32 %r789, %r774, 7; shl.b32 %r790, %r764, 7; setp.lt.s32 %p8, %r6, %r782; add.s32 %r791, %r6, 16; setp.lt.s32 %p9, %r791, %r782; add.s32 %r792, %r6, 32; setp.lt.s32 %p10, %r792, %r782; add.s32 %r793, %r6, 48; setp.lt.s32 %p11, %r793, %r782; add.s32 %r794, %r6, 64; setp.lt.s32 %p12, %r794, %r782; add.s32 %r795, %r6, 80; setp.lt.s32 %p13, %r795, %r782; add.s32 %r796, %r6, 96; setp.lt.s32 %p14, %r796, %r782; add.s32 %r797, %r6, 112; setp.lt.s32 %p15, %r797, %r782; add.s32 %r798, %r755, %r754; or.b32 %r799, %r790, %r769; add.s64 %rd183, %rd50, %rd55; or.b32 %r800, %r789, %r788; add.s64 %rd184, %rd50, %rd58; or.b32 %r801, %r769, %r787; mov.u32 %r802, _ZN25fused_multihead_attention5smem_E; add.s32 %r803, %r802, 16384; add.s32 %r10, %r798, %r803; shl.b64 %rd59, %rd6, 4; selp.b32 %r561, 16, 0, %p13; add.s32 %r804, %r798, %r802; add.s32 %r550, %r804, %r760; add.s32 %r552, %r550, 2048; add.s32 %r554, %r550, 4096; add.s32 %r556, %r550, 6144; add.s32 %r558, %r550, 8192; add.s32 %r560, %r550, 10240; add.s32 %r562, %r550, 12288; add.s32 %r564, %r550, 14336; selp.b32 %r551, 16, 0, %p8; // begin inline asm cp.async.cg.shared.global [%r550], [%rd35], 16, %r551; // end inline asm selp.b32 %r553, 16, 0, %p9; add.s64 %rd36, %rd35, %rd59; // begin inline asm cp.async.cg.shared.global [%r552], [%rd36], 16, %r553; // end inline asm selp.b32 %r555, 16, 0, %p10; add.s64 %rd37, %rd36, %rd59; // begin inline asm cp.async.cg.shared.global [%r554], [%rd37], 16, %r555; // end inline asm selp.b32 %r557, 16, 0, %p11; add.s64 %rd38, %rd37, %rd59; // begin inline asm cp.async.cg.shared.global [%r556], [%rd38], 16, %r557; // end inline asm selp.b32 %r559, 16, 0, %p12; add.s64 %rd39, %rd38, %rd59; // begin inline asm cp.async.cg.shared.global [%r558], [%rd39], 16, %r559; // end inline asm add.s64 %rd40, %rd39, %rd59; // begin inline asm cp.async.cg.shared.global [%r560], [%rd40], 16, %r561; // end inline asm selp.b32 %r563, 16, 0, %p14; add.s64 %rd41, %rd40, %rd59; // begin inline asm cp.async.cg.shared.global [%r562], [%rd41], 16, %r563; // end inline asm selp.b32 %r565, 16, 0, %p15; add.s64 %rd42, %rd41, %rd59; // begin inline asm cp.async.cg.shared.global [%r564], [%rd42], 16, %r565; // end inline asm min.s32 %r805, %r1, 16; setp.lt.s32 %p16, %r6, %r805; add.s32 %r566, %r10, %r772; selp.b32 %r569, 16, 0, %p16; // begin inline asm cp.async.cg.shared.global [%r566], [%rd183], 16, %r569; // end inline asm add.s32 %r806, %r802, 18432; add.s32 %r807, %r798, %r806; add.s32 %r568, %r807, %r780; // begin inline asm cp.async.cg.shared.global [%r568], [%rd184], 16, %r569; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r808, %r759, %r802; add.s32 %r574, %r808, %r799; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r570, %r571, %r572, %r573}, [%r574]; // end inline asm add.s32 %r579, %r574, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r575, %r576, %r577, %r578}, [%r579]; // end inline asm xor.b32 %r809, %r799, 32; add.s32 %r584, %r808, %r809; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r580, %r581, %r582, %r583}, [%r584]; // end inline asm add.s32 %r589, %r584, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r585, %r586, %r587, %r588}, [%r589]; // end inline asm xor.b32 %r810, %r799, 64; add.s32 %r594, %r808, %r810; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r590, %r591, %r592, %r593}, [%r594]; // end inline asm add.s32 %r599, %r594, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r595, %r596, %r597, %r598}, [%r599]; // end inline asm xor.b32 %r811, %r799, 96; add.s32 %r604, %r808, %r811; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r600, %r601, %r602, %r603}, [%r604]; // end inline asm add.s32 %r609, %r604, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r605, %r606, %r607, %r608}, [%r609]; // end inline asm add.s32 %r45, %r771, %r803; add.s32 %r614, %r45, %r800; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1564, %r1563, %r1562, %r1561}, [%r614]; // end inline asm xor.b32 %r812, %r800, 32; add.s32 %r619, %r45, %r812; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1560, %r1565, %r1566, %r1567}, [%r619]; // end inline asm xor.b32 %r813, %r800, 64; add.s32 %r624, %r45, %r813; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1568, %r1569, %r1570, %r1571}, [%r624]; // end inline asm xor.b32 %r814, %r800, 96; add.s32 %r629, %r45, %r814; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1572, %r1573, %r1574, %r1575}, [%r629]; // end inline asm add.s32 %r634, %r801, %r806; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1591, %r1590, %r1589, %r1588}, [%r634]; // end inline asm xor.b32 %r815, %r801, 32; add.s32 %r639, %r815, %r806; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1587, %r1586, %r1585, %r1584}, [%r639]; // end inline asm xor.b32 %r816, %r801, 64; add.s32 %r644, %r816, %r806; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1583, %r1582, %r1581, %r1580}, [%r644]; // end inline asm xor.b32 %r817, %r801, 96; add.s32 %r649, %r817, %r806; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1579, %r1578, %r1577, %r1576}, [%r649]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r1655, 0; // end inline asm // begin inline asm mov.u32 %r1654, 0; // end inline asm // begin inline asm mov.u32 %r1653, 0; // end inline asm // begin inline asm mov.u32 %r1652, 0; // end inline asm // begin inline asm mov.u32 %r1651, 0; // end inline asm // begin inline asm mov.u32 %r1650, 0; // end inline asm // begin inline asm mov.u32 %r1649, 0; // end inline asm // begin inline asm mov.u32 %r1648, 0; // end inline asm // begin inline asm mov.u32 %r1647, 0; // end inline asm // begin inline asm mov.u32 %r1646, 0; // end inline asm // begin inline asm mov.u32 %r1645, 0; // end inline asm // begin inline asm mov.u32 %r1644, 0; // end inline asm // begin inline asm mov.u32 %r1643, 0; // end inline asm // begin inline asm mov.u32 %r1642, 0; // end inline asm // begin inline asm mov.u32 %r1641, 0; // end inline asm // begin inline asm mov.u32 %r1640, 0; // end inline asm // begin inline asm mov.u32 %r1639, 0; // end inline asm // begin inline asm mov.u32 %r1638, 0; // end inline asm // begin inline asm mov.u32 %r1637, 0; // end inline asm // begin inline asm mov.u32 %r1636, 0; // end inline asm // begin inline asm mov.u32 %r1635, 0; // end inline asm // begin inline asm mov.u32 %r1634, 0; // end inline asm // begin inline asm mov.u32 %r1633, 0; // end inline asm // begin inline asm mov.u32 %r1632, 0; // end inline asm // begin inline asm mov.u32 %r1631, 0; // end inline asm // begin inline asm mov.u32 %r1630, 0; // end inline asm // begin inline asm mov.u32 %r1629, 0; // end inline asm // begin inline asm mov.u32 %r1628, 0; // end inline asm // begin inline asm mov.u32 %r1627, 0; // end inline asm // begin inline asm mov.u32 %r1626, 0; // end inline asm // begin inline asm mov.u32 %r1625, 0; // end inline asm // begin inline asm mov.u32 %r1624, 0; // end inline asm // begin inline asm mov.u32 %r1623, 0; // end inline asm // begin inline asm mov.u32 %r1622, 0; // end inline asm // begin inline asm mov.u32 %r1621, 0; // end inline asm // begin inline asm mov.u32 %r1620, 0; // end inline asm // begin inline asm mov.u32 %r1619, 0; // end inline asm // begin inline asm mov.u32 %r1618, 0; // end inline asm // begin inline asm mov.u32 %r1617, 0; // end inline asm // begin inline asm mov.u32 %r1616, 0; // end inline asm // begin inline asm mov.u32 %r1615, 0; // end inline asm // begin inline asm mov.u32 %r1614, 0; // end inline asm // begin inline asm mov.u32 %r1613, 0; // end inline asm // begin inline asm mov.u32 %r1612, 0; // end inline asm // begin inline asm mov.u32 %r1611, 0; // end inline asm // begin inline asm mov.u32 %r1610, 0; // end inline asm // begin inline asm mov.u32 %r1609, 0; // end inline asm // begin inline asm mov.u32 %r1608, 0; // end inline asm // begin inline asm mov.u32 %r1607, 0; // end inline asm // begin inline asm mov.u32 %r1606, 0; // end inline asm // begin inline asm mov.u32 %r1605, 0; // end inline asm // begin inline asm mov.u32 %r1604, 0; // end inline asm // begin inline asm mov.u32 %r1603, 0; // end inline asm // begin inline asm mov.u32 %r1602, 0; // end inline asm // begin inline asm mov.u32 %r1601, 0; // end inline asm // begin inline asm mov.u32 %r1600, 0; // end inline asm // begin inline asm mov.u32 %r1599, 0; // end inline asm // begin inline asm mov.u32 %r1598, 0; // end inline asm // begin inline asm mov.u32 %r1597, 0; // end inline asm // begin inline asm mov.u32 %r1596, 0; // end inline asm // begin inline asm mov.u32 %r1595, 0; // end inline asm // begin inline asm mov.u32 %r1594, 0; // end inline asm // begin inline asm mov.u32 %r1593, 0; // end inline asm // begin inline asm mov.u32 %r1592, 0; // end inline asm add.s32 %r818, %r1, 15; shr.s32 %r819, %r818, 31; shr.u32 %r820, %r819, 28; add.s32 %r821, %r818, %r820; and.b32 %r142, %r821, -16; setp.lt.s32 %p17, %r1, 1; @%p17 bra $L__BB0_53; ld.param.u8 %rs1, [%rd1+160]; cvt.s64.s32 %rd12, %r5; cvt.s64.s32 %rd13, %r1555; add.s32 %r823, %r1555, 1; cvt.s64.s32 %rd14, %r823; add.s32 %r824, %r1555, 8; cvt.s64.s32 %rd15, %r824; add.s32 %r825, %r1555, 9; cvt.s64.s32 %rd16, %r825; add.s32 %r207, %r5, 8; add.s32 %r208, %r5, 64; add.s32 %r209, %r5, 72; mov.f32 %f842, 0fFF800000; mov.f32 %f838, 0f00000000; mov.f32 %f839, %f838; mov.f32 %f840, %f838; mov.f32 %f841, %f838; mov.f32 %f843, %f842; mov.f32 %f844, %f842; mov.f32 %f845, %f842; mov.u32 %r1559, %r1; mov.u32 %r1558, %r1; $L__BB0_3: add.s32 %r826, %r1554, 16; setp.ge.s32 %p18, %r826, %r142; @%p18 bra $L__BB0_5; bar.sync 0; add.s64 %rd183, %rd183, %rd59; add.s32 %r1559, %r1559, -16; min.s32 %r831, %r1559, 16; setp.lt.s32 %p19, %r6, %r831; selp.b32 %r828, 16, 0, %p19; // begin inline asm cp.async.cg.shared.global [%r566], [%rd183], 16, %r828; // end inline asm add.s64 %rd184, %rd184, %rd59; add.s32 %r1558, %r1558, -16; min.s32 %r832, %r1558, 16; setp.lt.s32 %p20, %r6, %r832; selp.b32 %r830, 16, 0, %p20; // begin inline asm cp.async.cg.shared.global [%r568], [%rd184], 16, %r830; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm $L__BB0_5: setp.eq.s16 %p21, %rs1, 0; @%p21 bra $L__BB0_38; cvt.s64.s32 %rd64, %r3; add.s64 %rd65, %rd12, %rd64; setp.ge.u64 %p22, %rd65, %rd2; mul.lo.s32 %r835, %r1, %r3; cvt.s64.s32 %rd66, %r835; cvt.u64.u32 %rd23, %r1554; add.s64 %rd67, %rd66, %rd23; add.s64 %rd24, %rd67, %rd13; mul.lo.s64 %rd68, %rd12, %rd2; add.s64 %rd69, %rd24, %rd68; add.s64 %rd25, %rd13, %rd23; setp.ge.u64 %p23, %rd25, %rd2; shl.b64 %rd70, %rd69, 1; mad.lo.s32 %r838, %r2, %r715, %r716; cvt.s64.s32 %rd71, %r838; mul.lo.s64 %rd72, %rd4, %rd71; add.s64 %rd73, %rd72, %rd70; cvta.to.global.u64 %rd74, %rd3; add.s64 %rd26, %rd74, %rd73; mov.u16 %rs132, 0; or.pred %p24, %p23, %p22; mov.u16 %rs131, %rs132; @%p24 bra $L__BB0_8; ld.global.u16 %rs131, [%rd26]; $L__BB0_8: add.s64 %rd27, %rd14, %rd23; setp.ge.u64 %p26, %rd27, %rd2; or.pred %p27, %p26, %p22; @%p27 bra $L__BB0_10; ld.global.u16 %rs132, [%rd26+2]; $L__BB0_10: add.s64 %rd28, %rd15, %rd23; setp.ge.u64 %p29, %rd28, %rd2; mov.u16 %rs134, 0; or.pred %p30, %p29, %p22; mov.u16 %rs133, %rs134; @%p30 bra $L__BB0_12; ld.global.u16 %rs133, [%rd26+16]; $L__BB0_12: add.s64 %rd29, %rd16, %rd23; setp.ge.u64 %p32, %rd29, %rd2; or.pred %p33, %p32, %p22; @%p33 bra $L__BB0_14; ld.global.u16 %rs134, [%rd26+18]; $L__BB0_14: cvt.s64.s32 %rd82, %r207; add.s64 %rd83, %rd82, %rd64; setp.ge.u64 %p34, %rd83, %rd2; mul.lo.s64 %rd84, %rd82, %rd2; add.s64 %rd85, %rd24, %rd84; shl.b64 %rd86, %rd85, 1; add.s64 %rd89, %rd72, %rd86; add.s64 %rd30, %rd74, %rd89; mov.u16 %rs136, 0; or.pred %p36, %p23, %p34; mov.u16 %rs135, %rs136; @%p36 bra $L__BB0_16; ld.global.u16 %rs135, [%rd30]; $L__BB0_16: or.pred %p39, %p26, %p34; @%p39 bra $L__BB0_18; ld.global.u16 %rs136, [%rd30+2]; $L__BB0_18: mov.u16 %rs138, 0; or.pred %p42, %p29, %p34; mov.u16 %rs137, %rs138; @%p42 bra $L__BB0_20; ld.global.u16 %rs137, [%rd30+16]; $L__BB0_20: or.pred %p45, %p32, %p34; @%p45 bra $L__BB0_22; ld.global.u16 %rs138, [%rd30+18]; $L__BB0_22: cvt.s64.s32 %rd101, %r208; add.s64 %rd102, %rd101, %rd64; setp.ge.u64 %p46, %rd102, %rd2; mul.lo.s64 %rd103, %rd101, %rd2; add.s64 %rd104, %rd24, %rd103; shl.b64 %rd105, %rd104, 1; add.s64 %rd108, %rd72, %rd105; add.s64 %rd31, %rd74, %rd108; mov.u16 %rs140, 0; or.pred %p48, %p23, %p46; mov.u16 %rs139, %rs140; @%p48 bra $L__BB0_24; ld.global.u16 %rs139, [%rd31]; $L__BB0_24: or.pred %p51, %p26, %p46; @%p51 bra $L__BB0_26; ld.global.u16 %rs140, [%rd31+2]; $L__BB0_26: mov.u16 %rs142, 0; or.pred %p54, %p29, %p46; mov.u16 %rs141, %rs142; @%p54 bra $L__BB0_28; ld.global.u16 %rs141, [%rd31+16]; $L__BB0_28: or.pred %p57, %p32, %p46; @%p57 bra $L__BB0_30; ld.global.u16 %rs142, [%rd31+18]; $L__BB0_30: cvt.s64.s32 %rd120, %r209; add.s64 %rd121, %rd120, %rd64; setp.ge.u64 %p58, %rd121, %rd2; mul.lo.s64 %rd122, %rd120, %rd2; add.s64 %rd123, %rd24, %rd122; shl.b64 %rd124, %rd123, 1; add.s64 %rd127, %rd72, %rd124; add.s64 %rd32, %rd74, %rd127; mov.u16 %rs144, 0; or.pred %p60, %p23, %p58; mov.u16 %rs143, %rs144; @%p60 bra $L__BB0_32; ld.global.u16 %rs143, [%rd32]; $L__BB0_32: or.pred %p63, %p26, %p58; @%p63 bra $L__BB0_34; ld.global.u16 %rs144, [%rd32+2]; $L__BB0_34: mov.u16 %rs146, 0; or.pred %p66, %p29, %p58; mov.u16 %rs145, %rs146; @%p66 bra $L__BB0_36; ld.global.u16 %rs145, [%rd32+16]; $L__BB0_36: or.pred %p69, %p32, %p58; @%p69 bra $L__BB0_38; ld.global.u16 %rs146, [%rd32+18]; $L__BB0_38: // begin inline asm mov.u32 %r878, 0; // end inline asm // begin inline asm mov.u32 %r879, 0; // end inline asm // begin inline asm mov.u32 %r880, 0; // end inline asm // begin inline asm mov.u32 %r881, 0; // end inline asm // begin inline asm mov.u32 %r882, 0; // end inline asm // begin inline asm mov.u32 %r883, 0; // end inline asm // begin inline asm mov.u32 %r884, 0; // end inline asm // begin inline asm mov.u32 %r885, 0; // end inline asm // begin inline asm mov.u32 %r886, 0; // end inline asm // begin inline asm mov.u32 %r887, 0; // end inline asm // begin inline asm mov.u32 %r888, 0; // end inline asm // begin inline asm mov.u32 %r889, 0; // end inline asm // begin inline asm mov.u32 %r890, 0; // end inline asm // begin inline asm mov.u32 %r891, 0; // end inline asm // begin inline asm mov.u32 %r892, 0; // end inline asm // begin inline asm mov.u32 %r893, 0; // end inline asm mov.b32 %f198, %r878; mov.b32 %f199, %r879; mov.b32 %f200, %r880; mov.b32 %f201, %r881; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r570, %r571, %r572, %r573}, {%r1564, %r1563}, {%f198, %f199, %f200, %f201}; // end inline asm mov.b32 %f206, %r882; mov.b32 %f207, %r883; mov.b32 %f208, %r884; mov.b32 %f209, %r885; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r570, %r571, %r572, %r573}, {%r1562, %r1561}, {%f206, %f207, %f208, %f209}; // end inline asm mov.b32 %f214, %r886; mov.b32 %f215, %r887; mov.b32 %f216, %r888; mov.b32 %f217, %r889; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r575, %r576, %r577, %r578}, {%r1564, %r1563}, {%f214, %f215, %f216, %f217}; // end inline asm mov.b32 %f222, %r890; mov.b32 %f223, %r891; mov.b32 %f224, %r892; mov.b32 %f225, %r893; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r575, %r576, %r577, %r578}, {%r1562, %r1561}, {%f222, %f223, %f224, %f225}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r580, %r581, %r582, %r583}, {%r1560, %r1565}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r580, %r581, %r582, %r583}, {%r1566, %r1567}, {%f206, %f207, %f208, %f209}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r585, %r586, %r587, %r588}, {%r1560, %r1565}, {%f214, %f215, %f216, %f217}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r585, %r586, %r587, %r588}, {%r1566, %r1567}, {%f222, %f223, %f224, %f225}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r590, %r591, %r592, %r593}, {%r1568, %r1569}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r590, %r591, %r592, %r593}, {%r1570, %r1571}, {%f206, %f207, %f208, %f209}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r595, %r596, %r597, %r598}, {%r1568, %r1569}, {%f214, %f215, %f216, %f217}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r595, %r596, %r597, %r598}, {%r1570, %r1571}, {%f222, %f223, %f224, %f225}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f198, %f199, %f200, %f201}, {%r600, %r601, %r602, %r603}, {%r1572, %r1573}, {%f198, %f199, %f200, %f201}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f206, %f207, %f208, %f209}, {%r600, %r601, %r602, %r603}, {%r1574, %r1575}, {%f206, %f207, %f208, %f209}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r605, %r606, %r607, %r608}, {%r1572, %r1573}, {%f214, %f215, %f216, %f217}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r605, %r606, %r607, %r608}, {%r1574, %r1575}, {%f222, %f223, %f224, %f225}; // end inline asm mul.ftz.f32 %f294, %f1, %f198; mul.ftz.f32 %f295, %f1, %f199; mul.ftz.f32 %f296, %f1, %f206; mul.ftz.f32 %f297, %f1, %f207; mul.ftz.f32 %f298, %f1, %f200; mul.ftz.f32 %f299, %f1, %f201; mul.ftz.f32 %f300, %f1, %f208; mul.ftz.f32 %f301, %f1, %f209; mul.ftz.f32 %f302, %f1, %f214; mul.ftz.f32 %f303, %f1, %f215; mul.ftz.f32 %f304, %f1, %f222; mul.ftz.f32 %f305, %f1, %f223; mul.ftz.f32 %f306, %f1, %f216; mul.ftz.f32 %f307, %f1, %f217; mul.ftz.f32 %f308, %f1, %f224; mul.ftz.f32 %f309, %f1, %f225; setp.lt.s32 %p70, %r1555, %r1; selp.f32 %f861, %f294, 0fFF800000, %p70; add.s32 %r990, %r1555, 1; setp.lt.s32 %p71, %r990, %r1; selp.f32 %f860, %f295, 0fFF800000, %p71; add.s32 %r991, %r1555, 8; setp.lt.s32 %p72, %r991, %r1; selp.f32 %f859, %f296, 0fFF800000, %p72; add.s32 %r992, %r1555, 9; setp.lt.s32 %p73, %r992, %r1; selp.f32 %f858, %f297, 0fFF800000, %p73; selp.f32 %f857, %f298, 0fFF800000, %p70; selp.f32 %f856, %f299, 0fFF800000, %p71; selp.f32 %f855, %f300, 0fFF800000, %p72; selp.f32 %f854, %f301, 0fFF800000, %p73; selp.f32 %f853, %f302, 0fFF800000, %p70; selp.f32 %f852, %f303, 0fFF800000, %p71; selp.f32 %f851, %f304, 0fFF800000, %p72; selp.f32 %f850, %f305, 0fFF800000, %p73; selp.f32 %f849, %f306, 0fFF800000, %p70; selp.f32 %f848, %f307, 0fFF800000, %p71; selp.f32 %f847, %f308, 0fFF800000, %p72; selp.f32 %f846, %f309, 0fFF800000, %p73; @%p21 bra $L__BB0_40; // begin inline asm cvt.f32.f16 %f310, %rs131; // end inline asm add.ftz.f32 %f861, %f310, %f861; // begin inline asm cvt.f32.f16 %f311, %rs132; // end inline asm add.ftz.f32 %f860, %f311, %f860; // begin inline asm cvt.f32.f16 %f312, %rs133; // end inline asm add.ftz.f32 %f859, %f312, %f859; // begin inline asm cvt.f32.f16 %f313, %rs134; // end inline asm add.ftz.f32 %f858, %f313, %f858; // begin inline asm cvt.f32.f16 %f314, %rs135; // end inline asm add.ftz.f32 %f857, %f314, %f857; // begin inline asm cvt.f32.f16 %f315, %rs136; // end inline asm add.ftz.f32 %f856, %f315, %f856; // begin inline asm cvt.f32.f16 %f316, %rs137; // end inline asm add.ftz.f32 %f855, %f316, %f855; // begin inline asm cvt.f32.f16 %f317, %rs138; // end inline asm add.ftz.f32 %f854, %f317, %f854; // begin inline asm cvt.f32.f16 %f318, %rs139; // end inline asm add.ftz.f32 %f853, %f318, %f853; // begin inline asm cvt.f32.f16 %f319, %rs140; // end inline asm add.ftz.f32 %f852, %f319, %f852; // begin inline asm cvt.f32.f16 %f320, %rs141; // end inline asm add.ftz.f32 %f851, %f320, %f851; // begin inline asm cvt.f32.f16 %f321, %rs142; // end inline asm add.ftz.f32 %f850, %f321, %f850; // begin inline asm cvt.f32.f16 %f322, %rs143; // end inline asm add.ftz.f32 %f849, %f322, %f849; // begin inline asm cvt.f32.f16 %f323, %rs144; // end inline asm add.ftz.f32 %f848, %f323, %f848; // begin inline asm cvt.f32.f16 %f324, %rs145; // end inline asm add.ftz.f32 %f847, %f324, %f847; // begin inline asm cvt.f32.f16 %f325, %rs146; // end inline asm add.ftz.f32 %f846, %f325, %f846; $L__BB0_40: add.s32 %r1457, %r1554, 16; setp.ge.s32 %p137, %r1457, %r142; setp.gt.ftz.f32 %p76, %f861, %f860; selp.f32 %f326, %f861, %f860, %p76; setp.gt.ftz.f32 %p77, %f326, %f859; selp.f32 %f327, %f326, %f859, %p77; setp.gt.ftz.f32 %p78, %f327, %f858; selp.f32 %f328, %f327, %f858, %p78; setp.gt.ftz.f32 %p79, %f857, %f856; selp.f32 %f329, %f857, %f856, %p79; setp.gt.ftz.f32 %p80, %f329, %f855; selp.f32 %f330, %f329, %f855, %p80; setp.gt.ftz.f32 %p81, %f330, %f854; selp.f32 %f331, %f330, %f854, %p81; setp.gt.ftz.f32 %p82, %f853, %f852; selp.f32 %f332, %f853, %f852, %p82; setp.gt.ftz.f32 %p83, %f332, %f851; selp.f32 %f333, %f332, %f851, %p83; setp.gt.ftz.f32 %p84, %f333, %f850; selp.f32 %f334, %f333, %f850, %p84; setp.gt.ftz.f32 %p85, %f849, %f848; selp.f32 %f335, %f849, %f848, %p85; setp.gt.ftz.f32 %p86, %f335, %f847; selp.f32 %f336, %f335, %f847, %p86; setp.gt.ftz.f32 %p87, %f336, %f846; selp.f32 %f337, %f336, %f846, %p87; mov.b32 %r994, %f328; mov.u32 %r995, 31; mov.u32 %r996, 1; mov.u32 %r997, -1; shfl.sync.bfly.b32 %r998|%p88, %r994, %r996, %r995, %r997; mov.b32 %f338, %r998; setp.gt.ftz.f32 %p89, %f328, %f338; selp.f32 %f339, %f328, %f338, %p89; mov.b32 %r999, %f339; mov.u32 %r1000, 2; shfl.sync.bfly.b32 %r1001|%p90, %r999, %r1000, %r995, %r997; mov.b32 %f340, %r1001; setp.gt.ftz.f32 %p91, %f339, %f340; selp.f32 %f341, %f339, %f340, %p91; mov.b32 %r1002, %f331; shfl.sync.bfly.b32 %r1003|%p92, %r1002, %r996, %r995, %r997; mov.b32 %f342, %r1003; setp.gt.ftz.f32 %p93, %f331, %f342; selp.f32 %f343, %f331, %f342, %p93; mov.b32 %r1004, %f343; shfl.sync.bfly.b32 %r1005|%p94, %r1004, %r1000, %r995, %r997; mov.b32 %f344, %r1005; setp.gt.ftz.f32 %p95, %f343, %f344; selp.f32 %f345, %f343, %f344, %p95; mov.b32 %r1006, %f334; shfl.sync.bfly.b32 %r1007|%p96, %r1006, %r996, %r995, %r997; mov.b32 %f346, %r1007; setp.gt.ftz.f32 %p97, %f334, %f346; selp.f32 %f347, %f334, %f346, %p97; mov.b32 %r1008, %f347; shfl.sync.bfly.b32 %r1009|%p98, %r1008, %r1000, %r995, %r997; mov.b32 %f348, %r1009; setp.gt.ftz.f32 %p99, %f347, %f348; selp.f32 %f349, %f347, %f348, %p99; mov.b32 %r1010, %f337; shfl.sync.bfly.b32 %r1011|%p100, %r1010, %r996, %r995, %r997; mov.b32 %f350, %r1011; setp.gt.ftz.f32 %p101, %f337, %f350; selp.f32 %f351, %f337, %f350, %p101; mov.b32 %r1012, %f351; shfl.sync.bfly.b32 %r1013|%p102, %r1012, %r1000, %r995, %r997; mov.b32 %f352, %r1013; setp.gt.ftz.f32 %p103, %f351, %f352; selp.f32 %f353, %f351, %f352, %p103; max.ftz.f32 %f58, %f341, %f845; max.ftz.f32 %f59, %f345, %f844; max.ftz.f32 %f60, %f349, %f843; max.ftz.f32 %f61, %f353, %f842; sub.ftz.f32 %f354, %f861, %f58; mul.ftz.f32 %f355, %f354, 0f3FB8AA3B; ex2.approx.ftz.f32 %f62, %f355; sub.ftz.f32 %f356, %f860, %f58; mul.ftz.f32 %f357, %f356, 0f3FB8AA3B; ex2.approx.ftz.f32 %f63, %f357; sub.ftz.f32 %f358, %f859, %f58; mul.ftz.f32 %f359, %f358, 0f3FB8AA3B; ex2.approx.ftz.f32 %f64, %f359; sub.ftz.f32 %f360, %f858, %f58; mul.ftz.f32 %f361, %f360, 0f3FB8AA3B; ex2.approx.ftz.f32 %f65, %f361; sub.ftz.f32 %f362, %f857, %f59; mul.ftz.f32 %f363, %f362, 0f3FB8AA3B; ex2.approx.ftz.f32 %f66, %f363; sub.ftz.f32 %f364, %f856, %f59; mul.ftz.f32 %f365, %f364, 0f3FB8AA3B; ex2.approx.ftz.f32 %f67, %f365; sub.ftz.f32 %f366, %f855, %f59; mul.ftz.f32 %f367, %f366, 0f3FB8AA3B; ex2.approx.ftz.f32 %f68, %f367; sub.ftz.f32 %f368, %f854, %f59; mul.ftz.f32 %f369, %f368, 0f3FB8AA3B; ex2.approx.ftz.f32 %f69, %f369; sub.ftz.f32 %f370, %f853, %f60; mul.ftz.f32 %f371, %f370, 0f3FB8AA3B; ex2.approx.ftz.f32 %f70, %f371; sub.ftz.f32 %f372, %f852, %f60; mul.ftz.f32 %f373, %f372, 0f3FB8AA3B; ex2.approx.ftz.f32 %f71, %f373; sub.ftz.f32 %f374, %f851, %f60; mul.ftz.f32 %f375, %f374, 0f3FB8AA3B; ex2.approx.ftz.f32 %f72, %f375; sub.ftz.f32 %f376, %f850, %f60; mul.ftz.f32 %f377, %f376, 0f3FB8AA3B; ex2.approx.ftz.f32 %f73, %f377; sub.ftz.f32 %f378, %f849, %f61; mul.ftz.f32 %f379, %f378, 0f3FB8AA3B; ex2.approx.ftz.f32 %f74, %f379; sub.ftz.f32 %f380, %f848, %f61; mul.ftz.f32 %f381, %f380, 0f3FB8AA3B; ex2.approx.ftz.f32 %f75, %f381; sub.ftz.f32 %f382, %f847, %f61; mul.ftz.f32 %f383, %f382, 0f3FB8AA3B; ex2.approx.ftz.f32 %f76, %f383; sub.ftz.f32 %f384, %f846, %f61; mul.ftz.f32 %f385, %f384, 0f3FB8AA3B; ex2.approx.ftz.f32 %f77, %f385; add.ftz.f32 %f386, %f62, %f63; add.ftz.f32 %f387, %f386, 0f00000000; add.ftz.f32 %f388, %f64, %f65; add.ftz.f32 %f389, %f388, 0f00000000; add.ftz.f32 %f390, %f387, %f389; add.ftz.f32 %f391, %f66, %f67; add.ftz.f32 %f392, %f391, 0f00000000; add.ftz.f32 %f393, %f68, %f69; add.ftz.f32 %f394, %f393, 0f00000000; add.ftz.f32 %f395, %f392, %f394; add.ftz.f32 %f396, %f70, %f71; add.ftz.f32 %f397, %f396, 0f00000000; add.ftz.f32 %f398, %f72, %f73; add.ftz.f32 %f399, %f398, 0f00000000; add.ftz.f32 %f400, %f397, %f399; add.ftz.f32 %f401, %f74, %f75; add.ftz.f32 %f402, %f401, 0f00000000; add.ftz.f32 %f403, %f76, %f77; add.ftz.f32 %f404, %f403, 0f00000000; add.ftz.f32 %f405, %f402, %f404; mov.b32 %r1014, %f390; shfl.sync.bfly.b32 %r1015|%p104, %r1014, %r996, %r995, %r997; mov.b32 %f406, %r1015; add.ftz.f32 %f407, %f390, %f406; mov.b32 %r1016, %f407; shfl.sync.bfly.b32 %r1017|%p105, %r1016, %r1000, %r995, %r997; mov.b32 %f408, %r1017; add.ftz.f32 %f409, %f407, %f408; mov.b32 %r1018, %f395; shfl.sync.bfly.b32 %r1019|%p106, %r1018, %r996, %r995, %r997; mov.b32 %f410, %r1019; add.ftz.f32 %f411, %f395, %f410; mov.b32 %r1020, %f411; shfl.sync.bfly.b32 %r1021|%p107, %r1020, %r1000, %r995, %r997; mov.b32 %f412, %r1021; add.ftz.f32 %f413, %f411, %f412; mov.b32 %r1022, %f400; shfl.sync.bfly.b32 %r1023|%p108, %r1022, %r996, %r995, %r997; mov.b32 %f414, %r1023; add.ftz.f32 %f415, %f400, %f414; mov.b32 %r1024, %f415; shfl.sync.bfly.b32 %r1025|%p109, %r1024, %r1000, %r995, %r997; mov.b32 %f416, %r1025; add.ftz.f32 %f417, %f415, %f416; mov.b32 %r1026, %f405; shfl.sync.bfly.b32 %r1027|%p110, %r1026, %r996, %r995, %r997; mov.b32 %f418, %r1027; add.ftz.f32 %f419, %f405, %f418; mov.b32 %r1028, %f419; shfl.sync.bfly.b32 %r1029|%p111, %r1028, %r1000, %r995, %r997; mov.b32 %f420, %r1029; add.ftz.f32 %f421, %f419, %f420; sub.ftz.f32 %f422, %f845, %f58; mul.ftz.f32 %f423, %f422, 0f3FB8AA3B; ex2.approx.ftz.f32 %f424, %f423; mul.ftz.f32 %f78, %f424, %f841; add.ftz.f32 %f841, %f78, %f409; sub.ftz.f32 %f425, %f844, %f59; mul.ftz.f32 %f426, %f425, 0f3FB8AA3B; ex2.approx.ftz.f32 %f427, %f426; mul.ftz.f32 %f80, %f427, %f840; add.ftz.f32 %f840, %f80, %f413; sub.ftz.f32 %f428, %f843, %f60; mul.ftz.f32 %f429, %f428, 0f3FB8AA3B; ex2.approx.ftz.f32 %f430, %f429; mul.ftz.f32 %f82, %f430, %f839; add.ftz.f32 %f839, %f82, %f417; sub.ftz.f32 %f431, %f842, %f61; mul.ftz.f32 %f432, %f431, 0f3FB8AA3B; ex2.approx.ftz.f32 %f433, %f432; mul.ftz.f32 %f84, %f433, %f838; add.ftz.f32 %f838, %f84, %f421; @%p137 bra $L__BB0_42; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1564, %r1563, %r1562, %r1561}, [%r614]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1560, %r1565, %r1566, %r1567}, [%r619]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1568, %r1569, %r1570, %r1571}, [%r624]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1572, %r1573, %r1574, %r1575}, [%r629]; // end inline asm $L__BB0_42: // begin inline asm cvt.rn.f16x2.f32 %r1064, %f63, %f62; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1065, %f67, %f66; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1066, %f65, %f64; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1067, %f69, %f68; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1068, %f71, %f70; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1069, %f75, %f74; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1070, %f73, %f72; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1071, %f77, %f76; // end inline asm // begin inline asm mov.u32 %r1072, 0; // end inline asm // begin inline asm mov.u32 %r1073, 0; // end inline asm // begin inline asm mov.u32 %r1074, 0; // end inline asm // begin inline asm mov.u32 %r1075, 0; // end inline asm // begin inline asm mov.u32 %r1076, 0; // end inline asm // begin inline asm mov.u32 %r1077, 0; // end inline asm // begin inline asm mov.u32 %r1078, 0; // end inline asm // begin inline asm mov.u32 %r1079, 0; // end inline asm // begin inline asm mov.u32 %r1080, 0; // end inline asm // begin inline asm mov.u32 %r1081, 0; // end inline asm // begin inline asm mov.u32 %r1082, 0; // end inline asm // begin inline asm mov.u32 %r1083, 0; // end inline asm // begin inline asm mov.u32 %r1084, 0; // end inline asm // begin inline asm mov.u32 %r1085, 0; // end inline asm // begin inline asm mov.u32 %r1086, 0; // end inline asm // begin inline asm mov.u32 %r1087, 0; // end inline asm // begin inline asm mov.u32 %r1088, 0; // end inline asm // begin inline asm mov.u32 %r1089, 0; // end inline asm // begin inline asm mov.u32 %r1090, 0; // end inline asm // begin inline asm mov.u32 %r1091, 0; // end inline asm // begin inline asm mov.u32 %r1092, 0; // end inline asm // begin inline asm mov.u32 %r1093, 0; // end inline asm // begin inline asm mov.u32 %r1094, 0; // end inline asm // begin inline asm mov.u32 %r1095, 0; // end inline asm // begin inline asm mov.u32 %r1096, 0; // end inline asm // begin inline asm mov.u32 %r1097, 0; // end inline asm // begin inline asm mov.u32 %r1098, 0; // end inline asm // begin inline asm mov.u32 %r1099, 0; // end inline asm // begin inline asm mov.u32 %r1100, 0; // end inline asm // begin inline asm mov.u32 %r1101, 0; // end inline asm // begin inline asm mov.u32 %r1102, 0; // end inline asm // begin inline asm mov.u32 %r1103, 0; // end inline asm // begin inline asm mov.u32 %r1104, 0; // end inline asm // begin inline asm mov.u32 %r1105, 0; // end inline asm // begin inline asm mov.u32 %r1106, 0; // end inline asm // begin inline asm mov.u32 %r1107, 0; // end inline asm // begin inline asm mov.u32 %r1108, 0; // end inline asm // begin inline asm mov.u32 %r1109, 0; // end inline asm // begin inline asm mov.u32 %r1110, 0; // end inline asm // begin inline asm mov.u32 %r1111, 0; // end inline asm // begin inline asm mov.u32 %r1112, 0; // end inline asm // begin inline asm mov.u32 %r1113, 0; // end inline asm // begin inline asm mov.u32 %r1114, 0; // end inline asm // begin inline asm mov.u32 %r1115, 0; // end inline asm // begin inline asm mov.u32 %r1116, 0; // end inline asm // begin inline asm mov.u32 %r1117, 0; // end inline asm // begin inline asm mov.u32 %r1118, 0; // end inline asm // begin inline asm mov.u32 %r1119, 0; // end inline asm // begin inline asm mov.u32 %r1120, 0; // end inline asm // begin inline asm mov.u32 %r1121, 0; // end inline asm // begin inline asm mov.u32 %r1122, 0; // end inline asm // begin inline asm mov.u32 %r1123, 0; // end inline asm // begin inline asm mov.u32 %r1124, 0; // end inline asm // begin inline asm mov.u32 %r1125, 0; // end inline asm // begin inline asm mov.u32 %r1126, 0; // end inline asm // begin inline asm mov.u32 %r1127, 0; // end inline asm // begin inline asm mov.u32 %r1128, 0; // end inline asm // begin inline asm mov.u32 %r1129, 0; // end inline asm // begin inline asm mov.u32 %r1130, 0; // end inline asm // begin inline asm mov.u32 %r1131, 0; // end inline asm // begin inline asm mov.u32 %r1132, 0; // end inline asm // begin inline asm mov.u32 %r1133, 0; // end inline asm // begin inline asm mov.u32 %r1134, 0; // end inline asm // begin inline asm mov.u32 %r1135, 0; // end inline asm mov.b32 %f450, %r1072; mov.b32 %f451, %r1073; mov.b32 %f452, %r1074; mov.b32 %f453, %r1075; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f450, %f451, %f452, %f453}, {%r1064, %r1065, %r1066, %r1067}, {%r1591, %r1590}, {%f450, %f451, %f452, %f453}; // end inline asm mov.b32 %f458, %r1076; mov.b32 %f459, %r1077; mov.b32 %f460, %r1078; mov.b32 %f461, %r1079; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f458, %f459, %f460, %f461}, {%r1064, %r1065, %r1066, %r1067}, {%r1589, %r1588}, {%f458, %f459, %f460, %f461}; // end inline asm mov.b32 %f466, %r1080; mov.b32 %f467, %r1081; mov.b32 %f468, %r1082; mov.b32 %f469, %r1083; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f466, %f467, %f468, %f469}, {%r1064, %r1065, %r1066, %r1067}, {%r1587, %r1586}, {%f466, %f467, %f468, %f469}; // end inline asm mov.b32 %f474, %r1084; mov.b32 %f475, %r1085; mov.b32 %f476, %r1086; mov.b32 %f477, %r1087; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f474, %f475, %f476, %f477}, {%r1064, %r1065, %r1066, %r1067}, {%r1585, %r1584}, {%f474, %f475, %f476, %f477}; // end inline asm mov.b32 %f482, %r1088; mov.b32 %f483, %r1089; mov.b32 %f484, %r1090; mov.b32 %f485, %r1091; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r1064, %r1065, %r1066, %r1067}, {%r1583, %r1582}, {%f482, %f483, %f484, %f485}; // end inline asm mov.b32 %f490, %r1092; mov.b32 %f491, %r1093; mov.b32 %f492, %r1094; mov.b32 %f493, %r1095; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r1064, %r1065, %r1066, %r1067}, {%r1581, %r1580}, {%f490, %f491, %f492, %f493}; // end inline asm mov.b32 %f498, %r1096; mov.b32 %f499, %r1097; mov.b32 %f500, %r1098; mov.b32 %f501, %r1099; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f498, %f499, %f500, %f501}, {%r1064, %r1065, %r1066, %r1067}, {%r1579, %r1578}, {%f498, %f499, %f500, %f501}; // end inline asm mov.b32 %f506, %r1100; mov.b32 %f507, %r1101; mov.b32 %f508, %r1102; mov.b32 %f509, %r1103; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f506, %f507, %f508, %f509}, {%r1064, %r1065, %r1066, %r1067}, {%r1577, %r1576}, {%f506, %f507, %f508, %f509}; // end inline asm mov.b32 %f514, %r1104; mov.b32 %f515, %r1105; mov.b32 %f516, %r1106; mov.b32 %f517, %r1107; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f514, %f515, %f516, %f517}, {%r1068, %r1069, %r1070, %r1071}, {%r1591, %r1590}, {%f514, %f515, %f516, %f517}; // end inline asm mov.b32 %f522, %r1108; mov.b32 %f523, %r1109; mov.b32 %f524, %r1110; mov.b32 %f525, %r1111; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f522, %f523, %f524, %f525}, {%r1068, %r1069, %r1070, %r1071}, {%r1589, %r1588}, {%f522, %f523, %f524, %f525}; // end inline asm mov.b32 %f530, %r1112; mov.b32 %f531, %r1113; mov.b32 %f532, %r1114; mov.b32 %f533, %r1115; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f530, %f531, %f532, %f533}, {%r1068, %r1069, %r1070, %r1071}, {%r1587, %r1586}, {%f530, %f531, %f532, %f533}; // end inline asm mov.b32 %f538, %r1116; mov.b32 %f539, %r1117; mov.b32 %f540, %r1118; mov.b32 %f541, %r1119; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f538, %f539, %f540, %f541}, {%r1068, %r1069, %r1070, %r1071}, {%r1585, %r1584}, {%f538, %f539, %f540, %f541}; // end inline asm mov.b32 %f546, %r1120; mov.b32 %f547, %r1121; mov.b32 %f548, %r1122; mov.b32 %f549, %r1123; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f546, %f547, %f548, %f549}, {%r1068, %r1069, %r1070, %r1071}, {%r1583, %r1582}, {%f546, %f547, %f548, %f549}; // end inline asm mov.b32 %f554, %r1124; mov.b32 %f555, %r1125; mov.b32 %f556, %r1126; mov.b32 %f557, %r1127; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f554, %f555, %f556, %f557}, {%r1068, %r1069, %r1070, %r1071}, {%r1581, %r1580}, {%f554, %f555, %f556, %f557}; // end inline asm mov.b32 %f562, %r1128; mov.b32 %f563, %r1129; mov.b32 %f564, %r1130; mov.b32 %f565, %r1131; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f562, %f563, %f564, %f565}, {%r1068, %r1069, %r1070, %r1071}, {%r1579, %r1578}, {%f562, %f563, %f564, %f565}; // end inline asm mov.b32 %f570, %r1132; mov.b32 %f571, %r1133; mov.b32 %f572, %r1134; mov.b32 %f573, %r1135; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r1068, %r1069, %r1070, %r1071}, {%r1577, %r1576}, {%f570, %f571, %f572, %f573}; // end inline asm setp.equ.ftz.f32 %p112, %f841, 0f00000000; mov.f32 %f863, 0f3F800000; mov.f32 %f862, %f863; @%p112 bra $L__BB0_44; rcp.approx.ftz.f32 %f862, %f841; $L__BB0_44: setp.equ.ftz.f32 %p113, %f840, 0f00000000; @%p113 bra $L__BB0_46; rcp.approx.ftz.f32 %f863, %f840; $L__BB0_46: mov.b32 %f581, %r1655; fma.rn.ftz.f32 %f582, %f78, %f581, %f450; mul.ftz.f32 %f583, %f862, %f582; mov.b32 %r1655, %f583; mov.b32 %f584, %r1654; fma.rn.ftz.f32 %f585, %f78, %f584, %f451; mul.ftz.f32 %f586, %f862, %f585; mov.b32 %r1654, %f586; mov.b32 %f587, %r1653; fma.rn.ftz.f32 %f588, %f80, %f587, %f452; mul.ftz.f32 %f589, %f863, %f588; mov.b32 %r1653, %f589; mov.b32 %f590, %r1652; fma.rn.ftz.f32 %f591, %f80, %f590, %f453; mul.ftz.f32 %f592, %f863, %f591; mov.b32 %r1652, %f592; mov.b32 %f593, %r1651; fma.rn.ftz.f32 %f594, %f78, %f593, %f458; mul.ftz.f32 %f595, %f862, %f594; mov.b32 %r1651, %f595; mov.b32 %f596, %r1650; fma.rn.ftz.f32 %f597, %f78, %f596, %f459; mul.ftz.f32 %f598, %f862, %f597; mov.b32 %r1650, %f598; mov.b32 %f599, %r1649; fma.rn.ftz.f32 %f600, %f80, %f599, %f460; mul.ftz.f32 %f601, %f863, %f600; mov.b32 %r1649, %f601; mov.b32 %f602, %r1648; fma.rn.ftz.f32 %f603, %f80, %f602, %f461; mul.ftz.f32 %f604, %f863, %f603; mov.b32 %r1648, %f604; mov.b32 %f605, %r1647; fma.rn.ftz.f32 %f606, %f78, %f605, %f466; mul.ftz.f32 %f607, %f862, %f606; mov.b32 %r1647, %f607; mov.b32 %f608, %r1646; fma.rn.ftz.f32 %f609, %f78, %f608, %f467; mul.ftz.f32 %f610, %f862, %f609; mov.b32 %r1646, %f610; mov.b32 %f611, %r1645; fma.rn.ftz.f32 %f612, %f80, %f611, %f468; mul.ftz.f32 %f613, %f863, %f612; mov.b32 %r1645, %f613; mov.b32 %f614, %r1644; fma.rn.ftz.f32 %f615, %f80, %f614, %f469; mul.ftz.f32 %f616, %f863, %f615; mov.b32 %r1644, %f616; mov.b32 %f617, %r1643; fma.rn.ftz.f32 %f618, %f78, %f617, %f474; mul.ftz.f32 %f619, %f862, %f618; mov.b32 %r1643, %f619; mov.b32 %f620, %r1642; fma.rn.ftz.f32 %f621, %f78, %f620, %f475; mul.ftz.f32 %f622, %f862, %f621; mov.b32 %r1642, %f622; mov.b32 %f623, %r1641; fma.rn.ftz.f32 %f624, %f80, %f623, %f476; mul.ftz.f32 %f625, %f863, %f624; mov.b32 %r1641, %f625; mov.b32 %f626, %r1640; fma.rn.ftz.f32 %f627, %f80, %f626, %f477; mul.ftz.f32 %f628, %f863, %f627; mov.b32 %r1640, %f628; mov.b32 %f629, %r1639; fma.rn.ftz.f32 %f630, %f78, %f629, %f482; mul.ftz.f32 %f631, %f862, %f630; mov.b32 %r1639, %f631; mov.b32 %f632, %r1638; fma.rn.ftz.f32 %f633, %f78, %f632, %f483; mul.ftz.f32 %f634, %f862, %f633; mov.b32 %r1638, %f634; mov.b32 %f635, %r1637; fma.rn.ftz.f32 %f636, %f80, %f635, %f484; mul.ftz.f32 %f637, %f863, %f636; mov.b32 %r1637, %f637; mov.b32 %f638, %r1636; fma.rn.ftz.f32 %f639, %f80, %f638, %f485; mul.ftz.f32 %f640, %f863, %f639; mov.b32 %r1636, %f640; mov.b32 %f641, %r1635; fma.rn.ftz.f32 %f642, %f78, %f641, %f490; mul.ftz.f32 %f643, %f862, %f642; mov.b32 %r1635, %f643; mov.b32 %f644, %r1634; fma.rn.ftz.f32 %f645, %f78, %f644, %f491; mul.ftz.f32 %f646, %f862, %f645; mov.b32 %r1634, %f646; mov.b32 %f647, %r1633; fma.rn.ftz.f32 %f648, %f80, %f647, %f492; mul.ftz.f32 %f649, %f863, %f648; mov.b32 %r1633, %f649; mov.b32 %f650, %r1632; fma.rn.ftz.f32 %f651, %f80, %f650, %f493; mul.ftz.f32 %f652, %f863, %f651; mov.b32 %r1632, %f652; mov.b32 %f653, %r1631; fma.rn.ftz.f32 %f654, %f78, %f653, %f498; mul.ftz.f32 %f655, %f862, %f654; mov.b32 %r1631, %f655; mov.b32 %f656, %r1630; fma.rn.ftz.f32 %f657, %f78, %f656, %f499; mul.ftz.f32 %f658, %f862, %f657; mov.b32 %r1630, %f658; mov.b32 %f659, %r1629; fma.rn.ftz.f32 %f660, %f80, %f659, %f500; mul.ftz.f32 %f661, %f863, %f660; mov.b32 %r1629, %f661; mov.b32 %f662, %r1628; fma.rn.ftz.f32 %f663, %f80, %f662, %f501; mul.ftz.f32 %f664, %f863, %f663; mov.b32 %r1628, %f664; mov.b32 %f665, %r1627; fma.rn.ftz.f32 %f666, %f78, %f665, %f506; mul.ftz.f32 %f667, %f862, %f666; mov.b32 %r1627, %f667; mov.b32 %f668, %r1626; fma.rn.ftz.f32 %f669, %f78, %f668, %f507; mul.ftz.f32 %f670, %f862, %f669; mov.b32 %r1626, %f670; mov.b32 %f671, %r1625; fma.rn.ftz.f32 %f672, %f80, %f671, %f508; mul.ftz.f32 %f673, %f863, %f672; mov.b32 %r1625, %f673; mov.b32 %f674, %r1624; fma.rn.ftz.f32 %f675, %f80, %f674, %f509; mul.ftz.f32 %f676, %f863, %f675; mov.b32 %r1624, %f676; setp.equ.ftz.f32 %p114, %f839, 0f00000000; mov.f32 %f865, 0f3F800000; mov.f32 %f864, %f865; @%p114 bra $L__BB0_48; rcp.approx.ftz.f32 %f864, %f839; $L__BB0_48: setp.equ.ftz.f32 %p115, %f838, 0f00000000; @%p115 bra $L__BB0_50; rcp.approx.ftz.f32 %f865, %f838; $L__BB0_50: add.s32 %r1448, %r1554, 16; setp.ge.s32 %p136, %r1448, %r142; mov.b32 %f678, %r1623; fma.rn.ftz.f32 %f679, %f82, %f678, %f514; mul.ftz.f32 %f680, %f864, %f679; mov.b32 %r1623, %f680; mov.b32 %f681, %r1622; fma.rn.ftz.f32 %f682, %f82, %f681, %f515; mul.ftz.f32 %f683, %f864, %f682; mov.b32 %r1622, %f683; mov.b32 %f684, %r1621; fma.rn.ftz.f32 %f685, %f84, %f684, %f516; mul.ftz.f32 %f686, %f865, %f685; mov.b32 %r1621, %f686; mov.b32 %f687, %r1620; fma.rn.ftz.f32 %f688, %f84, %f687, %f517; mul.ftz.f32 %f689, %f865, %f688; mov.b32 %r1620, %f689; mov.b32 %f690, %r1619; fma.rn.ftz.f32 %f691, %f82, %f690, %f522; mul.ftz.f32 %f692, %f864, %f691; mov.b32 %r1619, %f692; mov.b32 %f693, %r1618; fma.rn.ftz.f32 %f694, %f82, %f693, %f523; mul.ftz.f32 %f695, %f864, %f694; mov.b32 %r1618, %f695; mov.b32 %f696, %r1617; fma.rn.ftz.f32 %f697, %f84, %f696, %f524; mul.ftz.f32 %f698, %f865, %f697; mov.b32 %r1617, %f698; mov.b32 %f699, %r1616; fma.rn.ftz.f32 %f700, %f84, %f699, %f525; mul.ftz.f32 %f701, %f865, %f700; mov.b32 %r1616, %f701; mov.b32 %f702, %r1615; fma.rn.ftz.f32 %f703, %f82, %f702, %f530; mul.ftz.f32 %f704, %f864, %f703; mov.b32 %r1615, %f704; mov.b32 %f705, %r1614; fma.rn.ftz.f32 %f706, %f82, %f705, %f531; mul.ftz.f32 %f707, %f864, %f706; mov.b32 %r1614, %f707; mov.b32 %f708, %r1613; fma.rn.ftz.f32 %f709, %f84, %f708, %f532; mul.ftz.f32 %f710, %f865, %f709; mov.b32 %r1613, %f710; mov.b32 %f711, %r1612; fma.rn.ftz.f32 %f712, %f84, %f711, %f533; mul.ftz.f32 %f713, %f865, %f712; mov.b32 %r1612, %f713; mov.b32 %f714, %r1611; fma.rn.ftz.f32 %f715, %f82, %f714, %f538; mul.ftz.f32 %f716, %f864, %f715; mov.b32 %r1611, %f716; mov.b32 %f717, %r1610; fma.rn.ftz.f32 %f718, %f82, %f717, %f539; mul.ftz.f32 %f719, %f864, %f718; mov.b32 %r1610, %f719; mov.b32 %f720, %r1609; fma.rn.ftz.f32 %f721, %f84, %f720, %f540; mul.ftz.f32 %f722, %f865, %f721; mov.b32 %r1609, %f722; mov.b32 %f723, %r1608; fma.rn.ftz.f32 %f724, %f84, %f723, %f541; mul.ftz.f32 %f725, %f865, %f724; mov.b32 %r1608, %f725; mov.b32 %f726, %r1607; fma.rn.ftz.f32 %f727, %f82, %f726, %f546; mul.ftz.f32 %f728, %f864, %f727; mov.b32 %r1607, %f728; mov.b32 %f729, %r1606; fma.rn.ftz.f32 %f730, %f82, %f729, %f547; mul.ftz.f32 %f731, %f864, %f730; mov.b32 %r1606, %f731; mov.b32 %f732, %r1605; fma.rn.ftz.f32 %f733, %f84, %f732, %f548; mul.ftz.f32 %f734, %f865, %f733; mov.b32 %r1605, %f734; mov.b32 %f735, %r1604; fma.rn.ftz.f32 %f736, %f84, %f735, %f549; mul.ftz.f32 %f737, %f865, %f736; mov.b32 %r1604, %f737; mov.b32 %f738, %r1603; fma.rn.ftz.f32 %f739, %f82, %f738, %f554; mul.ftz.f32 %f740, %f864, %f739; mov.b32 %r1603, %f740; mov.b32 %f741, %r1602; fma.rn.ftz.f32 %f742, %f82, %f741, %f555; mul.ftz.f32 %f743, %f864, %f742; mov.b32 %r1602, %f743; mov.b32 %f744, %r1601; fma.rn.ftz.f32 %f745, %f84, %f744, %f556; mul.ftz.f32 %f746, %f865, %f745; mov.b32 %r1601, %f746; mov.b32 %f747, %r1600; fma.rn.ftz.f32 %f748, %f84, %f747, %f557; mul.ftz.f32 %f749, %f865, %f748; mov.b32 %r1600, %f749; mov.b32 %f750, %r1599; fma.rn.ftz.f32 %f751, %f82, %f750, %f562; mul.ftz.f32 %f752, %f864, %f751; mov.b32 %r1599, %f752; mov.b32 %f753, %r1598; fma.rn.ftz.f32 %f754, %f82, %f753, %f563; mul.ftz.f32 %f755, %f864, %f754; mov.b32 %r1598, %f755; mov.b32 %f756, %r1597; fma.rn.ftz.f32 %f757, %f84, %f756, %f564; mul.ftz.f32 %f758, %f865, %f757; mov.b32 %r1597, %f758; mov.b32 %f759, %r1596; fma.rn.ftz.f32 %f760, %f84, %f759, %f565; mul.ftz.f32 %f761, %f865, %f760; mov.b32 %r1596, %f761; mov.b32 %f762, %r1595; fma.rn.ftz.f32 %f763, %f82, %f762, %f570; mul.ftz.f32 %f764, %f864, %f763; mov.b32 %r1595, %f764; mov.b32 %f765, %r1594; fma.rn.ftz.f32 %f766, %f82, %f765, %f571; mul.ftz.f32 %f767, %f864, %f766; mov.b32 %r1594, %f767; mov.b32 %f768, %r1593; fma.rn.ftz.f32 %f769, %f84, %f768, %f572; mul.ftz.f32 %f770, %f865, %f769; mov.b32 %r1593, %f770; mov.b32 %f771, %r1592; fma.rn.ftz.f32 %f772, %f84, %f771, %f573; mul.ftz.f32 %f773, %f865, %f772; mov.b32 %r1592, %f773; @%p136 bra $L__BB0_52; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1591, %r1590, %r1589, %r1588}, [%r634]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1587, %r1586, %r1585, %r1584}, [%r639]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1583, %r1582, %r1581, %r1580}, [%r644]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1579, %r1578, %r1577, %r1576}, [%r649]; // end inline asm $L__BB0_52: add.s32 %r1554, %r1554, 16; setp.lt.s32 %p117, %r1554, %r142; add.s32 %r1555, %r1555, 16; mov.f32 %f842, %f61; mov.f32 %f843, %f60; mov.f32 %f844, %f59; mov.f32 %f845, %f58; @%p117 bra $L__BB0_3; $L__BB0_53: mov.u32 %r1456, %tid.x; and.b32 %r1455, %r1456, 96; shr.u32 %r1454, %r1455, 1; mov.u32 %r1453, _ZN25fused_multihead_attention5smem_E; mov.b64 %rd180, fmha_v2_flash_attention_fp16_128_16_S_64_sm86_kernel_nl_param_0; mov.u64 %rd179, %rd180; ld.param.u32 %r1452, [%rd179+60]; mul.lo.s32 %r1451, %r1, %r715; mad.lo.s32 %r1450, %r1451, %r2, %r716; bar.sync 0; mul.lo.s32 %r1338, %r1450, %r1452; shl.b32 %r1339, %r1338, 1; cvt.s64.s32 %rd138, %r1339; add.s64 %rd33, %rd138, %rd47; mov.b32 %f774, %r1654; mov.b32 %f775, %r1655; // begin inline asm cvt.rn.f16x2.f32 %r1266, %f774, %f775; // end inline asm mov.b32 %f776, %r1652; mov.b32 %f777, %r1653; // begin inline asm cvt.rn.f16x2.f32 %r1267, %f776, %f777; // end inline asm shl.b32 %r1342, %r1456, 2; and.b32 %r1343, %r1342, 124; add.s32 %r1345, %r1343, %r1453; and.b32 %r1348, %r1456, 28; shr.u32 %r1349, %r1348, 2; or.b32 %r1350, %r1454, %r1349; shl.b32 %r1351, %r1350, 7; add.s32 %r1352, %r1345, %r1351; add.s32 %r1268, %r1352, 16384; // begin inline asm st.shared.b32 [%r1268], %r1266; // end inline asm add.s32 %r508, %r1352, 17408; // begin inline asm st.shared.b32 [%r508], %r1267; // end inline asm xor.b32 %r1274, %r1268, 16; mov.b32 %f778, %r1650; mov.b32 %f779, %r1651; // begin inline asm cvt.rn.f16x2.f32 %r1272, %f778, %f779; // end inline asm mov.b32 %f780, %r1648; mov.b32 %f781, %r1649; // begin inline asm cvt.rn.f16x2.f32 %r1273, %f780, %f781; // end inline asm // begin inline asm st.shared.b32 [%r1274], %r1272; // end inline asm add.s32 %r1276, %r1274, 1024; // begin inline asm st.shared.b32 [%r1276], %r1273; // end inline asm xor.b32 %r1280, %r1268, 32; mov.b32 %f782, %r1646; mov.b32 %f783, %r1647; // begin inline asm cvt.rn.f16x2.f32 %r1278, %f782, %f783; // end inline asm mov.b32 %f784, %r1644; mov.b32 %f785, %r1645; // begin inline asm cvt.rn.f16x2.f32 %r1279, %f784, %f785; // end inline asm // begin inline asm st.shared.b32 [%r1280], %r1278; // end inline asm add.s32 %r1282, %r1280, 1024; // begin inline asm st.shared.b32 [%r1282], %r1279; // end inline asm xor.b32 %r1286, %r1268, 48; mov.b32 %f786, %r1642; mov.b32 %f787, %r1643; // begin inline asm cvt.rn.f16x2.f32 %r1284, %f786, %f787; // end inline asm mov.b32 %f788, %r1640; mov.b32 %f789, %r1641; // begin inline asm cvt.rn.f16x2.f32 %r1285, %f788, %f789; // end inline asm // begin inline asm st.shared.b32 [%r1286], %r1284; // end inline asm add.s32 %r1288, %r1286, 1024; // begin inline asm st.shared.b32 [%r1288], %r1285; // end inline asm xor.b32 %r1292, %r1268, 64; mov.b32 %f790, %r1638; mov.b32 %f791, %r1639; // begin inline asm cvt.rn.f16x2.f32 %r1290, %f790, %f791; // end inline asm mov.b32 %f792, %r1636; mov.b32 %f793, %r1637; // begin inline asm cvt.rn.f16x2.f32 %r1291, %f792, %f793; // end inline asm // begin inline asm st.shared.b32 [%r1292], %r1290; // end inline asm add.s32 %r1294, %r1292, 1024; // begin inline asm st.shared.b32 [%r1294], %r1291; // end inline asm xor.b32 %r1298, %r1268, 80; mov.b32 %f794, %r1634; mov.b32 %f795, %r1635; // begin inline asm cvt.rn.f16x2.f32 %r1296, %f794, %f795; // end inline asm mov.b32 %f796, %r1632; mov.b32 %f797, %r1633; // begin inline asm cvt.rn.f16x2.f32 %r1297, %f796, %f797; // end inline asm // begin inline asm st.shared.b32 [%r1298], %r1296; // end inline asm add.s32 %r1300, %r1298, 1024; // begin inline asm st.shared.b32 [%r1300], %r1297; // end inline asm xor.b32 %r1304, %r1268, 96; mov.b32 %f798, %r1630; mov.b32 %f799, %r1631; // begin inline asm cvt.rn.f16x2.f32 %r1302, %f798, %f799; // end inline asm mov.b32 %f800, %r1628; mov.b32 %f801, %r1629; // begin inline asm cvt.rn.f16x2.f32 %r1303, %f800, %f801; // end inline asm // begin inline asm st.shared.b32 [%r1304], %r1302; // end inline asm add.s32 %r1306, %r1304, 1024; // begin inline asm st.shared.b32 [%r1306], %r1303; // end inline asm xor.b32 %r1310, %r1268, 112; mov.b32 %f802, %r1626; mov.b32 %f803, %r1627; // begin inline asm cvt.rn.f16x2.f32 %r1308, %f802, %f803; // end inline asm mov.b32 %f804, %r1624; mov.b32 %f805, %r1625; // begin inline asm cvt.rn.f16x2.f32 %r1309, %f804, %f805; // end inline asm // begin inline asm st.shared.b32 [%r1310], %r1308; // end inline asm add.s32 %r1312, %r1310, 1024; // begin inline asm st.shared.b32 [%r1312], %r1309; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1314, %r1315, %r1316, %r1317}, [%r10]; // end inline asm add.s32 %r1323, %r10, 2048; // begin inline asm ld.shared.v4.b32 {%r1319, %r1320, %r1321, %r1322}, [%r1323]; // end inline asm add.s32 %r1328, %r10, 4096; // begin inline asm ld.shared.v4.b32 {%r1324, %r1325, %r1326, %r1327}, [%r1328]; // end inline asm add.s32 %r1333, %r10, 6144; // begin inline asm ld.shared.v4.b32 {%r1329, %r1330, %r1331, %r1332}, [%r1333]; // end inline asm bar.sync 0; cvt.u32.u64 %r1353, %rd5; setp.ge.s32 %p118, %r1353, %r1; @%p118 bra $L__BB0_64; setp.ge.s32 %p119, %r7, %r9; @%p119 bra $L__BB0_56; mul.lo.s64 %rd140, %rd8, %rd5; add.s64 %rd141, %rd33, %rd140; cvta.to.global.u64 %rd142, %rd9; add.s64 %rd143, %rd142, %rd141; st.global.v4.u32 [%rd143], {%r1314, %r1315, %r1316, %r1317}; $L__BB0_56: add.s32 %r1355, %r1353, 16; setp.ge.s32 %p120, %r1355, %r1; @%p120 bra $L__BB0_64; @%p119 bra $L__BB0_59; add.s64 %rd144, %rd5, 16; mul.lo.s64 %rd145, %rd144, %rd8; add.s64 %rd146, %rd33, %rd145; cvta.to.global.u64 %rd147, %rd9; add.s64 %rd148, %rd147, %rd146; st.global.v4.u32 [%rd148], {%r1319, %r1320, %r1321, %r1322}; $L__BB0_59: add.s32 %r1357, %r1353, 32; setp.ge.s32 %p122, %r1357, %r1; @%p122 bra $L__BB0_64; @%p119 bra $L__BB0_62; add.s64 %rd149, %rd5, 32; mul.lo.s64 %rd150, %rd149, %rd8; add.s64 %rd151, %rd33, %rd150; cvta.to.global.u64 %rd152, %rd9; add.s64 %rd153, %rd152, %rd151; st.global.v4.u32 [%rd153], {%r1324, %r1325, %r1326, %r1327}; $L__BB0_62: add.s32 %r1359, %r1353, 48; setp.ge.s32 %p124, %r1359, %r1; or.pred %p126, %p124, %p119; @%p126 bra $L__BB0_64; add.s64 %rd154, %rd5, 48; mul.lo.s64 %rd155, %rd154, %rd8; add.s64 %rd156, %rd33, %rd155; cvta.to.global.u64 %rd157, %rd9; add.s64 %rd158, %rd157, %rd156; st.global.v4.u32 [%rd158], {%r1329, %r1330, %r1331, %r1332}; $L__BB0_64: mov.b32 %f806, %r1622; mov.b32 %f807, %r1623; // begin inline asm cvt.rn.f16x2.f32 %r1360, %f806, %f807; // end inline asm mov.b32 %f808, %r1620; mov.b32 %f809, %r1621; // begin inline asm cvt.rn.f16x2.f32 %r1361, %f808, %f809; // end inline asm // begin inline asm st.shared.b32 [%r1268], %r1360; // end inline asm // begin inline asm st.shared.b32 [%r508], %r1361; // end inline asm mov.b32 %f810, %r1618; mov.b32 %f811, %r1619; // begin inline asm cvt.rn.f16x2.f32 %r1366, %f810, %f811; // end inline asm mov.b32 %f812, %r1616; mov.b32 %f813, %r1617; // begin inline asm cvt.rn.f16x2.f32 %r1367, %f812, %f813; // end inline asm // begin inline asm st.shared.b32 [%r1274], %r1366; // end inline asm // begin inline asm st.shared.b32 [%r1276], %r1367; // end inline asm mov.b32 %f814, %r1614; mov.b32 %f815, %r1615; // begin inline asm cvt.rn.f16x2.f32 %r1372, %f814, %f815; // end inline asm mov.b32 %f816, %r1612; mov.b32 %f817, %r1613; // begin inline asm cvt.rn.f16x2.f32 %r1373, %f816, %f817; // end inline asm // begin inline asm st.shared.b32 [%r1280], %r1372; // end inline asm // begin inline asm st.shared.b32 [%r1282], %r1373; // end inline asm mov.b32 %f818, %r1610; mov.b32 %f819, %r1611; // begin inline asm cvt.rn.f16x2.f32 %r1378, %f818, %f819; // end inline asm mov.b32 %f820, %r1608; mov.b32 %f821, %r1609; // begin inline asm cvt.rn.f16x2.f32 %r1379, %f820, %f821; // end inline asm // begin inline asm st.shared.b32 [%r1286], %r1378; // end inline asm // begin inline asm st.shared.b32 [%r1288], %r1379; // end inline asm mov.b32 %f822, %r1606; mov.b32 %f823, %r1607; // begin inline asm cvt.rn.f16x2.f32 %r1384, %f822, %f823; // end inline asm mov.b32 %f824, %r1604; mov.b32 %f825, %r1605; // begin inline asm cvt.rn.f16x2.f32 %r1385, %f824, %f825; // end inline asm // begin inline asm st.shared.b32 [%r1292], %r1384; // end inline asm // begin inline asm st.shared.b32 [%r1294], %r1385; // end inline asm mov.b32 %f826, %r1602; mov.b32 %f827, %r1603; // begin inline asm cvt.rn.f16x2.f32 %r1390, %f826, %f827; // end inline asm mov.b32 %f828, %r1600; mov.b32 %f829, %r1601; // begin inline asm cvt.rn.f16x2.f32 %r1391, %f828, %f829; // end inline asm // begin inline asm st.shared.b32 [%r1298], %r1390; // end inline asm // begin inline asm st.shared.b32 [%r1300], %r1391; // end inline asm mov.b32 %f830, %r1598; mov.b32 %f831, %r1599; // begin inline asm cvt.rn.f16x2.f32 %r1396, %f830, %f831; // end inline asm mov.b32 %f832, %r1596; mov.b32 %f833, %r1597; // begin inline asm cvt.rn.f16x2.f32 %r1397, %f832, %f833; // end inline asm // begin inline asm st.shared.b32 [%r1304], %r1396; // end inline asm // begin inline asm st.shared.b32 [%r1306], %r1397; // end inline asm mov.b32 %f834, %r1594; mov.b32 %f835, %r1595; // begin inline asm cvt.rn.f16x2.f32 %r1402, %f834, %f835; // end inline asm mov.b32 %f836, %r1592; mov.b32 %f837, %r1593; // begin inline asm cvt.rn.f16x2.f32 %r1403, %f836, %f837; // end inline asm // begin inline asm st.shared.b32 [%r1310], %r1402; // end inline asm // begin inline asm st.shared.b32 [%r1312], %r1403; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1408, %r1409, %r1410, %r1411}, [%r10]; // end inline asm // begin inline asm ld.shared.v4.b32 {%r1413, %r1414, %r1415, %r1416}, [%r1323]; // end inline asm add.s32 %r1422, %r1323, 2048; // begin inline asm ld.shared.v4.b32 {%r1418, %r1419, %r1420, %r1421}, [%r1422]; // end inline asm add.s32 %r1427, %r1323, 4096; // begin inline asm ld.shared.v4.b32 {%r1423, %r1424, %r1425, %r1426}, [%r1427]; // end inline asm add.s32 %r1441, %r1353, 64; setp.ge.s32 %p127, %r1441, %r1; @%p127 bra $L__BB0_75; setp.ge.s32 %p128, %r7, %r9; @%p128 bra $L__BB0_67; add.s64 %rd159, %rd5, 64; mul.lo.s64 %rd160, %rd159, %rd8; add.s64 %rd161, %rd33, %rd160; cvta.to.global.u64 %rd162, %rd9; add.s64 %rd163, %rd162, %rd161; st.global.v4.u32 [%rd163], {%r1408, %r1409, %r1410, %r1411}; $L__BB0_67: add.s32 %r1443, %r1353, 80; setp.ge.s32 %p129, %r1443, %r1; @%p129 bra $L__BB0_75; @%p128 bra $L__BB0_70; add.s64 %rd164, %rd5, 80; mul.lo.s64 %rd165, %rd164, %rd8; add.s64 %rd166, %rd33, %rd165; cvta.to.global.u64 %rd167, %rd9; add.s64 %rd168, %rd167, %rd166; st.global.v4.u32 [%rd168], {%r1413, %r1414, %r1415, %r1416}; $L__BB0_70: add.s32 %r1445, %r1353, 96; setp.ge.s32 %p131, %r1445, %r1; @%p131 bra $L__BB0_75; @%p128 bra $L__BB0_73; add.s64 %rd169, %rd5, 96; mul.lo.s64 %rd170, %rd169, %rd8; add.s64 %rd171, %rd33, %rd170; cvta.to.global.u64 %rd172, %rd9; add.s64 %rd173, %rd172, %rd171; st.global.v4.u32 [%rd173], {%r1418, %r1419, %r1420, %r1421}; $L__BB0_73: add.s32 %r1447, %r1353, 112; setp.ge.s32 %p133, %r1447, %r1; or.pred %p135, %p133, %p128; @%p135 bra $L__BB0_75; add.s64 %rd174, %rd5, 112; mul.lo.s64 %rd175, %rd174, %rd8; add.s64 %rd176, %rd33, %rd175; cvta.to.global.u64 %rd177, %rd9; add.s64 %rd178, %rd177, %rd176; st.global.v4.u32 [%rd178], {%r1423, %r1424, %r1425, %r1426}; $L__BB0_75: ret; }