l_nl .extern .shared .align 16 .b8 _ZN25fused_multihead_attention5smem_E[]; .visible .entry fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl( .param .align 8 .b8 fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl_param_0[168] ) { .reg .pred %p<149>; .reg .b16 %rs<147>; .reg .f32 %f<762>; .reg .b32 %r<2068>; .reg .b64 %rd<191>; mov.b64 %rd42, fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd42; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl_param_0+52]; mov.u32 %r680, %ctaid.z; shl.b32 %r3, %r680, 6; setp.le.s32 %p5, %r1, %r3; @%p5 bra $L__BB0_85; mov.u32 %r681, %tid.x; mov.u32 %r682, %ctaid.y; mov.u32 %r683, %ctaid.x; mul.lo.s32 %r684, %r1, %r682; mad.lo.s32 %r685, %r684, %r2, %r683; shr.s32 %r686, %r681, 31; shr.u32 %r687, %r686, 27; add.s32 %r688, %r681, %r687; and.b32 %r689, %r688, -32; sub.s32 %r690, %r681, %r689; shr.u32 %r691, %r686, 25; add.s32 %r692, %r681, %r691; shr.s32 %r693, %r692, 7; shl.b32 %r694, %r693, 4; shr.s32 %r695, %r690, 31; shr.u32 %r696, %r695, 30; add.s32 %r697, %r690, %r696; and.b32 %r698, %r697, 2147483644; sub.s32 %r699, %r690, %r698; shl.b32 %r700, %r699, 1; add.s32 %r1945, %r700, %r694; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r701, %r688, 5; shr.s32 %r702, %r688, 31; shr.u32 %r703, %r702, 30; add.s32 %r704, %r701, %r703; and.b32 %r705, %r704, 268435452; sub.s32 %r706, %r701, %r705; shl.b32 %r707, %r706, 4; shr.s32 %r708, %r697, 2; add.s32 %r5, %r707, %r708; shr.u32 %r709, %r686, 28; add.s32 %r710, %r681, %r709; and.b32 %r711, %r710, -16; sub.s32 %r6, %r681, %r711; setp.gt.s32 %p6, %r6, 9; shr.s32 %r7, %r710, 4; add.s32 %r712, %r7, %r3; cvt.s64.s32 %rd5, %r712; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd43, %rd6, %rd5; mul.lo.s32 %r713, %r685, 3; mul.wide.s32 %rd44, %r713, 160; shl.b32 %r714, %r6, 4; cvt.s64.s32 %rd45, %r714; add.s64 %rd46, %rd44, %rd45; add.s64 %rd47, %rd46, %rd43; ld.param.u64 %rd48, [%rd1]; add.s64 %rd7, %rd48, %rd47; shr.s32 %r715, %r710, 31; shr.u32 %r716, %r715, 29; add.s32 %r717, %r7, %r716; and.b32 %r718, %r717, 268435448; sub.s32 %r719, %r7, %r718; xor.b32 %r720, %r719, %r6; shl.b32 %r721, %r7, 8; shl.b32 %r722, %r720, 4; mov.u32 %r723, 31; mov.u32 %r724, 0; mov.u32 %r725, -1; shfl.sync.idx.b32 %r8|%p1, %r724, %r724, %r723, %r725; shfl.sync.idx.b32 %r23|%p7, %r724, %r724, %r723, %r725; and.b32 %r726, %r681, 96; shr.u32 %r727, %r726, 1; and.b32 %r728, %r681, 15; or.b32 %r729, %r727, %r728; and.b32 %r730, %r681, 7; shl.b32 %r731, %r681, 4; and.b32 %r732, %r731, 112; and.b32 %r733, %r681, 16; xor.b32 %r734, %r732, %r733; cvt.s64.s32 %rd49, %r7; mul.lo.s64 %rd50, %rd6, %rd49; add.s64 %rd51, %rd46, %rd50; add.s64 %rd52, %rd48, %rd51; add.s64 %rd190, %rd52, 160; shfl.sync.idx.b32 %r9|%p2, %r724, %r724, %r723, %r725; shfl.sync.idx.b32 %r10|%p3, %r724, %r724, %r723, %r725; shr.u32 %r735, %r733, 1; or.b32 %r736, %r735, %r730; and.b32 %r737, %r681, 8; shr.u32 %r738, %r737, 3; xor.b32 %r739, %r738, %r730; add.s64 %rd189, %rd52, 320; shfl.sync.idx.b32 %r740|%p8, %r724, %r724, %r723, %r725; shfl.sync.idx.b32 %r11|%p4, %r724, %r724, %r723, %r725; ld.param.u64 %rd12, [%rd1+32]; ld.param.u64 %rd13, [%rd1+8]; sub.s32 %r741, %r1, %r3; min.s32 %r13, %r741, 64; shl.b32 %r745, %r681, 8; and.b32 %r746, %r745, 3840; shl.b32 %r747, %r739, 4; shl.b32 %r748, %r736, 8; shl.b32 %r749, %r729, 8; add.s32 %r15, %r7, 8; add.s32 %r16, %r7, 16; add.s32 %r17, %r7, 24; add.s32 %r18, %r722, %r721; or.b32 %r19, %r749, %r734; or.b32 %r20, %r748, %r747; or.b32 %r21, %r734, %r746; mov.u32 %r750, _ZN25fused_multihead_attention5smem_E; add.s32 %r751, %r18, %r750; add.s32 %r22, %r751, 16384; @%p6 bra $L__BB0_3; shl.b64 %rd61, %rd6, 3; add.s32 %r768, %r7, 56; setp.lt.s32 %p9, %r768, %r13; add.s32 %r769, %r7, 48; setp.lt.s32 %p10, %r769, %r13; add.s32 %r770, %r7, 40; setp.lt.s32 %p11, %r770, %r13; add.s32 %r771, %r7, 32; setp.lt.s32 %p12, %r771, %r13; selp.b32 %r763, 16, 0, %p11; add.s32 %r752, %r751, %r23; add.s32 %r754, %r752, 2048; add.s32 %r756, %r752, 4096; add.s32 %r758, %r752, 6144; add.s32 %r760, %r752, 8192; add.s32 %r762, %r752, 10240; add.s32 %r764, %r752, 12288; add.s32 %r766, %r752, 14336; setp.lt.s32 %p13, %r7, %r13; selp.b32 %r753, 16, 0, %p13; // begin inline asm cp.async.cg.shared.global [%r752], [%rd7], 16, %r753; // end inline asm setp.lt.s32 %p14, %r15, %r13; selp.b32 %r755, 16, 0, %p14; add.s64 %rd54, %rd7, %rd61; // begin inline asm cp.async.cg.shared.global [%r754], [%rd54], 16, %r755; // end inline asm setp.lt.s32 %p15, %r16, %r13; selp.b32 %r757, 16, 0, %p15; add.s64 %rd55, %rd54, %rd61; // begin inline asm cp.async.cg.shared.global [%r756], [%rd55], 16, %r757; // end inline asm setp.lt.s32 %p16, %r17, %r13; selp.b32 %r759, 16, 0, %p16; add.s64 %rd56, %rd55, %rd61; // begin inline asm cp.async.cg.shared.global [%r758], [%rd56], 16, %r759; // end inline asm selp.b32 %r761, 16, 0, %p12; add.s64 %rd57, %rd56, %rd61; // begin inline asm cp.async.cg.shared.global [%r760], [%rd57], 16, %r761; // end inline asm add.s64 %rd58, %rd57, %rd61; // begin inline asm cp.async.cg.shared.global [%r762], [%rd58], 16, %r763; // end inline asm selp.b32 %r765, 16, 0, %p10; add.s64 %rd59, %rd58, %rd61; // begin inline asm cp.async.cg.shared.global [%r764], [%rd59], 16, %r765; // end inline asm selp.b32 %r767, 16, 0, %p9; add.s64 %rd60, %rd59, %rd61; // begin inline asm cp.async.cg.shared.global [%r766], [%rd60], 16, %r767; // end inline asm $L__BB0_3: min.s32 %r25, %r1, 32; @%p6 bra $L__BB0_5; shl.b64 %rd66, %rd6, 3; setp.lt.s32 %p18, %r17, %r25; add.s32 %r774, %r22, %r10; add.s32 %r776, %r774, 2048; add.s32 %r778, %r774, 4096; add.s32 %r780, %r774, 6144; setp.lt.s32 %p19, %r7, %r25; selp.b32 %r775, 16, 0, %p19; // begin inline asm cp.async.cg.shared.global [%r774], [%rd190], 16, %r775; // end inline asm setp.lt.s32 %p20, %r15, %r25; selp.b32 %r777, 16, 0, %p20; add.s64 %rd63, %rd190, %rd66; // begin inline asm cp.async.cg.shared.global [%r776], [%rd63], 16, %r777; // end inline asm setp.lt.s32 %p21, %r16, %r25; selp.b32 %r779, 16, 0, %p21; add.s64 %rd64, %rd63, %rd66; // begin inline asm cp.async.cg.shared.global [%r778], [%rd64], 16, %r779; // end inline asm selp.b32 %r781, 16, 0, %p18; add.s64 %rd65, %rd64, %rd66; // begin inline asm cp.async.cg.shared.global [%r780], [%rd65], 16, %r781; // end inline asm $L__BB0_5: @%p6 bra $L__BB0_7; shl.b64 %rd71, %rd6, 3; setp.lt.s32 %p23, %r17, %r25; add.s32 %r792, %r751, %r11; add.s32 %r782, %r792, 24576; add.s32 %r784, %r792, 26624; add.s32 %r786, %r792, 28672; add.s32 %r788, %r792, 30720; setp.lt.s32 %p24, %r7, %r25; selp.b32 %r783, 16, 0, %p24; // begin inline asm cp.async.cg.shared.global [%r782], [%rd189], 16, %r783; // end inline asm setp.lt.s32 %p25, %r15, %r25; selp.b32 %r785, 16, 0, %p25; add.s64 %rd68, %rd189, %rd71; // begin inline asm cp.async.cg.shared.global [%r784], [%rd68], 16, %r785; // end inline asm setp.lt.s32 %p26, %r16, %r25; selp.b32 %r787, 16, 0, %p26; add.s64 %rd69, %rd68, %rd71; // begin inline asm cp.async.cg.shared.global [%r786], [%rd69], 16, %r787; // end inline asm selp.b32 %r789, 16, 0, %p23; add.s64 %rd70, %rd69, %rd71; // begin inline asm cp.async.cg.shared.global [%r788], [%rd70], 16, %r789; // end inline asm $L__BB0_7: setp.lt.s32 %p27, %r6, 10; // begin inline asm cp.async.commit_group; // end inline asm @%p27 bra $L__BB0_9; add.s32 %r793, %r751, %r23; add.s32 %r798, %r793, 2048; add.s32 %r803, %r793, 4096; add.s32 %r808, %r793, 6144; add.s32 %r813, %r793, 8192; add.s32 %r818, %r793, 10240; add.s32 %r823, %r793, 12288; add.s32 %r828, %r793, 14336; mov.u32 %r872, 0; // begin inline asm st.shared.v4.b32 [%r793], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r798], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r803], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r808], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r813], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r818], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r823], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r828], {%r872, %r872, %r872, %r872}; // end inline asm add.s32 %r833, %r22, %r10; add.s32 %r838, %r833, 2048; add.s32 %r843, %r833, 4096; add.s32 %r848, %r833, 6144; // begin inline asm st.shared.v4.b32 [%r833], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r838], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r843], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r848], {%r872, %r872, %r872, %r872}; // end inline asm add.s32 %r875, %r751, %r11; add.s32 %r853, %r875, 24576; add.s32 %r858, %r875, 26624; add.s32 %r863, %r875, 28672; add.s32 %r868, %r875, 30720; // begin inline asm st.shared.v4.b32 [%r853], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r858], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r863], {%r872, %r872, %r872, %r872}; // end inline asm // begin inline asm st.shared.v4.b32 [%r868], {%r872, %r872, %r872, %r872}; // end inline asm $L__BB0_9: // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r1042, %r8, %r750; add.s32 %r880, %r1042, %r19; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r876, %r877, %r878, %r879}, [%r880]; // end inline asm xor.b32 %r1043, %r19, 32; add.s32 %r885, %r1042, %r1043; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r881, %r882, %r883, %r884}, [%r885]; // end inline asm xor.b32 %r1044, %r19, 64; add.s32 %r890, %r1042, %r1044; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r886, %r887, %r888, %r889}, [%r890]; // end inline asm xor.b32 %r1045, %r19, 96; add.s32 %r895, %r1042, %r1045; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r891, %r892, %r893, %r894}, [%r895]; // end inline asm or.b32 %r1046, %r19, 128; add.s32 %r900, %r1042, %r1046; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r896, %r897, %r898, %r899}, [%r900]; // end inline asm add.s32 %r1047, %r9, %r750; add.s32 %r47, %r1047, 16384; add.s32 %r905, %r47, %r20; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1987, %r1986, %r1985, %r1984}, [%r905]; // end inline asm add.s32 %r910, %r905, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1983, %r1982, %r1981, %r1980}, [%r910]; // end inline asm xor.b32 %r1048, %r20, 32; add.s32 %r915, %r47, %r1048; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1979, %r1978, %r1977, %r1976}, [%r915]; // end inline asm add.s32 %r920, %r915, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1975, %r1974, %r1973, %r1972}, [%r920]; // end inline asm xor.b32 %r1049, %r20, 64; add.s32 %r925, %r47, %r1049; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1971, %r1970, %r1969, %r1968}, [%r925]; // end inline asm add.s32 %r930, %r925, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1967, %r1966, %r1965, %r1964}, [%r930]; // end inline asm xor.b32 %r1050, %r20, 96; add.s32 %r935, %r47, %r1050; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1963, %r1962, %r1961, %r1960}, [%r935]; // end inline asm add.s32 %r940, %r935, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1959, %r1958, %r1957, %r1956}, [%r940]; // end inline asm or.b32 %r1051, %r20, 128; add.s32 %r945, %r47, %r1051; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1955, %r1954, %r1953, %r1952}, [%r945]; // end inline asm add.s32 %r950, %r945, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1951, %r1950, %r1949, %r1948}, [%r950]; // end inline asm add.s32 %r1052, %r750, 24576; add.s32 %r955, %r21, %r1052; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2027, %r2026, %r2025, %r2024}, [%r955]; // end inline asm xor.b32 %r1053, %r21, 32; add.s32 %r960, %r1053, %r1052; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2023, %r2022, %r2021, %r2020}, [%r960]; // end inline asm xor.b32 %r1054, %r21, 64; add.s32 %r965, %r1054, %r1052; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2019, %r2018, %r2017, %r2016}, [%r965]; // end inline asm xor.b32 %r1055, %r21, 96; add.s32 %r970, %r1055, %r1052; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2015, %r2014, %r2013, %r2012}, [%r970]; // end inline asm or.b32 %r1056, %r21, 128; add.s32 %r975, %r1056, %r1052; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2011, %r2010, %r2009, %r2008}, [%r975]; // end inline asm add.s32 %r1057, %r750, 28672; add.s32 %r980, %r21, %r1057; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2007, %r2006, %r2005, %r2004}, [%r980]; // end inline asm add.s32 %r985, %r1053, %r1057; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2003, %r2002, %r2001, %r2000}, [%r985]; // end inline asm add.s32 %r990, %r1054, %r1057; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1999, %r1998, %r1997, %r1996}, [%r990]; // end inline asm add.s32 %r995, %r1055, %r1057; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1995, %r1994, %r1993, %r1992}, [%r995]; // end inline asm add.s32 %r1000, %r1056, %r1057; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1991, %r1990, %r1989, %r1988}, [%r1000]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r2067, 0; // end inline asm // begin inline asm mov.u32 %r2066, 0; // end inline asm // begin inline asm mov.u32 %r2065, 0; // end inline asm // begin inline asm mov.u32 %r2064, 0; // end inline asm // begin inline asm mov.u32 %r2063, 0; // end inline asm // begin inline asm mov.u32 %r2062, 0; // end inline asm // begin inline asm mov.u32 %r2061, 0; // end inline asm // begin inline asm mov.u32 %r2060, 0; // end inline asm // begin inline asm mov.u32 %r2059, 0; // end inline asm // begin inline asm mov.u32 %r2058, 0; // end inline asm // begin inline asm mov.u32 %r2057, 0; // end inline asm // begin inline asm mov.u32 %r2056, 0; // end inline asm // begin inline asm mov.u32 %r2055, 0; // end inline asm // begin inline asm mov.u32 %r2054, 0; // end inline asm // begin inline asm mov.u32 %r2053, 0; // end inline asm // begin inline asm mov.u32 %r2052, 0; // end inline asm // begin inline asm mov.u32 %r2051, 0; // end inline asm // begin inline asm mov.u32 %r2050, 0; // end inline asm // begin inline asm mov.u32 %r2049, 0; // end inline asm // begin inline asm mov.u32 %r2048, 0; // end inline asm // begin inline asm mov.u32 %r2047, 0; // end inline asm // begin inline asm mov.u32 %r2046, 0; // end inline asm // begin inline asm mov.u32 %r2045, 0; // end inline asm // begin inline asm mov.u32 %r2044, 0; // end inline asm // begin inline asm mov.u32 %r2043, 0; // end inline asm // begin inline asm mov.u32 %r2042, 0; // end inline asm // begin inline asm mov.u32 %r2041, 0; // end inline asm // begin inline asm mov.u32 %r2040, 0; // end inline asm // begin inline asm mov.u32 %r2039, 0; // end inline asm // begin inline asm mov.u32 %r2038, 0; // end inline asm // begin inline asm mov.u32 %r2037, 0; // end inline asm // begin inline asm mov.u32 %r2036, 0; // end inline asm // begin inline asm mov.u32 %r2035, 0; // end inline asm // begin inline asm mov.u32 %r2034, 0; // end inline asm // begin inline asm mov.u32 %r2033, 0; // end inline asm // begin inline asm mov.u32 %r2032, 0; // end inline asm // begin inline asm mov.u32 %r2031, 0; // end inline asm // begin inline asm mov.u32 %r2030, 0; // end inline asm // begin inline asm mov.u32 %r2029, 0; // end inline asm // begin inline asm mov.u32 %r2028, 0; // end inline asm add.s32 %r1058, %r1, 31; shr.s32 %r1059, %r1058, 31; shr.u32 %r1060, %r1059, 27; add.s32 %r1061, %r1058, %r1060; and.b32 %r168, %r1061, -32; setp.lt.s32 %p28, %r1, 1; @%p28 bra $L__BB0_62; ld.param.u8 %rs1, [%rd1+160]; add.s32 %r1065, %r751, %r11; add.s32 %r269, %r1065, 24576; add.s32 %r270, %r22, %r10; add.s32 %r271, %r270, 2048; add.s32 %r272, %r270, 4096; add.s32 %r273, %r270, 6144; cvt.s64.s32 %rd14, %r5; cvt.s64.s32 %rd15, %r1945; add.s32 %r274, %r1065, 26624; add.s32 %r275, %r1065, 28672; add.s32 %r276, %r1065, 30720; add.s32 %r1066, %r1945, 1; cvt.s64.s32 %rd16, %r1066; add.s32 %r1067, %r1945, 8; cvt.s64.s32 %rd17, %r1067; add.s32 %r1068, %r1945, 9; cvt.s64.s32 %rd18, %r1068; add.s32 %r1069, %r1945, 16; cvt.s64.s32 %rd19, %r1069; add.s32 %r1070, %r1945, 17; cvt.s64.s32 %rd20, %r1070; add.s32 %r1071, %r1945, 24; cvt.s64.s32 %rd21, %r1071; add.s32 %r1072, %r1945, 25; cvt.s64.s32 %rd22, %r1072; add.s32 %r277, %r5, 8; mov.u32 %r1944, 0; mov.f32 %f742, 0fFF800000; mov.f32 %f740, 0f00000000; mov.u32 %r1946, %r1; mov.f32 %f741, %f740; mov.f32 %f743, %f742; mov.u32 %r1947, %r1; $L__BB0_11: add.s32 %r1073, %r1944, 32; setp.ge.s32 %p29, %r1073, %r168; @%p29 bra $L__BB0_18; bar.sync 0; shl.b64 %rd72, %rd6, 5; add.s64 %rd190, %rd190, %rd72; add.s32 %r1947, %r1947, -32; @%p6 bra $L__BB0_14; min.s32 %r1082, %r1947, 32; setp.lt.s32 %p31, %r17, %r1082; setp.lt.s32 %p32, %r16, %r1082; setp.lt.s32 %p33, %r15, %r1082; setp.lt.s32 %p34, %r7, %r1082; selp.b32 %r1075, 16, 0, %p34; // begin inline asm cp.async.cg.shared.global [%r270], [%rd190], 16, %r1075; // end inline asm selp.b32 %r1077, 16, 0, %p33; shl.b64 %rd77, %rd6, 3; add.s64 %rd74, %rd190, %rd77; // begin inline asm cp.async.cg.shared.global [%r271], [%rd74], 16, %r1077; // end inline asm selp.b32 %r1079, 16, 0, %p32; add.s64 %rd75, %rd74, %rd77; // begin inline asm cp.async.cg.shared.global [%r272], [%rd75], 16, %r1079; // end inline asm selp.b32 %r1081, 16, 0, %p31; add.s64 %rd76, %rd75, %rd77; // begin inline asm cp.async.cg.shared.global [%r273], [%rd76], 16, %r1081; // end inline asm $L__BB0_14: add.s64 %rd189, %rd189, %rd72; add.s32 %r1946, %r1946, -32; @%p6 bra $L__BB0_16; min.s32 %r1094, %r1946, 32; setp.lt.s32 %p36, %r17, %r1094; setp.lt.s32 %p37, %r16, %r1094; setp.lt.s32 %p38, %r15, %r1094; setp.lt.s32 %p39, %r7, %r1094; selp.b32 %r1087, 16, 0, %p39; // begin inline asm cp.async.cg.shared.global [%r269], [%rd189], 16, %r1087; // end inline asm selp.b32 %r1089, 16, 0, %p38; shl.b64 %rd83, %rd6, 3; add.s64 %rd80, %rd189, %rd83; // begin inline asm cp.async.cg.shared.global [%r274], [%rd80], 16, %r1089; // end inline asm selp.b32 %r1091, 16, 0, %p37; add.s64 %rd81, %rd80, %rd83; // begin inline asm cp.async.cg.shared.global [%r275], [%rd81], 16, %r1091; // end inline asm selp.b32 %r1093, 16, 0, %p36; add.s64 %rd82, %rd81, %rd83; // begin inline asm cp.async.cg.shared.global [%r276], [%rd82], 16, %r1093; // end inline asm $L__BB0_16: // begin inline asm cp.async.commit_group; // end inline asm @%p27 bra $L__BB0_18; mov.u32 %r1137, 0; // begin inline asm st.shared.v4.b32 [%r270], {%r1137, %r1137, %r1137, %r1137}; // end inline asm // begin inline asm st.shared.v4.b32 [%r271], {%r1137, %r1137, %r1137, %r1137}; // end inline asm // begin inline asm st.shared.v4.b32 [%r272], {%r1137, %r1137, %r1137, %r1137}; // end inline asm // begin inline asm st.shared.v4.b32 [%r273], {%r1137, %r1137, %r1137, %r1137}; // end inline asm // begin inline asm st.shared.v4.b32 [%r269], {%r1137, %r1137, %r1137, %r1137}; // end inline asm // begin inline asm st.shared.v4.b32 [%r274], {%r1137, %r1137, %r1137, %r1137}; // end inline asm // begin inline asm st.shared.v4.b32 [%r275], {%r1137, %r1137, %r1137, %r1137}; // end inline asm // begin inline asm st.shared.v4.b32 [%r276], {%r1137, %r1137, %r1137, %r1137}; // end inline asm $L__BB0_18: setp.eq.s16 %p41, %rs1, 0; @%p41 bra $L__BB0_51; mov.u32 %r1821, %ctaid.x; mov.u32 %r1820, %ctaid.y; mov.u32 %r1819, %ctaid.z; shl.b32 %r1818, %r1819, 6; ld.param.u32 %r1817, [fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl_param_0+52]; cvt.s64.s32 %rd84, %r1818; add.s64 %rd85, %rd14, %rd84; setp.ge.u64 %p42, %rd85, %rd2; mul.lo.s32 %r1140, %r1, %r1818; cvt.s64.s32 %rd86, %r1140; cvt.u64.u32 %rd29, %r1944; add.s64 %rd30, %rd86, %rd29; mul.lo.s64 %rd87, %rd2, %rd14; add.s64 %rd88, %rd30, %rd87; add.s64 %rd89, %rd88, %rd15; add.s64 %rd31, %rd15, %rd29; setp.ge.u64 %p43, %rd31, %rd2; shl.b64 %rd90, %rd89, 1; mad.lo.s32 %r1143, %r1817, %r1820, %r1821; cvt.s64.s32 %rd91, %r1143; mul.lo.s64 %rd92, %rd4, %rd91; add.s64 %rd93, %rd92, %rd90; cvta.to.global.u64 %rd94, %rd3; add.s64 %rd32, %rd94, %rd93; mov.u16 %rs132, 0; or.pred %p44, %p43, %p42; mov.u16 %rs131, %rs132; @%p44 bra $L__BB0_21; ld.global.u16 %rs131, [%rd32]; $L__BB0_21: add.s64 %rd33, %rd16, %rd29; setp.ge.u64 %p46, %rd33, %rd2; or.pred %p47, %p46, %p42; @%p47 bra $L__BB0_23; ld.global.u16 %rs132, [%rd32+2]; $L__BB0_23: add.s64 %rd34, %rd17, %rd29; setp.ge.u64 %p49, %rd34, %rd2; mov.u16 %rs134, 0; or.pred %p50, %p49, %p42; mov.u16 %rs133, %rs134; @%p50 bra $L__BB0_25; ld.global.u16 %rs133, [%rd32+16]; $L__BB0_25: add.s64 %rd35, %rd18, %rd29; setp.ge.u64 %p52, %rd35, %rd2; or.pred %p53, %p52, %p42; @%p53 bra $L__BB0_27; ld.global.u16 %rs134, [%rd32+18]; $L__BB0_27: add.s64 %rd36, %rd19, %rd29; setp.ge.u64 %p55, %rd36, %rd2; mov.u16 %rs136, 0; or.pred %p56, %p55, %p42; mov.u16 %rs135, %rs136; @%p56 bra $L__BB0_29; ld.global.u16 %rs135, [%rd32+32]; $L__BB0_29: add.s64 %rd37, %rd20, %rd29; setp.ge.u64 %p58, %rd37, %rd2; or.pred %p59, %p58, %p42; @%p59 bra $L__BB0_31; ld.global.u16 %rs136, [%rd32+34]; $L__BB0_31: add.s64 %rd38, %rd21, %rd29; setp.ge.u64 %p61, %rd38, %rd2; mov.u16 %rs138, 0; or.pred %p62, %p61, %p42; mov.u16 %rs137, %rs138; @%p62 bra $L__BB0_33; ld.global.u16 %rs137, [%rd32+48]; $L__BB0_33: add.s64 %rd39, %rd22, %rd29; setp.ge.u64 %p64, %rd39, %rd2; or.pred %p65, %p64, %p42; @%p65 bra $L__BB0_35; ld.global.u16 %rs138, [%rd32+50]; $L__BB0_35: cvt.s64.s32 %rd110, %r277; add.s64 %rd111, %rd110, %rd84; setp.ge.u64 %p66, %rd111, %rd2; mul.lo.s64 %rd112, %rd2, %rd110; add.s64 %rd113, %rd30, %rd112; add.s64 %rd114, %rd113, %rd15; shl.b64 %rd115, %rd114, 1; add.s64 %rd118, %rd92, %rd115; add.s64 %rd40, %rd94, %rd118; mov.u16 %rs140, 0; or.pred %p68, %p43, %p66; mov.u16 %rs139, %rs140; @%p68 bra $L__BB0_37; ld.global.u16 %rs139, [%rd40]; $L__BB0_37: or.pred %p71, %p46, %p66; @%p71 bra $L__BB0_39; ld.global.u16 %rs140, [%rd40+2]; $L__BB0_39: mov.u16 %rs142, 0; or.pred %p74, %p49, %p66; mov.u16 %rs141, %rs142; @%p74 bra $L__BB0_41; ld.global.u16 %rs141, [%rd40+16]; $L__BB0_41: or.pred %p77, %p52, %p66; @%p77 bra $L__BB0_43; ld.global.u16 %rs142, [%rd40+18]; $L__BB0_43: mov.u16 %rs144, 0; or.pred %p80, %p55, %p66; mov.u16 %rs143, %rs144; @%p80 bra $L__BB0_45; ld.global.u16 %rs143, [%rd40+32]; $L__BB0_45: or.pred %p83, %p58, %p66; @%p83 bra $L__BB0_47; ld.global.u16 %rs144, [%rd40+34]; $L__BB0_47: mov.u16 %rs146, 0; or.pred %p86, %p61, %p66; mov.u16 %rs145, %rs146; @%p86 bra $L__BB0_49; ld.global.u16 %rs145, [%rd40+48]; $L__BB0_49: or.pred %p89, %p64, %p66; @%p89 bra $L__BB0_51; ld.global.u16 %rs146, [%rd40+50]; $L__BB0_51: // begin inline asm mov.u32 %r1177, 0; // end inline asm // begin inline asm mov.u32 %r1178, 0; // end inline asm // begin inline asm mov.u32 %r1179, 0; // end inline asm // begin inline asm mov.u32 %r1180, 0; // end inline asm // begin inline asm mov.u32 %r1181, 0; // end inline asm // begin inline asm mov.u32 %r1182, 0; // end inline asm // begin inline asm mov.u32 %r1183, 0; // end inline asm // begin inline asm mov.u32 %r1184, 0; // end inline asm // begin inline asm mov.u32 %r1185, 0; // end inline asm // begin inline asm mov.u32 %r1186, 0; // end inline asm // begin inline asm mov.u32 %r1187, 0; // end inline asm // begin inline asm mov.u32 %r1188, 0; // end inline asm // begin inline asm mov.u32 %r1189, 0; // end inline asm // begin inline asm mov.u32 %r1190, 0; // end inline asm // begin inline asm mov.u32 %r1191, 0; // end inline asm // begin inline asm mov.u32 %r1192, 0; // end inline asm mov.b32 %f156, %r1177; mov.b32 %f157, %r1178; mov.b32 %f158, %r1179; mov.b32 %f159, %r1180; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r876, %r877, %r878, %r879}, {%r1987, %r1986}, {%f156, %f157, %f158, %f159}; // end inline asm mov.b32 %f164, %r1181; mov.b32 %f165, %r1182; mov.b32 %f166, %r1183; mov.b32 %f167, %r1184; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r876, %r877, %r878, %r879}, {%r1985, %r1984}, {%f164, %f165, %f166, %f167}; // end inline asm mov.b32 %f172, %r1185; mov.b32 %f173, %r1186; mov.b32 %f174, %r1187; mov.b32 %f175, %r1188; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r876, %r877, %r878, %r879}, {%r1983, %r1982}, {%f172, %f173, %f174, %f175}; // end inline asm mov.b32 %f180, %r1189; mov.b32 %f181, %r1190; mov.b32 %f182, %r1191; mov.b32 %f183, %r1192; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r876, %r877, %r878, %r879}, {%r1981, %r1980}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r881, %r882, %r883, %r884}, {%r1979, %r1978}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r881, %r882, %r883, %r884}, {%r1977, %r1976}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r881, %r882, %r883, %r884}, {%r1975, %r1974}, {%f172, %f173, %f174, %f175}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r881, %r882, %r883, %r884}, {%r1973, %r1972}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r886, %r887, %r888, %r889}, {%r1971, %r1970}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r886, %r887, %r888, %r889}, {%r1969, %r1968}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r886, %r887, %r888, %r889}, {%r1967, %r1966}, {%f172, %f173, %f174, %f175}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r886, %r887, %r888, %r889}, {%r1965, %r1964}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r891, %r892, %r893, %r894}, {%r1963, %r1962}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r891, %r892, %r893, %r894}, {%r1961, %r1960}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r891, %r892, %r893, %r894}, {%r1959, %r1958}, {%f172, %f173, %f174, %f175}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r891, %r892, %r893, %r894}, {%r1957, %r1956}, {%f180, %f181, %f182, %f183}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r896, %r897, %r898, %r899}, {%r1955, %r1954}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f164, %f165, %f166, %f167}, {%r896, %r897, %r898, %r899}, {%r1953, %r1952}, {%f164, %f165, %f166, %f167}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f172, %f173, %f174, %f175}, {%r896, %r897, %r898, %r899}, {%r1951, %r1950}, {%f172, %f173, %f174, %f175}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f180, %f181, %f182, %f183}, {%r896, %r897, %r898, %r899}, {%r1949, %r1948}, {%f180, %f181, %f182, %f183}; // end inline asm mul.ftz.f32 %f284, %f1, %f156; mul.ftz.f32 %f285, %f1, %f157; mul.ftz.f32 %f286, %f1, %f164; mul.ftz.f32 %f287, %f1, %f165; mul.ftz.f32 %f288, %f1, %f158; mul.ftz.f32 %f289, %f1, %f159; mul.ftz.f32 %f290, %f1, %f166; mul.ftz.f32 %f291, %f1, %f167; mul.ftz.f32 %f292, %f1, %f172; mul.ftz.f32 %f293, %f1, %f173; mul.ftz.f32 %f294, %f1, %f180; mul.ftz.f32 %f295, %f1, %f181; mul.ftz.f32 %f296, %f1, %f174; mul.ftz.f32 %f297, %f1, %f175; mul.ftz.f32 %f298, %f1, %f182; mul.ftz.f32 %f299, %f1, %f183; setp.lt.s32 %p90, %r1945, %r1; selp.f32 %f759, %f284, 0fFF800000, %p90; add.s32 %r1313, %r1945, 1; setp.lt.s32 %p91, %r1313, %r1; selp.f32 %f758, %f285, 0fFF800000, %p91; add.s32 %r1314, %r1945, 8; setp.lt.s32 %p92, %r1314, %r1; selp.f32 %f757, %f286, 0fFF800000, %p92; add.s32 %r1315, %r1945, 9; setp.lt.s32 %p93, %r1315, %r1; selp.f32 %f756, %f287, 0fFF800000, %p93; add.s32 %r1316, %r1945, 16; setp.lt.s32 %p94, %r1316, %r1; selp.f32 %f755, %f292, 0fFF800000, %p94; add.s32 %r1317, %r1945, 17; setp.lt.s32 %p95, %r1317, %r1; selp.f32 %f754, %f293, 0fFF800000, %p95; add.s32 %r1318, %r1945, 24; setp.lt.s32 %p96, %r1318, %r1; selp.f32 %f753, %f294, 0fFF800000, %p96; add.s32 %r1319, %r1945, 25; setp.lt.s32 %p97, %r1319, %r1; selp.f32 %f752, %f295, 0fFF800000, %p97; selp.f32 %f751, %f288, 0fFF800000, %p90; selp.f32 %f750, %f289, 0fFF800000, %p91; selp.f32 %f749, %f290, 0fFF800000, %p92; selp.f32 %f748, %f291, 0fFF800000, %p93; selp.f32 %f747, %f296, 0fFF800000, %p94; selp.f32 %f746, %f297, 0fFF800000, %p95; selp.f32 %f745, %f298, 0fFF800000, %p96; selp.f32 %f744, %f299, 0fFF800000, %p97; @%p41 bra $L__BB0_53; // begin inline asm cvt.f32.f16 %f300, %rs131; // end inline asm add.ftz.f32 %f759, %f300, %f759; // begin inline asm cvt.f32.f16 %f301, %rs132; // end inline asm add.ftz.f32 %f758, %f301, %f758; // begin inline asm cvt.f32.f16 %f302, %rs133; // end inline asm add.ftz.f32 %f757, %f302, %f757; // begin inline asm cvt.f32.f16 %f303, %rs134; // end inline asm add.ftz.f32 %f756, %f303, %f756; // begin inline asm cvt.f32.f16 %f304, %rs135; // end inline asm add.ftz.f32 %f755, %f304, %f755; // begin inline asm cvt.f32.f16 %f305, %rs136; // end inline asm add.ftz.f32 %f754, %f305, %f754; // begin inline asm cvt.f32.f16 %f306, %rs137; // end inline asm add.ftz.f32 %f753, %f306, %f753; // begin inline asm cvt.f32.f16 %f307, %rs138; // end inline asm add.ftz.f32 %f752, %f307, %f752; // begin inline asm cvt.f32.f16 %f308, %rs139; // end inline asm add.ftz.f32 %f751, %f308, %f751; // begin inline asm cvt.f32.f16 %f309, %rs140; // end inline asm add.ftz.f32 %f750, %f309, %f750; // begin inline asm cvt.f32.f16 %f310, %rs141; // end inline asm add.ftz.f32 %f749, %f310, %f749; // begin inline asm cvt.f32.f16 %f311, %rs142; // end inline asm add.ftz.f32 %f748, %f311, %f748; // begin inline asm cvt.f32.f16 %f312, %rs143; // end inline asm add.ftz.f32 %f747, %f312, %f747; // begin inline asm cvt.f32.f16 %f313, %rs144; // end inline asm add.ftz.f32 %f746, %f313, %f746; // begin inline asm cvt.f32.f16 %f314, %rs145; // end inline asm add.ftz.f32 %f745, %f314, %f745; // begin inline asm cvt.f32.f16 %f315, %rs146; // end inline asm add.ftz.f32 %f744, %f315, %f744; $L__BB0_53: add.s32 %r1774, %r1944, 32; setp.ge.s32 %p147, %r1774, %r168; setp.gt.ftz.f32 %p100, %f759, %f758; selp.f32 %f316, %f759, %f758, %p100; setp.gt.ftz.f32 %p101, %f316, %f757; selp.f32 %f317, %f316, %f757, %p101; setp.gt.ftz.f32 %p102, %f317, %f756; selp.f32 %f318, %f317, %f756, %p102; setp.gt.ftz.f32 %p103, %f318, %f755; selp.f32 %f319, %f318, %f755, %p103; setp.gt.ftz.f32 %p104, %f319, %f754; selp.f32 %f320, %f319, %f754, %p104; setp.gt.ftz.f32 %p105, %f320, %f753; selp.f32 %f321, %f320, %f753, %p105; setp.gt.ftz.f32 %p106, %f321, %f752; selp.f32 %f322, %f321, %f752, %p106; setp.gt.ftz.f32 %p107, %f751, %f750; selp.f32 %f323, %f751, %f750, %p107; setp.gt.ftz.f32 %p108, %f323, %f749; selp.f32 %f324, %f323, %f749, %p108; setp.gt.ftz.f32 %p109, %f324, %f748; selp.f32 %f325, %f324, %f748, %p109; setp.gt.ftz.f32 %p110, %f325, %f747; selp.f32 %f326, %f325, %f747, %p110; setp.gt.ftz.f32 %p111, %f326, %f746; selp.f32 %f327, %f326, %f746, %p111; setp.gt.ftz.f32 %p112, %f327, %f745; selp.f32 %f328, %f327, %f745, %p112; setp.gt.ftz.f32 %p113, %f328, %f744; selp.f32 %f329, %f328, %f744, %p113; mov.b32 %r1321, %f322; mov.u32 %r1322, 31; mov.u32 %r1323, 1; mov.u32 %r1324, -1; shfl.sync.bfly.b32 %r1325|%p114, %r1321, %r1323, %r1322, %r1324; mov.b32 %f330, %r1325; setp.gt.ftz.f32 %p115, %f322, %f330; selp.f32 %f331, %f322, %f330, %p115; mov.b32 %r1326, %f331; mov.u32 %r1327, 2; shfl.sync.bfly.b32 %r1328|%p116, %r1326, %r1327, %r1322, %r1324; mov.b32 %f332, %r1328; setp.gt.ftz.f32 %p117, %f331, %f332; selp.f32 %f333, %f331, %f332, %p117; mov.b32 %r1329, %f329; shfl.sync.bfly.b32 %r1330|%p118, %r1329, %r1323, %r1322, %r1324; mov.b32 %f334, %r1330; setp.gt.ftz.f32 %p119, %f329, %f334; selp.f32 %f335, %f329, %f334, %p119; mov.b32 %r1331, %f335; shfl.sync.bfly.b32 %r1332|%p120, %r1331, %r1327, %r1322, %r1324; mov.b32 %f336, %r1332; setp.gt.ftz.f32 %p121, %f335, %f336; selp.f32 %f337, %f335, %f336, %p121; max.ftz.f32 %f54, %f333, %f743; max.ftz.f32 %f55, %f337, %f742; sub.ftz.f32 %f338, %f759, %f54; mul.ftz.f32 %f339, %f338, 0f3FB8AA3B; ex2.approx.ftz.f32 %f56, %f339; sub.ftz.f32 %f340, %f758, %f54; mul.ftz.f32 %f341, %f340, 0f3FB8AA3B; ex2.approx.ftz.f32 %f57, %f341; sub.ftz.f32 %f342, %f757, %f54; mul.ftz.f32 %f343, %f342, 0f3FB8AA3B; ex2.approx.ftz.f32 %f58, %f343; sub.ftz.f32 %f344, %f756, %f54; mul.ftz.f32 %f345, %f344, 0f3FB8AA3B; ex2.approx.ftz.f32 %f59, %f345; sub.ftz.f32 %f346, %f755, %f54; mul.ftz.f32 %f347, %f346, 0f3FB8AA3B; ex2.approx.ftz.f32 %f60, %f347; sub.ftz.f32 %f348, %f754, %f54; mul.ftz.f32 %f349, %f348, 0f3FB8AA3B; ex2.approx.ftz.f32 %f61, %f349; sub.ftz.f32 %f350, %f753, %f54; mul.ftz.f32 %f351, %f350, 0f3FB8AA3B; ex2.approx.ftz.f32 %f62, %f351; sub.ftz.f32 %f352, %f752, %f54; mul.ftz.f32 %f353, %f352, 0f3FB8AA3B; ex2.approx.ftz.f32 %f63, %f353; sub.ftz.f32 %f354, %f751, %f55; mul.ftz.f32 %f355, %f354, 0f3FB8AA3B; ex2.approx.ftz.f32 %f64, %f355; sub.ftz.f32 %f356, %f750, %f55; mul.ftz.f32 %f357, %f356, 0f3FB8AA3B; ex2.approx.ftz.f32 %f65, %f357; sub.ftz.f32 %f358, %f749, %f55; mul.ftz.f32 %f359, %f358, 0f3FB8AA3B; ex2.approx.ftz.f32 %f66, %f359; sub.ftz.f32 %f360, %f748, %f55; mul.ftz.f32 %f361, %f360, 0f3FB8AA3B; ex2.approx.ftz.f32 %f67, %f361; sub.ftz.f32 %f362, %f747, %f55; mul.ftz.f32 %f363, %f362, 0f3FB8AA3B; ex2.approx.ftz.f32 %f68, %f363; sub.ftz.f32 %f364, %f746, %f55; mul.ftz.f32 %f365, %f364, 0f3FB8AA3B; ex2.approx.ftz.f32 %f69, %f365; sub.ftz.f32 %f366, %f745, %f55; mul.ftz.f32 %f367, %f366, 0f3FB8AA3B; ex2.approx.ftz.f32 %f70, %f367; sub.ftz.f32 %f368, %f744, %f55; mul.ftz.f32 %f369, %f368, 0f3FB8AA3B; ex2.approx.ftz.f32 %f71, %f369; add.ftz.f32 %f370, %f56, %f57; add.ftz.f32 %f371, %f370, 0f00000000; add.ftz.f32 %f372, %f58, %f59; add.ftz.f32 %f373, %f372, 0f00000000; add.ftz.f32 %f374, %f60, %f61; add.ftz.f32 %f375, %f371, %f374; add.ftz.f32 %f376, %f62, %f63; add.ftz.f32 %f377, %f373, %f376; add.ftz.f32 %f378, %f375, %f377; add.ftz.f32 %f379, %f64, %f65; add.ftz.f32 %f380, %f379, 0f00000000; add.ftz.f32 %f381, %f66, %f67; add.ftz.f32 %f382, %f381, 0f00000000; add.ftz.f32 %f383, %f68, %f69; add.ftz.f32 %f384, %f380, %f383; add.ftz.f32 %f385, %f70, %f71; add.ftz.f32 %f386, %f382, %f385; add.ftz.f32 %f387, %f384, %f386; mov.b32 %r1333, %f378; shfl.sync.bfly.b32 %r1334|%p122, %r1333, %r1323, %r1322, %r1324; mov.b32 %f388, %r1334; add.ftz.f32 %f389, %f378, %f388; mov.b32 %r1335, %f389; shfl.sync.bfly.b32 %r1336|%p123, %r1335, %r1327, %r1322, %r1324; mov.b32 %f390, %r1336; add.ftz.f32 %f391, %f389, %f390; mov.b32 %r1337, %f387; shfl.sync.bfly.b32 %r1338|%p124, %r1337, %r1323, %r1322, %r1324; mov.b32 %f392, %r1338; add.ftz.f32 %f393, %f387, %f392; mov.b32 %r1339, %f393; shfl.sync.bfly.b32 %r1340|%p125, %r1339, %r1327, %r1322, %r1324; mov.b32 %f394, %r1340; add.ftz.f32 %f395, %f393, %f394; sub.ftz.f32 %f396, %f743, %f54; mul.ftz.f32 %f397, %f396, 0f3FB8AA3B; ex2.approx.ftz.f32 %f398, %f397; mul.ftz.f32 %f72, %f398, %f741; add.ftz.f32 %f741, %f72, %f391; sub.ftz.f32 %f399, %f742, %f55; mul.ftz.f32 %f400, %f399, 0f3FB8AA3B; ex2.approx.ftz.f32 %f401, %f400; mul.ftz.f32 %f74, %f401, %f740; add.ftz.f32 %f740, %f74, %f395; @%p147 bra $L__BB0_55; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1987, %r1986, %r1985, %r1984}, [%r905]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1983, %r1982, %r1981, %r1980}, [%r910]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1979, %r1978, %r1977, %r1976}, [%r915]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1975, %r1974, %r1973, %r1972}, [%r920]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1971, %r1970, %r1969, %r1968}, [%r925]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1967, %r1966, %r1965, %r1964}, [%r930]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1963, %r1962, %r1961, %r1960}, [%r935]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1959, %r1958, %r1957, %r1956}, [%r940]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1955, %r1954, %r1953, %r1952}, [%r945]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1951, %r1950, %r1949, %r1948}, [%r950]; // end inline asm $L__BB0_55: // begin inline asm cvt.rn.f16x2.f32 %r1406, %f57, %f56; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1407, %f65, %f64; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1408, %f59, %f58; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1409, %f67, %f66; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1410, %f61, %f60; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1411, %f69, %f68; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1412, %f63, %f62; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1413, %f71, %f70; // end inline asm // begin inline asm mov.u32 %r1414, 0; // end inline asm // begin inline asm mov.u32 %r1415, 0; // end inline asm // begin inline asm mov.u32 %r1416, 0; // end inline asm // begin inline asm mov.u32 %r1417, 0; // end inline asm // begin inline asm mov.u32 %r1418, 0; // end inline asm // begin inline asm mov.u32 %r1419, 0; // end inline asm // begin inline asm mov.u32 %r1420, 0; // end inline asm // begin inline asm mov.u32 %r1421, 0; // end inline asm // begin inline asm mov.u32 %r1422, 0; // end inline asm // begin inline asm mov.u32 %r1423, 0; // end inline asm // begin inline asm mov.u32 %r1424, 0; // end inline asm // begin inline asm mov.u32 %r1425, 0; // end inline asm // begin inline asm mov.u32 %r1426, 0; // end inline asm // begin inline asm mov.u32 %r1427, 0; // end inline asm // begin inline asm mov.u32 %r1428, 0; // end inline asm // begin inline asm mov.u32 %r1429, 0; // end inline asm // begin inline asm mov.u32 %r1430, 0; // end inline asm // begin inline asm mov.u32 %r1431, 0; // end inline asm // begin inline asm mov.u32 %r1432, 0; // end inline asm // begin inline asm mov.u32 %r1433, 0; // end inline asm // begin inline asm mov.u32 %r1434, 0; // end inline asm // begin inline asm mov.u32 %r1435, 0; // end inline asm // begin inline asm mov.u32 %r1436, 0; // end inline asm // begin inline asm mov.u32 %r1437, 0; // end inline asm // begin inline asm mov.u32 %r1438, 0; // end inline asm // begin inline asm mov.u32 %r1439, 0; // end inline asm // begin inline asm mov.u32 %r1440, 0; // end inline asm // begin inline asm mov.u32 %r1441, 0; // end inline asm // begin inline asm mov.u32 %r1442, 0; // end inline asm // begin inline asm mov.u32 %r1443, 0; // end inline asm // begin inline asm mov.u32 %r1444, 0; // end inline asm // begin inline asm mov.u32 %r1445, 0; // end inline asm // begin inline asm mov.u32 %r1446, 0; // end inline asm // begin inline asm mov.u32 %r1447, 0; // end inline asm // begin inline asm mov.u32 %r1448, 0; // end inline asm // begin inline asm mov.u32 %r1449, 0; // end inline asm // begin inline asm mov.u32 %r1450, 0; // end inline asm // begin inline asm mov.u32 %r1451, 0; // end inline asm // begin inline asm mov.u32 %r1452, 0; // end inline asm // begin inline asm mov.u32 %r1453, 0; // end inline asm mov.b32 %f498, %r1414; mov.b32 %f499, %r1415; mov.b32 %f500, %r1416; mov.b32 %f501, %r1417; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f498, %f499, %f500, %f501}, {%r1406, %r1407, %r1408, %r1409}, {%r2027, %r2026}, {%f498, %f499, %f500, %f501}; // end inline asm mov.b32 %f506, %r1418; mov.b32 %f507, %r1419; mov.b32 %f508, %r1420; mov.b32 %f509, %r1421; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f506, %f507, %f508, %f509}, {%r1406, %r1407, %r1408, %r1409}, {%r2025, %r2024}, {%f506, %f507, %f508, %f509}; // end inline asm mov.b32 %f514, %r1422; mov.b32 %f515, %r1423; mov.b32 %f516, %r1424; mov.b32 %f517, %r1425; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f514, %f515, %f516, %f517}, {%r1406, %r1407, %r1408, %r1409}, {%r2023, %r2022}, {%f514, %f515, %f516, %f517}; // end inline asm mov.b32 %f522, %r1426; mov.b32 %f523, %r1427; mov.b32 %f524, %r1428; mov.b32 %f525, %r1429; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f522, %f523, %f524, %f525}, {%r1406, %r1407, %r1408, %r1409}, {%r2021, %r2020}, {%f522, %f523, %f524, %f525}; // end inline asm mov.b32 %f530, %r1430; mov.b32 %f531, %r1431; mov.b32 %f532, %r1432; mov.b32 %f533, %r1433; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f530, %f531, %f532, %f533}, {%r1406, %r1407, %r1408, %r1409}, {%r2019, %r2018}, {%f530, %f531, %f532, %f533}; // end inline asm mov.b32 %f538, %r1434; mov.b32 %f539, %r1435; mov.b32 %f540, %r1436; mov.b32 %f541, %r1437; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f538, %f539, %f540, %f541}, {%r1406, %r1407, %r1408, %r1409}, {%r2017, %r2016}, {%f538, %f539, %f540, %f541}; // end inline asm mov.b32 %f546, %r1438; mov.b32 %f547, %r1439; mov.b32 %f548, %r1440; mov.b32 %f549, %r1441; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f546, %f547, %f548, %f549}, {%r1406, %r1407, %r1408, %r1409}, {%r2015, %r2014}, {%f546, %f547, %f548, %f549}; // end inline asm mov.b32 %f554, %r1442; mov.b32 %f555, %r1443; mov.b32 %f556, %r1444; mov.b32 %f557, %r1445; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f554, %f555, %f556, %f557}, {%r1406, %r1407, %r1408, %r1409}, {%r2013, %r2012}, {%f554, %f555, %f556, %f557}; // end inline asm mov.b32 %f562, %r1446; mov.b32 %f563, %r1447; mov.b32 %f564, %r1448; mov.b32 %f565, %r1449; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f562, %f563, %f564, %f565}, {%r1406, %r1407, %r1408, %r1409}, {%r2011, %r2010}, {%f562, %f563, %f564, %f565}; // end inline asm mov.b32 %f570, %r1450; mov.b32 %f571, %r1451; mov.b32 %f572, %r1452; mov.b32 %f573, %r1453; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r1406, %r1407, %r1408, %r1409}, {%r2009, %r2008}, {%f570, %f571, %f572, %f573}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f498, %f499, %f500, %f501}, {%r1410, %r1411, %r1412, %r1413}, {%r2007, %r2006}, {%f498, %f499, %f500, %f501}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f506, %f507, %f508, %f509}, {%r1410, %r1411, %r1412, %r1413}, {%r2005, %r2004}, {%f506, %f507, %f508, %f509}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f514, %f515, %f516, %f517}, {%r1410, %r1411, %r1412, %r1413}, {%r2003, %r2002}, {%f514, %f515, %f516, %f517}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f522, %f523, %f524, %f525}, {%r1410, %r1411, %r1412, %r1413}, {%r2001, %r2000}, {%f522, %f523, %f524, %f525}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f530, %f531, %f532, %f533}, {%r1410, %r1411, %r1412, %r1413}, {%r1999, %r1998}, {%f530, %f531, %f532, %f533}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f538, %f539, %f540, %f541}, {%r1410, %r1411, %r1412, %r1413}, {%r1997, %r1996}, {%f538, %f539, %f540, %f541}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f546, %f547, %f548, %f549}, {%r1410, %r1411, %r1412, %r1413}, {%r1995, %r1994}, {%f546, %f547, %f548, %f549}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f554, %f555, %f556, %f557}, {%r1410, %r1411, %r1412, %r1413}, {%r1993, %r1992}, {%f554, %f555, %f556, %f557}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f562, %f563, %f564, %f565}, {%r1410, %r1411, %r1412, %r1413}, {%r1991, %r1990}, {%f562, %f563, %f564, %f565}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r1410, %r1411, %r1412, %r1413}, {%r1989, %r1988}, {%f570, %f571, %f572, %f573}; // end inline asm setp.equ.ftz.f32 %p126, %f741, 0f00000000; mov.f32 %f761, 0f3F800000; mov.f32 %f760, %f761; @%p126 bra $L__BB0_57; rcp.approx.ftz.f32 %f760, %f741; $L__BB0_57: setp.equ.ftz.f32 %p127, %f740, 0f00000000; @%p127 bra $L__BB0_59; rcp.approx.ftz.f32 %f761, %f740; $L__BB0_59: add.s32 %r1775, %r1944, 32; setp.ge.s32 %p148, %r1775, %r168; mov.b32 %f580, %r2067; fma.rn.ftz.f32 %f581, %f72, %f580, %f498; mul.ftz.f32 %f582, %f760, %f581; mov.b32 %r2067, %f582; mov.b32 %f583, %r2066; fma.rn.ftz.f32 %f584, %f72, %f583, %f499; mul.ftz.f32 %f585, %f760, %f584; mov.b32 %r2066, %f585; mov.b32 %f586, %r2065; fma.rn.ftz.f32 %f587, %f74, %f586, %f500; mul.ftz.f32 %f588, %f761, %f587; mov.b32 %r2065, %f588; mov.b32 %f589, %r2064; fma.rn.ftz.f32 %f590, %f74, %f589, %f501; mul.ftz.f32 %f591, %f761, %f590; mov.b32 %r2064, %f591; mov.b32 %f592, %r2063; fma.rn.ftz.f32 %f593, %f72, %f592, %f506; mul.ftz.f32 %f594, %f760, %f593; mov.b32 %r2063, %f594; mov.b32 %f595, %r2062; fma.rn.ftz.f32 %f596, %f72, %f595, %f507; mul.ftz.f32 %f597, %f760, %f596; mov.b32 %r2062, %f597; mov.b32 %f598, %r2061; fma.rn.ftz.f32 %f599, %f74, %f598, %f508; mul.ftz.f32 %f600, %f761, %f599; mov.b32 %r2061, %f600; mov.b32 %f601, %r2060; fma.rn.ftz.f32 %f602, %f74, %f601, %f509; mul.ftz.f32 %f603, %f761, %f602; mov.b32 %r2060, %f603; mov.b32 %f604, %r2059; fma.rn.ftz.f32 %f605, %f72, %f604, %f514; mul.ftz.f32 %f606, %f760, %f605; mov.b32 %r2059, %f606; mov.b32 %f607, %r2058; fma.rn.ftz.f32 %f608, %f72, %f607, %f515; mul.ftz.f32 %f609, %f760, %f608; mov.b32 %r2058, %f609; mov.b32 %f610, %r2057; fma.rn.ftz.f32 %f611, %f74, %f610, %f516; mul.ftz.f32 %f612, %f761, %f611; mov.b32 %r2057, %f612; mov.b32 %f613, %r2056; fma.rn.ftz.f32 %f614, %f74, %f613, %f517; mul.ftz.f32 %f615, %f761, %f614; mov.b32 %r2056, %f615; mov.b32 %f616, %r2055; fma.rn.ftz.f32 %f617, %f72, %f616, %f522; mul.ftz.f32 %f618, %f760, %f617; mov.b32 %r2055, %f618; mov.b32 %f619, %r2054; fma.rn.ftz.f32 %f620, %f72, %f619, %f523; mul.ftz.f32 %f621, %f760, %f620; mov.b32 %r2054, %f621; mov.b32 %f622, %r2053; fma.rn.ftz.f32 %f623, %f74, %f622, %f524; mul.ftz.f32 %f624, %f761, %f623; mov.b32 %r2053, %f624; mov.b32 %f625, %r2052; fma.rn.ftz.f32 %f626, %f74, %f625, %f525; mul.ftz.f32 %f627, %f761, %f626; mov.b32 %r2052, %f627; mov.b32 %f628, %r2051; fma.rn.ftz.f32 %f629, %f72, %f628, %f530; mul.ftz.f32 %f630, %f760, %f629; mov.b32 %r2051, %f630; mov.b32 %f631, %r2050; fma.rn.ftz.f32 %f632, %f72, %f631, %f531; mul.ftz.f32 %f633, %f760, %f632; mov.b32 %r2050, %f633; mov.b32 %f634, %r2049; fma.rn.ftz.f32 %f635, %f74, %f634, %f532; mul.ftz.f32 %f636, %f761, %f635; mov.b32 %r2049, %f636; mov.b32 %f637, %r2048; fma.rn.ftz.f32 %f638, %f74, %f637, %f533; mul.ftz.f32 %f639, %f761, %f638; mov.b32 %r2048, %f639; mov.b32 %f640, %r2047; fma.rn.ftz.f32 %f641, %f72, %f640, %f538; mul.ftz.f32 %f642, %f760, %f641; mov.b32 %r2047, %f642; mov.b32 %f643, %r2046; fma.rn.ftz.f32 %f644, %f72, %f643, %f539; mul.ftz.f32 %f645, %f760, %f644; mov.b32 %r2046, %f645; mov.b32 %f646, %r2045; fma.rn.ftz.f32 %f647, %f74, %f646, %f540; mul.ftz.f32 %f648, %f761, %f647; mov.b32 %r2045, %f648; mov.b32 %f649, %r2044; fma.rn.ftz.f32 %f650, %f74, %f649, %f541; mul.ftz.f32 %f651, %f761, %f650; mov.b32 %r2044, %f651; mov.b32 %f652, %r2043; fma.rn.ftz.f32 %f653, %f72, %f652, %f546; mul.ftz.f32 %f654, %f760, %f653; mov.b32 %r2043, %f654; mov.b32 %f655, %r2042; fma.rn.ftz.f32 %f656, %f72, %f655, %f547; mul.ftz.f32 %f657, %f760, %f656; mov.b32 %r2042, %f657; mov.b32 %f658, %r2041; fma.rn.ftz.f32 %f659, %f74, %f658, %f548; mul.ftz.f32 %f660, %f761, %f659; mov.b32 %r2041, %f660; mov.b32 %f661, %r2040; fma.rn.ftz.f32 %f662, %f74, %f661, %f549; mul.ftz.f32 %f663, %f761, %f662; mov.b32 %r2040, %f663; mov.b32 %f664, %r2039; fma.rn.ftz.f32 %f665, %f72, %f664, %f554; mul.ftz.f32 %f666, %f760, %f665; mov.b32 %r2039, %f666; mov.b32 %f667, %r2038; fma.rn.ftz.f32 %f668, %f72, %f667, %f555; mul.ftz.f32 %f669, %f760, %f668; mov.b32 %r2038, %f669; mov.b32 %f670, %r2037; fma.rn.ftz.f32 %f671, %f74, %f670, %f556; mul.ftz.f32 %f672, %f761, %f671; mov.b32 %r2037, %f672; mov.b32 %f673, %r2036; fma.rn.ftz.f32 %f674, %f74, %f673, %f557; mul.ftz.f32 %f675, %f761, %f674; mov.b32 %r2036, %f675; mov.b32 %f676, %r2035; fma.rn.ftz.f32 %f677, %f72, %f676, %f562; mul.ftz.f32 %f678, %f760, %f677; mov.b32 %r2035, %f678; mov.b32 %f679, %r2034; fma.rn.ftz.f32 %f680, %f72, %f679, %f563; mul.ftz.f32 %f681, %f760, %f680; mov.b32 %r2034, %f681; mov.b32 %f682, %r2033; fma.rn.ftz.f32 %f683, %f74, %f682, %f564; mul.ftz.f32 %f684, %f761, %f683; mov.b32 %r2033, %f684; mov.b32 %f685, %r2032; fma.rn.ftz.f32 %f686, %f74, %f685, %f565; mul.ftz.f32 %f687, %f761, %f686; mov.b32 %r2032, %f687; mov.b32 %f688, %r2031; fma.rn.ftz.f32 %f689, %f72, %f688, %f570; mul.ftz.f32 %f690, %f760, %f689; mov.b32 %r2031, %f690; mov.b32 %f691, %r2030; fma.rn.ftz.f32 %f692, %f72, %f691, %f571; mul.ftz.f32 %f693, %f760, %f692; mov.b32 %r2030, %f693; mov.b32 %f694, %r2029; fma.rn.ftz.f32 %f695, %f74, %f694, %f572; mul.ftz.f32 %f696, %f761, %f695; mov.b32 %r2029, %f696; mov.b32 %f697, %r2028; fma.rn.ftz.f32 %f698, %f74, %f697, %f573; mul.ftz.f32 %f699, %f761, %f698; mov.b32 %r2028, %f699; @%p148 bra $L__BB0_61; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2027, %r2026, %r2025, %r2024}, [%r955]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2023, %r2022, %r2021, %r2020}, [%r960]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2019, %r2018, %r2017, %r2016}, [%r965]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2015, %r2014, %r2013, %r2012}, [%r970]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2011, %r2010, %r2009, %r2008}, [%r975]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2007, %r2006, %r2005, %r2004}, [%r980]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2003, %r2002, %r2001, %r2000}, [%r985]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1999, %r1998, %r1997, %r1996}, [%r990]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1995, %r1994, %r1993, %r1992}, [%r995]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1991, %r1990, %r1989, %r1988}, [%r1000]; // end inline asm $L__BB0_61: add.s32 %r1944, %r1944, 32; setp.lt.s32 %p129, %r1944, %r168; add.s32 %r1945, %r1945, 32; mov.f32 %f742, %f55; mov.f32 %f743, %f54; @%p129 bra $L__BB0_11; $L__BB0_62: mov.u32 %r1805, %tid.x; mov.u32 %r1804, %tid.x; shr.s32 %r1803, %r1804, 31; shr.u32 %r1802, %r1803, 28; add.s32 %r1801, %r1804, %r1802; and.b32 %r1800, %r1801, -16; sub.s32 %r1799, %r1804, %r1800; shl.b32 %r1798, %r1799, 4; cvt.s64.s32 %rd184, %r1798; mov.b64 %rd183, fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl_param_0; mov.u64 %rd182, %rd183; ld.param.u32 %r1797, [%rd182+60]; mov.u32 %r1796, %ctaid.y; mov.u32 %r1795, %ctaid.x; ld.param.u32 %r1794, [fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl_param_0+52]; mul.lo.s32 %r1793, %r1, %r1796; mad.lo.s32 %r1792, %r1793, %r1794, %r1795; and.b32 %r1791, %r1804, 96; shr.u32 %r1790, %r1791, 1; mov.u32 %r1789, _ZN25fused_multihead_attention5smem_E; shr.s32 %r1788, %r1801, 31; shr.u32 %r1787, %r1788, 29; add.s32 %r1786, %r7, %r1787; and.b32 %r1785, %r1786, 268435448; sub.s32 %r1784, %r7, %r1785; xor.b32 %r1783, %r1784, %r1799; shl.b32 %r1782, %r7, 8; shl.b32 %r1781, %r1783, 4; mov.u32 %r1780, _ZN25fused_multihead_attention5smem_E; add.s32 %r1779, %r1781, %r1782; add.s32 %r1778, %r1779, %r1780; add.s32 %r1777, %r1778, 16384; bar.sync 0; mov.b32 %f700, %r2066; mov.b32 %f701, %r2067; // begin inline asm cvt.rn.f16x2.f32 %r1640, %f700, %f701; // end inline asm mov.b32 %f702, %r2064; mov.b32 %f703, %r2065; // begin inline asm cvt.rn.f16x2.f32 %r1641, %f702, %f703; // end inline asm shl.b32 %r1741, %r1804, 2; and.b32 %r1742, %r1741, 124; add.s32 %r1744, %r1742, %r1780; and.b32 %r1747, %r1804, 28; shr.u32 %r1748, %r1747, 2; or.b32 %r1749, %r1790, %r1748; shl.b32 %r1750, %r1749, 8; add.s32 %r1751, %r1744, %r1750; add.s32 %r1642, %r1751, 16384; // begin inline asm st.shared.b32 [%r1642], %r1640; // end inline asm add.s32 %r1644, %r1751, 18432; // begin inline asm st.shared.b32 [%r1644], %r1641; // end inline asm xor.b32 %r1648, %r1642, 16; mov.b32 %f704, %r2062; mov.b32 %f705, %r2063; // begin inline asm cvt.rn.f16x2.f32 %r1646, %f704, %f705; // end inline asm mov.b32 %f706, %r2060; mov.b32 %f707, %r2061; // begin inline asm cvt.rn.f16x2.f32 %r1647, %f706, %f707; // end inline asm // begin inline asm st.shared.b32 [%r1648], %r1646; // end inline asm add.s32 %r1650, %r1648, 2048; // begin inline asm st.shared.b32 [%r1650], %r1647; // end inline asm xor.b32 %r1654, %r1642, 32; mov.b32 %f708, %r2058; mov.b32 %f709, %r2059; // begin inline asm cvt.rn.f16x2.f32 %r1652, %f708, %f709; // end inline asm mov.b32 %f710, %r2056; mov.b32 %f711, %r2057; // begin inline asm cvt.rn.f16x2.f32 %r1653, %f710, %f711; // end inline asm // begin inline asm st.shared.b32 [%r1654], %r1652; // end inline asm add.s32 %r1656, %r1654, 2048; // begin inline asm st.shared.b32 [%r1656], %r1653; // end inline asm xor.b32 %r1660, %r1642, 48; mov.b32 %f712, %r2054; mov.b32 %f713, %r2055; // begin inline asm cvt.rn.f16x2.f32 %r1658, %f712, %f713; // end inline asm mov.b32 %f714, %r2052; mov.b32 %f715, %r2053; // begin inline asm cvt.rn.f16x2.f32 %r1659, %f714, %f715; // end inline asm // begin inline asm st.shared.b32 [%r1660], %r1658; // end inline asm add.s32 %r1662, %r1660, 2048; // begin inline asm st.shared.b32 [%r1662], %r1659; // end inline asm xor.b32 %r1666, %r1642, 64; mov.b32 %f716, %r2050; mov.b32 %f717, %r2051; // begin inline asm cvt.rn.f16x2.f32 %r1664, %f716, %f717; // end inline asm mov.b32 %f718, %r2048; mov.b32 %f719, %r2049; // begin inline asm cvt.rn.f16x2.f32 %r1665, %f718, %f719; // end inline asm // begin inline asm st.shared.b32 [%r1666], %r1664; // end inline asm add.s32 %r1668, %r1666, 2048; // begin inline asm st.shared.b32 [%r1668], %r1665; // end inline asm xor.b32 %r1672, %r1642, 80; mov.b32 %f720, %r2046; mov.b32 %f721, %r2047; // begin inline asm cvt.rn.f16x2.f32 %r1670, %f720, %f721; // end inline asm mov.b32 %f722, %r2044; mov.b32 %f723, %r2045; // begin inline asm cvt.rn.f16x2.f32 %r1671, %f722, %f723; // end inline asm // begin inline asm st.shared.b32 [%r1672], %r1670; // end inline asm add.s32 %r1674, %r1672, 2048; // begin inline asm st.shared.b32 [%r1674], %r1671; // end inline asm xor.b32 %r1678, %r1642, 96; mov.b32 %f724, %r2042; mov.b32 %f725, %r2043; // begin inline asm cvt.rn.f16x2.f32 %r1676, %f724, %f725; // end inline asm mov.b32 %f726, %r2040; mov.b32 %f727, %r2041; // begin inline asm cvt.rn.f16x2.f32 %r1677, %f726, %f727; // end inline asm // begin inline asm st.shared.b32 [%r1678], %r1676; // end inline asm add.s32 %r1680, %r1678, 2048; // begin inline asm st.shared.b32 [%r1680], %r1677; // end inline asm xor.b32 %r1684, %r1642, 112; mov.b32 %f728, %r2038; mov.b32 %f729, %r2039; // begin inline asm cvt.rn.f16x2.f32 %r1682, %f728, %f729; // end inline asm mov.b32 %f730, %r2036; mov.b32 %f731, %r2037; // begin inline asm cvt.rn.f16x2.f32 %r1683, %f730, %f731; // end inline asm // begin inline asm st.shared.b32 [%r1684], %r1682; // end inline asm add.s32 %r1686, %r1684, 2048; // begin inline asm st.shared.b32 [%r1686], %r1683; // end inline asm xor.b32 %r1690, %r1642, 128; mov.b32 %f732, %r2034; mov.b32 %f733, %r2035; // begin inline asm cvt.rn.f16x2.f32 %r1688, %f732, %f733; // end inline asm mov.b32 %f734, %r2032; mov.b32 %f735, %r2033; // begin inline asm cvt.rn.f16x2.f32 %r1689, %f734, %f735; // end inline asm // begin inline asm st.shared.b32 [%r1690], %r1688; // end inline asm add.s32 %r1692, %r1690, 2048; // begin inline asm st.shared.b32 [%r1692], %r1689; // end inline asm xor.b32 %r1696, %r1642, 144; mov.b32 %f736, %r2030; mov.b32 %f737, %r2031; // begin inline asm cvt.rn.f16x2.f32 %r1694, %f736, %f737; // end inline asm mov.b32 %f738, %r2028; mov.b32 %f739, %r2029; // begin inline asm cvt.rn.f16x2.f32 %r1695, %f738, %f739; // end inline asm // begin inline asm st.shared.b32 [%r1696], %r1694; // end inline asm add.s32 %r1698, %r1696, 2048; // begin inline asm st.shared.b32 [%r1698], %r1695; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1700, %r1701, %r1702, %r1703}, [%r1777]; // end inline asm add.s32 %r1709, %r1777, 2048; // begin inline asm ld.shared.v4.b32 {%r1705, %r1706, %r1707, %r1708}, [%r1709]; // end inline asm add.s32 %r1714, %r1777, 4096; // begin inline asm ld.shared.v4.b32 {%r1710, %r1711, %r1712, %r1713}, [%r1714]; // end inline asm add.s32 %r1719, %r1777, 6144; // begin inline asm ld.shared.v4.b32 {%r1715, %r1716, %r1717, %r1718}, [%r1719]; // end inline asm add.s32 %r1724, %r1777, 8192; // begin inline asm ld.shared.v4.b32 {%r1720, %r1721, %r1722, %r1723}, [%r1724]; // end inline asm add.s32 %r1729, %r1777, 10240; // begin inline asm ld.shared.v4.b32 {%r1725, %r1726, %r1727, %r1728}, [%r1729]; // end inline asm add.s32 %r1734, %r1777, 12288; // begin inline asm ld.shared.v4.b32 {%r1730, %r1731, %r1732, %r1733}, [%r1734]; // end inline asm add.s32 %r1739, %r1777, 14336; // begin inline asm ld.shared.v4.b32 {%r1735, %r1736, %r1737, %r1738}, [%r1739]; // end inline asm mul.lo.s32 %r1756, %r1792, %r1797; shl.b32 %r1757, %r1756, 1; cvt.s64.s32 %rd141, %r1757; add.s64 %rd41, %rd141, %rd184; cvt.u32.u64 %r1759, %rd5; setp.ge.s32 %p130, %r1759, %r1; @%p130 bra $L__BB0_85; mov.b64 %rd186, fmha_v2_flash_attention_fp16_64_32_S_80_sm86_kernel_nl_param_0; mov.u64 %rd185, %rd186; ld.param.u32 %r1816, [%rd185+60]; mov.u32 %r1815, %tid.x; shr.s32 %r1814, %r1816, 31; shr.u32 %r1813, %r1814, 29; add.s32 %r1812, %r1816, %r1813; shr.s32 %r1811, %r1812, 3; shr.s32 %r1810, %r1815, 31; shr.u32 %r1809, %r1810, 28; add.s32 %r1808, %r1815, %r1809; and.b32 %r1807, %r1808, -16; sub.s32 %r1806, %r1815, %r1807; setp.ge.s32 %p131, %r1806, %r1811; @%p131 bra $L__BB0_65; mul.lo.s64 %rd143, %rd12, %rd5; add.s64 %rd144, %rd41, %rd143; cvta.to.global.u64 %rd145, %rd13; add.s64 %rd146, %rd145, %rd144; st.global.v4.u32 [%rd146], {%r1700, %r1701, %r1702, %r1703}; $L__BB0_65: add.s32 %r1761, %r1759, 8; setp.ge.s32 %p132, %r1761, %r1; @%p132 bra $L__BB0_85; @%p131 bra $L__BB0_68; add.s64 %rd147, %rd5, 8; mul.lo.s64 %rd148, %rd147, %rd12; add.s64 %rd149, %rd41, %rd148; cvta.to.global.u64 %rd150, %rd13; add.s64 %rd151, %rd150, %rd149; st.global.v4.u32 [%rd151], {%r1705, %r1706, %r1707, %r1708}; $L__BB0_68: add.s32 %r1763, %r1759, 16; setp.ge.s32 %p134, %r1763, %r1; @%p134 bra $L__BB0_85; @%p131 bra $L__BB0_71; add.s64 %rd152, %rd5, 16; mul.lo.s64 %rd153, %rd152, %rd12; add.s64 %rd154, %rd41, %rd153; cvta.to.global.u64 %rd155, %rd13; add.s64 %rd156, %rd155, %rd154; st.global.v4.u32 [%rd156], {%r1710, %r1711, %r1712, %r1713}; $L__BB0_71: add.s32 %r1765, %r1759, 24; setp.ge.s32 %p136, %r1765, %r1; @%p136 bra $L__BB0_85; @%p131 bra $L__BB0_74; add.s64 %rd157, %rd5, 24; mul.lo.s64 %rd158, %rd157, %rd12; add.s64 %rd159, %rd41, %rd158; cvta.to.global.u64 %rd160, %rd13; add.s64 %rd161, %rd160, %rd159; st.global.v4.u32 [%rd161], {%r1715, %r1716, %r1717, %r1718}; $L__BB0_74: add.s32 %r1767, %r1759, 32; setp.ge.s32 %p138, %r1767, %r1; @%p138 bra $L__BB0_85; @%p131 bra $L__BB0_77; add.s64 %rd162, %rd5, 32; mul.lo.s64 %rd163, %rd162, %rd12; add.s64 %rd164, %rd41, %rd163; cvta.to.global.u64 %rd165, %rd13; add.s64 %rd166, %rd165, %rd164; st.global.v4.u32 [%rd166], {%r1720, %r1721, %r1722, %r1723}; $L__BB0_77: add.s32 %r1769, %r1759, 40; setp.ge.s32 %p140, %r1769, %r1; @%p140 bra $L__BB0_85; @%p131 bra $L__BB0_80; add.s64 %rd167, %rd5, 40; mul.lo.s64 %rd168, %rd167, %rd12; add.s64 %rd169, %rd41, %rd168; cvta.to.global.u64 %rd170, %rd13; add.s64 %rd171, %rd170, %rd169; st.global.v4.u32 [%rd171], {%r1725, %r1726, %r1727, %r1728}; $L__BB0_80: add.s32 %r1771, %r1759, 48; setp.ge.s32 %p142, %r1771, %r1; @%p142 bra $L__BB0_85; @%p131 bra $L__BB0_83; add.s64 %rd172, %rd5, 48; mul.lo.s64 %rd173, %rd172, %rd12; add.s64 %rd174, %rd41, %rd173; cvta.to.global.u64 %rd175, %rd13; add.s64 %rd176, %rd175, %rd174; st.global.v4.u32 [%rd176], {%r1730, %r1731, %r1732, %r1733}; $L__BB0_83: add.s32 %r1773, %r1759, 56; setp.ge.s32 %p144, %r1773, %r1; or.pred %p146, %p144, %p131; @%p146 bra $L__BB0_85; add.s64 %rd177, %rd5, 56; mul.lo.s64 %rd178, %rd177, %rd12; add.s64 %rd179, %rd41, %rd178; cvta.to.global.u64 %rd180, %rd13; add.s64 %rd181, %rd180, %rd179; st.global.v4.u32 [%rd181], {%r1735, %r1736, %r1737, %r1738}; $L__BB0_85: ret; }