am .align 8 .b8 fmha_v2_flash_attention_fp16_128_16_S_80_sm86_kernel_nl_param_0[168] ) { .reg .pred %p<174>; .reg .b16 %rs<147>; .reg .f32 %f<1010>; .reg .b32 %r<2220>; .reg .b64 %rd<239>; mov.b64 %rd36, fmha_v2_flash_attention_fp16_128_16_S_80_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd36; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_128_16_S_80_sm86_kernel_nl_param_0+56]; cvt.s64.s32 %rd2, %r1; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_128_16_S_80_sm86_kernel_nl_param_0+52]; mov.u32 %r719, %ctaid.z; shl.b32 %r3, %r719, 7; setp.le.s32 %p5, %r1, %r3; @%p5 bra $L__BB0_112; mov.u32 %r720, %tid.x; mov.u32 %r721, %ctaid.y; mov.u32 %r722, %ctaid.x; mul.lo.s32 %r723, %r1, %r721; mad.lo.s32 %r724, %r723, %r2, %r722; shr.s32 %r725, %r720, 31; shr.u32 %r726, %r725, 27; add.s32 %r727, %r720, %r726; and.b32 %r728, %r727, -32; sub.s32 %r729, %r720, %r728; shr.u32 %r730, %r725, 25; add.s32 %r731, %r720, %r730; shr.s32 %r732, %r731, 7; shl.b32 %r733, %r732, 4; shr.s32 %r734, %r729, 31; shr.u32 %r735, %r734, 30; add.s32 %r736, %r729, %r735; and.b32 %r737, %r736, 2147483644; sub.s32 %r738, %r729, %r737; shl.b32 %r739, %r738, 1; add.s32 %r2097, %r739, %r733; ld.param.u64 %rd3, [%rd1+16]; ld.param.u64 %rd4, [%rd1+40]; shr.s32 %r740, %r727, 5; shr.s32 %r741, %r727, 31; shr.u32 %r742, %r741, 30; add.s32 %r743, %r740, %r742; and.b32 %r744, %r743, 268435452; sub.s32 %r745, %r740, %r744; shl.b32 %r746, %r745, 4; shr.s32 %r747, %r736, 2; add.s32 %r5, %r746, %r747; shr.u32 %r748, %r725, 28; add.s32 %r749, %r720, %r748; and.b32 %r750, %r749, -16; sub.s32 %r6, %r720, %r750; setp.gt.s32 %p6, %r6, 9; shr.s32 %r7, %r749, 4; add.s32 %r751, %r7, %r3; cvt.s64.s32 %rd5, %r751; ld.param.u64 %rd6, [%rd1+24]; mul.lo.s64 %rd37, %rd6, %rd5; mul.lo.s32 %r752, %r724, 3; mul.wide.s32 %rd38, %r752, 160; shl.b32 %r753, %r6, 4; cvt.s64.s32 %rd39, %r753; add.s64 %rd40, %rd38, %rd39; add.s64 %rd41, %rd40, %rd37; ld.param.u64 %rd42, [%rd1]; add.s64 %rd7, %rd42, %rd41; shr.s32 %r754, %r749, 31; shr.u32 %r755, %r754, 29; add.s32 %r756, %r7, %r755; and.b32 %r757, %r756, 268435448; sub.s32 %r758, %r7, %r757; xor.b32 %r759, %r758, %r6; shl.b32 %r760, %r7, 8; shl.b32 %r761, %r759, 4; mov.u32 %r762, 31; mov.u32 %r763, 0; mov.u32 %r764, -1; shfl.sync.idx.b32 %r8|%p1, %r763, %r763, %r762, %r764; shfl.sync.idx.b32 %r21|%p7, %r763, %r763, %r762, %r764; and.b32 %r765, %r720, 96; shr.u32 %r766, %r765, 1; and.b32 %r767, %r720, 15; or.b32 %r768, %r766, %r767; and.b32 %r769, %r720, 7; shl.b32 %r770, %r720, 4; and.b32 %r771, %r770, 112; and.b32 %r772, %r720, 16; xor.b32 %r773, %r771, %r772; cvt.s64.s32 %rd43, %r7; mul.lo.s64 %rd44, %rd6, %rd43; add.s64 %rd45, %rd40, %rd44; add.s64 %rd46, %rd42, %rd45; add.s64 %rd238, %rd46, 160; shfl.sync.idx.b32 %r9|%p2, %r763, %r763, %r762, %r764; shfl.sync.idx.b32 %r10|%p3, %r763, %r763, %r762, %r764; shr.u32 %r774, %r772, 1; or.b32 %r775, %r774, %r769; and.b32 %r776, %r720, 8; shr.u32 %r777, %r776, 3; xor.b32 %r778, %r777, %r769; add.s64 %rd237, %rd46, 320; shfl.sync.idx.b32 %r779|%p8, %r763, %r763, %r762, %r764; shfl.sync.idx.b32 %r11|%p4, %r763, %r763, %r762, %r764; ld.param.u64 %rd12, [%rd1+32]; ld.param.u64 %rd13, [%rd1+8]; ld.param.u32 %r12, [%rd1+60]; sub.s32 %r780, %r1, %r3; min.s32 %r13, %r780, 128; shr.s32 %r781, %r12, 31; shr.u32 %r782, %r781, 29; add.s32 %r783, %r12, %r782; shr.s32 %r14, %r783, 3; shl.b32 %r784, %r720, 8; and.b32 %r785, %r784, 3840; shl.b32 %r786, %r778, 4; shl.b32 %r787, %r775, 8; shl.b32 %r788, %r768, 8; add.s32 %r15, %r7, 8; add.s32 %r16, %r761, %r760; or.b32 %r17, %r788, %r773; or.b32 %r18, %r787, %r786; or.b32 %r19, %r773, %r785; mov.u32 %r789, _ZN25fused_multihead_attention5smem_E; add.s32 %r790, %r16, %r789; add.s32 %r20, %r790, 32768; @%p6 bra $L__BB0_3; shl.b64 %rd63, %rd6, 3; add.s32 %r823, %r7, 120; setp.lt.s32 %p9, %r823, %r13; add.s32 %r824, %r7, 112; setp.lt.s32 %p10, %r824, %r13; add.s32 %r825, %r7, 104; setp.lt.s32 %p11, %r825, %r13; add.s32 %r826, %r7, 96; setp.lt.s32 %p12, %r826, %r13; add.s32 %r827, %r7, 88; setp.lt.s32 %p13, %r827, %r13; add.s32 %r828, %r7, 80; setp.lt.s32 %p14, %r828, %r13; add.s32 %r829, %r7, 72; setp.lt.s32 %p15, %r829, %r13; add.s32 %r830, %r7, 64; setp.lt.s32 %p16, %r830, %r13; add.s32 %r831, %r7, 56; setp.lt.s32 %p17, %r831, %r13; add.s32 %r832, %r7, 48; setp.lt.s32 %p18, %r832, %r13; add.s32 %r833, %r7, 40; setp.lt.s32 %p19, %r833, %r13; add.s32 %r834, %r7, 32; setp.lt.s32 %p20, %r834, %r13; add.s32 %r835, %r7, 24; setp.lt.s32 %p21, %r835, %r13; add.s32 %r836, %r7, 16; setp.lt.s32 %p22, %r836, %r13; selp.b32 %r802, 16, 0, %p19; selp.b32 %r804, 16, 0, %p18; selp.b32 %r806, 16, 0, %p17; selp.b32 %r808, 16, 0, %p16; selp.b32 %r810, 16, 0, %p15; selp.b32 %r812, 16, 0, %p14; selp.b32 %r814, 16, 0, %p13; selp.b32 %r816, 16, 0, %p12; selp.b32 %r818, 16, 0, %p11; add.s32 %r791, %r790, %r21; add.s32 %r793, %r791, 2048; add.s32 %r795, %r791, 4096; add.s32 %r797, %r791, 6144; add.s32 %r799, %r791, 8192; add.s32 %r801, %r791, 10240; add.s32 %r803, %r791, 12288; add.s32 %r805, %r791, 14336; add.s32 %r807, %r791, 16384; add.s32 %r809, %r791, 18432; add.s32 %r811, %r791, 20480; add.s32 %r813, %r791, 22528; add.s32 %r815, %r791, 24576; add.s32 %r817, %r791, 26624; add.s32 %r819, %r791, 28672; add.s32 %r821, %r791, 30720; setp.lt.s32 %p23, %r7, %r13; selp.b32 %r792, 16, 0, %p23; // begin inline asm cp.async.cg.shared.global [%r791], [%rd7], 16, %r792; // end inline asm setp.lt.s32 %p24, %r15, %r13; selp.b32 %r794, 16, 0, %p24; add.s64 %rd48, %rd7, %rd63; // begin inline asm cp.async.cg.shared.global [%r793], [%rd48], 16, %r794; // end inline asm selp.b32 %r796, 16, 0, %p22; add.s64 %rd49, %rd48, %rd63; // begin inline asm cp.async.cg.shared.global [%r795], [%rd49], 16, %r796; // end inline asm selp.b32 %r798, 16, 0, %p21; add.s64 %rd50, %rd49, %rd63; // begin inline asm cp.async.cg.shared.global [%r797], [%rd50], 16, %r798; // end inline asm selp.b32 %r800, 16, 0, %p20; add.s64 %rd51, %rd50, %rd63; // begin inline asm cp.async.cg.shared.global [%r799], [%rd51], 16, %r800; // end inline asm add.s64 %rd52, %rd51, %rd63; // begin inline asm cp.async.cg.shared.global [%r801], [%rd52], 16, %r802; // end inline asm add.s64 %rd53, %rd52, %rd63; // begin inline asm cp.async.cg.shared.global [%r803], [%rd53], 16, %r804; // end inline asm add.s64 %rd54, %rd53, %rd63; // begin inline asm cp.async.cg.shared.global [%r805], [%rd54], 16, %r806; // end inline asm add.s64 %rd55, %rd54, %rd63; // begin inline asm cp.async.cg.shared.global [%r807], [%rd55], 16, %r808; // end inline asm add.s64 %rd56, %rd55, %rd63; // begin inline asm cp.async.cg.shared.global [%r809], [%rd56], 16, %r810; // end inline asm add.s64 %rd57, %rd56, %rd63; // begin inline asm cp.async.cg.shared.global [%r811], [%rd57], 16, %r812; // end inline asm add.s64 %rd58, %rd57, %rd63; // begin inline asm cp.async.cg.shared.global [%r813], [%rd58], 16, %r814; // end inline asm add.s64 %rd59, %rd58, %rd63; // begin inline asm cp.async.cg.shared.global [%r815], [%rd59], 16, %r816; // end inline asm add.s64 %rd60, %rd59, %rd63; // begin inline asm cp.async.cg.shared.global [%r817], [%rd60], 16, %r818; // end inline asm selp.b32 %r820, 16, 0, %p10; add.s64 %rd61, %rd60, %rd63; // begin inline asm cp.async.cg.shared.global [%r819], [%rd61], 16, %r820; // end inline asm selp.b32 %r822, 16, 0, %p9; add.s64 %rd62, %rd61, %rd63; // begin inline asm cp.async.cg.shared.global [%r821], [%rd62], 16, %r822; // end inline asm $L__BB0_3: min.s32 %r23, %r1, 16; @%p6 bra $L__BB0_5; setp.lt.s32 %p26, %r15, %r23; add.s32 %r839, %r20, %r10; add.s32 %r841, %r839, 2048; setp.lt.s32 %p27, %r7, %r23; selp.b32 %r840, 16, 0, %p27; // begin inline asm cp.async.cg.shared.global [%r839], [%rd238], 16, %r840; // end inline asm selp.b32 %r842, 16, 0, %p26; shl.b64 %rd66, %rd6, 3; add.s64 %rd65, %rd238, %rd66; // begin inline asm cp.async.cg.shared.global [%r841], [%rd65], 16, %r842; // end inline asm $L__BB0_5: @%p6 bra $L__BB0_7; setp.lt.s32 %p29, %r15, %r23; add.s32 %r849, %r790, %r11; add.s32 %r843, %r849, 36864; add.s32 %r845, %r849, 38912; setp.lt.s32 %p30, %r7, %r23; selp.b32 %r844, 16, 0, %p30; // begin inline asm cp.async.cg.shared.global [%r843], [%rd237], 16, %r844; // end inline asm selp.b32 %r846, 16, 0, %p29; shl.b64 %rd69, %rd6, 3; add.s64 %rd68, %rd237, %rd69; // begin inline asm cp.async.cg.shared.global [%r845], [%rd68], 16, %r846; // end inline asm $L__BB0_7: setp.lt.s32 %p31, %r6, 10; // begin inline asm cp.async.commit_group; // end inline asm @%p31 bra $L__BB0_9; add.s32 %r850, %r790, %r21; add.s32 %r855, %r850, 2048; add.s32 %r860, %r850, 4096; add.s32 %r865, %r850, 6144; add.s32 %r870, %r850, 8192; add.s32 %r875, %r850, 10240; add.s32 %r880, %r850, 12288; add.s32 %r885, %r850, 14336; add.s32 %r890, %r850, 16384; add.s32 %r895, %r850, 18432; add.s32 %r900, %r850, 20480; add.s32 %r905, %r850, 22528; add.s32 %r910, %r850, 24576; add.s32 %r915, %r850, 26624; add.s32 %r920, %r850, 28672; add.s32 %r925, %r850, 30720; mov.u32 %r949, 0; // begin inline asm st.shared.v4.b32 [%r850], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r855], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r860], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r865], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r870], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r875], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r880], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r885], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r890], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r895], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r900], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r905], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r910], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r915], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r920], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r925], {%r949, %r949, %r949, %r949}; // end inline asm add.s32 %r930, %r20, %r10; add.s32 %r935, %r930, 2048; // begin inline asm st.shared.v4.b32 [%r930], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r935], {%r949, %r949, %r949, %r949}; // end inline asm add.s32 %r952, %r790, %r11; add.s32 %r940, %r952, 36864; add.s32 %r945, %r952, 38912; // begin inline asm st.shared.v4.b32 [%r940], {%r949, %r949, %r949, %r949}; // end inline asm // begin inline asm st.shared.v4.b32 [%r945], {%r949, %r949, %r949, %r949}; // end inline asm $L__BB0_9: // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r1134, %r8, %r789; add.s32 %r957, %r1134, %r17; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r953, %r954, %r955, %r956}, [%r957]; // end inline asm add.s32 %r962, %r957, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r958, %r959, %r960, %r961}, [%r962]; // end inline asm xor.b32 %r1135, %r17, 32; add.s32 %r967, %r1134, %r1135; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r963, %r964, %r965, %r966}, [%r967]; // end inline asm add.s32 %r972, %r967, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r968, %r969, %r970, %r971}, [%r972]; // end inline asm xor.b32 %r1136, %r17, 64; add.s32 %r977, %r1134, %r1136; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r973, %r974, %r975, %r976}, [%r977]; // end inline asm add.s32 %r982, %r977, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r978, %r979, %r980, %r981}, [%r982]; // end inline asm xor.b32 %r1137, %r17, 96; add.s32 %r987, %r1134, %r1137; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r983, %r984, %r985, %r986}, [%r987]; // end inline asm add.s32 %r992, %r987, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r988, %r989, %r990, %r991}, [%r992]; // end inline asm or.b32 %r1138, %r17, 128; add.s32 %r997, %r1134, %r1138; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r993, %r994, %r995, %r996}, [%r997]; // end inline asm add.s32 %r1002, %r997, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r998, %r999, %r1000, %r1001}, [%r1002]; // end inline asm add.s32 %r1139, %r9, %r789; add.s32 %r65, %r1139, 32768; add.s32 %r1007, %r65, %r18; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2119, %r2118, %r2117, %r2116}, [%r1007]; // end inline asm xor.b32 %r1140, %r18, 32; add.s32 %r1012, %r65, %r1140; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2115, %r2114, %r2113, %r2112}, [%r1012]; // end inline asm xor.b32 %r1141, %r18, 64; add.s32 %r1017, %r65, %r1141; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2111, %r2110, %r2109, %r2108}, [%r1017]; // end inline asm xor.b32 %r1142, %r18, 96; add.s32 %r1022, %r65, %r1142; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2107, %r2106, %r2105, %r2104}, [%r1022]; // end inline asm or.b32 %r1143, %r18, 128; add.s32 %r1027, %r65, %r1143; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2103, %r2102, %r2101, %r2100}, [%r1027]; // end inline asm add.s32 %r1144, %r789, 36864; add.s32 %r1032, %r19, %r1144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2139, %r2138, %r2137, %r2136}, [%r1032]; // end inline asm xor.b32 %r1145, %r19, 32; add.s32 %r1037, %r1145, %r1144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2135, %r2134, %r2133, %r2132}, [%r1037]; // end inline asm xor.b32 %r1146, %r19, 64; add.s32 %r1042, %r1146, %r1144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2131, %r2130, %r2129, %r2128}, [%r1042]; // end inline asm xor.b32 %r1147, %r19, 96; add.s32 %r1047, %r1147, %r1144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2127, %r2126, %r2125, %r2124}, [%r1047]; // end inline asm or.b32 %r1148, %r19, 128; add.s32 %r1052, %r1148, %r1144; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2123, %r2122, %r2121, %r2120}, [%r1052]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r2219, 0; // end inline asm // begin inline asm mov.u32 %r2218, 0; // end inline asm // begin inline asm mov.u32 %r2217, 0; // end inline asm // begin inline asm mov.u32 %r2216, 0; // end inline asm // begin inline asm mov.u32 %r2215, 0; // end inline asm // begin inline asm mov.u32 %r2214, 0; // end inline asm // begin inline asm mov.u32 %r2213, 0; // end inline asm // begin inline asm mov.u32 %r2212, 0; // end inline asm // begin inline asm mov.u32 %r2211, 0; // end inline asm // begin inline asm mov.u32 %r2210, 0; // end inline asm // begin inline asm mov.u32 %r2209, 0; // end inline asm // begin inline asm mov.u32 %r2208, 0; // end inline asm // begin inline asm mov.u32 %r2207, 0; // end inline asm // begin inline asm mov.u32 %r2206, 0; // end inline asm // begin inline asm mov.u32 %r2205, 0; // end inline asm // begin inline asm mov.u32 %r2204, 0; // end inline asm // begin inline asm mov.u32 %r2203, 0; // end inline asm // begin inline asm mov.u32 %r2202, 0; // end inline asm // begin inline asm mov.u32 %r2201, 0; // end inline asm // begin inline asm mov.u32 %r2200, 0; // end inline asm // begin inline asm mov.u32 %r2199, 0; // end inline asm // begin inline asm mov.u32 %r2198, 0; // end inline asm // begin inline asm mov.u32 %r2197, 0; // end inline asm // begin inline asm mov.u32 %r2196, 0; // end inline asm // begin inline asm mov.u32 %r2195, 0; // end inline asm // begin inline asm mov.u32 %r2194, 0; // end inline asm // begin inline asm mov.u32 %r2193, 0; // end inline asm // begin inline asm mov.u32 %r2192, 0; // end inline asm // begin inline asm mov.u32 %r2191, 0; // end inline asm // begin inline asm mov.u32 %r2190, 0; // end inline asm // begin inline asm mov.u32 %r2189, 0; // end inline asm // begin inline asm mov.u32 %r2188, 0; // end inline asm // begin inline asm mov.u32 %r2187, 0; // end inline asm // begin inline asm mov.u32 %r2186, 0; // end inline asm // begin inline asm mov.u32 %r2185, 0; // end inline asm // begin inline asm mov.u32 %r2184, 0; // end inline asm // begin inline asm mov.u32 %r2183, 0; // end inline asm // begin inline asm mov.u32 %r2182, 0; // end inline asm // begin inline asm mov.u32 %r2181, 0; // end inline asm // begin inline asm mov.u32 %r2180, 0; // end inline asm // begin inline asm mov.u32 %r2179, 0; // end inline asm // begin inline asm mov.u32 %r2178, 0; // end inline asm // begin inline asm mov.u32 %r2177, 0; // end inline asm // begin inline asm mov.u32 %r2176, 0; // end inline asm // begin inline asm mov.u32 %r2175, 0; // end inline asm // begin inline asm mov.u32 %r2174, 0; // end inline asm // begin inline asm mov.u32 %r2173, 0; // end inline asm // begin inline asm mov.u32 %r2172, 0; // end inline asm // begin inline asm mov.u32 %r2171, 0; // end inline asm // begin inline asm mov.u32 %r2170, 0; // end inline asm // begin inline asm mov.u32 %r2169, 0; // end inline asm // begin inline asm mov.u32 %r2168, 0; // end inline asm // begin inline asm mov.u32 %r2167, 0; // end inline asm // begin inline asm mov.u32 %r2166, 0; // end inline asm // begin inline asm mov.u32 %r2165, 0; // end inline asm // begin inline asm mov.u32 %r2164, 0; // end inline asm // begin inline asm mov.u32 %r2163, 0; // end inline asm // begin inline asm mov.u32 %r2162, 0; // end inline asm // begin inline asm mov.u32 %r2161, 0; // end inline asm // begin inline asm mov.u32 %r2160, 0; // end inline asm // begin inline asm mov.u32 %r2159, 0; // end inline asm // begin inline asm mov.u32 %r2158, 0; // end inline asm // begin inline asm mov.u32 %r2157, 0; // end inline asm // begin inline asm mov.u32 %r2156, 0; // end inline asm // begin inline asm mov.u32 %r2155, 0; // end inline asm // begin inline asm mov.u32 %r2154, 0; // end inline asm // begin inline asm mov.u32 %r2153, 0; // end inline asm // begin inline asm mov.u32 %r2152, 0; // end inline asm // begin inline asm mov.u32 %r2151, 0; // end inline asm // begin inline asm mov.u32 %r2150, 0; // end inline asm // begin inline asm mov.u32 %r2149, 0; // end inline asm // begin inline asm mov.u32 %r2148, 0; // end inline asm // begin inline asm mov.u32 %r2147, 0; // end inline asm // begin inline asm mov.u32 %r2146, 0; // end inline asm // begin inline asm mov.u32 %r2145, 0; // end inline asm // begin inline asm mov.u32 %r2144, 0; // end inline asm // begin inline asm mov.u32 %r2143, 0; // end inline asm // begin inline asm mov.u32 %r2142, 0; // end inline asm // begin inline asm mov.u32 %r2141, 0; // end inline asm // begin inline asm mov.u32 %r2140, 0; // end inline asm add.s32 %r1149, %r1, 15; shr.s32 %r1150, %r1149, 31; shr.u32 %r1151, %r1150, 28; add.s32 %r1152, %r1149, %r1151; and.b32 %r186, %r1152, -16; setp.lt.s32 %p32, %r1, 1; @%p32 bra $L__BB0_66; ld.param.u8 %rs1, [%rd1+160]; add.s32 %r1156, %r790, %r11; add.s32 %r267, %r1156, 36864; add.s32 %r268, %r20, %r10; add.s32 %r269, %r268, 2048; cvt.s64.s32 %rd14, %r5; cvt.s64.s32 %rd15, %r2097; add.s32 %r270, %r1156, 38912; add.s32 %r1157, %r2097, 1; cvt.s64.s32 %rd16, %r1157; add.s32 %r1158, %r2097, 8; cvt.s64.s32 %rd17, %r1158; add.s32 %r1159, %r2097, 9; cvt.s64.s32 %rd18, %r1159; add.s32 %r271, %r5, 8; add.s32 %r272, %r5, 64; add.s32 %r273, %r5, 72; mov.u32 %r2096, 0; mov.f32 %f986, 0fFF800000; mov.f32 %f982, 0f00000000; mov.u32 %r2098, %r1; mov.u32 %r2099, %r1; mov.f32 %f983, %f982; mov.f32 %f984, %f982; mov.f32 %f985, %f982; mov.f32 %f987, %f986; mov.f32 %f988, %f986; mov.f32 %f989, %f986; $L__BB0_11: add.s32 %r1160, %r2096, 16; setp.ge.s32 %p33, %r1160, %r186; @%p33 bra $L__BB0_18; bar.sync 0; shl.b64 %rd70, %rd6, 4; add.s64 %rd21, %rd238, %rd70; add.s32 %r2099, %r2099, -16; @%p6 bra $L__BB0_14; min.s32 %r1165, %r2099, 16; setp.lt.s32 %p35, %r15, %r1165; setp.lt.s32 %p36, %r7, %r1165; selp.b32 %r1162, 16, 0, %p36; // begin inline asm cp.async.cg.shared.global [%r268], [%rd21], 16, %r1162; // end inline asm selp.b32 %r1164, 16, 0, %p35; mul.lo.s64 %rd73, %rd6, 24; add.s64 %rd72, %rd238, %rd73; // begin inline asm cp.async.cg.shared.global [%r269], [%rd72], 16, %r1164; // end inline asm $L__BB0_14: add.s64 %rd22, %rd237, %rd70; add.s32 %r2098, %r2098, -16; @%p6 bra $L__BB0_16; min.s32 %r1171, %r2098, 16; setp.lt.s32 %p38, %r15, %r1171; setp.lt.s32 %p39, %r7, %r1171; selp.b32 %r1168, 16, 0, %p39; // begin inline asm cp.async.cg.shared.global [%r267], [%rd22], 16, %r1168; // end inline asm selp.b32 %r1170, 16, 0, %p38; mul.lo.s64 %rd77, %rd6, 24; add.s64 %rd76, %rd237, %rd77; // begin inline asm cp.async.cg.shared.global [%r270], [%rd76], 16, %r1170; // end inline asm $L__BB0_16: // begin inline asm cp.async.commit_group; // end inline asm mov.u64 %rd237, %rd22; mov.u64 %rd238, %rd21; @%p31 bra $L__BB0_18; mov.u32 %r1192, 0; // begin inline asm st.shared.v4.b32 [%r268], {%r1192, %r1192, %r1192, %r1192}; // end inline asm // begin inline asm st.shared.v4.b32 [%r269], {%r1192, %r1192, %r1192, %r1192}; // end inline asm // begin inline asm st.shared.v4.b32 [%r267], {%r1192, %r1192, %r1192, %r1192}; // end inline asm // begin inline asm st.shared.v4.b32 [%r270], {%r1192, %r1192, %r1192, %r1192}; // end inline asm mov.u64 %rd237, %rd22; mov.u64 %rd238, %rd21; $L__BB0_18: setp.eq.s16 %p41, %rs1, 0; @%p41 bra $L__BB0_51; cvt.s64.s32 %rd78, %r3; add.s64 %rd79, %rd14, %rd78; setp.ge.u64 %p42, %rd79, %rd2; mul.lo.s32 %r1195, %r1, %r3; cvt.s64.s32 %rd80, %r1195; cvt.u64.u32 %rd25, %r2096; add.s64 %rd81, %rd80, %rd25; add.s64 %rd26, %rd81, %rd15; mul.lo.s64 %rd82, %rd14, %rd2; add.s64 %rd83, %rd26, %rd82; add.s64 %rd27, %rd15, %rd25; setp.ge.u64 %p43, %rd27, %rd2; shl.b64 %rd84, %rd83, 1; mad.lo.s32 %r1198, %r2, %r721, %r722; cvt.s64.s32 %rd85, %r1198; mul.lo.s64 %rd86, %rd4, %rd85; add.s64 %rd87, %rd86, %rd84; cvta.to.global.u64 %rd88, %rd3; add.s64 %rd28, %rd88, %rd87; mov.u16 %rs132, 0; or.pred %p44, %p43, %p42; mov.u16 %rs131, %rs132; @%p44 bra $L__BB0_21; ld.global.u16 %rs131, [%rd28]; $L__BB0_21: add.s64 %rd29, %rd16, %rd25; setp.ge.u64 %p46, %rd29, %rd2; or.pred %p47, %p46, %p42; @%p47 bra $L__BB0_23; ld.global.u16 %rs132, [%rd28+2]; $L__BB0_23: add.s64 %rd30, %rd17, %rd25; setp.ge.u64 %p49, %rd30, %rd2; mov.u16 %rs134, 0; or.pred %p50, %p49, %p42; mov.u16 %rs133, %rs134; @%p50 bra $L__BB0_25; ld.global.u16 %rs133, [%rd28+16]; $L__BB0_25: add.s64 %rd31, %rd18, %rd25; setp.ge.u64 %p52, %rd31, %rd2; or.pred %p53, %p52, %p42; @%p53 bra $L__BB0_27; ld.global.u16 %rs134, [%rd28+18]; $L__BB0_27: cvt.s64.s32 %rd96, %r271; add.s64 %rd97, %rd96, %rd78; setp.ge.u64 %p54, %rd97, %rd2; mul.lo.s64 %rd98, %rd96, %rd2; add.s64 %rd99, %rd26, %rd98; shl.b64 %rd100, %rd99, 1; add.s64 %rd103, %rd86, %rd100; add.s64 %rd32, %rd88, %rd103; mov.u16 %rs136, 0; or.pred %p56, %p43, %p54; mov.u16 %rs135, %rs136; @%p56 bra $L__BB0_29; ld.global.u16 %rs135, [%rd32]; $L__BB0_29: or.pred %p59, %p46, %p54; @%p59 bra $L__BB0_31; ld.global.u16 %rs136, [%rd32+2]; $L__BB0_31: mov.u16 %rs138, 0; or.pred %p62, %p49, %p54; mov.u16 %rs137, %rs138; @%p62 bra $L__BB0_33; ld.global.u16 %rs137, [%rd32+16]; $L__BB0_33: or.pred %p65, %p52, %p54; @%p65 bra $L__BB0_35; ld.global.u16 %rs138, [%rd32+18]; $L__BB0_35: cvt.s64.s32 %rd115, %r272; add.s64 %rd116, %rd115, %rd78; setp.ge.u64 %p66, %rd116, %rd2; mul.lo.s64 %rd117, %rd115, %rd2; add.s64 %rd118, %rd26, %rd117; shl.b64 %rd119, %rd118, 1; add.s64 %rd122, %rd86, %rd119; add.s64 %rd33, %rd88, %rd122; mov.u16 %rs140, 0; or.pred %p68, %p43, %p66; mov.u16 %rs139, %rs140; @%p68 bra $L__BB0_37; ld.global.u16 %rs139, [%rd33]; $L__BB0_37: or.pred %p71, %p46, %p66; @%p71 bra $L__BB0_39; ld.global.u16 %rs140, [%rd33+2]; $L__BB0_39: mov.u16 %rs142, 0; or.pred %p74, %p49, %p66; mov.u16 %rs141, %rs142; @%p74 bra $L__BB0_41; ld.global.u16 %rs141, [%rd33+16]; $L__BB0_41: or.pred %p77, %p52, %p66; @%p77 bra $L__BB0_43; ld.global.u16 %rs142, [%rd33+18]; $L__BB0_43: cvt.s64.s32 %rd134, %r273; add.s64 %rd135, %rd134, %rd78; setp.ge.u64 %p78, %rd135, %rd2; mul.lo.s64 %rd136, %rd134, %rd2; add.s64 %rd137, %rd26, %rd136; shl.b64 %rd138, %rd137, 1; add.s64 %rd141, %rd86, %rd138; add.s64 %rd34, %rd88, %rd141; mov.u16 %rs144, 0; or.pred %p80, %p43, %p78; mov.u16 %rs143, %rs144; @%p80 bra $L__BB0_45; ld.global.u16 %rs143, [%rd34]; $L__BB0_45: or.pred %p83, %p46, %p78; @%p83 bra $L__BB0_47; ld.global.u16 %rs144, [%rd34+2]; $L__BB0_47: mov.u16 %rs146, 0; or.pred %p86, %p49, %p78; mov.u16 %rs145, %rs146; @%p86 bra $L__BB0_49; ld.global.u16 %rs145, [%rd34+16]; $L__BB0_49: or.pred %p89, %p52, %p78; @%p89 bra $L__BB0_51; ld.global.u16 %rs146, [%rd34+18]; $L__BB0_51: // begin inline asm mov.u32 %r1238, 0; // end inline asm // begin inline asm mov.u32 %r1239, 0; // end inline asm // begin inline asm mov.u32 %r1240, 0; // end inline asm // begin inline asm mov.u32 %r1241, 0; // end inline asm // begin inline asm mov.u32 %r1242, 0; // end inline asm // begin inline asm mov.u32 %r1243, 0; // end inline asm // begin inline asm mov.u32 %r1244, 0; // end inline asm // begin inline asm mov.u32 %r1245, 0; // end inline asm // begin inline asm mov.u32 %r1246, 0; // end inline asm // begin inline asm mov.u32 %r1247, 0; // end inline asm // begin inline asm mov.u32 %r1248, 0; // end inline asm // begin inline asm mov.u32 %r1249, 0; // end inline asm // begin inline asm mov.u32 %r1250, 0; // end inline asm // begin inline asm mov.u32 %r1251, 0; // end inline asm // begin inline asm mov.u32 %r1252, 0; // end inline asm // begin inline asm mov.u32 %r1253, 0; // end inline asm mov.b32 %f214, %r1238; mov.b32 %f215, %r1239; mov.b32 %f216, %r1240; mov.b32 %f217, %r1241; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r953, %r954, %r955, %r956}, {%r2119, %r2118}, {%f214, %f215, %f216, %f217}; // end inline asm mov.b32 %f222, %r1242; mov.b32 %f223, %r1243; mov.b32 %f224, %r1244; mov.b32 %f225, %r1245; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r953, %r954, %r955, %r956}, {%r2117, %r2116}, {%f222, %f223, %f224, %f225}; // end inline asm mov.b32 %f230, %r1246; mov.b32 %f231, %r1247; mov.b32 %f232, %r1248; mov.b32 %f233, %r1249; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f230, %f231, %f232, %f233}, {%r958, %r959, %r960, %r961}, {%r2119, %r2118}, {%f230, %f231, %f232, %f233}; // end inline asm mov.b32 %f238, %r1250; mov.b32 %f239, %r1251; mov.b32 %f240, %r1252; mov.b32 %f241, %r1253; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f238, %f239, %f240, %f241}, {%r958, %r959, %r960, %r961}, {%r2117, %r2116}, {%f238, %f239, %f240, %f241}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r963, %r964, %r965, %r966}, {%r2115, %r2114}, {%f214, %f215, %f216, %f217}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r963, %r964, %r965, %r966}, {%r2113, %r2112}, {%f222, %f223, %f224, %f225}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f230, %f231, %f232, %f233}, {%r968, %r969, %r970, %r971}, {%r2115, %r2114}, {%f230, %f231, %f232, %f233}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f238, %f239, %f240, %f241}, {%r968, %r969, %r970, %r971}, {%r2113, %r2112}, {%f238, %f239, %f240, %f241}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r973, %r974, %r975, %r976}, {%r2111, %r2110}, {%f214, %f215, %f216, %f217}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r973, %r974, %r975, %r976}, {%r2109, %r2108}, {%f222, %f223, %f224, %f225}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f230, %f231, %f232, %f233}, {%r978, %r979, %r980, %r981}, {%r2111, %r2110}, {%f230, %f231, %f232, %f233}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f238, %f239, %f240, %f241}, {%r978, %r979, %r980, %r981}, {%r2109, %r2108}, {%f238, %f239, %f240, %f241}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r983, %r984, %r985, %r986}, {%r2107, %r2106}, {%f214, %f215, %f216, %f217}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r983, %r984, %r985, %r986}, {%r2105, %r2104}, {%f222, %f223, %f224, %f225}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f230, %f231, %f232, %f233}, {%r988, %r989, %r990, %r991}, {%r2107, %r2106}, {%f230, %f231, %f232, %f233}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f238, %f239, %f240, %f241}, {%r988, %r989, %r990, %r991}, {%r2105, %r2104}, {%f238, %f239, %f240, %f241}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f214, %f215, %f216, %f217}, {%r993, %r994, %r995, %r996}, {%r2103, %r2102}, {%f214, %f215, %f216, %f217}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f222, %f223, %f224, %f225}, {%r993, %r994, %r995, %r996}, {%r2101, %r2100}, {%f222, %f223, %f224, %f225}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f230, %f231, %f232, %f233}, {%r998, %r999, %r1000, %r1001}, {%r2103, %r2102}, {%f230, %f231, %f232, %f233}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f238, %f239, %f240, %f241}, {%r998, %r999, %r1000, %r1001}, {%r2101, %r2100}, {%f238, %f239, %f240, %f241}; // end inline asm mul.ftz.f32 %f342, %f1, %f214; mul.ftz.f32 %f343, %f1, %f215; mul.ftz.f32 %f344, %f1, %f222; mul.ftz.f32 %f345, %f1, %f223; mul.ftz.f32 %f346, %f1, %f216; mul.ftz.f32 %f347, %f1, %f217; mul.ftz.f32 %f348, %f1, %f224; mul.ftz.f32 %f349, %f1, %f225; mul.ftz.f32 %f350, %f1, %f230; mul.ftz.f32 %f351, %f1, %f231; mul.ftz.f32 %f352, %f1, %f238; mul.ftz.f32 %f353, %f1, %f239; mul.ftz.f32 %f354, %f1, %f232; mul.ftz.f32 %f355, %f1, %f233; mul.ftz.f32 %f356, %f1, %f240; mul.ftz.f32 %f357, %f1, %f241; setp.lt.s32 %p90, %r2097, %r1; selp.f32 %f1005, %f342, 0fFF800000, %p90; add.s32 %r1374, %r2097, 1; setp.lt.s32 %p91, %r1374, %r1; selp.f32 %f1004, %f343, 0fFF800000, %p91; add.s32 %r1375, %r2097, 8; setp.lt.s32 %p92, %r1375, %r1; selp.f32 %f1003, %f344, 0fFF800000, %p92; add.s32 %r1376, %r2097, 9; setp.lt.s32 %p93, %r1376, %r1; selp.f32 %f1002, %f345, 0fFF800000, %p93; selp.f32 %f1001, %f346, 0fFF800000, %p90; selp.f32 %f1000, %f347, 0fFF800000, %p91; selp.f32 %f999, %f348, 0fFF800000, %p92; selp.f32 %f998, %f349, 0fFF800000, %p93; selp.f32 %f997, %f350, 0fFF800000, %p90; selp.f32 %f996, %f351, 0fFF800000, %p91; selp.f32 %f995, %f352, 0fFF800000, %p92; selp.f32 %f994, %f353, 0fFF800000, %p93; selp.f32 %f993, %f354, 0fFF800000, %p90; selp.f32 %f992, %f355, 0fFF800000, %p91; selp.f32 %f991, %f356, 0fFF800000, %p92; selp.f32 %f990, %f357, 0fFF800000, %p93; @%p41 bra $L__BB0_53; // begin inline asm cvt.f32.f16 %f358, %rs131; // end inline asm add.ftz.f32 %f1005, %f358, %f1005; // begin inline asm cvt.f32.f16 %f359, %rs132; // end inline asm add.ftz.f32 %f1004, %f359, %f1004; // begin inline asm cvt.f32.f16 %f360, %rs133; // end inline asm add.ftz.f32 %f1003, %f360, %f1003; // begin inline asm cvt.f32.f16 %f361, %rs134; // end inline asm add.ftz.f32 %f1002, %f361, %f1002; // begin inline asm cvt.f32.f16 %f362, %rs135; // end inline asm add.ftz.f32 %f1001, %f362, %f1001; // begin inline asm cvt.f32.f16 %f363, %rs136; // end inline asm add.ftz.f32 %f1000, %f363, %f1000; // begin inline asm cvt.f32.f16 %f364, %rs137; // end inline asm add.ftz.f32 %f999, %f364, %f999; // begin inline asm cvt.f32.f16 %f365, %rs138; // end inline asm add.ftz.f32 %f998, %f365, %f998; // begin inline asm cvt.f32.f16 %f366, %rs139; // end inline asm add.ftz.f32 %f997, %f366, %f997; // begin inline asm cvt.f32.f16 %f367, %rs140; // end inline asm add.ftz.f32 %f996, %f367, %f996; // begin inline asm cvt.f32.f16 %f368, %rs141; // end inline asm add.ftz.f32 %f995, %f368, %f995; // begin inline asm cvt.f32.f16 %f369, %rs142; // end inline asm add.ftz.f32 %f994, %f369, %f994; // begin inline asm cvt.f32.f16 %f370, %rs143; // end inline asm add.ftz.f32 %f993, %f370, %f993; // begin inline asm cvt.f32.f16 %f371, %rs144; // end inline asm add.ftz.f32 %f992, %f371, %f992; // begin inline asm cvt.f32.f16 %f372, %rs145; // end inline asm add.ftz.f32 %f991, %f372, %f991; // begin inline asm cvt.f32.f16 %f373, %rs146; // end inline asm add.ftz.f32 %f990, %f373, %f990; $L__BB0_53: add.s32 %r1973, %r2096, 16; setp.ge.s32 %p173, %r1973, %r186; setp.gt.ftz.f32 %p96, %f1005, %f1004; selp.f32 %f374, %f1005, %f1004, %p96; setp.gt.ftz.f32 %p97, %f374, %f1003; selp.f32 %f375, %f374, %f1003, %p97; setp.gt.ftz.f32 %p98, %f375, %f1002; selp.f32 %f376, %f375, %f1002, %p98; setp.gt.ftz.f32 %p99, %f1001, %f1000; selp.f32 %f377, %f1001, %f1000, %p99; setp.gt.ftz.f32 %p100, %f377, %f999; selp.f32 %f378, %f377, %f999, %p100; setp.gt.ftz.f32 %p101, %f378, %f998; selp.f32 %f379, %f378, %f998, %p101; setp.gt.ftz.f32 %p102, %f997, %f996; selp.f32 %f380, %f997, %f996, %p102; setp.gt.ftz.f32 %p103, %f380, %f995; selp.f32 %f381, %f380, %f995, %p103; setp.gt.ftz.f32 %p104, %f381, %f994; selp.f32 %f382, %f381, %f994, %p104; setp.gt.ftz.f32 %p105, %f993, %f992; selp.f32 %f383, %f993, %f992, %p105; setp.gt.ftz.f32 %p106, %f383, %f991; selp.f32 %f384, %f383, %f991, %p106; setp.gt.ftz.f32 %p107, %f384, %f990; selp.f32 %f385, %f384, %f990, %p107; mov.b32 %r1378, %f376; mov.u32 %r1379, 31; mov.u32 %r1380, 1; mov.u32 %r1381, -1; shfl.sync.bfly.b32 %r1382|%p108, %r1378, %r1380, %r1379, %r1381; mov.b32 %f386, %r1382; setp.gt.ftz.f32 %p109, %f376, %f386; selp.f32 %f387, %f376, %f386, %p109; mov.b32 %r1383, %f387; mov.u32 %r1384, 2; shfl.sync.bfly.b32 %r1385|%p110, %r1383, %r1384, %r1379, %r1381; mov.b32 %f388, %r1385; setp.gt.ftz.f32 %p111, %f387, %f388; selp.f32 %f389, %f387, %f388, %p111; mov.b32 %r1386, %f379; shfl.sync.bfly.b32 %r1387|%p112, %r1386, %r1380, %r1379, %r1381; mov.b32 %f390, %r1387; setp.gt.ftz.f32 %p113, %f379, %f390; selp.f32 %f391, %f379, %f390, %p113; mov.b32 %r1388, %f391; shfl.sync.bfly.b32 %r1389|%p114, %r1388, %r1384, %r1379, %r1381; mov.b32 %f392, %r1389; setp.gt.ftz.f32 %p115, %f391, %f392; selp.f32 %f393, %f391, %f392, %p115; mov.b32 %r1390, %f382; shfl.sync.bfly.b32 %r1391|%p116, %r1390, %r1380, %r1379, %r1381; mov.b32 %f394, %r1391; setp.gt.ftz.f32 %p117, %f382, %f394; selp.f32 %f395, %f382, %f394, %p117; mov.b32 %r1392, %f395; shfl.sync.bfly.b32 %r1393|%p118, %r1392, %r1384, %r1379, %r1381; mov.b32 %f396, %r1393; setp.gt.ftz.f32 %p119, %f395, %f396; selp.f32 %f397, %f395, %f396, %p119; mov.b32 %r1394, %f385; shfl.sync.bfly.b32 %r1395|%p120, %r1394, %r1380, %r1379, %r1381; mov.b32 %f398, %r1395; setp.gt.ftz.f32 %p121, %f385, %f398; selp.f32 %f399, %f385, %f398, %p121; mov.b32 %r1396, %f399; shfl.sync.bfly.b32 %r1397|%p122, %r1396, %r1384, %r1379, %r1381; mov.b32 %f400, %r1397; setp.gt.ftz.f32 %p123, %f399, %f400; selp.f32 %f401, %f399, %f400, %p123; max.ftz.f32 %f58, %f389, %f989; max.ftz.f32 %f59, %f393, %f988; max.ftz.f32 %f60, %f397, %f987; max.ftz.f32 %f61, %f401, %f986; sub.ftz.f32 %f402, %f1005, %f58; mul.ftz.f32 %f403, %f402, 0f3FB8AA3B; ex2.approx.ftz.f32 %f62, %f403; sub.ftz.f32 %f404, %f1004, %f58; mul.ftz.f32 %f405, %f404, 0f3FB8AA3B; ex2.approx.ftz.f32 %f63, %f405; sub.ftz.f32 %f406, %f1003, %f58; mul.ftz.f32 %f407, %f406, 0f3FB8AA3B; ex2.approx.ftz.f32 %f64, %f407; sub.ftz.f32 %f408, %f1002, %f58; mul.ftz.f32 %f409, %f408, 0f3FB8AA3B; ex2.approx.ftz.f32 %f65, %f409; sub.ftz.f32 %f410, %f1001, %f59; mul.ftz.f32 %f411, %f410, 0f3FB8AA3B; ex2.approx.ftz.f32 %f66, %f411; sub.ftz.f32 %f412, %f1000, %f59; mul.ftz.f32 %f413, %f412, 0f3FB8AA3B; ex2.approx.ftz.f32 %f67, %f413; sub.ftz.f32 %f414, %f999, %f59; mul.ftz.f32 %f415, %f414, 0f3FB8AA3B; ex2.approx.ftz.f32 %f68, %f415; sub.ftz.f32 %f416, %f998, %f59; mul.ftz.f32 %f417, %f416, 0f3FB8AA3B; ex2.approx.ftz.f32 %f69, %f417; sub.ftz.f32 %f418, %f997, %f60; mul.ftz.f32 %f419, %f418, 0f3FB8AA3B; ex2.approx.ftz.f32 %f70, %f419; sub.ftz.f32 %f420, %f996, %f60; mul.ftz.f32 %f421, %f420, 0f3FB8AA3B; ex2.approx.ftz.f32 %f71, %f421; sub.ftz.f32 %f422, %f995, %f60; mul.ftz.f32 %f423, %f422, 0f3FB8AA3B; ex2.approx.ftz.f32 %f72, %f423; sub.ftz.f32 %f424, %f994, %f60; mul.ftz.f32 %f425, %f424, 0f3FB8AA3B; ex2.approx.ftz.f32 %f73, %f425; sub.ftz.f32 %f426, %f993, %f61; mul.ftz.f32 %f427, %f426, 0f3FB8AA3B; ex2.approx.ftz.f32 %f74, %f427; sub.ftz.f32 %f428, %f992, %f61; mul.ftz.f32 %f429, %f428, 0f3FB8AA3B; ex2.approx.ftz.f32 %f75, %f429; sub.ftz.f32 %f430, %f991, %f61; mul.ftz.f32 %f431, %f430, 0f3FB8AA3B; ex2.approx.ftz.f32 %f76, %f431; sub.ftz.f32 %f432, %f990, %f61; mul.ftz.f32 %f433, %f432, 0f3FB8AA3B; ex2.approx.ftz.f32 %f77, %f433; add.ftz.f32 %f434, %f62, %f63; add.ftz.f32 %f435, %f434, 0f00000000; add.ftz.f32 %f436, %f64, %f65; add.ftz.f32 %f437, %f436, 0f00000000; add.ftz.f32 %f438, %f435, %f437; add.ftz.f32 %f439, %f66, %f67; add.ftz.f32 %f440, %f439, 0f00000000; add.ftz.f32 %f441, %f68, %f69; add.ftz.f32 %f442, %f441, 0f00000000; add.ftz.f32 %f443, %f440, %f442; add.ftz.f32 %f444, %f70, %f71; add.ftz.f32 %f445, %f444, 0f00000000; add.ftz.f32 %f446, %f72, %f73; add.ftz.f32 %f447, %f446, 0f00000000; add.ftz.f32 %f448, %f445, %f447; add.ftz.f32 %f449, %f74, %f75; add.ftz.f32 %f450, %f449, 0f00000000; add.ftz.f32 %f451, %f76, %f77; add.ftz.f32 %f452, %f451, 0f00000000; add.ftz.f32 %f453, %f450, %f452; mov.b32 %r1398, %f438; shfl.sync.bfly.b32 %r1399|%p124, %r1398, %r1380, %r1379, %r1381; mov.b32 %f454, %r1399; add.ftz.f32 %f455, %f438, %f454; mov.b32 %r1400, %f455; shfl.sync.bfly.b32 %r1401|%p125, %r1400, %r1384, %r1379, %r1381; mov.b32 %f456, %r1401; add.ftz.f32 %f457, %f455, %f456; mov.b32 %r1402, %f443; shfl.sync.bfly.b32 %r1403|%p126, %r1402, %r1380, %r1379, %r1381; mov.b32 %f458, %r1403; add.ftz.f32 %f459, %f443, %f458; mov.b32 %r1404, %f459; shfl.sync.bfly.b32 %r1405|%p127, %r1404, %r1384, %r1379, %r1381; mov.b32 %f460, %r1405; add.ftz.f32 %f461, %f459, %f460; mov.b32 %r1406, %f448; shfl.sync.bfly.b32 %r1407|%p128, %r1406, %r1380, %r1379, %r1381; mov.b32 %f462, %r1407; add.ftz.f32 %f463, %f448, %f462; mov.b32 %r1408, %f463; shfl.sync.bfly.b32 %r1409|%p129, %r1408, %r1384, %r1379, %r1381; mov.b32 %f464, %r1409; add.ftz.f32 %f465, %f463, %f464; mov.b32 %r1410, %f453; shfl.sync.bfly.b32 %r1411|%p130, %r1410, %r1380, %r1379, %r1381; mov.b32 %f466, %r1411; add.ftz.f32 %f467, %f453, %f466; mov.b32 %r1412, %f467; shfl.sync.bfly.b32 %r1413|%p131, %r1412, %r1384, %r1379, %r1381; mov.b32 %f468, %r1413; add.ftz.f32 %f469, %f467, %f468; sub.ftz.f32 %f470, %f989, %f58; mul.ftz.f32 %f471, %f470, 0f3FB8AA3B; ex2.approx.ftz.f32 %f472, %f471; mul.ftz.f32 %f78, %f472, %f985; add.ftz.f32 %f985, %f78, %f457; sub.ftz.f32 %f473, %f988, %f59; mul.ftz.f32 %f474, %f473, 0f3FB8AA3B; ex2.approx.ftz.f32 %f475, %f474; mul.ftz.f32 %f80, %f475, %f984; add.ftz.f32 %f984, %f80, %f461; sub.ftz.f32 %f476, %f987, %f60; mul.ftz.f32 %f477, %f476, 0f3FB8AA3B; ex2.approx.ftz.f32 %f478, %f477; mul.ftz.f32 %f82, %f478, %f983; add.ftz.f32 %f983, %f82, %f465; sub.ftz.f32 %f479, %f986, %f61; mul.ftz.f32 %f480, %f479, 0f3FB8AA3B; ex2.approx.ftz.f32 %f481, %f480; mul.ftz.f32 %f84, %f481, %f982; add.ftz.f32 %f982, %f84, %f469; @%p173 bra $L__BB0_55; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2119, %r2118, %r2117, %r2116}, [%r1007]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2115, %r2114, %r2113, %r2112}, [%r1012]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2111, %r2110, %r2109, %r2108}, [%r1017]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2107, %r2106, %r2105, %r2104}, [%r1022]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2103, %r2102, %r2101, %r2100}, [%r1027]; // end inline asm $L__BB0_55: // begin inline asm cvt.rn.f16x2.f32 %r1454, %f63, %f62; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1455, %f67, %f66; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1456, %f65, %f64; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1457, %f69, %f68; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1458, %f71, %f70; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1459, %f75, %f74; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1460, %f73, %f72; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1461, %f77, %f76; // end inline asm // begin inline asm mov.u32 %r1462, 0; // end inline asm // begin inline asm mov.u32 %r1463, 0; // end inline asm // begin inline asm mov.u32 %r1464, 0; // end inline asm // begin inline asm mov.u32 %r1465, 0; // end inline asm // begin inline asm mov.u32 %r1466, 0; // end inline asm // begin inline asm mov.u32 %r1467, 0; // end inline asm // begin inline asm mov.u32 %r1468, 0; // end inline asm // begin inline asm mov.u32 %r1469, 0; // end inline asm // begin inline asm mov.u32 %r1470, 0; // end inline asm // begin inline asm mov.u32 %r1471, 0; // end inline asm // begin inline asm mov.u32 %r1472, 0; // end inline asm // begin inline asm mov.u32 %r1473, 0; // end inline asm // begin inline asm mov.u32 %r1474, 0; // end inline asm // begin inline asm mov.u32 %r1475, 0; // end inline asm // begin inline asm mov.u32 %r1476, 0; // end inline asm // begin inline asm mov.u32 %r1477, 0; // end inline asm // begin inline asm mov.u32 %r1478, 0; // end inline asm // begin inline asm mov.u32 %r1479, 0; // end inline asm // begin inline asm mov.u32 %r1480, 0; // end inline asm // begin inline asm mov.u32 %r1481, 0; // end inline asm // begin inline asm mov.u32 %r1482, 0; // end inline asm // begin inline asm mov.u32 %r1483, 0; // end inline asm // begin inline asm mov.u32 %r1484, 0; // end inline asm // begin inline asm mov.u32 %r1485, 0; // end inline asm // begin inline asm mov.u32 %r1486, 0; // end inline asm // begin inline asm mov.u32 %r1487, 0; // end inline asm // begin inline asm mov.u32 %r1488, 0; // end inline asm // begin inline asm mov.u32 %r1489, 0; // end inline asm // begin inline asm mov.u32 %r1490, 0; // end inline asm // begin inline asm mov.u32 %r1491, 0; // end inline asm // begin inline asm mov.u32 %r1492, 0; // end inline asm // begin inline asm mov.u32 %r1493, 0; // end inline asm // begin inline asm mov.u32 %r1494, 0; // end inline asm // begin inline asm mov.u32 %r1495, 0; // end inline asm // begin inline asm mov.u32 %r1496, 0; // end inline asm // begin inline asm mov.u32 %r1497, 0; // end inline asm // begin inline asm mov.u32 %r1498, 0; // end inline asm // begin inline asm mov.u32 %r1499, 0; // end inline asm // begin inline asm mov.u32 %r1500, 0; // end inline asm // begin inline asm mov.u32 %r1501, 0; // end inline asm // begin inline asm mov.u32 %r1502, 0; // end inline asm // begin inline asm mov.u32 %r1503, 0; // end inline asm // begin inline asm mov.u32 %r1504, 0; // end inline asm // begin inline asm mov.u32 %r1505, 0; // end inline asm // begin inline asm mov.u32 %r1506, 0; // end inline asm // begin inline asm mov.u32 %r1507, 0; // end inline asm // begin inline asm mov.u32 %r1508, 0; // end inline asm // begin inline asm mov.u32 %r1509, 0; // end inline asm // begin inline asm mov.u32 %r1510, 0; // end inline asm // begin inline asm mov.u32 %r1511, 0; // end inline asm // begin inline asm mov.u32 %r1512, 0; // end inline asm // begin inline asm mov.u32 %r1513, 0; // end inline asm // begin inline asm mov.u32 %r1514, 0; // end inline asm // begin inline asm mov.u32 %r1515, 0; // end inline asm // begin inline asm mov.u32 %r1516, 0; // end inline asm // begin inline asm mov.u32 %r1517, 0; // end inline asm // begin inline asm mov.u32 %r1518, 0; // end inline asm // begin inline asm mov.u32 %r1519, 0; // end inline asm // begin inline asm mov.u32 %r1520, 0; // end inline asm // begin inline asm mov.u32 %r1521, 0; // end inline asm // begin inline asm mov.u32 %r1522, 0; // end inline asm // begin inline asm mov.u32 %r1523, 0; // end inline asm // begin inline asm mov.u32 %r1524, 0; // end inline asm // begin inline asm mov.u32 %r1525, 0; // end inline asm // begin inline asm mov.u32 %r1526, 0; // end inline asm // begin inline asm mov.u32 %r1527, 0; // end inline asm // begin inline asm mov.u32 %r1528, 0; // end inline asm // begin inline asm mov.u32 %r1529, 0; // end inline asm // begin inline asm mov.u32 %r1530, 0; // end inline asm // begin inline asm mov.u32 %r1531, 0; // end inline asm // begin inline asm mov.u32 %r1532, 0; // end inline asm // begin inline asm mov.u32 %r1533, 0; // end inline asm // begin inline asm mov.u32 %r1534, 0; // end inline asm // begin inline asm mov.u32 %r1535, 0; // end inline asm // begin inline asm mov.u32 %r1536, 0; // end inline asm // begin inline asm mov.u32 %r1537, 0; // end inline asm // begin inline asm mov.u32 %r1538, 0; // end inline asm // begin inline asm mov.u32 %r1539, 0; // end inline asm // begin inline asm mov.u32 %r1540, 0; // end inline asm // begin inline asm mov.u32 %r1541, 0; // end inline asm mov.b32 %f498, %r1462; mov.b32 %f499, %r1463; mov.b32 %f500, %r1464; mov.b32 %f501, %r1465; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f498, %f499, %f500, %f501}, {%r1454, %r1455, %r1456, %r1457}, {%r2139, %r2138}, {%f498, %f499, %f500, %f501}; // end inline asm mov.b32 %f506, %r1466; mov.b32 %f507, %r1467; mov.b32 %f508, %r1468; mov.b32 %f509, %r1469; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f506, %f507, %f508, %f509}, {%r1454, %r1455, %r1456, %r1457}, {%r2137, %r2136}, {%f506, %f507, %f508, %f509}; // end inline asm mov.b32 %f514, %r1470; mov.b32 %f515, %r1471; mov.b32 %f516, %r1472; mov.b32 %f517, %r1473; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f514, %f515, %f516, %f517}, {%r1454, %r1455, %r1456, %r1457}, {%r2135, %r2134}, {%f514, %f515, %f516, %f517}; // end inline asm mov.b32 %f522, %r1474; mov.b32 %f523, %r1475; mov.b32 %f524, %r1476; mov.b32 %f525, %r1477; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f522, %f523, %f524, %f525}, {%r1454, %r1455, %r1456, %r1457}, {%r2133, %r2132}, {%f522, %f523, %f524, %f525}; // end inline asm mov.b32 %f530, %r1478; mov.b32 %f531, %r1479; mov.b32 %f532, %r1480; mov.b32 %f533, %r1481; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f530, %f531, %f532, %f533}, {%r1454, %r1455, %r1456, %r1457}, {%r2131, %r2130}, {%f530, %f531, %f532, %f533}; // end inline asm mov.b32 %f538, %r1482; mov.b32 %f539, %r1483; mov.b32 %f540, %r1484; mov.b32 %f541, %r1485; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f538, %f539, %f540, %f541}, {%r1454, %r1455, %r1456, %r1457}, {%r2129, %r2128}, {%f538, %f539, %f540, %f541}; // end inline asm mov.b32 %f546, %r1486; mov.b32 %f547, %r1487; mov.b32 %f548, %r1488; mov.b32 %f549, %r1489; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f546, %f547, %f548, %f549}, {%r1454, %r1455, %r1456, %r1457}, {%r2127, %r2126}, {%f546, %f547, %f548, %f549}; // end inline asm mov.b32 %f554, %r1490; mov.b32 %f555, %r1491; mov.b32 %f556, %r1492; mov.b32 %f557, %r1493; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f554, %f555, %f556, %f557}, {%r1454, %r1455, %r1456, %r1457}, {%r2125, %r2124}, {%f554, %f555, %f556, %f557}; // end inline asm mov.b32 %f562, %r1494; mov.b32 %f563, %r1495; mov.b32 %f564, %r1496; mov.b32 %f565, %r1497; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f562, %f563, %f564, %f565}, {%r1454, %r1455, %r1456, %r1457}, {%r2123, %r2122}, {%f562, %f563, %f564, %f565}; // end inline asm mov.b32 %f570, %r1498; mov.b32 %f571, %r1499; mov.b32 %f572, %r1500; mov.b32 %f573, %r1501; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f570, %f571, %f572, %f573}, {%r1454, %r1455, %r1456, %r1457}, {%r2121, %r2120}, {%f570, %f571, %f572, %f573}; // end inline asm mov.b32 %f578, %r1502; mov.b32 %f579, %r1503; mov.b32 %f580, %r1504; mov.b32 %f581, %r1505; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f578, %f579, %f580, %f581}, {%r1458, %r1459, %r1460, %r1461}, {%r2139, %r2138}, {%f578, %f579, %f580, %f581}; // end inline asm mov.b32 %f586, %r1506; mov.b32 %f587, %r1507; mov.b32 %f588, %r1508; mov.b32 %f589, %r1509; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f586, %f587, %f588, %f589}, {%r1458, %r1459, %r1460, %r1461}, {%r2137, %r2136}, {%f586, %f587, %f588, %f589}; // end inline asm mov.b32 %f594, %r1510; mov.b32 %f595, %r1511; mov.b32 %f596, %r1512; mov.b32 %f597, %r1513; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f594, %f595, %f596, %f597}, {%r1458, %r1459, %r1460, %r1461}, {%r2135, %r2134}, {%f594, %f595, %f596, %f597}; // end inline asm mov.b32 %f602, %r1514; mov.b32 %f603, %r1515; mov.b32 %f604, %r1516; mov.b32 %f605, %r1517; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f602, %f603, %f604, %f605}, {%r1458, %r1459, %r1460, %r1461}, {%r2133, %r2132}, {%f602, %f603, %f604, %f605}; // end inline asm mov.b32 %f610, %r1518; mov.b32 %f611, %r1519; mov.b32 %f612, %r1520; mov.b32 %f613, %r1521; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f610, %f611, %f612, %f613}, {%r1458, %r1459, %r1460, %r1461}, {%r2131, %r2130}, {%f610, %f611, %f612, %f613}; // end inline asm mov.b32 %f618, %r1522; mov.b32 %f619, %r1523; mov.b32 %f620, %r1524; mov.b32 %f621, %r1525; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f618, %f619, %f620, %f621}, {%r1458, %r1459, %r1460, %r1461}, {%r2129, %r2128}, {%f618, %f619, %f620, %f621}; // end inline asm mov.b32 %f626, %r1526; mov.b32 %f627, %r1527; mov.b32 %f628, %r1528; mov.b32 %f629, %r1529; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f626, %f627, %f628, %f629}, {%r1458, %r1459, %r1460, %r1461}, {%r2127, %r2126}, {%f626, %f627, %f628, %f629}; // end inline asm mov.b32 %f634, %r1530; mov.b32 %f635, %r1531; mov.b32 %f636, %r1532; mov.b32 %f637, %r1533; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f634, %f635, %f636, %f637}, {%r1458, %r1459, %r1460, %r1461}, {%r2125, %r2124}, {%f634, %f635, %f636, %f637}; // end inline asm mov.b32 %f642, %r1534; mov.b32 %f643, %r1535; mov.b32 %f644, %r1536; mov.b32 %f645, %r1537; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f642, %f643, %f644, %f645}, {%r1458, %r1459, %r1460, %r1461}, {%r2123, %r2122}, {%f642, %f643, %f644, %f645}; // end inline asm mov.b32 %f650, %r1538; mov.b32 %f651, %r1539; mov.b32 %f652, %r1540; mov.b32 %f653, %r1541; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f650, %f651, %f652, %f653}, {%r1458, %r1459, %r1460, %r1461}, {%r2121, %r2120}, {%f650, %f651, %f652, %f653}; // end inline asm setp.equ.ftz.f32 %p132, %f985, 0f00000000; mov.f32 %f1007, 0f3F800000; mov.f32 %f1006, %f1007; @%p132 bra $L__BB0_57; rcp.approx.ftz.f32 %f1006, %f985; $L__BB0_57: setp.equ.ftz.f32 %p133, %f984, 0f00000000; @%p133 bra $L__BB0_59; rcp.approx.ftz.f32 %f1007, %f984; $L__BB0_59: mov.b32 %f661, %r2219; fma.rn.ftz.f32 %f662, %f78, %f661, %f498; mul.ftz.f32 %f663, %f1006, %f662; mov.b32 %r2219, %f663; mov.b32 %f664, %r2218; fma.rn.ftz.f32 %f665, %f78, %f664, %f499; mul.ftz.f32 %f666, %f1006, %f665; mov.b32 %r2218, %f666; mov.b32 %f667, %r2217; fma.rn.ftz.f32 %f668, %f80, %f667, %f500; mul.ftz.f32 %f669, %f1007, %f668; mov.b32 %r2217, %f669; mov.b32 %f670, %r2216; fma.rn.ftz.f32 %f671, %f80, %f670, %f501; mul.ftz.f32 %f672, %f1007, %f671; mov.b32 %r2216, %f672; mov.b32 %f673, %r2215; fma.rn.ftz.f32 %f674, %f78, %f673, %f506; mul.ftz.f32 %f675, %f1006, %f674; mov.b32 %r2215, %f675; mov.b32 %f676, %r2214; fma.rn.ftz.f32 %f677, %f78, %f676, %f507; mul.ftz.f32 %f678, %f1006, %f677; mov.b32 %r2214, %f678; mov.b32 %f679, %r2213; fma.rn.ftz.f32 %f680, %f80, %f679, %f508; mul.ftz.f32 %f681, %f1007, %f680; mov.b32 %r2213, %f681; mov.b32 %f682, %r2212; fma.rn.ftz.f32 %f683, %f80, %f682, %f509; mul.ftz.f32 %f684, %f1007, %f683; mov.b32 %r2212, %f684; mov.b32 %f685, %r2211; fma.rn.ftz.f32 %f686, %f78, %f685, %f514; mul.ftz.f32 %f687, %f1006, %f686; mov.b32 %r2211, %f687; mov.b32 %f688, %r2210; fma.rn.ftz.f32 %f689, %f78, %f688, %f515; mul.ftz.f32 %f690, %f1006, %f689; mov.b32 %r2210, %f690; mov.b32 %f691, %r2209; fma.rn.ftz.f32 %f692, %f80, %f691, %f516; mul.ftz.f32 %f693, %f1007, %f692; mov.b32 %r2209, %f693; mov.b32 %f694, %r2208; fma.rn.ftz.f32 %f695, %f80, %f694, %f517; mul.ftz.f32 %f696, %f1007, %f695; mov.b32 %r2208, %f696; mov.b32 %f697, %r2207; fma.rn.ftz.f32 %f698, %f78, %f697, %f522; mul.ftz.f32 %f699, %f1006, %f698; mov.b32 %r2207, %f699; mov.b32 %f700, %r2206; fma.rn.ftz.f32 %f701, %f78, %f700, %f523; mul.ftz.f32 %f702, %f1006, %f701; mov.b32 %r2206, %f702; mov.b32 %f703, %r2205; fma.rn.ftz.f32 %f704, %f80, %f703, %f524; mul.ftz.f32 %f705, %f1007, %f704; mov.b32 %r2205, %f705; mov.b32 %f706, %r2204; fma.rn.ftz.f32 %f707, %f80, %f706, %f525; mul.ftz.f32 %f708, %f1007, %f707; mov.b32 %r2204, %f708; mov.b32 %f709, %r2203; fma.rn.ftz.f32 %f710, %f78, %f709, %f530; mul.ftz.f32 %f711, %f1006, %f710; mov.b32 %r2203, %f711; mov.b32 %f712, %r2202; fma.rn.ftz.f32 %f713, %f78, %f712, %f531; mul.ftz.f32 %f714, %f1006, %f713; mov.b32 %r2202, %f714; mov.b32 %f715, %r2201; fma.rn.ftz.f32 %f716, %f80, %f715, %f532; mul.ftz.f32 %f717, %f1007, %f716; mov.b32 %r2201, %f717; mov.b32 %f718, %r2200; fma.rn.ftz.f32 %f719, %f80, %f718, %f533; mul.ftz.f32 %f720, %f1007, %f719; mov.b32 %r2200, %f720; mov.b32 %f721, %r2199; fma.rn.ftz.f32 %f722, %f78, %f721, %f538; mul.ftz.f32 %f723, %f1006, %f722; mov.b32 %r2199, %f723; mov.b32 %f724, %r2198; fma.rn.ftz.f32 %f725, %f78, %f724, %f539; mul.ftz.f32 %f726, %f1006, %f725; mov.b32 %r2198, %f726; mov.b32 %f727, %r2197; fma.rn.ftz.f32 %f728, %f80, %f727, %f540; mul.ftz.f32 %f729, %f1007, %f728; mov.b32 %r2197, %f729; mov.b32 %f730, %r2196; fma.rn.ftz.f32 %f731, %f80, %f730, %f541; mul.ftz.f32 %f732, %f1007, %f731; mov.b32 %r2196, %f732; mov.b32 %f733, %r2195; fma.rn.ftz.f32 %f734, %f78, %f733, %f546; mul.ftz.f32 %f735, %f1006, %f734; mov.b32 %r2195, %f735; mov.b32 %f736, %r2194; fma.rn.ftz.f32 %f737, %f78, %f736, %f547; mul.ftz.f32 %f738, %f1006, %f737; mov.b32 %r2194, %f738; mov.b32 %f739, %r2193; fma.rn.ftz.f32 %f740, %f80, %f739, %f548; mul.ftz.f32 %f741, %f1007, %f740; mov.b32 %r2193, %f741; mov.b32 %f742, %r2192; fma.rn.ftz.f32 %f743, %f80, %f742, %f549; mul.ftz.f32 %f744, %f1007, %f743; mov.b32 %r2192, %f744; mov.b32 %f745, %r2191; fma.rn.ftz.f32 %f746, %f78, %f745, %f554; mul.ftz.f32 %f747, %f1006, %f746; mov.b32 %r2191, %f747; mov.b32 %f748, %r2190; fma.rn.ftz.f32 %f749, %f78, %f748, %f555; mul.ftz.f32 %f750, %f1006, %f749; mov.b32 %r2190, %f750; mov.b32 %f751, %r2189; fma.rn.ftz.f32 %f752, %f80, %f751, %f556; mul.ftz.f32 %f753, %f1007, %f752; mov.b32 %r2189, %f753; mov.b32 %f754, %r2188; fma.rn.ftz.f32 %f755, %f80, %f754, %f557; mul.ftz.f32 %f756, %f1007, %f755; mov.b32 %r2188, %f756; mov.b32 %f757, %r2187; fma.rn.ftz.f32 %f758, %f78, %f757, %f562; mul.ftz.f32 %f759, %f1006, %f758; mov.b32 %r2187, %f759; mov.b32 %f760, %r2186; fma.rn.ftz.f32 %f761, %f78, %f760, %f563; mul.ftz.f32 %f762, %f1006, %f761; mov.b32 %r2186, %f762; mov.b32 %f763, %r2185; fma.rn.ftz.f32 %f764, %f80, %f763, %f564; mul.ftz.f32 %f765, %f1007, %f764; mov.b32 %r2185, %f765; mov.b32 %f766, %r2184; fma.rn.ftz.f32 %f767, %f80, %f766, %f565; mul.ftz.f32 %f768, %f1007, %f767; mov.b32 %r2184, %f768; mov.b32 %f769, %r2183; fma.rn.ftz.f32 %f770, %f78, %f769, %f570; mul.ftz.f32 %f771, %f1006, %f770; mov.b32 %r2183, %f771; mov.b32 %f772, %r2182; fma.rn.ftz.f32 %f773, %f78, %f772, %f571; mul.ftz.f32 %f774, %f1006, %f773; mov.b32 %r2182, %f774; mov.b32 %f775, %r2181; fma.rn.ftz.f32 %f776, %f80, %f775, %f572; mul.ftz.f32 %f777, %f1007, %f776; mov.b32 %r2181, %f777; mov.b32 %f778, %r2180; fma.rn.ftz.f32 %f779, %f80, %f778, %f573; mul.ftz.f32 %f780, %f1007, %f779; mov.b32 %r2180, %f780; setp.equ.ftz.f32 %p134, %f983, 0f00000000; mov.f32 %f1009, 0f3F800000; mov.f32 %f1008, %f1009; @%p134 bra $L__BB0_61; rcp.approx.ftz.f32 %f1008, %f983; $L__BB0_61: setp.equ.ftz.f32 %p135, %f982, 0f00000000; @%p135 bra $L__BB0_63; rcp.approx.ftz.f32 %f1009, %f982; $L__BB0_63: add.s32 %r1964, %r2096, 16; setp.ge.s32 %p172, %r1964, %r186; mov.b32 %f782, %r2179; fma.rn.ftz.f32 %f783, %f82, %f782, %f578; mul.ftz.f32 %f784, %f1008, %f783; mov.b32 %r2179, %f784; mov.b32 %f785, %r2178; fma.rn.ftz.f32 %f786, %f82, %f785, %f579; mul.ftz.f32 %f787, %f1008, %f786; mov.b32 %r2178, %f787; mov.b32 %f788, %r2177; fma.rn.ftz.f32 %f789, %f84, %f788, %f580; mul.ftz.f32 %f790, %f1009, %f789; mov.b32 %r2177, %f790; mov.b32 %f791, %r2176; fma.rn.ftz.f32 %f792, %f84, %f791, %f581; mul.ftz.f32 %f793, %f1009, %f792; mov.b32 %r2176, %f793; mov.b32 %f794, %r2175; fma.rn.ftz.f32 %f795, %f82, %f794, %f586; mul.ftz.f32 %f796, %f1008, %f795; mov.b32 %r2175, %f796; mov.b32 %f797, %r2174; fma.rn.ftz.f32 %f798, %f82, %f797, %f587; mul.ftz.f32 %f799, %f1008, %f798; mov.b32 %r2174, %f799; mov.b32 %f800, %r2173; fma.rn.ftz.f32 %f801, %f84, %f800, %f588; mul.ftz.f32 %f802, %f1009, %f801; mov.b32 %r2173, %f802; mov.b32 %f803, %r2172; fma.rn.ftz.f32 %f804, %f84, %f803, %f589; mul.ftz.f32 %f805, %f1009, %f804; mov.b32 %r2172, %f805; mov.b32 %f806, %r2171; fma.rn.ftz.f32 %f807, %f82, %f806, %f594; mul.ftz.f32 %f808, %f1008, %f807; mov.b32 %r2171, %f808; mov.b32 %f809, %r2170; fma.rn.ftz.f32 %f810, %f82, %f809, %f595; mul.ftz.f32 %f811, %f1008, %f810; mov.b32 %r2170, %f811; mov.b32 %f812, %r2169; fma.rn.ftz.f32 %f813, %f84, %f812, %f596; mul.ftz.f32 %f814, %f1009, %f813; mov.b32 %r2169, %f814; mov.b32 %f815, %r2168; fma.rn.ftz.f32 %f816, %f84, %f815, %f597; mul.ftz.f32 %f817, %f1009, %f816; mov.b32 %r2168, %f817; mov.b32 %f818, %r2167; fma.rn.ftz.f32 %f819, %f82, %f818, %f602; mul.ftz.f32 %f820, %f1008, %f819; mov.b32 %r2167, %f820; mov.b32 %f821, %r2166; fma.rn.ftz.f32 %f822, %f82, %f821, %f603; mul.ftz.f32 %f823, %f1008, %f822; mov.b32 %r2166, %f823; mov.b32 %f824, %r2165; fma.rn.ftz.f32 %f825, %f84, %f824, %f604; mul.ftz.f32 %f826, %f1009, %f825; mov.b32 %r2165, %f826; mov.b32 %f827, %r2164; fma.rn.ftz.f32 %f828, %f84, %f827, %f605; mul.ftz.f32 %f829, %f1009, %f828; mov.b32 %r2164, %f829; mov.b32 %f830, %r2163; fma.rn.ftz.f32 %f831, %f82, %f830, %f610; mul.ftz.f32 %f832, %f1008, %f831; mov.b32 %r2163, %f832; mov.b32 %f833, %r2162; fma.rn.ftz.f32 %f834, %f82, %f833, %f611; mul.ftz.f32 %f835, %f1008, %f834; mov.b32 %r2162, %f835; mov.b32 %f836, %r2161; fma.rn.ftz.f32 %f837, %f84, %f836, %f612; mul.ftz.f32 %f838, %f1009, %f837; mov.b32 %r2161, %f838; mov.b32 %f839, %r2160; fma.rn.ftz.f32 %f840, %f84, %f839, %f613; mul.ftz.f32 %f841, %f1009, %f840; mov.b32 %r2160, %f841; mov.b32 %f842, %r2159; fma.rn.ftz.f32 %f843, %f82, %f842, %f618; mul.ftz.f32 %f844, %f1008, %f843; mov.b32 %r2159, %f844; mov.b32 %f845, %r2158; fma.rn.ftz.f32 %f846, %f82, %f845, %f619; mul.ftz.f32 %f847, %f1008, %f846; mov.b32 %r2158, %f847; mov.b32 %f848, %r2157; fma.rn.ftz.f32 %f849, %f84, %f848, %f620; mul.ftz.f32 %f850, %f1009, %f849; mov.b32 %r2157, %f850; mov.b32 %f851, %r2156; fma.rn.ftz.f32 %f852, %f84, %f851, %f621; mul.ftz.f32 %f853, %f1009, %f852; mov.b32 %r2156, %f853; mov.b32 %f854, %r2155; fma.rn.ftz.f32 %f855, %f82, %f854, %f626; mul.ftz.f32 %f856, %f1008, %f855; mov.b32 %r2155, %f856; mov.b32 %f857, %r2154; fma.rn.ftz.f32 %f858, %f82, %f857, %f627; mul.ftz.f32 %f859, %f1008, %f858; mov.b32 %r2154, %f859; mov.b32 %f860, %r2153; fma.rn.ftz.f32 %f861, %f84, %f860, %f628; mul.ftz.f32 %f862, %f1009, %f861; mov.b32 %r2153, %f862; mov.b32 %f863, %r2152; fma.rn.ftz.f32 %f864, %f84, %f863, %f629; mul.ftz.f32 %f865, %f1009, %f864; mov.b32 %r2152, %f865; mov.b32 %f866, %r2151; fma.rn.ftz.f32 %f867, %f82, %f866, %f634; mul.ftz.f32 %f868, %f1008, %f867; mov.b32 %r2151, %f868; mov.b32 %f869, %r2150; fma.rn.ftz.f32 %f870, %f82, %f869, %f635; mul.ftz.f32 %f871, %f1008, %f870; mov.b32 %r2150, %f871; mov.b32 %f872, %r2149; fma.rn.ftz.f32 %f873, %f84, %f872, %f636; mul.ftz.f32 %f874, %f1009, %f873; mov.b32 %r2149, %f874; mov.b32 %f875, %r2148; fma.rn.ftz.f32 %f876, %f84, %f875, %f637; mul.ftz.f32 %f877, %f1009, %f876; mov.b32 %r2148, %f877; mov.b32 %f878, %r2147; fma.rn.ftz.f32 %f879, %f82, %f878, %f642; mul.ftz.f32 %f880, %f1008, %f879; mov.b32 %r2147, %f880; mov.b32 %f881, %r2146; fma.rn.ftz.f32 %f882, %f82, %f881, %f643; mul.ftz.f32 %f883, %f1008, %f882; mov.b32 %r2146, %f883; mov.b32 %f884, %r2145; fma.rn.ftz.f32 %f885, %f84, %f884, %f644; mul.ftz.f32 %f886, %f1009, %f885; mov.b32 %r2145, %f886; mov.b32 %f887, %r2144; fma.rn.ftz.f32 %f888, %f84, %f887, %f645; mul.ftz.f32 %f889, %f1009, %f888; mov.b32 %r2144, %f889; mov.b32 %f890, %r2143; fma.rn.ftz.f32 %f891, %f82, %f890, %f650; mul.ftz.f32 %f892, %f1008, %f891; mov.b32 %r2143, %f892; mov.b32 %f893, %r2142; fma.rn.ftz.f32 %f894, %f82, %f893, %f651; mul.ftz.f32 %f895, %f1008, %f894; mov.b32 %r2142, %f895; mov.b32 %f896, %r2141; fma.rn.ftz.f32 %f897, %f84, %f896, %f652; mul.ftz.f32 %f898, %f1009, %f897; mov.b32 %r2141, %f898; mov.b32 %f899, %r2140; fma.rn.ftz.f32 %f900, %f84, %f899, %f653; mul.ftz.f32 %f901, %f1009, %f900; mov.b32 %r2140, %f901; @%p172 bra $L__BB0_65; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2139, %r2138, %r2137, %r2136}, [%r1032]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2135, %r2134, %r2133, %r2132}, [%r1037]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2131, %r2130, %r2129, %r2128}, [%r1042]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2127, %r2126, %r2125, %r2124}, [%r1047]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2123, %r2122, %r2121, %r2120}, [%r1052]; // end inline asm $L__BB0_65: add.s32 %r2096, %r2096, 16; setp.lt.s32 %p137, %r2096, %r186; add.s32 %r2097, %r2097, 16; mov.f32 %f986, %f61; mov.f32 %f987, %f60; mov.f32 %f988, %f59; mov.f32 %f989, %f58; @%p137 bra $L__BB0_11; $L__BB0_66: mov.u32 %r1972, %tid.x; and.b32 %r1971, %r1972, 96; shr.u32 %r1970, %r1971, 1; mov.b64 %rd234, fmha_v2_flash_attention_fp16_128_16_S_80_sm86_kernel_nl_param_0; mov.u64 %rd233, %rd234; ld.param.u32 %r1969, [%rd233+60]; mul.lo.s32 %r1968, %r1, %r721; mad.lo.s32 %r1967, %r1968, %r2, %r722; mov.u32 %r1966, _ZN25fused_multihead_attention5smem_E; bar.sync 0; mul.lo.s32 %r1806, %r1967, %r1969; shl.b32 %r1807, %r1806, 1; cvt.s64.s32 %rd152, %r1807; add.s64 %rd35, %rd152, %rd39; mov.b32 %f902, %r2218; mov.b32 %f903, %r2219; // begin inline asm cvt.rn.f16x2.f32 %r1702, %f902, %f903; // end inline asm mov.b32 %f904, %r2216; mov.b32 %f905, %r2217; // begin inline asm cvt.rn.f16x2.f32 %r1703, %f904, %f905; // end inline asm shl.b32 %r1810, %r1972, 2; and.b32 %r1811, %r1810, 124; add.s32 %r1813, %r1811, %r1966; and.b32 %r1816, %r1972, 28; shr.u32 %r1817, %r1816, 2; or.b32 %r1818, %r1970, %r1817; shl.b32 %r1819, %r1818, 8; add.s32 %r1820, %r1813, %r1819; add.s32 %r1704, %r1820, 32768; // begin inline asm st.shared.b32 [%r1704], %r1702; // end inline asm add.s32 %r644, %r1820, 34816; // begin inline asm st.shared.b32 [%r644], %r1703; // end inline asm xor.b32 %r1710, %r1704, 16; mov.b32 %f906, %r2214; mov.b32 %f907, %r2215; // begin inline asm cvt.rn.f16x2.f32 %r1708, %f906, %f907; // end inline asm mov.b32 %f908, %r2212; mov.b32 %f909, %r2213; // begin inline asm cvt.rn.f16x2.f32 %r1709, %f908, %f909; // end inline asm // begin inline asm st.shared.b32 [%r1710], %r1708; // end inline asm add.s32 %r1712, %r1710, 2048; // begin inline asm st.shared.b32 [%r1712], %r1709; // end inline asm xor.b32 %r1716, %r1704, 32; mov.b32 %f910, %r2210; mov.b32 %f911, %r2211; // begin inline asm cvt.rn.f16x2.f32 %r1714, %f910, %f911; // end inline asm mov.b32 %f912, %r2208; mov.b32 %f913, %r2209; // begin inline asm cvt.rn.f16x2.f32 %r1715, %f912, %f913; // end inline asm // begin inline asm st.shared.b32 [%r1716], %r1714; // end inline asm add.s32 %r1718, %r1716, 2048; // begin inline asm st.shared.b32 [%r1718], %r1715; // end inline asm xor.b32 %r1722, %r1704, 48; mov.b32 %f914, %r2206; mov.b32 %f915, %r2207; // begin inline asm cvt.rn.f16x2.f32 %r1720, %f914, %f915; // end inline asm mov.b32 %f916, %r2204; mov.b32 %f917, %r2205; // begin inline asm cvt.rn.f16x2.f32 %r1721, %f916, %f917; // end inline asm // begin inline asm st.shared.b32 [%r1722], %r1720; // end inline asm add.s32 %r1724, %r1722, 2048; // begin inline asm st.shared.b32 [%r1724], %r1721; // end inline asm xor.b32 %r1728, %r1704, 64; mov.b32 %f918, %r2202; mov.b32 %f919, %r2203; // begin inline asm cvt.rn.f16x2.f32 %r1726, %f918, %f919; // end inline asm mov.b32 %f920, %r2200; mov.b32 %f921, %r2201; // begin inline asm cvt.rn.f16x2.f32 %r1727, %f920, %f921; // end inline asm // begin inline asm st.shared.b32 [%r1728], %r1726; // end inline asm add.s32 %r1730, %r1728, 2048; // begin inline asm st.shared.b32 [%r1730], %r1727; // end inline asm xor.b32 %r1734, %r1704, 80; mov.b32 %f922, %r2198; mov.b32 %f923, %r2199; // begin inline asm cvt.rn.f16x2.f32 %r1732, %f922, %f923; // end inline asm mov.b32 %f924, %r2196; mov.b32 %f925, %r2197; // begin inline asm cvt.rn.f16x2.f32 %r1733, %f924, %f925; // end inline asm // begin inline asm st.shared.b32 [%r1734], %r1732; // end inline asm add.s32 %r1736, %r1734, 2048; // begin inline asm st.shared.b32 [%r1736], %r1733; // end inline asm xor.b32 %r1740, %r1704, 96; mov.b32 %f926, %r2194; mov.b32 %f927, %r2195; // begin inline asm cvt.rn.f16x2.f32 %r1738, %f926, %f927; // end inline asm mov.b32 %f928, %r2192; mov.b32 %f929, %r2193; // begin inline asm cvt.rn.f16x2.f32 %r1739, %f928, %f929; // end inline asm // begin inline asm st.shared.b32 [%r1740], %r1738; // end inline asm add.s32 %r1742, %r1740, 2048; // begin inline asm st.shared.b32 [%r1742], %r1739; // end inline asm xor.b32 %r1746, %r1704, 112; mov.b32 %f930, %r2190; mov.b32 %f931, %r2191; // begin inline asm cvt.rn.f16x2.f32 %r1744, %f930, %f931; // end inline asm mov.b32 %f932, %r2188; mov.b32 %f933, %r2189; // begin inline asm cvt.rn.f16x2.f32 %r1745, %f932, %f933; // end inline asm // begin inline asm st.shared.b32 [%r1746], %r1744; // end inline asm add.s32 %r1748, %r1746, 2048; // begin inline asm st.shared.b32 [%r1748], %r1745; // end inline asm xor.b32 %r1752, %r1704, 128; mov.b32 %f934, %r2186; mov.b32 %f935, %r2187; // begin inline asm cvt.rn.f16x2.f32 %r1750, %f934, %f935; // end inline asm mov.b32 %f936, %r2184; mov.b32 %f937, %r2185; // begin inline asm cvt.rn.f16x2.f32 %r1751, %f936, %f937; // end inline asm // begin inline asm st.shared.b32 [%r1752], %r1750; // end inline asm add.s32 %r1754, %r1752, 2048; // begin inline asm st.shared.b32 [%r1754], %r1751; // end inline asm xor.b32 %r1758, %r1704, 144; mov.b32 %f938, %r2182; mov.b32 %f939, %r2183; // begin inline asm cvt.rn.f16x2.f32 %r1756, %f938, %f939; // end inline asm mov.b32 %f940, %r2180; mov.b32 %f941, %r2181; // begin inline asm cvt.rn.f16x2.f32 %r1757, %f940, %f941; // end inline asm // begin inline asm st.shared.b32 [%r1758], %r1756; // end inline asm add.s32 %r1760, %r1758, 2048; // begin inline asm st.shared.b32 [%r1760], %r1757; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1762, %r1763, %r1764, %r1765}, [%r20]; // end inline asm add.s32 %r1771, %r20, 2048; // begin inline asm ld.shared.v4.b32 {%r1767, %r1768, %r1769, %r1770}, [%r1771]; // end inline asm add.s32 %r1776, %r20, 4096; // begin inline asm ld.shared.v4.b32 {%r1772, %r1773, %r1774, %r1775}, [%r1776]; // end inline asm add.s32 %r1781, %r20, 6144; // begin inline asm ld.shared.v4.b32 {%r1777, %r1778, %r1779, %r1780}, [%r1781]; // end inline asm add.s32 %r1786, %r20, 8192; // begin inline asm ld.shared.v4.b32 {%r1782, %r1783, %r1784, %r1785}, [%r1786]; // end inline asm add.s32 %r1791, %r20, 10240; // begin inline asm ld.shared.v4.b32 {%r1787, %r1788, %r1789, %r1790}, [%r1791]; // end inline asm add.s32 %r1796, %r20, 12288; // begin inline asm ld.shared.v4.b32 {%r1792, %r1793, %r1794, %r1795}, [%r1796]; // end inline asm add.s32 %r1801, %r20, 14336; // begin inline asm ld.shared.v4.b32 {%r1797, %r1798, %r1799, %r1800}, [%r1801]; // end inline asm bar.sync 0; cvt.u32.u64 %r1821, %rd5; setp.ge.s32 %p138, %r1821, %r1; @%p138 bra $L__BB0_89; setp.ge.s32 %p139, %r6, %r14; @%p139 bra $L__BB0_69; mul.lo.s64 %rd154, %rd12, %rd5; add.s64 %rd155, %rd35, %rd154; cvta.to.global.u64 %rd156, %rd13; add.s64 %rd157, %rd156, %rd155; st.global.v4.u32 [%rd157], {%r1762, %r1763, %r1764, %r1765}; $L__BB0_69: add.s32 %r1823, %r1821, 8; setp.ge.s32 %p140, %r1823, %r1; @%p140 bra $L__BB0_89; @%p139 bra $L__BB0_72; add.s64 %rd158, %rd5, 8; mul.lo.s64 %rd159, %rd158, %rd12; add.s64 %rd160, %rd35, %rd159; cvta.to.global.u64 %rd161, %rd13; add.s64 %rd162, %rd161, %rd160; st.global.v4.u32 [%rd162], {%r1767, %r1768, %r1769, %r1770}; $L__BB0_72: add.s32 %r1825, %r1821, 16; setp.ge.s32 %p142, %r1825, %r1; @%p142 bra $L__BB0_89; @%p139 bra $L__BB0_75; add.s64 %rd163, %rd5, 16; mul.lo.s64 %rd164, %rd163, %rd12; add.s64 %rd165, %rd35, %rd164; cvta.to.global.u64 %rd166, %rd13; add.s64 %rd167, %rd166, %rd165; st.global.v4.u32 [%rd167], {%r1772, %r1773, %r1774, %r1775}; $L__BB0_75: add.s32 %r1827, %r1821, 24; setp.ge.s32 %p144, %r1827, %r1; @%p144 bra $L__BB0_89; @%p139 bra $L__BB0_78; add.s64 %rd168, %rd5, 24; mul.lo.s64 %rd169, %rd168, %rd12; add.s64 %rd170, %rd35, %rd169; cvta.to.global.u64 %rd171, %rd13; add.s64 %rd172, %rd171, %rd170; st.global.v4.u32 [%rd172], {%r1777, %r1778, %r1779, %r1780}; $L__BB0_78: add.s32 %r1829, %r1821, 32; setp.ge.s32 %p146, %r1829, %r1; @%p146 bra $L__BB0_89; @%p139 bra $L__BB0_81; add.s64 %rd173, %rd5, 32; mul.lo.s64 %rd174, %rd173, %rd12; add.s64 %rd175, %rd35, %rd174; cvta.to.global.u64 %rd176, %rd13; add.s64 %rd177, %rd176, %rd175; st.global.v4.u32 [%rd177], {%r1782, %r1783, %r1784, %r1785}; $L__BB0_81: add.s32 %r1831, %r1821, 40; setp.ge.s32 %p148, %r1831, %r1; @%p148 bra $L__BB0_89; @%p139 bra $L__BB0_84; add.s64 %rd178, %rd5, 40; mul.lo.s64 %rd179, %rd178, %rd12; add.s64 %rd180, %rd35, %rd179; cvta.to.global.u64 %rd181, %rd13; add.s64 %rd182, %rd181, %rd180; st.global.v4.u32 [%rd182], {%r1787, %r1788, %r1789, %r1790}; $L__BB0_84: add.s32 %r1833, %r1821, 48; setp.ge.s32 %p150, %r1833, %r1; @%p150 bra $L__BB0_89; @%p139 bra $L__BB0_87; add.s64 %rd183, %rd5, 48; mul.lo.s64 %rd184, %rd183, %rd12; add.s64 %rd185, %rd35, %rd184; cvta.to.global.u64 %rd186, %rd13; add.s64 %rd187, %rd186, %rd185; st.global.v4.u32 [%rd187], {%r1792, %r1793, %r1794, %r1795}; $L__BB0_87: add.s32 %r1835, %r1821, 56; setp.ge.s32 %p152, %r1835, %r1; or.pred %p154, %p152, %p139; @%p154 bra $L__BB0_89; add.s64 %rd188, %rd5, 56; mul.lo.s64 %rd189, %rd188, %rd12; add.s64 %rd190, %rd35, %rd189; cvta.to.global.u64 %rd191, %rd13; add.s64 %rd192, %rd191, %rd190; st.global.v4.u32 [%rd192], {%r1797, %r1798, %r1799, %r1800}; $L__BB0_89: mov.b32 %f942, %r2178; mov.b32 %f943, %r2179; // begin inline asm cvt.rn.f16x2.f32 %r1836, %f942, %f943; // end inline asm mov.b32 %f944, %r2176; mov.b32 %f945, %r2177; // begin inline asm cvt.rn.f16x2.f32 %r1837, %f944, %f945; // end inline asm // begin inline asm st.shared.b32 [%r1704], %r1836; // end inline asm // begin inline asm st.shared.b32 [%r644], %r1837; // end inline asm mov.b32 %f946, %r2174; mov.b32 %f947, %r2175; // begin inline asm cvt.rn.f16x2.f32 %r1842, %f946, %f947; // end inline asm mov.b32 %f948, %r2172; mov.b32 %f949, %r2173; // begin inline asm cvt.rn.f16x2.f32 %r1843, %f948, %f949; // end inline asm // begin inline asm st.shared.b32 [%r1710], %r1842; // end inline asm // begin inline asm st.shared.b32 [%r1712], %r1843; // end inline asm mov.b32 %f950, %r2170; mov.b32 %f951, %r2171; // begin inline asm cvt.rn.f16x2.f32 %r1848, %f950, %f951; // end inline asm mov.b32 %f952, %r2168; mov.b32 %f953, %r2169; // begin inline asm cvt.rn.f16x2.f32 %r1849, %f952, %f953; // end inline asm // begin inline asm st.shared.b32 [%r1716], %r1848; // end inline asm // begin inline asm st.shared.b32 [%r1718], %r1849; // end inline asm mov.b32 %f954, %r2166; mov.b32 %f955, %r2167; // begin inline asm cvt.rn.f16x2.f32 %r1854, %f954, %f955; // end inline asm mov.b32 %f956, %r2164; mov.b32 %f957, %r2165; // begin inline asm cvt.rn.f16x2.f32 %r1855, %f956, %f957; // end inline asm // begin inline asm st.shared.b32 [%r1722], %r1854; // end inline asm // begin inline asm st.shared.b32 [%r1724], %r1855; // end inline asm mov.b32 %f958, %r2162; mov.b32 %f959, %r2163; // begin inline asm cvt.rn.f16x2.f32 %r1860, %f958, %f959; // end inline asm mov.b32 %f960, %r2160; mov.b32 %f961, %r2161; // begin inline asm cvt.rn.f16x2.f32 %r1861, %f960, %f961; // end inline asm // begin inline asm st.shared.b32 [%r1728], %r1860; // end inline asm // begin inline asm st.shared.b32 [%r1730], %r1861; // end inline asm mov.b32 %f962, %r2158; mov.b32 %f963, %r2159; // begin inline asm cvt.rn.f16x2.f32 %r1866, %f962, %f963; // end inline asm mov.b32 %f964, %r2156; mov.b32 %f965, %r2157; // begin inline asm cvt.rn.f16x2.f32 %r1867, %f964, %f965; // end inline asm // begin inline asm st.shared.b32 [%r1734], %r1866; // end inline asm // begin inline asm st.shared.b32 [%r1736], %r1867; // end inline asm mov.b32 %f966, %r2154; mov.b32 %f967, %r2155; // begin inline asm cvt.rn.f16x2.f32 %r1872, %f966, %f967; // end inline asm mov.b32 %f968, %r2152; mov.b32 %f969, %r2153; // begin inline asm cvt.rn.f16x2.f32 %r1873, %f968, %f969; // end inline asm // begin inline asm st.shared.b32 [%r1740], %r1872; // end inline asm // begin inline asm st.shared.b32 [%r1742], %r1873; // end inline asm mov.b32 %f970, %r2150; mov.b32 %f971, %r2151; // begin inline asm cvt.rn.f16x2.f32 %r1878, %f970, %f971; // end inline asm mov.b32 %f972, %r2148; mov.b32 %f973, %r2149; // begin inline asm cvt.rn.f16x2.f32 %r1879, %f972, %f973; // end inline asm // begin inline asm st.shared.b32 [%r1746], %r1878; // end inline asm // begin inline asm st.shared.b32 [%r1748], %r1879; // end inline asm mov.b32 %f974, %r2146; mov.b32 %f975, %r2147; // begin inline asm cvt.rn.f16x2.f32 %r1884, %f974, %f975; // end inline asm mov.b32 %f976, %r2144; mov.b32 %f977, %r2145; // begin inline asm cvt.rn.f16x2.f32 %r1885, %f976, %f977; // end inline asm // begin inline asm st.shared.b32 [%r1752], %r1884; // end inline asm // begin inline asm st.shared.b32 [%r1754], %r1885; // end inline asm mov.b32 %f978, %r2142; mov.b32 %f979, %r2143; // begin inline asm cvt.rn.f16x2.f32 %r1890, %f978, %f979; // end inline asm mov.b32 %f980, %r2140; mov.b32 %f981, %r2141; // begin inline asm cvt.rn.f16x2.f32 %r1891, %f980, %f981; // end inline asm // begin inline asm st.shared.b32 [%r1758], %r1890; // end inline asm // begin inline asm st.shared.b32 [%r1760], %r1891; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r1896, %r1897, %r1898, %r1899}, [%r20]; // end inline asm // begin inline asm ld.shared.v4.b32 {%r1901, %r1902, %r1903, %r1904}, [%r1771]; // end inline asm add.s32 %r1910, %r1771, 2048; // begin inline asm ld.shared.v4.b32 {%r1906, %r1907, %r1908, %r1909}, [%r1910]; // end inline asm add.s32 %r1915, %r1771, 4096; // begin inline asm ld.shared.v4.b32 {%r1911, %r1912, %r1913, %r1914}, [%r1915]; // end inline asm add.s32 %r1920, %r1771, 6144; // begin inline asm ld.shared.v4.b32 {%r1916, %r1917, %r1918, %r1919}, [%r1920]; // end inline asm add.s32 %r1925, %r1771, 8192; // begin inline asm ld.shared.v4.b32 {%r1921, %r1922, %r1923, %r1924}, [%r1925]; // end inline asm add.s32 %r1930, %r1771, 10240; // begin inline asm ld.shared.v4.b32 {%r1926, %r1927, %r1928, %r1929}, [%r1930]; // end inline asm add.s32 %r1935, %r1771, 12288; // begin inline asm ld.shared.v4.b32 {%r1931, %r1932, %r1933, %r1934}, [%r1935]; // end inline asm add.s32 %r1949, %r1821, 64; setp.ge.s32 %p155, %r1949, %r1; @%p155 bra $L__BB0_112; setp.ge.s32 %p156, %r6, %r14; @%p156 bra $L__BB0_92; add.s64 %rd193, %rd5, 64; mul.lo.s64 %rd194, %rd193, %rd12; add.s64 %rd195, %rd35, %rd194; cvta.to.global.u64 %rd196, %rd13; add.s64 %rd197, %rd196, %rd195; st.global.v4.u32 [%rd197], {%r1896, %r1897, %r1898, %r1899}; $L__BB0_92: add.s32 %r1951, %r1821, 72; setp.ge.s32 %p157, %r1951, %r1; @%p157 bra $L__BB0_112; @%p156 bra $L__BB0_95; add.s64 %rd198, %rd5, 72; mul.lo.s64 %rd199, %rd198, %rd12; add.s64 %rd200, %rd35, %rd199; cvta.to.global.u64 %rd201, %rd13; add.s64 %rd202, %rd201, %rd200; st.global.v4.u32 [%rd202], {%r1901, %r1902, %r1903, %r1904}; $L__BB0_95: add.s32 %r1953, %r1821, 80; setp.ge.s32 %p159, %r1953, %r1; @%p159 bra $L__BB0_112; @%p156 bra $L__BB0_98; add.s64 %rd203, %rd5, 80; mul.lo.s64 %rd204, %rd203, %rd12; add.s64 %rd205, %rd35, %rd204; cvta.to.global.u64 %rd206, %rd13; add.s64 %rd207, %rd206, %rd205; st.global.v4.u32 [%rd207], {%r1906, %r1907, %r1908, %r1909}; $L__BB0_98: add.s32 %r1955, %r1821, 88; setp.ge.s32 %p161, %r1955, %r1; @%p161 bra $L__BB0_112; @%p156 bra $L__BB0_101; add.s64 %rd208, %rd5, 88; mul.lo.s64 %rd209, %rd208, %rd12; add.s64 %rd210, %rd35, %rd209; cvta.to.global.u64 %rd211, %rd13; add.s64 %rd212, %rd211, %rd210; st.global.v4.u32 [%rd212], {%r1911, %r1912, %r1913, %r1914}; $L__BB0_101: add.s32 %r1957, %r1821, 96; setp.ge.s32 %p163, %r1957, %r1; @%p163 bra $L__BB0_112; @%p156 bra $L__BB0_104; add.s64 %rd213, %rd5, 96; mul.lo.s64 %rd214, %rd213, %rd12; add.s64 %rd215, %rd35, %rd214; cvta.to.global.u64 %rd216, %rd13; add.s64 %rd217, %rd216, %rd215; st.global.v4.u32 [%rd217], {%r1916, %r1917, %r1918, %r1919}; $L__BB0_104: add.s32 %r1959, %r1821, 104; setp.ge.s32 %p165, %r1959, %r1; @%p165 bra $L__BB0_112; @%p156 bra $L__BB0_107; add.s64 %rd218, %rd5, 104; mul.lo.s64 %rd219, %rd218, %rd12; add.s64 %rd220, %rd35, %rd219; cvta.to.global.u64 %rd221, %rd13; add.s64 %rd222, %rd221, %rd220; st.global.v4.u32 [%rd222], {%r1921, %r1922, %r1923, %r1924}; $L__BB0_107: add.s32 %r1961, %r1821, 112; setp.ge.s32 %p167, %r1961, %r1; @%p167 bra $L__BB0_112; @%p156 bra $L__BB0_110; add.s64 %rd223, %rd5, 112; mul.lo.s64 %rd224, %rd223, %rd12; add.s64 %rd225, %rd35, %rd224; cvta.to.global.u64 %rd226, %rd13; add.s64 %rd227, %rd226, %rd225; st.global.v4.u32 [%rd227], {%r1926, %r1927, %r1928, %r1929}; $L__BB0_110: add.s32 %r1963, %r1821, 120; setp.ge.s32 %p169, %r1963, %r1; or.pred %p171, %p169, %p156; @%p171 bra $L__BB0_112; add.s64 %rd228, %rd5, 120; mul.lo.s64 %rd229, %rd228, %rd12; add.s64 %rd230, %rd35, %rd229; cvta.to.global.u64 %rd231, %rd13; add.s64 %rd232, %rd231, %rd230; st.global.v4.u32 [%rd232], {%r1931, %r1932, %r1933, %r1934}; $L__BB0_112: ret; }