am .align 8 .b8 fmha_v2_flash_attention_fp16_64_16_S_160_sm86_kernel_nl_param_0[168] ) { .reg .pred %p<137>; .reg .b16 %rs<75>; .reg .f32 %f<866>; .reg .b32 %r<2671>; .reg .b64 %rd<216>; mov.b64 %rd33, fmha_v2_flash_attention_fp16_64_16_S_160_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd33; ld.param.u32 %r1, [fmha_v2_flash_attention_fp16_64_16_S_160_sm86_kernel_nl_param_0+56]; ld.param.u32 %r2, [fmha_v2_flash_attention_fp16_64_16_S_160_sm86_kernel_nl_param_0+52]; mov.u32 %r911, %ctaid.z; shl.b32 %r3, %r911, 6; setp.le.s32 %p5, %r1, %r3; @%p5 bra $L__BB0_93; mov.u32 %r912, %tid.x; mov.u32 %r913, %ctaid.y; mov.u32 %r914, %ctaid.x; mul.lo.s32 %r915, %r1, %r913; mad.lo.s32 %r916, %r915, %r2, %r914; shr.s32 %r917, %r912, 31; shr.u32 %r918, %r917, 27; add.s32 %r919, %r912, %r918; and.b32 %r920, %r919, -32; sub.s32 %r4, %r912, %r920; shr.u32 %r921, %r917, 25; add.s32 %r922, %r912, %r921; shr.s32 %r923, %r922, 7; shl.b32 %r924, %r923, 4; shr.s32 %r925, %r4, 31; shr.u32 %r926, %r925, 30; add.s32 %r927, %r4, %r926; and.b32 %r928, %r927, 2147483644; sub.s32 %r929, %r4, %r928; shl.b32 %r930, %r929, 1; add.s32 %r2508, %r930, %r924; ld.param.u64 %rd2, [%rd1+16]; ld.param.u64 %rd3, [%rd1+40]; shr.s32 %r6, %r919, 5; shr.s32 %r931, %r919, 31; shr.u32 %r932, %r931, 30; add.s32 %r933, %r6, %r932; and.b32 %r934, %r933, 268435452; sub.s32 %r935, %r6, %r934; shl.b32 %r936, %r935, 4; shr.s32 %r937, %r927, 2; add.s32 %r7, %r936, %r937; setp.gt.s32 %p6, %r4, 19; add.s32 %r938, %r6, %r3; cvt.s64.s32 %rd4, %r938; ld.param.u64 %rd5, [%rd1+24]; mul.lo.s64 %rd34, %rd5, %rd4; mul.lo.s32 %r939, %r916, 3; mul.wide.s32 %rd35, %r939, 320; shl.b32 %r940, %r4, 4; cvt.s64.s32 %rd36, %r940; add.s64 %rd37, %rd35, %rd36; add.s64 %rd38, %rd37, %rd34; ld.param.u64 %rd39, [%rd1]; add.s64 %rd6, %rd39, %rd38; shr.u32 %r941, %r931, 29; add.s32 %r942, %r6, %r941; and.b32 %r943, %r942, 268435448; sub.s32 %r944, %r6, %r943; xor.b32 %r945, %r944, %r4; shl.b32 %r946, %r6, 9; shl.b32 %r947, %r945, 4; mov.u32 %r948, 31; mov.u32 %r949, 0; mov.u32 %r950, -1; shfl.sync.idx.b32 %r8|%p1, %r949, %r949, %r948, %r950; shfl.sync.idx.b32 %r22|%p7, %r949, %r949, %r948, %r950; and.b32 %r951, %r912, 96; shr.u32 %r952, %r951, 1; and.b32 %r953, %r912, 15; or.b32 %r954, %r952, %r953; and.b32 %r955, %r912, 7; shl.b32 %r956, %r912, 4; and.b32 %r957, %r956, 112; and.b32 %r958, %r912, 16; xor.b32 %r959, %r957, %r958; cvt.s64.s32 %rd40, %r6; mul.lo.s64 %rd41, %rd5, %rd40; add.s64 %rd42, %rd37, %rd41; add.s64 %rd43, %rd39, %rd42; add.s64 %rd215, %rd43, 320; shfl.sync.idx.b32 %r9|%p2, %r949, %r949, %r948, %r950; shfl.sync.idx.b32 %r10|%p3, %r949, %r949, %r948, %r950; shr.u32 %r960, %r958, 1; or.b32 %r961, %r960, %r955; and.b32 %r962, %r912, 8; shr.u32 %r963, %r962, 3; xor.b32 %r964, %r963, %r955; add.s64 %rd214, %rd43, 640; shfl.sync.idx.b32 %r965|%p8, %r949, %r949, %r948, %r950; shfl.sync.idx.b32 %r11|%p4, %r949, %r949, %r948, %r950; ld.param.u64 %rd11, [%rd1+32]; ld.param.u64 %rd12, [%rd1+8]; sub.s32 %r966, %r1, %r3; min.s32 %r13, %r966, 64; shl.b32 %r970, %r912, 9; and.b32 %r971, %r970, 7680; shl.b32 %r972, %r964, 4; shl.b32 %r973, %r961, 9; shl.b32 %r974, %r954, 9; add.s32 %r15, %r6, 4; add.s32 %r16, %r6, 8; add.s32 %r17, %r6, 12; add.s32 %r18, %r947, %r946; or.b32 %r19, %r974, %r959; or.b32 %r20, %r973, %r972; or.b32 %r21, %r959, %r971; @%p6 bra $L__BB0_3; shl.b64 %rd60, %rd5, 2; add.s32 %r1007, %r6, 60; setp.lt.s32 %p9, %r1007, %r13; add.s32 %r1008, %r6, 56; setp.lt.s32 %p10, %r1008, %r13; add.s32 %r1009, %r6, 52; setp.lt.s32 %p11, %r1009, %r13; add.s32 %r1010, %r6, 48; setp.lt.s32 %p12, %r1010, %r13; add.s32 %r1011, %r6, 44; setp.lt.s32 %p13, %r1011, %r13; add.s32 %r1012, %r6, 40; setp.lt.s32 %p14, %r1012, %r13; add.s32 %r1013, %r6, 36; setp.lt.s32 %p15, %r1013, %r13; add.s32 %r1014, %r6, 32; setp.lt.s32 %p16, %r1014, %r13; add.s32 %r1015, %r6, 28; setp.lt.s32 %p17, %r1015, %r13; add.s32 %r1016, %r6, 24; setp.lt.s32 %p18, %r1016, %r13; add.s32 %r1017, %r6, 20; setp.lt.s32 %p19, %r1017, %r13; add.s32 %r1018, %r6, 16; setp.lt.s32 %p20, %r1018, %r13; selp.b32 %r986, 16, 0, %p19; selp.b32 %r988, 16, 0, %p18; selp.b32 %r990, 16, 0, %p17; selp.b32 %r992, 16, 0, %p16; selp.b32 %r994, 16, 0, %p15; selp.b32 %r996, 16, 0, %p14; selp.b32 %r998, 16, 0, %p13; selp.b32 %r1000, 16, 0, %p12; selp.b32 %r1002, 16, 0, %p11; mov.u32 %r1019, _ZN25fused_multihead_attention5smem_E; add.s32 %r1020, %r22, %r1019; add.s32 %r975, %r1020, %r18; add.s32 %r1021, %r18, 2048; xor.b32 %r1022, %r1021, 64; add.s32 %r977, %r1020, %r1022; add.s32 %r979, %r975, 4096; add.s32 %r1023, %r18, 6144; xor.b32 %r1024, %r1023, 64; add.s32 %r981, %r1020, %r1024; add.s32 %r983, %r975, 8192; add.s32 %r1025, %r18, 10240; xor.b32 %r1026, %r1025, 64; add.s32 %r985, %r1020, %r1026; add.s32 %r987, %r975, 12288; add.s32 %r1027, %r18, 14336; xor.b32 %r1028, %r1027, 64; add.s32 %r989, %r1020, %r1028; add.s32 %r991, %r975, 16384; add.s32 %r1029, %r18, 18432; xor.b32 %r1030, %r1029, 64; add.s32 %r993, %r1020, %r1030; add.s32 %r995, %r975, 20480; add.s32 %r1031, %r18, 22528; xor.b32 %r1032, %r1031, 64; add.s32 %r997, %r1020, %r1032; add.s32 %r999, %r975, 24576; add.s32 %r1033, %r18, 26624; xor.b32 %r1034, %r1033, 64; add.s32 %r1001, %r1020, %r1034; add.s32 %r1003, %r975, 28672; add.s32 %r1035, %r18, 30720; xor.b32 %r1036, %r1035, 64; add.s32 %r1005, %r1020, %r1036; setp.lt.s32 %p21, %r6, %r13; selp.b32 %r976, 16, 0, %p21; // begin inline asm cp.async.cg.shared.global [%r975], [%rd6], 16, %r976; // end inline asm setp.lt.s32 %p22, %r15, %r13; selp.b32 %r978, 16, 0, %p22; add.s64 %rd45, %rd6, %rd60; // begin inline asm cp.async.cg.shared.global [%r977], [%rd45], 16, %r978; // end inline asm setp.lt.s32 %p23, %r16, %r13; selp.b32 %r980, 16, 0, %p23; add.s64 %rd46, %rd45, %rd60; // begin inline asm cp.async.cg.shared.global [%r979], [%rd46], 16, %r980; // end inline asm setp.lt.s32 %p24, %r17, %r13; selp.b32 %r982, 16, 0, %p24; add.s64 %rd47, %rd46, %rd60; // begin inline asm cp.async.cg.shared.global [%r981], [%rd47], 16, %r982; // end inline asm selp.b32 %r984, 16, 0, %p20; add.s64 %rd48, %rd47, %rd60; // begin inline asm cp.async.cg.shared.global [%r983], [%rd48], 16, %r984; // end inline asm add.s64 %rd49, %rd48, %rd60; // begin inline asm cp.async.cg.shared.global [%r985], [%rd49], 16, %r986; // end inline asm add.s64 %rd50, %rd49, %rd60; // begin inline asm cp.async.cg.shared.global [%r987], [%rd50], 16, %r988; // end inline asm add.s64 %rd51, %rd50, %rd60; // begin inline asm cp.async.cg.shared.global [%r989], [%rd51], 16, %r990; // end inline asm add.s64 %rd52, %rd51, %rd60; // begin inline asm cp.async.cg.shared.global [%r991], [%rd52], 16, %r992; // end inline asm add.s64 %rd53, %rd52, %rd60; // begin inline asm cp.async.cg.shared.global [%r993], [%rd53], 16, %r994; // end inline asm add.s64 %rd54, %rd53, %rd60; // begin inline asm cp.async.cg.shared.global [%r995], [%rd54], 16, %r996; // end inline asm add.s64 %rd55, %rd54, %rd60; // begin inline asm cp.async.cg.shared.global [%r997], [%rd55], 16, %r998; // end inline asm add.s64 %rd56, %rd55, %rd60; // begin inline asm cp.async.cg.shared.global [%r999], [%rd56], 16, %r1000; // end inline asm add.s64 %rd57, %rd56, %rd60; // begin inline asm cp.async.cg.shared.global [%r1001], [%rd57], 16, %r1002; // end inline asm selp.b32 %r1004, 16, 0, %p10; add.s64 %rd58, %rd57, %rd60; // begin inline asm cp.async.cg.shared.global [%r1003], [%rd58], 16, %r1004; // end inline asm selp.b32 %r1006, 16, 0, %p9; add.s64 %rd59, %rd58, %rd60; // begin inline asm cp.async.cg.shared.global [%r1005], [%rd59], 16, %r1006; // end inline asm $L__BB0_3: min.s32 %r24, %r1, 16; @%p6 bra $L__BB0_5; shl.b64 %rd65, %rd5, 2; setp.lt.s32 %p26, %r17, %r24; mov.u32 %r1045, _ZN25fused_multihead_attention5smem_E; add.s32 %r1046, %r10, %r1045; add.s32 %r1047, %r1046, 32768; add.s32 %r1037, %r1047, %r18; add.s32 %r1048, %r18, 2048; xor.b32 %r1049, %r1048, 64; add.s32 %r1039, %r1047, %r1049; add.s32 %r1041, %r1037, 4096; add.s32 %r1050, %r18, 6144; xor.b32 %r1051, %r1050, 64; add.s32 %r1043, %r1047, %r1051; setp.lt.s32 %p27, %r6, %r24; selp.b32 %r1038, 16, 0, %p27; // begin inline asm cp.async.cg.shared.global [%r1037], [%rd215], 16, %r1038; // end inline asm setp.lt.s32 %p28, %r15, %r24; selp.b32 %r1040, 16, 0, %p28; add.s64 %rd62, %rd215, %rd65; // begin inline asm cp.async.cg.shared.global [%r1039], [%rd62], 16, %r1040; // end inline asm setp.lt.s32 %p29, %r16, %r24; selp.b32 %r1042, 16, 0, %p29; add.s64 %rd63, %rd62, %rd65; // begin inline asm cp.async.cg.shared.global [%r1041], [%rd63], 16, %r1042; // end inline asm selp.b32 %r1044, 16, 0, %p26; add.s64 %rd64, %rd63, %rd65; // begin inline asm cp.async.cg.shared.global [%r1043], [%rd64], 16, %r1044; // end inline asm $L__BB0_5: @%p6 bra $L__BB0_7; shl.b64 %rd70, %rd5, 2; setp.lt.s32 %p31, %r17, %r24; mov.u32 %r1060, _ZN25fused_multihead_attention5smem_E; add.s32 %r1061, %r11, %r1060; add.s32 %r1062, %r1061, 40960; add.s32 %r1052, %r1062, %r18; add.s32 %r1063, %r18, 2048; xor.b32 %r1064, %r1063, 64; add.s32 %r1054, %r1062, %r1064; add.s32 %r1056, %r1052, 4096; add.s32 %r1065, %r18, 6144; xor.b32 %r1066, %r1065, 64; add.s32 %r1058, %r1062, %r1066; setp.lt.s32 %p32, %r6, %r24; selp.b32 %r1053, 16, 0, %p32; // begin inline asm cp.async.cg.shared.global [%r1052], [%rd214], 16, %r1053; // end inline asm setp.lt.s32 %p33, %r15, %r24; selp.b32 %r1055, 16, 0, %p33; add.s64 %rd67, %rd214, %rd70; // begin inline asm cp.async.cg.shared.global [%r1054], [%rd67], 16, %r1055; // end inline asm setp.lt.s32 %p34, %r16, %r24; selp.b32 %r1057, 16, 0, %p34; add.s64 %rd68, %rd67, %rd70; // begin inline asm cp.async.cg.shared.global [%r1056], [%rd68], 16, %r1057; // end inline asm selp.b32 %r1059, 16, 0, %p31; add.s64 %rd69, %rd68, %rd70; // begin inline asm cp.async.cg.shared.global [%r1058], [%rd69], 16, %r1059; // end inline asm $L__BB0_7: setp.lt.s32 %p35, %r4, 20; // begin inline asm cp.async.commit_group; // end inline asm @%p35 bra $L__BB0_9; mov.u32 %r1187, _ZN25fused_multihead_attention5smem_E; add.s32 %r1188, %r22, %r1187; add.s32 %r1067, %r1188, %r18; add.s32 %r1189, %r18, 2048; xor.b32 %r1190, %r1189, 64; add.s32 %r1072, %r1188, %r1190; add.s32 %r1191, %r18, 4096; add.s32 %r1077, %r1188, %r1191; add.s32 %r1192, %r18, 6144; xor.b32 %r1193, %r1192, 64; add.s32 %r1082, %r1188, %r1193; add.s32 %r1087, %r1067, 8192; add.s32 %r1194, %r18, 10240; xor.b32 %r1195, %r1194, 64; add.s32 %r1092, %r1188, %r1195; add.s32 %r1097, %r1067, 12288; add.s32 %r1196, %r18, 14336; xor.b32 %r1197, %r1196, 64; add.s32 %r1102, %r1188, %r1197; add.s32 %r1107, %r1067, 16384; add.s32 %r1198, %r18, 18432; xor.b32 %r1199, %r1198, 64; add.s32 %r1112, %r1188, %r1199; add.s32 %r1117, %r1067, 20480; add.s32 %r1200, %r18, 22528; xor.b32 %r1201, %r1200, 64; add.s32 %r1122, %r1188, %r1201; add.s32 %r1127, %r1067, 24576; add.s32 %r1202, %r18, 26624; xor.b32 %r1203, %r1202, 64; add.s32 %r1132, %r1188, %r1203; add.s32 %r1137, %r1067, 28672; add.s32 %r1204, %r18, 30720; xor.b32 %r1205, %r1204, 64; add.s32 %r1142, %r1188, %r1205; mov.u32 %r1186, 0; // begin inline asm st.shared.v4.b32 [%r1067], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1072], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1077], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1082], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1087], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1092], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1097], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1102], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1107], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1112], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1117], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1122], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1127], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1132], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1137], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1142], {%r1186, %r1186, %r1186, %r1186}; // end inline asm add.s32 %r1206, %r10, %r1187; add.s32 %r1207, %r1206, 32768; add.s32 %r1147, %r1207, %r18; add.s32 %r1152, %r1207, %r1190; add.s32 %r1157, %r1207, %r1191; add.s32 %r1162, %r1207, %r1193; // begin inline asm st.shared.v4.b32 [%r1147], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1152], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1157], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1162], {%r1186, %r1186, %r1186, %r1186}; // end inline asm add.s32 %r1208, %r11, %r1187; add.s32 %r1209, %r1208, 40960; add.s32 %r1167, %r1209, %r18; add.s32 %r1172, %r1209, %r1190; add.s32 %r1177, %r1209, %r1191; add.s32 %r1182, %r1209, %r1193; // begin inline asm st.shared.v4.b32 [%r1167], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1172], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1177], {%r1186, %r1186, %r1186, %r1186}; // end inline asm // begin inline asm st.shared.v4.b32 [%r1182], {%r1186, %r1186, %r1186, %r1186}; // end inline asm $L__BB0_9: // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; mov.u32 %r1440, _ZN25fused_multihead_attention5smem_E; add.s32 %r1441, %r8, %r1440; add.s32 %r1214, %r1441, %r19; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1210, %r1211, %r1212, %r1213}, [%r1214]; // end inline asm xor.b32 %r1442, %r19, 32; add.s32 %r1219, %r1441, %r1442; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1215, %r1216, %r1217, %r1218}, [%r1219]; // end inline asm xor.b32 %r1443, %r19, 64; add.s32 %r1224, %r1441, %r1443; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1220, %r1221, %r1222, %r1223}, [%r1224]; // end inline asm xor.b32 %r1444, %r19, 96; add.s32 %r1229, %r1441, %r1444; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1225, %r1226, %r1227, %r1228}, [%r1229]; // end inline asm or.b32 %r1445, %r19, 128; add.s32 %r1234, %r1441, %r1445; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1230, %r1231, %r1232, %r1233}, [%r1234]; // end inline asm xor.b32 %r1446, %r19, 160; add.s32 %r1239, %r1441, %r1446; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1235, %r1236, %r1237, %r1238}, [%r1239]; // end inline asm xor.b32 %r1447, %r19, 192; add.s32 %r1244, %r1441, %r1447; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1240, %r1241, %r1242, %r1243}, [%r1244]; // end inline asm xor.b32 %r1448, %r19, 224; add.s32 %r1249, %r1441, %r1448; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1245, %r1246, %r1247, %r1248}, [%r1249]; // end inline asm or.b32 %r1449, %r19, 256; add.s32 %r1254, %r1441, %r1449; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1250, %r1251, %r1252, %r1253}, [%r1254]; // end inline asm xor.b32 %r1450, %r19, 288; add.s32 %r1259, %r1441, %r1450; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1255, %r1256, %r1257, %r1258}, [%r1259]; // end inline asm add.s32 %r1451, %r9, %r1440; add.s32 %r66, %r1451, 32768; add.s32 %r1264, %r66, %r20; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2539, %r2538, %r2537, %r2536}, [%r1264]; // end inline asm xor.b32 %r1452, %r20, 32; add.s32 %r1269, %r66, %r1452; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2535, %r2534, %r2533, %r2532}, [%r1269]; // end inline asm xor.b32 %r1453, %r20, 64; add.s32 %r1274, %r66, %r1453; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2531, %r2530, %r2529, %r2528}, [%r1274]; // end inline asm xor.b32 %r1454, %r20, 96; add.s32 %r1279, %r66, %r1454; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2527, %r2526, %r2525, %r2524}, [%r1279]; // end inline asm or.b32 %r1455, %r20, 128; add.s32 %r1284, %r66, %r1455; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2523, %r2522, %r2521, %r2520}, [%r1284]; // end inline asm xor.b32 %r1456, %r20, 160; add.s32 %r1289, %r66, %r1456; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2519, %r2518, %r2517, %r2516}, [%r1289]; // end inline asm xor.b32 %r1457, %r20, 192; add.s32 %r1294, %r66, %r1457; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2515, %r2514, %r2513, %r2512}, [%r1294]; // end inline asm xor.b32 %r1458, %r20, 224; add.s32 %r1299, %r66, %r1458; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2511, %r2540, %r2541, %r2542}, [%r1299]; // end inline asm or.b32 %r1459, %r20, 256; add.s32 %r1304, %r66, %r1459; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2543, %r2544, %r2545, %r2546}, [%r1304]; // end inline asm xor.b32 %r1460, %r20, 288; add.s32 %r1309, %r66, %r1460; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2547, %r2548, %r2549, %r2550}, [%r1309]; // end inline asm add.s32 %r1461, %r1440, 40960; add.s32 %r1314, %r21, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2590, %r2589, %r2588, %r2587}, [%r1314]; // end inline asm xor.b32 %r1462, %r21, 32; add.s32 %r1319, %r1462, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2586, %r2585, %r2584, %r2583}, [%r1319]; // end inline asm xor.b32 %r1463, %r21, 64; add.s32 %r1324, %r1463, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2582, %r2581, %r2580, %r2579}, [%r1324]; // end inline asm xor.b32 %r1464, %r21, 96; add.s32 %r1329, %r1464, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2578, %r2577, %r2576, %r2575}, [%r1329]; // end inline asm or.b32 %r1465, %r21, 128; add.s32 %r1334, %r1465, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2574, %r2573, %r2572, %r2571}, [%r1334]; // end inline asm xor.b32 %r1466, %r21, 160; add.s32 %r1339, %r1466, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2570, %r2569, %r2568, %r2567}, [%r1339]; // end inline asm xor.b32 %r1467, %r21, 192; add.s32 %r1344, %r1467, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2566, %r2565, %r2564, %r2563}, [%r1344]; // end inline asm xor.b32 %r1468, %r21, 224; add.s32 %r1349, %r1468, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2562, %r2561, %r2560, %r2559}, [%r1349]; // end inline asm or.b32 %r1469, %r21, 256; add.s32 %r1354, %r1469, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2558, %r2557, %r2556, %r2555}, [%r1354]; // end inline asm xor.b32 %r1470, %r21, 288; add.s32 %r1359, %r1470, %r1461; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2554, %r2553, %r2552, %r2551}, [%r1359]; // end inline asm ld.param.f32 %f1, [%rd1+64]; // begin inline asm mov.u32 %r2670, 0; // end inline asm // begin inline asm mov.u32 %r2669, 0; // end inline asm // begin inline asm mov.u32 %r2668, 0; // end inline asm // begin inline asm mov.u32 %r2667, 0; // end inline asm // begin inline asm mov.u32 %r2666, 0; // end inline asm // begin inline asm mov.u32 %r2665, 0; // end inline asm // begin inline asm mov.u32 %r2664, 0; // end inline asm // begin inline asm mov.u32 %r2663, 0; // end inline asm // begin inline asm mov.u32 %r2662, 0; // end inline asm // begin inline asm mov.u32 %r2661, 0; // end inline asm // begin inline asm mov.u32 %r2660, 0; // end inline asm // begin inline asm mov.u32 %r2659, 0; // end inline asm // begin inline asm mov.u32 %r2658, 0; // end inline asm // begin inline asm mov.u32 %r2657, 0; // end inline asm // begin inline asm mov.u32 %r2656, 0; // end inline asm // begin inline asm mov.u32 %r2655, 0; // end inline asm // begin inline asm mov.u32 %r2654, 0; // end inline asm // begin inline asm mov.u32 %r2653, 0; // end inline asm // begin inline asm mov.u32 %r2652, 0; // end inline asm // begin inline asm mov.u32 %r2651, 0; // end inline asm // begin inline asm mov.u32 %r2650, 0; // end inline asm // begin inline asm mov.u32 %r2649, 0; // end inline asm // begin inline asm mov.u32 %r2648, 0; // end inline asm // begin inline asm mov.u32 %r2647, 0; // end inline asm // begin inline asm mov.u32 %r2646, 0; // end inline asm // begin inline asm mov.u32 %r2645, 0; // end inline asm // begin inline asm mov.u32 %r2644, 0; // end inline asm // begin inline asm mov.u32 %r2643, 0; // end inline asm // begin inline asm mov.u32 %r2642, 0; // end inline asm // begin inline asm mov.u32 %r2641, 0; // end inline asm // begin inline asm mov.u32 %r2640, 0; // end inline asm // begin inline asm mov.u32 %r2639, 0; // end inline asm // begin inline asm mov.u32 %r2638, 0; // end inline asm // begin inline asm mov.u32 %r2637, 0; // end inline asm // begin inline asm mov.u32 %r2636, 0; // end inline asm // begin inline asm mov.u32 %r2635, 0; // end inline asm // begin inline asm mov.u32 %r2634, 0; // end inline asm // begin inline asm mov.u32 %r2633, 0; // end inline asm // begin inline asm mov.u32 %r2632, 0; // end inline asm // begin inline asm mov.u32 %r2631, 0; // end inline asm // begin inline asm mov.u32 %r2630, 0; // end inline asm // begin inline asm mov.u32 %r2629, 0; // end inline asm // begin inline asm mov.u32 %r2628, 0; // end inline asm // begin inline asm mov.u32 %r2627, 0; // end inline asm // begin inline asm mov.u32 %r2626, 0; // end inline asm // begin inline asm mov.u32 %r2625, 0; // end inline asm // begin inline asm mov.u32 %r2624, 0; // end inline asm // begin inline asm mov.u32 %r2623, 0; // end inline asm // begin inline asm mov.u32 %r2622, 0; // end inline asm // begin inline asm mov.u32 %r2621, 0; // end inline asm // begin inline asm mov.u32 %r2620, 0; // end inline asm // begin inline asm mov.u32 %r2619, 0; // end inline asm // begin inline asm mov.u32 %r2618, 0; // end inline asm // begin inline asm mov.u32 %r2617, 0; // end inline asm // begin inline asm mov.u32 %r2616, 0; // end inline asm // begin inline asm mov.u32 %r2615, 0; // end inline asm // begin inline asm mov.u32 %r2614, 0; // end inline asm // begin inline asm mov.u32 %r2613, 0; // end inline asm // begin inline asm mov.u32 %r2612, 0; // end inline asm // begin inline asm mov.u32 %r2611, 0; // end inline asm // begin inline asm mov.u32 %r2610, 0; // end inline asm // begin inline asm mov.u32 %r2609, 0; // end inline asm // begin inline asm mov.u32 %r2608, 0; // end inline asm // begin inline asm mov.u32 %r2607, 0; // end inline asm // begin inline asm mov.u32 %r2606, 0; // end inline asm // begin inline asm mov.u32 %r2605, 0; // end inline asm // begin inline asm mov.u32 %r2604, 0; // end inline asm // begin inline asm mov.u32 %r2603, 0; // end inline asm // begin inline asm mov.u32 %r2602, 0; // end inline asm // begin inline asm mov.u32 %r2601, 0; // end inline asm // begin inline asm mov.u32 %r2600, 0; // end inline asm // begin inline asm mov.u32 %r2599, 0; // end inline asm // begin inline asm mov.u32 %r2598, 0; // end inline asm // begin inline asm mov.u32 %r2597, 0; // end inline asm // begin inline asm mov.u32 %r2596, 0; // end inline asm // begin inline asm mov.u32 %r2595, 0; // end inline asm // begin inline asm mov.u32 %r2594, 0; // end inline asm // begin inline asm mov.u32 %r2593, 0; // end inline asm // begin inline asm mov.u32 %r2592, 0; // end inline asm // begin inline asm mov.u32 %r2591, 0; // end inline asm add.s32 %r1471, %r1, 15; shr.s32 %r1472, %r1471, 31; shr.u32 %r1473, %r1472, 28; add.s32 %r1474, %r1471, %r1473; and.b32 %r227, %r1474, -16; setp.lt.s32 %p36, %r1, 1; @%p36 bra $L__BB0_46; ld.param.u8 %rs1, [%rd1+160]; add.s32 %r1477, %r10, %r1440; add.s32 %r1478, %r1477, 32768; add.s32 %r348, %r1478, %r18; add.s32 %r1479, %r18, 2048; xor.b32 %r1480, %r1479, 64; add.s32 %r349, %r1478, %r1480; add.s32 %r1481, %r18, 4096; add.s32 %r350, %r1478, %r1481; add.s32 %r1482, %r18, 6144; xor.b32 %r1483, %r1482, 64; add.s32 %r351, %r1478, %r1483; cvt.s64.s32 %rd13, %r7; cvt.s64.s32 %rd14, %r2508; add.s32 %r1484, %r11, %r1440; add.s32 %r1485, %r1484, 40960; add.s32 %r352, %r1485, %r18; add.s32 %r353, %r1485, %r1480; add.s32 %r354, %r1485, %r1481; add.s32 %r355, %r1485, %r1483; add.s32 %r1486, %r2508, 1; cvt.s64.s32 %rd15, %r1486; add.s32 %r1487, %r2508, 8; cvt.s64.s32 %rd16, %r1487; add.s32 %r1488, %r2508, 9; cvt.s64.s32 %rd17, %r1488; add.s32 %r356, %r7, 8; mov.u32 %r2507, 0; mov.f32 %f854, 0fFF800000; mov.f32 %f852, 0f00000000; mov.f32 %f853, %f852; mov.f32 %f855, %f854; mov.u32 %r2509, %r1; mov.u32 %r2510, %r1; $L__BB0_11: add.s32 %r1489, %r2507, 16; setp.ge.s32 %p37, %r1489, %r227; @%p37 bra $L__BB0_18; bar.sync 0; shl.b64 %rd71, %rd5, 4; add.s64 %rd215, %rd215, %rd71; add.s32 %r2510, %r2510, -16; @%p6 bra $L__BB0_14; min.s32 %r1498, %r2510, 16; setp.lt.s32 %p39, %r17, %r1498; setp.lt.s32 %p40, %r16, %r1498; setp.lt.s32 %p41, %r15, %r1498; setp.lt.s32 %p42, %r6, %r1498; selp.b32 %r1491, 16, 0, %p42; // begin inline asm cp.async.cg.shared.global [%r348], [%rd215], 16, %r1491; // end inline asm selp.b32 %r1493, 16, 0, %p41; shl.b64 %rd76, %rd5, 2; add.s64 %rd73, %rd215, %rd76; // begin inline asm cp.async.cg.shared.global [%r349], [%rd73], 16, %r1493; // end inline asm selp.b32 %r1495, 16, 0, %p40; add.s64 %rd74, %rd73, %rd76; // begin inline asm cp.async.cg.shared.global [%r350], [%rd74], 16, %r1495; // end inline asm selp.b32 %r1497, 16, 0, %p39; add.s64 %rd75, %rd74, %rd76; // begin inline asm cp.async.cg.shared.global [%r351], [%rd75], 16, %r1497; // end inline asm $L__BB0_14: add.s64 %rd214, %rd214, %rd71; add.s32 %r2509, %r2509, -16; @%p6 bra $L__BB0_16; min.s32 %r1510, %r2509, 16; setp.lt.s32 %p44, %r17, %r1510; setp.lt.s32 %p45, %r16, %r1510; setp.lt.s32 %p46, %r15, %r1510; setp.lt.s32 %p47, %r6, %r1510; selp.b32 %r1503, 16, 0, %p47; // begin inline asm cp.async.cg.shared.global [%r352], [%rd214], 16, %r1503; // end inline asm selp.b32 %r1505, 16, 0, %p46; shl.b64 %rd82, %rd5, 2; add.s64 %rd79, %rd214, %rd82; // begin inline asm cp.async.cg.shared.global [%r353], [%rd79], 16, %r1505; // end inline asm selp.b32 %r1507, 16, 0, %p45; add.s64 %rd80, %rd79, %rd82; // begin inline asm cp.async.cg.shared.global [%r354], [%rd80], 16, %r1507; // end inline asm selp.b32 %r1509, 16, 0, %p44; add.s64 %rd81, %rd80, %rd82; // begin inline asm cp.async.cg.shared.global [%r355], [%rd81], 16, %r1509; // end inline asm $L__BB0_16: // begin inline asm cp.async.commit_group; // end inline asm @%p35 bra $L__BB0_18; mov.u32 %r1553, 0; // begin inline asm st.shared.v4.b32 [%r348], {%r1553, %r1553, %r1553, %r1553}; // end inline asm // begin inline asm st.shared.v4.b32 [%r349], {%r1553, %r1553, %r1553, %r1553}; // end inline asm // begin inline asm st.shared.v4.b32 [%r350], {%r1553, %r1553, %r1553, %r1553}; // end inline asm // begin inline asm st.shared.v4.b32 [%r351], {%r1553, %r1553, %r1553, %r1553}; // end inline asm // begin inline asm st.shared.v4.b32 [%r352], {%r1553, %r1553, %r1553, %r1553}; // end inline asm // begin inline asm st.shared.v4.b32 [%r353], {%r1553, %r1553, %r1553, %r1553}; // end inline asm // begin inline asm st.shared.v4.b32 [%r354], {%r1553, %r1553, %r1553, %r1553}; // end inline asm // begin inline asm st.shared.v4.b32 [%r355], {%r1553, %r1553, %r1553, %r1553}; // end inline asm $L__BB0_18: setp.eq.s16 %p49, %rs1, 0; @%p49 bra $L__BB0_35; cvt.s64.s32 %rd83, %r3; add.s64 %rd84, %rd13, %rd83; cvt.s64.s32 %rd85, %r1; setp.ge.u64 %p50, %rd84, %rd85; mul.lo.s32 %r1556, %r1, %r3; cvt.s64.s32 %rd86, %r1556; cvt.u64.u32 %rd24, %r2507; add.s64 %rd87, %rd86, %rd24; add.s64 %rd25, %rd87, %rd14; mul.lo.s64 %rd88, %rd85, %rd13; add.s64 %rd89, %rd25, %rd88; add.s64 %rd26, %rd14, %rd24; setp.ge.u64 %p51, %rd26, %rd85; shl.b64 %rd90, %rd89, 1; mad.lo.s32 %r1559, %r2, %r913, %r914; cvt.s64.s32 %rd91, %r1559; mul.lo.s64 %rd92, %rd3, %rd91; add.s64 %rd93, %rd92, %rd90; cvta.to.global.u64 %rd94, %rd2; add.s64 %rd27, %rd94, %rd93; mov.u16 %rs68, 0; or.pred %p52, %p51, %p50; mov.u16 %rs67, %rs68; @%p52 bra $L__BB0_21; ld.global.u16 %rs67, [%rd27]; $L__BB0_21: add.s64 %rd28, %rd15, %rd24; setp.ge.u64 %p54, %rd28, %rd85; or.pred %p55, %p54, %p50; @%p55 bra $L__BB0_23; ld.global.u16 %rs68, [%rd27+2]; $L__BB0_23: add.s64 %rd29, %rd16, %rd24; setp.ge.u64 %p57, %rd29, %rd85; mov.u16 %rs70, 0; or.pred %p58, %p57, %p50; mov.u16 %rs69, %rs70; @%p58 bra $L__BB0_25; ld.global.u16 %rs69, [%rd27+16]; $L__BB0_25: add.s64 %rd30, %rd17, %rd24; setp.ge.u64 %p60, %rd30, %rd85; or.pred %p61, %p60, %p50; @%p61 bra $L__BB0_27; ld.global.u16 %rs70, [%rd27+18]; $L__BB0_27: cvt.s64.s32 %rd106, %r356; add.s64 %rd107, %rd106, %rd83; setp.ge.u64 %p63, %rd107, %rd85; mul.wide.s32 %rd108, %r1, %r356; add.s64 %rd109, %rd25, %rd108; shl.b64 %rd110, %rd109, 1; add.s64 %rd113, %rd92, %rd110; add.s64 %rd31, %rd94, %rd113; mov.u16 %rs72, 0; or.pred %p64, %p51, %p63; mov.u16 %rs71, %rs72; @%p64 bra $L__BB0_29; ld.global.u16 %rs71, [%rd31]; $L__BB0_29: or.pred %p67, %p54, %p63; @%p67 bra $L__BB0_31; ld.global.u16 %rs72, [%rd31+2]; $L__BB0_31: mov.u16 %rs74, 0; or.pred %p70, %p57, %p63; mov.u16 %rs73, %rs74; @%p70 bra $L__BB0_33; ld.global.u16 %rs73, [%rd31+16]; $L__BB0_33: or.pred %p73, %p60, %p63; @%p73 bra $L__BB0_35; ld.global.u16 %rs74, [%rd31+18]; $L__BB0_35: // begin inline asm mov.u32 %r1577, 0; // end inline asm // begin inline asm mov.u32 %r1578, 0; // end inline asm // begin inline asm mov.u32 %r1579, 0; // end inline asm // begin inline asm mov.u32 %r1580, 0; // end inline asm // begin inline asm mov.u32 %r1581, 0; // end inline asm // begin inline asm mov.u32 %r1582, 0; // end inline asm // begin inline asm mov.u32 %r1583, 0; // end inline asm // begin inline asm mov.u32 %r1584, 0; // end inline asm mov.b32 %f148, %r1577; mov.b32 %f149, %r1578; mov.b32 %f150, %r1579; mov.b32 %f151, %r1580; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1210, %r1211, %r1212, %r1213}, {%r2539, %r2538}, {%f148, %f149, %f150, %f151}; // end inline asm mov.b32 %f156, %r1581; mov.b32 %f157, %r1582; mov.b32 %f158, %r1583; mov.b32 %f159, %r1584; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1210, %r1211, %r1212, %r1213}, {%r2537, %r2536}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1215, %r1216, %r1217, %r1218}, {%r2535, %r2534}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1215, %r1216, %r1217, %r1218}, {%r2533, %r2532}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1220, %r1221, %r1222, %r1223}, {%r2531, %r2530}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1220, %r1221, %r1222, %r1223}, {%r2529, %r2528}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1225, %r1226, %r1227, %r1228}, {%r2527, %r2526}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1225, %r1226, %r1227, %r1228}, {%r2525, %r2524}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1230, %r1231, %r1232, %r1233}, {%r2523, %r2522}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1230, %r1231, %r1232, %r1233}, {%r2521, %r2520}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1235, %r1236, %r1237, %r1238}, {%r2519, %r2518}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1235, %r1236, %r1237, %r1238}, {%r2517, %r2516}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1240, %r1241, %r1242, %r1243}, {%r2515, %r2514}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1240, %r1241, %r1242, %r1243}, {%r2513, %r2512}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1245, %r1246, %r1247, %r1248}, {%r2511, %r2540}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1245, %r1246, %r1247, %r1248}, {%r2541, %r2542}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1250, %r1251, %r1252, %r1253}, {%r2543, %r2544}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1250, %r1251, %r1252, %r1253}, {%r2545, %r2546}, {%f156, %f157, %f158, %f159}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f148, %f149, %f150, %f151}, {%r1255, %r1256, %r1257, %r1258}, {%r2547, %r2548}, {%f148, %f149, %f150, %f151}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f156, %f157, %f158, %f159}, {%r1255, %r1256, %r1257, %r1258}, {%r2549, %r2550}, {%f156, %f157, %f158, %f159}; // end inline asm mul.ftz.f32 %f292, %f1, %f148; mul.ftz.f32 %f293, %f1, %f149; mul.ftz.f32 %f294, %f1, %f156; mul.ftz.f32 %f295, %f1, %f157; mul.ftz.f32 %f296, %f1, %f150; mul.ftz.f32 %f297, %f1, %f151; mul.ftz.f32 %f298, %f1, %f158; mul.ftz.f32 %f299, %f1, %f159; setp.lt.s32 %p74, %r2508, %r1; selp.f32 %f863, %f292, 0fFF800000, %p74; add.s32 %r1705, %r2508, 1; setp.lt.s32 %p75, %r1705, %r1; selp.f32 %f862, %f293, 0fFF800000, %p75; add.s32 %r1706, %r2508, 8; setp.lt.s32 %p76, %r1706, %r1; selp.f32 %f861, %f294, 0fFF800000, %p76; add.s32 %r1707, %r2508, 9; setp.lt.s32 %p77, %r1707, %r1; selp.f32 %f860, %f295, 0fFF800000, %p77; selp.f32 %f859, %f296, 0fFF800000, %p74; selp.f32 %f858, %f297, 0fFF800000, %p75; selp.f32 %f857, %f298, 0fFF800000, %p76; selp.f32 %f856, %f299, 0fFF800000, %p77; @%p49 bra $L__BB0_37; // begin inline asm cvt.f32.f16 %f300, %rs67; // end inline asm add.ftz.f32 %f863, %f300, %f863; // begin inline asm cvt.f32.f16 %f301, %rs68; // end inline asm add.ftz.f32 %f862, %f301, %f862; // begin inline asm cvt.f32.f16 %f302, %rs69; // end inline asm add.ftz.f32 %f861, %f302, %f861; // begin inline asm cvt.f32.f16 %f303, %rs70; // end inline asm add.ftz.f32 %f860, %f303, %f860; // begin inline asm cvt.f32.f16 %f304, %rs71; // end inline asm add.ftz.f32 %f859, %f304, %f859; // begin inline asm cvt.f32.f16 %f305, %rs72; // end inline asm add.ftz.f32 %f858, %f305, %f858; // begin inline asm cvt.f32.f16 %f306, %rs73; // end inline asm add.ftz.f32 %f857, %f306, %f857; // begin inline asm cvt.f32.f16 %f307, %rs74; // end inline asm add.ftz.f32 %f856, %f307, %f856; $L__BB0_37: add.s32 %r2324, %r2507, 16; setp.ge.s32 %p135, %r2324, %r227; setp.gt.ftz.f32 %p80, %f863, %f862; selp.f32 %f308, %f863, %f862, %p80; setp.gt.ftz.f32 %p81, %f308, %f861; selp.f32 %f309, %f308, %f861, %p81; setp.gt.ftz.f32 %p82, %f309, %f860; selp.f32 %f310, %f309, %f860, %p82; setp.gt.ftz.f32 %p83, %f859, %f858; selp.f32 %f311, %f859, %f858, %p83; setp.gt.ftz.f32 %p84, %f311, %f857; selp.f32 %f312, %f311, %f857, %p84; setp.gt.ftz.f32 %p85, %f312, %f856; selp.f32 %f313, %f312, %f856, %p85; mov.b32 %r1709, %f310; mov.u32 %r1710, 31; mov.u32 %r1711, 1; mov.u32 %r1712, -1; shfl.sync.bfly.b32 %r1713|%p86, %r1709, %r1711, %r1710, %r1712; mov.b32 %f314, %r1713; setp.gt.ftz.f32 %p87, %f310, %f314; selp.f32 %f315, %f310, %f314, %p87; mov.b32 %r1714, %f315; mov.u32 %r1715, 2; shfl.sync.bfly.b32 %r1716|%p88, %r1714, %r1715, %r1710, %r1712; mov.b32 %f316, %r1716; setp.gt.ftz.f32 %p89, %f315, %f316; selp.f32 %f317, %f315, %f316, %p89; mov.b32 %r1717, %f313; shfl.sync.bfly.b32 %r1718|%p90, %r1717, %r1711, %r1710, %r1712; mov.b32 %f318, %r1718; setp.gt.ftz.f32 %p91, %f313, %f318; selp.f32 %f319, %f313, %f318, %p91; mov.b32 %r1719, %f319; shfl.sync.bfly.b32 %r1720|%p92, %r1719, %r1715, %r1710, %r1712; mov.b32 %f320, %r1720; setp.gt.ftz.f32 %p93, %f319, %f320; selp.f32 %f321, %f319, %f320, %p93; max.ftz.f32 %f30, %f317, %f855; max.ftz.f32 %f31, %f321, %f854; sub.ftz.f32 %f322, %f863, %f30; mul.ftz.f32 %f323, %f322, 0f3FB8AA3B; ex2.approx.ftz.f32 %f32, %f323; sub.ftz.f32 %f324, %f862, %f30; mul.ftz.f32 %f325, %f324, 0f3FB8AA3B; ex2.approx.ftz.f32 %f33, %f325; sub.ftz.f32 %f326, %f861, %f30; mul.ftz.f32 %f327, %f326, 0f3FB8AA3B; ex2.approx.ftz.f32 %f34, %f327; sub.ftz.f32 %f328, %f860, %f30; mul.ftz.f32 %f329, %f328, 0f3FB8AA3B; ex2.approx.ftz.f32 %f35, %f329; sub.ftz.f32 %f330, %f859, %f31; mul.ftz.f32 %f331, %f330, 0f3FB8AA3B; ex2.approx.ftz.f32 %f36, %f331; sub.ftz.f32 %f332, %f858, %f31; mul.ftz.f32 %f333, %f332, 0f3FB8AA3B; ex2.approx.ftz.f32 %f37, %f333; sub.ftz.f32 %f334, %f857, %f31; mul.ftz.f32 %f335, %f334, 0f3FB8AA3B; ex2.approx.ftz.f32 %f38, %f335; sub.ftz.f32 %f336, %f856, %f31; mul.ftz.f32 %f337, %f336, 0f3FB8AA3B; ex2.approx.ftz.f32 %f39, %f337; add.ftz.f32 %f338, %f32, %f33; add.ftz.f32 %f339, %f338, 0f00000000; add.ftz.f32 %f340, %f34, %f35; add.ftz.f32 %f341, %f340, 0f00000000; add.ftz.f32 %f342, %f339, %f341; add.ftz.f32 %f343, %f36, %f37; add.ftz.f32 %f344, %f343, 0f00000000; add.ftz.f32 %f345, %f38, %f39; add.ftz.f32 %f346, %f345, 0f00000000; add.ftz.f32 %f347, %f344, %f346; mov.b32 %r1721, %f342; shfl.sync.bfly.b32 %r1722|%p94, %r1721, %r1711, %r1710, %r1712; mov.b32 %f348, %r1722; add.ftz.f32 %f349, %f342, %f348; mov.b32 %r1723, %f349; shfl.sync.bfly.b32 %r1724|%p95, %r1723, %r1715, %r1710, %r1712; mov.b32 %f350, %r1724; add.ftz.f32 %f351, %f349, %f350; mov.b32 %r1725, %f347; shfl.sync.bfly.b32 %r1726|%p96, %r1725, %r1711, %r1710, %r1712; mov.b32 %f352, %r1726; add.ftz.f32 %f353, %f347, %f352; mov.b32 %r1727, %f353; shfl.sync.bfly.b32 %r1728|%p97, %r1727, %r1715, %r1710, %r1712; mov.b32 %f354, %r1728; add.ftz.f32 %f355, %f353, %f354; sub.ftz.f32 %f356, %f855, %f30; mul.ftz.f32 %f357, %f356, 0f3FB8AA3B; ex2.approx.ftz.f32 %f358, %f357; mul.ftz.f32 %f40, %f358, %f853; add.ftz.f32 %f853, %f40, %f351; sub.ftz.f32 %f359, %f854, %f31; mul.ftz.f32 %f360, %f359, 0f3FB8AA3B; ex2.approx.ftz.f32 %f361, %f360; mul.ftz.f32 %f42, %f361, %f852; add.ftz.f32 %f852, %f42, %f355; @%p135 bra $L__BB0_39; // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2539, %r2538, %r2537, %r2536}, [%r1264]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2535, %r2534, %r2533, %r2532}, [%r1269]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2531, %r2530, %r2529, %r2528}, [%r1274]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2527, %r2526, %r2525, %r2524}, [%r1279]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2523, %r2522, %r2521, %r2520}, [%r1284]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2519, %r2518, %r2517, %r2516}, [%r1289]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2515, %r2514, %r2513, %r2512}, [%r1294]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2511, %r2540, %r2541, %r2542}, [%r1299]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2543, %r2544, %r2545, %r2546}, [%r1304]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r2547, %r2548, %r2549, %r2550}, [%r1309]; // end inline asm $L__BB0_39: // begin inline asm cvt.rn.f16x2.f32 %r1799, %f33, %f32; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1800, %f37, %f36; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1801, %f35, %f34; // end inline asm // begin inline asm cvt.rn.f16x2.f32 %r1802, %f39, %f38; // end inline asm // begin inline asm mov.u32 %r1803, 0; // end inline asm // begin inline asm mov.u32 %r1804, 0; // end inline asm // begin inline asm mov.u32 %r1805, 0; // end inline asm // begin inline asm mov.u32 %r1806, 0; // end inline asm // begin inline asm mov.u32 %r1807, 0; // end inline asm // begin inline asm mov.u32 %r1808, 0; // end inline asm // begin inline asm mov.u32 %r1809, 0; // end inline asm // begin inline asm mov.u32 %r1810, 0; // end inline asm // begin inline asm mov.u32 %r1811, 0; // end inline asm // begin inline asm mov.u32 %r1812, 0; // end inline asm // begin inline asm mov.u32 %r1813, 0; // end inline asm // begin inline asm mov.u32 %r1814, 0; // end inline asm // begin inline asm mov.u32 %r1815, 0; // end inline asm // begin inline asm mov.u32 %r1816, 0; // end inline asm // begin inline asm mov.u32 %r1817, 0; // end inline asm // begin inline asm mov.u32 %r1818, 0; // end inline asm // begin inline asm mov.u32 %r1819, 0; // end inline asm // begin inline asm mov.u32 %r1820, 0; // end inline asm // begin inline asm mov.u32 %r1821, 0; // end inline asm // begin inline asm mov.u32 %r1822, 0; // end inline asm // begin inline asm mov.u32 %r1823, 0; // end inline asm // begin inline asm mov.u32 %r1824, 0; // end inline asm // begin inline asm mov.u32 %r1825, 0; // end inline asm // begin inline asm mov.u32 %r1826, 0; // end inline asm // begin inline asm mov.u32 %r1827, 0; // end inline asm // begin inline asm mov.u32 %r1828, 0; // end inline asm // begin inline asm mov.u32 %r1829, 0; // end inline asm // begin inline asm mov.u32 %r1830, 0; // end inline asm // begin inline asm mov.u32 %r1831, 0; // end inline asm // begin inline asm mov.u32 %r1832, 0; // end inline asm // begin inline asm mov.u32 %r1833, 0; // end inline asm // begin inline asm mov.u32 %r1834, 0; // end inline asm // begin inline asm mov.u32 %r1835, 0; // end inline asm // begin inline asm mov.u32 %r1836, 0; // end inline asm // begin inline asm mov.u32 %r1837, 0; // end inline asm // begin inline asm mov.u32 %r1838, 0; // end inline asm // begin inline asm mov.u32 %r1839, 0; // end inline asm // begin inline asm mov.u32 %r1840, 0; // end inline asm // begin inline asm mov.u32 %r1841, 0; // end inline asm // begin inline asm mov.u32 %r1842, 0; // end inline asm // begin inline asm mov.u32 %r1843, 0; // end inline asm // begin inline asm mov.u32 %r1844, 0; // end inline asm // begin inline asm mov.u32 %r1845, 0; // end inline asm // begin inline asm mov.u32 %r1846, 0; // end inline asm // begin inline asm mov.u32 %r1847, 0; // end inline asm // begin inline asm mov.u32 %r1848, 0; // end inline asm // begin inline asm mov.u32 %r1849, 0; // end inline asm // begin inline asm mov.u32 %r1850, 0; // end inline asm // begin inline asm mov.u32 %r1851, 0; // end inline asm // begin inline asm mov.u32 %r1852, 0; // end inline asm // begin inline asm mov.u32 %r1853, 0; // end inline asm // begin inline asm mov.u32 %r1854, 0; // end inline asm // begin inline asm mov.u32 %r1855, 0; // end inline asm // begin inline asm mov.u32 %r1856, 0; // end inline asm // begin inline asm mov.u32 %r1857, 0; // end inline asm // begin inline asm mov.u32 %r1858, 0; // end inline asm // begin inline asm mov.u32 %r1859, 0; // end inline asm // begin inline asm mov.u32 %r1860, 0; // end inline asm // begin inline asm mov.u32 %r1861, 0; // end inline asm // begin inline asm mov.u32 %r1862, 0; // end inline asm // begin inline asm mov.u32 %r1863, 0; // end inline asm // begin inline asm mov.u32 %r1864, 0; // end inline asm // begin inline asm mov.u32 %r1865, 0; // end inline asm // begin inline asm mov.u32 %r1866, 0; // end inline asm // begin inline asm mov.u32 %r1867, 0; // end inline asm // begin inline asm mov.u32 %r1868, 0; // end inline asm // begin inline asm mov.u32 %r1869, 0; // end inline asm // begin inline asm mov.u32 %r1870, 0; // end inline asm // begin inline asm mov.u32 %r1871, 0; // end inline asm // begin inline asm mov.u32 %r1872, 0; // end inline asm // begin inline asm mov.u32 %r1873, 0; // end inline asm // begin inline asm mov.u32 %r1874, 0; // end inline asm // begin inline asm mov.u32 %r1875, 0; // end inline asm // begin inline asm mov.u32 %r1876, 0; // end inline asm // begin inline asm mov.u32 %r1877, 0; // end inline asm // begin inline asm mov.u32 %r1878, 0; // end inline asm // begin inline asm mov.u32 %r1879, 0; // end inline asm // begin inline asm mov.u32 %r1880, 0; // end inline asm // begin inline asm mov.u32 %r1881, 0; // end inline asm // begin inline asm mov.u32 %r1882, 0; // end inline asm mov.b32 %f370, %r1803; mov.b32 %f371, %r1804; mov.b32 %f372, %r1805; mov.b32 %f373, %r1806; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f370, %f371, %f372, %f373}, {%r1799, %r1800, %r1801, %r1802}, {%r2590, %r2589}, {%f370, %f371, %f372, %f373}; // end inline asm mov.b32 %f378, %r1807; mov.b32 %f379, %r1808; mov.b32 %f380, %r1809; mov.b32 %f381, %r1810; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f378, %f379, %f380, %f381}, {%r1799, %r1800, %r1801, %r1802}, {%r2588, %r2587}, {%f378, %f379, %f380, %f381}; // end inline asm mov.b32 %f386, %r1811; mov.b32 %f387, %r1812; mov.b32 %f388, %r1813; mov.b32 %f389, %r1814; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f386, %f387, %f388, %f389}, {%r1799, %r1800, %r1801, %r1802}, {%r2586, %r2585}, {%f386, %f387, %f388, %f389}; // end inline asm mov.b32 %f394, %r1815; mov.b32 %f395, %r1816; mov.b32 %f396, %r1817; mov.b32 %f397, %r1818; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f394, %f395, %f396, %f397}, {%r1799, %r1800, %r1801, %r1802}, {%r2584, %r2583}, {%f394, %f395, %f396, %f397}; // end inline asm mov.b32 %f402, %r1819; mov.b32 %f403, %r1820; mov.b32 %f404, %r1821; mov.b32 %f405, %r1822; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f402, %f403, %f404, %f405}, {%r1799, %r1800, %r1801, %r1802}, {%r2582, %r2581}, {%f402, %f403, %f404, %f405}; // end inline asm mov.b32 %f410, %r1823; mov.b32 %f411, %r1824; mov.b32 %f412, %r1825; mov.b32 %f413, %r1826; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f410, %f411, %f412, %f413}, {%r1799, %r1800, %r1801, %r1802}, {%r2580, %r2579}, {%f410, %f411, %f412, %f413}; // end inline asm mov.b32 %f418, %r1827; mov.b32 %f419, %r1828; mov.b32 %f420, %r1829; mov.b32 %f421, %r1830; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f418, %f419, %f420, %f421}, {%r1799, %r1800, %r1801, %r1802}, {%r2578, %r2577}, {%f418, %f419, %f420, %f421}; // end inline asm mov.b32 %f426, %r1831; mov.b32 %f427, %r1832; mov.b32 %f428, %r1833; mov.b32 %f429, %r1834; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f426, %f427, %f428, %f429}, {%r1799, %r1800, %r1801, %r1802}, {%r2576, %r2575}, {%f426, %f427, %f428, %f429}; // end inline asm mov.b32 %f434, %r1835; mov.b32 %f435, %r1836; mov.b32 %f436, %r1837; mov.b32 %f437, %r1838; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f434, %f435, %f436, %f437}, {%r1799, %r1800, %r1801, %r1802}, {%r2574, %r2573}, {%f434, %f435, %f436, %f437}; // end inline asm mov.b32 %f442, %r1839; mov.b32 %f443, %r1840; mov.b32 %f444, %r1841; mov.b32 %f445, %r1842; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f442, %f443, %f444, %f445}, {%r1799, %r1800, %r1801, %r1802}, {%r2572, %r2571}, {%f442, %f443, %f444, %f445}; // end inline asm mov.b32 %f450, %r1843; mov.b32 %f451, %r1844; mov.b32 %f452, %r1845; mov.b32 %f453, %r1846; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f450, %f451, %f452, %f453}, {%r1799, %r1800, %r1801, %r1802}, {%r2570, %r2569}, {%f450, %f451, %f452, %f453}; // end inline asm mov.b32 %f458, %r1847; mov.b32 %f459, %r1848; mov.b32 %f460, %r1849; mov.b32 %f461, %r1850; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f458, %f459, %f460, %f461}, {%r1799, %r1800, %r1801, %r1802}, {%r2568, %r2567}, {%f458, %f459, %f460, %f461}; // end inline asm mov.b32 %f466, %r1851; mov.b32 %f467, %r1852; mov.b32 %f468, %r1853; mov.b32 %f469, %r1854; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f466, %f467, %f468, %f469}, {%r1799, %r1800, %r1801, %r1802}, {%r2566, %r2565}, {%f466, %f467, %f468, %f469}; // end inline asm mov.b32 %f474, %r1855; mov.b32 %f475, %r1856; mov.b32 %f476, %r1857; mov.b32 %f477, %r1858; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f474, %f475, %f476, %f477}, {%r1799, %r1800, %r1801, %r1802}, {%r2564, %r2563}, {%f474, %f475, %f476, %f477}; // end inline asm mov.b32 %f482, %r1859; mov.b32 %f483, %r1860; mov.b32 %f484, %r1861; mov.b32 %f485, %r1862; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f482, %f483, %f484, %f485}, {%r1799, %r1800, %r1801, %r1802}, {%r2562, %r2561}, {%f482, %f483, %f484, %f485}; // end inline asm mov.b32 %f490, %r1863; mov.b32 %f491, %r1864; mov.b32 %f492, %r1865; mov.b32 %f493, %r1866; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f490, %f491, %f492, %f493}, {%r1799, %r1800, %r1801, %r1802}, {%r2560, %r2559}, {%f490, %f491, %f492, %f493}; // end inline asm mov.b32 %f498, %r1867; mov.b32 %f499, %r1868; mov.b32 %f500, %r1869; mov.b32 %f501, %r1870; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f498, %f499, %f500, %f501}, {%r1799, %r1800, %r1801, %r1802}, {%r2558, %r2557}, {%f498, %f499, %f500, %f501}; // end inline asm mov.b32 %f506, %r1871; mov.b32 %f507, %r1872; mov.b32 %f508, %r1873; mov.b32 %f509, %r1874; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f506, %f507, %f508, %f509}, {%r1799, %r1800, %r1801, %r1802}, {%r2556, %r2555}, {%f506, %f507, %f508, %f509}; // end inline asm mov.b32 %f514, %r1875; mov.b32 %f515, %r1876; mov.b32 %f516, %r1877; mov.b32 %f517, %r1878; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f514, %f515, %f516, %f517}, {%r1799, %r1800, %r1801, %r1802}, {%r2554, %r2553}, {%f514, %f515, %f516, %f517}; // end inline asm mov.b32 %f522, %r1879; mov.b32 %f523, %r1880; mov.b32 %f524, %r1881; mov.b32 %f525, %r1882; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%f522, %f523, %f524, %f525}, {%r1799, %r1800, %r1801, %r1802}, {%r2552, %r2551}, {%f522, %f523, %f524, %f525}; // end inline asm setp.equ.ftz.f32 %p98, %f853, 0f00000000; mov.f32 %f865, 0f3F800000; mov.f32 %f864, %f865; @%p98 bra $L__BB0_41; rcp.approx.ftz.f32 %f864, %f853; $L__BB0_41: setp.equ.ftz.f32 %p99, %f852, 0f00000000; @%p99 bra $L__BB0_43; rcp.approx.ftz.f32 %f865, %f852; $L__BB0_43: add.s32 %r2325, %r2507, 16; setp.ge.s32 %p136, %r2325, %r227; mov.b32 %f532, %r2670; fma.rn.ftz.f32 %f533, %f40, %f532, %f370; mul.ftz.f32 %f534, %f864, %f533; mov.b32 %r2670, %f534; mov.b32 %f535, %r2669; fma.rn.ftz.f32 %f536, %f40, %f535, %f371; mul.ftz.f32 %f537, %f864, %f536; mov.b32 %r2669, %f537; mov.b32 %f538, %r2668; fma.rn.ftz.f32 %f539, %f42, %f538, %f372; mul.ftz.f32 %f540, %f865, %f539; mov.b32 %r2668, %f540; mov.b32 %f541, %r2667; fma.rn.ftz.f32 %f542, %f42, %f541, %f373; mul.ftz.f32 %f543, %f865, %f542; mov.b32 %r2667, %f543; mov.b32 %f544, %r2666; fma.rn.ftz.f32 %f545, %f40, %f544, %f378; mul.ftz.f32 %f546, %f864, %f545; mov.b32 %r2666, %f546; mov.b32 %f547, %r2665; fma.rn.ftz.f32 %f548, %f40, %f547, %f379; mul.ftz.f32 %f549, %f864, %f548; mov.b32 %r2665, %f549; mov.b32 %f550, %r2664; fma.rn.ftz.f32 %f551, %f42, %f550, %f380; mul.ftz.f32 %f552, %f865, %f551; mov.b32 %r2664, %f552; mov.b32 %f553, %r2663; fma.rn.ftz.f32 %f554, %f42, %f553, %f381; mul.ftz.f32 %f555, %f865, %f554; mov.b32 %r2663, %f555; mov.b32 %f556, %r2662; fma.rn.ftz.f32 %f557, %f40, %f556, %f386; mul.ftz.f32 %f558, %f864, %f557; mov.b32 %r2662, %f558; mov.b32 %f559, %r2661; fma.rn.ftz.f32 %f560, %f40, %f559, %f387; mul.ftz.f32 %f561, %f864, %f560; mov.b32 %r2661, %f561; mov.b32 %f562, %r2660; fma.rn.ftz.f32 %f563, %f42, %f562, %f388; mul.ftz.f32 %f564, %f865, %f563; mov.b32 %r2660, %f564; mov.b32 %f565, %r2659; fma.rn.ftz.f32 %f566, %f42, %f565, %f389; mul.ftz.f32 %f567, %f865, %f566; mov.b32 %r2659, %f567; mov.b32 %f568, %r2658; fma.rn.ftz.f32 %f569, %f40, %f568, %f394; mul.ftz.f32 %f570, %f864, %f569; mov.b32 %r2658, %f570; mov.b32 %f571, %r2657; fma.rn.ftz.f32 %f572, %f40, %f571, %f395; mul.ftz.f32 %f573, %f864, %f572; mov.b32 %r2657, %f573; mov.b32 %f574, %r2656; fma.rn.ftz.f32 %f575, %f42, %f574, %f396; mul.ftz.f32 %f576, %f865, %f575; mov.b32 %r2656, %f576; mov.b32 %f577, %r2655; fma.rn.ftz.f32 %f578, %f42, %f577, %f397; mul.ftz.f32 %f579, %f865, %f578; mov.b32 %r2655, %f579; mov.b32 %f580, %r2654; fma.rn.ftz.f32 %f581, %f40, %f580, %f402; mul.ftz.f32 %f582, %f864, %f581; mov.b32 %r2654, %f582; mov.b32 %f583, %r2653; fma.rn.ftz.f32 %f584, %f40, %f583, %f403; mul.ftz.f32 %f585, %f864, %f584; mov.b32 %r2653, %f585; mov.b32 %f586, %r2652; fma.rn.ftz.f32 %f587, %f42, %f586, %f404; mul.ftz.f32 %f588, %f865, %f587; mov.b32 %r2652, %f588; mov.b32 %f589, %r2651; fma.rn.ftz.f32 %f590, %f42, %f589, %f405; mul.ftz.f32 %f591, %f865, %f590; mov.b32 %r2651, %f591; mov.b32 %f592, %r2650; fma.rn.ftz.f32 %f593, %f40, %f592, %f410; mul.ftz.f32 %f594, %f864, %f593; mov.b32 %r2650, %f594; mov.b32 %f595, %r2649; fma.rn.ftz.f32 %f596, %f40, %f595, %f411; mul.ftz.f32 %f597, %f864, %f596; mov.b32 %r2649, %f597; mov.b32 %f598, %r2648; fma.rn.ftz.f32 %f599, %f42, %f598, %f412; mul.ftz.f32 %f600, %f865, %f599; mov.b32 %r2648, %f600; mov.b32 %f601, %r2647; fma.rn.ftz.f32 %f602, %f42, %f601, %f413; mul.ftz.f32 %f603, %f865, %f602; mov.b32 %r2647, %f603; mov.b32 %f604, %r2646; fma.rn.ftz.f32 %f605, %f40, %f604, %f418; mul.ftz.f32 %f606, %f864, %f605; mov.b32 %r2646, %f606; mov.b32 %f607, %r2645; fma.rn.ftz.f32 %f608, %f40, %f607, %f419; mul.ftz.f32 %f609, %f864, %f608; mov.b32 %r2645, %f609; mov.b32 %f610, %r2644; fma.rn.ftz.f32 %f611, %f42, %f610, %f420; mul.ftz.f32 %f612, %f865, %f611; mov.b32 %r2644, %f612; mov.b32 %f613, %r2643; fma.rn.ftz.f32 %f614, %f42, %f613, %f421; mul.ftz.f32 %f615, %f865, %f614; mov.b32 %r2643, %f615; mov.b32 %f616, %r2642; fma.rn.ftz.f32 %f617, %f40, %f616, %f426; mul.ftz.f32 %f618, %f864, %f617; mov.b32 %r2642, %f618; mov.b32 %f619, %r2641; fma.rn.ftz.f32 %f620, %f40, %f619, %f427; mul.ftz.f32 %f621, %f864, %f620; mov.b32 %r2641, %f621; mov.b32 %f622, %r2640; fma.rn.ftz.f32 %f623, %f42, %f622, %f428; mul.ftz.f32 %f624, %f865, %f623; mov.b32 %r2640, %f624; mov.b32 %f625, %r2639; fma.rn.ftz.f32 %f626, %f42, %f625, %f429; mul.ftz.f32 %f627, %f865, %f626; mov.b32 %r2639, %f627; mov.b32 %f628, %r2638; fma.rn.ftz.f32 %f629, %f40, %f628, %f434; mul.ftz.f32 %f630, %f864, %f629; mov.b32 %r2638, %f630; mov.b32 %f631, %r2637; fma.rn.ftz.f32 %f632, %f40, %f631, %f435; mul.ftz.f32 %f633, %f864, %f632; mov.b32 %r2637, %f633; mov.b32 %f634, %r2636; fma.rn.ftz.f32 %f635, %f42, %f634, %f436; mul.ftz.f32 %f636, %f865, %f635; mov.b32 %r2636, %f636; mov.b32 %f637, %r2635; fma.rn.ftz.f32 %f638, %f42, %f637, %f437; mul.ftz.f32 %f639, %f865, %f638; mov.b32 %r2635, %f639; mov.b32 %f640, %r2634; fma.rn.ftz.f32 %f641, %f40, %f640, %f442; mul.ftz.f32 %f642, %f864, %f641; mov.b32 %r2634, %f642; mov.b32 %f643, %r2633; fma.rn.ftz.f32 %f644, %f40, %f643, %f443; mul.ftz.f32 %f645, %f864, %f644; mov.b32 %r2633, %f645; mov.b32 %f646, %r2632; fma.rn.ftz.f32 %f647, %f42, %f646, %f444; mul.ftz.f32 %f648, %f865, %f647; mov.b32 %r2632, %f648; mov.b32 %f649, %r2631; fma.rn.ftz.f32 %f650, %f42, %f649, %f445; mul.ftz.f32 %f651, %f865, %f650; mov.b32 %r2631, %f651; mov.b32 %f652, %r2630; fma.rn.ftz.f32 %f653, %f40, %f652, %f450; mul.ftz.f32 %f654, %f864, %f653; mov.b32 %r2630, %f654; mov.b32 %f655, %r2629; fma.rn.ftz.f32 %f656, %f40, %f655, %f451; mul.ftz.f32 %f657, %f864, %f656; mov.b32 %r2629, %f657; mov.b32 %f658, %r2628; fma.rn.ftz.f32 %f659, %f42, %f658, %f452; mul.ftz.f32 %f660, %f865, %f659; mov.b32 %r2628, %f660; mov.b32 %f661, %r2627; fma.rn.ftz.f32 %f662, %f42, %f661, %f453; mul.ftz.f32 %f663, %f865, %f662; mov.b32 %r2627, %f663; mov.b32 %f664, %r2626; fma.rn.ftz.f32 %f665, %f40, %f664, %f458; mul.ftz.f32 %f666, %f864, %f665; mov.b32 %r2626, %f666; mov.b32 %f667, %r2625; fma.rn.ftz.f32 %f668, %f40, %f667, %f459; mul.ftz.f32 %f669, %f864, %f668; mov.b32 %r2625, %f669; mov.b32 %f670, %r2624; fma.rn.ftz.f32 %f671, %f42, %f670, %f460; mul.ftz.f32 %f672, %f865, %f671; mov.b32 %r2624, %f672; mov.b32 %f673, %r2623; fma.rn.ftz.f32 %f674, %f42, %f673, %f461; mul.ftz.f32 %f675, %f865, %f674; mov.b32 %r2623, %f675; mov.b32 %f676, %r2622; fma.rn.ftz.f32 %f677, %f40, %f676, %f466; mul.ftz.f32 %f678, %f864, %f677; mov.b32 %r2622, %f678; mov.b32 %f679, %r2621; fma.rn.ftz.f32 %f680, %f40, %f679, %f467; mul.ftz.f32 %f681, %f864, %f680; mov.b32 %r2621, %f681; mov.b32 %f682, %r2620; fma.rn.ftz.f32 %f683, %f42, %f682, %f468; mul.ftz.f32 %f684, %f865, %f683; mov.b32 %r2620, %f684; mov.b32 %f685, %r2619; fma.rn.ftz.f32 %f686, %f42, %f685, %f469; mul.ftz.f32 %f687, %f865, %f686; mov.b32 %r2619, %f687; mov.b32 %f688, %r2618; fma.rn.ftz.f32 %f689, %f40, %f688, %f474; mul.ftz.f32 %f690, %f864, %f689; mov.b32 %r2618, %f690; mov.b32 %f691, %r2617; fma.rn.ftz.f32 %f692, %f40, %f691, %f475; mul.ftz.f32 %f693, %f864, %f692; mov.b32 %r2617, %f693; mov.b32 %f694, %r2616; fma.rn.ftz.f32 %f695, %f42, %f694, %f476; mul.ftz.f32 %f696, %f865, %f695; mov.b32 %r2616, %f696; mov.b32 %f697, %r2615; fma.rn.ftz.f32 %f698, %f42, %f697, %f477; mul.ftz.f32 %f699, %f865, %f698; mov.b32 %r2615, %f699; mov.b32 %f700, %r2614; fma.rn.ftz.f32 %f701, %f40, %f700, %f482; mul.ftz.f32 %f702, %f864, %f701; mov.b32 %r2614, %f702; mov.b32 %f703, %r2613; fma.rn.ftz.f32 %f704, %f40, %f703, %f483; mul.ftz.f32 %f705, %f864, %f704; mov.b32 %r2613, %f705; mov.b32 %f706, %r2612; fma.rn.ftz.f32 %f707, %f42, %f706, %f484; mul.ftz.f32 %f708, %f865, %f707; mov.b32 %r2612, %f708; mov.b32 %f709, %r2611; fma.rn.ftz.f32 %f710, %f42, %f709, %f485; mul.ftz.f32 %f711, %f865, %f710; mov.b32 %r2611, %f711; mov.b32 %f712, %r2610; fma.rn.ftz.f32 %f713, %f40, %f712, %f490; mul.ftz.f32 %f714, %f864, %f713; mov.b32 %r2610, %f714; mov.b32 %f715, %r2609; fma.rn.ftz.f32 %f716, %f40, %f715, %f491; mul.ftz.f32 %f717, %f864, %f716; mov.b32 %r2609, %f717; mov.b32 %f718, %r2608; fma.rn.ftz.f32 %f719, %f42, %f718, %f492; mul.ftz.f32 %f720, %f865, %f719; mov.b32 %r2608, %f720; mov.b32 %f721, %r2607; fma.rn.ftz.f32 %f722, %f42, %f721, %f493; mul.ftz.f32 %f723, %f865, %f722; mov.b32 %r2607, %f723; mov.b32 %f724, %r2606; fma.rn.ftz.f32 %f725, %f40, %f724, %f498; mul.ftz.f32 %f726, %f864, %f725; mov.b32 %r2606, %f726; mov.b32 %f727, %r2605; fma.rn.ftz.f32 %f728, %f40, %f727, %f499; mul.ftz.f32 %f729, %f864, %f728; mov.b32 %r2605, %f729; mov.b32 %f730, %r2604; fma.rn.ftz.f32 %f731, %f42, %f730, %f500; mul.ftz.f32 %f732, %f865, %f731; mov.b32 %r2604, %f732; mov.b32 %f733, %r2603; fma.rn.ftz.f32 %f734, %f42, %f733, %f501; mul.ftz.f32 %f735, %f865, %f734; mov.b32 %r2603, %f735; mov.b32 %f736, %r2602; fma.rn.ftz.f32 %f737, %f40, %f736, %f506; mul.ftz.f32 %f738, %f864, %f737; mov.b32 %r2602, %f738; mov.b32 %f739, %r2601; fma.rn.ftz.f32 %f740, %f40, %f739, %f507; mul.ftz.f32 %f741, %f864, %f740; mov.b32 %r2601, %f741; mov.b32 %f742, %r2600; fma.rn.ftz.f32 %f743, %f42, %f742, %f508; mul.ftz.f32 %f744, %f865, %f743; mov.b32 %r2600, %f744; mov.b32 %f745, %r2599; fma.rn.ftz.f32 %f746, %f42, %f745, %f509; mul.ftz.f32 %f747, %f865, %f746; mov.b32 %r2599, %f747; mov.b32 %f748, %r2598; fma.rn.ftz.f32 %f749, %f40, %f748, %f514; mul.ftz.f32 %f750, %f864, %f749; mov.b32 %r2598, %f750; mov.b32 %f751, %r2597; fma.rn.ftz.f32 %f752, %f40, %f751, %f515; mul.ftz.f32 %f753, %f864, %f752; mov.b32 %r2597, %f753; mov.b32 %f754, %r2596; fma.rn.ftz.f32 %f755, %f42, %f754, %f516; mul.ftz.f32 %f756, %f865, %f755; mov.b32 %r2596, %f756; mov.b32 %f757, %r2595; fma.rn.ftz.f32 %f758, %f42, %f757, %f517; mul.ftz.f32 %f759, %f865, %f758; mov.b32 %r2595, %f759; mov.b32 %f760, %r2594; fma.rn.ftz.f32 %f761, %f40, %f760, %f522; mul.ftz.f32 %f762, %f864, %f761; mov.b32 %r2594, %f762; mov.b32 %f763, %r2593; fma.rn.ftz.f32 %f764, %f40, %f763, %f523; mul.ftz.f32 %f765, %f864, %f764; mov.b32 %r2593, %f765; mov.b32 %f766, %r2592; fma.rn.ftz.f32 %f767, %f42, %f766, %f524; mul.ftz.f32 %f768, %f865, %f767; mov.b32 %r2592, %f768; mov.b32 %f769, %r2591; fma.rn.ftz.f32 %f770, %f42, %f769, %f525; mul.ftz.f32 %f771, %f865, %f770; mov.b32 %r2591, %f771; @%p136 bra $L__BB0_45; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2590, %r2589, %r2588, %r2587}, [%r1314]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2586, %r2585, %r2584, %r2583}, [%r1319]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2582, %r2581, %r2580, %r2579}, [%r1324]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2578, %r2577, %r2576, %r2575}, [%r1329]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2574, %r2573, %r2572, %r2571}, [%r1334]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2570, %r2569, %r2568, %r2567}, [%r1339]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2566, %r2565, %r2564, %r2563}, [%r1344]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2562, %r2561, %r2560, %r2559}, [%r1349]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2558, %r2557, %r2556, %r2555}, [%r1354]; // end inline asm // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r2554, %r2553, %r2552, %r2551}, [%r1359]; // end inline asm $L__BB0_45: add.s32 %r2507, %r2507, 16; setp.lt.s32 %p101, %r2507, %r227; add.s32 %r2508, %r2508, 16; mov.f32 %f854, %f31; mov.f32 %f855, %f30; @%p101 bra $L__BB0_11; $L__BB0_46: mov.u32 %r2333, %tid.x; mov.b64 %rd209, fmha_v2_flash_attention_fp16_64_16_S_160_sm86_kernel_nl_param_0; mov.u64 %rd208, %rd209; ld.param.u32 %r2332, [%rd208+60]; mul.lo.s32 %r2331, %r1, %r913; mad.lo.s32 %r2330, %r2331, %r2, %r914; and.b32 %r2329, %r2333, 96; shr.u32 %r2328, %r2329, 1; mov.u32 %r2327, _ZN25fused_multihead_attention5smem_E; add.s32 %r2274, %r2327, 32768; add.s32 %r2197, %r18, %r2274; bar.sync 0; mov.b32 %f772, %r2669; mov.b32 %f773, %r2670; // begin inline asm cvt.rn.f16x2.f32 %r2073, %f772, %f773; // end inline asm mov.b32 %f774, %r2667; mov.b32 %f775, %r2668; // begin inline asm cvt.rn.f16x2.f32 %r2074, %f774, %f775; // end inline asm shl.b32 %r2276, %r2333, 2; and.b32 %r2277, %r2276, 124; add.s32 %r2278, %r2277, %r2274; and.b32 %r2281, %r2333, 28; shr.u32 %r2282, %r2281, 2; or.b32 %r2283, %r2328, %r2282; shl.b32 %r2284, %r2283, 9; add.s32 %r2075, %r2278, %r2284; // begin inline asm st.shared.b32 [%r2075], %r2073; // end inline asm add.s32 %r2077, %r2075, 4096; // begin inline asm st.shared.b32 [%r2077], %r2074; // end inline asm xor.b32 %r2081, %r2075, 16; mov.b32 %f776, %r2665; mov.b32 %f777, %r2666; // begin inline asm cvt.rn.f16x2.f32 %r2079, %f776, %f777; // end inline asm mov.b32 %f778, %r2663; mov.b32 %f779, %r2664; // begin inline asm cvt.rn.f16x2.f32 %r2080, %f778, %f779; // end inline asm // begin inline asm st.shared.b32 [%r2081], %r2079; // end inline asm add.s32 %r2083, %r2081, 4096; // begin inline asm st.shared.b32 [%r2083], %r2080; // end inline asm xor.b32 %r2087, %r2075, 32; mov.b32 %f780, %r2661; mov.b32 %f781, %r2662; // begin inline asm cvt.rn.f16x2.f32 %r2085, %f780, %f781; // end inline asm mov.b32 %f782, %r2659; mov.b32 %f783, %r2660; // begin inline asm cvt.rn.f16x2.f32 %r2086, %f782, %f783; // end inline asm // begin inline asm st.shared.b32 [%r2087], %r2085; // end inline asm add.s32 %r2089, %r2087, 4096; // begin inline asm st.shared.b32 [%r2089], %r2086; // end inline asm xor.b32 %r2093, %r2075, 48; mov.b32 %f784, %r2657; mov.b32 %f785, %r2658; // begin inline asm cvt.rn.f16x2.f32 %r2091, %f784, %f785; // end inline asm mov.b32 %f786, %r2655; mov.b32 %f787, %r2656; // begin inline asm cvt.rn.f16x2.f32 %r2092, %f786, %f787; // end inline asm // begin inline asm st.shared.b32 [%r2093], %r2091; // end inline asm add.s32 %r2095, %r2093, 4096; // begin inline asm st.shared.b32 [%r2095], %r2092; // end inline asm xor.b32 %r2099, %r2075, 64; mov.b32 %f788, %r2653; mov.b32 %f789, %r2654; // begin inline asm cvt.rn.f16x2.f32 %r2097, %f788, %f789; // end inline asm mov.b32 %f790, %r2651; mov.b32 %f791, %r2652; // begin inline asm cvt.rn.f16x2.f32 %r2098, %f790, %f791; // end inline asm // begin inline asm st.shared.b32 [%r2099], %r2097; // end inline asm add.s32 %r2101, %r2099, 4096; // begin inline asm st.shared.b32 [%r2101], %r2098; // end inline asm xor.b32 %r2105, %r2075, 80; mov.b32 %f792, %r2649; mov.b32 %f793, %r2650; // begin inline asm cvt.rn.f16x2.f32 %r2103, %f792, %f793; // end inline asm mov.b32 %f794, %r2647; mov.b32 %f795, %r2648; // begin inline asm cvt.rn.f16x2.f32 %r2104, %f794, %f795; // end inline asm // begin inline asm st.shared.b32 [%r2105], %r2103; // end inline asm add.s32 %r2107, %r2105, 4096; // begin inline asm st.shared.b32 [%r2107], %r2104; // end inline asm xor.b32 %r2111, %r2075, 96; mov.b32 %f796, %r2645; mov.b32 %f797, %r2646; // begin inline asm cvt.rn.f16x2.f32 %r2109, %f796, %f797; // end inline asm mov.b32 %f798, %r2643; mov.b32 %f799, %r2644; // begin inline asm cvt.rn.f16x2.f32 %r2110, %f798, %f799; // end inline asm // begin inline asm st.shared.b32 [%r2111], %r2109; // end inline asm add.s32 %r2113, %r2111, 4096; // begin inline asm st.shared.b32 [%r2113], %r2110; // end inline asm xor.b32 %r2117, %r2075, 112; mov.b32 %f800, %r2641; mov.b32 %f801, %r2642; // begin inline asm cvt.rn.f16x2.f32 %r2115, %f800, %f801; // end inline asm mov.b32 %f802, %r2639; mov.b32 %f803, %r2640; // begin inline asm cvt.rn.f16x2.f32 %r2116, %f802, %f803; // end inline asm // begin inline asm st.shared.b32 [%r2117], %r2115; // end inline asm add.s32 %r2119, %r2117, 4096; // begin inline asm st.shared.b32 [%r2119], %r2116; // end inline asm xor.b32 %r2123, %r2075, 128; mov.b32 %f804, %r2637; mov.b32 %f805, %r2638; // begin inline asm cvt.rn.f16x2.f32 %r2121, %f804, %f805; // end inline asm mov.b32 %f806, %r2635; mov.b32 %f807, %r2636; // begin inline asm cvt.rn.f16x2.f32 %r2122, %f806, %f807; // end inline asm // begin inline asm st.shared.b32 [%r2123], %r2121; // end inline asm add.s32 %r2125, %r2123, 4096; // begin inline asm st.shared.b32 [%r2125], %r2122; // end inline asm xor.b32 %r2129, %r2075, 144; mov.b32 %f808, %r2633; mov.b32 %f809, %r2634; // begin inline asm cvt.rn.f16x2.f32 %r2127, %f808, %f809; // end inline asm mov.b32 %f810, %r2631; mov.b32 %f811, %r2632; // begin inline asm cvt.rn.f16x2.f32 %r2128, %f810, %f811; // end inline asm // begin inline asm st.shared.b32 [%r2129], %r2127; // end inline asm add.s32 %r2131, %r2129, 4096; // begin inline asm st.shared.b32 [%r2131], %r2128; // end inline asm xor.b32 %r2135, %r2075, 160; mov.b32 %f812, %r2629; mov.b32 %f813, %r2630; // begin inline asm cvt.rn.f16x2.f32 %r2133, %f812, %f813; // end inline asm mov.b32 %f814, %r2627; mov.b32 %f815, %r2628; // begin inline asm cvt.rn.f16x2.f32 %r2134, %f814, %f815; // end inline asm // begin inline asm st.shared.b32 [%r2135], %r2133; // end inline asm add.s32 %r2137, %r2135, 4096; // begin inline asm st.shared.b32 [%r2137], %r2134; // end inline asm xor.b32 %r2141, %r2075, 176; mov.b32 %f816, %r2625; mov.b32 %f817, %r2626; // begin inline asm cvt.rn.f16x2.f32 %r2139, %f816, %f817; // end inline asm mov.b32 %f818, %r2623; mov.b32 %f819, %r2624; // begin inline asm cvt.rn.f16x2.f32 %r2140, %f818, %f819; // end inline asm // begin inline asm st.shared.b32 [%r2141], %r2139; // end inline asm add.s32 %r2143, %r2141, 4096; // begin inline asm st.shared.b32 [%r2143], %r2140; // end inline asm xor.b32 %r2147, %r2075, 192; mov.b32 %f820, %r2621; mov.b32 %f821, %r2622; // begin inline asm cvt.rn.f16x2.f32 %r2145, %f820, %f821; // end inline asm mov.b32 %f822, %r2619; mov.b32 %f823, %r2620; // begin inline asm cvt.rn.f16x2.f32 %r2146, %f822, %f823; // end inline asm // begin inline asm st.shared.b32 [%r2147], %r2145; // end inline asm add.s32 %r2149, %r2147, 4096; // begin inline asm st.shared.b32 [%r2149], %r2146; // end inline asm xor.b32 %r2153, %r2075, 208; mov.b32 %f824, %r2617; mov.b32 %f825, %r2618; // begin inline asm cvt.rn.f16x2.f32 %r2151, %f824, %f825; // end inline asm mov.b32 %f826, %r2615; mov.b32 %f827, %r2616; // begin inline asm cvt.rn.f16x2.f32 %r2152, %f826, %f827; // end inline asm // begin inline asm st.shared.b32 [%r2153], %r2151; // end inline asm add.s32 %r2155, %r2153, 4096; // begin inline asm st.shared.b32 [%r2155], %r2152; // end inline asm xor.b32 %r2159, %r2075, 224; mov.b32 %f828, %r2613; mov.b32 %f829, %r2614; // begin inline asm cvt.rn.f16x2.f32 %r2157, %f828, %f829; // end inline asm mov.b32 %f830, %r2611; mov.b32 %f831, %r2612; // begin inline asm cvt.rn.f16x2.f32 %r2158, %f830, %f831; // end inline asm // begin inline asm st.shared.b32 [%r2159], %r2157; // end inline asm add.s32 %r2161, %r2159, 4096; // begin inline asm st.shared.b32 [%r2161], %r2158; // end inline asm xor.b32 %r2165, %r2075, 240; mov.b32 %f832, %r2609; mov.b32 %f833, %r2610; // begin inline asm cvt.rn.f16x2.f32 %r2163, %f832, %f833; // end inline asm mov.b32 %f834, %r2607; mov.b32 %f835, %r2608; // begin inline asm cvt.rn.f16x2.f32 %r2164, %f834, %f835; // end inline asm // begin inline asm st.shared.b32 [%r2165], %r2163; // end inline asm add.s32 %r2167, %r2165, 4096; // begin inline asm st.shared.b32 [%r2167], %r2164; // end inline asm xor.b32 %r2171, %r2075, 256; mov.b32 %f836, %r2605; mov.b32 %f837, %r2606; // begin inline asm cvt.rn.f16x2.f32 %r2169, %f836, %f837; // end inline asm mov.b32 %f838, %r2603; mov.b32 %f839, %r2604; // begin inline asm cvt.rn.f16x2.f32 %r2170, %f838, %f839; // end inline asm // begin inline asm st.shared.b32 [%r2171], %r2169; // end inline asm add.s32 %r2173, %r2171, 4096; // begin inline asm st.shared.b32 [%r2173], %r2170; // end inline asm xor.b32 %r2177, %r2075, 272; mov.b32 %f840, %r2601; mov.b32 %f841, %r2602; // begin inline asm cvt.rn.f16x2.f32 %r2175, %f840, %f841; // end inline asm mov.b32 %f842, %r2599; mov.b32 %f843, %r2600; // begin inline asm cvt.rn.f16x2.f32 %r2176, %f842, %f843; // end inline asm // begin inline asm st.shared.b32 [%r2177], %r2175; // end inline asm add.s32 %r2179, %r2177, 4096; // begin inline asm st.shared.b32 [%r2179], %r2176; // end inline asm xor.b32 %r2183, %r2075, 288; mov.b32 %f844, %r2597; mov.b32 %f845, %r2598; // begin inline asm cvt.rn.f16x2.f32 %r2181, %f844, %f845; // end inline asm mov.b32 %f846, %r2595; mov.b32 %f847, %r2596; // begin inline asm cvt.rn.f16x2.f32 %r2182, %f846, %f847; // end inline asm // begin inline asm st.shared.b32 [%r2183], %r2181; // end inline asm add.s32 %r2185, %r2183, 4096; // begin inline asm st.shared.b32 [%r2185], %r2182; // end inline asm xor.b32 %r2189, %r2075, 304; mov.b32 %f848, %r2593; mov.b32 %f849, %r2594; // begin inline asm cvt.rn.f16x2.f32 %r2187, %f848, %f849; // end inline asm mov.b32 %f850, %r2591; mov.b32 %f851, %r2592; // begin inline asm cvt.rn.f16x2.f32 %r2188, %f850, %f851; // end inline asm // begin inline asm st.shared.b32 [%r2189], %r2187; // end inline asm add.s32 %r2191, %r2189, 4096; // begin inline asm st.shared.b32 [%r2191], %r2188; // end inline asm bar.sync 0; // begin inline asm ld.shared.v4.b32 {%r2193, %r2194, %r2195, %r2196}, [%r2197]; // end inline asm xor.b32 %r2285, %r2197, 64; add.s32 %r2202, %r2285, 2048; // begin inline asm ld.shared.v4.b32 {%r2198, %r2199, %r2200, %r2201}, [%r2202]; // end inline asm add.s32 %r2207, %r2197, 4096; // begin inline asm ld.shared.v4.b32 {%r2203, %r2204, %r2205, %r2206}, [%r2207]; // end inline asm add.s32 %r2212, %r2285, 6144; // begin inline asm ld.shared.v4.b32 {%r2208, %r2209, %r2210, %r2211}, [%r2212]; // end inline asm add.s32 %r2217, %r2197, 8192; // begin inline asm ld.shared.v4.b32 {%r2213, %r2214, %r2215, %r2216}, [%r2217]; // end inline asm add.s32 %r2222, %r2285, 10240; // begin inline asm ld.shared.v4.b32 {%r2218, %r2219, %r2220, %r2221}, [%r2222]; // end inline asm add.s32 %r2227, %r2197, 12288; // begin inline asm ld.shared.v4.b32 {%r2223, %r2224, %r2225, %r2226}, [%r2227]; // end inline asm add.s32 %r2232, %r2285, 14336; // begin inline asm ld.shared.v4.b32 {%r2228, %r2229, %r2230, %r2231}, [%r2232]; // end inline asm add.s32 %r2237, %r2197, 16384; // begin inline asm ld.shared.v4.b32 {%r2233, %r2234, %r2235, %r2236}, [%r2237]; // end inline asm add.s32 %r2242, %r2285, 18432; // begin inline asm ld.shared.v4.b32 {%r2238, %r2239, %r2240, %r2241}, [%r2242]; // end inline asm add.s32 %r2247, %r2197, 20480; // begin inline asm ld.shared.v4.b32 {%r2243, %r2244, %r2245, %r2246}, [%r2247]; // end inline asm add.s32 %r2252, %r2285, 22528; // begin inline asm ld.shared.v4.b32 {%r2248, %r2249, %r2250, %r2251}, [%r2252]; // end inline asm add.s32 %r2257, %r2197, 24576; // begin inline asm ld.shared.v4.b32 {%r2253, %r2254, %r2255, %r2256}, [%r2257]; // end inline asm add.s32 %r2262, %r2285, 26624; // begin inline asm ld.shared.v4.b32 {%r2258, %r2259, %r2260, %r2261}, [%r2262]; // end inline asm add.s32 %r2267, %r2197, 28672; // begin inline asm ld.shared.v4.b32 {%r2263, %r2264, %r2265, %r2266}, [%r2267]; // end inline asm add.s32 %r2272, %r2285, 30720; // begin inline asm ld.shared.v4.b32 {%r2268, %r2269, %r2270, %r2271}, [%r2272]; // end inline asm mul.lo.s32 %r2290, %r2330, %r2332; shl.b32 %r2291, %r2290, 1; cvt.s64.s32 %rd127, %r2291; add.s64 %rd32, %rd127, %rd36; cvt.u32.u64 %r2293, %rd4; setp.ge.s32 %p102, %r2293, %r1; @%p102 bra $L__BB0_93; mov.b64 %rd211, fmha_v2_flash_attention_fp16_64_16_S_160_sm86_kernel_nl_param_0; mov.u64 %rd210, %rd211; ld.param.u32 %r2344, [%rd210+60]; mov.u32 %r2343, %tid.x; shr.s32 %r2342, %r2344, 31; shr.u32 %r2341, %r2342, 29; add.s32 %r2340, %r2344, %r2341; shr.s32 %r2339, %r2340, 3; shr.s32 %r2338, %r2343, 31; shr.u32 %r2337, %r2338, 27; add.s32 %r2336, %r2343, %r2337; and.b32 %r2335, %r2336, -32; sub.s32 %r2334, %r2343, %r2335; setp.ge.s32 %p103, %r2334, %r2339; @%p103 bra $L__BB0_49; mul.lo.s64 %rd129, %rd11, %rd4; add.s64 %rd130, %rd32, %rd129; cvta.to.global.u64 %rd131, %rd12; add.s64 %rd132, %rd131, %rd130; st.global.v4.u32 [%rd132], {%r2193, %r2194, %r2195, %r2196}; $L__BB0_49: add.s32 %r2295, %r2293, 4; setp.ge.s32 %p104, %r2295, %r1; @%p104 bra $L__BB0_93; @%p103 bra $L__BB0_52; add.s64 %rd133, %rd4, 4; mul.lo.s64 %rd134, %rd133, %rd11; add.s64 %rd135, %rd32, %rd134; cvta.to.global.u64 %rd136, %rd12; add.s64 %rd137, %rd136, %rd135; st.global.v4.u32 [%rd137], {%r2198, %r2199, %r2200, %r2201}; $L__BB0_52: add.s32 %r2297, %r2293, 8; setp.ge.s32 %p106, %r2297, %r1; @%p106 bra $L__BB0_93; @%p103 bra $L__BB0_55; add.s64 %rd138, %rd4, 8; mul.lo.s64 %rd139, %rd138, %rd11; add.s64 %rd140, %rd32, %rd139; cvta.to.global.u64 %rd141, %rd12; add.s64 %rd142, %rd141, %rd140; st.global.v4.u32 [%rd142], {%r2203, %r2204, %r2205, %r2206}; $L__BB0_55: add.s32 %r2299, %r2293, 12; setp.ge.s32 %p108, %r2299, %r1; @%p108 bra $L__BB0_93; @%p103 bra $L__BB0_58; add.s64 %rd143, %rd4, 12; mul.lo.s64 %rd144, %rd143, %rd11; add.s64 %rd145, %rd32, %rd144; cvta.to.global.u64 %rd146, %rd12; add.s64 %rd147, %rd146, %rd145; st.global.v4.u32 [%rd147], {%r2208, %r2209, %r2210, %r2211}; $L__BB0_58: add.s32 %r2301, %r2293, 16; setp.ge.s32 %p110, %r2301, %r1; @%p110 bra $L__BB0_93; @%p103 bra $L__BB0_61; add.s64 %rd148, %rd4, 16; mul.lo.s64 %rd149, %rd148, %rd11; add.s64 %rd150, %rd32, %rd149; cvta.to.global.u64 %rd151, %rd12; add.s64 %rd152, %rd151, %rd150; st.global.v4.u32 [%rd152], {%r2213, %r2214, %r2215, %r2216}; $L__BB0_61: add.s32 %r2303, %r2293, 20; setp.ge.s32 %p112, %r2303, %r1; @%p112 bra $L__BB0_93; @%p103 bra $L__BB0_64; add.s64 %rd153, %rd4, 20; mul.lo.s64 %rd154, %rd153, %rd11; add.s64 %rd155, %rd32, %rd154; cvta.to.global.u64 %rd156, %rd12; add.s64 %rd157, %rd156, %rd155; st.global.v4.u32 [%rd157], {%r2218, %r2219, %r2220, %r2221}; $L__BB0_64: add.s32 %r2305, %r2293, 24; setp.ge.s32 %p114, %r2305, %r1; @%p114 bra $L__BB0_93; @%p103 bra $L__BB0_67; add.s64 %rd158, %rd4, 24; mul.lo.s64 %rd159, %rd158, %rd11; add.s64 %rd160, %rd32, %rd159; cvta.to.global.u64 %rd161, %rd12; add.s64 %rd162, %rd161, %rd160; st.global.v4.u32 [%rd162], {%r2223, %r2224, %r2225, %r2226}; $L__BB0_67: add.s32 %r2307, %r2293, 28; setp.ge.s32 %p116, %r2307, %r1; @%p116 bra $L__BB0_93; @%p103 bra $L__BB0_70; add.s64 %rd163, %rd4, 28; mul.lo.s64 %rd164, %rd163, %rd11; add.s64 %rd165, %rd32, %rd164; cvta.to.global.u64 %rd166, %rd12; add.s64 %rd167, %rd166, %rd165; st.global.v4.u32 [%rd167], {%r2228, %r2229, %r2230, %r2231}; $L__BB0_70: add.s32 %r2309, %r2293, 32; setp.ge.s32 %p118, %r2309, %r1; @%p118 bra $L__BB0_93; @%p103 bra $L__BB0_73; add.s64 %rd168, %rd4, 32; mul.lo.s64 %rd169, %rd168, %rd11; add.s64 %rd170, %rd32, %rd169; cvta.to.global.u64 %rd171, %rd12; add.s64 %rd172, %rd171, %rd170; st.global.v4.u32 [%rd172], {%r2233, %r2234, %r2235, %r2236}; $L__BB0_73: add.s32 %r2311, %r2293, 36; setp.ge.s32 %p120, %r2311, %r1; @%p120 bra $L__BB0_93; @%p103 bra $L__BB0_76; add.s64 %rd173, %rd4, 36; mul.lo.s64 %rd174, %rd173, %rd11; add.s64 %rd175, %rd32, %rd174; cvta.to.global.u64 %rd176, %rd12; add.s64 %rd177, %rd176, %rd175; st.global.v4.u32 [%rd177], {%r2238, %r2239, %r2240, %r2241}; $L__BB0_76: add.s32 %r2313, %r2293, 40; setp.ge.s32 %p122, %r2313, %r1; @%p122 bra $L__BB0_93; @%p103 bra $L__BB0_79; add.s64 %rd178, %rd4, 40; mul.lo.s64 %rd179, %rd178, %rd11; add.s64 %rd180, %rd32, %rd179; cvta.to.global.u64 %rd181, %rd12; add.s64 %rd182, %rd181, %rd180; st.global.v4.u32 [%rd182], {%r2243, %r2244, %r2245, %r2246}; $L__BB0_79: add.s32 %r2315, %r2293, 44; setp.ge.s32 %p124, %r2315, %r1; @%p124 bra $L__BB0_93; @%p103 bra $L__BB0_82; add.s64 %rd183, %rd4, 44; mul.lo.s64 %rd184, %rd183, %rd11; add.s64 %rd185, %rd32, %rd184; cvta.to.global.u64 %rd186, %rd12; add.s64 %rd187, %rd186, %rd185; st.global.v4.u32 [%rd187], {%r2248, %r2249, %r2250, %r2251}; $L__BB0_82: add.s32 %r2317, %r2293, 48; setp.ge.s32 %p126, %r2317, %r1; @%p126 bra $L__BB0_93; @%p103 bra $L__BB0_85; add.s64 %rd188, %rd4, 48; mul.lo.s64 %rd189, %rd188, %rd11; add.s64 %rd190, %rd32, %rd189; cvta.to.global.u64 %rd191, %rd12; add.s64 %rd192, %rd191, %rd190; st.global.v4.u32 [%rd192], {%r2253, %r2254, %r2255, %r2256}; $L__BB0_85: add.s32 %r2319, %r2293, 52; setp.ge.s32 %p128, %r2319, %r1; @%p128 bra $L__BB0_93; @%p103 bra $L__BB0_88; add.s64 %rd193, %rd4, 52; mul.lo.s64 %rd194, %rd193, %rd11; add.s64 %rd195, %rd32, %rd194; cvta.to.global.u64 %rd196, %rd12; add.s64 %rd197, %rd196, %rd195; st.global.v4.u32 [%rd197], {%r2258, %r2259, %r2260, %r2261}; $L__BB0_88: add.s32 %r2321, %r2293, 56; setp.ge.s32 %p130, %r2321, %r1; @%p130 bra $L__BB0_93; @%p103 bra $L__BB0_91; add.s64 %rd198, %rd4, 56; mul.lo.s64 %rd199, %rd198, %rd11; add.s64 %rd200, %rd32, %rd199; cvta.to.global.u64 %rd201, %rd12; add.s64 %rd202, %rd201, %rd200; st.global.v4.u32 [%rd202], {%r2263, %r2264, %r2265, %r2266}; $L__BB0_91: add.s32 %r2323, %r2293, 60; setp.ge.s32 %p132, %r2323, %r1; or.pred %p134, %p132, %p103; @%p134 bra $L__BB0_93; add.s64 %rd203, %rd4, 60; mul.lo.s64 %rd204, %rd203, %rd11; add.s64 %rd205, %rd32, %rd204; cvta.to.global.u64 %rd206, %rd12; add.s64 %rd207, %rd206, %rd205; st.global.v4.u32 [%rd207], {%r2268, %r2269, %r2270, %r2271}; $L__BB0_93: ret; }