5, [fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0+232]; mul.lo.s32 %r362, %r5, %r359; ld.param.u32 %r6, [fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0+224]; mad.lo.s32 %r7, %r362, %r6, %r361; mov.u32 %r363, %ctaid.z; shl.b32 %r8, %r363, 4; setp.le.s32 %p19, %r2, %r8; @%p19 bra $L__BB0_86; shr.s32 %r372, %r1, 31; shr.u32 %r373, %r372, 27; add.s32 %r374, %r1, %r373; shr.s32 %r9, %r374, 5; and.b32 %r375, %r374, -32; sub.s32 %r10, %r1, %r375; ld.param.u64 %rd43, [%rd1+176]; ld.param.u32 %r381, [%rd1+196]; shl.b32 %r382, %r381, 1; shr.s32 %r383, %r381, 31; shr.u32 %r384, %r383, 29; add.s32 %r385, %r381, %r384; shr.s32 %r386, %r385, 3; setp.lt.s32 %p20, %r10, %r386; add.s32 %r387, %r9, %r8; cvt.s64.s32 %rd2, %r387; ld.param.u64 %rd44, [%rd1+184]; mul.lo.s64 %rd45, %rd44, %rd2; mul.wide.s32 %rd46, %r382, %r4; shl.b32 %r388, %r10, 4; cvt.s64.s32 %rd47, %r388; shr.s32 %r389, %r374, 31; shr.u32 %r390, %r389, 29; add.s32 %r391, %r9, %r390; and.b32 %r392, %r391, 268435448; sub.s32 %r393, %r9, %r392; xor.b32 %r394, %r393, %r10; shl.b32 %r395, %r9, 9; shl.b32 %r12, %r394, 4; add.s32 %r396, %r12, %r395; mov.u32 %r397, 31; mov.u32 %r2478, 0; mov.u32 %r398, -1; shfl.sync.idx.b32 %r13|%p21, %r2478, %r2478, %r397, %r398; shfl.sync.idx.b32 %r399|%p22, %r2478, %r2478, %r397, %r398; ld.param.u64 %rd5, [%rd1+208]; ld.param.u32 %r16, [%rd1+228]; shl.b32 %r400, %r16, 1; shr.s32 %r401, %r16, 31; shr.u32 %r402, %r401, 29; add.s32 %r403, %r16, %r402; shr.s32 %r17, %r403, 3; setp.lt.s32 %p1, %r10, %r17; setp.ge.s32 %p23, %r10, %r17; cvt.s64.s32 %rd48, %r9; ld.param.u64 %rd3, [%rd1+216]; mul.lo.s64 %rd49, %rd3, %rd48; shl.b32 %r404, %r7, 1; mul.wide.s32 %rd50, %r400, %r404; add.s64 %rd51, %rd50, %rd47; add.s64 %rd52, %rd51, %rd49; cvta.to.global.u64 %rd53, %rd5; add.s64 %rd4, %rd53, %rd52; shfl.sync.idx.b32 %r14|%p24, %r2478, %r2478, %r397, %r398; shfl.sync.idx.b32 %r15|%p25, %r2478, %r2478, %r397, %r398; or.b32 %r405, %r404, 1; mul.wide.s32 %rd54, %r400, %r405; add.s64 %rd55, %rd54, %rd47; add.s64 %rd56, %rd55, %rd49; add.s64 %rd7, %rd53, %rd56; shfl.sync.idx.b32 %r406|%p26, %r2478, %r2478, %r397, %r398; shfl.sync.idx.b32 %r18|%p27, %r2478, %r2478, %r397, %r398; sub.s32 %r410, %r2, %r8; min.s32 %r411, %r410, 16; setp.lt.s32 %p28, %r9, %r411; and.pred %p29, %p20, %p28; add.s32 %r21, %r9, 8; setp.lt.s32 %p30, %r21, %r411; and.pred %p31, %p20, %p30; add.s64 %rd57, %rd45, %rd47; add.s64 %rd58, %rd57, %rd46; shl.b64 %rd59, %rd44, 3; add.s64 %rd41, %rd43, %rd58; add.s64 %rd42, %rd41, %rd59; mov.u32 %r412, _ZN25fused_multihead_attention5smem_E; add.s32 %r413, %r396, %r412; add.s32 %r364, %r413, %r399; add.s32 %r366, %r364, 4096; selp.b32 %r365, 16, 0, %p29; // begin inline asm cp.async.cg.shared.global [%r364], [%rd41], 16, %r365; // end inline asm selp.b32 %r367, 16, 0, %p31; // begin inline asm cp.async.cg.shared.global [%r366], [%rd42], 16, %r367; // end inline asm min.s32 %r22, %r5, 128; setp.ge.s32 %p32, %r9, %r22; or.pred %p33, %p23, %p32; mov.u32 %r2474, %r2478; mov.u32 %r2475, %r2478; mov.u32 %r2476, %r2478; mov.u32 %r2477, %r2478; @%p33 bra $L__BB0_3; ld.global.v4.u32 {%r2474, %r2475, %r2476, %r2477}, [%rd4]; $L__BB0_3: shl.b64 %rd10, %rd3, 3; add.s64 %rd11, %rd4, %rd10; setp.ge.s32 %p34, %r21, %r22; not.pred %p35, %p1; or.pred %p36, %p35, %p34; mov.u32 %r2479, %r2478; mov.u32 %r2480, %r2478; mov.u32 %r2481, %r2478; @%p36 bra $L__BB0_5; ld.global.v4.u32 {%r2478, %r2479, %r2480, %r2481}, [%rd11]; $L__BB0_5: add.s64 %rd12, %rd11, %rd10; add.s32 %r430, %r9, 16; setp.ge.s32 %p37, %r430, %r22; mov.u32 %r2486, 0; or.pred %p39, %p35, %p37; mov.u32 %r2482, %r2486; mov.u32 %r2483, %r2486; mov.u32 %r2484, %r2486; mov.u32 %r2485, %r2486; @%p39 bra $L__BB0_7; ld.global.v4.u32 {%r2482, %r2483, %r2484, %r2485}, [%rd12]; $L__BB0_7: add.s64 %rd13, %rd12, %rd10; add.s32 %r439, %r9, 24; setp.ge.s32 %p40, %r439, %r22; or.pred %p42, %p35, %p40; mov.u32 %r2487, %r2486; mov.u32 %r2488, %r2486; mov.u32 %r2489, %r2486; @%p42 bra $L__BB0_9; ld.global.v4.u32 {%r2486, %r2487, %r2488, %r2489}, [%rd13]; $L__BB0_9: add.s64 %rd14, %rd13, %rd10; add.s32 %r448, %r9, 32; setp.ge.s32 %p43, %r448, %r22; mov.u32 %r2494, 0; or.pred %p45, %p35, %p43; mov.u32 %r2490, %r2494; mov.u32 %r2491, %r2494; mov.u32 %r2492, %r2494; mov.u32 %r2493, %r2494; @%p45 bra $L__BB0_11; ld.global.v4.u32 {%r2490, %r2491, %r2492, %r2493}, [%rd14]; $L__BB0_11: add.s64 %rd15, %rd14, %rd10; add.s32 %r457, %r9, 40; setp.ge.s32 %p46, %r457, %r22; or.pred %p48, %p35, %p46; mov.u32 %r2495, %r2494; mov.u32 %r2496, %r2494; mov.u32 %r2497, %r2494; @%p48 bra $L__BB0_13; ld.global.v4.u32 {%r2494, %r2495, %r2496, %r2497}, [%rd15]; $L__BB0_13: add.s64 %rd16, %rd15, %rd10; add.s32 %r466, %r9, 48; setp.ge.s32 %p49, %r466, %r22; mov.u32 %r2502, 0; or.pred %p51, %p35, %p49; mov.u32 %r2498, %r2502; mov.u32 %r2499, %r2502; mov.u32 %r2500, %r2502; mov.u32 %r2501, %r2502; @%p51 bra $L__BB0_15; ld.global.v4.u32 {%r2498, %r2499, %r2500, %r2501}, [%rd16]; $L__BB0_15: add.s64 %rd17, %rd16, %rd10; add.s32 %r475, %r9, 56; setp.ge.s32 %p52, %r475, %r22; or.pred %p54, %p35, %p52; mov.u32 %r2503, %r2502; mov.u32 %r2504, %r2502; mov.u32 %r2505, %r2502; @%p54 bra $L__BB0_17; ld.global.v4.u32 {%r2502, %r2503, %r2504, %r2505}, [%rd17]; $L__BB0_17: add.s64 %rd18, %rd17, %rd10; add.s32 %r484, %r9, 64; setp.ge.s32 %p55, %r484, %r22; mov.u32 %r2510, 0; or.pred %p57, %p35, %p55; mov.u32 %r2506, %r2510; mov.u32 %r2507, %r2510; mov.u32 %r2508, %r2510; mov.u32 %r2509, %r2510; @%p57 bra $L__BB0_19; ld.global.v4.u32 {%r2506, %r2507, %r2508, %r2509}, [%rd18]; $L__BB0_19: add.s64 %rd19, %rd18, %rd10; add.s32 %r493, %r9, 72; setp.ge.s32 %p58, %r493, %r22; or.pred %p60, %p35, %p58; mov.u32 %r2511, %r2510; mov.u32 %r2512, %r2510; mov.u32 %r2513, %r2510; @%p60 bra $L__BB0_21; ld.global.v4.u32 {%r2510, %r2511, %r2512, %r2513}, [%rd19]; $L__BB0_21: add.s64 %rd20, %rd19, %rd10; add.s32 %r502, %r9, 80; setp.ge.s32 %p61, %r502, %r22; mov.u32 %r2518, 0; or.pred %p63, %p35, %p61; mov.u32 %r2514, %r2518; mov.u32 %r2515, %r2518; mov.u32 %r2516, %r2518; mov.u32 %r2517, %r2518; @%p63 bra $L__BB0_23; ld.global.v4.u32 {%r2514, %r2515, %r2516, %r2517}, [%rd20]; $L__BB0_23: add.s64 %rd21, %rd20, %rd10; add.s32 %r511, %r9, 88; setp.ge.s32 %p64, %r511, %r22; or.pred %p66, %p35, %p64; mov.u32 %r2519, %r2518; mov.u32 %r2520, %r2518; mov.u32 %r2521, %r2518; @%p66 bra $L__BB0_25; ld.global.v4.u32 {%r2518, %r2519, %r2520, %r2521}, [%rd21]; $L__BB0_25: add.s64 %rd22, %rd21, %rd10; add.s32 %r520, %r9, 96; setp.ge.s32 %p67, %r520, %r22; mov.u32 %r2526, 0; or.pred %p69, %p35, %p67; mov.u32 %r2522, %r2526; mov.u32 %r2523, %r2526; mov.u32 %r2524, %r2526; mov.u32 %r2525, %r2526; @%p69 bra $L__BB0_27; ld.global.v4.u32 {%r2522, %r2523, %r2524, %r2525}, [%rd22]; $L__BB0_27: add.s64 %rd23, %rd22, %rd10; add.s32 %r529, %r9, 104; setp.ge.s32 %p70, %r529, %r22; or.pred %p72, %p35, %p70; mov.u32 %r2527, %r2526; mov.u32 %r2528, %r2526; mov.u32 %r2529, %r2526; @%p72 bra $L__BB0_29; ld.global.v4.u32 {%r2526, %r2527, %r2528, %r2529}, [%rd23]; $L__BB0_29: add.s64 %rd24, %rd23, %rd10; add.s32 %r538, %r9, 112; setp.ge.s32 %p73, %r538, %r22; mov.u32 %r2534, 0; or.pred %p75, %p35, %p73; mov.u32 %r2530, %r2534; mov.u32 %r2531, %r2534; mov.u32 %r2532, %r2534; mov.u32 %r2533, %r2534; @%p75 bra $L__BB0_31; ld.global.v4.u32 {%r2530, %r2531, %r2532, %r2533}, [%rd24]; $L__BB0_31: add.s64 %rd25, %rd24, %rd10; add.s32 %r547, %r9, 120; setp.ge.s32 %p76, %r547, %r22; or.pred %p78, %p35, %p76; mov.u32 %r2535, %r2534; mov.u32 %r2536, %r2534; mov.u32 %r2537, %r2534; @%p78 bra $L__BB0_33; ld.global.v4.u32 {%r2534, %r2535, %r2536, %r2537}, [%rd25]; $L__BB0_33: add.s64 %rd26, %rd7, %rd10; add.s64 %rd27, %rd26, %rd10; add.s64 %rd28, %rd27, %rd10; add.s64 %rd29, %rd28, %rd10; add.s64 %rd30, %rd29, %rd10; add.s64 %rd31, %rd30, %rd10; add.s64 %rd32, %rd31, %rd10; add.s64 %rd33, %rd32, %rd10; add.s64 %rd34, %rd33, %rd10; add.s64 %rd35, %rd34, %rd10; add.s64 %rd36, %rd35, %rd10; add.s64 %rd37, %rd36, %rd10; add.s64 %rd38, %rd37, %rd10; mov.u32 %r2542, 0; mov.u32 %r2538, %r2542; mov.u32 %r2539, %r2542; mov.u32 %r2540, %r2542; mov.u32 %r2541, %r2542; @%p33 bra $L__BB0_35; ld.global.v4.u32 {%r2538, %r2539, %r2540, %r2541}, [%rd7]; $L__BB0_35: mov.u32 %r2448, %tid.x; shr.s32 %r2447, %r2448, 31; shr.u32 %r2446, %r2447, 27; add.s32 %r2445, %r2448, %r2446; shr.s32 %r2444, %r2445, 5; add.s32 %r2443, %r2444, 16; add.s32 %r2442, %r2444, 24; add.s32 %r2441, %r2444, 32; add.s32 %r2440, %r2444, 40; add.s32 %r2439, %r2444, 48; add.s32 %r2438, %r2444, 56; add.s32 %r2437, %r2444, 64; add.s32 %r2436, %r2444, 72; add.s32 %r2435, %r2444, 80; add.s32 %r2434, %r2444, 88; add.s32 %r2433, %r2444, 96; add.s32 %r2432, %r2444, 104; add.s32 %r2431, %r2444, 112; add.s32 %r2430, %r2444, 120; ld.param.u32 %r2429, [fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0+232]; min.s32 %r2428, %r2429, 128; setp.lt.s32 %p82, %r2430, %r2428; and.pred %p3, %p1, %p82; setp.lt.s32 %p84, %r2431, %r2428; and.pred %p4, %p1, %p84; setp.lt.s32 %p85, %r2432, %r2428; and.pred %p5, %p1, %p85; setp.lt.s32 %p86, %r2433, %r2428; and.pred %p6, %p1, %p86; setp.lt.s32 %p87, %r2434, %r2428; and.pred %p7, %p1, %p87; setp.lt.s32 %p88, %r2435, %r2428; and.pred %p8, %p1, %p88; setp.lt.s32 %p89, %r2436, %r2428; and.pred %p9, %p1, %p89; setp.lt.s32 %p90, %r2437, %r2428; and.pred %p10, %p1, %p90; setp.lt.s32 %p91, %r2438, %r2428; and.pred %p11, %p1, %p91; setp.lt.s32 %p92, %r2439, %r2428; and.pred %p12, %p1, %p92; setp.lt.s32 %p93, %r2440, %r2428; and.pred %p13, %p1, %p93; setp.lt.s32 %p94, %r2441, %r2428; and.pred %p14, %p1, %p94; setp.lt.s32 %p95, %r2442, %r2428; and.pred %p15, %p1, %p95; setp.lt.s32 %p96, %r2443, %r2428; and.pred %p16, %p1, %p96; mov.u32 %r2543, %r2542; mov.u32 %r2544, %r2542; mov.u32 %r2545, %r2542; @%p36 bra $L__BB0_37; mov.b64 %rd171, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd170, %rd171; ld.param.u64 %rd169, [%rd170+216]; shl.b64 %rd168, %rd169, 3; add.s64 %rd167, %rd7, %rd168; ld.global.v4.u32 {%r2542, %r2543, %r2544, %r2545}, [%rd167]; $L__BB0_37: mov.u32 %r2550, 0; not.pred %p99, %p16; mov.u32 %r2546, %r2550; mov.u32 %r2547, %r2550; mov.u32 %r2548, %r2550; mov.u32 %r2549, %r2550; @%p99 bra $L__BB0_39; mov.b64 %rd166, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd165, %rd166; ld.param.u64 %rd164, [%rd165+216]; shl.b64 %rd163, %rd164, 3; add.s64 %rd162, %rd7, %rd163; add.s64 %rd161, %rd162, %rd163; ld.global.v4.u32 {%r2546, %r2547, %r2548, %r2549}, [%rd161]; $L__BB0_39: not.pred %p100, %p15; mov.u32 %r2551, %r2550; mov.u32 %r2552, %r2550; mov.u32 %r2553, %r2550; @%p100 bra $L__BB0_41; ld.global.v4.u32 {%r2550, %r2551, %r2552, %r2553}, [%rd28]; $L__BB0_41: mov.u32 %r2558, 0; not.pred %p101, %p14; mov.u32 %r2554, %r2558; mov.u32 %r2555, %r2558; mov.u32 %r2556, %r2558; mov.u32 %r2557, %r2558; @%p101 bra $L__BB0_43; ld.global.v4.u32 {%r2554, %r2555, %r2556, %r2557}, [%rd29]; $L__BB0_43: not.pred %p102, %p13; mov.u32 %r2559, %r2558; mov.u32 %r2560, %r2558; mov.u32 %r2561, %r2558; @%p102 bra $L__BB0_45; mov.b64 %rd160, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd159, %rd160; ld.param.u64 %rd158, [%rd159+216]; shl.b64 %rd157, %rd158, 3; add.s64 %rd156, %rd29, %rd157; ld.global.v4.u32 {%r2558, %r2559, %r2560, %r2561}, [%rd156]; $L__BB0_45: mov.u32 %r2566, 0; not.pred %p103, %p12; mov.u32 %r2562, %r2566; mov.u32 %r2563, %r2566; mov.u32 %r2564, %r2566; mov.u32 %r2565, %r2566; @%p103 bra $L__BB0_47; mov.b64 %rd155, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd154, %rd155; ld.param.u64 %rd153, [%rd154+216]; shl.b64 %rd152, %rd153, 3; add.s64 %rd151, %rd29, %rd152; add.s64 %rd150, %rd151, %rd152; ld.global.v4.u32 {%r2562, %r2563, %r2564, %r2565}, [%rd150]; $L__BB0_47: not.pred %p104, %p11; mov.u32 %r2567, %r2566; mov.u32 %r2568, %r2566; mov.u32 %r2569, %r2566; @%p104 bra $L__BB0_49; ld.global.v4.u32 {%r2566, %r2567, %r2568, %r2569}, [%rd32]; $L__BB0_49: mov.u32 %r2574, 0; not.pred %p105, %p10; mov.u32 %r2570, %r2574; mov.u32 %r2571, %r2574; mov.u32 %r2572, %r2574; mov.u32 %r2573, %r2574; @%p105 bra $L__BB0_51; ld.global.v4.u32 {%r2570, %r2571, %r2572, %r2573}, [%rd33]; $L__BB0_51: not.pred %p106, %p9; mov.u32 %r2575, %r2574; mov.u32 %r2576, %r2574; mov.u32 %r2577, %r2574; @%p106 bra $L__BB0_53; mov.b64 %rd149, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd148, %rd149; ld.param.u64 %rd147, [%rd148+216]; shl.b64 %rd146, %rd147, 3; add.s64 %rd145, %rd33, %rd146; ld.global.v4.u32 {%r2574, %r2575, %r2576, %r2577}, [%rd145]; $L__BB0_53: mov.u32 %r2582, 0; not.pred %p107, %p8; mov.u32 %r2578, %r2582; mov.u32 %r2579, %r2582; mov.u32 %r2580, %r2582; mov.u32 %r2581, %r2582; @%p107 bra $L__BB0_55; mov.b64 %rd144, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd143, %rd144; ld.param.u64 %rd142, [%rd143+216]; shl.b64 %rd141, %rd142, 3; add.s64 %rd140, %rd33, %rd141; add.s64 %rd139, %rd140, %rd141; ld.global.v4.u32 {%r2578, %r2579, %r2580, %r2581}, [%rd139]; $L__BB0_55: not.pred %p108, %p7; mov.u32 %r2583, %r2582; mov.u32 %r2584, %r2582; mov.u32 %r2585, %r2582; @%p108 bra $L__BB0_57; ld.global.v4.u32 {%r2582, %r2583, %r2584, %r2585}, [%rd36]; $L__BB0_57: mov.u32 %r2590, 0; not.pred %p109, %p6; mov.u32 %r2586, %r2590; mov.u32 %r2587, %r2590; mov.u32 %r2588, %r2590; mov.u32 %r2589, %r2590; @%p109 bra $L__BB0_59; ld.global.v4.u32 {%r2586, %r2587, %r2588, %r2589}, [%rd37]; $L__BB0_59: not.pred %p110, %p5; mov.u32 %r2591, %r2590; mov.u32 %r2592, %r2590; mov.u32 %r2593, %r2590; @%p110 bra $L__BB0_61; ld.global.v4.u32 {%r2590, %r2591, %r2592, %r2593}, [%rd38]; $L__BB0_61: mov.u32 %r2598, 0; not.pred %p111, %p4; mov.u32 %r2594, %r2598; mov.u32 %r2595, %r2598; mov.u32 %r2596, %r2598; mov.u32 %r2597, %r2598; @%p111 bra $L__BB0_63; add.s64 %rd83, %rd38, %rd10; ld.global.v4.u32 {%r2594, %r2595, %r2596, %r2597}, [%rd83]; $L__BB0_63: not.pred %p112, %p3; mov.u32 %r2599, %r2598; mov.u32 %r2600, %r2598; mov.u32 %r2601, %r2598; @%p112 bra $L__BB0_65; add.s64 %rd106, %rd38, %rd10; add.s64 %rd107, %rd106, %rd10; ld.global.v4.u32 {%r2598, %r2599, %r2600, %r2601}, [%rd107]; $L__BB0_65: mov.u32 %r2473, %tid.x; shr.s32 %r2472, %r2473, 31; shr.s32 %r2471, %r2473, 31; shr.u32 %r2470, %r2471, 27; add.s32 %r2469, %r2473, %r2470; and.b32 %r2468, %r2469, -32; sub.s32 %r2467, %r2473, %r2468; shr.s32 %r2466, %r2467, 31; shr.u32 %r2465, %r2466, 30; add.s32 %r2464, %r2467, %r2465; shr.s32 %r2463, %r2469, 5; and.b32 %r2462, %r2464, 2147483644; sub.s32 %r2461, %r2467, %r2462; shl.b32 %r2460, %r2461, 1; shr.s32 %r2459, %r2469, 31; shr.u32 %r2458, %r2459, 29; add.s32 %r2457, %r2463, %r2458; and.b32 %r2456, %r2457, 268435448; sub.s32 %r2455, %r2463, %r2456; xor.b32 %r2454, %r2455, %r2467; shl.b32 %r2453, %r2463, 9; shl.b32 %r2452, %r2454, 4; add.s32 %r2451, %r2452, %r2453; mov.u32 %r2450, _ZN25fused_multihead_attention5smem_E; ld.param.u32 %r2449, [fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0+232]; mov.b64 %rd138, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd137, %rd138; // begin inline asm cp.async.commit_group; // end inline asm add.s32 %r1466, %r2450, 16384; add.s32 %r1467, %r2451, %r1466; add.s32 %r711, %r1467, %r15; add.s32 %r716, %r711, 4096; add.s32 %r721, %r711, 8192; add.s32 %r726, %r711, 12288; add.s32 %r731, %r711, 16384; add.s32 %r736, %r711, 20480; add.s32 %r741, %r711, 24576; add.s32 %r746, %r711, 28672; add.s32 %r751, %r711, 32768; add.s32 %r756, %r711, 36864; add.s32 %r761, %r711, 40960; add.s32 %r766, %r711, 45056; add.s32 %r771, %r711, 49152; add.s32 %r776, %r711, 53248; add.s32 %r781, %r711, 57344; add.s32 %r786, %r711, 61440; // begin inline asm st.shared.v4.b32 [%r711], {%r2474, %r2475, %r2476, %r2477}; // end inline asm // begin inline asm st.shared.v4.b32 [%r716], {%r2478, %r2479, %r2480, %r2481}; // end inline asm // begin inline asm st.shared.v4.b32 [%r721], {%r2482, %r2483, %r2484, %r2485}; // end inline asm // begin inline asm st.shared.v4.b32 [%r726], {%r2486, %r2487, %r2488, %r2489}; // end inline asm // begin inline asm st.shared.v4.b32 [%r731], {%r2490, %r2491, %r2492, %r2493}; // end inline asm // begin inline asm st.shared.v4.b32 [%r736], {%r2494, %r2495, %r2496, %r2497}; // end inline asm // begin inline asm st.shared.v4.b32 [%r741], {%r2498, %r2499, %r2500, %r2501}; // end inline asm // begin inline asm st.shared.v4.b32 [%r746], {%r2502, %r2503, %r2504, %r2505}; // end inline asm // begin inline asm st.shared.v4.b32 [%r751], {%r2506, %r2507, %r2508, %r2509}; // end inline asm // begin inline asm st.shared.v4.b32 [%r756], {%r2510, %r2511, %r2512, %r2513}; // end inline asm // begin inline asm st.shared.v4.b32 [%r761], {%r2514, %r2515, %r2516, %r2517}; // end inline asm // begin inline asm st.shared.v4.b32 [%r766], {%r2518, %r2519, %r2520, %r2521}; // end inline asm // begin inline asm st.shared.v4.b32 [%r771], {%r2522, %r2523, %r2524, %r2525}; // end inline asm // begin inline asm st.shared.v4.b32 [%r776], {%r2526, %r2527, %r2528, %r2529}; // end inline asm // begin inline asm st.shared.v4.b32 [%r781], {%r2530, %r2531, %r2532, %r2533}; // end inline asm // begin inline asm st.shared.v4.b32 [%r786], {%r2534, %r2535, %r2536, %r2537}; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; and.b32 %r1469, %r2473, 15; shl.b32 %r1470, %r2473, 9; and.b32 %r1471, %r1470, 7680; and.b32 %r1472, %r2473, 7; shl.b32 %r1473, %r2473, 4; and.b32 %r1474, %r1473, 112; and.b32 %r1475, %r2473, 16; xor.b32 %r1476, %r1474, %r1475; or.b32 %r1477, %r1476, %r1471; add.s32 %r1478, %r1477, %r2450; add.s32 %r795, %r1478, %r13; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r791, %r792, %r793, %r794}, [%r795]; // end inline asm and.b32 %r1479, %r2473, 224; shr.u32 %r1480, %r1479, 1; mov.u32 %r1481, 1; or.b32 %r1482, %r1480, %r1472; shr.u32 %r1483, %r1475, 1; or.b32 %r1484, %r1482, %r1483; shl.b32 %r1485, %r1484, 9; and.b32 %r1486, %r2473, 8; shr.u32 %r1487, %r1486, 3; xor.b32 %r1488, %r1487, %r1472; shl.b32 %r1489, %r1488, 4; or.b32 %r1490, %r1485, %r1489; add.s32 %r1491, %r14, %r1466; add.s32 %r800, %r1491, %r1490; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r796, %r797, %r798, %r799}, [%r800]; // end inline asm xor.b32 %r1492, %r1490, 32; add.s32 %r805, %r1491, %r1492; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r801, %r802, %r803, %r804}, [%r805]; // end inline asm xor.b32 %r1493, %r1490, 64; add.s32 %r810, %r1491, %r1493; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r806, %r807, %r808, %r809}, [%r810]; // end inline asm xor.b32 %r1494, %r1490, 96; add.s32 %r815, %r1491, %r1494; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r811, %r812, %r813, %r814}, [%r815]; // end inline asm or.b32 %r1495, %r1490, 128; add.s32 %r820, %r1491, %r1495; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r816, %r817, %r818, %r819}, [%r820]; // end inline asm xor.b32 %r1496, %r1490, 160; add.s32 %r825, %r1491, %r1496; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r821, %r822, %r823, %r824}, [%r825]; // end inline asm xor.b32 %r1497, %r1490, 192; add.s32 %r830, %r1491, %r1497; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r826, %r827, %r828, %r829}, [%r830]; // end inline asm xor.b32 %r1498, %r1490, 224; add.s32 %r835, %r1491, %r1498; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r831, %r832, %r833, %r834}, [%r835]; // end inline asm or.b32 %r1499, %r1490, 256; add.s32 %r840, %r1491, %r1499; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r836, %r837, %r838, %r839}, [%r840]; // end inline asm xor.b32 %r1500, %r1490, 288; add.s32 %r845, %r1491, %r1500; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r841, %r842, %r843, %r844}, [%r845]; // end inline asm xor.b32 %r1501, %r1490, 320; add.s32 %r850, %r1491, %r1501; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r846, %r847, %r848, %r849}, [%r850]; // end inline asm xor.b32 %r1502, %r1490, 352; add.s32 %r855, %r1491, %r1502; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r851, %r852, %r853, %r854}, [%r855]; // end inline asm or.b32 %r1503, %r1490, 384; add.s32 %r860, %r1491, %r1503; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r856, %r857, %r858, %r859}, [%r860]; // end inline asm xor.b32 %r1504, %r1490, 416; add.s32 %r865, %r1491, %r1504; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r861, %r862, %r863, %r864}, [%r865]; // end inline asm xor.b32 %r1505, %r1490, 448; add.s32 %r870, %r1491, %r1505; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r866, %r867, %r868, %r869}, [%r870]; // end inline asm xor.b32 %r1506, %r1490, 480; add.s32 %r875, %r1491, %r1506; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r871, %r872, %r873, %r874}, [%r875]; // end inline asm xor.b32 %r1507, %r1477, 32; bar.sync 0; add.s32 %r876, %r1467, %r18; add.s32 %r881, %r876, 4096; add.s32 %r886, %r876, 8192; add.s32 %r891, %r876, 12288; add.s32 %r896, %r876, 16384; add.s32 %r901, %r876, 20480; add.s32 %r906, %r876, 24576; add.s32 %r911, %r876, 28672; add.s32 %r916, %r876, 32768; add.s32 %r921, %r876, 36864; add.s32 %r926, %r876, 40960; add.s32 %r931, %r876, 45056; add.s32 %r936, %r876, 49152; add.s32 %r941, %r876, 53248; add.s32 %r946, %r876, 57344; add.s32 %r951, %r876, 61440; // begin inline asm st.shared.v4.b32 [%r876], {%r2538, %r2539, %r2540, %r2541}; // end inline asm // begin inline asm st.shared.v4.b32 [%r881], {%r2542, %r2543, %r2544, %r2545}; // end inline asm // begin inline asm st.shared.v4.b32 [%r886], {%r2546, %r2547, %r2548, %r2549}; // end inline asm // begin inline asm st.shared.v4.b32 [%r891], {%r2550, %r2551, %r2552, %r2553}; // end inline asm // begin inline asm st.shared.v4.b32 [%r896], {%r2554, %r2555, %r2556, %r2557}; // end inline asm // begin inline asm st.shared.v4.b32 [%r901], {%r2558, %r2559, %r2560, %r2561}; // end inline asm // begin inline asm st.shared.v4.b32 [%r906], {%r2562, %r2563, %r2564, %r2565}; // end inline asm // begin inline asm st.shared.v4.b32 [%r911], {%r2566, %r2567, %r2568, %r2569}; // end inline asm // begin inline asm st.shared.v4.b32 [%r916], {%r2570, %r2571, %r2572, %r2573}; // end inline asm // begin inline asm st.shared.v4.b32 [%r921], {%r2574, %r2575, %r2576, %r2577}; // end inline asm // begin inline asm st.shared.v4.b32 [%r926], {%r2578, %r2579, %r2580, %r2581}; // end inline asm // begin inline asm st.shared.v4.b32 [%r931], {%r2582, %r2583, %r2584, %r2585}; // end inline asm // begin inline asm st.shared.v4.b32 [%r936], {%r2586, %r2587, %r2588, %r2589}; // end inline asm // begin inline asm st.shared.v4.b32 [%r941], {%r2590, %r2591, %r2592, %r2593}; // end inline asm // begin inline asm st.shared.v4.b32 [%r946], {%r2594, %r2595, %r2596, %r2597}; // end inline asm // begin inline asm st.shared.v4.b32 [%r951], {%r2598, %r2599, %r2600, %r2601}; // end inline asm bar.sync 0; or.b32 %r1508, %r1480, %r1469; shl.b32 %r1509, %r1508, 9; or.b32 %r1510, %r1509, %r1476; add.s32 %r960, %r1510, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r956, %r957, %r958, %r959}, [%r960]; // end inline asm xor.b32 %r1511, %r1510, 32; add.s32 %r965, %r1511, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r961, %r962, %r963, %r964}, [%r965]; // end inline asm xor.b32 %r1512, %r1510, 64; add.s32 %r970, %r1512, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r966, %r967, %r968, %r969}, [%r970]; // end inline asm xor.b32 %r1513, %r1510, 96; add.s32 %r975, %r1513, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r971, %r972, %r973, %r974}, [%r975]; // end inline asm or.b32 %r1514, %r1510, 128; add.s32 %r980, %r1514, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r976, %r977, %r978, %r979}, [%r980]; // end inline asm xor.b32 %r1515, %r1510, 160; add.s32 %r985, %r1515, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r981, %r982, %r983, %r984}, [%r985]; // end inline asm xor.b32 %r1516, %r1510, 192; add.s32 %r990, %r1516, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r986, %r987, %r988, %r989}, [%r990]; // end inline asm xor.b32 %r1517, %r1510, 224; add.s32 %r995, %r1517, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r991, %r992, %r993, %r994}, [%r995]; // end inline asm or.b32 %r1518, %r1510, 256; add.s32 %r1000, %r1518, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r996, %r997, %r998, %r999}, [%r1000]; // end inline asm xor.b32 %r1519, %r1510, 288; add.s32 %r1005, %r1519, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1001, %r1002, %r1003, %r1004}, [%r1005]; // end inline asm xor.b32 %r1520, %r1510, 320; add.s32 %r1010, %r1520, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1006, %r1007, %r1008, %r1009}, [%r1010]; // end inline asm xor.b32 %r1521, %r1510, 352; add.s32 %r1015, %r1521, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1011, %r1012, %r1013, %r1014}, [%r1015]; // end inline asm or.b32 %r1522, %r1510, 384; add.s32 %r1020, %r1522, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1016, %r1017, %r1018, %r1019}, [%r1020]; // end inline asm xor.b32 %r1523, %r1510, 416; add.s32 %r1025, %r1523, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1021, %r1022, %r1023, %r1024}, [%r1025]; // end inline asm xor.b32 %r1524, %r1510, 448; add.s32 %r1030, %r1524, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1026, %r1027, %r1028, %r1029}, [%r1030]; // end inline asm xor.b32 %r1525, %r1510, 480; add.s32 %r1035, %r1525, %r1466; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r1031, %r1032, %r1033, %r1034}, [%r1035]; // end inline asm shr.u32 %r1527, %r2471, 25; add.s32 %r1528, %r2473, %r1527; shr.u32 %r1529, %r1528, 3; and.b32 %r1530, %r1529, 536870896; shr.s32 %r343, %r2464, 2; add.s32 %r1534, %r1530, %r343; shl.b32 %r1535, %r1534, 2; mov.u32 %r1536, 2; shr.s32 %r1537, %r2463, 31; shr.u32 %r1538, %r1537, 30; add.s32 %r1539, %r2463, %r1538; and.b32 %r1540, %r1539, 1073741820; sub.s32 %r1541, %r2463, %r1540; add.s32 %r1542, %r1535, %r1541; shl.b32 %r1543, %r1542, 2; add.s32 %r344, %r1466, %r1543; ld.param.u32 %r1437, [%rd137+64]; // begin inline asm mov.u32 %r1070, 0; // end inline asm // begin inline asm mov.u32 %r1071, 0; // end inline asm // begin inline asm mov.u32 %r1080, 0; // end inline asm // begin inline asm mov.u32 %r1081, 0; // end inline asm add.s32 %r1544, %r13, %r2450; add.s32 %r1044, %r1544, %r1507; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1040, %r1041, %r1042, %r1043}, [%r1044]; // end inline asm xor.b32 %r1545, %r1477, 64; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r791, %r792, %r793, %r794}, {%r796, %r797}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r791, %r792, %r793, %r794}, {%r798, %r799}, {%r1080, %r1081}; // end inline asm add.s32 %r1069, %r1544, %r1545; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1065, %r1066, %r1067, %r1068}, [%r1069]; // end inline asm xor.b32 %r1546, %r1477, 96; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1040, %r1041, %r1042, %r1043}, {%r801, %r802}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1040, %r1041, %r1042, %r1043}, {%r803, %r804}, {%r1080, %r1081}; // end inline asm add.s32 %r1094, %r1544, %r1546; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1090, %r1091, %r1092, %r1093}, [%r1094]; // end inline asm or.b32 %r1547, %r1477, 128; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1065, %r1066, %r1067, %r1068}, {%r806, %r807}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1065, %r1066, %r1067, %r1068}, {%r808, %r809}, {%r1080, %r1081}; // end inline asm add.s32 %r1119, %r1544, %r1547; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1115, %r1116, %r1117, %r1118}, [%r1119]; // end inline asm xor.b32 %r1548, %r1477, 160; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1090, %r1091, %r1092, %r1093}, {%r811, %r812}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1090, %r1091, %r1092, %r1093}, {%r813, %r814}, {%r1080, %r1081}; // end inline asm add.s32 %r1144, %r1544, %r1548; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1140, %r1141, %r1142, %r1143}, [%r1144]; // end inline asm xor.b32 %r1549, %r1477, 192; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1115, %r1116, %r1117, %r1118}, {%r816, %r817}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1115, %r1116, %r1117, %r1118}, {%r818, %r819}, {%r1080, %r1081}; // end inline asm add.s32 %r1169, %r1544, %r1549; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1165, %r1166, %r1167, %r1168}, [%r1169]; // end inline asm xor.b32 %r1550, %r1477, 224; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1140, %r1141, %r1142, %r1143}, {%r821, %r822}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1140, %r1141, %r1142, %r1143}, {%r823, %r824}, {%r1080, %r1081}; // end inline asm add.s32 %r1194, %r1544, %r1550; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1190, %r1191, %r1192, %r1193}, [%r1194]; // end inline asm or.b32 %r1551, %r1477, 256; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1165, %r1166, %r1167, %r1168}, {%r826, %r827}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1165, %r1166, %r1167, %r1168}, {%r828, %r829}, {%r1080, %r1081}; // end inline asm add.s32 %r1219, %r1544, %r1551; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1215, %r1216, %r1217, %r1218}, [%r1219]; // end inline asm xor.b32 %r1552, %r1477, 288; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1190, %r1191, %r1192, %r1193}, {%r831, %r832}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1190, %r1191, %r1192, %r1193}, {%r833, %r834}, {%r1080, %r1081}; // end inline asm add.s32 %r1244, %r1544, %r1552; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1240, %r1241, %r1242, %r1243}, [%r1244]; // end inline asm xor.b32 %r1553, %r1477, 320; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1215, %r1216, %r1217, %r1218}, {%r836, %r837}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1215, %r1216, %r1217, %r1218}, {%r838, %r839}, {%r1080, %r1081}; // end inline asm add.s32 %r1269, %r1544, %r1553; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1265, %r1266, %r1267, %r1268}, [%r1269]; // end inline asm xor.b32 %r1554, %r1477, 352; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1240, %r1241, %r1242, %r1243}, {%r841, %r842}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1240, %r1241, %r1242, %r1243}, {%r843, %r844}, {%r1080, %r1081}; // end inline asm add.s32 %r1294, %r1544, %r1554; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1290, %r1291, %r1292, %r1293}, [%r1294]; // end inline asm or.b32 %r1555, %r1477, 384; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1265, %r1266, %r1267, %r1268}, {%r846, %r847}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1265, %r1266, %r1267, %r1268}, {%r848, %r849}, {%r1080, %r1081}; // end inline asm add.s32 %r1319, %r1544, %r1555; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1315, %r1316, %r1317, %r1318}, [%r1319]; // end inline asm xor.b32 %r1556, %r1477, 416; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1290, %r1291, %r1292, %r1293}, {%r851, %r852}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1290, %r1291, %r1292, %r1293}, {%r853, %r854}, {%r1080, %r1081}; // end inline asm add.s32 %r1344, %r1544, %r1556; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1340, %r1341, %r1342, %r1343}, [%r1344]; // end inline asm xor.b32 %r1557, %r1477, 448; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1315, %r1316, %r1317, %r1318}, {%r856, %r857}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1315, %r1316, %r1317, %r1318}, {%r858, %r859}, {%r1080, %r1081}; // end inline asm add.s32 %r1369, %r1544, %r1557; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1365, %r1366, %r1367, %r1368}, [%r1369]; // end inline asm xor.b32 %r1558, %r1477, 480; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1340, %r1341, %r1342, %r1343}, {%r861, %r862}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1340, %r1341, %r1342, %r1343}, {%r863, %r864}, {%r1080, %r1081}; // end inline asm add.s32 %r1394, %r1544, %r1558; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1390, %r1391, %r1392, %r1393}, [%r1394]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1365, %r1366, %r1367, %r1368}, {%r866, %r867}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1365, %r1366, %r1367, %r1368}, {%r868, %r869}, {%r1080, %r1081}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1070, %r1071}, {%r1390, %r1391, %r1392, %r1393}, {%r871, %r872}, {%r1070, %r1071}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1080, %r1081}, {%r1390, %r1391, %r1392, %r1393}, {%r873, %r874}, {%r1080, %r1081}; // end inline asm // begin inline asm mul.f16x2 %r1435, %r1070, %r1437; // end inline asm mov.u32 %r1458, 2080340991; // begin inline asm min.xorsign.abs.f16x2 %r1438, %r1435, %r1458; // end inline asm // begin inline asm mul.f16x2 %r1441, %r1071, %r1437; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1444, %r1441, %r1458; // end inline asm // begin inline asm mul.f16x2 %r1447, %r1080, %r1437; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1450, %r1447, %r1458; // end inline asm // begin inline asm mul.f16x2 %r1453, %r1081, %r1437; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1456, %r1453, %r1458; // end inline asm // begin inline asm mov.b32 {%rs1, %rs2}, %r1438; // end inline asm // begin inline asm cvt.f32.f16 %f64, %rs1; // end inline asm // begin inline asm cvt.f32.f16 %f65, %rs2; // end inline asm // begin inline asm mov.b32 {%rs5, %rs6}, %r1444; // end inline asm // begin inline asm cvt.f32.f16 %f66, %rs5; // end inline asm // begin inline asm cvt.f32.f16 %f67, %rs6; // end inline asm // begin inline asm mov.b32 {%rs9, %rs10}, %r1450; // end inline asm // begin inline asm cvt.f32.f16 %f68, %rs9; // end inline asm // begin inline asm cvt.f32.f16 %f69, %rs10; // end inline asm // begin inline asm mov.b32 {%rs13, %rs14}, %r1456; // end inline asm // begin inline asm cvt.f32.f16 %f70, %rs13; // end inline asm // begin inline asm cvt.f32.f16 %f71, %rs14; // end inline asm shl.b32 %r1559, %r2463, 4; add.s32 %r1560, %r2460, %r1559; setp.lt.s32 %p113, %r1560, %r2449; selp.f32 %f1, %f64, 0fFF800000, %p113; add.s32 %r1561, %r1560, 1; setp.lt.s32 %p114, %r1561, %r2449; selp.f32 %f2, %f65, 0fFF800000, %p114; add.s32 %r1562, %r1560, 8; setp.lt.s32 %p115, %r1562, %r2449; selp.f32 %f3, %f68, 0fFF800000, %p115; add.s32 %r1563, %r1560, 9; setp.lt.s32 %p116, %r1563, %r2449; selp.f32 %f4, %f69, 0fFF800000, %p116; selp.f32 %f5, %f66, 0fFF800000, %p113; selp.f32 %f6, %f67, 0fFF800000, %p114; selp.f32 %f7, %f70, 0fFF800000, %p115; selp.f32 %f8, %f71, 0fFF800000, %p116; bar.sync 0; setp.gt.ftz.f32 %p117, %f1, %f2; selp.f32 %f72, %f1, %f2, %p117; setp.gt.ftz.f32 %p118, %f72, %f3; selp.f32 %f73, %f72, %f3, %p118; setp.gt.ftz.f32 %p119, %f73, %f4; selp.f32 %f74, %f73, %f4, %p119; setp.gt.ftz.f32 %p120, %f5, %f6; selp.f32 %f75, %f5, %f6, %p120; setp.gt.ftz.f32 %p121, %f75, %f7; selp.f32 %f76, %f75, %f7, %p121; setp.gt.ftz.f32 %p122, %f76, %f8; selp.f32 %f77, %f76, %f8, %p122; mov.b32 %r1564, %f74; mov.u32 %r1565, 31; mov.u32 %r1566, -1; shfl.sync.bfly.b32 %r1567|%p123, %r1564, %r1481, %r1565, %r1566; mov.b32 %f78, %r1567; setp.gt.ftz.f32 %p124, %f74, %f78; selp.f32 %f9, %f74, %f78, %p124; mov.b32 %r1568, %f9; shfl.sync.bfly.b32 %r345|%p17, %r1568, %r1536, %r1565, %r1566; mov.b32 %r1569, %f77; shfl.sync.bfly.b32 %r1570|%p125, %r1569, %r1481, %r1565, %r1566; mov.b32 %f79, %r1570; setp.gt.ftz.f32 %p126, %f77, %f79; selp.f32 %f10, %f77, %f79, %p126; mov.b32 %r1571, %f10; shfl.sync.bfly.b32 %r346|%p18, %r1571, %r1536, %r1565, %r1566; and.b32 %r347, %r2473, 3; setp.ne.s32 %p127, %r347, 0; @%p127 bra $L__BB0_67; mov.b32 %f80, %r345; mov.b32 %f81, %r346; setp.gt.ftz.f32 %p128, %f9, %f80; selp.f32 %f82, %f9, %f80, %p128; st.shared.f32 [%r344], %f82; setp.gt.ftz.f32 %p129, %f10, %f81; selp.f32 %f83, %f10, %f81, %p129; st.shared.f32 [%r344+128], %f83; $L__BB0_67: mov.u32 %r2385, _ZN25fused_multihead_attention5smem_E; mov.u32 %r2384, %tid.x; bar.sync 0; setp.gt.s32 %p130, %r2384, 15; add.s32 %r348, %r2385, %r1473; @%p130 bra $L__BB0_69; ld.shared.v4.f32 {%f154, %f163, %f156, %f165}, [%r348+16384]; ld.shared.v4.f32 {%f158, %f167, %f160, %f169}, [%r348+16640]; $L__BB0_69: mov.u32 %r2387, _ZN25fused_multihead_attention5smem_E; mov.u32 %r2386, %tid.x; setp.gt.ftz.f32 %p132, %f154, %f163; selp.f32 %f93, %f154, %f163, %p132; setp.gt.ftz.f32 %p133, %f156, %f165; selp.f32 %f164, %f156, %f165, %p133; setp.gt.ftz.f32 %p134, %f158, %f167; selp.f32 %f94, %f158, %f167, %p134; setp.gt.ftz.f32 %p135, %f160, %f169; selp.f32 %f168, %f160, %f169, %p135; setp.gt.ftz.f32 %p136, %f93, %f164; selp.f32 %f95, %f93, %f164, %p136; setp.gt.ftz.f32 %p137, %f94, %f168; selp.f32 %f166, %f94, %f168, %p137; setp.gt.ftz.f32 %p138, %f95, %f166; selp.f32 %f162, %f95, %f166, %p138; bar.sync 0; shl.b32 %r1576, %r2386, 2; add.s32 %r349, %r2387, %r1576; @%p130 bra $L__BB0_71; st.shared.f32 [%r349+16384], %f162; $L__BB0_71: mov.u32 %r2390, %tid.x; and.b32 %r2389, %r2390, 3; setp.ne.s32 %p153, %r2389, 0; mov.u32 %r2388, _ZN25fused_multihead_attention5smem_E; shl.b32 %r1578, %r343, 2; add.s32 %r1580, %r2388, %r1578; mov.u32 %r1581, 2; bar.sync 0; ld.shared.f32 %f96, [%r1580+16384]; ld.shared.f32 %f97, [%r1580+16416]; bar.sync 0; bar.sync 0; sub.ftz.f32 %f98, %f1, %f96; mul.ftz.f32 %f99, %f98, 0f3FB8AA3B; ex2.approx.ftz.f32 %f31, %f99; sub.ftz.f32 %f100, %f2, %f96; mul.ftz.f32 %f101, %f100, 0f3FB8AA3B; ex2.approx.ftz.f32 %f32, %f101; sub.ftz.f32 %f102, %f3, %f96; mul.ftz.f32 %f103, %f102, 0f3FB8AA3B; ex2.approx.ftz.f32 %f33, %f103; sub.ftz.f32 %f104, %f4, %f96; mul.ftz.f32 %f105, %f104, 0f3FB8AA3B; ex2.approx.ftz.f32 %f34, %f105; sub.ftz.f32 %f106, %f5, %f97; mul.ftz.f32 %f107, %f106, 0f3FB8AA3B; ex2.approx.ftz.f32 %f35, %f107; sub.ftz.f32 %f108, %f6, %f97; mul.ftz.f32 %f109, %f108, 0f3FB8AA3B; ex2.approx.ftz.f32 %f36, %f109; sub.ftz.f32 %f110, %f7, %f97; mul.ftz.f32 %f111, %f110, 0f3FB8AA3B; ex2.approx.ftz.f32 %f37, %f111; sub.ftz.f32 %f112, %f8, %f97; mul.ftz.f32 %f113, %f112, 0f3FB8AA3B; ex2.approx.ftz.f32 %f38, %f113; add.ftz.f32 %f114, %f31, 0f00000000; add.ftz.f32 %f115, %f114, %f32; add.ftz.f32 %f116, %f33, 0f00000000; add.ftz.f32 %f117, %f116, %f34; add.ftz.f32 %f118, %f117, %f115; add.ftz.f32 %f119, %f35, 0f00000000; add.ftz.f32 %f120, %f119, %f36; add.ftz.f32 %f121, %f37, 0f00000000; add.ftz.f32 %f122, %f121, %f38; add.ftz.f32 %f123, %f122, %f120; mov.b32 %r1582, %f118; mov.u32 %r1583, 31; mov.u32 %r1584, 1; mov.u32 %r1585, -1; shfl.sync.bfly.b32 %r1586|%p140, %r1582, %r1584, %r1583, %r1585; mov.b32 %f124, %r1586; add.ftz.f32 %f125, %f118, %f124; mov.b32 %r1587, %f125; shfl.sync.bfly.b32 %r1588|%p141, %r1587, %r1581, %r1583, %r1585; mov.b32 %f126, %r1588; add.ftz.f32 %f39, %f125, %f126; mov.b32 %r1589, %f123; shfl.sync.bfly.b32 %r1590|%p142, %r1589, %r1584, %r1583, %r1585; mov.b32 %f127, %r1590; add.ftz.f32 %f128, %f123, %f127; mov.b32 %r1591, %f128; shfl.sync.bfly.b32 %r1592|%p143, %r1591, %r1581, %r1583, %r1585; mov.b32 %f129, %r1592; add.ftz.f32 %f40, %f128, %f129; @%p153 bra $L__BB0_73; st.shared.f32 [%r344], %f39; st.shared.f32 [%r344+128], %f40; $L__BB0_73: bar.sync 0; @%p130 bra $L__BB0_75; ld.shared.v4.f32 {%f162, %f163, %f164, %f165}, [%r348+16384]; ld.shared.v4.f32 {%f166, %f167, %f168, %f169}, [%r348+16640]; $L__BB0_75: add.ftz.f32 %f138, %f164, %f165; add.ftz.f32 %f139, %f162, %f163; add.ftz.f32 %f140, %f139, %f138; add.ftz.f32 %f141, %f168, %f169; add.ftz.f32 %f142, %f166, %f167; add.ftz.f32 %f143, %f142, %f141; add.ftz.f32 %f57, %f140, %f143; bar.sync 0; @%p130 bra $L__BB0_77; st.shared.f32 [%r349+16384], %f57; $L__BB0_77: bar.sync 0; add.s32 %r2382, %r1580, 16384; ld.shared.f32 %f58, [%r2382]; add.s32 %r2383, %r1580, 16384; ld.shared.f32 %f59, [%r2383+32]; bar.sync 0; setp.equ.ftz.f32 %p146, %f58, 0f00000000; mov.f32 %f171, 0f3F800000; mov.f32 %f170, %f171; @%p146 bra $L__BB0_79; rcp.approx.ftz.f32 %f170, %f58; $L__BB0_79: setp.equ.ftz.f32 %p147, %f59, 0f00000000; @%p147 bra $L__BB0_81; rcp.approx.ftz.f32 %f171, %f59; $L__BB0_81: mov.u32 %r2415, %tid.x; shr.s32 %r2414, %r2415, 31; shr.u32 %r2413, %r2414, 27; add.s32 %r2412, %r2415, %r2413; and.b32 %r2411, %r2412, -32; sub.s32 %r2410, %r2415, %r2411; shr.s32 %r2409, %r2412, 5; shl.b32 %r2408, %r2410, 4; cvt.s64.s32 %rd122, %r2408; mov.b64 %rd121, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd120, %rd121; ld.param.u32 %r2407, [%rd120+60]; shr.s32 %r2406, %r2412, 31; shr.u32 %r2405, %r2406, 29; add.s32 %r2404, %r2409, %r2405; and.b32 %r2403, %r2404, 268435448; sub.s32 %r2402, %r2409, %r2403; xor.b32 %r2401, %r2402, %r2410; shl.b32 %r2400, %r2401, 4; mov.u32 %r2399, _ZN25fused_multihead_attention5smem_E; add.s32 %r2398, %r2399, 16384; ld.param.u32 %r2397, [fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0+200]; mov.u32 %r2396, %ctaid.y; ld.param.u32 %r2395, [fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0+200]; mov.u32 %r2394, %ctaid.x; ld.param.u32 %r2393, [fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0+192]; mul.lo.s32 %r2392, %r2395, %r2396; mad.lo.s32 %r2391, %r2392, %r2393, %r2394; mul.ftz.f32 %f146, %f170, %f32; mul.ftz.f32 %f147, %f170, %f31; // begin inline asm cvt.rn.f16x2.f32 %r1595, %f146, %f147; // end inline asm mul.ftz.f32 %f148, %f171, %f36; mul.ftz.f32 %f149, %f171, %f35; // begin inline asm cvt.rn.f16x2.f32 %r1596, %f148, %f149; // end inline asm mul.ftz.f32 %f150, %f170, %f34; mul.ftz.f32 %f151, %f170, %f33; // begin inline asm cvt.rn.f16x2.f32 %r1597, %f150, %f151; // end inline asm mul.ftz.f32 %f152, %f171, %f38; mul.ftz.f32 %f153, %f171, %f37; // begin inline asm cvt.rn.f16x2.f32 %r1598, %f152, %f153; // end inline asm // begin inline asm mov.u32 %r1663, 0; // end inline asm // begin inline asm mov.u32 %r1664, 0; // end inline asm // begin inline asm mov.u32 %r1673, 0; // end inline asm // begin inline asm mov.u32 %r1674, 0; // end inline asm // begin inline asm mov.u32 %r1683, 0; // end inline asm // begin inline asm mov.u32 %r1684, 0; // end inline asm // begin inline asm mov.u32 %r1693, 0; // end inline asm // begin inline asm mov.u32 %r1694, 0; // end inline asm // begin inline asm mov.u32 %r1703, 0; // end inline asm // begin inline asm mov.u32 %r1704, 0; // end inline asm // begin inline asm mov.u32 %r1713, 0; // end inline asm // begin inline asm mov.u32 %r1714, 0; // end inline asm // begin inline asm mov.u32 %r1723, 0; // end inline asm // begin inline asm mov.u32 %r1724, 0; // end inline asm // begin inline asm mov.u32 %r1733, 0; // end inline asm // begin inline asm mov.u32 %r1734, 0; // end inline asm // begin inline asm mov.u32 %r1743, 0; // end inline asm // begin inline asm mov.u32 %r1744, 0; // end inline asm // begin inline asm mov.u32 %r1753, 0; // end inline asm // begin inline asm mov.u32 %r1754, 0; // end inline asm // begin inline asm mov.u32 %r1763, 0; // end inline asm // begin inline asm mov.u32 %r1764, 0; // end inline asm // begin inline asm mov.u32 %r1773, 0; // end inline asm // begin inline asm mov.u32 %r1774, 0; // end inline asm // begin inline asm mov.u32 %r1783, 0; // end inline asm // begin inline asm mov.u32 %r1784, 0; // end inline asm // begin inline asm mov.u32 %r1793, 0; // end inline asm // begin inline asm mov.u32 %r1794, 0; // end inline asm // begin inline asm mov.u32 %r1803, 0; // end inline asm // begin inline asm mov.u32 %r1804, 0; // end inline asm // begin inline asm mov.u32 %r1813, 0; // end inline asm // begin inline asm mov.u32 %r1814, 0; // end inline asm // begin inline asm mov.u32 %r1823, 0; // end inline asm // begin inline asm mov.u32 %r1824, 0; // end inline asm // begin inline asm mov.u32 %r1833, 0; // end inline asm // begin inline asm mov.u32 %r1834, 0; // end inline asm // begin inline asm mov.u32 %r1843, 0; // end inline asm // begin inline asm mov.u32 %r1844, 0; // end inline asm // begin inline asm mov.u32 %r1853, 0; // end inline asm // begin inline asm mov.u32 %r1854, 0; // end inline asm // begin inline asm mov.u32 %r1863, 0; // end inline asm // begin inline asm mov.u32 %r1864, 0; // end inline asm // begin inline asm mov.u32 %r1873, 0; // end inline asm // begin inline asm mov.u32 %r1874, 0; // end inline asm // begin inline asm mov.u32 %r1883, 0; // end inline asm // begin inline asm mov.u32 %r1884, 0; // end inline asm // begin inline asm mov.u32 %r1893, 0; // end inline asm // begin inline asm mov.u32 %r1894, 0; // end inline asm // begin inline asm mov.u32 %r1903, 0; // end inline asm // begin inline asm mov.u32 %r1904, 0; // end inline asm // begin inline asm mov.u32 %r1913, 0; // end inline asm // begin inline asm mov.u32 %r1914, 0; // end inline asm // begin inline asm mov.u32 %r1923, 0; // end inline asm // begin inline asm mov.u32 %r1924, 0; // end inline asm // begin inline asm mov.u32 %r1933, 0; // end inline asm // begin inline asm mov.u32 %r1934, 0; // end inline asm // begin inline asm mov.u32 %r1943, 0; // end inline asm // begin inline asm mov.u32 %r1944, 0; // end inline asm // begin inline asm mov.u32 %r1953, 0; // end inline asm // begin inline asm mov.u32 %r1954, 0; // end inline asm // begin inline asm mov.u32 %r1963, 0; // end inline asm // begin inline asm mov.u32 %r1964, 0; // end inline asm // begin inline asm mov.u32 %r1973, 0; // end inline asm // begin inline asm mov.u32 %r1974, 0; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1663, %r1664}, {%r1595, %r1596, %r1597, %r1598}, {%r956, %r957}, {%r1663, %r1664}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1673, %r1674}, {%r1595, %r1596, %r1597, %r1598}, {%r958, %r959}, {%r1673, %r1674}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1683, %r1684}, {%r1595, %r1596, %r1597, %r1598}, {%r961, %r962}, {%r1683, %r1684}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1693, %r1694}, {%r1595, %r1596, %r1597, %r1598}, {%r963, %r964}, {%r1693, %r1694}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1703, %r1704}, {%r1595, %r1596, %r1597, %r1598}, {%r966, %r967}, {%r1703, %r1704}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1713, %r1714}, {%r1595, %r1596, %r1597, %r1598}, {%r968, %r969}, {%r1713, %r1714}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1723, %r1724}, {%r1595, %r1596, %r1597, %r1598}, {%r971, %r972}, {%r1723, %r1724}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1733, %r1734}, {%r1595, %r1596, %r1597, %r1598}, {%r973, %r974}, {%r1733, %r1734}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1743, %r1744}, {%r1595, %r1596, %r1597, %r1598}, {%r976, %r977}, {%r1743, %r1744}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1753, %r1754}, {%r1595, %r1596, %r1597, %r1598}, {%r978, %r979}, {%r1753, %r1754}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1763, %r1764}, {%r1595, %r1596, %r1597, %r1598}, {%r981, %r982}, {%r1763, %r1764}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1773, %r1774}, {%r1595, %r1596, %r1597, %r1598}, {%r983, %r984}, {%r1773, %r1774}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1783, %r1784}, {%r1595, %r1596, %r1597, %r1598}, {%r986, %r987}, {%r1783, %r1784}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1793, %r1794}, {%r1595, %r1596, %r1597, %r1598}, {%r988, %r989}, {%r1793, %r1794}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1803, %r1804}, {%r1595, %r1596, %r1597, %r1598}, {%r991, %r992}, {%r1803, %r1804}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1813, %r1814}, {%r1595, %r1596, %r1597, %r1598}, {%r993, %r994}, {%r1813, %r1814}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1823, %r1824}, {%r1595, %r1596, %r1597, %r1598}, {%r996, %r997}, {%r1823, %r1824}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1833, %r1834}, {%r1595, %r1596, %r1597, %r1598}, {%r998, %r999}, {%r1833, %r1834}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1843, %r1844}, {%r1595, %r1596, %r1597, %r1598}, {%r1001, %r1002}, {%r1843, %r1844}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1853, %r1854}, {%r1595, %r1596, %r1597, %r1598}, {%r1003, %r1004}, {%r1853, %r1854}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1863, %r1864}, {%r1595, %r1596, %r1597, %r1598}, {%r1006, %r1007}, {%r1863, %r1864}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1873, %r1874}, {%r1595, %r1596, %r1597, %r1598}, {%r1008, %r1009}, {%r1873, %r1874}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1883, %r1884}, {%r1595, %r1596, %r1597, %r1598}, {%r1011, %r1012}, {%r1883, %r1884}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1893, %r1894}, {%r1595, %r1596, %r1597, %r1598}, {%r1013, %r1014}, {%r1893, %r1894}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1903, %r1904}, {%r1595, %r1596, %r1597, %r1598}, {%r1016, %r1017}, {%r1903, %r1904}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1913, %r1914}, {%r1595, %r1596, %r1597, %r1598}, {%r1018, %r1019}, {%r1913, %r1914}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1923, %r1924}, {%r1595, %r1596, %r1597, %r1598}, {%r1021, %r1022}, {%r1923, %r1924}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1933, %r1934}, {%r1595, %r1596, %r1597, %r1598}, {%r1023, %r1024}, {%r1933, %r1934}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1943, %r1944}, {%r1595, %r1596, %r1597, %r1598}, {%r1026, %r1027}, {%r1943, %r1944}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1953, %r1954}, {%r1595, %r1596, %r1597, %r1598}, {%r1028, %r1029}, {%r1953, %r1954}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1963, %r1964}, {%r1595, %r1596, %r1597, %r1598}, {%r1031, %r1032}, {%r1963, %r1964}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r1973, %r1974}, {%r1595, %r1596, %r1597, %r1598}, {%r1033, %r1034}, {%r1973, %r1974}; // end inline asm shl.b32 %r2360, %r2415, 10; and.b32 %r2361, %r2360, 28672; add.s32 %r2364, %r2361, %r2398; and.b32 %r2366, %r1576, 896; and.b32 %r2367, %r2415, 31; or.b32 %r2368, %r2366, %r2367; shl.b32 %r2369, %r2368, 2; add.s32 %r1983, %r2364, %r2369; // begin inline asm st.shared.b32 [%r1983], %r1663; // end inline asm add.s32 %r1985, %r1983, 32768; // begin inline asm st.shared.b32 [%r1985], %r1664; // end inline asm xor.b32 %r1987, %r1983, 16; // begin inline asm st.shared.b32 [%r1987], %r1673; // end inline asm add.s32 %r1989, %r1987, 32768; // begin inline asm st.shared.b32 [%r1989], %r1674; // end inline asm xor.b32 %r1991, %r1983, 32; // begin inline asm st.shared.b32 [%r1991], %r1683; // end inline asm add.s32 %r1993, %r1991, 32768; // begin inline asm st.shared.b32 [%r1993], %r1684; // end inline asm xor.b32 %r1995, %r1983, 48; // begin inline asm st.shared.b32 [%r1995], %r1693; // end inline asm add.s32 %r1997, %r1995, 32768; // begin inline asm st.shared.b32 [%r1997], %r1694; // end inline asm xor.b32 %r1999, %r1983, 64; // begin inline asm st.shared.b32 [%r1999], %r1703; // end inline asm add.s32 %r2001, %r1999, 32768; // begin inline asm st.shared.b32 [%r2001], %r1704; // end inline asm xor.b32 %r2003, %r1983, 80; // begin inline asm st.shared.b32 [%r2003], %r1713; // end inline asm add.s32 %r2005, %r2003, 32768; // begin inline asm st.shared.b32 [%r2005], %r1714; // end inline asm xor.b32 %r2007, %r1983, 96; // begin inline asm st.shared.b32 [%r2007], %r1723; // end inline asm add.s32 %r2009, %r2007, 32768; // begin inline asm st.shared.b32 [%r2009], %r1724; // end inline asm xor.b32 %r2011, %r1983, 112; // begin inline asm st.shared.b32 [%r2011], %r1733; // end inline asm add.s32 %r2013, %r2011, 32768; // begin inline asm st.shared.b32 [%r2013], %r1734; // end inline asm xor.b32 %r2015, %r1983, 128; // begin inline asm st.shared.b32 [%r2015], %r1743; // end inline asm add.s32 %r2017, %r2015, 32768; // begin inline asm st.shared.b32 [%r2017], %r1744; // end inline asm xor.b32 %r2019, %r1983, 144; // begin inline asm st.shared.b32 [%r2019], %r1753; // end inline asm add.s32 %r2021, %r2019, 32768; // begin inline asm st.shared.b32 [%r2021], %r1754; // end inline asm xor.b32 %r2023, %r1983, 160; // begin inline asm st.shared.b32 [%r2023], %r1763; // end inline asm add.s32 %r2025, %r2023, 32768; // begin inline asm st.shared.b32 [%r2025], %r1764; // end inline asm xor.b32 %r2027, %r1983, 176; // begin inline asm st.shared.b32 [%r2027], %r1773; // end inline asm add.s32 %r2029, %r2027, 32768; // begin inline asm st.shared.b32 [%r2029], %r1774; // end inline asm xor.b32 %r2031, %r1983, 192; // begin inline asm st.shared.b32 [%r2031], %r1783; // end inline asm add.s32 %r2033, %r2031, 32768; // begin inline asm st.shared.b32 [%r2033], %r1784; // end inline asm xor.b32 %r2035, %r1983, 208; // begin inline asm st.shared.b32 [%r2035], %r1793; // end inline asm add.s32 %r2037, %r2035, 32768; // begin inline asm st.shared.b32 [%r2037], %r1794; // end inline asm xor.b32 %r2039, %r1983, 224; // begin inline asm st.shared.b32 [%r2039], %r1803; // end inline asm add.s32 %r2041, %r2039, 32768; // begin inline asm st.shared.b32 [%r2041], %r1804; // end inline asm xor.b32 %r2043, %r1983, 240; // begin inline asm st.shared.b32 [%r2043], %r1813; // end inline asm add.s32 %r2045, %r2043, 32768; // begin inline asm st.shared.b32 [%r2045], %r1814; // end inline asm xor.b32 %r2047, %r1983, 256; // begin inline asm st.shared.b32 [%r2047], %r1823; // end inline asm add.s32 %r2049, %r2047, 32768; // begin inline asm st.shared.b32 [%r2049], %r1824; // end inline asm xor.b32 %r2051, %r1983, 272; // begin inline asm st.shared.b32 [%r2051], %r1833; // end inline asm add.s32 %r2053, %r2051, 32768; // begin inline asm st.shared.b32 [%r2053], %r1834; // end inline asm xor.b32 %r2055, %r1983, 288; // begin inline asm st.shared.b32 [%r2055], %r1843; // end inline asm add.s32 %r2057, %r2055, 32768; // begin inline asm st.shared.b32 [%r2057], %r1844; // end inline asm xor.b32 %r2059, %r1983, 304; // begin inline asm st.shared.b32 [%r2059], %r1853; // end inline asm add.s32 %r2061, %r2059, 32768; // begin inline asm st.shared.b32 [%r2061], %r1854; // end inline asm xor.b32 %r2063, %r1983, 320; // begin inline asm st.shared.b32 [%r2063], %r1863; // end inline asm add.s32 %r2065, %r2063, 32768; // begin inline asm st.shared.b32 [%r2065], %r1864; // end inline asm xor.b32 %r2067, %r1983, 336; // begin inline asm st.shared.b32 [%r2067], %r1873; // end inline asm add.s32 %r2069, %r2067, 32768; // begin inline asm st.shared.b32 [%r2069], %r1874; // end inline asm xor.b32 %r2071, %r1983, 352; // begin inline asm st.shared.b32 [%r2071], %r1883; // end inline asm add.s32 %r2073, %r2071, 32768; // begin inline asm st.shared.b32 [%r2073], %r1884; // end inline asm xor.b32 %r2075, %r1983, 368; // begin inline asm st.shared.b32 [%r2075], %r1893; // end inline asm add.s32 %r2077, %r2075, 32768; // begin inline asm st.shared.b32 [%r2077], %r1894; // end inline asm xor.b32 %r2079, %r1983, 384; // begin inline asm st.shared.b32 [%r2079], %r1903; // end inline asm add.s32 %r2081, %r2079, 32768; // begin inline asm st.shared.b32 [%r2081], %r1904; // end inline asm xor.b32 %r2083, %r1983, 400; // begin inline asm st.shared.b32 [%r2083], %r1913; // end inline asm add.s32 %r2085, %r2083, 32768; // begin inline asm st.shared.b32 [%r2085], %r1914; // end inline asm xor.b32 %r2087, %r1983, 416; // begin inline asm st.shared.b32 [%r2087], %r1923; // end inline asm add.s32 %r2089, %r2087, 32768; // begin inline asm st.shared.b32 [%r2089], %r1924; // end inline asm xor.b32 %r2091, %r1983, 432; // begin inline asm st.shared.b32 [%r2091], %r1933; // end inline asm add.s32 %r2093, %r2091, 32768; // begin inline asm st.shared.b32 [%r2093], %r1934; // end inline asm xor.b32 %r2095, %r1983, 448; // begin inline asm st.shared.b32 [%r2095], %r1943; // end inline asm add.s32 %r2097, %r2095, 32768; // begin inline asm st.shared.b32 [%r2097], %r1944; // end inline asm xor.b32 %r2099, %r1983, 464; // begin inline asm st.shared.b32 [%r2099], %r1953; // end inline asm add.s32 %r2101, %r2099, 32768; // begin inline asm st.shared.b32 [%r2101], %r1954; // end inline asm xor.b32 %r2103, %r1983, 480; // begin inline asm st.shared.b32 [%r2103], %r1963; // end inline asm add.s32 %r2105, %r2103, 32768; // begin inline asm st.shared.b32 [%r2105], %r1964; // end inline asm xor.b32 %r2107, %r1983, 496; // begin inline asm st.shared.b32 [%r2107], %r1973; // end inline asm add.s32 %r2109, %r2107, 32768; // begin inline asm st.shared.b32 [%r2109], %r1974; // end inline asm bar.sync 0; shl.b32 %r2370, %r2409, 12; add.s32 %r2371, %r2370, %r2398; add.s32 %r2115, %r2371, %r2400; // begin inline asm ld.shared.v4.b32 {%r2111, %r2112, %r2113, %r2114}, [%r2115]; // end inline asm add.s32 %r2120, %r2115, 512; // begin inline asm ld.shared.v4.b32 {%r2116, %r2117, %r2118, %r2119}, [%r2120]; // end inline asm add.s32 %r2125, %r2115, 1024; // begin inline asm ld.shared.v4.b32 {%r2121, %r2122, %r2123, %r2124}, [%r2125]; // end inline asm add.s32 %r2130, %r2115, 1536; // begin inline asm ld.shared.v4.b32 {%r2126, %r2127, %r2128, %r2129}, [%r2130]; // end inline asm add.s32 %r2135, %r2115, 2048; // begin inline asm ld.shared.v4.b32 {%r2131, %r2132, %r2133, %r2134}, [%r2135]; // end inline asm add.s32 %r2140, %r2115, 2560; // begin inline asm ld.shared.v4.b32 {%r2136, %r2137, %r2138, %r2139}, [%r2140]; // end inline asm add.s32 %r2145, %r2115, 3072; // begin inline asm ld.shared.v4.b32 {%r2141, %r2142, %r2143, %r2144}, [%r2145]; // end inline asm add.s32 %r2150, %r2115, 3584; // begin inline asm ld.shared.v4.b32 {%r2146, %r2147, %r2148, %r2149}, [%r2150]; // end inline asm // begin inline asm add.f16x2 %r2151, %r2111, %r2116; // end inline asm // begin inline asm add.f16x2 %r2154, %r2112, %r2117; // end inline asm // begin inline asm add.f16x2 %r2157, %r2113, %r2118; // end inline asm // begin inline asm add.f16x2 %r2160, %r2114, %r2119; // end inline asm // begin inline asm add.f16x2 %r2163, %r2151, %r2121; // end inline asm // begin inline asm add.f16x2 %r2166, %r2154, %r2122; // end inline asm // begin inline asm add.f16x2 %r2169, %r2157, %r2123; // end inline asm // begin inline asm add.f16x2 %r2172, %r2160, %r2124; // end inline asm // begin inline asm add.f16x2 %r2175, %r2163, %r2126; // end inline asm // begin inline asm add.f16x2 %r2178, %r2166, %r2127; // end inline asm // begin inline asm add.f16x2 %r2181, %r2169, %r2128; // end inline asm // begin inline asm add.f16x2 %r2184, %r2172, %r2129; // end inline asm // begin inline asm add.f16x2 %r2187, %r2175, %r2131; // end inline asm // begin inline asm add.f16x2 %r2190, %r2178, %r2132; // end inline asm // begin inline asm add.f16x2 %r2193, %r2181, %r2133; // end inline asm // begin inline asm add.f16x2 %r2196, %r2184, %r2134; // end inline asm // begin inline asm add.f16x2 %r2199, %r2187, %r2136; // end inline asm // begin inline asm add.f16x2 %r2202, %r2190, %r2137; // end inline asm // begin inline asm add.f16x2 %r2205, %r2193, %r2138; // end inline asm // begin inline asm add.f16x2 %r2208, %r2196, %r2139; // end inline asm // begin inline asm add.f16x2 %r2211, %r2199, %r2141; // end inline asm // begin inline asm add.f16x2 %r2214, %r2202, %r2142; // end inline asm // begin inline asm add.f16x2 %r2217, %r2205, %r2143; // end inline asm // begin inline asm add.f16x2 %r2220, %r2208, %r2144; // end inline asm // begin inline asm add.f16x2 %r2223, %r2211, %r2146; // end inline asm // begin inline asm add.f16x2 %r2226, %r2214, %r2147; // end inline asm // begin inline asm add.f16x2 %r2229, %r2217, %r2148; // end inline asm // begin inline asm add.f16x2 %r2232, %r2220, %r2149; // end inline asm add.s32 %r2239, %r2115, 32768; // begin inline asm ld.shared.v4.b32 {%r2235, %r2236, %r2237, %r2238}, [%r2239]; // end inline asm add.s32 %r2244, %r2115, 33280; // begin inline asm ld.shared.v4.b32 {%r2240, %r2241, %r2242, %r2243}, [%r2244]; // end inline asm add.s32 %r2249, %r2115, 33792; // begin inline asm ld.shared.v4.b32 {%r2245, %r2246, %r2247, %r2248}, [%r2249]; // end inline asm add.s32 %r2254, %r2115, 34304; // begin inline asm ld.shared.v4.b32 {%r2250, %r2251, %r2252, %r2253}, [%r2254]; // end inline asm add.s32 %r2259, %r2115, 34816; // begin inline asm ld.shared.v4.b32 {%r2255, %r2256, %r2257, %r2258}, [%r2259]; // end inline asm add.s32 %r2264, %r2115, 35328; // begin inline asm ld.shared.v4.b32 {%r2260, %r2261, %r2262, %r2263}, [%r2264]; // end inline asm add.s32 %r2269, %r2115, 35840; // begin inline asm ld.shared.v4.b32 {%r2265, %r2266, %r2267, %r2268}, [%r2269]; // end inline asm add.s32 %r2274, %r2115, 36352; // begin inline asm ld.shared.v4.b32 {%r2270, %r2271, %r2272, %r2273}, [%r2274]; // end inline asm // begin inline asm add.f16x2 %r2275, %r2235, %r2240; // end inline asm // begin inline asm add.f16x2 %r2278, %r2236, %r2241; // end inline asm // begin inline asm add.f16x2 %r2281, %r2237, %r2242; // end inline asm // begin inline asm add.f16x2 %r2284, %r2238, %r2243; // end inline asm // begin inline asm add.f16x2 %r2287, %r2275, %r2245; // end inline asm // begin inline asm add.f16x2 %r2290, %r2278, %r2246; // end inline asm // begin inline asm add.f16x2 %r2293, %r2281, %r2247; // end inline asm // begin inline asm add.f16x2 %r2296, %r2284, %r2248; // end inline asm // begin inline asm add.f16x2 %r2299, %r2287, %r2250; // end inline asm // begin inline asm add.f16x2 %r2302, %r2290, %r2251; // end inline asm // begin inline asm add.f16x2 %r2305, %r2293, %r2252; // end inline asm // begin inline asm add.f16x2 %r2308, %r2296, %r2253; // end inline asm // begin inline asm add.f16x2 %r2311, %r2299, %r2255; // end inline asm // begin inline asm add.f16x2 %r2314, %r2302, %r2256; // end inline asm // begin inline asm add.f16x2 %r2317, %r2305, %r2257; // end inline asm // begin inline asm add.f16x2 %r2320, %r2308, %r2258; // end inline asm // begin inline asm add.f16x2 %r2323, %r2311, %r2260; // end inline asm // begin inline asm add.f16x2 %r2326, %r2314, %r2261; // end inline asm // begin inline asm add.f16x2 %r2329, %r2317, %r2262; // end inline asm // begin inline asm add.f16x2 %r2332, %r2320, %r2263; // end inline asm // begin inline asm add.f16x2 %r2335, %r2323, %r2265; // end inline asm // begin inline asm add.f16x2 %r2338, %r2326, %r2266; // end inline asm // begin inline asm add.f16x2 %r2341, %r2329, %r2267; // end inline asm // begin inline asm add.f16x2 %r2344, %r2332, %r2268; // end inline asm // begin inline asm add.f16x2 %r2347, %r2335, %r2270; // end inline asm // begin inline asm add.f16x2 %r2350, %r2338, %r2271; // end inline asm // begin inline asm add.f16x2 %r2353, %r2341, %r2272; // end inline asm // begin inline asm add.f16x2 %r2356, %r2344, %r2273; // end inline asm mul.lo.s32 %r2376, %r2391, %r2407; shl.b32 %r2377, %r2376, 1; cvt.s64.s32 %rd109, %r2377; add.s64 %rd39, %rd109, %rd122; cvt.u32.u64 %r2379, %rd2; setp.ge.s32 %p148, %r2379, %r2395; @%p148 bra $L__BB0_86; mov.b64 %rd124, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd123, %rd124; ld.param.u32 %r2426, [%rd123+60]; mov.u32 %r2425, %tid.x; shr.s32 %r2424, %r2426, 31; shr.u32 %r2423, %r2424, 29; add.s32 %r2422, %r2426, %r2423; shr.s32 %r2421, %r2422, 3; shr.s32 %r2420, %r2425, 31; shr.u32 %r2419, %r2420, 27; add.s32 %r2418, %r2425, %r2419; and.b32 %r2417, %r2418, -32; sub.s32 %r2416, %r2425, %r2417; setp.ge.s32 %p149, %r2416, %r2421; @%p149 bra $L__BB0_84; mov.b64 %rd136, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd135, %rd136; ld.param.u64 %rd134, [%rd135+8]; mov.b64 %rd130, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd129, %rd130; ld.param.u64 %rd128, [%rd129+32]; mul.lo.s64 %rd111, %rd128, %rd2; add.s64 %rd112, %rd39, %rd111; cvta.to.global.u64 %rd113, %rd134; add.s64 %rd114, %rd113, %rd112; st.global.v4.u32 [%rd114], {%r2223, %r2226, %r2229, %r2232}; $L__BB0_84: ld.param.u32 %r2427, [fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0+200]; add.s32 %r2381, %r2379, 8; setp.ge.s32 %p150, %r2381, %r2427; or.pred %p152, %p150, %p149; @%p152 bra $L__BB0_86; mov.b64 %rd133, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd132, %rd133; ld.param.u64 %rd131, [%rd132+8]; mov.b64 %rd127, fmha_mhca_fp16_128_256_sm86_kernel_nl_param_0; mov.u64 %rd126, %rd127; ld.param.u64 %rd125, [%rd126+32]; add.s64 %rd115, %rd2, 8; mul.lo.s64 %rd116, %rd115, %rd125; add.s64 %rd117, %rd39, %rd116; cvta.to.global.u64 %rd118, %rd131; add.s64 %rd119, %rd118, %rd117; st.global.v4.u32 [%rd119], {%r2347, %r2350, %r2353, %r2356}; $L__BB0_86: ret; }