// .globl fmha_mhca_fp16_128_128_sm86_kernel_nl .extern .shared .align 16 .b8 _ZN25fused_multihead_attention5smem_E[]; .visible .entry fmha_mhca_fp16_128_128_sm86_kernel_nl( .param .align 8 .b8 fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0[240] ) { .reg .pred %p<205>; .reg .b16 %rs<129>; .reg .f32 %f<664>; .reg .b32 %r<4564>; .reg .b64 %rd<119>; mov.b64 %rd5, fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0; mov.u64 %rd1, %rd5; ld.param.u32 %r1, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; ld.param.u32 %r2, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+192]; mov.u32 %r126, %ctaid.z; shl.b32 %r3, %r126, 6; setp.le.s32 %p17, %r1, %r3; @%p17 bra $L__BB0_52; mov.u32 %r2063, %tid.x; mov.u32 %r2064, %ctaid.y; mov.u32 %r2065, %ctaid.x; mul.lo.s32 %r2066, %r1, %r2064; mad.lo.s32 %r2067, %r2066, %r2, %r2065; ld.param.u32 %r2068, [%rd1+232]; ld.param.u32 %r2069, [%rd1+224]; shr.s32 %r2070, %r2063, 31; shr.u32 %r2071, %r2070, 27; add.s32 %r2072, %r2063, %r2071; and.b32 %r2073, %r2072, -32; sub.s32 %r2074, %r2063, %r2073; ld.param.u32 %r2075, [%rd1+196]; shl.b32 %r2076, %r2075, 1; mov.u32 %r2077, 1; shr.s32 %r2078, %r2075, 31; shr.u32 %r2079, %r2078, 29; add.s32 %r2080, %r2075, %r2079; shr.s32 %r2081, %r2080, 3; shr.u32 %r2082, %r2070, 28; add.s32 %r2083, %r2063, %r2082; and.b32 %r2084, %r2083, -16; sub.s32 %r4, %r2063, %r2084; setp.lt.s32 %p18, %r4, %r2081; shr.s32 %r5, %r2083, 4; add.s32 %r2085, %r5, %r3; cvt.s64.s32 %rd46, %r2085; ld.param.u64 %rd47, [%rd1+184]; mul.lo.s64 %rd48, %rd47, %rd46; mul.wide.s32 %rd49, %r2076, %r2067; shl.b32 %r2086, %r4, 4; cvt.s64.s32 %rd50, %r2086; add.s64 %rd51, %rd49, %rd50; add.s64 %rd52, %rd51, %rd48; ld.param.u64 %rd53, [%rd1+176]; add.s64 %rd6, %rd53, %rd52; shr.s32 %r2087, %r2083, 31; shr.u32 %r2088, %r2087, 29; add.s32 %r2089, %r5, %r2088; and.b32 %r2090, %r2089, 268435448; sub.s32 %r2091, %r5, %r2090; xor.b32 %r2092, %r2091, %r4; shl.b32 %r6, %r2092, 4; mov.u32 %r2093, 31; mov.u32 %r2094, 0; mov.u32 %r2095, -1; shfl.sync.idx.b32 %r2096|%p19, %r2094, %r2094, %r2093, %r2095; shfl.sync.idx.b32 %r2097|%p20, %r2094, %r2094, %r2093, %r2095; and.b32 %r2098, %r2063, 7; shl.b32 %r2099, %r2063, 4; and.b32 %r2100, %r2099, 112; and.b32 %r2101, %r2063, 16; xor.b32 %r2102, %r2100, %r2101; ld.param.u32 %r2103, [%rd1+228]; shl.b32 %r2104, %r2103, 1; shr.s32 %r2105, %r2103, 31; shr.u32 %r2106, %r2105, 29; add.s32 %r2107, %r2103, %r2106; shr.s32 %r2108, %r2107, 3; setp.lt.s32 %p21, %r4, %r2108; cvt.s64.s32 %rd54, %r5; ld.param.u64 %rd55, [%rd1+216]; mul.lo.s64 %rd56, %rd55, %rd54; mul.lo.s32 %r2109, %r2068, %r2064; mad.lo.s32 %r2110, %r2109, %r2069, %r2065; shl.b32 %r2111, %r2110, 1; mul.wide.s32 %rd57, %r2104, %r2111; add.s64 %rd58, %rd57, %rd50; add.s64 %rd59, %rd58, %rd56; ld.param.u64 %rd60, [%rd1+208]; add.s64 %rd14, %rd60, %rd59; shfl.sync.idx.b32 %r2112|%p22, %r2094, %r2094, %r2093, %r2095; shfl.sync.idx.b32 %r2113|%p23, %r2094, %r2094, %r2093, %r2095; and.b32 %r2114, %r2063, 96; shr.u32 %r2115, %r2114, 1; or.b32 %r2116, %r2115, %r2098; shr.u32 %r2117, %r2101, 1; or.b32 %r2118, %r2116, %r2117; and.b32 %r2119, %r2063, 8; shr.u32 %r2120, %r2119, 3; xor.b32 %r2121, %r2120, %r2098; or.b32 %r2122, %r2111, 1; mul.wide.s32 %rd61, %r2104, %r2122; add.s64 %rd62, %rd61, %rd50; add.s64 %rd63, %rd62, %rd56; add.s64 %rd30, %rd60, %rd63; shfl.sync.idx.b32 %r2123|%p24, %r2094, %r2094, %r2093, %r2095; shfl.sync.idx.b32 %r2124|%p25, %r2094, %r2094, %r2093, %r2095; and.b32 %r2125, %r2063, 224; shr.u32 %r2126, %r2125, 1; and.b32 %r2127, %r2063, 15; or.b32 %r2128, %r2126, %r2127; ld.param.u64 %rd2, [%rd1+32]; ld.param.u64 %rd3, [%rd1+8]; sub.s32 %r2129, %r1, %r3; min.s32 %r2130, %r2129, 64; shl.b32 %r2134, %r2128, 8; shl.b32 %r2135, %r2121, 4; shl.b32 %r2136, %r2118, 8; shl.b32 %r2137, %r2063, 8; and.b32 %r2138, %r2137, 3840; shl.b32 %r2139, %r5, 8; shr.s32 %r2140, %r2074, 31; shr.u32 %r2141, %r2140, 30; add.s32 %r2142, %r2074, %r2141; and.b32 %r2143, %r2142, 2147483644; sub.s32 %r2144, %r2074, %r2143; setp.lt.s32 %p26, %r5, %r2130; and.pred %p27, %p18, %p26; add.s32 %r2145, %r5, 8; setp.lt.s32 %p28, %r2145, %r2130; and.pred %p29, %p18, %p28; add.s32 %r2146, %r5, 16; setp.lt.s32 %p30, %r2146, %r2130; and.pred %p31, %p18, %p30; add.s32 %r2147, %r5, 24; setp.lt.s32 %p32, %r2147, %r2130; and.pred %p33, %p18, %p32; add.s32 %r2148, %r5, 32; setp.lt.s32 %p34, %r2148, %r2130; and.pred %p35, %p18, %p34; add.s32 %r2149, %r5, 40; setp.lt.s32 %p36, %r2149, %r2130; and.pred %p37, %p18, %p36; add.s32 %r2150, %r5, 48; setp.lt.s32 %p38, %r2150, %r2130; and.pred %p39, %p18, %p38; add.s32 %r2151, %r5, 56; setp.lt.s32 %p40, %r2151, %r2130; and.pred %p41, %p18, %p40; add.s32 %r2152, %r6, %r2139; or.b32 %r2153, %r2102, %r2138; or.b32 %r2154, %r2136, %r2135; or.b32 %r2155, %r2134, %r2102; shr.s32 %r2156, %r2072, 5; shl.b32 %r2157, %r2156, 4; shl.b32 %r2158, %r2144, 1; shl.b64 %rd64, %rd47, 3; selp.b32 %r138, 16, 0, %p37; mov.u32 %r2159, _ZN25fused_multihead_attention5smem_E; add.s32 %r2160, %r2152, %r2159; add.s32 %r127, %r2160, %r2097; add.s32 %r129, %r127, 2048; add.s32 %r131, %r127, 4096; add.s32 %r133, %r127, 6144; add.s32 %r135, %r127, 8192; add.s32 %r137, %r127, 10240; add.s32 %r139, %r127, 12288; add.s32 %r141, %r127, 14336; selp.b32 %r128, 16, 0, %p27; // begin inline asm cp.async.cg.shared.global [%r127], [%rd6], 16, %r128; // end inline asm selp.b32 %r130, 16, 0, %p29; add.s64 %rd7, %rd6, %rd64; // begin inline asm cp.async.cg.shared.global [%r129], [%rd7], 16, %r130; // end inline asm selp.b32 %r132, 16, 0, %p31; add.s64 %rd8, %rd7, %rd64; // begin inline asm cp.async.cg.shared.global [%r131], [%rd8], 16, %r132; // end inline asm selp.b32 %r134, 16, 0, %p33; add.s64 %rd9, %rd8, %rd64; // begin inline asm cp.async.cg.shared.global [%r133], [%rd9], 16, %r134; // end inline asm selp.b32 %r136, 16, 0, %p35; add.s64 %rd10, %rd9, %rd64; // begin inline asm cp.async.cg.shared.global [%r135], [%rd10], 16, %r136; // end inline asm add.s64 %rd11, %rd10, %rd64; // begin inline asm cp.async.cg.shared.global [%r137], [%rd11], 16, %r138; // end inline asm selp.b32 %r140, 16, 0, %p39; add.s64 %rd12, %rd11, %rd64; // begin inline asm cp.async.cg.shared.global [%r139], [%rd12], 16, %r140; // end inline asm selp.b32 %r142, 16, 0, %p41; add.s64 %rd13, %rd12, %rd64; // begin inline asm cp.async.cg.shared.global [%r141], [%rd13], 16, %r142; // end inline asm min.s32 %r2161, %r2068, 128; setp.lt.s32 %p42, %r5, %r2161; and.pred %p43, %p21, %p42; setp.lt.s32 %p44, %r2145, %r2161; and.pred %p45, %p21, %p44; setp.lt.s32 %p46, %r2146, %r2161; and.pred %p47, %p21, %p46; setp.lt.s32 %p48, %r2147, %r2161; and.pred %p49, %p21, %p48; setp.lt.s32 %p50, %r2148, %r2161; and.pred %p51, %p21, %p50; setp.lt.s32 %p52, %r2149, %r2161; and.pred %p53, %p21, %p52; setp.lt.s32 %p54, %r2150, %r2161; and.pred %p55, %p21, %p54; setp.lt.s32 %p56, %r2151, %r2161; and.pred %p57, %p21, %p56; add.s32 %r2162, %r5, 64; setp.lt.s32 %p58, %r2162, %r2161; and.pred %p59, %p21, %p58; add.s32 %r2163, %r5, 72; setp.lt.s32 %p60, %r2163, %r2161; and.pred %p61, %p21, %p60; add.s32 %r2164, %r5, 80; setp.lt.s32 %p62, %r2164, %r2161; and.pred %p63, %p21, %p62; add.s32 %r2165, %r5, 88; setp.lt.s32 %p64, %r2165, %r2161; and.pred %p65, %p21, %p64; add.s32 %r2166, %r5, 96; setp.lt.s32 %p66, %r2166, %r2161; and.pred %p67, %p21, %p66; add.s32 %r2167, %r5, 104; setp.lt.s32 %p68, %r2167, %r2161; and.pred %p69, %p21, %p68; add.s32 %r2168, %r5, 112; setp.lt.s32 %p70, %r2168, %r2161; and.pred %p71, %p21, %p70; add.s32 %r2169, %r5, 120; setp.lt.s32 %p72, %r2169, %r2161; and.pred %p73, %p21, %p72; shl.b64 %rd65, %rd55, 3; selp.b32 %r186, 16, 0, %p53; selp.b32 %r188, 16, 0, %p55; selp.b32 %r190, 16, 0, %p57; selp.b32 %r192, 16, 0, %p59; selp.b32 %r194, 16, 0, %p61; selp.b32 %r196, 16, 0, %p63; selp.b32 %r198, 16, 0, %p65; selp.b32 %r200, 16, 0, %p67; selp.b32 %r202, 16, 0, %p69; add.s32 %r2170, %r2159, 32768; add.s32 %r2171, %r2152, %r2170; add.s32 %r143, %r2171, %r2113; add.s32 %r145, %r143, 2048; add.s32 %r147, %r143, 4096; add.s32 %r149, %r143, 6144; add.s32 %r151, %r143, 8192; add.s32 %r153, %r143, 10240; add.s32 %r155, %r143, 12288; add.s32 %r157, %r143, 14336; add.s32 %r159, %r143, 16384; add.s32 %r161, %r143, 18432; add.s32 %r163, %r143, 20480; add.s32 %r165, %r143, 22528; add.s32 %r167, %r143, 24576; add.s32 %r169, %r143, 26624; add.s32 %r171, %r143, 28672; add.s32 %r173, %r143, 30720; selp.b32 %r176, 16, 0, %p43; // begin inline asm cp.async.cg.shared.global [%r143], [%rd14], 16, %r176; // end inline asm selp.b32 %r178, 16, 0, %p45; add.s64 %rd15, %rd14, %rd65; // begin inline asm cp.async.cg.shared.global [%r145], [%rd15], 16, %r178; // end inline asm selp.b32 %r180, 16, 0, %p47; add.s64 %rd16, %rd15, %rd65; // begin inline asm cp.async.cg.shared.global [%r147], [%rd16], 16, %r180; // end inline asm selp.b32 %r182, 16, 0, %p49; add.s64 %rd17, %rd16, %rd65; // begin inline asm cp.async.cg.shared.global [%r149], [%rd17], 16, %r182; // end inline asm selp.b32 %r184, 16, 0, %p51; add.s64 %rd18, %rd17, %rd65; // begin inline asm cp.async.cg.shared.global [%r151], [%rd18], 16, %r184; // end inline asm add.s64 %rd19, %rd18, %rd65; // begin inline asm cp.async.cg.shared.global [%r153], [%rd19], 16, %r186; // end inline asm add.s64 %rd20, %rd19, %rd65; // begin inline asm cp.async.cg.shared.global [%r155], [%rd20], 16, %r188; // end inline asm add.s64 %rd21, %rd20, %rd65; // begin inline asm cp.async.cg.shared.global [%r157], [%rd21], 16, %r190; // end inline asm add.s64 %rd22, %rd21, %rd65; // begin inline asm cp.async.cg.shared.global [%r159], [%rd22], 16, %r192; // end inline asm add.s64 %rd23, %rd22, %rd65; // begin inline asm cp.async.cg.shared.global [%r161], [%rd23], 16, %r194; // end inline asm add.s64 %rd24, %rd23, %rd65; // begin inline asm cp.async.cg.shared.global [%r163], [%rd24], 16, %r196; // end inline asm add.s64 %rd25, %rd24, %rd65; // begin inline asm cp.async.cg.shared.global [%r165], [%rd25], 16, %r198; // end inline asm add.s64 %rd26, %rd25, %rd65; // begin inline asm cp.async.cg.shared.global [%r167], [%rd26], 16, %r200; // end inline asm add.s64 %rd27, %rd26, %rd65; // begin inline asm cp.async.cg.shared.global [%r169], [%rd27], 16, %r202; // end inline asm selp.b32 %r204, 16, 0, %p71; add.s64 %rd28, %rd27, %rd65; // begin inline asm cp.async.cg.shared.global [%r171], [%rd28], 16, %r204; // end inline asm selp.b32 %r206, 16, 0, %p73; add.s64 %rd29, %rd28, %rd65; // begin inline asm cp.async.cg.shared.global [%r173], [%rd29], 16, %r206; // end inline asm add.s32 %r2172, %r2159, 65536; add.s32 %r2173, %r2152, %r2172; add.s32 %r175, %r2173, %r2124; add.s32 %r177, %r175, 2048; add.s32 %r179, %r175, 4096; add.s32 %r181, %r175, 6144; add.s32 %r183, %r175, 8192; add.s32 %r185, %r175, 10240; add.s32 %r187, %r175, 12288; add.s32 %r189, %r175, 14336; add.s32 %r191, %r175, 16384; add.s32 %r193, %r175, 18432; add.s32 %r195, %r175, 20480; add.s32 %r197, %r175, 22528; add.s32 %r199, %r175, 24576; add.s32 %r201, %r175, 26624; add.s32 %r203, %r175, 28672; add.s32 %r205, %r175, 30720; // begin inline asm cp.async.cg.shared.global [%r175], [%rd30], 16, %r176; // end inline asm add.s64 %rd31, %rd30, %rd65; // begin inline asm cp.async.cg.shared.global [%r177], [%rd31], 16, %r178; // end inline asm add.s64 %rd32, %rd31, %rd65; // begin inline asm cp.async.cg.shared.global [%r179], [%rd32], 16, %r180; // end inline asm add.s64 %rd33, %rd32, %rd65; // begin inline asm cp.async.cg.shared.global [%r181], [%rd33], 16, %r182; // end inline asm add.s64 %rd34, %rd33, %rd65; // begin inline asm cp.async.cg.shared.global [%r183], [%rd34], 16, %r184; // end inline asm add.s64 %rd35, %rd34, %rd65; // begin inline asm cp.async.cg.shared.global [%r185], [%rd35], 16, %r186; // end inline asm add.s64 %rd36, %rd35, %rd65; // begin inline asm cp.async.cg.shared.global [%r187], [%rd36], 16, %r188; // end inline asm add.s64 %rd37, %rd36, %rd65; // begin inline asm cp.async.cg.shared.global [%r189], [%rd37], 16, %r190; // end inline asm add.s64 %rd38, %rd37, %rd65; // begin inline asm cp.async.cg.shared.global [%r191], [%rd38], 16, %r192; // end inline asm add.s64 %rd39, %rd38, %rd65; // begin inline asm cp.async.cg.shared.global [%r193], [%rd39], 16, %r194; // end inline asm add.s64 %rd40, %rd39, %rd65; // begin inline asm cp.async.cg.shared.global [%r195], [%rd40], 16, %r196; // end inline asm add.s64 %rd41, %rd40, %rd65; // begin inline asm cp.async.cg.shared.global [%r197], [%rd41], 16, %r198; // end inline asm add.s64 %rd42, %rd41, %rd65; // begin inline asm cp.async.cg.shared.global [%r199], [%rd42], 16, %r200; // end inline asm add.s64 %rd43, %rd42, %rd65; // begin inline asm cp.async.cg.shared.global [%r201], [%rd43], 16, %r202; // end inline asm add.s64 %rd44, %rd43, %rd65; // begin inline asm cp.async.cg.shared.global [%r203], [%rd44], 16, %r204; // end inline asm add.s64 %rd45, %rd44, %rd65; // begin inline asm cp.async.cg.shared.global [%r205], [%rd45], 16, %r206; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm cp.async.wait_group 0; // end inline asm bar.sync 0; add.s32 %r2174, %r2153, %r2159; add.s32 %r211, %r2174, %r2096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r207, %r208, %r209, %r210}, [%r211]; // end inline asm add.s32 %r216, %r211, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r212, %r213, %r214, %r215}, [%r216]; // end inline asm add.s32 %r221, %r211, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r217, %r218, %r219, %r220}, [%r221]; // end inline asm add.s32 %r226, %r211, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r222, %r223, %r224, %r225}, [%r226]; // end inline asm add.s32 %r2175, %r2112, %r2170; add.s32 %r231, %r2175, %r2154; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r227, %r228, %r229, %r230}, [%r231]; // end inline asm add.s32 %r236, %r231, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r232, %r233, %r234, %r235}, [%r236]; // end inline asm xor.b32 %r2176, %r2154, 32; add.s32 %r241, %r2175, %r2176; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r237, %r238, %r239, %r240}, [%r241]; // end inline asm add.s32 %r246, %r241, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r242, %r243, %r244, %r245}, [%r246]; // end inline asm xor.b32 %r2177, %r2154, 64; add.s32 %r251, %r2175, %r2177; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r247, %r248, %r249, %r250}, [%r251]; // end inline asm add.s32 %r256, %r251, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r252, %r253, %r254, %r255}, [%r256]; // end inline asm xor.b32 %r2178, %r2154, 96; add.s32 %r261, %r2175, %r2178; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r257, %r258, %r259, %r260}, [%r261]; // end inline asm add.s32 %r266, %r261, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r262, %r263, %r264, %r265}, [%r266]; // end inline asm or.b32 %r2179, %r2154, 128; add.s32 %r271, %r2175, %r2179; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r267, %r268, %r269, %r270}, [%r271]; // end inline asm add.s32 %r276, %r271, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r272, %r273, %r274, %r275}, [%r276]; // end inline asm xor.b32 %r2180, %r2154, 160; add.s32 %r281, %r2175, %r2180; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r277, %r278, %r279, %r280}, [%r281]; // end inline asm add.s32 %r286, %r281, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r282, %r283, %r284, %r285}, [%r286]; // end inline asm xor.b32 %r2181, %r2154, 192; add.s32 %r291, %r2175, %r2181; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r287, %r288, %r289, %r290}, [%r291]; // end inline asm add.s32 %r296, %r291, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r292, %r293, %r294, %r295}, [%r296]; // end inline asm xor.b32 %r2182, %r2154, 224; add.s32 %r301, %r2175, %r2182; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r297, %r298, %r299, %r300}, [%r301]; // end inline asm add.s32 %r306, %r301, 16384; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r302, %r303, %r304, %r305}, [%r306]; // end inline asm xor.b32 %r2183, %r2153, 32; add.s32 %r311, %r2155, %r2172; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r307, %r308, %r309, %r310}, [%r311]; // end inline asm xor.b32 %r2184, %r2155, 32; add.s32 %r316, %r2184, %r2172; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r312, %r313, %r314, %r315}, [%r316]; // end inline asm xor.b32 %r2185, %r2155, 64; add.s32 %r321, %r2185, %r2172; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r317, %r318, %r319, %r320}, [%r321]; // end inline asm xor.b32 %r2186, %r2155, 96; add.s32 %r326, %r2186, %r2172; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r322, %r323, %r324, %r325}, [%r326]; // end inline asm or.b32 %r2187, %r2155, 128; add.s32 %r331, %r2187, %r2172; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r327, %r328, %r329, %r330}, [%r331]; // end inline asm xor.b32 %r2188, %r2155, 160; add.s32 %r336, %r2188, %r2172; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r332, %r333, %r334, %r335}, [%r336]; // end inline asm xor.b32 %r2189, %r2155, 192; add.s32 %r341, %r2189, %r2172; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r337, %r338, %r339, %r340}, [%r341]; // end inline asm xor.b32 %r2190, %r2155, 224; add.s32 %r346, %r2190, %r2172; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r342, %r343, %r344, %r345}, [%r346]; // end inline asm add.s32 %r2191, %r2159, 81920; add.s32 %r351, %r2155, %r2191; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r347, %r348, %r349, %r350}, [%r351]; // end inline asm add.s32 %r356, %r2184, %r2191; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r352, %r353, %r354, %r355}, [%r356]; // end inline asm add.s32 %r361, %r2185, %r2191; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r357, %r358, %r359, %r360}, [%r361]; // end inline asm add.s32 %r366, %r2186, %r2191; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r362, %r363, %r364, %r365}, [%r366]; // end inline asm add.s32 %r371, %r2187, %r2191; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r367, %r368, %r369, %r370}, [%r371]; // end inline asm add.s32 %r376, %r2188, %r2191; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r372, %r373, %r374, %r375}, [%r376]; // end inline asm add.s32 %r381, %r2189, %r2191; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r377, %r378, %r379, %r380}, [%r381]; // end inline asm add.s32 %r386, %r2190, %r2191; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%r382, %r383, %r384, %r385}, [%r386]; // end inline asm shr.s32 %r2192, %r2072, 31; shr.u32 %r2193, %r2192, 30; add.s32 %r2194, %r2156, %r2193; and.b32 %r2195, %r2194, 1073741820; sub.s32 %r2196, %r2156, %r2195; shr.u32 %r2197, %r2070, 25; add.s32 %r2198, %r2063, %r2197; shr.u32 %r2199, %r2198, 1; and.b32 %r2200, %r2199, 268435392; shr.s32 %r73, %r2142, 2; add.s32 %r2201, %r2200, %r73; shl.b32 %r2202, %r2201, 2; mov.u32 %r2203, 2; add.s32 %r2204, %r2202, %r2196; shl.b32 %r2205, %r2204, 2; add.s32 %r74, %r2170, %r2205; ld.param.u32 %r1841, [%rd1+64]; // begin inline asm mov.u32 %r619, 0; // end inline asm // begin inline asm mov.u32 %r620, 0; // end inline asm // begin inline asm mov.u32 %r629, 0; // end inline asm // begin inline asm mov.u32 %r630, 0; // end inline asm // begin inline asm mov.u32 %r639, 0; // end inline asm // begin inline asm mov.u32 %r640, 0; // end inline asm // begin inline asm mov.u32 %r649, 0; // end inline asm // begin inline asm mov.u32 %r650, 0; // end inline asm // begin inline asm mov.u32 %r659, 0; // end inline asm // begin inline asm mov.u32 %r660, 0; // end inline asm // begin inline asm mov.u32 %r669, 0; // end inline asm // begin inline asm mov.u32 %r670, 0; // end inline asm // begin inline asm mov.u32 %r679, 0; // end inline asm // begin inline asm mov.u32 %r680, 0; // end inline asm // begin inline asm mov.u32 %r689, 0; // end inline asm // begin inline asm mov.u32 %r690, 0; // end inline asm // begin inline asm mov.u32 %r699, 0; // end inline asm // begin inline asm mov.u32 %r700, 0; // end inline asm // begin inline asm mov.u32 %r709, 0; // end inline asm // begin inline asm mov.u32 %r710, 0; // end inline asm // begin inline asm mov.u32 %r719, 0; // end inline asm // begin inline asm mov.u32 %r720, 0; // end inline asm // begin inline asm mov.u32 %r729, 0; // end inline asm // begin inline asm mov.u32 %r730, 0; // end inline asm // begin inline asm mov.u32 %r739, 0; // end inline asm // begin inline asm mov.u32 %r740, 0; // end inline asm // begin inline asm mov.u32 %r749, 0; // end inline asm // begin inline asm mov.u32 %r750, 0; // end inline asm // begin inline asm mov.u32 %r759, 0; // end inline asm // begin inline asm mov.u32 %r760, 0; // end inline asm // begin inline asm mov.u32 %r769, 0; // end inline asm // begin inline asm mov.u32 %r770, 0; // end inline asm add.s32 %r2206, %r2096, %r2159; add.s32 %r423, %r2206, %r2183; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r419, %r420, %r421, %r422}, [%r423]; // end inline asm add.s32 %r428, %r423, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r424, %r425, %r426, %r427}, [%r428]; // end inline asm add.s32 %r433, %r423, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r429, %r430, %r431, %r432}, [%r433]; // end inline asm add.s32 %r438, %r423, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r434, %r435, %r436, %r437}, [%r438]; // end inline asm xor.b32 %r2207, %r2153, 64; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r619, %r620}, {%r207, %r208, %r209, %r210}, {%r227, %r228}, {%r619, %r620}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r629, %r630}, {%r207, %r208, %r209, %r210}, {%r229, %r230}, {%r629, %r630}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r639, %r640}, {%r207, %r208, %r209, %r210}, {%r232, %r233}, {%r639, %r640}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r649, %r650}, {%r207, %r208, %r209, %r210}, {%r234, %r235}, {%r649, %r650}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r659, %r660}, {%r212, %r213, %r214, %r215}, {%r227, %r228}, {%r659, %r660}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r669, %r670}, {%r212, %r213, %r214, %r215}, {%r229, %r230}, {%r669, %r670}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r679, %r680}, {%r212, %r213, %r214, %r215}, {%r232, %r233}, {%r679, %r680}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r689, %r690}, {%r212, %r213, %r214, %r215}, {%r234, %r235}, {%r689, %r690}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r699, %r700}, {%r217, %r218, %r219, %r220}, {%r227, %r228}, {%r699, %r700}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r709, %r710}, {%r217, %r218, %r219, %r220}, {%r229, %r230}, {%r709, %r710}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r719, %r720}, {%r217, %r218, %r219, %r220}, {%r232, %r233}, {%r719, %r720}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r729, %r730}, {%r217, %r218, %r219, %r220}, {%r234, %r235}, {%r729, %r730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r739, %r740}, {%r222, %r223, %r224, %r225}, {%r227, %r228}, {%r739, %r740}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r749, %r750}, {%r222, %r223, %r224, %r225}, {%r229, %r230}, {%r749, %r750}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r759, %r760}, {%r222, %r223, %r224, %r225}, {%r232, %r233}, {%r759, %r760}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r769, %r770}, {%r222, %r223, %r224, %r225}, {%r234, %r235}, {%r769, %r770}; // end inline asm add.s32 %r603, %r2206, %r2207; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r599, %r600, %r601, %r602}, [%r603]; // end inline asm add.s32 %r608, %r603, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r604, %r605, %r606, %r607}, [%r608]; // end inline asm add.s32 %r613, %r603, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r609, %r610, %r611, %r612}, [%r613]; // end inline asm add.s32 %r618, %r603, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r614, %r615, %r616, %r617}, [%r618]; // end inline asm xor.b32 %r2208, %r2153, 96; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r619, %r620}, {%r419, %r420, %r421, %r422}, {%r237, %r238}, {%r619, %r620}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r629, %r630}, {%r419, %r420, %r421, %r422}, {%r239, %r240}, {%r629, %r630}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r639, %r640}, {%r419, %r420, %r421, %r422}, {%r242, %r243}, {%r639, %r640}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r649, %r650}, {%r419, %r420, %r421, %r422}, {%r244, %r245}, {%r649, %r650}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r659, %r660}, {%r424, %r425, %r426, %r427}, {%r237, %r238}, {%r659, %r660}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r669, %r670}, {%r424, %r425, %r426, %r427}, {%r239, %r240}, {%r669, %r670}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r679, %r680}, {%r424, %r425, %r426, %r427}, {%r242, %r243}, {%r679, %r680}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r689, %r690}, {%r424, %r425, %r426, %r427}, {%r244, %r245}, {%r689, %r690}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r699, %r700}, {%r429, %r430, %r431, %r432}, {%r237, %r238}, {%r699, %r700}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r709, %r710}, {%r429, %r430, %r431, %r432}, {%r239, %r240}, {%r709, %r710}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r719, %r720}, {%r429, %r430, %r431, %r432}, {%r242, %r243}, {%r719, %r720}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r729, %r730}, {%r429, %r430, %r431, %r432}, {%r244, %r245}, {%r729, %r730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r739, %r740}, {%r434, %r435, %r436, %r437}, {%r237, %r238}, {%r739, %r740}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r749, %r750}, {%r434, %r435, %r436, %r437}, {%r239, %r240}, {%r749, %r750}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r759, %r760}, {%r434, %r435, %r436, %r437}, {%r242, %r243}, {%r759, %r760}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r769, %r770}, {%r434, %r435, %r436, %r437}, {%r244, %r245}, {%r769, %r770}; // end inline asm add.s32 %r783, %r2206, %r2208; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r779, %r780, %r781, %r782}, [%r783]; // end inline asm add.s32 %r788, %r783, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r784, %r785, %r786, %r787}, [%r788]; // end inline asm add.s32 %r793, %r783, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r789, %r790, %r791, %r792}, [%r793]; // end inline asm add.s32 %r798, %r783, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r794, %r795, %r796, %r797}, [%r798]; // end inline asm or.b32 %r2209, %r2153, 128; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r619, %r620}, {%r599, %r600, %r601, %r602}, {%r247, %r248}, {%r619, %r620}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r629, %r630}, {%r599, %r600, %r601, %r602}, {%r249, %r250}, {%r629, %r630}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r639, %r640}, {%r599, %r600, %r601, %r602}, {%r252, %r253}, {%r639, %r640}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r649, %r650}, {%r599, %r600, %r601, %r602}, {%r254, %r255}, {%r649, %r650}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r659, %r660}, {%r604, %r605, %r606, %r607}, {%r247, %r248}, {%r659, %r660}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r669, %r670}, {%r604, %r605, %r606, %r607}, {%r249, %r250}, {%r669, %r670}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r679, %r680}, {%r604, %r605, %r606, %r607}, {%r252, %r253}, {%r679, %r680}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r689, %r690}, {%r604, %r605, %r606, %r607}, {%r254, %r255}, {%r689, %r690}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r699, %r700}, {%r609, %r610, %r611, %r612}, {%r247, %r248}, {%r699, %r700}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r709, %r710}, {%r609, %r610, %r611, %r612}, {%r249, %r250}, {%r709, %r710}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r719, %r720}, {%r609, %r610, %r611, %r612}, {%r252, %r253}, {%r719, %r720}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r729, %r730}, {%r609, %r610, %r611, %r612}, {%r254, %r255}, {%r729, %r730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r739, %r740}, {%r614, %r615, %r616, %r617}, {%r247, %r248}, {%r739, %r740}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r749, %r750}, {%r614, %r615, %r616, %r617}, {%r249, %r250}, {%r749, %r750}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r759, %r760}, {%r614, %r615, %r616, %r617}, {%r252, %r253}, {%r759, %r760}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r769, %r770}, {%r614, %r615, %r616, %r617}, {%r254, %r255}, {%r769, %r770}; // end inline asm add.s32 %r963, %r2206, %r2209; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r959, %r960, %r961, %r962}, [%r963]; // end inline asm add.s32 %r968, %r963, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r964, %r965, %r966, %r967}, [%r968]; // end inline asm add.s32 %r973, %r963, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r969, %r970, %r971, %r972}, [%r973]; // end inline asm add.s32 %r978, %r963, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r974, %r975, %r976, %r977}, [%r978]; // end inline asm xor.b32 %r2210, %r2153, 160; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r619, %r620}, {%r779, %r780, %r781, %r782}, {%r257, %r258}, {%r619, %r620}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r629, %r630}, {%r779, %r780, %r781, %r782}, {%r259, %r260}, {%r629, %r630}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r639, %r640}, {%r779, %r780, %r781, %r782}, {%r262, %r263}, {%r639, %r640}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r649, %r650}, {%r779, %r780, %r781, %r782}, {%r264, %r265}, {%r649, %r650}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r659, %r660}, {%r784, %r785, %r786, %r787}, {%r257, %r258}, {%r659, %r660}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r669, %r670}, {%r784, %r785, %r786, %r787}, {%r259, %r260}, {%r669, %r670}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r679, %r680}, {%r784, %r785, %r786, %r787}, {%r262, %r263}, {%r679, %r680}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r689, %r690}, {%r784, %r785, %r786, %r787}, {%r264, %r265}, {%r689, %r690}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r699, %r700}, {%r789, %r790, %r791, %r792}, {%r257, %r258}, {%r699, %r700}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r709, %r710}, {%r789, %r790, %r791, %r792}, {%r259, %r260}, {%r709, %r710}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r719, %r720}, {%r789, %r790, %r791, %r792}, {%r262, %r263}, {%r719, %r720}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r729, %r730}, {%r789, %r790, %r791, %r792}, {%r264, %r265}, {%r729, %r730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r739, %r740}, {%r794, %r795, %r796, %r797}, {%r257, %r258}, {%r739, %r740}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r749, %r750}, {%r794, %r795, %r796, %r797}, {%r259, %r260}, {%r749, %r750}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r759, %r760}, {%r794, %r795, %r796, %r797}, {%r262, %r263}, {%r759, %r760}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r769, %r770}, {%r794, %r795, %r796, %r797}, {%r264, %r265}, {%r769, %r770}; // end inline asm add.s32 %r1143, %r2206, %r2210; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1139, %r1140, %r1141, %r1142}, [%r1143]; // end inline asm add.s32 %r1148, %r1143, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1144, %r1145, %r1146, %r1147}, [%r1148]; // end inline asm add.s32 %r1153, %r1143, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1149, %r1150, %r1151, %r1152}, [%r1153]; // end inline asm add.s32 %r1158, %r1143, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1154, %r1155, %r1156, %r1157}, [%r1158]; // end inline asm xor.b32 %r2211, %r2153, 192; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r619, %r620}, {%r959, %r960, %r961, %r962}, {%r267, %r268}, {%r619, %r620}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r629, %r630}, {%r959, %r960, %r961, %r962}, {%r269, %r270}, {%r629, %r630}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r639, %r640}, {%r959, %r960, %r961, %r962}, {%r272, %r273}, {%r639, %r640}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r649, %r650}, {%r959, %r960, %r961, %r962}, {%r274, %r275}, {%r649, %r650}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r659, %r660}, {%r964, %r965, %r966, %r967}, {%r267, %r268}, {%r659, %r660}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r669, %r670}, {%r964, %r965, %r966, %r967}, {%r269, %r270}, {%r669, %r670}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r679, %r680}, {%r964, %r965, %r966, %r967}, {%r272, %r273}, {%r679, %r680}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r689, %r690}, {%r964, %r965, %r966, %r967}, {%r274, %r275}, {%r689, %r690}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r699, %r700}, {%r969, %r970, %r971, %r972}, {%r267, %r268}, {%r699, %r700}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r709, %r710}, {%r969, %r970, %r971, %r972}, {%r269, %r270}, {%r709, %r710}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r719, %r720}, {%r969, %r970, %r971, %r972}, {%r272, %r273}, {%r719, %r720}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r729, %r730}, {%r969, %r970, %r971, %r972}, {%r274, %r275}, {%r729, %r730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r739, %r740}, {%r974, %r975, %r976, %r977}, {%r267, %r268}, {%r739, %r740}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r749, %r750}, {%r974, %r975, %r976, %r977}, {%r269, %r270}, {%r749, %r750}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r759, %r760}, {%r974, %r975, %r976, %r977}, {%r272, %r273}, {%r759, %r760}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r769, %r770}, {%r974, %r975, %r976, %r977}, {%r274, %r275}, {%r769, %r770}; // end inline asm add.s32 %r1323, %r2206, %r2211; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1319, %r1320, %r1321, %r1322}, [%r1323]; // end inline asm add.s32 %r1328, %r1323, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1324, %r1325, %r1326, %r1327}, [%r1328]; // end inline asm add.s32 %r1333, %r1323, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1329, %r1330, %r1331, %r1332}, [%r1333]; // end inline asm add.s32 %r1338, %r1323, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1334, %r1335, %r1336, %r1337}, [%r1338]; // end inline asm xor.b32 %r2212, %r2153, 224; // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r619, %r620}, {%r1139, %r1140, %r1141, %r1142}, {%r277, %r278}, {%r619, %r620}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r629, %r630}, {%r1139, %r1140, %r1141, %r1142}, {%r279, %r280}, {%r629, %r630}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r639, %r640}, {%r1139, %r1140, %r1141, %r1142}, {%r282, %r283}, {%r639, %r640}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r649, %r650}, {%r1139, %r1140, %r1141, %r1142}, {%r284, %r285}, {%r649, %r650}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r659, %r660}, {%r1144, %r1145, %r1146, %r1147}, {%r277, %r278}, {%r659, %r660}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r669, %r670}, {%r1144, %r1145, %r1146, %r1147}, {%r279, %r280}, {%r669, %r670}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r679, %r680}, {%r1144, %r1145, %r1146, %r1147}, {%r282, %r283}, {%r679, %r680}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r689, %r690}, {%r1144, %r1145, %r1146, %r1147}, {%r284, %r285}, {%r689, %r690}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r699, %r700}, {%r1149, %r1150, %r1151, %r1152}, {%r277, %r278}, {%r699, %r700}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r709, %r710}, {%r1149, %r1150, %r1151, %r1152}, {%r279, %r280}, {%r709, %r710}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r719, %r720}, {%r1149, %r1150, %r1151, %r1152}, {%r282, %r283}, {%r719, %r720}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r729, %r730}, {%r1149, %r1150, %r1151, %r1152}, {%r284, %r285}, {%r729, %r730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r739, %r740}, {%r1154, %r1155, %r1156, %r1157}, {%r277, %r278}, {%r739, %r740}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r749, %r750}, {%r1154, %r1155, %r1156, %r1157}, {%r279, %r280}, {%r749, %r750}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r759, %r760}, {%r1154, %r1155, %r1156, %r1157}, {%r282, %r283}, {%r759, %r760}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r769, %r770}, {%r1154, %r1155, %r1156, %r1157}, {%r284, %r285}, {%r769, %r770}; // end inline asm add.s32 %r1503, %r2206, %r2212; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1499, %r1500, %r1501, %r1502}, [%r1503]; // end inline asm add.s32 %r1508, %r1503, 4096; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1504, %r1505, %r1506, %r1507}, [%r1508]; // end inline asm add.s32 %r1513, %r1503, 8192; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1509, %r1510, %r1511, %r1512}, [%r1513]; // end inline asm add.s32 %r1518, %r1503, 12288; // begin inline asm ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%r1514, %r1515, %r1516, %r1517}, [%r1518]; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r619, %r620}, {%r1319, %r1320, %r1321, %r1322}, {%r287, %r288}, {%r619, %r620}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r629, %r630}, {%r1319, %r1320, %r1321, %r1322}, {%r289, %r290}, {%r629, %r630}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r639, %r640}, {%r1319, %r1320, %r1321, %r1322}, {%r292, %r293}, {%r639, %r640}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r649, %r650}, {%r1319, %r1320, %r1321, %r1322}, {%r294, %r295}, {%r649, %r650}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r659, %r660}, {%r1324, %r1325, %r1326, %r1327}, {%r287, %r288}, {%r659, %r660}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r669, %r670}, {%r1324, %r1325, %r1326, %r1327}, {%r289, %r290}, {%r669, %r670}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r679, %r680}, {%r1324, %r1325, %r1326, %r1327}, {%r292, %r293}, {%r679, %r680}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r689, %r690}, {%r1324, %r1325, %r1326, %r1327}, {%r294, %r295}, {%r689, %r690}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r699, %r700}, {%r1329, %r1330, %r1331, %r1332}, {%r287, %r288}, {%r699, %r700}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r709, %r710}, {%r1329, %r1330, %r1331, %r1332}, {%r289, %r290}, {%r709, %r710}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r719, %r720}, {%r1329, %r1330, %r1331, %r1332}, {%r292, %r293}, {%r719, %r720}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r729, %r730}, {%r1329, %r1330, %r1331, %r1332}, {%r294, %r295}, {%r729, %r730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r739, %r740}, {%r1334, %r1335, %r1336, %r1337}, {%r287, %r288}, {%r739, %r740}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r749, %r750}, {%r1334, %r1335, %r1336, %r1337}, {%r289, %r290}, {%r749, %r750}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r759, %r760}, {%r1334, %r1335, %r1336, %r1337}, {%r292, %r293}, {%r759, %r760}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r769, %r770}, {%r1334, %r1335, %r1336, %r1337}, {%r294, %r295}, {%r769, %r770}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r619, %r620}, {%r1499, %r1500, %r1501, %r1502}, {%r297, %r298}, {%r619, %r620}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r629, %r630}, {%r1499, %r1500, %r1501, %r1502}, {%r299, %r300}, {%r629, %r630}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r639, %r640}, {%r1499, %r1500, %r1501, %r1502}, {%r302, %r303}, {%r639, %r640}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r649, %r650}, {%r1499, %r1500, %r1501, %r1502}, {%r304, %r305}, {%r649, %r650}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r659, %r660}, {%r1504, %r1505, %r1506, %r1507}, {%r297, %r298}, {%r659, %r660}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r669, %r670}, {%r1504, %r1505, %r1506, %r1507}, {%r299, %r300}, {%r669, %r670}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r679, %r680}, {%r1504, %r1505, %r1506, %r1507}, {%r302, %r303}, {%r679, %r680}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r689, %r690}, {%r1504, %r1505, %r1506, %r1507}, {%r304, %r305}, {%r689, %r690}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r699, %r700}, {%r1509, %r1510, %r1511, %r1512}, {%r297, %r298}, {%r699, %r700}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r709, %r710}, {%r1509, %r1510, %r1511, %r1512}, {%r299, %r300}, {%r709, %r710}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r719, %r720}, {%r1509, %r1510, %r1511, %r1512}, {%r302, %r303}, {%r719, %r720}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r729, %r730}, {%r1509, %r1510, %r1511, %r1512}, {%r304, %r305}, {%r729, %r730}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r739, %r740}, {%r1514, %r1515, %r1516, %r1517}, {%r297, %r298}, {%r739, %r740}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r749, %r750}, {%r1514, %r1515, %r1516, %r1517}, {%r299, %r300}, {%r749, %r750}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r759, %r760}, {%r1514, %r1515, %r1516, %r1517}, {%r302, %r303}, {%r759, %r760}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r769, %r770}, {%r1514, %r1515, %r1516, %r1517}, {%r304, %r305}, {%r769, %r770}; // end inline asm // begin inline asm mul.f16x2 %r1839, %r619, %r1841; // end inline asm mov.u32 %r2058, 2080340991; // begin inline asm min.xorsign.abs.f16x2 %r1842, %r1839, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1845, %r620, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1848, %r1845, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1851, %r629, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1854, %r1851, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1857, %r630, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1860, %r1857, %r2058; // end inline asm // begin inline asm mov.b32 {%rs1, %rs2}, %r1842; // end inline asm // begin inline asm cvt.f32.f16 %f187, %rs1; // end inline asm // begin inline asm cvt.f32.f16 %f188, %rs2; // end inline asm // begin inline asm mov.b32 {%rs5, %rs6}, %r1848; // end inline asm // begin inline asm cvt.f32.f16 %f189, %rs5; // end inline asm // begin inline asm cvt.f32.f16 %f190, %rs6; // end inline asm // begin inline asm mov.b32 {%rs9, %rs10}, %r1854; // end inline asm // begin inline asm cvt.f32.f16 %f191, %rs9; // end inline asm // begin inline asm cvt.f32.f16 %f192, %rs10; // end inline asm // begin inline asm mov.b32 {%rs13, %rs14}, %r1860; // end inline asm // begin inline asm cvt.f32.f16 %f193, %rs13; // end inline asm // begin inline asm cvt.f32.f16 %f194, %rs14; // end inline asm // begin inline asm mul.f16x2 %r1867, %r639, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1870, %r1867, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1873, %r640, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1876, %r1873, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1879, %r649, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1882, %r1879, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1885, %r650, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1888, %r1885, %r2058; // end inline asm // begin inline asm mov.b32 {%rs17, %rs18}, %r1870; // end inline asm // begin inline asm cvt.f32.f16 %f195, %rs17; // end inline asm // begin inline asm cvt.f32.f16 %f196, %rs18; // end inline asm // begin inline asm mov.b32 {%rs21, %rs22}, %r1876; // end inline asm // begin inline asm cvt.f32.f16 %f197, %rs21; // end inline asm // begin inline asm cvt.f32.f16 %f198, %rs22; // end inline asm // begin inline asm mov.b32 {%rs25, %rs26}, %r1882; // end inline asm // begin inline asm cvt.f32.f16 %f199, %rs25; // end inline asm // begin inline asm cvt.f32.f16 %f200, %rs26; // end inline asm // begin inline asm mov.b32 {%rs29, %rs30}, %r1888; // end inline asm // begin inline asm cvt.f32.f16 %f201, %rs29; // end inline asm // begin inline asm cvt.f32.f16 %f202, %rs30; // end inline asm // begin inline asm mul.f16x2 %r1895, %r659, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1898, %r1895, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1901, %r660, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1904, %r1901, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1907, %r669, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1910, %r1907, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1913, %r670, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1916, %r1913, %r2058; // end inline asm // begin inline asm mov.b32 {%rs33, %rs34}, %r1898; // end inline asm // begin inline asm cvt.f32.f16 %f203, %rs33; // end inline asm // begin inline asm cvt.f32.f16 %f204, %rs34; // end inline asm // begin inline asm mov.b32 {%rs37, %rs38}, %r1904; // end inline asm // begin inline asm cvt.f32.f16 %f205, %rs37; // end inline asm // begin inline asm cvt.f32.f16 %f206, %rs38; // end inline asm // begin inline asm mov.b32 {%rs41, %rs42}, %r1910; // end inline asm // begin inline asm cvt.f32.f16 %f207, %rs41; // end inline asm // begin inline asm cvt.f32.f16 %f208, %rs42; // end inline asm // begin inline asm mov.b32 {%rs45, %rs46}, %r1916; // end inline asm // begin inline asm cvt.f32.f16 %f209, %rs45; // end inline asm // begin inline asm cvt.f32.f16 %f210, %rs46; // end inline asm // begin inline asm mul.f16x2 %r1923, %r679, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1926, %r1923, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1929, %r680, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1932, %r1929, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1935, %r689, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1938, %r1935, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1941, %r690, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1944, %r1941, %r2058; // end inline asm // begin inline asm mov.b32 {%rs49, %rs50}, %r1926; // end inline asm // begin inline asm cvt.f32.f16 %f211, %rs49; // end inline asm // begin inline asm cvt.f32.f16 %f212, %rs50; // end inline asm // begin inline asm mov.b32 {%rs53, %rs54}, %r1932; // end inline asm // begin inline asm cvt.f32.f16 %f213, %rs53; // end inline asm // begin inline asm cvt.f32.f16 %f214, %rs54; // end inline asm // begin inline asm mov.b32 {%rs57, %rs58}, %r1938; // end inline asm // begin inline asm cvt.f32.f16 %f215, %rs57; // end inline asm // begin inline asm cvt.f32.f16 %f216, %rs58; // end inline asm // begin inline asm mov.b32 {%rs61, %rs62}, %r1944; // end inline asm // begin inline asm cvt.f32.f16 %f217, %rs61; // end inline asm // begin inline asm cvt.f32.f16 %f218, %rs62; // end inline asm // begin inline asm mul.f16x2 %r1951, %r699, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1954, %r1951, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1957, %r700, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1960, %r1957, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1963, %r709, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1966, %r1963, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1969, %r710, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1972, %r1969, %r2058; // end inline asm // begin inline asm mov.b32 {%rs65, %rs66}, %r1954; // end inline asm // begin inline asm cvt.f32.f16 %f219, %rs65; // end inline asm // begin inline asm cvt.f32.f16 %f220, %rs66; // end inline asm // begin inline asm mov.b32 {%rs69, %rs70}, %r1960; // end inline asm // begin inline asm cvt.f32.f16 %f221, %rs69; // end inline asm // begin inline asm cvt.f32.f16 %f222, %rs70; // end inline asm // begin inline asm mov.b32 {%rs73, %rs74}, %r1966; // end inline asm // begin inline asm cvt.f32.f16 %f223, %rs73; // end inline asm // begin inline asm cvt.f32.f16 %f224, %rs74; // end inline asm // begin inline asm mov.b32 {%rs77, %rs78}, %r1972; // end inline asm // begin inline asm cvt.f32.f16 %f225, %rs77; // end inline asm // begin inline asm cvt.f32.f16 %f226, %rs78; // end inline asm // begin inline asm mul.f16x2 %r1979, %r719, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1982, %r1979, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1985, %r720, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1988, %r1985, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1991, %r729, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r1994, %r1991, %r2058; // end inline asm // begin inline asm mul.f16x2 %r1997, %r730, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2000, %r1997, %r2058; // end inline asm // begin inline asm mov.b32 {%rs81, %rs82}, %r1982; // end inline asm // begin inline asm cvt.f32.f16 %f227, %rs81; // end inline asm // begin inline asm cvt.f32.f16 %f228, %rs82; // end inline asm // begin inline asm mov.b32 {%rs85, %rs86}, %r1988; // end inline asm // begin inline asm cvt.f32.f16 %f229, %rs85; // end inline asm // begin inline asm cvt.f32.f16 %f230, %rs86; // end inline asm // begin inline asm mov.b32 {%rs89, %rs90}, %r1994; // end inline asm // begin inline asm cvt.f32.f16 %f231, %rs89; // end inline asm // begin inline asm cvt.f32.f16 %f232, %rs90; // end inline asm // begin inline asm mov.b32 {%rs93, %rs94}, %r2000; // end inline asm // begin inline asm cvt.f32.f16 %f233, %rs93; // end inline asm // begin inline asm cvt.f32.f16 %f234, %rs94; // end inline asm // begin inline asm mul.f16x2 %r2007, %r739, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2010, %r2007, %r2058; // end inline asm // begin inline asm mul.f16x2 %r2013, %r740, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2016, %r2013, %r2058; // end inline asm // begin inline asm mul.f16x2 %r2019, %r749, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2022, %r2019, %r2058; // end inline asm // begin inline asm mul.f16x2 %r2025, %r750, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2028, %r2025, %r2058; // end inline asm // begin inline asm mov.b32 {%rs97, %rs98}, %r2010; // end inline asm // begin inline asm cvt.f32.f16 %f235, %rs97; // end inline asm // begin inline asm cvt.f32.f16 %f236, %rs98; // end inline asm // begin inline asm mov.b32 {%rs101, %rs102}, %r2016; // end inline asm // begin inline asm cvt.f32.f16 %f237, %rs101; // end inline asm // begin inline asm cvt.f32.f16 %f238, %rs102; // end inline asm // begin inline asm mov.b32 {%rs105, %rs106}, %r2022; // end inline asm // begin inline asm cvt.f32.f16 %f239, %rs105; // end inline asm // begin inline asm cvt.f32.f16 %f240, %rs106; // end inline asm // begin inline asm mov.b32 {%rs109, %rs110}, %r2028; // end inline asm // begin inline asm cvt.f32.f16 %f241, %rs109; // end inline asm // begin inline asm cvt.f32.f16 %f242, %rs110; // end inline asm // begin inline asm mul.f16x2 %r2035, %r759, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2038, %r2035, %r2058; // end inline asm // begin inline asm mul.f16x2 %r2041, %r760, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2044, %r2041, %r2058; // end inline asm // begin inline asm mul.f16x2 %r2047, %r769, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2050, %r2047, %r2058; // end inline asm // begin inline asm mul.f16x2 %r2053, %r770, %r1841; // end inline asm // begin inline asm min.xorsign.abs.f16x2 %r2056, %r2053, %r2058; // end inline asm // begin inline asm mov.b32 {%rs113, %rs114}, %r2038; // end inline asm // begin inline asm cvt.f32.f16 %f243, %rs113; // end inline asm // begin inline asm cvt.f32.f16 %f244, %rs114; // end inline asm // begin inline asm mov.b32 {%rs117, %rs118}, %r2044; // end inline asm // begin inline asm cvt.f32.f16 %f245, %rs117; // end inline asm // begin inline asm cvt.f32.f16 %f246, %rs118; // end inline asm // begin inline asm mov.b32 {%rs121, %rs122}, %r2050; // end inline asm // begin inline asm cvt.f32.f16 %f247, %rs121; // end inline asm // begin inline asm cvt.f32.f16 %f248, %rs122; // end inline asm // begin inline asm mov.b32 {%rs125, %rs126}, %r2056; // end inline asm // begin inline asm cvt.f32.f16 %f249, %rs125; // end inline asm // begin inline asm cvt.f32.f16 %f250, %rs126; // end inline asm add.s32 %r2213, %r2158, %r2157; setp.lt.s32 %p74, %r2213, %r2068; selp.f32 %f1, %f187, 0fFF800000, %p74; or.b32 %r2214, %r2213, 1; setp.lt.s32 %p75, %r2214, %r2068; selp.f32 %f2, %f188, 0fFF800000, %p75; add.s32 %r2215, %r2213, 8; setp.lt.s32 %p76, %r2215, %r2068; selp.f32 %f3, %f191, 0fFF800000, %p76; add.s32 %r2216, %r2213, 9; setp.lt.s32 %p77, %r2216, %r2068; selp.f32 %f4, %f192, 0fFF800000, %p77; add.s32 %r2217, %r2213, 64; setp.lt.s32 %p78, %r2217, %r2068; selp.f32 %f5, %f195, 0fFF800000, %p78; add.s32 %r2218, %r2213, 65; setp.lt.s32 %p79, %r2218, %r2068; selp.f32 %f6, %f196, 0fFF800000, %p79; add.s32 %r2219, %r2213, 72; setp.lt.s32 %p80, %r2219, %r2068; selp.f32 %f7, %f199, 0fFF800000, %p80; add.s32 %r2220, %r2213, 73; setp.lt.s32 %p81, %r2220, %r2068; selp.f32 %f8, %f200, 0fFF800000, %p81; selp.f32 %f9, %f189, 0fFF800000, %p74; selp.f32 %f10, %f190, 0fFF800000, %p75; selp.f32 %f11, %f193, 0fFF800000, %p76; selp.f32 %f12, %f194, 0fFF800000, %p77; selp.f32 %f13, %f197, 0fFF800000, %p78; selp.f32 %f14, %f198, 0fFF800000, %p79; selp.f32 %f15, %f201, 0fFF800000, %p80; selp.f32 %f16, %f202, 0fFF800000, %p81; selp.f32 %f17, %f203, 0fFF800000, %p74; selp.f32 %f18, %f204, 0fFF800000, %p75; selp.f32 %f19, %f207, 0fFF800000, %p76; selp.f32 %f20, %f208, 0fFF800000, %p77; selp.f32 %f21, %f211, 0fFF800000, %p78; selp.f32 %f22, %f212, 0fFF800000, %p79; selp.f32 %f23, %f215, 0fFF800000, %p80; selp.f32 %f24, %f216, 0fFF800000, %p81; selp.f32 %f25, %f205, 0fFF800000, %p74; selp.f32 %f26, %f206, 0fFF800000, %p75; selp.f32 %f27, %f209, 0fFF800000, %p76; selp.f32 %f28, %f210, 0fFF800000, %p77; selp.f32 %f29, %f213, 0fFF800000, %p78; selp.f32 %f30, %f214, 0fFF800000, %p79; selp.f32 %f31, %f217, 0fFF800000, %p80; selp.f32 %f32, %f218, 0fFF800000, %p81; selp.f32 %f33, %f219, 0fFF800000, %p74; selp.f32 %f34, %f220, 0fFF800000, %p75; selp.f32 %f35, %f223, 0fFF800000, %p76; selp.f32 %f36, %f224, 0fFF800000, %p77; selp.f32 %f37, %f227, 0fFF800000, %p78; selp.f32 %f38, %f228, 0fFF800000, %p79; selp.f32 %f39, %f231, 0fFF800000, %p80; selp.f32 %f40, %f232, 0fFF800000, %p81; selp.f32 %f41, %f221, 0fFF800000, %p74; selp.f32 %f42, %f222, 0fFF800000, %p75; selp.f32 %f43, %f225, 0fFF800000, %p76; selp.f32 %f44, %f226, 0fFF800000, %p77; selp.f32 %f45, %f229, 0fFF800000, %p78; selp.f32 %f46, %f230, 0fFF800000, %p79; selp.f32 %f47, %f233, 0fFF800000, %p80; selp.f32 %f48, %f234, 0fFF800000, %p81; selp.f32 %f49, %f235, 0fFF800000, %p74; selp.f32 %f50, %f236, 0fFF800000, %p75; selp.f32 %f51, %f239, 0fFF800000, %p76; selp.f32 %f52, %f240, 0fFF800000, %p77; selp.f32 %f53, %f243, 0fFF800000, %p78; selp.f32 %f54, %f244, 0fFF800000, %p79; selp.f32 %f55, %f247, 0fFF800000, %p80; selp.f32 %f56, %f248, 0fFF800000, %p81; selp.f32 %f57, %f237, 0fFF800000, %p74; selp.f32 %f58, %f238, 0fFF800000, %p75; selp.f32 %f59, %f241, 0fFF800000, %p76; selp.f32 %f60, %f242, 0fFF800000, %p77; selp.f32 %f61, %f245, 0fFF800000, %p78; selp.f32 %f62, %f246, 0fFF800000, %p79; selp.f32 %f63, %f249, 0fFF800000, %p80; selp.f32 %f64, %f250, 0fFF800000, %p81; setp.gt.ftz.f32 %p82, %f1, %f2; selp.f32 %f251, %f1, %f2, %p82; setp.gt.ftz.f32 %p83, %f251, %f3; selp.f32 %f252, %f251, %f3, %p83; setp.gt.ftz.f32 %p84, %f252, %f4; selp.f32 %f253, %f252, %f4, %p84; setp.gt.ftz.f32 %p85, %f253, %f5; selp.f32 %f254, %f253, %f5, %p85; setp.gt.ftz.f32 %p86, %f254, %f6; selp.f32 %f255, %f254, %f6, %p86; setp.gt.ftz.f32 %p87, %f255, %f7; selp.f32 %f256, %f255, %f7, %p87; setp.gt.ftz.f32 %p88, %f256, %f8; selp.f32 %f257, %f256, %f8, %p88; setp.gt.ftz.f32 %p89, %f9, %f10; selp.f32 %f258, %f9, %f10, %p89; setp.gt.ftz.f32 %p90, %f258, %f11; selp.f32 %f259, %f258, %f11, %p90; setp.gt.ftz.f32 %p91, %f259, %f12; selp.f32 %f260, %f259, %f12, %p91; setp.gt.ftz.f32 %p92, %f260, %f13; selp.f32 %f261, %f260, %f13, %p92; setp.gt.ftz.f32 %p93, %f261, %f14; selp.f32 %f262, %f261, %f14, %p93; setp.gt.ftz.f32 %p94, %f262, %f15; selp.f32 %f263, %f262, %f15, %p94; setp.gt.ftz.f32 %p95, %f263, %f16; selp.f32 %f264, %f263, %f16, %p95; setp.gt.ftz.f32 %p96, %f17, %f18; selp.f32 %f265, %f17, %f18, %p96; setp.gt.ftz.f32 %p97, %f265, %f19; selp.f32 %f266, %f265, %f19, %p97; setp.gt.ftz.f32 %p98, %f266, %f20; selp.f32 %f267, %f266, %f20, %p98; setp.gt.ftz.f32 %p99, %f267, %f21; selp.f32 %f268, %f267, %f21, %p99; setp.gt.ftz.f32 %p100, %f268, %f22; selp.f32 %f269, %f268, %f22, %p100; setp.gt.ftz.f32 %p101, %f269, %f23; selp.f32 %f270, %f269, %f23, %p101; setp.gt.ftz.f32 %p102, %f270, %f24; selp.f32 %f271, %f270, %f24, %p102; setp.gt.ftz.f32 %p103, %f25, %f26; selp.f32 %f272, %f25, %f26, %p103; setp.gt.ftz.f32 %p104, %f272, %f27; selp.f32 %f273, %f272, %f27, %p104; setp.gt.ftz.f32 %p105, %f273, %f28; selp.f32 %f274, %f273, %f28, %p105; setp.gt.ftz.f32 %p106, %f274, %f29; selp.f32 %f275, %f274, %f29, %p106; setp.gt.ftz.f32 %p107, %f275, %f30; selp.f32 %f276, %f275, %f30, %p107; setp.gt.ftz.f32 %p108, %f276, %f31; selp.f32 %f277, %f276, %f31, %p108; setp.gt.ftz.f32 %p109, %f277, %f32; selp.f32 %f278, %f277, %f32, %p109; setp.gt.ftz.f32 %p110, %f33, %f34; selp.f32 %f279, %f33, %f34, %p110; setp.gt.ftz.f32 %p111, %f279, %f35; selp.f32 %f280, %f279, %f35, %p111; setp.gt.ftz.f32 %p112, %f280, %f36; selp.f32 %f281, %f280, %f36, %p112; setp.gt.ftz.f32 %p113, %f281, %f37; selp.f32 %f282, %f281, %f37, %p113; setp.gt.ftz.f32 %p114, %f282, %f38; selp.f32 %f283, %f282, %f38, %p114; setp.gt.ftz.f32 %p115, %f283, %f39; selp.f32 %f284, %f283, %f39, %p115; setp.gt.ftz.f32 %p116, %f284, %f40; selp.f32 %f285, %f284, %f40, %p116; setp.gt.ftz.f32 %p117, %f41, %f42; selp.f32 %f286, %f41, %f42, %p117; setp.gt.ftz.f32 %p118, %f286, %f43; selp.f32 %f287, %f286, %f43, %p118; setp.gt.ftz.f32 %p119, %f287, %f44; selp.f32 %f288, %f287, %f44, %p119; setp.gt.ftz.f32 %p120, %f288, %f45; selp.f32 %f289, %f288, %f45, %p120; setp.gt.ftz.f32 %p121, %f289, %f46; selp.f32 %f290, %f289, %f46, %p121; setp.gt.ftz.f32 %p122, %f290, %f47; selp.f32 %f291, %f290, %f47, %p122; setp.gt.ftz.f32 %p123, %f291, %f48; selp.f32 %f292, %f291, %f48, %p123; setp.gt.ftz.f32 %p124, %f49, %f50; selp.f32 %f293, %f49, %f50, %p124; setp.gt.ftz.f32 %p125, %f293, %f51; selp.f32 %f294, %f293, %f51, %p125; setp.gt.ftz.f32 %p126, %f294, %f52; selp.f32 %f295, %f294, %f52, %p126; setp.gt.ftz.f32 %p127, %f295, %f53; selp.f32 %f296, %f295, %f53, %p127; setp.gt.ftz.f32 %p128, %f296, %f54; selp.f32 %f297, %f296, %f54, %p128; setp.gt.ftz.f32 %p129, %f297, %f55; selp.f32 %f298, %f297, %f55, %p129; setp.gt.ftz.f32 %p130, %f298, %f56; selp.f32 %f299, %f298, %f56, %p130; setp.gt.ftz.f32 %p131, %f57, %f58; selp.f32 %f300, %f57, %f58, %p131; setp.gt.ftz.f32 %p132, %f300, %f59; selp.f32 %f301, %f300, %f59, %p132; setp.gt.ftz.f32 %p133, %f301, %f60; selp.f32 %f302, %f301, %f60, %p133; setp.gt.ftz.f32 %p134, %f302, %f61; selp.f32 %f303, %f302, %f61, %p134; setp.gt.ftz.f32 %p135, %f303, %f62; selp.f32 %f304, %f303, %f62, %p135; setp.gt.ftz.f32 %p136, %f304, %f63; selp.f32 %f305, %f304, %f63, %p136; setp.gt.ftz.f32 %p137, %f305, %f64; selp.f32 %f306, %f305, %f64, %p137; mov.b32 %r2221, %f257; shfl.sync.bfly.b32 %r2222|%p138, %r2221, %r2077, %r2093, %r2095; mov.b32 %f307, %r2222; setp.gt.ftz.f32 %p139, %f257, %f307; selp.f32 %f65, %f257, %f307, %p139; mov.b32 %r2223, %f65; shfl.sync.bfly.b32 %r75|%p1, %r2223, %r2203, %r2093, %r2095; mov.b32 %r2224, %f264; shfl.sync.bfly.b32 %r2225|%p140, %r2224, %r2077, %r2093, %r2095; mov.b32 %f308, %r2225; setp.gt.ftz.f32 %p141, %f264, %f308; selp.f32 %f66, %f264, %f308, %p141; mov.b32 %r2226, %f66; shfl.sync.bfly.b32 %r76|%p2, %r2226, %r2203, %r2093, %r2095; mov.b32 %r2227, %f271; shfl.sync.bfly.b32 %r2228|%p142, %r2227, %r2077, %r2093, %r2095; mov.b32 %f309, %r2228; setp.gt.ftz.f32 %p143, %f271, %f309; selp.f32 %f67, %f271, %f309, %p143; mov.b32 %r2229, %f67; shfl.sync.bfly.b32 %r77|%p3, %r2229, %r2203, %r2093, %r2095; mov.b32 %r2230, %f278; shfl.sync.bfly.b32 %r2231|%p144, %r2230, %r2077, %r2093, %r2095; mov.b32 %f310, %r2231; setp.gt.ftz.f32 %p145, %f278, %f310; selp.f32 %f68, %f278, %f310, %p145; mov.b32 %r2232, %f68; shfl.sync.bfly.b32 %r78|%p4, %r2232, %r2203, %r2093, %r2095; mov.b32 %r2233, %f285; shfl.sync.bfly.b32 %r2234|%p146, %r2233, %r2077, %r2093, %r2095; mov.b32 %f311, %r2234; setp.gt.ftz.f32 %p147, %f285, %f311; selp.f32 %f69, %f285, %f311, %p147; mov.b32 %r2235, %f69; shfl.sync.bfly.b32 %r79|%p5, %r2235, %r2203, %r2093, %r2095; mov.b32 %r2236, %f292; shfl.sync.bfly.b32 %r2237|%p148, %r2236, %r2077, %r2093, %r2095; mov.b32 %f312, %r2237; setp.gt.ftz.f32 %p149, %f292, %f312; selp.f32 %f70, %f292, %f312, %p149; mov.b32 %r2238, %f70; shfl.sync.bfly.b32 %r80|%p6, %r2238, %r2203, %r2093, %r2095; mov.b32 %r2239, %f299; shfl.sync.bfly.b32 %r2240|%p150, %r2239, %r2077, %r2093, %r2095; mov.b32 %f313, %r2240; setp.gt.ftz.f32 %p151, %f299, %f313; selp.f32 %f71, %f299, %f313, %p151; mov.b32 %r2241, %f71; shfl.sync.bfly.b32 %r81|%p7, %r2241, %r2203, %r2093, %r2095; mov.b32 %r2242, %f306; shfl.sync.bfly.b32 %r2243|%p152, %r2242, %r2077, %r2093, %r2095; mov.b32 %f314, %r2243; setp.gt.ftz.f32 %p153, %f306, %f314; selp.f32 %f72, %f306, %f314, %p153; mov.b32 %r2244, %f72; shfl.sync.bfly.b32 %r82|%p8, %r2244, %r2203, %r2093, %r2095; and.b32 %r2245, %r2063, 3; setp.ne.s32 %p154, %r2245, 0; @%p154 bra $L__BB0_3; mov.b32 %f315, %r75; mov.b32 %f316, %r76; mov.b32 %f317, %r77; mov.b32 %f318, %r78; mov.b32 %f319, %r79; mov.b32 %f320, %r80; mov.b32 %f321, %r81; mov.b32 %f322, %r82; setp.gt.ftz.f32 %p155, %f65, %f315; selp.f32 %f323, %f65, %f315, %p155; st.shared.f32 [%r74], %f323; setp.gt.ftz.f32 %p156, %f66, %f316; selp.f32 %f324, %f66, %f316, %p156; st.shared.f32 [%r74+128], %f324; setp.gt.ftz.f32 %p157, %f67, %f317; selp.f32 %f325, %f67, %f317, %p157; st.shared.f32 [%r74+256], %f325; setp.gt.ftz.f32 %p158, %f68, %f318; selp.f32 %f326, %f68, %f318, %p158; st.shared.f32 [%r74+384], %f326; setp.gt.ftz.f32 %p159, %f69, %f319; selp.f32 %f327, %f69, %f319, %p159; st.shared.f32 [%r74+512], %f327; setp.gt.ftz.f32 %p160, %f70, %f320; selp.f32 %f328, %f70, %f320, %p160; st.shared.f32 [%r74+640], %f328; setp.gt.ftz.f32 %p161, %f71, %f321; selp.f32 %f329, %f71, %f321, %p161; st.shared.f32 [%r74+768], %f329; setp.gt.ftz.f32 %p162, %f72, %f322; selp.f32 %f330, %f72, %f322, %p162; st.shared.f32 [%r74+896], %f330; $L__BB0_3: mov.u32 %r4522, %tid.x; bar.sync 0; setp.gt.s32 %p163, %r4522, 63; @%p163 bra $L__BB0_5; mov.u32 %r4563, %tid.x; shl.b32 %r4562, %r4563, 4; mov.u32 %r4561, _ZN25fused_multihead_attention5smem_E; add.s32 %r2248, %r4561, %r4562; ld.shared.v4.f32 {%f648, %f653, %f650, %f655}, [%r2248+32768]; $L__BB0_5: setp.gt.ftz.f32 %p165, %f648, %f653; selp.f32 %f336, %f648, %f653, %p165; setp.gt.ftz.f32 %p166, %f650, %f655; selp.f32 %f654, %f650, %f655, %p166; setp.gt.ftz.f32 %p167, %f336, %f654; selp.f32 %f652, %f336, %f654, %p167; bar.sync 0; @%p163 bra $L__BB0_7; mov.u32 %r4560, _ZN25fused_multihead_attention5smem_E; mov.u32 %r4559, %tid.x; shl.b32 %r2249, %r4559, 2; add.s32 %r2251, %r4560, %r2249; st.shared.f32 [%r2251+32768], %f652; $L__BB0_7: mov.u32 %r4525, %tid.x; and.b32 %r4524, %r4525, 3; setp.ne.s32 %p204, %r4524, 0; mov.u32 %r4523, _ZN25fused_multihead_attention5smem_E; shl.b32 %r2252, %r73, 2; add.s32 %r2254, %r4523, %r2252; mov.u32 %r2255, 2; bar.sync 0; ld.shared.f32 %f337, [%r2254+32768]; ld.shared.f32 %f338, [%r2254+32800]; ld.shared.f32 %f339, [%r2254+32832]; ld.shared.f32 %f340, [%r2254+32864]; ld.shared.f32 %f341, [%r2254+32896]; ld.shared.f32 %f342, [%r2254+32928]; ld.shared.f32 %f343, [%r2254+32960]; ld.shared.f32 %f344, [%r2254+32992]; bar.sync 0; bar.sync 0; sub.ftz.f32 %f345, %f1, %f337; mul.ftz.f32 %f346, %f345, 0f3FB8AA3B; ex2.approx.ftz.f32 %f83, %f346; sub.ftz.f32 %f347, %f2, %f337; mul.ftz.f32 %f348, %f347, 0f3FB8AA3B; ex2.approx.ftz.f32 %f84, %f348; sub.ftz.f32 %f349, %f3, %f337; mul.ftz.f32 %f350, %f349, 0f3FB8AA3B; ex2.approx.ftz.f32 %f85, %f350; sub.ftz.f32 %f351, %f4, %f337; mul.ftz.f32 %f352, %f351, 0f3FB8AA3B; ex2.approx.ftz.f32 %f86, %f352; sub.ftz.f32 %f353, %f5, %f337; mul.ftz.f32 %f354, %f353, 0f3FB8AA3B; ex2.approx.ftz.f32 %f87, %f354; sub.ftz.f32 %f355, %f6, %f337; mul.ftz.f32 %f356, %f355, 0f3FB8AA3B; ex2.approx.ftz.f32 %f88, %f356; sub.ftz.f32 %f357, %f7, %f337; mul.ftz.f32 %f358, %f357, 0f3FB8AA3B; ex2.approx.ftz.f32 %f89, %f358; sub.ftz.f32 %f359, %f8, %f337; mul.ftz.f32 %f360, %f359, 0f3FB8AA3B; ex2.approx.ftz.f32 %f90, %f360; sub.ftz.f32 %f361, %f9, %f338; mul.ftz.f32 %f362, %f361, 0f3FB8AA3B; ex2.approx.ftz.f32 %f91, %f362; sub.ftz.f32 %f363, %f10, %f338; mul.ftz.f32 %f364, %f363, 0f3FB8AA3B; ex2.approx.ftz.f32 %f92, %f364; sub.ftz.f32 %f365, %f11, %f338; mul.ftz.f32 %f366, %f365, 0f3FB8AA3B; ex2.approx.ftz.f32 %f93, %f366; sub.ftz.f32 %f367, %f12, %f338; mul.ftz.f32 %f368, %f367, 0f3FB8AA3B; ex2.approx.ftz.f32 %f94, %f368; sub.ftz.f32 %f369, %f13, %f338; mul.ftz.f32 %f370, %f369, 0f3FB8AA3B; ex2.approx.ftz.f32 %f95, %f370; sub.ftz.f32 %f371, %f14, %f338; mul.ftz.f32 %f372, %f371, 0f3FB8AA3B; ex2.approx.ftz.f32 %f96, %f372; sub.ftz.f32 %f373, %f15, %f338; mul.ftz.f32 %f374, %f373, 0f3FB8AA3B; ex2.approx.ftz.f32 %f97, %f374; sub.ftz.f32 %f375, %f16, %f338; mul.ftz.f32 %f376, %f375, 0f3FB8AA3B; ex2.approx.ftz.f32 %f98, %f376; sub.ftz.f32 %f377, %f17, %f339; mul.ftz.f32 %f378, %f377, 0f3FB8AA3B; ex2.approx.ftz.f32 %f99, %f378; sub.ftz.f32 %f379, %f18, %f339; mul.ftz.f32 %f380, %f379, 0f3FB8AA3B; ex2.approx.ftz.f32 %f100, %f380; sub.ftz.f32 %f381, %f19, %f339; mul.ftz.f32 %f382, %f381, 0f3FB8AA3B; ex2.approx.ftz.f32 %f101, %f382; sub.ftz.f32 %f383, %f20, %f339; mul.ftz.f32 %f384, %f383, 0f3FB8AA3B; ex2.approx.ftz.f32 %f102, %f384; sub.ftz.f32 %f385, %f21, %f339; mul.ftz.f32 %f386, %f385, 0f3FB8AA3B; ex2.approx.ftz.f32 %f103, %f386; sub.ftz.f32 %f387, %f22, %f339; mul.ftz.f32 %f388, %f387, 0f3FB8AA3B; ex2.approx.ftz.f32 %f104, %f388; sub.ftz.f32 %f389, %f23, %f339; mul.ftz.f32 %f390, %f389, 0f3FB8AA3B; ex2.approx.ftz.f32 %f105, %f390; sub.ftz.f32 %f391, %f24, %f339; mul.ftz.f32 %f392, %f391, 0f3FB8AA3B; ex2.approx.ftz.f32 %f106, %f392; sub.ftz.f32 %f393, %f25, %f340; mul.ftz.f32 %f394, %f393, 0f3FB8AA3B; ex2.approx.ftz.f32 %f107, %f394; sub.ftz.f32 %f395, %f26, %f340; mul.ftz.f32 %f396, %f395, 0f3FB8AA3B; ex2.approx.ftz.f32 %f108, %f396; sub.ftz.f32 %f397, %f27, %f340; mul.ftz.f32 %f398, %f397, 0f3FB8AA3B; ex2.approx.ftz.f32 %f109, %f398; sub.ftz.f32 %f399, %f28, %f340; mul.ftz.f32 %f400, %f399, 0f3FB8AA3B; ex2.approx.ftz.f32 %f110, %f400; sub.ftz.f32 %f401, %f29, %f340; mul.ftz.f32 %f402, %f401, 0f3FB8AA3B; ex2.approx.ftz.f32 %f111, %f402; sub.ftz.f32 %f403, %f30, %f340; mul.ftz.f32 %f404, %f403, 0f3FB8AA3B; ex2.approx.ftz.f32 %f112, %f404; sub.ftz.f32 %f405, %f31, %f340; mul.ftz.f32 %f406, %f405, 0f3FB8AA3B; ex2.approx.ftz.f32 %f113, %f406; sub.ftz.f32 %f407, %f32, %f340; mul.ftz.f32 %f408, %f407, 0f3FB8AA3B; ex2.approx.ftz.f32 %f114, %f408; sub.ftz.f32 %f409, %f33, %f341; mul.ftz.f32 %f410, %f409, 0f3FB8AA3B; ex2.approx.ftz.f32 %f115, %f410; sub.ftz.f32 %f411, %f34, %f341; mul.ftz.f32 %f412, %f411, 0f3FB8AA3B; ex2.approx.ftz.f32 %f116, %f412; sub.ftz.f32 %f413, %f35, %f341; mul.ftz.f32 %f414, %f413, 0f3FB8AA3B; ex2.approx.ftz.f32 %f117, %f414; sub.ftz.f32 %f415, %f36, %f341; mul.ftz.f32 %f416, %f415, 0f3FB8AA3B; ex2.approx.ftz.f32 %f118, %f416; sub.ftz.f32 %f417, %f37, %f341; mul.ftz.f32 %f418, %f417, 0f3FB8AA3B; ex2.approx.ftz.f32 %f119, %f418; sub.ftz.f32 %f419, %f38, %f341; mul.ftz.f32 %f420, %f419, 0f3FB8AA3B; ex2.approx.ftz.f32 %f120, %f420; sub.ftz.f32 %f421, %f39, %f341; mul.ftz.f32 %f422, %f421, 0f3FB8AA3B; ex2.approx.ftz.f32 %f121, %f422; sub.ftz.f32 %f423, %f40, %f341; mul.ftz.f32 %f424, %f423, 0f3FB8AA3B; ex2.approx.ftz.f32 %f122, %f424; sub.ftz.f32 %f425, %f41, %f342; mul.ftz.f32 %f426, %f425, 0f3FB8AA3B; ex2.approx.ftz.f32 %f123, %f426; sub.ftz.f32 %f427, %f42, %f342; mul.ftz.f32 %f428, %f427, 0f3FB8AA3B; ex2.approx.ftz.f32 %f124, %f428; sub.ftz.f32 %f429, %f43, %f342; mul.ftz.f32 %f430, %f429, 0f3FB8AA3B; ex2.approx.ftz.f32 %f125, %f430; sub.ftz.f32 %f431, %f44, %f342; mul.ftz.f32 %f432, %f431, 0f3FB8AA3B; ex2.approx.ftz.f32 %f126, %f432; sub.ftz.f32 %f433, %f45, %f342; mul.ftz.f32 %f434, %f433, 0f3FB8AA3B; ex2.approx.ftz.f32 %f127, %f434; sub.ftz.f32 %f435, %f46, %f342; mul.ftz.f32 %f436, %f435, 0f3FB8AA3B; ex2.approx.ftz.f32 %f128, %f436; sub.ftz.f32 %f437, %f47, %f342; mul.ftz.f32 %f438, %f437, 0f3FB8AA3B; ex2.approx.ftz.f32 %f129, %f438; sub.ftz.f32 %f439, %f48, %f342; mul.ftz.f32 %f440, %f439, 0f3FB8AA3B; ex2.approx.ftz.f32 %f130, %f440; sub.ftz.f32 %f441, %f49, %f343; mul.ftz.f32 %f442, %f441, 0f3FB8AA3B; ex2.approx.ftz.f32 %f131, %f442; sub.ftz.f32 %f443, %f50, %f343; mul.ftz.f32 %f444, %f443, 0f3FB8AA3B; ex2.approx.ftz.f32 %f132, %f444; sub.ftz.f32 %f445, %f51, %f343; mul.ftz.f32 %f446, %f445, 0f3FB8AA3B; ex2.approx.ftz.f32 %f133, %f446; sub.ftz.f32 %f447, %f52, %f343; mul.ftz.f32 %f448, %f447, 0f3FB8AA3B; ex2.approx.ftz.f32 %f134, %f448; sub.ftz.f32 %f449, %f53, %f343; mul.ftz.f32 %f450, %f449, 0f3FB8AA3B; ex2.approx.ftz.f32 %f135, %f450; sub.ftz.f32 %f451, %f54, %f343; mul.ftz.f32 %f452, %f451, 0f3FB8AA3B; ex2.approx.ftz.f32 %f136, %f452; sub.ftz.f32 %f453, %f55, %f343; mul.ftz.f32 %f454, %f453, 0f3FB8AA3B; ex2.approx.ftz.f32 %f137, %f454; sub.ftz.f32 %f455, %f56, %f343; mul.ftz.f32 %f456, %f455, 0f3FB8AA3B; ex2.approx.ftz.f32 %f138, %f456; sub.ftz.f32 %f457, %f57, %f344; mul.ftz.f32 %f458, %f457, 0f3FB8AA3B; ex2.approx.ftz.f32 %f139, %f458; sub.ftz.f32 %f459, %f58, %f344; mul.ftz.f32 %f460, %f459, 0f3FB8AA3B; ex2.approx.ftz.f32 %f140, %f460; sub.ftz.f32 %f461, %f59, %f344; mul.ftz.f32 %f462, %f461, 0f3FB8AA3B; ex2.approx.ftz.f32 %f141, %f462; sub.ftz.f32 %f463, %f60, %f344; mul.ftz.f32 %f464, %f463, 0f3FB8AA3B; ex2.approx.ftz.f32 %f142, %f464; sub.ftz.f32 %f465, %f61, %f344; mul.ftz.f32 %f466, %f465, 0f3FB8AA3B; ex2.approx.ftz.f32 %f143, %f466; sub.ftz.f32 %f467, %f62, %f344; mul.ftz.f32 %f468, %f467, 0f3FB8AA3B; ex2.approx.ftz.f32 %f144, %f468; sub.ftz.f32 %f469, %f63, %f344; mul.ftz.f32 %f470, %f469, 0f3FB8AA3B; ex2.approx.ftz.f32 %f145, %f470; sub.ftz.f32 %f471, %f64, %f344; mul.ftz.f32 %f472, %f471, 0f3FB8AA3B; ex2.approx.ftz.f32 %f146, %f472; add.ftz.f32 %f473, %f83, %f84; add.ftz.f32 %f474, %f473, 0f00000000; add.ftz.f32 %f475, %f85, %f86; add.ftz.f32 %f476, %f475, 0f00000000; add.ftz.f32 %f477, %f87, %f88; add.ftz.f32 %f478, %f474, %f477; add.ftz.f32 %f479, %f89, %f90; add.ftz.f32 %f480, %f476, %f479; add.ftz.f32 %f481, %f478, %f480; add.ftz.f32 %f482, %f91, %f92; add.ftz.f32 %f483, %f482, 0f00000000; add.ftz.f32 %f484, %f93, %f94; add.ftz.f32 %f485, %f484, 0f00000000; add.ftz.f32 %f486, %f95, %f96; add.ftz.f32 %f487, %f483, %f486; add.ftz.f32 %f488, %f97, %f98; add.ftz.f32 %f489, %f485, %f488; add.ftz.f32 %f490, %f487, %f489; add.ftz.f32 %f491, %f99, %f100; add.ftz.f32 %f492, %f491, 0f00000000; add.ftz.f32 %f493, %f101, %f102; add.ftz.f32 %f494, %f493, 0f00000000; add.ftz.f32 %f495, %f103, %f104; add.ftz.f32 %f496, %f492, %f495; add.ftz.f32 %f497, %f105, %f106; add.ftz.f32 %f498, %f494, %f497; add.ftz.f32 %f499, %f496, %f498; add.ftz.f32 %f500, %f107, %f108; add.ftz.f32 %f501, %f500, 0f00000000; add.ftz.f32 %f502, %f109, %f110; add.ftz.f32 %f503, %f502, 0f00000000; add.ftz.f32 %f504, %f111, %f112; add.ftz.f32 %f505, %f501, %f504; add.ftz.f32 %f506, %f113, %f114; add.ftz.f32 %f507, %f503, %f506; add.ftz.f32 %f508, %f505, %f507; add.ftz.f32 %f509, %f115, %f116; add.ftz.f32 %f510, %f509, 0f00000000; add.ftz.f32 %f511, %f117, %f118; add.ftz.f32 %f512, %f511, 0f00000000; add.ftz.f32 %f513, %f119, %f120; add.ftz.f32 %f514, %f510, %f513; add.ftz.f32 %f515, %f121, %f122; add.ftz.f32 %f516, %f512, %f515; add.ftz.f32 %f517, %f514, %f516; add.ftz.f32 %f518, %f123, %f124; add.ftz.f32 %f519, %f518, 0f00000000; add.ftz.f32 %f520, %f125, %f126; add.ftz.f32 %f521, %f520, 0f00000000; add.ftz.f32 %f522, %f127, %f128; add.ftz.f32 %f523, %f519, %f522; add.ftz.f32 %f524, %f129, %f130; add.ftz.f32 %f525, %f521, %f524; add.ftz.f32 %f526, %f523, %f525; add.ftz.f32 %f527, %f131, %f132; add.ftz.f32 %f528, %f527, 0f00000000; add.ftz.f32 %f529, %f133, %f134; add.ftz.f32 %f530, %f529, 0f00000000; add.ftz.f32 %f531, %f135, %f136; add.ftz.f32 %f532, %f528, %f531; add.ftz.f32 %f533, %f137, %f138; add.ftz.f32 %f534, %f530, %f533; add.ftz.f32 %f535, %f532, %f534; add.ftz.f32 %f536, %f139, %f140; add.ftz.f32 %f537, %f536, 0f00000000; add.ftz.f32 %f538, %f141, %f142; add.ftz.f32 %f539, %f538, 0f00000000; add.ftz.f32 %f540, %f143, %f144; add.ftz.f32 %f541, %f537, %f540; add.ftz.f32 %f542, %f145, %f146; add.ftz.f32 %f543, %f539, %f542; add.ftz.f32 %f544, %f541, %f543; mov.b32 %r2258, %f481; mov.u32 %r2259, 31; mov.u32 %r2260, 1; mov.u32 %r2261, -1; shfl.sync.bfly.b32 %r2262|%p169, %r2258, %r2260, %r2259, %r2261; mov.b32 %f545, %r2262; add.ftz.f32 %f147, %f481, %f545; mov.b32 %r2263, %f147; shfl.sync.bfly.b32 %r86|%p9, %r2263, %r2255, %r2259, %r2261; mov.b32 %r2264, %f490; shfl.sync.bfly.b32 %r2265|%p170, %r2264, %r2260, %r2259, %r2261; mov.b32 %f546, %r2265; add.ftz.f32 %f148, %f490, %f546; mov.b32 %r2266, %f148; shfl.sync.bfly.b32 %r87|%p10, %r2266, %r2255, %r2259, %r2261; mov.b32 %r2267, %f499; shfl.sync.bfly.b32 %r2268|%p171, %r2267, %r2260, %r2259, %r2261; mov.b32 %f547, %r2268; add.ftz.f32 %f149, %f499, %f547; mov.b32 %r2269, %f149; shfl.sync.bfly.b32 %r88|%p11, %r2269, %r2255, %r2259, %r2261; mov.b32 %r2270, %f508; shfl.sync.bfly.b32 %r2271|%p172, %r2270, %r2260, %r2259, %r2261; mov.b32 %f548, %r2271; add.ftz.f32 %f150, %f508, %f548; mov.b32 %r2272, %f150; shfl.sync.bfly.b32 %r89|%p12, %r2272, %r2255, %r2259, %r2261; mov.b32 %r2273, %f517; shfl.sync.bfly.b32 %r2274|%p173, %r2273, %r2260, %r2259, %r2261; mov.b32 %f549, %r2274; add.ftz.f32 %f151, %f517, %f549; mov.b32 %r2275, %f151; shfl.sync.bfly.b32 %r90|%p13, %r2275, %r2255, %r2259, %r2261; mov.b32 %r2276, %f526; shfl.sync.bfly.b32 %r2277|%p174, %r2276, %r2260, %r2259, %r2261; mov.b32 %f550, %r2277; add.ftz.f32 %f152, %f526, %f550; mov.b32 %r2278, %f152; shfl.sync.bfly.b32 %r91|%p14, %r2278, %r2255, %r2259, %r2261; mov.b32 %r2279, %f535; shfl.sync.bfly.b32 %r2280|%p175, %r2279, %r2260, %r2259, %r2261; mov.b32 %f551, %r2280; add.ftz.f32 %f153, %f535, %f551; mov.b32 %r2281, %f153; shfl.sync.bfly.b32 %r92|%p15, %r2281, %r2255, %r2259, %r2261; mov.b32 %r2282, %f544; shfl.sync.bfly.b32 %r2283|%p176, %r2282, %r2260, %r2259, %r2261; mov.b32 %f552, %r2283; add.ftz.f32 %f154, %f544, %f552; mov.b32 %r2284, %f154; shfl.sync.bfly.b32 %r93|%p16, %r2284, %r2255, %r2259, %r2261; @%p204 bra $L__BB0_9; mov.b32 %f553, %r86; add.ftz.f32 %f554, %f147, %f553; st.shared.f32 [%r74], %f554; mov.b32 %f555, %r87; add.ftz.f32 %f556, %f148, %f555; st.shared.f32 [%r74+128], %f556; mov.b32 %f557, %r88; add.ftz.f32 %f558, %f149, %f557; st.shared.f32 [%r74+256], %f558; mov.b32 %f559, %r89; add.ftz.f32 %f560, %f150, %f559; st.shared.f32 [%r74+384], %f560; mov.b32 %f561, %r90; add.ftz.f32 %f562, %f151, %f561; st.shared.f32 [%r74+512], %f562; mov.b32 %f563, %r91; add.ftz.f32 %f564, %f152, %f563; st.shared.f32 [%r74+640], %f564; mov.b32 %f565, %r92; add.ftz.f32 %f566, %f153, %f565; st.shared.f32 [%r74+768], %f566; mov.b32 %f567, %r93; add.ftz.f32 %f568, %f154, %f567; st.shared.f32 [%r74+896], %f568; $L__BB0_9: bar.sync 0; @%p163 bra $L__BB0_11; mov.u32 %r4558, %tid.x; shl.b32 %r4557, %r4558, 4; mov.u32 %r4556, _ZN25fused_multihead_attention5smem_E; add.s32 %r2289, %r4556, %r4557; ld.shared.v4.f32 {%f652, %f653, %f654, %f655}, [%r2289+32768]; $L__BB0_11: bar.sync 0; @%p163 bra $L__BB0_13; mov.u32 %r4555, _ZN25fused_multihead_attention5smem_E; mov.u32 %r4554, %tid.x; shl.b32 %r2292, %r4554, 2; add.s32 %r2294, %r4555, %r2292; add.ftz.f32 %f573, %f652, %f653; add.ftz.f32 %f574, %f654, %f655; add.ftz.f32 %f575, %f573, %f574; st.shared.f32 [%r2294+32768], %f575; $L__BB0_13: bar.sync 0; add.s32 %r4514, %r2254, 32768; ld.shared.f32 %f163, [%r4514]; add.s32 %r4515, %r2254, 32768; ld.shared.f32 %f164, [%r4515+32]; add.s32 %r4516, %r2254, 32768; ld.shared.f32 %f165, [%r4516+64]; add.s32 %r4517, %r2254, 32768; ld.shared.f32 %f166, [%r4517+96]; add.s32 %r4518, %r2254, 32768; ld.shared.f32 %f167, [%r4518+128]; add.s32 %r4519, %r2254, 32768; ld.shared.f32 %f168, [%r4519+160]; add.s32 %r4520, %r2254, 32768; ld.shared.f32 %f169, [%r4520+192]; add.s32 %r4521, %r2254, 32768; ld.shared.f32 %f170, [%r4521+224]; bar.sync 0; setp.equ.ftz.f32 %p179, %f163, 0f00000000; mov.f32 %f657, 0f3F800000; mov.f32 %f656, %f657; @%p179 bra $L__BB0_15; rcp.approx.ftz.f32 %f656, %f163; $L__BB0_15: setp.equ.ftz.f32 %p180, %f164, 0f00000000; @%p180 bra $L__BB0_17; rcp.approx.ftz.f32 %f657, %f164; $L__BB0_17: setp.equ.ftz.f32 %p181, %f165, 0f00000000; mov.f32 %f659, 0f3F800000; mov.f32 %f658, %f659; @%p181 bra $L__BB0_19; rcp.approx.ftz.f32 %f658, %f165; $L__BB0_19: setp.equ.ftz.f32 %p182, %f166, 0f00000000; @%p182 bra $L__BB0_21; rcp.approx.ftz.f32 %f659, %f166; $L__BB0_21: setp.equ.ftz.f32 %p183, %f167, 0f00000000; mov.f32 %f661, 0f3F800000; mov.f32 %f660, %f661; @%p183 bra $L__BB0_23; rcp.approx.ftz.f32 %f660, %f167; $L__BB0_23: setp.equ.ftz.f32 %p184, %f168, 0f00000000; @%p184 bra $L__BB0_25; rcp.approx.ftz.f32 %f661, %f168; $L__BB0_25: setp.equ.ftz.f32 %p185, %f169, 0f00000000; mov.f32 %f663, 0f3F800000; mov.f32 %f662, %f663; @%p185 bra $L__BB0_27; rcp.approx.ftz.f32 %f662, %f169; $L__BB0_27: setp.equ.ftz.f32 %p186, %f170, 0f00000000; @%p186 bra $L__BB0_29; rcp.approx.ftz.f32 %f663, %f170; $L__BB0_29: mov.u32 %r4535, %tid.x; mov.b64 %rd116, fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0; mov.u64 %rd115, %rd116; ld.param.u32 %r4534, [%rd115+60]; shr.s32 %r4533, %r4535, 31; shr.u32 %r4532, %r4533, 28; add.s32 %r4531, %r4535, %r4532; shr.s32 %r4530, %r4531, 4; mov.u32 %r4529, _ZN25fused_multihead_attention5smem_E; add.s32 %r4528, %r4529, 32768; shl.b32 %r4527, %r4535, 8; ld.param.u32 %r4526, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; mul.ftz.f32 %f584, %f656, %f84; mul.ftz.f32 %f585, %f656, %f83; // begin inline asm cvt.rn.f16x2.f32 %r2295, %f584, %f585; // end inline asm mul.ftz.f32 %f586, %f657, %f92; mul.ftz.f32 %f587, %f657, %f91; // begin inline asm cvt.rn.f16x2.f32 %r2296, %f586, %f587; // end inline asm mul.ftz.f32 %f588, %f656, %f86; mul.ftz.f32 %f589, %f656, %f85; // begin inline asm cvt.rn.f16x2.f32 %r2297, %f588, %f589; // end inline asm mul.ftz.f32 %f590, %f657, %f94; mul.ftz.f32 %f591, %f657, %f93; // begin inline asm cvt.rn.f16x2.f32 %r2298, %f590, %f591; // end inline asm mul.ftz.f32 %f592, %f656, %f88; mul.ftz.f32 %f593, %f656, %f87; // begin inline asm cvt.rn.f16x2.f32 %r2299, %f592, %f593; // end inline asm mul.ftz.f32 %f594, %f657, %f96; mul.ftz.f32 %f595, %f657, %f95; // begin inline asm cvt.rn.f16x2.f32 %r2300, %f594, %f595; // end inline asm mul.ftz.f32 %f596, %f656, %f90; mul.ftz.f32 %f597, %f656, %f89; // begin inline asm cvt.rn.f16x2.f32 %r2301, %f596, %f597; // end inline asm mul.ftz.f32 %f598, %f657, %f98; mul.ftz.f32 %f599, %f657, %f97; // begin inline asm cvt.rn.f16x2.f32 %r2302, %f598, %f599; // end inline asm mul.ftz.f32 %f600, %f658, %f100; mul.ftz.f32 %f601, %f658, %f99; // begin inline asm cvt.rn.f16x2.f32 %r2303, %f600, %f601; // end inline asm mul.ftz.f32 %f602, %f659, %f108; mul.ftz.f32 %f603, %f659, %f107; // begin inline asm cvt.rn.f16x2.f32 %r2304, %f602, %f603; // end inline asm mul.ftz.f32 %f604, %f658, %f102; mul.ftz.f32 %f605, %f658, %f101; // begin inline asm cvt.rn.f16x2.f32 %r2305, %f604, %f605; // end inline asm mul.ftz.f32 %f606, %f659, %f110; mul.ftz.f32 %f607, %f659, %f109; // begin inline asm cvt.rn.f16x2.f32 %r2306, %f606, %f607; // end inline asm mul.ftz.f32 %f608, %f658, %f104; mul.ftz.f32 %f609, %f658, %f103; // begin inline asm cvt.rn.f16x2.f32 %r2307, %f608, %f609; // end inline asm mul.ftz.f32 %f610, %f659, %f112; mul.ftz.f32 %f611, %f659, %f111; // begin inline asm cvt.rn.f16x2.f32 %r2308, %f610, %f611; // end inline asm mul.ftz.f32 %f612, %f658, %f106; mul.ftz.f32 %f613, %f658, %f105; // begin inline asm cvt.rn.f16x2.f32 %r2309, %f612, %f613; // end inline asm mul.ftz.f32 %f614, %f659, %f114; mul.ftz.f32 %f615, %f659, %f113; // begin inline asm cvt.rn.f16x2.f32 %r2310, %f614, %f615; // end inline asm mul.ftz.f32 %f616, %f660, %f116; mul.ftz.f32 %f617, %f660, %f115; // begin inline asm cvt.rn.f16x2.f32 %r2311, %f616, %f617; // end inline asm mul.ftz.f32 %f618, %f661, %f124; mul.ftz.f32 %f619, %f661, %f123; // begin inline asm cvt.rn.f16x2.f32 %r2312, %f618, %f619; // end inline asm mul.ftz.f32 %f620, %f660, %f118; mul.ftz.f32 %f621, %f660, %f117; // begin inline asm cvt.rn.f16x2.f32 %r2313, %f620, %f621; // end inline asm mul.ftz.f32 %f622, %f661, %f126; mul.ftz.f32 %f623, %f661, %f125; // begin inline asm cvt.rn.f16x2.f32 %r2314, %f622, %f623; // end inline asm mul.ftz.f32 %f624, %f660, %f120; mul.ftz.f32 %f625, %f660, %f119; // begin inline asm cvt.rn.f16x2.f32 %r2315, %f624, %f625; // end inline asm mul.ftz.f32 %f626, %f661, %f128; mul.ftz.f32 %f627, %f661, %f127; // begin inline asm cvt.rn.f16x2.f32 %r2316, %f626, %f627; // end inline asm mul.ftz.f32 %f628, %f660, %f122; mul.ftz.f32 %f629, %f660, %f121; // begin inline asm cvt.rn.f16x2.f32 %r2317, %f628, %f629; // end inline asm mul.ftz.f32 %f630, %f661, %f130; mul.ftz.f32 %f631, %f661, %f129; // begin inline asm cvt.rn.f16x2.f32 %r2318, %f630, %f631; // end inline asm mul.ftz.f32 %f632, %f662, %f132; mul.ftz.f32 %f633, %f662, %f131; // begin inline asm cvt.rn.f16x2.f32 %r2319, %f632, %f633; // end inline asm mul.ftz.f32 %f634, %f663, %f140; mul.ftz.f32 %f635, %f663, %f139; // begin inline asm cvt.rn.f16x2.f32 %r2320, %f634, %f635; // end inline asm mul.ftz.f32 %f636, %f662, %f134; mul.ftz.f32 %f637, %f662, %f133; // begin inline asm cvt.rn.f16x2.f32 %r2321, %f636, %f637; // end inline asm mul.ftz.f32 %f638, %f663, %f142; mul.ftz.f32 %f639, %f663, %f141; // begin inline asm cvt.rn.f16x2.f32 %r2322, %f638, %f639; // end inline asm mul.ftz.f32 %f640, %f662, %f136; mul.ftz.f32 %f641, %f662, %f135; // begin inline asm cvt.rn.f16x2.f32 %r2323, %f640, %f641; // end inline asm mul.ftz.f32 %f642, %f663, %f144; mul.ftz.f32 %f643, %f663, %f143; // begin inline asm cvt.rn.f16x2.f32 %r2324, %f642, %f643; // end inline asm mul.ftz.f32 %f644, %f662, %f138; mul.ftz.f32 %f645, %f662, %f137; // begin inline asm cvt.rn.f16x2.f32 %r2325, %f644, %f645; // end inline asm mul.ftz.f32 %f646, %f663, %f146; mul.ftz.f32 %f647, %f663, %f145; // begin inline asm cvt.rn.f16x2.f32 %r2326, %f646, %f647; // end inline asm // begin inline asm mov.u32 %r3095, 0; // end inline asm // begin inline asm mov.u32 %r3096, 0; // end inline asm // begin inline asm mov.u32 %r3105, 0; // end inline asm // begin inline asm mov.u32 %r3106, 0; // end inline asm // begin inline asm mov.u32 %r3115, 0; // end inline asm // begin inline asm mov.u32 %r3116, 0; // end inline asm // begin inline asm mov.u32 %r3125, 0; // end inline asm // begin inline asm mov.u32 %r3126, 0; // end inline asm // begin inline asm mov.u32 %r3135, 0; // end inline asm // begin inline asm mov.u32 %r3136, 0; // end inline asm // begin inline asm mov.u32 %r3145, 0; // end inline asm // begin inline asm mov.u32 %r3146, 0; // end inline asm // begin inline asm mov.u32 %r3155, 0; // end inline asm // begin inline asm mov.u32 %r3156, 0; // end inline asm // begin inline asm mov.u32 %r3165, 0; // end inline asm // begin inline asm mov.u32 %r3166, 0; // end inline asm // begin inline asm mov.u32 %r3175, 0; // end inline asm // begin inline asm mov.u32 %r3176, 0; // end inline asm // begin inline asm mov.u32 %r3185, 0; // end inline asm // begin inline asm mov.u32 %r3186, 0; // end inline asm // begin inline asm mov.u32 %r3195, 0; // end inline asm // begin inline asm mov.u32 %r3196, 0; // end inline asm // begin inline asm mov.u32 %r3205, 0; // end inline asm // begin inline asm mov.u32 %r3206, 0; // end inline asm // begin inline asm mov.u32 %r3215, 0; // end inline asm // begin inline asm mov.u32 %r3216, 0; // end inline asm // begin inline asm mov.u32 %r3225, 0; // end inline asm // begin inline asm mov.u32 %r3226, 0; // end inline asm // begin inline asm mov.u32 %r3235, 0; // end inline asm // begin inline asm mov.u32 %r3236, 0; // end inline asm // begin inline asm mov.u32 %r3245, 0; // end inline asm // begin inline asm mov.u32 %r3246, 0; // end inline asm // begin inline asm mov.u32 %r3255, 0; // end inline asm // begin inline asm mov.u32 %r3256, 0; // end inline asm // begin inline asm mov.u32 %r3265, 0; // end inline asm // begin inline asm mov.u32 %r3266, 0; // end inline asm // begin inline asm mov.u32 %r3275, 0; // end inline asm // begin inline asm mov.u32 %r3276, 0; // end inline asm // begin inline asm mov.u32 %r3285, 0; // end inline asm // begin inline asm mov.u32 %r3286, 0; // end inline asm // begin inline asm mov.u32 %r3295, 0; // end inline asm // begin inline asm mov.u32 %r3296, 0; // end inline asm // begin inline asm mov.u32 %r3305, 0; // end inline asm // begin inline asm mov.u32 %r3306, 0; // end inline asm // begin inline asm mov.u32 %r3315, 0; // end inline asm // begin inline asm mov.u32 %r3316, 0; // end inline asm // begin inline asm mov.u32 %r3325, 0; // end inline asm // begin inline asm mov.u32 %r3326, 0; // end inline asm // begin inline asm mov.u32 %r3335, 0; // end inline asm // begin inline asm mov.u32 %r3336, 0; // end inline asm // begin inline asm mov.u32 %r3345, 0; // end inline asm // begin inline asm mov.u32 %r3346, 0; // end inline asm // begin inline asm mov.u32 %r3355, 0; // end inline asm // begin inline asm mov.u32 %r3356, 0; // end inline asm // begin inline asm mov.u32 %r3365, 0; // end inline asm // begin inline asm mov.u32 %r3366, 0; // end inline asm // begin inline asm mov.u32 %r3375, 0; // end inline asm // begin inline asm mov.u32 %r3376, 0; // end inline asm // begin inline asm mov.u32 %r3385, 0; // end inline asm // begin inline asm mov.u32 %r3386, 0; // end inline asm // begin inline asm mov.u32 %r3395, 0; // end inline asm // begin inline asm mov.u32 %r3396, 0; // end inline asm // begin inline asm mov.u32 %r3405, 0; // end inline asm // begin inline asm mov.u32 %r3406, 0; // end inline asm // begin inline asm mov.u32 %r3415, 0; // end inline asm // begin inline asm mov.u32 %r3416, 0; // end inline asm // begin inline asm mov.u32 %r3425, 0; // end inline asm // begin inline asm mov.u32 %r3426, 0; // end inline asm // begin inline asm mov.u32 %r3435, 0; // end inline asm // begin inline asm mov.u32 %r3436, 0; // end inline asm // begin inline asm mov.u32 %r3445, 0; // end inline asm // begin inline asm mov.u32 %r3446, 0; // end inline asm // begin inline asm mov.u32 %r3455, 0; // end inline asm // begin inline asm mov.u32 %r3456, 0; // end inline asm // begin inline asm mov.u32 %r3465, 0; // end inline asm // begin inline asm mov.u32 %r3466, 0; // end inline asm // begin inline asm mov.u32 %r3475, 0; // end inline asm // begin inline asm mov.u32 %r3476, 0; // end inline asm // begin inline asm mov.u32 %r3485, 0; // end inline asm // begin inline asm mov.u32 %r3486, 0; // end inline asm // begin inline asm mov.u32 %r3495, 0; // end inline asm // begin inline asm mov.u32 %r3496, 0; // end inline asm // begin inline asm mov.u32 %r3505, 0; // end inline asm // begin inline asm mov.u32 %r3506, 0; // end inline asm // begin inline asm mov.u32 %r3515, 0; // end inline asm // begin inline asm mov.u32 %r3516, 0; // end inline asm // begin inline asm mov.u32 %r3525, 0; // end inline asm // begin inline asm mov.u32 %r3526, 0; // end inline asm // begin inline asm mov.u32 %r3535, 0; // end inline asm // begin inline asm mov.u32 %r3536, 0; // end inline asm // begin inline asm mov.u32 %r3545, 0; // end inline asm // begin inline asm mov.u32 %r3546, 0; // end inline asm // begin inline asm mov.u32 %r3555, 0; // end inline asm // begin inline asm mov.u32 %r3556, 0; // end inline asm // begin inline asm mov.u32 %r3565, 0; // end inline asm // begin inline asm mov.u32 %r3566, 0; // end inline asm // begin inline asm mov.u32 %r3575, 0; // end inline asm // begin inline asm mov.u32 %r3576, 0; // end inline asm // begin inline asm mov.u32 %r3585, 0; // end inline asm // begin inline asm mov.u32 %r3586, 0; // end inline asm // begin inline asm mov.u32 %r3595, 0; // end inline asm // begin inline asm mov.u32 %r3596, 0; // end inline asm // begin inline asm mov.u32 %r3605, 0; // end inline asm // begin inline asm mov.u32 %r3606, 0; // end inline asm // begin inline asm mov.u32 %r3615, 0; // end inline asm // begin inline asm mov.u32 %r3616, 0; // end inline asm // begin inline asm mov.u32 %r3625, 0; // end inline asm // begin inline asm mov.u32 %r3626, 0; // end inline asm // begin inline asm mov.u32 %r3635, 0; // end inline asm // begin inline asm mov.u32 %r3636, 0; // end inline asm // begin inline asm mov.u32 %r3645, 0; // end inline asm // begin inline asm mov.u32 %r3646, 0; // end inline asm // begin inline asm mov.u32 %r3655, 0; // end inline asm // begin inline asm mov.u32 %r3656, 0; // end inline asm // begin inline asm mov.u32 %r3665, 0; // end inline asm // begin inline asm mov.u32 %r3666, 0; // end inline asm // begin inline asm mov.u32 %r3675, 0; // end inline asm // begin inline asm mov.u32 %r3676, 0; // end inline asm // begin inline asm mov.u32 %r3685, 0; // end inline asm // begin inline asm mov.u32 %r3686, 0; // end inline asm // begin inline asm mov.u32 %r3695, 0; // end inline asm // begin inline asm mov.u32 %r3696, 0; // end inline asm // begin inline asm mov.u32 %r3705, 0; // end inline asm // begin inline asm mov.u32 %r3706, 0; // end inline asm // begin inline asm mov.u32 %r3715, 0; // end inline asm // begin inline asm mov.u32 %r3716, 0; // end inline asm // begin inline asm mov.u32 %r3725, 0; // end inline asm // begin inline asm mov.u32 %r3726, 0; // end inline asm // begin inline asm cp.async.commit_group; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3095, %r3096}, {%r2295, %r2296, %r2297, %r2298}, {%r307, %r308}, {%r3095, %r3096}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3105, %r3106}, {%r2295, %r2296, %r2297, %r2298}, {%r309, %r310}, {%r3105, %r3106}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3115, %r3116}, {%r2295, %r2296, %r2297, %r2298}, {%r312, %r313}, {%r3115, %r3116}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3125, %r3126}, {%r2295, %r2296, %r2297, %r2298}, {%r314, %r315}, {%r3125, %r3126}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3135, %r3136}, {%r2295, %r2296, %r2297, %r2298}, {%r317, %r318}, {%r3135, %r3136}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3145, %r3146}, {%r2295, %r2296, %r2297, %r2298}, {%r319, %r320}, {%r3145, %r3146}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3155, %r3156}, {%r2295, %r2296, %r2297, %r2298}, {%r322, %r323}, {%r3155, %r3156}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3165, %r3166}, {%r2295, %r2296, %r2297, %r2298}, {%r324, %r325}, {%r3165, %r3166}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3175, %r3176}, {%r2295, %r2296, %r2297, %r2298}, {%r327, %r328}, {%r3175, %r3176}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3185, %r3186}, {%r2295, %r2296, %r2297, %r2298}, {%r329, %r330}, {%r3185, %r3186}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3195, %r3196}, {%r2295, %r2296, %r2297, %r2298}, {%r332, %r333}, {%r3195, %r3196}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3205, %r3206}, {%r2295, %r2296, %r2297, %r2298}, {%r334, %r335}, {%r3205, %r3206}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3215, %r3216}, {%r2295, %r2296, %r2297, %r2298}, {%r337, %r338}, {%r3215, %r3216}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3225, %r3226}, {%r2295, %r2296, %r2297, %r2298}, {%r339, %r340}, {%r3225, %r3226}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3235, %r3236}, {%r2295, %r2296, %r2297, %r2298}, {%r342, %r343}, {%r3235, %r3236}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3245, %r3246}, {%r2295, %r2296, %r2297, %r2298}, {%r344, %r345}, {%r3245, %r3246}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3255, %r3256}, {%r2303, %r2304, %r2305, %r2306}, {%r307, %r308}, {%r3255, %r3256}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3265, %r3266}, {%r2303, %r2304, %r2305, %r2306}, {%r309, %r310}, {%r3265, %r3266}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3275, %r3276}, {%r2303, %r2304, %r2305, %r2306}, {%r312, %r313}, {%r3275, %r3276}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3285, %r3286}, {%r2303, %r2304, %r2305, %r2306}, {%r314, %r315}, {%r3285, %r3286}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3295, %r3296}, {%r2303, %r2304, %r2305, %r2306}, {%r317, %r318}, {%r3295, %r3296}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3305, %r3306}, {%r2303, %r2304, %r2305, %r2306}, {%r319, %r320}, {%r3305, %r3306}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3315, %r3316}, {%r2303, %r2304, %r2305, %r2306}, {%r322, %r323}, {%r3315, %r3316}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3325, %r3326}, {%r2303, %r2304, %r2305, %r2306}, {%r324, %r325}, {%r3325, %r3326}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3335, %r3336}, {%r2303, %r2304, %r2305, %r2306}, {%r327, %r328}, {%r3335, %r3336}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3345, %r3346}, {%r2303, %r2304, %r2305, %r2306}, {%r329, %r330}, {%r3345, %r3346}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3355, %r3356}, {%r2303, %r2304, %r2305, %r2306}, {%r332, %r333}, {%r3355, %r3356}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3365, %r3366}, {%r2303, %r2304, %r2305, %r2306}, {%r334, %r335}, {%r3365, %r3366}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3375, %r3376}, {%r2303, %r2304, %r2305, %r2306}, {%r337, %r338}, {%r3375, %r3376}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3385, %r3386}, {%r2303, %r2304, %r2305, %r2306}, {%r339, %r340}, {%r3385, %r3386}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3395, %r3396}, {%r2303, %r2304, %r2305, %r2306}, {%r342, %r343}, {%r3395, %r3396}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3405, %r3406}, {%r2303, %r2304, %r2305, %r2306}, {%r344, %r345}, {%r3405, %r3406}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3415, %r3416}, {%r2311, %r2312, %r2313, %r2314}, {%r307, %r308}, {%r3415, %r3416}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3425, %r3426}, {%r2311, %r2312, %r2313, %r2314}, {%r309, %r310}, {%r3425, %r3426}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3435, %r3436}, {%r2311, %r2312, %r2313, %r2314}, {%r312, %r313}, {%r3435, %r3436}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3445, %r3446}, {%r2311, %r2312, %r2313, %r2314}, {%r314, %r315}, {%r3445, %r3446}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3455, %r3456}, {%r2311, %r2312, %r2313, %r2314}, {%r317, %r318}, {%r3455, %r3456}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3465, %r3466}, {%r2311, %r2312, %r2313, %r2314}, {%r319, %r320}, {%r3465, %r3466}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3475, %r3476}, {%r2311, %r2312, %r2313, %r2314}, {%r322, %r323}, {%r3475, %r3476}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3485, %r3486}, {%r2311, %r2312, %r2313, %r2314}, {%r324, %r325}, {%r3485, %r3486}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3495, %r3496}, {%r2311, %r2312, %r2313, %r2314}, {%r327, %r328}, {%r3495, %r3496}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3505, %r3506}, {%r2311, %r2312, %r2313, %r2314}, {%r329, %r330}, {%r3505, %r3506}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3515, %r3516}, {%r2311, %r2312, %r2313, %r2314}, {%r332, %r333}, {%r3515, %r3516}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3525, %r3526}, {%r2311, %r2312, %r2313, %r2314}, {%r334, %r335}, {%r3525, %r3526}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3535, %r3536}, {%r2311, %r2312, %r2313, %r2314}, {%r337, %r338}, {%r3535, %r3536}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3545, %r3546}, {%r2311, %r2312, %r2313, %r2314}, {%r339, %r340}, {%r3545, %r3546}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3555, %r3556}, {%r2311, %r2312, %r2313, %r2314}, {%r342, %r343}, {%r3555, %r3556}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3565, %r3566}, {%r2311, %r2312, %r2313, %r2314}, {%r344, %r345}, {%r3565, %r3566}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3575, %r3576}, {%r2319, %r2320, %r2321, %r2322}, {%r307, %r308}, {%r3575, %r3576}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3585, %r3586}, {%r2319, %r2320, %r2321, %r2322}, {%r309, %r310}, {%r3585, %r3586}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3595, %r3596}, {%r2319, %r2320, %r2321, %r2322}, {%r312, %r313}, {%r3595, %r3596}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3605, %r3606}, {%r2319, %r2320, %r2321, %r2322}, {%r314, %r315}, {%r3605, %r3606}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3615, %r3616}, {%r2319, %r2320, %r2321, %r2322}, {%r317, %r318}, {%r3615, %r3616}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3625, %r3626}, {%r2319, %r2320, %r2321, %r2322}, {%r319, %r320}, {%r3625, %r3626}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3635, %r3636}, {%r2319, %r2320, %r2321, %r2322}, {%r322, %r323}, {%r3635, %r3636}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3645, %r3646}, {%r2319, %r2320, %r2321, %r2322}, {%r324, %r325}, {%r3645, %r3646}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3655, %r3656}, {%r2319, %r2320, %r2321, %r2322}, {%r327, %r328}, {%r3655, %r3656}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3665, %r3666}, {%r2319, %r2320, %r2321, %r2322}, {%r329, %r330}, {%r3665, %r3666}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3675, %r3676}, {%r2319, %r2320, %r2321, %r2322}, {%r332, %r333}, {%r3675, %r3676}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3685, %r3686}, {%r2319, %r2320, %r2321, %r2322}, {%r334, %r335}, {%r3685, %r3686}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3695, %r3696}, {%r2319, %r2320, %r2321, %r2322}, {%r337, %r338}, {%r3695, %r3696}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3705, %r3706}, {%r2319, %r2320, %r2321, %r2322}, {%r339, %r340}, {%r3705, %r3706}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3715, %r3716}, {%r2319, %r2320, %r2321, %r2322}, {%r342, %r343}, {%r3715, %r3716}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3725, %r3726}, {%r2319, %r2320, %r2321, %r2322}, {%r344, %r345}, {%r3725, %r3726}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3095, %r3096}, {%r2299, %r2300, %r2301, %r2302}, {%r347, %r348}, {%r3095, %r3096}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3105, %r3106}, {%r2299, %r2300, %r2301, %r2302}, {%r349, %r350}, {%r3105, %r3106}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3115, %r3116}, {%r2299, %r2300, %r2301, %r2302}, {%r352, %r353}, {%r3115, %r3116}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3125, %r3126}, {%r2299, %r2300, %r2301, %r2302}, {%r354, %r355}, {%r3125, %r3126}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3135, %r3136}, {%r2299, %r2300, %r2301, %r2302}, {%r357, %r358}, {%r3135, %r3136}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3145, %r3146}, {%r2299, %r2300, %r2301, %r2302}, {%r359, %r360}, {%r3145, %r3146}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3155, %r3156}, {%r2299, %r2300, %r2301, %r2302}, {%r362, %r363}, {%r3155, %r3156}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3165, %r3166}, {%r2299, %r2300, %r2301, %r2302}, {%r364, %r365}, {%r3165, %r3166}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3175, %r3176}, {%r2299, %r2300, %r2301, %r2302}, {%r367, %r368}, {%r3175, %r3176}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3185, %r3186}, {%r2299, %r2300, %r2301, %r2302}, {%r369, %r370}, {%r3185, %r3186}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3195, %r3196}, {%r2299, %r2300, %r2301, %r2302}, {%r372, %r373}, {%r3195, %r3196}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3205, %r3206}, {%r2299, %r2300, %r2301, %r2302}, {%r374, %r375}, {%r3205, %r3206}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3215, %r3216}, {%r2299, %r2300, %r2301, %r2302}, {%r377, %r378}, {%r3215, %r3216}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3225, %r3226}, {%r2299, %r2300, %r2301, %r2302}, {%r379, %r380}, {%r3225, %r3226}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3235, %r3236}, {%r2299, %r2300, %r2301, %r2302}, {%r382, %r383}, {%r3235, %r3236}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3245, %r3246}, {%r2299, %r2300, %r2301, %r2302}, {%r384, %r385}, {%r3245, %r3246}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3255, %r3256}, {%r2307, %r2308, %r2309, %r2310}, {%r347, %r348}, {%r3255, %r3256}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3265, %r3266}, {%r2307, %r2308, %r2309, %r2310}, {%r349, %r350}, {%r3265, %r3266}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3275, %r3276}, {%r2307, %r2308, %r2309, %r2310}, {%r352, %r353}, {%r3275, %r3276}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3285, %r3286}, {%r2307, %r2308, %r2309, %r2310}, {%r354, %r355}, {%r3285, %r3286}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3295, %r3296}, {%r2307, %r2308, %r2309, %r2310}, {%r357, %r358}, {%r3295, %r3296}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3305, %r3306}, {%r2307, %r2308, %r2309, %r2310}, {%r359, %r360}, {%r3305, %r3306}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3315, %r3316}, {%r2307, %r2308, %r2309, %r2310}, {%r362, %r363}, {%r3315, %r3316}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3325, %r3326}, {%r2307, %r2308, %r2309, %r2310}, {%r364, %r365}, {%r3325, %r3326}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3335, %r3336}, {%r2307, %r2308, %r2309, %r2310}, {%r367, %r368}, {%r3335, %r3336}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3345, %r3346}, {%r2307, %r2308, %r2309, %r2310}, {%r369, %r370}, {%r3345, %r3346}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3355, %r3356}, {%r2307, %r2308, %r2309, %r2310}, {%r372, %r373}, {%r3355, %r3356}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3365, %r3366}, {%r2307, %r2308, %r2309, %r2310}, {%r374, %r375}, {%r3365, %r3366}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3375, %r3376}, {%r2307, %r2308, %r2309, %r2310}, {%r377, %r378}, {%r3375, %r3376}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3385, %r3386}, {%r2307, %r2308, %r2309, %r2310}, {%r379, %r380}, {%r3385, %r3386}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3395, %r3396}, {%r2307, %r2308, %r2309, %r2310}, {%r382, %r383}, {%r3395, %r3396}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3405, %r3406}, {%r2307, %r2308, %r2309, %r2310}, {%r384, %r385}, {%r3405, %r3406}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3415, %r3416}, {%r2315, %r2316, %r2317, %r2318}, {%r347, %r348}, {%r3415, %r3416}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3425, %r3426}, {%r2315, %r2316, %r2317, %r2318}, {%r349, %r350}, {%r3425, %r3426}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3435, %r3436}, {%r2315, %r2316, %r2317, %r2318}, {%r352, %r353}, {%r3435, %r3436}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3445, %r3446}, {%r2315, %r2316, %r2317, %r2318}, {%r354, %r355}, {%r3445, %r3446}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3455, %r3456}, {%r2315, %r2316, %r2317, %r2318}, {%r357, %r358}, {%r3455, %r3456}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3465, %r3466}, {%r2315, %r2316, %r2317, %r2318}, {%r359, %r360}, {%r3465, %r3466}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3475, %r3476}, {%r2315, %r2316, %r2317, %r2318}, {%r362, %r363}, {%r3475, %r3476}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3485, %r3486}, {%r2315, %r2316, %r2317, %r2318}, {%r364, %r365}, {%r3485, %r3486}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3495, %r3496}, {%r2315, %r2316, %r2317, %r2318}, {%r367, %r368}, {%r3495, %r3496}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3505, %r3506}, {%r2315, %r2316, %r2317, %r2318}, {%r369, %r370}, {%r3505, %r3506}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3515, %r3516}, {%r2315, %r2316, %r2317, %r2318}, {%r372, %r373}, {%r3515, %r3516}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3525, %r3526}, {%r2315, %r2316, %r2317, %r2318}, {%r374, %r375}, {%r3525, %r3526}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3535, %r3536}, {%r2315, %r2316, %r2317, %r2318}, {%r377, %r378}, {%r3535, %r3536}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3545, %r3546}, {%r2315, %r2316, %r2317, %r2318}, {%r379, %r380}, {%r3545, %r3546}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3555, %r3556}, {%r2315, %r2316, %r2317, %r2318}, {%r382, %r383}, {%r3555, %r3556}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3565, %r3566}, {%r2315, %r2316, %r2317, %r2318}, {%r384, %r385}, {%r3565, %r3566}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3575, %r3576}, {%r2323, %r2324, %r2325, %r2326}, {%r347, %r348}, {%r3575, %r3576}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3585, %r3586}, {%r2323, %r2324, %r2325, %r2326}, {%r349, %r350}, {%r3585, %r3586}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3595, %r3596}, {%r2323, %r2324, %r2325, %r2326}, {%r352, %r353}, {%r3595, %r3596}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3605, %r3606}, {%r2323, %r2324, %r2325, %r2326}, {%r354, %r355}, {%r3605, %r3606}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3615, %r3616}, {%r2323, %r2324, %r2325, %r2326}, {%r357, %r358}, {%r3615, %r3616}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3625, %r3626}, {%r2323, %r2324, %r2325, %r2326}, {%r359, %r360}, {%r3625, %r3626}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3635, %r3636}, {%r2323, %r2324, %r2325, %r2326}, {%r362, %r363}, {%r3635, %r3636}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3645, %r3646}, {%r2323, %r2324, %r2325, %r2326}, {%r364, %r365}, {%r3645, %r3646}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3655, %r3656}, {%r2323, %r2324, %r2325, %r2326}, {%r367, %r368}, {%r3655, %r3656}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3665, %r3666}, {%r2323, %r2324, %r2325, %r2326}, {%r369, %r370}, {%r3665, %r3666}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3675, %r3676}, {%r2323, %r2324, %r2325, %r2326}, {%r372, %r373}, {%r3675, %r3676}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3685, %r3686}, {%r2323, %r2324, %r2325, %r2326}, {%r374, %r375}, {%r3685, %r3686}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3695, %r3696}, {%r2323, %r2324, %r2325, %r2326}, {%r377, %r378}, {%r3695, %r3696}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3705, %r3706}, {%r2323, %r2324, %r2325, %r2326}, {%r379, %r380}, {%r3705, %r3706}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3715, %r3716}, {%r2323, %r2324, %r2325, %r2326}, {%r382, %r383}, {%r3715, %r3716}; // end inline asm // begin inline asm mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%r3725, %r3726}, {%r2323, %r2324, %r2325, %r2326}, {%r384, %r385}, {%r3725, %r3726}; // end inline asm and.b32 %r4444, %r4527, 7168; add.s32 %r4447, %r4444, %r4528; shl.b32 %r4448, %r4535, 1; and.b32 %r4449, %r4448, 448; and.b32 %r4450, %r4535, 31; or.b32 %r4451, %r4449, %r4450; shl.b32 %r4452, %r4451, 2; add.s32 %r3735, %r4447, %r4452; add.s32 %r3737, %r3735, 8192; // begin inline asm st.shared.b32 [%r3735], %r3095; // end inline asm // begin inline asm st.shared.b32 [%r3737], %r3096; // end inline asm add.s32 %r3739, %r3735, 16384; // begin inline asm st.shared.b32 [%r3739], %r3255; // end inline asm add.s32 %r3741, %r3735, 24576; // begin inline asm st.shared.b32 [%r3741], %r3256; // end inline asm add.s32 %r3743, %r3735, 32768; // begin inline asm st.shared.b32 [%r3743], %r3415; // end inline asm add.s32 %r3745, %r3735, 40960; // begin inline asm st.shared.b32 [%r3745], %r3416; // end inline asm add.s32 %r3747, %r3735, 49152; // begin inline asm st.shared.b32 [%r3747], %r3575; // end inline asm add.s32 %r3749, %r3735, 57344; // begin inline asm st.shared.b32 [%r3749], %r3576; // end inline asm xor.b32 %r3751, %r3735, 16; add.s32 %r3753, %r3751, 8192; // begin inline asm st.shared.b32 [%r3751], %r3105; // end inline asm // begin inline asm st.shared.b32 [%r3753], %r3106; // end inline asm add.s32 %r3755, %r3751, 16384; // begin inline asm st.shared.b32 [%r3755], %r3265; // end inline asm add.s32 %r3757, %r3751, 24576; // begin inline asm st.shared.b32 [%r3757], %r3266; // end inline asm add.s32 %r3759, %r3751, 32768; // begin inline asm st.shared.b32 [%r3759], %r3425; // end inline asm add.s32 %r3761, %r3751, 40960; // begin inline asm st.shared.b32 [%r3761], %r3426; // end inline asm add.s32 %r3763, %r3751, 49152; // begin inline asm st.shared.b32 [%r3763], %r3585; // end inline asm add.s32 %r3765, %r3751, 57344; // begin inline asm st.shared.b32 [%r3765], %r3586; // end inline asm xor.b32 %r3767, %r3735, 32; add.s32 %r3769, %r3767, 8192; // begin inline asm st.shared.b32 [%r3767], %r3115; // end inline asm // begin inline asm st.shared.b32 [%r3769], %r3116; // end inline asm add.s32 %r3771, %r3767, 16384; // begin inline asm st.shared.b32 [%r3771], %r3275; // end inline asm add.s32 %r3773, %r3767, 24576; // begin inline asm st.shared.b32 [%r3773], %r3276; // end inline asm add.s32 %r3775, %r3767, 32768; // begin inline asm st.shared.b32 [%r3775], %r3435; // end inline asm add.s32 %r3777, %r3767, 40960; // begin inline asm st.shared.b32 [%r3777], %r3436; // end inline asm add.s32 %r3779, %r3767, 49152; // begin inline asm st.shared.b32 [%r3779], %r3595; // end inline asm add.s32 %r3781, %r3767, 57344; // begin inline asm st.shared.b32 [%r3781], %r3596; // end inline asm xor.b32 %r3783, %r3735, 48; add.s32 %r3785, %r3783, 8192; // begin inline asm st.shared.b32 [%r3783], %r3125; // end inline asm // begin inline asm st.shared.b32 [%r3785], %r3126; // end inline asm add.s32 %r3787, %r3783, 16384; // begin inline asm st.shared.b32 [%r3787], %r3285; // end inline asm add.s32 %r3789, %r3783, 24576; // begin inline asm st.shared.b32 [%r3789], %r3286; // end inline asm add.s32 %r3791, %r3783, 32768; // begin inline asm st.shared.b32 [%r3791], %r3445; // end inline asm add.s32 %r3793, %r3783, 40960; // begin inline asm st.shared.b32 [%r3793], %r3446; // end inline asm add.s32 %r3795, %r3783, 49152; // begin inline asm st.shared.b32 [%r3795], %r3605; // end inline asm add.s32 %r3797, %r3783, 57344; // begin inline asm st.shared.b32 [%r3797], %r3606; // end inline asm xor.b32 %r3799, %r3735, 64; add.s32 %r3801, %r3799, 8192; // begin inline asm st.shared.b32 [%r3799], %r3135; // end inline asm // begin inline asm st.shared.b32 [%r3801], %r3136; // end inline asm add.s32 %r3803, %r3799, 16384; // begin inline asm st.shared.b32 [%r3803], %r3295; // end inline asm add.s32 %r3805, %r3799, 24576; // begin inline asm st.shared.b32 [%r3805], %r3296; // end inline asm add.s32 %r3807, %r3799, 32768; // begin inline asm st.shared.b32 [%r3807], %r3455; // end inline asm add.s32 %r3809, %r3799, 40960; // begin inline asm st.shared.b32 [%r3809], %r3456; // end inline asm add.s32 %r3811, %r3799, 49152; // begin inline asm st.shared.b32 [%r3811], %r3615; // end inline asm add.s32 %r3813, %r3799, 57344; // begin inline asm st.shared.b32 [%r3813], %r3616; // end inline asm xor.b32 %r3815, %r3735, 80; add.s32 %r3817, %r3815, 8192; // begin inline asm st.shared.b32 [%r3815], %r3145; // end inline asm // begin inline asm st.shared.b32 [%r3817], %r3146; // end inline asm add.s32 %r3819, %r3815, 16384; // begin inline asm st.shared.b32 [%r3819], %r3305; // end inline asm add.s32 %r3821, %r3815, 24576; // begin inline asm st.shared.b32 [%r3821], %r3306; // end inline asm add.s32 %r3823, %r3815, 32768; // begin inline asm st.shared.b32 [%r3823], %r3465; // end inline asm add.s32 %r3825, %r3815, 40960; // begin inline asm st.shared.b32 [%r3825], %r3466; // end inline asm add.s32 %r3827, %r3815, 49152; // begin inline asm st.shared.b32 [%r3827], %r3625; // end inline asm add.s32 %r3829, %r3815, 57344; // begin inline asm st.shared.b32 [%r3829], %r3626; // end inline asm xor.b32 %r3831, %r3735, 96; add.s32 %r3833, %r3831, 8192; // begin inline asm st.shared.b32 [%r3831], %r3155; // end inline asm // begin inline asm st.shared.b32 [%r3833], %r3156; // end inline asm add.s32 %r3835, %r3831, 16384; // begin inline asm st.shared.b32 [%r3835], %r3315; // end inline asm add.s32 %r3837, %r3831, 24576; // begin inline asm st.shared.b32 [%r3837], %r3316; // end inline asm add.s32 %r3839, %r3831, 32768; // begin inline asm st.shared.b32 [%r3839], %r3475; // end inline asm add.s32 %r3841, %r3831, 40960; // begin inline asm st.shared.b32 [%r3841], %r3476; // end inline asm add.s32 %r3843, %r3831, 49152; // begin inline asm st.shared.b32 [%r3843], %r3635; // end inline asm add.s32 %r3845, %r3831, 57344; // begin inline asm st.shared.b32 [%r3845], %r3636; // end inline asm xor.b32 %r3847, %r3735, 112; add.s32 %r3849, %r3847, 8192; // begin inline asm st.shared.b32 [%r3847], %r3165; // end inline asm // begin inline asm st.shared.b32 [%r3849], %r3166; // end inline asm add.s32 %r3851, %r3847, 16384; // begin inline asm st.shared.b32 [%r3851], %r3325; // end inline asm add.s32 %r3853, %r3847, 24576; // begin inline asm st.shared.b32 [%r3853], %r3326; // end inline asm add.s32 %r3855, %r3847, 32768; // begin inline asm st.shared.b32 [%r3855], %r3485; // end inline asm add.s32 %r3857, %r3847, 40960; // begin inline asm st.shared.b32 [%r3857], %r3486; // end inline asm add.s32 %r3859, %r3847, 49152; // begin inline asm st.shared.b32 [%r3859], %r3645; // end inline asm add.s32 %r3861, %r3847, 57344; // begin inline asm st.shared.b32 [%r3861], %r3646; // end inline asm xor.b32 %r3863, %r3735, 128; add.s32 %r3865, %r3863, 8192; // begin inline asm st.shared.b32 [%r3863], %r3175; // end inline asm // begin inline asm st.shared.b32 [%r3865], %r3176; // end inline asm add.s32 %r3867, %r3863, 16384; // begin inline asm st.shared.b32 [%r3867], %r3335; // end inline asm add.s32 %r3869, %r3863, 24576; // begin inline asm st.shared.b32 [%r3869], %r3336; // end inline asm add.s32 %r3871, %r3863, 32768; // begin inline asm st.shared.b32 [%r3871], %r3495; // end inline asm add.s32 %r3873, %r3863, 40960; // begin inline asm st.shared.b32 [%r3873], %r3496; // end inline asm add.s32 %r3875, %r3863, 49152; // begin inline asm st.shared.b32 [%r3875], %r3655; // end inline asm add.s32 %r3877, %r3863, 57344; // begin inline asm st.shared.b32 [%r3877], %r3656; // end inline asm xor.b32 %r3879, %r3735, 144; add.s32 %r3881, %r3879, 8192; // begin inline asm st.shared.b32 [%r3879], %r3185; // end inline asm // begin inline asm st.shared.b32 [%r3881], %r3186; // end inline asm add.s32 %r3883, %r3879, 16384; // begin inline asm st.shared.b32 [%r3883], %r3345; // end inline asm add.s32 %r3885, %r3879, 24576; // begin inline asm st.shared.b32 [%r3885], %r3346; // end inline asm add.s32 %r3887, %r3879, 32768; // begin inline asm st.shared.b32 [%r3887], %r3505; // end inline asm add.s32 %r3889, %r3879, 40960; // begin inline asm st.shared.b32 [%r3889], %r3506; // end inline asm add.s32 %r3891, %r3879, 49152; // begin inline asm st.shared.b32 [%r3891], %r3665; // end inline asm add.s32 %r3893, %r3879, 57344; // begin inline asm st.shared.b32 [%r3893], %r3666; // end inline asm xor.b32 %r3895, %r3735, 160; add.s32 %r3897, %r3895, 8192; // begin inline asm st.shared.b32 [%r3895], %r3195; // end inline asm // begin inline asm st.shared.b32 [%r3897], %r3196; // end inline asm add.s32 %r3899, %r3895, 16384; // begin inline asm st.shared.b32 [%r3899], %r3355; // end inline asm add.s32 %r3901, %r3895, 24576; // begin inline asm st.shared.b32 [%r3901], %r3356; // end inline asm add.s32 %r3903, %r3895, 32768; // begin inline asm st.shared.b32 [%r3903], %r3515; // end inline asm add.s32 %r3905, %r3895, 40960; // begin inline asm st.shared.b32 [%r3905], %r3516; // end inline asm add.s32 %r3907, %r3895, 49152; // begin inline asm st.shared.b32 [%r3907], %r3675; // end inline asm add.s32 %r3909, %r3895, 57344; // begin inline asm st.shared.b32 [%r3909], %r3676; // end inline asm xor.b32 %r3911, %r3735, 176; add.s32 %r3913, %r3911, 8192; // begin inline asm st.shared.b32 [%r3911], %r3205; // end inline asm // begin inline asm st.shared.b32 [%r3913], %r3206; // end inline asm add.s32 %r3915, %r3911, 16384; // begin inline asm st.shared.b32 [%r3915], %r3365; // end inline asm add.s32 %r3917, %r3911, 24576; // begin inline asm st.shared.b32 [%r3917], %r3366; // end inline asm add.s32 %r3919, %r3911, 32768; // begin inline asm st.shared.b32 [%r3919], %r3525; // end inline asm add.s32 %r3921, %r3911, 40960; // begin inline asm st.shared.b32 [%r3921], %r3526; // end inline asm add.s32 %r3923, %r3911, 49152; // begin inline asm st.shared.b32 [%r3923], %r3685; // end inline asm add.s32 %r3925, %r3911, 57344; // begin inline asm st.shared.b32 [%r3925], %r3686; // end inline asm xor.b32 %r3927, %r3735, 192; add.s32 %r3929, %r3927, 8192; // begin inline asm st.shared.b32 [%r3927], %r3215; // end inline asm // begin inline asm st.shared.b32 [%r3929], %r3216; // end inline asm add.s32 %r3931, %r3927, 16384; // begin inline asm st.shared.b32 [%r3931], %r3375; // end inline asm add.s32 %r3933, %r3927, 24576; // begin inline asm st.shared.b32 [%r3933], %r3376; // end inline asm add.s32 %r3935, %r3927, 32768; // begin inline asm st.shared.b32 [%r3935], %r3535; // end inline asm add.s32 %r3937, %r3927, 40960; // begin inline asm st.shared.b32 [%r3937], %r3536; // end inline asm add.s32 %r3939, %r3927, 49152; // begin inline asm st.shared.b32 [%r3939], %r3695; // end inline asm add.s32 %r3941, %r3927, 57344; // begin inline asm st.shared.b32 [%r3941], %r3696; // end inline asm xor.b32 %r3943, %r3735, 208; add.s32 %r3945, %r3943, 8192; // begin inline asm st.shared.b32 [%r3943], %r3225; // end inline asm // begin inline asm st.shared.b32 [%r3945], %r3226; // end inline asm add.s32 %r3947, %r3943, 16384; // begin inline asm st.shared.b32 [%r3947], %r3385; // end inline asm add.s32 %r3949, %r3943, 24576; // begin inline asm st.shared.b32 [%r3949], %r3386; // end inline asm add.s32 %r3951, %r3943, 32768; // begin inline asm st.shared.b32 [%r3951], %r3545; // end inline asm add.s32 %r3953, %r3943, 40960; // begin inline asm st.shared.b32 [%r3953], %r3546; // end inline asm add.s32 %r3955, %r3943, 49152; // begin inline asm st.shared.b32 [%r3955], %r3705; // end inline asm add.s32 %r3957, %r3943, 57344; // begin inline asm st.shared.b32 [%r3957], %r3706; // end inline asm xor.b32 %r3959, %r3735, 224; add.s32 %r3961, %r3959, 8192; // begin inline asm st.shared.b32 [%r3959], %r3235; // end inline asm // begin inline asm st.shared.b32 [%r3961], %r3236; // end inline asm add.s32 %r3963, %r3959, 16384; // begin inline asm st.shared.b32 [%r3963], %r3395; // end inline asm add.s32 %r3965, %r3959, 24576; // begin inline asm st.shared.b32 [%r3965], %r3396; // end inline asm add.s32 %r3967, %r3959, 32768; // begin inline asm st.shared.b32 [%r3967], %r3555; // end inline asm add.s32 %r3969, %r3959, 40960; // begin inline asm st.shared.b32 [%r3969], %r3556; // end inline asm add.s32 %r3971, %r3959, 49152; // begin inline asm st.shared.b32 [%r3971], %r3715; // end inline asm add.s32 %r3973, %r3959, 57344; // begin inline asm st.shared.b32 [%r3973], %r3716; // end inline asm xor.b32 %r3975, %r3735, 240; add.s32 %r3977, %r3975, 8192; // begin inline asm st.shared.b32 [%r3975], %r3245; // end inline asm // begin inline asm st.shared.b32 [%r3977], %r3246; // end inline asm add.s32 %r3979, %r3975, 16384; // begin inline asm st.shared.b32 [%r3979], %r3405; // end inline asm add.s32 %r3981, %r3975, 24576; // begin inline asm st.shared.b32 [%r3981], %r3406; // end inline asm add.s32 %r3983, %r3975, 32768; // begin inline asm st.shared.b32 [%r3983], %r3565; // end inline asm add.s32 %r3985, %r3975, 40960; // begin inline asm st.shared.b32 [%r3985], %r3566; // end inline asm add.s32 %r3987, %r3975, 49152; // begin inline asm st.shared.b32 [%r3987], %r3725; // end inline asm add.s32 %r3989, %r3975, 57344; // begin inline asm st.shared.b32 [%r3989], %r3726; // end inline asm bar.sync 0; shl.b32 %r4453, %r4530, 10; add.s32 %r4454, %r4453, %r4528; add.s32 %r3995, %r4454, %r6; // begin inline asm ld.shared.v4.b32 {%r3991, %r3992, %r3993, %r3994}, [%r3995]; // end inline asm add.s32 %r4000, %r3995, 256; // begin inline asm ld.shared.v4.b32 {%r3996, %r3997, %r3998, %r3999}, [%r4000]; // end inline asm add.s32 %r4005, %r3995, 512; // begin inline asm ld.shared.v4.b32 {%r4001, %r4002, %r4003, %r4004}, [%r4005]; // end inline asm add.s32 %r4010, %r3995, 768; // begin inline asm ld.shared.v4.b32 {%r4006, %r4007, %r4008, %r4009}, [%r4010]; // end inline asm // begin inline asm add.f16x2 %r4011, %r3991, %r3996; // end inline asm // begin inline asm add.f16x2 %r4014, %r3992, %r3997; // end inline asm // begin inline asm add.f16x2 %r4017, %r3993, %r3998; // end inline asm // begin inline asm add.f16x2 %r4020, %r3994, %r3999; // end inline asm // begin inline asm add.f16x2 %r4023, %r4011, %r4001; // end inline asm // begin inline asm add.f16x2 %r4026, %r4014, %r4002; // end inline asm // begin inline asm add.f16x2 %r4029, %r4017, %r4003; // end inline asm // begin inline asm add.f16x2 %r4032, %r4020, %r4004; // end inline asm // begin inline asm add.f16x2 %r4035, %r4023, %r4006; // end inline asm // begin inline asm add.f16x2 %r4038, %r4026, %r4007; // end inline asm // begin inline asm add.f16x2 %r4041, %r4029, %r4008; // end inline asm // begin inline asm add.f16x2 %r4044, %r4032, %r4009; // end inline asm add.s32 %r4051, %r3995, 8192; // begin inline asm ld.shared.v4.b32 {%r4047, %r4048, %r4049, %r4050}, [%r4051]; // end inline asm add.s32 %r4056, %r3995, 8448; // begin inline asm ld.shared.v4.b32 {%r4052, %r4053, %r4054, %r4055}, [%r4056]; // end inline asm add.s32 %r4061, %r3995, 8704; // begin inline asm ld.shared.v4.b32 {%r4057, %r4058, %r4059, %r4060}, [%r4061]; // end inline asm add.s32 %r4066, %r3995, 8960; // begin inline asm ld.shared.v4.b32 {%r4062, %r4063, %r4064, %r4065}, [%r4066]; // end inline asm // begin inline asm add.f16x2 %r4067, %r4047, %r4052; // end inline asm // begin inline asm add.f16x2 %r4070, %r4048, %r4053; // end inline asm // begin inline asm add.f16x2 %r4073, %r4049, %r4054; // end inline asm // begin inline asm add.f16x2 %r4076, %r4050, %r4055; // end inline asm // begin inline asm add.f16x2 %r4079, %r4067, %r4057; // end inline asm // begin inline asm add.f16x2 %r4082, %r4070, %r4058; // end inline asm // begin inline asm add.f16x2 %r4085, %r4073, %r4059; // end inline asm // begin inline asm add.f16x2 %r4088, %r4076, %r4060; // end inline asm // begin inline asm add.f16x2 %r4091, %r4079, %r4062; // end inline asm // begin inline asm add.f16x2 %r4094, %r4082, %r4063; // end inline asm // begin inline asm add.f16x2 %r4097, %r4085, %r4064; // end inline asm // begin inline asm add.f16x2 %r4100, %r4088, %r4065; // end inline asm add.s32 %r4107, %r3995, 16384; // begin inline asm ld.shared.v4.b32 {%r4103, %r4104, %r4105, %r4106}, [%r4107]; // end inline asm add.s32 %r4112, %r3995, 16640; // begin inline asm ld.shared.v4.b32 {%r4108, %r4109, %r4110, %r4111}, [%r4112]; // end inline asm add.s32 %r4117, %r3995, 16896; // begin inline asm ld.shared.v4.b32 {%r4113, %r4114, %r4115, %r4116}, [%r4117]; // end inline asm add.s32 %r4122, %r3995, 17152; // begin inline asm ld.shared.v4.b32 {%r4118, %r4119, %r4120, %r4121}, [%r4122]; // end inline asm // begin inline asm add.f16x2 %r4123, %r4103, %r4108; // end inline asm // begin inline asm add.f16x2 %r4126, %r4104, %r4109; // end inline asm // begin inline asm add.f16x2 %r4129, %r4105, %r4110; // end inline asm // begin inline asm add.f16x2 %r4132, %r4106, %r4111; // end inline asm // begin inline asm add.f16x2 %r4135, %r4123, %r4113; // end inline asm // begin inline asm add.f16x2 %r4138, %r4126, %r4114; // end inline asm // begin inline asm add.f16x2 %r4141, %r4129, %r4115; // end inline asm // begin inline asm add.f16x2 %r4144, %r4132, %r4116; // end inline asm // begin inline asm add.f16x2 %r4147, %r4135, %r4118; // end inline asm // begin inline asm add.f16x2 %r4150, %r4138, %r4119; // end inline asm // begin inline asm add.f16x2 %r4153, %r4141, %r4120; // end inline asm // begin inline asm add.f16x2 %r4156, %r4144, %r4121; // end inline asm add.s32 %r4163, %r3995, 24576; // begin inline asm ld.shared.v4.b32 {%r4159, %r4160, %r4161, %r4162}, [%r4163]; // end inline asm add.s32 %r4168, %r3995, 24832; // begin inline asm ld.shared.v4.b32 {%r4164, %r4165, %r4166, %r4167}, [%r4168]; // end inline asm add.s32 %r4173, %r3995, 25088; // begin inline asm ld.shared.v4.b32 {%r4169, %r4170, %r4171, %r4172}, [%r4173]; // end inline asm add.s32 %r4178, %r3995, 25344; // begin inline asm ld.shared.v4.b32 {%r4174, %r4175, %r4176, %r4177}, [%r4178]; // end inline asm // begin inline asm add.f16x2 %r4179, %r4159, %r4164; // end inline asm // begin inline asm add.f16x2 %r4182, %r4160, %r4165; // end inline asm // begin inline asm add.f16x2 %r4185, %r4161, %r4166; // end inline asm // begin inline asm add.f16x2 %r4188, %r4162, %r4167; // end inline asm // begin inline asm add.f16x2 %r4191, %r4179, %r4169; // end inline asm // begin inline asm add.f16x2 %r4194, %r4182, %r4170; // end inline asm // begin inline asm add.f16x2 %r4197, %r4185, %r4171; // end inline asm // begin inline asm add.f16x2 %r4200, %r4188, %r4172; // end inline asm // begin inline asm add.f16x2 %r4203, %r4191, %r4174; // end inline asm // begin inline asm add.f16x2 %r4206, %r4194, %r4175; // end inline asm // begin inline asm add.f16x2 %r4209, %r4197, %r4176; // end inline asm // begin inline asm add.f16x2 %r4212, %r4200, %r4177; // end inline asm add.s32 %r4219, %r3995, 32768; // begin inline asm ld.shared.v4.b32 {%r4215, %r4216, %r4217, %r4218}, [%r4219]; // end inline asm add.s32 %r4224, %r3995, 33024; // begin inline asm ld.shared.v4.b32 {%r4220, %r4221, %r4222, %r4223}, [%r4224]; // end inline asm add.s32 %r4229, %r3995, 33280; // begin inline asm ld.shared.v4.b32 {%r4225, %r4226, %r4227, %r4228}, [%r4229]; // end inline asm add.s32 %r4234, %r3995, 33536; // begin inline asm ld.shared.v4.b32 {%r4230, %r4231, %r4232, %r4233}, [%r4234]; // end inline asm // begin inline asm add.f16x2 %r4235, %r4215, %r4220; // end inline asm // begin inline asm add.f16x2 %r4238, %r4216, %r4221; // end inline asm // begin inline asm add.f16x2 %r4241, %r4217, %r4222; // end inline asm // begin inline asm add.f16x2 %r4244, %r4218, %r4223; // end inline asm // begin inline asm add.f16x2 %r4247, %r4235, %r4225; // end inline asm // begin inline asm add.f16x2 %r4250, %r4238, %r4226; // end inline asm // begin inline asm add.f16x2 %r4253, %r4241, %r4227; // end inline asm // begin inline asm add.f16x2 %r4256, %r4244, %r4228; // end inline asm // begin inline asm add.f16x2 %r4259, %r4247, %r4230; // end inline asm // begin inline asm add.f16x2 %r4262, %r4250, %r4231; // end inline asm // begin inline asm add.f16x2 %r4265, %r4253, %r4232; // end inline asm // begin inline asm add.f16x2 %r4268, %r4256, %r4233; // end inline asm add.s32 %r4275, %r3995, 40960; // begin inline asm ld.shared.v4.b32 {%r4271, %r4272, %r4273, %r4274}, [%r4275]; // end inline asm add.s32 %r4280, %r3995, 41216; // begin inline asm ld.shared.v4.b32 {%r4276, %r4277, %r4278, %r4279}, [%r4280]; // end inline asm add.s32 %r4285, %r3995, 41472; // begin inline asm ld.shared.v4.b32 {%r4281, %r4282, %r4283, %r4284}, [%r4285]; // end inline asm add.s32 %r4290, %r3995, 41728; // begin inline asm ld.shared.v4.b32 {%r4286, %r4287, %r4288, %r4289}, [%r4290]; // end inline asm // begin inline asm add.f16x2 %r4291, %r4271, %r4276; // end inline asm // begin inline asm add.f16x2 %r4294, %r4272, %r4277; // end inline asm // begin inline asm add.f16x2 %r4297, %r4273, %r4278; // end inline asm // begin inline asm add.f16x2 %r4300, %r4274, %r4279; // end inline asm // begin inline asm add.f16x2 %r4303, %r4291, %r4281; // end inline asm // begin inline asm add.f16x2 %r4306, %r4294, %r4282; // end inline asm // begin inline asm add.f16x2 %r4309, %r4297, %r4283; // end inline asm // begin inline asm add.f16x2 %r4312, %r4300, %r4284; // end inline asm // begin inline asm add.f16x2 %r4315, %r4303, %r4286; // end inline asm // begin inline asm add.f16x2 %r4318, %r4306, %r4287; // end inline asm // begin inline asm add.f16x2 %r4321, %r4309, %r4288; // end inline asm // begin inline asm add.f16x2 %r4324, %r4312, %r4289; // end inline asm add.s32 %r4331, %r3995, 49152; // begin inline asm ld.shared.v4.b32 {%r4327, %r4328, %r4329, %r4330}, [%r4331]; // end inline asm add.s32 %r4336, %r3995, 49408; // begin inline asm ld.shared.v4.b32 {%r4332, %r4333, %r4334, %r4335}, [%r4336]; // end inline asm add.s32 %r4341, %r3995, 49664; // begin inline asm ld.shared.v4.b32 {%r4337, %r4338, %r4339, %r4340}, [%r4341]; // end inline asm add.s32 %r4346, %r3995, 49920; // begin inline asm ld.shared.v4.b32 {%r4342, %r4343, %r4344, %r4345}, [%r4346]; // end inline asm // begin inline asm add.f16x2 %r4347, %r4327, %r4332; // end inline asm // begin inline asm add.f16x2 %r4350, %r4328, %r4333; // end inline asm // begin inline asm add.f16x2 %r4353, %r4329, %r4334; // end inline asm // begin inline asm add.f16x2 %r4356, %r4330, %r4335; // end inline asm // begin inline asm add.f16x2 %r4359, %r4347, %r4337; // end inline asm // begin inline asm add.f16x2 %r4362, %r4350, %r4338; // end inline asm // begin inline asm add.f16x2 %r4365, %r4353, %r4339; // end inline asm // begin inline asm add.f16x2 %r4368, %r4356, %r4340; // end inline asm // begin inline asm add.f16x2 %r4371, %r4359, %r4342; // end inline asm // begin inline asm add.f16x2 %r4374, %r4362, %r4343; // end inline asm // begin inline asm add.f16x2 %r4377, %r4365, %r4344; // end inline asm // begin inline asm add.f16x2 %r4380, %r4368, %r4345; // end inline asm add.s32 %r4387, %r3995, 57344; // begin inline asm ld.shared.v4.b32 {%r4383, %r4384, %r4385, %r4386}, [%r4387]; // end inline asm add.s32 %r4392, %r3995, 57600; // begin inline asm ld.shared.v4.b32 {%r4388, %r4389, %r4390, %r4391}, [%r4392]; // end inline asm add.s32 %r4397, %r3995, 57856; // begin inline asm ld.shared.v4.b32 {%r4393, %r4394, %r4395, %r4396}, [%r4397]; // end inline asm add.s32 %r4402, %r3995, 58112; // begin inline asm ld.shared.v4.b32 {%r4398, %r4399, %r4400, %r4401}, [%r4402]; // end inline asm // begin inline asm add.f16x2 %r4403, %r4383, %r4388; // end inline asm // begin inline asm add.f16x2 %r4406, %r4384, %r4389; // end inline asm // begin inline asm add.f16x2 %r4409, %r4385, %r4390; // end inline asm // begin inline asm add.f16x2 %r4412, %r4386, %r4391; // end inline asm // begin inline asm add.f16x2 %r4415, %r4403, %r4393; // end inline asm // begin inline asm add.f16x2 %r4418, %r4406, %r4394; // end inline asm // begin inline asm add.f16x2 %r4421, %r4409, %r4395; // end inline asm // begin inline asm add.f16x2 %r4424, %r4412, %r4396; // end inline asm // begin inline asm add.f16x2 %r4427, %r4415, %r4398; // end inline asm // begin inline asm add.f16x2 %r4430, %r4418, %r4399; // end inline asm // begin inline asm add.f16x2 %r4433, %r4421, %r4400; // end inline asm // begin inline asm add.f16x2 %r4436, %r4424, %r4401; // end inline asm mul.lo.s32 %r4459, %r2067, %r4534; shl.b32 %r4460, %r4459, 1; cvt.s64.s32 %rd66, %r4460; add.s64 %rd4, %rd66, %rd50; setp.ge.s32 %p187, %r2085, %r4526; @%p187 bra $L__BB0_52; mov.b64 %rd118, fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0; mov.u64 %rd117, %rd118; ld.param.u32 %r4546, [%rd117+60]; mov.u32 %r4545, %tid.x; shr.s32 %r4544, %r4546, 31; shr.u32 %r4543, %r4544, 29; add.s32 %r4542, %r4546, %r4543; shr.s32 %r4541, %r4542, 3; shr.s32 %r4540, %r4545, 31; shr.u32 %r4539, %r4540, 28; add.s32 %r4538, %r4545, %r4539; and.b32 %r4537, %r4538, -16; sub.s32 %r4536, %r4545, %r4537; setp.ge.s32 %p188, %r4536, %r4541; @%p188 bra $L__BB0_32; mul.lo.s64 %rd69, %rd2, %rd46; add.s64 %rd70, %rd4, %rd69; cvta.to.global.u64 %rd71, %rd3; add.s64 %rd72, %rd71, %rd70; st.global.v4.u32 [%rd72], {%r4035, %r4038, %r4041, %r4044}; $L__BB0_32: ld.param.u32 %r4547, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; add.s32 %r4468, %r2085, 8; setp.ge.s32 %p189, %r4468, %r4547; @%p189 bra $L__BB0_52; @%p188 bra $L__BB0_35; add.s64 %rd74, %rd46, 8; mul.lo.s64 %rd75, %rd74, %rd2; add.s64 %rd76, %rd4, %rd75; cvta.to.global.u64 %rd77, %rd3; add.s64 %rd78, %rd77, %rd76; st.global.v4.u32 [%rd78], {%r4091, %r4094, %r4097, %r4100}; $L__BB0_35: ld.param.u32 %r4548, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; add.s32 %r4475, %r2085, 16; setp.ge.s32 %p191, %r4475, %r4548; @%p191 bra $L__BB0_52; @%p188 bra $L__BB0_38; add.s64 %rd80, %rd46, 16; mul.lo.s64 %rd81, %rd80, %rd2; add.s64 %rd82, %rd4, %rd81; cvta.to.global.u64 %rd83, %rd3; add.s64 %rd84, %rd83, %rd82; st.global.v4.u32 [%rd84], {%r4147, %r4150, %r4153, %r4156}; $L__BB0_38: ld.param.u32 %r4549, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; add.s32 %r4482, %r2085, 24; setp.ge.s32 %p193, %r4482, %r4549; @%p193 bra $L__BB0_52; @%p188 bra $L__BB0_41; add.s64 %rd86, %rd46, 24; mul.lo.s64 %rd87, %rd86, %rd2; add.s64 %rd88, %rd4, %rd87; cvta.to.global.u64 %rd89, %rd3; add.s64 %rd90, %rd89, %rd88; st.global.v4.u32 [%rd90], {%r4203, %r4206, %r4209, %r4212}; $L__BB0_41: ld.param.u32 %r4550, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; add.s32 %r4489, %r2085, 32; setp.ge.s32 %p195, %r4489, %r4550; @%p195 bra $L__BB0_52; @%p188 bra $L__BB0_44; add.s64 %rd92, %rd46, 32; mul.lo.s64 %rd93, %rd92, %rd2; add.s64 %rd94, %rd4, %rd93; cvta.to.global.u64 %rd95, %rd3; add.s64 %rd96, %rd95, %rd94; st.global.v4.u32 [%rd96], {%r4259, %r4262, %r4265, %r4268}; $L__BB0_44: ld.param.u32 %r4551, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; add.s32 %r4496, %r2085, 40; setp.ge.s32 %p197, %r4496, %r4551; @%p197 bra $L__BB0_52; @%p188 bra $L__BB0_47; add.s64 %rd98, %rd46, 40; mul.lo.s64 %rd99, %rd98, %rd2; add.s64 %rd100, %rd4, %rd99; cvta.to.global.u64 %rd101, %rd3; add.s64 %rd102, %rd101, %rd100; st.global.v4.u32 [%rd102], {%r4315, %r4318, %r4321, %r4324}; $L__BB0_47: ld.param.u32 %r4552, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; add.s32 %r4503, %r2085, 48; setp.ge.s32 %p199, %r4503, %r4552; @%p199 bra $L__BB0_52; @%p188 bra $L__BB0_50; add.s64 %rd104, %rd46, 48; mul.lo.s64 %rd105, %rd104, %rd2; add.s64 %rd106, %rd4, %rd105; cvta.to.global.u64 %rd107, %rd3; add.s64 %rd108, %rd107, %rd106; st.global.v4.u32 [%rd108], {%r4371, %r4374, %r4377, %r4380}; $L__BB0_50: ld.param.u32 %r4553, [fmha_mhca_fp16_128_128_sm86_kernel_nl_param_0+200]; add.s32 %r4510, %r2085, 56; setp.ge.s32 %p201, %r4510, %r4553; or.pred %p203, %p201, %p188; @%p203 bra $L__BB0_52; add.s64 %rd110, %rd46, 56; mul.lo.s64 %rd111, %rd110, %rd2; add.s64 %rd112, %rd4, %rd111; cvta.to.global.u64 %rd113, %rd3; add.s64 %rd114, %rd113, %rd112; st.global.v4.u32 [%rd114], {%r4427, %r4430, %r4433, %r4436}; $L__BB0_52: ret; }