/ demoted variable .shared .align 4 .f32 _ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE2mu; // demoted variable .shared .align 4 .f32 _ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE6rsigma; ld.param.v2.u8 {%rs18, %rs19}, [MVNKernel_fp32_large_param_0+48]; ld.param.v2.u32 {%r125, %r126}, [MVNKernel_fp32_large_param_0+56]; ld.param.v2.u32 {%r127, %r128}, [MVNKernel_fp32_large_param_0+64]; ld.param.v2.u32 {%r129, %r130}, [MVNKernel_fp32_large_param_0+72]; ld.param.v4.u8 {%rs20, %rs21, %rs22, %rs23}, [MVNKernel_fp32_large_param_0+84]; ld.param.v4.u8 {%rs24, %rs25, %rs26, %rs27}, [MVNKernel_fp32_large_param_0+44]; ld.param.u32 %r124, [MVNKernel_fp32_large_param_0+80]; ld.param.u8 %rs13, [MVNKernel_fp32_large_param_0+50]; ld.param.f32 %f71, [MVNKernel_fp32_large_param_0+40]; ld.param.f32 %f70, [MVNKernel_fp32_large_param_0+36]; ld.param.u32 %r116, [MVNKernel_fp32_large_param_0+32]; ld.param.u64 %rd43, [MVNKernel_fp32_large_param_0+24]; ld.param.u64 %rd42, [MVNKernel_fp32_large_param_0+16]; ld.param.u64 %rd41, [MVNKernel_fp32_large_param_0+8]; ld.param.u64 %rd40, [MVNKernel_fp32_large_param_0]; cvta.to.global.u64 %rd1, %rd40; cvta.to.global.u64 %rd2, %rd41; cvta.to.global.u64 %rd3, %rd42; cvta.to.global.u64 %rd4, %rd43; mov.u32 %r1, %ctaid.x; mul.lo.s32 %r3, %r116, %r1; setp.eq.s16 %p1, %rs24, 0; mov.f32 %f177, 0f00000000; mov.u32 %r256, %tid.x; @%p1 bra $L__BB0_22; setp.ge.s32 %p2, %r256, %r116; mov.f32 %f171, 0f00000000; @%p2 bra $L__BB0_8; not.b32 %r131, %r256; add.s32 %r5, %r116, %r131; shr.u32 %r132, %r5, 10; add.s32 %r133, %r132, 1; and.b32 %r228, %r133, 3; setp.eq.s32 %p3, %r228, 0; mov.f32 %f171, 0f00000000; mov.u32 %r229, %r256; @%p3 bra $L__BB0_5; add.s32 %r134, %r256, %r3; mul.wide.s32 %rd44, %r134, 4; add.s64 %rd89, %rd1, %rd44; mov.u32 %r229, %r256; $L__BB0_4: .pragma "nounroll"; ld.global.f32 %f77, [%rd89]; add.ftz.f32 %f171, %f171, %f77; add.s32 %r229, %r229, 1024; add.s64 %rd89, %rd89, 4096; add.s32 %r228, %r228, -1; setp.ne.s32 %p4, %r228, 0; @%p4 bra $L__BB0_4; $L__BB0_5: setp.lt.u32 %p5, %r5, 3072; @%p5 bra $L__BB0_8; add.s32 %r135, %r229, %r3; mul.wide.s32 %rd45, %r135, 4; add.s64 %rd46, %rd1, %rd45; add.s64 %rd90, %rd46, 8192; $L__BB0_7: ld.global.f32 %f78, [%rd90+-8192]; add.ftz.f32 %f79, %f171, %f78; ld.global.f32 %f80, [%rd90+-4096]; add.ftz.f32 %f81, %f79, %f80; ld.global.f32 %f82, [%rd90]; add.ftz.f32 %f83, %f81, %f82; ld.global.f32 %f84, [%rd90+4096]; add.ftz.f32 %f171, %f83, %f84; add.s64 %rd90, %rd90, 16384; add.s32 %r229, %r229, 4096; setp.lt.s32 %p6, %r229, %r116; @%p6 bra $L__BB0_7; $L__BB0_8: mul.ftz.f32 %f176, %f70, %f171; mov.u32 %r232, WARP_SZ; setp.lt.s32 %p7, %r232, 2; @%p7 bra $L__BB0_11; mov.u32 %r231, %r232; $L__BB0_10: mov.b32 %r136, %f176; shr.u32 %r137, %r231, 31; add.s32 %r138, %r231, %r137; shr.s32 %r16, %r138, 1; mov.u32 %r139, 31; mov.u32 %r140, -1; shfl.sync.down.b32 %r141|%p8, %r136, %r16, %r139, %r140; mov.b32 %f85, %r141; add.ftz.f32 %f176, %f176, %f85; setp.gt.s32 %p9, %r231, 3; mov.u32 %r231, %r16; @%p9 bra $L__BB0_10; $L__BB0_11: rem.u32 %r17, %r256, %r232; setp.ne.s32 %p10, %r17, 0; @%p10 bra $L__BB0_13; div.u32 %r142, %r256, %r232; shl.b32 %r143, %r142, 2; mov.u32 %r144, _ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r145, %r144, %r143; st.shared.f32 [%r145], %f176; $L__BB0_13: bar.sync 0; setp.le.u32 %p11, %r232, %r256; @%p11 bra $L__BB0_19; mov.u32 %r146, %ntid.x; div.u32 %r147, %r146, %r232; setp.ge.s32 %p12, %r17, %r147; mov.f32 %f176, 0f00000000; @%p12 bra $L__BB0_16; shl.b32 %r148, %r17, 2; mov.u32 %r149, _ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r150, %r149, %r148; ld.shared.f32 %f176, [%r150]; $L__BB0_16: @%p7 bra $L__BB0_19; $L__BB0_18: mov.b32 %r151, %f176; shr.u32 %r152, %r232, 31; add.s32 %r153, %r232, %r152; shr.s32 %r19, %r153, 1; mov.u32 %r154, 31; mov.u32 %r155, -1; shfl.sync.down.b32 %r156|%p14, %r151, %r19, %r154, %r155; mov.b32 %f87, %r156; add.ftz.f32 %f176, %f176, %f87; setp.gt.s32 %p15, %r232, 3; mov.u32 %r232, %r19; @%p15 bra $L__BB0_18; $L__BB0_19: bar.sync 0; setp.ne.s32 %p16, %r256, 0; @%p16 bra $L__BB0_21; st.shared.f32 [_ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE2mu], %f176; $L__BB0_21: bar.sync 0; ld.shared.f32 %f177, [_ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE2mu]; $L__BB0_22: setp.eq.s16 %p17, %rs25, 0; mov.f32 %f188, 0f3F800000; @%p17 bra $L__BB0_44; setp.ge.s32 %p18, %r256, %r116; mov.f32 %f182, 0f00000000; @%p18 bra $L__BB0_30; not.b32 %r157, %r256; add.s32 %r20, %r116, %r157; shr.u32 %r158, %r20, 10; add.s32 %r159, %r158, 1; and.b32 %r234, %r159, 3; setp.eq.s32 %p19, %r234, 0; mov.f32 %f182, 0f00000000; mov.u32 %r235, %r256; @%p19 bra $L__BB0_27; add.s32 %r160, %r256, %r3; mul.wide.s32 %rd47, %r160, 4; add.s64 %rd91, %rd1, %rd47; mov.u32 %r235, %r256; $L__BB0_26: .pragma "nounroll"; ld.global.f32 %f93, [%rd91]; sub.ftz.f32 %f94, %f93, %f177; fma.rn.ftz.f32 %f182, %f94, %f94, %f182; add.s32 %r235, %r235, 1024; add.s64 %rd91, %rd91, 4096; add.s32 %r234, %r234, -1; setp.ne.s32 %p20, %r234, 0; @%p20 bra $L__BB0_26; $L__BB0_27: setp.lt.u32 %p21, %r20, 3072; @%p21 bra $L__BB0_30; add.s32 %r161, %r235, %r3; mul.wide.s32 %rd48, %r161, 4; add.s64 %rd49, %rd1, %rd48; add.s64 %rd92, %rd49, 8192; $L__BB0_29: ld.global.f32 %f95, [%rd92+-8192]; sub.ftz.f32 %f96, %f95, %f177; fma.rn.ftz.f32 %f97, %f96, %f96, %f182; ld.global.f32 %f98, [%rd92+-4096]; sub.ftz.f32 %f99, %f98, %f177; fma.rn.ftz.f32 %f100, %f99, %f99, %f97; ld.global.f32 %f101, [%rd92]; sub.ftz.f32 %f102, %f101, %f177; fma.rn.ftz.f32 %f103, %f102, %f102, %f100; ld.global.f32 %f104, [%rd92+4096]; sub.ftz.f32 %f105, %f104, %f177; fma.rn.ftz.f32 %f182, %f105, %f105, %f103; add.s64 %rd92, %rd92, 16384; add.s32 %r235, %r235, 4096; setp.lt.s32 %p22, %r235, %r116; @%p22 bra $L__BB0_29; $L__BB0_30: mul.ftz.f32 %f187, %f70, %f182; mov.u32 %r238, WARP_SZ; setp.lt.s32 %p23, %r238, 2; @%p23 bra $L__BB0_33; mov.u32 %r237, %r238; $L__BB0_32: mov.b32 %r162, %f187; shr.u32 %r163, %r237, 31; add.s32 %r164, %r237, %r163; shr.s32 %r31, %r164, 1; mov.u32 %r165, 31; mov.u32 %r166, -1; shfl.sync.down.b32 %r167|%p24, %r162, %r31, %r165, %r166; mov.b32 %f106, %r167; add.ftz.f32 %f187, %f187, %f106; setp.gt.s32 %p25, %r237, 3; mov.u32 %r237, %r31; @%p25 bra $L__BB0_32; $L__BB0_33: rem.u32 %r32, %r256, %r238; setp.ne.s32 %p26, %r32, 0; @%p26 bra $L__BB0_35; div.u32 %r168, %r256, %r238; shl.b32 %r169, %r168, 2; mov.u32 %r170, _ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r171, %r170, %r169; st.shared.f32 [%r171], %f187; $L__BB0_35: bar.sync 0; setp.le.u32 %p27, %r238, %r256; @%p27 bra $L__BB0_41; mov.u32 %r172, %ntid.x; div.u32 %r173, %r172, %r238; setp.ge.s32 %p28, %r32, %r173; mov.f32 %f187, 0f00000000; @%p28 bra $L__BB0_38; shl.b32 %r174, %r32, 2; mov.u32 %r175, _ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE4temp; add.s32 %r176, %r175, %r174; ld.shared.f32 %f187, [%r176]; $L__BB0_38: @%p23 bra $L__BB0_41; $L__BB0_40: mov.b32 %r177, %f187; shr.u32 %r178, %r238, 31; add.s32 %r179, %r238, %r178; shr.s32 %r34, %r179, 1; mov.u32 %r180, 31; mov.u32 %r181, -1; shfl.sync.down.b32 %r182|%p30, %r177, %r34, %r180, %r181; mov.b32 %f108, %r182; add.ftz.f32 %f187, %f187, %f108; setp.gt.s32 %p31, %r238, 3; mov.u32 %r238, %r34; @%p31 bra $L__BB0_40; $L__BB0_41: bar.sync 0; setp.ne.s32 %p32, %r256, 0; @%p32 bra $L__BB0_43; add.ftz.f32 %f109, %f71, %f187; rsqrt.approx.ftz.f32 %f110, %f109; st.shared.f32 [_ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE6rsigma], %f110; $L__BB0_43: bar.sync 0; ld.shared.f32 %f188, [_ZZ20LayerNormLargeKernelIffEvN8dxml_mvn15MVNKernelParamsEE6rsigma]; $L__BB0_44: setp.ge.s32 %p33, %r256, %r116; @%p33 bra $L__BB0_104; setp.eq.s16 %p34, %rs13, 0; setp.ne.s16 %p35, %rs19, 0; selp.b32 %r37, %r1, 0, %p35; mul.lo.s32 %r38, %r127, %r125; mul.lo.s32 %r39, %r126, %r127; not.b32 %r183, %r256; add.s32 %r40, %r116, %r183; @%p34 bra $L__BB0_78; and.b32 %r184, %r40, 1024; setp.ne.s32 %p36, %r184, 0; @%p36 bra $L__BB0_57; setp.eq.s16 %p37, %rs21, 0; @%p37 bra $L__BB0_51; setp.eq.s16 %p38, %rs20, 0; @%p38 bra $L__BB0_50; div.u32 %r240, %r256, %r39; mul.lo.s32 %r185, %r240, %r39; sub.s32 %r186, %r256, %r185; div.u32 %r241, %r186, %r127; mul.lo.s32 %r187, %r241, %r127; sub.s32 %r242, %r186, %r187; mov.u32 %r239, %r37; bra.uni $L__BB0_52; $L__BB0_78: shr.u32 %r221, %r40, 10; add.s32 %r222, %r221, 1; and.b32 %r255, %r222, 3; setp.eq.s32 %p60, %r255, 0; @%p60 bra $L__BB0_85; add.s32 %r253, %r256, %r3; mul.wide.s32 %rd67, %r253, 4; add.s64 %rd96, %rd4, %rd67; add.s64 %rd95, %rd1, %rd67; $L__BB0_80: .pragma "nounroll"; selp.b32 %r99, %r253, %r256, %p35; setp.eq.s16 %p62, %rs27, 0; mov.f32 %f195, 0f3F800000; @%p62 bra $L__BB0_82; mul.wide.u32 %rd68, %r99, 4; add.s64 %rd69, %rd2, %rd68; ld.global.f32 %f195, [%rd69]; $L__BB0_82: setp.eq.s16 %p63, %rs26, 0; mov.f32 %f196, 0f00000000; @%p63 bra $L__BB0_84; mul.wide.u32 %rd70, %r99, 4; add.s64 %rd71, %rd3, %rd70; ld.global.f32 %f196, [%rd71]; $L__BB0_84: ld.global.f32 %f134, [%rd95]; sub.ftz.f32 %f135, %f134, %f177; mul.ftz.f32 %f136, %f188, %f135; fma.rn.ftz.f32 %f137, %f195, %f136, %f196; setp.lt.ftz.f32 %p64, %f137, 0f00000000; setp.ne.s16 %p65, %rs18, 0; and.pred %p66, %p65, %p64; selp.f32 %f138, 0f00000000, %f137, %p66; st.global.f32 [%rd96], %f138; add.s32 %r256, %r256, 1024; add.s64 %rd96, %rd96, 4096; add.s64 %rd95, %rd95, 4096; add.s32 %r253, %r253, 1024; add.s32 %r255, %r255, -1; setp.ne.s32 %p67, %r255, 0; @%p67 bra $L__BB0_80; $L__BB0_85: setp.lt.u32 %p68, %r40, 3072; @%p68 bra $L__BB0_104; add.s32 %r223, %r256, %r3; add.s32 %r257, %r223, 3072; mul.wide.s32 %rd72, %r223, 4; add.s64 %rd97, %rd1, %rd72; add.s64 %rd98, %rd4, %rd72; $L__BB0_87: add.s32 %r224, %r3, %r256; selp.b32 %r107, %r224, %r256, %p35; setp.eq.s16 %p70, %rs27, 0; mov.f32 %f197, 0f3F800000; @%p70 bra $L__BB0_89; mul.wide.u32 %rd73, %r107, 4; add.s64 %rd74, %rd2, %rd73; ld.global.f32 %f197, [%rd74]; $L__BB0_89: setp.eq.s16 %p71, %rs26, 0; mov.f32 %f198, 0f00000000; @%p71 bra $L__BB0_91; mul.wide.u32 %rd75, %r107, 4; add.s64 %rd76, %rd3, %rd75; ld.global.f32 %f198, [%rd76]; $L__BB0_91: ld.global.f32 %f142, [%rd97]; sub.ftz.f32 %f143, %f142, %f177; mul.ftz.f32 %f144, %f188, %f143; fma.rn.ftz.f32 %f145, %f197, %f144, %f198; setp.lt.ftz.f32 %p72, %f145, 0f00000000; setp.ne.s16 %p73, %rs18, 0; and.pred %p74, %p73, %p72; selp.f32 %f146, 0f00000000, %f145, %p74; st.global.f32 [%rd98], %f146; add.s32 %r108, %r256, 1024; add.s32 %r225, %r257, -2048; selp.b32 %r109, %r225, %r108, %p35; mov.f32 %f199, 0f3F800000; @%p70 bra $L__BB0_93; mul.wide.u32 %rd77, %r109, 4; add.s64 %rd78, %rd2, %rd77; ld.global.f32 %f199, [%rd78]; $L__BB0_93: mov.f32 %f200, 0f00000000; @%p71 bra $L__BB0_95; mul.wide.u32 %rd79, %r109, 4; add.s64 %rd80, %rd3, %rd79; ld.global.f32 %f200, [%rd80]; $L__BB0_95: ld.global.f32 %f149, [%rd97+4096]; sub.ftz.f32 %f150, %f149, %f177; mul.ftz.f32 %f151, %f188, %f150; fma.rn.ftz.f32 %f152, %f199, %f151, %f200; setp.lt.ftz.f32 %p78, %f152, 0f00000000; and.pred %p80, %p73, %p78; selp.f32 %f153, 0f00000000, %f152, %p80; st.global.f32 [%rd98+4096], %f153; add.s32 %r110, %r108, 1024; add.s32 %r226, %r257, -1024; selp.b32 %r111, %r226, %r110, %p35; mov.f32 %f201, 0f3F800000; @%p70 bra $L__BB0_97; mul.wide.u32 %rd81, %r111, 4; add.s64 %rd82, %rd2, %rd81; ld.global.f32 %f201, [%rd82]; $L__BB0_97: mov.f32 %f202, 0f00000000; @%p71 bra $L__BB0_99; mul.wide.u32 %rd83, %r111, 4; add.s64 %rd84, %rd3, %rd83; ld.global.f32 %f202, [%rd84]; $L__BB0_99: ld.global.f32 %f156, [%rd97+8192]; sub.ftz.f32 %f157, %f156, %f177; mul.ftz.f32 %f158, %f188, %f157; fma.rn.ftz.f32 %f159, %f201, %f158, %f202; setp.lt.ftz.f32 %p84, %f159, 0f00000000; and.pred %p86, %p73, %p84; selp.f32 %f160, 0f00000000, %f159, %p86; st.global.f32 [%rd98+8192], %f160; add.s32 %r112, %r110, 1024; selp.b32 %r113, %r257, %r112, %p35; mov.f32 %f203, 0f3F800000; @%p70 bra $L__BB0_101; mul.wide.u32 %rd85, %r113, 4; add.s64 %rd86, %rd2, %rd85; ld.global.f32 %f203, [%rd86]; $L__BB0_101: mov.f32 %f204, 0f00000000; @%p71 bra $L__BB0_103; mul.wide.u32 %rd87, %r113, 4; add.s64 %rd88, %rd3, %rd87; ld.global.f32 %f204, [%rd88]; $L__BB0_103: add.s64 %rd38, %rd97, 16384; ld.global.f32 %f162, [%rd97+12288]; sub.ftz.f32 %f163, %f162, %f177; mul.ftz.f32 %f164, %f188, %f163; fma.rn.ftz.f32 %f165, %f203, %f164, %f204; setp.lt.ftz.f32 %p90, %f165, 0f00000000; and.pred %p92, %p73, %p90; selp.f32 %f166, 0f00000000, %f165, %p92; add.s64 %rd39, %rd98, 16384; st.global.f32 [%rd98+12288], %f166; add.s32 %r257, %r257, 4096; add.s32 %r256, %r112, 1024; setp.lt.s32 %p93, %r256, %r116; mov.u64 %rd97, %rd38; mov.u64 %rd98, %rd39; @%p93 bra $L__BB0_87; bra.uni $L__BB0_104; $L__BB0_51: div.u32 %r239, %r37, %r125; mul.lo.s32 %r191, %r239, %r125; sub.s32 %r240, %r37, %r191; div.u32 %r241, %r256, %r127; mul.lo.s32 %r192, %r241, %r127; sub.s32 %r242, %r256, %r192; bra.uni $L__BB0_52; $L__BB0_50: div.u32 %r241, %r256, %r38; mul.lo.s32 %r188, %r241, %r38; sub.s32 %r189, %r256, %r188; div.u32 %r242, %r189, %r125; mul.lo.s32 %r190, %r242, %r125; sub.s32 %r240, %r189, %r190; mov.u32 %r239, %r37; $L__BB0_52: mul.lo.s32 %r193, %r239, %r128; mad.lo.s32 %r194, %r240, %r129, %r193; mad.lo.s32 %r195, %r241, %r130, %r194; mad.lo.s32 %r196, %r242, %r124, %r195; cvt.u64.u32 %rd17, %r196; setp.eq.s16 %p39, %rs27, 0; mov.f32 %f189, 0f3F800000; @%p39 bra $L__BB0_54; shl.b64 %rd50, %rd17, 2; add.s64 %rd51, %rd2, %rd50; ld.global.f32 %f189, [%rd51]; $L__BB0_54: setp.eq.s16 %p40, %rs26, 0; mov.f32 %f190, 0f00000000; @%p40 bra $L__BB0_56; shl.b64 %rd52, %rd17, 2; add.s64 %rd53, %rd3, %rd52; ld.global.f32 %f190, [%rd53]; $L__BB0_56: add.s32 %r197, %r256, %r3; mul.wide.s32 %rd54, %r197, 4; add.s64 %rd55, %rd1, %rd54; ld.global.f32 %f113, [%rd55]; sub.ftz.f32 %f114, %f113, %f177; mul.ftz.f32 %f115, %f188, %f114; fma.rn.ftz.f32 %f116, %f189, %f115, %f190; setp.lt.ftz.f32 %p41, %f116, 0f00000000; setp.ne.s16 %p42, %rs18, 0; and.pred %p43, %p42, %p41; selp.f32 %f117, 0f00000000, %f116, %p43; add.s64 %rd56, %rd4, %rd54; st.global.f32 [%rd56], %f117; add.s32 %r256, %r256, 1024; $L__BB0_57: setp.lt.u32 %p44, %r40, 1024; @%p44 bra $L__BB0_104; add.s32 %r198, %r256, %r3; mul.wide.s32 %rd57, %r198, 4; add.s64 %rd58, %rd57, 4096; add.s64 %rd94, %rd4, %rd58; add.s64 %rd93, %rd1, %rd58; $L__BB0_59: setp.eq.s16 %p45, %rs21, 0; @%p45 bra $L__BB0_63; setp.eq.s16 %p46, %rs20, 0; @%p46 bra $L__BB0_62; div.u32 %r246, %r256, %r39; mul.lo.s32 %r199, %r246, %r39; sub.s32 %r200, %r256, %r199; div.u32 %r247, %r200, %r127; mul.lo.s32 %r201, %r247, %r127; sub.s32 %r248, %r200, %r201; mov.u32 %r245, %r37; bra.uni $L__BB0_64; $L__BB0_63: div.u32 %r245, %r37, %r125; mul.lo.s32 %r205, %r245, %r125; sub.s32 %r246, %r37, %r205; div.u32 %r247, %r256, %r127; mul.lo.s32 %r206, %r247, %r127; sub.s32 %r248, %r256, %r206; bra.uni $L__BB0_64; $L__BB0_62: div.u32 %r247, %r256, %r38; mul.lo.s32 %r202, %r247, %r38; sub.s32 %r203, %r256, %r202; div.u32 %r248, %r203, %r125; mul.lo.s32 %r204, %r248, %r125; sub.s32 %r246, %r203, %r204; mov.u32 %r245, %r37; $L__BB0_64: mul.lo.s32 %r207, %r245, %r128; mad.lo.s32 %r208, %r246, %r129, %r207; mad.lo.s32 %r209, %r247, %r130, %r208; mad.lo.s32 %r76, %r248, %r124, %r209; setp.eq.s16 %p47, %rs27, 0; mov.f32 %f191, 0f3F800000; @%p47 bra $L__BB0_66; mul.wide.u32 %rd59, %r76, 4; add.s64 %rd60, %rd2, %rd59; ld.global.f32 %f191, [%rd60]; $L__BB0_66: setp.eq.s16 %p48, %rs26, 0; mov.f32 %f192, 0f00000000; @%p48 bra $L__BB0_68; mul.wide.u32 %rd61, %r76, 4; add.s64 %rd62, %rd3, %rd61; ld.global.f32 %f192, [%rd62]; $L__BB0_68: add.s64 %rd22, %rd93, -4096; ld.global.f32 %f120, [%rd93+-4096]; sub.ftz.f32 %f121, %f120, %f177; mul.ftz.f32 %f122, %f188, %f121; fma.rn.ftz.f32 %f123, %f191, %f122, %f192; setp.lt.ftz.f32 %p49, %f123, 0f00000000; setp.ne.s16 %p50, %rs18, 0; and.pred %p51, %p50, %p49; selp.f32 %f124, 0f00000000, %f123, %p51; add.s64 %rd23, %rd94, -4096; st.global.f32 [%rd94+-4096], %f124; add.s32 %r77, %r256, 1024; @%p45 bra $L__BB0_72; setp.eq.s16 %p53, %rs20, 0; @%p53 bra $L__BB0_71; div.u32 %r250, %r77, %r39; mul.lo.s32 %r210, %r250, %r39; sub.s32 %r211, %r77, %r210; div.u32 %r251, %r211, %r127; mul.lo.s32 %r212, %r251, %r127; sub.s32 %r252, %r211, %r212; mov.u32 %r249, %r37; bra.uni $L__BB0_73; $L__BB0_72: div.u32 %r249, %r37, %r125; mul.lo.s32 %r216, %r249, %r125; sub.s32 %r250, %r37, %r216; div.u32 %r251, %r77, %r127; mul.lo.s32 %r217, %r251, %r127; sub.s32 %r252, %r77, %r217; bra.uni $L__BB0_73; $L__BB0_71: div.u32 %r251, %r77, %r38; mul.lo.s32 %r213, %r251, %r38; sub.s32 %r214, %r77, %r213; div.u32 %r252, %r214, %r125; mul.lo.s32 %r215, %r252, %r125; sub.s32 %r250, %r214, %r215; mov.u32 %r249, %r37; $L__BB0_73: mul.lo.s32 %r218, %r249, %r128; mad.lo.s32 %r219, %r250, %r129, %r218; mad.lo.s32 %r220, %r251, %r130, %r219; mad.lo.s32 %r92, %r252, %r124, %r220; mov.f32 %f193, 0f3F800000; @%p47 bra $L__BB0_75; mul.wide.u32 %rd63, %r92, 4; add.s64 %rd64, %rd2, %rd63; ld.global.f32 %f193, [%rd64]; $L__BB0_75: mov.f32 %f194, 0f00000000; @%p48 bra $L__BB0_77; mul.wide.u32 %rd65, %r92, 4; add.s64 %rd66, %rd3, %rd65; ld.global.f32 %f194, [%rd66]; $L__BB0_77: ld.global.f32 %f127, [%rd22+4096]; sub.ftz.f32 %f128, %f127, %f177; mul.ftz.f32 %f129, %f188, %f128; fma.rn.ftz.f32 %f130, %f193, %f129, %f194; setp.lt.ftz.f32 %p56, %f130, 0f00000000; and.pred %p58, %p50, %p56; selp.f32 %f131, 0f00000000, %f130, %p58; st.global.f32 [%rd23+4096], %f131; add.s64 %rd94, %rd94, 8192; add.s64 %rd93, %rd93, 8192; add.s32 %r256, %r256, 2048; setp.lt.s32 %p59, %r256, %r116; @%p59 bra $L__BB0_59; $L__BB0_104: ret; }