Compiler Build ID: CL-31678015 // Cuda compilation tools, release 11.8, V11.8.85 // Based on NVVM 7.0.1 // .version 7.8 .target sm_60 .address_size 64 // .globl depthwise_conv_fp32_nchw_kernel .visible .entry depthwise_conv_fp32_nchw_kernel( .param .align 8 .b8 depthwise_conv_fp32_nchw_kernel_param_0[152] ) { .reg .pred %p<94>; .reg .b16 %rs<17>; .reg .f32 %f<469>; .reg .b32 %r<327>; .reg .b64 %rd<132>; mov.b64 %rd7, depthwise_conv_fp32_nchw_kernel_param_0; mov.u64 %rd1, %rd7; mov.u32 %r97, %ntid.x; mov.u32 %r98, %ctaid.x; mov.u32 %r99, %tid.x; mad.lo.s32 %r1, %r98, %r97, %r99; mov.u32 %r100, %ntid.y; mov.u32 %r101, %ctaid.y; mov.u32 %r102, %tid.y; mad.lo.s32 %r2, %r101, %r100, %r102; mov.u32 %r103, %ntid.z; mov.u32 %r104, %ctaid.z; mov.u32 %r105, %tid.z; mad.lo.s32 %r106, %r104, %r103, %r105; shl.b32 %r3, %r106, 3; ld.param.v2.u32 {%r107, %r108}, [depthwise_conv_fp32_nchw_kernel_param_0+40]; setp.ge.s32 %p1, %r1, %r108; setp.ge.s32 %p2, %r2, %r107; or.pred %p3, %p1, %p2; @%p3 bra $L__BB0_110; ld.param.u32 %r111, [%rd1+32]; setp.lt.s32 %p4, %r111, 1; @%p4 bra $L__BB0_110; ld.param.v4.u8 {%rs6, %rs7, %rs8, %rs9}, [%rd1+144]; ld.param.v2.u32 {%r113, %r114}, [%rd1+56]; mov.u32 %r320, 0; ld.param.u64 %rd2, [%rd1+24]; ld.param.u32 %r115, [%rd1+132]; add.s32 %r6, %r115, %r2; ld.param.u32 %r116, [%rd1+120]; add.s32 %r7, %r116, %r1; ld.param.v2.u32 {%r117, %r118}, [%rd1+104]; mul.lo.s32 %r119, %r118, %r1; sub.s32 %r10, %r119, %r116; ld.param.u32 %r120, [%rd1+104]; mul.lo.s32 %r121, %r120, %r2; sub.s32 %r11, %r121, %r115; ld.param.u8 %rs5, [%rd1+145]; ld.param.u32 %r12, [%rd1+56]; ld.param.v2.u32 {%r122, %r123}, [%rd1+112]; ld.param.u32 %r16, [%rd1+52]; ld.param.v2.u32 {%r124, %r125}, [%rd1+48]; ld.param.u64 %rd8, [%rd1+8]; cvta.to.global.u64 %rd3, %rd8; ld.param.u64 %rd9, [%rd1+16]; cvta.to.global.u64 %rd4, %rd9; ld.param.u32 %r19, [%rd1+36]; ld.param.v2.u32 {%r126, %r127}, [%rd1+80]; ld.param.u32 %r22, [%rd1+88]; ld.param.v2.u32 {%r128, %r129}, [%rd1+96]; ld.param.u32 %r24, [%rd1+60]; ld.param.v2.f32 {%f183, %f184}, [%rd1+136]; cvta.to.global.u64 %rd10, %rd2; ld.param.u32 %r25, [%rd1+112]; ld.param.u32 %r26, [%rd1+48]; ld.param.u64 %rd5, [%rd1]; ld.param.v2.u32 {%r131, %r132}, [%rd1+32]; ld.param.v2.u32 {%r133, %r134}, [%rd1+64]; ld.param.u32 %r135, [%rd1+72]; mul.lo.s32 %r32, %r135, %r2; mul.lo.s32 %r33, %r127, %r3; mul.lo.s32 %r34, %r128, %r3; add.s32 %r35, %r3, 1; add.s32 %r36, %r33, %r127; add.s32 %r37, %r34, %r128; add.s32 %r38, %r3, 2; add.s32 %r39, %r36, %r127; add.s32 %r40, %r37, %r128; add.s32 %r41, %r3, 3; add.s32 %r42, %r39, %r127; add.s32 %r43, %r40, %r128; add.s32 %r44, %r3, 4; add.s32 %r45, %r42, %r127; add.s32 %r46, %r43, %r128; add.s32 %r47, %r3, 5; add.s32 %r48, %r45, %r127; add.s32 %r49, %r46, %r128; add.s32 %r50, %r3, 6; add.s32 %r51, %r48, %r127; add.s32 %r52, %r49, %r128; add.s32 %r53, %r3, 7; add.s32 %r54, %r51, %r127; add.s32 %r55, %r52, %r128; mul.wide.s32 %rd11, %r3, 4; add.s64 %rd6, %rd10, %rd11; mad.lo.s32 %r136, %r134, %r3, %r134; add.s32 %r137, %r136, %r134; add.s32 %r138, %r137, %r134; add.s32 %r139, %r138, %r134; add.s32 %r56, %r139, %r134; add.s32 %r57, %r56, %r134; add.s32 %r58, %r57, %r134; $L__BB0_3: setp.eq.s16 %p5, %rs6, 0; @%p5 bra $L__BB0_53; setp.ne.s16 %p6, %rs8, 0; setp.eq.s16 %p7, %rs7, 0; mov.f32 %f453, 0f00000000; mov.f32 %f452, 0f00000000; mov.f32 %f451, 0f00000000; mov.f32 %f450, 0f00000000; mov.f32 %f449, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f447, 0f00000000; mov.f32 %f446, 0f00000000; or.pred %p8, %p6, %p7; @%p8 bra $L__BB0_28; bra.uni $L__BB0_5; $L__BB0_28: setp.lt.s32 %p26, %r113, 1; @%p26 bra $L__BB0_76; mov.u32 %r323, 0; mov.f32 %f446, 0f00000000; mov.f32 %f447, %f446; mov.f32 %f448, %f446; mov.f32 %f449, %f446; mov.f32 %f450, %f446; mov.f32 %f451, %f446; mov.f32 %f452, %f446; mov.f32 %f453, %f446; $L__BB0_30: setp.lt.s32 %p27, %r114, 1; @%p27 bra $L__BB0_52; setp.eq.s16 %p28, %rs5, 0; not.b32 %r177, %r323; add.s32 %r178, %r113, %r177; mul.lo.s32 %r74, %r129, %r323; selp.b32 %r179, %r178, %r323, %p28; mul.lo.s32 %r180, %r179, %r25; sub.s32 %r75, %r6, %r180; rem.s32 %r76, %r75, %r117; mov.u32 %r324, 0; $L__BB0_32: setp.eq.s16 %p92, %rs5, 0; not.b32 %r181, %r324; add.s32 %r182, %r114, %r181; selp.b32 %r183, %r182, %r324, %p92; mul.lo.s32 %r184, %r183, %r123; sub.s32 %r78, %r7, %r184; setp.ne.s32 %p30, %r76, 0; @%p30 bra $L__BB0_51; div.s32 %r79, %r75, %r117; rem.s32 %r185, %r78, %r118; setp.ne.s32 %p31, %r185, 0; @%p31 bra $L__BB0_51; div.s32 %r80, %r78, %r118; setp.le.s32 %p32, %r125, %r80; or.b32 %r186, %r80, %r79; setp.lt.s32 %p33, %r186, 0; or.pred %p34, %p33, %p32; setp.le.s32 %p35, %r124, %r79; or.pred %p36, %p34, %p35; @%p36 bra $L__BB0_51; mul.lo.s32 %r317, %r126, %r320; setp.ge.s32 %p37, %r3, %r19; mul.lo.s32 %r81, %r22, %r79; add.s32 %r82, %r317, %r80; @%p37 bra $L__BB0_37; add.s32 %r187, %r82, %r33; add.s32 %r188, %r187, %r81; mul.wide.s32 %rd44, %r188, 4; add.s64 %rd45, %rd3, %rd44; add.s32 %r189, %r34, %r324; add.s32 %r190, %r189, %r74; mul.wide.s32 %rd46, %r190, 4; add.s64 %rd47, %rd4, %rd46; ld.global.f32 %f233, [%rd47]; ld.global.f32 %f234, [%rd45]; fma.rn.ftz.f32 %f453, %f234, %f233, %f453; $L__BB0_37: setp.ge.s32 %p38, %r35, %r19; @%p38 bra $L__BB0_39; add.s32 %r191, %r82, %r36; add.s32 %r192, %r191, %r81; mul.wide.s32 %rd48, %r192, 4; add.s64 %rd49, %rd3, %rd48; add.s32 %r193, %r37, %r324; add.s32 %r194, %r193, %r74; mul.wide.s32 %rd50, %r194, 4; add.s64 %rd51, %rd4, %rd50; ld.global.f32 %f235, [%rd51]; ld.global.f32 %f236, [%rd49]; fma.rn.ftz.f32 %f452, %f236, %f235, %f452; $L__BB0_39: setp.ge.s32 %p39, %r38, %r19; @%p39 bra $L__BB0_41; add.s32 %r195, %r82, %r39; add.s32 %r196, %r195, %r81; mul.wide.s32 %rd52, %r196, 4; add.s64 %rd53, %rd3, %rd52; add.s32 %r197, %r40, %r324; add.s32 %r198, %r197, %r74; mul.wide.s32 %rd54, %r198, 4; add.s64 %rd55, %rd4, %rd54; ld.global.f32 %f237, [%rd55]; ld.global.f32 %f238, [%rd53]; fma.rn.ftz.f32 %f451, %f238, %f237, %f451; $L__BB0_41: setp.ge.s32 %p40, %r41, %r19; @%p40 bra $L__BB0_43; add.s32 %r199, %r82, %r42; add.s32 %r200, %r199, %r81; mul.wide.s32 %rd56, %r200, 4; add.s64 %rd57, %rd3, %rd56; add.s32 %r201, %r43, %r324; add.s32 %r202, %r201, %r74; mul.wide.s32 %rd58, %r202, 4; add.s64 %rd59, %rd4, %rd58; ld.global.f32 %f239, [%rd59]; ld.global.f32 %f240, [%rd57]; fma.rn.ftz.f32 %f450, %f240, %f239, %f450; $L__BB0_43: setp.ge.s32 %p41, %r44, %r19; @%p41 bra $L__BB0_45; add.s32 %r203, %r82, %r45; add.s32 %r204, %r203, %r81; mul.wide.s32 %rd60, %r204, 4; add.s64 %rd61, %rd3, %rd60; add.s32 %r205, %r46, %r324; add.s32 %r206, %r205, %r74; mul.wide.s32 %rd62, %r206, 4; add.s64 %rd63, %rd4, %rd62; ld.global.f32 %f241, [%rd63]; ld.global.f32 %f242, [%rd61]; fma.rn.ftz.f32 %f449, %f242, %f241, %f449; $L__BB0_45: setp.ge.s32 %p42, %r47, %r19; @%p42 bra $L__BB0_47; add.s32 %r207, %r82, %r48; add.s32 %r208, %r207, %r81; mul.wide.s32 %rd64, %r208, 4; add.s64 %rd65, %rd3, %rd64; add.s32 %r209, %r49, %r324; add.s32 %r210, %r209, %r74; mul.wide.s32 %rd66, %r210, 4; add.s64 %rd67, %rd4, %rd66; ld.global.f32 %f243, [%rd67]; ld.global.f32 %f244, [%rd65]; fma.rn.ftz.f32 %f448, %f244, %f243, %f448; $L__BB0_47: setp.ge.s32 %p43, %r50, %r19; @%p43 bra $L__BB0_49; add.s32 %r211, %r82, %r51; add.s32 %r212, %r211, %r81; mul.wide.s32 %rd68, %r212, 4; add.s64 %rd69, %rd3, %rd68; add.s32 %r213, %r52, %r324; add.s32 %r214, %r213, %r74; mul.wide.s32 %rd70, %r214, 4; add.s64 %rd71, %rd4, %rd70; ld.global.f32 %f245, [%rd71]; ld.global.f32 %f246, [%rd69]; fma.rn.ftz.f32 %f447, %f246, %f245, %f447; $L__BB0_49: setp.ge.s32 %p44, %r53, %r19; @%p44 bra $L__BB0_51; add.s32 %r215, %r82, %r54; add.s32 %r216, %r215, %r81; mul.wide.s32 %rd72, %r216, 4; add.s64 %rd73, %rd3, %rd72; add.s32 %r217, %r55, %r324; add.s32 %r218, %r217, %r74; mul.wide.s32 %rd74, %r218, 4; add.s64 %rd75, %rd4, %rd74; ld.global.f32 %f247, [%rd75]; ld.global.f32 %f248, [%rd73]; fma.rn.ftz.f32 %f446, %f248, %f247, %f446; $L__BB0_51: add.s32 %r324, %r324, 1; setp.lt.s32 %p45, %r324, %r114; @%p45 bra $L__BB0_32; $L__BB0_52: add.s32 %r323, %r323, 1; setp.lt.s32 %p46, %r323, %r113; @%p46 bra $L__BB0_30; bra.uni $L__BB0_76; $L__BB0_53: mov.f32 %f453, 0f00000000; mov.f32 %f452, 0f00000000; mov.f32 %f451, 0f00000000; mov.f32 %f450, 0f00000000; mov.f32 %f449, 0f00000000; mov.f32 %f448, 0f00000000; mov.f32 %f447, 0f00000000; mov.f32 %f446, 0f00000000; setp.lt.s32 %p47, %r113, 1; @%p47 bra $L__BB0_76; mov.u32 %r325, 0; mov.f32 %f446, 0f00000000; mov.f32 %f447, %f446; mov.f32 %f448, %f446; mov.f32 %f449, %f446; mov.f32 %f450, %f446; mov.f32 %f451, %f446; mov.f32 %f452, %f446; mov.f32 %f453, %f446; $L__BB0_55: setp.lt.s32 %p48, %r114, 1; @%p48 bra $L__BB0_75; setp.eq.s16 %p49, %rs5, 0; not.b32 %r221, %r325; add.s32 %r222, %r12, %r221; mov.u32 %r326, 0; mul.lo.s32 %r223, %r222, %r122; mul.lo.s32 %r224, %r122, %r325; mul.lo.s32 %r87, %r129, %r325; selp.b32 %r225, %r223, %r224, %p49; add.s32 %r88, %r225, %r11; mul.lo.s32 %r89, %r22, %r88; $L__BB0_57: setp.eq.s16 %p93, %rs5, 0; not.b32 %r226, %r326; add.s32 %r227, %r114, %r226; selp.b32 %r228, %r227, %r326, %p93; mad.lo.s32 %r91, %r123, %r228, %r10; setp.le.s32 %p51, %r16, %r91; or.b32 %r229, %r91, %r88; setp.lt.s32 %p52, %r229, 0; or.pred %p53, %p51, %p52; setp.le.s32 %p54, %r124, %r88; or.pred %p55, %p54, %p53; @%p55 bra $L__BB0_74; mul.lo.s32 %r319, %r126, %r320; setp.ge.s32 %p56, %r3, %r19; add.s32 %r92, %r319, %r91; @%p56 bra $L__BB0_60; add.s32 %r230, %r92, %r33; add.s32 %r231, %r230, %r89; mul.wide.s32 %rd76, %r231, 4; add.s64 %rd77, %rd3, %rd76; add.s32 %r232, %r34, %r326; add.s32 %r233, %r232, %r87; mul.wide.s32 %rd78, %r233, 4; add.s64 %rd79, %rd4, %rd78; ld.global.f32 %f265, [%rd79]; ld.global.f32 %f266, [%rd77]; fma.rn.ftz.f32 %f453, %f266, %f265, %f453; $L__BB0_60: setp.ge.s32 %p57, %r35, %r19; @%p57 bra $L__BB0_62; add.s32 %r234, %r92, %r36; add.s32 %r235, %r234, %r89; mul.wide.s32 %rd80, %r235, 4; add.s64 %rd81, %rd3, %rd80; add.s32 %r236, %r37, %r326; add.s32 %r237, %r236, %r87; mul.wide.s32 %rd82, %r237, 4; add.s64 %rd83, %rd4, %rd82; ld.global.f32 %f267, [%rd83]; ld.global.f32 %f268, [%rd81]; fma.rn.ftz.f32 %f452, %f268, %f267, %f452; $L__BB0_62: setp.ge.s32 %p58, %r38, %r19; @%p58 bra $L__BB0_64; add.s32 %r238, %r92, %r39; add.s32 %r239, %r238, %r89; mul.wide.s32 %rd84, %r239, 4; add.s64 %rd85, %rd3, %rd84; add.s32 %r240, %r40, %r326; add.s32 %r241, %r240, %r87; mul.wide.s32 %rd86, %r241, 4; add.s64 %rd87, %rd4, %rd86; ld.global.f32 %f269, [%rd87]; ld.global.f32 %f270, [%rd85]; fma.rn.ftz.f32 %f451, %f270, %f269, %f451; $L__BB0_64: setp.ge.s32 %p59, %r41, %r19; @%p59 bra $L__BB0_66; add.s32 %r242, %r92, %r42; add.s32 %r243, %r242, %r89; mul.wide.s32 %rd88, %r243, 4; add.s64 %rd89, %rd3, %rd88; add.s32 %r244, %r43, %r326; add.s32 %r245, %r244, %r87; mul.wide.s32 %rd90, %r245, 4; add.s64 %rd91, %rd4, %rd90; ld.global.f32 %f271, [%rd91]; ld.global.f32 %f272, [%rd89]; fma.rn.ftz.f32 %f450, %f272, %f271, %f450; $L__BB0_66: setp.ge.s32 %p60, %r44, %r19; @%p60 bra $L__BB0_68; add.s32 %r246, %r92, %r45; add.s32 %r247, %r246, %r89; mul.wide.s32 %rd92, %r247, 4; add.s64 %rd93, %rd3, %rd92; add.s32 %r248, %r46, %r326; add.s32 %r249, %r248, %r87; mul.wide.s32 %rd94, %r249, 4; add.s64 %rd95, %rd4, %rd94; ld.global.f32 %f273, [%rd95]; ld.global.f32 %f274, [%rd93]; fma.rn.ftz.f32 %f449, %f274, %f273, %f449; $L__BB0_68: setp.ge.s32 %p61, %r47, %r19; @%p61 bra $L__BB0_70; add.s32 %r250, %r92, %r48; add.s32 %r251, %r250, %r89; mul.wide.s32 %rd96, %r251, 4; add.s64 %rd97, %rd3, %rd96; add.s32 %r252, %r49, %r326; add.s32 %r253, %r252, %r87; mul.wide.s32 %rd98, %r253, 4; add.s64 %rd99, %rd4, %rd98; ld.global.f32 %f275, [%rd99]; ld.global.f32 %f276, [%rd97]; fma.rn.ftz.f32 %f448, %f276, %f275, %f448; $L__BB0_70: setp.ge.s32 %p62, %r50, %r19; @%p62 bra $L__BB0_72; add.s32 %r254, %r92, %r51; add.s32 %r255, %r254, %r89; mul.wide.s32 %rd100, %r255, 4; add.s64 %rd101, %rd3, %rd100; add.s32 %r256, %r52, %r326; add.s32 %r257, %r256, %r87; mul.wide.s32 %rd102, %r257, 4; add.s64 %rd103, %rd4, %rd102; ld.global.f32 %f277, [%rd103]; ld.global.f32 %f278, [%rd101]; fma.rn.ftz.f32 %f447, %f278, %f277, %f447; $L__BB0_72: setp.ge.s32 %p63, %r53, %r19; @%p63 bra $L__BB0_74; add.s32 %r258, %r92, %r54; add.s32 %r259, %r258, %r89; mul.wide.s32 %rd104, %r259, 4; add.s64 %rd105, %rd3, %rd104; add.s32 %r260, %r55, %r326; add.s32 %r261, %r260, %r87; mul.wide.s32 %rd106, %r261, 4; add.s64 %rd107, %rd4, %rd106; ld.global.f32 %f279, [%rd107]; ld.global.f32 %f280, [%rd105]; fma.rn.ftz.f32 %f446, %f280, %f279, %f446; $L__BB0_74: add.s32 %r326, %r326, 1; setp.lt.s32 %p64, %r326, %r24; @%p64 bra $L__BB0_57; $L__BB0_75: add.s32 %r325, %r325, 1; setp.lt.s32 %p65, %r325, %r113; @%p65 bra $L__BB0_55; bra.uni $L__BB0_76; $L__BB0_5: rem.s32 %r60, %r7, %r118; rem.s32 %r321, %r6, %r117; setp.ge.s32 %p9, %r321, %r113; @%p9 bra $L__BB0_76; mov.f32 %f446, 0f00000000; mov.f32 %f447, %f446; mov.f32 %f448, %f446; mov.f32 %f449, %f446; mov.f32 %f450, %f446; mov.f32 %f451, %f446; mov.f32 %f452, %f446; mov.f32 %f453, %f446; $L__BB0_7: setp.ge.s32 %p10, %r60, %r24; @%p10 bra $L__BB0_27; sub.s32 %r140, %r6, %r321; div.s32 %r64, %r140, %r117; mul.lo.s32 %r65, %r22, %r64; mul.lo.s32 %r66, %r129, %r321; mov.u32 %r322, %r60; $L__BB0_9: sub.s32 %r141, %r7, %r322; div.s32 %r68, %r141, %r118; setp.le.s32 %p11, %r125, %r68; or.b32 %r142, %r68, %r64; setp.lt.s32 %p12, %r142, 0; or.pred %p13, %p12, %p11; setp.le.s32 %p14, %r26, %r64; or.pred %p15, %p13, %p14; @%p15 bra $L__BB0_26; mul.lo.s32 %r318, %r126, %r320; setp.ge.s32 %p16, %r3, %r19; add.s32 %r69, %r318, %r68; @%p16 bra $L__BB0_12; add.s32 %r143, %r69, %r33; add.s32 %r144, %r143, %r65; mul.wide.s32 %rd12, %r144, 4; add.s64 %rd13, %rd3, %rd12; add.s32 %r145, %r34, %r322; add.s32 %r146, %r145, %r66; mul.wide.s32 %rd14, %r146, 4; add.s64 %rd15, %rd4, %rd14; ld.global.f32 %f201, [%rd15]; ld.global.f32 %f202, [%rd13]; fma.rn.ftz.f32 %f453, %f202, %f201, %f453; $L__BB0_12: setp.ge.s32 %p17, %r35, %r19; @%p17 bra $L__BB0_14; add.s32 %r147, %r69, %r36; add.s32 %r148, %r147, %r65; mul.wide.s32 %rd16, %r148, 4; add.s64 %rd17, %rd3, %rd16; add.s32 %r149, %r37, %r322; add.s32 %r150, %r149, %r66; mul.wide.s32 %rd18, %r150, 4; add.s64 %rd19, %rd4, %rd18; ld.global.f32 %f203, [%rd19]; ld.global.f32 %f204, [%rd17]; fma.rn.ftz.f32 %f452, %f204, %f203, %f452; $L__BB0_14: setp.ge.s32 %p18, %r38, %r19; @%p18 bra $L__BB0_16; add.s32 %r151, %r69, %r39; add.s32 %r152, %r151, %r65; mul.wide.s32 %rd20, %r152, 4; add.s64 %rd21, %rd3, %rd20; add.s32 %r153, %r40, %r322; add.s32 %r154, %r153, %r66; mul.wide.s32 %rd22, %r154, 4; add.s64 %rd23, %rd4, %rd22; ld.global.f32 %f205, [%rd23]; ld.global.f32 %f206, [%rd21]; fma.rn.ftz.f32 %f451, %f206, %f205, %f451; $L__BB0_16: setp.ge.s32 %p19, %r41, %r19; @%p19 bra $L__BB0_18; add.s32 %r155, %r69, %r42; add.s32 %r156, %r155, %r65; mul.wide.s32 %rd24, %r156, 4; add.s64 %rd25, %rd3, %rd24; add.s32 %r157, %r43, %r322; add.s32 %r158, %r157, %r66; mul.wide.s32 %rd26, %r158, 4; add.s64 %rd27, %rd4, %rd26; ld.global.f32 %f207, [%rd27]; ld.global.f32 %f208, [%rd25]; fma.rn.ftz.f32 %f450, %f208, %f207, %f450; $L__BB0_18: setp.ge.s32 %p20, %r44, %r19; @%p20 bra $L__BB0_20; add.s32 %r159, %r69, %r45; add.s32 %r160, %r159, %r65; mul.wide.s32 %rd28, %r160, 4; add.s64 %rd29, %rd3, %rd28; add.s32 %r161, %r46, %r322; add.s32 %r162, %r161, %r66; mul.wide.s32 %rd30, %r162, 4; add.s64 %rd31, %rd4, %rd30; ld.global.f32 %f209, [%rd31]; ld.global.f32 %f210, [%rd29]; fma.rn.ftz.f32 %f449, %f210, %f209, %f449; $L__BB0_20: setp.ge.s32 %p21, %r47, %r19; @%p21 bra $L__BB0_22; add.s32 %r163, %r69, %r48; add.s32 %r164, %r163, %r65; mul.wide.s32 %rd32, %r164, 4; add.s64 %rd33, %rd3, %rd32; add.s32 %r165, %r49, %r322; add.s32 %r166, %r165, %r66; mul.wide.s32 %rd34, %r166, 4; add.s64 %rd35, %rd4, %rd34; ld.global.f32 %f211, [%rd35]; ld.global.f32 %f212, [%rd33]; fma.rn.ftz.f32 %f448, %f212, %f211, %f448; $L__BB0_22: setp.ge.s32 %p22, %r50, %r19; @%p22 bra $L__BB0_24; add.s32 %r167, %r69, %r51; add.s32 %r168, %r167, %r65; mul.wide.s32 %rd36, %r168, 4; add.s64 %rd37, %rd3, %rd36; add.s32 %r169, %r52, %r322; add.s32 %r170, %r169, %r66; mul.wide.s32 %rd38, %r170, 4; add.s64 %rd39, %rd4, %rd38; ld.global.f32 %f213, [%rd39]; ld.global.f32 %f214, [%rd37]; fma.rn.ftz.f32 %f447, %f214, %f213, %f447; $L__BB0_24: setp.ge.s32 %p23, %r53, %r19; @%p23 bra $L__BB0_26; add.s32 %r171, %r69, %r54; add.s32 %r172, %r171, %r65; mul.wide.s32 %rd40, %r172, 4; add.s64 %rd41, %rd3, %rd40; add.s32 %r173, %r55, %r322; add.s32 %r174, %r173, %r66; mul.wide.s32 %rd42, %r174, 4; add.s64 %rd43, %rd4, %rd42; ld.global.f32 %f215, [%rd43]; ld.global.f32 %f216, [%rd41]; fma.rn.ftz.f32 %f446, %f216, %f215, %f446; $L__BB0_26: add.s32 %r322, %r322, %r118; setp.lt.s32 %p24, %r322, %r114; @%p24 bra $L__BB0_9; $L__BB0_27: add.s32 %r321, %r321, %r117; setp.lt.s32 %p25, %r321, %r113; @%p25 bra $L__BB0_7; $L__BB0_76: setp.eq.s64 %p66, %rd2, 0; @%p66 bra $L__BB0_93; setp.ge.s32 %p67, %r3, %r19; @%p67 bra $L__BB0_79; ld.global.f32 %f281, [%rd6]; add.ftz.f32 %f453, %f281, %f453; $L__BB0_79: setp.ge.s32 %p68, %r35, %r19; @%p68 bra $L__BB0_81; ld.global.f32 %f282, [%rd6+4]; add.ftz.f32 %f452, %f282, %f452; $L__BB0_81: setp.ge.s32 %p69, %r38, %r19; @%p69 bra $L__BB0_83; ld.global.f32 %f283, [%rd6+8]; add.ftz.f32 %f451, %f283, %f451; $L__BB0_83: setp.ge.s32 %p70, %r41, %r19; @%p70 bra $L__BB0_85; ld.global.f32 %f284, [%rd6+12]; add.ftz.f32 %f450, %f284, %f450; $L__BB0_85: setp.ge.s32 %p71, %r44, %r19; @%p71 bra $L__BB0_87; ld.global.f32 %f285, [%rd6+16]; add.ftz.f32 %f449, %f285, %f449; $L__BB0_87: setp.ge.s32 %p72, %r47, %r19; @%p72 bra $L__BB0_89; ld.global.f32 %f286, [%rd6+20]; add.ftz.f32 %f448, %f286, %f448; $L__BB0_89: setp.ge.s32 %p73, %r50, %r19; @%p73 bra $L__BB0_91; ld.global.f32 %f287, [%rd6+24]; add.ftz.f32 %f447, %f287, %f447; $L__BB0_91: setp.ge.s32 %p74, %r53, %r19; @%p74 bra $L__BB0_93; ld.global.f32 %f288, [%rd6+28]; add.ftz.f32 %f446, %f288, %f446; $L__BB0_93: setp.gtu.ftz.f32 %p75, %f453, %f183; mul.ftz.f32 %f289, %f184, %f453; selp.f32 %f175, %f453, %f289, %p75; setp.gtu.ftz.f32 %p76, %f452, %f183; mul.ftz.f32 %f290, %f184, %f452; selp.f32 %f176, %f452, %f290, %p76; setp.gtu.ftz.f32 %p77, %f451, %f183; mul.ftz.f32 %f291, %f184, %f451; selp.f32 %f177, %f451, %f291, %p77; setp.gtu.ftz.f32 %p78, %f450, %f183; mul.ftz.f32 %f292, %f184, %f450; selp.f32 %f178, %f450, %f292, %p78; setp.gtu.ftz.f32 %p79, %f449, %f183; mul.ftz.f32 %f293, %f184, %f449; selp.f32 %f179, %f449, %f293, %p79; setp.gtu.ftz.f32 %p80, %f448, %f183; mul.ftz.f32 %f294, %f184, %f448; selp.f32 %f180, %f448, %f294, %p80; setp.gtu.ftz.f32 %p81, %f447, %f183; mul.ftz.f32 %f295, %f184, %f447; selp.f32 %f181, %f447, %f295, %p81; setp.gtu.ftz.f32 %p82, %f446, %f183; mul.ftz.f32 %f296, %f184, %f446; selp.f32 %f182, %f446, %f296, %p82; mad.lo.s32 %r95, %r133, %r320, %r1; setp.ge.s32 %p83, %r3, %r132; @%p83 bra $L__BB0_95; mad.lo.s32 %r271, %r134, %r3, %r95; add.s32 %r272, %r271, %r32; cvta.to.global.u64 %rd108, %rd5; mul.wide.s32 %rd109, %r272, 4; add.s64 %rd110, %rd108, %rd109; st.global.f32 [%rd110], %f175; $L__BB0_95: setp.ge.s32 %p84, %r35, %r132; @%p84 bra $L__BB0_97; add.s32 %r279, %r95, %r136; add.s32 %r280, %r279, %r32; cvta.to.global.u64 %rd111, %rd5; mul.wide.s32 %rd112, %r280, 4; add.s64 %rd113, %rd111, %rd112; st.global.f32 [%rd113], %f176; $L__BB0_97: setp.ge.s32 %p85, %r38, %r132; @%p85 bra $L__BB0_99; add.s32 %r288, %r95, %r137; add.s32 %r289, %r288, %r32; cvta.to.global.u64 %rd114, %rd5; mul.wide.s32 %rd115, %r289, 4; add.s64 %rd116, %rd114, %rd115; st.global.f32 [%rd116], %f177; $L__BB0_99: setp.ge.s32 %p86, %r41, %r132; @%p86 bra $L__BB0_101; add.s32 %r298, %r95, %r138; add.s32 %r299, %r298, %r32; cvta.to.global.u64 %rd117, %rd5; mul.wide.s32 %rd118, %r299, 4; add.s64 %rd119, %rd117, %rd118; st.global.f32 [%rd119], %f178; $L__BB0_101: setp.ge.s32 %p87, %r44, %r132; @%p87 bra $L__BB0_103; add.s32 %r309, %r95, %r139; add.s32 %r310, %r309, %r32; cvta.to.global.u64 %rd120, %rd5; mul.wide.s32 %rd121, %r310, 4; add.s64 %rd122, %rd120, %rd121; st.global.f32 [%rd122], %f179; $L__BB0_103: setp.ge.s32 %p88, %r47, %r132; @%p88 bra $L__BB0_105; add.s32 %r311, %r95, %r56; add.s32 %r312, %r311, %r32; cvta.to.global.u64 %rd123, %rd5; mul.wide.s32 %rd124, %r312, 4; add.s64 %rd125, %rd123, %rd124; st.global.f32 [%rd125], %f180; $L__BB0_105: setp.ge.s32 %p89, %r50, %r132; @%p89 bra $L__BB0_107; add.s32 %r313, %r95, %r57; add.s32 %r314, %r313, %r32; cvta.to.global.u64 %rd126, %rd5; mul.wide.s32 %rd127, %r314, 4; add.s64 %rd128, %rd126, %rd127; st.global.f32 [%rd128], %f181; $L__BB0_107: setp.ge.s32 %p90, %r53, %r132; @%p90 bra $L__BB0_109; add.s32 %r315, %r95, %r58; add.s32 %r316, %r315, %r32; cvta.to.global.u64 %rd129, %rd5; mul.wide.s32 %rd130, %r316, 4; add.s64 %rd131, %rd129, %rd130; st.global.f32 [%rd131], %f182; $L__BB0_109: add.s32 %r320, %r320, 1; setp.lt.s32 %p91, %r320, %r131; @%p91 bra $L__BB0_3; $L__BB0_110: ret; }