storage has been demoted .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust6system6detail10sequential3seqE[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_1E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_2E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a5ccceb06thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group32_batch823DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<26>; .reg .b16 %rs<288>; .reg .f32 %f<592>; .reg .b32 %r<270>; .reg .b64 %rd<67>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi32ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[5120]; ld.param.v2.u32 {%r52, %r53}, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r54, %r55}, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f77, %f78}, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs68, %rs69, %rs70, %rs71}, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd18, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd17, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd16, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd15, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+16]; mov.u32 %r269, %tid.y; shl.b32 %r56, %r269, 5; mov.u32 %r57, %tid.x; add.s32 %r268, %r56, %r57; setp.ge.u32 %p1, %r268, %r54; mov.f32 %f568, 0f00000000; mov.f32 %f569, %f568; mov.f32 %f570, %f568; mov.f32 %f571, %f568; mov.f32 %f572, %f568; mov.f32 %f573, %f568; mov.f32 %f574, %f568; mov.f32 %f575, %f568; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd2, %rd15; mov.u32 %r58, %ctaid.x; mul.lo.s32 %r5, %r55, %r58; shl.b16 %rs2, %rs68, 3; cvta.to.global.u64 %rd3, %rd16; cvta.to.global.u64 %rd4, %rd18; $L__BB0_2: mad.lo.s32 %r60, %r54, %r58, %r268; mul.wide.u32 %rd19, %r60, 4; add.s64 %rd20, %rd3, %rd19; ld.global.u32 %r8, [%rd20]; shr.u32 %r62, %r57, 2; shl.b32 %r63, %r269, 3; add.s32 %r9, %r63, %r62; add.s32 %r10, %r9, %r5; mul.wide.s32 %rd21, %r10, 2; add.s64 %rd22, %rd4, %rd21; ld.global.u16 %rs76, [%rd22]; // begin inline asm { cvt.f32.f16 %f87, %rs76;} // end inline asm setp.eq.s64 %p2, %rd17, 0; mov.u16 %rs287, %rs2; @%p2 bra $L__BB0_4; shr.u32 %r64, %r10, 31; add.s32 %r65, %r10, %r64; shr.s32 %r66, %r65, 1; cvt.s64.s32 %rd23, %r66; cvta.to.global.u64 %rd24, %rd17; add.s64 %rd25, %rd24, %rd23; ld.global.u8 %r67, [%rd25]; shl.b32 %r68, %r9, 2; and.b32 %r69, %r68, 4; shr.u32 %r70, %r67, %r69; cvt.u16.u32 %rs77, %r70; and.b16 %rs287, %rs77, 15; $L__BB0_4: shl.b32 %r11, %r268, 3; setp.ge.s32 %p3, %r11, %r52; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs68, 0; mul.wide.s32 %rd26, %r11, 2; add.s64 %rd27, %rd2, %rd26; ld.global.v4.u32 {%r71, %r72, %r73, %r74}, [%rd27]; mul.wide.s32 %rd28, %r52, 2; add.s64 %rd29, %rd27, %rd28; ld.global.v4.u32 {%r75, %r76, %r77, %r78}, [%rd29]; add.s32 %r79, %r52, %r11; add.s32 %r80, %r79, %r52; mul.wide.s32 %rd30, %r80, 2; add.s64 %rd31, %rd2, %rd30; ld.global.v4.u32 {%r81, %r82, %r83, %r84}, [%rd31]; add.s64 %rd32, %rd31, %rd28; ld.global.v4.u32 {%r85, %r86, %r87, %r88}, [%rd32]; add.s64 %rd33, %rd32, %rd28; ld.global.v4.u32 {%r89, %r90, %r91, %r92}, [%rd33]; add.s64 %rd34, %rd33, %rd28; ld.global.v4.u32 {%r93, %r94, %r95, %r96}, [%rd34]; add.s64 %rd35, %rd34, %rd28; ld.global.v4.u32 {%r97, %r98, %r99, %r100}, [%rd35]; add.s64 %rd36, %rd35, %rd28; ld.global.v4.u32 {%r101, %r102, %r103, %r104}, [%rd36]; shr.u16 %rs79, %rs287, 3; and.b16 %rs80, %rs79, 1; setp.eq.b16 %p5, %rs80, 1; and.pred %p6, %p4, %p5; selp.b16 %rs81, -16, 0, %p6; or.b16 %rs82, %rs81, %rs287; cvt.s16.s8 %rs83, %rs82; cvt.rn.f32.s16 %f10, %rs83; cvt.u16.u32 %rs5, %r8; and.b16 %rs6, %rs5, 15; cvt.u16.u32 %rs7, %r71; mov.b32 {%rs10, %rs9}, %r75; mov.b32 {%rs13, %rs12}, %r81; mov.b32 {%rs16, %rs15}, %r85; mov.b32 {%rs19, %rs18}, %r89; mov.b32 {%rs22, %rs21}, %r93; mov.b32 {%rs25, %rs24}, %r97; mov.b32 {%rs28, %rs27}, %r101; shr.u32 %r105, %r8, 4; cvt.u16.u32 %rs29, %r105; and.b16 %rs30, %rs29, 15; shr.u32 %r106, %r8, 8; cvt.u16.u32 %rs31, %r106; and.b16 %rs32, %rs31, 15; shr.u32 %r107, %r8, 12; cvt.u16.u32 %rs33, %r107; and.b16 %rs34, %rs33, 15; mov.b32 {%rs84, %rs35}, %r98; mov.b32 {%rs85, %rs36}, %r102; shr.u32 %r108, %r8, 16; cvt.u16.u32 %rs37, %r108; and.b16 %rs38, %rs37, 15; mov.b32 {%rs39, %rs49}, %r73; mov.b32 {%rs40, %rs50}, %r77; mov.b32 {%rs41, %rs51}, %r83; mov.b32 {%rs42, %rs52}, %r87; mov.b32 {%rs43, %rs53}, %r91; mov.b32 {%rs44, %rs54}, %r95; mov.b32 {%rs45, %rs55}, %r99; mov.b32 {%rs46, %rs56}, %r103; shr.u32 %r109, %r8, 20; cvt.u16.u32 %rs47, %r109; and.b16 %rs48, %rs47, 15; shr.u32 %r110, %r8, 24; cvt.u16.u32 %rs57, %r110; and.b16 %rs58, %rs57, 15; shr.u32 %r111, %r8, 28; cvt.u16.u32 %rs59, %r111; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f152, %rs6; sub.ftz.f32 %f153, %f152, %f10; mul.ftz.f32 %f154, %f87, %f153; // begin inline asm { cvt.f32.f16 %f88, %rs7;} // end inline asm fma.rn.ftz.f32 %f155, %f154, %f88, %f568; // begin inline asm { cvt.f32.f16 %f89, %rs10;} // end inline asm fma.rn.ftz.f32 %f156, %f154, %f89, %f569; // begin inline asm { cvt.f32.f16 %f90, %rs13;} // end inline asm fma.rn.ftz.f32 %f157, %f154, %f90, %f570; // begin inline asm { cvt.f32.f16 %f91, %rs16;} // end inline asm fma.rn.ftz.f32 %f158, %f154, %f91, %f571; // begin inline asm { cvt.f32.f16 %f92, %rs19;} // end inline asm fma.rn.ftz.f32 %f159, %f154, %f92, %f572; // begin inline asm { cvt.f32.f16 %f93, %rs22;} // end inline asm fma.rn.ftz.f32 %f160, %f154, %f93, %f573; // begin inline asm { cvt.f32.f16 %f94, %rs25;} // end inline asm fma.rn.ftz.f32 %f161, %f154, %f94, %f574; // begin inline asm { cvt.f32.f16 %f95, %rs28;} // end inline asm fma.rn.ftz.f32 %f162, %f154, %f95, %f575; cvt.rn.f32.s16 %f163, %rs30; sub.ftz.f32 %f164, %f163, %f10; mul.ftz.f32 %f165, %f87, %f164; mov.b32 {%rs150, %rs94}, %r71; // begin inline asm { cvt.f32.f16 %f96, %rs94;} // end inline asm fma.rn.ftz.f32 %f166, %f165, %f96, %f155; // begin inline asm { cvt.f32.f16 %f97, %rs9;} // end inline asm fma.rn.ftz.f32 %f167, %f165, %f97, %f156; // begin inline asm { cvt.f32.f16 %f98, %rs12;} // end inline asm fma.rn.ftz.f32 %f168, %f165, %f98, %f157; // begin inline asm { cvt.f32.f16 %f99, %rs15;} // end inline asm fma.rn.ftz.f32 %f169, %f165, %f99, %f158; // begin inline asm { cvt.f32.f16 %f100, %rs18;} // end inline asm fma.rn.ftz.f32 %f170, %f165, %f100, %f159; // begin inline asm { cvt.f32.f16 %f101, %rs21;} // end inline asm fma.rn.ftz.f32 %f171, %f165, %f101, %f160; // begin inline asm { cvt.f32.f16 %f102, %rs24;} // end inline asm fma.rn.ftz.f32 %f172, %f165, %f102, %f161; // begin inline asm { cvt.f32.f16 %f103, %rs27;} // end inline asm fma.rn.ftz.f32 %f173, %f165, %f103, %f162; cvt.rn.f32.s16 %f174, %rs32; sub.ftz.f32 %f175, %f174, %f10; mul.ftz.f32 %f176, %f87, %f175; mov.b32 {%rs102, %rs110}, %r72; // begin inline asm { cvt.f32.f16 %f104, %rs102;} // end inline asm fma.rn.ftz.f32 %f177, %f176, %f104, %f166; mov.b32 {%rs103, %rs111}, %r76; // begin inline asm { cvt.f32.f16 %f105, %rs103;} // end inline asm fma.rn.ftz.f32 %f178, %f176, %f105, %f167; mov.b32 {%rs104, %rs112}, %r82; // begin inline asm { cvt.f32.f16 %f106, %rs104;} // end inline asm fma.rn.ftz.f32 %f179, %f176, %f106, %f168; mov.b32 {%rs105, %rs113}, %r86; // begin inline asm { cvt.f32.f16 %f107, %rs105;} // end inline asm fma.rn.ftz.f32 %f180, %f176, %f107, %f169; mov.b32 {%rs106, %rs114}, %r90; // begin inline asm { cvt.f32.f16 %f108, %rs106;} // end inline asm fma.rn.ftz.f32 %f181, %f176, %f108, %f170; mov.b32 {%rs107, %rs115}, %r94; // begin inline asm { cvt.f32.f16 %f109, %rs107;} // end inline asm fma.rn.ftz.f32 %f182, %f176, %f109, %f171; cvt.u16.u32 %rs108, %r98; // begin inline asm { cvt.f32.f16 %f110, %rs108;} // end inline asm fma.rn.ftz.f32 %f183, %f176, %f110, %f172; cvt.u16.u32 %rs109, %r102; // begin inline asm { cvt.f32.f16 %f111, %rs109;} // end inline asm fma.rn.ftz.f32 %f184, %f176, %f111, %f173; cvt.rn.f32.s16 %f185, %rs34; sub.ftz.f32 %f186, %f185, %f10; mul.ftz.f32 %f187, %f87, %f186; // begin inline asm { cvt.f32.f16 %f112, %rs110;} // end inline asm fma.rn.ftz.f32 %f188, %f187, %f112, %f177; // begin inline asm { cvt.f32.f16 %f113, %rs111;} // end inline asm fma.rn.ftz.f32 %f189, %f187, %f113, %f178; // begin inline asm { cvt.f32.f16 %f114, %rs112;} // end inline asm fma.rn.ftz.f32 %f190, %f187, %f114, %f179; // begin inline asm { cvt.f32.f16 %f115, %rs113;} // end inline asm fma.rn.ftz.f32 %f191, %f187, %f115, %f180; // begin inline asm { cvt.f32.f16 %f116, %rs114;} // end inline asm fma.rn.ftz.f32 %f192, %f187, %f116, %f181; // begin inline asm { cvt.f32.f16 %f117, %rs115;} // end inline asm fma.rn.ftz.f32 %f193, %f187, %f117, %f182; // begin inline asm { cvt.f32.f16 %f118, %rs35;} // end inline asm fma.rn.ftz.f32 %f194, %f187, %f118, %f183; // begin inline asm { cvt.f32.f16 %f119, %rs36;} // end inline asm fma.rn.ftz.f32 %f195, %f187, %f119, %f184; cvt.rn.f32.s16 %f196, %rs38; sub.ftz.f32 %f197, %f196, %f10; mul.ftz.f32 %f198, %f87, %f197; // begin inline asm { cvt.f32.f16 %f120, %rs39;} // end inline asm fma.rn.ftz.f32 %f199, %f198, %f120, %f188; // begin inline asm { cvt.f32.f16 %f121, %rs40;} // end inline asm fma.rn.ftz.f32 %f200, %f198, %f121, %f189; // begin inline asm { cvt.f32.f16 %f122, %rs41;} // end inline asm fma.rn.ftz.f32 %f201, %f198, %f122, %f190; // begin inline asm { cvt.f32.f16 %f123, %rs42;} // end inline asm fma.rn.ftz.f32 %f202, %f198, %f123, %f191; // begin inline asm { cvt.f32.f16 %f124, %rs43;} // end inline asm fma.rn.ftz.f32 %f203, %f198, %f124, %f192; // begin inline asm { cvt.f32.f16 %f125, %rs44;} // end inline asm fma.rn.ftz.f32 %f204, %f198, %f125, %f193; // begin inline asm { cvt.f32.f16 %f126, %rs45;} // end inline asm fma.rn.ftz.f32 %f205, %f198, %f126, %f194; // begin inline asm { cvt.f32.f16 %f127, %rs46;} // end inline asm fma.rn.ftz.f32 %f206, %f198, %f127, %f195; cvt.rn.f32.s16 %f207, %rs48; sub.ftz.f32 %f208, %f207, %f10; mul.ftz.f32 %f209, %f87, %f208; // begin inline asm { cvt.f32.f16 %f128, %rs49;} // end inline asm fma.rn.ftz.f32 %f210, %f209, %f128, %f199; // begin inline asm { cvt.f32.f16 %f129, %rs50;} // end inline asm fma.rn.ftz.f32 %f211, %f209, %f129, %f200; // begin inline asm { cvt.f32.f16 %f130, %rs51;} // end inline asm fma.rn.ftz.f32 %f212, %f209, %f130, %f201; // begin inline asm { cvt.f32.f16 %f131, %rs52;} // end inline asm fma.rn.ftz.f32 %f213, %f209, %f131, %f202; // begin inline asm { cvt.f32.f16 %f132, %rs53;} // end inline asm fma.rn.ftz.f32 %f214, %f209, %f132, %f203; // begin inline asm { cvt.f32.f16 %f133, %rs54;} // end inline asm fma.rn.ftz.f32 %f215, %f209, %f133, %f204; // begin inline asm { cvt.f32.f16 %f134, %rs55;} // end inline asm fma.rn.ftz.f32 %f216, %f209, %f134, %f205; // begin inline asm { cvt.f32.f16 %f135, %rs56;} // end inline asm fma.rn.ftz.f32 %f217, %f209, %f135, %f206; cvt.rn.f32.s16 %f218, %rs58; sub.ftz.f32 %f219, %f218, %f10; mul.ftz.f32 %f220, %f87, %f219; mov.b32 {%rs134, %rs142}, %r74; // begin inline asm { cvt.f32.f16 %f136, %rs134;} // end inline asm fma.rn.ftz.f32 %f221, %f220, %f136, %f210; mov.b32 {%rs135, %rs143}, %r78; // begin inline asm { cvt.f32.f16 %f137, %rs135;} // end inline asm fma.rn.ftz.f32 %f222, %f220, %f137, %f211; mov.b32 {%rs136, %rs144}, %r84; // begin inline asm { cvt.f32.f16 %f138, %rs136;} // end inline asm fma.rn.ftz.f32 %f223, %f220, %f138, %f212; mov.b32 {%rs137, %rs145}, %r88; // begin inline asm { cvt.f32.f16 %f139, %rs137;} // end inline asm fma.rn.ftz.f32 %f224, %f220, %f139, %f213; mov.b32 {%rs138, %rs146}, %r92; // begin inline asm { cvt.f32.f16 %f140, %rs138;} // end inline asm fma.rn.ftz.f32 %f225, %f220, %f140, %f214; mov.b32 {%rs139, %rs147}, %r96; // begin inline asm { cvt.f32.f16 %f141, %rs139;} // end inline asm fma.rn.ftz.f32 %f226, %f220, %f141, %f215; mov.b32 {%rs140, %rs148}, %r100; // begin inline asm { cvt.f32.f16 %f142, %rs140;} // end inline asm fma.rn.ftz.f32 %f227, %f220, %f142, %f216; mov.b32 {%rs141, %rs149}, %r104; // begin inline asm { cvt.f32.f16 %f143, %rs141;} // end inline asm fma.rn.ftz.f32 %f228, %f220, %f143, %f217; cvt.rn.f32.s16 %f229, %rs59; sub.ftz.f32 %f230, %f229, %f10; mul.ftz.f32 %f231, %f87, %f230; // begin inline asm { cvt.f32.f16 %f144, %rs142;} // end inline asm fma.rn.ftz.f32 %f568, %f231, %f144, %f221; // begin inline asm { cvt.f32.f16 %f145, %rs143;} // end inline asm fma.rn.ftz.f32 %f569, %f231, %f145, %f222; // begin inline asm { cvt.f32.f16 %f146, %rs144;} // end inline asm fma.rn.ftz.f32 %f570, %f231, %f146, %f223; // begin inline asm { cvt.f32.f16 %f147, %rs145;} // end inline asm fma.rn.ftz.f32 %f571, %f231, %f147, %f224; // begin inline asm { cvt.f32.f16 %f148, %rs146;} // end inline asm fma.rn.ftz.f32 %f572, %f231, %f148, %f225; // begin inline asm { cvt.f32.f16 %f149, %rs147;} // end inline asm fma.rn.ftz.f32 %f573, %f231, %f149, %f226; // begin inline asm { cvt.f32.f16 %f150, %rs148;} // end inline asm fma.rn.ftz.f32 %f574, %f231, %f150, %f227; // begin inline asm { cvt.f32.f16 %f151, %rs149;} // end inline asm fma.rn.ftz.f32 %f575, %f231, %f151, %f228; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs215, %rs5, 4; cvt.s16.s8 %rs216, %rs215; shr.s16 %rs217, %rs216, 7; and.b16 %rs218, %rs217, -16; or.b16 %rs219, %rs218, %rs6; cvt.rn.f32.s16 %f296, %rs219; sub.ftz.f32 %f297, %f296, %f10; mul.ftz.f32 %f298, %f87, %f297; // begin inline asm { cvt.f32.f16 %f232, %rs7;} // end inline asm fma.rn.ftz.f32 %f299, %f298, %f232, %f568; // begin inline asm { cvt.f32.f16 %f233, %rs10;} // end inline asm fma.rn.ftz.f32 %f300, %f298, %f233, %f569; // begin inline asm { cvt.f32.f16 %f234, %rs13;} // end inline asm fma.rn.ftz.f32 %f301, %f298, %f234, %f570; // begin inline asm { cvt.f32.f16 %f235, %rs16;} // end inline asm fma.rn.ftz.f32 %f302, %f298, %f235, %f571; // begin inline asm { cvt.f32.f16 %f236, %rs19;} // end inline asm fma.rn.ftz.f32 %f303, %f298, %f236, %f572; // begin inline asm { cvt.f32.f16 %f237, %rs22;} // end inline asm fma.rn.ftz.f32 %f304, %f298, %f237, %f573; // begin inline asm { cvt.f32.f16 %f238, %rs25;} // end inline asm fma.rn.ftz.f32 %f305, %f298, %f238, %f574; // begin inline asm { cvt.f32.f16 %f239, %rs28;} // end inline asm fma.rn.ftz.f32 %f306, %f298, %f239, %f575; shl.b16 %rs220, %rs29, 4; cvt.s16.s8 %rs221, %rs220; shr.s16 %rs222, %rs221, 7; and.b16 %rs223, %rs222, -16; or.b16 %rs224, %rs223, %rs30; cvt.rn.f32.s16 %f307, %rs224; sub.ftz.f32 %f308, %f307, %f10; mul.ftz.f32 %f309, %f87, %f308; mov.b32 {%rs225, %rs159}, %r71; // begin inline asm { cvt.f32.f16 %f240, %rs159;} // end inline asm fma.rn.ftz.f32 %f310, %f309, %f240, %f299; // begin inline asm { cvt.f32.f16 %f241, %rs9;} // end inline asm fma.rn.ftz.f32 %f311, %f309, %f241, %f300; // begin inline asm { cvt.f32.f16 %f242, %rs12;} // end inline asm fma.rn.ftz.f32 %f312, %f309, %f242, %f301; // begin inline asm { cvt.f32.f16 %f243, %rs15;} // end inline asm fma.rn.ftz.f32 %f313, %f309, %f243, %f302; // begin inline asm { cvt.f32.f16 %f244, %rs18;} // end inline asm fma.rn.ftz.f32 %f314, %f309, %f244, %f303; // begin inline asm { cvt.f32.f16 %f245, %rs21;} // end inline asm fma.rn.ftz.f32 %f315, %f309, %f245, %f304; // begin inline asm { cvt.f32.f16 %f246, %rs24;} // end inline asm fma.rn.ftz.f32 %f316, %f309, %f246, %f305; // begin inline asm { cvt.f32.f16 %f247, %rs27;} // end inline asm fma.rn.ftz.f32 %f317, %f309, %f247, %f306; shl.b16 %rs233, %rs31, 4; cvt.s16.s8 %rs234, %rs233; shr.s16 %rs235, %rs234, 7; and.b16 %rs236, %rs235, -16; or.b16 %rs237, %rs236, %rs32; cvt.rn.f32.s16 %f318, %rs237; sub.ftz.f32 %f319, %f318, %f10; mul.ftz.f32 %f320, %f87, %f319; mov.b32 {%rs167, %rs175}, %r72; // begin inline asm { cvt.f32.f16 %f248, %rs167;} // end inline asm fma.rn.ftz.f32 %f321, %f320, %f248, %f310; mov.b32 {%rs168, %rs176}, %r76; // begin inline asm { cvt.f32.f16 %f249, %rs168;} // end inline asm fma.rn.ftz.f32 %f322, %f320, %f249, %f311; mov.b32 {%rs169, %rs177}, %r82; // begin inline asm { cvt.f32.f16 %f250, %rs169;} // end inline asm fma.rn.ftz.f32 %f323, %f320, %f250, %f312; mov.b32 {%rs170, %rs178}, %r86; // begin inline asm { cvt.f32.f16 %f251, %rs170;} // end inline asm fma.rn.ftz.f32 %f324, %f320, %f251, %f313; mov.b32 {%rs171, %rs179}, %r90; // begin inline asm { cvt.f32.f16 %f252, %rs171;} // end inline asm fma.rn.ftz.f32 %f325, %f320, %f252, %f314; mov.b32 {%rs172, %rs180}, %r94; // begin inline asm { cvt.f32.f16 %f253, %rs172;} // end inline asm fma.rn.ftz.f32 %f326, %f320, %f253, %f315; cvt.u16.u32 %rs173, %r98; // begin inline asm { cvt.f32.f16 %f254, %rs173;} // end inline asm fma.rn.ftz.f32 %f327, %f320, %f254, %f316; cvt.u16.u32 %rs174, %r102; // begin inline asm { cvt.f32.f16 %f255, %rs174;} // end inline asm fma.rn.ftz.f32 %f328, %f320, %f255, %f317; shl.b16 %rs238, %rs33, 4; cvt.s16.s8 %rs239, %rs238; shr.s16 %rs240, %rs239, 7; and.b16 %rs241, %rs240, -16; or.b16 %rs242, %rs241, %rs34; cvt.rn.f32.s16 %f329, %rs242; sub.ftz.f32 %f330, %f329, %f10; mul.ftz.f32 %f331, %f87, %f330; // begin inline asm { cvt.f32.f16 %f256, %rs175;} // end inline asm fma.rn.ftz.f32 %f332, %f331, %f256, %f321; // begin inline asm { cvt.f32.f16 %f257, %rs176;} // end inline asm fma.rn.ftz.f32 %f333, %f331, %f257, %f322; // begin inline asm { cvt.f32.f16 %f258, %rs177;} // end inline asm fma.rn.ftz.f32 %f334, %f331, %f258, %f323; // begin inline asm { cvt.f32.f16 %f259, %rs178;} // end inline asm fma.rn.ftz.f32 %f335, %f331, %f259, %f324; // begin inline asm { cvt.f32.f16 %f260, %rs179;} // end inline asm fma.rn.ftz.f32 %f336, %f331, %f260, %f325; // begin inline asm { cvt.f32.f16 %f261, %rs180;} // end inline asm fma.rn.ftz.f32 %f337, %f331, %f261, %f326; // begin inline asm { cvt.f32.f16 %f262, %rs35;} // end inline asm fma.rn.ftz.f32 %f338, %f331, %f262, %f327; // begin inline asm { cvt.f32.f16 %f263, %rs36;} // end inline asm fma.rn.ftz.f32 %f339, %f331, %f263, %f328; shl.b16 %rs243, %rs37, 4; cvt.s16.s8 %rs244, %rs243; shr.s16 %rs245, %rs244, 7; and.b16 %rs246, %rs245, -16; or.b16 %rs247, %rs246, %rs38; cvt.rn.f32.s16 %f340, %rs247; sub.ftz.f32 %f341, %f340, %f10; mul.ftz.f32 %f342, %f87, %f341; // begin inline asm { cvt.f32.f16 %f264, %rs39;} // end inline asm fma.rn.ftz.f32 %f343, %f342, %f264, %f332; // begin inline asm { cvt.f32.f16 %f265, %rs40;} // end inline asm fma.rn.ftz.f32 %f344, %f342, %f265, %f333; // begin inline asm { cvt.f32.f16 %f266, %rs41;} // end inline asm fma.rn.ftz.f32 %f345, %f342, %f266, %f334; // begin inline asm { cvt.f32.f16 %f267, %rs42;} // end inline asm fma.rn.ftz.f32 %f346, %f342, %f267, %f335; // begin inline asm { cvt.f32.f16 %f268, %rs43;} // end inline asm fma.rn.ftz.f32 %f347, %f342, %f268, %f336; // begin inline asm { cvt.f32.f16 %f269, %rs44;} // end inline asm fma.rn.ftz.f32 %f348, %f342, %f269, %f337; // begin inline asm { cvt.f32.f16 %f270, %rs45;} // end inline asm fma.rn.ftz.f32 %f349, %f342, %f270, %f338; // begin inline asm { cvt.f32.f16 %f271, %rs46;} // end inline asm fma.rn.ftz.f32 %f350, %f342, %f271, %f339; shl.b16 %rs248, %rs47, 4; cvt.s16.s8 %rs249, %rs248; shr.s16 %rs250, %rs249, 7; and.b16 %rs251, %rs250, -16; or.b16 %rs252, %rs251, %rs48; cvt.rn.f32.s16 %f351, %rs252; sub.ftz.f32 %f352, %f351, %f10; mul.ftz.f32 %f353, %f87, %f352; // begin inline asm { cvt.f32.f16 %f272, %rs49;} // end inline asm fma.rn.ftz.f32 %f354, %f353, %f272, %f343; // begin inline asm { cvt.f32.f16 %f273, %rs50;} // end inline asm fma.rn.ftz.f32 %f355, %f353, %f273, %f344; // begin inline asm { cvt.f32.f16 %f274, %rs51;} // end inline asm fma.rn.ftz.f32 %f356, %f353, %f274, %f345; // begin inline asm { cvt.f32.f16 %f275, %rs52;} // end inline asm fma.rn.ftz.f32 %f357, %f353, %f275, %f346; // begin inline asm { cvt.f32.f16 %f276, %rs53;} // end inline asm fma.rn.ftz.f32 %f358, %f353, %f276, %f347; // begin inline asm { cvt.f32.f16 %f277, %rs54;} // end inline asm fma.rn.ftz.f32 %f359, %f353, %f277, %f348; // begin inline asm { cvt.f32.f16 %f278, %rs55;} // end inline asm fma.rn.ftz.f32 %f360, %f353, %f278, %f349; // begin inline asm { cvt.f32.f16 %f279, %rs56;} // end inline asm fma.rn.ftz.f32 %f361, %f353, %f279, %f350; shl.b16 %rs253, %rs57, 4; cvt.s16.s8 %rs254, %rs253; shr.s16 %rs255, %rs254, 7; and.b16 %rs256, %rs255, -16; or.b16 %rs257, %rs256, %rs58; cvt.rn.f32.s16 %f362, %rs257; sub.ftz.f32 %f363, %f362, %f10; mul.ftz.f32 %f364, %f87, %f363; mov.b32 {%rs199, %rs207}, %r74; // begin inline asm { cvt.f32.f16 %f280, %rs199;} // end inline asm fma.rn.ftz.f32 %f365, %f364, %f280, %f354; mov.b32 {%rs200, %rs208}, %r78; // begin inline asm { cvt.f32.f16 %f281, %rs200;} // end inline asm fma.rn.ftz.f32 %f366, %f364, %f281, %f355; mov.b32 {%rs201, %rs209}, %r84; // begin inline asm { cvt.f32.f16 %f282, %rs201;} // end inline asm fma.rn.ftz.f32 %f367, %f364, %f282, %f356; mov.b32 {%rs202, %rs210}, %r88; // begin inline asm { cvt.f32.f16 %f283, %rs202;} // end inline asm fma.rn.ftz.f32 %f368, %f364, %f283, %f357; mov.b32 {%rs203, %rs211}, %r92; // begin inline asm { cvt.f32.f16 %f284, %rs203;} // end inline asm fma.rn.ftz.f32 %f369, %f364, %f284, %f358; mov.b32 {%rs204, %rs212}, %r96; // begin inline asm { cvt.f32.f16 %f285, %rs204;} // end inline asm fma.rn.ftz.f32 %f370, %f364, %f285, %f359; mov.b32 {%rs205, %rs213}, %r100; // begin inline asm { cvt.f32.f16 %f286, %rs205;} // end inline asm fma.rn.ftz.f32 %f371, %f364, %f286, %f360; mov.b32 {%rs206, %rs214}, %r104; // begin inline asm { cvt.f32.f16 %f287, %rs206;} // end inline asm fma.rn.ftz.f32 %f372, %f364, %f287, %f361; shl.b16 %rs258, %rs59, 4; cvt.s16.s8 %rs259, %rs258; shr.s16 %rs260, %rs259, 7; and.b16 %rs261, %rs260, -16; or.b16 %rs262, %rs261, %rs59; cvt.rn.f32.s16 %f373, %rs262; sub.ftz.f32 %f374, %f373, %f10; mul.ftz.f32 %f375, %f87, %f374; // begin inline asm { cvt.f32.f16 %f288, %rs207;} // end inline asm fma.rn.ftz.f32 %f568, %f375, %f288, %f365; // begin inline asm { cvt.f32.f16 %f289, %rs208;} // end inline asm fma.rn.ftz.f32 %f569, %f375, %f289, %f366; // begin inline asm { cvt.f32.f16 %f290, %rs209;} // end inline asm fma.rn.ftz.f32 %f570, %f375, %f290, %f367; // begin inline asm { cvt.f32.f16 %f291, %rs210;} // end inline asm fma.rn.ftz.f32 %f571, %f375, %f291, %f368; // begin inline asm { cvt.f32.f16 %f292, %rs211;} // end inline asm fma.rn.ftz.f32 %f572, %f375, %f292, %f369; // begin inline asm { cvt.f32.f16 %f293, %rs212;} // end inline asm fma.rn.ftz.f32 %f573, %f375, %f293, %f370; // begin inline asm { cvt.f32.f16 %f294, %rs213;} // end inline asm fma.rn.ftz.f32 %f574, %f375, %f294, %f371; // begin inline asm { cvt.f32.f16 %f295, %rs214;} // end inline asm fma.rn.ftz.f32 %f575, %f375, %f295, %f372; $L__BB0_8: add.s32 %r269, %r269, 4; shl.b32 %r112, %r269, 5; add.s32 %r268, %r112, %r57; setp.lt.u32 %p7, %r268, %r54; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r263, %tid.y; mov.u32 %r262, %tid.x; shl.b32 %r261, %r263, 5; add.s32 %r260, %r261, %r262; shl.b32 %r114, %r260, 2; mov.u32 %r115, _ZZ9gemv_int4ILi4ELi32ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r116, %r115, %r114; setp.lt.u32 %p8, %r260, 32; @%p8 bra $L__BB0_11; add.s32 %r252, %r116, -112; st.shared.f32 [%r252], %f568; $L__BB0_11: mov.u32 %r267, %tid.y; mov.u32 %r266, %tid.x; shl.b32 %r265, %r267, 5; add.s32 %r264, %r265, %r266; setp.gt.u32 %p9, %r264, 31; bar.sync 0; mad.lo.s32 %r47, %r264, 12, %r115; @%p9 bra $L__BB0_13; mov.u32 %r131, 16; ld.shared.f32 %f391, [%r47+16]; add.ftz.f32 %f392, %f568, %f391; ld.shared.f32 %f393, [%r47+20]; add.ftz.f32 %f394, %f392, %f393; ld.shared.f32 %f395, [%r47+24]; add.ftz.f32 %f378, %f394, %f395; mov.u32 %r119, 1; mov.u32 %r132, 31; mov.u32 %r133, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f378, %r119, %r132, %r133; @p add.f32 r0, r0, %f378; mov.f32 %f376, r0;} // end inline asm mov.u32 %r122, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f376, %r122, %r132, %r133; @p add.f32 r0, r0, %f376; mov.f32 %f379, r0;} // end inline asm mov.u32 %r125, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f379, %r125, %r132, %r133; @p add.f32 r0, r0, %f379; mov.f32 %f382, r0;} // end inline asm mov.u32 %r128, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f382, %r128, %r132, %r133; @p add.f32 r0, r0, %f382; mov.f32 %f385, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f385, %r131, %r132, %r133; @p add.f32 r0, r0, %f385; mov.f32 %f568, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r253, %r116, -112; st.shared.f32 [%r253+640], %f569; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f411, [%r47+656]; add.ftz.f32 %f412, %f569, %f411; ld.shared.f32 %f413, [%r47+660]; add.ftz.f32 %f414, %f412, %f413; ld.shared.f32 %f415, [%r47+664]; add.ftz.f32 %f398, %f414, %f415; mov.u32 %r135, 1; mov.u32 %r148, 31; mov.u32 %r149, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f398, %r135, %r148, %r149; @p add.f32 r0, r0, %f398; mov.f32 %f396, r0;} // end inline asm mov.u32 %r138, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f396, %r138, %r148, %r149; @p add.f32 r0, r0, %f396; mov.f32 %f399, r0;} // end inline asm mov.u32 %r141, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f399, %r141, %r148, %r149; @p add.f32 r0, r0, %f399; mov.f32 %f402, r0;} // end inline asm mov.u32 %r144, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f402, %r144, %r148, %r149; @p add.f32 r0, r0, %f402; mov.f32 %f405, r0;} // end inline asm mov.u32 %r147, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f405, %r147, %r148, %r149; @p add.f32 r0, r0, %f405; mov.f32 %f569, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r254, %r116, -112; st.shared.f32 [%r254+1280], %f570; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f431, [%r47+1296]; add.ftz.f32 %f432, %f570, %f431; ld.shared.f32 %f433, [%r47+1300]; add.ftz.f32 %f434, %f432, %f433; ld.shared.f32 %f435, [%r47+1304]; add.ftz.f32 %f418, %f434, %f435; mov.u32 %r151, 1; mov.u32 %r164, 31; mov.u32 %r165, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f418, %r151, %r164, %r165; @p add.f32 r0, r0, %f418; mov.f32 %f416, r0;} // end inline asm mov.u32 %r154, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f416, %r154, %r164, %r165; @p add.f32 r0, r0, %f416; mov.f32 %f419, r0;} // end inline asm mov.u32 %r157, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f419, %r157, %r164, %r165; @p add.f32 r0, r0, %f419; mov.f32 %f422, r0;} // end inline asm mov.u32 %r160, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f422, %r160, %r164, %r165; @p add.f32 r0, r0, %f422; mov.f32 %f425, r0;} // end inline asm mov.u32 %r163, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f425, %r163, %r164, %r165; @p add.f32 r0, r0, %f425; mov.f32 %f570, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r255, %r116, -112; st.shared.f32 [%r255+1920], %f571; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f451, [%r47+1936]; add.ftz.f32 %f452, %f571, %f451; ld.shared.f32 %f453, [%r47+1940]; add.ftz.f32 %f454, %f452, %f453; ld.shared.f32 %f455, [%r47+1944]; add.ftz.f32 %f438, %f454, %f455; mov.u32 %r167, 1; mov.u32 %r180, 31; mov.u32 %r181, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f438, %r167, %r180, %r181; @p add.f32 r0, r0, %f438; mov.f32 %f436, r0;} // end inline asm mov.u32 %r170, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f436, %r170, %r180, %r181; @p add.f32 r0, r0, %f436; mov.f32 %f439, r0;} // end inline asm mov.u32 %r173, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f439, %r173, %r180, %r181; @p add.f32 r0, r0, %f439; mov.f32 %f442, r0;} // end inline asm mov.u32 %r176, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f442, %r176, %r180, %r181; @p add.f32 r0, r0, %f442; mov.f32 %f445, r0;} // end inline asm mov.u32 %r179, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f445, %r179, %r180, %r181; @p add.f32 r0, r0, %f445; mov.f32 %f571, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r256, %r116, -112; st.shared.f32 [%r256+2560], %f572; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f471, [%r47+2576]; add.ftz.f32 %f472, %f572, %f471; ld.shared.f32 %f473, [%r47+2580]; add.ftz.f32 %f474, %f472, %f473; ld.shared.f32 %f475, [%r47+2584]; add.ftz.f32 %f458, %f474, %f475; mov.u32 %r183, 1; mov.u32 %r196, 31; mov.u32 %r197, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f458, %r183, %r196, %r197; @p add.f32 r0, r0, %f458; mov.f32 %f456, r0;} // end inline asm mov.u32 %r186, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f456, %r186, %r196, %r197; @p add.f32 r0, r0, %f456; mov.f32 %f459, r0;} // end inline asm mov.u32 %r189, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f459, %r189, %r196, %r197; @p add.f32 r0, r0, %f459; mov.f32 %f462, r0;} // end inline asm mov.u32 %r192, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f462, %r192, %r196, %r197; @p add.f32 r0, r0, %f462; mov.f32 %f465, r0;} // end inline asm mov.u32 %r195, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f465, %r195, %r196, %r197; @p add.f32 r0, r0, %f465; mov.f32 %f572, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r257, %r116, -112; st.shared.f32 [%r257+3200], %f573; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f491, [%r47+3216]; add.ftz.f32 %f492, %f573, %f491; ld.shared.f32 %f493, [%r47+3220]; add.ftz.f32 %f494, %f492, %f493; ld.shared.f32 %f495, [%r47+3224]; add.ftz.f32 %f478, %f494, %f495; mov.u32 %r199, 1; mov.u32 %r212, 31; mov.u32 %r213, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f478, %r199, %r212, %r213; @p add.f32 r0, r0, %f478; mov.f32 %f476, r0;} // end inline asm mov.u32 %r202, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f476, %r202, %r212, %r213; @p add.f32 r0, r0, %f476; mov.f32 %f479, r0;} // end inline asm mov.u32 %r205, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f479, %r205, %r212, %r213; @p add.f32 r0, r0, %f479; mov.f32 %f482, r0;} // end inline asm mov.u32 %r208, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f482, %r208, %r212, %r213; @p add.f32 r0, r0, %f482; mov.f32 %f485, r0;} // end inline asm mov.u32 %r211, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f485, %r211, %r212, %r213; @p add.f32 r0, r0, %f485; mov.f32 %f573, r0;} // end inline asm $L__BB0_33: @%p8 bra $L__BB0_35; add.s32 %r258, %r116, -112; st.shared.f32 [%r258+3840], %f574; $L__BB0_35: bar.sync 0; @%p9 bra $L__BB0_37; ld.shared.f32 %f511, [%r47+3856]; add.ftz.f32 %f512, %f574, %f511; ld.shared.f32 %f513, [%r47+3860]; add.ftz.f32 %f514, %f512, %f513; ld.shared.f32 %f515, [%r47+3864]; add.ftz.f32 %f498, %f514, %f515; mov.u32 %r215, 1; mov.u32 %r228, 31; mov.u32 %r229, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f498, %r215, %r228, %r229; @p add.f32 r0, r0, %f498; mov.f32 %f496, r0;} // end inline asm mov.u32 %r218, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f496, %r218, %r228, %r229; @p add.f32 r0, r0, %f496; mov.f32 %f499, r0;} // end inline asm mov.u32 %r221, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f499, %r221, %r228, %r229; @p add.f32 r0, r0, %f499; mov.f32 %f502, r0;} // end inline asm mov.u32 %r224, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f502, %r224, %r228, %r229; @p add.f32 r0, r0, %f502; mov.f32 %f505, r0;} // end inline asm mov.u32 %r227, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f505, %r227, %r228, %r229; @p add.f32 r0, r0, %f505; mov.f32 %f574, r0;} // end inline asm $L__BB0_37: @%p8 bra $L__BB0_39; add.s32 %r259, %r116, -112; st.shared.f32 [%r259+4480], %f575; $L__BB0_39: bar.sync 0; @%p9 bra $L__BB0_41; ld.shared.f32 %f531, [%r47+4496]; add.ftz.f32 %f532, %f575, %f531; ld.shared.f32 %f533, [%r47+4500]; add.ftz.f32 %f534, %f532, %f533; ld.shared.f32 %f535, [%r47+4504]; add.ftz.f32 %f518, %f534, %f535; mov.u32 %r231, 1; mov.u32 %r244, 31; mov.u32 %r245, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f518, %r231, %r244, %r245; @p add.f32 r0, r0, %f518; mov.f32 %f516, r0;} // end inline asm mov.u32 %r234, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f516, %r234, %r244, %r245; @p add.f32 r0, r0, %f516; mov.f32 %f519, r0;} // end inline asm mov.u32 %r237, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f519, %r237, %r244, %r245; @p add.f32 r0, r0, %f519; mov.f32 %f522, r0;} // end inline asm mov.u32 %r240, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f522, %r240, %r244, %r245; @p add.f32 r0, r0, %f522; mov.f32 %f525, r0;} // end inline asm mov.u32 %r243, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f525, %r243, %r244, %r245; @p add.f32 r0, r0, %f525; mov.f32 %f575, r0;} // end inline asm $L__BB0_41: mov.u32 %r246, %tid.y; or.b32 %r248, %r57, %r246; setp.ne.s32 %p24, %r248, 0; @%p24 bra $L__BB0_45; ld.param.u64 %rd65, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+8]; ld.param.u64 %rd64, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0]; cvta.to.global.u64 %rd37, %rd64; setp.eq.s64 %p25, %rd65, 0; mul.ftz.f32 %f59, %f77, %f568; mov.u32 %r249, %ctaid.x; cvt.s64.s32 %rd6, %r249; mul.wide.s32 %rd38, %r249, 2; add.s64 %rd7, %rd37, %rd38; mul.ftz.f32 %f60, %f77, %f569; add.s32 %r250, %r53, %r249; cvt.s64.s32 %rd8, %r53; mul.wide.s32 %rd39, %r53, 2; add.s64 %rd9, %rd7, %rd39; mul.ftz.f32 %f61, %f77, %f570; add.s32 %r251, %r250, %r53; cvt.s64.s32 %rd10, %r251; mul.wide.s32 %rd40, %r251, 2; add.s64 %rd12, %rd37, %rd40; mul.ftz.f32 %f62, %f77, %f571; mul.ftz.f32 %f63, %f77, %f572; mul.ftz.f32 %f64, %f77, %f573; mul.ftz.f32 %f65, %f77, %f574; mul.ftz.f32 %f66, %f77, %f575; @%p25 bra $L__BB0_44; ld.param.u64 %rd66, [_Z27dequant_gemv_group32_batch823DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd41, %rd66; shl.b64 %rd42, %rd6, 1; add.s64 %rd43, %rd41, %rd42; ld.global.u16 %rs263, [%rd43]; // begin inline asm { cvt.f32.f16 %f536, %rs263;} // end inline asm fma.rn.ftz.f32 %f537, %f78, %f536, %f59; // begin inline asm { cvt.rn.f16.f32 %rs264, %f537;} // end inline asm st.global.u16 [%rd7], %rs264; shl.b64 %rd44, %rd8, 1; add.s64 %rd45, %rd43, %rd44; ld.global.u16 %rs265, [%rd45]; // begin inline asm { cvt.f32.f16 %f538, %rs265;} // end inline asm fma.rn.ftz.f32 %f539, %f78, %f538, %f60; // begin inline asm { cvt.rn.f16.f32 %rs266, %f539;} // end inline asm st.global.u16 [%rd9], %rs266; shl.b64 %rd46, %rd10, 1; add.s64 %rd47, %rd41, %rd46; ld.global.u16 %rs267, [%rd47]; // begin inline asm { cvt.f32.f16 %f540, %rs267;} // end inline asm fma.rn.ftz.f32 %f541, %f78, %f540, %f61; // begin inline asm { cvt.rn.f16.f32 %rs268, %f541;} // end inline asm st.global.u16 [%rd12], %rs268; add.s64 %rd48, %rd47, %rd44; ld.global.u16 %rs269, [%rd48]; // begin inline asm { cvt.f32.f16 %f542, %rs269;} // end inline asm fma.rn.ftz.f32 %f543, %f78, %f542, %f62; // begin inline asm { cvt.rn.f16.f32 %rs270, %f543;} // end inline asm add.s64 %rd49, %rd12, %rd44; st.global.u16 [%rd49], %rs270; add.s64 %rd50, %rd48, %rd44; ld.global.u16 %rs271, [%rd50]; // begin inline asm { cvt.f32.f16 %f544, %rs271;} // end inline asm fma.rn.ftz.f32 %f545, %f78, %f544, %f63; // begin inline asm { cvt.rn.f16.f32 %rs272, %f545;} // end inline asm add.s64 %rd51, %rd49, %rd44; st.global.u16 [%rd51], %rs272; add.s64 %rd52, %rd50, %rd44; ld.global.u16 %rs273, [%rd52]; // begin inline asm { cvt.f32.f16 %f546, %rs273;} // end inline asm fma.rn.ftz.f32 %f547, %f78, %f546, %f64; // begin inline asm { cvt.rn.f16.f32 %rs274, %f547;} // end inline asm add.s64 %rd53, %rd51, %rd44; st.global.u16 [%rd53], %rs274; add.s64 %rd54, %rd52, %rd44; ld.global.u16 %rs275, [%rd54]; // begin inline asm { cvt.f32.f16 %f548, %rs275;} // end inline asm fma.rn.ftz.f32 %f549, %f78, %f548, %f65; // begin inline asm { cvt.rn.f16.f32 %rs276, %f549;} // end inline asm add.s64 %rd55, %rd53, %rd44; st.global.u16 [%rd55], %rs276; add.s64 %rd56, %rd54, %rd44; ld.global.u16 %rs277, [%rd56]; // begin inline asm { cvt.f32.f16 %f550, %rs277;} // end inline asm fma.rn.ftz.f32 %f551, %f78, %f550, %f66; // begin inline asm { cvt.rn.f16.f32 %rs278, %f551;} // end inline asm add.s64 %rd57, %rd55, %rd44; st.global.u16 [%rd57], %rs278; bra.uni $L__BB0_45; $L__BB0_44: // begin inline asm { cvt.rn.f16.f32 %rs279, %f59;} // end inline asm st.global.u16 [%rd7], %rs279; // begin inline asm { cvt.rn.f16.f32 %rs280, %f60;} // end inline asm st.global.u16 [%rd9], %rs280; // begin inline asm { cvt.rn.f16.f32 %rs281, %f61;} // end inline asm st.global.u16 [%rd12], %rs281; // begin inline asm { cvt.rn.f16.f32 %rs282, %f62;} // end inline asm shl.b64 %rd58, %rd8, 1; add.s64 %rd59, %rd12, %rd58; st.global.u16 [%rd59], %rs282; // begin inline asm { cvt.rn.f16.f32 %rs283, %f63;} // end inline asm add.s64 %rd60, %rd59, %rd58; st.global.u16 [%rd60], %rs283; // begin inline asm { cvt.rn.f16.f32 %rs284, %f64;} // end inline asm add.s64 %rd61, %rd60, %rd58; st.global.u16 [%rd61], %rs284; // begin inline asm { cvt.rn.f16.f32 %rs285, %f65;} // end inline asm add.s64 %rd62, %rd61, %rd58; st.global.u16 [%rd62], %rs285; // begin inline asm { cvt.rn.f16.f32 %rs286, %f66;} // end inline asm add.s64 %rd63, %rd62, %rd58; st.global.u16 [%rd63], %rs286; $L__BB0_45: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }