RNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_3E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_4E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_5E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_6E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_7E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_8E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders2_9E[1]; .global .align 1 .b8 _ZN37_INTERNAL_36609967_7_gemv_cu_a568b9746thrust12placeholders3_10E[1]; .visible .entry _Z27dequant_gemv_group64_batch823DequantGemvKernelParams( .param .align 8 .b8 _Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0[80] ) { .reg .pred %p<33>; .reg .b16 %rs<422>; .reg .f32 %f<914>; .reg .b32 %r<482>; .reg .b64 %rd<109>; // demoted variable .shared .align 16 .b8 _ZZ9gemv_int4ILi4ELi64ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage[5120]; ld.param.v2.u32 {%r59, %r60}, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+48]; ld.param.v2.u32 {%r61, %r62}, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+56]; ld.param.v2.f32 {%f95, %f96}, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+64]; ld.param.v4.u8 {%rs43, %rs44, %rs45, %rs46}, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+72]; ld.param.u64 %rd36, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+40]; ld.param.u64 %rd35, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+32]; ld.param.u64 %rd34, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+24]; ld.param.u64 %rd33, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+16]; ld.param.u64 %rd32, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+8]; cvta.to.global.u64 %rd2, %rd32; mov.u32 %r481, %tid.y; shl.b32 %r63, %r481, 5; mov.u32 %r64, %tid.x; add.s32 %r480, %r63, %r64; shl.b32 %r479, %r480, 1; setp.ge.u32 %p1, %r479, %r61; mov.f32 %f882, 0f00000000; mov.f32 %f883, %f882; mov.f32 %f884, %f882; mov.f32 %f885, %f882; mov.f32 %f886, %f882; mov.f32 %f887, %f882; mov.f32 %f888, %f882; mov.f32 %f889, %f882; @%p1 bra $L__BB0_9; cvta.to.global.u64 %rd3, %rd33; mov.u32 %r65, %ctaid.x; mul.lo.s32 %r6, %r62, %r65; $L__BB0_2: mad.lo.s32 %r69, %r61, %r65, %r479; mul.wide.u32 %rd43, %r69, 4; add.s64 %rd38, %rd34, %rd43; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd37, 1.0; // end inline asm // begin inline asm ld.global.L1::no_allocate.L2::cache_hint.v2.u32 {%r66,%r67}, [%rd38], %rd37; // end inline asm shr.u32 %r71, %r64, 2; shl.b32 %r72, %r481, 3; add.s32 %r12, %r72, %r71; add.s32 %r13, %r12, %r6; mul.wide.s32 %rd44, %r13, 2; add.s64 %rd41, %rd36, %rd44; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd40, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u16 %rs51, [%rd41], %rd40; // end inline asm // begin inline asm { cvt.f32.f16 %f105, %rs51;} // end inline asm shl.b16 %rs421, %rs43, 3; setp.eq.s64 %p2, %rd35, 0; @%p2 bra $L__BB0_4; shr.u32 %r73, %r13, 31; add.s32 %r74, %r13, %r73; shr.s32 %r75, %r74, 1; cvt.s64.s32 %rd48, %r75; add.s64 %rd46, %rd35, %rd48; // begin inline asm createpolicy.fractional.L2::evict_first.L2::evict_first.b64 %rd45, 1.0; // end inline asm // begin inline asm ld.global.L1::evict_last.L2::cache_hint.u8 %rs53, [%rd46], %rd45; // end inline asm cvt.u32.u16 %r76, %rs53; and.b32 %r77, %r76, 255; shl.b32 %r78, %r12, 2; and.b32 %r79, %r78, 4; shr.u32 %r80, %r77, %r79; cvt.u16.u32 %rs54, %r80; and.b16 %rs421, %rs54, 15; $L__BB0_4: shl.b32 %r14, %r480, 4; setp.ge.s32 %p3, %r14, %r59; @%p3 bra $L__BB0_8; setp.eq.s16 %p4, %rs43, 0; shr.u16 %rs56, %rs421, 3; and.b16 %rs57, %rs56, 1; setp.eq.b16 %p5, %rs57, 1; and.pred %p6, %p4, %p5; selp.b16 %rs58, -16, 0, %p6; or.b16 %rs59, %rs58, %rs421; cvt.s16.s8 %rs60, %rs59; cvt.rn.f32.s16 %f10, %rs60; mul.wide.s32 %rd49, %r14, 2; add.s64 %rd7, %rd3, %rd49; ld.global.v4.u32 {%r81, %r82, %r83, %r84}, [%rd7]; mul.wide.s32 %rd9, %r59, 2; add.s64 %rd8, %rd7, %rd9; ld.global.v4.u32 {%r85, %r86, %r87, %r88}, [%rd8]; add.s32 %r89, %r14, %r59; add.s32 %r90, %r89, %r59; mul.wide.s32 %rd50, %r90, 2; add.s64 %rd51, %rd3, %rd50; ld.global.v4.u32 {%r91, %r92, %r93, %r94}, [%rd51]; add.s64 %rd52, %rd51, %rd9; ld.global.v4.u32 {%r95, %r96, %r97, %r98}, [%rd52]; add.s64 %rd53, %rd52, %rd9; ld.global.v4.u32 {%r99, %r100, %r101, %r102}, [%rd53]; add.s64 %rd54, %rd53, %rd9; ld.global.v4.u32 {%r103, %r104, %r105, %r106}, [%rd54]; add.s64 %rd55, %rd54, %rd9; ld.global.v4.u32 {%r107, %r108, %r109, %r110}, [%rd55]; add.s64 %rd56, %rd55, %rd9; ld.global.v4.u32 {%r111, %r112, %r113, %r114}, [%rd56]; cvt.u16.u32 %rs5, %r66; and.b16 %rs6, %rs5, 15; shr.u32 %r115, %r66, 4; cvt.u16.u32 %rs7, %r115; and.b16 %rs8, %rs7, 15; shr.u32 %r116, %r66, 8; cvt.u16.u32 %rs9, %r116; and.b16 %rs10, %rs9, 15; shr.u32 %r117, %r66, 12; cvt.u16.u32 %rs11, %r117; and.b16 %rs12, %rs11, 15; shr.u32 %r118, %r66, 16; cvt.u16.u32 %rs13, %r118; and.b16 %rs14, %rs13, 15; shr.u32 %r119, %r66, 20; cvt.u16.u32 %rs15, %r119; and.b16 %rs16, %rs15, 15; shr.u32 %r120, %r66, 24; cvt.u16.u32 %rs17, %r120; and.b16 %rs18, %rs17, 15; shr.u32 %r121, %r66, 28; cvt.u16.u32 %rs19, %r121; cvt.u16.u32 %rs20, %r67; and.b16 %rs21, %rs20, 15; shr.u32 %r122, %r67, 4; cvt.u16.u32 %rs22, %r122; and.b16 %rs23, %rs22, 15; shr.u32 %r123, %r67, 8; cvt.u16.u32 %rs24, %r123; and.b16 %rs25, %rs24, 15; shr.u32 %r124, %r67, 12; cvt.u16.u32 %rs26, %r124; and.b16 %rs27, %rs26, 15; shr.u32 %r125, %r67, 16; cvt.u16.u32 %rs28, %r125; and.b16 %rs29, %rs28, 15; shr.u32 %r126, %r67, 20; cvt.u16.u32 %rs30, %r126; and.b16 %rs31, %rs30, 15; shr.u32 %r127, %r67, 24; cvt.u16.u32 %rs32, %r127; and.b16 %rs33, %rs32, 15; shr.u32 %r128, %r67, 28; cvt.u16.u32 %rs34, %r128; add.s64 %rd57, %rd8, %rd9; add.s64 %rd10, %rd57, 16; add.s64 %rd11, %rd10, %rd9; add.s64 %rd12, %rd11, %rd9; add.s64 %rd13, %rd12, %rd9; add.s64 %rd14, %rd13, %rd9; @%p4 bra $L__BB0_7; cvt.rn.f32.s16 %f234, %rs6; sub.ftz.f32 %f235, %f234, %f10; mul.ftz.f32 %f236, %f105, %f235; mov.b32 {%rs61, %rs69}, %r81; // begin inline asm { cvt.f32.f16 %f106, %rs61;} // end inline asm fma.rn.ftz.f32 %f237, %f236, %f106, %f889; mov.b32 {%rs62, %rs70}, %r85; // begin inline asm { cvt.f32.f16 %f107, %rs62;} // end inline asm fma.rn.ftz.f32 %f238, %f236, %f107, %f888; mov.b32 {%rs63, %rs71}, %r91; // begin inline asm { cvt.f32.f16 %f108, %rs63;} // end inline asm fma.rn.ftz.f32 %f239, %f236, %f108, %f887; mov.b32 {%rs64, %rs72}, %r95; // begin inline asm { cvt.f32.f16 %f109, %rs64;} // end inline asm fma.rn.ftz.f32 %f240, %f236, %f109, %f886; mov.b32 {%rs65, %rs73}, %r99; // begin inline asm { cvt.f32.f16 %f110, %rs65;} // end inline asm fma.rn.ftz.f32 %f241, %f236, %f110, %f885; mov.b32 {%rs66, %rs74}, %r103; // begin inline asm { cvt.f32.f16 %f111, %rs66;} // end inline asm fma.rn.ftz.f32 %f242, %f236, %f111, %f884; mov.b32 {%rs67, %rs75}, %r107; // begin inline asm { cvt.f32.f16 %f112, %rs67;} // end inline asm fma.rn.ftz.f32 %f243, %f236, %f112, %f883; mov.b32 {%rs68, %rs76}, %r111; // begin inline asm { cvt.f32.f16 %f113, %rs68;} // end inline asm fma.rn.ftz.f32 %f244, %f236, %f113, %f882; cvt.rn.f32.s16 %f245, %rs8; sub.ftz.f32 %f246, %f245, %f10; mul.ftz.f32 %f247, %f105, %f246; // begin inline asm { cvt.f32.f16 %f114, %rs69;} // end inline asm fma.rn.ftz.f32 %f248, %f247, %f114, %f237; // begin inline asm { cvt.f32.f16 %f115, %rs70;} // end inline asm fma.rn.ftz.f32 %f249, %f247, %f115, %f238; // begin inline asm { cvt.f32.f16 %f116, %rs71;} // end inline asm fma.rn.ftz.f32 %f250, %f247, %f116, %f239; // begin inline asm { cvt.f32.f16 %f117, %rs72;} // end inline asm fma.rn.ftz.f32 %f251, %f247, %f117, %f240; // begin inline asm { cvt.f32.f16 %f118, %rs73;} // end inline asm fma.rn.ftz.f32 %f252, %f247, %f118, %f241; // begin inline asm { cvt.f32.f16 %f119, %rs74;} // end inline asm fma.rn.ftz.f32 %f253, %f247, %f119, %f242; // begin inline asm { cvt.f32.f16 %f120, %rs75;} // end inline asm fma.rn.ftz.f32 %f254, %f247, %f120, %f243; // begin inline asm { cvt.f32.f16 %f121, %rs76;} // end inline asm fma.rn.ftz.f32 %f255, %f247, %f121, %f244; cvt.rn.f32.s16 %f256, %rs10; sub.ftz.f32 %f257, %f256, %f10; mul.ftz.f32 %f258, %f105, %f257; mov.b32 {%rs77, %rs85}, %r82; // begin inline asm { cvt.f32.f16 %f122, %rs77;} // end inline asm fma.rn.ftz.f32 %f259, %f258, %f122, %f248; mov.b32 {%rs78, %rs86}, %r86; // begin inline asm { cvt.f32.f16 %f123, %rs78;} // end inline asm fma.rn.ftz.f32 %f260, %f258, %f123, %f249; mov.b32 {%rs79, %rs87}, %r92; // begin inline asm { cvt.f32.f16 %f124, %rs79;} // end inline asm fma.rn.ftz.f32 %f261, %f258, %f124, %f250; mov.b32 {%rs80, %rs88}, %r96; // begin inline asm { cvt.f32.f16 %f125, %rs80;} // end inline asm fma.rn.ftz.f32 %f262, %f258, %f125, %f251; mov.b32 {%rs81, %rs89}, %r100; // begin inline asm { cvt.f32.f16 %f126, %rs81;} // end inline asm fma.rn.ftz.f32 %f263, %f258, %f126, %f252; mov.b32 {%rs82, %rs90}, %r104; // begin inline asm { cvt.f32.f16 %f127, %rs82;} // end inline asm fma.rn.ftz.f32 %f264, %f258, %f127, %f253; mov.b32 {%rs83, %rs91}, %r108; // begin inline asm { cvt.f32.f16 %f128, %rs83;} // end inline asm fma.rn.ftz.f32 %f265, %f258, %f128, %f254; mov.b32 {%rs84, %rs92}, %r112; // begin inline asm { cvt.f32.f16 %f129, %rs84;} // end inline asm fma.rn.ftz.f32 %f266, %f258, %f129, %f255; cvt.rn.f32.s16 %f267, %rs12; sub.ftz.f32 %f268, %f267, %f10; mul.ftz.f32 %f269, %f105, %f268; // begin inline asm { cvt.f32.f16 %f130, %rs85;} // end inline asm fma.rn.ftz.f32 %f270, %f269, %f130, %f259; // begin inline asm { cvt.f32.f16 %f131, %rs86;} // end inline asm fma.rn.ftz.f32 %f271, %f269, %f131, %f260; // begin inline asm { cvt.f32.f16 %f132, %rs87;} // end inline asm fma.rn.ftz.f32 %f272, %f269, %f132, %f261; // begin inline asm { cvt.f32.f16 %f133, %rs88;} // end inline asm fma.rn.ftz.f32 %f273, %f269, %f133, %f262; // begin inline asm { cvt.f32.f16 %f134, %rs89;} // end inline asm fma.rn.ftz.f32 %f274, %f269, %f134, %f263; // begin inline asm { cvt.f32.f16 %f135, %rs90;} // end inline asm fma.rn.ftz.f32 %f275, %f269, %f135, %f264; // begin inline asm { cvt.f32.f16 %f136, %rs91;} // end inline asm fma.rn.ftz.f32 %f276, %f269, %f136, %f265; // begin inline asm { cvt.f32.f16 %f137, %rs92;} // end inline asm fma.rn.ftz.f32 %f277, %f269, %f137, %f266; cvt.rn.f32.s16 %f278, %rs14; sub.ftz.f32 %f279, %f278, %f10; mul.ftz.f32 %f280, %f105, %f279; mov.b32 {%rs93, %rs101}, %r83; // begin inline asm { cvt.f32.f16 %f138, %rs93;} // end inline asm fma.rn.ftz.f32 %f281, %f280, %f138, %f270; mov.b32 {%rs94, %rs102}, %r87; // begin inline asm { cvt.f32.f16 %f139, %rs94;} // end inline asm fma.rn.ftz.f32 %f282, %f280, %f139, %f271; mov.b32 {%rs95, %rs103}, %r93; // begin inline asm { cvt.f32.f16 %f140, %rs95;} // end inline asm fma.rn.ftz.f32 %f283, %f280, %f140, %f272; mov.b32 {%rs96, %rs104}, %r97; // begin inline asm { cvt.f32.f16 %f141, %rs96;} // end inline asm fma.rn.ftz.f32 %f284, %f280, %f141, %f273; mov.b32 {%rs97, %rs105}, %r101; // begin inline asm { cvt.f32.f16 %f142, %rs97;} // end inline asm fma.rn.ftz.f32 %f285, %f280, %f142, %f274; mov.b32 {%rs98, %rs106}, %r105; // begin inline asm { cvt.f32.f16 %f143, %rs98;} // end inline asm fma.rn.ftz.f32 %f286, %f280, %f143, %f275; mov.b32 {%rs99, %rs107}, %r109; // begin inline asm { cvt.f32.f16 %f144, %rs99;} // end inline asm fma.rn.ftz.f32 %f287, %f280, %f144, %f276; mov.b32 {%rs100, %rs108}, %r113; // begin inline asm { cvt.f32.f16 %f145, %rs100;} // end inline asm fma.rn.ftz.f32 %f288, %f280, %f145, %f277; cvt.rn.f32.s16 %f289, %rs16; sub.ftz.f32 %f290, %f289, %f10; mul.ftz.f32 %f291, %f105, %f290; // begin inline asm { cvt.f32.f16 %f146, %rs101;} // end inline asm fma.rn.ftz.f32 %f292, %f291, %f146, %f281; // begin inline asm { cvt.f32.f16 %f147, %rs102;} // end inline asm fma.rn.ftz.f32 %f293, %f291, %f147, %f282; // begin inline asm { cvt.f32.f16 %f148, %rs103;} // end inline asm fma.rn.ftz.f32 %f294, %f291, %f148, %f283; // begin inline asm { cvt.f32.f16 %f149, %rs104;} // end inline asm fma.rn.ftz.f32 %f295, %f291, %f149, %f284; // begin inline asm { cvt.f32.f16 %f150, %rs105;} // end inline asm fma.rn.ftz.f32 %f296, %f291, %f150, %f285; // begin inline asm { cvt.f32.f16 %f151, %rs106;} // end inline asm fma.rn.ftz.f32 %f297, %f291, %f151, %f286; // begin inline asm { cvt.f32.f16 %f152, %rs107;} // end inline asm fma.rn.ftz.f32 %f298, %f291, %f152, %f287; // begin inline asm { cvt.f32.f16 %f153, %rs108;} // end inline asm fma.rn.ftz.f32 %f299, %f291, %f153, %f288; cvt.rn.f32.s16 %f300, %rs18; sub.ftz.f32 %f301, %f300, %f10; mul.ftz.f32 %f302, %f105, %f301; mov.b32 {%rs109, %rs117}, %r84; // begin inline asm { cvt.f32.f16 %f154, %rs109;} // end inline asm fma.rn.ftz.f32 %f303, %f302, %f154, %f292; mov.b32 {%rs110, %rs118}, %r88; // begin inline asm { cvt.f32.f16 %f155, %rs110;} // end inline asm fma.rn.ftz.f32 %f304, %f302, %f155, %f293; mov.b32 {%rs111, %rs119}, %r94; // begin inline asm { cvt.f32.f16 %f156, %rs111;} // end inline asm fma.rn.ftz.f32 %f305, %f302, %f156, %f294; mov.b32 {%rs112, %rs120}, %r98; // begin inline asm { cvt.f32.f16 %f157, %rs112;} // end inline asm fma.rn.ftz.f32 %f306, %f302, %f157, %f295; mov.b32 {%rs113, %rs121}, %r102; // begin inline asm { cvt.f32.f16 %f158, %rs113;} // end inline asm fma.rn.ftz.f32 %f307, %f302, %f158, %f296; mov.b32 {%rs114, %rs122}, %r106; // begin inline asm { cvt.f32.f16 %f159, %rs114;} // end inline asm fma.rn.ftz.f32 %f308, %f302, %f159, %f297; mov.b32 {%rs115, %rs123}, %r110; // begin inline asm { cvt.f32.f16 %f160, %rs115;} // end inline asm fma.rn.ftz.f32 %f309, %f302, %f160, %f298; mov.b32 {%rs116, %rs124}, %r114; // begin inline asm { cvt.f32.f16 %f161, %rs116;} // end inline asm fma.rn.ftz.f32 %f310, %f302, %f161, %f299; cvt.rn.f32.s16 %f311, %rs19; sub.ftz.f32 %f312, %f311, %f10; mul.ftz.f32 %f313, %f105, %f312; // begin inline asm { cvt.f32.f16 %f162, %rs117;} // end inline asm fma.rn.ftz.f32 %f314, %f313, %f162, %f303; // begin inline asm { cvt.f32.f16 %f163, %rs118;} // end inline asm fma.rn.ftz.f32 %f315, %f313, %f163, %f304; // begin inline asm { cvt.f32.f16 %f164, %rs119;} // end inline asm fma.rn.ftz.f32 %f316, %f313, %f164, %f305; // begin inline asm { cvt.f32.f16 %f165, %rs120;} // end inline asm fma.rn.ftz.f32 %f317, %f313, %f165, %f306; // begin inline asm { cvt.f32.f16 %f166, %rs121;} // end inline asm fma.rn.ftz.f32 %f318, %f313, %f166, %f307; // begin inline asm { cvt.f32.f16 %f167, %rs122;} // end inline asm fma.rn.ftz.f32 %f319, %f313, %f167, %f308; // begin inline asm { cvt.f32.f16 %f168, %rs123;} // end inline asm fma.rn.ftz.f32 %f320, %f313, %f168, %f309; // begin inline asm { cvt.f32.f16 %f169, %rs124;} // end inline asm fma.rn.ftz.f32 %f321, %f313, %f169, %f310; ld.global.v4.u32 {%r129, %r130, %r131, %r132}, [%rd7+16]; ld.global.v4.u32 {%r137, %r138, %r139, %r140}, [%rd8+16]; ld.global.v4.u32 {%r145, %r146, %r147, %r148}, [%rd10]; ld.global.v4.u32 {%r153, %r154, %r155, %r156}, [%rd11]; ld.global.v4.u32 {%r161, %r162, %r163, %r164}, [%rd12]; ld.global.v4.u32 {%r169, %r170, %r171, %r172}, [%rd13]; ld.global.v4.u32 {%r177, %r178, %r179, %r180}, [%rd14]; add.s64 %rd58, %rd14, %rd9; ld.global.v4.u32 {%r185, %r186, %r187, %r188}, [%rd58]; cvt.rn.f32.s16 %f322, %rs21; sub.ftz.f32 %f323, %f322, %f10; mul.ftz.f32 %f324, %f105, %f323; mov.b32 {%rs125, %rs133}, %r129; // begin inline asm { cvt.f32.f16 %f170, %rs125;} // end inline asm fma.rn.ftz.f32 %f325, %f324, %f170, %f314; mov.b32 {%rs126, %rs134}, %r137; // begin inline asm { cvt.f32.f16 %f171, %rs126;} // end inline asm fma.rn.ftz.f32 %f326, %f324, %f171, %f315; mov.b32 {%rs127, %rs135}, %r145; // begin inline asm { cvt.f32.f16 %f172, %rs127;} // end inline asm fma.rn.ftz.f32 %f327, %f324, %f172, %f316; mov.b32 {%rs128, %rs136}, %r153; // begin inline asm { cvt.f32.f16 %f173, %rs128;} // end inline asm fma.rn.ftz.f32 %f328, %f324, %f173, %f317; mov.b32 {%rs129, %rs137}, %r161; // begin inline asm { cvt.f32.f16 %f174, %rs129;} // end inline asm fma.rn.ftz.f32 %f329, %f324, %f174, %f318; mov.b32 {%rs130, %rs138}, %r169; // begin inline asm { cvt.f32.f16 %f175, %rs130;} // end inline asm fma.rn.ftz.f32 %f330, %f324, %f175, %f319; mov.b32 {%rs131, %rs139}, %r177; // begin inline asm { cvt.f32.f16 %f176, %rs131;} // end inline asm fma.rn.ftz.f32 %f331, %f324, %f176, %f320; mov.b32 {%rs132, %rs140}, %r185; // begin inline asm { cvt.f32.f16 %f177, %rs132;} // end inline asm fma.rn.ftz.f32 %f332, %f324, %f177, %f321; cvt.rn.f32.s16 %f333, %rs23; sub.ftz.f32 %f334, %f333, %f10; mul.ftz.f32 %f335, %f105, %f334; // begin inline asm { cvt.f32.f16 %f178, %rs133;} // end inline asm fma.rn.ftz.f32 %f336, %f335, %f178, %f325; // begin inline asm { cvt.f32.f16 %f179, %rs134;} // end inline asm fma.rn.ftz.f32 %f337, %f335, %f179, %f326; // begin inline asm { cvt.f32.f16 %f180, %rs135;} // end inline asm fma.rn.ftz.f32 %f338, %f335, %f180, %f327; // begin inline asm { cvt.f32.f16 %f181, %rs136;} // end inline asm fma.rn.ftz.f32 %f339, %f335, %f181, %f328; // begin inline asm { cvt.f32.f16 %f182, %rs137;} // end inline asm fma.rn.ftz.f32 %f340, %f335, %f182, %f329; // begin inline asm { cvt.f32.f16 %f183, %rs138;} // end inline asm fma.rn.ftz.f32 %f341, %f335, %f183, %f330; // begin inline asm { cvt.f32.f16 %f184, %rs139;} // end inline asm fma.rn.ftz.f32 %f342, %f335, %f184, %f331; // begin inline asm { cvt.f32.f16 %f185, %rs140;} // end inline asm fma.rn.ftz.f32 %f343, %f335, %f185, %f332; cvt.rn.f32.s16 %f344, %rs25; sub.ftz.f32 %f345, %f344, %f10; mul.ftz.f32 %f346, %f105, %f345; mov.b32 {%rs141, %rs149}, %r130; // begin inline asm { cvt.f32.f16 %f186, %rs141;} // end inline asm fma.rn.ftz.f32 %f347, %f346, %f186, %f336; mov.b32 {%rs142, %rs150}, %r138; // begin inline asm { cvt.f32.f16 %f187, %rs142;} // end inline asm fma.rn.ftz.f32 %f348, %f346, %f187, %f337; mov.b32 {%rs143, %rs151}, %r146; // begin inline asm { cvt.f32.f16 %f188, %rs143;} // end inline asm fma.rn.ftz.f32 %f349, %f346, %f188, %f338; mov.b32 {%rs144, %rs152}, %r154; // begin inline asm { cvt.f32.f16 %f189, %rs144;} // end inline asm fma.rn.ftz.f32 %f350, %f346, %f189, %f339; mov.b32 {%rs145, %rs153}, %r162; // begin inline asm { cvt.f32.f16 %f190, %rs145;} // end inline asm fma.rn.ftz.f32 %f351, %f346, %f190, %f340; mov.b32 {%rs146, %rs154}, %r170; // begin inline asm { cvt.f32.f16 %f191, %rs146;} // end inline asm fma.rn.ftz.f32 %f352, %f346, %f191, %f341; mov.b32 {%rs147, %rs155}, %r178; // begin inline asm { cvt.f32.f16 %f192, %rs147;} // end inline asm fma.rn.ftz.f32 %f353, %f346, %f192, %f342; mov.b32 {%rs148, %rs156}, %r186; // begin inline asm { cvt.f32.f16 %f193, %rs148;} // end inline asm fma.rn.ftz.f32 %f354, %f346, %f193, %f343; cvt.rn.f32.s16 %f355, %rs27; sub.ftz.f32 %f356, %f355, %f10; mul.ftz.f32 %f357, %f105, %f356; // begin inline asm { cvt.f32.f16 %f194, %rs149;} // end inline asm fma.rn.ftz.f32 %f358, %f357, %f194, %f347; // begin inline asm { cvt.f32.f16 %f195, %rs150;} // end inline asm fma.rn.ftz.f32 %f359, %f357, %f195, %f348; // begin inline asm { cvt.f32.f16 %f196, %rs151;} // end inline asm fma.rn.ftz.f32 %f360, %f357, %f196, %f349; // begin inline asm { cvt.f32.f16 %f197, %rs152;} // end inline asm fma.rn.ftz.f32 %f361, %f357, %f197, %f350; // begin inline asm { cvt.f32.f16 %f198, %rs153;} // end inline asm fma.rn.ftz.f32 %f362, %f357, %f198, %f351; // begin inline asm { cvt.f32.f16 %f199, %rs154;} // end inline asm fma.rn.ftz.f32 %f363, %f357, %f199, %f352; // begin inline asm { cvt.f32.f16 %f200, %rs155;} // end inline asm fma.rn.ftz.f32 %f364, %f357, %f200, %f353; // begin inline asm { cvt.f32.f16 %f201, %rs156;} // end inline asm fma.rn.ftz.f32 %f365, %f357, %f201, %f354; cvt.rn.f32.s16 %f366, %rs29; sub.ftz.f32 %f367, %f366, %f10; mul.ftz.f32 %f368, %f105, %f367; mov.b32 {%rs157, %rs165}, %r131; // begin inline asm { cvt.f32.f16 %f202, %rs157;} // end inline asm fma.rn.ftz.f32 %f369, %f368, %f202, %f358; mov.b32 {%rs158, %rs166}, %r139; // begin inline asm { cvt.f32.f16 %f203, %rs158;} // end inline asm fma.rn.ftz.f32 %f370, %f368, %f203, %f359; mov.b32 {%rs159, %rs167}, %r147; // begin inline asm { cvt.f32.f16 %f204, %rs159;} // end inline asm fma.rn.ftz.f32 %f371, %f368, %f204, %f360; mov.b32 {%rs160, %rs168}, %r155; // begin inline asm { cvt.f32.f16 %f205, %rs160;} // end inline asm fma.rn.ftz.f32 %f372, %f368, %f205, %f361; mov.b32 {%rs161, %rs169}, %r163; // begin inline asm { cvt.f32.f16 %f206, %rs161;} // end inline asm fma.rn.ftz.f32 %f373, %f368, %f206, %f362; mov.b32 {%rs162, %rs170}, %r171; // begin inline asm { cvt.f32.f16 %f207, %rs162;} // end inline asm fma.rn.ftz.f32 %f374, %f368, %f207, %f363; mov.b32 {%rs163, %rs171}, %r179; // begin inline asm { cvt.f32.f16 %f208, %rs163;} // end inline asm fma.rn.ftz.f32 %f375, %f368, %f208, %f364; mov.b32 {%rs164, %rs172}, %r187; // begin inline asm { cvt.f32.f16 %f209, %rs164;} // end inline asm fma.rn.ftz.f32 %f376, %f368, %f209, %f365; cvt.rn.f32.s16 %f377, %rs31; sub.ftz.f32 %f378, %f377, %f10; mul.ftz.f32 %f379, %f105, %f378; // begin inline asm { cvt.f32.f16 %f210, %rs165;} // end inline asm fma.rn.ftz.f32 %f380, %f379, %f210, %f369; // begin inline asm { cvt.f32.f16 %f211, %rs166;} // end inline asm fma.rn.ftz.f32 %f381, %f379, %f211, %f370; // begin inline asm { cvt.f32.f16 %f212, %rs167;} // end inline asm fma.rn.ftz.f32 %f382, %f379, %f212, %f371; // begin inline asm { cvt.f32.f16 %f213, %rs168;} // end inline asm fma.rn.ftz.f32 %f383, %f379, %f213, %f372; // begin inline asm { cvt.f32.f16 %f214, %rs169;} // end inline asm fma.rn.ftz.f32 %f384, %f379, %f214, %f373; // begin inline asm { cvt.f32.f16 %f215, %rs170;} // end inline asm fma.rn.ftz.f32 %f385, %f379, %f215, %f374; // begin inline asm { cvt.f32.f16 %f216, %rs171;} // end inline asm fma.rn.ftz.f32 %f386, %f379, %f216, %f375; // begin inline asm { cvt.f32.f16 %f217, %rs172;} // end inline asm fma.rn.ftz.f32 %f387, %f379, %f217, %f376; cvt.rn.f32.s16 %f388, %rs33; sub.ftz.f32 %f389, %f388, %f10; mul.ftz.f32 %f390, %f105, %f389; mov.b32 {%rs173, %rs181}, %r132; // begin inline asm { cvt.f32.f16 %f218, %rs173;} // end inline asm fma.rn.ftz.f32 %f391, %f390, %f218, %f380; mov.b32 {%rs174, %rs182}, %r140; // begin inline asm { cvt.f32.f16 %f219, %rs174;} // end inline asm fma.rn.ftz.f32 %f392, %f390, %f219, %f381; mov.b32 {%rs175, %rs183}, %r148; // begin inline asm { cvt.f32.f16 %f220, %rs175;} // end inline asm fma.rn.ftz.f32 %f393, %f390, %f220, %f382; mov.b32 {%rs176, %rs184}, %r156; // begin inline asm { cvt.f32.f16 %f221, %rs176;} // end inline asm fma.rn.ftz.f32 %f394, %f390, %f221, %f383; mov.b32 {%rs177, %rs185}, %r164; // begin inline asm { cvt.f32.f16 %f222, %rs177;} // end inline asm fma.rn.ftz.f32 %f395, %f390, %f222, %f384; mov.b32 {%rs178, %rs186}, %r172; // begin inline asm { cvt.f32.f16 %f223, %rs178;} // end inline asm fma.rn.ftz.f32 %f396, %f390, %f223, %f385; mov.b32 {%rs179, %rs187}, %r180; // begin inline asm { cvt.f32.f16 %f224, %rs179;} // end inline asm fma.rn.ftz.f32 %f397, %f390, %f224, %f386; mov.b32 {%rs180, %rs188}, %r188; // begin inline asm { cvt.f32.f16 %f225, %rs180;} // end inline asm fma.rn.ftz.f32 %f398, %f390, %f225, %f387; cvt.rn.f32.s16 %f399, %rs34; sub.ftz.f32 %f400, %f399, %f10; mul.ftz.f32 %f401, %f105, %f400; // begin inline asm { cvt.f32.f16 %f226, %rs181;} // end inline asm fma.rn.ftz.f32 %f889, %f401, %f226, %f391; // begin inline asm { cvt.f32.f16 %f227, %rs182;} // end inline asm fma.rn.ftz.f32 %f888, %f401, %f227, %f392; // begin inline asm { cvt.f32.f16 %f228, %rs183;} // end inline asm fma.rn.ftz.f32 %f887, %f401, %f228, %f393; // begin inline asm { cvt.f32.f16 %f229, %rs184;} // end inline asm fma.rn.ftz.f32 %f886, %f401, %f229, %f394; // begin inline asm { cvt.f32.f16 %f230, %rs185;} // end inline asm fma.rn.ftz.f32 %f885, %f401, %f230, %f395; // begin inline asm { cvt.f32.f16 %f231, %rs186;} // end inline asm fma.rn.ftz.f32 %f884, %f401, %f231, %f396; // begin inline asm { cvt.f32.f16 %f232, %rs187;} // end inline asm fma.rn.ftz.f32 %f883, %f401, %f232, %f397; // begin inline asm { cvt.f32.f16 %f233, %rs188;} // end inline asm fma.rn.ftz.f32 %f882, %f401, %f233, %f398; bra.uni $L__BB0_8; $L__BB0_7: shl.b16 %rs317, %rs5, 4; cvt.s16.s8 %rs318, %rs317; shr.s16 %rs319, %rs318, 7; and.b16 %rs320, %rs319, -16; or.b16 %rs321, %rs320, %rs6; cvt.rn.f32.s16 %f530, %rs321; sub.ftz.f32 %f531, %f530, %f10; mul.ftz.f32 %f532, %f105, %f531; mov.b32 {%rs189, %rs197}, %r81; // begin inline asm { cvt.f32.f16 %f402, %rs189;} // end inline asm fma.rn.ftz.f32 %f533, %f532, %f402, %f889; mov.b32 {%rs190, %rs198}, %r85; // begin inline asm { cvt.f32.f16 %f403, %rs190;} // end inline asm fma.rn.ftz.f32 %f534, %f532, %f403, %f888; mov.b32 {%rs191, %rs199}, %r91; // begin inline asm { cvt.f32.f16 %f404, %rs191;} // end inline asm fma.rn.ftz.f32 %f535, %f532, %f404, %f887; mov.b32 {%rs192, %rs200}, %r95; // begin inline asm { cvt.f32.f16 %f405, %rs192;} // end inline asm fma.rn.ftz.f32 %f536, %f532, %f405, %f886; mov.b32 {%rs193, %rs201}, %r99; // begin inline asm { cvt.f32.f16 %f406, %rs193;} // end inline asm fma.rn.ftz.f32 %f537, %f532, %f406, %f885; mov.b32 {%rs194, %rs202}, %r103; // begin inline asm { cvt.f32.f16 %f407, %rs194;} // end inline asm fma.rn.ftz.f32 %f538, %f532, %f407, %f884; mov.b32 {%rs195, %rs203}, %r107; // begin inline asm { cvt.f32.f16 %f408, %rs195;} // end inline asm fma.rn.ftz.f32 %f539, %f532, %f408, %f883; mov.b32 {%rs196, %rs204}, %r111; // begin inline asm { cvt.f32.f16 %f409, %rs196;} // end inline asm fma.rn.ftz.f32 %f540, %f532, %f409, %f882; shl.b16 %rs322, %rs7, 4; cvt.s16.s8 %rs323, %rs322; shr.s16 %rs324, %rs323, 7; and.b16 %rs325, %rs324, -16; or.b16 %rs326, %rs325, %rs8; cvt.rn.f32.s16 %f541, %rs326; sub.ftz.f32 %f542, %f541, %f10; mul.ftz.f32 %f543, %f105, %f542; // begin inline asm { cvt.f32.f16 %f410, %rs197;} // end inline asm fma.rn.ftz.f32 %f544, %f543, %f410, %f533; // begin inline asm { cvt.f32.f16 %f411, %rs198;} // end inline asm fma.rn.ftz.f32 %f545, %f543, %f411, %f534; // begin inline asm { cvt.f32.f16 %f412, %rs199;} // end inline asm fma.rn.ftz.f32 %f546, %f543, %f412, %f535; // begin inline asm { cvt.f32.f16 %f413, %rs200;} // end inline asm fma.rn.ftz.f32 %f547, %f543, %f413, %f536; // begin inline asm { cvt.f32.f16 %f414, %rs201;} // end inline asm fma.rn.ftz.f32 %f548, %f543, %f414, %f537; // begin inline asm { cvt.f32.f16 %f415, %rs202;} // end inline asm fma.rn.ftz.f32 %f549, %f543, %f415, %f538; // begin inline asm { cvt.f32.f16 %f416, %rs203;} // end inline asm fma.rn.ftz.f32 %f550, %f543, %f416, %f539; // begin inline asm { cvt.f32.f16 %f417, %rs204;} // end inline asm fma.rn.ftz.f32 %f551, %f543, %f417, %f540; shl.b16 %rs327, %rs9, 4; cvt.s16.s8 %rs328, %rs327; shr.s16 %rs329, %rs328, 7; and.b16 %rs330, %rs329, -16; or.b16 %rs331, %rs330, %rs10; cvt.rn.f32.s16 %f552, %rs331; sub.ftz.f32 %f553, %f552, %f10; mul.ftz.f32 %f554, %f105, %f553; mov.b32 {%rs205, %rs213}, %r82; // begin inline asm { cvt.f32.f16 %f418, %rs205;} // end inline asm fma.rn.ftz.f32 %f555, %f554, %f418, %f544; mov.b32 {%rs206, %rs214}, %r86; // begin inline asm { cvt.f32.f16 %f419, %rs206;} // end inline asm fma.rn.ftz.f32 %f556, %f554, %f419, %f545; mov.b32 {%rs207, %rs215}, %r92; // begin inline asm { cvt.f32.f16 %f420, %rs207;} // end inline asm fma.rn.ftz.f32 %f557, %f554, %f420, %f546; mov.b32 {%rs208, %rs216}, %r96; // begin inline asm { cvt.f32.f16 %f421, %rs208;} // end inline asm fma.rn.ftz.f32 %f558, %f554, %f421, %f547; mov.b32 {%rs209, %rs217}, %r100; // begin inline asm { cvt.f32.f16 %f422, %rs209;} // end inline asm fma.rn.ftz.f32 %f559, %f554, %f422, %f548; mov.b32 {%rs210, %rs218}, %r104; // begin inline asm { cvt.f32.f16 %f423, %rs210;} // end inline asm fma.rn.ftz.f32 %f560, %f554, %f423, %f549; mov.b32 {%rs211, %rs219}, %r108; // begin inline asm { cvt.f32.f16 %f424, %rs211;} // end inline asm fma.rn.ftz.f32 %f561, %f554, %f424, %f550; mov.b32 {%rs212, %rs220}, %r112; // begin inline asm { cvt.f32.f16 %f425, %rs212;} // end inline asm fma.rn.ftz.f32 %f562, %f554, %f425, %f551; shl.b16 %rs332, %rs11, 4; cvt.s16.s8 %rs333, %rs332; shr.s16 %rs334, %rs333, 7; and.b16 %rs335, %rs334, -16; or.b16 %rs336, %rs335, %rs12; cvt.rn.f32.s16 %f563, %rs336; sub.ftz.f32 %f564, %f563, %f10; mul.ftz.f32 %f565, %f105, %f564; // begin inline asm { cvt.f32.f16 %f426, %rs213;} // end inline asm fma.rn.ftz.f32 %f566, %f565, %f426, %f555; // begin inline asm { cvt.f32.f16 %f427, %rs214;} // end inline asm fma.rn.ftz.f32 %f567, %f565, %f427, %f556; // begin inline asm { cvt.f32.f16 %f428, %rs215;} // end inline asm fma.rn.ftz.f32 %f568, %f565, %f428, %f557; // begin inline asm { cvt.f32.f16 %f429, %rs216;} // end inline asm fma.rn.ftz.f32 %f569, %f565, %f429, %f558; // begin inline asm { cvt.f32.f16 %f430, %rs217;} // end inline asm fma.rn.ftz.f32 %f570, %f565, %f430, %f559; // begin inline asm { cvt.f32.f16 %f431, %rs218;} // end inline asm fma.rn.ftz.f32 %f571, %f565, %f431, %f560; // begin inline asm { cvt.f32.f16 %f432, %rs219;} // end inline asm fma.rn.ftz.f32 %f572, %f565, %f432, %f561; // begin inline asm { cvt.f32.f16 %f433, %rs220;} // end inline asm fma.rn.ftz.f32 %f573, %f565, %f433, %f562; shl.b16 %rs337, %rs13, 4; cvt.s16.s8 %rs338, %rs337; shr.s16 %rs339, %rs338, 7; and.b16 %rs340, %rs339, -16; or.b16 %rs341, %rs340, %rs14; cvt.rn.f32.s16 %f574, %rs341; sub.ftz.f32 %f575, %f574, %f10; mul.ftz.f32 %f576, %f105, %f575; mov.b32 {%rs221, %rs229}, %r83; // begin inline asm { cvt.f32.f16 %f434, %rs221;} // end inline asm fma.rn.ftz.f32 %f577, %f576, %f434, %f566; mov.b32 {%rs222, %rs230}, %r87; // begin inline asm { cvt.f32.f16 %f435, %rs222;} // end inline asm fma.rn.ftz.f32 %f578, %f576, %f435, %f567; mov.b32 {%rs223, %rs231}, %r93; // begin inline asm { cvt.f32.f16 %f436, %rs223;} // end inline asm fma.rn.ftz.f32 %f579, %f576, %f436, %f568; mov.b32 {%rs224, %rs232}, %r97; // begin inline asm { cvt.f32.f16 %f437, %rs224;} // end inline asm fma.rn.ftz.f32 %f580, %f576, %f437, %f569; mov.b32 {%rs225, %rs233}, %r101; // begin inline asm { cvt.f32.f16 %f438, %rs225;} // end inline asm fma.rn.ftz.f32 %f581, %f576, %f438, %f570; mov.b32 {%rs226, %rs234}, %r105; // begin inline asm { cvt.f32.f16 %f439, %rs226;} // end inline asm fma.rn.ftz.f32 %f582, %f576, %f439, %f571; mov.b32 {%rs227, %rs235}, %r109; // begin inline asm { cvt.f32.f16 %f440, %rs227;} // end inline asm fma.rn.ftz.f32 %f583, %f576, %f440, %f572; mov.b32 {%rs228, %rs236}, %r113; // begin inline asm { cvt.f32.f16 %f441, %rs228;} // end inline asm fma.rn.ftz.f32 %f584, %f576, %f441, %f573; shl.b16 %rs342, %rs15, 4; cvt.s16.s8 %rs343, %rs342; shr.s16 %rs344, %rs343, 7; and.b16 %rs345, %rs344, -16; or.b16 %rs346, %rs345, %rs16; cvt.rn.f32.s16 %f585, %rs346; sub.ftz.f32 %f586, %f585, %f10; mul.ftz.f32 %f587, %f105, %f586; // begin inline asm { cvt.f32.f16 %f442, %rs229;} // end inline asm fma.rn.ftz.f32 %f588, %f587, %f442, %f577; // begin inline asm { cvt.f32.f16 %f443, %rs230;} // end inline asm fma.rn.ftz.f32 %f589, %f587, %f443, %f578; // begin inline asm { cvt.f32.f16 %f444, %rs231;} // end inline asm fma.rn.ftz.f32 %f590, %f587, %f444, %f579; // begin inline asm { cvt.f32.f16 %f445, %rs232;} // end inline asm fma.rn.ftz.f32 %f591, %f587, %f445, %f580; // begin inline asm { cvt.f32.f16 %f446, %rs233;} // end inline asm fma.rn.ftz.f32 %f592, %f587, %f446, %f581; // begin inline asm { cvt.f32.f16 %f447, %rs234;} // end inline asm fma.rn.ftz.f32 %f593, %f587, %f447, %f582; // begin inline asm { cvt.f32.f16 %f448, %rs235;} // end inline asm fma.rn.ftz.f32 %f594, %f587, %f448, %f583; // begin inline asm { cvt.f32.f16 %f449, %rs236;} // end inline asm fma.rn.ftz.f32 %f595, %f587, %f449, %f584; shl.b16 %rs347, %rs17, 4; cvt.s16.s8 %rs348, %rs347; shr.s16 %rs349, %rs348, 7; and.b16 %rs350, %rs349, -16; or.b16 %rs351, %rs350, %rs18; cvt.rn.f32.s16 %f596, %rs351; sub.ftz.f32 %f597, %f596, %f10; mul.ftz.f32 %f598, %f105, %f597; mov.b32 {%rs237, %rs245}, %r84; // begin inline asm { cvt.f32.f16 %f450, %rs237;} // end inline asm fma.rn.ftz.f32 %f599, %f598, %f450, %f588; mov.b32 {%rs238, %rs246}, %r88; // begin inline asm { cvt.f32.f16 %f451, %rs238;} // end inline asm fma.rn.ftz.f32 %f600, %f598, %f451, %f589; mov.b32 {%rs239, %rs247}, %r94; // begin inline asm { cvt.f32.f16 %f452, %rs239;} // end inline asm fma.rn.ftz.f32 %f601, %f598, %f452, %f590; mov.b32 {%rs240, %rs248}, %r98; // begin inline asm { cvt.f32.f16 %f453, %rs240;} // end inline asm fma.rn.ftz.f32 %f602, %f598, %f453, %f591; mov.b32 {%rs241, %rs249}, %r102; // begin inline asm { cvt.f32.f16 %f454, %rs241;} // end inline asm fma.rn.ftz.f32 %f603, %f598, %f454, %f592; mov.b32 {%rs242, %rs250}, %r106; // begin inline asm { cvt.f32.f16 %f455, %rs242;} // end inline asm fma.rn.ftz.f32 %f604, %f598, %f455, %f593; mov.b32 {%rs243, %rs251}, %r110; // begin inline asm { cvt.f32.f16 %f456, %rs243;} // end inline asm fma.rn.ftz.f32 %f605, %f598, %f456, %f594; mov.b32 {%rs244, %rs252}, %r114; // begin inline asm { cvt.f32.f16 %f457, %rs244;} // end inline asm fma.rn.ftz.f32 %f606, %f598, %f457, %f595; shl.b16 %rs352, %rs19, 4; cvt.s16.s8 %rs353, %rs352; shr.s16 %rs354, %rs353, 7; and.b16 %rs355, %rs354, -16; or.b16 %rs356, %rs355, %rs19; cvt.rn.f32.s16 %f607, %rs356; sub.ftz.f32 %f608, %f607, %f10; mul.ftz.f32 %f609, %f105, %f608; // begin inline asm { cvt.f32.f16 %f458, %rs245;} // end inline asm fma.rn.ftz.f32 %f610, %f609, %f458, %f599; // begin inline asm { cvt.f32.f16 %f459, %rs246;} // end inline asm fma.rn.ftz.f32 %f611, %f609, %f459, %f600; // begin inline asm { cvt.f32.f16 %f460, %rs247;} // end inline asm fma.rn.ftz.f32 %f612, %f609, %f460, %f601; // begin inline asm { cvt.f32.f16 %f461, %rs248;} // end inline asm fma.rn.ftz.f32 %f613, %f609, %f461, %f602; // begin inline asm { cvt.f32.f16 %f462, %rs249;} // end inline asm fma.rn.ftz.f32 %f614, %f609, %f462, %f603; // begin inline asm { cvt.f32.f16 %f463, %rs250;} // end inline asm fma.rn.ftz.f32 %f615, %f609, %f463, %f604; // begin inline asm { cvt.f32.f16 %f464, %rs251;} // end inline asm fma.rn.ftz.f32 %f616, %f609, %f464, %f605; // begin inline asm { cvt.f32.f16 %f465, %rs252;} // end inline asm fma.rn.ftz.f32 %f617, %f609, %f465, %f606; ld.global.v4.u32 {%r193, %r194, %r195, %r196}, [%rd7+16]; ld.global.v4.u32 {%r201, %r202, %r203, %r204}, [%rd8+16]; ld.global.v4.u32 {%r209, %r210, %r211, %r212}, [%rd10]; ld.global.v4.u32 {%r217, %r218, %r219, %r220}, [%rd11]; ld.global.v4.u32 {%r225, %r226, %r227, %r228}, [%rd12]; ld.global.v4.u32 {%r233, %r234, %r235, %r236}, [%rd13]; ld.global.v4.u32 {%r241, %r242, %r243, %r244}, [%rd14]; add.s64 %rd62, %rd14, %rd9; ld.global.v4.u32 {%r249, %r250, %r251, %r252}, [%rd62]; shl.b16 %rs357, %rs20, 4; cvt.s16.s8 %rs358, %rs357; shr.s16 %rs359, %rs358, 7; and.b16 %rs360, %rs359, -16; or.b16 %rs361, %rs360, %rs21; cvt.rn.f32.s16 %f618, %rs361; sub.ftz.f32 %f619, %f618, %f10; mul.ftz.f32 %f620, %f105, %f619; mov.b32 {%rs253, %rs261}, %r193; // begin inline asm { cvt.f32.f16 %f466, %rs253;} // end inline asm fma.rn.ftz.f32 %f621, %f620, %f466, %f610; mov.b32 {%rs254, %rs262}, %r201; // begin inline asm { cvt.f32.f16 %f467, %rs254;} // end inline asm fma.rn.ftz.f32 %f622, %f620, %f467, %f611; mov.b32 {%rs255, %rs263}, %r209; // begin inline asm { cvt.f32.f16 %f468, %rs255;} // end inline asm fma.rn.ftz.f32 %f623, %f620, %f468, %f612; mov.b32 {%rs256, %rs264}, %r217; // begin inline asm { cvt.f32.f16 %f469, %rs256;} // end inline asm fma.rn.ftz.f32 %f624, %f620, %f469, %f613; mov.b32 {%rs257, %rs265}, %r225; // begin inline asm { cvt.f32.f16 %f470, %rs257;} // end inline asm fma.rn.ftz.f32 %f625, %f620, %f470, %f614; mov.b32 {%rs258, %rs266}, %r233; // begin inline asm { cvt.f32.f16 %f471, %rs258;} // end inline asm fma.rn.ftz.f32 %f626, %f620, %f471, %f615; mov.b32 {%rs259, %rs267}, %r241; // begin inline asm { cvt.f32.f16 %f472, %rs259;} // end inline asm fma.rn.ftz.f32 %f627, %f620, %f472, %f616; mov.b32 {%rs260, %rs268}, %r249; // begin inline asm { cvt.f32.f16 %f473, %rs260;} // end inline asm fma.rn.ftz.f32 %f628, %f620, %f473, %f617; shl.b16 %rs362, %rs22, 4; cvt.s16.s8 %rs363, %rs362; shr.s16 %rs364, %rs363, 7; and.b16 %rs365, %rs364, -16; or.b16 %rs366, %rs365, %rs23; cvt.rn.f32.s16 %f629, %rs366; sub.ftz.f32 %f630, %f629, %f10; mul.ftz.f32 %f631, %f105, %f630; // begin inline asm { cvt.f32.f16 %f474, %rs261;} // end inline asm fma.rn.ftz.f32 %f632, %f631, %f474, %f621; // begin inline asm { cvt.f32.f16 %f475, %rs262;} // end inline asm fma.rn.ftz.f32 %f633, %f631, %f475, %f622; // begin inline asm { cvt.f32.f16 %f476, %rs263;} // end inline asm fma.rn.ftz.f32 %f634, %f631, %f476, %f623; // begin inline asm { cvt.f32.f16 %f477, %rs264;} // end inline asm fma.rn.ftz.f32 %f635, %f631, %f477, %f624; // begin inline asm { cvt.f32.f16 %f478, %rs265;} // end inline asm fma.rn.ftz.f32 %f636, %f631, %f478, %f625; // begin inline asm { cvt.f32.f16 %f479, %rs266;} // end inline asm fma.rn.ftz.f32 %f637, %f631, %f479, %f626; // begin inline asm { cvt.f32.f16 %f480, %rs267;} // end inline asm fma.rn.ftz.f32 %f638, %f631, %f480, %f627; // begin inline asm { cvt.f32.f16 %f481, %rs268;} // end inline asm fma.rn.ftz.f32 %f639, %f631, %f481, %f628; shl.b16 %rs367, %rs24, 4; cvt.s16.s8 %rs368, %rs367; shr.s16 %rs369, %rs368, 7; and.b16 %rs370, %rs369, -16; or.b16 %rs371, %rs370, %rs25; cvt.rn.f32.s16 %f640, %rs371; sub.ftz.f32 %f641, %f640, %f10; mul.ftz.f32 %f642, %f105, %f641; mov.b32 {%rs269, %rs277}, %r194; // begin inline asm { cvt.f32.f16 %f482, %rs269;} // end inline asm fma.rn.ftz.f32 %f643, %f642, %f482, %f632; mov.b32 {%rs270, %rs278}, %r202; // begin inline asm { cvt.f32.f16 %f483, %rs270;} // end inline asm fma.rn.ftz.f32 %f644, %f642, %f483, %f633; mov.b32 {%rs271, %rs279}, %r210; // begin inline asm { cvt.f32.f16 %f484, %rs271;} // end inline asm fma.rn.ftz.f32 %f645, %f642, %f484, %f634; mov.b32 {%rs272, %rs280}, %r218; // begin inline asm { cvt.f32.f16 %f485, %rs272;} // end inline asm fma.rn.ftz.f32 %f646, %f642, %f485, %f635; mov.b32 {%rs273, %rs281}, %r226; // begin inline asm { cvt.f32.f16 %f486, %rs273;} // end inline asm fma.rn.ftz.f32 %f647, %f642, %f486, %f636; mov.b32 {%rs274, %rs282}, %r234; // begin inline asm { cvt.f32.f16 %f487, %rs274;} // end inline asm fma.rn.ftz.f32 %f648, %f642, %f487, %f637; mov.b32 {%rs275, %rs283}, %r242; // begin inline asm { cvt.f32.f16 %f488, %rs275;} // end inline asm fma.rn.ftz.f32 %f649, %f642, %f488, %f638; mov.b32 {%rs276, %rs284}, %r250; // begin inline asm { cvt.f32.f16 %f489, %rs276;} // end inline asm fma.rn.ftz.f32 %f650, %f642, %f489, %f639; shl.b16 %rs372, %rs26, 4; cvt.s16.s8 %rs373, %rs372; shr.s16 %rs374, %rs373, 7; and.b16 %rs375, %rs374, -16; or.b16 %rs376, %rs375, %rs27; cvt.rn.f32.s16 %f651, %rs376; sub.ftz.f32 %f652, %f651, %f10; mul.ftz.f32 %f653, %f105, %f652; // begin inline asm { cvt.f32.f16 %f490, %rs277;} // end inline asm fma.rn.ftz.f32 %f654, %f653, %f490, %f643; // begin inline asm { cvt.f32.f16 %f491, %rs278;} // end inline asm fma.rn.ftz.f32 %f655, %f653, %f491, %f644; // begin inline asm { cvt.f32.f16 %f492, %rs279;} // end inline asm fma.rn.ftz.f32 %f656, %f653, %f492, %f645; // begin inline asm { cvt.f32.f16 %f493, %rs280;} // end inline asm fma.rn.ftz.f32 %f657, %f653, %f493, %f646; // begin inline asm { cvt.f32.f16 %f494, %rs281;} // end inline asm fma.rn.ftz.f32 %f658, %f653, %f494, %f647; // begin inline asm { cvt.f32.f16 %f495, %rs282;} // end inline asm fma.rn.ftz.f32 %f659, %f653, %f495, %f648; // begin inline asm { cvt.f32.f16 %f496, %rs283;} // end inline asm fma.rn.ftz.f32 %f660, %f653, %f496, %f649; // begin inline asm { cvt.f32.f16 %f497, %rs284;} // end inline asm fma.rn.ftz.f32 %f661, %f653, %f497, %f650; shl.b16 %rs377, %rs28, 4; cvt.s16.s8 %rs378, %rs377; shr.s16 %rs379, %rs378, 7; and.b16 %rs380, %rs379, -16; or.b16 %rs381, %rs380, %rs29; cvt.rn.f32.s16 %f662, %rs381; sub.ftz.f32 %f663, %f662, %f10; mul.ftz.f32 %f664, %f105, %f663; mov.b32 {%rs285, %rs293}, %r195; // begin inline asm { cvt.f32.f16 %f498, %rs285;} // end inline asm fma.rn.ftz.f32 %f665, %f664, %f498, %f654; mov.b32 {%rs286, %rs294}, %r203; // begin inline asm { cvt.f32.f16 %f499, %rs286;} // end inline asm fma.rn.ftz.f32 %f666, %f664, %f499, %f655; mov.b32 {%rs287, %rs295}, %r211; // begin inline asm { cvt.f32.f16 %f500, %rs287;} // end inline asm fma.rn.ftz.f32 %f667, %f664, %f500, %f656; mov.b32 {%rs288, %rs296}, %r219; // begin inline asm { cvt.f32.f16 %f501, %rs288;} // end inline asm fma.rn.ftz.f32 %f668, %f664, %f501, %f657; mov.b32 {%rs289, %rs297}, %r227; // begin inline asm { cvt.f32.f16 %f502, %rs289;} // end inline asm fma.rn.ftz.f32 %f669, %f664, %f502, %f658; mov.b32 {%rs290, %rs298}, %r235; // begin inline asm { cvt.f32.f16 %f503, %rs290;} // end inline asm fma.rn.ftz.f32 %f670, %f664, %f503, %f659; mov.b32 {%rs291, %rs299}, %r243; // begin inline asm { cvt.f32.f16 %f504, %rs291;} // end inline asm fma.rn.ftz.f32 %f671, %f664, %f504, %f660; mov.b32 {%rs292, %rs300}, %r251; // begin inline asm { cvt.f32.f16 %f505, %rs292;} // end inline asm fma.rn.ftz.f32 %f672, %f664, %f505, %f661; shl.b16 %rs382, %rs30, 4; cvt.s16.s8 %rs383, %rs382; shr.s16 %rs384, %rs383, 7; and.b16 %rs385, %rs384, -16; or.b16 %rs386, %rs385, %rs31; cvt.rn.f32.s16 %f673, %rs386; sub.ftz.f32 %f674, %f673, %f10; mul.ftz.f32 %f675, %f105, %f674; // begin inline asm { cvt.f32.f16 %f506, %rs293;} // end inline asm fma.rn.ftz.f32 %f676, %f675, %f506, %f665; // begin inline asm { cvt.f32.f16 %f507, %rs294;} // end inline asm fma.rn.ftz.f32 %f677, %f675, %f507, %f666; // begin inline asm { cvt.f32.f16 %f508, %rs295;} // end inline asm fma.rn.ftz.f32 %f678, %f675, %f508, %f667; // begin inline asm { cvt.f32.f16 %f509, %rs296;} // end inline asm fma.rn.ftz.f32 %f679, %f675, %f509, %f668; // begin inline asm { cvt.f32.f16 %f510, %rs297;} // end inline asm fma.rn.ftz.f32 %f680, %f675, %f510, %f669; // begin inline asm { cvt.f32.f16 %f511, %rs298;} // end inline asm fma.rn.ftz.f32 %f681, %f675, %f511, %f670; // begin inline asm { cvt.f32.f16 %f512, %rs299;} // end inline asm fma.rn.ftz.f32 %f682, %f675, %f512, %f671; // begin inline asm { cvt.f32.f16 %f513, %rs300;} // end inline asm fma.rn.ftz.f32 %f683, %f675, %f513, %f672; shl.b16 %rs387, %rs32, 4; cvt.s16.s8 %rs388, %rs387; shr.s16 %rs389, %rs388, 7; and.b16 %rs390, %rs389, -16; or.b16 %rs391, %rs390, %rs33; cvt.rn.f32.s16 %f684, %rs391; sub.ftz.f32 %f685, %f684, %f10; mul.ftz.f32 %f686, %f105, %f685; mov.b32 {%rs301, %rs309}, %r196; // begin inline asm { cvt.f32.f16 %f514, %rs301;} // end inline asm fma.rn.ftz.f32 %f687, %f686, %f514, %f676; mov.b32 {%rs302, %rs310}, %r204; // begin inline asm { cvt.f32.f16 %f515, %rs302;} // end inline asm fma.rn.ftz.f32 %f688, %f686, %f515, %f677; mov.b32 {%rs303, %rs311}, %r212; // begin inline asm { cvt.f32.f16 %f516, %rs303;} // end inline asm fma.rn.ftz.f32 %f689, %f686, %f516, %f678; mov.b32 {%rs304, %rs312}, %r220; // begin inline asm { cvt.f32.f16 %f517, %rs304;} // end inline asm fma.rn.ftz.f32 %f690, %f686, %f517, %f679; mov.b32 {%rs305, %rs313}, %r228; // begin inline asm { cvt.f32.f16 %f518, %rs305;} // end inline asm fma.rn.ftz.f32 %f691, %f686, %f518, %f680; mov.b32 {%rs306, %rs314}, %r236; // begin inline asm { cvt.f32.f16 %f519, %rs306;} // end inline asm fma.rn.ftz.f32 %f692, %f686, %f519, %f681; mov.b32 {%rs307, %rs315}, %r244; // begin inline asm { cvt.f32.f16 %f520, %rs307;} // end inline asm fma.rn.ftz.f32 %f693, %f686, %f520, %f682; mov.b32 {%rs308, %rs316}, %r252; // begin inline asm { cvt.f32.f16 %f521, %rs308;} // end inline asm fma.rn.ftz.f32 %f694, %f686, %f521, %f683; shl.b16 %rs392, %rs34, 4; cvt.s16.s8 %rs393, %rs392; shr.s16 %rs394, %rs393, 7; and.b16 %rs395, %rs394, -16; or.b16 %rs396, %rs395, %rs34; cvt.rn.f32.s16 %f695, %rs396; sub.ftz.f32 %f696, %f695, %f10; mul.ftz.f32 %f697, %f105, %f696; // begin inline asm { cvt.f32.f16 %f522, %rs309;} // end inline asm fma.rn.ftz.f32 %f889, %f697, %f522, %f687; // begin inline asm { cvt.f32.f16 %f523, %rs310;} // end inline asm fma.rn.ftz.f32 %f888, %f697, %f523, %f688; // begin inline asm { cvt.f32.f16 %f524, %rs311;} // end inline asm fma.rn.ftz.f32 %f887, %f697, %f524, %f689; // begin inline asm { cvt.f32.f16 %f525, %rs312;} // end inline asm fma.rn.ftz.f32 %f886, %f697, %f525, %f690; // begin inline asm { cvt.f32.f16 %f526, %rs313;} // end inline asm fma.rn.ftz.f32 %f885, %f697, %f526, %f691; // begin inline asm { cvt.f32.f16 %f527, %rs314;} // end inline asm fma.rn.ftz.f32 %f884, %f697, %f527, %f692; // begin inline asm { cvt.f32.f16 %f528, %rs315;} // end inline asm fma.rn.ftz.f32 %f883, %f697, %f528, %f693; // begin inline asm { cvt.f32.f16 %f529, %rs316;} // end inline asm fma.rn.ftz.f32 %f882, %f697, %f529, %f694; $L__BB0_8: add.s32 %r481, %r481, 4; shl.b32 %r257, %r481, 5; add.s32 %r480, %r257, %r64; shl.b32 %r479, %r480, 1; setp.lt.u32 %p7, %r479, %r61; @%p7 bra $L__BB0_2; $L__BB0_9: mov.u32 %r259, %tid.y; shl.b32 %r260, %r259, 5; add.s32 %r50, %r260, %r64; setp.lt.u32 %p8, %r50, 32; shl.b32 %r262, %r50, 2; mov.u32 %r263, _ZZ9gemv_int4ILi4ELi64ELi8EEvP6__halfPKS0_S3_PKjPKhS3_iiiiffbE12temp_storage; add.s32 %r264, %r263, %r262; @%p8 bra $L__BB0_11; add.s32 %r471, %r264, -112; st.shared.f32 [%r471], %f889; $L__BB0_11: setp.gt.u32 %p9, %r50, 31; bar.sync 0; mad.lo.s32 %r52, %r50, 12, %r263; @%p9 bra $L__BB0_13; mov.u32 %r283, 16; ld.shared.f32 %f713, [%r52+16]; add.ftz.f32 %f714, %f889, %f713; ld.shared.f32 %f715, [%r52+20]; add.ftz.f32 %f716, %f714, %f715; ld.shared.f32 %f717, [%r52+24]; add.ftz.f32 %f700, %f716, %f717; mov.u32 %r271, 1; mov.u32 %r284, 31; mov.u32 %r285, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f700, %r271, %r284, %r285; @p add.f32 r0, r0, %f700; mov.f32 %f698, r0;} // end inline asm mov.u32 %r274, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f698, %r274, %r284, %r285; @p add.f32 r0, r0, %f698; mov.f32 %f701, r0;} // end inline asm mov.u32 %r277, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f701, %r277, %r284, %r285; @p add.f32 r0, r0, %f701; mov.f32 %f704, r0;} // end inline asm mov.u32 %r280, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f704, %r280, %r284, %r285; @p add.f32 r0, r0, %f704; mov.f32 %f707, r0;} // end inline asm // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f707, %r283, %r284, %r285; @p add.f32 r0, r0, %f707; mov.f32 %f889, r0;} // end inline asm $L__BB0_13: @%p8 bra $L__BB0_15; add.s32 %r472, %r264, -112; st.shared.f32 [%r472+640], %f888; $L__BB0_15: bar.sync 0; @%p9 bra $L__BB0_17; ld.shared.f32 %f733, [%r52+656]; add.ftz.f32 %f734, %f888, %f733; ld.shared.f32 %f735, [%r52+660]; add.ftz.f32 %f736, %f734, %f735; ld.shared.f32 %f737, [%r52+664]; add.ftz.f32 %f720, %f736, %f737; mov.u32 %r295, 1; mov.u32 %r308, 31; mov.u32 %r309, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f720, %r295, %r308, %r309; @p add.f32 r0, r0, %f720; mov.f32 %f718, r0;} // end inline asm mov.u32 %r298, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f718, %r298, %r308, %r309; @p add.f32 r0, r0, %f718; mov.f32 %f721, r0;} // end inline asm mov.u32 %r301, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f721, %r301, %r308, %r309; @p add.f32 r0, r0, %f721; mov.f32 %f724, r0;} // end inline asm mov.u32 %r304, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f724, %r304, %r308, %r309; @p add.f32 r0, r0, %f724; mov.f32 %f727, r0;} // end inline asm mov.u32 %r307, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f727, %r307, %r308, %r309; @p add.f32 r0, r0, %f727; mov.f32 %f888, r0;} // end inline asm $L__BB0_17: @%p8 bra $L__BB0_19; add.s32 %r473, %r264, -112; st.shared.f32 [%r473+1280], %f887; $L__BB0_19: bar.sync 0; @%p9 bra $L__BB0_21; ld.shared.f32 %f753, [%r52+1296]; add.ftz.f32 %f754, %f887, %f753; ld.shared.f32 %f755, [%r52+1300]; add.ftz.f32 %f756, %f754, %f755; ld.shared.f32 %f757, [%r52+1304]; add.ftz.f32 %f740, %f756, %f757; mov.u32 %r319, 1; mov.u32 %r332, 31; mov.u32 %r333, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f740, %r319, %r332, %r333; @p add.f32 r0, r0, %f740; mov.f32 %f738, r0;} // end inline asm mov.u32 %r322, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f738, %r322, %r332, %r333; @p add.f32 r0, r0, %f738; mov.f32 %f741, r0;} // end inline asm mov.u32 %r325, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f741, %r325, %r332, %r333; @p add.f32 r0, r0, %f741; mov.f32 %f744, r0;} // end inline asm mov.u32 %r328, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f744, %r328, %r332, %r333; @p add.f32 r0, r0, %f744; mov.f32 %f747, r0;} // end inline asm mov.u32 %r331, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f747, %r331, %r332, %r333; @p add.f32 r0, r0, %f747; mov.f32 %f887, r0;} // end inline asm $L__BB0_21: @%p8 bra $L__BB0_23; add.s32 %r474, %r264, -112; st.shared.f32 [%r474+1920], %f886; $L__BB0_23: bar.sync 0; @%p9 bra $L__BB0_25; ld.shared.f32 %f773, [%r52+1936]; add.ftz.f32 %f774, %f886, %f773; ld.shared.f32 %f775, [%r52+1940]; add.ftz.f32 %f776, %f774, %f775; ld.shared.f32 %f777, [%r52+1944]; add.ftz.f32 %f760, %f776, %f777; mov.u32 %r343, 1; mov.u32 %r356, 31; mov.u32 %r357, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f760, %r343, %r356, %r357; @p add.f32 r0, r0, %f760; mov.f32 %f758, r0;} // end inline asm mov.u32 %r346, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f758, %r346, %r356, %r357; @p add.f32 r0, r0, %f758; mov.f32 %f761, r0;} // end inline asm mov.u32 %r349, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f761, %r349, %r356, %r357; @p add.f32 r0, r0, %f761; mov.f32 %f764, r0;} // end inline asm mov.u32 %r352, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f764, %r352, %r356, %r357; @p add.f32 r0, r0, %f764; mov.f32 %f767, r0;} // end inline asm mov.u32 %r355, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f767, %r355, %r356, %r357; @p add.f32 r0, r0, %f767; mov.f32 %f886, r0;} // end inline asm $L__BB0_25: @%p8 bra $L__BB0_27; add.s32 %r475, %r264, -112; st.shared.f32 [%r475+2560], %f885; $L__BB0_27: bar.sync 0; @%p9 bra $L__BB0_29; ld.shared.f32 %f793, [%r52+2576]; add.ftz.f32 %f794, %f885, %f793; ld.shared.f32 %f795, [%r52+2580]; add.ftz.f32 %f796, %f794, %f795; ld.shared.f32 %f797, [%r52+2584]; add.ftz.f32 %f780, %f796, %f797; mov.u32 %r367, 1; mov.u32 %r380, 31; mov.u32 %r381, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f780, %r367, %r380, %r381; @p add.f32 r0, r0, %f780; mov.f32 %f778, r0;} // end inline asm mov.u32 %r370, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f778, %r370, %r380, %r381; @p add.f32 r0, r0, %f778; mov.f32 %f781, r0;} // end inline asm mov.u32 %r373, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f781, %r373, %r380, %r381; @p add.f32 r0, r0, %f781; mov.f32 %f784, r0;} // end inline asm mov.u32 %r376, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f784, %r376, %r380, %r381; @p add.f32 r0, r0, %f784; mov.f32 %f787, r0;} // end inline asm mov.u32 %r379, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f787, %r379, %r380, %r381; @p add.f32 r0, r0, %f787; mov.f32 %f885, r0;} // end inline asm $L__BB0_29: @%p8 bra $L__BB0_31; add.s32 %r476, %r264, -112; st.shared.f32 [%r476+3200], %f884; $L__BB0_31: bar.sync 0; @%p9 bra $L__BB0_33; ld.shared.f32 %f813, [%r52+3216]; add.ftz.f32 %f814, %f884, %f813; ld.shared.f32 %f815, [%r52+3220]; add.ftz.f32 %f816, %f814, %f815; ld.shared.f32 %f817, [%r52+3224]; add.ftz.f32 %f800, %f816, %f817; mov.u32 %r391, 1; mov.u32 %r404, 31; mov.u32 %r405, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f800, %r391, %r404, %r405; @p add.f32 r0, r0, %f800; mov.f32 %f798, r0;} // end inline asm mov.u32 %r394, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f798, %r394, %r404, %r405; @p add.f32 r0, r0, %f798; mov.f32 %f801, r0;} // end inline asm mov.u32 %r397, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f801, %r397, %r404, %r405; @p add.f32 r0, r0, %f801; mov.f32 %f804, r0;} // end inline asm mov.u32 %r400, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f804, %r400, %r404, %r405; @p add.f32 r0, r0, %f804; mov.f32 %f807, r0;} // end inline asm mov.u32 %r403, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f807, %r403, %r404, %r405; @p add.f32 r0, r0, %f807; mov.f32 %f884, r0;} // end inline asm $L__BB0_33: @%p8 bra $L__BB0_35; add.s32 %r477, %r264, -112; st.shared.f32 [%r477+3840], %f883; $L__BB0_35: bar.sync 0; @%p9 bra $L__BB0_37; ld.shared.f32 %f833, [%r52+3856]; add.ftz.f32 %f834, %f883, %f833; ld.shared.f32 %f835, [%r52+3860]; add.ftz.f32 %f836, %f834, %f835; ld.shared.f32 %f837, [%r52+3864]; add.ftz.f32 %f820, %f836, %f837; mov.u32 %r415, 1; mov.u32 %r428, 31; mov.u32 %r429, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f820, %r415, %r428, %r429; @p add.f32 r0, r0, %f820; mov.f32 %f818, r0;} // end inline asm mov.u32 %r418, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f818, %r418, %r428, %r429; @p add.f32 r0, r0, %f818; mov.f32 %f821, r0;} // end inline asm mov.u32 %r421, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f821, %r421, %r428, %r429; @p add.f32 r0, r0, %f821; mov.f32 %f824, r0;} // end inline asm mov.u32 %r424, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f824, %r424, %r428, %r429; @p add.f32 r0, r0, %f824; mov.f32 %f827, r0;} // end inline asm mov.u32 %r427, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f827, %r427, %r428, %r429; @p add.f32 r0, r0, %f827; mov.f32 %f883, r0;} // end inline asm $L__BB0_37: @%p8 bra $L__BB0_39; add.s32 %r478, %r264, -112; st.shared.f32 [%r478+4480], %f882; $L__BB0_39: bar.sync 0; @%p9 bra $L__BB0_41; ld.shared.f32 %f853, [%r52+4496]; add.ftz.f32 %f854, %f882, %f853; ld.shared.f32 %f855, [%r52+4500]; add.ftz.f32 %f856, %f854, %f855; ld.shared.f32 %f857, [%r52+4504]; add.ftz.f32 %f840, %f856, %f857; mov.u32 %r439, 1; mov.u32 %r452, 31; mov.u32 %r453, -1; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f840, %r439, %r452, %r453; @p add.f32 r0, r0, %f840; mov.f32 %f838, r0;} // end inline asm mov.u32 %r442, 2; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f838, %r442, %r452, %r453; @p add.f32 r0, r0, %f838; mov.f32 %f841, r0;} // end inline asm mov.u32 %r445, 4; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f841, %r445, %r452, %r453; @p add.f32 r0, r0, %f841; mov.f32 %f844, r0;} // end inline asm mov.u32 %r448, 8; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f844, %r448, %r452, %r453; @p add.f32 r0, r0, %f844; mov.f32 %f847, r0;} // end inline asm mov.u32 %r451, 16; // begin inline asm { .reg .f32 r0; .reg .pred p; shfl.sync.down.b32 r0|p, %f847, %r451, %r452, %r453; @p add.f32 r0, r0, %f847; mov.f32 %f882, r0;} // end inline asm $L__BB0_41: or.b32 %r456, %r64, %r259; setp.ne.s32 %p24, %r456, 0; @%p24 bra $L__BB0_59; ld.param.u64 %rd106, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0+8]; setp.eq.s64 %p25, %rd106, 0; mul.ftz.f32 %f906, %f95, %f889; mov.u32 %r457, %ctaid.x; cvt.s64.s32 %rd15, %r457; @%p25 bra $L__BB0_44; shl.b64 %rd63, %rd15, 1; add.s64 %rd64, %rd2, %rd63; ld.global.u16 %rs397, [%rd64]; // begin inline asm { cvt.f32.f16 %f858, %rs397;} // end inline asm fma.rn.ftz.f32 %f906, %f96, %f858, %f906; $L__BB0_44: ld.param.u64 %rd107, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0]; // begin inline asm { cvt.rn.f16.f32 %rs398, %f906;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd65, 1.0; // end inline asm shl.b64 %rd68, %rd15, 1; add.s64 %rd66, %rd107, %rd68; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd66], %rs398, %rd65; // end inline asm mul.ftz.f32 %f907, %f95, %f888; add.s32 %r459, %r60, %r457; cvt.s64.s32 %rd18, %r459; @%p25 bra $L__BB0_46; shl.b64 %rd69, %rd18, 1; add.s64 %rd70, %rd2, %rd69; ld.global.u16 %rs400, [%rd70]; // begin inline asm { cvt.f32.f16 %f860, %rs400;} // end inline asm fma.rn.ftz.f32 %f907, %f96, %f860, %f907; $L__BB0_46: cvt.s64.s32 %rd19, %r60; mul.wide.s32 %rd74, %r60, 2; add.s64 %rd72, %rd66, %rd74; // begin inline asm { cvt.rn.f16.f32 %rs401, %f907;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd71, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd72], %rs401, %rd71; // end inline asm mul.ftz.f32 %f908, %f95, %f887; cvt.u32.u64 %r460, %rd18; add.s32 %r461, %r460, %r60; cvt.s64.s32 %rd20, %r461; @%p25 bra $L__BB0_48; shl.b64 %rd75, %rd20, 1; add.s64 %rd76, %rd2, %rd75; ld.global.u16 %rs403, [%rd76]; // begin inline asm { cvt.f32.f16 %f862, %rs403;} // end inline asm fma.rn.ftz.f32 %f908, %f96, %f862, %f908; $L__BB0_48: ld.param.u64 %rd108, [_Z27dequant_gemv_group64_batch823DequantGemvKernelParams_param_0]; shl.b64 %rd80, %rd20, 1; add.s64 %rd78, %rd108, %rd80; // begin inline asm { cvt.rn.f16.f32 %rs404, %f908;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd77, 1.0; // end inline asm // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd78], %rs404, %rd77; // end inline asm mul.ftz.f32 %f909, %f95, %f886; cvt.u32.u64 %r462, %rd20; add.s32 %r463, %r462, %r60; cvt.s64.s32 %rd22, %r463; @%p25 bra $L__BB0_50; shl.b64 %rd81, %rd22, 1; add.s64 %rd82, %rd2, %rd81; ld.global.u16 %rs406, [%rd82]; // begin inline asm { cvt.f32.f16 %f864, %rs406;} // end inline asm fma.rn.ftz.f32 %f909, %f96, %f864, %f909; $L__BB0_50: // begin inline asm { cvt.rn.f16.f32 %rs407, %f909;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd83, 1.0; // end inline asm shl.b64 %rd23, %rd19, 1; add.s64 %rd84, %rd78, %rd23; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd84], %rs407, %rd83; // end inline asm mul.ftz.f32 %f910, %f95, %f885; cvt.u32.u64 %r464, %rd22; add.s32 %r465, %r464, %r60; cvt.s64.s32 %rd25, %r465; @%p25 bra $L__BB0_52; shl.b64 %rd86, %rd25, 1; add.s64 %rd87, %rd2, %rd86; ld.global.u16 %rs409, [%rd87]; // begin inline asm { cvt.f32.f16 %f866, %rs409;} // end inline asm fma.rn.ftz.f32 %f910, %f96, %f866, %f910; $L__BB0_52: // begin inline asm { cvt.rn.f16.f32 %rs410, %f910;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd88, 1.0; // end inline asm add.s64 %rd89, %rd84, %rd23; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd89], %rs410, %rd88; // end inline asm mul.ftz.f32 %f911, %f95, %f884; cvt.u32.u64 %r466, %rd25; add.s32 %r467, %r466, %r60; cvt.s64.s32 %rd27, %r467; @%p25 bra $L__BB0_54; shl.b64 %rd91, %rd27, 1; add.s64 %rd92, %rd2, %rd91; ld.global.u16 %rs412, [%rd92]; // begin inline asm { cvt.f32.f16 %f868, %rs412;} // end inline asm fma.rn.ftz.f32 %f911, %f96, %f868, %f911; $L__BB0_54: // begin inline asm { cvt.rn.f16.f32 %rs413, %f911;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd93, 1.0; // end inline asm add.s64 %rd94, %rd89, %rd23; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd94], %rs413, %rd93; // end inline asm mul.ftz.f32 %f912, %f95, %f883; cvt.u32.u64 %r468, %rd27; add.s32 %r469, %r468, %r60; cvt.s64.s32 %rd29, %r469; @%p25 bra $L__BB0_56; shl.b64 %rd96, %rd29, 1; add.s64 %rd97, %rd2, %rd96; ld.global.u16 %rs415, [%rd97]; // begin inline asm { cvt.f32.f16 %f870, %rs415;} // end inline asm fma.rn.ftz.f32 %f912, %f96, %f870, %f912; $L__BB0_56: // begin inline asm { cvt.rn.f16.f32 %rs416, %f912;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd98, 1.0; // end inline asm add.s64 %rd99, %rd94, %rd23; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd99], %rs416, %rd98; // end inline asm mul.ftz.f32 %f913, %f95, %f882; cvt.u32.u64 %r470, %rd29; add.s32 %r54, %r470, %r60; @%p25 bra $L__BB0_58; mul.wide.s32 %rd101, %r54, 2; add.s64 %rd102, %rd2, %rd101; ld.global.u16 %rs418, [%rd102]; // begin inline asm { cvt.f32.f16 %f872, %rs418;} // end inline asm fma.rn.ftz.f32 %f913, %f96, %f872, %f913; $L__BB0_58: // begin inline asm { cvt.rn.f16.f32 %rs419, %f913;} // end inline asm // begin inline asm createpolicy.fractional.L2::evict_last.L2::evict_unchanged.b64 %rd103, 1.0; // end inline asm add.s64 %rd104, %rd99, %rd23; // begin inline asm st.global.L1::evict_normal.L2::cache_hint.u16 [%rd104], %rs419, %rd103; // end inline asm $L__BB0_59: ret; } // .globl _ZN3cub11EmptyKernelIvEEvv .visible .entry _ZN3cub11EmptyKernelIvEEvv() { ret; }