From 92b26317d444fc63c8b229dfabd2cddd838b9fe4 Mon Sep 17 00:00:00 2001 From: Sidraya Jayagond Date: Mon, 18 Nov 2024 06:51:11 +0000 Subject: [PATCH] 8327652: S390x: Implements SLP support Reviewed-by: amitkumar, lucy, mdoerr --- src/hotspot/cpu/s390/assembler_s390.hpp | 106 ++- .../cpu/s390/assembler_s390.inline.hpp | 49 +- .../cpu/s390/c2_MacroAssembler_s390.cpp | 6 + src/hotspot/cpu/s390/c2_globals_s390.hpp | 2 +- src/hotspot/cpu/s390/globals_s390.hpp | 5 + src/hotspot/cpu/s390/registerSaver_s390.hpp | 9 +- src/hotspot/cpu/s390/register_s390.cpp | 13 +- src/hotspot/cpu/s390/register_s390.hpp | 24 +- src/hotspot/cpu/s390/s390.ad | 866 +++++++++++++++++- src/hotspot/cpu/s390/sharedRuntime_s390.cpp | 89 +- src/hotspot/cpu/s390/vm_version_s390.cpp | 18 +- src/hotspot/cpu/s390/vmreg_s390.cpp | 10 + src/hotspot/cpu/s390/vmreg_s390.hpp | 16 +- src/hotspot/cpu/s390/vmreg_s390.inline.hpp | 12 +- src/hotspot/share/adlc/output_c.cpp | 3 + src/hotspot/share/opto/machnode.hpp | 8 + src/hotspot/share/opto/type.cpp | 2 +- 17 files changed, 1148 insertions(+), 90 deletions(-) diff --git a/src/hotspot/cpu/s390/assembler_s390.hpp b/src/hotspot/cpu/s390/assembler_s390.hpp index c98c100a06842..60e347a2d92ca 100644 --- a/src/hotspot/cpu/s390/assembler_s390.hpp +++ b/src/hotspot/cpu/s390/assembler_s390.hpp @@ -1236,6 +1236,9 @@ class Assembler : public AbstractAssembler { // NOR #define VNO_ZOPC (unsigned long)(0xe7L << 40 | 0x6bL << 0) // V1 := !(V2 | V3), element size = 2**m + //NOT-XOR +#define VNX_ZOPC (unsigned long)(0xe7L << 40 | 0x6cL << 0) // V1 := !(V2 | V3), element size = 2**m + // OR #define VO_ZOPC (unsigned long)(0xe7L << 40 | 0x6aL << 0) // V1 := V2 | V3, element size = 2**m @@ -1287,6 +1290,13 @@ class Assembler : public AbstractAssembler { #define VSTRC_ZOPC (unsigned long)(0xe7L << 40 | 0x8aL << 0) // String range compare #define VISTR_ZOPC (unsigned long)(0xe7L << 40 | 0x5cL << 0) // Isolate String +#define VFA_ZOPC (unsigned long)(0xe7L << 40 | 0xE3L << 0) // V1 := V2 + V3, element size = 2**m +#define VFS_ZOPC (unsigned long)(0xe7L << 40 | 0xE2L << 0) // V1 := V2 - V3, element size = 2**m +#define VFM_ZOPC (unsigned long)(0xe7L << 40 | 0xE7L << 0) // V1 := V2 * V3, element size = 2**m +#define VFD_ZOPC (unsigned long)(0xe7L << 40 | 0xE5L << 0) // V1 := V2 / V3, element size = 2**m +#define VFSQ_ZOPC (unsigned long)(0xe7L << 40 | 0xCEL << 0) // V1 := sqrt of V2, element size = 2**m +#define VFLR_ZOPC (unsigned long)(0xe7L << 40 | 0xC5L << 0) // vector fp load rounded, element size = 2**m + //-------------------------------- //-- Miscellaneous Operations -- @@ -2322,22 +2332,22 @@ class Assembler : public AbstractAssembler { inline void z_xilf(Register r1, int64_t i2); // xor r1 = r1 ^ i2_imm32 ; or only for bits 32-63 // shift - inline void z_sla( Register r1, int64_t d2, Register b2=Z_R0); // shift left r1 = r1 << ((d2+b2)&0x3f) ; int32, only 31 bits shifted, sign preserved! - inline void z_slak(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int32, only 31 bits shifted, sign preserved! - inline void z_slag(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int64, only 63 bits shifted, sign preserved! - inline void z_sra( Register r1, int64_t d2, Register b2=Z_R0); // shift right r1 = r1 >> ((d2+b2)&0x3f) ; int32, sign extended - inline void z_srak(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int32, sign extended - inline void z_srag(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int64, sign extended - inline void z_sll( Register r1, int64_t d2, Register b2=Z_R0); // shift left r1 = r1 << ((d2+b2)&0x3f) ; int32, zeros added - inline void z_sllk(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int32, zeros added - inline void z_sllg(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int64, zeros added - inline void z_srl( Register r1, int64_t d2, Register b2=Z_R0); // shift right r1 = r1 >> ((d2+b2)&0x3f) ; int32, zero extended - inline void z_srlk(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int32, zero extended - inline void z_srlg(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int64, zero extended + inline void z_sla( Register r1, int64_t d2, Register b2 = Z_R0); // shift left r1 = r1 << ((d2+b2)&0x3f) ; int32, only 31 bits shifted, sign preserved! + inline void z_slak(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int32, only 31 bits shifted, sign preserved! + inline void z_slag(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int64, only 63 bits shifted, sign preserved! + inline void z_sra( Register r1, int64_t d2, Register b2 = Z_R0); // shift right r1 = r1 >> ((d2+b2)&0x3f) ; int32, sign extended + inline void z_srak(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int32, sign extended + inline void z_srag(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int64, sign extended + inline void z_sll( Register r1, int64_t d2, Register b2 = Z_R0); // shift left r1 = r1 << ((d2+b2)&0x3f) ; int32, zeros added + inline void z_sllk(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int32, zeros added + inline void z_sllg(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // shift left r1 = r3 << ((d2+b2)&0x3f) ; int64, zeros added + inline void z_srl( Register r1, int64_t d2, Register b2 = Z_R0); // shift right r1 = r1 >> ((d2+b2)&0x3f) ; int32, zero extended + inline void z_srlk(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int32, zero extended + inline void z_srlg(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int64, zero extended // rotate - inline void z_rll( Register r1, Register r3, int64_t d2, Register b2=Z_R0); // rot r1 = r3 << (d2+b2 & 0x3f) ; int32 -- z10 - inline void z_rllg(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // rot r1 = r3 << (d2+b2 & 0x3f) ; int64 -- z10 + inline void z_rll( Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // rot r1 = r3 << (d2+b2 & 0x3f) ; int32 -- z10 + inline void z_rllg(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // rot r1 = r3 << (d2+b2 & 0x3f) ; int64 -- z10 // rotate the AND/XOR/OR/insert inline void z_rnsbg( Register r1, Register r2, int64_t spos3, int64_t epos4, int64_t nrot5, bool test_only = false); // rotate then AND selected bits -- z196 @@ -2459,7 +2469,7 @@ class Assembler : public AbstractAssembler { inline void z_mvc(const Address& d, const Address& s, int64_t l); // move l bytes inline void z_mvc(int64_t d1, int64_t l, Register b1, int64_t d2, Register b2); // move l+1 bytes inline void z_mvcin(int64_t d1, int64_t l, Register b1, int64_t d2, Register b2); // move l+1 bytes - inline void z_mvcle(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // move region of memory + inline void z_mvcle(Register r1, Register r3, int64_t d2, Register b2 = Z_R0); // move region of memory inline void z_stfle(int64_t d2, Register b2); // store facility list extended @@ -2491,6 +2501,7 @@ class Assembler : public AbstractAssembler { // Load (transfer from memory) inline void z_vlm( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2); inline void z_vl( VectorRegister v1, int64_t d2, Register x2, Register b2); + inline void z_vl( VectorRegister v1, const Address& a); inline void z_vleb( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3); inline void z_vleh( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3); inline void z_vlef( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3); @@ -2529,10 +2540,10 @@ class Assembler : public AbstractAssembler { inline void z_vlgvg( Register r1, VectorRegister v3, int64_t d2, Register b2); inline void z_vlvg( VectorRegister v1, Register r3, int64_t d2, Register b2, int64_t m4); - inline void z_vlvgb( VectorRegister v1, Register r3, int64_t d2, Register b2); - inline void z_vlvgh( VectorRegister v1, Register r3, int64_t d2, Register b2); - inline void z_vlvgf( VectorRegister v1, Register r3, int64_t d2, Register b2); - inline void z_vlvgg( VectorRegister v1, Register r3, int64_t d2, Register b2); + inline void z_vlvgb( VectorRegister v1, Register r3, int64_t d2, Register b2 = Z_R0); + inline void z_vlvgh( VectorRegister v1, Register r3, int64_t d2, Register b2 = Z_R0); + inline void z_vlvgf( VectorRegister v1, Register r3, int64_t d2, Register b2 = Z_R0); + inline void z_vlvgg( VectorRegister v1, Register r3, int64_t d2, Register b2 = Z_R0); inline void z_vlvgp( VectorRegister v1, Register r2, Register r3); @@ -2619,6 +2630,7 @@ class Assembler : public AbstractAssembler { // Store inline void z_vstm( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2); inline void z_vst( VectorRegister v1, int64_t d2, Register x2, Register b2); + inline void z_vst( VectorRegister v1, const Address& a); inline void z_vsteb( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3); inline void z_vsteh( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3); inline void z_vstef( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t m3); @@ -2679,13 +2691,16 @@ class Assembler : public AbstractAssembler { inline void z_vscbiq( VectorRegister v1, VectorRegister v2, VectorRegister v3); // MULTIPLY - inline void z_vml( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); - inline void z_vmh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); - inline void z_vmlh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); - inline void z_vme( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); - inline void z_vmle( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); - inline void z_vmo( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); - inline void z_vmlo( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vml( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vmlb( VectorRegister v1, VectorRegister v2, VectorRegister v3); + inline void z_vmlhw(VectorRegister v1, VectorRegister v2, VectorRegister v3); + inline void z_vmlf( VectorRegister v1, VectorRegister v2, VectorRegister v3); + inline void z_vmh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vmlh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vme( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vmle( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vmo( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vmlo( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); // MULTIPLY & ADD inline void z_vmal( VectorRegister v1, VectorRegister v2, VectorRegister v3, VectorRegister v4, int64_t m5); @@ -2744,6 +2759,9 @@ class Assembler : public AbstractAssembler { // NOR inline void z_vno( VectorRegister v1, VectorRegister v2, VectorRegister v3); + //NOT-XOR + inline void z_vnx( VectorRegister v1, VectorRegister v2, VectorRegister v3); + // OR inline void z_vo( VectorRegister v1, VectorRegister v2, VectorRegister v3); @@ -2810,6 +2828,10 @@ class Assembler : public AbstractAssembler { inline void z_vctzf( VectorRegister v1, VectorRegister v2); inline void z_vctzg( VectorRegister v1, VectorRegister v2); inline void z_vpopct( VectorRegister v1, VectorRegister v2, int64_t m3); + inline void z_vpopctb(VectorRegister v1, VectorRegister v2); + inline void z_vpopcth(VectorRegister v1, VectorRegister v2); + inline void z_vpopctf(VectorRegister v1, VectorRegister v2); + inline void z_vpopctg(VectorRegister v1, VectorRegister v2); // Rotate/Shift inline void z_verllv( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); @@ -2898,9 +2920,39 @@ class Assembler : public AbstractAssembler { inline void z_vistrfs(VectorRegister v1, VectorRegister v2); - // Floatingpoint instructions + // Vector Floatingpoint instructions // ========================== + // Add + inline void z_vfa( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vfasb(VectorRegister v1, VectorRegister v2, VectorRegister v3); + inline void z_vfadb(VectorRegister v1, VectorRegister v2, VectorRegister v3); + + //SUB + inline void z_vfs( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vfssb(VectorRegister v1, VectorRegister v2, VectorRegister v3); + inline void z_vfsdb(VectorRegister v1, VectorRegister v2, VectorRegister v3); + + //MUL + inline void z_vfm( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vfmsb(VectorRegister v1, VectorRegister v2, VectorRegister v3); + inline void z_vfmdb(VectorRegister v1, VectorRegister v2, VectorRegister v3); + + //DIV + inline void z_vfd( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4); + inline void z_vfdsb(VectorRegister v1, VectorRegister v2, VectorRegister v3); + inline void z_vfddb(VectorRegister v1, VectorRegister v2, VectorRegister v3); + + //square root + inline void z_vfsq( VectorRegister v1, VectorRegister v2, int64_t m3); + inline void z_vfsqsb(VectorRegister v1, VectorRegister v2); + inline void z_vfsqdb(VectorRegister v1, VectorRegister v2); + + //vector fp load rounded + inline void z_vflr( VectorRegister v1, VectorRegister v2, int64_t m3, int64_t m5); + inline void z_vflrd( VectorRegister v1, VectorRegister v2, int64_t m5); + // Floatingpoint instructions + // ========================== // compare instructions inline void z_cebr(FloatRegister r1, FloatRegister r2); // compare (r1, r2) ; float inline void z_ceb(FloatRegister r1, int64_t d2, Register x2, Register b2); // compare (r1, *(d2_imm12+x2+b2)) ; float diff --git a/src/hotspot/cpu/s390/assembler_s390.inline.hpp b/src/hotspot/cpu/s390/assembler_s390.inline.hpp index 78ce87ddeb767..e9277b8bb6f12 100644 --- a/src/hotspot/cpu/s390/assembler_s390.inline.hpp +++ b/src/hotspot/cpu/s390/assembler_s390.inline.hpp @@ -787,6 +787,7 @@ inline void Assembler::z_vleb( VectorRegister v1, int64_t d2, Register x2, Reg inline void Assembler::z_vleh( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t ix3){emit_48(VLEH_ZOPC | vreg(v1, 8) | rxmask_48(d2, x2, b2) | uimm4(ix3, 32, 48)); } inline void Assembler::z_vlef( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t ix3){emit_48(VLEF_ZOPC | vreg(v1, 8) | rxmask_48(d2, x2, b2) | uimm4(ix3, 32, 48)); } inline void Assembler::z_vleg( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t ix3){emit_48(VLEG_ZOPC | vreg(v1, 8) | rxmask_48(d2, x2, b2) | uimm4(ix3, 32, 48)); } +inline void Assembler::z_vl(VectorRegister v1, const Address& a) { z_vl(v1, a.disp(), a.indexOrR0(), a.baseOrR0()); } // Gather/Scatter inline void Assembler::z_vgef( VectorRegister v1, int64_t d2, VectorRegister vx2, Register b2, int64_t ix3) {emit_48(VGEF_ZOPC | vreg(v1, 8) | rvmask_48(d2, vx2, b2) | uimm4(ix3, 32, 48)); } @@ -820,7 +821,7 @@ inline void Assembler::z_vlgvh( Register r1, VectorRegister v3, int64_t d2, Reg inline void Assembler::z_vlgvf( Register r1, VectorRegister v3, int64_t d2, Register b2) {z_vlgv(r1, v3, d2, b2, VRET_FW); } // load FW from VR element (index d2(b2)) into GR (logical) inline void Assembler::z_vlgvg( Register r1, VectorRegister v3, int64_t d2, Register b2) {z_vlgv(r1, v3, d2, b2, VRET_DW); } // load DW from VR element (index d2(b2)) into GR. -inline void Assembler::z_vlvg( VectorRegister v1, Register r3, int64_t d2, Register b2, int64_t m4) {emit_48(VLVG_ZOPC | vreg(v1, 8) | reg(r3, 12, 48) | rsmask_48(d2, b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); } +inline void Assembler::z_vlvg( VectorRegister v1, Register r3, int64_t d2, Register b2, int64_t m4) {emit_48(VLVG_ZOPC | vreg(v1, 8) | reg(r3, 12, 48) | rsmaskt_48(d2, b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); } inline void Assembler::z_vlvgb( VectorRegister v1, Register r3, int64_t d2, Register b2) {z_vlvg(v1, r3, d2, b2, VRET_BYTE); } inline void Assembler::z_vlvgh( VectorRegister v1, Register r3, int64_t d2, Register b2) {z_vlvg(v1, r3, d2, b2, VRET_HW); } inline void Assembler::z_vlvgf( VectorRegister v1, Register r3, int64_t d2, Register b2) {z_vlvg(v1, r3, d2, b2, VRET_FW); } @@ -916,6 +917,7 @@ inline void Assembler::z_vsteh( VectorRegister v1, int64_t d2, Register x2, Reg inline void Assembler::z_vstef( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t ix3){emit_48(VSTEF_ZOPC | vreg(v1, 8) | rxmask_48(d2, x2, b2) | uimm4(ix3, 32, 48)); } inline void Assembler::z_vsteg( VectorRegister v1, int64_t d2, Register x2, Register b2, int64_t ix3){emit_48(VSTEG_ZOPC | vreg(v1, 8) | rxmask_48(d2, x2, b2) | uimm4(ix3, 32, 48)); } inline void Assembler::z_vstl( VectorRegister v1, Register r3, int64_t d2, Register b2) {emit_48(VSTL_ZOPC | vreg(v1, 8) | reg(r3, 12, 48) | rsmask_48(d2, b2)); } +inline void Assembler::z_vst(VectorRegister v1, const Address& a) { z_vst(v1, a.disp(), a.indexOrR0(), a.baseOrR0()); } // Misc inline void Assembler::z_vgm( VectorRegister v1, int64_t imm2, int64_t imm3, int64_t m4) {emit_48(VGM_ZOPC | vreg(v1, 8) | uimm8( imm2, 16, 48) | uimm8(imm3, 24, 48) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); } @@ -973,6 +975,9 @@ inline void Assembler::z_vscbiq( VectorRegister v1, VectorRegister v2, VectorReg // MULTIPLY inline void Assembler::z_vml( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VML_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); } inline void Assembler::z_vmh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMH_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); } +inline void Assembler::z_vmlb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vml(v1, v2, v3, VRET_BYTE);} // vector element type 'B' +inline void Assembler::z_vmlhw( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vml(v1, v2, v3, VRET_HW);} // vector element type 'H' +inline void Assembler::z_vmlf( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vml(v1, v2, v3, VRET_FW);} // vector element type 'F' inline void Assembler::z_vmlh( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMLH_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); } inline void Assembler::z_vme( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VME_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); } inline void Assembler::z_vmle( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VMLE_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_FW, 32)); } @@ -1035,6 +1040,9 @@ inline void Assembler::z_vx( VectorRegister v1, VectorRegister v2, VectorReg // NOR inline void Assembler::z_vno( VectorRegister v1, VectorRegister v2, VectorRegister v3) {emit_48(VNO_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16)); } +//NOT-XOR +inline void Assembler::z_vnx( VectorRegister v1, VectorRegister v2, VectorRegister v3) {emit_48(VNX_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16)); } + // OR inline void Assembler::z_vo( VectorRegister v1, VectorRegister v2, VectorRegister v3) {emit_48(VO_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16)); } @@ -1101,6 +1109,10 @@ inline void Assembler::z_vctzh( VectorRegister v1, VectorRegister v2) inline void Assembler::z_vctzf( VectorRegister v1, VectorRegister v2) {z_vctz(v1, v2, VRET_FW); } // vector element type 'F' inline void Assembler::z_vctzg( VectorRegister v1, VectorRegister v2) {z_vctz(v1, v2, VRET_DW); } // vector element type 'G' inline void Assembler::z_vpopct( VectorRegister v1, VectorRegister v2, int64_t m3) {emit_48(VPOPCT_ZOPC| vreg(v1, 8) | vreg(v2, 12) | vesc_mask(m3, VRET_BYTE, VRET_DW, 32)); } +inline void Assembler::z_vpopctb( VectorRegister v1, VectorRegister v2) {z_vpopct(v1, v2, VRET_BYTE); } +inline void Assembler::z_vpopcth( VectorRegister v1, VectorRegister v2) {z_vpopct(v1, v2, VRET_HW); } +inline void Assembler::z_vpopctf( VectorRegister v1, VectorRegister v2) {z_vpopct(v1, v2, VRET_FW); } +inline void Assembler::z_vpopctg( VectorRegister v1, VectorRegister v2) {z_vpopct(v1, v2, VRET_DW); } // Rotate/Shift inline void Assembler::z_verllv( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VERLLV_ZOPC| vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); } @@ -1108,7 +1120,7 @@ inline void Assembler::z_verllvb(VectorRegister v1, VectorRegister v2, VectorReg inline void Assembler::z_verllvh(VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_verllv(v1, v2, v3, VRET_HW); } // vector element type 'H' inline void Assembler::z_verllvf(VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_verllv(v1, v2, v3, VRET_FW); } // vector element type 'F' inline void Assembler::z_verllvg(VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_verllv(v1, v2, v3, VRET_DW); } // vector element type 'G' -inline void Assembler::z_verll( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2, int64_t m4) {emit_48(VERLL_ZOPC | vreg(v1, 8) | vreg(v3, 12) | rsmask_48(d2, b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); } +inline void Assembler::z_verll( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2, int64_t m4) {emit_48(VERLL_ZOPC | vreg(v1, 8) | vreg(v3, 12) | rsmask_48(d2, b2) | vesc_mask(m4, VRET_BYTE, VRET_DW, 32)); } inline void Assembler::z_verllb( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2) {z_verll(v1, v3, d2, b2, VRET_BYTE);}// vector element type 'B' inline void Assembler::z_verllh( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2) {z_verll(v1, v3, d2, b2, VRET_HW);} // vector element type 'H' inline void Assembler::z_verllf( VectorRegister v1, VectorRegister v3, int64_t d2, Register b2) {z_verll(v1, v3, d2, b2, VRET_FW);} // vector element type 'F' @@ -1188,12 +1200,41 @@ inline void Assembler::z_vistrbs(VectorRegister v1, VectorRegister v2) inline void Assembler::z_vistrhs(VectorRegister v1, VectorRegister v2) {z_vistr(v1, v2, VRET_HW, VOPRC_CCSET); } inline void Assembler::z_vistrfs(VectorRegister v1, VectorRegister v2) {z_vistr(v1, v2, VRET_FW, VOPRC_CCSET); } +//------------------------------- +// Vector FLOAT INSTRUCTIONS +//------------------------------- +// ADD +inline void Assembler::z_vfa( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VFA_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_FW, VRET_QW, 32)); } +inline void Assembler::z_vfasb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vfa(v1, v2, v3, VRET_FW); } // vector element type 'F' +inline void Assembler::z_vfadb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vfa(v1, v2, v3, VRET_DW); } // vector element type 'G' + +// SUB +inline void Assembler::z_vfs( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VFS_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_FW, VRET_QW, 32)); } +inline void Assembler::z_vfssb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vfs(v1, v2, v3, VRET_FW); } // vector element type 'F' +inline void Assembler::z_vfsdb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vfs(v1, v2, v3, VRET_DW); } // vector element type 'G' + +// MUL +inline void Assembler::z_vfm( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VFM_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_FW, VRET_QW, 32)); } +inline void Assembler::z_vfmsb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vfm(v1, v2, v3, VRET_FW); } // vector element type 'F' +inline void Assembler::z_vfmdb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vfm(v1, v2, v3, VRET_DW); } // vector element type 'G' + +// DIV +inline void Assembler::z_vfd( VectorRegister v1, VectorRegister v2, VectorRegister v3, int64_t m4) {emit_48(VFD_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vreg(v3, 16) | vesc_mask(m4, VRET_FW, VRET_QW, 32)); } +inline void Assembler::z_vfdsb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vfd(v1, v2, v3, VRET_FW); } // vector element type 'F' +inline void Assembler::z_vfddb( VectorRegister v1, VectorRegister v2, VectorRegister v3) {z_vfd(v1, v2, v3, VRET_DW); } // vector element type 'G' + +// square root +inline void Assembler::z_vfsq( VectorRegister v1, VectorRegister v2, int64_t m3) {emit_48(VFSQ_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vesc_mask(m3, VRET_FW, VRET_QW, 32)); } +inline void Assembler::z_vfsqsb( VectorRegister v1, VectorRegister v2) {z_vfsq(v1, v2, VRET_FW); } +inline void Assembler::z_vfsqdb( VectorRegister v1, VectorRegister v2) {z_vfsq(v1, v2, VRET_DW); } + +// vector fp load rounded +inline void Assembler::z_vflr( VectorRegister v1, VectorRegister v2, int64_t m5, int64_t m3) {emit_48(VFLR_ZOPC | vreg(v1, 8) | vreg(v2, 12) | vesc_mask(m5, VRET_FW, 7, 24) | vesc_mask(m3, VRET_FW, VRET_QW, 32)); } +inline void Assembler::z_vflrd( VectorRegister v1, VectorRegister v2, int64_t m5) {z_vflr(v1, v2, m5, VRET_DW); } //------------------------------- // FLOAT INSTRUCTIONS //------------------------------- - -//---------------- // LOAD //---------------- inline void Assembler::z_ler( FloatRegister r1, FloatRegister r2) { emit_16( LER_ZOPC | fregt(r1,8,16) | freg(r2,12,16)); } diff --git a/src/hotspot/cpu/s390/c2_MacroAssembler_s390.cpp b/src/hotspot/cpu/s390/c2_MacroAssembler_s390.cpp index 378d5e4cfe1b9..c8393fe0e6097 100644 --- a/src/hotspot/cpu/s390/c2_MacroAssembler_s390.cpp +++ b/src/hotspot/cpu/s390/c2_MacroAssembler_s390.cpp @@ -169,6 +169,7 @@ unsigned int C2_MacroAssembler::string_compress(Register result, Register src, R #endif clear_reg(Z_R0); // make sure register is properly initialized. +#if 0 if (VM_Version::has_VectorFacility()) { const int min_vcnt = 32; // Minimum #characters required to use vector instructions. // Otherwise just do nothing in vector mode. @@ -223,6 +224,7 @@ unsigned int C2_MacroAssembler::string_compress(Register result, Register src, R bind(VectorDone); } +#endif { const int min_cnt = 8; // Minimum #characters required to use unrolled loop. @@ -461,6 +463,7 @@ unsigned int C2_MacroAssembler::string_inflate(Register src, Register dst, Regis #endif clear_reg(Z_R0); // make sure register is properly initialized. +#if 0 if (VM_Version::has_VectorFacility()) { const int min_vcnt = 32; // Minimum #characters required to use vector instructions. // Otherwise just do nothing in vector mode. @@ -489,6 +492,7 @@ unsigned int C2_MacroAssembler::string_inflate(Register src, Register dst, Regis bind(VectorDone); } +#endif const int min_cnt = 8; // Minimum #characters required to use unrolled scalar loop. // Otherwise just do nothing in unrolled scalar mode. @@ -623,6 +627,7 @@ unsigned int C2_MacroAssembler::string_inflate_const(Register src, Register dst, bool restore_inputs = false; bool workreg_clear = false; +#if 0 if ((len >= 32) && VM_Version::has_VectorFacility()) { const int min_vcnt = 32; // Minimum #characters required to use vector instructions. // Otherwise just do nothing in vector mode. @@ -678,6 +683,7 @@ unsigned int C2_MacroAssembler::string_inflate_const(Register src, Register dst, src_off += min_vcnt; dst_off += min_vcnt*2; } +#endif if ((len-nprocessed) > 8) { const int min_cnt = 8; // Minimum #characters required to use unrolled scalar loop. diff --git a/src/hotspot/cpu/s390/c2_globals_s390.hpp b/src/hotspot/cpu/s390/c2_globals_s390.hpp index 0192cb716baab..1de38f100f627 100644 --- a/src/hotspot/cpu/s390/c2_globals_s390.hpp +++ b/src/hotspot/cpu/s390/c2_globals_s390.hpp @@ -60,7 +60,7 @@ define_pd_global(bool, UseCISCSpill, true); define_pd_global(bool, OptoBundling, false); define_pd_global(bool, OptoScheduling, false); define_pd_global(bool, OptoRegScheduling, false); -define_pd_global(bool, SuperWordLoopUnrollAnalysis, false); +define_pd_global(bool, SuperWordLoopUnrollAnalysis, true); // On s390x, we can clear the array with a single instruction, // so don't idealize it. define_pd_global(bool, IdealizeClearArrayNode, false); diff --git a/src/hotspot/cpu/s390/globals_s390.hpp b/src/hotspot/cpu/s390/globals_s390.hpp index fb5892ba62f0c..cf4be20397cfd 100644 --- a/src/hotspot/cpu/s390/globals_s390.hpp +++ b/src/hotspot/cpu/s390/globals_s390.hpp @@ -107,6 +107,11 @@ define_pd_global(intx, InitArrayShortSize, 1*BytesPerLong); /* Seems to pay off with 2 pages already. */ \ product(size_t, MVCLEThreshold, +2*(4*K), DIAGNOSTIC, \ "Threshold above which page-aligned MVCLE copy/init is used.") \ + /* special instructions */ \ + product(bool, SuperwordUseVX, false, \ + "Use Z15 Vector instructions for superword optimization.") \ + product(bool, UseSFPV, false, DIAGNOSTIC, \ + "Use SFPV Vector instructions for superword optimization.") \ \ product(bool, PreferLAoverADD, false, DIAGNOSTIC, \ "Use LA/LAY instructions over ADD instructions (z/Architecture).") \ diff --git a/src/hotspot/cpu/s390/registerSaver_s390.hpp b/src/hotspot/cpu/s390/registerSaver_s390.hpp index 97883685384ca..13674f1562daf 100644 --- a/src/hotspot/cpu/s390/registerSaver_s390.hpp +++ b/src/hotspot/cpu/s390/registerSaver_s390.hpp @@ -47,10 +47,11 @@ class RegisterSaver { // Boolean flags to force only argument registers to be saved. static int live_reg_save_size(RegisterSet reg_set); - static int live_reg_frame_size(RegisterSet reg_set); + static int live_reg_frame_size(RegisterSet reg_set, bool save_vectors = false); + static int calculate_vregstosave_num(); // Specify the register that should be stored as the return pc in the current frame. - static OopMap* save_live_registers(MacroAssembler* masm, RegisterSet reg_set, Register return_pc = Z_R14); - static void restore_live_registers(MacroAssembler* masm, RegisterSet reg_set); + static OopMap* save_live_registers(MacroAssembler* masm, RegisterSet reg_set, Register return_pc = Z_R14, bool save_vectors = false); + static void restore_live_registers(MacroAssembler* masm, RegisterSet reg_set, bool save_vectors = false); // Generate the OopMap (again, regs where saved before). static OopMap* generate_oop_map(MacroAssembler* masm, RegisterSet reg_set); @@ -65,11 +66,13 @@ class RegisterSaver { int_reg = 0, float_reg = 1, excluded_reg = 2, // Not saved/restored. + v_reg = 3 } RegisterType; typedef enum { reg_size = 8, half_reg_size = reg_size / 2, + v_reg_size = 16 } RegisterConstants; // Remember type, number, and VMReg. diff --git a/src/hotspot/cpu/s390/register_s390.cpp b/src/hotspot/cpu/s390/register_s390.cpp index f055a1c013441..c0840add5d6e4 100644 --- a/src/hotspot/cpu/s390/register_s390.cpp +++ b/src/hotspot/cpu/s390/register_s390.cpp @@ -26,11 +26,6 @@ #include "precompiled.hpp" #include "register_s390.hpp" - -const int ConcreteRegisterImpl::max_gpr = Register::number_of_registers * 2; -const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr + - FloatRegister::number_of_registers * 2; - const char* Register::name() const { const char* names[number_of_registers] = { "Z_R0", "Z_R1", "Z_R2", "Z_R3", "Z_R4", "Z_R5", "Z_R6", "Z_R7", @@ -54,5 +49,11 @@ const char* VectorRegister::name() const { "Z_V16", "Z_V17", "Z_V18", "Z_V19", "Z_V20", "Z_V21", "Z_V22", "Z_V23", "Z_V24", "Z_V25", "Z_V26", "Z_V27", "Z_V28", "Z_V29", "Z_V30", "Z_V31" }; - return is_valid() ? names[encoding()] : "fnoreg"; + return is_valid() ? names[encoding()] : "vnoreg"; +} + +// Method to convert a FloatRegister to a VectorRegister (VectorRegister) +VectorRegister FloatRegister::to_vr() const { + if (*this == fnoreg) { return vnoreg; } + return as_VectorRegister(encoding()); } diff --git a/src/hotspot/cpu/s390/register_s390.hpp b/src/hotspot/cpu/s390/register_s390.hpp index 18af232e56970..6fcba746cd3b9 100644 --- a/src/hotspot/cpu/s390/register_s390.hpp +++ b/src/hotspot/cpu/s390/register_s390.hpp @@ -64,6 +64,7 @@ class Register { public: enum { number_of_registers = 16, + max_slots_per_register = 2, number_of_arg_registers = 5 }; @@ -164,12 +165,13 @@ constexpr ConditionRegister Z_CR = as_ConditionRegister(0); //========================= // The implementation of float registers for the z/Architecture. - +class VectorRegister; class FloatRegister { int _encoding; public: enum { number_of_registers = 16, + max_slots_per_register = 2, number_of_arg_registers = 4 }; @@ -192,6 +194,8 @@ class FloatRegister { constexpr bool is_nonvolatile() const { return (8 <= _encoding && _encoding <= 15); } const char* name() const; + // convert to VR + VectorRegister to_vr() const; }; inline constexpr FloatRegister as_FloatRegister(int encoding) { @@ -285,6 +289,7 @@ class VectorRegister { public: enum { number_of_registers = 32, + max_slots_per_register = 4, number_of_arg_registers = 0 }; @@ -379,21 +384,20 @@ constexpr VectorRegister Z_V31 = as_VectorRegister(31); // Need to know the total number of registers of all sorts for SharedInfo. // Define a class that exports it. - class ConcreteRegisterImpl : public AbstractRegisterImpl { public: enum { - number_of_registers = - (Register::number_of_registers + - FloatRegister::number_of_registers) - * 2 // register halves - + 1 // condition code register + max_gpr = Register::number_of_registers * Register::max_slots_per_register, + max_fpr = max_gpr + FloatRegister::number_of_registers * FloatRegister::max_slots_per_register, + max_vr = max_fpr + VectorRegister::number_of_registers * VectorRegister::max_slots_per_register, + // A big enough number for C2: all the registers plus flags + // This number must be large enough to cover REG_COUNT (defined by c2) registers. + // There is no requirement that any ordering here matches any ordering c2 gives + // it's optoregs. + number_of_registers = max_vr + 1 // gpr/fpr/vr + flags }; - static const int max_gpr; - static const int max_fpr; }; - // Common register declarations used in assembler code. constexpr Register Z_EXC_OOP = Z_R2; constexpr Register Z_EXC_PC = Z_R3; diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index 63e150c9e9c78..e1a98139992f8 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -97,8 +97,9 @@ register %{ // e.g. Z_R3_H, which is needed by the allocator, but is not used // for stores, loads, etc. - // Integer/Long Registers - // ---------------------------- +// ---------------------------- +// Integer/Long Registers +// ---------------------------- // z/Architecture has 16 64-bit integer registers. @@ -136,7 +137,9 @@ register %{ reg_def Z_R15 (NS, NS, Op_RegI, 15, Z_R15->as_VMReg()); // s SP reg_def Z_R15_H(NS, NS, Op_RegI, 99, Z_R15->as_VMReg()->next()); - // Float/Double Registers +// ---------------------------- +// Float/Double Registers +// ---------------------------- // The rules of ADL require that double registers be defined in pairs. // Each pair must be two 32-bit values, but not necessarily a pair of @@ -182,7 +185,169 @@ register %{ reg_def Z_F15 (SOC, SOE, Op_RegF, 15, Z_F15->as_VMReg()); reg_def Z_F15_H(SOC, SOE, Op_RegF, 99, Z_F15->as_VMReg()->next()); - +// ---------------------------- +// Vector Registers +// ---------------------------- + // 1st 16 VRs are aliases for the FPRs which are already defined above. + reg_def Z_VR0 ( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad()); + reg_def Z_VR0_H ( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad()); + reg_def Z_VR0_J ( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad()); + reg_def Z_VR0_K ( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad()); + + reg_def Z_VR1 ( SOC, SOC, Op_RegF, 1, VMRegImpl::Bad()); + reg_def Z_VR1_H ( SOC, SOC, Op_RegF, 1, VMRegImpl::Bad()); + reg_def Z_VR1_J ( SOC, SOC, Op_RegF, 1, VMRegImpl::Bad()); + reg_def Z_VR1_K ( SOC, SOC, Op_RegF, 1, VMRegImpl::Bad()); + + reg_def Z_VR2 ( SOC, SOC, Op_RegF, 2, VMRegImpl::Bad()); + reg_def Z_VR2_H ( SOC, SOC, Op_RegF, 2, VMRegImpl::Bad()); + reg_def Z_VR2_J ( SOC, SOC, Op_RegF, 2, VMRegImpl::Bad()); + reg_def Z_VR2_K ( SOC, SOC, Op_RegF, 2, VMRegImpl::Bad()); + + reg_def Z_VR3 ( SOC, SOC, Op_RegF, 3, VMRegImpl::Bad()); + reg_def Z_VR3_H ( SOC, SOC, Op_RegF, 3, VMRegImpl::Bad()); + reg_def Z_VR3_J ( SOC, SOC, Op_RegF, 3, VMRegImpl::Bad()); + reg_def Z_VR3_K ( SOC, SOC, Op_RegF, 3, VMRegImpl::Bad()); + + reg_def Z_VR4 ( SOC, SOC, Op_RegF, 4, VMRegImpl::Bad()); + reg_def Z_VR4_H ( SOC, SOC, Op_RegF, 4, VMRegImpl::Bad()); + reg_def Z_VR4_J ( SOC, SOC, Op_RegF, 4, VMRegImpl::Bad()); + reg_def Z_VR4_K ( SOC, SOC, Op_RegF, 4, VMRegImpl::Bad()); + + reg_def Z_VR5 ( SOC, SOC, Op_RegF, 5, VMRegImpl::Bad()); + reg_def Z_VR5_H ( SOC, SOC, Op_RegF, 5, VMRegImpl::Bad()); + reg_def Z_VR5_J ( SOC, SOC, Op_RegF, 5, VMRegImpl::Bad()); + reg_def Z_VR5_K ( SOC, SOC, Op_RegF, 5, VMRegImpl::Bad()); + + reg_def Z_VR6 ( SOC, SOC, Op_RegF, 6, VMRegImpl::Bad()); + reg_def Z_VR6_H ( SOC, SOC, Op_RegF, 6, VMRegImpl::Bad()); + reg_def Z_VR6_J ( SOC, SOC, Op_RegF, 6, VMRegImpl::Bad()); + reg_def Z_VR6_K ( SOC, SOC, Op_RegF, 6, VMRegImpl::Bad()); + + reg_def Z_VR7 ( SOC, SOC, Op_RegF, 7, VMRegImpl::Bad()); + reg_def Z_VR7_H ( SOC, SOC, Op_RegF, 7, VMRegImpl::Bad()); + reg_def Z_VR7_J ( SOC, SOC, Op_RegF, 7, VMRegImpl::Bad()); + reg_def Z_VR7_K ( SOC, SOC, Op_RegF, 7, VMRegImpl::Bad()); + + reg_def Z_VR8 ( SOC, SOC, Op_RegF, 8, VMRegImpl::Bad()); + reg_def Z_VR8_H ( SOC, SOC, Op_RegF, 8, VMRegImpl::Bad()); + reg_def Z_VR8_J ( SOC, SOC, Op_RegF, 8, VMRegImpl::Bad()); + reg_def Z_VR8_K ( SOC, SOC, Op_RegF, 8, VMRegImpl::Bad()); + + reg_def Z_VR9 ( SOC, SOC, Op_RegF, 9, VMRegImpl::Bad()); + reg_def Z_VR9_H ( SOC, SOC, Op_RegF, 9, VMRegImpl::Bad()); + reg_def Z_VR9_J ( SOC, SOC, Op_RegF, 9, VMRegImpl::Bad()); + reg_def Z_VR9_K ( SOC, SOC, Op_RegF, 9, VMRegImpl::Bad()); + + reg_def Z_VR10 ( SOC, SOC, Op_RegF, 10, VMRegImpl::Bad()); + reg_def Z_VR10_H ( SOC, SOC, Op_RegF, 10, VMRegImpl::Bad()); + reg_def Z_VR10_J ( SOC, SOC, Op_RegF, 10, VMRegImpl::Bad()); + reg_def Z_VR10_K ( SOC, SOC, Op_RegF, 10, VMRegImpl::Bad()); + + reg_def Z_VR11 ( SOC, SOC, Op_RegF, 11, VMRegImpl::Bad()); + reg_def Z_VR11_H ( SOC, SOC, Op_RegF, 11, VMRegImpl::Bad()); + reg_def Z_VR11_J ( SOC, SOC, Op_RegF, 11, VMRegImpl::Bad()); + reg_def Z_VR11_K ( SOC, SOC, Op_RegF, 11, VMRegImpl::Bad()); + + reg_def Z_VR12 ( SOC, SOC, Op_RegF, 12, VMRegImpl::Bad()); + reg_def Z_VR12_H ( SOC, SOC, Op_RegF, 12, VMRegImpl::Bad()); + reg_def Z_VR12_J ( SOC, SOC, Op_RegF, 12, VMRegImpl::Bad()); + reg_def Z_VR12_K ( SOC, SOC, Op_RegF, 12, VMRegImpl::Bad()); + + reg_def Z_VR13 ( SOC, SOC, Op_RegF, 13, VMRegImpl::Bad()); + reg_def Z_VR13_H ( SOC, SOC, Op_RegF, 13, VMRegImpl::Bad()); + reg_def Z_VR13_J ( SOC, SOC, Op_RegF, 13, VMRegImpl::Bad()); + reg_def Z_VR13_K ( SOC, SOC, Op_RegF, 13, VMRegImpl::Bad()); + + reg_def Z_VR14 ( SOC, SOC, Op_RegF, 14, VMRegImpl::Bad()); + reg_def Z_VR14_H ( SOC, SOC, Op_RegF, 14, VMRegImpl::Bad()); + reg_def Z_VR14_J ( SOC, SOC, Op_RegF, 14, VMRegImpl::Bad()); + reg_def Z_VR14_K ( SOC, SOC, Op_RegF, 14, VMRegImpl::Bad()); + + reg_def Z_VR15 ( SOC, SOC, Op_RegF, 15, VMRegImpl::Bad()); + reg_def Z_VR15_H ( SOC, SOC, Op_RegF, 15, VMRegImpl::Bad()); + reg_def Z_VR15_J ( SOC, SOC, Op_RegF, 15, VMRegImpl::Bad()); + reg_def Z_VR15_K ( SOC, SOC, Op_RegF, 15, VMRegImpl::Bad()); + + reg_def Z_VR16 ( SOC, SOC, Op_RegF, 16, Z_V16->as_VMReg() ); + reg_def Z_VR16_H ( SOC, SOC, Op_RegF, 16, Z_V16->as_VMReg()->next() ); + reg_def Z_VR16_J ( SOC, SOC, Op_RegF, 16, Z_V16->as_VMReg()->next(2) ); + reg_def Z_VR16_K ( SOC, SOC, Op_RegF, 16, Z_V16->as_VMReg()->next(3) ); + + reg_def Z_VR17 ( SOC, SOC, Op_RegF, 17, Z_V17->as_VMReg() ); + reg_def Z_VR17_H ( SOC, SOC, Op_RegF, 17, Z_V17->as_VMReg()->next() ); + reg_def Z_VR17_J ( SOC, SOC, Op_RegF, 17, Z_V17->as_VMReg()->next(2) ); + reg_def Z_VR17_K ( SOC, SOC, Op_RegF, 17, Z_V17->as_VMReg()->next(3) ); + + reg_def Z_VR18 ( SOC, SOC, Op_RegF, 18, Z_V18->as_VMReg() ); + reg_def Z_VR18_H ( SOC, SOC, Op_RegF, 18, Z_V18->as_VMReg()->next() ); + reg_def Z_VR18_J ( SOC, SOC, Op_RegF, 18, Z_V18->as_VMReg()->next(2) ); + reg_def Z_VR18_K ( SOC, SOC, Op_RegF, 18, Z_V18->as_VMReg()->next(3) ); + + reg_def Z_VR19 ( SOC, SOC, Op_RegF, 19, Z_V19->as_VMReg() ); + reg_def Z_VR19_H ( SOC, SOC, Op_RegF, 19, Z_V19->as_VMReg()->next() ); + reg_def Z_VR19_J ( SOC, SOC, Op_RegF, 19, Z_V19->as_VMReg()->next(2) ); + reg_def Z_VR19_K ( SOC, SOC, Op_RegF, 19, Z_V19->as_VMReg()->next(3) ); + + reg_def Z_VR20 ( SOC, SOC, Op_RegF, 20, Z_V20->as_VMReg() ); + reg_def Z_VR20_H ( SOC, SOC, Op_RegF, 20, Z_V20->as_VMReg()->next() ); + reg_def Z_VR20_J ( SOC, SOC, Op_RegF, 20, Z_V20->as_VMReg()->next(2) ); + reg_def Z_VR20_K ( SOC, SOC, Op_RegF, 20, Z_V20->as_VMReg()->next(3) ); + + reg_def Z_VR21 ( SOC, SOC, Op_RegF, 21, Z_V21->as_VMReg() ); + reg_def Z_VR21_H ( SOC, SOC, Op_RegF, 21, Z_V21->as_VMReg()->next() ); + reg_def Z_VR21_J ( SOC, SOC, Op_RegF, 21, Z_V21->as_VMReg()->next(2) ); + reg_def Z_VR21_K ( SOC, SOC, Op_RegF, 21, Z_V21->as_VMReg()->next(3) ); + + reg_def Z_VR22 ( SOC, SOC, Op_RegF, 22, Z_V22->as_VMReg() ); + reg_def Z_VR22_H ( SOC, SOC, Op_RegF, 22, Z_V22->as_VMReg()->next() ); + reg_def Z_VR22_J ( SOC, SOC, Op_RegF, 22, Z_V22->as_VMReg()->next(2) ); + reg_def Z_VR22_K ( SOC, SOC, Op_RegF, 22, Z_V22->as_VMReg()->next(3) ); + + reg_def Z_VR23 ( SOC, SOC, Op_RegF, 23, Z_V23->as_VMReg() ); + reg_def Z_VR23_H ( SOC, SOC, Op_RegF, 23, Z_V23->as_VMReg()->next() ); + reg_def Z_VR23_J ( SOC, SOC, Op_RegF, 23, Z_V23->as_VMReg()->next(2) ); + reg_def Z_VR23_K ( SOC, SOC, Op_RegF, 23, Z_V23->as_VMReg()->next(3) ); + + reg_def Z_VR24 ( SOC, SOC, Op_RegF, 24, Z_V24->as_VMReg() ); + reg_def Z_VR24_H ( SOC, SOC, Op_RegF, 24, Z_V24->as_VMReg()->next() ); + reg_def Z_VR24_J ( SOC, SOC, Op_RegF, 24, Z_V24->as_VMReg()->next(2) ); + reg_def Z_VR24_K ( SOC, SOC, Op_RegF, 24, Z_V24->as_VMReg()->next(3) ); + + reg_def Z_VR25 ( SOC, SOC, Op_RegF, 25, Z_V25->as_VMReg() ); + reg_def Z_VR25_H ( SOC, SOC, Op_RegF, 25, Z_V25->as_VMReg()->next() ); + reg_def Z_VR25_J ( SOC, SOC, Op_RegF, 25, Z_V25->as_VMReg()->next(2) ); + reg_def Z_VR25_K ( SOC, SOC, Op_RegF, 25, Z_V25->as_VMReg()->next(3) ); + + reg_def Z_VR26 ( SOC, SOC, Op_RegF, 26, Z_V26->as_VMReg() ); + reg_def Z_VR26_H ( SOC, SOC, Op_RegF, 26, Z_V26->as_VMReg()->next() ); + reg_def Z_VR26_J ( SOC, SOC, Op_RegF, 26, Z_V26->as_VMReg()->next(2) ); + reg_def Z_VR26_K ( SOC, SOC, Op_RegF, 26, Z_V26->as_VMReg()->next(3) ); + + reg_def Z_VR27 ( SOC, SOC, Op_RegF, 27, Z_V27->as_VMReg() ); + reg_def Z_VR27_H ( SOC, SOC, Op_RegF, 27, Z_V27->as_VMReg()->next() ); + reg_def Z_VR27_J ( SOC, SOC, Op_RegF, 27, Z_V27->as_VMReg()->next(2) ); + reg_def Z_VR27_K ( SOC, SOC, Op_RegF, 27, Z_V27->as_VMReg()->next(3) ); + + reg_def Z_VR28 ( SOC, SOC, Op_RegF, 28, Z_V28->as_VMReg() ); + reg_def Z_VR28_H ( SOC, SOC, Op_RegF, 28, Z_V28->as_VMReg()->next() ); + reg_def Z_VR28_J ( SOC, SOC, Op_RegF, 28, Z_V28->as_VMReg()->next(2) ); + reg_def Z_VR28_K ( SOC, SOC, Op_RegF, 28, Z_V28->as_VMReg()->next(3) ); + + reg_def Z_VR29 ( SOC, SOC, Op_RegF, 29, Z_V29->as_VMReg() ); + reg_def Z_VR29_H ( SOC, SOC, Op_RegF, 29, Z_V29->as_VMReg()->next() ); + reg_def Z_VR29_J ( SOC, SOC, Op_RegF, 29, Z_V29->as_VMReg()->next(2) ); + reg_def Z_VR29_K ( SOC, SOC, Op_RegF, 29, Z_V29->as_VMReg()->next(3) ); + + reg_def Z_VR30 ( SOC, SOC, Op_RegF, 30, Z_V30->as_VMReg() ); + reg_def Z_VR30_H ( SOC, SOC, Op_RegF, 30, Z_V30->as_VMReg()->next() ); + reg_def Z_VR30_J ( SOC, SOC, Op_RegF, 30, Z_V30->as_VMReg()->next(2) ); + reg_def Z_VR30_K ( SOC, SOC, Op_RegF, 30, Z_V30->as_VMReg()->next(3) ); + + reg_def Z_VR31 ( SOC, SOC, Op_RegF, 31, Z_V31->as_VMReg() ); + reg_def Z_VR31_H ( SOC, SOC, Op_RegF, 31, Z_V31->as_VMReg()->next() ); + reg_def Z_VR31_J ( SOC, SOC, Op_RegF, 31, Z_V31->as_VMReg()->next(2) ); + reg_def Z_VR31_K ( SOC, SOC, Op_RegF, 31, Z_V31->as_VMReg()->next(3) ); // Special Registers // Condition Codes Flag Registers @@ -194,7 +359,6 @@ register %{ reg_def Z_CR(SOC, SOC, Op_RegFlags, 0, Z_CR->as_VMReg()); // volatile - // Specify priority of register selection within phases of register // allocation. Highest priority is first. A useful heuristic is to // give registers a low priority when they are required by machine @@ -268,6 +432,41 @@ alloc_class chunk1( ); alloc_class chunk2( + Z_VR0, Z_VR0_H, Z_VR0_J, Z_VR0_K, + Z_VR1, Z_VR1_H, Z_VR1_J, Z_VR1_K, + Z_VR2, Z_VR2_H, Z_VR2_J, Z_VR2_K, + Z_VR3, Z_VR3_H, Z_VR3_J, Z_VR3_K, + Z_VR4, Z_VR4_H, Z_VR4_J, Z_VR4_K, + Z_VR5, Z_VR5_H, Z_VR5_J, Z_VR5_K, + Z_VR6, Z_VR6_H, Z_VR6_J, Z_VR6_K, + Z_VR7, Z_VR7_H, Z_VR7_J, Z_VR7_K, + Z_VR8, Z_VR8_H, Z_VR8_J, Z_VR8_K, + Z_VR9, Z_VR9_H, Z_VR9_J, Z_VR9_K, + Z_VR10, Z_VR10_H, Z_VR10_J, Z_VR10_K, + Z_VR11, Z_VR11_H, Z_VR11_J, Z_VR11_K, + Z_VR12, Z_VR12_H, Z_VR12_J, Z_VR12_K, + Z_VR13, Z_VR13_H, Z_VR13_J, Z_VR13_K, + Z_VR14, Z_VR14_H, Z_VR14_J, Z_VR14_K, + Z_VR15, Z_VR15_H, Z_VR15_J, Z_VR15_K, + Z_VR16, Z_VR16_H, Z_VR16_J, Z_VR16_K, + Z_VR17, Z_VR17_H, Z_VR17_J, Z_VR17_K, + Z_VR18, Z_VR18_H, Z_VR18_J, Z_VR18_K, + Z_VR19, Z_VR19_H, Z_VR19_J, Z_VR19_K, + Z_VR20, Z_VR20_H, Z_VR20_J, Z_VR20_K, + Z_VR21, Z_VR21_H, Z_VR21_J, Z_VR21_K, + Z_VR22, Z_VR22_H, Z_VR22_J, Z_VR22_K, + Z_VR23, Z_VR23_H, Z_VR23_J, Z_VR23_K, + Z_VR24, Z_VR24_H, Z_VR24_J, Z_VR24_K, + Z_VR25, Z_VR25_H, Z_VR25_J, Z_VR25_K, + Z_VR26, Z_VR26_H, Z_VR26_J, Z_VR26_K, + Z_VR27, Z_VR27_H, Z_VR27_J, Z_VR27_K, + Z_VR28, Z_VR28_H, Z_VR28_J, Z_VR28_K, + Z_VR29, Z_VR29_H, Z_VR29_J, Z_VR29_K, + Z_VR30, Z_VR30_H, Z_VR30_J, Z_VR30_K, + Z_VR31, Z_VR31_H, Z_VR31_J, Z_VR31_K +); + +alloc_class chunk3( Z_CR ); @@ -542,6 +741,27 @@ reg_class z_dbl_reg( ); reg_class z_rscratch1_dbl_reg(Z_F1,Z_F1_H); +reg_class z_v_reg( + // Attention: Only these ones are saved & restored at safepoint by RegisterSaver. + //1st 16 VRs overlaps with 1st 16 FPRs. + Z_VR16, Z_VR16_H, Z_VR16_J, Z_VR16_K, + Z_VR17, Z_VR17_H, Z_VR17_J, Z_VR17_K, + Z_VR18, Z_VR18_H, Z_VR18_J, Z_VR18_K, + Z_VR19, Z_VR19_H, Z_VR19_J, Z_VR19_K, + Z_VR20, Z_VR20_H, Z_VR20_J, Z_VR20_K, + Z_VR21, Z_VR21_H, Z_VR21_J, Z_VR21_K, + Z_VR22, Z_VR22_H, Z_VR22_J, Z_VR22_K, + Z_VR23, Z_VR23_H, Z_VR23_J, Z_VR23_K, + Z_VR24, Z_VR24_H, Z_VR24_J, Z_VR24_K, + Z_VR25, Z_VR25_H, Z_VR25_J, Z_VR25_K, + Z_VR26, Z_VR26_H, Z_VR26_J, Z_VR26_K, + Z_VR27, Z_VR27_H, Z_VR27_J, Z_VR27_K, + Z_VR28, Z_VR28_H, Z_VR28_J, Z_VR28_K, + Z_VR29, Z_VR29_H, Z_VR29_J, Z_VR29_K, + Z_VR30, Z_VR30_H, Z_VR30_J, Z_VR30_K, + Z_VR31, Z_VR31_H, Z_VR31_J, Z_VR31_K +); + %} //----------DEFINITION BLOCK--------------------------------------------------- @@ -953,8 +1173,8 @@ const Pipeline * MachEpilogNode::pipeline() const { //============================================================================= -// Figure out which register class each belongs in: rc_int, rc_float, rc_stack. -enum RC { rc_bad, rc_int, rc_float, rc_stack }; +// Figure out which register class each belongs in: rc_int, rc_float, rc_vector, rc_stack. +enum RC { rc_bad, rc_int, rc_float, rc_vector, rc_stack }; static enum RC rc_class(OptoReg::Name reg) { // Return the register class for the given register. The given register @@ -975,8 +1195,13 @@ static enum RC rc_class(OptoReg::Name reg) { return rc_float; } + // we have 128 vector register halves at index 64 + if (reg < 32+32+128) { + return rc_vector; + } + // Between float regs & stack are the flags regs. - assert(reg >= OptoReg::stack0(), "blow up if spilling flags"); + assert(OptoReg::is_stack(reg) || reg < 32+32+128, "blow up if spilling flags"); return rc_stack; } @@ -1035,7 +1260,7 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r "expected aligned-adjacent pairs"); // Generate spill code! - + int size = 0; if (src_lo == dst_lo && src_hi == dst_hi) { return 0; // Self copy, no move. } @@ -1049,6 +1274,37 @@ uint MachSpillCopyNode::implementation(C2_MacroAssembler *masm, PhaseRegAlloc *r const char *mnemo = nullptr; unsigned long opc = 0; + if (bottom_type()->isa_vect() != nullptr && ideal_reg() == Op_VecX) { + if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { + if (masm != nullptr) { + __ z_mvc(Address(Z_SP, 0, dst_offset), Address(Z_SP, 0, src_offset), 16); + } + size += 6; + } else if (src_lo_rc == rc_vector && dst_lo_rc == rc_stack) { + VectorRegister Rsrc = as_VectorRegister(Matcher::_regEncode[src_lo]); + if (masm != nullptr) { + __ z_vst(Rsrc, Address(Z_SP, 0, dst_offset)); + } + size += 6; + } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_vector) { + VectorRegister Rdst = as_VectorRegister(Matcher::_regEncode[dst_lo]); + if (masm != nullptr) { + __ z_vl(Rdst, Address(Z_SP, 0, src_offset)); + } + size += 6; + } else if (src_lo_rc == rc_vector && dst_lo_rc == rc_vector) { + VectorRegister Rsrc = as_VectorRegister(Matcher::_regEncode[src_lo]); + VectorRegister Rdst = as_VectorRegister(Matcher::_regEncode[dst_lo]); + if (masm != nullptr) { + __ z_vlr(Rdst, Rsrc); + } + size += 6; + } else { + ShouldNotReachHere(); + } + return size; + } + // Memory->Memory Spill. Use Z_R0 to hold the value. if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { @@ -1283,7 +1539,7 @@ source_hpp %{ // // To keep related declarations/definitions/uses close together, // we switch between source %{ }% and source_hpp %{ }% freely as needed. - +#include "opto/convertnode.hpp" #include "oops/klass.inline.hpp" //-------------------------------------------------------------- @@ -1446,6 +1702,32 @@ bool Matcher::match_rule_supported(int opcode) { case Op_PopCountL: // PopCount supported by H/W from z/Architecture G5 (z196) on. return (UsePopCountInstruction && VM_Version::has_PopCount()); + case Op_AddVB: + case Op_AddVS: + case Op_AddVI: + case Op_AddVL: + case Op_AddVD: + case Op_SubVB: + case Op_SubVS: + case Op_SubVI: + case Op_SubVL: + case Op_SubVD: + case Op_MulVB: + case Op_MulVS: + case Op_MulVI: + case Op_MulVD: + case Op_DivVD: + case Op_SqrtVD: + case Op_RoundDoubleModeV: + return SuperwordUseVX; + case Op_AddVF: + case Op_SubVF: + case Op_MulVF: + case Op_DivVF: + case Op_SqrtVF: + //PopCountVI supported by z14 onwards. + case Op_PopCountVI: + return (SuperwordUseVX && UseSFPV); case Op_FmaF: case Op_FmaD: return UseFMA; @@ -1491,14 +1773,24 @@ OptoRegPair Matcher::vector_return_value(uint ideal_reg) { // Vector width in bytes. int Matcher::vector_width_in_bytes(BasicType bt) { - assert(MaxVectorSize == 8, ""); - return 8; + if (SuperwordUseVX) { + assert(MaxVectorSize == 16, ""); + return 16; + } else { + assert(MaxVectorSize == 8, ""); + return 8; + } } // Vector ideal reg. uint Matcher::vector_ideal_reg(int size) { - assert(MaxVectorSize == 8 && size == 8, ""); - return Op_RegL; + if (SuperwordUseVX) { + assert(MaxVectorSize == 16 && size == 16, ""); + return Op_VecX; + } else { + assert(MaxVectorSize == 8 && size == 8, ""); + return Op_RegL; + } } // Limits on vector size (number of elements) loaded into vector. @@ -2391,6 +2683,14 @@ ins_attrib ins_should_rematerialize(false); // Immediate Operands // Please note: // Formats are generated automatically for constants and base registers. +operand vecX() %{ + constraint(ALLOC_IN_RC(z_v_reg)); + match(VecX); + + format %{ %} + interface(REG_INTER); +%} + //---------------------------------------------- // SIGNED (shorter than INT) immediate operands @@ -10534,6 +10834,45 @@ instruct Repl4S_immm1(iRegL dst, immS_minus1 src) %{ ins_pipe(pipe_class_dummy); %} +instruct repl8S_reg_Ex(vecX dst, iRegI src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 8 && + Matcher::vector_element_basic_type(n) == T_SHORT); + + size(12); + ins_encode %{ + __ z_vlvgh($dst$$VectorRegister, $src$$Register, 0); + __ z_vreph($dst$$VectorRegister, $dst$$VectorRegister, 0); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl8S_immIminus1(vecX dst, immI_minus1 src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 8 && + Matcher::vector_element_basic_type(n) == T_SHORT); + + format %{ "VONE $dst, $src \t// replicate8S" %} + size(6); + ins_encode %{ + __ z_vone($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl8S_immI0(vecX dst, immI_0 zero) %{ + match(Set dst (Replicate zero)); + predicate(n->as_Vector()->length() == 8 && + Matcher::vector_element_basic_type(n) == T_SHORT); + + format %{ "VZERO $dst, $zero \t// replicate8S" %} + size(6); + ins_encode %{ + __ z_vzero($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + // Exploit rotate_then_insert, if available. // Replicate scalar int to packed int values (8 Bytes). instruct Repl2I_reg_risbg(iRegL dst, iRegI src, flagsReg cr) %{ @@ -10586,7 +10925,44 @@ instruct Repl2I_immm1(iRegL dst, immI_minus1 src) %{ ins_pipe(pipe_class_dummy); %} -// +instruct repl4I_reg_Ex(vecX dst, iRegI src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 4 && + Matcher::vector_element_basic_type(n) == T_INT); + + size(12); + ins_encode %{ + __ z_vlvgf($dst$$VectorRegister, $src$$Register, 0); + __ z_vrepf($dst$$VectorRegister, $dst$$VectorRegister, 0); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl4I_immI0(vecX dst, immI_0 zero) %{ + match(Set dst (Replicate zero)); + predicate(n->as_Vector()->length() == 4 && + Matcher::vector_element_basic_type(n) == T_INT); + + format %{ "VZERO $dst, $zero \t// replicate4I" %} + size(6); + ins_encode %{ + __ z_vzero($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl4I_immIminus1(vecX dst, immI_minus1 src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 4 && + Matcher::vector_element_basic_type(n) == T_INT); + + format %{ "VONE $dst, $dst, $dst \t// replicate4I" %} + size(6); + ins_encode %{ + __ z_vone($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} instruct Repl2F_reg_indirect(iRegL dst, regF src, flagsReg cr) %{ match(Set dst (Replicate src)); @@ -10650,6 +11026,139 @@ instruct Repl2F_imm0(iRegL dst, immFp0 src) %{ ins_pipe(pipe_class_dummy); %} +instruct repl4F_reg_Ex(vecX dst, regF src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 4 && + Matcher::vector_element_basic_type(n) == T_FLOAT); + + format %{ "VREP $dst, $src \t// replicate4F" %} + size(6); + + ins_encode %{ + __ z_vrepf($dst$$VectorRegister, $src$$FloatRegister->to_vr(), 0); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl4F_immF0(vecX dst, immFp0 zero) %{ + match(Set dst (Replicate zero)); + predicate(n->as_Vector()->length() == 4 && + Matcher::vector_element_basic_type(n) == T_FLOAT); + + format %{ "VZERO $dst, $zero \t// replicate4F" %} + size(6); + ins_encode %{ + __ z_vzero($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl2D_reg_Ex(vecX dst, regD src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 2 && + Matcher::vector_element_basic_type(n) == T_DOUBLE); + + format %{ "VREP $dst, $src \t// replicate2D" %} + size(6); + + ins_encode %{ + __ z_vrepg($dst$$VectorRegister, $src$$FloatRegister->to_vr(), 0); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl2D_immD0(vecX dst, immDp0 zero) %{ + match(Set dst (Replicate zero)); + predicate(n->as_Vector()->length() == 2 && + Matcher::vector_element_basic_type(n) == T_DOUBLE); + + format %{ "VZERO $dst, $zero \t// replicate2D" %} + size(6); + ins_encode %{ + __ z_vzero($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl16B_reg_Ex(vecX dst, iRegI src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 16 && + Matcher::vector_element_basic_type(n) == T_BYTE); + + size(12); + ins_encode %{ + __ z_vlvgb($dst$$VectorRegister, $src$$Register, 0); + __ z_vrepb($dst$$VectorRegister, $dst$$VectorRegister, 0); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl16B_immIminus1(vecX dst, immI_minus1 src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 16 && + Matcher::vector_element_basic_type(n) == T_BYTE); + + format %{ "VONE $dst, $src \t// replicate16B" %} + size(6); + ins_encode %{ + __ z_vone($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl16B_immI0(vecX dst, immI_0 zero) %{ + match(Set dst (Replicate zero)); + predicate(n->as_Vector()->length() == 16 && + Matcher::vector_element_basic_type(n) == T_BYTE); + + format %{ "VZERO $dst, $zero \t// replicate16B" %} + size(6); + ins_encode %{ + __ z_vzero($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl2L_reg_Ex(vecX dst, iRegL src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 2 && + Matcher::vector_element_basic_type(n) == T_LONG); + + size(12); + ins_encode %{ + __ z_vlvgg($dst$$VectorRegister, $src$$Register, 0); + __ z_vrepg($dst$$VectorRegister, $dst$$VectorRegister, 0); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl2L_immIminus1(vecX dst, immI_minus1 src) %{ + match(Set dst (Replicate src)); + predicate(n->as_Vector()->length() == 2 && + Matcher::vector_element_basic_type(n) == T_LONG); + + format %{ "VONE $dst, $src \t// replicate2L" %} + size(6); + ins_encode %{ + __ z_vone($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct repl2L_immI0(vecX dst, immI_0 zero) %{ + match(Set dst (Replicate zero)); + predicate(n->as_Vector()->length() == 2 && + Matcher::vector_element_basic_type(n) == T_LONG); + + format %{ "VZERO $dst, $zero \t// replicate16B" %} + size(6); + ins_encode %{ + __ z_vzero($dst$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + + // Load/Store vector // Store Aligned Packed Byte register to memory (8 Bytes). @@ -10664,6 +11173,21 @@ instruct storeA8B(memory mem, iRegL src) %{ ins_pipe(pipe_class_dummy); %} +// Store Packed Byte long register to memory +instruct storeV16(memoryRX mem, vecX src) %{ + predicate(n->as_StoreVector()->memory_size() == 16); + match(Set mem (StoreVector mem src)); + ins_cost(MEMORY_REF_COST); + + format %{ "VST $mem, $src \t// store 16-byte Vector" %} + size(6); + ins_encode %{ + __ z_vst($src$$VectorRegister, + Address(reg_to_register_object($mem$$base), $mem$$index$$Register, $mem$$disp)); + %} + ins_pipe(pipe_class_dummy); +%} + instruct loadV8(iRegL dst, memory mem) %{ match(Set dst (LoadVector mem)); predicate(n->as_LoadVector()->memory_size() == 8); @@ -10675,6 +11199,21 @@ instruct loadV8(iRegL dst, memory mem) %{ ins_pipe(pipe_class_dummy); %} +// Load Aligned Packed Byte +instruct loadV16(vecX dst, memoryRX mem) %{ + predicate(n->as_LoadVector()->memory_size() == 16); + match(Set dst (LoadVector mem)); + ins_cost(MEMORY_REF_COST); + + format %{ "VL $dst, $mem \t// load 16-byte Vector" %} + size(6); + ins_encode %{ + __ z_vl($dst$$VectorRegister, + Address(reg_to_register_object($mem$$base), $mem$$index$$Register, $mem$$disp)); + %} + ins_pipe(pipe_class_dummy); +%} + // Reinterpret: only one vector size used instruct reinterpret(iRegL dst) %{ match(Set dst (VectorReinterpret dst)); @@ -10684,6 +11223,303 @@ instruct reinterpret(iRegL dst) %{ ins_pipe(pipe_class_dummy); %} +instruct reinterpretX(vecX dst) %{ + match(Set dst (VectorReinterpret dst)); + ins_cost(0); + format %{ "reinterpret $dst" %} + ins_encode( /*empty*/ ); + ins_pipe(pipe_class_dummy); +%} + +//----------Vector Arithmetic Instructions-------------------------------------- + +// Vector Addition Instructions + +instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (AddVB src1 src2)); + predicate(n->as_Vector()->length() == 16); + format %{ "VAB $dst,$src1,$src2\t// add packed16B" %} + size(6); + ins_encode %{ + __ z_vab($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (AddVS src1 src2)); + predicate(n->as_Vector()->length() == 8); + format %{ "VAH $dst,$src1,$src2\t// add packed8S" %} + size(6); + ins_encode %{ + __ z_vah($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (AddVI src1 src2)); + predicate(n->as_Vector()->length() == 4); + format %{ "VAF $dst,$src1,$src2\t// add packed4I" %} + size(6); + ins_encode %{ + __ z_vaf($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (AddVL src1 src2)); + predicate(n->as_Vector()->length() == 2); + format %{ "VAG $dst,$src1,$src2\t// add packed2L" %} + size(6); + ins_encode %{ + __ z_vag($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vmul16B_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (MulVB src1 src2)); + predicate(n->as_Vector()->length() == 16); + format %{ "VMLB $dst,$src1,$src2\t// mul packed16B" %} + size(6); + ins_encode %{ + __ z_vmlb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (MulVS src1 src2)); + predicate(n->as_Vector()->length() == 8); + format %{ "VMLHW $dst,$src1,$src2\t// mul packed8S" %} + size(6); + ins_encode %{ + __ z_vmlhw($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (MulVI src1 src2)); + predicate(n->as_Vector()->length() == 4); + format %{ "VMLF $dst,$src1,$src2\t// mul packed4I" %} + size(6); + ins_encode %{ + __ z_vmlf($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (SubVB src1 src2)); + predicate(n->as_Vector()->length() == 16); + format %{ "VSB $dst,$src1,$src2\t// sub packed16B" %} + size(6); + ins_encode %{ + __ z_vsb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vsub8S_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (SubVS src1 src2)); + predicate(n->as_Vector()->length() == 8); + format %{ "VSH $dst,$src1,$src2\t// sub packed8S" %} + size(6); + ins_encode %{ + __ z_vsh($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (SubVI src1 src2)); + predicate(n->as_Vector()->length() == 4); + format %{ "VSF $dst,$src1,$src2\t// sub packed4I" %} + size(6); + ins_encode %{ + __ z_vsf($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (SubVL src1 src2)); + predicate(n->as_Vector()->length() == 2); + format %{ "VSG $dst,$src1,$src2\t// sub packed2L" %} + size(6); + ins_encode %{ + __ z_vsg($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vadd4F_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (AddVF src1 src2)); + predicate(n->as_Vector()->length() == 4); + format %{ "VFASB $dst,$src1,$src2\t// add packed4F" %} + size(6); + ins_encode %{ + __ z_vfasb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vadd2D_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (AddVD src1 src2)); + predicate(n->as_Vector()->length() == 2); + format %{ "VFADB $dst,$src1,$src2\t// add packed2D" %} + size(6); + ins_encode %{ + __ z_vfadb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vsub4F_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (SubVF src1 src2)); + predicate(n->as_Vector()->length() == 4); + format %{ "VFSSB $dst,$src1,$src2\t// sub packed4F" %} + size(6); + ins_encode %{ + __ z_vfssb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vsub2D_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (SubVD src1 src2)); + predicate(n->as_Vector()->length() == 2); + format %{ "VFSDB $dst,$src1,$src2\t// sub packed2D" %} + size(6); + ins_encode %{ + __ z_vfsdb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vmul4F_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (MulVF src1 src2)); + predicate(n->as_Vector()->length() == 4); + format %{ "VFMSB $dst,$src1,$src2\t// mul packed4F" %} + size(6); + ins_encode %{ + __ z_vfmsb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vmul2D_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (MulVD src1 src2)); + predicate(n->as_Vector()->length() == 2); + format %{ "VFMDB $dst,$src1,$src2\t// mul packed2D" %} + size(6); + ins_encode %{ + __ z_vfmdb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vdiv4F_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (DivVF src1 src2)); + predicate(n->as_Vector()->length() == 4); + format %{ "VFDSB $dst,$src1,$src2\t// div packed4F" %} + size(6); + ins_encode %{ + __ z_vfdsb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vdiv2D_reg(vecX dst, vecX src1, vecX src2) %{ + match(Set dst (DivVD src1 src2)); + predicate(n->as_Vector()->length() == 2); + format %{ "VFDDB $dst,$src1,$src2\t// div packed2D" %} + size(6); + ins_encode %{ + __ z_vfddb($dst$$VectorRegister, $src1$$VectorRegister, $src2$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +// Vector Square Root Instructions + +instruct vsqrt4F_reg(vecX dst, vecX src) %{ + match(Set dst (SqrtVF src)); + predicate(n->as_Vector()->length() == 4); + format %{ "VFSQSB $dst,$src\t// sqrt packed4F" %} + size(6); + ins_encode %{ + __ z_vfsqsb($dst$$VectorRegister, $src$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +instruct vsqrt2D_reg(vecX dst, vecX src) %{ + match(Set dst (SqrtVD src)); + predicate(n->as_Vector()->length() == 2); + format %{ "VFSQDB $dst,$src\t// sqrt packed2D" %} + size(6); + ins_encode %{ + __ z_vfsqdb($dst$$VectorRegister, $src$$VectorRegister); + %} + ins_pipe(pipe_class_dummy); +%} + +// Vector Population Count Instructions + +instruct vpopcnt_reg(vecX dst, vecX src) %{ + match(Set dst (PopCountVI src)); + format %{ "VPOPCT $dst,$src\t// pop count packed" %} + size(6); + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + switch (bt) { + case T_BYTE: + __ z_vpopctb($dst$$VectorRegister, $src$$VectorRegister); + break; + case T_SHORT: + __ z_vpopcth($dst$$VectorRegister, $src$$VectorRegister); + break; + case T_INT: + __ z_vpopctf($dst$$VectorRegister, $src$$VectorRegister); + break; + case T_LONG: + __ z_vpopctg($dst$$VectorRegister, $src$$VectorRegister); + break; + default: + ShouldNotReachHere(); + } + %} + ins_pipe(pipe_class_dummy); +%} + +// Vector Round Instructions +instruct vround2D_reg(vecX dst, vecX src, immI8 rmode) %{ + match(Set dst (RoundDoubleModeV src rmode)); + predicate(n->as_Vector()->length() == 2); + format %{ "RoundDoubleModeV $src,$rmode" %} + size(6); + ins_encode %{ + switch ($rmode$$constant) { + case RoundDoubleModeNode::rmode_rint: + __ z_vflrd($dst$$VectorRegister, $src$$VectorRegister, 0); + break; + case RoundDoubleModeNode::rmode_floor: + __ z_vflrd($dst$$VectorRegister, $src$$VectorRegister, 7); + break; + case RoundDoubleModeNode::rmode_ceil: + __ z_vflrd($dst$$VectorRegister, $src$$VectorRegister, 6); + break; + default: + ShouldNotReachHere(); + } + %} + ins_pipe(pipe_class_dummy); +%} + //----------POPULATION COUNT RULES-------------------------------------------- // Byte reverse diff --git a/src/hotspot/cpu/s390/sharedRuntime_s390.cpp b/src/hotspot/cpu/s390/sharedRuntime_s390.cpp index c60f6ef32957b..1238f887b872d 100644 --- a/src/hotspot/cpu/s390/sharedRuntime_s390.cpp +++ b/src/hotspot/cpu/s390/sharedRuntime_s390.cpp @@ -81,6 +81,9 @@ #define RegisterSaver_ExcludedFloatReg(regname) \ { RegisterSaver::excluded_reg, regname->encoding(), regname->as_VMReg() } +#define RegisterSaver_LiveVReg(regname) \ + { RegisterSaver::v_reg, regname->encoding(), regname->as_VMReg() } + static const RegisterSaver::LiveRegType RegisterSaver_LiveRegs[] = { // Live registers which get spilled to the stack. Register positions // in this array correspond directly to the stack layout. @@ -258,6 +261,26 @@ static const RegisterSaver::LiveRegType RegisterSaver_LiveVolatileRegs[] = { // RegisterSaver_ExcludedIntReg(Z_R15) // stack pointer }; +static const RegisterSaver::LiveRegType RegisterSaver_LiveVRegs[] = { + // live vector registers (optional, only these are used by C2): + RegisterSaver_LiveVReg( Z_V16 ), + RegisterSaver_LiveVReg( Z_V17 ), + RegisterSaver_LiveVReg( Z_V18 ), + RegisterSaver_LiveVReg( Z_V19 ), + RegisterSaver_LiveVReg( Z_V20 ), + RegisterSaver_LiveVReg( Z_V21 ), + RegisterSaver_LiveVReg( Z_V22 ), + RegisterSaver_LiveVReg( Z_V23 ), + RegisterSaver_LiveVReg( Z_V24 ), + RegisterSaver_LiveVReg( Z_V25 ), + RegisterSaver_LiveVReg( Z_V26 ), + RegisterSaver_LiveVReg( Z_V27 ), + RegisterSaver_LiveVReg( Z_V28 ), + RegisterSaver_LiveVReg( Z_V29 ), + RegisterSaver_LiveVReg( Z_V30 ), + RegisterSaver_LiveVReg( Z_V31 ) +}; + int RegisterSaver::live_reg_save_size(RegisterSet reg_set) { int reg_space = -1; switch (reg_set) { @@ -271,23 +294,28 @@ int RegisterSaver::live_reg_save_size(RegisterSet reg_set) { return (reg_space / sizeof(RegisterSaver::LiveRegType)) * reg_size; } +int RegisterSaver::calculate_vregstosave_num() { + return (sizeof(RegisterSaver_LiveVRegs) / sizeof(RegisterSaver::LiveRegType)); +} -int RegisterSaver::live_reg_frame_size(RegisterSet reg_set) { - return live_reg_save_size(reg_set) + frame::z_abi_160_size; +int RegisterSaver::live_reg_frame_size(RegisterSet reg_set, bool save_vectors) { + const int vregstosave_num = save_vectors ? calculate_vregstosave_num() : 0; + return live_reg_save_size(reg_set) + vregstosave_num * v_reg_size + frame::z_abi_160_size; } // return_pc: Specify the register that should be stored as the return pc in the current frame. -OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, RegisterSet reg_set, Register return_pc) { +OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, RegisterSet reg_set, Register return_pc, bool save_vectors) { // Record volatile registers as callee-save values in an OopMap so // their save locations will be propagated to the caller frame's // RegisterMap during StackFrameStream construction (needed for // deoptimization; see compiledVFrame::create_stack_value). // Calculate frame size. - const int frame_size_in_bytes = live_reg_frame_size(reg_set); + const int frame_size_in_bytes = live_reg_frame_size(reg_set, save_vectors); const int frame_size_in_slots = frame_size_in_bytes / sizeof(jint); - const int register_save_offset = frame_size_in_bytes - live_reg_save_size(reg_set); + const int vregstosave_num = save_vectors ? calculate_vregstosave_num() : 0; + const int register_save_offset = frame_size_in_bytes - (live_reg_save_size(reg_set) + vregstosave_num * v_reg_size); // OopMap frame size is in c2 stack slots (sizeof(jint)) not bytes or words. OopMap* map = new OopMap(frame_size_in_slots, 0); @@ -382,6 +410,23 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, RegisterSet reg assert(first != noreg, "Should spill at least one int reg."); __ z_stmg(first, last, first_offset, Z_SP); + for (int i = 0; i < vregstosave_num; i++, offset += v_reg_size) { + int reg_num = RegisterSaver_LiveVRegs[i].reg_num; + + __ z_vst(as_VectorRegister(reg_num), Address(Z_SP, offset)); + + map->set_callee_saved(VMRegImpl::stack2reg(offset>>2), + RegisterSaver_LiveVRegs[i].vmreg); + map->set_callee_saved(VMRegImpl::stack2reg((offset + half_reg_size ) >> 2), + RegisterSaver_LiveVRegs[i].vmreg->next()); + map->set_callee_saved(VMRegImpl::stack2reg((offset + (half_reg_size * 2)) >> 2), + RegisterSaver_LiveVRegs[i].vmreg->next(2)); + map->set_callee_saved(VMRegImpl::stack2reg((offset + (half_reg_size * 3)) >> 2), + RegisterSaver_LiveVRegs[i].vmreg->next(3)); + } + + assert(offset == frame_size_in_bytes, "consistency check"); + // And we're done. return map; } @@ -433,14 +478,18 @@ OopMap* RegisterSaver::generate_oop_map(MacroAssembler* masm, RegisterSet reg_se } offset += reg_size; } +#ifdef ASSERT + assert(offset == frame_size_in_bytes, "consistency check"); +#endif return map; } // Pop the current frame and restore all the registers that we saved. -void RegisterSaver::restore_live_registers(MacroAssembler* masm, RegisterSet reg_set) { +void RegisterSaver::restore_live_registers(MacroAssembler* masm, RegisterSet reg_set, bool save_vectors) { int offset; - const int register_save_offset = live_reg_frame_size(reg_set) - live_reg_save_size(reg_set); + const int vregstosave_num = save_vectors ? calculate_vregstosave_num() : 0; + const int register_save_offset = live_reg_frame_size(reg_set, save_vectors) - (live_reg_save_size(reg_set) + vregstosave_num * v_reg_size); Register first = noreg; Register last = noreg; @@ -517,6 +566,12 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, RegisterSet reg assert(first != noreg, "Should spill at least one int reg."); __ z_lmg(first, last, first_offset, Z_SP); + for (int i = 0; i < vregstosave_num; i++, offset += v_reg_size) { + int reg_num = RegisterSaver_LiveVRegs[i].reg_num; + + __ z_vl(as_VectorRegister(reg_num), Address(Z_SP, offset)); + } + // Pop the frame. __ pop_frame(); @@ -527,14 +582,12 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, RegisterSet reg // Pop the current frame and restore the registers that might be holding a result. void RegisterSaver::restore_result_registers(MacroAssembler* masm) { - int i; - int offset; const int regstosave_num = sizeof(RegisterSaver_LiveRegs) / sizeof(RegisterSaver::LiveRegType); const int register_save_offset = live_reg_frame_size(all_registers) - live_reg_save_size(all_registers); // Restore all result registers (ints and floats). - offset = register_save_offset; + int offset = register_save_offset; for (int i = 0; i < regstosave_num; i++, offset += reg_size) { int reg_num = RegisterSaver_LiveRegs[i].reg_num; int reg_type = RegisterSaver_LiveRegs[i].reg_type; @@ -557,6 +610,7 @@ void RegisterSaver::restore_result_registers(MacroAssembler* masm) { ShouldNotReachHere(); } } + assert(offset == live_reg_frame_size(all_registers), "consistency check"); } // --------------------------------------------------------------------------- @@ -980,8 +1034,8 @@ static void gen_special_dispatch(MacroAssembler *masm, // Is the size of a vector size (in bytes) bigger than a size saved by default? // 8 bytes registers are saved by default on z/Architecture. bool SharedRuntime::is_wide_vector(int size) { - // Note, MaxVectorSize == 8 on this platform. - assert(size <= 8, "%d bytes vectors are not supported", size); + // Note, MaxVectorSize == 8/16 on this platform. + assert(size <= (SuperwordUseVX ? 16 : 8), "%d bytes vectors are not supported", size); return size > 8; } @@ -2865,8 +2919,9 @@ SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address cal __ z_lg(Z_R14, Address(Z_thread, JavaThread::saved_exception_pc_offset())); } + bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP); // Save registers, fpu state, and flags - map = RegisterSaver::save_live_registers(masm, RegisterSaver::all_registers); + map = RegisterSaver::save_live_registers(masm, RegisterSaver::all_registers, Z_R14, save_vectors); if (!cause_return) { // Keep a copy of the return pc to detect if it gets modified. @@ -2898,7 +2953,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address cal // Pending exception case, used (sporadically) by // api/java_lang/Thread.State/index#ThreadState et al. - RegisterSaver::restore_live_registers(masm, RegisterSaver::all_registers); + RegisterSaver::restore_live_registers(masm, RegisterSaver::all_registers, save_vectors); // Jump to forward_exception_entry, with the issuing PC in Z_R14 // so it looks like the original nmethod called forward_exception_entry. @@ -2911,7 +2966,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address cal if (!cause_return) { Label no_adjust; // If our stashed return pc was modified by the runtime we avoid touching it - const int offset_of_return_pc = _z_common_abi(return_pc) + RegisterSaver::live_reg_frame_size(RegisterSaver::all_registers); + const int offset_of_return_pc = _z_common_abi(return_pc) + RegisterSaver::live_reg_frame_size(RegisterSaver::all_registers, save_vectors); __ z_cg(Z_R6, offset_of_return_pc, Z_SP); __ z_brne(no_adjust); @@ -2924,7 +2979,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address cal } // Normal exit, restore registers and exit. - RegisterSaver::restore_live_registers(masm, RegisterSaver::all_registers); + RegisterSaver::restore_live_registers(masm, RegisterSaver::all_registers, save_vectors); __ z_br(Z_R14); @@ -2932,7 +2987,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(SharedStubId id, address cal masm->flush(); // Fill-out other meta info - return SafepointBlob::create(&buffer, oop_maps, RegisterSaver::live_reg_frame_size(RegisterSaver::all_registers)/wordSize); + return SafepointBlob::create(&buffer, oop_maps, RegisterSaver::live_reg_frame_size(RegisterSaver::all_registers, save_vectors)/wordSize); } diff --git a/src/hotspot/cpu/s390/vm_version_s390.cpp b/src/hotspot/cpu/s390/vm_version_s390.cpp index 4b17ff4594ccf..8ab5affd309e0 100644 --- a/src/hotspot/cpu/s390/vm_version_s390.cpp +++ b/src/hotspot/cpu/s390/vm_version_s390.cpp @@ -97,7 +97,23 @@ void VM_Version::initialize() { intx cache_line_size = Dcache_lineSize(0); #ifdef COMPILER2 - MaxVectorSize = 8; + int model_ix = get_model_index(); + + if ( model_ix >= 7 ) { + if (FLAG_IS_DEFAULT(SuperwordUseVX)) { + FLAG_SET_ERGO(SuperwordUseVX, true); + } + if (model_ix > 7 && FLAG_IS_DEFAULT(UseSFPV) && SuperwordUseVX) { + FLAG_SET_ERGO(UseSFPV, true); + } else if (model_ix == 7 && UseSFPV) { + warning("UseSFPV specified, but needs at least Z14."); + FLAG_SET_DEFAULT(UseSFPV, false); + } + } else if (SuperwordUseVX) { + warning("SuperwordUseVX specified, but needs at least Z13."); + FLAG_SET_DEFAULT(SuperwordUseVX, false); + } + MaxVectorSize = SuperwordUseVX ? 16 : 8; #endif if (has_PrefetchRaw()) { diff --git a/src/hotspot/cpu/s390/vmreg_s390.cpp b/src/hotspot/cpu/s390/vmreg_s390.cpp index 239b68513b96c..5bec8313a48c4 100644 --- a/src/hotspot/cpu/s390/vmreg_s390.cpp +++ b/src/hotspot/cpu/s390/vmreg_s390.cpp @@ -43,6 +43,16 @@ void VMRegImpl::set_regName() { regName[i++] = freg->name(); freg = freg->successor(); } + + VectorRegister vreg = ::as_VectorRegister(0); + for (; i < ConcreteRegisterImpl::max_vr;) { + regName[i++] = vreg->name(); + regName[i++] = vreg->name(); + regName[i++] = vreg->name(); + regName[i++] = vreg->name(); + vreg = vreg->successor(); + } + for (; i < ConcreteRegisterImpl::number_of_registers; i ++) { regName[i] = "NON-GPR-XMM"; } diff --git a/src/hotspot/cpu/s390/vmreg_s390.hpp b/src/hotspot/cpu/s390/vmreg_s390.hpp index 3dd1bd9a16cbd..eb601f693abc6 100644 --- a/src/hotspot/cpu/s390/vmreg_s390.hpp +++ b/src/hotspot/cpu/s390/vmreg_s390.hpp @@ -35,14 +35,26 @@ inline bool is_FloatRegister() { value() < ConcreteRegisterImpl::max_fpr; } +inline bool is_VectorRegister() { + return value() >= ConcreteRegisterImpl::max_fpr && + value() < ConcreteRegisterImpl::max_vr; +} + inline Register as_Register() { assert(is_Register() && is_even(value()), "even-aligned GPR name"); - return ::as_Register(value() >> 1); + return ::as_Register(value() / Register::max_slots_per_register); } inline FloatRegister as_FloatRegister() { assert(is_FloatRegister() && is_even(value()), "must be"); - return ::as_FloatRegister((value() - ConcreteRegisterImpl::max_gpr) >> 1); + return ::as_FloatRegister((value() - ConcreteRegisterImpl::max_gpr) / + FloatRegister::max_slots_per_register); +} + +inline VectorRegister as_VectorRegister() { + assert(is_VectorRegister(), "must be"); + return ::as_VectorRegister((value() - ConcreteRegisterImpl::max_fpr) / + VectorRegister::max_slots_per_register); } inline bool is_concrete() { diff --git a/src/hotspot/cpu/s390/vmreg_s390.inline.hpp b/src/hotspot/cpu/s390/vmreg_s390.inline.hpp index 593a0d480454e..b03a66b3086e0 100644 --- a/src/hotspot/cpu/s390/vmreg_s390.inline.hpp +++ b/src/hotspot/cpu/s390/vmreg_s390.inline.hpp @@ -27,15 +27,21 @@ #define CPU_S390_VMREG_S390_INLINE_HPP inline VMReg Register::as_VMReg() const { - return VMRegImpl::as_VMReg(encoding() << 1); + return VMRegImpl::as_VMReg(encoding() * Register::max_slots_per_register); } inline VMReg FloatRegister::as_VMReg() const { - return VMRegImpl::as_VMReg((encoding() << 1) + ConcreteRegisterImpl::max_gpr); + return VMRegImpl::as_VMReg((encoding() * FloatRegister::max_slots_per_register) + + ConcreteRegisterImpl::max_gpr); +} + +inline VMReg VectorRegister::as_VMReg() const { + return VMRegImpl::as_VMReg((encoding() * VectorRegister::max_slots_per_register) + + ConcreteRegisterImpl::max_fpr); } inline VMReg ConditionRegister::as_VMReg() const { - return VMRegImpl::as_VMReg((encoding() << 1) + ConcreteRegisterImpl::max_fpr); + return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_vr); } #endif // CPU_S390_VMREG_S390_INLINE_HPP diff --git a/src/hotspot/share/adlc/output_c.cpp b/src/hotspot/share/adlc/output_c.cpp index 804e8f1a4e6c3..cc6ed278b4901 100644 --- a/src/hotspot/share/adlc/output_c.cpp +++ b/src/hotspot/share/adlc/output_c.cpp @@ -2358,6 +2358,9 @@ class DefineEmitState { if (strcmp(rep_var,"$VectorRegister") == 0) return "as_VectorRegister"; if (strcmp(rep_var,"$VectorSRegister") == 0) return "as_VectorSRegister"; #endif +#if defined(S390) + if (strcmp(rep_var,"$VectorRegister") == 0) return "as_VectorRegister"; +#endif #if defined(AARCH64) if (strcmp(rep_var,"$PRegister") == 0) return "as_PRegister"; #endif diff --git a/src/hotspot/share/opto/machnode.hpp b/src/hotspot/share/opto/machnode.hpp index 6fcbabdab90f3..4ac91175f78ed 100644 --- a/src/hotspot/share/opto/machnode.hpp +++ b/src/hotspot/share/opto/machnode.hpp @@ -134,6 +134,14 @@ class MachOper : public ResourceObj { return ::as_VectorSRegister(reg(ra_, node, idx)); } #endif +#if defined(S390) + VectorRegister as_VectorRegister(PhaseRegAlloc *ra_, const Node *node) const { + return ::as_VectorRegister(reg(ra_, node)); + } + VectorRegister as_VectorRegister(PhaseRegAlloc *ra_, const Node *node, int idx) const { + return ::as_VectorRegister(reg(ra_, node, idx)); + } +#endif #if defined(AARCH64) PRegister as_PRegister(PhaseRegAlloc* ra_, const Node* node) const { return ::as_PRegister(reg(ra_, node)); diff --git a/src/hotspot/share/opto/type.cpp b/src/hotspot/share/opto/type.cpp index 70cd46c900dc2..407a4a20a9bda 100644 --- a/src/hotspot/share/opto/type.cpp +++ b/src/hotspot/share/opto/type.cpp @@ -77,7 +77,7 @@ const Type::TypeInfo Type::_type_info[Type::lastype] = { { Bad, T_ILLEGAL, "vectora:", false, Op_VecA, relocInfo::none }, // VectorA. { Bad, T_ILLEGAL, "vectors:", false, 0, relocInfo::none }, // VectorS { Bad, T_ILLEGAL, "vectord:", false, Op_RegL, relocInfo::none }, // VectorD - { Bad, T_ILLEGAL, "vectorx:", false, 0, relocInfo::none }, // VectorX + { Bad, T_ILLEGAL, "vectorx:", false, Op_VecX, relocInfo::none }, // VectorX { Bad, T_ILLEGAL, "vectory:", false, 0, relocInfo::none }, // VectorY { Bad, T_ILLEGAL, "vectorz:", false, 0, relocInfo::none }, // VectorZ #else // all other