From 33d73fdf1d5c4a8ea230acc778eb8b7cf9e9d87d Mon Sep 17 00:00:00 2001 From: Daniel Barry Date: Mon, 4 Nov 2024 13:43:49 -0800 Subject: [PATCH] cat: updates in vector-FLOPs benchmarks Include kernels that perform scalar floating-point operations. These changes have been tested on the Intel Sapphire Rapids and IBM POWER10 architectures. --- src/counter_analysis_toolkit/Makefile | 58 +-- src/counter_analysis_toolkit/cat_arch.h | 45 +- src/counter_analysis_toolkit/vec.c | 410 ++++++++++++------ src/counter_analysis_toolkit/vec_fma_dp.c | 12 +- src/counter_analysis_toolkit/vec_fma_hp.c | 33 +- src/counter_analysis_toolkit/vec_fma_sp.c | 14 +- src/counter_analysis_toolkit/vec_nonfma_dp.c | 12 +- src/counter_analysis_toolkit/vec_nonfma_hp.c | 33 +- src/counter_analysis_toolkit/vec_nonfma_sp.c | 12 +- .../vec_scalar_verify.c | 338 +++++++++++++-- .../vec_scalar_verify.h | 50 ++- 11 files changed, 692 insertions(+), 325 deletions(-) diff --git a/src/counter_analysis_toolkit/Makefile b/src/counter_analysis_toolkit/Makefile index 45ec4ae58..31f1a4405 100644 --- a/src/counter_analysis_toolkit/Makefile +++ b/src/counter_analysis_toolkit/Makefile @@ -49,9 +49,9 @@ endif ifeq ($(ARCH),POWER) FLOP+=-maltivec -DPOWER VECSRC=vec_fma_hp.o vec_fma_sp.o vec_fma_dp.o vec_nonfma_hp.o vec_nonfma_sp.o vec_nonfma_dp.o - VEC=-maltivec -O0 -DPOWER - VEC_FMA=-maltivec -O0 -DPOWER - VEC_ALL=$(VEC) -O0 -DPOWER + VEC=-maltivec -DPOWER + VEC_FMA=-maltivec -DPOWER + VEC_ALL=$(VEC) -DPOWER endif ifeq ($(ARCH),ARM) FLOP+=-march=armv8.2-a+fp16 -DARM @@ -109,58 +109,58 @@ weak_symbols.o: weak_symbols.c vec.h -$(CC) -c $(CFLAGS) weak_symbols.c vec.o: vec.c vec.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) -D$(ARCH) $(VEC_META) vec.c + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) -D$(ARCH) $(VEC_META) vec.c vec_scalar_verify.o: vec_scalar_verify.c vec_scalar_verify.h cat_arch.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC_ALL) vec_scalar_verify.c + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC_ALL) vec_scalar_verify.c vec_fma_hp.o: vec_fma_hp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC_FMA) vec_fma_hp.c + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC_FMA) vec_fma_hp.c vec_fma_hp: vec_fma_hp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128_FMA) vec_fma_hp.c -o vec_fma_hp-128B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256_FMA) vec_fma_hp.c -o vec_fma_hp-256B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512_FMA) vec_fma_hp.c -o vec_fma_hp-512B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128_FMA) vec_fma_hp.c -o vec_fma_hp-128B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256_FMA) vec_fma_hp.c -o vec_fma_hp-256B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512_FMA) vec_fma_hp.c -o vec_fma_hp-512B.o vec_fma_sp.o: vec_fma_sp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC_FMA) vec_fma_sp.c + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC_FMA) vec_fma_sp.c vec_fma_sp: vec_fma_sp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128_FMA) vec_fma_sp.c -o vec_fma_sp-128B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256_FMA) vec_fma_sp.c -o vec_fma_sp-256B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512_FMA) vec_fma_sp.c -o vec_fma_sp-512B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128_FMA) vec_fma_sp.c -o vec_fma_sp-128B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256_FMA) vec_fma_sp.c -o vec_fma_sp-256B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512_FMA) vec_fma_sp.c -o vec_fma_sp-512B.o vec_fma_dp.o: vec_fma_dp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC_FMA) vec_fma_dp.c + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC_FMA) vec_fma_dp.c vec_fma_dp: vec_fma_dp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128_FMA) vec_fma_dp.c -o vec_fma_dp-128B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256_FMA) vec_fma_dp.c -o vec_fma_dp-256B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512_FMA) vec_fma_dp.c -o vec_fma_dp-512B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128_FMA) vec_fma_dp.c -o vec_fma_dp-128B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256_FMA) vec_fma_dp.c -o vec_fma_dp-256B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512_FMA) vec_fma_dp.c -o vec_fma_dp-512B.o vec_nonfma_hp.o: vec_nonfma_hp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC) vec_nonfma_hp.c + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC) vec_nonfma_hp.c vec_nonfma_hp: vec_nonfma_hp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128) vec_nonfma_hp.c -o vec_nonfma_hp-128B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256) vec_nonfma_hp.c -o vec_nonfma_hp-256B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512) vec_nonfma_hp.c -o vec_nonfma_hp-512B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128) vec_nonfma_hp.c -o vec_nonfma_hp-128B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256) vec_nonfma_hp.c -o vec_nonfma_hp-256B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512) vec_nonfma_hp.c -o vec_nonfma_hp-512B.o vec_nonfma_sp.o: vec_nonfma_sp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC) vec_nonfma_sp.c + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC) vec_nonfma_sp.c vec_nonfma_sp: vec_nonfma_sp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128) vec_nonfma_sp.c -o vec_nonfma_sp-128B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256) vec_nonfma_sp.c -o vec_nonfma_sp-256B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512) vec_nonfma_sp.c -o vec_nonfma_sp-512B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128) vec_nonfma_sp.c -o vec_nonfma_sp-128B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256) vec_nonfma_sp.c -o vec_nonfma_sp-256B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512) vec_nonfma_sp.c -o vec_nonfma_sp-512B.o vec_nonfma_dp.o: vec_nonfma_dp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC) vec_nonfma_dp.c + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC) vec_nonfma_dp.c vec_nonfma_dp: vec_nonfma_dp.c vec_scalar_verify.h - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC128) vec_nonfma_dp.c -o vec_nonfma_dp-128B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC256) vec_nonfma_dp.c -o vec_nonfma_dp-256B.o - -$(CC) -c $(CFLAGS) $(INCFLAGS) $(VEC512) vec_nonfma_dp.c -o vec_nonfma_dp-512B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC128) vec_nonfma_dp.c -o vec_nonfma_dp-128B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC256) vec_nonfma_dp.c -o vec_nonfma_dp-256B.o + -$(CC) -c $(CFLAGS) $(OPT1) $(INCFLAGS) $(VEC512) vec_nonfma_dp.c -o vec_nonfma_dp-512B.o cat_collect: $(CC) $(CFLAGS) -fopenmp $(INCFLAGS) main.c $(wildcard *.o) -o cat_collect $(LDFLAGS) diff --git a/src/counter_analysis_toolkit/cat_arch.h b/src/counter_analysis_toolkit/cat_arch.h index b42273f26..db6b62e1e 100644 --- a/src/counter_analysis_toolkit/cat_arch.h +++ b/src/counter_analysis_toolkit/cat_arch.h @@ -123,38 +123,17 @@ typedef float64x2_t DP_VEC_TYPE; #define ADD_VEC_SH(_I_,_J_) vaddh_f16( _I_ , _J_ ); #define MUL_VEC_SH(_I_,_J_) vmulh_f16( _I_ , _J_ ); #define SQRT_VEC_SH(_I_) vsqrth_f16( _I_ ); -#define FMA_VEC_SH(_out_,_I_,_J_,_K_) {\ - HP_VEC_TYPE arg1 = SET_VEC_PH(_I_);\ - HP_VEC_TYPE arg2 = SET_VEC_PH(_J_);\ - HP_VEC_TYPE arg3 = SET_VEC_PH(_K_);\ - HP_VEC_TYPE argTmp;\ - argTmp = FMA_VEC_PH( arg1 , arg2 , arg3 );\ - _out_ = ((half*)&(argTmp))[0];\ -} +#define FMA_VEC_SH(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_; #define SET_VEC_SS(_I_) _I_ ; #define ADD_VEC_SS(_I_,_J_) _I_ + _J_ ; #define MUL_VEC_SS(_I_,_J_) _I_ * _J_ ; -#define FMA_VEC_SS(_out_,_I_,_J_,_K_) {\ - SP_VEC_TYPE arg1 = SET_VEC_PS(_I_);\ - SP_VEC_TYPE arg2 = SET_VEC_PS(_J_);\ - SP_VEC_TYPE arg3 = SET_VEC_PS(_K_);\ - SP_VEC_TYPE argTmp;\ - argTmp = FMA_VEC_PS( arg1 , arg2 , arg3 );\ - _out_ = ((SP_SCALAR_TYPE*)&(argTmp))[0];\ -} +#define FMA_VEC_SS(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_; #define SET_VEC_SD(_I_) _I_ ; #define ADD_VEC_SD(_I_,_J_) _I_ + _J_ ; #define MUL_VEC_SD(_I_,_J_) _I_ * _J_ ; -#define FMA_VEC_SD(_out_,_I_,_J_,_K_) {\ - DP_VEC_TYPE arg1 = SET_VEC_PD(_I_);\ - DP_VEC_TYPE arg2 = SET_VEC_PD(_J_);\ - DP_VEC_TYPE arg3 = SET_VEC_PD(_K_);\ - DP_VEC_TYPE argTmp;\ - argTmp = FMA_VEC_PD( arg1 , arg2 , arg3 );\ - _out_ = ((DP_SCALAR_TYPE*)&(argTmp))[0];\ -} +#define FMA_VEC_SD(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_; #elif defined(POWER) void test_hp_power_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp ); @@ -187,25 +166,11 @@ typedef __vector double DP_VEC_TYPE; #define SET_VEC_SS(_I_) _I_ ; #define ADD_VEC_SS(_I_,_J_) _I_ + _J_ ; #define MUL_VEC_SS(_I_,_J_) _I_ * _J_ ; -#define FMA_VEC_SS(_out_,_I_,_J_,_K_) {\ - SP_VEC_TYPE arg1 = SET_VEC_PS(_I_);\ - SP_VEC_TYPE arg2 = SET_VEC_PS(_J_);\ - SP_VEC_TYPE arg3 = SET_VEC_PS(_K_);\ - SP_VEC_TYPE argTmp;\ - argTmp = FMA_VEC_PS( arg1 , arg2 , arg3 );\ - _out_ = ((SP_SCALAR_TYPE*)&(argTmp))[0];\ -} +#define FMA_VEC_SS(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_; #define SET_VEC_SD(_I_) _I_ ; #define ADD_VEC_SD(_I_,_J_) _I_ + _J_ ; #define MUL_VEC_SD(_I_,_J_) _I_ * _J_ ; -#define FMA_VEC_SD(_out_,_I_,_J_,_K_) {\ - DP_VEC_TYPE arg1 = SET_VEC_PD(_I_);\ - DP_VEC_TYPE arg2 = SET_VEC_PD(_J_);\ - DP_VEC_TYPE arg3 = SET_VEC_PD(_K_);\ - DP_VEC_TYPE argTmp;\ - argTmp = FMA_VEC_PD( arg1 , arg2 , arg3 );\ - _out_ = ((DP_SCALAR_TYPE*)&(argTmp))[0];\ -} +#define FMA_VEC_SD(_out_,_I_,_J_,_K_) _out_ = _I_ * _J_ + _K_; #endif diff --git a/src/counter_analysis_toolkit/vec.c b/src/counter_analysis_toolkit/vec.c index 830e4eed5..e33f148b1 100644 --- a/src/counter_analysis_toolkit/vec.c +++ b/src/counter_analysis_toolkit/vec.c @@ -5,6 +5,7 @@ #include #include "vec.h" #include "cat_arch.h" +#include "vec_scalar_verify.h" void vec_driver(char* papi_event_name, hw_desc_t *hw_desc, char* outdir) { @@ -38,107 +39,162 @@ void vec_driver(char* papi_event_name, hw_desc_t *hw_desc, char* outdir) goto error1; } + // Header to label the columns in the output file. + fprintf(ofp_papi, "# ExpectedInstrs EventCount\n"); + #if defined(X86) #if defined(AVX128_AVAIL) - // Non-FMA instruction trials. - test_hp_x86_128B_VEC( 24, 1000, EventSet, ofp_papi ); - test_hp_x86_128B_VEC( 48, 1000, EventSet, ofp_papi ); - test_hp_x86_128B_VEC( 96, 1000, EventSet, ofp_papi ); - -#if defined(AVX256_AVAIL) - test_hp_x86_256B_VEC( 24, 1000, EventSet, ofp_papi ); - test_hp_x86_256B_VEC( 48, 1000, EventSet, ofp_papi ); - test_hp_x86_256B_VEC( 96, 1000, EventSet, ofp_papi ); - -#if defined(AVX512_AVAIL) - test_hp_x86_512B_VEC( 24, 1000, EventSet, ofp_papi ); - test_hp_x86_512B_VEC( 48, 1000, EventSet, ofp_papi ); - test_hp_x86_512B_VEC( 96, 1000, EventSet, ofp_papi ); -#endif -#endif - - test_sp_x86_128B_VEC( 24, 1000, EventSet, ofp_papi ); - test_sp_x86_128B_VEC( 48, 1000, EventSet, ofp_papi ); - test_sp_x86_128B_VEC( 96, 1000, EventSet, ofp_papi ); - -#if defined(AVX256_AVAIL) - test_sp_x86_256B_VEC( 24, 1000, EventSet, ofp_papi ); - test_sp_x86_256B_VEC( 48, 1000, EventSet, ofp_papi ); - test_sp_x86_256B_VEC( 96, 1000, EventSet, ofp_papi ); - -#if defined(AVX512_AVAIL) - test_sp_x86_512B_VEC( 24, 1000, EventSet, ofp_papi ); - test_sp_x86_512B_VEC( 48, 1000, EventSet, ofp_papi ); - test_sp_x86_512B_VEC( 96, 1000, EventSet, ofp_papi ); -#endif -#endif - - test_dp_x86_128B_VEC( 24, 1000, EventSet, ofp_papi ); - test_dp_x86_128B_VEC( 48, 1000, EventSet, ofp_papi ); - test_dp_x86_128B_VEC( 96, 1000, EventSet, ofp_papi ); - -#if defined(AVX256_AVAIL) - test_dp_x86_256B_VEC( 24, 1000, EventSet, ofp_papi ); - test_dp_x86_256B_VEC( 48, 1000, EventSet, ofp_papi ); - test_dp_x86_256B_VEC( 96, 1000, EventSet, ofp_papi ); - -#if defined(AVX512_AVAIL) - test_dp_x86_512B_VEC( 24, 1000, EventSet, ofp_papi ); - test_dp_x86_512B_VEC( 48, 1000, EventSet, ofp_papi ); - test_dp_x86_512B_VEC( 96, 1000, EventSet, ofp_papi ); -#endif -#endif - - // FMA instruction trials. - test_hp_x86_128B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_hp_x86_128B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_hp_x86_128B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - -#if defined(AVX256_AVAIL) - test_hp_x86_256B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_hp_x86_256B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_hp_x86_256B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - -#if defined(AVX512_AVAIL) - test_hp_x86_512B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_hp_x86_512B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_hp_x86_512B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); -#endif -#endif - - test_sp_x86_128B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_sp_x86_128B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_sp_x86_128B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - -#if defined(AVX256_AVAIL) - test_sp_x86_256B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_sp_x86_256B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_sp_x86_256B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - -#if defined(AVX512_AVAIL) - test_sp_x86_512B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_sp_x86_512B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_sp_x86_512B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); -#endif -#endif - - test_dp_x86_128B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_dp_x86_128B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_dp_x86_128B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - -#if defined(AVX256_AVAIL) - test_dp_x86_256B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_dp_x86_256B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_dp_x86_256B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - -#if defined(AVX512_AVAIL) - test_dp_x86_512B_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_dp_x86_512B_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_dp_x86_512B_VEC_FMA( 48, 1000, EventSet, ofp_papi ); -#endif -#endif + // HP Non-FMA instruction trials. + fprintf(ofp_papi, "# HP Non-FMA Scalar\n"); + test_hp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# HP Non-FMA Vector AVX128\n"); + test_hp_x86_128B_VEC( 24, ITER, EventSet, ofp_papi ); + test_hp_x86_128B_VEC( 48, ITER, EventSet, ofp_papi ); + test_hp_x86_128B_VEC( 96, ITER, EventSet, ofp_papi ); + + #if defined(AVX256_AVAIL) + fprintf(ofp_papi, "# HP Non-FMA Vector AVX256\n"); + test_hp_x86_256B_VEC( 24, ITER, EventSet, ofp_papi ); + test_hp_x86_256B_VEC( 48, ITER, EventSet, ofp_papi ); + test_hp_x86_256B_VEC( 96, ITER, EventSet, ofp_papi ); + + #if defined(AVX512_AVAIL) + fprintf(ofp_papi, "# HP Non-FMA Vector AVX512\n"); + test_hp_x86_512B_VEC( 24, ITER, EventSet, ofp_papi ); + test_hp_x86_512B_VEC( 48, ITER, EventSet, ofp_papi ); + test_hp_x86_512B_VEC( 96, ITER, EventSet, ofp_papi ); + #endif + #endif + + // SP Non-FMA instruction trials. + fprintf(ofp_papi, "# SP Non-FMA Scalar\n"); + test_sp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP Non-FMA Vector AVX128\n"); + test_sp_x86_128B_VEC( 24, ITER, EventSet, ofp_papi ); + test_sp_x86_128B_VEC( 48, ITER, EventSet, ofp_papi ); + test_sp_x86_128B_VEC( 96, ITER, EventSet, ofp_papi ); + + #if defined(AVX256_AVAIL) + fprintf(ofp_papi, "# SP Non-FMA Vector AVX256\n"); + test_sp_x86_256B_VEC( 24, ITER, EventSet, ofp_papi ); + test_sp_x86_256B_VEC( 48, ITER, EventSet, ofp_papi ); + test_sp_x86_256B_VEC( 96, ITER, EventSet, ofp_papi ); + + #if defined(AVX512_AVAIL) + fprintf(ofp_papi, "# SP Non-FMA Vector AVX512\n"); + test_sp_x86_512B_VEC( 24, ITER, EventSet, ofp_papi ); + test_sp_x86_512B_VEC( 48, ITER, EventSet, ofp_papi ); + test_sp_x86_512B_VEC( 96, ITER, EventSet, ofp_papi ); + #endif + #endif + + // DP Non-FMA instruction trials. + fprintf(ofp_papi, "# DP Non-FMA Scalar\n"); + test_dp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP Non-FMA Vector AVX128\n"); + test_dp_x86_128B_VEC( 24, ITER, EventSet, ofp_papi ); + test_dp_x86_128B_VEC( 48, ITER, EventSet, ofp_papi ); + test_dp_x86_128B_VEC( 96, ITER, EventSet, ofp_papi ); + + #if defined(AVX256_AVAIL) + fprintf(ofp_papi, "# DP Non-FMA Vector AVX256\n"); + test_dp_x86_256B_VEC( 24, ITER, EventSet, ofp_papi ); + test_dp_x86_256B_VEC( 48, ITER, EventSet, ofp_papi ); + test_dp_x86_256B_VEC( 96, ITER, EventSet, ofp_papi ); + + #if defined(AVX512_AVAIL) + fprintf(ofp_papi, "# DP Non-FMA Vector AVX512\n"); + test_dp_x86_512B_VEC( 24, ITER, EventSet, ofp_papi ); + test_dp_x86_512B_VEC( 48, ITER, EventSet, ofp_papi ); + test_dp_x86_512B_VEC( 96, ITER, EventSet, ofp_papi ); + #endif + #endif + + // HP FMA instruction trials. + fprintf(ofp_papi, "# HP FMA Scalar\n"); + test_hp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# HP FMA Vector AVX128\n"); + test_hp_x86_128B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_hp_x86_128B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_hp_x86_128B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + #if defined(AVX256_AVAIL) + fprintf(ofp_papi, "# HP FMA Vector AVX256\n"); + test_hp_x86_256B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_hp_x86_256B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_hp_x86_256B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + #if defined(AVX512_AVAIL) + fprintf(ofp_papi, "# HP FMA Vector AVX512\n"); + test_hp_x86_512B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_hp_x86_512B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_hp_x86_512B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + #endif + #endif + + // SP FMA instruction trials. + fprintf(ofp_papi, "# SP FMA Scalar\n"); + test_sp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP FMA Vector AVX128\n"); + test_sp_x86_128B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_sp_x86_128B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_sp_x86_128B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + #if defined(AVX256_AVAIL) + fprintf(ofp_papi, "# SP FMA Vector AVX256\n"); + test_sp_x86_256B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_sp_x86_256B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_sp_x86_256B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + #if defined(AVX512_AVAIL) + fprintf(ofp_papi, "# SP FMA Vector AVX512\n"); + test_sp_x86_512B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_sp_x86_512B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_sp_x86_512B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + #endif + #endif + + // DP FMA instruction trials. + fprintf(ofp_papi, "# DP FMA Scalar\n"); + test_dp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP FMA Vector AVX128\n"); + test_dp_x86_128B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_dp_x86_128B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_dp_x86_128B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + #if defined(AVX256_AVAIL) + fprintf(ofp_papi, "# DP FMA Vector AVX256\n"); + test_dp_x86_256B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_dp_x86_256B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_dp_x86_256B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + #if defined(AVX512_AVAIL) + fprintf(ofp_papi, "# DP FMA Vector AVX512\n"); + test_dp_x86_512B_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_dp_x86_512B_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_dp_x86_512B_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + #endif + #endif #else fprintf(stderr, "Vector FLOP benchmark is not supported on this architecture: AVX unavailable!\n"); @@ -147,58 +203,130 @@ void vec_driver(char* papi_event_name, hw_desc_t *hw_desc, char* outdir) #elif defined(ARM) // Non-FMA instruction trials. - test_hp_arm_VEC( 24, 1000, EventSet, ofp_papi ); - test_hp_arm_VEC( 48, 1000, EventSet, ofp_papi ); - test_hp_arm_VEC( 96, 1000, EventSet, ofp_papi ); - - test_sp_arm_VEC( 24, 1000, EventSet, ofp_papi ); - test_sp_arm_VEC( 48, 1000, EventSet, ofp_papi ); - test_sp_arm_VEC( 96, 1000, EventSet, ofp_papi ); - - test_dp_arm_VEC( 24, 1000, EventSet, ofp_papi ); - test_dp_arm_VEC( 48, 1000, EventSet, ofp_papi ); - test_dp_arm_VEC( 96, 1000, EventSet, ofp_papi ); + fprintf(ofp_papi, "# HP Non-FMA Scalar\n"); + test_hp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# HP Non-FMA Vector\n"); + test_hp_arm_VEC( 24, ITER, EventSet, ofp_papi ); + test_hp_arm_VEC( 48, ITER, EventSet, ofp_papi ); + test_hp_arm_VEC( 96, ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP Non-FMA Scalar\n"); + test_sp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP Non-FMA Vector\n"); + test_sp_arm_VEC( 24, ITER, EventSet, ofp_papi ); + test_sp_arm_VEC( 48, ITER, EventSet, ofp_papi ); + test_sp_arm_VEC( 96, ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP Non-FMA Scalar\n"); + test_dp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP Non-FMA Vector\n"); + test_dp_arm_VEC( 24, ITER, EventSet, ofp_papi ); + test_dp_arm_VEC( 48, ITER, EventSet, ofp_papi ); + test_dp_arm_VEC( 96, ITER, EventSet, ofp_papi ); // FMA instruction trials. - test_hp_arm_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_hp_arm_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_hp_arm_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - - test_sp_arm_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_sp_arm_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_sp_arm_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - - test_dp_arm_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_dp_arm_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_dp_arm_VEC_FMA( 48, 1000, EventSet, ofp_papi ); + fprintf(ofp_papi, "# HP FMA Scalar\n"); + test_hp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# HP FMA Vector\n"); + test_hp_arm_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_hp_arm_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_hp_arm_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP FMA Scalar\n"); + test_sp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP FMA Vector\n"); + test_sp_arm_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_sp_arm_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_sp_arm_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP FMA Scalar\n"); + test_dp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP FMA Vector\n"); + test_dp_arm_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_dp_arm_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_dp_arm_VEC_FMA( 48, ITER, EventSet, ofp_papi ); #elif defined(POWER) // Non-FMA instruction trials. - test_hp_power_VEC( 24, 1000, EventSet, ofp_papi ); - test_hp_power_VEC( 48, 1000, EventSet, ofp_papi ); - test_hp_power_VEC( 96, 1000, EventSet, ofp_papi ); - - test_sp_power_VEC( 24, 1000, EventSet, ofp_papi ); - test_sp_power_VEC( 48, 1000, EventSet, ofp_papi ); - test_sp_power_VEC( 96, 1000, EventSet, ofp_papi ); - - test_dp_power_VEC( 24, 1000, EventSet, ofp_papi ); - test_dp_power_VEC( 48, 1000, EventSet, ofp_papi ); - test_dp_power_VEC( 96, 1000, EventSet, ofp_papi ); + fprintf(ofp_papi, "# HP Non-FMA Scalar\n"); + test_hp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# HP Non-FMA Vector\n"); + test_hp_power_VEC( 24, ITER, EventSet, ofp_papi ); + test_hp_power_VEC( 48, ITER, EventSet, ofp_papi ); + test_hp_power_VEC( 96, ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP Non-FMA Scalar\n"); + test_sp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP Non-FMA Vector\n"); + test_sp_power_VEC( 24, ITER, EventSet, ofp_papi ); + test_sp_power_VEC( 48, ITER, EventSet, ofp_papi ); + test_sp_power_VEC( 96, ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP Non-FMA Scalar\n"); + test_dp_scalar_VEC_24( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_48( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_96( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP Non-FMA Vector\n"); + test_dp_power_VEC( 24, ITER, EventSet, ofp_papi ); + test_dp_power_VEC( 48, ITER, EventSet, ofp_papi ); + test_dp_power_VEC( 96, ITER, EventSet, ofp_papi ); // FMA instruction trials. - test_hp_power_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_hp_power_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_hp_power_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - - test_sp_power_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_sp_power_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_sp_power_VEC_FMA( 48, 1000, EventSet, ofp_papi ); - - test_dp_power_VEC_FMA( 12, 1000, EventSet, ofp_papi ); - test_dp_power_VEC_FMA( 24, 1000, EventSet, ofp_papi ); - test_dp_power_VEC_FMA( 48, 1000, EventSet, ofp_papi ); + fprintf(ofp_papi, "# HP FMA Scalar\n"); + test_hp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_hp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# HP FMA Vector\n"); + test_hp_power_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_hp_power_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_hp_power_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP FMA Scalar\n"); + test_sp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_sp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# SP FMA Vector\n"); + test_sp_power_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_sp_power_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_sp_power_VEC_FMA( 48, ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP FMA Scalar\n"); + test_dp_scalar_VEC_FMA_12( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_24( ITER, EventSet, ofp_papi ); + test_dp_scalar_VEC_FMA_48( ITER, EventSet, ofp_papi ); + + fprintf(ofp_papi, "# DP FMA Vector\n"); + test_dp_power_VEC_FMA( 12, ITER, EventSet, ofp_papi ); + test_dp_power_VEC_FMA( 24, ITER, EventSet, ofp_papi ); + test_dp_power_VEC_FMA( 48, ITER, EventSet, ofp_papi ); #endif diff --git a/src/counter_analysis_toolkit/vec_fma_dp.c b/src/counter_analysis_toolkit/vec_fma_dp.c index 8ae703aee..0f8e4ede3 100644 --- a/src/counter_analysis_toolkit/vec_fma_dp.c +++ b/src/counter_analysis_toolkit/vec_fma_dp.c @@ -61,7 +61,7 @@ double test_dp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PD(r0,r7,r9); @@ -137,7 +137,7 @@ double test_dp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PD(r0,r7,r9); @@ -227,7 +227,7 @@ double test_dp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PD(r0,r7,r9); @@ -320,15 +320,15 @@ void test_dp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE if ( instr_per_loop == 12 ) { sum += test_dp_mac_VEC_FMA_12( iterations, EventSet, fp ); - scalar_sum += test_dp_scalar_VEC_FMA_12( iterations ); + scalar_sum += test_dp_scalar_VEC_FMA_12( iterations, EventSet, NULL ); } else if ( instr_per_loop == 24 ) { sum += test_dp_mac_VEC_FMA_24( iterations, EventSet, fp ); - scalar_sum += test_dp_scalar_VEC_FMA_24( iterations ); + scalar_sum += test_dp_scalar_VEC_FMA_24( iterations, EventSet, NULL ); } else if ( instr_per_loop == 48 ) { sum += test_dp_mac_VEC_FMA_48( iterations, EventSet, fp ); - scalar_sum += test_dp_scalar_VEC_FMA_48( iterations ); + scalar_sum += test_dp_scalar_VEC_FMA_48( iterations, EventSet, NULL ); } if( sum/2.0 != scalar_sum ) { diff --git a/src/counter_analysis_toolkit/vec_fma_hp.c b/src/counter_analysis_toolkit/vec_fma_hp.c index f65b5a4a0..7eebd08fa 100644 --- a/src/counter_analysis_toolkit/vec_fma_hp.c +++ b/src/counter_analysis_toolkit/vec_fma_hp.c @@ -68,7 +68,7 @@ half test_hp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PH(r0,r7,r9); @@ -146,7 +146,7 @@ half test_hp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PH(r0,r7,r9); @@ -238,7 +238,7 @@ half test_hp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PH(r0,r7,r9); @@ -333,15 +333,15 @@ void test_hp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE if ( instr_per_loop == 12 ) { sum = vaddh_f16(sum,test_hp_mac_VEC_FMA_12( iterations, EventSet, fp )); - scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_12( iterations )); + scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_12( iterations, EventSet, NULL )); } else if ( instr_per_loop == 24 ) { sum = vaddh_f16(sum,test_hp_mac_VEC_FMA_24( iterations, EventSet, fp )); - scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_24( iterations )); + scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_24( iterations, EventSet, NULL )); } else if ( instr_per_loop == 48 ) { sum = vaddh_f16(sum,test_hp_mac_VEC_FMA_48( iterations, EventSet, fp )); - scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_48( iterations )); + scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_FMA_48( iterations, EventSet, NULL )); } if( vdivh_f16(sum,4.0) != scalar_sum ) { @@ -355,7 +355,10 @@ float test_hp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; (void)EventSet; - papi_stop_and_print_placeholder(12, fp); + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(12, fp); + } return 0.0; } @@ -365,7 +368,10 @@ float test_hp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; (void)EventSet; - papi_stop_and_print_placeholder(24, fp); + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(24, fp); + } return 0.0; } @@ -375,7 +381,10 @@ float test_hp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; (void)EventSet; - papi_stop_and_print_placeholder(48, fp); + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(48, fp); + } return 0.0; } @@ -388,15 +397,15 @@ void test_hp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE if ( instr_per_loop == 12 ) { sum += test_hp_mac_VEC_FMA_12( iterations, EventSet, fp ); - scalar_sum += test_hp_scalar_VEC_FMA_12( iterations ); + scalar_sum += test_hp_scalar_VEC_FMA_12( iterations, EventSet, NULL ); } else if ( instr_per_loop == 24 ) { sum += test_hp_mac_VEC_FMA_24( iterations, EventSet, fp ); - scalar_sum += test_hp_scalar_VEC_FMA_24( iterations ); + scalar_sum += test_hp_scalar_VEC_FMA_24( iterations, EventSet, NULL ); } else if ( instr_per_loop == 48 ) { sum += test_hp_mac_VEC_FMA_48( iterations, EventSet, fp ); - scalar_sum += test_hp_scalar_VEC_FMA_48( iterations ); + scalar_sum += test_hp_scalar_VEC_FMA_48( iterations, EventSet, NULL ); } if( sum/4.0 != scalar_sum ) { diff --git a/src/counter_analysis_toolkit/vec_fma_sp.c b/src/counter_analysis_toolkit/vec_fma_sp.c index 57366fe42..2cb6d84c3 100644 --- a/src/counter_analysis_toolkit/vec_fma_sp.c +++ b/src/counter_analysis_toolkit/vec_fma_sp.c @@ -61,7 +61,7 @@ float test_sp_mac_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PS(r0,r7,r9); @@ -139,7 +139,7 @@ float test_sp_mac_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PS(r0,r7,r9); @@ -231,7 +231,7 @@ float test_sp_mac_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = FMA_VEC_PS(r0,r7,r9); @@ -326,18 +326,18 @@ void test_sp_VEC_FMA( int instr_per_loop, uint64 iterations, int EventSet, FILE if ( instr_per_loop == 12 ) { sum += test_sp_mac_VEC_FMA_12( iterations, EventSet, fp ); - scalar_sum += test_sp_scalar_VEC_FMA_12( iterations ); + scalar_sum += test_sp_scalar_VEC_FMA_12( iterations, EventSet, NULL ); } else if ( instr_per_loop == 24 ) { sum += test_sp_mac_VEC_FMA_24( iterations, EventSet, fp ); - scalar_sum += test_sp_scalar_VEC_FMA_24( iterations ); + scalar_sum += test_sp_scalar_VEC_FMA_24( iterations, EventSet, NULL ); } else if ( instr_per_loop == 48 ) { sum += test_sp_mac_VEC_FMA_48( iterations, EventSet, fp ); - scalar_sum += test_sp_scalar_VEC_FMA_48( iterations ); + scalar_sum += test_sp_scalar_VEC_FMA_48( iterations, EventSet, NULL ); } if( sum/4.0 != scalar_sum ) { - fprintf(stderr, "FMA: Inconsistent FLOP results detected!\n"); + fprintf(stderr, "FMA: Inconsistent FLOP results detected! %f vs %f\n", sum/4.0, scalar_sum); } } diff --git a/src/counter_analysis_toolkit/vec_nonfma_dp.c b/src/counter_analysis_toolkit/vec_nonfma_dp.c index 5911ec330..b723cde0f 100644 --- a/src/counter_analysis_toolkit/vec_nonfma_dp.c +++ b/src/counter_analysis_toolkit/vec_nonfma_dp.c @@ -61,7 +61,7 @@ double test_dp_mac_VEC_24( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PD(r0,rC); @@ -154,7 +154,7 @@ double test_dp_mac_VEC_48( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PD(r0,rC); @@ -273,7 +273,7 @@ double test_dp_mac_VEC_96( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PD(r0,rC); @@ -419,15 +419,15 @@ void test_dp_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp if ( instr_per_loop == 24 ) { sum += test_dp_mac_VEC_24( iterations, EventSet, fp ); - scalar_sum += test_dp_scalar_VEC_24( iterations ); + scalar_sum += test_dp_scalar_VEC_24( iterations, EventSet, NULL ); } else if ( instr_per_loop == 48 ) { sum += test_dp_mac_VEC_48( iterations, EventSet, fp ); - scalar_sum += test_dp_scalar_VEC_48( iterations ); + scalar_sum += test_dp_scalar_VEC_48( iterations, EventSet, NULL ); } else if ( instr_per_loop == 96 ) { sum += test_dp_mac_VEC_96( iterations, EventSet, fp ); - scalar_sum += test_dp_scalar_VEC_96( iterations ); + scalar_sum += test_dp_scalar_VEC_96( iterations, EventSet, NULL ); } if( sum/2.0 != scalar_sum ) { diff --git a/src/counter_analysis_toolkit/vec_nonfma_hp.c b/src/counter_analysis_toolkit/vec_nonfma_hp.c index f8b08c298..867c9cc45 100644 --- a/src/counter_analysis_toolkit/vec_nonfma_hp.c +++ b/src/counter_analysis_toolkit/vec_nonfma_hp.c @@ -68,7 +68,7 @@ half test_hp_mac_VEC_24( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PH(r0,rC); @@ -163,7 +163,7 @@ half test_hp_mac_VEC_48( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PH(r0,rC); @@ -284,7 +284,7 @@ half test_hp_mac_VEC_96( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PH(r0,rC); @@ -432,15 +432,15 @@ void test_hp_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp if ( instr_per_loop == 24 ) { sum = vaddh_f16(sum,test_hp_mac_VEC_24( iterations, EventSet, fp )); - scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_24( iterations )); + scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_24( iterations, EventSet, NULL )); } else if ( instr_per_loop == 48 ) { sum = vaddh_f16(sum,test_hp_mac_VEC_48( iterations, EventSet, fp )); - scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_48( iterations )); + scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_48( iterations, EventSet, NULL )); } else if ( instr_per_loop == 96 ) { sum = vaddh_f16(sum,test_hp_mac_VEC_96( iterations, EventSet, fp )); - scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_96( iterations )); + scalar_sum = vaddh_f16(scalar_sum,test_hp_scalar_VEC_96( iterations, EventSet, NULL )); } if( vdivh_f16(sum,4.0) != scalar_sum ) { @@ -454,7 +454,10 @@ float test_hp_mac_VEC_24( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; (void)EventSet; - papi_stop_and_print_placeholder(24, fp); + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(24, fp); + } return 0.0; } @@ -464,7 +467,10 @@ float test_hp_mac_VEC_48( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; (void)EventSet; - papi_stop_and_print_placeholder(48, fp); + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(48, fp); + } return 0.0; } @@ -474,7 +480,10 @@ float test_hp_mac_VEC_96( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; (void)EventSet; - papi_stop_and_print_placeholder(96, fp); + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(96, fp); + } return 0.0; } @@ -487,15 +496,15 @@ void test_hp_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp if ( instr_per_loop == 24 ) { sum += test_hp_mac_VEC_24( iterations, EventSet, fp ); - scalar_sum += test_hp_scalar_VEC_24( iterations ); + scalar_sum += test_hp_scalar_VEC_24( iterations, EventSet, NULL ); } else if ( instr_per_loop == 48 ) { sum += test_hp_mac_VEC_48( iterations, EventSet, fp ); - scalar_sum += test_hp_scalar_VEC_48( iterations ); + scalar_sum += test_hp_scalar_VEC_48( iterations, EventSet, NULL ); } else if ( instr_per_loop == 96 ) { sum += test_hp_mac_VEC_96( iterations, EventSet, fp ); - scalar_sum += test_hp_scalar_VEC_96( iterations ); + scalar_sum += test_hp_scalar_VEC_96( iterations, EventSet, NULL ); } if( sum/4.0 != scalar_sum ) { diff --git a/src/counter_analysis_toolkit/vec_nonfma_sp.c b/src/counter_analysis_toolkit/vec_nonfma_sp.c index e0f179712..f5e25d46b 100644 --- a/src/counter_analysis_toolkit/vec_nonfma_sp.c +++ b/src/counter_analysis_toolkit/vec_nonfma_sp.c @@ -61,7 +61,7 @@ float test_sp_mac_VEC_24( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PS(r0,rC); @@ -156,7 +156,7 @@ float test_sp_mac_VEC_48( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PS(r0,rC); @@ -277,7 +277,7 @@ float test_sp_mac_VEC_96( uint64 iterations, int EventSet, FILE *fp ){ uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_PS(r0,rC); @@ -425,15 +425,15 @@ void test_sp_VEC( int instr_per_loop, uint64 iterations, int EventSet, FILE *fp if ( instr_per_loop == 24 ) { sum += test_sp_mac_VEC_24( iterations, EventSet, fp ); - scalar_sum += test_sp_scalar_VEC_24( iterations ); + scalar_sum += test_sp_scalar_VEC_24( iterations, EventSet, NULL ); } else if ( instr_per_loop == 48 ) { sum += test_sp_mac_VEC_48( iterations, EventSet, fp ); - scalar_sum += test_sp_scalar_VEC_48( iterations ); + scalar_sum += test_sp_scalar_VEC_48( iterations, EventSet, NULL ); } else if ( instr_per_loop == 96 ) { sum += test_sp_mac_VEC_96( iterations, EventSet, fp ); - scalar_sum += test_sp_scalar_VEC_96( iterations ); + scalar_sum += test_sp_scalar_VEC_96( iterations, EventSet, NULL ); } if( sum/4.0 != scalar_sum ) { diff --git a/src/counter_analysis_toolkit/vec_scalar_verify.c b/src/counter_analysis_toolkit/vec_scalar_verify.c index 065bccc68..9d4a48683 100644 --- a/src/counter_analysis_toolkit/vec_scalar_verify.c +++ b/src/counter_analysis_toolkit/vec_scalar_verify.c @@ -19,7 +19,7 @@ void papi_stop_and_print(long long theory, int EventSet, FILE *fp) } #if defined(ARM) -half test_hp_scalar_VEC_24( uint64 iterations ){ +half test_hp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ){ register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -40,10 +40,17 @@ half test_hp_scalar_VEC_24( uint64 iterations ){ rE = SET_VEC_SH(0.15); rF = SET_VEC_SH(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SH(r0,rC); @@ -77,6 +84,11 @@ half test_hp_scalar_VEC_24( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(24, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SH(r0,r1); r2 = ADD_VEC_SH(r2,r3); @@ -99,7 +111,7 @@ half test_hp_scalar_VEC_24( uint64 iterations ){ return out; } -half test_hp_scalar_VEC_48( uint64 iterations ){ +half test_hp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ){ register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -120,10 +132,17 @@ half test_hp_scalar_VEC_48( uint64 iterations ){ rE = SET_VEC_SH(0.15); rF = SET_VEC_SH(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SH(r0,rC); @@ -183,6 +202,11 @@ half test_hp_scalar_VEC_48( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(48, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SH(r0,r1); r2 = ADD_VEC_SH(r2,r3); @@ -205,7 +229,7 @@ half test_hp_scalar_VEC_48( uint64 iterations ){ return out; } -half test_hp_scalar_VEC_96( uint64 iterations ){ +half test_hp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ){ register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -226,10 +250,17 @@ half test_hp_scalar_VEC_96( uint64 iterations ){ rE = SET_VEC_SH(0.15); rF = SET_VEC_SH(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SH(r0,rC); @@ -341,6 +372,11 @@ half test_hp_scalar_VEC_96( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(96, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SH(r0,r1); r2 = ADD_VEC_SH(r2,r3); @@ -364,21 +400,39 @@ half test_hp_scalar_VEC_96( uint64 iterations ){ } #else -float test_hp_scalar_VEC_24( uint64 iterations ){ +float test_hp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; + (void)EventSet; + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(24, fp); + } + return 0.0; } -float test_hp_scalar_VEC_48( uint64 iterations ){ +float test_hp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; + (void)EventSet; + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(48, fp); + } + return 0.0; } -float test_hp_scalar_VEC_96( uint64 iterations ){ +float test_hp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; + (void)EventSet; + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(96, fp); + } + return 0.0; } #endif @@ -386,7 +440,7 @@ float test_hp_scalar_VEC_96( uint64 iterations ){ /************************************/ /* Loop unrolling: 24 instructions */ /************************************/ -float test_sp_scalar_VEC_24( uint64 iterations ){ +float test_sp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ){ register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -407,10 +461,17 @@ float test_sp_scalar_VEC_24( uint64 iterations ){ rE = SET_VEC_SS(0.15); rF = SET_VEC_SS(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SS(r0,rC); @@ -444,6 +505,11 @@ float test_sp_scalar_VEC_24( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(24, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SS(r0,r1); r2 = ADD_VEC_SS(r2,r3); @@ -469,7 +535,7 @@ float test_sp_scalar_VEC_24( uint64 iterations ){ /************************************/ /* Loop unrolling: 48 instructions */ /************************************/ -float test_sp_scalar_VEC_48( uint64 iterations ){ +float test_sp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ){ register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -490,10 +556,17 @@ float test_sp_scalar_VEC_48( uint64 iterations ){ rE = SET_VEC_SS(0.15); rF = SET_VEC_SS(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SS(r0,rC); @@ -553,6 +626,11 @@ float test_sp_scalar_VEC_48( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(48, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SS(r0,r1); r2 = ADD_VEC_SS(r2,r3); @@ -578,7 +656,7 @@ float test_sp_scalar_VEC_48( uint64 iterations ){ /************************************/ /* Loop unrolling: 96 instructions */ /************************************/ -float test_sp_scalar_VEC_96( uint64 iterations ){ +float test_sp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ){ register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -599,10 +677,17 @@ float test_sp_scalar_VEC_96( uint64 iterations ){ rE = SET_VEC_SS(0.15); rF = SET_VEC_SS(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SS(r0,rC); @@ -714,6 +799,11 @@ float test_sp_scalar_VEC_96( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(96, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SS(r0,r1); r2 = ADD_VEC_SS(r2,r3); @@ -739,7 +829,7 @@ float test_sp_scalar_VEC_96( uint64 iterations ){ /************************************/ /* Loop unrolling: 24 instructions */ /************************************/ -double test_dp_scalar_VEC_24( uint64 iterations ){ +double test_dp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ){ register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -760,10 +850,17 @@ double test_dp_scalar_VEC_24( uint64 iterations ){ rE = SET_VEC_SD(0.15); rF = SET_VEC_SD(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SD(r0,rC); @@ -797,6 +894,11 @@ double test_dp_scalar_VEC_24( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(24, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SD(r0,r1); r2 = ADD_VEC_SD(r2,r3); @@ -822,7 +924,7 @@ double test_dp_scalar_VEC_24( uint64 iterations ){ /************************************/ /* Loop unrolling: 48 instructions */ /************************************/ -double test_dp_scalar_VEC_48( uint64 iterations ){ +double test_dp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ){ register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -843,10 +945,17 @@ double test_dp_scalar_VEC_48( uint64 iterations ){ rE = SET_VEC_SD(0.15); rF = SET_VEC_SD(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SD(r0,rC); @@ -906,6 +1015,11 @@ double test_dp_scalar_VEC_48( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(48, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SD(r0,r1); r2 = ADD_VEC_SD(r2,r3); @@ -931,7 +1045,7 @@ double test_dp_scalar_VEC_48( uint64 iterations ){ /************************************/ /* Loop unrolling: 96 instructions */ /************************************/ -double test_dp_scalar_VEC_96( uint64 iterations ){ +double test_dp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ){ register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -952,10 +1066,17 @@ double test_dp_scalar_VEC_96( uint64 iterations ){ rE = SET_VEC_SD(0.15); rF = SET_VEC_SD(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ r0 = MUL_VEC_SD(r0,rC); @@ -1067,6 +1188,11 @@ double test_dp_scalar_VEC_96( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(96, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SD(r0,r1); r2 = ADD_VEC_SD(r2,r3); @@ -1090,7 +1216,7 @@ double test_dp_scalar_VEC_96( uint64 iterations ){ } #if defined(ARM) -half test_hp_scalar_VEC_FMA_12( uint64 iterations ){ +half test_hp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1111,10 +1237,17 @@ half test_hp_scalar_VEC_FMA_12( uint64 iterations ){ rE = SET_VEC_SH(0.15); rF = SET_VEC_SH(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SH(r0,r0,r7,r9); @@ -1136,6 +1269,11 @@ half test_hp_scalar_VEC_FMA_12( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(12, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SH(r0,r1); r2 = ADD_VEC_SH(r2,r3); @@ -1153,7 +1291,7 @@ half test_hp_scalar_VEC_FMA_12( uint64 iterations ){ return out; } -half test_hp_scalar_VEC_FMA_24( uint64 iterations ){ +half test_hp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){ register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1174,10 +1312,17 @@ half test_hp_scalar_VEC_FMA_24( uint64 iterations ){ rE = SET_VEC_SH(0.15); rF = SET_VEC_SH(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SH(r0,r0,r7,r9); @@ -1213,6 +1358,11 @@ half test_hp_scalar_VEC_FMA_24( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(24, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SH(r0,r1); r2 = ADD_VEC_SH(r2,r3); @@ -1230,7 +1380,7 @@ half test_hp_scalar_VEC_FMA_24( uint64 iterations ){ return out; } -half test_hp_scalar_VEC_FMA_48( uint64 iterations ){ +half test_hp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){ register half r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1251,10 +1401,17 @@ half test_hp_scalar_VEC_FMA_48( uint64 iterations ){ rE = SET_VEC_SH(0.15); rF = SET_VEC_SH(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SH(r0,r0,r7,r9); @@ -1318,6 +1475,11 @@ half test_hp_scalar_VEC_FMA_48( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(48, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SH(r0,r1); r2 = ADD_VEC_SH(r2,r3); @@ -1336,21 +1498,39 @@ half test_hp_scalar_VEC_FMA_48( uint64 iterations ){ } #else -float test_hp_scalar_VEC_FMA_12( uint64 iterations ){ +float test_hp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; + (void)EventSet; + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(12, fp); + } + return 0.0; } -float test_hp_scalar_VEC_FMA_24( uint64 iterations ){ +float test_hp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; + (void)EventSet; + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(24, fp); + } + return 0.0; } -float test_hp_scalar_VEC_FMA_48( uint64 iterations ){ +float test_hp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){ (void)iterations; + (void)EventSet; + + if ( NULL != fp ) { + papi_stop_and_print_placeholder(48, fp); + } + return 0.0; } #endif @@ -1358,7 +1538,8 @@ float test_hp_scalar_VEC_FMA_48( uint64 iterations ){ /************************************/ /* Loop unrolling: 12 instructions */ /************************************/ -float test_sp_scalar_VEC_FMA_12( uint64 iterations ){ +#pragma GCC optimize ("O2") +float test_sp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1379,10 +1560,17 @@ float test_sp_scalar_VEC_FMA_12( uint64 iterations ){ rE = SET_VEC_SS(0.15); rF = SET_VEC_SS(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SS(r0,r0,r7,r9); @@ -1404,6 +1592,11 @@ float test_sp_scalar_VEC_FMA_12( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(12, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SS(r0,r1); r2 = ADD_VEC_SS(r2,r3); @@ -1424,7 +1617,7 @@ float test_sp_scalar_VEC_FMA_12( uint64 iterations ){ /************************************/ /* Loop unrolling: 24 instructions */ /************************************/ -float test_sp_scalar_VEC_FMA_24( uint64 iterations ){ +float test_sp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){ register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1445,10 +1638,17 @@ float test_sp_scalar_VEC_FMA_24( uint64 iterations ){ rE = SET_VEC_SS(0.15); rF = SET_VEC_SS(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SS(r0,r0,r7,r9); @@ -1484,6 +1684,11 @@ float test_sp_scalar_VEC_FMA_24( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(24, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SS(r0,r1); r2 = ADD_VEC_SS(r2,r3); @@ -1504,7 +1709,7 @@ float test_sp_scalar_VEC_FMA_24( uint64 iterations ){ /************************************/ /* Loop unrolling: 48 instructions */ /************************************/ -float test_sp_scalar_VEC_FMA_48( uint64 iterations ){ +float test_sp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){ register SP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1525,10 +1730,17 @@ float test_sp_scalar_VEC_FMA_48( uint64 iterations ){ rE = SET_VEC_SS(0.15); rF = SET_VEC_SS(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SS(r0,r0,r7,r9); @@ -1592,6 +1804,11 @@ float test_sp_scalar_VEC_FMA_48( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(48, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SS(r0,r1); r2 = ADD_VEC_SS(r2,r3); @@ -1612,7 +1829,7 @@ float test_sp_scalar_VEC_FMA_48( uint64 iterations ){ /************************************/ /* Loop unrolling: 12 instructions */ /************************************/ -double test_dp_scalar_VEC_FMA_12( uint64 iterations ){ +double test_dp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ){ register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1633,10 +1850,17 @@ double test_dp_scalar_VEC_FMA_12( uint64 iterations ){ rE = SET_VEC_SD(0.15); rF = SET_VEC_SD(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SD(r0,r0,r7,r9); @@ -1658,6 +1882,11 @@ double test_dp_scalar_VEC_FMA_12( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(12, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SD(r0,r1); r2 = ADD_VEC_SD(r2,r3); @@ -1678,7 +1907,7 @@ double test_dp_scalar_VEC_FMA_12( uint64 iterations ){ /************************************/ /* Loop unrolling: 24 instructions */ /************************************/ -double test_dp_scalar_VEC_FMA_24( uint64 iterations ){ +double test_dp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ){ register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1699,10 +1928,17 @@ double test_dp_scalar_VEC_FMA_24( uint64 iterations ){ rE = SET_VEC_SD(0.15); rF = SET_VEC_SD(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SD(r0,r0,r7,r9); @@ -1738,6 +1974,11 @@ double test_dp_scalar_VEC_FMA_24( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(24, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SD(r0,r1); r2 = ADD_VEC_SD(r2,r3); @@ -1758,7 +1999,7 @@ double test_dp_scalar_VEC_FMA_24( uint64 iterations ){ /************************************/ /* Loop unrolling: 48 instructions */ /************************************/ -double test_dp_scalar_VEC_FMA_48( uint64 iterations ){ +double test_dp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ){ register DP_SCALAR_TYPE r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,rA,rB,rC,rD,rE,rF; /* Generate starting data */ @@ -1779,10 +2020,17 @@ double test_dp_scalar_VEC_FMA_48( uint64 iterations ){ rE = SET_VEC_SD(0.15); rF = SET_VEC_SD(0.16); + /* Start PAPI counters */ + if ( NULL != fp ) { + if ( PAPI_start( EventSet ) != PAPI_OK ) { + return -1; + } + } + uint64 c = 0; while (c < iterations){ size_t i = 0; - while (i < 1000){ + while (i < ITER){ /* The performance critical part */ FMA_VEC_SD(r0,r0,r7,r9); @@ -1846,6 +2094,11 @@ double test_dp_scalar_VEC_FMA_48( uint64 iterations ){ c++; } + /* Stop PAPI counters */ + if ( NULL != fp ) { + papi_stop_and_print(48, EventSet, fp); + } + /* Use data so that compiler does not eliminate it when using -O2 */ r0 = ADD_VEC_SD(r0,r1); r2 = ADD_VEC_SD(r2,r3); @@ -1862,3 +2115,4 @@ double test_dp_scalar_VEC_FMA_48( uint64 iterations ){ return out; } +// End of pragma. diff --git a/src/counter_analysis_toolkit/vec_scalar_verify.h b/src/counter_analysis_toolkit/vec_scalar_verify.h index 133dfd3b9..3335511ce 100644 --- a/src/counter_analysis_toolkit/vec_scalar_verify.h +++ b/src/counter_analysis_toolkit/vec_scalar_verify.h @@ -3,44 +3,46 @@ #include #include "cat_arch.h" +#define ITER 1 + void papi_stop_and_print_placeholder(long long theory, FILE *fp); void papi_stop_and_print(long long theory, int EventSet, FILE *fp); // Non-FMA-like computations. #if defined(ARM) -half test_hp_scalar_VEC_24( uint64 iterations ); -half test_hp_scalar_VEC_48( uint64 iterations ); -half test_hp_scalar_VEC_96( uint64 iterations ); +half test_hp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ); +half test_hp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ); +half test_hp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ); #else -float test_hp_scalar_VEC_24( uint64 iterations ); -float test_hp_scalar_VEC_48( uint64 iterations ); -float test_hp_scalar_VEC_96( uint64 iterations ); +float test_hp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ); +float test_hp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ); +float test_hp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ); #endif -float test_sp_scalar_VEC_24( uint64 iterations ); -float test_sp_scalar_VEC_48( uint64 iterations ); -float test_sp_scalar_VEC_96( uint64 iterations ); +float test_sp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ); +float test_sp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ); +float test_sp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ); -double test_dp_scalar_VEC_24( uint64 iterations ); -double test_dp_scalar_VEC_48( uint64 iterations ); -double test_dp_scalar_VEC_96( uint64 iterations ); +double test_dp_scalar_VEC_24( uint64 iterations, int EventSet, FILE *fp ); +double test_dp_scalar_VEC_48( uint64 iterations, int EventSet, FILE *fp ); +double test_dp_scalar_VEC_96( uint64 iterations, int EventSet, FILE *fp ); // Functions to emulate FMA. #if defined(ARM) -half test_hp_scalar_VEC_FMA_12( uint64 iterations ); -half test_hp_scalar_VEC_FMA_24( uint64 iterations ); -half test_hp_scalar_VEC_FMA_48( uint64 iterations ); +half test_hp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); +half test_hp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); +half test_hp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ); #else -float test_hp_scalar_VEC_FMA_12( uint64 iterations ); -float test_hp_scalar_VEC_FMA_24( uint64 iterations ); -float test_hp_scalar_VEC_FMA_48( uint64 iterations ); +float test_hp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); +float test_hp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); +float test_hp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ); #endif -float test_sp_scalar_VEC_FMA_12( uint64 iterations ); -float test_sp_scalar_VEC_FMA_24( uint64 iterations ); -float test_sp_scalar_VEC_FMA_48( uint64 iterations ); +float test_sp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); +float test_sp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); +float test_sp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp ); -double test_dp_scalar_VEC_FMA_12( uint64 iterations ); -double test_dp_scalar_VEC_FMA_24( uint64 iterations ); -double test_dp_scalar_VEC_FMA_48( uint64 iterations ); +double test_dp_scalar_VEC_FMA_12( uint64 iterations, int EventSet, FILE *fp ); +double test_dp_scalar_VEC_FMA_24( uint64 iterations, int EventSet, FILE *fp ); +double test_dp_scalar_VEC_FMA_48( uint64 iterations, int EventSet, FILE *fp );