c++ - SIMD vs OMP in vector multiplication -
in project have several vector multiplications, done on either double *a
-vectors or float *a
-vectors. in order accelerate that, wanted use either simd-operations or omp
. getting fastest result, wrote benchmark program:
#include <iostream> #include <memory> #include <vector> #include <omp.h> #include <immintrin.h> #include <stdlib.h> #include <chrono> #define size 32768 #define rounds 1e5 void multiply_singular(float *a, float *b, float *d) { for(int = 0; < size; i++) d[i] = a[i]*b[i]; } void multiply_omp(float *a, float *b, float *d) { #pragma omp parallel for(int = 0; < size; i++) d[i] = a[i]*b[i]; } void multiply_avx(float *a, float *b, float *d) { __m256 a_a, b_a, c_a; for(int = 0; < size/8; i++) { a_a = _mm256_loadu_ps(a+8*i); b_a = _mm256_loadu_ps(b+8*i); c_a = _mm256_mul_ps(a_a, b_a); _mm256_storeu_ps (d+i*8, c_a); } } void multiply_avx_omp(float *a, float *b, float *d) { __m256 a_a, b_a, c_a; #pragma omp for(int = 0; < size/8; i++) { a_a = _mm256_loadu_ps(a+8*i); b_a = _mm256_loadu_ps(b+8*i); c_a = _mm256_mul_ps(a_a, b_a); _mm256_storeu_ps (d+i*8, c_a); } } void multiply_singular_double(double *a, double *b, double *d) { for(int = 0; < size; i++) d[i] = a[i]*b[i]; } void multiply_omp_double(double *a, double *b, double *d) { #pragma omp parallel for(int = 0; < size; i++) d[i] = a[i]*b[i]; } void multiply_avx_double(double *a, double *b, double *d) { __m256d a_a, b_a, c_a; for(int = 0; < size/4; i++) { a_a = _mm256_loadu_pd(a+4*i); b_a = _mm256_loadu_pd(b+4*i); c_a = _mm256_mul_pd(a_a, b_a); _mm256_storeu_pd (d+i*4, c_a); } } void multiply_avx_double_omp(double *a, double *b, double *d) { __m256d a_a, b_a, c_a; #pragma omp parallel for(int = 0; < size/4; i++) { a_a = _mm256_loadu_pd(a+4*i); b_a = _mm256_loadu_pd(b+4*i); c_a = _mm256_mul_pd(a_a, b_a); _mm256_storeu_pd (d+i*4, c_a); } } int main() { float *a, *b, *c, *d, *e, *f; double *a_d, *b_d, *c_d, *d_d, *e_d, *f_d; = new float[size] {0}; b = new float[size] {0}; c = new float[size] {0}; d = new float[size] {0}; e = new float[size] {0}; f = new float[size] {0}; a_d = new double[size] {0}; b_d = new double[size] {0}; c_d = new double[size] {0}; d_d = new double[size] {0}; e_d = new double[size] {0}; f_d = new double[size] {0}; for(int = 0; < size; i++) { a[i] = i; b[i] = i; a_d[i] = i; b_d[i] = i; }; std::cout << "now doing single float rounds!\n"; std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now(); for(int = 0; < rounds; i++) { multiply_singular(a, b, c); } std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now(); auto duration_ss = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count(); std::cout << "now doing omp float rounds!\n"; t1 = std::chrono::high_resolution_clock::now(); for(int = 0; < rounds*10; i++) { multiply_omp(a, b, d); }; t2 = std::chrono::high_resolution_clock::now(); auto duration_so = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count(); std::cout << "now doing avx float rounds!\n"; t1 = std::chrono::high_resolution_clock::now(); for(int = 0; < rounds*10; i++) { multiply_avx(a, b, e); }; t2 = std::chrono::high_resolution_clock::now(); auto duration_sa = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count(); std::cout << "now doing avx omp float rounds!\n"; t1 = std::chrono::high_resolution_clock::now(); for(int = 0; < rounds*10; i++) { multiply_avx_omp(a, b, e); }; t2 = std::chrono::high_resolution_clock::now(); auto duration_sao = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count(); std::cout << "now doing single double rounds!\n"; t1 = std::chrono::high_resolution_clock::now(); for(int = 0; < rounds; i++) { multiply_singular_double(a_d, b_d, c_d); }; t2 = std::chrono::high_resolution_clock::now(); auto duration_ds = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count(); std::cout << "now doing omp double rounds!\n"; t1 = std::chrono::high_resolution_clock::now(); for(int = 0; < rounds*10; i++) { multiply_omp_double(a_d, b_d, d_d); }; t2 = std::chrono::high_resolution_clock::now(); auto duration_do = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count(); std::cout << "now doing avx double rounds!\n"; t1 = std::chrono::high_resolution_clock::now(); for(int = 0; < rounds*10; i++) { multiply_avx_double(a_d, b_d, e_d); }; t2 = std::chrono::high_resolution_clock::now(); auto duration_da = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count(); std::cout << "now doing avx omp double rounds!\n"; t1 = std::chrono::high_resolution_clock::now(); for(int = 0; < rounds*10; i++) { multiply_avx_double_omp(a_d, b_d, f_d); }; t2 = std::chrono::high_resolution_clock::now(); auto duration_dao = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count(); std::cout << "finished\n"; std::cout << "elapsed time functions:\n"; std::cout << "function\ttime[ms]\n"; std::cout << "singular float:\t" << duration_ss/rounds << '\n'; std::cout << "omp float:\t" << duration_so/(rounds*10) << '\n'; std::cout << "avx float avx:\t" << duration_sa/(rounds*10) << '\n'; std::cout << "omp avx float avx omp:\t" << duration_sao/(rounds*10) << '\n'; std::cout << "singular double:\t" << duration_ds/rounds << '\n'; std::cout << "omp double:\t" << duration_do/(rounds*10) << '\n'; std::cout << "avx double:\t" << duration_da/(rounds*10) << '\n'; std::cout << "omp avx double:\t" << duration_dao/(rounds*10) << '\n'; delete[] a; delete[] b; delete[] c; delete[] d; delete[] e; delete[] f; delete[] a_d; delete[] b_d; delete[] c_d; delete[] d_d; delete[] e_d; delete[] f_d; return 0; }
when compiling g++-5 -fopenmp -std=c++14 -march=native test_new.cpp -o test -lgomp
,
elapsed time functions: function time[ms] singular float: 117.979 omp float: 40.5385 avx float avx: 60.2964 omp avx float avx omp: 61.4206 singular double: 129.59 omp double: 200.745 avx double: 136.715 omp avx double: 122.176
or in second run
elapsed time functions: function time[ms] singular float: 113.932 omp float: 39.2581 avx float avx: 58.3029 omp avx float avx omp: 60.0023 singular double: 123.575 omp double: 66.0327 avx double: 124.293 omp avx double: 318.038
here pure omp
-function faster other functions, avx function. when adding -o3
-switch compiling line, following result:
elapsed time functions: function time[ms] singular float: 12.7361 omp float: 4.82436 avx float avx: 14.7514 omp avx float avx omp: 14.7225 singular double: 27.9976 omp double: 8.50957 avx double: 32.5175 omp avx double: 257.219
here again omp
faster else, while avx slowest, slower linear approach. why that? avx function implementation crappy, or there other problems?
executed on ubuntu 14.04.1, i7 sandy bridge, gcc version 5.3.0.
edit: found 1 mistake: should move declarations of temporary variables in avx
-functions inside for-loop, gets me omp
-level (and delivers correct results).
edit 2: when disabling -o3
-switch, omp
-avx
-instructions faster omp
-functions, switch on par.
edit 3: when filling arrays random data every time before executing next loop, (with -o3
):
elapsed time functions: function time[ms] singular float: 30.742 cilk float: 24.0769 omp float: 17.2415 avx float avx: 33.0217 omp avx float avx omp: 10.1934 singular double: 60.412 cilk double: 34.6458 omp double: 19.0739 avx double: 66.8676 omp avx double: 22.3586
and without:
elapsed time functions: function time[ms] singular float: 274.402 cilk float: 88.258 omp float: 66.2124 avx float avx: 117.066 omp avx float avx omp: 35.0313 singular double: 238.652 cilk double: 91.1667 omp double: 127.621 avx double: 249.516 omp avx double: 116.24
(and added cilk_for()-loop comparison, too).
update: added (as suggested in answer) function using #pragma omp parallel simd
. resulted in:
elapsed time functions: function time[ms] singular float: 106.081 cilk float: 33.2761 omp float: 17.0651 avx float avx: 65.1129 omp avx float: 19.1496 simd omp float: 2.6095 aligned avx omp float: 18.1165 singular double: 118.939 cilk double: 53.1102 omp double: 35.652 avx double: 131.24 omp avx double: 39.4377 simd omp double: 7.0748 aligned avx omp double: 38.4474
with compilers supporting openmp4.x may want start this:
void multiply_singular_omp_for_simd(float *a, float *b, float *d) { #pragma omp parallel simd schedule (static,16) for(int = 0; < size; i++) d[i] = a[i]*b[i]; }
it give both simd , thread parallelism. parallel decomposition done automatically, first having parallel tasks/chunks spread across threads/cores, secondly every task/chunk spreading individual iterations "across" simd "lanes".
read given couple articles if feel concerned: threading , simd in openmp4, icc documentation.
formally way phrased question ambiguous, because staring 4.0, omp loop simd, threading or simd+threading parallel. it's not omp vs. simd anymore. instead it's omp simd vs. omp threading.
not sure how given gcc implementation is, icc/ifort can deal omp parallel simd relatively long time now. gcc should support starting 5.x (#pragma omp simd supported gcc time, it's not necessary case #pragma omp parallel simd).
for optimal compiler-driven implementation may ideally prefer cache blocking , manually split iteration space have outer loop driven omp parallel for, while innermost loop driven omp simd. maybe out of scope of original question.
Comments
Post a Comment