4040
4141#define MKL_INT_MAX ((npy_intp) (~((MKL_UINT) 0 ) >> 1 ))
4242
43+ #if defined(__ICC) || defined(__INTEL_COMPILER)
44+ #define DIST_PRAGMA_VECTOR _Pragma (" vector" )
45+ #define DIST_PRAGMA_NOVECTOR _Pragma (" novector" )
46+ #define DIST_ASSUME_ALIGNED (p, b ) __assume_aligned((p), (b));
47+ #else
48+ #define DIST_PRAGMA_VECTOR _Pragma (" GCC ivdep" )
49+ #define DIST_PRAGMA_NOVECTOR
50+ #define DIST_ASSUME_ALIGNED (p, b )
51+ #endif
52+
53+
4354void
4455irk_double_vec (irk_state *state, npy_intp len, double *res)
4556{
@@ -413,7 +424,7 @@ irk_pareto_vec(irk_state *state, npy_intp len, double *res, const double alp)
413424 /* res[i] = pow(res[i], neg_rec_alp) */
414425 vmdPowx (len, res, neg_rec_alp, res, VML_HA);
415426
416- # pragma ivdep
427+ DIST_PRAGMA_VECTOR
417428 for (i=0 ; i < len; i++) res[i] -= 1.0 ;
418429
419430}
@@ -491,7 +502,7 @@ irk_rayleigh_vec(irk_state *state, npy_intp len, double *res, const double scale
491502
492503 vmdSqrt (len, res, res, VML_HA);
493504
494- # pragma ivdep
505+ DIST_PRAGMA_VECTOR
495506 for (i=0 ; i < len; i++) res[i] *= scale;
496507
497508}
@@ -618,7 +629,7 @@ irk_noncentral_chisquare_vec(irk_state *state, npy_intp len, double *res, const
618629 idx = (int *) mkl_malloc (len * sizeof (int ), 64 );
619630 assert ( idx != NULL );
620631
621- # pragma ivdep
632+ DIST_PRAGMA_VECTOR
622633 for (i=0 ; i <len; i++) idx[i] = i;
623634
624635 std::sort (idx, idx + len, [pvec](int i1, int i2){ return pvec[i1] < pvec[i2]; } );
@@ -631,15 +642,15 @@ irk_noncentral_chisquare_vec(irk_state *state, npy_intp len, double *res, const
631642 for (i = 0 ; i < len; ) {
632643 int k, j, cv = pvec[idx[i]];
633644
634- # pragma ivdep
645+ DIST_PRAGMA_VECTOR
635646 for (j=i+1 ; (j < len) && (pvec[idx[j]] == cv); j++) {}
636647
637648 assert (j > i);
638649 err = vdRngGamma (VSL_RNG_METHOD_GAMMA_GNORM_ACCURATE, state->stream , j - i, tmp,
639650 shape + cv, d_zero, d_two);
640651 assert (err == VSL_STATUS_OK);
641652
642- # pragma ivdep
653+ DIST_PRAGMA_VECTOR
643654 for (k = i; k < j; k++) res[idx[k]] = tmp[k - i];
644655
645656 i = j;
@@ -733,10 +744,10 @@ irk_logistic_vec(irk_state *state, npy_intp len, double *res, const double loc,
733744 assert (err == VSL_STATUS_OK);
734745
735746 /* can MKL optimize computation of the logit function p \mapsto \ln(p/(1-p)) */
736- # pragma ivdep
747+ DIST_PRAGMA_VECTOR
737748 for (i=0 ; i<len; i++) res[i] = log (res[i]/(1.0 - res[i]));
738749
739- # pragma ivdep
750+ DIST_PRAGMA_VECTOR
740751 for (i=0 ; i<len; i++) res[i] = loc + scale*res[i];
741752}
742753
@@ -809,7 +820,7 @@ irk_wald_vec(irk_state *state, npy_intp len, double *res, const double mean, con
809820 /* Y = mean/(4 scale) * Z^2 */
810821 vmdSqr (len, res, res, VML_HA);
811822
812- # pragma ivdep
823+ DIST_PRAGMA_VECTOR
813824 for (i = 0 ; i < len; i++) {
814825 if (res[i] <= 1.0 ) {
815826 res[i] = 1.0 + res[i] - sqrt ( res[i] * (res[i] + 2.0 ));
@@ -824,7 +835,7 @@ irk_wald_vec(irk_state *state, npy_intp len, double *res, const double mean, con
824835 err = vdRngUniform (VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, state->stream , len, uvec, d_zero, d_one);
825836 assert (err == VSL_STATUS_OK);
826837
827- # pragma ivdep
838+ DIST_PRAGMA_VECTOR
828839 for (i=0 ; i<len; i++) {
829840 if (uvec[i]*(1.0 + res[i]) <= 1.0 )
830841 res[i] = mean*res[i];
@@ -895,7 +906,7 @@ irk_vonmises_vec_small_kappa(irk_state *state, npy_intp len, double *res, const
895906 err = vsRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, VFvec, (float ) d_zero, (float ) d_one);
896907 assert (err == VSL_STATUS_OK);
897908
898- # pragma ivdep
909+ DIST_PRAGMA_VECTOR
899910 for (i = 0 ; i < len; i++) {
900911 double mod, resi;
901912
@@ -943,7 +954,7 @@ irk_vonmises_vec_large_kappa(irk_state *state, npy_intp len, double *res, const
943954 err = vdRngUniform (VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, state->stream , size, Vvec, d_zero, d_one);
944955 assert (err == VSL_STATUS_OK);
945956
946- # pragma ivdep
957+ DIST_PRAGMA_VECTOR
947958 for (i = 0 ; i < size; i++ ) {
948959 double sn, cn, sn2, cn2;
949960 double neg_W_minus_one, V, Y;
@@ -973,7 +984,7 @@ irk_vonmises_vec_large_kappa(irk_state *state, npy_intp len, double *res, const
973984 err = vsRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, VFvec, (float ) d_zero, (float ) d_one);
974985 assert (err == VSL_STATUS_OK);
975986
976- # pragma ivdep
987+ DIST_PRAGMA_VECTOR
977988 for (i = 0 ; i < len; i++) {
978989 double mod, resi;
979990
@@ -1041,7 +1052,7 @@ irk_noncentral_f_vec(irk_state *state, npy_intp len, double *res, const double d
10411052 mkl_free (den);
10421053 fctr = df_den/df_num;
10431054
1044- # pragma ivdep
1055+ DIST_PRAGMA_VECTOR
10451056 for (i = 0 ; i < len; i++) res[i] *= fctr;
10461057
10471058}
@@ -1082,18 +1093,18 @@ irk_triangular_vec(irk_state *state, npy_intp len, double *res, const double x_m
10821093 assert ( 0 <= ratio && ratio <= 1 );
10831094
10841095 if (ratio <= 0 ) {
1085- # pragma ivdep
1096+ DIST_PRAGMA_VECTOR
10861097 for (i = 0 ; i < len; i++) {
10871098 /* U and 1 - U are equal in distribution */
10881099 res[i] = x_max - sqrt (res[i] * rpr);
10891100 }
10901101 } else if (ratio >= 1 ) {
1091- # pragma ivdep
1102+ DIST_PRAGMA_VECTOR
10921103 for (i = 0 ; i < len; i++) {
10931104 res[i] = x_min + sqrt (res[i]*lpr);
10941105 }
10951106 } else {
1096- # pragma ivdep
1107+ DIST_PRAGMA_VECTOR
10971108 for (i = 0 ; i < len; i++) {
10981109 double ui = res[i];
10991110 res[i] = (ui > ratio) ? x_max - sqrt ((1.0 - ui) * rpr) : x_min + sqrt (ui*lpr);
@@ -1324,7 +1335,7 @@ irk_zipf_long_vec(irk_state *state, npy_intp len, long *res, const double a)
13241335 err = vdRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , batch_size, Vvec, d_zero, d_one);
13251336 assert (err == VSL_STATUS_OK);
13261337
1327- # pragma ivdep
1338+ DIST_PRAGMA_VECTOR
13281339 for (i = 0 ; i < batch_size; i++) {
13291340 U = d_one - Uvec[i]; V = Vvec[i];
13301341 X = (long )floor (pow (U, (-1.0 )/am1));
@@ -1379,7 +1390,7 @@ irk_logseries_vec(irk_state *state, npy_intp len, int *res, const double theta)
13791390 err = vdRngUniform (VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, state->stream , batch_size, Vvec, d_zero, d_one);
13801391 assert (err == VSL_STATUS_OK);
13811392
1382- # pragma ivdep
1393+ DIST_PRAGMA_VECTOR
13831394 for (i = 0 ; i < batch_size; i++) {
13841395 V = Vvec[i];
13851396 if (V >= theta) {
@@ -1454,7 +1465,7 @@ irk_discrete_uniform_long_vec(irk_state *state, npy_intp len, long *res, const l
14541465
14551466 max = ((unsigned long ) high) - ((unsigned long ) low) - 1UL ;
14561467 if (max == 0 ) {
1457- # pragma ivdep
1468+ DIST_PRAGMA_VECTOR
14581469 for (i=0 ; i < len; i++) res[i] = low;
14591470
14601471 return ;
@@ -1467,7 +1478,7 @@ irk_discrete_uniform_long_vec(irk_state *state, npy_intp len, long *res, const l
14671478 err = viRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, buf, -1 , (const int ) max);
14681479 assert (err == VSL_STATUS_OK);
14691480
1470- # pragma ivdep
1481+ DIST_PRAGMA_VECTOR
14711482 for (i=0 ; i < len; i++) res[i] = low + ((long ) buf[i]) + 1L ;
14721483
14731484 mkl_free (buf);
@@ -1542,7 +1553,7 @@ irk_long_vec(irk_state *state, npy_intp len, long *res)
15421553
15431554 irk_ulong_vec (state, len, ulptr);
15441555
1545- # pragma ivdep
1556+ DIST_PRAGMA_VECTOR
15461557 for (i=0 ; i<len; i++)
15471558 res[i] = (long ) (ulptr[i] >> 1 );
15481559
@@ -1565,7 +1576,7 @@ irk_rand_bool_vec(irk_state *state, npy_intp len, npy_bool *res, const npy_bool
15651576 }
15661577
15671578 if (lo == hi) {
1568- # pragma ivdep
1579+ DIST_PRAGMA_VECTOR
15691580 for (i = 0 ; i < len; i++) res[i] = lo;
15701581
15711582 return ;
@@ -1578,7 +1589,7 @@ irk_rand_bool_vec(irk_state *state, npy_intp len, npy_bool *res, const npy_bool
15781589 err = viRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, buf, (const int ) lo, (const int ) hi + 1 );
15791590 assert (err == VSL_STATUS_OK);
15801591
1581- # pragma ivdep
1592+ DIST_PRAGMA_VECTOR
15821593 for (i = 0 ; i < len; i++) res[i] = (npy_bool) buf[i];
15831594
15841595 mkl_free (buf);
@@ -1601,7 +1612,7 @@ irk_rand_uint8_vec(irk_state *state, npy_intp len, npy_uint8 *res, const npy_uin
16011612 }
16021613
16031614 if (lo == hi) {
1604- # pragma ivdep
1615+ DIST_PRAGMA_VECTOR
16051616 for (i = 0 ; i < len; i++) res[i] = lo;
16061617
16071618 return ;
@@ -1614,7 +1625,7 @@ irk_rand_uint8_vec(irk_state *state, npy_intp len, npy_uint8 *res, const npy_uin
16141625 err = viRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, buf, (const int ) lo, (const int ) hi + 1 );
16151626 assert (err == VSL_STATUS_OK);
16161627
1617- # pragma ivdep
1628+ DIST_PRAGMA_VECTOR
16181629 for (i = 0 ; i < len; i++) res[i] = (npy_uint8) buf[i];
16191630
16201631 mkl_free (buf);
@@ -1638,7 +1649,7 @@ irk_rand_int8_vec(irk_state *state, npy_intp len, npy_int8 *res, const npy_int8
16381649 }
16391650
16401651 if (lo == hi) {
1641- # pragma ivdep
1652+ DIST_PRAGMA_VECTOR
16421653 for (i = 0 ; i < len; i++) res[i] = lo;
16431654
16441655 return ;
@@ -1651,7 +1662,7 @@ irk_rand_int8_vec(irk_state *state, npy_intp len, npy_int8 *res, const npy_int8
16511662 err = viRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, buf, (const int ) lo, (const int ) hi + 1 );
16521663 assert (err == VSL_STATUS_OK);
16531664
1654- # pragma ivdep
1665+ DIST_PRAGMA_VECTOR
16551666 for (i = 0 ; i < len; i++) res[i] = (npy_int8) buf[i];
16561667
16571668 mkl_free (buf);
@@ -1674,7 +1685,7 @@ irk_rand_uint16_vec(irk_state *state, npy_intp len, npy_uint16 *res, const npy_u
16741685 }
16751686
16761687 if (lo == hi) {
1677- # pragma ivdep
1688+ DIST_PRAGMA_VECTOR
16781689 for (i = 0 ; i < len; i++) res[i] = lo;
16791690
16801691 return ;
@@ -1687,7 +1698,7 @@ irk_rand_uint16_vec(irk_state *state, npy_intp len, npy_uint16 *res, const npy_u
16871698 err = viRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, buf, (const int ) lo, (const int ) hi + 1 );
16881699 assert (err == VSL_STATUS_OK);
16891700
1690- # pragma ivdep
1701+ DIST_PRAGMA_VECTOR
16911702 for (i = 0 ; i < len; i++) res[i] = (npy_uint16) buf[i];
16921703
16931704 mkl_free (buf);
@@ -1710,7 +1721,7 @@ irk_rand_int16_vec(irk_state *state, npy_intp len, npy_int16 *res, const npy_int
17101721 }
17111722
17121723 if (lo == hi) {
1713- # pragma ivdep
1724+ DIST_PRAGMA_VECTOR
17141725 for (i = 0 ; i < len; i++) res[i] = lo;
17151726
17161727 return ;
@@ -1723,7 +1734,7 @@ irk_rand_int16_vec(irk_state *state, npy_intp len, npy_int16 *res, const npy_int
17231734 err = viRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, buf, (const int ) lo, (const int ) hi + 1 );
17241735 assert (err == VSL_STATUS_OK);
17251736
1726- # pragma ivdep
1737+ DIST_PRAGMA_VECTOR
17271738 for (i = 0 ; i < len; i++) res[i] = (npy_int16) buf[i];
17281739
17291740 mkl_free (buf);
@@ -1764,7 +1775,7 @@ irk_rand_uint32_vec(irk_state *state, npy_intp len, npy_uint32 *res, const npy_u
17641775 err = viRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, (int *) res, (const int ) (lo - shft), (const int ) (hi - shft + 1U ));
17651776 assert (err == VSL_STATUS_OK);
17661777
1767- # pragma ivdep
1778+ DIST_PRAGMA_VECTOR
17681779 for (i=0 ; i < len; i++) res[i] += shft;
17691780
17701781 } else {
@@ -1794,7 +1805,7 @@ irk_rand_int32_vec(irk_state *state, npy_intp len, npy_int32 *res, const npy_int
17941805
17951806 irk_rand_uint32_vec (state, len, (npy_uint32 *) res, 0U , (npy_uint32) (hi - lo));
17961807
1797- # pragma ivdep
1808+ DIST_PRAGMA_VECTOR
17981809 for (i=0 ; i < len; i++) res[i] += lo;
17991810
18001811 } else {
@@ -1829,7 +1840,7 @@ irk_rand_uint64_vec(irk_state *state, npy_intp len, npy_uint64 *res, const npy_u
18291840
18301841 rng = hi - lo;
18311842 if (!rng) {
1832- # pragma ivdep
1843+ DIST_PRAGMA_VECTOR
18331844 for (i = 0 ; i < len; i++) res[i] = lo;
18341845
18351846 return ;
@@ -1844,7 +1855,7 @@ irk_rand_uint64_vec(irk_state *state, npy_intp len, npy_uint64 *res, const npy_u
18441855 err = viRngUniform (VSL_RNG_METHOD_UNIFORM_STD, state->stream , len, buf, 0 , (const int ) rng);
18451856 assert (err == VSL_STATUS_OK);
18461857
1847- # pragma ivdep
1858+ DIST_PRAGMA_VECTOR
18481859 for (i=0 ; i < len; i++) res[i] = lo + ((npy_uint64) buf[i]);
18491860
18501861 mkl_free (buf);
@@ -1896,7 +1907,7 @@ irk_rand_int64_vec(irk_state *state, npy_intp len, npy_int64 *res, const npy_int
18961907
18971908 irk_rand_uint64_vec (state, len, (npy_uint64 *) res, 0 , rng);
18981909
1899- # pragma ivdep
1910+ DIST_PRAGMA_VECTOR
19001911 for (i = 0 ; i < len; i++)
19011912 res[i] = res[i] + lo;
19021913
@@ -1950,7 +1961,7 @@ irk_multinormal_vec_BM2(irk_state *state, npy_intp len, double *res, const int d
19501961 int err;
19511962 const MKL_INT storage_mode = cholesky_storage_flags[storage_flag];
19521963
1953- if (len<1 )
1964+ if (len<1 )
19541965 return ;
19551966
19561967 while (len > MKL_INT_MAX) {
0 commit comments