Skip to content

Commit f255c5f

Browse files
Replaced explicit use of #pragma ivdep
Use _Pragma("vector") hidden behind compiler-aware preprocessor variable
1 parent 8c44ef6 commit f255c5f

File tree

1 file changed

+48
-37
lines changed

1 file changed

+48
-37
lines changed

mkl_random/src/mkl_distributions.cpp

Lines changed: 48 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,17 @@
4040

4141
#define MKL_INT_MAX ((npy_intp) (~((MKL_UINT) 0) >> 1))
4242

43+
#if defined(__ICC) || defined(__INTEL_COMPILER)
44+
#define DIST_PRAGMA_VECTOR _Pragma("vector")
45+
#define DIST_PRAGMA_NOVECTOR _Pragma("novector")
46+
#define DIST_ASSUME_ALIGNED(p, b) __assume_aligned((p), (b));
47+
#else
48+
#define DIST_PRAGMA_VECTOR _Pragma("GCC ivdep")
49+
#define DIST_PRAGMA_NOVECTOR
50+
#define DIST_ASSUME_ALIGNED(p, b)
51+
#endif
52+
53+
4354
void
4455
irk_double_vec(irk_state *state, npy_intp len, double *res)
4556
{
@@ -413,7 +424,7 @@ irk_pareto_vec(irk_state *state, npy_intp len, double *res, const double alp)
413424
/* res[i] = pow(res[i], neg_rec_alp) */
414425
vmdPowx(len, res, neg_rec_alp, res, VML_HA);
415426

416-
#pragma ivdep
427+
DIST_PRAGMA_VECTOR
417428
for(i=0; i < len; i++) res[i] -= 1.0;
418429

419430
}
@@ -491,7 +502,7 @@ irk_rayleigh_vec(irk_state *state, npy_intp len, double *res, const double scale
491502

492503
vmdSqrt(len, res, res, VML_HA);
493504

494-
#pragma ivdep
505+
DIST_PRAGMA_VECTOR
495506
for(i=0; i < len; i++) res[i] *= scale;
496507

497508
}
@@ -618,7 +629,7 @@ irk_noncentral_chisquare_vec(irk_state *state, npy_intp len, double *res, const
618629
idx = (int *) mkl_malloc(len * sizeof(int), 64);
619630
assert( idx != NULL );
620631

621-
#pragma ivdep
632+
DIST_PRAGMA_VECTOR
622633
for(i=0; i <len; i++) idx[i] = i;
623634

624635
std::sort(idx, idx + len, [pvec](int i1, int i2){ return pvec[i1] < pvec[i2]; } );
@@ -631,15 +642,15 @@ irk_noncentral_chisquare_vec(irk_state *state, npy_intp len, double *res, const
631642
for(i = 0; i < len; ) {
632643
int k, j, cv = pvec[idx[i]];
633644

634-
#pragma ivdep
645+
DIST_PRAGMA_VECTOR
635646
for(j=i+1; (j < len) && (pvec[idx[j]] == cv); j++) {}
636647

637648
assert(j > i);
638649
err = vdRngGamma(VSL_RNG_METHOD_GAMMA_GNORM_ACCURATE, state->stream, j - i, tmp,
639650
shape + cv, d_zero, d_two);
640651
assert(err == VSL_STATUS_OK);
641652

642-
#pragma ivdep
653+
DIST_PRAGMA_VECTOR
643654
for(k = i; k < j; k++) res[idx[k]] = tmp[k - i];
644655

645656
i = j;
@@ -733,10 +744,10 @@ irk_logistic_vec(irk_state *state, npy_intp len, double *res, const double loc,
733744
assert(err == VSL_STATUS_OK);
734745

735746
/* can MKL optimize computation of the logit function p \mapsto \ln(p/(1-p)) */
736-
#pragma ivdep
747+
DIST_PRAGMA_VECTOR
737748
for(i=0; i<len; i++) res[i] = log(res[i]/(1.0 - res[i]));
738749

739-
#pragma ivdep
750+
DIST_PRAGMA_VECTOR
740751
for(i=0; i<len; i++) res[i] = loc + scale*res[i];
741752
}
742753

@@ -809,7 +820,7 @@ irk_wald_vec(irk_state *state, npy_intp len, double *res, const double mean, con
809820
/* Y = mean/(4 scale) * Z^2 */
810821
vmdSqr(len, res, res, VML_HA);
811822

812-
#pragma ivdep
823+
DIST_PRAGMA_VECTOR
813824
for(i = 0; i < len; i++) {
814825
if(res[i] <= 1.0) {
815826
res[i] = 1.0 + res[i] - sqrt( res[i] * (res[i] + 2.0));
@@ -824,7 +835,7 @@ irk_wald_vec(irk_state *state, npy_intp len, double *res, const double mean, con
824835
err = vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, state->stream, len, uvec, d_zero, d_one);
825836
assert(err == VSL_STATUS_OK);
826837

827-
#pragma ivdep
838+
DIST_PRAGMA_VECTOR
828839
for(i=0; i<len; i++) {
829840
if (uvec[i]*(1.0 + res[i]) <= 1.0)
830841
res[i] = mean*res[i];
@@ -895,7 +906,7 @@ irk_vonmises_vec_small_kappa(irk_state *state, npy_intp len, double *res, const
895906
err = vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, VFvec, (float) d_zero, (float) d_one);
896907
assert(err == VSL_STATUS_OK);
897908

898-
#pragma ivdep
909+
DIST_PRAGMA_VECTOR
899910
for(i = 0; i < len; i++) {
900911
double mod, resi;
901912

@@ -943,7 +954,7 @@ irk_vonmises_vec_large_kappa(irk_state *state, npy_intp len, double *res, const
943954
err = vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, state->stream, size, Vvec, d_zero, d_one);
944955
assert(err == VSL_STATUS_OK);
945956

946-
#pragma ivdep
957+
DIST_PRAGMA_VECTOR
947958
for(i = 0; i < size; i++ ) {
948959
double sn, cn, sn2, cn2;
949960
double neg_W_minus_one, V, Y;
@@ -973,7 +984,7 @@ irk_vonmises_vec_large_kappa(irk_state *state, npy_intp len, double *res, const
973984
err = vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, VFvec, (float) d_zero, (float) d_one);
974985
assert(err == VSL_STATUS_OK);
975986

976-
#pragma ivdep
987+
DIST_PRAGMA_VECTOR
977988
for(i = 0; i < len; i++) {
978989
double mod, resi;
979990

@@ -1041,7 +1052,7 @@ irk_noncentral_f_vec(irk_state *state, npy_intp len, double *res, const double d
10411052
mkl_free(den);
10421053
fctr = df_den/df_num;
10431054

1044-
#pragma ivdep
1055+
DIST_PRAGMA_VECTOR
10451056
for(i = 0; i < len; i++) res[i] *= fctr;
10461057

10471058
}
@@ -1082,18 +1093,18 @@ irk_triangular_vec(irk_state *state, npy_intp len, double *res, const double x_m
10821093
assert( 0 <= ratio && ratio <= 1);
10831094

10841095
if (ratio <= 0) {
1085-
#pragma ivdep
1096+
DIST_PRAGMA_VECTOR
10861097
for(i = 0; i < len; i++) {
10871098
/* U and 1 - U are equal in distribution */
10881099
res[i] = x_max - sqrt(res[i] * rpr);
10891100
}
10901101
} else if (ratio >= 1) {
1091-
#pragma ivdep
1102+
DIST_PRAGMA_VECTOR
10921103
for(i = 0; i < len; i++) {
10931104
res[i] = x_min + sqrt(res[i]*lpr);
10941105
}
10951106
} else {
1096-
#pragma ivdep
1107+
DIST_PRAGMA_VECTOR
10971108
for(i = 0; i < len; i++) {
10981109
double ui = res[i];
10991110
res[i] = (ui > ratio) ? x_max - sqrt((1.0 - ui) * rpr) : x_min + sqrt(ui*lpr);
@@ -1324,7 +1335,7 @@ irk_zipf_long_vec(irk_state *state, npy_intp len, long *res, const double a)
13241335
err = vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, batch_size, Vvec, d_zero, d_one);
13251336
assert(err == VSL_STATUS_OK);
13261337

1327-
#pragma ivdep
1338+
DIST_PRAGMA_VECTOR
13281339
for(i = 0; i < batch_size; i++) {
13291340
U = d_one - Uvec[i]; V = Vvec[i];
13301341
X = (long)floor(pow(U, (-1.0)/am1));
@@ -1379,7 +1390,7 @@ irk_logseries_vec(irk_state *state, npy_intp len, int *res, const double theta)
13791390
err = vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD_ACCURATE, state->stream, batch_size, Vvec, d_zero, d_one);
13801391
assert(err == VSL_STATUS_OK);
13811392

1382-
#pragma ivdep
1393+
DIST_PRAGMA_VECTOR
13831394
for(i = 0; i < batch_size; i++) {
13841395
V = Vvec[i];
13851396
if (V >= theta) {
@@ -1454,7 +1465,7 @@ irk_discrete_uniform_long_vec(irk_state *state, npy_intp len, long *res, const l
14541465

14551466
max = ((unsigned long) high) - ((unsigned long) low) - 1UL;
14561467
if(max == 0) {
1457-
#pragma ivdep
1468+
DIST_PRAGMA_VECTOR
14581469
for(i=0; i < len; i++) res[i] = low;
14591470

14601471
return;
@@ -1467,7 +1478,7 @@ irk_discrete_uniform_long_vec(irk_state *state, npy_intp len, long *res, const l
14671478
err = viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, buf, -1, (const int) max);
14681479
assert(err == VSL_STATUS_OK);
14691480

1470-
#pragma ivdep
1481+
DIST_PRAGMA_VECTOR
14711482
for(i=0; i < len; i++) res[i] = low + ((long) buf[i]) + 1L;
14721483

14731484
mkl_free(buf);
@@ -1542,7 +1553,7 @@ irk_long_vec(irk_state *state, npy_intp len, long *res)
15421553

15431554
irk_ulong_vec(state, len, ulptr);
15441555

1545-
#pragma ivdep
1556+
DIST_PRAGMA_VECTOR
15461557
for(i=0; i<len; i++)
15471558
res[i] = (long) (ulptr[i] >> 1);
15481559

@@ -1565,7 +1576,7 @@ irk_rand_bool_vec(irk_state *state, npy_intp len, npy_bool *res, const npy_bool
15651576
}
15661577

15671578
if (lo == hi) {
1568-
#pragma ivdep
1579+
DIST_PRAGMA_VECTOR
15691580
for(i = 0; i < len; i++) res[i] = lo;
15701581

15711582
return;
@@ -1578,7 +1589,7 @@ irk_rand_bool_vec(irk_state *state, npy_intp len, npy_bool *res, const npy_bool
15781589
err = viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, buf, (const int) lo, (const int) hi + 1);
15791590
assert(err == VSL_STATUS_OK);
15801591

1581-
#pragma ivdep
1592+
DIST_PRAGMA_VECTOR
15821593
for(i = 0; i < len; i++) res[i] = (npy_bool) buf[i];
15831594

15841595
mkl_free(buf);
@@ -1601,7 +1612,7 @@ irk_rand_uint8_vec(irk_state *state, npy_intp len, npy_uint8 *res, const npy_uin
16011612
}
16021613

16031614
if (lo == hi) {
1604-
#pragma ivdep
1615+
DIST_PRAGMA_VECTOR
16051616
for(i = 0; i < len; i++) res[i] = lo;
16061617

16071618
return;
@@ -1614,7 +1625,7 @@ irk_rand_uint8_vec(irk_state *state, npy_intp len, npy_uint8 *res, const npy_uin
16141625
err = viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, buf, (const int) lo, (const int) hi + 1);
16151626
assert(err == VSL_STATUS_OK);
16161627

1617-
#pragma ivdep
1628+
DIST_PRAGMA_VECTOR
16181629
for(i = 0; i < len; i++) res[i] = (npy_uint8) buf[i];
16191630

16201631
mkl_free(buf);
@@ -1638,7 +1649,7 @@ irk_rand_int8_vec(irk_state *state, npy_intp len, npy_int8 *res, const npy_int8
16381649
}
16391650

16401651
if (lo == hi) {
1641-
#pragma ivdep
1652+
DIST_PRAGMA_VECTOR
16421653
for(i = 0; i < len; i++) res[i] = lo;
16431654

16441655
return;
@@ -1651,7 +1662,7 @@ irk_rand_int8_vec(irk_state *state, npy_intp len, npy_int8 *res, const npy_int8
16511662
err = viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, buf, (const int) lo, (const int) hi + 1);
16521663
assert(err == VSL_STATUS_OK);
16531664

1654-
#pragma ivdep
1665+
DIST_PRAGMA_VECTOR
16551666
for(i = 0; i < len; i++) res[i] = (npy_int8) buf[i];
16561667

16571668
mkl_free(buf);
@@ -1674,7 +1685,7 @@ irk_rand_uint16_vec(irk_state *state, npy_intp len, npy_uint16 *res, const npy_u
16741685
}
16751686

16761687
if (lo == hi) {
1677-
#pragma ivdep
1688+
DIST_PRAGMA_VECTOR
16781689
for(i = 0; i < len; i++) res[i] = lo;
16791690

16801691
return;
@@ -1687,7 +1698,7 @@ irk_rand_uint16_vec(irk_state *state, npy_intp len, npy_uint16 *res, const npy_u
16871698
err = viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, buf, (const int) lo, (const int) hi + 1);
16881699
assert(err == VSL_STATUS_OK);
16891700

1690-
#pragma ivdep
1701+
DIST_PRAGMA_VECTOR
16911702
for(i = 0; i < len; i++) res[i] = (npy_uint16) buf[i];
16921703

16931704
mkl_free(buf);
@@ -1710,7 +1721,7 @@ irk_rand_int16_vec(irk_state *state, npy_intp len, npy_int16 *res, const npy_int
17101721
}
17111722

17121723
if (lo == hi) {
1713-
#pragma ivdep
1724+
DIST_PRAGMA_VECTOR
17141725
for(i = 0; i < len; i++) res[i] = lo;
17151726

17161727
return;
@@ -1723,7 +1734,7 @@ irk_rand_int16_vec(irk_state *state, npy_intp len, npy_int16 *res, const npy_int
17231734
err = viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, buf, (const int) lo, (const int) hi + 1);
17241735
assert(err == VSL_STATUS_OK);
17251736

1726-
#pragma ivdep
1737+
DIST_PRAGMA_VECTOR
17271738
for(i = 0; i < len; i++) res[i] = (npy_int16) buf[i];
17281739

17291740
mkl_free(buf);
@@ -1764,7 +1775,7 @@ irk_rand_uint32_vec(irk_state *state, npy_intp len, npy_uint32 *res, const npy_u
17641775
err = viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, (int *) res, (const int) (lo - shft), (const int) (hi - shft + 1U));
17651776
assert(err == VSL_STATUS_OK);
17661777

1767-
#pragma ivdep
1778+
DIST_PRAGMA_VECTOR
17681779
for(i=0; i < len; i++) res[i] += shft;
17691780

17701781
} else {
@@ -1794,7 +1805,7 @@ irk_rand_int32_vec(irk_state *state, npy_intp len, npy_int32 *res, const npy_int
17941805

17951806
irk_rand_uint32_vec(state, len, (npy_uint32 *) res, 0U, (npy_uint32) (hi - lo));
17961807

1797-
#pragma ivdep
1808+
DIST_PRAGMA_VECTOR
17981809
for(i=0; i < len; i++) res[i] += lo;
17991810

18001811
} else {
@@ -1829,7 +1840,7 @@ irk_rand_uint64_vec(irk_state *state, npy_intp len, npy_uint64 *res, const npy_u
18291840

18301841
rng = hi - lo;
18311842
if(!rng) {
1832-
#pragma ivdep
1843+
DIST_PRAGMA_VECTOR
18331844
for(i = 0; i < len; i++) res[i] = lo;
18341845

18351846
return;
@@ -1844,7 +1855,7 @@ irk_rand_uint64_vec(irk_state *state, npy_intp len, npy_uint64 *res, const npy_u
18441855
err = viRngUniform(VSL_RNG_METHOD_UNIFORM_STD, state->stream, len, buf, 0, (const int) rng);
18451856
assert(err == VSL_STATUS_OK);
18461857

1847-
#pragma ivdep
1858+
DIST_PRAGMA_VECTOR
18481859
for(i=0; i < len; i++) res[i] = lo + ((npy_uint64) buf[i]);
18491860

18501861
mkl_free(buf);
@@ -1896,7 +1907,7 @@ irk_rand_int64_vec(irk_state *state, npy_intp len, npy_int64 *res, const npy_int
18961907

18971908
irk_rand_uint64_vec(state, len, (npy_uint64 *) res, 0, rng);
18981909

1899-
#pragma ivdep
1910+
DIST_PRAGMA_VECTOR
19001911
for(i = 0; i < len; i++)
19011912
res[i] = res[i] + lo;
19021913

@@ -1950,7 +1961,7 @@ irk_multinormal_vec_BM2(irk_state *state, npy_intp len, double *res, const int d
19501961
int err;
19511962
const MKL_INT storage_mode = cholesky_storage_flags[storage_flag];
19521963

1953-
if (len<1)
1964+
if (len<1)
19541965
return;
19551966

19561967
while (len > MKL_INT_MAX) {

0 commit comments

Comments
 (0)