11#ifndef roofit_batchcompute_faddeeva_impl_h
12#define roofit_batchcompute_faddeeva_impl_h
33#if defined(__CUDACC__) || !(defined(__GNUC__) || defined(__clang__)) || \
34 !defined(__unix__) || !defined(__x86_64__) || \
35 !defined(__OPTIMIZE__) || defined(__OPTIMIZE_SIZE__) || \
36 defined(__INTEL_COMPILER) || \
37 defined(__OPEN64__) || defined(__PATHSCALE__)
38 const double e = std::exp(re);
39 re =
e * std::cos(im);
40 im =
e * std::sin(im);
53 "fsubr %%st,%%st(1)\n\t"
62 "testl $0x200, %%eax\n\t"
70 "testl $0x400,%%eax\n\t"
78 "testl $0x400,%%eax\n\t"
86 :
"=t" (im),
"=u" (re):
"0" (re),
"1" (im) :
92 ,
"st(5)",
"st(6)",
"st(7)"
105 template <
class T,
unsigned N,
unsigned NTAYLOR,
unsigned NCF>
107 T zre, T zim,
const T tm,
108 const T (&
a)[
N],
const T (&npi)[
N],
109 const T (&taylorarr)[
N * NTAYLOR * 2])
115 const T zim2 = zim * zim;
116 const T maxnorm = T(9) / T(1000000);
117 if (zim2 < maxnorm) {
120 const T dnsing = tm * zre / npi[1];
121 const T dnsingmax2 = (T(
N) - T(1) / T(2)) * (T(
N) - T(1) / T(2));
122 if (dnsing * dnsing < dnsingmax2) {
126 const bool negrez = zre < T(0);
128 const int nsing =
int(std::abs(dnsing) + T(1) / T(2));
130 const T zmnpire = std::abs(zre) - npi[nsing];
131 const T zmnpinorm = zmnpire * zmnpire + zim2;
133 if (zmnpinorm < maxnorm) {
134 const T* coeffs = &taylorarr[nsing * NTAYLOR * 2];
139 T sumre = coeffs[0], sumim = coeffs[1];
140 for (
unsigned i = 1; i < NTAYLOR; ++i) {
141 const T re = sumre * zmnpire - sumim * zim;
142 const T im = sumim * zmnpire + sumre * zim;
143 sumre = re + coeffs[2 * i + 0];
144 sumim = im + coeffs[2 * i + 1];
147 if (negrez)
return std::complex<T>(sumre, -sumim);
148 else return std::complex<T>(sumre, sumim);
154 const bool negimz = zim < T(0);
159 const T znorm = zre * zre + zim2;
160 if (znorm > tm * tm) {
162 const T isqrtpi = 5.64189583547756287e-01;
163 const T z2re = (zre + zim) * (zre - zim);
164 const T z2im = T(2) * zre * zim;
165 T cfre = T(1), cfim = T(0), cfnorm = T(1);
166 for (
unsigned k =
NCF; k; --k) {
167 cfre = +(T(k) / T(2)) * cfre / cfnorm;
168 cfim = -(T(k) / T(2)) * cfim / cfnorm;
169 if (k & 1) cfre -= z2re, cfim -= z2im;
171 cfnorm = cfre * cfre + cfim * cfim;
173 T sumre = (zim * cfre - zre * cfim) * isqrtpi / cfnorm;
174 T sumim = -(zre * cfre + zim * cfim) * isqrtpi / cfnorm;
178 T ez2re = -z2re, ez2im = -z2im;
180 return std::complex<T>(T(2) * ez2re - sumre,
181 T(2) * ez2im - sumim);
183 return std::complex<T>(sumre, sumim);
186 const T twosqrtpi = 3.54490770181103205e+00;
187 const T tmzre = tm * zre, tmzim = tm * zim;
189 T eitmzre = -tmzim, eitmzim = tmzre;
192 const T numerarr[4] = {
193 T(1) - eitmzre, -eitmzim, T(1) + eitmzre, +eitmzim
196 const T numertmz[4] = {
197 tmzre * numerarr[0] - tmzim * numerarr[1],
198 tmzre * numerarr[1] + tmzim * numerarr[0],
199 tmzre * numerarr[2] - tmzim * numerarr[3],
200 tmzre * numerarr[3] + tmzim * numerarr[2]
203 const T reimtmzm2 = T(-2) * tmzre * tmzim;
204 const T imtmz2 = tmzim * tmzim;
205 const T reimtmzm22 = reimtmzm2 * reimtmzm2;
213#if (defined(__CUDACC__) || !defined(__x86_64__)) || !defined(__OPTIMIZE__) || \
214 defined(__OPTIMIZE_SIZE__) || defined(__INTEL_COMPILER) || \
215 defined(__clang__) || defined(__OPEN64__) || \
216 defined(__PATHSCALE__) || !defined(__GNUC__)
217 T sumre = (-
a[0] / znorm) * (numerarr[0] * zre + numerarr[1] * zim);
218 T sumim = (-
a[0] / znorm) * (numerarr[1] * zre - numerarr[0] * zim);
219 for (
unsigned i = 0; i <
N; ++i) {
220 const unsigned j = (i << 1) & 2;
222 const T wk = imtmz2 + (npi[i] + tmzre) * (npi[i] - tmzre);
224 const T norm = wk * wk + reimtmzm22;
225 const T
f = T(2) * tm *
a[i] / norm;
227 sumre -=
f * (numertmz[j] * wk + numertmz[j + 1] * reimtmzm2);
228 sumim -=
f * (numertmz[j + 1] * wk - numertmz[j] * reimtmzm2);
233 for (
unsigned i = 0; i <
N; ++i) {
234 const T wk = imtmz2 + (npi[i] + tmzre) * (npi[i] - tmzre);
236 tmp[2 * i + 1] = T(2) * tm *
a[i] / (wk * wk + reimtmzm22);
238 for (
unsigned i = 0; i <
N / 2; ++i) {
239 T wk = tmp[4 * i + 0],
f = tmp[4 * i + 1];
240 tmp[4 * i + 0] = -
f * (numertmz[0] * wk + numertmz[1] * reimtmzm2);
241 tmp[4 * i + 1] = -
f * (numertmz[1] * wk - numertmz[0] * reimtmzm2);
242 wk = tmp[4 * i + 2],
f = tmp[4 * i + 3];
243 tmp[4 * i + 2] = -
f * (numertmz[2] * wk + numertmz[3] * reimtmzm2);
244 tmp[4 * i + 3] = -
f * (numertmz[3] * wk - numertmz[2] * reimtmzm2);
249 const T wk = tmp[2 *
N - 2],
f = tmp[2 *
N - 1];
250 tmp[2 * (
N - 1) + 0] = -
f * (numertmz[0] * wk + numertmz[1] * reimtmzm2);
251 tmp[2 * (
N - 1) + 1] = -
f * (numertmz[1] * wk - numertmz[0] * reimtmzm2);
253 T sumre = (-
a[0] / znorm) * (numerarr[0] * zre + numerarr[1] * zim);
254 T sumim = (-
a[0] / znorm) * (numerarr[1] * zre - numerarr[0] * zim);
255 for (
unsigned i = 0; i <
N; ++i) {
256 sumre += tmp[2 * i + 0];
257 sumim += tmp[2 * i + 1];
265 const T z2im = -T(2) * zre * zim;
266 const T z2re = -(zre + zim) * (zre - zim);
267 T ez2re = z2re, ez2im = z2im;
269 return std::complex<T>(T(2) * ez2re + sumim / twosqrtpi,
270 T(2) * ez2im - sumre / twosqrtpi);
272 return std::complex<T>(-sumim / twosqrtpi, sumre / twosqrtpi);
277 0.00000000000000000e+00, 3.14159265358979324e+00, 6.28318530717958648e+00,
278 9.42477796076937972e+00, 1.25663706143591730e+01, 1.57079632679489662e+01,
279 1.88495559215387594e+01, 2.19911485751285527e+01, 2.51327412287183459e+01,
280 2.82743338823081391e+01, 3.14159265358979324e+01, 3.45575191894877256e+01,
281 3.76991118430775189e+01, 4.08407044966673121e+01, 4.39822971502571053e+01,
282 4.71238898038468986e+01, 5.02654824574366918e+01, 5.34070751110264851e+01,
283 5.65486677646162783e+01, 5.96902604182060715e+01, 6.28318530717958648e+01,
284 6.59734457253856580e+01, 6.91150383789754512e+01, 7.22566310325652445e+01,
287 2.95408975150919338e-01, 2.75840233292177084e-01, 2.24573955224615866e-01,
288 1.59414938273911723e-01, 9.86657664154541891e-02, 5.32441407876394120e-02,
289 2.50521500053936484e-02, 1.02774656705395362e-02, 3.67616433284484706e-03,
290 1.14649364124223317e-03, 3.11757015046197600e-04, 7.39143342960301488e-05,
291 1.52794934280083635e-05, 2.75395660822107093e-06, 4.32785878190124505e-07,
292 5.93003040874588103e-08, 7.08449030774820423e-09, 7.37952063581678038e-10,
293 6.70217160600200763e-11, 5.30726516347079017e-12, 3.66432411346763916e-13,
294 2.20589494494103134e-14, 1.15782686262855879e-15, 5.29871142946730482e-17,
299 0.00000000000000000e-00, 3.00901111225470020e-01,
300 5.00000000000000000e-01, 0.00000000000000000e-00,
301 0.00000000000000000e-00, -7.52252778063675049e-01,
302 -1.00000000000000000e-00, 0.00000000000000000e-00,
303 0.00000000000000000e-00, 1.12837916709551257e+00,
304 1.00000000000000000e-00, 0.00000000000000000e-00,
306 -2.22423508493755319e-01, 1.87966717746229718e-01,
307 3.41805419240637628e-01, 3.42752593807919263e-01,
308 4.66574321730757753e-01, -5.59649213591058097e-01,
309 -8.05759710273191021e-01, -5.38989366115424093e-01,
310 -4.88914083733395200e-01, 9.80580906465856792e-01,
311 9.33757118080975970e-01, 2.82273885115127769e-01,
313 -2.60522586513312894e-01, -4.26259455096092786e-02,
314 1.36549702008863349e-03, 4.39243227763478846e-01,
315 6.50591493715480700e-01, -1.23422352472779046e-01,
316 -3.43379903564271318e-01, -8.13862662890748911e-01,
317 -7.96093943501906645e-01, 6.11271022503935772e-01,
318 7.60213717643090957e-01, 4.93801903948967945e-01,
320 -1.18249853727020186e-01, -1.90471659765411376e-01,
321 -2.59044664869706839e-01, 2.69333898502392004e-01,
322 4.99077838344125714e-01, 2.64644800189075006e-01,
323 1.26114512111568737e-01, -7.46519337025968199e-01,
324 -8.47666863706379907e-01, 1.89347715957263646e-01,
325 5.39641485816297176e-01, 5.97805988669631615e-01,
327 4.94825297066481491e-02, -1.71428212158876197e-01,
328 -2.97766677111471585e-01, 1.60773286596649656e-02,
329 1.88114210832460682e-01, 4.11734391195006462e-01,
330 3.98540613293909842e-01, -4.63321903522162715e-01,
331 -6.99522070542463639e-01, -1.32412024008354582e-01,
332 3.33997185986131785e-01, 6.01983450812696742e-01,
334 1.18367078448232332e-01, -6.09533063579086850e-02,
335 -1.74762998833038991e-01, -1.39098099222000187e-01,
336 -6.71534655984154549e-02, 3.34462251996496680e-01,
337 4.37429678577360024e-01, -1.59613865629038012e-01,
338 -4.71863911886034656e-01, -2.92759316465055762e-01,
339 1.80238737704018306e-01, 5.42834914744283253e-01,
341 8.87698096005701290e-02, 2.84339354980994902e-02,
342 -3.18943083830766399e-02, -1.53946887977045862e-01,
343 -1.71825061547624858e-01, 1.70734367410600348e-01,
344 3.33690792296469441e-01, 3.97048587678703930e-02,
345 -2.66422678503135697e-01, -3.18469797424381480e-01,
346 8.48049724711137773e-02, 4.60546329221462864e-01,
348 2.99767046276705077e-02, 5.34659695701718247e-02,
349 4.53131030251822568e-02, -9.37915401977138648e-02,
350 -1.57982359988083777e-01, 3.82170507060760740e-02,
351 1.98891589845251706e-01, 1.17546677047049354e-01,
352 -1.27514335237079297e-01, -2.72741112680307074e-01,
353 3.47906344595283767e-02, 3.82277517244493224e-01,
355 -7.35922494437203395e-03, 3.72011290318534610e-02,
356 5.66783220847204687e-02, -3.21015398169199501e-02,
357 -1.00308737825172555e-01, -2.57695148077963515e-02,
358 9.67294850588435368e-02, 1.18174625238337507e-01,
359 -5.21266530264988508e-02, -2.08850084114630861e-01,
360 1.24443217440050976e-02, 3.19239968065752286e-01,
362 -1.66126772808035320e-02, 1.46180329587665321e-02,
363 3.85927576915247303e-02, 1.18910471133003227e-03,
364 -4.94003498320899806e-02, -3.93468443660139110e-02,
365 3.92113167048952835e-02, 9.03306084789976219e-02,
366 -1.82889636251263500e-02, -1.53816215444915245e-01,
367 3.88103861995563741e-03, 2.72090310854550347e-01,
369 -1.21245068916826880e-02, 1.59080224420074489e-03,
370 1.91116222508366035e-02, 1.05879549199053302e-02,
371 -1.97228428219695318e-02, -3.16962067712639397e-02,
372 1.34110372628315158e-02, 6.18045654429108837e-02,
373 -5.52574921865441838e-03, -1.14259663804569455e-01,
374 1.05534036292203489e-03, 2.37326534898818288e-01,
376 -5.96835002183177493e-03, -2.42594931567031205e-03,
377 7.44753817476594184e-03, 9.33450807578394386e-03,
378 -6.52649522783026481e-03, -2.08165802069352019e-02,
379 3.89988065678848650e-03, 4.12784313451549132e-02,
380 -1.44110721106127920e-03, -8.76484782997757425e-02,
381 2.50210184908121337e-04, 2.11131066219336647e-01,
383 -2.24505212235034193e-03, -2.38114524227619446e-03,
384 2.36375918970809340e-03, 5.97324040603806266e-03,
385 -1.81333819936645381e-03, -1.28126250720444051e-02,
386 9.69251586187208358e-04, 2.83055679874589732e-02,
387 -3.24986363596307374e-04, -6.97056268370209313e-02,
388 5.17231862038123061e-05, 1.90681117197597520e-01,
390 -6.76887607549779069e-04, -1.48589685249767064e-03,
391 6.22548369472046953e-04, 3.43871156746448680e-03,
392 -4.26557147166379929e-04, -7.98854145009655400e-03,
393 2.06644460919535524e-04, 2.03107152586353217e-02,
394 -6.34563929410856987e-05, -5.71425144910115832e-02,
395 9.32252179140502456e-06, 1.74167663785025829e-01,
397 -1.67596437777156162e-04, -8.05384193869903178e-04,
398 1.37627277777023791e-04, 1.97652692602724093e-03,
399 -8.54392244879459717e-05, -5.23088906415977167e-03,
400 3.78965577556493513e-05, 1.52191559129376333e-02,
401 -1.07393019498185646e-05, -4.79347862153366295e-02,
402 1.46503970628861795e-06, 1.60471011683477685e-01,
404 -3.45715760630978778e-05, -4.31089554210205493e-04,
405 2.57350138106549737e-05, 1.19449262097417514e-03,
406 -1.46322227517372253e-05, -3.61303766799909378e-03,
407 5.99057675687392260e-06, 1.17993805017130890e-02,
408 -1.57660578509526722e-06, -4.09165023743669707e-02,
409 2.00739683204152177e-07, 1.48879348585662670e-01,
411 -5.99735188857573424e-06, -2.42949218855805052e-04,
412 4.09249090936269722e-06, 7.67400152727128171e-04,
413 -2.14920611287648034e-06, -2.60710519575546230e-03,
414 8.17591694958640978e-07, 9.38581640137393053e-03,
415 -2.00910914042737743e-07, -3.54045580123653803e-02,
416 2.39819738182594508e-08, 1.38916449405613711e-01,
418 -8.80708505155966658e-07, -1.46479474515521504e-04,
419 5.55693207391871904e-07, 5.19165587844615415e-04,
420 -2.71391142598826750e-07, -1.94439427580099576e-03,
421 9.64641799864928425e-08, 7.61536975207357980e-03,
422 -2.22357616069432967e-08, -3.09762939485679078e-02,
423 2.49806920458212581e-09, 1.30247401712293206e-01,
425 -1.10007111030476390e-07, -9.35886150886691786e-05,
426 6.46244096997824390e-08, 3.65267193418479043e-04,
427 -2.95175785569292542e-08, -1.48730955943961081e-03,
428 9.84949251974795537e-09, 6.27824679148707177e-03,
429 -2.13827217704781576e-09, -2.73545766571797965e-02,
430 2.26877724435352177e-10, 1.22627158810895267e-01,
432 -1.17302439957657553e-08, -6.24890956722053332e-05,
433 6.45231881609786173e-09, 2.64799907072561543e-04,
434 -2.76943921343331654e-09, -1.16094187847598385e-03,
435 8.71074689656480749e-10, 5.24514377390761210e-03,
436 -1.78730768958639407e-10, -2.43489203319091538e-02,
437 1.79658223341365988e-11, 1.15870972518909888e-01,
439 -1.07084502471985403e-09, -4.31515421260633319e-05,
440 5.54152563270547927e-10, 1.96606443937168357e-04,
441 -2.24423474431542338e-10, -9.21550077887211094e-04,
442 6.67734377376211580e-11, 4.43201203646827019e-03,
443 -1.29896907717633162e-11, -2.18236356404862774e-02,
444 1.24042409733678516e-12, 1.09836276968151848e-01,
446 -8.38816525569060600e-11, -3.06091807093959821e-05,
447 4.10033961556230842e-11, 1.48895624771753491e-04,
448 -1.57238128435253905e-11, -7.42073499862065649e-04,
449 4.43938379112418832e-12, 3.78197089773957382e-03,
450 -8.21067867869285873e-13, -1.96793607299577220e-02,
451 7.46725770201828754e-14, 1.04410965521273064e-01,
453 -5.64848922712870507e-12, -2.22021942382507691e-05,
454 2.61729537775838587e-12, 1.14683068921649992e-04,
455 -9.53316139085394895e-13, -6.05021573565916914e-04,
456 2.56116039498542220e-13, 3.25530796858307225e-03,
457 -4.51482829896525004e-14, -1.78416955716514289e-02,
458 3.91940313268087086e-15, 9.95054815464739996e-02,
460 -3.27482357793897640e-13, -1.64138890390689871e-05,
461 1.44278798346454523e-13, 8.96362542918265398e-05,
462 -5.00524303437266481e-14, -4.98699756861136127e-04,
463 1.28274026095767213e-14, 2.82359118537843949e-03,
464 -2.16009593993917109e-15, -1.62538825704327487e-02,
465 1.79368667683853708e-16, 9.50473084594884184e-02
469 0.00000000000000000e+00, 3.14159265358979324e+00, 6.28318530717958648e+00,
470 9.42477796076937972e+00, 1.25663706143591730e+01, 1.57079632679489662e+01,
471 1.88495559215387594e+01, 2.19911485751285527e+01, 2.51327412287183459e+01,
472 2.82743338823081391e+01, 3.14159265358979324e+01
475 4.43113462726379007e-01, 3.79788034073635143e-01, 2.39122407410867584e-01,
476 1.10599187402169792e-01, 3.75782250080904725e-02, 9.37936104296856288e-03,
477 1.71974046186334976e-03, 2.31635559000523461e-04, 2.29192401420125452e-05,
478 1.66589592139340077e-06, 8.89504561311882155e-08
483 -1.00000000000000000e+00, 0.00000000000000000e+00,
484 0.00000000000000000e-01, 1.12837916709551257e+00,
485 1.00000000000000000e+00, 0.00000000000000000e+00,
487 -5.92741768247463996e-01, -7.19914991991294310e-01,
488 -6.73156763521649944e-01, 8.14025039279059577e-01,
489 8.57089811121701143e-01, 4.00248106586639754e-01,
491 1.26114512111568737e-01, -7.46519337025968199e-01,
492 -8.47666863706379907e-01, 1.89347715957263646e-01,
493 5.39641485816297176e-01, 5.97805988669631615e-01,
495 4.43238482668529408e-01, -3.03563167310638372e-01,
496 -5.88095866853990048e-01, -2.32638360700858412e-01,
497 2.49595637924601714e-01, 5.77633779156009340e-01,
499 3.33690792296469441e-01, 3.97048587678703930e-02,
500 -2.66422678503135697e-01, -3.18469797424381480e-01,
501 8.48049724711137773e-02, 4.60546329221462864e-01,
503 1.42043544696751869e-01, 1.24094227867032671e-01,
504 -8.31224229982140323e-02, -2.40766729258442100e-01,
505 2.11669512031059302e-02, 3.48650139549945097e-01,
507 3.92113167048952835e-02, 9.03306084789976219e-02,
508 -1.82889636251263500e-02, -1.53816215444915245e-01,
509 3.88103861995563741e-03, 2.72090310854550347e-01,
511 7.37741897722738503e-03, 5.04625223970221539e-02,
512 -2.87394336989990770e-03, -9.96122819257496929e-02,
513 5.22745478269428248e-04, 2.23361039070072101e-01,
515 9.69251586187208358e-04, 2.83055679874589732e-02,
516 -3.24986363596307374e-04, -6.97056268370209313e-02,
517 5.17231862038123061e-05, 1.90681117197597520e-01,
519 9.01625563468897100e-05, 1.74961124275657019e-02,
520 -2.65745127697337342e-05, -5.22070356354932341e-02,
521 3.75952450449939411e-06, 1.67018782142871146e-01,
523 5.99057675687392260e-06, 1.17993805017130890e-02,
524 -1.57660578509526722e-06, -4.09165023743669707e-02,
525 2.00739683204152177e-07, 1.48879348585662670e-01
530 return faddeeva_impl::faddeeva_smabmq_impl<double, 24, 6, 9>(
537 return faddeeva_impl::faddeeva_smabmq_impl<double, 11, 3, 3>(
static __roodevice__ const double a24[24]
__roodevice__ __roohost__ std::complex< double > faddeeva(std::complex< double > z)
__roodevice__ __roohost__ std::complex< double > faddeeva_fast(std::complex< double > z)
__roodevice__ const double taylorarr11[11 *6]
static __roodevice__ const double taylorarr24[24 *12]
static __roodevice__ const double npi24[24]
__roodevice__ static __roohost__ void cexp(double &re, double &im)
__roodevice__ const double a11[11]
__roodevice__ const double npi11[11]
__roodevice__ static __roohost__ std::complex< T > faddeeva_smabmq_impl(T zre, T zim, const T tm, const T(&a)[N], const T(&npi)[N], const T(&taylorarr)[N *NTAYLOR *2])