diff options
| author | Even Rouault <even.rouault@spatialys.com> | 2020-03-29 18:43:37 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-03-29 18:43:37 +0200 |
| commit | b84c9d0cb61f3bd561da6092e15e294ae7e410e0 (patch) | |
| tree | 8957109c3555165c26d14c729f8f40c216a07c87 /src | |
| parent | 455e6304d095f870672cff8ae3026168f931dbe3 (diff) | |
| parent | d9b1db0ff166aceb8c74517bcbe056678a048554 (diff) | |
| download | PROJ-b84c9d0cb61f3bd561da6092e15e294ae7e410e0.tar.gz PROJ-b84c9d0cb61f3bd561da6092e15e294ae7e410e0.zip | |
Merge pull request #2039 from rouault/speedup_snyder_tmerc
Approximate tmerc (Snyder): speed optimizations
Diffstat (limited to 'src')
| -rw-r--r-- | src/projections/tmerc.cpp | 98 |
1 files changed, 92 insertions, 6 deletions
diff --git a/src/projections/tmerc.cpp b/src/projections/tmerc.cpp index db0fa908..75a77ccd 100644 --- a/src/projections/tmerc.cpp +++ b/src/projections/tmerc.cpp @@ -56,13 +56,34 @@ struct pj_opaque_exact { /* Constant for "exact" transverse mercator */ #define PROJ_ETMERC_ORDER 6 +// Determine if we should try to provide optimized versions for the Fused Multiply Addition +// Intel instruction set. We use GCC 6 __attribute__((target_clones("fma","default"))) +// mechanism for that, where the compiler builds a default version, and one that +// uses FMA. And at runtimes it figures out automatically which version can be used +// by the current CPU. This allows to create general purpose binaries. +#if defined(__GNUC__) && __GNUC__ >= 6 && defined(__x86_64__) && !defined(__FMA__) +#define BUILD_FMA_OPTIMIZED_VERSION +#endif + /*****************************************************************************/ // // Approximate Transverse Mercator functions // /*****************************************************************************/ -static PJ_XY approx_e_fwd (PJ_LP lp, PJ *P) { + +inline static double inline_pj_mlfn(double phi, double sphi, double cphi, double *en) { + cphi *= sphi; + sphi *= sphi; + return(en[0] * phi - cphi * (en[1] + sphi*(en[2] + + sphi*(en[3] + sphi*en[4])))); +} + +#ifdef BUILD_FMA_OPTIMIZED_VERSION +__attribute__((target_clones("fma","default"))) +#endif +inline static PJ_XY approx_e_fwd_internal (PJ_LP lp, PJ *P) +{ PJ_XY xy = {0.0, 0.0}; struct pj_opaque_approx *Q = static_cast<struct pj_opaque_approx*>(P->opaque); double al, als, n, cosphi, sinphi, t; @@ -94,7 +115,7 @@ static PJ_XY approx_e_fwd (PJ_LP lp, PJ *P) { FC5 * als * (5. + t * (t - 18.) + n * (14. - 58. * t) + FC7 * als * (61. + t * ( t * (179. - t) - 479. ) ) ))); - xy.y = P->k0 * (pj_mlfn(lp.phi, sinphi, cosphi, Q->en) - Q->ml0 + + xy.y = P->k0 * (inline_pj_mlfn(lp.phi, sinphi, cosphi, Q->en) - Q->ml0 + sinphi * al * lp.lam * FC2 * ( 1. + FC4 * als * (5. - t + n * (9. + 4. * n) + FC6 * als * (61. + t * (t - 58.) + n * (270. - 330 * t) @@ -103,6 +124,10 @@ static PJ_XY approx_e_fwd (PJ_LP lp, PJ *P) { return (xy); } +static PJ_XY approx_e_fwd (PJ_LP lp, PJ *P) +{ + return approx_e_fwd_internal(lp, P); +} static PJ_XY approx_s_fwd (PJ_LP lp, PJ *P) { PJ_XY xy = {0.0,0.0}; @@ -148,18 +173,76 @@ static PJ_XY approx_s_fwd (PJ_LP lp, PJ *P) { return xy; } +inline static double +inline_pj_inv_mlfn(projCtx ctx, double arg, double es, double *en, + double* sinphi, double* cosphi) { + double phi, k = 1./(1.-es); + int i; +#define EPS 1e-11 +#define MAX_ITER 10 + phi = arg; + double s = sin(phi); + double c = cos(phi); + for (i = MAX_ITER; i ; --i) { /* rarely goes over 2 iterations */ + double t = 1. - es * s * s; + t = (inline_pj_mlfn(phi, s, c, en) - arg) * (t * sqrt(t)) * k; + phi -= t; + if (fabs(t) < EPS) + { + // Instead of recomputing sin(phi), cos(phi) from scratch, + // use sin(phi-t) and cos(phi-t) approximate formulas with + // 1-term approximation of sin(t) and cos(t) + *sinphi = s - c * t; + *cosphi = c + s * t; + return phi; + } + if (fabs(t) < 1e-3) + { + // 2-term approximation of sin(t) and cos(t) + // Max relative error is 4e-14 on cos(t), and 8e-15 on sin(t) + const double t2 = t * t; + const double cos_t = 1 - 0.5 * t2; + const double sin_t = t * (1 - (1. / 6) * t2); + const double s_new = s * cos_t - c * sin_t; + c = c * cos_t + s * sin_t; + s = s_new; + } + else if (fabs(t) < 1e-2) + { + // 3-term approximation of sin(t) and cos(t) + // Max relative error is 2e-15 on cos(t), and 2e-16 on sin(t) + const double t2 = t * t; + const double cos_t = 1 - 0.5 * t2 * (1 - (1. / 12) * t2); + const double sin_t = t * (1 - (1. / 6) * t2 * (1 - (1. / 20) * t2)); + const double s_new = s * cos_t - c * sin_t; + c = c * cos_t + s * sin_t; + s = s_new; + } + else + { + s = sin(phi); + c = cos(phi); + } + } + *sinphi = s; + *cosphi = c; + pj_ctx_set_errno( ctx, PJD_ERR_NON_CONV_INV_MERI_DIST ); + return phi; +} -static PJ_LP approx_e_inv (PJ_XY xy, PJ *P) { +#ifdef BUILD_FMA_OPTIMIZED_VERSION +__attribute__((target_clones("fma","default"))) +#endif +inline static PJ_LP approx_e_inv_internal (PJ_XY xy, PJ *P) { PJ_LP lp = {0.0,0.0}; struct pj_opaque_approx *Q = static_cast<struct pj_opaque_approx*>(P->opaque); - lp.phi = pj_inv_mlfn(P->ctx, Q->ml0 + xy.y / P->k0, P->es, Q->en); + double sinphi, cosphi; + lp.phi = inline_pj_inv_mlfn(P->ctx, Q->ml0 + xy.y / P->k0, P->es, Q->en, &sinphi, &cosphi); if (fabs(lp.phi) >= M_HALFPI) { lp.phi = xy.y < 0. ? -M_HALFPI : M_HALFPI; lp.lam = 0.; } else { - const double sinphi = sin(lp.phi); - const double cosphi = cos(lp.phi); double t = fabs (cosphi) > 1e-10 ? sinphi/cosphi : 0.; const double n = Q->esp * cosphi * cosphi; double con = 1. - P->es * sinphi * sinphi; @@ -182,6 +265,9 @@ static PJ_LP approx_e_inv (PJ_XY xy, PJ *P) { return lp; } +static PJ_LP approx_e_inv (PJ_XY xy, PJ *P) { + return approx_e_inv_internal(xy, P); +} static PJ_LP approx_s_inv (PJ_XY xy, PJ *P) { PJ_LP lp = {0.0, 0.0}; |
