aboutsummaryrefslogtreecommitdiff
path: root/src/projections/tmerc.cpp
diff options
context:
space:
mode:
authorEven Rouault <even.rouault@spatialys.com>2020-03-09 22:33:40 +0100
committerEven Rouault <even.rouault@spatialys.com>2020-03-09 22:51:55 +0100
commitd9b1db0ff166aceb8c74517bcbe056678a048554 (patch)
tree87528369f1216d0ed5f2019d1295ff206a664859 /src/projections/tmerc.cpp
parent13782b19974ed8c99a8176692e688512c96e8481 (diff)
downloadPROJ-d9b1db0ff166aceb8c74517bcbe056678a048554.tar.gz
PROJ-d9b1db0ff166aceb8c74517bcbe056678a048554.zip
Approximate tmerc (Snyder): speed optimizations
fwd: 7% faster on Core-i7@2.6GHz (with FMA triggered), 22% faster on GCE Xeon@2GHz (with FMA) inv: 31% faster on Core-i7@2.6GHz (with FMA triggered), 60% faster on GCE Xeon@2GHz (with FMA) The optimizations consists in different things: - optionaly use the FMA (Fused Multiply Addition) instruction set with gcc >= 6. Binaries are generated with the standard instruction set (SSE/SSE2), and with one variant with FMA, and the appropriate version is selected automatically at runtime. This gives a modest speedup, but measurable. The speedup is more obvious on lower clocked CPU. - inline mlfn and inv_mlfn - for inv_mlfn avoid recomputation of sin()/cos() at each iteration stage, by observing that the argument changes in modest way at each iteration, and using approximation of sin()/cos(). The differences due to that approximation are way below the 1e-11 tolerance threshold. Different in results are neglectable (only found in areas where the approximations of the Snyder formulas are already no longer valid) Before: $ echo 8e5 9e6 | src/proj -d 12 +proj=utm +zone=31 -I +approx | src/proj -d 12 +proj=utm +zone=31 +approx 799997.896522093331 8999999.520601103082 $ echo 8e5 5e6 | src/proj -d 12 +proj=utm +zone=31 -I +approx | src/proj -d 12 +proj=utm +zone=31 +approx 800000.000007762224 4999999.999971268699 $ echo 18e5 9e6 | src/proj -d 12 +proj=utm +zone=31 -I +approx | src/proj -d 12 +proj=utm +zone=31 +approx 1079182.990696100984 8661150.574729491025 $ echo 18e5 5e6 | src/proj -d 12 +proj=utm +zone=31 -I +approx | src/proj -d 12 +proj=utm +zone=31 +approx 1799997.510861013783 4999999.567328464240 After: $ echo 8e5 9e6 | src/proj -d 12 +proj=utm +zone=31 -I +approx | src/proj -d 12 +proj=utm +zone=31 +approx 799997.896522093331 8999999.520601103082 $ echo 8e5 5e6 | src/proj -d 12 +proj=utm +zone=31 -I +approx | src/proj -d 12 +proj=utm +zone=31 +approx 800000.000007762224 4999999.999971268699 $ echo 18e5 9e6 | src/proj -d 12 +proj=utm +zone=31 -I +approx | src/proj -d 12 +proj=utm +zone=31 +approx 1079182.990696124267 8661150.574729502201 $ echo 18e5 5e6 | src/proj -d 12 +proj=utm +zone=31 -I +approx | src/proj -d 12 +proj=utm +zone=31 +approx 1799997.510861013783 4999999.567328464240
Diffstat (limited to 'src/projections/tmerc.cpp')
-rw-r--r--src/projections/tmerc.cpp98
1 files changed, 92 insertions, 6 deletions
diff --git a/src/projections/tmerc.cpp b/src/projections/tmerc.cpp
index 4b2a96f0..1414e542 100644
--- a/src/projections/tmerc.cpp
+++ b/src/projections/tmerc.cpp
@@ -56,13 +56,34 @@ struct pj_opaque_exact {
/* Constant for "exact" transverse mercator */
#define PROJ_ETMERC_ORDER 6
+// Determine if we should try to provide optimized versions for the Fused Multiply Addition
+// Intel instruction set. We use GCC 6 __attribute__((target_clones("fma","default")))
+// mechanism for that, where the compiler builds a default version, and one that
+// uses FMA. And at runtimes it figures out automatically which version can be used
+// by the current CPU. This allows to create general purpose binaries.
+#if defined(__GNUC__) && __GNUC__ >= 6 && defined(__x86_64__) && !defined(__FMA__)
+#define BUILD_FMA_OPTIMIZED_VERSION
+#endif
+
/*****************************************************************************/
//
// Approximate Transverse Mercator functions
//
/*****************************************************************************/
-static PJ_XY approx_e_fwd (PJ_LP lp, PJ *P) {
+
+inline static double inline_pj_mlfn(double phi, double sphi, double cphi, double *en) {
+ cphi *= sphi;
+ sphi *= sphi;
+ return(en[0] * phi - cphi * (en[1] + sphi*(en[2]
+ + sphi*(en[3] + sphi*en[4]))));
+}
+
+#ifdef BUILD_FMA_OPTIMIZED_VERSION
+__attribute__((target_clones("fma","default")))
+#endif
+inline static PJ_XY approx_e_fwd_internal (PJ_LP lp, PJ *P)
+{
PJ_XY xy = {0.0, 0.0};
struct pj_opaque_approx *Q = static_cast<struct pj_opaque_approx*>(P->opaque);
double al, als, n, cosphi, sinphi, t;
@@ -94,7 +115,7 @@ static PJ_XY approx_e_fwd (PJ_LP lp, PJ *P) {
FC5 * als * (5. + t * (t - 18.) + n * (14. - 58. * t)
+ FC7 * als * (61. + t * ( t * (179. - t) - 479. ) )
)));
- xy.y = P->k0 * (pj_mlfn(lp.phi, sinphi, cosphi, Q->en) - Q->ml0 +
+ xy.y = P->k0 * (inline_pj_mlfn(lp.phi, sinphi, cosphi, Q->en) - Q->ml0 +
sinphi * al * lp.lam * FC2 * ( 1. +
FC4 * als * (5. - t + n * (9. + 4. * n) +
FC6 * als * (61. + t * (t - 58.) + n * (270. - 330 * t)
@@ -103,6 +124,10 @@ static PJ_XY approx_e_fwd (PJ_LP lp, PJ *P) {
return (xy);
}
+static PJ_XY approx_e_fwd (PJ_LP lp, PJ *P)
+{
+ return approx_e_fwd_internal(lp, P);
+}
static PJ_XY approx_s_fwd (PJ_LP lp, PJ *P) {
PJ_XY xy = {0.0,0.0};
@@ -148,18 +173,76 @@ static PJ_XY approx_s_fwd (PJ_LP lp, PJ *P) {
return xy;
}
+inline static double
+inline_pj_inv_mlfn(projCtx ctx, double arg, double es, double *en,
+ double* sinphi, double* cosphi) {
+ double phi, k = 1./(1.-es);
+ int i;
+#define EPS 1e-11
+#define MAX_ITER 10
+ phi = arg;
+ double s = sin(phi);
+ double c = cos(phi);
+ for (i = MAX_ITER; i ; --i) { /* rarely goes over 2 iterations */
+ double t = 1. - es * s * s;
+ t = (inline_pj_mlfn(phi, s, c, en) - arg) * (t * sqrt(t)) * k;
+ phi -= t;
+ if (fabs(t) < EPS)
+ {
+ // Instead of recomputing sin(phi), cos(phi) from scratch,
+ // use sin(phi-t) and cos(phi-t) approximate formulas with
+ // 1-term approximation of sin(t) and cos(t)
+ *sinphi = s - c * t;
+ *cosphi = c + s * t;
+ return phi;
+ }
+ if (fabs(t) < 1e-3)
+ {
+ // 2-term approximation of sin(t) and cos(t)
+ // Max relative error is 4e-14 on cos(t), and 8e-15 on sin(t)
+ const double t2 = t * t;
+ const double cos_t = 1 - 0.5 * t2;
+ const double sin_t = t * (1 - (1. / 6) * t2);
+ const double s_new = s * cos_t - c * sin_t;
+ c = c * cos_t + s * sin_t;
+ s = s_new;
+ }
+ else if (fabs(t) < 1e-2)
+ {
+ // 3-term approximation of sin(t) and cos(t)
+ // Max relative error is 2e-15 on cos(t), and 2e-16 on sin(t)
+ const double t2 = t * t;
+ const double cos_t = 1 - 0.5 * t2 * (1 - (1. / 12) * t2);
+ const double sin_t = t * (1 - (1. / 6) * t2 * (1 - (1. / 20) * t2));
+ const double s_new = s * cos_t - c * sin_t;
+ c = c * cos_t + s * sin_t;
+ s = s_new;
+ }
+ else
+ {
+ s = sin(phi);
+ c = cos(phi);
+ }
+ }
+ *sinphi = s;
+ *cosphi = c;
+ pj_ctx_set_errno( ctx, PJD_ERR_NON_CONV_INV_MERI_DIST );
+ return phi;
+}
-static PJ_LP approx_e_inv (PJ_XY xy, PJ *P) {
+#ifdef BUILD_FMA_OPTIMIZED_VERSION
+__attribute__((target_clones("fma","default")))
+#endif
+inline static PJ_LP approx_e_inv_internal (PJ_XY xy, PJ *P) {
PJ_LP lp = {0.0,0.0};
struct pj_opaque_approx *Q = static_cast<struct pj_opaque_approx*>(P->opaque);
- lp.phi = pj_inv_mlfn(P->ctx, Q->ml0 + xy.y / P->k0, P->es, Q->en);
+ double sinphi, cosphi;
+ lp.phi = inline_pj_inv_mlfn(P->ctx, Q->ml0 + xy.y / P->k0, P->es, Q->en, &sinphi, &cosphi);
if (fabs(lp.phi) >= M_HALFPI) {
lp.phi = xy.y < 0. ? -M_HALFPI : M_HALFPI;
lp.lam = 0.;
} else {
- const double sinphi = sin(lp.phi);
- const double cosphi = cos(lp.phi);
double t = fabs (cosphi) > 1e-10 ? sinphi/cosphi : 0.;
const double n = Q->esp * cosphi * cosphi;
double con = 1. - P->es * sinphi * sinphi;
@@ -182,6 +265,9 @@ static PJ_LP approx_e_inv (PJ_XY xy, PJ *P) {
return lp;
}
+static PJ_LP approx_e_inv (PJ_XY xy, PJ *P) {
+ return approx_e_inv_internal(xy, P);
+}
static PJ_LP approx_s_inv (PJ_XY xy, PJ *P) {
PJ_LP lp = {0.0, 0.0};