From 0b9a5e0aabda1f2606e4907c5c03e05bd6fe4078 Mon Sep 17 00:00:00 2001 From: Pasukhin Dmitry Date: Thu, 25 Dec 2025 22:56:22 +0000 Subject: [PATCH] Foundation Classes - Optimize PLib polynomial evaluation (#953) - Replaced `memcpy`/`memset` calls with direct initialization loops to enable register allocation for small dimensions (1-15) - Introduced local stack arrays that reduce memory round-trips by writing to output only at the end of computation - Fused derivative and value update loops to minimize redundant memory reads --- src/FoundationClasses/TKMath/PLib/PLib.cxx | 515 ++++++++++----------- 1 file changed, 246 insertions(+), 269 deletions(-) diff --git a/src/FoundationClasses/TKMath/PLib/PLib.cxx b/src/FoundationClasses/TKMath/PLib/PLib.cxx index da9d8df4a1..671d45f556 100644 --- a/src/FoundationClasses/TKMath/PLib/PLib.cxx +++ b/src/FoundationClasses/TKMath/PLib/PLib.cxx @@ -24,6 +24,8 @@ #include #include +#include + // To convert points array into Real .. // ********************************* //================================================================================================= @@ -701,100 +703,240 @@ void PLib::RationalDerivatives(const Standard_Integer DerivativeRequest, //======================================================================= // Auxiliary template functions used for optimized evaluation of polynome -// and its derivatives for smaller dimensions of the polynome +// and its derivatives for smaller dimensions of the polynome. +// Uses local arrays to enable register allocation and avoids memcpy/memset. //======================================================================= namespace { -// recursive template for evaluating value or first derivative +// Maximum dimension for optimized template dispatch +constexpr int THE_MAX_OPT_DIM = 15; + +// Evaluation of only value using local array (register-friendly). +// Writes to output only at the end. template -inline void eval_step1(double* poly, double par, const double* coef) +inline void eval_poly0(double* theResult, const double* theCoeffs, int theDegree, double thePar) { - eval_step1(poly, par, coef); - poly[dim] = poly[dim] * par + coef[dim]; -} + std::array aLocal; + for (int i = 0; i < dim; ++i) + { + aLocal[i] = theCoeffs[i]; + } -// recursion end -template <> -inline void eval_step1<-1>(double*, double, const double*) -{ -} - -// recursive template for evaluating second derivative -template -inline void eval_step2(double* poly, double par, const double* coef) -{ - eval_step2(poly, par, coef); - poly[dim] = poly[dim] * par + coef[dim] * 2.; -} - -// recursion end -template <> -inline void eval_step2<-1>(double*, double, const double*) -{ -} - -// evaluation of only value -template -inline void eval_poly0(double* aRes, const double* theCoeffs, int Degree, double Par) -{ - Standard_Real* aRes0 = aRes; - const Standard_Real* aCoeffs = theCoeffs; - memcpy(aRes0, aCoeffs, sizeof(Standard_Real) * dim); - - for (Standard_Integer aDeg = 0; aDeg < Degree; aDeg++) + const double* aCoeffs = theCoeffs; + for (int aDeg = 0; aDeg < theDegree; ++aDeg) { aCoeffs -= dim; - // Calculating the value of the polynomial - eval_step1(aRes0, Par, aCoeffs); + for (int i = 0; i < dim; ++i) + { + aLocal[i] = aLocal[i] * thePar + aCoeffs[i]; + } + } + + for (int i = 0; i < dim; ++i) + { + theResult[i] = aLocal[i]; } } -// evaluation of value and first derivative +// Evaluation of value and first derivative using local arrays. +// Fuses derivative and value loops to read aLocal0 only once per iteration. template -inline void eval_poly1(double* aRes, const double* theCoeffs, int Degree, double Par) +inline void eval_poly1(double* theResult, const double* theCoeffs, int theDegree, double thePar) { - Standard_Real* aRes0 = aRes; - Standard_Real* aRes1 = aRes + dim; - const Standard_Real* aCoeffs = theCoeffs; + std::array aLocal0; + std::array aLocal1; - memcpy(aRes0, aCoeffs, sizeof(Standard_Real) * dim); - memset(aRes1, 0, sizeof(Standard_Real) * dim); + for (int i = 0; i < dim; ++i) + { + aLocal0[i] = theCoeffs[i]; + aLocal1[i] = 0.0; + } - for (Standard_Integer aDeg = 0; aDeg < Degree; aDeg++) + const double* aCoeffs = theCoeffs; + for (int aDeg = 0; aDeg < theDegree; ++aDeg) { aCoeffs -= dim; - // Calculating derivatives of the polynomial - eval_step1(aRes1, Par, aRes0); - // Calculating the value of the polynomial - eval_step1(aRes0, Par, aCoeffs); + for (int i = 0; i < dim; ++i) + { + const double aVal = aLocal0[i]; + aLocal1[i] = aLocal1[i] * thePar + aVal; + aLocal0[i] = aVal * thePar + aCoeffs[i]; + } + } + + for (int i = 0; i < dim; ++i) + { + theResult[i] = aLocal0[i]; + theResult[dim + i] = aLocal1[i]; } } -// evaluation of value and first and second derivatives +// Evaluation of value and first and second derivatives using local arrays. +// Fuses all three loops for better cache utilization. template -inline void eval_poly2(double* aRes, const double* theCoeffs, int Degree, double Par) +inline void eval_poly2(double* theResult, const double* theCoeffs, int theDegree, double thePar) { - Standard_Real* aRes0 = aRes; - Standard_Real* aRes1 = aRes + dim; - Standard_Real* aRes2 = aRes + 2 * dim; - const Standard_Real* aCoeffs = theCoeffs; + std::array aLocal0; + std::array aLocal1; + std::array aLocal2; - memcpy(aRes0, aCoeffs, sizeof(Standard_Real) * dim); - memset(aRes1, 0, sizeof(Standard_Real) * dim); - memset(aRes2, 0, sizeof(Standard_Real) * dim); + for (int i = 0; i < dim; ++i) + { + aLocal0[i] = theCoeffs[i]; + aLocal1[i] = 0.0; + aLocal2[i] = 0.0; + } - for (Standard_Integer aDeg = 0; aDeg < Degree; aDeg++) + const double* aCoeffs = theCoeffs; + for (int aDeg = 0; aDeg < theDegree; ++aDeg) { aCoeffs -= dim; - // Calculating second derivatives of the polynomial - eval_step2(aRes2, Par, aRes1); - // Calculating derivatives of the polynomial - eval_step1(aRes1, Par, aRes0); - // Calculating the value of the polynomial - eval_step1(aRes0, Par, aCoeffs); + for (int i = 0; i < dim; ++i) + { + const double aD1 = aLocal1[i]; + const double aVal = aLocal0[i]; + aLocal2[i] = aLocal2[i] * thePar + aD1 * 2.0; + aLocal1[i] = aD1 * thePar + aVal; + aLocal0[i] = aVal * thePar + aCoeffs[i]; + } + } + + for (int i = 0; i < dim; ++i) + { + theResult[i] = aLocal0[i]; + theResult[dim + i] = aLocal1[i]; + theResult[2 * dim + i] = aLocal2[i]; } } + +// Runtime evaluation for dimension > THE_MAX_OPT_DIM (value only) +inline void eval_poly0_runtime(double* theResult, + const double* theCoeffs, + int theDegree, + double thePar, + int theDimension) +{ + for (int i = 0; i < theDimension; ++i) + { + theResult[i] = theCoeffs[i]; + } + + const double* aCoeffs = theCoeffs; + for (int aDeg = 0; aDeg < theDegree; ++aDeg) + { + aCoeffs -= theDimension; + for (int i = 0; i < theDimension; ++i) + { + theResult[i] = theResult[i] * thePar + aCoeffs[i]; + } + } +} + +// Runtime evaluation for dimension > THE_MAX_OPT_DIM (value + 1st derivative) +inline void eval_poly1_runtime(double* theResult, + const double* theCoeffs, + int theDegree, + double thePar, + int theDimension) +{ + double* aRes0 = theResult; + double* aRes1 = theResult + theDimension; + + for (int i = 0; i < theDimension; ++i) + { + aRes0[i] = theCoeffs[i]; + aRes1[i] = 0.0; + } + + const double* aCoeffs = theCoeffs; + for (int aDeg = 0; aDeg < theDegree; ++aDeg) + { + aCoeffs -= theDimension; + for (int i = 0; i < theDimension; ++i) + { + const double aVal = aRes0[i]; + aRes1[i] = aRes1[i] * thePar + aVal; + aRes0[i] = aVal * thePar + aCoeffs[i]; + } + } +} + +// Runtime evaluation for dimension > THE_MAX_OPT_DIM (value + 1st + 2nd derivatives) +inline void eval_poly2_runtime(double* theResult, + const double* theCoeffs, + int theDegree, + double thePar, + int theDimension) +{ + double* aRes0 = theResult; + double* aRes1 = theResult + theDimension; + double* aRes2 = theResult + 2 * theDimension; + + for (int i = 0; i < theDimension; ++i) + { + aRes0[i] = theCoeffs[i]; + aRes1[i] = 0.0; + aRes2[i] = 0.0; + } + + const double* aCoeffs = theCoeffs; + for (int aDeg = 0; aDeg < theDegree; ++aDeg) + { + aCoeffs -= theDimension; + for (int i = 0; i < theDimension; ++i) + { + const double aD1 = aRes1[i]; + const double aVal = aRes0[i]; + aRes2[i] = aRes2[i] * thePar + aD1 * 2.0; + aRes1[i] = aD1 * thePar + aVal; + aRes0[i] = aVal * thePar + aCoeffs[i]; + } + } +} + +// Function pointer type for dispatch tables +using EvalPolyFunc = void (*)(double*, const double*, int, double); + +// Helper to generate dispatch tables at compile time +template