Cの高速かつ効率的な最小二乗適合アルゴリズム？

Question

2つのデータ配列に線形最小二乗フィットを実装しようとしています。時間と振幅です。これまでに知っている唯一の手法は、（y = m * x + b）で可能なすべてのmおよびbポイントをテストし、どの組み合わせがデータに最もよく一致するかを調べて、エラーが最小になるようにすることです。ただし、すべてをテストするため、非常に多くの組み合わせを反復しても役に立たないと思う場合があります。私が知らないプロセスをスピードアップするためのテクニックはありますか？ありがとう。

Jeremiah Willcock · Accepted Answer

最小二乗フィッティングには効率的なアルゴリズムがあります。詳細は Wikipedia を参照してください。アルゴリズムを実装するライブラリもあり、単純な実装よりも効率的です。 GNU Scientific Library は1つの例ですが、より緩やかなライセンスの下にあるものもあります。

Mark Lakata · Answer

このコードを試してください。これはy = mx + bを（x、y）データに適合させます。

linregの引数は

linreg(int n, REAL x[], REAL y[], REAL* b, REAL* m, REAL* r) n = number of data points x,y = arrays of data *b = output intercept *m = output slope *r = output correlation coefficient (can be NULL if you don't want it)

戻り値は成功した場合は0、失敗した場合は！= 0です。

これがコードです

#include "linreg.h" #include <stdlib.h> #include <math.h> /* math functions */ //#define REAL float #define REAL double inline static REAL sqr(REAL x) { return x*x; } int linreg(int n, const REAL x[], const REAL y[], REAL* m, REAL* b, REAL* r){ REAL sumx = 0.0; /* sum of x */ REAL sumx2 = 0.0; /* sum of x**2 */ REAL sumxy = 0.0; /* sum of x * y */ REAL sumy = 0.0; /* sum of y */ REAL sumy2 = 0.0; /* sum of y**2 */ for (int i=0;i<n;i++){ sumx += x[i]; sumx2 += sqr(x[i]); sumxy += x[i] * y[i]; sumy += y[i]; sumy2 += sqr(y[i]); } REAL denom = (n * sumx2 - sqr(sumx)); if (denom == 0) { // singular matrix. can't solve the problem. *m = 0; *b = 0; if (r) *r = 0; return 1; } *m = (n * sumxy - sumx * sumy) / denom; *b = (sumy * sumx2 - sumx * sumxy) / denom; if (r!=NULL) { *r = (sumxy - sumx * sumy / n) / /* compute correlation coeff */ sqrt((sumx2 - sqr(sumx)/n) * (sumy2 - sqr(sumy)/n)); } return 0; }

例

この例はオンラインを実行できます。

int main() { int n = 6; REAL x[6]= {1, 2, 4, 5, 10, 20}; REAL y[6]= {4, 6, 12, 15, 34, 68}; REAL m,b,r; linreg(n,x,y,&m,&b,&r); printf("m=%g b=%g r=%g
",m,b,r); return 0; }

これが出力です

m=3.43651 b=-0.888889 r=0.999192

これは、Excelプロットと線形適合（検証用）です。

すべての値は上記のCコードと正確に一致します（Cコードはrを返し、ExcelはR**2を返します）。

Ziezi · Answer

FromNumerical Recipes：The Art of Scientific Computingin（15.2）Fitting Data to a Straight Line：

線形回帰：

N個のデータポイントのセット（x_私、y_私）直線モデルに：

不確実性を仮定：シグマ_私各yに関連付けられています_私そして、x_私’（従属変数の値）は正確にわかっています。モデルがデータとどの程度一致しているかを測定するには、カイ二乗関数を使用します。この場合は次のようになります。

上記の方程式を最小化して、aとbを決定します。これは、aとbに関する上記の方程式の導関数を見つけ、それらをゼロに等しくして、aとbについて解くことによって行われます。次に、aとbの推定値の推定不確実性を推定します。明らかに、データの測定誤差により、これらのパラメーターの決定に不確実性がもたらされるためです。さらに、モデルに対するデータの適合度を推定する必要があります。この推定がなければ、モデル内のパラメーターaとbがまったく意味を持っているというわずかな兆候はありません。

以下の構造体は、前述の計算を実行します。

_struct Fitab { // Object for fitting a straight line y = a + b*x to a set of // points (xi, yi), with or without available // errors sigma i . Call one of the two constructors to calculate the fit. // The answers are then available as the variables: // a, b, siga, sigb, chi2, and either q or sigdat. int ndata; double a, b, siga, sigb, chi2, q, sigdat; // Answers. vector<double> &x, &y, &sig; // Constructor. Fitab(vector<double> &xx, vector<double> &yy, vector<double> &ssig) : ndata(xx.size()), x(xx), y(yy), sig(ssig), chi2(0.), q(1.), sigdat(0.) { // Given a set of data points x[0..ndata-1], y[0..ndata-1] // with individual standard deviations sig[0..ndata-1], // sets a,b and their respective probable uncertainties // siga and sigb, the chi-square: chi2, and the goodness-of-fit // probability: q Gamma gam; int i; double ss=0., sx=0., sy=0., st2=0., t, wt, sxoss; b=0.0; for (i=0;i < ndata; i++) { // Accumulate sums ... wt = 1.0 / SQR(sig[i]); //...with weights ss += wt; sx += x[i]*wt; sy += y[i]*wt; } sxoss = sx/ss; for (i=0; i < ndata; i++) { t = (x[i]-sxoss) / sig[i]; st2 += t*t; b += t*y[i]/sig[i]; } b /= st2; // Solve for a, b, sigma-a, and simga-b. a = (sy-sx*b) / ss; siga = sqrt((1.0+sx*sx/(ss*st2))/ss); sigb = sqrt(1.0/st2); // Calculate chi2. for (i=0;i<ndata;i++) chi2 += SQR((y[i]-a-b*x[i])/sig[i]); if (ndata>2) q=gam.gammq(0.5*(ndata-2),0.5*chi2); // goodness of fit } // Constructor. Fitab(vector<double> &xx, vector<double> &yy) : ndata(xx.size()), x(xx), y(yy), sig(xx), chi2(0.), q(1.), sigdat(0.) { // As above, but without known errors (sig is not used). // The uncertainties siga and sigb are estimated by assuming // equal errors for all points, and that a straight line is // a good fit. q is returned as 1.0, the normalization of chi2 // is to unit standard deviation on all points, and sigdat // is set to the estimated error of each point. int i; double ss,sx=0.,sy=0.,st2=0.,t,sxoss; b=0.0; // Accumulate sums ... for (i=0; i < ndata; i++) { sx += x[i]; // ...without weights. sy += y[i]; } ss = ndata; sxoss = sx/ss; for (i=0;i < ndata; i++) { t = x[i]-sxoss; st2 += t*t; b += t*y[i]; } b /= st2; // Solve for a, b, sigma-a, and sigma-b. a = (sy-sx*b)/ss; siga=sqrt((1.0+sx*sx/(ss*st2))/ss); sigb=sqrt(1.0/st2); // Calculate chi2. for (i=0;i<ndata;i++) chi2 += SQR(y[i]-a-b*x[i]); if (ndata > 2) sigdat=sqrt(chi2/(ndata-2)); // For unweighted data evaluate typical // sig using chi2, and adjust // the standard deviations. siga *= sigdat; sigb *= sigdat; } }; _

ここで_struct Gamma_：

_struct Gamma : Gauleg18 { // Object for incomplete gamma function. // Gauleg18 provides coefficients for Gauss-Legendre quadrature. static const Int ASWITCH=100; When to switch to quadrature method. static const double EPS; // See end of struct for initializations. static const double FPMIN; double gln; double gammp(const double a, const double x) { // Returns the incomplete gamma function P(a,x) if (x < 0.0 || a <= 0.0) throw("bad args in gammp"); if (x == 0.0) return 0.0; else if ((Int)a >= ASWITCH) return gammpapprox(a,x,1); // Quadrature. else if (x < a+1.0) return gser(a,x); // Use the series representation. else return 1.0-gcf(a,x); // Use the continued fraction representation. } double gammq(const double a, const double x) { // Returns the incomplete gamma function Q(a,x) = 1 - P(a,x) if (x < 0.0 || a <= 0.0) throw("bad args in gammq"); if (x == 0.0) return 1.0; else if ((Int)a >= ASWITCH) return gammpapprox(a,x,0); // Quadrature. else if (x < a+1.0) return 1.0-gser(a,x); // Use the series representation. else return gcf(a,x); // Use the continued fraction representation. } double gser(const Doub a, const Doub x) { // Returns the incomplete gamma function P(a,x) evaluated by its series representation. // Also sets ln (gamma) as gln. User should not call directly. double sum,del,ap; gln=gammln(a); ap=a; del=sum=1.0/a; for (;;) { ++ap; del *= x/ap; sum += del; if (fabs(del) < fabs(sum)*EPS) { return sum*exp(-x+a*log(x)-gln); } } } double gcf(const Doub a, const Doub x) { // Returns the incomplete gamma function Q(a, x) evaluated // by its continued fraction representation. // Also sets ln (gamma) as gln. User should not call directly. int i; double an,b,c,d,del,h; gln=gammln(a); b=x+1.0-a; // Set up for evaluating continued fraction // by modified Lentz’s method with with b0 = 0. c=1.0/FPMIN; d=1.0/b; h=d; for (i=1;;i++) { // Iterate to convergence. an = -i*(i-a); b += 2.0; d=an*d+b; if (fabs(d) < FPMIN) d=FPMIN; c=b+an/c; if (fabs(c) < FPMIN) c=FPMIN; d=1.0/d; del=d*c; h *= del; if (fabs(del-1.0) <= EPS) break; } return exp(-x+a*log(x)-gln)*h; Put factors in front. } double gammpapprox(double a, double x, int psig) { // Incomplete gamma by quadrature. Returns P(a,x) or Q(a, x), // when psig is 1 or 0, respectively. User should not call directly. int j; double xu,t,sum,ans; double a1 = a-1.0, lna1 = log(a1), sqrta1 = sqrt(a1); gln = gammln(a); // Set how far to integrate into the tail: if (x > a1) xu = MAX(a1 + 11.5*sqrta1, x + 6.0*sqrta1); else xu = MAX(0.,MIN(a1 - 7.5*sqrta1, x - 5.0*sqrta1)); sum = 0; for (j=0;j<ngau;j++) { // Gauss-Legendre. t = x + (xu-x)*y[j]; sum += w[j]*exp(-(t-a1)+a1*(log(t)-lna1)); } ans = sum*(xu-x)*exp(a1*(lna1-1.)-gln); return (psig?(ans>0.0? 1.0-ans:-ans):(ans>=0.0? ans:1.0+ans)); } double invgammp(Doub p, Doub a); // Inverse function on x of P(a,x) . }; const Doub Gamma::EPS = numeric_limits<Doub>::epsilon(); const Doub Gamma::FPMIN = numeric_limits<Doub>::min()/EPS _

および_stuct Gauleg18_：

_struct Gauleg18 { // Abscissas and weights for Gauss-Legendre quadrature. static const Int ngau = 18; static const Doub y[18]; static const Doub w[18]; }; const Doub Gauleg18::y[18] = {0.0021695375159141994, 0.011413521097787704,0.027972308950302116,0.051727015600492421, 0.082502225484340941, 0.12007019910960293,0.16415283300752470, 0.21442376986779355, 0.27051082840644336, 0.33199876341447887, 0.39843234186401943, 0.46931971407375483, 0.54413605556657973, 0.62232745288031077, 0.70331500465597174, 0.78649910768313447, 0.87126389619061517, 0.95698180152629142}; const Doub Gauleg18::w[18] = {0.0055657196642445571, 0.012915947284065419,0.020181515297735382,0.027298621498568734, 0.034213810770299537,0.040875750923643261,0.047235083490265582, 0.053244713977759692,0.058860144245324798,0.064039797355015485 0.068745323835736408,0.072941885005653087,0.076598410645870640, 0.079687828912071670,0.082187266704339706,0.084078218979661945, 0.085346685739338721,0.085983275670394821}; _

そして、最後に機能Gamma::invgamp()：

_double Gamma::invgammp(double p, double a) { // Returns x such that P(a,x) = p for an argument p between 0 and 1. int j; double x,err,t,u,pp,lna1,afac,a1=a-1; const double EPS=1.e-8; // Accuracy is the square of EPS. gln=gammln(a); if (a <= 0.) throw("a must be pos in invgammap"); if (p >= 1.) return MAX(100.,a + 100.*sqrt(a)); if (p <= 0.) return 0.0; if (a > 1.) { lna1=log(a1); afac = exp(a1*(lna1-1.)-gln); pp = (p < 0.5)? p : 1. - p; t = sqrt(-2.*log(pp)); x = (2.30753+t*0.27061)/(1.+t*(0.99229+t*0.04481)) - t; if (p < 0.5) x = -x; x = MAX(1.e-3,a*pow(1.-1./(9.*a)-x/(3.*sqrt(a)),3)); } else { t = 1.0 - a*(0.253+a*0.12); and (6.2.9). if (p < t) x = pow(p/t,1./a); else x = 1.-log(1.-(p-t)/(1.-t)); } for (j=0;j<12;j++) { if (x <= 0.0) return 0.0; // x too small to compute accurately. err = gammp(a,x) - p; if (a > 1.) t = afac*exp(-(x-a1)+a1*(log(x)-lna1)); else t = exp(-x+a1*log(x)-gln); u = err/t; // Halley’s method. x -= (t = u/(1.-0.5*MIN(1.,u*((a-1.)/x - 1)))); // Halve old value if x tries to go negative. if (x <= 0.) x = 0.5*(x + t); if (fabs(t) < EPS*x ) break; } return x; } _

Jack · Answer

上記のオリジナルの例は、スロープとオフセットでうまく機能しましたが、corr coefで苦労しました。かっこが想定された優先順位と同じように機能していないのでしょうか？とにかく、他のWebページの助けを借りて、ようやくExcelの線形傾向線に一致する値を取得しました。 Mark Lakataの変数名を使用してコードを共有すると思いました。お役に立てれば。

double slope = ((n * sumxy) - (sumx * sumy )) / denom; double intercept = ((sumy * sumx2) - (sumx * sumxy)) / denom; double term1 = ((n * sumxy) - (sumx * sumy)); double term2 = ((n * sumx2) - (sumx * sumx)); double term3 = ((n * sumy2) - (sumy * sumy)); double term23 = (term2 * term3); double r2 = 1.0; if (fabs(term23) > MIN_DOUBLE) // Define MIN_DOUBLE somewhere as 1e-9 or similar r2 = (term1 * term1) / term23;

Throwback1986 · Answer

この論文のセクション1を見てください。このセクションでは、2D線形回帰を行列乗算の演習として表現します。データが適切に動作している限り、この手法を使用すると、最小二乗法による近似をすばやく作成できます。

データのサイズによっては、行列の乗算を代数的に単純な方程式のセットに減らし、それによってmatmult（）関数を記述する必要をなくすことは価値があります。（事前に注意してください。これは、4つまたは5つを超えるデータポイントに対しては完全に非現実的です！）

Default picture · Answer

私が知る限り、最小二乗を解決する最も速くて最も効率的な方法は、パラメーターベクトルから（勾配）/（2次勾配）を引くことです。（2次勾配=つまり、ヘッセ行列の対角線。）

ここに直感があります：

1つのパラメーターに対して最小二乗を最適化するとします。これは、放物線の頂点を見つけることと同じです。次に、ランダムな初期パラメータの場合、x、損失関数の頂点はxにあります -f (1) /f^（2）。これは-f (1) /f (2) to xは常に導関数fをゼロにします^（1）。

補足：Tensorflowでこれを実装すると、ソリューションはwに現れました -f (1) /f (2) /（重みの数）ですが、それがTensorflowによるものなのか、それとも他の何かによるものなのかはわかりません。

nisennenmondai · Answer

割り当てとして、RMSE損失関数を使用して単純な線形回帰をCでコーディングする必要がありました。プログラムは動的であり、独自の値を入力して独自の損失関数を選択できます。現在のところ、これは二乗平均平方根誤差に限定されています。しかし、最初にここに私が使用したアルゴリズムがあります：

今コード...グラフを表示するにはgnuplotが必要です、Sudo apt install gnuplot

#include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <sys/types.h> #define BUFFSIZE 64 #define MAXSIZE 100 static double vector_x[MAXSIZE] = {0}; static double vector_y[MAXSIZE] = {0}; static double vector_predict[MAXSIZE] = {0}; static double max_x; static double max_y; static double mean_x; static double mean_y; static double teta_0_intercept; static double teta_1_grad; static double RMSE; static double r_square; static double prediction; static char intercept[BUFFSIZE]; static char grad[BUFFSIZE]; static char xrange[BUFFSIZE]; static char yrange[BUFFSIZE]; static char lossname_RMSE[BUFFSIZE] = "Simple Linear Regression using RMSE'"; static char cmd_gnu_0[BUFFSIZE] = "set title '"; static char cmd_gnu_1[BUFFSIZE] = "intercept = "; static char cmd_gnu_2[BUFFSIZE] = "grad = "; static char cmd_gnu_3[BUFFSIZE] = "set xrange [0:"; static char cmd_gnu_4[BUFFSIZE] = "set yrange [0:"; static char cmd_gnu_5[BUFFSIZE] = "f(x) = (grad * x) + intercept"; static char cmd_gnu_6[BUFFSIZE] = "plot f(x), 'data.temp' with points pointtype 7"; static char const *commands_gnuplot[] = { cmd_gnu_0, cmd_gnu_1, cmd_gnu_2, cmd_gnu_3, cmd_gnu_4, cmd_gnu_5, cmd_gnu_6, }; static size_t size; static void user_input() { printf("Enter x,y vector size, MAX = 100
"); scanf("%lu", &size); if (size > MAXSIZE) { printf("Wrong input size is too big
"); user_input(); } printf("vector's size is %lu
", size); size_t i; for (i = 0; i < size; i++) { printf("Enter vector_x[%ld] values
", i); scanf("%lf", &vector_x[i]); } for (i = 0; i < size; i++) { printf("Enter vector_y[%ld] values
", i); scanf("%lf", &vector_y[i]); } } static void display_vector() { size_t i; for (i = 0; i < size; i++){ printf("vector_x[%lu] = %lf	", i, vector_x[i]); printf("vector_y[%lu] = %lf
", i, vector_y[i]); } } static void concatenate(char p[], char q[]) { int c; int d; c = 0; while (p[c] != '\0') { c++; } d = 0; while (q[d] != '\0') { p[c] = q[d]; d++; c++; } p[c] = '\0'; } static void compute_mean_x_y() { size_t i; double tmp_x = 0.0; double tmp_y = 0.0; for (i = 0; i < size; i++) { tmp_x += vector_x[i]; tmp_y += vector_y[i]; } mean_x = tmp_x / size; mean_y = tmp_y / size; printf("mean_x = %lf
", mean_x); printf("mean_y = %lf
", mean_y); } static void compute_teta_1_grad() { double numerator = 0.0; double denominator = 0.0; double tmp1 = 0.0; double tmp2 = 0.0; size_t i; for (i = 0; i < size; i++) { numerator += (vector_x[i] - mean_x) * (vector_y[i] - mean_y); } for (i = 0; i < size; i++) { tmp1 = vector_x[i] - mean_x; tmp2 = tmp1 * tmp1; denominator += tmp2; } teta_1_grad = numerator / denominator; printf("teta_1_grad = %lf
", teta_1_grad); } static void compute_teta_0_intercept() { teta_0_intercept = mean_y - (teta_1_grad * mean_x); printf("teta_0_intercept = %lf
", teta_0_intercept); } static void compute_prediction() { size_t i; for (i = 0; i < size; i++) { vector_predict[i] = teta_0_intercept + (teta_1_grad * vector_x[i]); printf("y^[%ld] = %lf
", i, vector_predict[i]); } printf("
"); } static void compute_RMSE() { compute_prediction(); double error = 0; size_t i; for (i = 0; i < size; i++) { error = (vector_predict[i] - vector_y[i]) * (vector_predict[i] - vector_y[i]); printf("error y^[%ld] = %lf
", i, error); RMSE += error; } /* mean */ RMSE = RMSE / size; /* square root mean */ RMSE = sqrt(RMSE); printf("
RMSE = %lf
", RMSE); } static void compute_loss_function() { int input = 0; printf("Which loss function do you want to use?
"); printf(" 1 - RMSE
"); scanf("%d", &input); switch(input) { case 1: concatenate(cmd_gnu_0, lossname_RMSE); compute_RMSE(); printf("
"); break; default: printf("Wrong input try again
"); compute_loss_function(size); } } static void compute_r_square(size_t size) { double num_err = 0.0; double den_err = 0.0; size_t i; for (i = 0; i < size; i++) { num_err += (vector_y[i] - vector_predict[i]) * (vector_y[i] - vector_predict[i]); den_err += (vector_y[i] - mean_y) * (vector_y[i] - mean_y); } r_square = 1 - (num_err/den_err); printf("R_square = %lf
", r_square); } static void compute_predict_for_x() { double x = 0.0; printf("Please enter x value
"); scanf("%lf", &x); prediction = teta_0_intercept + (teta_1_grad * x); printf("y^ if x = %lf -> %lf
",x, prediction); } static void compute_max_x_y() { size_t i; double tmp1= 0.0; double tmp2= 0.0; for (i = 0; i < size; i++) { if (vector_x[i] > tmp1) { tmp1 = vector_x[i]; max_x = vector_x[i]; } if (vector_y[i] > tmp2) { tmp2 = vector_y[i]; max_y = vector_y[i]; } } printf("vector_x max value %lf
", max_x); printf("vector_y max value %lf
", max_y); } static void display_model_line() { sprintf(intercept, "%0.7lf", teta_0_intercept); sprintf(grad, "%0.7lf", teta_1_grad); sprintf(xrange, "%0.7lf", max_x + 1); sprintf(yrange, "%0.7lf", max_y + 1); concatenate(cmd_gnu_1, intercept); concatenate(cmd_gnu_2, grad); concatenate(cmd_gnu_3, xrange); concatenate(cmd_gnu_3, "]"); concatenate(cmd_gnu_4, yrange); concatenate(cmd_gnu_4, "]"); printf("grad = %s
", grad); printf("intercept = %s
", intercept); printf("xrange = %s
", xrange); printf("yrange = %s
", yrange); printf("cmd_gnu_0: %s
", cmd_gnu_0); printf("cmd_gnu_1: %s
", cmd_gnu_1); printf("cmd_gnu_2: %s
", cmd_gnu_2); printf("cmd_gnu_3: %s
", cmd_gnu_3); printf("cmd_gnu_4: %s
", cmd_gnu_4); printf("cmd_gnu_5: %s
", cmd_gnu_5); printf("cmd_gnu_6: %s
", cmd_gnu_6); /* print plot */ FILE *gnuplot_pipe = (FILE*)popen("gnuplot -persistent", "w"); FILE *temp = (FILE*)fopen("data.temp", "w"); /* create data.temp */ size_t i; for (i = 0; i < size; i++) { fprintf(temp, "%f %f 
", vector_x[i], vector_y[i]); } /* display gnuplot */ for (i = 0; i < 7; i++) { fprintf(gnuplot_pipe, "%s 
", commands_gnuplot[i]); } } int main(void) { printf("===========================================
"); printf("INPUT DATA
"); printf("===========================================
"); user_input(); display_vector(); printf("
"); printf("===========================================
"); printf("COMPUTE MEAN X:Y, TETA_1 TETA_0
"); printf("===========================================
"); compute_mean_x_y(); compute_max_x_y(); compute_teta_1_grad(); compute_teta_0_intercept(); printf("
"); printf("===========================================
"); printf("COMPUTE LOSS FUNCTION
"); printf("===========================================
"); compute_loss_function(); printf("===========================================
"); printf("COMPUTE R_square
"); printf("===========================================
"); compute_r_square(size); printf("
"); printf("===========================================
"); printf("COMPUTE y^ according to x
"); printf("===========================================
"); compute_predict_for_x(); printf("
"); printf("===========================================
"); printf("DISPLAY LINEAR REGRESSION
"); printf("===========================================
"); display_model_line(); printf("
"); return 0; }