root/trunk/regress.d

Revision 211, 30.2 kB (checked in by dsimcha, 2 weeks ago)

Fix ranges: Add save() and @property stuff.

Line 
1 /**A module for performing linear regression.  This module has an unusual
2  * interface, as it is range-based instead of matrix based. Values for
3  * independent variables are provided as either a tuple or a range of ranges.
4  * This means that one can use, for example, map, to fit high order models and
5  * lazily evaluate certain values.  (For details, see examples below.)
6  *
7  * Author:  David Simcha*/
8  /*
9  * License:
10  * Boost Software License - Version 1.0 - August 17th, 2003
11  *
12  * Permission is hereby granted, free of charge, to any person or organization
13  * obtaining a copy of the software and accompanying documentation covered by
14  * this license (the "Software") to use, reproduce, display, distribute,
15  * execute, and transmit the Software, and to prepare derivative works of the
16  * Software, and to permit third-parties to whom the Software is furnished to
17  * do so, all subject to the following:
18  *
19  * The copyright notices in the Software and this entire statement, including
20  * the above license grant, this restriction and the following disclaimer,
21  * must be included in all copies of the Software, in whole or in part, and
22  * all derivative works of the Software, unless such copies or derivative
23  * works are solely in the form of machine-executable object code generated by
24  * a source language processor.
25  *
26  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
29  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
30  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
31  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
32  * DEALINGS IN THE SOFTWARE.
33  */
34
35 module dstats.regress;
36
37 import std.math, std.algorithm, std.traits, std.array, std.traits, std.exception,
38     std.typetuple, std.typecons;
39
40 import dstats.alloc, std.range, std.conv, dstats.distrib, dstats.cor, dstats.base;
41
42 private void enforceConfidence(double conf) {
43     dstatsEnforce(conf >= 0 && conf <= 1,
44         "Confidence intervals must be between 0 and 1.");
45 }
46
47 ///
48 struct PowMap(ExpType, T)
49 if(isForwardRange!(T)) {
50     Unqual!T range;
51     Unqual!ExpType exponent;
52     double cache;
53
54     this(T range, ExpType exponent) {
55         this.range = range;
56         this.exponent = exponent;
57
58         static if(isIntegral!ExpType) {
59             cache = pow(cast(double) range.front, exponent);
60         } else {
61             cache = pow(cast(ExpType) range.front, exponent);
62         }
63     }
64
65     @property double front() const pure nothrow {
66         return cache;
67     }
68
69     void popFront() {
70         range.popFront;
71         if(!range.empty) {
72             cache = pow(cast(double) range.front, exponent);
73         }
74     }
75
76     @property typeof(this) save() {
77         return this;
78     }
79
80     @property bool empty() {
81         return range.empty;
82     }
83 }
84
85 /**Maps a forward range to a power determined at runtime.  ExpType is the type
86  * of the exponent.  Using an int is faster than using a double, but obviously
87  * less flexible.*/
88 PowMap!(ExpType, T) powMap(ExpType, T)(T range, ExpType exponent) {
89     alias PowMap!(ExpType, T) RT;
90     return RT(range, exponent);
91 }
92
93 // Very ad-hoc, does a bunch of matrix ops.  Written specifically to be
94 // efficient in the context used here.
95 private void rangeMatrixMulTrans(U, T...)
96 (out double[] xTy, out double[][] xTx, U vec, T matIn) {
97     static if(isArray!(T[0]) &&
98         isInputRange!(typeof(matIn[0][0])) && matIn.length == 1) {
99         alias typeof(matIn[0].front()) E;
100         typeof(matIn[0]) mat = tempdup(cast(E[]) matIn[0]);
101         scope(exit) TempAlloc.free;
102     } else {
103         alias matIn mat;
104     }
105
106     bool someEmpty() {
107         if(vec.empty) {
108             return true;
109         }
110         foreach(range; mat) {
111             if(range.empty) {
112                 return true;
113             }
114         }
115         return false;
116     }
117
118     void popAll() {
119         foreach(ti, range; mat) {
120             mat[ti].popFront;
121         }
122         vec.popFront;
123     }
124
125     xTy = newStack!double(mat.length);
126     xTy[] = 0;
127
128     xTx = newStack!(double[])(mat.length);
129     foreach(ref elem; xTx) {
130         elem = newStack!double(mat.length * 2);
131     }
132
133     foreach(row; xTx) {
134         row[] = 0;
135     }
136
137     while(!someEmpty) {
138         foreach(i, elem1; mat) {
139             double e1Front = cast(double) elem1.front;
140             xTy[i] += cast(double) elem1.front * cast(double) vec.front;
141             xTx[i][i] += e1Front * e1Front;
142             foreach(jMinusI, elem2; mat[i + 1..$]) {
143                 immutable j = i + 1 + jMinusI;
144                 double num = e1Front * cast(double) elem2.front;
145                 xTx[i][j] += num;
146                 xTx[j][i] += num;
147             }
148         }
149         popAll;
150     }
151 }
152
153 // Uses Gauss-Jordan elim. w/ row pivoting.  Not that efficient, but for the ad-hoc purposes
154 // it was meant for, it should be good enough.
155 void invert(ref double[][] mat) {
156     // Normalize, augment w/ identity.  The matrix is already the right size
157     // from rangeMatrixMulTrans.
158     foreach(i, row; mat) {
159         double absMax = 1.0L / reduce!(max)(map!(abs)(row[0..mat.length]));
160         row[0..mat.length] *= absMax;
161         row[i + mat.length] = absMax;
162     }
163
164     foreach(col; 0..mat.length) {
165         size_t bestRow;
166         double biggest = 0;
167         foreach(row; col..mat.length) {
168             if(abs(mat[row][col]) > biggest) {
169                 bestRow = row;
170                 biggest = abs(mat[row][col]);
171             }
172         }
173         swap(mat[col], mat[bestRow]);
174
175         foreach(row; 0..mat.length) {
176             if(row == col) {
177                 continue;
178             }
179             double ratio = mat[row][col] / mat[col][col];
180             foreach(i, ref elem; mat[row]) {
181                 elem -= mat[col][i] * ratio;
182             }
183         }
184     }
185
186
187     foreach(i; 0..mat.length) {
188         double diagVal = mat[i][i];
189         mat[i][] /= diagVal;
190     }
191
192     foreach(ref row; mat) {
193         row = row[mat.length..$];
194     }
195 }
196
197 /**Struct that holds the results of a linear regression.  It's a plain old
198  * data struct.*/
199 struct RegressRes {
200     /**The coefficients, one for each range in X.  These will be in the order
201      * that the X ranges were passed in.*/
202     double[] betas;
203
204     /**The standard error terms of the X ranges passed in.*/
205     double[] stdErr;
206
207     /**The lower confidence bounds of the beta terms, at the confidence level
208      * specificied.  (Default 0.95).*/
209     double[] lowerBound;
210
211     /**The upper confidence bounds of the beta terms, at the confidence level
212      * specificied.  (Default 0.95).*/
213     double[] upperBound;
214
215     /**The P-value for the alternative that the corresponding beta value is
216      * different from zero against the null that it is equal to zero.*/
217     double[] p;
218
219     /**The coefficient of determination.*/
220     double R2;
221
222     /**The adjusted coefficient of determination.*/
223     double adjustedR2;
224
225     /**The root mean square of the residuals.*/
226     double residualError;
227
228     /**The P-value for the model as a whole.  Based on an F-statistic.  The
229      * null here is that the model has no predictive value, the alternative
230      * is that it does.*/
231     double overallP;
232
233     // Just used internally.
234     private static string arrStr(T)(T arr) {
235         return text(arr)[1..$ - 1];
236     }
237
238     /**Print out the results in the default format.*/
239     string toString() {
240         return "Betas:  " ~ arrStr(betas) ~ "\nLower Conf. Int.:  " ~
241             arrStr(lowerBound) ~ "\nUpper Conf. Int.:  " ~ arrStr(upperBound) ~
242             "\nStd. Err:  " ~ arrStr(stdErr) ~ "\nP Values:  " ~ arrStr(p) ~
243             "\nR^2:  " ~ text(R2) ~
244             "\nAdjusted R^2:  " ~ text(adjustedR2) ~
245             "\nStd. Residual Error:  " ~ text(residualError)
246             ~ "\nOverall P:  " ~ text(overallP);
247     }
248 }
249
250 /**Struct returned by polyFit.*/
251 struct PolyFitRes(T) {
252
253     /**The array of PowMap ranges created by polyFit.*/
254     T X;
255
256     /**The rest of the results.  This is alias this'd.*/
257     RegressRes regressRes;
258     alias regressRes this;
259 }
260
261 /**Forward Range for holding the residuals from a regression analysis.*/
262 struct Residuals(F, U, T...) {
263     static if(T.length == 1 && isForwardRange!(typeof(T[0].front()))) {
264         alias T[0] R;
265     } else {
266         alias T R;
267     }
268
269     Unqual!U Y;
270     staticMap!(Unqual, R) X;
271     F[] betas;
272     double residual;
273     bool _empty;
274
275     void nextResidual() {
276         double sum = 0;
277         size_t i = 0;
278         foreach(elem; X) {
279             double frnt = elem.front;
280             sum += frnt * betas[i];
281             i++;
282         }
283         residual = Y.front - sum;//sum - Y.front;
284     }
285
286     this(F[] betas, U Y, R X) {
287         static if(is(typeof(X.length))) {
288             dstatsEnforce(X.length == betas.length,
289                 "Betas and X must have same length for residuals.");
290         } else {
291             dstatsEnforce(walkLength(X) == betas.length,
292                 "Betas and X must have same length for residuals.");
293         }
294
295         this.X = X;
296         this.Y = Y;
297         this.betas = betas;
298         if(Y.empty) {
299             _empty = true;
300             return;
301         }
302         foreach(elem; X) {
303             if(elem.empty) {
304                 _empty = true;
305                 return;
306             }
307         }
308         nextResidual;
309     }
310
311     @property double front() const pure nothrow {
312         return residual;
313     }
314
315     void popFront() {
316         Y.popFront;
317         if(Y.empty) {
318             _empty = true;
319             return;
320         }
321         foreach(ti, elem; X) {
322             X[ti].popFront;
323             if(X[ti].empty) {
324                 _empty = true;
325                 return;
326             }
327         }
328         nextResidual;
329     }
330
331     @property bool empty() const pure nothrow {
332         return _empty;
333     }
334
335     @property typeof(this) save() {
336         auto ret = this;
337         ret.Y = ret.Y.save;
338         foreach(ti, xElem; ret.X) {
339             ret.X[ti] = ret.X[ti].save;
340         }
341
342         return ret;
343     }
344 }
345
346 /**Given the beta coefficients from a linear regression, and X and Y values,
347  * returns a range that lazily computes the residuals.
348  */
349 Residuals!(F, U, T) residuals(F, U, T...)(F[] betas, U Y, T X)
350 if(isFloatingPoint!F && isForwardRange!U && allSatisfy!(isForwardRange, T)) {
351     alias Residuals!(F, U, T) RT;
352     return RT(betas, Y, X);
353 }
354
355 /**Perform a linear regression and return just the beta values.  The advantages
356  * to just returning the beta values are that it's faster and that each range
357  * needs to be iterated over only once, and thus can be just an input range.
358  * The beta values are returned such that the smallest index corresponds to
359  * the leftmost element of X.  X can be either a tuple or a range of input
360  * ranges.  Y must be an input range.
361  *
362  * Notes:  The X ranges are traversed in lockstep, but the traversal is stopped
363  * at the end of the shortest one.  Therefore, using infinite ranges is safe.
364  * For example, using repeat(1) to get an intercept term works.
365  *
366  * Examples:
367  * ---
368  * int[] nBeers = [8,6,7,5,3,0,9];
369  * int[] nCoffees = [3,6,2,4,3,6,8];
370  * int[] musicVolume = [3,1,4,1,5,9,2];
371  * int[] programmingSkill = [2,7,1,8,2,8,1];
372  * double[] betas = linearRegressBeta(programmingSkill, repeat(1), nBeers, nCoffees,
373  *     musicVolume, map!"a * a"(musicVolume));
374  * ---
375  */
376 double[] linearRegressBeta(U, T...)(U Y, T XIn)
377 if(allSatisfy!(isInputRange, T) && doubleInput!(U)) {
378     double[] dummy;
379     return linearRegressBetaBuf!(U, T)(dummy, Y, XIn);
380 }
381
382 /**Same as linearRegressBeta, but allows the user to specify a buffer for
383  * the beta terms.  If the buffer is too short, a new one is allocated.
384  * Otherwise, the results are returned in the user-provided buffer.
385  */
386 double[] linearRegressBetaBuf(U, T...)(double[] buf, U Y, T XIn)
387 if(allSatisfy!(isInputRange, T) && doubleInput!(U)) {
388     mixin(newFrame);
389     static if(isArray!(T[0]) && isInputRange!(typeof(XIn[0][0])) &&
390         T.length == 1) {
391         alias typeof(XIn[0].front) E;
392         E[] X = tempdup(XIn[0]);
393     } else {
394         alias XIn X;
395     }
396
397     double[][] xTx;
398     double[] xTy;
399     rangeMatrixMulTrans(xTy, xTx, Y, X);
400     invert(xTx);
401
402     double[] ret;
403     if(buf.length < X.length) {
404         ret = new double[X.length];
405     } else {
406         ret = buf[0..X.length];
407     }
408
409     foreach(i; 0..ret.length) {
410         ret[i] = 0;
411         foreach(j; 0..ret.length) {
412             ret[i] += xTx[i][j] * xTy[j];
413         }
414     }
415     return ret;
416 }
417
418 /**Perform a linear regression as in linearRegressBeta, but return a
419  * RegressRes with useful stuff for statistical inference.  If the last element
420  * of input is a real, this is used to specify the confidence intervals to
421  * be calculated.  Otherwise, the default of 0.95 is used.  The rest of input
422  * should be the elements of X.
423  *
424  * When using this function, which provides several useful statistics useful
425  * for inference, each range must be traversed twice.  This means:
426  *
427  * 1.  They have to be forward ranges, not input ranges.
428  *
429  * 2.  If you have a large amount of data and you're mapping it to some
430  *     expensive function, you may want to do this eagerly instead of lazily.
431  *
432  * Notes:  The X ranges are traversed in lockstep, but the traversal is stopped
433  * at the end of the shortest one.  Therefore, using infinite ranges is safe.
434  * For example, using repeat(1) to get an intercept term works.
435  *
436  * Bugs:  The statistical tests performed in this function assume that an
437  * intercept term is included in your regression model.  If no intercept term
438  * is included, the P-values, confidence intervals and adjusted R^2 values
439  * calculated by this function will be wrong.
440  *
441  * Examples:
442  * ---
443  * int[] nBeers = [8,6,7,5,3,0,9];
444  * int[] nCoffees = [3,6,2,4,3,6,8];
445  * int[] musicVolume = [3,1,4,1,5,9,2];
446  * int[] programmingSkill = [2,7,1,8,2,8,1];
447  *
448  * // Using default confidence interval:
449  * auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees,
450  *     musicVolume, map!"a * a"(musicVolume));
451  *
452  * // Using user-specified confidence interval:
453  * auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees,
454  *     musicVolume, map!"a * a"(musicVolume), 0.8675309);
455  * ---
456  */
457 RegressRes linearRegress(U, TC...)(U Y, TC input) {
458     static if(is(TC[$ - 1] : double)) {
459         double confLvl = input[$ - 1];
460         enforceConfidence(confLvl);
461         alias TC[0..$ - 1] T;
462         alias input[0..$ - 1] XIn;
463     } else {
464         double confLvl = 0.95; // Default;
465         alias TC T;
466         alias input XIn;
467     }
468
469     mixin(newFrame);
470     static if(isForwardRange!(T[0]) && isForwardRange!(typeof(XIn[0].front())) &&
471         T.length == 1) {
472
473         enum bool arrayX = true;
474         alias typeof(XIn[0].front) E;
475         E[] X = tempdup(XIn[0]);
476     } else static if(allSatisfy!(isForwardRange, T)) {
477         enum bool arrayX = false;
478         alias XIn X;
479     } else {
480         static assert(0, "Linear regression can only be performed with " ~
481             "tuples of forward ranges or ranges of forward ranges.");
482     }
483
484     double[][] xTx;
485     double[] xTy;
486
487     typeof(X) xSaved;
488     static if(arrayX) {
489         xSaved = X.tempdup;
490         foreach(ref elem; xSaved) {
491             elem = elem.save;
492         }
493     } else {
494         xSaved = saveAll(X).expand;
495     }
496
497     rangeMatrixMulTrans(xTy, xTx, Y.save, X);
498     invert(xTx);
499     double[] betas = new double[X.length];
500     foreach(i; 0..betas.length) {
501         betas[i] = 0;
502         foreach(j; 0..betas.length) {
503             betas[i] += xTx[i][j] * xTy[j];
504         }
505     }
506
507     auto residuals = residuals(betas, Y, X);
508     double S = 0;
509     ulong n = 0;
510     PearsonCor R2Calc;
511     for(; !residuals.empty; residuals.popFront) {
512         double residual = residuals.front;
513         S += residual * residual;
514         double Yfront = residuals.Y.front;
515         double predicted = Yfront - residual;
516         R2Calc.put(predicted, Yfront);
517         n++;
518     }
519     ulong df =  n - X.length;
520     double R2 = R2Calc.cor();
521     R2 *= R2;
522     double adjustedR2 = 1.0L - (1.0L - R2) * ((n - 1.0L) / df);
523
524     double sigma2 = S / (n - X.length);
525
526     double[] stdErr = new double[betas.length];
527     foreach(i, ref elem; stdErr) {
528         elem = sqrt( S * xTx[i][i] / df);
529     }
530
531     double[] lowerBound = new double[betas.length],
532            upperBound = new double[betas.length],
533            p = new double[betas.length];
534     foreach(i, beta; betas) {
535         p[i] = 2 * min(studentsTCDF(beta / stdErr[i], df),
536                        studentsTCDFR(beta / stdErr[i], df));
537         double delta = invStudentsTCDF(0.5 * (1 - confLvl), df) *
538              stdErr[i];
539         upperBound[i] = beta - delta;
540         lowerBound[i] = beta + delta;
541     }
542
543     double F = (R2 / (X.length - 1)) / ((1 - R2) / (n - X.length));
544     double overallP = fisherCDFR(F, X.length - 1, n - X.length);
545
546     return RegressRes(betas, stdErr, lowerBound, upperBound, p, R2,
547         adjustedR2, sqrt(sigma2), overallP);
548 }
549
550 /**Convenience function that takes a forward range X and a forward range Y,
551  * creates an array of PowMap structs for integer powers from 0 through N,
552  * and calls linearRegressBeta.
553  *
554  * Returns:  An array of doubles.  The index of each element corresponds to
555  * the exponent.  For example, the X<sup>2</sup> term will have an index of
556  * 2.
557  */
558 double[] polyFitBeta(T, U)(U Y, T X, uint N) {
559     double[] dummy;
560     return polyFitBetaBuf!(T, U)(dummy, Y, X, N);
561 }
562
563 /**Same as polyFitBeta, but allows the caller to provide an explicit buffer
564  * to return the coefficients in.  If it's too short, a new one will be
565  * allocated.  Otherwise, results will be returned in the user-provided buffer.
566  */
567 double[] polyFitBetaBuf(T, U)(double[] buf, U Y, T X, uint N) {
568     mixin(newFrame);
569     auto pows = newStack!(PowMap!(uint, T))(N + 1);
570     foreach(exponent; 0..N + 1) {
571         pows[exponent] = powMap(X, exponent);
572     }
573     return linearRegressBetaBuf(buf, Y, pows);
574 }
575
576 /**Convenience function that takes a forward range X and a forward range Y,
577  * creates an array of PowMap structs for integer powers 0 through N,
578  * and calls linearRegress.
579  *
580  * Returns:  A PolyFitRes containing the array of PowMap structs created and
581  * a RegressRes.  The PolyFitRes is alias this'd to the RegressRes.*/
582 PolyFitRes!(PowMap!(uint, T)[])
583 polyFit(T, U)(U Y, T X, uint N, double confInt = 0.95) {
584     enforceConfidence(confInt);
585     auto pows = new PowMap!(uint, T)[N + 1];
586     foreach(exponent; 0..N + 1) {
587         pows[exponent] = powMap(X, exponent);
588     }
589     alias PolyFitRes!(typeof(pows)) RT;
590     RT ret;
591     ret.X = pows;
592     ret.regressRes = linearRegress(Y, pows, confInt);
593     return ret;
594 }
595
596 version(unittest) {
597     import std.stdio;
598     void main(){}
599 }
600
601 unittest {
602     // These are a bunch of values gleaned from various examples on the Web.
603     double[] heights = [1.47,1.5,1.52,1.55,1.57,1.60,1.63,1.65,1.68,1.7,1.73,1.75,
604         1.78,1.8,1.83];
605     double[] weights = [52.21,53.12,54.48,55.84,57.2,58.57,59.93,61.29,63.11,64.47,
606         66.28,68.1,69.92,72.19,74.46];
607     float[] diseaseSev = [1.9,3.1,3.3,4.8,5.3,6.1,6.4,7.6,9.8,12.4];
608     ubyte[] temperature = [2,1,5,5,20,20,23,10,30,25];
609
610     // Values from R.
611     auto res1 = polyFit(diseaseSev, temperature, 1);
612     assert(approxEqual(res1.betas[0], 2.6623));
613     assert(approxEqual(res1.betas[1], 0.2417));
614     assert(approxEqual(res1.stdErr[0], 1.1008));
615     assert(approxEqual(res1.stdErr[1], 0.0635));
616     assert(approxEqual(res1.p[0], 0.0419));
617     assert(approxEqual(res1.p[1], 0.0052));
618     assert(approxEqual(res1.R2, 0.644));
619     assert(approxEqual(res1.adjustedR2, 0.6001));
620     assert(approxEqual(res1.residualError, 2.03));
621     assert(approxEqual(res1.overallP, 0.00518));
622
623
624     auto res2 = polyFit(weights, heights, 2);
625     assert(approxEqual(res2.betas[0], 128.813));
626     assert(approxEqual(res2.betas[1], -143.162));
627     assert(approxEqual(res2.betas[2], 61.960));
628
629     assert(approxEqual(res2.stdErr[0], 16.308));
630     assert(approxEqual(res2.stdErr[1], 19.833));
631     assert(approxEqual(res2.stdErr[2], 6.008));
632
633     assert(approxEqual(res2.p[0], 4.28e-6));
634     assert(approxEqual(res2.p[1], 1.06e-5));
635     assert(approxEqual(res2.p[2], 2.57e-7));
636
637     assert(approxEqual(res2.R2, 0.9989, 0.0001));
638     assert(approxEqual(res2.adjustedR2, 0.9987, 0.0001));
639
640     assert(approxEqual(res2.lowerBound[0], 92.9, 0.01));
641     assert(approxEqual(res2.lowerBound[1], -186.8, 0.01));
642     assert(approxEqual(res2.lowerBound[2], 48.7, 0.01));
643     assert(approxEqual(res2.upperBound[0], 164.7, 0.01));
644     assert(approxEqual(res2.upperBound[1], -99.5, 0.01));
645     assert(approxEqual(res2.upperBound[2], 75.2, 0.01));
646
647     auto res3 = linearRegress(weights, repeat(1), heights, map!"a * a"(heights));
648     assert(res2.betas == res3.betas);
649
650     double[2] beta1Buf;
651     auto beta1 = linearRegressBetaBuf
652         (beta1Buf[], diseaseSev, repeat(1), temperature);
653     assert(beta1Buf.ptr == beta1.ptr);
654     assert(beta1Buf[] == beta1[]);
655     assert(beta1 == res1.betas);
656     auto beta2 = polyFitBeta(weights, heights, 2);
657     assert(beta2 == res2.betas);
658
659     auto res4 = linearRegress(weights, repeat(1), heights);
660     assert(approxEqual(res4.p, 3.604e-14));
661     assert(approxEqual(res4.betas, [-39.062, 61.272]));
662     assert(approxEqual(res4.p, [6.05e-9, 3.60e-14]));
663     assert(approxEqual(res4.R2, 0.9892));
664     assert(approxEqual(res4.adjustedR2, 0.9884));
665     assert(approxEqual(res4.residualError, 0.7591));
666     assert(approxEqual(res4.lowerBound, [-45.40912, 57.43554]));
667     assert(approxEqual(res4.upperBound, [-32.71479, 65.10883]));
668
669     // Test residuals.
670     assert(approxEqual(residuals(res4.betas, weights, repeat(1), heights),
671         [1.20184170, 0.27367611,  0.40823237, -0.06993322,  0.06462305,
672          -0.40354255, -0.88170814,  -0.74715188, -0.76531747, -0.63076120,
673          -0.65892680, -0.06437053, -0.08253613,  0.96202014,  1.39385455]));
674 }
675
676 /**Computes a logistic regression using a maximum likelihood estimator
677  * and returns the beta coefficients.  This is a generalized linear model with
678  * the link function f(XB) = 1 / (1 + exp(XB)). This is generally used to model
679  * the probability that a binary Y variable is 1 given a set of X variables.
680  *
681  * For the purpose of this function, Y variables are interpreted as Booleans,
682  * regardless of their type.  X may be either a range of ranges or a tuple of
683  * ranges.  However, note that unlike in linearRegress, they are copied to an
684  * array if they are not random access ranges.  Note that each value is accessed
685  * several times, so if your range is a map to something expensive, you may
686  * want to evaluate it eagerly.
687  *
688  * Also note that, as in linearRegress, repeat(1) can be used for the intercept
689  * term.
690  *
691  * Returns:  The beta coefficients for the regression model.
692  *
693  * TODO:  Add hypothesis testing stuff and generalize to a parametrizable
694  *        generalized linear model function.
695  *
696  * References:
697  * http://en.wikipedia.org/wiki/Logistic_regression
698  * http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf
699  */
700 double[] logisticRegressBeta(T, U...)(T yIn, U xIn) {
701     mixin(newFrame);
702
703     static assert(!isInfinite!T, "Can't do regression with infinite # of Y's.");
704     static if(isRandomAccessRange!T) {
705         alias yIn y;
706     } else {
707         auto y = toBools(yIn);
708     }
709
710     static if(U.length == 1 && isRoR!U) {
711         static if(isForwardRange!U) {
712             auto x = toRandomAccessRoR(y.length, xIn);
713         } else {
714             auto x = toRandomAccessRoR(y.length, tempdup(xIn));
715         }
716     } else {
717         auto x = toRandomAccessTuple(xIn).expand;
718     }
719
720     auto beta = new double[x.length];
721     beta[] = 0;
722
723     doMLE(beta, y, x);
724
725     return beta;
726 }
727
728 unittest {
729     // Values from R.
730     alias approxEqual ae;  // Save typing.
731
732     // Start with the basics, with X as a ror.
733     auto y1 =  [1,   0, 0, 0, 1, 0, 0];
734     auto x1 = [[1.0, 1 ,1 ,1 ,1 ,1 ,1],
735               [8.0, 6, 7, 5, 3, 0, 9]];
736     auto res1 = logisticRegressBeta(y1, x1);
737     assert(ae(res1[0], -0.98273));
738     assert(ae(res1[1], 0.01219));
739
740     // Use tuple.
741     auto y2   = [1,0,1,1,0,1,0,0,0,1,0,1];
742     auto x2_1 = [3,1,4,1,5,9,2,6,5,3,5,8];
743     auto x2_2 = [2,7,1,8,2,8,1,8,2,8,4,5];
744     auto res2 = logisticRegressBeta(y2, repeat(1), x2_1, x2_2);
745     assert(ae(res2[0], -1.1875));
746     assert(ae(res2[1], 0.1021));
747     assert(ae(res2[2], 0.1603));
748
749     auto x2Intercept = [1,1,1,1,1,1,1,1,1,1,1,1];
750     auto res2a = logisticRegressBeta(y2,
751         filter!"a.length"([x2Intercept, x2_1, x2_2]));
752     assert(ae(res2a, res2));
753
754     // Use a huge range of values to test numerical stability.
755
756     // The filter is to make y3 a non-random access range.
757     auto y3 = filter!"a < 2"([1,1,1,1,0,0,0,0]);
758     auto x3_1 = filter!"a > 0"([1, 1e10, 2, 2e10, 3, 3e15, 4, 4e7]);
759     auto x3_2 = [1e8, 1e6, 1e7, 1e5, 1e3, 1e0, 1e9, 1e11];
760     auto x3_3 = [-5e12, 5e2, 6e5, 4e3, -999999, -666, -3e10, -2e10];
761     auto res3 = logisticRegressBeta(y3, repeat(1), x3_1, x3_2, x3_3);
762     assert(ae(res3[0], 1.115e0));
763     assert(ae(res3[1], -4.674e-15));
764     assert(ae(res3[2], -7.026e-9));
765     assert(ae(res3[3], -2.109e-12));
766
767     // Test with a just plain huge dataset that R chokes for several minutes
768     // on.  If you think this unittest is slow, try getting the reference
769     // values from R.
770     auto y4 = chain(
771                 take(cycle([0,0,0,0,1]), 500_000),
772                 take(cycle([1,1,1,1,0]), 500_000));
773     auto x4_1 = iota(0, 1_000_000);
774     auto x4_2 = map!exp(map!"a / 1_000_000.0"(x4_1));
775     auto x4_3 = take(cycle([1,2,3,4,5]), 1_000_000);
776     auto x4_4 = take(cycle([8,6,7,5,3,0,9]), 1_000_000);
777     auto res4 = logisticRegressBeta(y4, repeat(1), x4_1, x4_2, x4_3, x4_4);
778     assert(ae(res4[0], -1.574));
779     assert(ae(res4[1], 5.625e-6));
780     assert(ae(res4[2], -7.282e-1));
781     assert(ae(res4[3], -4.381e-6));
782     assert(ae(res4[4], -8.343e-6));
783 }
784
785 /// The inverse logit function used in logistic regression.
786 double inverseLogit(double xb) pure nothrow {
787     return 1.0 / (1 + exp(-xb));
788 }
789
790 private:
791 double doMLE(T, U...)(double[] beta, T y, U xIn) {
792     // This big, disgusting function uses the Newton-Raphson method as outlined
793     // in http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf
794     //
795     // The matrix operations are kind of obfuscated because they're written
796     // using very low-level primitives and with as little temp space as
797     // possible used.
798     static if(isRoR!(U[0]) && U.length == 1) {
799         alias xIn[0] x;
800     } else {
801         alias xIn x;
802     }
803
804     mixin(newFrame);
805     immutable N = y.length;
806
807     auto ps = newStack!double(y.length);
808
809     double[] xRow = newStack!double(beta.length);
810     void evalPs() {
811         foreach(i; 0..N) {
812
813             double prodSum = 0;
814             foreach(j, col; x) {
815                 prodSum += col[i] * beta[j];
816             }
817
818             ps[i] = inverseLogit(prodSum);
819             assert(ps[i] >= 0, text(ps[i]));
820             assert(ps[i] <= 1, text(ps[i]));
821         }
822     }
823
824     double logLikelihood() {
825         double sum = 0;
826         size_t i = 0;
827         foreach(yVal; y) {
828             scope(exit) i++;
829             if(yVal) {
830                 sum -= 2 * log(ps[i]);
831             } else {
832                 sum -= 2 * log(1 - ps[i]);
833             }
834         }
835         return sum;
836     }
837
838
839     enum eps = 1e-6;
840     enum maxIter = 1000;
841
842     auto oldLikelihood = double.infinity;
843
844     auto mat = newStack!(double[])(beta.length);
845     foreach(ref row; mat) {
846         // The *2 is for the augmentations scratch space for inversion.
847         row = newStack!double(beta.length * 2);
848     }
849
850     foreach(iter; 0..maxIter) {
851         evalPs();
852         immutable lh = logLikelihood();
853
854         if(oldLikelihood - lh < eps || isNaN(lh)) {
855             return lh;
856         }
857         oldLikelihood = lh;
858
859         foreach(i; 0..beta.length) {
860             mat[i] = mat[i][0..beta.length];
861             mat[i][] = 0;
862         }
863
864         // Calculate X' * W * X in the notation of our reference.  Since
865         // V is a diagonal matrix of ps[] * (1.0 - ps[]), we only have one
866         // dimension representing it.
867         foreach(i, dummy; x) foreach(j, dummy2; x) {
868             foreach(k; 0..ps.length) {
869                 mat[i][j] += (ps[k] * (1 - ps[k])) * x[i][k] * x[j][k];
870             }
871         }
872
873         foreach(i; 0..mat.length) {
874             // We allocated this augmentation area, but it got sliced away by
875             // invert().  Put it back.
876             mat[i] = mat[i].ptr[0..beta.length * 2];
877             mat[i][beta.length..$] = 0;
878         }
879
880         // Invert the intermediate matrix.
881         invert(mat);
882
883         // Now, multiply the resulting matrix by X' * (y - p).
884         foreach(betaIndex, ref b; beta) {
885             double diff = 0;
886
887             foreach(pIndex, p; ps) {
888                 immutable pDiff = (y[pIndex] != 0) ? (1.0 - p) : -1.0 * p;
889                 double sum = 0;
890                 foreach(betaIndex2, dummy; x) {
891                     diff += mat[betaIndex][betaIndex2] *
892                             x[betaIndex2][pIndex] * pDiff;
893                 }
894             }
895
896             b += diff;
897         }
898
899         debug(print) writeln("Iter:  ", iter);
900     }
901
902     return logLikelihood();
903 }
904
905 template isRoR(T) {
906     static if(!isInputRange!T) {
907         enum isRoR = false;
908     } else {
909         enum isRoR = isInputRange!(typeof(T.init.front()));
910     }
911 }
912
913 template isFloatMat(T) {
914     static if(is(T : const(float[][])) ||
915         is(T : const(real[][])) || is(T : const(double[][]))) {
916         enum isFloatMat = true;
917     } else {
918         enum isFloatMat = false;
919     }
920 }
921
922 template NonRandomToArray(T) {
923     static if(isRandomAccessRange!T) {
924         alias T NonRandomToArray;
925     } else {
926         alias Unqual!(ElementType!(T))[] NonRandomToArray;
927     }
928 }
929
930 bool[] toBools(R)(R range) {
931     return tempdup(map!"(a) ? true : false"(range));
932 }
933
934 auto toRandomAccessRoR(T)(uint len, T ror) {
935     static assert(isRoR!T);
936     alias ElementType!T E;
937     static if(isRandomAccessRange!T && isRandomAccessRange!E) {
938         return ror;
939     } else static if(!isRandomAccessRange!T && isRandomAccessRange!E) {
940         return tempdup(ror);
941     } else {
942         auto ret = newStack!(E[])(walkLength(ror.save));
943
944         foreach(ref col; ret) {
945             scope(exit) ror.popFront();
946             col = newStack!E(len);
947
948             size_t i;
949             foreach(elem; ror.front) {
950                 col[i++] = elem;
951             }
952         }
953
954         return ret;
955     }
956 }
957
958 auto toRandomAccessTuple(T...)(T input) {
959     Tuple!(staticMap!(NonRandomToArray, T)) ret;
960
961     foreach(ti, range; input) {
962         static if(isRandomAccessRange!(typeof(range))) {
963             ret.field[ti] = range;
964         } else {
965             ret.field[ti] = tempdup(range);
966         }
967     }
968
969     return ret;
970 }
Note: See TracBrowser for help on using the browser.