| 1 |
/**A module for performing linear regression. This module has an unusual |
|---|
| 2 |
* interface, as it is range-based instead of matrix based. Values for |
|---|
| 3 |
* independent variables are provided as either a tuple or a range of ranges. |
|---|
| 4 |
* This means that one can use, for example, map, to fit high order models and |
|---|
| 5 |
* lazily evaluate certain values. (For details, see examples below.) |
|---|
| 6 |
* |
|---|
| 7 |
* Author: David Simcha*/ |
|---|
| 8 |
/* |
|---|
| 9 |
* License: |
|---|
| 10 |
* Boost Software License - Version 1.0 - August 17th, 2003 |
|---|
| 11 |
* |
|---|
| 12 |
* Permission is hereby granted, free of charge, to any person or organization |
|---|
| 13 |
* obtaining a copy of the software and accompanying documentation covered by |
|---|
| 14 |
* this license (the "Software") to use, reproduce, display, distribute, |
|---|
| 15 |
* execute, and transmit the Software, and to prepare derivative works of the |
|---|
| 16 |
* Software, and to permit third-parties to whom the Software is furnished to |
|---|
| 17 |
* do so, all subject to the following: |
|---|
| 18 |
* |
|---|
| 19 |
* The copyright notices in the Software and this entire statement, including |
|---|
| 20 |
* the above license grant, this restriction and the following disclaimer, |
|---|
| 21 |
* must be included in all copies of the Software, in whole or in part, and |
|---|
| 22 |
* all derivative works of the Software, unless such copies or derivative |
|---|
| 23 |
* works are solely in the form of machine-executable object code generated by |
|---|
| 24 |
* a source language processor. |
|---|
| 25 |
* |
|---|
| 26 |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|---|
| 27 |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|---|
| 28 |
* FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT |
|---|
| 29 |
* SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE |
|---|
| 30 |
* FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, |
|---|
| 31 |
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|---|
| 32 |
* DEALINGS IN THE SOFTWARE. |
|---|
| 33 |
*/ |
|---|
| 34 |
|
|---|
| 35 |
module dstats.regress; |
|---|
| 36 |
|
|---|
| 37 |
import std.math, std.algorithm, std.traits, std.array, std.traits, std.exception, |
|---|
| 38 |
std.typetuple, std.typecons; |
|---|
| 39 |
|
|---|
| 40 |
import dstats.alloc, std.range, std.conv, dstats.distrib, dstats.cor, dstats.base; |
|---|
| 41 |
|
|---|
| 42 |
private void enforceConfidence(double conf) { |
|---|
| 43 |
dstatsEnforce(conf >= 0 && conf <= 1, |
|---|
| 44 |
"Confidence intervals must be between 0 and 1."); |
|---|
| 45 |
} |
|---|
| 46 |
|
|---|
| 47 |
/// |
|---|
| 48 |
struct PowMap(ExpType, T) |
|---|
| 49 |
if(isForwardRange!(T)) { |
|---|
| 50 |
Unqual!T range; |
|---|
| 51 |
Unqual!ExpType exponent; |
|---|
| 52 |
double cache; |
|---|
| 53 |
|
|---|
| 54 |
this(T range, ExpType exponent) { |
|---|
| 55 |
this.range = range; |
|---|
| 56 |
this.exponent = exponent; |
|---|
| 57 |
|
|---|
| 58 |
static if(isIntegral!ExpType) { |
|---|
| 59 |
cache = pow(cast(double) range.front, exponent); |
|---|
| 60 |
} else { |
|---|
| 61 |
cache = pow(cast(ExpType) range.front, exponent); |
|---|
| 62 |
} |
|---|
| 63 |
} |
|---|
| 64 |
|
|---|
| 65 |
@property double front() const pure nothrow { |
|---|
| 66 |
return cache; |
|---|
| 67 |
} |
|---|
| 68 |
|
|---|
| 69 |
void popFront() { |
|---|
| 70 |
range.popFront; |
|---|
| 71 |
if(!range.empty) { |
|---|
| 72 |
cache = pow(cast(double) range.front, exponent); |
|---|
| 73 |
} |
|---|
| 74 |
} |
|---|
| 75 |
|
|---|
| 76 |
@property typeof(this) save() { |
|---|
| 77 |
return this; |
|---|
| 78 |
} |
|---|
| 79 |
|
|---|
| 80 |
@property bool empty() { |
|---|
| 81 |
return range.empty; |
|---|
| 82 |
} |
|---|
| 83 |
} |
|---|
| 84 |
|
|---|
| 85 |
/**Maps a forward range to a power determined at runtime. ExpType is the type |
|---|
| 86 |
* of the exponent. Using an int is faster than using a double, but obviously |
|---|
| 87 |
* less flexible.*/ |
|---|
| 88 |
PowMap!(ExpType, T) powMap(ExpType, T)(T range, ExpType exponent) { |
|---|
| 89 |
alias PowMap!(ExpType, T) RT; |
|---|
| 90 |
return RT(range, exponent); |
|---|
| 91 |
} |
|---|
| 92 |
|
|---|
| 93 |
// Very ad-hoc, does a bunch of matrix ops. Written specifically to be |
|---|
| 94 |
// efficient in the context used here. |
|---|
| 95 |
private void rangeMatrixMulTrans(U, T...) |
|---|
| 96 |
(out double[] xTy, out double[][] xTx, U vec, T matIn) { |
|---|
| 97 |
static if(isArray!(T[0]) && |
|---|
| 98 |
isInputRange!(typeof(matIn[0][0])) && matIn.length == 1) { |
|---|
| 99 |
alias typeof(matIn[0].front()) E; |
|---|
| 100 |
typeof(matIn[0]) mat = tempdup(cast(E[]) matIn[0]); |
|---|
| 101 |
scope(exit) TempAlloc.free; |
|---|
| 102 |
} else { |
|---|
| 103 |
alias matIn mat; |
|---|
| 104 |
} |
|---|
| 105 |
|
|---|
| 106 |
bool someEmpty() { |
|---|
| 107 |
if(vec.empty) { |
|---|
| 108 |
return true; |
|---|
| 109 |
} |
|---|
| 110 |
foreach(range; mat) { |
|---|
| 111 |
if(range.empty) { |
|---|
| 112 |
return true; |
|---|
| 113 |
} |
|---|
| 114 |
} |
|---|
| 115 |
return false; |
|---|
| 116 |
} |
|---|
| 117 |
|
|---|
| 118 |
void popAll() { |
|---|
| 119 |
foreach(ti, range; mat) { |
|---|
| 120 |
mat[ti].popFront; |
|---|
| 121 |
} |
|---|
| 122 |
vec.popFront; |
|---|
| 123 |
} |
|---|
| 124 |
|
|---|
| 125 |
xTy = newStack!double(mat.length); |
|---|
| 126 |
xTy[] = 0; |
|---|
| 127 |
|
|---|
| 128 |
xTx = newStack!(double[])(mat.length); |
|---|
| 129 |
foreach(ref elem; xTx) { |
|---|
| 130 |
elem = newStack!double(mat.length * 2); |
|---|
| 131 |
} |
|---|
| 132 |
|
|---|
| 133 |
foreach(row; xTx) { |
|---|
| 134 |
row[] = 0; |
|---|
| 135 |
} |
|---|
| 136 |
|
|---|
| 137 |
while(!someEmpty) { |
|---|
| 138 |
foreach(i, elem1; mat) { |
|---|
| 139 |
double e1Front = cast(double) elem1.front; |
|---|
| 140 |
xTy[i] += cast(double) elem1.front * cast(double) vec.front; |
|---|
| 141 |
xTx[i][i] += e1Front * e1Front; |
|---|
| 142 |
foreach(jMinusI, elem2; mat[i + 1..$]) { |
|---|
| 143 |
immutable j = i + 1 + jMinusI; |
|---|
| 144 |
double num = e1Front * cast(double) elem2.front; |
|---|
| 145 |
xTx[i][j] += num; |
|---|
| 146 |
xTx[j][i] += num; |
|---|
| 147 |
} |
|---|
| 148 |
} |
|---|
| 149 |
popAll; |
|---|
| 150 |
} |
|---|
| 151 |
} |
|---|
| 152 |
|
|---|
| 153 |
// Uses Gauss-Jordan elim. w/ row pivoting. Not that efficient, but for the ad-hoc purposes |
|---|
| 154 |
// it was meant for, it should be good enough. |
|---|
| 155 |
void invert(ref double[][] mat) { |
|---|
| 156 |
// Normalize, augment w/ identity. The matrix is already the right size |
|---|
| 157 |
// from rangeMatrixMulTrans. |
|---|
| 158 |
foreach(i, row; mat) { |
|---|
| 159 |
double absMax = 1.0L / reduce!(max)(map!(abs)(row[0..mat.length])); |
|---|
| 160 |
row[0..mat.length] *= absMax; |
|---|
| 161 |
row[i + mat.length] = absMax; |
|---|
| 162 |
} |
|---|
| 163 |
|
|---|
| 164 |
foreach(col; 0..mat.length) { |
|---|
| 165 |
size_t bestRow; |
|---|
| 166 |
double biggest = 0; |
|---|
| 167 |
foreach(row; col..mat.length) { |
|---|
| 168 |
if(abs(mat[row][col]) > biggest) { |
|---|
| 169 |
bestRow = row; |
|---|
| 170 |
biggest = abs(mat[row][col]); |
|---|
| 171 |
} |
|---|
| 172 |
} |
|---|
| 173 |
swap(mat[col], mat[bestRow]); |
|---|
| 174 |
|
|---|
| 175 |
foreach(row; 0..mat.length) { |
|---|
| 176 |
if(row == col) { |
|---|
| 177 |
continue; |
|---|
| 178 |
} |
|---|
| 179 |
double ratio = mat[row][col] / mat[col][col]; |
|---|
| 180 |
foreach(i, ref elem; mat[row]) { |
|---|
| 181 |
elem -= mat[col][i] * ratio; |
|---|
| 182 |
} |
|---|
| 183 |
} |
|---|
| 184 |
} |
|---|
| 185 |
|
|---|
| 186 |
|
|---|
| 187 |
foreach(i; 0..mat.length) { |
|---|
| 188 |
double diagVal = mat[i][i]; |
|---|
| 189 |
mat[i][] /= diagVal; |
|---|
| 190 |
} |
|---|
| 191 |
|
|---|
| 192 |
foreach(ref row; mat) { |
|---|
| 193 |
row = row[mat.length..$]; |
|---|
| 194 |
} |
|---|
| 195 |
} |
|---|
| 196 |
|
|---|
| 197 |
/**Struct that holds the results of a linear regression. It's a plain old |
|---|
| 198 |
* data struct.*/ |
|---|
| 199 |
struct RegressRes { |
|---|
| 200 |
/**The coefficients, one for each range in X. These will be in the order |
|---|
| 201 |
* that the X ranges were passed in.*/ |
|---|
| 202 |
double[] betas; |
|---|
| 203 |
|
|---|
| 204 |
/**The standard error terms of the X ranges passed in.*/ |
|---|
| 205 |
double[] stdErr; |
|---|
| 206 |
|
|---|
| 207 |
/**The lower confidence bounds of the beta terms, at the confidence level |
|---|
| 208 |
* specificied. (Default 0.95).*/ |
|---|
| 209 |
double[] lowerBound; |
|---|
| 210 |
|
|---|
| 211 |
/**The upper confidence bounds of the beta terms, at the confidence level |
|---|
| 212 |
* specificied. (Default 0.95).*/ |
|---|
| 213 |
double[] upperBound; |
|---|
| 214 |
|
|---|
| 215 |
/**The P-value for the alternative that the corresponding beta value is |
|---|
| 216 |
* different from zero against the null that it is equal to zero.*/ |
|---|
| 217 |
double[] p; |
|---|
| 218 |
|
|---|
| 219 |
/**The coefficient of determination.*/ |
|---|
| 220 |
double R2; |
|---|
| 221 |
|
|---|
| 222 |
/**The adjusted coefficient of determination.*/ |
|---|
| 223 |
double adjustedR2; |
|---|
| 224 |
|
|---|
| 225 |
/**The root mean square of the residuals.*/ |
|---|
| 226 |
double residualError; |
|---|
| 227 |
|
|---|
| 228 |
/**The P-value for the model as a whole. Based on an F-statistic. The |
|---|
| 229 |
* null here is that the model has no predictive value, the alternative |
|---|
| 230 |
* is that it does.*/ |
|---|
| 231 |
double overallP; |
|---|
| 232 |
|
|---|
| 233 |
// Just used internally. |
|---|
| 234 |
private static string arrStr(T)(T arr) { |
|---|
| 235 |
return text(arr)[1..$ - 1]; |
|---|
| 236 |
} |
|---|
| 237 |
|
|---|
| 238 |
/**Print out the results in the default format.*/ |
|---|
| 239 |
string toString() { |
|---|
| 240 |
return "Betas: " ~ arrStr(betas) ~ "\nLower Conf. Int.: " ~ |
|---|
| 241 |
arrStr(lowerBound) ~ "\nUpper Conf. Int.: " ~ arrStr(upperBound) ~ |
|---|
| 242 |
"\nStd. Err: " ~ arrStr(stdErr) ~ "\nP Values: " ~ arrStr(p) ~ |
|---|
| 243 |
"\nR^2: " ~ text(R2) ~ |
|---|
| 244 |
"\nAdjusted R^2: " ~ text(adjustedR2) ~ |
|---|
| 245 |
"\nStd. Residual Error: " ~ text(residualError) |
|---|
| 246 |
~ "\nOverall P: " ~ text(overallP); |
|---|
| 247 |
} |
|---|
| 248 |
} |
|---|
| 249 |
|
|---|
| 250 |
/**Struct returned by polyFit.*/ |
|---|
| 251 |
struct PolyFitRes(T) { |
|---|
| 252 |
|
|---|
| 253 |
/**The array of PowMap ranges created by polyFit.*/ |
|---|
| 254 |
T X; |
|---|
| 255 |
|
|---|
| 256 |
/**The rest of the results. This is alias this'd.*/ |
|---|
| 257 |
RegressRes regressRes; |
|---|
| 258 |
alias regressRes this; |
|---|
| 259 |
} |
|---|
| 260 |
|
|---|
| 261 |
/**Forward Range for holding the residuals from a regression analysis.*/ |
|---|
| 262 |
struct Residuals(F, U, T...) { |
|---|
| 263 |
static if(T.length == 1 && isForwardRange!(typeof(T[0].front()))) { |
|---|
| 264 |
alias T[0] R; |
|---|
| 265 |
} else { |
|---|
| 266 |
alias T R; |
|---|
| 267 |
} |
|---|
| 268 |
|
|---|
| 269 |
Unqual!U Y; |
|---|
| 270 |
staticMap!(Unqual, R) X; |
|---|
| 271 |
F[] betas; |
|---|
| 272 |
double residual; |
|---|
| 273 |
bool _empty; |
|---|
| 274 |
|
|---|
| 275 |
void nextResidual() { |
|---|
| 276 |
double sum = 0; |
|---|
| 277 |
size_t i = 0; |
|---|
| 278 |
foreach(elem; X) { |
|---|
| 279 |
double frnt = elem.front; |
|---|
| 280 |
sum += frnt * betas[i]; |
|---|
| 281 |
i++; |
|---|
| 282 |
} |
|---|
| 283 |
residual = Y.front - sum;//sum - Y.front; |
|---|
| 284 |
} |
|---|
| 285 |
|
|---|
| 286 |
this(F[] betas, U Y, R X) { |
|---|
| 287 |
static if(is(typeof(X.length))) { |
|---|
| 288 |
dstatsEnforce(X.length == betas.length, |
|---|
| 289 |
"Betas and X must have same length for residuals."); |
|---|
| 290 |
} else { |
|---|
| 291 |
dstatsEnforce(walkLength(X) == betas.length, |
|---|
| 292 |
"Betas and X must have same length for residuals."); |
|---|
| 293 |
} |
|---|
| 294 |
|
|---|
| 295 |
this.X = X; |
|---|
| 296 |
this.Y = Y; |
|---|
| 297 |
this.betas = betas; |
|---|
| 298 |
if(Y.empty) { |
|---|
| 299 |
_empty = true; |
|---|
| 300 |
return; |
|---|
| 301 |
} |
|---|
| 302 |
foreach(elem; X) { |
|---|
| 303 |
if(elem.empty) { |
|---|
| 304 |
_empty = true; |
|---|
| 305 |
return; |
|---|
| 306 |
} |
|---|
| 307 |
} |
|---|
| 308 |
nextResidual; |
|---|
| 309 |
} |
|---|
| 310 |
|
|---|
| 311 |
@property double front() const pure nothrow { |
|---|
| 312 |
return residual; |
|---|
| 313 |
} |
|---|
| 314 |
|
|---|
| 315 |
void popFront() { |
|---|
| 316 |
Y.popFront; |
|---|
| 317 |
if(Y.empty) { |
|---|
| 318 |
_empty = true; |
|---|
| 319 |
return; |
|---|
| 320 |
} |
|---|
| 321 |
foreach(ti, elem; X) { |
|---|
| 322 |
X[ti].popFront; |
|---|
| 323 |
if(X[ti].empty) { |
|---|
| 324 |
_empty = true; |
|---|
| 325 |
return; |
|---|
| 326 |
} |
|---|
| 327 |
} |
|---|
| 328 |
nextResidual; |
|---|
| 329 |
} |
|---|
| 330 |
|
|---|
| 331 |
@property bool empty() const pure nothrow { |
|---|
| 332 |
return _empty; |
|---|
| 333 |
} |
|---|
| 334 |
|
|---|
| 335 |
@property typeof(this) save() { |
|---|
| 336 |
auto ret = this; |
|---|
| 337 |
ret.Y = ret.Y.save; |
|---|
| 338 |
foreach(ti, xElem; ret.X) { |
|---|
| 339 |
ret.X[ti] = ret.X[ti].save; |
|---|
| 340 |
} |
|---|
| 341 |
|
|---|
| 342 |
return ret; |
|---|
| 343 |
} |
|---|
| 344 |
} |
|---|
| 345 |
|
|---|
| 346 |
/**Given the beta coefficients from a linear regression, and X and Y values, |
|---|
| 347 |
* returns a range that lazily computes the residuals. |
|---|
| 348 |
*/ |
|---|
| 349 |
Residuals!(F, U, T) residuals(F, U, T...)(F[] betas, U Y, T X) |
|---|
| 350 |
if(isFloatingPoint!F && isForwardRange!U && allSatisfy!(isForwardRange, T)) { |
|---|
| 351 |
alias Residuals!(F, U, T) RT; |
|---|
| 352 |
return RT(betas, Y, X); |
|---|
| 353 |
} |
|---|
| 354 |
|
|---|
| 355 |
/**Perform a linear regression and return just the beta values. The advantages |
|---|
| 356 |
* to just returning the beta values are that it's faster and that each range |
|---|
| 357 |
* needs to be iterated over only once, and thus can be just an input range. |
|---|
| 358 |
* The beta values are returned such that the smallest index corresponds to |
|---|
| 359 |
* the leftmost element of X. X can be either a tuple or a range of input |
|---|
| 360 |
* ranges. Y must be an input range. |
|---|
| 361 |
* |
|---|
| 362 |
* Notes: The X ranges are traversed in lockstep, but the traversal is stopped |
|---|
| 363 |
* at the end of the shortest one. Therefore, using infinite ranges is safe. |
|---|
| 364 |
* For example, using repeat(1) to get an intercept term works. |
|---|
| 365 |
* |
|---|
| 366 |
* Examples: |
|---|
| 367 |
* --- |
|---|
| 368 |
* int[] nBeers = [8,6,7,5,3,0,9]; |
|---|
| 369 |
* int[] nCoffees = [3,6,2,4,3,6,8]; |
|---|
| 370 |
* int[] musicVolume = [3,1,4,1,5,9,2]; |
|---|
| 371 |
* int[] programmingSkill = [2,7,1,8,2,8,1]; |
|---|
| 372 |
* double[] betas = linearRegressBeta(programmingSkill, repeat(1), nBeers, nCoffees, |
|---|
| 373 |
* musicVolume, map!"a * a"(musicVolume)); |
|---|
| 374 |
* --- |
|---|
| 375 |
*/ |
|---|
| 376 |
double[] linearRegressBeta(U, T...)(U Y, T XIn) |
|---|
| 377 |
if(allSatisfy!(isInputRange, T) && doubleInput!(U)) { |
|---|
| 378 |
double[] dummy; |
|---|
| 379 |
return linearRegressBetaBuf!(U, T)(dummy, Y, XIn); |
|---|
| 380 |
} |
|---|
| 381 |
|
|---|
| 382 |
/**Same as linearRegressBeta, but allows the user to specify a buffer for |
|---|
| 383 |
* the beta terms. If the buffer is too short, a new one is allocated. |
|---|
| 384 |
* Otherwise, the results are returned in the user-provided buffer. |
|---|
| 385 |
*/ |
|---|
| 386 |
double[] linearRegressBetaBuf(U, T...)(double[] buf, U Y, T XIn) |
|---|
| 387 |
if(allSatisfy!(isInputRange, T) && doubleInput!(U)) { |
|---|
| 388 |
mixin(newFrame); |
|---|
| 389 |
static if(isArray!(T[0]) && isInputRange!(typeof(XIn[0][0])) && |
|---|
| 390 |
T.length == 1) { |
|---|
| 391 |
alias typeof(XIn[0].front) E; |
|---|
| 392 |
E[] X = tempdup(XIn[0]); |
|---|
| 393 |
} else { |
|---|
| 394 |
alias XIn X; |
|---|
| 395 |
} |
|---|
| 396 |
|
|---|
| 397 |
double[][] xTx; |
|---|
| 398 |
double[] xTy; |
|---|
| 399 |
rangeMatrixMulTrans(xTy, xTx, Y, X); |
|---|
| 400 |
invert(xTx); |
|---|
| 401 |
|
|---|
| 402 |
double[] ret; |
|---|
| 403 |
if(buf.length < X.length) { |
|---|
| 404 |
ret = new double[X.length]; |
|---|
| 405 |
} else { |
|---|
| 406 |
ret = buf[0..X.length]; |
|---|
| 407 |
} |
|---|
| 408 |
|
|---|
| 409 |
foreach(i; 0..ret.length) { |
|---|
| 410 |
ret[i] = 0; |
|---|
| 411 |
foreach(j; 0..ret.length) { |
|---|
| 412 |
ret[i] += xTx[i][j] * xTy[j]; |
|---|
| 413 |
} |
|---|
| 414 |
} |
|---|
| 415 |
return ret; |
|---|
| 416 |
} |
|---|
| 417 |
|
|---|
| 418 |
/**Perform a linear regression as in linearRegressBeta, but return a |
|---|
| 419 |
* RegressRes with useful stuff for statistical inference. If the last element |
|---|
| 420 |
* of input is a real, this is used to specify the confidence intervals to |
|---|
| 421 |
* be calculated. Otherwise, the default of 0.95 is used. The rest of input |
|---|
| 422 |
* should be the elements of X. |
|---|
| 423 |
* |
|---|
| 424 |
* When using this function, which provides several useful statistics useful |
|---|
| 425 |
* for inference, each range must be traversed twice. This means: |
|---|
| 426 |
* |
|---|
| 427 |
* 1. They have to be forward ranges, not input ranges. |
|---|
| 428 |
* |
|---|
| 429 |
* 2. If you have a large amount of data and you're mapping it to some |
|---|
| 430 |
* expensive function, you may want to do this eagerly instead of lazily. |
|---|
| 431 |
* |
|---|
| 432 |
* Notes: The X ranges are traversed in lockstep, but the traversal is stopped |
|---|
| 433 |
* at the end of the shortest one. Therefore, using infinite ranges is safe. |
|---|
| 434 |
* For example, using repeat(1) to get an intercept term works. |
|---|
| 435 |
* |
|---|
| 436 |
* Bugs: The statistical tests performed in this function assume that an |
|---|
| 437 |
* intercept term is included in your regression model. If no intercept term |
|---|
| 438 |
* is included, the P-values, confidence intervals and adjusted R^2 values |
|---|
| 439 |
* calculated by this function will be wrong. |
|---|
| 440 |
* |
|---|
| 441 |
* Examples: |
|---|
| 442 |
* --- |
|---|
| 443 |
* int[] nBeers = [8,6,7,5,3,0,9]; |
|---|
| 444 |
* int[] nCoffees = [3,6,2,4,3,6,8]; |
|---|
| 445 |
* int[] musicVolume = [3,1,4,1,5,9,2]; |
|---|
| 446 |
* int[] programmingSkill = [2,7,1,8,2,8,1]; |
|---|
| 447 |
* |
|---|
| 448 |
* // Using default confidence interval: |
|---|
| 449 |
* auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees, |
|---|
| 450 |
* musicVolume, map!"a * a"(musicVolume)); |
|---|
| 451 |
* |
|---|
| 452 |
* // Using user-specified confidence interval: |
|---|
| 453 |
* auto results = linearRegress(programmingSkill, repeat(1), nBeers, nCoffees, |
|---|
| 454 |
* musicVolume, map!"a * a"(musicVolume), 0.8675309); |
|---|
| 455 |
* --- |
|---|
| 456 |
*/ |
|---|
| 457 |
RegressRes linearRegress(U, TC...)(U Y, TC input) { |
|---|
| 458 |
static if(is(TC[$ - 1] : double)) { |
|---|
| 459 |
double confLvl = input[$ - 1]; |
|---|
| 460 |
enforceConfidence(confLvl); |
|---|
| 461 |
alias TC[0..$ - 1] T; |
|---|
| 462 |
alias input[0..$ - 1] XIn; |
|---|
| 463 |
} else { |
|---|
| 464 |
double confLvl = 0.95; // Default; |
|---|
| 465 |
alias TC T; |
|---|
| 466 |
alias input XIn; |
|---|
| 467 |
} |
|---|
| 468 |
|
|---|
| 469 |
mixin(newFrame); |
|---|
| 470 |
static if(isForwardRange!(T[0]) && isForwardRange!(typeof(XIn[0].front())) && |
|---|
| 471 |
T.length == 1) { |
|---|
| 472 |
|
|---|
| 473 |
enum bool arrayX = true; |
|---|
| 474 |
alias typeof(XIn[0].front) E; |
|---|
| 475 |
E[] X = tempdup(XIn[0]); |
|---|
| 476 |
} else static if(allSatisfy!(isForwardRange, T)) { |
|---|
| 477 |
enum bool arrayX = false; |
|---|
| 478 |
alias XIn X; |
|---|
| 479 |
} else { |
|---|
| 480 |
static assert(0, "Linear regression can only be performed with " ~ |
|---|
| 481 |
"tuples of forward ranges or ranges of forward ranges."); |
|---|
| 482 |
} |
|---|
| 483 |
|
|---|
| 484 |
double[][] xTx; |
|---|
| 485 |
double[] xTy; |
|---|
| 486 |
|
|---|
| 487 |
typeof(X) xSaved; |
|---|
| 488 |
static if(arrayX) { |
|---|
| 489 |
xSaved = X.tempdup; |
|---|
| 490 |
foreach(ref elem; xSaved) { |
|---|
| 491 |
elem = elem.save; |
|---|
| 492 |
} |
|---|
| 493 |
} else { |
|---|
| 494 |
xSaved = saveAll(X).expand; |
|---|
| 495 |
} |
|---|
| 496 |
|
|---|
| 497 |
rangeMatrixMulTrans(xTy, xTx, Y.save, X); |
|---|
| 498 |
invert(xTx); |
|---|
| 499 |
double[] betas = new double[X.length]; |
|---|
| 500 |
foreach(i; 0..betas.length) { |
|---|
| 501 |
betas[i] = 0; |
|---|
| 502 |
foreach(j; 0..betas.length) { |
|---|
| 503 |
betas[i] += xTx[i][j] * xTy[j]; |
|---|
| 504 |
} |
|---|
| 505 |
} |
|---|
| 506 |
|
|---|
| 507 |
auto residuals = residuals(betas, Y, X); |
|---|
| 508 |
double S = 0; |
|---|
| 509 |
ulong n = 0; |
|---|
| 510 |
PearsonCor R2Calc; |
|---|
| 511 |
for(; !residuals.empty; residuals.popFront) { |
|---|
| 512 |
double residual = residuals.front; |
|---|
| 513 |
S += residual * residual; |
|---|
| 514 |
double Yfront = residuals.Y.front; |
|---|
| 515 |
double predicted = Yfront - residual; |
|---|
| 516 |
R2Calc.put(predicted, Yfront); |
|---|
| 517 |
n++; |
|---|
| 518 |
} |
|---|
| 519 |
ulong df = n - X.length; |
|---|
| 520 |
double R2 = R2Calc.cor(); |
|---|
| 521 |
R2 *= R2; |
|---|
| 522 |
double adjustedR2 = 1.0L - (1.0L - R2) * ((n - 1.0L) / df); |
|---|
| 523 |
|
|---|
| 524 |
double sigma2 = S / (n - X.length); |
|---|
| 525 |
|
|---|
| 526 |
double[] stdErr = new double[betas.length]; |
|---|
| 527 |
foreach(i, ref elem; stdErr) { |
|---|
| 528 |
elem = sqrt( S * xTx[i][i] / df); |
|---|
| 529 |
} |
|---|
| 530 |
|
|---|
| 531 |
double[] lowerBound = new double[betas.length], |
|---|
| 532 |
upperBound = new double[betas.length], |
|---|
| 533 |
p = new double[betas.length]; |
|---|
| 534 |
foreach(i, beta; betas) { |
|---|
| 535 |
p[i] = 2 * min(studentsTCDF(beta / stdErr[i], df), |
|---|
| 536 |
studentsTCDFR(beta / stdErr[i], df)); |
|---|
| 537 |
double delta = invStudentsTCDF(0.5 * (1 - confLvl), df) * |
|---|
| 538 |
stdErr[i]; |
|---|
| 539 |
upperBound[i] = beta - delta; |
|---|
| 540 |
lowerBound[i] = beta + delta; |
|---|
| 541 |
} |
|---|
| 542 |
|
|---|
| 543 |
double F = (R2 / (X.length - 1)) / ((1 - R2) / (n - X.length)); |
|---|
| 544 |
double overallP = fisherCDFR(F, X.length - 1, n - X.length); |
|---|
| 545 |
|
|---|
| 546 |
return RegressRes(betas, stdErr, lowerBound, upperBound, p, R2, |
|---|
| 547 |
adjustedR2, sqrt(sigma2), overallP); |
|---|
| 548 |
} |
|---|
| 549 |
|
|---|
| 550 |
/**Convenience function that takes a forward range X and a forward range Y, |
|---|
| 551 |
* creates an array of PowMap structs for integer powers from 0 through N, |
|---|
| 552 |
* and calls linearRegressBeta. |
|---|
| 553 |
* |
|---|
| 554 |
* Returns: An array of doubles. The index of each element corresponds to |
|---|
| 555 |
* the exponent. For example, the X<sup>2</sup> term will have an index of |
|---|
| 556 |
* 2. |
|---|
| 557 |
*/ |
|---|
| 558 |
double[] polyFitBeta(T, U)(U Y, T X, uint N) { |
|---|
| 559 |
double[] dummy; |
|---|
| 560 |
return polyFitBetaBuf!(T, U)(dummy, Y, X, N); |
|---|
| 561 |
} |
|---|
| 562 |
|
|---|
| 563 |
/**Same as polyFitBeta, but allows the caller to provide an explicit buffer |
|---|
| 564 |
* to return the coefficients in. If it's too short, a new one will be |
|---|
| 565 |
* allocated. Otherwise, results will be returned in the user-provided buffer. |
|---|
| 566 |
*/ |
|---|
| 567 |
double[] polyFitBetaBuf(T, U)(double[] buf, U Y, T X, uint N) { |
|---|
| 568 |
mixin(newFrame); |
|---|
| 569 |
auto pows = newStack!(PowMap!(uint, T))(N + 1); |
|---|
| 570 |
foreach(exponent; 0..N + 1) { |
|---|
| 571 |
pows[exponent] = powMap(X, exponent); |
|---|
| 572 |
} |
|---|
| 573 |
return linearRegressBetaBuf(buf, Y, pows); |
|---|
| 574 |
} |
|---|
| 575 |
|
|---|
| 576 |
/**Convenience function that takes a forward range X and a forward range Y, |
|---|
| 577 |
* creates an array of PowMap structs for integer powers 0 through N, |
|---|
| 578 |
* and calls linearRegress. |
|---|
| 579 |
* |
|---|
| 580 |
* Returns: A PolyFitRes containing the array of PowMap structs created and |
|---|
| 581 |
* a RegressRes. The PolyFitRes is alias this'd to the RegressRes.*/ |
|---|
| 582 |
PolyFitRes!(PowMap!(uint, T)[]) |
|---|
| 583 |
polyFit(T, U)(U Y, T X, uint N, double confInt = 0.95) { |
|---|
| 584 |
enforceConfidence(confInt); |
|---|
| 585 |
auto pows = new PowMap!(uint, T)[N + 1]; |
|---|
| 586 |
foreach(exponent; 0..N + 1) { |
|---|
| 587 |
pows[exponent] = powMap(X, exponent); |
|---|
| 588 |
} |
|---|
| 589 |
alias PolyFitRes!(typeof(pows)) RT; |
|---|
| 590 |
RT ret; |
|---|
| 591 |
ret.X = pows; |
|---|
| 592 |
ret.regressRes = linearRegress(Y, pows, confInt); |
|---|
| 593 |
return ret; |
|---|
| 594 |
} |
|---|
| 595 |
|
|---|
| 596 |
version(unittest) { |
|---|
| 597 |
import std.stdio; |
|---|
| 598 |
void main(){} |
|---|
| 599 |
} |
|---|
| 600 |
|
|---|
| 601 |
unittest { |
|---|
| 602 |
// These are a bunch of values gleaned from various examples on the Web. |
|---|
| 603 |
double[] heights = [1.47,1.5,1.52,1.55,1.57,1.60,1.63,1.65,1.68,1.7,1.73,1.75, |
|---|
| 604 |
1.78,1.8,1.83]; |
|---|
| 605 |
double[] weights = [52.21,53.12,54.48,55.84,57.2,58.57,59.93,61.29,63.11,64.47, |
|---|
| 606 |
66.28,68.1,69.92,72.19,74.46]; |
|---|
| 607 |
float[] diseaseSev = [1.9,3.1,3.3,4.8,5.3,6.1,6.4,7.6,9.8,12.4]; |
|---|
| 608 |
ubyte[] temperature = [2,1,5,5,20,20,23,10,30,25]; |
|---|
| 609 |
|
|---|
| 610 |
// Values from R. |
|---|
| 611 |
auto res1 = polyFit(diseaseSev, temperature, 1); |
|---|
| 612 |
assert(approxEqual(res1.betas[0], 2.6623)); |
|---|
| 613 |
assert(approxEqual(res1.betas[1], 0.2417)); |
|---|
| 614 |
assert(approxEqual(res1.stdErr[0], 1.1008)); |
|---|
| 615 |
assert(approxEqual(res1.stdErr[1], 0.0635)); |
|---|
| 616 |
assert(approxEqual(res1.p[0], 0.0419)); |
|---|
| 617 |
assert(approxEqual(res1.p[1], 0.0052)); |
|---|
| 618 |
assert(approxEqual(res1.R2, 0.644)); |
|---|
| 619 |
assert(approxEqual(res1.adjustedR2, 0.6001)); |
|---|
| 620 |
assert(approxEqual(res1.residualError, 2.03)); |
|---|
| 621 |
assert(approxEqual(res1.overallP, 0.00518)); |
|---|
| 622 |
|
|---|
| 623 |
|
|---|
| 624 |
auto res2 = polyFit(weights, heights, 2); |
|---|
| 625 |
assert(approxEqual(res2.betas[0], 128.813)); |
|---|
| 626 |
assert(approxEqual(res2.betas[1], -143.162)); |
|---|
| 627 |
assert(approxEqual(res2.betas[2], 61.960)); |
|---|
| 628 |
|
|---|
| 629 |
assert(approxEqual(res2.stdErr[0], 16.308)); |
|---|
| 630 |
assert(approxEqual(res2.stdErr[1], 19.833)); |
|---|
| 631 |
assert(approxEqual(res2.stdErr[2], 6.008)); |
|---|
| 632 |
|
|---|
| 633 |
assert(approxEqual(res2.p[0], 4.28e-6)); |
|---|
| 634 |
assert(approxEqual(res2.p[1], 1.06e-5)); |
|---|
| 635 |
assert(approxEqual(res2.p[2], 2.57e-7)); |
|---|
| 636 |
|
|---|
| 637 |
assert(approxEqual(res2.R2, 0.9989, 0.0001)); |
|---|
| 638 |
assert(approxEqual(res2.adjustedR2, 0.9987, 0.0001)); |
|---|
| 639 |
|
|---|
| 640 |
assert(approxEqual(res2.lowerBound[0], 92.9, 0.01)); |
|---|
| 641 |
assert(approxEqual(res2.lowerBound[1], -186.8, 0.01)); |
|---|
| 642 |
assert(approxEqual(res2.lowerBound[2], 48.7, 0.01)); |
|---|
| 643 |
assert(approxEqual(res2.upperBound[0], 164.7, 0.01)); |
|---|
| 644 |
assert(approxEqual(res2.upperBound[1], -99.5, 0.01)); |
|---|
| 645 |
assert(approxEqual(res2.upperBound[2], 75.2, 0.01)); |
|---|
| 646 |
|
|---|
| 647 |
auto res3 = linearRegress(weights, repeat(1), heights, map!"a * a"(heights)); |
|---|
| 648 |
assert(res2.betas == res3.betas); |
|---|
| 649 |
|
|---|
| 650 |
double[2] beta1Buf; |
|---|
| 651 |
auto beta1 = linearRegressBetaBuf |
|---|
| 652 |
(beta1Buf[], diseaseSev, repeat(1), temperature); |
|---|
| 653 |
assert(beta1Buf.ptr == beta1.ptr); |
|---|
| 654 |
assert(beta1Buf[] == beta1[]); |
|---|
| 655 |
assert(beta1 == res1.betas); |
|---|
| 656 |
auto beta2 = polyFitBeta(weights, heights, 2); |
|---|
| 657 |
assert(beta2 == res2.betas); |
|---|
| 658 |
|
|---|
| 659 |
auto res4 = linearRegress(weights, repeat(1), heights); |
|---|
| 660 |
assert(approxEqual(res4.p, 3.604e-14)); |
|---|
| 661 |
assert(approxEqual(res4.betas, [-39.062, 61.272])); |
|---|
| 662 |
assert(approxEqual(res4.p, [6.05e-9, 3.60e-14])); |
|---|
| 663 |
assert(approxEqual(res4.R2, 0.9892)); |
|---|
| 664 |
assert(approxEqual(res4.adjustedR2, 0.9884)); |
|---|
| 665 |
assert(approxEqual(res4.residualError, 0.7591)); |
|---|
| 666 |
assert(approxEqual(res4.lowerBound, [-45.40912, 57.43554])); |
|---|
| 667 |
assert(approxEqual(res4.upperBound, [-32.71479, 65.10883])); |
|---|
| 668 |
|
|---|
| 669 |
// Test residuals. |
|---|
| 670 |
assert(approxEqual(residuals(res4.betas, weights, repeat(1), heights), |
|---|
| 671 |
[1.20184170, 0.27367611, 0.40823237, -0.06993322, 0.06462305, |
|---|
| 672 |
-0.40354255, -0.88170814, -0.74715188, -0.76531747, -0.63076120, |
|---|
| 673 |
-0.65892680, -0.06437053, -0.08253613, 0.96202014, 1.39385455])); |
|---|
| 674 |
} |
|---|
| 675 |
|
|---|
| 676 |
/**Computes a logistic regression using a maximum likelihood estimator |
|---|
| 677 |
* and returns the beta coefficients. This is a generalized linear model with |
|---|
| 678 |
* the link function f(XB) = 1 / (1 + exp(XB)). This is generally used to model |
|---|
| 679 |
* the probability that a binary Y variable is 1 given a set of X variables. |
|---|
| 680 |
* |
|---|
| 681 |
* For the purpose of this function, Y variables are interpreted as Booleans, |
|---|
| 682 |
* regardless of their type. X may be either a range of ranges or a tuple of |
|---|
| 683 |
* ranges. However, note that unlike in linearRegress, they are copied to an |
|---|
| 684 |
* array if they are not random access ranges. Note that each value is accessed |
|---|
| 685 |
* several times, so if your range is a map to something expensive, you may |
|---|
| 686 |
* want to evaluate it eagerly. |
|---|
| 687 |
* |
|---|
| 688 |
* Also note that, as in linearRegress, repeat(1) can be used for the intercept |
|---|
| 689 |
* term. |
|---|
| 690 |
* |
|---|
| 691 |
* Returns: The beta coefficients for the regression model. |
|---|
| 692 |
* |
|---|
| 693 |
* TODO: Add hypothesis testing stuff and generalize to a parametrizable |
|---|
| 694 |
* generalized linear model function. |
|---|
| 695 |
* |
|---|
| 696 |
* References: |
|---|
| 697 |
* http://en.wikipedia.org/wiki/Logistic_regression |
|---|
| 698 |
* http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf |
|---|
| 699 |
*/ |
|---|
| 700 |
double[] logisticRegressBeta(T, U...)(T yIn, U xIn) { |
|---|
| 701 |
mixin(newFrame); |
|---|
| 702 |
|
|---|
| 703 |
static assert(!isInfinite!T, "Can't do regression with infinite # of Y's."); |
|---|
| 704 |
static if(isRandomAccessRange!T) { |
|---|
| 705 |
alias yIn y; |
|---|
| 706 |
} else { |
|---|
| 707 |
auto y = toBools(yIn); |
|---|
| 708 |
} |
|---|
| 709 |
|
|---|
| 710 |
static if(U.length == 1 && isRoR!U) { |
|---|
| 711 |
static if(isForwardRange!U) { |
|---|
| 712 |
auto x = toRandomAccessRoR(y.length, xIn); |
|---|
| 713 |
} else { |
|---|
| 714 |
auto x = toRandomAccessRoR(y.length, tempdup(xIn)); |
|---|
| 715 |
} |
|---|
| 716 |
} else { |
|---|
| 717 |
auto x = toRandomAccessTuple(xIn).expand; |
|---|
| 718 |
} |
|---|
| 719 |
|
|---|
| 720 |
auto beta = new double[x.length]; |
|---|
| 721 |
beta[] = 0; |
|---|
| 722 |
|
|---|
| 723 |
doMLE(beta, y, x); |
|---|
| 724 |
|
|---|
| 725 |
return beta; |
|---|
| 726 |
} |
|---|
| 727 |
|
|---|
| 728 |
unittest { |
|---|
| 729 |
// Values from R. |
|---|
| 730 |
alias approxEqual ae; // Save typing. |
|---|
| 731 |
|
|---|
| 732 |
// Start with the basics, with X as a ror. |
|---|
| 733 |
auto y1 = [1, 0, 0, 0, 1, 0, 0]; |
|---|
| 734 |
auto x1 = [[1.0, 1 ,1 ,1 ,1 ,1 ,1], |
|---|
| 735 |
[8.0, 6, 7, 5, 3, 0, 9]]; |
|---|
| 736 |
auto res1 = logisticRegressBeta(y1, x1); |
|---|
| 737 |
assert(ae(res1[0], -0.98273)); |
|---|
| 738 |
assert(ae(res1[1], 0.01219)); |
|---|
| 739 |
|
|---|
| 740 |
// Use tuple. |
|---|
| 741 |
auto y2 = [1,0,1,1,0,1,0,0,0,1,0,1]; |
|---|
| 742 |
auto x2_1 = [3,1,4,1,5,9,2,6,5,3,5,8]; |
|---|
| 743 |
auto x2_2 = [2,7,1,8,2,8,1,8,2,8,4,5]; |
|---|
| 744 |
auto res2 = logisticRegressBeta(y2, repeat(1), x2_1, x2_2); |
|---|
| 745 |
assert(ae(res2[0], -1.1875)); |
|---|
| 746 |
assert(ae(res2[1], 0.1021)); |
|---|
| 747 |
assert(ae(res2[2], 0.1603)); |
|---|
| 748 |
|
|---|
| 749 |
auto x2Intercept = [1,1,1,1,1,1,1,1,1,1,1,1]; |
|---|
| 750 |
auto res2a = logisticRegressBeta(y2, |
|---|
| 751 |
filter!"a.length"([x2Intercept, x2_1, x2_2])); |
|---|
| 752 |
assert(ae(res2a, res2)); |
|---|
| 753 |
|
|---|
| 754 |
// Use a huge range of values to test numerical stability. |
|---|
| 755 |
|
|---|
| 756 |
// The filter is to make y3 a non-random access range. |
|---|
| 757 |
auto y3 = filter!"a < 2"([1,1,1,1,0,0,0,0]); |
|---|
| 758 |
auto x3_1 = filter!"a > 0"([1, 1e10, 2, 2e10, 3, 3e15, 4, 4e7]); |
|---|
| 759 |
auto x3_2 = [1e8, 1e6, 1e7, 1e5, 1e3, 1e0, 1e9, 1e11]; |
|---|
| 760 |
auto x3_3 = [-5e12, 5e2, 6e5, 4e3, -999999, -666, -3e10, -2e10]; |
|---|
| 761 |
auto res3 = logisticRegressBeta(y3, repeat(1), x3_1, x3_2, x3_3); |
|---|
| 762 |
assert(ae(res3[0], 1.115e0)); |
|---|
| 763 |
assert(ae(res3[1], -4.674e-15)); |
|---|
| 764 |
assert(ae(res3[2], -7.026e-9)); |
|---|
| 765 |
assert(ae(res3[3], -2.109e-12)); |
|---|
| 766 |
|
|---|
| 767 |
// Test with a just plain huge dataset that R chokes for several minutes |
|---|
| 768 |
// on. If you think this unittest is slow, try getting the reference |
|---|
| 769 |
// values from R. |
|---|
| 770 |
auto y4 = chain( |
|---|
| 771 |
take(cycle([0,0,0,0,1]), 500_000), |
|---|
| 772 |
take(cycle([1,1,1,1,0]), 500_000)); |
|---|
| 773 |
auto x4_1 = iota(0, 1_000_000); |
|---|
| 774 |
auto x4_2 = map!exp(map!"a / 1_000_000.0"(x4_1)); |
|---|
| 775 |
auto x4_3 = take(cycle([1,2,3,4,5]), 1_000_000); |
|---|
| 776 |
auto x4_4 = take(cycle([8,6,7,5,3,0,9]), 1_000_000); |
|---|
| 777 |
auto res4 = logisticRegressBeta(y4, repeat(1), x4_1, x4_2, x4_3, x4_4); |
|---|
| 778 |
assert(ae(res4[0], -1.574)); |
|---|
| 779 |
assert(ae(res4[1], 5.625e-6)); |
|---|
| 780 |
assert(ae(res4[2], -7.282e-1)); |
|---|
| 781 |
assert(ae(res4[3], -4.381e-6)); |
|---|
| 782 |
assert(ae(res4[4], -8.343e-6)); |
|---|
| 783 |
} |
|---|
| 784 |
|
|---|
| 785 |
/// The inverse logit function used in logistic regression. |
|---|
| 786 |
double inverseLogit(double xb) pure nothrow { |
|---|
| 787 |
return 1.0 / (1 + exp(-xb)); |
|---|
| 788 |
} |
|---|
| 789 |
|
|---|
| 790 |
private: |
|---|
| 791 |
double doMLE(T, U...)(double[] beta, T y, U xIn) { |
|---|
| 792 |
// This big, disgusting function uses the Newton-Raphson method as outlined |
|---|
| 793 |
// in http://socserv.mcmaster.ca/jfox/Courses/UCLA/logistic-regression-notes.pdf |
|---|
| 794 |
// |
|---|
| 795 |
// The matrix operations are kind of obfuscated because they're written |
|---|
| 796 |
// using very low-level primitives and with as little temp space as |
|---|
| 797 |
// possible used. |
|---|
| 798 |
static if(isRoR!(U[0]) && U.length == 1) { |
|---|
| 799 |
alias xIn[0] x; |
|---|
| 800 |
} else { |
|---|
| 801 |
alias xIn x; |
|---|
| 802 |
} |
|---|
| 803 |
|
|---|
| 804 |
mixin(newFrame); |
|---|
| 805 |
immutable N = y.length; |
|---|
| 806 |
|
|---|
| 807 |
auto ps = newStack!double(y.length); |
|---|
| 808 |
|
|---|
| 809 |
double[] xRow = newStack!double(beta.length); |
|---|
| 810 |
void evalPs() { |
|---|
| 811 |
foreach(i; 0..N) { |
|---|
| 812 |
|
|---|
| 813 |
double prodSum = 0; |
|---|
| 814 |
foreach(j, col; x) { |
|---|
| 815 |
prodSum += col[i] * beta[j]; |
|---|
| 816 |
} |
|---|
| 817 |
|
|---|
| 818 |
ps[i] = inverseLogit(prodSum); |
|---|
| 819 |
assert(ps[i] >= 0, text(ps[i])); |
|---|
| 820 |
assert(ps[i] <= 1, text(ps[i])); |
|---|
| 821 |
} |
|---|
| 822 |
} |
|---|
| 823 |
|
|---|
| 824 |
double logLikelihood() { |
|---|
| 825 |
double sum = 0; |
|---|
| 826 |
size_t i = 0; |
|---|
| 827 |
foreach(yVal; y) { |
|---|
| 828 |
scope(exit) i++; |
|---|
| 829 |
if(yVal) { |
|---|
| 830 |
sum -= 2 * log(ps[i]); |
|---|
| 831 |
} else { |
|---|
| 832 |
sum -= 2 * log(1 - ps[i]); |
|---|
| 833 |
} |
|---|
| 834 |
} |
|---|
| 835 |
return sum; |
|---|
| 836 |
} |
|---|
| 837 |
|
|---|
| 838 |
|
|---|
| 839 |
enum eps = 1e-6; |
|---|
| 840 |
enum maxIter = 1000; |
|---|
| 841 |
|
|---|
| 842 |
auto oldLikelihood = double.infinity; |
|---|
| 843 |
|
|---|
| 844 |
auto mat = newStack!(double[])(beta.length); |
|---|
| 845 |
foreach(ref row; mat) { |
|---|
| 846 |
// The *2 is for the augmentations scratch space for inversion. |
|---|
| 847 |
row = newStack!double(beta.length * 2); |
|---|
| 848 |
} |
|---|
| 849 |
|
|---|
| 850 |
foreach(iter; 0..maxIter) { |
|---|
| 851 |
evalPs(); |
|---|
| 852 |
immutable lh = logLikelihood(); |
|---|
| 853 |
|
|---|
| 854 |
if(oldLikelihood - lh < eps || isNaN(lh)) { |
|---|
| 855 |
return lh; |
|---|
| 856 |
} |
|---|
| 857 |
oldLikelihood = lh; |
|---|
| 858 |
|
|---|
| 859 |
foreach(i; 0..beta.length) { |
|---|
| 860 |
mat[i] = mat[i][0..beta.length]; |
|---|
| 861 |
mat[i][] = 0; |
|---|
| 862 |
} |
|---|
| 863 |
|
|---|
| 864 |
// Calculate X' * W * X in the notation of our reference. Since |
|---|
| 865 |
// V is a diagonal matrix of ps[] * (1.0 - ps[]), we only have one |
|---|
| 866 |
// dimension representing it. |
|---|
| 867 |
foreach(i, dummy; x) foreach(j, dummy2; x) { |
|---|
| 868 |
foreach(k; 0..ps.length) { |
|---|
| 869 |
mat[i][j] += (ps[k] * (1 - ps[k])) * x[i][k] * x[j][k]; |
|---|
| 870 |
} |
|---|
| 871 |
} |
|---|
| 872 |
|
|---|
| 873 |
foreach(i; 0..mat.length) { |
|---|
| 874 |
// We allocated this augmentation area, but it got sliced away by |
|---|
| 875 |
// invert(). Put it back. |
|---|
| 876 |
mat[i] = mat[i].ptr[0..beta.length * 2]; |
|---|
| 877 |
mat[i][beta.length..$] = 0; |
|---|
| 878 |
} |
|---|
| 879 |
|
|---|
| 880 |
// Invert the intermediate matrix. |
|---|
| 881 |
invert(mat); |
|---|
| 882 |
|
|---|
| 883 |
// Now, multiply the resulting matrix by X' * (y - p). |
|---|
| 884 |
foreach(betaIndex, ref b; beta) { |
|---|
| 885 |
double diff = 0; |
|---|
| 886 |
|
|---|
| 887 |
foreach(pIndex, p; ps) { |
|---|
| 888 |
immutable pDiff = (y[pIndex] != 0) ? (1.0 - p) : -1.0 * p; |
|---|
| 889 |
double sum = 0; |
|---|
| 890 |
foreach(betaIndex2, dummy; x) { |
|---|
| 891 |
diff += mat[betaIndex][betaIndex2] * |
|---|
| 892 |
x[betaIndex2][pIndex] * pDiff; |
|---|
| 893 |
} |
|---|
| 894 |
} |
|---|
| 895 |
|
|---|
| 896 |
b += diff; |
|---|
| 897 |
} |
|---|
| 898 |
|
|---|
| 899 |
debug(print) writeln("Iter: ", iter); |
|---|
| 900 |
} |
|---|
| 901 |
|
|---|
| 902 |
return logLikelihood(); |
|---|
| 903 |
} |
|---|
| 904 |
|
|---|
| 905 |
template isRoR(T) { |
|---|
| 906 |
static if(!isInputRange!T) { |
|---|
| 907 |
enum isRoR = false; |
|---|
| 908 |
} else { |
|---|
| 909 |
enum isRoR = isInputRange!(typeof(T.init.front())); |
|---|
| 910 |
} |
|---|
| 911 |
} |
|---|
| 912 |
|
|---|
| 913 |
template isFloatMat(T) { |
|---|
| 914 |
static if(is(T : const(float[][])) || |
|---|
| 915 |
is(T : const(real[][])) || is(T : const(double[][]))) { |
|---|
| 916 |
enum isFloatMat = true; |
|---|
| 917 |
} else { |
|---|
| 918 |
enum isFloatMat = false; |
|---|
| 919 |
} |
|---|
| 920 |
} |
|---|
| 921 |
|
|---|
| 922 |
template NonRandomToArray(T) { |
|---|
| 923 |
static if(isRandomAccessRange!T) { |
|---|
| 924 |
alias T NonRandomToArray; |
|---|
| 925 |
} else { |
|---|
| 926 |
alias Unqual!(ElementType!(T))[] NonRandomToArray; |
|---|
| 927 |
} |
|---|
| 928 |
} |
|---|
| 929 |
|
|---|
| 930 |
bool[] toBools(R)(R range) { |
|---|
| 931 |
return tempdup(map!"(a) ? true : false"(range)); |
|---|
| 932 |
} |
|---|
| 933 |
|
|---|
| 934 |
auto toRandomAccessRoR(T)(uint len, T ror) { |
|---|
| 935 |
static assert(isRoR!T); |
|---|
| 936 |
alias ElementType!T E; |
|---|
| 937 |
static if(isRandomAccessRange!T && isRandomAccessRange!E) { |
|---|
| 938 |
return ror; |
|---|
| 939 |
} else static if(!isRandomAccessRange!T && isRandomAccessRange!E) { |
|---|
| 940 |
return tempdup(ror); |
|---|
| 941 |
} else { |
|---|
| 942 |
auto ret = newStack!(E[])(walkLength(ror.save)); |
|---|
| 943 |
|
|---|
| 944 |
foreach(ref col; ret) { |
|---|
| 945 |
scope(exit) ror.popFront(); |
|---|
| 946 |
col = newStack!E(len); |
|---|
| 947 |
|
|---|
| 948 |
size_t i; |
|---|
| 949 |
foreach(elem; ror.front) { |
|---|
| 950 |
col[i++] = elem; |
|---|
| 951 |
} |
|---|
| 952 |
} |
|---|
| 953 |
|
|---|
| 954 |
return ret; |
|---|
| 955 |
} |
|---|
| 956 |
} |
|---|
| 957 |
|
|---|
| 958 |
auto toRandomAccessTuple(T...)(T input) { |
|---|
| 959 |
Tuple!(staticMap!(NonRandomToArray, T)) ret; |
|---|
| 960 |
|
|---|
| 961 |
foreach(ti, range; input) { |
|---|
| 962 |
static if(isRandomAccessRange!(typeof(range))) { |
|---|
| 963 |
ret.field[ti] = range; |
|---|
| 964 |
} else { |
|---|
| 965 |
ret.field[ti] = tempdup(range); |
|---|
| 966 |
} |
|---|
| 967 |
} |
|---|
| 968 |
|
|---|
| 969 |
return ret; |
|---|
| 970 |
} |
|---|