[8783660] | 1 | /* |
---|
| 2 | * Linpack 100x100 Benchmark In C/C++ For PCs |
---|
| 3 | * |
---|
| 4 | ******************************************************************** |
---|
| 5 | * |
---|
| 6 | * Original Source from NETLIB |
---|
| 7 | * |
---|
| 8 | * Translated to C by Bonnie Toy 5/88 (modified on 2/25/94 to fix |
---|
| 9 | * a problem with daxpy for unequal increments or equal increments |
---|
| 10 | * not equal to 1. Jack Dongarra) |
---|
| 11 | * |
---|
| 12 | * To obtain rolled source BLAS, add -DROLL to the command lines. |
---|
| 13 | * To obtain unrolled source BLAS, add -DUNROLL to the command lines. |
---|
| 14 | * |
---|
| 15 | * You must specify one of -DSP or -DDP to compile correctly. |
---|
| 16 | * |
---|
| 17 | * You must specify one of -DROLL or -DUNROLL to compile correctly. |
---|
| 18 | * |
---|
| 19 | ******************************************************************** |
---|
| 20 | * |
---|
| 21 | * Changes in this version |
---|
| 22 | * |
---|
| 23 | * 1. Function prototypes are declared and function headers have |
---|
| 24 | * embedded parameter types to produce code for C and C++ |
---|
| 25 | * |
---|
| 26 | * 2. Arrays aa and a are declared as [200*200] and [200*201] to |
---|
| 27 | * allow compilation with prototypes. |
---|
| 28 | * |
---|
| 29 | * 3. Function second changed (compiler dependent). |
---|
| 30 | * |
---|
| 31 | * 4. Timing method changed due to inaccuracy of PC clock (see below). |
---|
| 32 | * |
---|
| 33 | * 5. Additional date function included (compiler dependent). |
---|
| 34 | * |
---|
| 35 | * 6. Additional code used as a standard for a series of benchmarks:- |
---|
| 36 | * Automatic run time calibration rather than fixed parameters |
---|
| 37 | * Initial calibration with display to show linearity |
---|
| 38 | * Results displayed at reasonable rate for viewing (5 seconds) |
---|
| 39 | * Facilities for typing in details of system used etc. |
---|
| 40 | * Compiler details in code in case .exe files used elsewhere |
---|
| 41 | * Results appended to a text file (Linpack.txt) |
---|
| 42 | * |
---|
| 43 | * Roy Longbottom 101323.2241@compuserve.com 14 September 1996 |
---|
| 44 | * |
---|
| 45 | ************************************************************************ |
---|
| 46 | * |
---|
| 47 | * Timing |
---|
| 48 | * |
---|
| 49 | * The PC timer is updated at about 18 times per second or resolution of |
---|
| 50 | * 0.05 to 0.06 seconds which is similar to the time taken by the main |
---|
| 51 | * time consuming function dgefa on a 100 MHz Pentium. Thus there is no |
---|
| 52 | * point in running the dgefa/dges1 combination three times as in the |
---|
| 53 | * original version. Main timing for the latter, in the loop run NTIMES, |
---|
| 54 | * executes matgen/dgefa, summing the time taken by matgen within the |
---|
| 55 | * loop for later deduction from the total time. On a modern PC this sum |
---|
| 56 | * can be based on a random selection of 0 or 0.05/0.06. This version |
---|
| 57 | * executes the single pass once and the main timing loop five times, |
---|
| 58 | * calculating the matgen overhead separately. |
---|
| 59 | * |
---|
| 60 | ************************************************************************* |
---|
| 61 | * |
---|
| 62 | * Example of Output |
---|
| 63 | * |
---|
| 64 | * Rolled Double Precision Linpack Benchmark - PC Version in 'C/C++' |
---|
| 65 | * |
---|
| 66 | * Compiler Watcom C/C++ 10.5 Win 386 |
---|
| 67 | * Optimisation -zp4 -otexan -fp5 -5r -dDP -dROLL |
---|
| 68 | * |
---|
| 69 | * |
---|
| 70 | * norm resid resid machep x[0]-1 x[n-1]-1 |
---|
| 71 | * 0.4 7.41628980e-014 1.00000000e-015 -1.49880108e-014 -1.89848137e-014 |
---|
| 72 | * |
---|
| 73 | * |
---|
| 74 | * Times are reported for matrices of order 100 |
---|
| 75 | * 1 pass times for array with leading dimension of 201 |
---|
| 76 | * |
---|
| 77 | * dgefa dgesl total Mflops unit ratio |
---|
| 78 | * 0.06000 0.00000 0.06000 11.44 0.1748 1.0714 |
---|
| 79 | * |
---|
| 80 | * |
---|
| 81 | * Calculating matgen overhead |
---|
| 82 | * |
---|
| 83 | * 10 times 0.11 seconds |
---|
| 84 | * 20 times 0.22 seconds |
---|
| 85 | * 40 times 0.44 seconds |
---|
| 86 | * 80 times 0.87 seconds |
---|
| 87 | * 160 times 1.76 seconds |
---|
| 88 | * 320 times 3.52 seconds |
---|
| 89 | * 640 times 7.03 seconds |
---|
| 90 | * |
---|
| 91 | * Overhead for 1 matgen 0.01098 seconds |
---|
| 92 | * |
---|
| 93 | * |
---|
| 94 | * Calculating matgen/dgefa passes for 5 seconds |
---|
| 95 | * |
---|
| 96 | * 10 times 0.71 seconds |
---|
| 97 | * 20 times 1.38 seconds |
---|
| 98 | * 40 times 2.80 seconds |
---|
| 99 | * 80 times 5.66 seconds |
---|
| 100 | * |
---|
| 101 | * Passes used 70 |
---|
| 102 | * |
---|
| 103 | * This is followed by output of the normal data for dgefa, dges1, |
---|
| 104 | * total, Mflops, unit and ratio with five sets of results for each. |
---|
| 105 | * |
---|
| 106 | ************************************************************************ |
---|
| 107 | * |
---|
| 108 | * Example from output file Linpack.txt |
---|
| 109 | * |
---|
| 110 | * LINPACK BENCHMARK FOR PCs 'C/C++' n @ 100 |
---|
| 111 | * |
---|
| 112 | * Month run 9/1996 |
---|
| 113 | * PC model Escom |
---|
| 114 | * CPU Pentium |
---|
| 115 | * Clock MHz 100 |
---|
| 116 | * Cache 256K |
---|
| 117 | * Options Neptune chipset |
---|
| 118 | * OS/DOS Windows 95 |
---|
| 119 | * Compiler Watcom C/C++ 10.5 Win 386 |
---|
| 120 | * OptLevel -zp4 -otexan -fp5 -5r -dDP -dROLL |
---|
| 121 | * Run by Roy Longbottom |
---|
| 122 | * From UK |
---|
| 123 | * Mail 101323.2241@compuserve.com |
---|
| 124 | * |
---|
| 125 | * Rolling Rolled |
---|
| 126 | * Precision Double |
---|
| 127 | * norm. resid 0.4 |
---|
| 128 | * resid 7.41628980e-014 |
---|
| 129 | * machep 1.00000000e-015 (8.88178420e-016 NON OPT) |
---|
| 130 | * x[0]-1 -1.49880108e-014 |
---|
| 131 | * x[n-1]-1 -1.89848137e-014 |
---|
| 132 | * matgen 1 seconds 0.01051 |
---|
| 133 | * matgen 2 seconds 0.01050 |
---|
| 134 | * Repetitions 70 |
---|
| 135 | * Leading dimension 201 |
---|
| 136 | * dgefa dgesl total Mflops |
---|
| 137 | * 1 pass seconds 0.06000 0.00000 0.06000 |
---|
| 138 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
| 139 | * Repeat seconds 0.06077 0.00157 0.06234 11.01 |
---|
| 140 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
| 141 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
| 142 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
| 143 | * Average 10.99 |
---|
| 144 | * Leading dimension 200 |
---|
| 145 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
| 146 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
| 147 | * Repeat seconds 0.05864 0.00157 0.06021 11.40 |
---|
| 148 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
| 149 | * Repeat seconds 0.05864 0.00157 0.06021 11.40 |
---|
| 150 | * Average 11.32 |
---|
| 151 | * |
---|
| 152 | ************************************************************************ |
---|
| 153 | * |
---|
| 154 | * Examples of Results |
---|
| 155 | * |
---|
| 156 | * Precompiled codes were produced via a Watcom C/C++ 10.5 compiler. |
---|
| 157 | * Versions are available for DOS, Windows 3/95 and NT/Win 95. Both |
---|
| 158 | * non-optimised and optimised programs are available. The latter has |
---|
| 159 | * options as in the above example. Although these options can place |
---|
| 160 | * functions in-line, in this case, daxpy is not in-lined. Optimisation |
---|
| 161 | * reduces 18 instructions in the loop in this function to the following: |
---|
| 162 | * |
---|
| 163 | * L85 fld st(0) |
---|
| 164 | * fmul qword ptr [edx] |
---|
| 165 | * add eax,00000008H |
---|
| 166 | * add edx,00000008H |
---|
| 167 | * fadd qword ptr -8H[eax] |
---|
| 168 | * inc ebx |
---|
| 169 | * fstp qword ptr -8H[eax] |
---|
| 170 | * cmp ebx,esi |
---|
| 171 | * jl L85 |
---|
| 172 | * |
---|
| 173 | * Results produced are not consistent between runs but produce similar |
---|
| 174 | * speeds when executing at a particular dimension (see above). An example |
---|
| 175 | * of other results is 11.4/10.5 Mflops. Most typical double precision |
---|
| 176 | * rolled results are: |
---|
| 177 | * |
---|
| 178 | * Opt No Opt Version/ |
---|
| 179 | * MHz Cache Mflops Mflops Make/Options Via |
---|
| 180 | * |
---|
| 181 | * AM80386DX 40 128K 0.53 0.36 Clone Win/W95 |
---|
| 182 | * 80486DX2 66 128K 2.5 1.9 Escom SIS chipset Win/W95 |
---|
| 183 | * 80486DX2 66 128K 2.3 1.9 Escom SIS chipset NT/W95 |
---|
| 184 | * 80486DX2 66 128K 2.8 2.0 Escom SIS chipset Dos/Dos |
---|
| 185 | * Pentium 100 256K 11 4.2 Escom Neptune chipset Win/W95 |
---|
| 186 | * Pentium 100 256K 11 5.5 Escom Neptune chipset NT/W95 |
---|
| 187 | * Pentium 100 256K 12 4.4 Escom Neptune chipset Dos/Dos |
---|
| 188 | * Pentium Pro 200 256K 48 19 Dell XPS Pro200n NT/NT |
---|
| 189 | * |
---|
| 190 | * The results are as produced when compiled as Linpack.cpp. Compiling as |
---|
| 191 | * Linpack.c gives similar speeds but the code is a little different. |
---|
| 192 | * |
---|
| 193 | *************************************************************************** |
---|
| 194 | */ |
---|
| 195 | |
---|
[0a16d5f] | 196 | #define DP |
---|
| 197 | #define ROLL |
---|
[8783660] | 198 | |
---|
| 199 | #ifdef SP |
---|
| 200 | #define REAL float |
---|
| 201 | #define ZERO 0.0 |
---|
| 202 | #define ONE 1.0 |
---|
| 203 | #define PREC "Single " |
---|
| 204 | #endif |
---|
| 205 | |
---|
| 206 | #ifdef DP |
---|
| 207 | #define REAL double |
---|
| 208 | #define ZERO 0.0e0 |
---|
| 209 | #define ONE 1.0e0 |
---|
| 210 | #define PREC "Double " |
---|
| 211 | #endif |
---|
| 212 | |
---|
| 213 | #ifdef ROLL |
---|
| 214 | #define ROLLING "Rolled " |
---|
| 215 | #endif |
---|
| 216 | #ifdef UNROLL |
---|
| 217 | #define ROLLING "Unrolled " |
---|
| 218 | #endif |
---|
| 219 | |
---|
| 220 | |
---|
[0a16d5f] | 221 | #define NTIMES atoi(argv[1]) |
---|
[8783660] | 222 | |
---|
| 223 | #include <stdio.h> |
---|
| 224 | #include <math.h> |
---|
| 225 | #include <stdlib.h> |
---|
[8c1f4064] | 226 | #ifdef __rtems__ |
---|
| 227 | #include <tmacros.h> |
---|
| 228 | #undef print_time |
---|
| 229 | #define fprintf(f, ...) rtems_printf(&rtems_test_printer, __VA_ARGS__) |
---|
| 230 | #endif /* __rtems__ */ |
---|
[8783660] | 231 | |
---|
| 232 | |
---|
| 233 | static REAL atime[9][15]; |
---|
| 234 | |
---|
| 235 | void print_time (int row); |
---|
| 236 | void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma); |
---|
| 237 | void dgefa (REAL a[], int lda, int n, int ipvt[], int *info); |
---|
| 238 | void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job); |
---|
| 239 | void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]); |
---|
| 240 | void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy); |
---|
| 241 | REAL epslon (REAL x); |
---|
| 242 | int idamax (int n, REAL dx[], int incx); |
---|
| 243 | void dscal (int n, REAL da, REAL dx[], int incx); |
---|
| 244 | REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy); |
---|
| 245 | |
---|
| 246 | /* TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME */ |
---|
[0a16d5f] | 247 | #include <sys/time.h> /* for following time functions only */ |
---|
| 248 | static REAL second(void) |
---|
[8783660] | 249 | { |
---|
[0a16d5f] | 250 | struct timeval tv; |
---|
[8783660] | 251 | |
---|
[0a16d5f] | 252 | gettimeofday(&tv, NULL); |
---|
| 253 | return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6; |
---|
[8783660] | 254 | } |
---|
| 255 | |
---|
[0a16d5f] | 256 | int main (int argc, char **argv) |
---|
[8783660] | 257 | { |
---|
| 258 | static REAL aa[200*200],a[200*201],b[200],x[200]; |
---|
| 259 | REAL cray,ops,total,norma,normx; |
---|
| 260 | REAL resid,residn,eps,t1,tm2,epsn,x1,x2; |
---|
| 261 | REAL mflops; |
---|
| 262 | static int ipvt[200],n,i,j,ntimes,info,lda,ldaa; |
---|
[0a16d5f] | 263 | int pass, loop; |
---|
[8783660] | 264 | REAL overhead1, overhead2, time1, time2; |
---|
[0a16d5f] | 265 | char *compiler, *options; |
---|
[8783660] | 266 | |
---|
| 267 | /************************************************************************ |
---|
| 268 | * Enter details of compiler and options used * |
---|
| 269 | ************************************************************************/ |
---|
| 270 | /*----------------- --------- --------- ---------*/ |
---|
| 271 | compiler = "INSERT COMPILER NAME HERE"; |
---|
| 272 | options = "INSERT OPTIMISATION OPTIONS HERE"; |
---|
| 273 | /* Include -dDP or -dSP and -dROLL or -dUNROLL */ |
---|
| 274 | |
---|
| 275 | lda = 201; |
---|
| 276 | ldaa = 200; |
---|
| 277 | cray = .056; |
---|
| 278 | n = 100; |
---|
| 279 | |
---|
| 280 | fprintf(stdout,ROLLING);fprintf(stdout,PREC); |
---|
| 281 | fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n"); |
---|
| 282 | fprintf(stdout,"Compiler %s\n",compiler); |
---|
| 283 | fprintf(stdout,"Optimisation %s\n\n",options); |
---|
| 284 | |
---|
| 285 | ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n); |
---|
| 286 | |
---|
| 287 | matgen(a,lda,n,b,&norma); |
---|
| 288 | t1 = second(); |
---|
| 289 | dgefa(a,lda,n,ipvt,&info); |
---|
| 290 | atime[0][0] = second() - t1; |
---|
| 291 | t1 = second(); |
---|
| 292 | dgesl(a,lda,n,ipvt,b,0); |
---|
| 293 | atime[1][0] = second() - t1; |
---|
| 294 | total = atime[0][0] + atime[1][0]; |
---|
| 295 | |
---|
| 296 | /* compute a residual to verify results. */ |
---|
| 297 | |
---|
| 298 | for (i = 0; i < n; i++) { |
---|
| 299 | x[i] = b[i]; |
---|
| 300 | } |
---|
| 301 | matgen(a,lda,n,b,&norma); |
---|
| 302 | for (i = 0; i < n; i++) { |
---|
| 303 | b[i] = -b[i]; |
---|
| 304 | } |
---|
| 305 | dmxpy(n,b,n,lda,x,a); |
---|
| 306 | resid = 0.0; |
---|
| 307 | normx = 0.0; |
---|
| 308 | for (i = 0; i < n; i++) { |
---|
| 309 | resid = (resid > fabs((double)b[i])) |
---|
| 310 | ? resid : fabs((double)b[i]); |
---|
| 311 | normx = (normx > fabs((double)x[i])) |
---|
| 312 | ? normx : fabs((double)x[i]); |
---|
| 313 | } |
---|
| 314 | eps = epslon(ONE); |
---|
| 315 | residn = resid/( n*norma*normx*eps ); |
---|
| 316 | epsn = eps; |
---|
| 317 | x1 = x[0] - 1; |
---|
| 318 | x2 = x[n-1] - 1; |
---|
| 319 | |
---|
| 320 | printf("norm resid resid machep"); |
---|
| 321 | printf(" x[0]-1 x[n-1]-1\n"); |
---|
| 322 | printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n", |
---|
| 323 | (double)residn, (double)resid, (double)epsn, |
---|
| 324 | (double)x1, (double)x2); |
---|
| 325 | |
---|
[0a16d5f] | 326 | fprintf(stdout,"Times are reported for matrices of order %5d\n",n); |
---|
| 327 | fprintf(stdout,"1 pass times for array with leading dimension of%5d\n\n",lda); |
---|
| 328 | fprintf(stdout," dgefa dgesl total Mflops unit"); |
---|
| 329 | fprintf(stdout," ratio\n"); |
---|
[8783660] | 330 | |
---|
| 331 | atime[2][0] = total; |
---|
| 332 | if (total > 0.0) |
---|
| 333 | { |
---|
| 334 | atime[3][0] = ops/(1.0e6*total); |
---|
| 335 | atime[4][0] = 2.0/atime[3][0]; |
---|
| 336 | } |
---|
| 337 | else |
---|
| 338 | { |
---|
| 339 | atime[3][0] = 0.0; |
---|
| 340 | atime[4][0] = 0.0; |
---|
| 341 | } |
---|
| 342 | atime[5][0] = total/cray; |
---|
| 343 | |
---|
| 344 | print_time(0); |
---|
| 345 | |
---|
| 346 | /************************************************************************ |
---|
| 347 | * Calculate overhead of executing matgen procedure * |
---|
| 348 | ************************************************************************/ |
---|
| 349 | |
---|
[0a16d5f] | 350 | fprintf (stdout,"\nCalculating matgen overhead\n"); |
---|
[8783660] | 351 | pass = -20; |
---|
| 352 | loop = NTIMES; |
---|
| 353 | do |
---|
| 354 | { |
---|
| 355 | time1 = second(); |
---|
| 356 | pass = pass + 1; |
---|
| 357 | for ( i = 0 ; i < loop ; i++) |
---|
| 358 | { |
---|
| 359 | matgen(a,lda,n,b,&norma); |
---|
| 360 | } |
---|
| 361 | time2 = second(); |
---|
| 362 | overhead1 = (time2 - time1); |
---|
[0a16d5f] | 363 | fprintf (stdout,"%10d times %6.2f seconds\n", loop, overhead1); |
---|
[8783660] | 364 | if (overhead1 > 5.0) |
---|
| 365 | { |
---|
| 366 | pass = 0; |
---|
| 367 | } |
---|
| 368 | if (pass < 0) |
---|
| 369 | { |
---|
| 370 | if (overhead1 < 0.1) |
---|
| 371 | { |
---|
| 372 | loop = loop * 10; |
---|
| 373 | } |
---|
| 374 | else |
---|
| 375 | { |
---|
| 376 | loop = loop * 2; |
---|
| 377 | } |
---|
| 378 | } |
---|
| 379 | } |
---|
| 380 | while (pass < 0); |
---|
| 381 | |
---|
| 382 | overhead1 = overhead1 / (double)loop; |
---|
| 383 | |
---|
[0a16d5f] | 384 | fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead1); |
---|
[8783660] | 385 | |
---|
| 386 | /************************************************************************ |
---|
| 387 | * Calculate matgen/dgefa passes for 5 seconds * |
---|
| 388 | ************************************************************************/ |
---|
| 389 | |
---|
[0a16d5f] | 390 | fprintf (stdout,"Calculating matgen/dgefa passes for 5 seconds\n"); |
---|
[8783660] | 391 | pass = -20; |
---|
| 392 | ntimes = NTIMES; |
---|
| 393 | do |
---|
| 394 | { |
---|
| 395 | time1 = second(); |
---|
| 396 | pass = pass + 1; |
---|
| 397 | for ( i = 0 ; i < ntimes ; i++) |
---|
| 398 | { |
---|
| 399 | matgen(a,lda,n,b,&norma); |
---|
| 400 | dgefa(a,lda,n,ipvt,&info ); |
---|
| 401 | } |
---|
| 402 | time2 = second() - time1; |
---|
[0a16d5f] | 403 | fprintf (stdout,"%10d times %6.2f seconds\n", ntimes, time2); |
---|
[8783660] | 404 | if (time2 > 5.0) |
---|
| 405 | { |
---|
| 406 | pass = 0; |
---|
| 407 | } |
---|
| 408 | if (pass < 0) |
---|
| 409 | { |
---|
| 410 | if (time2 < 0.1) |
---|
| 411 | { |
---|
| 412 | ntimes = ntimes * 10; |
---|
| 413 | } |
---|
| 414 | else |
---|
| 415 | { |
---|
| 416 | ntimes = ntimes * 2; |
---|
| 417 | } |
---|
| 418 | } |
---|
| 419 | } |
---|
| 420 | while (pass < 0); |
---|
| 421 | |
---|
| 422 | ntimes = 5.0 * (double)ntimes / time2; |
---|
| 423 | if (ntimes == 0) ntimes = 1; |
---|
| 424 | |
---|
[0a16d5f] | 425 | fprintf (stdout,"Passes used %10d \n\n", ntimes); |
---|
| 426 | fprintf(stdout,"Times for array with leading dimension of%4d\n\n",lda); |
---|
| 427 | fprintf(stdout," dgefa dgesl total Mflops unit"); |
---|
| 428 | fprintf(stdout," ratio\n"); |
---|
[8783660] | 429 | |
---|
| 430 | /************************************************************************ |
---|
| 431 | * Execute 5 passes * |
---|
| 432 | ************************************************************************/ |
---|
| 433 | |
---|
| 434 | tm2 = ntimes * overhead1; |
---|
| 435 | atime[3][6] = 0; |
---|
| 436 | |
---|
| 437 | for (j=1 ; j<6 ; j++) |
---|
| 438 | { |
---|
| 439 | |
---|
| 440 | t1 = second(); |
---|
| 441 | |
---|
| 442 | for (i = 0; i < ntimes; i++) |
---|
| 443 | { |
---|
| 444 | matgen(a,lda,n,b,&norma); |
---|
| 445 | dgefa(a,lda,n,ipvt,&info ); |
---|
| 446 | } |
---|
| 447 | |
---|
| 448 | atime[0][j] = (second() - t1 - tm2)/ntimes; |
---|
| 449 | |
---|
| 450 | t1 = second(); |
---|
| 451 | |
---|
| 452 | for (i = 0; i < ntimes; i++) |
---|
| 453 | { |
---|
| 454 | dgesl(a,lda,n,ipvt,b,0); |
---|
| 455 | } |
---|
| 456 | |
---|
| 457 | atime[1][j] = (second() - t1)/ntimes; |
---|
| 458 | total = atime[0][j] + atime[1][j]; |
---|
| 459 | atime[2][j] = total; |
---|
| 460 | atime[3][j] = ops/(1.0e6*total); |
---|
| 461 | atime[4][j] = 2.0/atime[3][j]; |
---|
| 462 | atime[5][j] = total/cray; |
---|
| 463 | atime[3][6] = atime[3][6] + atime[3][j]; |
---|
| 464 | |
---|
| 465 | print_time(j); |
---|
| 466 | } |
---|
| 467 | atime[3][6] = atime[3][6] / 5.0; |
---|
[0a16d5f] | 468 | fprintf (stdout,"Average %11.2f\n", |
---|
[8783660] | 469 | (double)atime[3][6]); |
---|
| 470 | |
---|
[0a16d5f] | 471 | fprintf (stdout,"\nCalculating matgen2 overhead\n"); |
---|
[8783660] | 472 | |
---|
| 473 | /************************************************************************ |
---|
| 474 | * Calculate overhead of executing matgen procedure * |
---|
| 475 | ************************************************************************/ |
---|
| 476 | |
---|
| 477 | time1 = second(); |
---|
| 478 | for ( i = 0 ; i < loop ; i++) |
---|
| 479 | { |
---|
| 480 | matgen(aa,ldaa,n,b,&norma); |
---|
| 481 | } |
---|
| 482 | time2 = second(); |
---|
| 483 | overhead2 = (time2 - time1); |
---|
| 484 | overhead2 = overhead2 / (double)loop; |
---|
| 485 | |
---|
[0a16d5f] | 486 | fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead2); |
---|
| 487 | fprintf(stdout,"Times for array with leading dimension of%4d\n\n",ldaa); |
---|
| 488 | fprintf(stdout," dgefa dgesl total Mflops unit"); |
---|
| 489 | fprintf(stdout," ratio\n"); |
---|
[8783660] | 490 | |
---|
| 491 | /************************************************************************ |
---|
| 492 | * Execute 5 passes * |
---|
| 493 | ************************************************************************/ |
---|
| 494 | |
---|
| 495 | tm2 = ntimes * overhead2; |
---|
| 496 | atime[3][12] = 0; |
---|
| 497 | |
---|
| 498 | for (j=7 ; j<12 ; j++) |
---|
| 499 | { |
---|
| 500 | |
---|
| 501 | t1 = second(); |
---|
| 502 | |
---|
| 503 | for (i = 0; i < ntimes; i++) |
---|
| 504 | { |
---|
| 505 | matgen(aa,ldaa,n,b,&norma); |
---|
| 506 | dgefa(aa,ldaa,n,ipvt,&info ); |
---|
| 507 | } |
---|
| 508 | |
---|
| 509 | atime[0][j] = (second() - t1 - tm2)/ntimes; |
---|
| 510 | |
---|
| 511 | t1 = second(); |
---|
| 512 | |
---|
| 513 | for (i = 0; i < ntimes; i++) |
---|
| 514 | { |
---|
| 515 | dgesl(aa,ldaa,n,ipvt,b,0); |
---|
| 516 | } |
---|
| 517 | |
---|
| 518 | atime[1][j] = (second() - t1)/ntimes; |
---|
| 519 | total = atime[0][j] + atime[1][j]; |
---|
| 520 | atime[2][j] = total; |
---|
| 521 | atime[3][j] = ops/(1.0e6*total); |
---|
| 522 | atime[4][j] = 2.0/atime[3][j]; |
---|
| 523 | atime[5][j] = total/cray; |
---|
| 524 | atime[3][12] = atime[3][12] + atime[3][j]; |
---|
| 525 | |
---|
| 526 | print_time(j); |
---|
| 527 | } |
---|
| 528 | atime[3][12] = atime[3][12] / 5.0; |
---|
[0a16d5f] | 529 | fprintf (stdout,"Average %11.2f\n", |
---|
[8783660] | 530 | (double)atime[3][12]); |
---|
| 531 | |
---|
| 532 | /************************************************************************ |
---|
| 533 | * Use minimum average as overall Mflops rating * |
---|
| 534 | ************************************************************************/ |
---|
| 535 | |
---|
| 536 | mflops = atime[3][6]; |
---|
| 537 | if (atime[3][12] < mflops) mflops = atime[3][12]; |
---|
| 538 | |
---|
[0a16d5f] | 539 | fprintf(stdout,"\n"); |
---|
| 540 | fprintf(stdout,ROLLING);fprintf(stdout,PREC); |
---|
| 541 | fprintf(stdout," Precision %11.2f Mflops \n\n",mflops); |
---|
[e4633241] | 542 | #ifdef __rtems__ |
---|
| 543 | return 0; |
---|
| 544 | #endif |
---|
[8783660] | 545 | } |
---|
| 546 | |
---|
| 547 | /*----------------------*/ |
---|
| 548 | void print_time (int row) |
---|
| 549 | |
---|
| 550 | { |
---|
[0a16d5f] | 551 | fprintf(stdout,"%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n", (double)atime[0][row], |
---|
[8783660] | 552 | (double)atime[1][row], (double)atime[2][row], (double)atime[3][row], |
---|
| 553 | (double)atime[4][row], (double)atime[5][row]); |
---|
| 554 | return; |
---|
| 555 | } |
---|
| 556 | |
---|
| 557 | /*----------------------*/ |
---|
| 558 | |
---|
| 559 | void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma) |
---|
| 560 | |
---|
| 561 | |
---|
| 562 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
| 563 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
| 564 | |
---|
| 565 | { |
---|
| 566 | int init, i, j; |
---|
| 567 | |
---|
| 568 | init = 1325; |
---|
| 569 | *norma = 0.0; |
---|
| 570 | for (j = 0; j < n; j++) { |
---|
| 571 | for (i = 0; i < n; i++) { |
---|
| 572 | init = 3125*init % 65536; |
---|
| 573 | a[lda*j+i] = (init - 32768.0)/16384.0; |
---|
| 574 | *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma; |
---|
| 575 | |
---|
| 576 | /* alternative for some compilers |
---|
| 577 | if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]); |
---|
| 578 | */ |
---|
| 579 | } |
---|
| 580 | } |
---|
| 581 | for (i = 0; i < n; i++) { |
---|
| 582 | b[i] = 0.0; |
---|
| 583 | } |
---|
| 584 | for (j = 0; j < n; j++) { |
---|
| 585 | for (i = 0; i < n; i++) { |
---|
| 586 | b[i] = b[i] + a[lda*j+i]; |
---|
| 587 | } |
---|
| 588 | } |
---|
| 589 | return; |
---|
| 590 | } |
---|
| 591 | |
---|
| 592 | /*----------------------*/ |
---|
| 593 | void dgefa(REAL a[], int lda, int n, int ipvt[], int *info) |
---|
| 594 | |
---|
| 595 | |
---|
| 596 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
| 597 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
| 598 | /* |
---|
| 599 | dgefa factors a double precision matrix by gaussian elimination. |
---|
| 600 | |
---|
| 601 | dgefa is usually called by dgeco, but it can be called |
---|
| 602 | directly with a saving in time if rcond is not needed. |
---|
| 603 | (time for dgeco) = (1 + 9/n)*(time for dgefa) . |
---|
| 604 | |
---|
| 605 | on entry |
---|
| 606 | |
---|
| 607 | a REAL precision[n][lda] |
---|
| 608 | the matrix to be factored. |
---|
| 609 | |
---|
| 610 | lda integer |
---|
| 611 | the leading dimension of the array a . |
---|
| 612 | |
---|
| 613 | n integer |
---|
| 614 | the order of the matrix a . |
---|
| 615 | |
---|
| 616 | on return |
---|
| 617 | |
---|
| 618 | a an upper triangular matrix and the multipliers |
---|
| 619 | which were used to obtain it. |
---|
| 620 | the factorization can be written a = l*u where |
---|
| 621 | l is a product of permutation and unit lower |
---|
| 622 | triangular matrices and u is upper triangular. |
---|
| 623 | |
---|
| 624 | ipvt integer[n] |
---|
| 625 | an integer vector of pivot indices. |
---|
| 626 | |
---|
| 627 | info integer |
---|
| 628 | = 0 normal value. |
---|
| 629 | = k if u[k][k] .eq. 0.0 . this is not an error |
---|
| 630 | condition for this subroutine, but it does |
---|
| 631 | indicate that dgesl or dgedi will divide by zero |
---|
| 632 | if called. use rcond in dgeco for a reliable |
---|
| 633 | indication of singularity. |
---|
| 634 | |
---|
| 635 | linpack. this version dated 08/14/78 . |
---|
| 636 | cleve moler, university of new mexico, argonne national lab. |
---|
| 637 | |
---|
| 638 | functions |
---|
| 639 | |
---|
| 640 | blas daxpy,dscal,idamax |
---|
| 641 | */ |
---|
| 642 | |
---|
| 643 | { |
---|
| 644 | /* internal variables */ |
---|
| 645 | |
---|
| 646 | REAL t; |
---|
| 647 | int j,k,kp1,l,nm1; |
---|
| 648 | |
---|
| 649 | |
---|
| 650 | /* gaussian elimination with partial pivoting */ |
---|
| 651 | |
---|
| 652 | *info = 0; |
---|
| 653 | nm1 = n - 1; |
---|
| 654 | if (nm1 >= 0) { |
---|
| 655 | for (k = 0; k < nm1; k++) { |
---|
| 656 | kp1 = k + 1; |
---|
| 657 | |
---|
| 658 | /* find l = pivot index */ |
---|
| 659 | |
---|
| 660 | l = idamax(n-k,&a[lda*k+k],1) + k; |
---|
| 661 | ipvt[k] = l; |
---|
| 662 | |
---|
| 663 | /* zero pivot implies this column already |
---|
| 664 | triangularized */ |
---|
| 665 | |
---|
| 666 | if (a[lda*k+l] != ZERO) { |
---|
| 667 | |
---|
| 668 | /* interchange if necessary */ |
---|
| 669 | |
---|
| 670 | if (l != k) { |
---|
| 671 | t = a[lda*k+l]; |
---|
| 672 | a[lda*k+l] = a[lda*k+k]; |
---|
| 673 | a[lda*k+k] = t; |
---|
| 674 | } |
---|
| 675 | |
---|
| 676 | /* compute multipliers */ |
---|
| 677 | |
---|
| 678 | t = -ONE/a[lda*k+k]; |
---|
| 679 | dscal(n-(k+1),t,&a[lda*k+k+1],1); |
---|
| 680 | |
---|
| 681 | /* row elimination with column indexing */ |
---|
| 682 | |
---|
| 683 | for (j = kp1; j < n; j++) { |
---|
| 684 | t = a[lda*j+l]; |
---|
| 685 | if (l != k) { |
---|
| 686 | a[lda*j+l] = a[lda*j+k]; |
---|
| 687 | a[lda*j+k] = t; |
---|
| 688 | } |
---|
| 689 | daxpy(n-(k+1),t,&a[lda*k+k+1],1, |
---|
| 690 | &a[lda*j+k+1],1); |
---|
| 691 | } |
---|
| 692 | } |
---|
| 693 | else { |
---|
| 694 | *info = k; |
---|
| 695 | } |
---|
| 696 | } |
---|
| 697 | } |
---|
| 698 | ipvt[n-1] = n-1; |
---|
| 699 | if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1; |
---|
| 700 | return; |
---|
| 701 | } |
---|
| 702 | |
---|
| 703 | /*----------------------*/ |
---|
| 704 | |
---|
| 705 | void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job ) |
---|
| 706 | |
---|
| 707 | |
---|
| 708 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
| 709 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
| 710 | |
---|
| 711 | /* |
---|
| 712 | dgesl solves the double precision system |
---|
| 713 | a * x = b or trans(a) * x = b |
---|
| 714 | using the factors computed by dgeco or dgefa. |
---|
| 715 | |
---|
| 716 | on entry |
---|
| 717 | |
---|
| 718 | a double precision[n][lda] |
---|
| 719 | the output from dgeco or dgefa. |
---|
| 720 | |
---|
| 721 | lda integer |
---|
| 722 | the leading dimension of the array a . |
---|
| 723 | |
---|
| 724 | n integer |
---|
| 725 | the order of the matrix a . |
---|
| 726 | |
---|
| 727 | ipvt integer[n] |
---|
| 728 | the pivot vector from dgeco or dgefa. |
---|
| 729 | |
---|
| 730 | b double precision[n] |
---|
| 731 | the right hand side vector. |
---|
| 732 | |
---|
| 733 | job integer |
---|
| 734 | = 0 to solve a*x = b , |
---|
| 735 | = nonzero to solve trans(a)*x = b where |
---|
| 736 | trans(a) is the transpose. |
---|
| 737 | |
---|
| 738 | on return |
---|
| 739 | |
---|
| 740 | b the solution vector x . |
---|
| 741 | |
---|
| 742 | error condition |
---|
| 743 | |
---|
| 744 | a division by zero will occur if the input factor contains a |
---|
| 745 | zero on the diagonal. technically this indicates singularity |
---|
| 746 | but it is often caused by improper arguments or improper |
---|
| 747 | setting of lda . it will not occur if the subroutines are |
---|
| 748 | called correctly and if dgeco has set rcond .gt. 0.0 |
---|
| 749 | or dgefa has set info .eq. 0 . |
---|
| 750 | |
---|
| 751 | to compute inverse(a) * c where c is a matrix |
---|
| 752 | with p columns |
---|
| 753 | dgeco(a,lda,n,ipvt,rcond,z) |
---|
| 754 | if (!rcond is too small){ |
---|
| 755 | for (j=0,j<p,j++) |
---|
| 756 | dgesl(a,lda,n,ipvt,c[j][0],0); |
---|
| 757 | } |
---|
| 758 | |
---|
| 759 | linpack. this version dated 08/14/78 . |
---|
| 760 | cleve moler, university of new mexico, argonne national lab. |
---|
| 761 | |
---|
| 762 | functions |
---|
| 763 | |
---|
| 764 | blas daxpy,ddot |
---|
| 765 | */ |
---|
| 766 | { |
---|
| 767 | /* internal variables */ |
---|
| 768 | |
---|
| 769 | REAL t; |
---|
| 770 | int k,kb,l,nm1; |
---|
| 771 | |
---|
| 772 | nm1 = n - 1; |
---|
| 773 | if (job == 0) { |
---|
| 774 | |
---|
| 775 | /* job = 0 , solve a * x = b |
---|
| 776 | first solve l*y = b */ |
---|
| 777 | |
---|
| 778 | if (nm1 >= 1) { |
---|
| 779 | for (k = 0; k < nm1; k++) { |
---|
| 780 | l = ipvt[k]; |
---|
| 781 | t = b[l]; |
---|
| 782 | if (l != k){ |
---|
| 783 | b[l] = b[k]; |
---|
| 784 | b[k] = t; |
---|
| 785 | } |
---|
| 786 | daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 ); |
---|
| 787 | } |
---|
| 788 | } |
---|
| 789 | |
---|
| 790 | /* now solve u*x = y */ |
---|
| 791 | |
---|
| 792 | for (kb = 0; kb < n; kb++) { |
---|
| 793 | k = n - (kb + 1); |
---|
| 794 | b[k] = b[k]/a[lda*k+k]; |
---|
| 795 | t = -b[k]; |
---|
| 796 | daxpy(k,t,&a[lda*k+0],1,&b[0],1 ); |
---|
| 797 | } |
---|
| 798 | } |
---|
| 799 | else { |
---|
| 800 | |
---|
| 801 | /* job = nonzero, solve trans(a) * x = b |
---|
| 802 | first solve trans(u)*y = b */ |
---|
| 803 | |
---|
| 804 | for (k = 0; k < n; k++) { |
---|
| 805 | t = ddot(k,&a[lda*k+0],1,&b[0],1); |
---|
| 806 | b[k] = (b[k] - t)/a[lda*k+k]; |
---|
| 807 | } |
---|
| 808 | |
---|
| 809 | /* now solve trans(l)*x = y */ |
---|
| 810 | |
---|
| 811 | if (nm1 >= 1) { |
---|
| 812 | for (kb = 1; kb < nm1; kb++) { |
---|
| 813 | k = n - (kb+1); |
---|
| 814 | b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1); |
---|
| 815 | l = ipvt[k]; |
---|
| 816 | if (l != k) { |
---|
| 817 | t = b[l]; |
---|
| 818 | b[l] = b[k]; |
---|
| 819 | b[k] = t; |
---|
| 820 | } |
---|
| 821 | } |
---|
| 822 | } |
---|
| 823 | } |
---|
| 824 | return; |
---|
| 825 | } |
---|
| 826 | |
---|
| 827 | /*----------------------*/ |
---|
| 828 | |
---|
| 829 | void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy) |
---|
| 830 | /* |
---|
| 831 | constant times a vector plus a vector. |
---|
| 832 | jack dongarra, linpack, 3/11/78. |
---|
| 833 | */ |
---|
| 834 | |
---|
| 835 | { |
---|
[0a16d5f] | 836 | int i,ix,iy; |
---|
[8783660] | 837 | |
---|
| 838 | if(n <= 0) return; |
---|
| 839 | if (da == ZERO) return; |
---|
| 840 | |
---|
| 841 | if(incx != 1 || incy != 1) { |
---|
| 842 | |
---|
| 843 | /* code for unequal increments or equal increments |
---|
| 844 | not equal to 1 */ |
---|
| 845 | |
---|
| 846 | ix = 0; |
---|
| 847 | iy = 0; |
---|
| 848 | if(incx < 0) ix = (-n+1)*incx; |
---|
| 849 | if(incy < 0)iy = (-n+1)*incy; |
---|
| 850 | for (i = 0;i < n; i++) { |
---|
| 851 | dy[iy] = dy[iy] + da*dx[ix]; |
---|
| 852 | ix = ix + incx; |
---|
| 853 | iy = iy + incy; |
---|
| 854 | |
---|
| 855 | } |
---|
| 856 | return; |
---|
| 857 | } |
---|
| 858 | |
---|
| 859 | /* code for both increments equal to 1 */ |
---|
| 860 | |
---|
| 861 | |
---|
| 862 | #ifdef ROLL |
---|
| 863 | |
---|
| 864 | for (i = 0;i < n; i++) { |
---|
| 865 | dy[i] = dy[i] + da*dx[i]; |
---|
| 866 | } |
---|
| 867 | |
---|
| 868 | |
---|
| 869 | #endif |
---|
| 870 | |
---|
| 871 | #ifdef UNROLL |
---|
| 872 | |
---|
| 873 | m = n % 4; |
---|
| 874 | if ( m != 0) { |
---|
| 875 | for (i = 0; i < m; i++) |
---|
| 876 | dy[i] = dy[i] + da*dx[i]; |
---|
| 877 | |
---|
| 878 | if (n < 4) return; |
---|
| 879 | } |
---|
| 880 | for (i = m; i < n; i = i + 4) { |
---|
| 881 | dy[i] = dy[i] + da*dx[i]; |
---|
| 882 | dy[i+1] = dy[i+1] + da*dx[i+1]; |
---|
| 883 | dy[i+2] = dy[i+2] + da*dx[i+2]; |
---|
| 884 | dy[i+3] = dy[i+3] + da*dx[i+3]; |
---|
| 885 | |
---|
| 886 | } |
---|
| 887 | |
---|
| 888 | #endif |
---|
| 889 | return; |
---|
| 890 | } |
---|
| 891 | |
---|
| 892 | /*----------------------*/ |
---|
| 893 | |
---|
| 894 | REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy) |
---|
| 895 | /* |
---|
| 896 | forms the dot product of two vectors. |
---|
| 897 | jack dongarra, linpack, 3/11/78. |
---|
| 898 | */ |
---|
| 899 | |
---|
| 900 | { |
---|
| 901 | REAL dtemp; |
---|
[0a16d5f] | 902 | int i,ix,iy; |
---|
[8783660] | 903 | |
---|
| 904 | dtemp = ZERO; |
---|
| 905 | |
---|
| 906 | if(n <= 0) return(ZERO); |
---|
| 907 | |
---|
| 908 | if(incx != 1 || incy != 1) { |
---|
| 909 | |
---|
| 910 | /* code for unequal increments or equal increments |
---|
| 911 | not equal to 1 */ |
---|
| 912 | |
---|
| 913 | ix = 0; |
---|
| 914 | iy = 0; |
---|
| 915 | if (incx < 0) ix = (-n+1)*incx; |
---|
| 916 | if (incy < 0) iy = (-n+1)*incy; |
---|
| 917 | for (i = 0;i < n; i++) { |
---|
| 918 | dtemp = dtemp + dx[ix]*dy[iy]; |
---|
| 919 | ix = ix + incx; |
---|
| 920 | iy = iy + incy; |
---|
| 921 | |
---|
| 922 | } |
---|
| 923 | return(dtemp); |
---|
| 924 | } |
---|
| 925 | |
---|
| 926 | /* code for both increments equal to 1 */ |
---|
| 927 | |
---|
| 928 | |
---|
| 929 | #ifdef ROLL |
---|
| 930 | |
---|
| 931 | for (i=0;i < n; i++) |
---|
| 932 | dtemp = dtemp + dx[i]*dy[i]; |
---|
| 933 | |
---|
| 934 | return(dtemp); |
---|
| 935 | |
---|
| 936 | #endif |
---|
| 937 | |
---|
| 938 | #ifdef UNROLL |
---|
| 939 | |
---|
| 940 | |
---|
| 941 | m = n % 5; |
---|
| 942 | if (m != 0) { |
---|
| 943 | for (i = 0; i < m; i++) |
---|
| 944 | dtemp = dtemp + dx[i]*dy[i]; |
---|
| 945 | if (n < 5) return(dtemp); |
---|
| 946 | } |
---|
| 947 | for (i = m; i < n; i = i + 5) { |
---|
| 948 | dtemp = dtemp + dx[i]*dy[i] + |
---|
| 949 | dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] + |
---|
| 950 | dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4]; |
---|
| 951 | } |
---|
| 952 | return(dtemp); |
---|
| 953 | |
---|
| 954 | #endif |
---|
| 955 | |
---|
| 956 | } |
---|
| 957 | |
---|
| 958 | /*----------------------*/ |
---|
| 959 | void dscal(int n, REAL da, REAL dx[], int incx) |
---|
| 960 | |
---|
| 961 | /* scales a vector by a constant. |
---|
| 962 | jack dongarra, linpack, 3/11/78. |
---|
| 963 | */ |
---|
| 964 | |
---|
| 965 | { |
---|
[0a16d5f] | 966 | int i,nincx; |
---|
[8783660] | 967 | |
---|
| 968 | if(n <= 0)return; |
---|
| 969 | if(incx != 1) { |
---|
| 970 | |
---|
| 971 | /* code for increment not equal to 1 */ |
---|
| 972 | |
---|
| 973 | nincx = n*incx; |
---|
| 974 | for (i = 0; i < nincx; i = i + incx) |
---|
| 975 | dx[i] = da*dx[i]; |
---|
| 976 | |
---|
| 977 | return; |
---|
| 978 | } |
---|
| 979 | |
---|
| 980 | /* code for increment equal to 1 */ |
---|
| 981 | |
---|
| 982 | |
---|
| 983 | #ifdef ROLL |
---|
| 984 | |
---|
| 985 | for (i = 0; i < n; i++) |
---|
| 986 | dx[i] = da*dx[i]; |
---|
| 987 | |
---|
| 988 | |
---|
| 989 | #endif |
---|
| 990 | |
---|
| 991 | #ifdef UNROLL |
---|
| 992 | |
---|
| 993 | |
---|
| 994 | m = n % 5; |
---|
| 995 | if (m != 0) { |
---|
| 996 | for (i = 0; i < m; i++) |
---|
| 997 | dx[i] = da*dx[i]; |
---|
| 998 | if (n < 5) return; |
---|
| 999 | } |
---|
| 1000 | for (i = m; i < n; i = i + 5){ |
---|
| 1001 | dx[i] = da*dx[i]; |
---|
| 1002 | dx[i+1] = da*dx[i+1]; |
---|
| 1003 | dx[i+2] = da*dx[i+2]; |
---|
| 1004 | dx[i+3] = da*dx[i+3]; |
---|
| 1005 | dx[i+4] = da*dx[i+4]; |
---|
| 1006 | } |
---|
| 1007 | |
---|
| 1008 | #endif |
---|
| 1009 | |
---|
| 1010 | } |
---|
| 1011 | |
---|
| 1012 | /*----------------------*/ |
---|
| 1013 | int idamax(int n, REAL dx[], int incx) |
---|
| 1014 | |
---|
| 1015 | /* |
---|
| 1016 | finds the index of element having max. absolute value. |
---|
| 1017 | jack dongarra, linpack, 3/11/78. |
---|
| 1018 | */ |
---|
| 1019 | |
---|
| 1020 | |
---|
| 1021 | { |
---|
| 1022 | REAL dmax; |
---|
| 1023 | int i, ix, itemp; |
---|
| 1024 | |
---|
| 1025 | if( n < 1 ) return(-1); |
---|
| 1026 | if(n ==1 ) return(0); |
---|
[0a16d5f] | 1027 | itemp = -1; |
---|
[8783660] | 1028 | if(incx != 1) { |
---|
| 1029 | |
---|
| 1030 | /* code for increment not equal to 1 */ |
---|
| 1031 | |
---|
| 1032 | ix = 1; |
---|
| 1033 | dmax = fabs((double)dx[0]); |
---|
| 1034 | ix = ix + incx; |
---|
| 1035 | for (i = 1; i < n; i++) { |
---|
| 1036 | if(fabs((double)dx[ix]) > dmax) { |
---|
| 1037 | itemp = i; |
---|
| 1038 | dmax = fabs((double)dx[ix]); |
---|
| 1039 | } |
---|
| 1040 | ix = ix + incx; |
---|
| 1041 | } |
---|
| 1042 | } |
---|
| 1043 | else { |
---|
| 1044 | |
---|
| 1045 | /* code for increment equal to 1 */ |
---|
| 1046 | |
---|
| 1047 | itemp = 0; |
---|
| 1048 | dmax = fabs((double)dx[0]); |
---|
| 1049 | for (i = 1; i < n; i++) { |
---|
| 1050 | if(fabs((double)dx[i]) > dmax) { |
---|
| 1051 | itemp = i; |
---|
| 1052 | dmax = fabs((double)dx[i]); |
---|
| 1053 | } |
---|
| 1054 | } |
---|
| 1055 | } |
---|
| 1056 | return (itemp); |
---|
| 1057 | } |
---|
| 1058 | |
---|
| 1059 | /*----------------------*/ |
---|
| 1060 | REAL epslon (REAL x) |
---|
| 1061 | |
---|
| 1062 | /* |
---|
| 1063 | estimate unit roundoff in quantities of size x. |
---|
| 1064 | */ |
---|
| 1065 | |
---|
| 1066 | { |
---|
| 1067 | REAL a,b,c,eps; |
---|
| 1068 | /* |
---|
| 1069 | this program should function properly on all systems |
---|
| 1070 | satisfying the following two assumptions, |
---|
| 1071 | 1. the base used in representing dfloating point |
---|
| 1072 | numbers is not a power of three. |
---|
| 1073 | 2. the quantity a in statement 10 is represented to |
---|
| 1074 | the accuracy used in dfloating point variables |
---|
| 1075 | that are stored in memory. |
---|
| 1076 | the statement number 10 and the go to 10 are intended to |
---|
| 1077 | force optimizing compilers to generate code satisfying |
---|
| 1078 | assumption 2. |
---|
| 1079 | under these assumptions, it should be true that, |
---|
| 1080 | a is not exactly equal to four-thirds, |
---|
| 1081 | b has a zero for its last bit or digit, |
---|
| 1082 | c is not exactly equal to one, |
---|
| 1083 | eps measures the separation of 1.0 from |
---|
| 1084 | the next larger dfloating point number. |
---|
| 1085 | the developers of eispack would appreciate being informed |
---|
| 1086 | about any systems where these assumptions do not hold. |
---|
| 1087 | |
---|
| 1088 | ***************************************************************** |
---|
| 1089 | this routine is one of the auxiliary routines used by eispack iii |
---|
| 1090 | to avoid machine dependencies. |
---|
| 1091 | ***************************************************************** |
---|
| 1092 | |
---|
| 1093 | this version dated 4/6/83. |
---|
| 1094 | */ |
---|
| 1095 | |
---|
| 1096 | a = 4.0e0/3.0e0; |
---|
| 1097 | eps = ZERO; |
---|
| 1098 | while (eps == ZERO) { |
---|
| 1099 | b = a - ONE; |
---|
| 1100 | c = b + b + b; |
---|
| 1101 | eps = fabs((double)(c-ONE)); |
---|
| 1102 | } |
---|
| 1103 | return(eps*fabs((double)x)); |
---|
| 1104 | } |
---|
| 1105 | |
---|
| 1106 | /*----------------------*/ |
---|
| 1107 | void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]) |
---|
| 1108 | |
---|
| 1109 | |
---|
| 1110 | /* We would like to declare m[][ldm], but c does not allow it. In this |
---|
| 1111 | function, references to m[i][j] are written m[ldm*i+j]. */ |
---|
| 1112 | |
---|
| 1113 | /* |
---|
| 1114 | purpose: |
---|
| 1115 | multiply matrix m times vector x and add the result to vector y. |
---|
| 1116 | |
---|
| 1117 | parameters: |
---|
| 1118 | |
---|
| 1119 | n1 integer, number of elements in vector y, and number of rows in |
---|
| 1120 | matrix m |
---|
| 1121 | |
---|
| 1122 | y double [n1], vector of length n1 to which is added |
---|
| 1123 | the product m*x |
---|
| 1124 | |
---|
| 1125 | n2 integer, number of elements in vector x, and number of columns |
---|
| 1126 | in matrix m |
---|
| 1127 | |
---|
| 1128 | ldm integer, leading dimension of array m |
---|
| 1129 | |
---|
| 1130 | x double [n2], vector of length n2 |
---|
| 1131 | |
---|
| 1132 | m double [ldm][n2], matrix of n1 rows and n2 columns |
---|
| 1133 | |
---|
| 1134 | ---------------------------------------------------------------------- |
---|
| 1135 | */ |
---|
| 1136 | { |
---|
| 1137 | int j,i,jmin; |
---|
| 1138 | /* cleanup odd vector */ |
---|
| 1139 | |
---|
| 1140 | j = n2 % 2; |
---|
| 1141 | if (j >= 1) { |
---|
| 1142 | j = j - 1; |
---|
| 1143 | for (i = 0; i < n1; i++) |
---|
| 1144 | y[i] = (y[i]) + x[j]*m[ldm*j+i]; |
---|
| 1145 | } |
---|
| 1146 | |
---|
| 1147 | /* cleanup odd group of two vectors */ |
---|
| 1148 | |
---|
| 1149 | j = n2 % 4; |
---|
| 1150 | if (j >= 2) { |
---|
| 1151 | j = j - 1; |
---|
| 1152 | for (i = 0; i < n1; i++) |
---|
| 1153 | y[i] = ( (y[i]) |
---|
| 1154 | + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; |
---|
| 1155 | } |
---|
| 1156 | |
---|
| 1157 | /* cleanup odd group of four vectors */ |
---|
| 1158 | |
---|
| 1159 | j = n2 % 8; |
---|
| 1160 | if (j >= 4) { |
---|
| 1161 | j = j - 1; |
---|
| 1162 | for (i = 0; i < n1; i++) |
---|
| 1163 | y[i] = ((( (y[i]) |
---|
| 1164 | + x[j-3]*m[ldm*(j-3)+i]) |
---|
| 1165 | + x[j-2]*m[ldm*(j-2)+i]) |
---|
| 1166 | + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; |
---|
| 1167 | } |
---|
| 1168 | |
---|
| 1169 | /* cleanup odd group of eight vectors */ |
---|
| 1170 | |
---|
| 1171 | j = n2 % 16; |
---|
| 1172 | if (j >= 8) { |
---|
| 1173 | j = j - 1; |
---|
| 1174 | for (i = 0; i < n1; i++) |
---|
| 1175 | y[i] = ((((((( (y[i]) |
---|
| 1176 | + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i]) |
---|
| 1177 | + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i]) |
---|
| 1178 | + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i]) |
---|
| 1179 | + x[j-1]*m[ldm*(j-1)+i]) + x[j] *m[ldm*j+i]; |
---|
| 1180 | } |
---|
| 1181 | |
---|
| 1182 | /* main loop - groups of sixteen vectors */ |
---|
| 1183 | |
---|
| 1184 | jmin = (n2%16)+16; |
---|
| 1185 | for (j = jmin-1; j < n2; j = j + 16) { |
---|
| 1186 | for (i = 0; i < n1; i++) |
---|
| 1187 | y[i] = ((((((((((((((( (y[i]) |
---|
| 1188 | + x[j-15]*m[ldm*(j-15)+i]) |
---|
| 1189 | + x[j-14]*m[ldm*(j-14)+i]) |
---|
| 1190 | + x[j-13]*m[ldm*(j-13)+i]) |
---|
| 1191 | + x[j-12]*m[ldm*(j-12)+i]) |
---|
| 1192 | + x[j-11]*m[ldm*(j-11)+i]) |
---|
| 1193 | + x[j-10]*m[ldm*(j-10)+i]) |
---|
| 1194 | + x[j- 9]*m[ldm*(j- 9)+i]) |
---|
| 1195 | + x[j- 8]*m[ldm*(j- 8)+i]) |
---|
| 1196 | + x[j- 7]*m[ldm*(j- 7)+i]) |
---|
| 1197 | + x[j- 6]*m[ldm*(j- 6)+i]) |
---|
| 1198 | + x[j- 5]*m[ldm*(j- 5)+i]) |
---|
| 1199 | + x[j- 4]*m[ldm*(j- 4)+i]) |
---|
| 1200 | + x[j- 3]*m[ldm*(j- 3)+i]) |
---|
| 1201 | + x[j- 2]*m[ldm*(j- 2)+i]) |
---|
| 1202 | + x[j- 1]*m[ldm*(j- 1)+i]) |
---|
| 1203 | + x[j] *m[ldm*j+i]; |
---|
| 1204 | } |
---|
| 1205 | return; |
---|
| 1206 | } |
---|
| 1207 | |
---|
| 1208 | /*----------------------*/ |
---|