1 | /* |
---|
2 | * Linpack 100x100 Benchmark In C/C++ For PCs |
---|
3 | * |
---|
4 | ******************************************************************** |
---|
5 | * |
---|
6 | * Original Source from NETLIB |
---|
7 | * |
---|
8 | * Translated to C by Bonnie Toy 5/88 (modified on 2/25/94 to fix |
---|
9 | * a problem with daxpy for unequal increments or equal increments |
---|
10 | * not equal to 1. Jack Dongarra) |
---|
11 | * |
---|
12 | * To obtain rolled source BLAS, add -DROLL to the command lines. |
---|
13 | * To obtain unrolled source BLAS, add -DUNROLL to the command lines. |
---|
14 | * |
---|
15 | * You must specify one of -DSP or -DDP to compile correctly. |
---|
16 | * |
---|
17 | * You must specify one of -DROLL or -DUNROLL to compile correctly. |
---|
18 | * |
---|
19 | ******************************************************************** |
---|
20 | * |
---|
21 | * Changes in this version |
---|
22 | * |
---|
23 | * 1. Function prototypes are declared and function headers have |
---|
24 | * embedded parameter types to produce code for C and C++ |
---|
25 | * |
---|
26 | * 2. Arrays aa and a are declared as [200*200] and [200*201] to |
---|
27 | * allow compilation with prototypes. |
---|
28 | * |
---|
29 | * 3. Function second changed (compiler dependent). |
---|
30 | * |
---|
31 | * 4. Timing method changed due to inaccuracy of PC clock (see below). |
---|
32 | * |
---|
33 | * 5. Additional date function included (compiler dependent). |
---|
34 | * |
---|
35 | * 6. Additional code used as a standard for a series of benchmarks:- |
---|
36 | * Automatic run time calibration rather than fixed parameters |
---|
37 | * Initial calibration with display to show linearity |
---|
38 | * Results displayed at reasonable rate for viewing (5 seconds) |
---|
39 | * Facilities for typing in details of system used etc. |
---|
40 | * Compiler details in code in case .exe files used elsewhere |
---|
41 | * Results appended to a text file (Linpack.txt) |
---|
42 | * |
---|
43 | * Roy Longbottom 101323.2241@compuserve.com 14 September 1996 |
---|
44 | * |
---|
45 | ************************************************************************ |
---|
46 | * |
---|
47 | * Timing |
---|
48 | * |
---|
49 | * The PC timer is updated at about 18 times per second or resolution of |
---|
50 | * 0.05 to 0.06 seconds which is similar to the time taken by the main |
---|
51 | * time consuming function dgefa on a 100 MHz Pentium. Thus there is no |
---|
52 | * point in running the dgefa/dges1 combination three times as in the |
---|
53 | * original version. Main timing for the latter, in the loop run NTIMES, |
---|
54 | * executes matgen/dgefa, summing the time taken by matgen within the |
---|
55 | * loop for later deduction from the total time. On a modern PC this sum |
---|
56 | * can be based on a random selection of 0 or 0.05/0.06. This version |
---|
57 | * executes the single pass once and the main timing loop five times, |
---|
58 | * calculating the matgen overhead separately. |
---|
59 | * |
---|
60 | ************************************************************************* |
---|
61 | * |
---|
62 | * Example of Output |
---|
63 | * |
---|
64 | * Rolled Double Precision Linpack Benchmark - PC Version in 'C/C++' |
---|
65 | * |
---|
66 | * Compiler Watcom C/C++ 10.5 Win 386 |
---|
67 | * Optimisation -zp4 -otexan -fp5 -5r -dDP -dROLL |
---|
68 | * |
---|
69 | * |
---|
70 | * norm resid resid machep x[0]-1 x[n-1]-1 |
---|
71 | * 0.4 7.41628980e-014 1.00000000e-015 -1.49880108e-014 -1.89848137e-014 |
---|
72 | * |
---|
73 | * |
---|
74 | * Times are reported for matrices of order 100 |
---|
75 | * 1 pass times for array with leading dimension of 201 |
---|
76 | * |
---|
77 | * dgefa dgesl total Mflops unit ratio |
---|
78 | * 0.06000 0.00000 0.06000 11.44 0.1748 1.0714 |
---|
79 | * |
---|
80 | * |
---|
81 | * Calculating matgen overhead |
---|
82 | * |
---|
83 | * 10 times 0.11 seconds |
---|
84 | * 20 times 0.22 seconds |
---|
85 | * 40 times 0.44 seconds |
---|
86 | * 80 times 0.87 seconds |
---|
87 | * 160 times 1.76 seconds |
---|
88 | * 320 times 3.52 seconds |
---|
89 | * 640 times 7.03 seconds |
---|
90 | * |
---|
91 | * Overhead for 1 matgen 0.01098 seconds |
---|
92 | * |
---|
93 | * |
---|
94 | * Calculating matgen/dgefa passes for 5 seconds |
---|
95 | * |
---|
96 | * 10 times 0.71 seconds |
---|
97 | * 20 times 1.38 seconds |
---|
98 | * 40 times 2.80 seconds |
---|
99 | * 80 times 5.66 seconds |
---|
100 | * |
---|
101 | * Passes used 70 |
---|
102 | * |
---|
103 | * This is followed by output of the normal data for dgefa, dges1, |
---|
104 | * total, Mflops, unit and ratio with five sets of results for each. |
---|
105 | * |
---|
106 | ************************************************************************ |
---|
107 | * |
---|
108 | * Example from output file Linpack.txt |
---|
109 | * |
---|
110 | * LINPACK BENCHMARK FOR PCs 'C/C++' n @ 100 |
---|
111 | * |
---|
112 | * Month run 9/1996 |
---|
113 | * PC model Escom |
---|
114 | * CPU Pentium |
---|
115 | * Clock MHz 100 |
---|
116 | * Cache 256K |
---|
117 | * Options Neptune chipset |
---|
118 | * OS/DOS Windows 95 |
---|
119 | * Compiler Watcom C/C++ 10.5 Win 386 |
---|
120 | * OptLevel -zp4 -otexan -fp5 -5r -dDP -dROLL |
---|
121 | * Run by Roy Longbottom |
---|
122 | * From UK |
---|
123 | * Mail 101323.2241@compuserve.com |
---|
124 | * |
---|
125 | * Rolling Rolled |
---|
126 | * Precision Double |
---|
127 | * norm. resid 0.4 |
---|
128 | * resid 7.41628980e-014 |
---|
129 | * machep 1.00000000e-015 (8.88178420e-016 NON OPT) |
---|
130 | * x[0]-1 -1.49880108e-014 |
---|
131 | * x[n-1]-1 -1.89848137e-014 |
---|
132 | * matgen 1 seconds 0.01051 |
---|
133 | * matgen 2 seconds 0.01050 |
---|
134 | * Repetitions 70 |
---|
135 | * Leading dimension 201 |
---|
136 | * dgefa dgesl total Mflops |
---|
137 | * 1 pass seconds 0.06000 0.00000 0.06000 |
---|
138 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
139 | * Repeat seconds 0.06077 0.00157 0.06234 11.01 |
---|
140 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
141 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
142 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
143 | * Average 10.99 |
---|
144 | * Leading dimension 200 |
---|
145 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
146 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
147 | * Repeat seconds 0.05864 0.00157 0.06021 11.40 |
---|
148 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
149 | * Repeat seconds 0.05864 0.00157 0.06021 11.40 |
---|
150 | * Average 11.32 |
---|
151 | * |
---|
152 | ************************************************************************ |
---|
153 | * |
---|
154 | * Examples of Results |
---|
155 | * |
---|
156 | * Precompiled codes were produced via a Watcom C/C++ 10.5 compiler. |
---|
157 | * Versions are available for DOS, Windows 3/95 and NT/Win 95. Both |
---|
158 | * non-optimised and optimised programs are available. The latter has |
---|
159 | * options as in the above example. Although these options can place |
---|
160 | * functions in-line, in this case, daxpy is not in-lined. Optimisation |
---|
161 | * reduces 18 instructions in the loop in this function to the following: |
---|
162 | * |
---|
163 | * L85 fld st(0) |
---|
164 | * fmul qword ptr [edx] |
---|
165 | * add eax,00000008H |
---|
166 | * add edx,00000008H |
---|
167 | * fadd qword ptr -8H[eax] |
---|
168 | * inc ebx |
---|
169 | * fstp qword ptr -8H[eax] |
---|
170 | * cmp ebx,esi |
---|
171 | * jl L85 |
---|
172 | * |
---|
173 | * Results produced are not consistent between runs but produce similar |
---|
174 | * speeds when executing at a particular dimension (see above). An example |
---|
175 | * of other results is 11.4/10.5 Mflops. Most typical double precision |
---|
176 | * rolled results are: |
---|
177 | * |
---|
178 | * Opt No Opt Version/ |
---|
179 | * MHz Cache Mflops Mflops Make/Options Via |
---|
180 | * |
---|
181 | * AM80386DX 40 128K 0.53 0.36 Clone Win/W95 |
---|
182 | * 80486DX2 66 128K 2.5 1.9 Escom SIS chipset Win/W95 |
---|
183 | * 80486DX2 66 128K 2.3 1.9 Escom SIS chipset NT/W95 |
---|
184 | * 80486DX2 66 128K 2.8 2.0 Escom SIS chipset Dos/Dos |
---|
185 | * Pentium 100 256K 11 4.2 Escom Neptune chipset Win/W95 |
---|
186 | * Pentium 100 256K 11 5.5 Escom Neptune chipset NT/W95 |
---|
187 | * Pentium 100 256K 12 4.4 Escom Neptune chipset Dos/Dos |
---|
188 | * Pentium Pro 200 256K 48 19 Dell XPS Pro200n NT/NT |
---|
189 | * |
---|
190 | * The results are as produced when compiled as Linpack.cpp. Compiling as |
---|
191 | * Linpack.c gives similar speeds but the code is a little different. |
---|
192 | * |
---|
193 | *************************************************************************** |
---|
194 | */ |
---|
195 | |
---|
196 | |
---|
197 | #ifdef SP |
---|
198 | #define REAL float |
---|
199 | #define ZERO 0.0 |
---|
200 | #define ONE 1.0 |
---|
201 | #define PREC "Single " |
---|
202 | #endif |
---|
203 | |
---|
204 | #ifdef DP |
---|
205 | #define REAL double |
---|
206 | #define ZERO 0.0e0 |
---|
207 | #define ONE 1.0e0 |
---|
208 | #define PREC "Double " |
---|
209 | #endif |
---|
210 | |
---|
211 | #ifdef ROLL |
---|
212 | #define ROLLING "Rolled " |
---|
213 | #endif |
---|
214 | #ifdef UNROLL |
---|
215 | #define ROLLING "Unrolled " |
---|
216 | #endif |
---|
217 | |
---|
218 | |
---|
219 | #define NTIMES 10 |
---|
220 | |
---|
221 | #include <stdio.h> |
---|
222 | #include <math.h> |
---|
223 | #include <conio.h> |
---|
224 | #include <stdlib.h> |
---|
225 | |
---|
226 | |
---|
227 | static REAL atime[9][15]; |
---|
228 | static char this_month; |
---|
229 | static int this_year; |
---|
230 | |
---|
231 | void print_time (int row); |
---|
232 | void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma); |
---|
233 | void dgefa (REAL a[], int lda, int n, int ipvt[], int *info); |
---|
234 | void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job); |
---|
235 | void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]); |
---|
236 | void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy); |
---|
237 | REAL epslon (REAL x); |
---|
238 | int idamax (int n, REAL dx[], int incx); |
---|
239 | void dscal (int n, REAL da, REAL dx[], int incx); |
---|
240 | REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy); |
---|
241 | |
---|
242 | /* TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME */ |
---|
243 | #include <time.h> /* for following time functions only */ |
---|
244 | REAL second() |
---|
245 | { |
---|
246 | REAL secs; |
---|
247 | clock_t Time; |
---|
248 | Time = clock(); |
---|
249 | secs = (REAL)Time / (REAL)CLOCKS_PER_SEC; |
---|
250 | return secs ; |
---|
251 | } |
---|
252 | |
---|
253 | /* DATE DATE DATE DATE DATE DATE DATE DATE DATE DATE DATE DATE DATE */ |
---|
254 | #include <dos.h> /* for following date functions only */ |
---|
255 | void what_date() |
---|
256 | { |
---|
257 | /* Watcom */ |
---|
258 | struct dosdate_t adate; |
---|
259 | _dos_getdate( &adate ); |
---|
260 | this_month = adate.month; |
---|
261 | this_year = adate.year; |
---|
262 | |
---|
263 | /* Borland |
---|
264 | struct date adate; |
---|
265 | getdate( &adate ); |
---|
266 | this_month = adate.da_mon; |
---|
267 | this_year = adate.da_year; |
---|
268 | */ |
---|
269 | return; |
---|
270 | } |
---|
271 | |
---|
272 | |
---|
273 | main () |
---|
274 | { |
---|
275 | static REAL aa[200*200],a[200*201],b[200],x[200]; |
---|
276 | REAL cray,ops,total,norma,normx; |
---|
277 | REAL resid,residn,eps,t1,tm2,epsn,x1,x2; |
---|
278 | REAL mflops; |
---|
279 | static int ipvt[200],n,i,j,ntimes,info,lda,ldaa; |
---|
280 | int Endit, pass, loop; |
---|
281 | REAL overhead1, overhead2, time1, time2; |
---|
282 | FILE *outfile; |
---|
283 | char *compiler, *options, general[9][80] = {" "}; |
---|
284 | |
---|
285 | outfile = fopen("Linpack.txt","a+"); |
---|
286 | if (outfile == NULL) |
---|
287 | { |
---|
288 | printf ("Cannot open results file \n\n"); |
---|
289 | printf("Press any key\n"); |
---|
290 | Endit = getch(); |
---|
291 | exit (0); |
---|
292 | } |
---|
293 | |
---|
294 | /************************************************************************ |
---|
295 | * Enter details of compiler and options used * |
---|
296 | ************************************************************************/ |
---|
297 | /*----------------- --------- --------- ---------*/ |
---|
298 | compiler = "INSERT COMPILER NAME HERE"; |
---|
299 | options = "INSERT OPTIMISATION OPTIONS HERE"; |
---|
300 | /* Include -dDP or -dSP and -dROLL or -dUNROLL */ |
---|
301 | |
---|
302 | lda = 201; |
---|
303 | ldaa = 200; |
---|
304 | cray = .056; |
---|
305 | n = 100; |
---|
306 | |
---|
307 | fprintf(stdout,ROLLING);fprintf(stdout,PREC); |
---|
308 | fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n"); |
---|
309 | fprintf(stdout,"Compiler %s\n",compiler); |
---|
310 | fprintf(stdout,"Optimisation %s\n\n",options); |
---|
311 | |
---|
312 | ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n); |
---|
313 | |
---|
314 | matgen(a,lda,n,b,&norma); |
---|
315 | t1 = second(); |
---|
316 | dgefa(a,lda,n,ipvt,&info); |
---|
317 | atime[0][0] = second() - t1; |
---|
318 | t1 = second(); |
---|
319 | dgesl(a,lda,n,ipvt,b,0); |
---|
320 | atime[1][0] = second() - t1; |
---|
321 | total = atime[0][0] + atime[1][0]; |
---|
322 | |
---|
323 | /* compute a residual to verify results. */ |
---|
324 | |
---|
325 | for (i = 0; i < n; i++) { |
---|
326 | x[i] = b[i]; |
---|
327 | } |
---|
328 | matgen(a,lda,n,b,&norma); |
---|
329 | for (i = 0; i < n; i++) { |
---|
330 | b[i] = -b[i]; |
---|
331 | } |
---|
332 | dmxpy(n,b,n,lda,x,a); |
---|
333 | resid = 0.0; |
---|
334 | normx = 0.0; |
---|
335 | for (i = 0; i < n; i++) { |
---|
336 | resid = (resid > fabs((double)b[i])) |
---|
337 | ? resid : fabs((double)b[i]); |
---|
338 | normx = (normx > fabs((double)x[i])) |
---|
339 | ? normx : fabs((double)x[i]); |
---|
340 | } |
---|
341 | eps = epslon(ONE); |
---|
342 | residn = resid/( n*norma*normx*eps ); |
---|
343 | epsn = eps; |
---|
344 | x1 = x[0] - 1; |
---|
345 | x2 = x[n-1] - 1; |
---|
346 | |
---|
347 | printf("norm resid resid machep"); |
---|
348 | printf(" x[0]-1 x[n-1]-1\n"); |
---|
349 | printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n", |
---|
350 | (double)residn, (double)resid, (double)epsn, |
---|
351 | (double)x1, (double)x2); |
---|
352 | |
---|
353 | fprintf(stderr,"Times are reported for matrices of order %5d\n",n); |
---|
354 | fprintf(stderr,"1 pass times for array with leading dimension of%5d\n\n",lda); |
---|
355 | fprintf(stderr," dgefa dgesl total Mflops unit"); |
---|
356 | fprintf(stderr," ratio\n"); |
---|
357 | |
---|
358 | atime[2][0] = total; |
---|
359 | if (total > 0.0) |
---|
360 | { |
---|
361 | atime[3][0] = ops/(1.0e6*total); |
---|
362 | atime[4][0] = 2.0/atime[3][0]; |
---|
363 | } |
---|
364 | else |
---|
365 | { |
---|
366 | atime[3][0] = 0.0; |
---|
367 | atime[4][0] = 0.0; |
---|
368 | } |
---|
369 | atime[5][0] = total/cray; |
---|
370 | |
---|
371 | print_time(0); |
---|
372 | |
---|
373 | /************************************************************************ |
---|
374 | * Calculate overhead of executing matgen procedure * |
---|
375 | ************************************************************************/ |
---|
376 | |
---|
377 | fprintf (stderr,"\nCalculating matgen overhead\n"); |
---|
378 | pass = -20; |
---|
379 | loop = NTIMES; |
---|
380 | do |
---|
381 | { |
---|
382 | time1 = second(); |
---|
383 | pass = pass + 1; |
---|
384 | for ( i = 0 ; i < loop ; i++) |
---|
385 | { |
---|
386 | matgen(a,lda,n,b,&norma); |
---|
387 | } |
---|
388 | time2 = second(); |
---|
389 | overhead1 = (time2 - time1); |
---|
390 | fprintf (stderr,"%10d times %6.2f seconds\n", loop, overhead1); |
---|
391 | if (overhead1 > 5.0) |
---|
392 | { |
---|
393 | pass = 0; |
---|
394 | } |
---|
395 | if (pass < 0) |
---|
396 | { |
---|
397 | if (overhead1 < 0.1) |
---|
398 | { |
---|
399 | loop = loop * 10; |
---|
400 | } |
---|
401 | else |
---|
402 | { |
---|
403 | loop = loop * 2; |
---|
404 | } |
---|
405 | } |
---|
406 | } |
---|
407 | while (pass < 0); |
---|
408 | |
---|
409 | overhead1 = overhead1 / (double)loop; |
---|
410 | |
---|
411 | fprintf (stderr,"Overhead for 1 matgen %12.5f seconds\n\n", overhead1); |
---|
412 | |
---|
413 | /************************************************************************ |
---|
414 | * Calculate matgen/dgefa passes for 5 seconds * |
---|
415 | ************************************************************************/ |
---|
416 | |
---|
417 | fprintf (stderr,"Calculating matgen/dgefa passes for 5 seconds\n"); |
---|
418 | pass = -20; |
---|
419 | ntimes = NTIMES; |
---|
420 | do |
---|
421 | { |
---|
422 | time1 = second(); |
---|
423 | pass = pass + 1; |
---|
424 | for ( i = 0 ; i < ntimes ; i++) |
---|
425 | { |
---|
426 | matgen(a,lda,n,b,&norma); |
---|
427 | dgefa(a,lda,n,ipvt,&info ); |
---|
428 | } |
---|
429 | time2 = second() - time1; |
---|
430 | fprintf (stderr,"%10d times %6.2f seconds\n", ntimes, time2); |
---|
431 | if (time2 > 5.0) |
---|
432 | { |
---|
433 | pass = 0; |
---|
434 | } |
---|
435 | if (pass < 0) |
---|
436 | { |
---|
437 | if (time2 < 0.1) |
---|
438 | { |
---|
439 | ntimes = ntimes * 10; |
---|
440 | } |
---|
441 | else |
---|
442 | { |
---|
443 | ntimes = ntimes * 2; |
---|
444 | } |
---|
445 | } |
---|
446 | } |
---|
447 | while (pass < 0); |
---|
448 | |
---|
449 | ntimes = 5.0 * (double)ntimes / time2; |
---|
450 | if (ntimes == 0) ntimes = 1; |
---|
451 | |
---|
452 | fprintf (stderr,"Passes used %10d \n\n", ntimes); |
---|
453 | fprintf(stderr,"Times for array with leading dimension of%4d\n\n",lda); |
---|
454 | fprintf(stderr," dgefa dgesl total Mflops unit"); |
---|
455 | fprintf(stderr," ratio\n"); |
---|
456 | |
---|
457 | /************************************************************************ |
---|
458 | * Execute 5 passes * |
---|
459 | ************************************************************************/ |
---|
460 | |
---|
461 | tm2 = ntimes * overhead1; |
---|
462 | atime[3][6] = 0; |
---|
463 | |
---|
464 | for (j=1 ; j<6 ; j++) |
---|
465 | { |
---|
466 | |
---|
467 | t1 = second(); |
---|
468 | |
---|
469 | for (i = 0; i < ntimes; i++) |
---|
470 | { |
---|
471 | matgen(a,lda,n,b,&norma); |
---|
472 | dgefa(a,lda,n,ipvt,&info ); |
---|
473 | } |
---|
474 | |
---|
475 | atime[0][j] = (second() - t1 - tm2)/ntimes; |
---|
476 | |
---|
477 | t1 = second(); |
---|
478 | |
---|
479 | for (i = 0; i < ntimes; i++) |
---|
480 | { |
---|
481 | dgesl(a,lda,n,ipvt,b,0); |
---|
482 | } |
---|
483 | |
---|
484 | atime[1][j] = (second() - t1)/ntimes; |
---|
485 | total = atime[0][j] + atime[1][j]; |
---|
486 | atime[2][j] = total; |
---|
487 | atime[3][j] = ops/(1.0e6*total); |
---|
488 | atime[4][j] = 2.0/atime[3][j]; |
---|
489 | atime[5][j] = total/cray; |
---|
490 | atime[3][6] = atime[3][6] + atime[3][j]; |
---|
491 | |
---|
492 | print_time(j); |
---|
493 | } |
---|
494 | atime[3][6] = atime[3][6] / 5.0; |
---|
495 | fprintf (stderr,"Average %11.2f\n", |
---|
496 | (double)atime[3][6]); |
---|
497 | |
---|
498 | fprintf (stderr,"\nCalculating matgen2 overhead\n"); |
---|
499 | |
---|
500 | /************************************************************************ |
---|
501 | * Calculate overhead of executing matgen procedure * |
---|
502 | ************************************************************************/ |
---|
503 | |
---|
504 | time1 = second(); |
---|
505 | for ( i = 0 ; i < loop ; i++) |
---|
506 | { |
---|
507 | matgen(aa,ldaa,n,b,&norma); |
---|
508 | } |
---|
509 | time2 = second(); |
---|
510 | overhead2 = (time2 - time1); |
---|
511 | overhead2 = overhead2 / (double)loop; |
---|
512 | |
---|
513 | fprintf (stderr,"Overhead for 1 matgen %12.5f seconds\n\n", overhead2); |
---|
514 | fprintf(stderr,"Times for array with leading dimension of%4d\n\n",ldaa); |
---|
515 | fprintf(stderr," dgefa dgesl total Mflops unit"); |
---|
516 | fprintf(stderr," ratio\n"); |
---|
517 | |
---|
518 | /************************************************************************ |
---|
519 | * Execute 5 passes * |
---|
520 | ************************************************************************/ |
---|
521 | |
---|
522 | tm2 = ntimes * overhead2; |
---|
523 | atime[3][12] = 0; |
---|
524 | |
---|
525 | for (j=7 ; j<12 ; j++) |
---|
526 | { |
---|
527 | |
---|
528 | t1 = second(); |
---|
529 | |
---|
530 | for (i = 0; i < ntimes; i++) |
---|
531 | { |
---|
532 | matgen(aa,ldaa,n,b,&norma); |
---|
533 | dgefa(aa,ldaa,n,ipvt,&info ); |
---|
534 | } |
---|
535 | |
---|
536 | atime[0][j] = (second() - t1 - tm2)/ntimes; |
---|
537 | |
---|
538 | t1 = second(); |
---|
539 | |
---|
540 | for (i = 0; i < ntimes; i++) |
---|
541 | { |
---|
542 | dgesl(aa,ldaa,n,ipvt,b,0); |
---|
543 | } |
---|
544 | |
---|
545 | atime[1][j] = (second() - t1)/ntimes; |
---|
546 | total = atime[0][j] + atime[1][j]; |
---|
547 | atime[2][j] = total; |
---|
548 | atime[3][j] = ops/(1.0e6*total); |
---|
549 | atime[4][j] = 2.0/atime[3][j]; |
---|
550 | atime[5][j] = total/cray; |
---|
551 | atime[3][12] = atime[3][12] + atime[3][j]; |
---|
552 | |
---|
553 | print_time(j); |
---|
554 | } |
---|
555 | atime[3][12] = atime[3][12] / 5.0; |
---|
556 | fprintf (stderr,"Average %11.2f\n", |
---|
557 | (double)atime[3][12]); |
---|
558 | |
---|
559 | /************************************************************************ |
---|
560 | * Use minimum average as overall Mflops rating * |
---|
561 | ************************************************************************/ |
---|
562 | |
---|
563 | mflops = atime[3][6]; |
---|
564 | if (atime[3][12] < mflops) mflops = atime[3][12]; |
---|
565 | |
---|
566 | fprintf(stderr,"\n"); |
---|
567 | fprintf(stderr,ROLLING);fprintf(stderr,PREC); |
---|
568 | fprintf(stderr," Precision %11.2f Mflops \n\n",mflops); |
---|
569 | |
---|
570 | what_date(); |
---|
571 | |
---|
572 | /************************************************************************ |
---|
573 | * Type details of hardware, software etc. * |
---|
574 | ************************************************************************/ |
---|
575 | |
---|
576 | printf ("Enter the following data which will be " |
---|
577 | "appended to file Linpack.txt \n\n"); |
---|
578 | printf ("PC Supplier/model ?\n "); |
---|
579 | scanf ("%[^\n]", general[1]); |
---|
580 | fflush (stdin); |
---|
581 | printf ("CPU ?\n "); |
---|
582 | scanf ("%[^\n]", general[2]); |
---|
583 | fflush (stdin); |
---|
584 | printf ("Clock MHz ?\n "); |
---|
585 | scanf ("%[^\n]", general[3]); |
---|
586 | fflush (stdin); |
---|
587 | printf ("Cache ?\n "); |
---|
588 | scanf ("%[^\n]", general[4]); |
---|
589 | fflush (stdin); |
---|
590 | printf ("Chipset/options ?\n "); |
---|
591 | scanf ("%[^\n]", general[5]); |
---|
592 | fflush (stdin); |
---|
593 | printf ("OS/DOS version ?\n "); |
---|
594 | scanf ("%[^\n]", general[6]); |
---|
595 | fflush (stdin); |
---|
596 | printf ("Your name ?\n "); |
---|
597 | scanf ("%[^\n]", general[7]); |
---|
598 | fflush (stdin); |
---|
599 | printf ("Where from ?\n "); |
---|
600 | scanf ("%[^\n]", general[8]); |
---|
601 | fflush (stdin); |
---|
602 | printf ("Mail address ?\n "); |
---|
603 | scanf ("%[^\n]", general[0]); |
---|
604 | fflush (stdin); |
---|
605 | |
---|
606 | /************************************************************************ |
---|
607 | * Add results to output file LLloops.txt * |
---|
608 | ************************************************************************/ |
---|
609 | |
---|
610 | fprintf (outfile, "----------------- ----------------- --------- " |
---|
611 | "--------- ---------\n"); |
---|
612 | fprintf (outfile, "LINPACK BENCHMARK FOR PCs 'C/C++' n @ 100\n\n"); |
---|
613 | fprintf (outfile, "Month run %d/%d\n", this_month, this_year); |
---|
614 | fprintf (outfile, "PC model %s\n", general[1]); |
---|
615 | fprintf (outfile, "CPU %s\n", general[2]); |
---|
616 | fprintf (outfile, "Clock MHz %s\n", general[3]); |
---|
617 | fprintf (outfile, "Cache %s\n", general[4]); |
---|
618 | fprintf (outfile, "Options %s\n", general[5]); |
---|
619 | fprintf (outfile, "OS/DOS %s\n", general[6]); |
---|
620 | fprintf (outfile, "Compiler %s\n", compiler); |
---|
621 | fprintf (outfile, "OptLevel %s\n", options); |
---|
622 | fprintf (outfile, "Run by %s\n", general[7]); |
---|
623 | fprintf (outfile, "From %s\n", general[8]); |
---|
624 | fprintf (outfile, "Mail %s\n\n", general[0]); |
---|
625 | |
---|
626 | fprintf(outfile, "Rolling %s\n",ROLLING); |
---|
627 | fprintf(outfile, "Precision %s\n",PREC); |
---|
628 | fprintf(outfile, "norm. resid %16.1f\n",(double)residn); |
---|
629 | fprintf(outfile, "resid %16.8e\n",(double)resid); |
---|
630 | fprintf(outfile, "machep %16.8e\n",(double)epsn); |
---|
631 | fprintf(outfile, "x[0]-1 %16.8e\n",(double)x1); |
---|
632 | fprintf(outfile, "x[n-1]-1 %16.8e\n",(double)x2); |
---|
633 | fprintf(outfile, "matgen 1 seconds %16.5f\n",overhead1); |
---|
634 | fprintf(outfile, "matgen 2 seconds %16.5f\n",overhead2); |
---|
635 | fprintf(outfile, "Repetitions %16d\n",ntimes); |
---|
636 | fprintf(outfile, "Leading dimension %16d\n",lda); |
---|
637 | fprintf(outfile, " dgefa dgesl " |
---|
638 | " total Mflops\n"); |
---|
639 | fprintf(outfile, "1 pass seconds %16.5f %9.5f %9.5f\n", |
---|
640 | atime[0][0], atime[1][0], atime[2][0]); |
---|
641 | |
---|
642 | for (i=1 ; i<6 ; i++) |
---|
643 | { |
---|
644 | fprintf(outfile, "Repeat seconds %16.5f %9.5f %9.5f %9.2f\n", |
---|
645 | atime[0][i], atime[1][i], atime[2][i], atime[3][i]); |
---|
646 | } |
---|
647 | fprintf(outfile, "Average %46.2f\n",atime[3][6]); |
---|
648 | |
---|
649 | fprintf(outfile, "Leading dimension %16d\n",ldaa); |
---|
650 | |
---|
651 | for (i=7 ; i<12 ; i++) |
---|
652 | { |
---|
653 | fprintf(outfile, "Repeat seconds %16.5f %9.5f %9.5f %9.2f\n", |
---|
654 | atime[0][i], atime[1][i], atime[2][i], atime[3][i]); |
---|
655 | } |
---|
656 | fprintf(outfile, "Average %46.2f\n\n",atime[3][12]); |
---|
657 | |
---|
658 | fclose (outfile); |
---|
659 | |
---|
660 | printf("\nPress any key\n"); |
---|
661 | Endit = getch(); |
---|
662 | } |
---|
663 | |
---|
664 | /*----------------------*/ |
---|
665 | void print_time (int row) |
---|
666 | |
---|
667 | { |
---|
668 | fprintf(stderr,"%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n", (double)atime[0][row], |
---|
669 | (double)atime[1][row], (double)atime[2][row], (double)atime[3][row], |
---|
670 | (double)atime[4][row], (double)atime[5][row]); |
---|
671 | return; |
---|
672 | } |
---|
673 | |
---|
674 | /*----------------------*/ |
---|
675 | |
---|
676 | void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma) |
---|
677 | |
---|
678 | |
---|
679 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
680 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
681 | |
---|
682 | { |
---|
683 | int init, i, j; |
---|
684 | |
---|
685 | init = 1325; |
---|
686 | *norma = 0.0; |
---|
687 | for (j = 0; j < n; j++) { |
---|
688 | for (i = 0; i < n; i++) { |
---|
689 | init = 3125*init % 65536; |
---|
690 | a[lda*j+i] = (init - 32768.0)/16384.0; |
---|
691 | *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma; |
---|
692 | |
---|
693 | /* alternative for some compilers |
---|
694 | if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]); |
---|
695 | */ |
---|
696 | } |
---|
697 | } |
---|
698 | for (i = 0; i < n; i++) { |
---|
699 | b[i] = 0.0; |
---|
700 | } |
---|
701 | for (j = 0; j < n; j++) { |
---|
702 | for (i = 0; i < n; i++) { |
---|
703 | b[i] = b[i] + a[lda*j+i]; |
---|
704 | } |
---|
705 | } |
---|
706 | return; |
---|
707 | } |
---|
708 | |
---|
709 | /*----------------------*/ |
---|
710 | void dgefa(REAL a[], int lda, int n, int ipvt[], int *info) |
---|
711 | |
---|
712 | |
---|
713 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
714 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
715 | /* |
---|
716 | dgefa factors a double precision matrix by gaussian elimination. |
---|
717 | |
---|
718 | dgefa is usually called by dgeco, but it can be called |
---|
719 | directly with a saving in time if rcond is not needed. |
---|
720 | (time for dgeco) = (1 + 9/n)*(time for dgefa) . |
---|
721 | |
---|
722 | on entry |
---|
723 | |
---|
724 | a REAL precision[n][lda] |
---|
725 | the matrix to be factored. |
---|
726 | |
---|
727 | lda integer |
---|
728 | the leading dimension of the array a . |
---|
729 | |
---|
730 | n integer |
---|
731 | the order of the matrix a . |
---|
732 | |
---|
733 | on return |
---|
734 | |
---|
735 | a an upper triangular matrix and the multipliers |
---|
736 | which were used to obtain it. |
---|
737 | the factorization can be written a = l*u where |
---|
738 | l is a product of permutation and unit lower |
---|
739 | triangular matrices and u is upper triangular. |
---|
740 | |
---|
741 | ipvt integer[n] |
---|
742 | an integer vector of pivot indices. |
---|
743 | |
---|
744 | info integer |
---|
745 | = 0 normal value. |
---|
746 | = k if u[k][k] .eq. 0.0 . this is not an error |
---|
747 | condition for this subroutine, but it does |
---|
748 | indicate that dgesl or dgedi will divide by zero |
---|
749 | if called. use rcond in dgeco for a reliable |
---|
750 | indication of singularity. |
---|
751 | |
---|
752 | linpack. this version dated 08/14/78 . |
---|
753 | cleve moler, university of new mexico, argonne national lab. |
---|
754 | |
---|
755 | functions |
---|
756 | |
---|
757 | blas daxpy,dscal,idamax |
---|
758 | */ |
---|
759 | |
---|
760 | { |
---|
761 | /* internal variables */ |
---|
762 | |
---|
763 | REAL t; |
---|
764 | int j,k,kp1,l,nm1; |
---|
765 | |
---|
766 | |
---|
767 | /* gaussian elimination with partial pivoting */ |
---|
768 | |
---|
769 | *info = 0; |
---|
770 | nm1 = n - 1; |
---|
771 | if (nm1 >= 0) { |
---|
772 | for (k = 0; k < nm1; k++) { |
---|
773 | kp1 = k + 1; |
---|
774 | |
---|
775 | /* find l = pivot index */ |
---|
776 | |
---|
777 | l = idamax(n-k,&a[lda*k+k],1) + k; |
---|
778 | ipvt[k] = l; |
---|
779 | |
---|
780 | /* zero pivot implies this column already |
---|
781 | triangularized */ |
---|
782 | |
---|
783 | if (a[lda*k+l] != ZERO) { |
---|
784 | |
---|
785 | /* interchange if necessary */ |
---|
786 | |
---|
787 | if (l != k) { |
---|
788 | t = a[lda*k+l]; |
---|
789 | a[lda*k+l] = a[lda*k+k]; |
---|
790 | a[lda*k+k] = t; |
---|
791 | } |
---|
792 | |
---|
793 | /* compute multipliers */ |
---|
794 | |
---|
795 | t = -ONE/a[lda*k+k]; |
---|
796 | dscal(n-(k+1),t,&a[lda*k+k+1],1); |
---|
797 | |
---|
798 | /* row elimination with column indexing */ |
---|
799 | |
---|
800 | for (j = kp1; j < n; j++) { |
---|
801 | t = a[lda*j+l]; |
---|
802 | if (l != k) { |
---|
803 | a[lda*j+l] = a[lda*j+k]; |
---|
804 | a[lda*j+k] = t; |
---|
805 | } |
---|
806 | daxpy(n-(k+1),t,&a[lda*k+k+1],1, |
---|
807 | &a[lda*j+k+1],1); |
---|
808 | } |
---|
809 | } |
---|
810 | else { |
---|
811 | *info = k; |
---|
812 | } |
---|
813 | } |
---|
814 | } |
---|
815 | ipvt[n-1] = n-1; |
---|
816 | if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1; |
---|
817 | return; |
---|
818 | } |
---|
819 | |
---|
820 | /*----------------------*/ |
---|
821 | |
---|
822 | void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job ) |
---|
823 | |
---|
824 | |
---|
825 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
826 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
827 | |
---|
828 | /* |
---|
829 | dgesl solves the double precision system |
---|
830 | a * x = b or trans(a) * x = b |
---|
831 | using the factors computed by dgeco or dgefa. |
---|
832 | |
---|
833 | on entry |
---|
834 | |
---|
835 | a double precision[n][lda] |
---|
836 | the output from dgeco or dgefa. |
---|
837 | |
---|
838 | lda integer |
---|
839 | the leading dimension of the array a . |
---|
840 | |
---|
841 | n integer |
---|
842 | the order of the matrix a . |
---|
843 | |
---|
844 | ipvt integer[n] |
---|
845 | the pivot vector from dgeco or dgefa. |
---|
846 | |
---|
847 | b double precision[n] |
---|
848 | the right hand side vector. |
---|
849 | |
---|
850 | job integer |
---|
851 | = 0 to solve a*x = b , |
---|
852 | = nonzero to solve trans(a)*x = b where |
---|
853 | trans(a) is the transpose. |
---|
854 | |
---|
855 | on return |
---|
856 | |
---|
857 | b the solution vector x . |
---|
858 | |
---|
859 | error condition |
---|
860 | |
---|
861 | a division by zero will occur if the input factor contains a |
---|
862 | zero on the diagonal. technically this indicates singularity |
---|
863 | but it is often caused by improper arguments or improper |
---|
864 | setting of lda . it will not occur if the subroutines are |
---|
865 | called correctly and if dgeco has set rcond .gt. 0.0 |
---|
866 | or dgefa has set info .eq. 0 . |
---|
867 | |
---|
868 | to compute inverse(a) * c where c is a matrix |
---|
869 | with p columns |
---|
870 | dgeco(a,lda,n,ipvt,rcond,z) |
---|
871 | if (!rcond is too small){ |
---|
872 | for (j=0,j<p,j++) |
---|
873 | dgesl(a,lda,n,ipvt,c[j][0],0); |
---|
874 | } |
---|
875 | |
---|
876 | linpack. this version dated 08/14/78 . |
---|
877 | cleve moler, university of new mexico, argonne national lab. |
---|
878 | |
---|
879 | functions |
---|
880 | |
---|
881 | blas daxpy,ddot |
---|
882 | */ |
---|
883 | { |
---|
884 | /* internal variables */ |
---|
885 | |
---|
886 | REAL t; |
---|
887 | int k,kb,l,nm1; |
---|
888 | |
---|
889 | nm1 = n - 1; |
---|
890 | if (job == 0) { |
---|
891 | |
---|
892 | /* job = 0 , solve a * x = b |
---|
893 | first solve l*y = b */ |
---|
894 | |
---|
895 | if (nm1 >= 1) { |
---|
896 | for (k = 0; k < nm1; k++) { |
---|
897 | l = ipvt[k]; |
---|
898 | t = b[l]; |
---|
899 | if (l != k){ |
---|
900 | b[l] = b[k]; |
---|
901 | b[k] = t; |
---|
902 | } |
---|
903 | daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 ); |
---|
904 | } |
---|
905 | } |
---|
906 | |
---|
907 | /* now solve u*x = y */ |
---|
908 | |
---|
909 | for (kb = 0; kb < n; kb++) { |
---|
910 | k = n - (kb + 1); |
---|
911 | b[k] = b[k]/a[lda*k+k]; |
---|
912 | t = -b[k]; |
---|
913 | daxpy(k,t,&a[lda*k+0],1,&b[0],1 ); |
---|
914 | } |
---|
915 | } |
---|
916 | else { |
---|
917 | |
---|
918 | /* job = nonzero, solve trans(a) * x = b |
---|
919 | first solve trans(u)*y = b */ |
---|
920 | |
---|
921 | for (k = 0; k < n; k++) { |
---|
922 | t = ddot(k,&a[lda*k+0],1,&b[0],1); |
---|
923 | b[k] = (b[k] - t)/a[lda*k+k]; |
---|
924 | } |
---|
925 | |
---|
926 | /* now solve trans(l)*x = y */ |
---|
927 | |
---|
928 | if (nm1 >= 1) { |
---|
929 | for (kb = 1; kb < nm1; kb++) { |
---|
930 | k = n - (kb+1); |
---|
931 | b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1); |
---|
932 | l = ipvt[k]; |
---|
933 | if (l != k) { |
---|
934 | t = b[l]; |
---|
935 | b[l] = b[k]; |
---|
936 | b[k] = t; |
---|
937 | } |
---|
938 | } |
---|
939 | } |
---|
940 | } |
---|
941 | return; |
---|
942 | } |
---|
943 | |
---|
944 | /*----------------------*/ |
---|
945 | |
---|
946 | void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy) |
---|
947 | /* |
---|
948 | constant times a vector plus a vector. |
---|
949 | jack dongarra, linpack, 3/11/78. |
---|
950 | */ |
---|
951 | |
---|
952 | { |
---|
953 | int i,ix,iy,m,mp1; |
---|
954 | |
---|
955 | mp1 = 0; |
---|
956 | m = 0; |
---|
957 | |
---|
958 | if(n <= 0) return; |
---|
959 | if (da == ZERO) return; |
---|
960 | |
---|
961 | if(incx != 1 || incy != 1) { |
---|
962 | |
---|
963 | /* code for unequal increments or equal increments |
---|
964 | not equal to 1 */ |
---|
965 | |
---|
966 | ix = 0; |
---|
967 | iy = 0; |
---|
968 | if(incx < 0) ix = (-n+1)*incx; |
---|
969 | if(incy < 0)iy = (-n+1)*incy; |
---|
970 | for (i = 0;i < n; i++) { |
---|
971 | dy[iy] = dy[iy] + da*dx[ix]; |
---|
972 | ix = ix + incx; |
---|
973 | iy = iy + incy; |
---|
974 | |
---|
975 | } |
---|
976 | return; |
---|
977 | } |
---|
978 | |
---|
979 | /* code for both increments equal to 1 */ |
---|
980 | |
---|
981 | |
---|
982 | #ifdef ROLL |
---|
983 | |
---|
984 | for (i = 0;i < n; i++) { |
---|
985 | dy[i] = dy[i] + da*dx[i]; |
---|
986 | } |
---|
987 | |
---|
988 | |
---|
989 | #endif |
---|
990 | |
---|
991 | #ifdef UNROLL |
---|
992 | |
---|
993 | m = n % 4; |
---|
994 | if ( m != 0) { |
---|
995 | for (i = 0; i < m; i++) |
---|
996 | dy[i] = dy[i] + da*dx[i]; |
---|
997 | |
---|
998 | if (n < 4) return; |
---|
999 | } |
---|
1000 | for (i = m; i < n; i = i + 4) { |
---|
1001 | dy[i] = dy[i] + da*dx[i]; |
---|
1002 | dy[i+1] = dy[i+1] + da*dx[i+1]; |
---|
1003 | dy[i+2] = dy[i+2] + da*dx[i+2]; |
---|
1004 | dy[i+3] = dy[i+3] + da*dx[i+3]; |
---|
1005 | |
---|
1006 | } |
---|
1007 | |
---|
1008 | #endif |
---|
1009 | return; |
---|
1010 | } |
---|
1011 | |
---|
1012 | /*----------------------*/ |
---|
1013 | |
---|
1014 | REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy) |
---|
1015 | /* |
---|
1016 | forms the dot product of two vectors. |
---|
1017 | jack dongarra, linpack, 3/11/78. |
---|
1018 | */ |
---|
1019 | |
---|
1020 | { |
---|
1021 | REAL dtemp; |
---|
1022 | int i,ix,iy,m,mp1; |
---|
1023 | |
---|
1024 | mp1 = 0; |
---|
1025 | m = 0; |
---|
1026 | |
---|
1027 | dtemp = ZERO; |
---|
1028 | |
---|
1029 | if(n <= 0) return(ZERO); |
---|
1030 | |
---|
1031 | if(incx != 1 || incy != 1) { |
---|
1032 | |
---|
1033 | /* code for unequal increments or equal increments |
---|
1034 | not equal to 1 */ |
---|
1035 | |
---|
1036 | ix = 0; |
---|
1037 | iy = 0; |
---|
1038 | if (incx < 0) ix = (-n+1)*incx; |
---|
1039 | if (incy < 0) iy = (-n+1)*incy; |
---|
1040 | for (i = 0;i < n; i++) { |
---|
1041 | dtemp = dtemp + dx[ix]*dy[iy]; |
---|
1042 | ix = ix + incx; |
---|
1043 | iy = iy + incy; |
---|
1044 | |
---|
1045 | } |
---|
1046 | return(dtemp); |
---|
1047 | } |
---|
1048 | |
---|
1049 | /* code for both increments equal to 1 */ |
---|
1050 | |
---|
1051 | |
---|
1052 | #ifdef ROLL |
---|
1053 | |
---|
1054 | for (i=0;i < n; i++) |
---|
1055 | dtemp = dtemp + dx[i]*dy[i]; |
---|
1056 | |
---|
1057 | return(dtemp); |
---|
1058 | |
---|
1059 | #endif |
---|
1060 | |
---|
1061 | #ifdef UNROLL |
---|
1062 | |
---|
1063 | |
---|
1064 | m = n % 5; |
---|
1065 | if (m != 0) { |
---|
1066 | for (i = 0; i < m; i++) |
---|
1067 | dtemp = dtemp + dx[i]*dy[i]; |
---|
1068 | if (n < 5) return(dtemp); |
---|
1069 | } |
---|
1070 | for (i = m; i < n; i = i + 5) { |
---|
1071 | dtemp = dtemp + dx[i]*dy[i] + |
---|
1072 | dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] + |
---|
1073 | dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4]; |
---|
1074 | } |
---|
1075 | return(dtemp); |
---|
1076 | |
---|
1077 | #endif |
---|
1078 | |
---|
1079 | } |
---|
1080 | |
---|
1081 | /*----------------------*/ |
---|
1082 | void dscal(int n, REAL da, REAL dx[], int incx) |
---|
1083 | |
---|
1084 | /* scales a vector by a constant. |
---|
1085 | jack dongarra, linpack, 3/11/78. |
---|
1086 | */ |
---|
1087 | |
---|
1088 | { |
---|
1089 | int i,m,mp1,nincx; |
---|
1090 | |
---|
1091 | mp1 = 0; |
---|
1092 | m = 0; |
---|
1093 | |
---|
1094 | if(n <= 0)return; |
---|
1095 | if(incx != 1) { |
---|
1096 | |
---|
1097 | /* code for increment not equal to 1 */ |
---|
1098 | |
---|
1099 | nincx = n*incx; |
---|
1100 | for (i = 0; i < nincx; i = i + incx) |
---|
1101 | dx[i] = da*dx[i]; |
---|
1102 | |
---|
1103 | return; |
---|
1104 | } |
---|
1105 | |
---|
1106 | /* code for increment equal to 1 */ |
---|
1107 | |
---|
1108 | |
---|
1109 | #ifdef ROLL |
---|
1110 | |
---|
1111 | for (i = 0; i < n; i++) |
---|
1112 | dx[i] = da*dx[i]; |
---|
1113 | |
---|
1114 | |
---|
1115 | #endif |
---|
1116 | |
---|
1117 | #ifdef UNROLL |
---|
1118 | |
---|
1119 | |
---|
1120 | m = n % 5; |
---|
1121 | if (m != 0) { |
---|
1122 | for (i = 0; i < m; i++) |
---|
1123 | dx[i] = da*dx[i]; |
---|
1124 | if (n < 5) return; |
---|
1125 | } |
---|
1126 | for (i = m; i < n; i = i + 5){ |
---|
1127 | dx[i] = da*dx[i]; |
---|
1128 | dx[i+1] = da*dx[i+1]; |
---|
1129 | dx[i+2] = da*dx[i+2]; |
---|
1130 | dx[i+3] = da*dx[i+3]; |
---|
1131 | dx[i+4] = da*dx[i+4]; |
---|
1132 | } |
---|
1133 | |
---|
1134 | #endif |
---|
1135 | |
---|
1136 | } |
---|
1137 | |
---|
1138 | /*----------------------*/ |
---|
1139 | int idamax(int n, REAL dx[], int incx) |
---|
1140 | |
---|
1141 | /* |
---|
1142 | finds the index of element having max. absolute value. |
---|
1143 | jack dongarra, linpack, 3/11/78. |
---|
1144 | */ |
---|
1145 | |
---|
1146 | |
---|
1147 | { |
---|
1148 | REAL dmax; |
---|
1149 | int i, ix, itemp; |
---|
1150 | |
---|
1151 | if( n < 1 ) return(-1); |
---|
1152 | if(n ==1 ) return(0); |
---|
1153 | if(incx != 1) { |
---|
1154 | |
---|
1155 | /* code for increment not equal to 1 */ |
---|
1156 | |
---|
1157 | ix = 1; |
---|
1158 | dmax = fabs((double)dx[0]); |
---|
1159 | ix = ix + incx; |
---|
1160 | for (i = 1; i < n; i++) { |
---|
1161 | if(fabs((double)dx[ix]) > dmax) { |
---|
1162 | itemp = i; |
---|
1163 | dmax = fabs((double)dx[ix]); |
---|
1164 | } |
---|
1165 | ix = ix + incx; |
---|
1166 | } |
---|
1167 | } |
---|
1168 | else { |
---|
1169 | |
---|
1170 | /* code for increment equal to 1 */ |
---|
1171 | |
---|
1172 | itemp = 0; |
---|
1173 | dmax = fabs((double)dx[0]); |
---|
1174 | for (i = 1; i < n; i++) { |
---|
1175 | if(fabs((double)dx[i]) > dmax) { |
---|
1176 | itemp = i; |
---|
1177 | dmax = fabs((double)dx[i]); |
---|
1178 | } |
---|
1179 | } |
---|
1180 | } |
---|
1181 | return (itemp); |
---|
1182 | } |
---|
1183 | |
---|
1184 | /*----------------------*/ |
---|
1185 | REAL epslon (REAL x) |
---|
1186 | |
---|
1187 | /* |
---|
1188 | estimate unit roundoff in quantities of size x. |
---|
1189 | */ |
---|
1190 | |
---|
1191 | { |
---|
1192 | REAL a,b,c,eps; |
---|
1193 | /* |
---|
1194 | this program should function properly on all systems |
---|
1195 | satisfying the following two assumptions, |
---|
1196 | 1. the base used in representing dfloating point |
---|
1197 | numbers is not a power of three. |
---|
1198 | 2. the quantity a in statement 10 is represented to |
---|
1199 | the accuracy used in dfloating point variables |
---|
1200 | that are stored in memory. |
---|
1201 | the statement number 10 and the go to 10 are intended to |
---|
1202 | force optimizing compilers to generate code satisfying |
---|
1203 | assumption 2. |
---|
1204 | under these assumptions, it should be true that, |
---|
1205 | a is not exactly equal to four-thirds, |
---|
1206 | b has a zero for its last bit or digit, |
---|
1207 | c is not exactly equal to one, |
---|
1208 | eps measures the separation of 1.0 from |
---|
1209 | the next larger dfloating point number. |
---|
1210 | the developers of eispack would appreciate being informed |
---|
1211 | about any systems where these assumptions do not hold. |
---|
1212 | |
---|
1213 | ***************************************************************** |
---|
1214 | this routine is one of the auxiliary routines used by eispack iii |
---|
1215 | to avoid machine dependencies. |
---|
1216 | ***************************************************************** |
---|
1217 | |
---|
1218 | this version dated 4/6/83. |
---|
1219 | */ |
---|
1220 | |
---|
1221 | a = 4.0e0/3.0e0; |
---|
1222 | eps = ZERO; |
---|
1223 | while (eps == ZERO) { |
---|
1224 | b = a - ONE; |
---|
1225 | c = b + b + b; |
---|
1226 | eps = fabs((double)(c-ONE)); |
---|
1227 | } |
---|
1228 | return(eps*fabs((double)x)); |
---|
1229 | } |
---|
1230 | |
---|
1231 | /*----------------------*/ |
---|
1232 | void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]) |
---|
1233 | |
---|
1234 | |
---|
1235 | /* We would like to declare m[][ldm], but c does not allow it. In this |
---|
1236 | function, references to m[i][j] are written m[ldm*i+j]. */ |
---|
1237 | |
---|
1238 | /* |
---|
1239 | purpose: |
---|
1240 | multiply matrix m times vector x and add the result to vector y. |
---|
1241 | |
---|
1242 | parameters: |
---|
1243 | |
---|
1244 | n1 integer, number of elements in vector y, and number of rows in |
---|
1245 | matrix m |
---|
1246 | |
---|
1247 | y double [n1], vector of length n1 to which is added |
---|
1248 | the product m*x |
---|
1249 | |
---|
1250 | n2 integer, number of elements in vector x, and number of columns |
---|
1251 | in matrix m |
---|
1252 | |
---|
1253 | ldm integer, leading dimension of array m |
---|
1254 | |
---|
1255 | x double [n2], vector of length n2 |
---|
1256 | |
---|
1257 | m double [ldm][n2], matrix of n1 rows and n2 columns |
---|
1258 | |
---|
1259 | ---------------------------------------------------------------------- |
---|
1260 | */ |
---|
1261 | { |
---|
1262 | int j,i,jmin; |
---|
1263 | /* cleanup odd vector */ |
---|
1264 | |
---|
1265 | j = n2 % 2; |
---|
1266 | if (j >= 1) { |
---|
1267 | j = j - 1; |
---|
1268 | for (i = 0; i < n1; i++) |
---|
1269 | y[i] = (y[i]) + x[j]*m[ldm*j+i]; |
---|
1270 | } |
---|
1271 | |
---|
1272 | /* cleanup odd group of two vectors */ |
---|
1273 | |
---|
1274 | j = n2 % 4; |
---|
1275 | if (j >= 2) { |
---|
1276 | j = j - 1; |
---|
1277 | for (i = 0; i < n1; i++) |
---|
1278 | y[i] = ( (y[i]) |
---|
1279 | + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; |
---|
1280 | } |
---|
1281 | |
---|
1282 | /* cleanup odd group of four vectors */ |
---|
1283 | |
---|
1284 | j = n2 % 8; |
---|
1285 | if (j >= 4) { |
---|
1286 | j = j - 1; |
---|
1287 | for (i = 0; i < n1; i++) |
---|
1288 | y[i] = ((( (y[i]) |
---|
1289 | + x[j-3]*m[ldm*(j-3)+i]) |
---|
1290 | + x[j-2]*m[ldm*(j-2)+i]) |
---|
1291 | + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; |
---|
1292 | } |
---|
1293 | |
---|
1294 | /* cleanup odd group of eight vectors */ |
---|
1295 | |
---|
1296 | j = n2 % 16; |
---|
1297 | if (j >= 8) { |
---|
1298 | j = j - 1; |
---|
1299 | for (i = 0; i < n1; i++) |
---|
1300 | y[i] = ((((((( (y[i]) |
---|
1301 | + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i]) |
---|
1302 | + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i]) |
---|
1303 | + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i]) |
---|
1304 | + x[j-1]*m[ldm*(j-1)+i]) + x[j] *m[ldm*j+i]; |
---|
1305 | } |
---|
1306 | |
---|
1307 | /* main loop - groups of sixteen vectors */ |
---|
1308 | |
---|
1309 | jmin = (n2%16)+16; |
---|
1310 | for (j = jmin-1; j < n2; j = j + 16) { |
---|
1311 | for (i = 0; i < n1; i++) |
---|
1312 | y[i] = ((((((((((((((( (y[i]) |
---|
1313 | + x[j-15]*m[ldm*(j-15)+i]) |
---|
1314 | + x[j-14]*m[ldm*(j-14)+i]) |
---|
1315 | + x[j-13]*m[ldm*(j-13)+i]) |
---|
1316 | + x[j-12]*m[ldm*(j-12)+i]) |
---|
1317 | + x[j-11]*m[ldm*(j-11)+i]) |
---|
1318 | + x[j-10]*m[ldm*(j-10)+i]) |
---|
1319 | + x[j- 9]*m[ldm*(j- 9)+i]) |
---|
1320 | + x[j- 8]*m[ldm*(j- 8)+i]) |
---|
1321 | + x[j- 7]*m[ldm*(j- 7)+i]) |
---|
1322 | + x[j- 6]*m[ldm*(j- 6)+i]) |
---|
1323 | + x[j- 5]*m[ldm*(j- 5)+i]) |
---|
1324 | + x[j- 4]*m[ldm*(j- 4)+i]) |
---|
1325 | + x[j- 3]*m[ldm*(j- 3)+i]) |
---|
1326 | + x[j- 2]*m[ldm*(j- 2)+i]) |
---|
1327 | + x[j- 1]*m[ldm*(j- 1)+i]) |
---|
1328 | + x[j] *m[ldm*j+i]; |
---|
1329 | } |
---|
1330 | return; |
---|
1331 | } |
---|
1332 | |
---|
1333 | /*----------------------*/ |
---|