1 | /* |
---|
2 | * Linpack 100x100 Benchmark In C/C++ For PCs |
---|
3 | * |
---|
4 | ******************************************************************** |
---|
5 | * |
---|
6 | * Original Source from NETLIB |
---|
7 | * |
---|
8 | * Translated to C by Bonnie Toy 5/88 (modified on 2/25/94 to fix |
---|
9 | * a problem with daxpy for unequal increments or equal increments |
---|
10 | * not equal to 1. Jack Dongarra) |
---|
11 | * |
---|
12 | * To obtain rolled source BLAS, add -DROLL to the command lines. |
---|
13 | * To obtain unrolled source BLAS, add -DUNROLL to the command lines. |
---|
14 | * |
---|
15 | * You must specify one of -DSP or -DDP to compile correctly. |
---|
16 | * |
---|
17 | * You must specify one of -DROLL or -DUNROLL to compile correctly. |
---|
18 | * |
---|
19 | ******************************************************************** |
---|
20 | * |
---|
21 | * Changes in this version |
---|
22 | * |
---|
23 | * 1. Function prototypes are declared and function headers have |
---|
24 | * embedded parameter types to produce code for C and C++ |
---|
25 | * |
---|
26 | * 2. Arrays aa and a are declared as [200*200] and [200*201] to |
---|
27 | * allow compilation with prototypes. |
---|
28 | * |
---|
29 | * 3. Function second changed (compiler dependent). |
---|
30 | * |
---|
31 | * 4. Timing method changed due to inaccuracy of PC clock (see below). |
---|
32 | * |
---|
33 | * 5. Additional date function included (compiler dependent). |
---|
34 | * |
---|
35 | * 6. Additional code used as a standard for a series of benchmarks:- |
---|
36 | * Automatic run time calibration rather than fixed parameters |
---|
37 | * Initial calibration with display to show linearity |
---|
38 | * Results displayed at reasonable rate for viewing (5 seconds) |
---|
39 | * Facilities for typing in details of system used etc. |
---|
40 | * Compiler details in code in case .exe files used elsewhere |
---|
41 | * Results appended to a text file (Linpack.txt) |
---|
42 | * |
---|
43 | * Roy Longbottom 101323.2241@compuserve.com 14 September 1996 |
---|
44 | * |
---|
45 | ************************************************************************ |
---|
46 | * |
---|
47 | * Timing |
---|
48 | * |
---|
49 | * The PC timer is updated at about 18 times per second or resolution of |
---|
50 | * 0.05 to 0.06 seconds which is similar to the time taken by the main |
---|
51 | * time consuming function dgefa on a 100 MHz Pentium. Thus there is no |
---|
52 | * point in running the dgefa/dges1 combination three times as in the |
---|
53 | * original version. Main timing for the latter, in the loop run NTIMES, |
---|
54 | * executes matgen/dgefa, summing the time taken by matgen within the |
---|
55 | * loop for later deduction from the total time. On a modern PC this sum |
---|
56 | * can be based on a random selection of 0 or 0.05/0.06. This version |
---|
57 | * executes the single pass once and the main timing loop five times, |
---|
58 | * calculating the matgen overhead separately. |
---|
59 | * |
---|
60 | ************************************************************************* |
---|
61 | * |
---|
62 | * Example of Output |
---|
63 | * |
---|
64 | * Rolled Double Precision Linpack Benchmark - PC Version in 'C/C++' |
---|
65 | * |
---|
66 | * Compiler Watcom C/C++ 10.5 Win 386 |
---|
67 | * Optimisation -zp4 -otexan -fp5 -5r -dDP -dROLL |
---|
68 | * |
---|
69 | * |
---|
70 | * norm resid resid machep x[0]-1 x[n-1]-1 |
---|
71 | * 0.4 7.41628980e-014 1.00000000e-015 -1.49880108e-014 -1.89848137e-014 |
---|
72 | * |
---|
73 | * |
---|
74 | * Times are reported for matrices of order 100 |
---|
75 | * 1 pass times for array with leading dimension of 201 |
---|
76 | * |
---|
77 | * dgefa dgesl total Mflops unit ratio |
---|
78 | * 0.06000 0.00000 0.06000 11.44 0.1748 1.0714 |
---|
79 | * |
---|
80 | * |
---|
81 | * Calculating matgen overhead |
---|
82 | * |
---|
83 | * 10 times 0.11 seconds |
---|
84 | * 20 times 0.22 seconds |
---|
85 | * 40 times 0.44 seconds |
---|
86 | * 80 times 0.87 seconds |
---|
87 | * 160 times 1.76 seconds |
---|
88 | * 320 times 3.52 seconds |
---|
89 | * 640 times 7.03 seconds |
---|
90 | * |
---|
91 | * Overhead for 1 matgen 0.01098 seconds |
---|
92 | * |
---|
93 | * |
---|
94 | * Calculating matgen/dgefa passes for 5 seconds |
---|
95 | * |
---|
96 | * 10 times 0.71 seconds |
---|
97 | * 20 times 1.38 seconds |
---|
98 | * 40 times 2.80 seconds |
---|
99 | * 80 times 5.66 seconds |
---|
100 | * |
---|
101 | * Passes used 70 |
---|
102 | * |
---|
103 | * This is followed by output of the normal data for dgefa, dges1, |
---|
104 | * total, Mflops, unit and ratio with five sets of results for each. |
---|
105 | * |
---|
106 | ************************************************************************ |
---|
107 | * |
---|
108 | * Example from output file Linpack.txt |
---|
109 | * |
---|
110 | * LINPACK BENCHMARK FOR PCs 'C/C++' n @ 100 |
---|
111 | * |
---|
112 | * Month run 9/1996 |
---|
113 | * PC model Escom |
---|
114 | * CPU Pentium |
---|
115 | * Clock MHz 100 |
---|
116 | * Cache 256K |
---|
117 | * Options Neptune chipset |
---|
118 | * OS/DOS Windows 95 |
---|
119 | * Compiler Watcom C/C++ 10.5 Win 386 |
---|
120 | * OptLevel -zp4 -otexan -fp5 -5r -dDP -dROLL |
---|
121 | * Run by Roy Longbottom |
---|
122 | * From UK |
---|
123 | * Mail 101323.2241@compuserve.com |
---|
124 | * |
---|
125 | * Rolling Rolled |
---|
126 | * Precision Double |
---|
127 | * norm. resid 0.4 |
---|
128 | * resid 7.41628980e-014 |
---|
129 | * machep 1.00000000e-015 (8.88178420e-016 NON OPT) |
---|
130 | * x[0]-1 -1.49880108e-014 |
---|
131 | * x[n-1]-1 -1.89848137e-014 |
---|
132 | * matgen 1 seconds 0.01051 |
---|
133 | * matgen 2 seconds 0.01050 |
---|
134 | * Repetitions 70 |
---|
135 | * Leading dimension 201 |
---|
136 | * dgefa dgesl total Mflops |
---|
137 | * 1 pass seconds 0.06000 0.00000 0.06000 |
---|
138 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
139 | * Repeat seconds 0.06077 0.00157 0.06234 11.01 |
---|
140 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
141 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
142 | * Repeat seconds 0.06092 0.00157 0.06249 10.99 |
---|
143 | * Average 10.99 |
---|
144 | * Leading dimension 200 |
---|
145 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
146 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
147 | * Repeat seconds 0.05864 0.00157 0.06021 11.40 |
---|
148 | * Repeat seconds 0.05936 0.00157 0.06093 11.27 |
---|
149 | * Repeat seconds 0.05864 0.00157 0.06021 11.40 |
---|
150 | * Average 11.32 |
---|
151 | * |
---|
152 | ************************************************************************ |
---|
153 | * |
---|
154 | * Examples of Results |
---|
155 | * |
---|
156 | * Precompiled codes were produced via a Watcom C/C++ 10.5 compiler. |
---|
157 | * Versions are available for DOS, Windows 3/95 and NT/Win 95. Both |
---|
158 | * non-optimised and optimised programs are available. The latter has |
---|
159 | * options as in the above example. Although these options can place |
---|
160 | * functions in-line, in this case, daxpy is not in-lined. Optimisation |
---|
161 | * reduces 18 instructions in the loop in this function to the following: |
---|
162 | * |
---|
163 | * L85 fld st(0) |
---|
164 | * fmul qword ptr [edx] |
---|
165 | * add eax,00000008H |
---|
166 | * add edx,00000008H |
---|
167 | * fadd qword ptr -8H[eax] |
---|
168 | * inc ebx |
---|
169 | * fstp qword ptr -8H[eax] |
---|
170 | * cmp ebx,esi |
---|
171 | * jl L85 |
---|
172 | * |
---|
173 | * Results produced are not consistent between runs but produce similar |
---|
174 | * speeds when executing at a particular dimension (see above). An example |
---|
175 | * of other results is 11.4/10.5 Mflops. Most typical double precision |
---|
176 | * rolled results are: |
---|
177 | * |
---|
178 | * Opt No Opt Version/ |
---|
179 | * MHz Cache Mflops Mflops Make/Options Via |
---|
180 | * |
---|
181 | * AM80386DX 40 128K 0.53 0.36 Clone Win/W95 |
---|
182 | * 80486DX2 66 128K 2.5 1.9 Escom SIS chipset Win/W95 |
---|
183 | * 80486DX2 66 128K 2.3 1.9 Escom SIS chipset NT/W95 |
---|
184 | * 80486DX2 66 128K 2.8 2.0 Escom SIS chipset Dos/Dos |
---|
185 | * Pentium 100 256K 11 4.2 Escom Neptune chipset Win/W95 |
---|
186 | * Pentium 100 256K 11 5.5 Escom Neptune chipset NT/W95 |
---|
187 | * Pentium 100 256K 12 4.4 Escom Neptune chipset Dos/Dos |
---|
188 | * Pentium Pro 200 256K 48 19 Dell XPS Pro200n NT/NT |
---|
189 | * |
---|
190 | * The results are as produced when compiled as Linpack.cpp. Compiling as |
---|
191 | * Linpack.c gives similar speeds but the code is a little different. |
---|
192 | * |
---|
193 | *************************************************************************** |
---|
194 | */ |
---|
195 | |
---|
196 | #define DP |
---|
197 | #define ROLL |
---|
198 | |
---|
199 | #ifdef SP |
---|
200 | #define REAL float |
---|
201 | #define ZERO 0.0 |
---|
202 | #define ONE 1.0 |
---|
203 | #define PREC "Single " |
---|
204 | #endif |
---|
205 | |
---|
206 | #ifdef DP |
---|
207 | #define REAL double |
---|
208 | #define ZERO 0.0e0 |
---|
209 | #define ONE 1.0e0 |
---|
210 | #define PREC "Double " |
---|
211 | #endif |
---|
212 | |
---|
213 | #ifdef ROLL |
---|
214 | #define ROLLING "Rolled " |
---|
215 | #endif |
---|
216 | #ifdef UNROLL |
---|
217 | #define ROLLING "Unrolled " |
---|
218 | #endif |
---|
219 | |
---|
220 | |
---|
221 | #define NTIMES atoi(argv[1]) |
---|
222 | |
---|
223 | #include <stdio.h> |
---|
224 | #include <math.h> |
---|
225 | #include <stdlib.h> |
---|
226 | #ifdef __rtems__ |
---|
227 | #include <tmacros.h> |
---|
228 | #undef print_time |
---|
229 | #define fprintf(f, ...) rtems_printf(&rtems_test_printer, __VA_ARGS__) |
---|
230 | #endif /* __rtems__ */ |
---|
231 | |
---|
232 | |
---|
233 | static REAL atime[9][15]; |
---|
234 | |
---|
235 | void print_time (int row); |
---|
236 | void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma); |
---|
237 | void dgefa (REAL a[], int lda, int n, int ipvt[], int *info); |
---|
238 | void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job); |
---|
239 | void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]); |
---|
240 | void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy); |
---|
241 | REAL epslon (REAL x); |
---|
242 | int idamax (int n, REAL dx[], int incx); |
---|
243 | void dscal (int n, REAL da, REAL dx[], int incx); |
---|
244 | REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy); |
---|
245 | |
---|
246 | /* TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME */ |
---|
247 | #include <sys/time.h> /* for following time functions only */ |
---|
248 | static REAL second(void) |
---|
249 | { |
---|
250 | struct timeval tv; |
---|
251 | |
---|
252 | gettimeofday(&tv, NULL); |
---|
253 | return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6; |
---|
254 | } |
---|
255 | |
---|
256 | int main (int argc, char **argv) |
---|
257 | { |
---|
258 | static REAL aa[200*200],a[200*201],b[200],x[200]; |
---|
259 | REAL cray,ops,total,norma,normx; |
---|
260 | REAL resid,residn,eps,t1,tm2,epsn,x1,x2; |
---|
261 | REAL mflops; |
---|
262 | static int ipvt[200],n,i,j,ntimes,info,lda,ldaa; |
---|
263 | int pass, loop; |
---|
264 | REAL overhead1, overhead2, time1, time2; |
---|
265 | char *compiler, *options; |
---|
266 | |
---|
267 | /************************************************************************ |
---|
268 | * Enter details of compiler and options used * |
---|
269 | ************************************************************************/ |
---|
270 | /*----------------- --------- --------- ---------*/ |
---|
271 | compiler = "INSERT COMPILER NAME HERE"; |
---|
272 | options = "INSERT OPTIMISATION OPTIONS HERE"; |
---|
273 | /* Include -dDP or -dSP and -dROLL or -dUNROLL */ |
---|
274 | |
---|
275 | lda = 201; |
---|
276 | ldaa = 200; |
---|
277 | cray = .056; |
---|
278 | n = 100; |
---|
279 | |
---|
280 | fprintf(stdout,ROLLING);fprintf(stdout,PREC); |
---|
281 | fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n"); |
---|
282 | fprintf(stdout,"Compiler %s\n",compiler); |
---|
283 | fprintf(stdout,"Optimisation %s\n\n",options); |
---|
284 | |
---|
285 | ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n); |
---|
286 | |
---|
287 | matgen(a,lda,n,b,&norma); |
---|
288 | t1 = second(); |
---|
289 | dgefa(a,lda,n,ipvt,&info); |
---|
290 | atime[0][0] = second() - t1; |
---|
291 | t1 = second(); |
---|
292 | dgesl(a,lda,n,ipvt,b,0); |
---|
293 | atime[1][0] = second() - t1; |
---|
294 | total = atime[0][0] + atime[1][0]; |
---|
295 | |
---|
296 | /* compute a residual to verify results. */ |
---|
297 | |
---|
298 | for (i = 0; i < n; i++) { |
---|
299 | x[i] = b[i]; |
---|
300 | } |
---|
301 | matgen(a,lda,n,b,&norma); |
---|
302 | for (i = 0; i < n; i++) { |
---|
303 | b[i] = -b[i]; |
---|
304 | } |
---|
305 | dmxpy(n,b,n,lda,x,a); |
---|
306 | resid = 0.0; |
---|
307 | normx = 0.0; |
---|
308 | for (i = 0; i < n; i++) { |
---|
309 | resid = (resid > fabs((double)b[i])) |
---|
310 | ? resid : fabs((double)b[i]); |
---|
311 | normx = (normx > fabs((double)x[i])) |
---|
312 | ? normx : fabs((double)x[i]); |
---|
313 | } |
---|
314 | eps = epslon(ONE); |
---|
315 | residn = resid/( n*norma*normx*eps ); |
---|
316 | epsn = eps; |
---|
317 | x1 = x[0] - 1; |
---|
318 | x2 = x[n-1] - 1; |
---|
319 | |
---|
320 | printf("norm resid resid machep"); |
---|
321 | printf(" x[0]-1 x[n-1]-1\n"); |
---|
322 | printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n", |
---|
323 | (double)residn, (double)resid, (double)epsn, |
---|
324 | (double)x1, (double)x2); |
---|
325 | |
---|
326 | fprintf(stdout,"Times are reported for matrices of order %5d\n",n); |
---|
327 | fprintf(stdout,"1 pass times for array with leading dimension of%5d\n\n",lda); |
---|
328 | fprintf(stdout," dgefa dgesl total Mflops unit"); |
---|
329 | fprintf(stdout," ratio\n"); |
---|
330 | |
---|
331 | atime[2][0] = total; |
---|
332 | if (total > 0.0) |
---|
333 | { |
---|
334 | atime[3][0] = ops/(1.0e6*total); |
---|
335 | atime[4][0] = 2.0/atime[3][0]; |
---|
336 | } |
---|
337 | else |
---|
338 | { |
---|
339 | atime[3][0] = 0.0; |
---|
340 | atime[4][0] = 0.0; |
---|
341 | } |
---|
342 | atime[5][0] = total/cray; |
---|
343 | |
---|
344 | print_time(0); |
---|
345 | |
---|
346 | /************************************************************************ |
---|
347 | * Calculate overhead of executing matgen procedure * |
---|
348 | ************************************************************************/ |
---|
349 | |
---|
350 | fprintf (stdout,"\nCalculating matgen overhead\n"); |
---|
351 | pass = -20; |
---|
352 | loop = NTIMES; |
---|
353 | do |
---|
354 | { |
---|
355 | time1 = second(); |
---|
356 | pass = pass + 1; |
---|
357 | for ( i = 0 ; i < loop ; i++) |
---|
358 | { |
---|
359 | matgen(a,lda,n,b,&norma); |
---|
360 | } |
---|
361 | time2 = second(); |
---|
362 | overhead1 = (time2 - time1); |
---|
363 | fprintf (stdout,"%10d times %6.2f seconds\n", loop, overhead1); |
---|
364 | if (overhead1 > 5.0) |
---|
365 | { |
---|
366 | pass = 0; |
---|
367 | } |
---|
368 | if (pass < 0) |
---|
369 | { |
---|
370 | if (overhead1 < 0.1) |
---|
371 | { |
---|
372 | loop = loop * 10; |
---|
373 | } |
---|
374 | else |
---|
375 | { |
---|
376 | loop = loop * 2; |
---|
377 | } |
---|
378 | } |
---|
379 | } |
---|
380 | while (pass < 0); |
---|
381 | |
---|
382 | overhead1 = overhead1 / (double)loop; |
---|
383 | |
---|
384 | fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead1); |
---|
385 | |
---|
386 | /************************************************************************ |
---|
387 | * Calculate matgen/dgefa passes for 5 seconds * |
---|
388 | ************************************************************************/ |
---|
389 | |
---|
390 | fprintf (stdout,"Calculating matgen/dgefa passes for 5 seconds\n"); |
---|
391 | pass = -20; |
---|
392 | ntimes = NTIMES; |
---|
393 | do |
---|
394 | { |
---|
395 | time1 = second(); |
---|
396 | pass = pass + 1; |
---|
397 | for ( i = 0 ; i < ntimes ; i++) |
---|
398 | { |
---|
399 | matgen(a,lda,n,b,&norma); |
---|
400 | dgefa(a,lda,n,ipvt,&info ); |
---|
401 | } |
---|
402 | time2 = second() - time1; |
---|
403 | fprintf (stdout,"%10d times %6.2f seconds\n", ntimes, time2); |
---|
404 | if (time2 > 5.0) |
---|
405 | { |
---|
406 | pass = 0; |
---|
407 | } |
---|
408 | if (pass < 0) |
---|
409 | { |
---|
410 | if (time2 < 0.1) |
---|
411 | { |
---|
412 | ntimes = ntimes * 10; |
---|
413 | } |
---|
414 | else |
---|
415 | { |
---|
416 | ntimes = ntimes * 2; |
---|
417 | } |
---|
418 | } |
---|
419 | } |
---|
420 | while (pass < 0); |
---|
421 | |
---|
422 | ntimes = 5.0 * (double)ntimes / time2; |
---|
423 | if (ntimes == 0) ntimes = 1; |
---|
424 | |
---|
425 | fprintf (stdout,"Passes used %10d \n\n", ntimes); |
---|
426 | fprintf(stdout,"Times for array with leading dimension of%4d\n\n",lda); |
---|
427 | fprintf(stdout," dgefa dgesl total Mflops unit"); |
---|
428 | fprintf(stdout," ratio\n"); |
---|
429 | |
---|
430 | /************************************************************************ |
---|
431 | * Execute 5 passes * |
---|
432 | ************************************************************************/ |
---|
433 | |
---|
434 | tm2 = ntimes * overhead1; |
---|
435 | atime[3][6] = 0; |
---|
436 | |
---|
437 | for (j=1 ; j<6 ; j++) |
---|
438 | { |
---|
439 | |
---|
440 | t1 = second(); |
---|
441 | |
---|
442 | for (i = 0; i < ntimes; i++) |
---|
443 | { |
---|
444 | matgen(a,lda,n,b,&norma); |
---|
445 | dgefa(a,lda,n,ipvt,&info ); |
---|
446 | } |
---|
447 | |
---|
448 | atime[0][j] = (second() - t1 - tm2)/ntimes; |
---|
449 | |
---|
450 | t1 = second(); |
---|
451 | |
---|
452 | for (i = 0; i < ntimes; i++) |
---|
453 | { |
---|
454 | dgesl(a,lda,n,ipvt,b,0); |
---|
455 | } |
---|
456 | |
---|
457 | atime[1][j] = (second() - t1)/ntimes; |
---|
458 | total = atime[0][j] + atime[1][j]; |
---|
459 | atime[2][j] = total; |
---|
460 | atime[3][j] = ops/(1.0e6*total); |
---|
461 | atime[4][j] = 2.0/atime[3][j]; |
---|
462 | atime[5][j] = total/cray; |
---|
463 | atime[3][6] = atime[3][6] + atime[3][j]; |
---|
464 | |
---|
465 | print_time(j); |
---|
466 | } |
---|
467 | atime[3][6] = atime[3][6] / 5.0; |
---|
468 | fprintf (stdout,"Average %11.2f\n", |
---|
469 | (double)atime[3][6]); |
---|
470 | |
---|
471 | fprintf (stdout,"\nCalculating matgen2 overhead\n"); |
---|
472 | |
---|
473 | /************************************************************************ |
---|
474 | * Calculate overhead of executing matgen procedure * |
---|
475 | ************************************************************************/ |
---|
476 | |
---|
477 | time1 = second(); |
---|
478 | for ( i = 0 ; i < loop ; i++) |
---|
479 | { |
---|
480 | matgen(aa,ldaa,n,b,&norma); |
---|
481 | } |
---|
482 | time2 = second(); |
---|
483 | overhead2 = (time2 - time1); |
---|
484 | overhead2 = overhead2 / (double)loop; |
---|
485 | |
---|
486 | fprintf (stdout,"Overhead for 1 matgen %12.5f seconds\n\n", overhead2); |
---|
487 | fprintf(stdout,"Times for array with leading dimension of%4d\n\n",ldaa); |
---|
488 | fprintf(stdout," dgefa dgesl total Mflops unit"); |
---|
489 | fprintf(stdout," ratio\n"); |
---|
490 | |
---|
491 | /************************************************************************ |
---|
492 | * Execute 5 passes * |
---|
493 | ************************************************************************/ |
---|
494 | |
---|
495 | tm2 = ntimes * overhead2; |
---|
496 | atime[3][12] = 0; |
---|
497 | |
---|
498 | for (j=7 ; j<12 ; j++) |
---|
499 | { |
---|
500 | |
---|
501 | t1 = second(); |
---|
502 | |
---|
503 | for (i = 0; i < ntimes; i++) |
---|
504 | { |
---|
505 | matgen(aa,ldaa,n,b,&norma); |
---|
506 | dgefa(aa,ldaa,n,ipvt,&info ); |
---|
507 | } |
---|
508 | |
---|
509 | atime[0][j] = (second() - t1 - tm2)/ntimes; |
---|
510 | |
---|
511 | t1 = second(); |
---|
512 | |
---|
513 | for (i = 0; i < ntimes; i++) |
---|
514 | { |
---|
515 | dgesl(aa,ldaa,n,ipvt,b,0); |
---|
516 | } |
---|
517 | |
---|
518 | atime[1][j] = (second() - t1)/ntimes; |
---|
519 | total = atime[0][j] + atime[1][j]; |
---|
520 | atime[2][j] = total; |
---|
521 | atime[3][j] = ops/(1.0e6*total); |
---|
522 | atime[4][j] = 2.0/atime[3][j]; |
---|
523 | atime[5][j] = total/cray; |
---|
524 | atime[3][12] = atime[3][12] + atime[3][j]; |
---|
525 | |
---|
526 | print_time(j); |
---|
527 | } |
---|
528 | atime[3][12] = atime[3][12] / 5.0; |
---|
529 | fprintf (stdout,"Average %11.2f\n", |
---|
530 | (double)atime[3][12]); |
---|
531 | |
---|
532 | /************************************************************************ |
---|
533 | * Use minimum average as overall Mflops rating * |
---|
534 | ************************************************************************/ |
---|
535 | |
---|
536 | mflops = atime[3][6]; |
---|
537 | if (atime[3][12] < mflops) mflops = atime[3][12]; |
---|
538 | |
---|
539 | fprintf(stdout,"\n"); |
---|
540 | fprintf(stdout,ROLLING);fprintf(stdout,PREC); |
---|
541 | fprintf(stdout," Precision %11.2f Mflops \n\n",mflops); |
---|
542 | #ifdef __rtems__ |
---|
543 | return 0; |
---|
544 | #endif |
---|
545 | } |
---|
546 | |
---|
547 | /*----------------------*/ |
---|
548 | void print_time (int row) |
---|
549 | |
---|
550 | { |
---|
551 | fprintf(stdout,"%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n", (double)atime[0][row], |
---|
552 | (double)atime[1][row], (double)atime[2][row], (double)atime[3][row], |
---|
553 | (double)atime[4][row], (double)atime[5][row]); |
---|
554 | return; |
---|
555 | } |
---|
556 | |
---|
557 | /*----------------------*/ |
---|
558 | |
---|
559 | void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma) |
---|
560 | |
---|
561 | |
---|
562 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
563 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
564 | |
---|
565 | { |
---|
566 | int init, i, j; |
---|
567 | |
---|
568 | init = 1325; |
---|
569 | *norma = 0.0; |
---|
570 | for (j = 0; j < n; j++) { |
---|
571 | for (i = 0; i < n; i++) { |
---|
572 | init = 3125*init % 65536; |
---|
573 | a[lda*j+i] = (init - 32768.0)/16384.0; |
---|
574 | *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma; |
---|
575 | |
---|
576 | /* alternative for some compilers |
---|
577 | if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]); |
---|
578 | */ |
---|
579 | } |
---|
580 | } |
---|
581 | for (i = 0; i < n; i++) { |
---|
582 | b[i] = 0.0; |
---|
583 | } |
---|
584 | for (j = 0; j < n; j++) { |
---|
585 | for (i = 0; i < n; i++) { |
---|
586 | b[i] = b[i] + a[lda*j+i]; |
---|
587 | } |
---|
588 | } |
---|
589 | return; |
---|
590 | } |
---|
591 | |
---|
592 | /*----------------------*/ |
---|
593 | void dgefa(REAL a[], int lda, int n, int ipvt[], int *info) |
---|
594 | |
---|
595 | |
---|
596 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
597 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
598 | /* |
---|
599 | dgefa factors a double precision matrix by gaussian elimination. |
---|
600 | |
---|
601 | dgefa is usually called by dgeco, but it can be called |
---|
602 | directly with a saving in time if rcond is not needed. |
---|
603 | (time for dgeco) = (1 + 9/n)*(time for dgefa) . |
---|
604 | |
---|
605 | on entry |
---|
606 | |
---|
607 | a REAL precision[n][lda] |
---|
608 | the matrix to be factored. |
---|
609 | |
---|
610 | lda integer |
---|
611 | the leading dimension of the array a . |
---|
612 | |
---|
613 | n integer |
---|
614 | the order of the matrix a . |
---|
615 | |
---|
616 | on return |
---|
617 | |
---|
618 | a an upper triangular matrix and the multipliers |
---|
619 | which were used to obtain it. |
---|
620 | the factorization can be written a = l*u where |
---|
621 | l is a product of permutation and unit lower |
---|
622 | triangular matrices and u is upper triangular. |
---|
623 | |
---|
624 | ipvt integer[n] |
---|
625 | an integer vector of pivot indices. |
---|
626 | |
---|
627 | info integer |
---|
628 | = 0 normal value. |
---|
629 | = k if u[k][k] .eq. 0.0 . this is not an error |
---|
630 | condition for this subroutine, but it does |
---|
631 | indicate that dgesl or dgedi will divide by zero |
---|
632 | if called. use rcond in dgeco for a reliable |
---|
633 | indication of singularity. |
---|
634 | |
---|
635 | linpack. this version dated 08/14/78 . |
---|
636 | cleve moler, university of new mexico, argonne national lab. |
---|
637 | |
---|
638 | functions |
---|
639 | |
---|
640 | blas daxpy,dscal,idamax |
---|
641 | */ |
---|
642 | |
---|
643 | { |
---|
644 | /* internal variables */ |
---|
645 | |
---|
646 | REAL t; |
---|
647 | int j,k,kp1,l,nm1; |
---|
648 | |
---|
649 | |
---|
650 | /* gaussian elimination with partial pivoting */ |
---|
651 | |
---|
652 | *info = 0; |
---|
653 | nm1 = n - 1; |
---|
654 | if (nm1 >= 0) { |
---|
655 | for (k = 0; k < nm1; k++) { |
---|
656 | kp1 = k + 1; |
---|
657 | |
---|
658 | /* find l = pivot index */ |
---|
659 | |
---|
660 | l = idamax(n-k,&a[lda*k+k],1) + k; |
---|
661 | ipvt[k] = l; |
---|
662 | |
---|
663 | /* zero pivot implies this column already |
---|
664 | triangularized */ |
---|
665 | |
---|
666 | if (a[lda*k+l] != ZERO) { |
---|
667 | |
---|
668 | /* interchange if necessary */ |
---|
669 | |
---|
670 | if (l != k) { |
---|
671 | t = a[lda*k+l]; |
---|
672 | a[lda*k+l] = a[lda*k+k]; |
---|
673 | a[lda*k+k] = t; |
---|
674 | } |
---|
675 | |
---|
676 | /* compute multipliers */ |
---|
677 | |
---|
678 | t = -ONE/a[lda*k+k]; |
---|
679 | dscal(n-(k+1),t,&a[lda*k+k+1],1); |
---|
680 | |
---|
681 | /* row elimination with column indexing */ |
---|
682 | |
---|
683 | for (j = kp1; j < n; j++) { |
---|
684 | t = a[lda*j+l]; |
---|
685 | if (l != k) { |
---|
686 | a[lda*j+l] = a[lda*j+k]; |
---|
687 | a[lda*j+k] = t; |
---|
688 | } |
---|
689 | daxpy(n-(k+1),t,&a[lda*k+k+1],1, |
---|
690 | &a[lda*j+k+1],1); |
---|
691 | } |
---|
692 | } |
---|
693 | else { |
---|
694 | *info = k; |
---|
695 | } |
---|
696 | } |
---|
697 | } |
---|
698 | ipvt[n-1] = n-1; |
---|
699 | if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1; |
---|
700 | return; |
---|
701 | } |
---|
702 | |
---|
703 | /*----------------------*/ |
---|
704 | |
---|
705 | void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job ) |
---|
706 | |
---|
707 | |
---|
708 | /* We would like to declare a[][lda], but c does not allow it. In this |
---|
709 | function, references to a[i][j] are written a[lda*i+j]. */ |
---|
710 | |
---|
711 | /* |
---|
712 | dgesl solves the double precision system |
---|
713 | a * x = b or trans(a) * x = b |
---|
714 | using the factors computed by dgeco or dgefa. |
---|
715 | |
---|
716 | on entry |
---|
717 | |
---|
718 | a double precision[n][lda] |
---|
719 | the output from dgeco or dgefa. |
---|
720 | |
---|
721 | lda integer |
---|
722 | the leading dimension of the array a . |
---|
723 | |
---|
724 | n integer |
---|
725 | the order of the matrix a . |
---|
726 | |
---|
727 | ipvt integer[n] |
---|
728 | the pivot vector from dgeco or dgefa. |
---|
729 | |
---|
730 | b double precision[n] |
---|
731 | the right hand side vector. |
---|
732 | |
---|
733 | job integer |
---|
734 | = 0 to solve a*x = b , |
---|
735 | = nonzero to solve trans(a)*x = b where |
---|
736 | trans(a) is the transpose. |
---|
737 | |
---|
738 | on return |
---|
739 | |
---|
740 | b the solution vector x . |
---|
741 | |
---|
742 | error condition |
---|
743 | |
---|
744 | a division by zero will occur if the input factor contains a |
---|
745 | zero on the diagonal. technically this indicates singularity |
---|
746 | but it is often caused by improper arguments or improper |
---|
747 | setting of lda . it will not occur if the subroutines are |
---|
748 | called correctly and if dgeco has set rcond .gt. 0.0 |
---|
749 | or dgefa has set info .eq. 0 . |
---|
750 | |
---|
751 | to compute inverse(a) * c where c is a matrix |
---|
752 | with p columns |
---|
753 | dgeco(a,lda,n,ipvt,rcond,z) |
---|
754 | if (!rcond is too small){ |
---|
755 | for (j=0,j<p,j++) |
---|
756 | dgesl(a,lda,n,ipvt,c[j][0],0); |
---|
757 | } |
---|
758 | |
---|
759 | linpack. this version dated 08/14/78 . |
---|
760 | cleve moler, university of new mexico, argonne national lab. |
---|
761 | |
---|
762 | functions |
---|
763 | |
---|
764 | blas daxpy,ddot |
---|
765 | */ |
---|
766 | { |
---|
767 | /* internal variables */ |
---|
768 | |
---|
769 | REAL t; |
---|
770 | int k,kb,l,nm1; |
---|
771 | |
---|
772 | nm1 = n - 1; |
---|
773 | if (job == 0) { |
---|
774 | |
---|
775 | /* job = 0 , solve a * x = b |
---|
776 | first solve l*y = b */ |
---|
777 | |
---|
778 | if (nm1 >= 1) { |
---|
779 | for (k = 0; k < nm1; k++) { |
---|
780 | l = ipvt[k]; |
---|
781 | t = b[l]; |
---|
782 | if (l != k){ |
---|
783 | b[l] = b[k]; |
---|
784 | b[k] = t; |
---|
785 | } |
---|
786 | daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 ); |
---|
787 | } |
---|
788 | } |
---|
789 | |
---|
790 | /* now solve u*x = y */ |
---|
791 | |
---|
792 | for (kb = 0; kb < n; kb++) { |
---|
793 | k = n - (kb + 1); |
---|
794 | b[k] = b[k]/a[lda*k+k]; |
---|
795 | t = -b[k]; |
---|
796 | daxpy(k,t,&a[lda*k+0],1,&b[0],1 ); |
---|
797 | } |
---|
798 | } |
---|
799 | else { |
---|
800 | |
---|
801 | /* job = nonzero, solve trans(a) * x = b |
---|
802 | first solve trans(u)*y = b */ |
---|
803 | |
---|
804 | for (k = 0; k < n; k++) { |
---|
805 | t = ddot(k,&a[lda*k+0],1,&b[0],1); |
---|
806 | b[k] = (b[k] - t)/a[lda*k+k]; |
---|
807 | } |
---|
808 | |
---|
809 | /* now solve trans(l)*x = y */ |
---|
810 | |
---|
811 | if (nm1 >= 1) { |
---|
812 | for (kb = 1; kb < nm1; kb++) { |
---|
813 | k = n - (kb+1); |
---|
814 | b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1); |
---|
815 | l = ipvt[k]; |
---|
816 | if (l != k) { |
---|
817 | t = b[l]; |
---|
818 | b[l] = b[k]; |
---|
819 | b[k] = t; |
---|
820 | } |
---|
821 | } |
---|
822 | } |
---|
823 | } |
---|
824 | return; |
---|
825 | } |
---|
826 | |
---|
827 | /*----------------------*/ |
---|
828 | |
---|
829 | void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy) |
---|
830 | /* |
---|
831 | constant times a vector plus a vector. |
---|
832 | jack dongarra, linpack, 3/11/78. |
---|
833 | */ |
---|
834 | |
---|
835 | { |
---|
836 | int i,ix,iy; |
---|
837 | |
---|
838 | if(n <= 0) return; |
---|
839 | if (da == ZERO) return; |
---|
840 | |
---|
841 | if(incx != 1 || incy != 1) { |
---|
842 | |
---|
843 | /* code for unequal increments or equal increments |
---|
844 | not equal to 1 */ |
---|
845 | |
---|
846 | ix = 0; |
---|
847 | iy = 0; |
---|
848 | if(incx < 0) ix = (-n+1)*incx; |
---|
849 | if(incy < 0)iy = (-n+1)*incy; |
---|
850 | for (i = 0;i < n; i++) { |
---|
851 | dy[iy] = dy[iy] + da*dx[ix]; |
---|
852 | ix = ix + incx; |
---|
853 | iy = iy + incy; |
---|
854 | |
---|
855 | } |
---|
856 | return; |
---|
857 | } |
---|
858 | |
---|
859 | /* code for both increments equal to 1 */ |
---|
860 | |
---|
861 | |
---|
862 | #ifdef ROLL |
---|
863 | |
---|
864 | for (i = 0;i < n; i++) { |
---|
865 | dy[i] = dy[i] + da*dx[i]; |
---|
866 | } |
---|
867 | |
---|
868 | |
---|
869 | #endif |
---|
870 | |
---|
871 | #ifdef UNROLL |
---|
872 | |
---|
873 | m = n % 4; |
---|
874 | if ( m != 0) { |
---|
875 | for (i = 0; i < m; i++) |
---|
876 | dy[i] = dy[i] + da*dx[i]; |
---|
877 | |
---|
878 | if (n < 4) return; |
---|
879 | } |
---|
880 | for (i = m; i < n; i = i + 4) { |
---|
881 | dy[i] = dy[i] + da*dx[i]; |
---|
882 | dy[i+1] = dy[i+1] + da*dx[i+1]; |
---|
883 | dy[i+2] = dy[i+2] + da*dx[i+2]; |
---|
884 | dy[i+3] = dy[i+3] + da*dx[i+3]; |
---|
885 | |
---|
886 | } |
---|
887 | |
---|
888 | #endif |
---|
889 | return; |
---|
890 | } |
---|
891 | |
---|
892 | /*----------------------*/ |
---|
893 | |
---|
894 | REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy) |
---|
895 | /* |
---|
896 | forms the dot product of two vectors. |
---|
897 | jack dongarra, linpack, 3/11/78. |
---|
898 | */ |
---|
899 | |
---|
900 | { |
---|
901 | REAL dtemp; |
---|
902 | int i,ix,iy; |
---|
903 | |
---|
904 | dtemp = ZERO; |
---|
905 | |
---|
906 | if(n <= 0) return(ZERO); |
---|
907 | |
---|
908 | if(incx != 1 || incy != 1) { |
---|
909 | |
---|
910 | /* code for unequal increments or equal increments |
---|
911 | not equal to 1 */ |
---|
912 | |
---|
913 | ix = 0; |
---|
914 | iy = 0; |
---|
915 | if (incx < 0) ix = (-n+1)*incx; |
---|
916 | if (incy < 0) iy = (-n+1)*incy; |
---|
917 | for (i = 0;i < n; i++) { |
---|
918 | dtemp = dtemp + dx[ix]*dy[iy]; |
---|
919 | ix = ix + incx; |
---|
920 | iy = iy + incy; |
---|
921 | |
---|
922 | } |
---|
923 | return(dtemp); |
---|
924 | } |
---|
925 | |
---|
926 | /* code for both increments equal to 1 */ |
---|
927 | |
---|
928 | |
---|
929 | #ifdef ROLL |
---|
930 | |
---|
931 | for (i=0;i < n; i++) |
---|
932 | dtemp = dtemp + dx[i]*dy[i]; |
---|
933 | |
---|
934 | return(dtemp); |
---|
935 | |
---|
936 | #endif |
---|
937 | |
---|
938 | #ifdef UNROLL |
---|
939 | |
---|
940 | |
---|
941 | m = n % 5; |
---|
942 | if (m != 0) { |
---|
943 | for (i = 0; i < m; i++) |
---|
944 | dtemp = dtemp + dx[i]*dy[i]; |
---|
945 | if (n < 5) return(dtemp); |
---|
946 | } |
---|
947 | for (i = m; i < n; i = i + 5) { |
---|
948 | dtemp = dtemp + dx[i]*dy[i] + |
---|
949 | dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] + |
---|
950 | dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4]; |
---|
951 | } |
---|
952 | return(dtemp); |
---|
953 | |
---|
954 | #endif |
---|
955 | |
---|
956 | } |
---|
957 | |
---|
958 | /*----------------------*/ |
---|
959 | void dscal(int n, REAL da, REAL dx[], int incx) |
---|
960 | |
---|
961 | /* scales a vector by a constant. |
---|
962 | jack dongarra, linpack, 3/11/78. |
---|
963 | */ |
---|
964 | |
---|
965 | { |
---|
966 | int i,nincx; |
---|
967 | |
---|
968 | if(n <= 0)return; |
---|
969 | if(incx != 1) { |
---|
970 | |
---|
971 | /* code for increment not equal to 1 */ |
---|
972 | |
---|
973 | nincx = n*incx; |
---|
974 | for (i = 0; i < nincx; i = i + incx) |
---|
975 | dx[i] = da*dx[i]; |
---|
976 | |
---|
977 | return; |
---|
978 | } |
---|
979 | |
---|
980 | /* code for increment equal to 1 */ |
---|
981 | |
---|
982 | |
---|
983 | #ifdef ROLL |
---|
984 | |
---|
985 | for (i = 0; i < n; i++) |
---|
986 | dx[i] = da*dx[i]; |
---|
987 | |
---|
988 | |
---|
989 | #endif |
---|
990 | |
---|
991 | #ifdef UNROLL |
---|
992 | |
---|
993 | |
---|
994 | m = n % 5; |
---|
995 | if (m != 0) { |
---|
996 | for (i = 0; i < m; i++) |
---|
997 | dx[i] = da*dx[i]; |
---|
998 | if (n < 5) return; |
---|
999 | } |
---|
1000 | for (i = m; i < n; i = i + 5){ |
---|
1001 | dx[i] = da*dx[i]; |
---|
1002 | dx[i+1] = da*dx[i+1]; |
---|
1003 | dx[i+2] = da*dx[i+2]; |
---|
1004 | dx[i+3] = da*dx[i+3]; |
---|
1005 | dx[i+4] = da*dx[i+4]; |
---|
1006 | } |
---|
1007 | |
---|
1008 | #endif |
---|
1009 | |
---|
1010 | } |
---|
1011 | |
---|
1012 | /*----------------------*/ |
---|
1013 | int idamax(int n, REAL dx[], int incx) |
---|
1014 | |
---|
1015 | /* |
---|
1016 | finds the index of element having max. absolute value. |
---|
1017 | jack dongarra, linpack, 3/11/78. |
---|
1018 | */ |
---|
1019 | |
---|
1020 | |
---|
1021 | { |
---|
1022 | REAL dmax; |
---|
1023 | int i, ix, itemp; |
---|
1024 | |
---|
1025 | if( n < 1 ) return(-1); |
---|
1026 | if(n ==1 ) return(0); |
---|
1027 | itemp = -1; |
---|
1028 | if(incx != 1) { |
---|
1029 | |
---|
1030 | /* code for increment not equal to 1 */ |
---|
1031 | |
---|
1032 | ix = 1; |
---|
1033 | dmax = fabs((double)dx[0]); |
---|
1034 | ix = ix + incx; |
---|
1035 | for (i = 1; i < n; i++) { |
---|
1036 | if(fabs((double)dx[ix]) > dmax) { |
---|
1037 | itemp = i; |
---|
1038 | dmax = fabs((double)dx[ix]); |
---|
1039 | } |
---|
1040 | ix = ix + incx; |
---|
1041 | } |
---|
1042 | } |
---|
1043 | else { |
---|
1044 | |
---|
1045 | /* code for increment equal to 1 */ |
---|
1046 | |
---|
1047 | itemp = 0; |
---|
1048 | dmax = fabs((double)dx[0]); |
---|
1049 | for (i = 1; i < n; i++) { |
---|
1050 | if(fabs((double)dx[i]) > dmax) { |
---|
1051 | itemp = i; |
---|
1052 | dmax = fabs((double)dx[i]); |
---|
1053 | } |
---|
1054 | } |
---|
1055 | } |
---|
1056 | return (itemp); |
---|
1057 | } |
---|
1058 | |
---|
1059 | /*----------------------*/ |
---|
1060 | REAL epslon (REAL x) |
---|
1061 | |
---|
1062 | /* |
---|
1063 | estimate unit roundoff in quantities of size x. |
---|
1064 | */ |
---|
1065 | |
---|
1066 | { |
---|
1067 | REAL a,b,c,eps; |
---|
1068 | /* |
---|
1069 | this program should function properly on all systems |
---|
1070 | satisfying the following two assumptions, |
---|
1071 | 1. the base used in representing dfloating point |
---|
1072 | numbers is not a power of three. |
---|
1073 | 2. the quantity a in statement 10 is represented to |
---|
1074 | the accuracy used in dfloating point variables |
---|
1075 | that are stored in memory. |
---|
1076 | the statement number 10 and the go to 10 are intended to |
---|
1077 | force optimizing compilers to generate code satisfying |
---|
1078 | assumption 2. |
---|
1079 | under these assumptions, it should be true that, |
---|
1080 | a is not exactly equal to four-thirds, |
---|
1081 | b has a zero for its last bit or digit, |
---|
1082 | c is not exactly equal to one, |
---|
1083 | eps measures the separation of 1.0 from |
---|
1084 | the next larger dfloating point number. |
---|
1085 | the developers of eispack would appreciate being informed |
---|
1086 | about any systems where these assumptions do not hold. |
---|
1087 | |
---|
1088 | ***************************************************************** |
---|
1089 | this routine is one of the auxiliary routines used by eispack iii |
---|
1090 | to avoid machine dependencies. |
---|
1091 | ***************************************************************** |
---|
1092 | |
---|
1093 | this version dated 4/6/83. |
---|
1094 | */ |
---|
1095 | |
---|
1096 | a = 4.0e0/3.0e0; |
---|
1097 | eps = ZERO; |
---|
1098 | while (eps == ZERO) { |
---|
1099 | b = a - ONE; |
---|
1100 | c = b + b + b; |
---|
1101 | eps = fabs((double)(c-ONE)); |
---|
1102 | } |
---|
1103 | return(eps*fabs((double)x)); |
---|
1104 | } |
---|
1105 | |
---|
1106 | /*----------------------*/ |
---|
1107 | void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]) |
---|
1108 | |
---|
1109 | |
---|
1110 | /* We would like to declare m[][ldm], but c does not allow it. In this |
---|
1111 | function, references to m[i][j] are written m[ldm*i+j]. */ |
---|
1112 | |
---|
1113 | /* |
---|
1114 | purpose: |
---|
1115 | multiply matrix m times vector x and add the result to vector y. |
---|
1116 | |
---|
1117 | parameters: |
---|
1118 | |
---|
1119 | n1 integer, number of elements in vector y, and number of rows in |
---|
1120 | matrix m |
---|
1121 | |
---|
1122 | y double [n1], vector of length n1 to which is added |
---|
1123 | the product m*x |
---|
1124 | |
---|
1125 | n2 integer, number of elements in vector x, and number of columns |
---|
1126 | in matrix m |
---|
1127 | |
---|
1128 | ldm integer, leading dimension of array m |
---|
1129 | |
---|
1130 | x double [n2], vector of length n2 |
---|
1131 | |
---|
1132 | m double [ldm][n2], matrix of n1 rows and n2 columns |
---|
1133 | |
---|
1134 | ---------------------------------------------------------------------- |
---|
1135 | */ |
---|
1136 | { |
---|
1137 | int j,i,jmin; |
---|
1138 | /* cleanup odd vector */ |
---|
1139 | |
---|
1140 | j = n2 % 2; |
---|
1141 | if (j >= 1) { |
---|
1142 | j = j - 1; |
---|
1143 | for (i = 0; i < n1; i++) |
---|
1144 | y[i] = (y[i]) + x[j]*m[ldm*j+i]; |
---|
1145 | } |
---|
1146 | |
---|
1147 | /* cleanup odd group of two vectors */ |
---|
1148 | |
---|
1149 | j = n2 % 4; |
---|
1150 | if (j >= 2) { |
---|
1151 | j = j - 1; |
---|
1152 | for (i = 0; i < n1; i++) |
---|
1153 | y[i] = ( (y[i]) |
---|
1154 | + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; |
---|
1155 | } |
---|
1156 | |
---|
1157 | /* cleanup odd group of four vectors */ |
---|
1158 | |
---|
1159 | j = n2 % 8; |
---|
1160 | if (j >= 4) { |
---|
1161 | j = j - 1; |
---|
1162 | for (i = 0; i < n1; i++) |
---|
1163 | y[i] = ((( (y[i]) |
---|
1164 | + x[j-3]*m[ldm*(j-3)+i]) |
---|
1165 | + x[j-2]*m[ldm*(j-2)+i]) |
---|
1166 | + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; |
---|
1167 | } |
---|
1168 | |
---|
1169 | /* cleanup odd group of eight vectors */ |
---|
1170 | |
---|
1171 | j = n2 % 16; |
---|
1172 | if (j >= 8) { |
---|
1173 | j = j - 1; |
---|
1174 | for (i = 0; i < n1; i++) |
---|
1175 | y[i] = ((((((( (y[i]) |
---|
1176 | + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i]) |
---|
1177 | + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i]) |
---|
1178 | + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i]) |
---|
1179 | + x[j-1]*m[ldm*(j-1)+i]) + x[j] *m[ldm*j+i]; |
---|
1180 | } |
---|
1181 | |
---|
1182 | /* main loop - groups of sixteen vectors */ |
---|
1183 | |
---|
1184 | jmin = (n2%16)+16; |
---|
1185 | for (j = jmin-1; j < n2; j = j + 16) { |
---|
1186 | for (i = 0; i < n1; i++) |
---|
1187 | y[i] = ((((((((((((((( (y[i]) |
---|
1188 | + x[j-15]*m[ldm*(j-15)+i]) |
---|
1189 | + x[j-14]*m[ldm*(j-14)+i]) |
---|
1190 | + x[j-13]*m[ldm*(j-13)+i]) |
---|
1191 | + x[j-12]*m[ldm*(j-12)+i]) |
---|
1192 | + x[j-11]*m[ldm*(j-11)+i]) |
---|
1193 | + x[j-10]*m[ldm*(j-10)+i]) |
---|
1194 | + x[j- 9]*m[ldm*(j- 9)+i]) |
---|
1195 | + x[j- 8]*m[ldm*(j- 8)+i]) |
---|
1196 | + x[j- 7]*m[ldm*(j- 7)+i]) |
---|
1197 | + x[j- 6]*m[ldm*(j- 6)+i]) |
---|
1198 | + x[j- 5]*m[ldm*(j- 5)+i]) |
---|
1199 | + x[j- 4]*m[ldm*(j- 4)+i]) |
---|
1200 | + x[j- 3]*m[ldm*(j- 3)+i]) |
---|
1201 | + x[j- 2]*m[ldm*(j- 2)+i]) |
---|
1202 | + x[j- 1]*m[ldm*(j- 1)+i]) |
---|
1203 | + x[j] *m[ldm*j+i]; |
---|
1204 | } |
---|
1205 | return; |
---|
1206 | } |
---|
1207 | |
---|
1208 | /*----------------------*/ |
---|