source: rtems-graphics-toolkit/jpeg-7/jidctint.c @ 2abf143

base initial
Last change on this file since 2abf143 was 2abf143, checked in by Joel Sherrill <joel.sherrill@…>, on 12/20/09 at 00:59:55

Initial import of jpeg and tiff libraries.

  • Property mode set to 100644
File size: 177.6 KB
Line 
1/*
2 * jidctint.c
3 *
4 * Copyright (C) 1991-1998, Thomas G. Lane.
5 * Modification developed 2002-2009 by Guido Vollbeding.
6 * This file is part of the Independent JPEG Group's software.
7 * For conditions of distribution and use, see the accompanying README file.
8 *
9 * This file contains a slow-but-accurate integer implementation of the
10 * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
11 * must also perform dequantization of the input coefficients.
12 *
13 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
14 * on each row (or vice versa, but it's more convenient to emit a row at
15 * a time).  Direct algorithms are also available, but they are much more
16 * complex and seem not to be any faster when reduced to code.
17 *
18 * This implementation is based on an algorithm described in
19 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
20 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
21 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
22 * The primary algorithm described there uses 11 multiplies and 29 adds.
23 * We use their alternate method with 12 multiplies and 32 adds.
24 * The advantage of this method is that no data path contains more than one
25 * multiplication; this allows a very simple and accurate implementation in
26 * scaled fixed-point arithmetic, with a minimal number of shifts.
27 *
28 * We also provide IDCT routines with various output sample block sizes for
29 * direct resolution reduction or enlargement and for direct resolving the
30 * common 2x1 and 1x2 subsampling cases without additional resampling: NxN
31 * (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 input DCT block.
32 *
33 * For N<8 we simply take the corresponding low-frequency coefficients of
34 * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
35 * to yield the downscaled outputs.
36 * This can be seen as direct low-pass downsampling from the DCT domain
37 * point of view rather than the usual spatial domain point of view,
38 * yielding significant computational savings and results at least
39 * as good as common bilinear (averaging) spatial downsampling.
40 *
41 * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
42 * lower frequencies and higher frequencies assumed to be zero.
43 * It turns out that the computational effort is similar to the 8x8 IDCT
44 * regarding the output size.
45 * Furthermore, the scaling and descaling is the same for all IDCT sizes.
46 *
47 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
48 * since there would be too many additional constants to pre-calculate.
49 */
50
51#define JPEG_INTERNALS
52#include "jinclude.h"
53#include "jpeglib.h"
54#include "jdct.h"               /* Private declarations for DCT subsystem */
55
56#ifdef DCT_ISLOW_SUPPORTED
57
58
59/*
60 * This module is specialized to the case DCTSIZE = 8.
61 */
62
63#if DCTSIZE != 8
64  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
65#endif
66
67
68/*
69 * The poop on this scaling stuff is as follows:
70 *
71 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
72 * larger than the true IDCT outputs.  The final outputs are therefore
73 * a factor of N larger than desired; since N=8 this can be cured by
74 * a simple right shift at the end of the algorithm.  The advantage of
75 * this arrangement is that we save two multiplications per 1-D IDCT,
76 * because the y0 and y4 inputs need not be divided by sqrt(N).
77 *
78 * We have to do addition and subtraction of the integer inputs, which
79 * is no problem, and multiplication by fractional constants, which is
80 * a problem to do in integer arithmetic.  We multiply all the constants
81 * by CONST_SCALE and convert them to integer constants (thus retaining
82 * CONST_BITS bits of precision in the constants).  After doing a
83 * multiplication we have to divide the product by CONST_SCALE, with proper
84 * rounding, to produce the correct output.  This division can be done
85 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
86 * as long as possible so that partial sums can be added together with
87 * full fractional precision.
88 *
89 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
90 * they are represented to better-than-integral precision.  These outputs
91 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
92 * with the recommended scaling.  (To scale up 12-bit sample data further, an
93 * intermediate INT32 array would be needed.)
94 *
95 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
96 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
97 * shows that the values given below are the most effective.
98 */
99
100#if BITS_IN_JSAMPLE == 8
101#define CONST_BITS  13
102#define PASS1_BITS  2
103#else
104#define CONST_BITS  13
105#define PASS1_BITS  1           /* lose a little precision to avoid overflow */
106#endif
107
108/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
109 * causing a lot of useless floating-point operations at run time.
110 * To get around this we use the following pre-calculated constants.
111 * If you change CONST_BITS you may want to add appropriate values.
112 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
113 */
114
115#if CONST_BITS == 13
116#define FIX_0_298631336  ((INT32)  2446)        /* FIX(0.298631336) */
117#define FIX_0_390180644  ((INT32)  3196)        /* FIX(0.390180644) */
118#define FIX_0_541196100  ((INT32)  4433)        /* FIX(0.541196100) */
119#define FIX_0_765366865  ((INT32)  6270)        /* FIX(0.765366865) */
120#define FIX_0_899976223  ((INT32)  7373)        /* FIX(0.899976223) */
121#define FIX_1_175875602  ((INT32)  9633)        /* FIX(1.175875602) */
122#define FIX_1_501321110  ((INT32)  12299)       /* FIX(1.501321110) */
123#define FIX_1_847759065  ((INT32)  15137)       /* FIX(1.847759065) */
124#define FIX_1_961570560  ((INT32)  16069)       /* FIX(1.961570560) */
125#define FIX_2_053119869  ((INT32)  16819)       /* FIX(2.053119869) */
126#define FIX_2_562915447  ((INT32)  20995)       /* FIX(2.562915447) */
127#define FIX_3_072711026  ((INT32)  25172)       /* FIX(3.072711026) */
128#else
129#define FIX_0_298631336  FIX(0.298631336)
130#define FIX_0_390180644  FIX(0.390180644)
131#define FIX_0_541196100  FIX(0.541196100)
132#define FIX_0_765366865  FIX(0.765366865)
133#define FIX_0_899976223  FIX(0.899976223)
134#define FIX_1_175875602  FIX(1.175875602)
135#define FIX_1_501321110  FIX(1.501321110)
136#define FIX_1_847759065  FIX(1.847759065)
137#define FIX_1_961570560  FIX(1.961570560)
138#define FIX_2_053119869  FIX(2.053119869)
139#define FIX_2_562915447  FIX(2.562915447)
140#define FIX_3_072711026  FIX(3.072711026)
141#endif
142
143
144/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
145 * For 8-bit samples with the recommended scaling, all the variable
146 * and constant values involved are no more than 16 bits wide, so a
147 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
148 * For 12-bit samples, a full 32-bit multiplication will be needed.
149 */
150
151#if BITS_IN_JSAMPLE == 8
152#define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
153#else
154#define MULTIPLY(var,const)  ((var) * (const))
155#endif
156
157
158/* Dequantize a coefficient by multiplying it by the multiplier-table
159 * entry; produce an int result.  In this module, both inputs and result
160 * are 16 bits or less, so either int or short multiply will work.
161 */
162
163#define DEQUANTIZE(coef,quantval)  (((ISLOW_MULT_TYPE) (coef)) * (quantval))
164
165
166/*
167 * Perform dequantization and inverse DCT on one block of coefficients.
168 */
169
170GLOBAL(void)
171jpeg_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
172                 JCOEFPTR coef_block,
173                 JSAMPARRAY output_buf, JDIMENSION output_col)
174{
175  INT32 tmp0, tmp1, tmp2, tmp3;
176  INT32 tmp10, tmp11, tmp12, tmp13;
177  INT32 z1, z2, z3;
178  JCOEFPTR inptr;
179  ISLOW_MULT_TYPE * quantptr;
180  int * wsptr;
181  JSAMPROW outptr;
182  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
183  int ctr;
184  int workspace[DCTSIZE2];      /* buffers data between passes */
185  SHIFT_TEMPS
186
187  /* Pass 1: process columns from input, store into work array. */
188  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
189  /* furthermore, we scale the results by 2**PASS1_BITS. */
190
191  inptr = coef_block;
192  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
193  wsptr = workspace;
194  for (ctr = DCTSIZE; ctr > 0; ctr--) {
195    /* Due to quantization, we will usually find that many of the input
196     * coefficients are zero, especially the AC terms.  We can exploit this
197     * by short-circuiting the IDCT calculation for any column in which all
198     * the AC terms are zero.  In that case each output is equal to the
199     * DC coefficient (with scale factor as needed).
200     * With typical images and quantization tables, half or more of the
201     * column DCT calculations can be simplified this way.
202     */
203
204    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
205        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
206        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
207        inptr[DCTSIZE*7] == 0) {
208      /* AC terms all zero */
209      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
210
211      wsptr[DCTSIZE*0] = dcval;
212      wsptr[DCTSIZE*1] = dcval;
213      wsptr[DCTSIZE*2] = dcval;
214      wsptr[DCTSIZE*3] = dcval;
215      wsptr[DCTSIZE*4] = dcval;
216      wsptr[DCTSIZE*5] = dcval;
217      wsptr[DCTSIZE*6] = dcval;
218      wsptr[DCTSIZE*7] = dcval;
219
220      inptr++;                  /* advance pointers to next column */
221      quantptr++;
222      wsptr++;
223      continue;
224    }
225
226    /* Even part: reverse the even part of the forward DCT. */
227    /* The rotator is sqrt(2)*c(-6). */
228   
229    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
230    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
231
232    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
233    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
234    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
235
236    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
237    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
238    z2 <<= CONST_BITS;
239    z3 <<= CONST_BITS;
240    /* Add fudge factor here for final descale. */
241    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
242
243    tmp0 = z2 + z3;
244    tmp1 = z2 - z3;
245
246    tmp10 = tmp0 + tmp2;
247    tmp13 = tmp0 - tmp2;
248    tmp11 = tmp1 + tmp3;
249    tmp12 = tmp1 - tmp3;
250
251    /* Odd part per figure 8; the matrix is unitary and hence its
252     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
253     */
254
255    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
256    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
257    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
258    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
259   
260    z2 = tmp0 + tmp2;
261    z3 = tmp1 + tmp3;
262
263    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
264    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
265    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
266    z2 += z1;
267    z3 += z1;
268
269    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
270    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
271    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
272    tmp0 += z1 + z2;
273    tmp3 += z1 + z3;
274
275    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
276    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
277    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
278    tmp1 += z1 + z3;
279    tmp2 += z1 + z2;
280
281    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
282
283    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
284    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
285    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
286    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
287    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
288    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
289    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
290    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
291   
292    inptr++;                    /* advance pointers to next column */
293    quantptr++;
294    wsptr++;
295  }
296
297  /* Pass 2: process rows from work array, store into output array. */
298  /* Note that we must descale the results by a factor of 8 == 2**3, */
299  /* and also undo the PASS1_BITS scaling. */
300
301  wsptr = workspace;
302  for (ctr = 0; ctr < DCTSIZE; ctr++) {
303    outptr = output_buf[ctr] + output_col;
304    /* Rows of zeroes can be exploited in the same way as we did with columns.
305     * However, the column calculation has created many nonzero AC terms, so
306     * the simplification applies less often (typically 5% to 10% of the time).
307     * On machines with very fast multiplication, it's possible that the
308     * test takes more time than it's worth.  In that case this section
309     * may be commented out.
310     */
311
312#ifndef NO_ZERO_ROW_TEST
313    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
314        wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
315      /* AC terms all zero */
316      JSAMPLE dcval = range_limit[(int) DESCALE((INT32) wsptr[0], PASS1_BITS+3)
317                                  & RANGE_MASK];
318
319      outptr[0] = dcval;
320      outptr[1] = dcval;
321      outptr[2] = dcval;
322      outptr[3] = dcval;
323      outptr[4] = dcval;
324      outptr[5] = dcval;
325      outptr[6] = dcval;
326      outptr[7] = dcval;
327
328      wsptr += DCTSIZE;         /* advance pointer to next row */
329      continue;
330    }
331#endif
332
333    /* Even part: reverse the even part of the forward DCT. */
334    /* The rotator is sqrt(2)*c(-6). */
335   
336    z2 = (INT32) wsptr[2];
337    z3 = (INT32) wsptr[6];
338
339    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
340    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
341    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
342
343    /* Add fudge factor here for final descale. */
344    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
345    z3 = (INT32) wsptr[4];
346
347    tmp0 = (z2 + z3) << CONST_BITS;
348    tmp1 = (z2 - z3) << CONST_BITS;
349   
350    tmp10 = tmp0 + tmp2;
351    tmp13 = tmp0 - tmp2;
352    tmp11 = tmp1 + tmp3;
353    tmp12 = tmp1 - tmp3;
354
355    /* Odd part per figure 8; the matrix is unitary and hence its
356     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
357     */
358
359    tmp0 = (INT32) wsptr[7];
360    tmp1 = (INT32) wsptr[5];
361    tmp2 = (INT32) wsptr[3];
362    tmp3 = (INT32) wsptr[1];
363
364    z2 = tmp0 + tmp2;
365    z3 = tmp1 + tmp3;
366
367    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
368    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
369    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
370    z2 += z1;
371    z3 += z1;
372
373    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
374    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
375    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
376    tmp0 += z1 + z2;
377    tmp3 += z1 + z3;
378
379    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
380    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
381    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
382    tmp1 += z1 + z3;
383    tmp2 += z1 + z2;
384
385    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
386
387    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
388                                              CONST_BITS+PASS1_BITS+3)
389                            & RANGE_MASK];
390    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
391                                              CONST_BITS+PASS1_BITS+3)
392                            & RANGE_MASK];
393    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
394                                              CONST_BITS+PASS1_BITS+3)
395                            & RANGE_MASK];
396    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
397                                              CONST_BITS+PASS1_BITS+3)
398                            & RANGE_MASK];
399    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
400                                              CONST_BITS+PASS1_BITS+3)
401                            & RANGE_MASK];
402    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
403                                              CONST_BITS+PASS1_BITS+3)
404                            & RANGE_MASK];
405    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
406                                              CONST_BITS+PASS1_BITS+3)
407                            & RANGE_MASK];
408    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
409                                              CONST_BITS+PASS1_BITS+3)
410                            & RANGE_MASK];
411
412    wsptr += DCTSIZE;           /* advance pointer to next row */
413  }
414}
415
416#ifdef IDCT_SCALING_SUPPORTED
417
418
419/*
420 * Perform dequantization and inverse DCT on one block of coefficients,
421 * producing a 7x7 output block.
422 *
423 * Optimized algorithm with 12 multiplications in the 1-D kernel.
424 * cK represents sqrt(2) * cos(K*pi/14).
425 */
426
427GLOBAL(void)
428jpeg_idct_7x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
429               JCOEFPTR coef_block,
430               JSAMPARRAY output_buf, JDIMENSION output_col)
431{
432  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
433  INT32 z1, z2, z3;
434  JCOEFPTR inptr;
435  ISLOW_MULT_TYPE * quantptr;
436  int * wsptr;
437  JSAMPROW outptr;
438  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
439  int ctr;
440  int workspace[7*7];   /* buffers data between passes */
441  SHIFT_TEMPS
442
443  /* Pass 1: process columns from input, store into work array. */
444
445  inptr = coef_block;
446  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
447  wsptr = workspace;
448  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
449    /* Even part */
450
451    tmp13 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
452    tmp13 <<= CONST_BITS;
453    /* Add fudge factor here for final descale. */
454    tmp13 += ONE << (CONST_BITS-PASS1_BITS-1);
455
456    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
457    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
458    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
459
460    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
461    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
462    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
463    tmp0 = z1 + z3;
464    z2 -= tmp0;
465    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
466    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
467    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
468    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
469
470    /* Odd part */
471
472    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
473    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
474    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
475
476    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
477    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
478    tmp0 = tmp1 - tmp2;
479    tmp1 += tmp2;
480    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
481    tmp1 += tmp2;
482    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
483    tmp0 += z2;
484    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
485
486    /* Final output stage */
487
488    wsptr[7*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
489    wsptr[7*6] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
490    wsptr[7*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
491    wsptr[7*5] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
492    wsptr[7*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
493    wsptr[7*4] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
494    wsptr[7*3] = (int) RIGHT_SHIFT(tmp13, CONST_BITS-PASS1_BITS);
495  }
496
497  /* Pass 2: process 7 rows from work array, store into output array. */
498
499  wsptr = workspace;
500  for (ctr = 0; ctr < 7; ctr++) {
501    outptr = output_buf[ctr] + output_col;
502
503    /* Even part */
504
505    /* Add fudge factor here for final descale. */
506    tmp13 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
507    tmp13 <<= CONST_BITS;
508
509    z1 = (INT32) wsptr[2];
510    z2 = (INT32) wsptr[4];
511    z3 = (INT32) wsptr[6];
512
513    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
514    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
515    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
516    tmp0 = z1 + z3;
517    z2 -= tmp0;
518    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
519    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
520    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
521    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
522
523    /* Odd part */
524
525    z1 = (INT32) wsptr[1];
526    z2 = (INT32) wsptr[3];
527    z3 = (INT32) wsptr[5];
528
529    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
530    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
531    tmp0 = tmp1 - tmp2;
532    tmp1 += tmp2;
533    tmp2 = MULTIPLY(z2 + z3, - FIX(1.378756276));    /* -c1 */
534    tmp1 += tmp2;
535    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
536    tmp0 += z2;
537    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
538
539    /* Final output stage */
540
541    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
542                                              CONST_BITS+PASS1_BITS+3)
543                            & RANGE_MASK];
544    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
545                                              CONST_BITS+PASS1_BITS+3)
546                            & RANGE_MASK];
547    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
548                                              CONST_BITS+PASS1_BITS+3)
549                            & RANGE_MASK];
550    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
551                                              CONST_BITS+PASS1_BITS+3)
552                            & RANGE_MASK];
553    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
554                                              CONST_BITS+PASS1_BITS+3)
555                            & RANGE_MASK];
556    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
557                                              CONST_BITS+PASS1_BITS+3)
558                            & RANGE_MASK];
559    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13,
560                                              CONST_BITS+PASS1_BITS+3)
561                            & RANGE_MASK];
562
563    wsptr += 7;         /* advance pointer to next row */
564  }
565}
566
567
568/*
569 * Perform dequantization and inverse DCT on one block of coefficients,
570 * producing a reduced-size 6x6 output block.
571 *
572 * Optimized algorithm with 3 multiplications in the 1-D kernel.
573 * cK represents sqrt(2) * cos(K*pi/12).
574 */
575
576GLOBAL(void)
577jpeg_idct_6x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
578               JCOEFPTR coef_block,
579               JSAMPARRAY output_buf, JDIMENSION output_col)
580{
581  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
582  INT32 z1, z2, z3;
583  JCOEFPTR inptr;
584  ISLOW_MULT_TYPE * quantptr;
585  int * wsptr;
586  JSAMPROW outptr;
587  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
588  int ctr;
589  int workspace[6*6];   /* buffers data between passes */
590  SHIFT_TEMPS
591
592  /* Pass 1: process columns from input, store into work array. */
593
594  inptr = coef_block;
595  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
596  wsptr = workspace;
597  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
598    /* Even part */
599
600    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
601    tmp0 <<= CONST_BITS;
602    /* Add fudge factor here for final descale. */
603    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
604    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
605    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
606    tmp1 = tmp0 + tmp10;
607    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
608    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
609    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
610    tmp10 = tmp1 + tmp0;
611    tmp12 = tmp1 - tmp0;
612
613    /* Odd part */
614
615    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
616    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
617    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
618    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
619    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
620    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
621    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
622
623    /* Final output stage */
624
625    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
626    wsptr[6*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
627    wsptr[6*1] = (int) (tmp11 + tmp1);
628    wsptr[6*4] = (int) (tmp11 - tmp1);
629    wsptr[6*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
630    wsptr[6*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
631  }
632
633  /* Pass 2: process 6 rows from work array, store into output array. */
634
635  wsptr = workspace;
636  for (ctr = 0; ctr < 6; ctr++) {
637    outptr = output_buf[ctr] + output_col;
638
639    /* Even part */
640
641    /* Add fudge factor here for final descale. */
642    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
643    tmp0 <<= CONST_BITS;
644    tmp2 = (INT32) wsptr[4];
645    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
646    tmp1 = tmp0 + tmp10;
647    tmp11 = tmp0 - tmp10 - tmp10;
648    tmp10 = (INT32) wsptr[2];
649    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
650    tmp10 = tmp1 + tmp0;
651    tmp12 = tmp1 - tmp0;
652
653    /* Odd part */
654
655    z1 = (INT32) wsptr[1];
656    z2 = (INT32) wsptr[3];
657    z3 = (INT32) wsptr[5];
658    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
659    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
660    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
661    tmp1 = (z1 - z2 - z3) << CONST_BITS;
662
663    /* Final output stage */
664
665    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
666                                              CONST_BITS+PASS1_BITS+3)
667                            & RANGE_MASK];
668    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
669                                              CONST_BITS+PASS1_BITS+3)
670                            & RANGE_MASK];
671    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
672                                              CONST_BITS+PASS1_BITS+3)
673                            & RANGE_MASK];
674    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
675                                              CONST_BITS+PASS1_BITS+3)
676                            & RANGE_MASK];
677    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
678                                              CONST_BITS+PASS1_BITS+3)
679                            & RANGE_MASK];
680    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
681                                              CONST_BITS+PASS1_BITS+3)
682                            & RANGE_MASK];
683
684    wsptr += 6;         /* advance pointer to next row */
685  }
686}
687
688
689/*
690 * Perform dequantization and inverse DCT on one block of coefficients,
691 * producing a reduced-size 5x5 output block.
692 *
693 * Optimized algorithm with 5 multiplications in the 1-D kernel.
694 * cK represents sqrt(2) * cos(K*pi/10).
695 */
696
697GLOBAL(void)
698jpeg_idct_5x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
699               JCOEFPTR coef_block,
700               JSAMPARRAY output_buf, JDIMENSION output_col)
701{
702  INT32 tmp0, tmp1, tmp10, tmp11, tmp12;
703  INT32 z1, z2, z3;
704  JCOEFPTR inptr;
705  ISLOW_MULT_TYPE * quantptr;
706  int * wsptr;
707  JSAMPROW outptr;
708  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
709  int ctr;
710  int workspace[5*5];   /* buffers data between passes */
711  SHIFT_TEMPS
712
713  /* Pass 1: process columns from input, store into work array. */
714
715  inptr = coef_block;
716  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
717  wsptr = workspace;
718  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
719    /* Even part */
720
721    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
722    tmp12 <<= CONST_BITS;
723    /* Add fudge factor here for final descale. */
724    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
725    tmp0 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
726    tmp1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
727    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
728    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
729    z3 = tmp12 + z2;
730    tmp10 = z3 + z1;
731    tmp11 = z3 - z1;
732    tmp12 -= z2 << 2;
733
734    /* Odd part */
735
736    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
737    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
738
739    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
740    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
741    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
742
743    /* Final output stage */
744
745    wsptr[5*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
746    wsptr[5*4] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
747    wsptr[5*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
748    wsptr[5*3] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
749    wsptr[5*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
750  }
751
752  /* Pass 2: process 5 rows from work array, store into output array. */
753
754  wsptr = workspace;
755  for (ctr = 0; ctr < 5; ctr++) {
756    outptr = output_buf[ctr] + output_col;
757
758    /* Even part */
759
760    /* Add fudge factor here for final descale. */
761    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
762    tmp12 <<= CONST_BITS;
763    tmp0 = (INT32) wsptr[2];
764    tmp1 = (INT32) wsptr[4];
765    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
766    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
767    z3 = tmp12 + z2;
768    tmp10 = z3 + z1;
769    tmp11 = z3 - z1;
770    tmp12 -= z2 << 2;
771
772    /* Odd part */
773
774    z2 = (INT32) wsptr[1];
775    z3 = (INT32) wsptr[3];
776
777    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
778    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
779    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
780
781    /* Final output stage */
782
783    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
784                                              CONST_BITS+PASS1_BITS+3)
785                            & RANGE_MASK];
786    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
787                                              CONST_BITS+PASS1_BITS+3)
788                            & RANGE_MASK];
789    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
790                                              CONST_BITS+PASS1_BITS+3)
791                            & RANGE_MASK];
792    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
793                                              CONST_BITS+PASS1_BITS+3)
794                            & RANGE_MASK];
795    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
796                                              CONST_BITS+PASS1_BITS+3)
797                            & RANGE_MASK];
798
799    wsptr += 5;         /* advance pointer to next row */
800  }
801}
802
803
804/*
805 * Perform dequantization and inverse DCT on one block of coefficients,
806 * producing a reduced-size 4x4 output block.
807 *
808 * Optimized algorithm with 3 multiplications in the 1-D kernel.
809 * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
810 */
811
812GLOBAL(void)
813jpeg_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
814               JCOEFPTR coef_block,
815               JSAMPARRAY output_buf, JDIMENSION output_col)
816{
817  INT32 tmp0, tmp2, tmp10, tmp12;
818  INT32 z1, z2, z3;
819  JCOEFPTR inptr;
820  ISLOW_MULT_TYPE * quantptr;
821  int * wsptr;
822  JSAMPROW outptr;
823  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
824  int ctr;
825  int workspace[4*4];   /* buffers data between passes */
826  SHIFT_TEMPS
827
828  /* Pass 1: process columns from input, store into work array. */
829
830  inptr = coef_block;
831  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
832  wsptr = workspace;
833  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
834    /* Even part */
835
836    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
837    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
838   
839    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
840    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
841
842    /* Odd part */
843    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
844
845    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
846    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
847
848    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
849    /* Add fudge factor here for final descale. */
850    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
851    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
852                       CONST_BITS-PASS1_BITS);
853    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
854                       CONST_BITS-PASS1_BITS);
855
856    /* Final output stage */
857
858    wsptr[4*0] = (int) (tmp10 + tmp0);
859    wsptr[4*3] = (int) (tmp10 - tmp0);
860    wsptr[4*1] = (int) (tmp12 + tmp2);
861    wsptr[4*2] = (int) (tmp12 - tmp2);
862  }
863
864  /* Pass 2: process 4 rows from work array, store into output array. */
865
866  wsptr = workspace;
867  for (ctr = 0; ctr < 4; ctr++) {
868    outptr = output_buf[ctr] + output_col;
869
870    /* Even part */
871
872    /* Add fudge factor here for final descale. */
873    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
874    tmp2 = (INT32) wsptr[2];
875
876    tmp10 = (tmp0 + tmp2) << CONST_BITS;
877    tmp12 = (tmp0 - tmp2) << CONST_BITS;
878
879    /* Odd part */
880    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
881
882    z2 = (INT32) wsptr[1];
883    z3 = (INT32) wsptr[3];
884
885    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
886    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
887    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
888
889    /* Final output stage */
890
891    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
892                                              CONST_BITS+PASS1_BITS+3)
893                            & RANGE_MASK];
894    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
895                                              CONST_BITS+PASS1_BITS+3)
896                            & RANGE_MASK];
897    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
898                                              CONST_BITS+PASS1_BITS+3)
899                            & RANGE_MASK];
900    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
901                                              CONST_BITS+PASS1_BITS+3)
902                            & RANGE_MASK];
903
904    wsptr += 4;         /* advance pointer to next row */
905  }
906}
907
908
909/*
910 * Perform dequantization and inverse DCT on one block of coefficients,
911 * producing a reduced-size 3x3 output block.
912 *
913 * Optimized algorithm with 2 multiplications in the 1-D kernel.
914 * cK represents sqrt(2) * cos(K*pi/6).
915 */
916
917GLOBAL(void)
918jpeg_idct_3x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
919               JCOEFPTR coef_block,
920               JSAMPARRAY output_buf, JDIMENSION output_col)
921{
922  INT32 tmp0, tmp2, tmp10, tmp12;
923  JCOEFPTR inptr;
924  ISLOW_MULT_TYPE * quantptr;
925  int * wsptr;
926  JSAMPROW outptr;
927  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
928  int ctr;
929  int workspace[3*3];   /* buffers data between passes */
930  SHIFT_TEMPS
931
932  /* Pass 1: process columns from input, store into work array. */
933
934  inptr = coef_block;
935  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
936  wsptr = workspace;
937  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
938    /* Even part */
939
940    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
941    tmp0 <<= CONST_BITS;
942    /* Add fudge factor here for final descale. */
943    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
944    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
945    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
946    tmp10 = tmp0 + tmp12;
947    tmp2 = tmp0 - tmp12 - tmp12;
948
949    /* Odd part */
950
951    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
952    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
953
954    /* Final output stage */
955
956    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
957    wsptr[3*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
958    wsptr[3*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
959  }
960
961  /* Pass 2: process 3 rows from work array, store into output array. */
962
963  wsptr = workspace;
964  for (ctr = 0; ctr < 3; ctr++) {
965    outptr = output_buf[ctr] + output_col;
966
967    /* Even part */
968
969    /* Add fudge factor here for final descale. */
970    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
971    tmp0 <<= CONST_BITS;
972    tmp2 = (INT32) wsptr[2];
973    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
974    tmp10 = tmp0 + tmp12;
975    tmp2 = tmp0 - tmp12 - tmp12;
976
977    /* Odd part */
978
979    tmp12 = (INT32) wsptr[1];
980    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
981
982    /* Final output stage */
983
984    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
985                                              CONST_BITS+PASS1_BITS+3)
986                            & RANGE_MASK];
987    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
988                                              CONST_BITS+PASS1_BITS+3)
989                            & RANGE_MASK];
990    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
991                                              CONST_BITS+PASS1_BITS+3)
992                            & RANGE_MASK];
993
994    wsptr += 3;         /* advance pointer to next row */
995  }
996}
997
998
999/*
1000 * Perform dequantization and inverse DCT on one block of coefficients,
1001 * producing a reduced-size 2x2 output block.
1002 *
1003 * Multiplication-less algorithm.
1004 */
1005
1006GLOBAL(void)
1007jpeg_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1008               JCOEFPTR coef_block,
1009               JSAMPARRAY output_buf, JDIMENSION output_col)
1010{
1011  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1012  ISLOW_MULT_TYPE * quantptr;
1013  JSAMPROW outptr;
1014  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1015  SHIFT_TEMPS
1016
1017  /* Pass 1: process columns from input. */
1018
1019  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1020
1021  /* Column 0 */
1022  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
1023  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
1024  /* Add fudge factor here for final descale. */
1025  tmp4 += ONE << 2;
1026
1027  tmp0 = tmp4 + tmp5;
1028  tmp2 = tmp4 - tmp5;
1029
1030  /* Column 1 */
1031  tmp4 = DEQUANTIZE(coef_block[DCTSIZE*0+1], quantptr[DCTSIZE*0+1]);
1032  tmp5 = DEQUANTIZE(coef_block[DCTSIZE*1+1], quantptr[DCTSIZE*1+1]);
1033
1034  tmp1 = tmp4 + tmp5;
1035  tmp3 = tmp4 - tmp5;
1036
1037  /* Pass 2: process 2 rows, store into output array. */
1038
1039  /* Row 0 */
1040  outptr = output_buf[0] + output_col;
1041
1042  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp0 + tmp1, 3) & RANGE_MASK];
1043  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp0 - tmp1, 3) & RANGE_MASK];
1044
1045  /* Row 1 */
1046  outptr = output_buf[1] + output_col;
1047
1048  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp2 + tmp3, 3) & RANGE_MASK];
1049  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2 - tmp3, 3) & RANGE_MASK];
1050}
1051
1052
1053/*
1054 * Perform dequantization and inverse DCT on one block of coefficients,
1055 * producing a reduced-size 1x1 output block.
1056 *
1057 * We hardly need an inverse DCT routine for this: just take the
1058 * average pixel value, which is one-eighth of the DC coefficient.
1059 */
1060
1061GLOBAL(void)
1062jpeg_idct_1x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1063               JCOEFPTR coef_block,
1064               JSAMPARRAY output_buf, JDIMENSION output_col)
1065{
1066  int dcval;
1067  ISLOW_MULT_TYPE * quantptr;
1068  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1069  SHIFT_TEMPS
1070
1071  /* 1x1 is trivial: just take the DC coefficient divided by 8. */
1072  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1073  dcval = DEQUANTIZE(coef_block[0], quantptr[0]);
1074  dcval = (int) DESCALE((INT32) dcval, 3);
1075
1076  output_buf[0][output_col] = range_limit[dcval & RANGE_MASK];
1077}
1078
1079
1080/*
1081 * Perform dequantization and inverse DCT on one block of coefficients,
1082 * producing a 9x9 output block.
1083 *
1084 * Optimized algorithm with 10 multiplications in the 1-D kernel.
1085 * cK represents sqrt(2) * cos(K*pi/18).
1086 */
1087
1088GLOBAL(void)
1089jpeg_idct_9x9 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1090               JCOEFPTR coef_block,
1091               JSAMPARRAY output_buf, JDIMENSION output_col)
1092{
1093  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
1094  INT32 z1, z2, z3, z4;
1095  JCOEFPTR inptr;
1096  ISLOW_MULT_TYPE * quantptr;
1097  int * wsptr;
1098  JSAMPROW outptr;
1099  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1100  int ctr;
1101  int workspace[8*9];   /* buffers data between passes */
1102  SHIFT_TEMPS
1103
1104  /* Pass 1: process columns from input, store into work array. */
1105
1106  inptr = coef_block;
1107  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1108  wsptr = workspace;
1109  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1110    /* Even part */
1111
1112    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1113    tmp0 <<= CONST_BITS;
1114    /* Add fudge factor here for final descale. */
1115    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
1116
1117    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1118    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1119    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1120
1121    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1122    tmp1 = tmp0 + tmp3;
1123    tmp2 = tmp0 - tmp3 - tmp3;
1124
1125    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1126    tmp11 = tmp2 + tmp0;
1127    tmp14 = tmp2 - tmp0 - tmp0;
1128
1129    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1130    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1131    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1132
1133    tmp10 = tmp1 + tmp0 - tmp3;
1134    tmp12 = tmp1 - tmp0 + tmp2;
1135    tmp13 = tmp1 - tmp2 + tmp3;
1136
1137    /* Odd part */
1138
1139    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1140    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1141    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1142    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1143
1144    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1145
1146    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1147    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1148    tmp0 = tmp2 + tmp3 - z2;
1149    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1150    tmp2 += z2 - tmp1;
1151    tmp3 += z2 + tmp1;
1152    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1153
1154    /* Final output stage */
1155
1156    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
1157    wsptr[8*8] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
1158    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS-PASS1_BITS);
1159    wsptr[8*7] = (int) RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS-PASS1_BITS);
1160    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
1161    wsptr[8*6] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
1162    wsptr[8*3] = (int) RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS-PASS1_BITS);
1163    wsptr[8*5] = (int) RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS-PASS1_BITS);
1164    wsptr[8*4] = (int) RIGHT_SHIFT(tmp14, CONST_BITS-PASS1_BITS);
1165  }
1166
1167  /* Pass 2: process 9 rows from work array, store into output array. */
1168
1169  wsptr = workspace;
1170  for (ctr = 0; ctr < 9; ctr++) {
1171    outptr = output_buf[ctr] + output_col;
1172
1173    /* Even part */
1174
1175    /* Add fudge factor here for final descale. */
1176    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1177    tmp0 <<= CONST_BITS;
1178
1179    z1 = (INT32) wsptr[2];
1180    z2 = (INT32) wsptr[4];
1181    z3 = (INT32) wsptr[6];
1182
1183    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
1184    tmp1 = tmp0 + tmp3;
1185    tmp2 = tmp0 - tmp3 - tmp3;
1186
1187    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
1188    tmp11 = tmp2 + tmp0;
1189    tmp14 = tmp2 - tmp0 - tmp0;
1190
1191    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
1192    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
1193    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
1194
1195    tmp10 = tmp1 + tmp0 - tmp3;
1196    tmp12 = tmp1 - tmp0 + tmp2;
1197    tmp13 = tmp1 - tmp2 + tmp3;
1198
1199    /* Odd part */
1200
1201    z1 = (INT32) wsptr[1];
1202    z2 = (INT32) wsptr[3];
1203    z3 = (INT32) wsptr[5];
1204    z4 = (INT32) wsptr[7];
1205
1206    z2 = MULTIPLY(z2, - FIX(1.224744871));           /* -c3 */
1207
1208    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
1209    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
1210    tmp0 = tmp2 + tmp3 - z2;
1211    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
1212    tmp2 += z2 - tmp1;
1213    tmp3 += z2 + tmp1;
1214    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
1215
1216    /* Final output stage */
1217
1218    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
1219                                              CONST_BITS+PASS1_BITS+3)
1220                            & RANGE_MASK];
1221    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
1222                                              CONST_BITS+PASS1_BITS+3)
1223                            & RANGE_MASK];
1224    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
1225                                              CONST_BITS+PASS1_BITS+3)
1226                            & RANGE_MASK];
1227    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
1228                                              CONST_BITS+PASS1_BITS+3)
1229                            & RANGE_MASK];
1230    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
1231                                              CONST_BITS+PASS1_BITS+3)
1232                            & RANGE_MASK];
1233    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
1234                                              CONST_BITS+PASS1_BITS+3)
1235                            & RANGE_MASK];
1236    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp3,
1237                                              CONST_BITS+PASS1_BITS+3)
1238                            & RANGE_MASK];
1239    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp3,
1240                                              CONST_BITS+PASS1_BITS+3)
1241                            & RANGE_MASK];
1242    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp14,
1243                                              CONST_BITS+PASS1_BITS+3)
1244                            & RANGE_MASK];
1245
1246    wsptr += 8;         /* advance pointer to next row */
1247  }
1248}
1249
1250
1251/*
1252 * Perform dequantization and inverse DCT on one block of coefficients,
1253 * producing a 10x10 output block.
1254 *
1255 * Optimized algorithm with 12 multiplications in the 1-D kernel.
1256 * cK represents sqrt(2) * cos(K*pi/20).
1257 */
1258
1259GLOBAL(void)
1260jpeg_idct_10x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1261                 JCOEFPTR coef_block,
1262                 JSAMPARRAY output_buf, JDIMENSION output_col)
1263{
1264  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1265  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
1266  INT32 z1, z2, z3, z4, z5;
1267  JCOEFPTR inptr;
1268  ISLOW_MULT_TYPE * quantptr;
1269  int * wsptr;
1270  JSAMPROW outptr;
1271  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1272  int ctr;
1273  int workspace[8*10];  /* buffers data between passes */
1274  SHIFT_TEMPS
1275
1276  /* Pass 1: process columns from input, store into work array. */
1277
1278  inptr = coef_block;
1279  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1280  wsptr = workspace;
1281  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1282    /* Even part */
1283
1284    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1285    z3 <<= CONST_BITS;
1286    /* Add fudge factor here for final descale. */
1287    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1288    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1289    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1290    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1291    tmp10 = z3 + z1;
1292    tmp11 = z3 - z2;
1293
1294    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
1295                        CONST_BITS-PASS1_BITS);
1296
1297    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1298    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1299
1300    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1301    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1302    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1303
1304    tmp20 = tmp10 + tmp12;
1305    tmp24 = tmp10 - tmp12;
1306    tmp21 = tmp11 + tmp13;
1307    tmp23 = tmp11 - tmp13;
1308
1309    /* Odd part */
1310
1311    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1312    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1313    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1314    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1315
1316    tmp11 = z2 + z4;
1317    tmp13 = z2 - z4;
1318
1319    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1320    z5 = z3 << CONST_BITS;
1321
1322    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1323    z4 = z5 + tmp12;
1324
1325    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1326    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1327
1328    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1329    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
1330
1331    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
1332
1333    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1334    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1335
1336    /* Final output stage */
1337
1338    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1339    wsptr[8*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1340    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1341    wsptr[8*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1342    wsptr[8*2] = (int) (tmp22 + tmp12);
1343    wsptr[8*7] = (int) (tmp22 - tmp12);
1344    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1345    wsptr[8*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1346    wsptr[8*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1347    wsptr[8*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1348  }
1349
1350  /* Pass 2: process 10 rows from work array, store into output array. */
1351
1352  wsptr = workspace;
1353  for (ctr = 0; ctr < 10; ctr++) {
1354    outptr = output_buf[ctr] + output_col;
1355
1356    /* Even part */
1357
1358    /* Add fudge factor here for final descale. */
1359    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1360    z3 <<= CONST_BITS;
1361    z4 = (INT32) wsptr[4];
1362    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
1363    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
1364    tmp10 = z3 + z1;
1365    tmp11 = z3 - z2;
1366
1367    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
1368
1369    z2 = (INT32) wsptr[2];
1370    z3 = (INT32) wsptr[6];
1371
1372    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
1373    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
1374    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
1375
1376    tmp20 = tmp10 + tmp12;
1377    tmp24 = tmp10 - tmp12;
1378    tmp21 = tmp11 + tmp13;
1379    tmp23 = tmp11 - tmp13;
1380
1381    /* Odd part */
1382
1383    z1 = (INT32) wsptr[1];
1384    z2 = (INT32) wsptr[3];
1385    z3 = (INT32) wsptr[5];
1386    z3 <<= CONST_BITS;
1387    z4 = (INT32) wsptr[7];
1388
1389    tmp11 = z2 + z4;
1390    tmp13 = z2 - z4;
1391
1392    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
1393
1394    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
1395    z4 = z3 + tmp12;
1396
1397    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
1398    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
1399
1400    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
1401    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
1402
1403    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
1404
1405    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
1406    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
1407
1408    /* Final output stage */
1409
1410    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1411                                              CONST_BITS+PASS1_BITS+3)
1412                            & RANGE_MASK];
1413    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1414                                              CONST_BITS+PASS1_BITS+3)
1415                            & RANGE_MASK];
1416    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1417                                              CONST_BITS+PASS1_BITS+3)
1418                            & RANGE_MASK];
1419    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1420                                              CONST_BITS+PASS1_BITS+3)
1421                            & RANGE_MASK];
1422    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1423                                              CONST_BITS+PASS1_BITS+3)
1424                            & RANGE_MASK];
1425    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1426                                              CONST_BITS+PASS1_BITS+3)
1427                            & RANGE_MASK];
1428    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1429                                              CONST_BITS+PASS1_BITS+3)
1430                            & RANGE_MASK];
1431    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1432                                              CONST_BITS+PASS1_BITS+3)
1433                            & RANGE_MASK];
1434    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1435                                              CONST_BITS+PASS1_BITS+3)
1436                            & RANGE_MASK];
1437    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1438                                              CONST_BITS+PASS1_BITS+3)
1439                            & RANGE_MASK];
1440
1441    wsptr += 8;         /* advance pointer to next row */
1442  }
1443}
1444
1445
1446/*
1447 * Perform dequantization and inverse DCT on one block of coefficients,
1448 * producing a 11x11 output block.
1449 *
1450 * Optimized algorithm with 24 multiplications in the 1-D kernel.
1451 * cK represents sqrt(2) * cos(K*pi/22).
1452 */
1453
1454GLOBAL(void)
1455jpeg_idct_11x11 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1456                 JCOEFPTR coef_block,
1457                 JSAMPARRAY output_buf, JDIMENSION output_col)
1458{
1459  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1460  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1461  INT32 z1, z2, z3, z4;
1462  JCOEFPTR inptr;
1463  ISLOW_MULT_TYPE * quantptr;
1464  int * wsptr;
1465  JSAMPROW outptr;
1466  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1467  int ctr;
1468  int workspace[8*11];  /* buffers data between passes */
1469  SHIFT_TEMPS
1470
1471  /* Pass 1: process columns from input, store into work array. */
1472
1473  inptr = coef_block;
1474  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1475  wsptr = workspace;
1476  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1477    /* Even part */
1478
1479    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1480    tmp10 <<= CONST_BITS;
1481    /* Add fudge factor here for final descale. */
1482    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
1483
1484    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1485    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1486    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1487
1488    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1489    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1490    z4 = z1 + z3;
1491    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1492    z4 -= z2;
1493    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1494    tmp21 = tmp20 + tmp23 + tmp25 -
1495            MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1496    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1497    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1498    tmp24 += tmp25;
1499    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1500    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1501             MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1502    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1503
1504    /* Odd part */
1505
1506    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1507    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1508    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1509    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1510
1511    tmp11 = z1 + z2;
1512    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1513    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1514    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1515    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1516    tmp10 = tmp11 + tmp12 + tmp13 -
1517            MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1518    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1519    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1520    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1521    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1522    tmp11 += z1;
1523    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1524    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1525             MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1526             MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1527
1528    /* Final output stage */
1529
1530    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1531    wsptr[8*10] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1532    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1533    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1534    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1535    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1536    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1537    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1538    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1539    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1540    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25, CONST_BITS-PASS1_BITS);
1541  }
1542
1543  /* Pass 2: process 11 rows from work array, store into output array. */
1544
1545  wsptr = workspace;
1546  for (ctr = 0; ctr < 11; ctr++) {
1547    outptr = output_buf[ctr] + output_col;
1548
1549    /* Even part */
1550
1551    /* Add fudge factor here for final descale. */
1552    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1553    tmp10 <<= CONST_BITS;
1554
1555    z1 = (INT32) wsptr[2];
1556    z2 = (INT32) wsptr[4];
1557    z3 = (INT32) wsptr[6];
1558
1559    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
1560    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
1561    z4 = z1 + z3;
1562    tmp24 = MULTIPLY(z4, - FIX(1.155664402));        /* -(c2-c10) */
1563    z4 -= z2;
1564    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
1565    tmp21 = tmp20 + tmp23 + tmp25 -
1566            MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
1567    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
1568    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
1569    tmp24 += tmp25;
1570    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
1571    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
1572             MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
1573    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
1574
1575    /* Odd part */
1576
1577    z1 = (INT32) wsptr[1];
1578    z2 = (INT32) wsptr[3];
1579    z3 = (INT32) wsptr[5];
1580    z4 = (INT32) wsptr[7];
1581
1582    tmp11 = z1 + z2;
1583    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
1584    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
1585    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
1586    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
1587    tmp10 = tmp11 + tmp12 + tmp13 -
1588            MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
1589    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
1590    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
1591    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
1592    z1    = MULTIPLY(z2 + z4, - FIX(1.798248910));       /* -(c1+c9) */
1593    tmp11 += z1;
1594    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
1595    tmp14 += MULTIPLY(z2, - FIX(1.467221301)) +          /* -(c5+c9) */
1596             MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
1597             MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
1598
1599    /* Final output stage */
1600
1601    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1602                                               CONST_BITS+PASS1_BITS+3)
1603                             & RANGE_MASK];
1604    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1605                                               CONST_BITS+PASS1_BITS+3)
1606                             & RANGE_MASK];
1607    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1608                                               CONST_BITS+PASS1_BITS+3)
1609                             & RANGE_MASK];
1610    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1611                                               CONST_BITS+PASS1_BITS+3)
1612                             & RANGE_MASK];
1613    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1614                                               CONST_BITS+PASS1_BITS+3)
1615                             & RANGE_MASK];
1616    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1617                                               CONST_BITS+PASS1_BITS+3)
1618                             & RANGE_MASK];
1619    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1620                                               CONST_BITS+PASS1_BITS+3)
1621                             & RANGE_MASK];
1622    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1623                                               CONST_BITS+PASS1_BITS+3)
1624                             & RANGE_MASK];
1625    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1626                                               CONST_BITS+PASS1_BITS+3)
1627                             & RANGE_MASK];
1628    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1629                                               CONST_BITS+PASS1_BITS+3)
1630                             & RANGE_MASK];
1631    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25,
1632                                               CONST_BITS+PASS1_BITS+3)
1633                             & RANGE_MASK];
1634
1635    wsptr += 8;         /* advance pointer to next row */
1636  }
1637}
1638
1639
1640/*
1641 * Perform dequantization and inverse DCT on one block of coefficients,
1642 * producing a 12x12 output block.
1643 *
1644 * Optimized algorithm with 15 multiplications in the 1-D kernel.
1645 * cK represents sqrt(2) * cos(K*pi/24).
1646 */
1647
1648GLOBAL(void)
1649jpeg_idct_12x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1650                 JCOEFPTR coef_block,
1651                 JSAMPARRAY output_buf, JDIMENSION output_col)
1652{
1653  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1654  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
1655  INT32 z1, z2, z3, z4;
1656  JCOEFPTR inptr;
1657  ISLOW_MULT_TYPE * quantptr;
1658  int * wsptr;
1659  JSAMPROW outptr;
1660  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1661  int ctr;
1662  int workspace[8*12];  /* buffers data between passes */
1663  SHIFT_TEMPS
1664
1665  /* Pass 1: process columns from input, store into work array. */
1666
1667  inptr = coef_block;
1668  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1669  wsptr = workspace;
1670  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1671    /* Even part */
1672
1673    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1674    z3 <<= CONST_BITS;
1675    /* Add fudge factor here for final descale. */
1676    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
1677
1678    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1679    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1680
1681    tmp10 = z3 + z4;
1682    tmp11 = z3 - z4;
1683
1684    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1685    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1686    z1 <<= CONST_BITS;
1687    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1688    z2 <<= CONST_BITS;
1689
1690    tmp12 = z1 - z2;
1691
1692    tmp21 = z3 + tmp12;
1693    tmp24 = z3 - tmp12;
1694
1695    tmp12 = z4 + z2;
1696
1697    tmp20 = tmp10 + tmp12;
1698    tmp25 = tmp10 - tmp12;
1699
1700    tmp12 = z4 - z1 - z2;
1701
1702    tmp22 = tmp11 + tmp12;
1703    tmp23 = tmp11 - tmp12;
1704
1705    /* Odd part */
1706
1707    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1708    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1709    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1710    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1711
1712    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1713    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1714
1715    tmp10 = z1 + z3;
1716    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1717    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1718    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1719    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1720    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1721    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1722    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1723             MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1724
1725    z1 -= z4;
1726    z2 -= z3;
1727    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1728    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1729    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1730
1731    /* Final output stage */
1732
1733    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1734    wsptr[8*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1735    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1736    wsptr[8*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1737    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1738    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1739    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1740    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1741    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1742    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1743    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1744    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1745  }
1746
1747  /* Pass 2: process 12 rows from work array, store into output array. */
1748
1749  wsptr = workspace;
1750  for (ctr = 0; ctr < 12; ctr++) {
1751    outptr = output_buf[ctr] + output_col;
1752
1753    /* Even part */
1754
1755    /* Add fudge factor here for final descale. */
1756    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1757    z3 <<= CONST_BITS;
1758
1759    z4 = (INT32) wsptr[4];
1760    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
1761
1762    tmp10 = z3 + z4;
1763    tmp11 = z3 - z4;
1764
1765    z1 = (INT32) wsptr[2];
1766    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
1767    z1 <<= CONST_BITS;
1768    z2 = (INT32) wsptr[6];
1769    z2 <<= CONST_BITS;
1770
1771    tmp12 = z1 - z2;
1772
1773    tmp21 = z3 + tmp12;
1774    tmp24 = z3 - tmp12;
1775
1776    tmp12 = z4 + z2;
1777
1778    tmp20 = tmp10 + tmp12;
1779    tmp25 = tmp10 - tmp12;
1780
1781    tmp12 = z4 - z1 - z2;
1782
1783    tmp22 = tmp11 + tmp12;
1784    tmp23 = tmp11 - tmp12;
1785
1786    /* Odd part */
1787
1788    z1 = (INT32) wsptr[1];
1789    z2 = (INT32) wsptr[3];
1790    z3 = (INT32) wsptr[5];
1791    z4 = (INT32) wsptr[7];
1792
1793    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
1794    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
1795
1796    tmp10 = z1 + z3;
1797    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
1798    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
1799    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
1800    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
1801    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
1802    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
1803    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
1804             MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
1805
1806    z1 -= z4;
1807    z2 -= z3;
1808    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
1809    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
1810    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
1811
1812    /* Final output stage */
1813
1814    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
1815                                               CONST_BITS+PASS1_BITS+3)
1816                             & RANGE_MASK];
1817    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
1818                                               CONST_BITS+PASS1_BITS+3)
1819                             & RANGE_MASK];
1820    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
1821                                               CONST_BITS+PASS1_BITS+3)
1822                             & RANGE_MASK];
1823    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
1824                                               CONST_BITS+PASS1_BITS+3)
1825                             & RANGE_MASK];
1826    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
1827                                               CONST_BITS+PASS1_BITS+3)
1828                             & RANGE_MASK];
1829    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
1830                                               CONST_BITS+PASS1_BITS+3)
1831                             & RANGE_MASK];
1832    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
1833                                               CONST_BITS+PASS1_BITS+3)
1834                             & RANGE_MASK];
1835    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
1836                                               CONST_BITS+PASS1_BITS+3)
1837                             & RANGE_MASK];
1838    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
1839                                               CONST_BITS+PASS1_BITS+3)
1840                             & RANGE_MASK];
1841    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
1842                                               CONST_BITS+PASS1_BITS+3)
1843                             & RANGE_MASK];
1844    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
1845                                               CONST_BITS+PASS1_BITS+3)
1846                             & RANGE_MASK];
1847    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
1848                                               CONST_BITS+PASS1_BITS+3)
1849                             & RANGE_MASK];
1850
1851    wsptr += 8;         /* advance pointer to next row */
1852  }
1853}
1854
1855
1856/*
1857 * Perform dequantization and inverse DCT on one block of coefficients,
1858 * producing a 13x13 output block.
1859 *
1860 * Optimized algorithm with 29 multiplications in the 1-D kernel.
1861 * cK represents sqrt(2) * cos(K*pi/26).
1862 */
1863
1864GLOBAL(void)
1865jpeg_idct_13x13 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
1866                 JCOEFPTR coef_block,
1867                 JSAMPARRAY output_buf, JDIMENSION output_col)
1868{
1869  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1870  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
1871  INT32 z1, z2, z3, z4;
1872  JCOEFPTR inptr;
1873  ISLOW_MULT_TYPE * quantptr;
1874  int * wsptr;
1875  JSAMPROW outptr;
1876  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
1877  int ctr;
1878  int workspace[8*13];  /* buffers data between passes */
1879  SHIFT_TEMPS
1880
1881  /* Pass 1: process columns from input, store into work array. */
1882
1883  inptr = coef_block;
1884  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
1885  wsptr = workspace;
1886  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
1887    /* Even part */
1888
1889    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
1890    z1 <<= CONST_BITS;
1891    /* Add fudge factor here for final descale. */
1892    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
1893
1894    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
1895    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
1896    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
1897
1898    tmp10 = z3 + z4;
1899    tmp11 = z3 - z4;
1900
1901    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1902    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1903
1904    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1905    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1906
1907    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1908    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1909
1910    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1911    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
1912
1913    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
1914    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
1915
1916    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
1917    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
1918
1919    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
1920
1921    /* Odd part */
1922
1923    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
1924    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
1925    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
1926    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
1927
1928    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
1929    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
1930    tmp15 = z1 + z4;
1931    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
1932    tmp10 = tmp11 + tmp12 + tmp13 -
1933            MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
1934    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
1935    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
1936    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
1937    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
1938    tmp11 += tmp14;
1939    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
1940    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
1941    tmp12 += tmp14;
1942    tmp13 += tmp14;
1943    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
1944    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
1945            MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
1946    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
1947    tmp14 += z1;
1948    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
1949             MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
1950
1951    /* Final output stage */
1952
1953    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
1954    wsptr[8*12] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
1955    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
1956    wsptr[8*11] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
1957    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
1958    wsptr[8*10] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
1959    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
1960    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
1961    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
1962    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
1963    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
1964    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
1965    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26, CONST_BITS-PASS1_BITS);
1966  }
1967
1968  /* Pass 2: process 13 rows from work array, store into output array. */
1969
1970  wsptr = workspace;
1971  for (ctr = 0; ctr < 13; ctr++) {
1972    outptr = output_buf[ctr] + output_col;
1973
1974    /* Even part */
1975
1976    /* Add fudge factor here for final descale. */
1977    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
1978    z1 <<= CONST_BITS;
1979
1980    z2 = (INT32) wsptr[2];
1981    z3 = (INT32) wsptr[4];
1982    z4 = (INT32) wsptr[6];
1983
1984    tmp10 = z3 + z4;
1985    tmp11 = z3 - z4;
1986
1987    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
1988    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
1989
1990    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
1991    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
1992
1993    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
1994    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
1995
1996    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
1997    tmp25 = MULTIPLY(z2, - FIX(1.252223920)) + tmp12 + tmp13; /* c4 */
1998
1999    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
2000    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
2001
2002    tmp23 = MULTIPLY(z2, - FIX(0.170464608)) - tmp12 - tmp13; /* c12 */
2003    tmp24 = MULTIPLY(z2, - FIX(0.803364869)) + tmp12 - tmp13; /* c8 */
2004
2005    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
2006
2007    /* Odd part */
2008
2009    z1 = (INT32) wsptr[1];
2010    z2 = (INT32) wsptr[3];
2011    z3 = (INT32) wsptr[5];
2012    z4 = (INT32) wsptr[7];
2013
2014    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
2015    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
2016    tmp15 = z1 + z4;
2017    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
2018    tmp10 = tmp11 + tmp12 + tmp13 -
2019            MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
2020    tmp14 = MULTIPLY(z2 + z3, - FIX(0.338443458));   /* -c11 */
2021    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
2022    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
2023    tmp14 = MULTIPLY(z2 + z4, - FIX(1.163874945));   /* -c5 */
2024    tmp11 += tmp14;
2025    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
2026    tmp14 = MULTIPLY(z3 + z4, - FIX(0.657217813));   /* -c9 */
2027    tmp12 += tmp14;
2028    tmp13 += tmp14;
2029    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
2030    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
2031            MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
2032    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
2033    tmp14 += z1;
2034    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
2035             MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
2036
2037    /* Final output stage */
2038
2039    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2040                                               CONST_BITS+PASS1_BITS+3)
2041                             & RANGE_MASK];
2042    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2043                                               CONST_BITS+PASS1_BITS+3)
2044                             & RANGE_MASK];
2045    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2046                                               CONST_BITS+PASS1_BITS+3)
2047                             & RANGE_MASK];
2048    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2049                                               CONST_BITS+PASS1_BITS+3)
2050                             & RANGE_MASK];
2051    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2052                                               CONST_BITS+PASS1_BITS+3)
2053                             & RANGE_MASK];
2054    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2055                                               CONST_BITS+PASS1_BITS+3)
2056                             & RANGE_MASK];
2057    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2058                                               CONST_BITS+PASS1_BITS+3)
2059                             & RANGE_MASK];
2060    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2061                                               CONST_BITS+PASS1_BITS+3)
2062                             & RANGE_MASK];
2063    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2064                                               CONST_BITS+PASS1_BITS+3)
2065                             & RANGE_MASK];
2066    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2067                                               CONST_BITS+PASS1_BITS+3)
2068                             & RANGE_MASK];
2069    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2070                                               CONST_BITS+PASS1_BITS+3)
2071                             & RANGE_MASK];
2072    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2073                                               CONST_BITS+PASS1_BITS+3)
2074                             & RANGE_MASK];
2075    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26,
2076                                               CONST_BITS+PASS1_BITS+3)
2077                             & RANGE_MASK];
2078
2079    wsptr += 8;         /* advance pointer to next row */
2080  }
2081}
2082
2083
2084/*
2085 * Perform dequantization and inverse DCT on one block of coefficients,
2086 * producing a 14x14 output block.
2087 *
2088 * Optimized algorithm with 20 multiplications in the 1-D kernel.
2089 * cK represents sqrt(2) * cos(K*pi/28).
2090 */
2091
2092GLOBAL(void)
2093jpeg_idct_14x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2094                 JCOEFPTR coef_block,
2095                 JSAMPARRAY output_buf, JDIMENSION output_col)
2096{
2097  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2098  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
2099  INT32 z1, z2, z3, z4;
2100  JCOEFPTR inptr;
2101  ISLOW_MULT_TYPE * quantptr;
2102  int * wsptr;
2103  JSAMPROW outptr;
2104  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2105  int ctr;
2106  int workspace[8*14];  /* buffers data between passes */
2107  SHIFT_TEMPS
2108
2109  /* Pass 1: process columns from input, store into work array. */
2110
2111  inptr = coef_block;
2112  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2113  wsptr = workspace;
2114  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2115    /* Even part */
2116
2117    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2118    z1 <<= CONST_BITS;
2119    /* Add fudge factor here for final descale. */
2120    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2121    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2122    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2123    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2124    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2125
2126    tmp10 = z1 + z2;
2127    tmp11 = z1 + z3;
2128    tmp12 = z1 - z4;
2129
2130    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
2131                        CONST_BITS-PASS1_BITS);
2132
2133    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2134    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2135
2136    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2137
2138    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2139    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2140    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2141            MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2142
2143    tmp20 = tmp10 + tmp13;
2144    tmp26 = tmp10 - tmp13;
2145    tmp21 = tmp11 + tmp14;
2146    tmp25 = tmp11 - tmp14;
2147    tmp22 = tmp12 + tmp15;
2148    tmp24 = tmp12 - tmp15;
2149
2150    /* Odd part */
2151
2152    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2153    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2154    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2155    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2156    tmp13 = z4 << CONST_BITS;
2157
2158    tmp14 = z1 + z3;
2159    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2160    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2161    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2162    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2163    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2164    z1    -= z2;
2165    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
2166    tmp16 += tmp15;
2167    z1    += z4;
2168    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
2169    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
2170    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
2171    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2172    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2173    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
2174
2175    tmp13 = (z1 - z3) << PASS1_BITS;
2176
2177    /* Final output stage */
2178
2179    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2180    wsptr[8*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2181    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2182    wsptr[8*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2183    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2184    wsptr[8*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2185    wsptr[8*3]  = (int) (tmp23 + tmp13);
2186    wsptr[8*10] = (int) (tmp23 - tmp13);
2187    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2188    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2189    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2190    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2191    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2192    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2193  }
2194
2195  /* Pass 2: process 14 rows from work array, store into output array. */
2196
2197  wsptr = workspace;
2198  for (ctr = 0; ctr < 14; ctr++) {
2199    outptr = output_buf[ctr] + output_col;
2200
2201    /* Even part */
2202
2203    /* Add fudge factor here for final descale. */
2204    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2205    z1 <<= CONST_BITS;
2206    z4 = (INT32) wsptr[4];
2207    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
2208    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
2209    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
2210
2211    tmp10 = z1 + z2;
2212    tmp11 = z1 + z3;
2213    tmp12 = z1 - z4;
2214
2215    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
2216
2217    z1 = (INT32) wsptr[2];
2218    z2 = (INT32) wsptr[6];
2219
2220    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
2221
2222    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
2223    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
2224    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
2225            MULTIPLY(z2, FIX(1.378756276));      /* c2 */
2226
2227    tmp20 = tmp10 + tmp13;
2228    tmp26 = tmp10 - tmp13;
2229    tmp21 = tmp11 + tmp14;
2230    tmp25 = tmp11 - tmp14;
2231    tmp22 = tmp12 + tmp15;
2232    tmp24 = tmp12 - tmp15;
2233
2234    /* Odd part */
2235
2236    z1 = (INT32) wsptr[1];
2237    z2 = (INT32) wsptr[3];
2238    z3 = (INT32) wsptr[5];
2239    z4 = (INT32) wsptr[7];
2240    z4 <<= CONST_BITS;
2241
2242    tmp14 = z1 + z3;
2243    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
2244    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
2245    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
2246    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
2247    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
2248    z1    -= z2;
2249    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
2250    tmp16 += tmp15;
2251    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
2252    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
2253    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
2254    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
2255    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
2256    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
2257
2258    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
2259
2260    /* Final output stage */
2261
2262    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2263                                               CONST_BITS+PASS1_BITS+3)
2264                             & RANGE_MASK];
2265    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2266                                               CONST_BITS+PASS1_BITS+3)
2267                             & RANGE_MASK];
2268    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2269                                               CONST_BITS+PASS1_BITS+3)
2270                             & RANGE_MASK];
2271    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2272                                               CONST_BITS+PASS1_BITS+3)
2273                             & RANGE_MASK];
2274    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2275                                               CONST_BITS+PASS1_BITS+3)
2276                             & RANGE_MASK];
2277    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2278                                               CONST_BITS+PASS1_BITS+3)
2279                             & RANGE_MASK];
2280    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2281                                               CONST_BITS+PASS1_BITS+3)
2282                             & RANGE_MASK];
2283    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2284                                               CONST_BITS+PASS1_BITS+3)
2285                             & RANGE_MASK];
2286    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2287                                               CONST_BITS+PASS1_BITS+3)
2288                             & RANGE_MASK];
2289    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2290                                               CONST_BITS+PASS1_BITS+3)
2291                             & RANGE_MASK];
2292    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2293                                               CONST_BITS+PASS1_BITS+3)
2294                             & RANGE_MASK];
2295    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2296                                               CONST_BITS+PASS1_BITS+3)
2297                             & RANGE_MASK];
2298    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2299                                               CONST_BITS+PASS1_BITS+3)
2300                             & RANGE_MASK];
2301    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2302                                               CONST_BITS+PASS1_BITS+3)
2303                             & RANGE_MASK];
2304
2305    wsptr += 8;         /* advance pointer to next row */
2306  }
2307}
2308
2309
2310/*
2311 * Perform dequantization and inverse DCT on one block of coefficients,
2312 * producing a 15x15 output block.
2313 *
2314 * Optimized algorithm with 22 multiplications in the 1-D kernel.
2315 * cK represents sqrt(2) * cos(K*pi/30).
2316 */
2317
2318GLOBAL(void)
2319jpeg_idct_15x15 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2320                 JCOEFPTR coef_block,
2321                 JSAMPARRAY output_buf, JDIMENSION output_col)
2322{
2323  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2324  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2325  INT32 z1, z2, z3, z4;
2326  JCOEFPTR inptr;
2327  ISLOW_MULT_TYPE * quantptr;
2328  int * wsptr;
2329  JSAMPROW outptr;
2330  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2331  int ctr;
2332  int workspace[8*15];  /* buffers data between passes */
2333  SHIFT_TEMPS
2334
2335  /* Pass 1: process columns from input, store into work array. */
2336
2337  inptr = coef_block;
2338  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2339  wsptr = workspace;
2340  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2341    /* Even part */
2342
2343    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2344    z1 <<= CONST_BITS;
2345    /* Add fudge factor here for final descale. */
2346    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
2347
2348    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2349    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2350    z4 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2351
2352    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2353    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2354
2355    tmp12 = z1 - tmp10;
2356    tmp13 = z1 + tmp11;
2357    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2358
2359    z4 = z2 - z3;
2360    z3 += z2;
2361    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2362    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2363    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2364
2365    tmp20 = tmp13 + tmp10 + tmp11;
2366    tmp23 = tmp12 - tmp10 + tmp11 + z2;
2367
2368    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2369    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2370
2371    tmp25 = tmp13 - tmp10 - tmp11;
2372    tmp26 = tmp12 + tmp10 - tmp11 - z2;
2373
2374    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2375    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2376
2377    tmp21 = tmp12 + tmp10 + tmp11;
2378    tmp24 = tmp13 - tmp10 + tmp11;
2379    tmp11 += tmp11;
2380    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2381    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2382
2383    /* Odd part */
2384
2385    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2386    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2387    z4 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2388    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2389    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2390
2391    tmp13 = z2 - z4;
2392    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2393    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2394    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2395
2396    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2397    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2398    z2 = z1 - z4;
2399    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2400
2401    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2402    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2403    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2404    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2405    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2406    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2407
2408    /* Final output stage */
2409
2410    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
2411    wsptr[8*14] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
2412    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
2413    wsptr[8*13] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
2414    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
2415    wsptr[8*12] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
2416    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
2417    wsptr[8*11] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
2418    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
2419    wsptr[8*10] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
2420    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
2421    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
2422    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
2423    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
2424    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27, CONST_BITS-PASS1_BITS);
2425  }
2426
2427  /* Pass 2: process 15 rows from work array, store into output array. */
2428
2429  wsptr = workspace;
2430  for (ctr = 0; ctr < 15; ctr++) {
2431    outptr = output_buf[ctr] + output_col;
2432
2433    /* Even part */
2434
2435    /* Add fudge factor here for final descale. */
2436    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2437    z1 <<= CONST_BITS;
2438
2439    z2 = (INT32) wsptr[2];
2440    z3 = (INT32) wsptr[4];
2441    z4 = (INT32) wsptr[6];
2442
2443    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
2444    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
2445
2446    tmp12 = z1 - tmp10;
2447    tmp13 = z1 + tmp11;
2448    z1 -= (tmp11 - tmp10) << 1;             /* c0 = (c6-c12)*2 */
2449
2450    z4 = z2 - z3;
2451    z3 += z2;
2452    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
2453    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
2454    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
2455
2456    tmp20 = tmp13 + tmp10 + tmp11;
2457    tmp23 = tmp12 - tmp10 + tmp11 + z2;
2458
2459    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
2460    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
2461
2462    tmp25 = tmp13 - tmp10 - tmp11;
2463    tmp26 = tmp12 + tmp10 - tmp11 - z2;
2464
2465    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
2466    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
2467
2468    tmp21 = tmp12 + tmp10 + tmp11;
2469    tmp24 = tmp13 - tmp10 + tmp11;
2470    tmp11 += tmp11;
2471    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
2472    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
2473
2474    /* Odd part */
2475
2476    z1 = (INT32) wsptr[1];
2477    z2 = (INT32) wsptr[3];
2478    z4 = (INT32) wsptr[5];
2479    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
2480    z4 = (INT32) wsptr[7];
2481
2482    tmp13 = z2 - z4;
2483    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
2484    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
2485    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
2486
2487    tmp13 = MULTIPLY(z2, - FIX(0.831253876));               /* -c9 */
2488    tmp15 = MULTIPLY(z2, - FIX(1.344997024));               /* -c3 */
2489    z2 = z1 - z4;
2490    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
2491
2492    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
2493    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
2494    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
2495    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
2496    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
2497    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
2498
2499    /* Final output stage */
2500
2501    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
2502                                               CONST_BITS+PASS1_BITS+3)
2503                             & RANGE_MASK];
2504    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
2505                                               CONST_BITS+PASS1_BITS+3)
2506                             & RANGE_MASK];
2507    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
2508                                               CONST_BITS+PASS1_BITS+3)
2509                             & RANGE_MASK];
2510    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
2511                                               CONST_BITS+PASS1_BITS+3)
2512                             & RANGE_MASK];
2513    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
2514                                               CONST_BITS+PASS1_BITS+3)
2515                             & RANGE_MASK];
2516    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
2517                                               CONST_BITS+PASS1_BITS+3)
2518                             & RANGE_MASK];
2519    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
2520                                               CONST_BITS+PASS1_BITS+3)
2521                             & RANGE_MASK];
2522    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
2523                                               CONST_BITS+PASS1_BITS+3)
2524                             & RANGE_MASK];
2525    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
2526                                               CONST_BITS+PASS1_BITS+3)
2527                             & RANGE_MASK];
2528    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
2529                                               CONST_BITS+PASS1_BITS+3)
2530                             & RANGE_MASK];
2531    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
2532                                               CONST_BITS+PASS1_BITS+3)
2533                             & RANGE_MASK];
2534    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
2535                                               CONST_BITS+PASS1_BITS+3)
2536                             & RANGE_MASK];
2537    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
2538                                               CONST_BITS+PASS1_BITS+3)
2539                             & RANGE_MASK];
2540    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
2541                                               CONST_BITS+PASS1_BITS+3)
2542                             & RANGE_MASK];
2543    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27,
2544                                               CONST_BITS+PASS1_BITS+3)
2545                             & RANGE_MASK];
2546
2547    wsptr += 8;         /* advance pointer to next row */
2548  }
2549}
2550
2551
2552/*
2553 * Perform dequantization and inverse DCT on one block of coefficients,
2554 * producing a 16x16 output block.
2555 *
2556 * Optimized algorithm with 28 multiplications in the 1-D kernel.
2557 * cK represents sqrt(2) * cos(K*pi/32).
2558 */
2559
2560GLOBAL(void)
2561jpeg_idct_16x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2562                 JCOEFPTR coef_block,
2563                 JSAMPARRAY output_buf, JDIMENSION output_col)
2564{
2565  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2566  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2567  INT32 z1, z2, z3, z4;
2568  JCOEFPTR inptr;
2569  ISLOW_MULT_TYPE * quantptr;
2570  int * wsptr;
2571  JSAMPROW outptr;
2572  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2573  int ctr;
2574  int workspace[8*16];  /* buffers data between passes */
2575  SHIFT_TEMPS
2576
2577  /* Pass 1: process columns from input, store into work array. */
2578
2579  inptr = coef_block;
2580  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2581  wsptr = workspace;
2582  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
2583    /* Even part */
2584
2585    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2586    tmp0 <<= CONST_BITS;
2587    /* Add fudge factor here for final descale. */
2588    tmp0 += 1 << (CONST_BITS-PASS1_BITS-1);
2589
2590    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2591    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2592    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2593
2594    tmp10 = tmp0 + tmp1;
2595    tmp11 = tmp0 - tmp1;
2596    tmp12 = tmp0 + tmp2;
2597    tmp13 = tmp0 - tmp2;
2598
2599    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2600    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2601    z3 = z1 - z2;
2602    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2603    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2604
2605    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2606    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2607    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2608    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2609
2610    tmp20 = tmp10 + tmp0;
2611    tmp27 = tmp10 - tmp0;
2612    tmp21 = tmp12 + tmp1;
2613    tmp26 = tmp12 - tmp1;
2614    tmp22 = tmp13 + tmp2;
2615    tmp25 = tmp13 - tmp2;
2616    tmp23 = tmp11 + tmp3;
2617    tmp24 = tmp11 - tmp3;
2618
2619    /* Odd part */
2620
2621    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2622    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2623    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2624    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2625
2626    tmp11 = z1 + z3;
2627
2628    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2629    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2630    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2631    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2632    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2633    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2634    tmp0  = tmp1 + tmp2 + tmp3 -
2635            MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2636    tmp13 = tmp10 + tmp11 + tmp12 -
2637            MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2638    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2639    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2640    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2641    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2642    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2643    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2644    z2    += z4;
2645    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2646    tmp1  += z1;
2647    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2648    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2649    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2650    tmp12 += z2;
2651    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2652    tmp2  += z2;
2653    tmp3  += z2;
2654    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2655    tmp10 += z2;
2656    tmp11 += z2;
2657
2658    /* Final output stage */
2659
2660    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
2661    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
2662    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
2663    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
2664    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
2665    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
2666    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
2667    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
2668    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
2669    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
2670    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
2671    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
2672    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
2673    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
2674    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
2675    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
2676  }
2677
2678  /* Pass 2: process 16 rows from work array, store into output array. */
2679
2680  wsptr = workspace;
2681  for (ctr = 0; ctr < 16; ctr++) {
2682    outptr = output_buf[ctr] + output_col;
2683
2684    /* Even part */
2685
2686    /* Add fudge factor here for final descale. */
2687    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2688    tmp0 <<= CONST_BITS;
2689
2690    z1 = (INT32) wsptr[4];
2691    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2692    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2693
2694    tmp10 = tmp0 + tmp1;
2695    tmp11 = tmp0 - tmp1;
2696    tmp12 = tmp0 + tmp2;
2697    tmp13 = tmp0 - tmp2;
2698
2699    z1 = (INT32) wsptr[2];
2700    z2 = (INT32) wsptr[6];
2701    z3 = z1 - z2;
2702    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2703    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2704
2705    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2706    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2707    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2708    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2709
2710    tmp20 = tmp10 + tmp0;
2711    tmp27 = tmp10 - tmp0;
2712    tmp21 = tmp12 + tmp1;
2713    tmp26 = tmp12 - tmp1;
2714    tmp22 = tmp13 + tmp2;
2715    tmp25 = tmp13 - tmp2;
2716    tmp23 = tmp11 + tmp3;
2717    tmp24 = tmp11 - tmp3;
2718
2719    /* Odd part */
2720
2721    z1 = (INT32) wsptr[1];
2722    z2 = (INT32) wsptr[3];
2723    z3 = (INT32) wsptr[5];
2724    z4 = (INT32) wsptr[7];
2725
2726    tmp11 = z1 + z3;
2727
2728    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
2729    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
2730    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
2731    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
2732    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
2733    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
2734    tmp0  = tmp1 + tmp2 + tmp3 -
2735            MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
2736    tmp13 = tmp10 + tmp11 + tmp12 -
2737            MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
2738    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
2739    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
2740    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
2741    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
2742    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
2743    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
2744    z2    += z4;
2745    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
2746    tmp1  += z1;
2747    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
2748    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
2749    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
2750    tmp12 += z2;
2751    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
2752    tmp2  += z2;
2753    tmp3  += z2;
2754    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
2755    tmp10 += z2;
2756    tmp11 += z2;
2757
2758    /* Final output stage */
2759
2760    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
2761                                               CONST_BITS+PASS1_BITS+3)
2762                             & RANGE_MASK];
2763    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
2764                                               CONST_BITS+PASS1_BITS+3)
2765                             & RANGE_MASK];
2766    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
2767                                               CONST_BITS+PASS1_BITS+3)
2768                             & RANGE_MASK];
2769    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
2770                                               CONST_BITS+PASS1_BITS+3)
2771                             & RANGE_MASK];
2772    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
2773                                               CONST_BITS+PASS1_BITS+3)
2774                             & RANGE_MASK];
2775    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
2776                                               CONST_BITS+PASS1_BITS+3)
2777                             & RANGE_MASK];
2778    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
2779                                               CONST_BITS+PASS1_BITS+3)
2780                             & RANGE_MASK];
2781    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
2782                                               CONST_BITS+PASS1_BITS+3)
2783                             & RANGE_MASK];
2784    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
2785                                               CONST_BITS+PASS1_BITS+3)
2786                             & RANGE_MASK];
2787    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
2788                                               CONST_BITS+PASS1_BITS+3)
2789                             & RANGE_MASK];
2790    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
2791                                               CONST_BITS+PASS1_BITS+3)
2792                             & RANGE_MASK];
2793    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
2794                                               CONST_BITS+PASS1_BITS+3)
2795                             & RANGE_MASK];
2796    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
2797                                               CONST_BITS+PASS1_BITS+3)
2798                             & RANGE_MASK];
2799    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
2800                                               CONST_BITS+PASS1_BITS+3)
2801                             & RANGE_MASK];
2802    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
2803                                               CONST_BITS+PASS1_BITS+3)
2804                             & RANGE_MASK];
2805    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
2806                                               CONST_BITS+PASS1_BITS+3)
2807                             & RANGE_MASK];
2808
2809    wsptr += 8;         /* advance pointer to next row */
2810  }
2811}
2812
2813
2814/*
2815 * Perform dequantization and inverse DCT on one block of coefficients,
2816 * producing a 16x8 output block.
2817 *
2818 * 8-point IDCT in pass 1 (columns), 16-point in pass 2 (rows).
2819 */
2820
2821GLOBAL(void)
2822jpeg_idct_16x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
2823                JCOEFPTR coef_block,
2824                JSAMPARRAY output_buf, JDIMENSION output_col)
2825{
2826  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
2827  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
2828  INT32 z1, z2, z3, z4;
2829  JCOEFPTR inptr;
2830  ISLOW_MULT_TYPE * quantptr;
2831  int * wsptr;
2832  JSAMPROW outptr;
2833  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
2834  int ctr;
2835  int workspace[8*8];   /* buffers data between passes */
2836  SHIFT_TEMPS
2837
2838  /* Pass 1: process columns from input, store into work array. */
2839  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
2840  /* furthermore, we scale the results by 2**PASS1_BITS. */
2841
2842  inptr = coef_block;
2843  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
2844  wsptr = workspace;
2845  for (ctr = DCTSIZE; ctr > 0; ctr--) {
2846    /* Due to quantization, we will usually find that many of the input
2847     * coefficients are zero, especially the AC terms.  We can exploit this
2848     * by short-circuiting the IDCT calculation for any column in which all
2849     * the AC terms are zero.  In that case each output is equal to the
2850     * DC coefficient (with scale factor as needed).
2851     * With typical images and quantization tables, half or more of the
2852     * column DCT calculations can be simplified this way.
2853     */
2854   
2855    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
2856        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
2857        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
2858        inptr[DCTSIZE*7] == 0) {
2859      /* AC terms all zero */
2860      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
2861     
2862      wsptr[DCTSIZE*0] = dcval;
2863      wsptr[DCTSIZE*1] = dcval;
2864      wsptr[DCTSIZE*2] = dcval;
2865      wsptr[DCTSIZE*3] = dcval;
2866      wsptr[DCTSIZE*4] = dcval;
2867      wsptr[DCTSIZE*5] = dcval;
2868      wsptr[DCTSIZE*6] = dcval;
2869      wsptr[DCTSIZE*7] = dcval;
2870     
2871      inptr++;                  /* advance pointers to next column */
2872      quantptr++;
2873      wsptr++;
2874      continue;
2875    }
2876   
2877    /* Even part: reverse the even part of the forward DCT. */
2878    /* The rotator is sqrt(2)*c(-6). */
2879   
2880    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
2881    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
2882   
2883    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
2884    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
2885    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
2886   
2887    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
2888    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
2889    z2 <<= CONST_BITS;
2890    z3 <<= CONST_BITS;
2891    /* Add fudge factor here for final descale. */
2892    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
2893
2894    tmp0 = z2 + z3;
2895    tmp1 = z2 - z3;
2896   
2897    tmp10 = tmp0 + tmp2;
2898    tmp13 = tmp0 - tmp2;
2899    tmp11 = tmp1 + tmp3;
2900    tmp12 = tmp1 - tmp3;
2901   
2902    /* Odd part per figure 8; the matrix is unitary and hence its
2903     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
2904     */
2905   
2906    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
2907    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
2908    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
2909    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
2910   
2911    z2 = tmp0 + tmp2;
2912    z3 = tmp1 + tmp3;
2913
2914    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
2915    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
2916    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
2917    z2 += z1;
2918    z3 += z1;
2919
2920    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
2921    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
2922    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
2923    tmp0 += z1 + z2;
2924    tmp3 += z1 + z3;
2925
2926    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
2927    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
2928    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
2929    tmp1 += z1 + z3;
2930    tmp2 += z1 + z2;
2931   
2932    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
2933   
2934    wsptr[DCTSIZE*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
2935    wsptr[DCTSIZE*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
2936    wsptr[DCTSIZE*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
2937    wsptr[DCTSIZE*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
2938    wsptr[DCTSIZE*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
2939    wsptr[DCTSIZE*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
2940    wsptr[DCTSIZE*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
2941    wsptr[DCTSIZE*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
2942   
2943    inptr++;                    /* advance pointers to next column */
2944    quantptr++;
2945    wsptr++;
2946  }
2947
2948  /* Pass 2: process 8 rows from work array, store into output array.
2949   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
2950   */
2951  wsptr = workspace;
2952  for (ctr = 0; ctr < 8; ctr++) {
2953    outptr = output_buf[ctr] + output_col;
2954
2955    /* Even part */
2956
2957    /* Add fudge factor here for final descale. */
2958    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
2959    tmp0 <<= CONST_BITS;
2960
2961    z1 = (INT32) wsptr[4];
2962    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
2963    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
2964
2965    tmp10 = tmp0 + tmp1;
2966    tmp11 = tmp0 - tmp1;
2967    tmp12 = tmp0 + tmp2;
2968    tmp13 = tmp0 - tmp2;
2969
2970    z1 = (INT32) wsptr[2];
2971    z2 = (INT32) wsptr[6];
2972    z3 = z1 - z2;
2973    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
2974    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
2975
2976    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
2977    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
2978    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
2979    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
2980
2981    tmp20 = tmp10 + tmp0;
2982    tmp27 = tmp10 - tmp0;
2983    tmp21 = tmp12 + tmp1;
2984    tmp26 = tmp12 - tmp1;
2985    tmp22 = tmp13 + tmp2;
2986    tmp25 = tmp13 - tmp2;
2987    tmp23 = tmp11 + tmp3;
2988    tmp24 = tmp11 - tmp3;
2989
2990    /* Odd part */
2991
2992    z1 = (INT32) wsptr[1];
2993    z2 = (INT32) wsptr[3];
2994    z3 = (INT32) wsptr[5];
2995    z4 = (INT32) wsptr[7];
2996
2997    tmp11 = z1 + z3;
2998
2999    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
3000    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
3001    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
3002    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
3003    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
3004    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
3005    tmp0  = tmp1 + tmp2 + tmp3 -
3006            MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
3007    tmp13 = tmp10 + tmp11 + tmp12 -
3008            MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
3009    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
3010    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
3011    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
3012    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
3013    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
3014    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
3015    z2    += z4;
3016    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
3017    tmp1  += z1;
3018    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
3019    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
3020    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
3021    tmp12 += z2;
3022    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
3023    tmp2  += z2;
3024    tmp3  += z2;
3025    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
3026    tmp10 += z2;
3027    tmp11 += z2;
3028
3029    /* Final output stage */
3030
3031    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp0,
3032                                               CONST_BITS+PASS1_BITS+3)
3033                             & RANGE_MASK];
3034    outptr[15] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp0,
3035                                               CONST_BITS+PASS1_BITS+3)
3036                             & RANGE_MASK];
3037    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp1,
3038                                               CONST_BITS+PASS1_BITS+3)
3039                             & RANGE_MASK];
3040    outptr[14] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp1,
3041                                               CONST_BITS+PASS1_BITS+3)
3042                             & RANGE_MASK];
3043    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp2,
3044                                               CONST_BITS+PASS1_BITS+3)
3045                             & RANGE_MASK];
3046    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp2,
3047                                               CONST_BITS+PASS1_BITS+3)
3048                             & RANGE_MASK];
3049    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp3,
3050                                               CONST_BITS+PASS1_BITS+3)
3051                             & RANGE_MASK];
3052    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp3,
3053                                               CONST_BITS+PASS1_BITS+3)
3054                             & RANGE_MASK];
3055    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp10,
3056                                               CONST_BITS+PASS1_BITS+3)
3057                             & RANGE_MASK];
3058    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp10,
3059                                               CONST_BITS+PASS1_BITS+3)
3060                             & RANGE_MASK];
3061    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp11,
3062                                               CONST_BITS+PASS1_BITS+3)
3063                             & RANGE_MASK];
3064    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp11,
3065                                               CONST_BITS+PASS1_BITS+3)
3066                             & RANGE_MASK];
3067    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp12,
3068                                               CONST_BITS+PASS1_BITS+3)
3069                             & RANGE_MASK];
3070    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp12,
3071                                               CONST_BITS+PASS1_BITS+3)
3072                             & RANGE_MASK];
3073    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp27 + tmp13,
3074                                               CONST_BITS+PASS1_BITS+3)
3075                             & RANGE_MASK];
3076    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp27 - tmp13,
3077                                               CONST_BITS+PASS1_BITS+3)
3078                             & RANGE_MASK];
3079
3080    wsptr += 8;         /* advance pointer to next row */
3081  }
3082}
3083
3084
3085/*
3086 * Perform dequantization and inverse DCT on one block of coefficients,
3087 * producing a 14x7 output block.
3088 *
3089 * 7-point IDCT in pass 1 (columns), 14-point in pass 2 (rows).
3090 */
3091
3092GLOBAL(void)
3093jpeg_idct_14x7 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3094                JCOEFPTR coef_block,
3095                JSAMPARRAY output_buf, JDIMENSION output_col)
3096{
3097  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3098  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
3099  INT32 z1, z2, z3, z4;
3100  JCOEFPTR inptr;
3101  ISLOW_MULT_TYPE * quantptr;
3102  int * wsptr;
3103  JSAMPROW outptr;
3104  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3105  int ctr;
3106  int workspace[8*7];   /* buffers data between passes */
3107  SHIFT_TEMPS
3108
3109  /* Pass 1: process columns from input, store into work array.
3110   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
3111   */
3112  inptr = coef_block;
3113  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3114  wsptr = workspace;
3115  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3116    /* Even part */
3117
3118    tmp23 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3119    tmp23 <<= CONST_BITS;
3120    /* Add fudge factor here for final descale. */
3121    tmp23 += ONE << (CONST_BITS-PASS1_BITS-1);
3122
3123    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3124    z2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3125    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
3126
3127    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
3128    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
3129    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
3130    tmp10 = z1 + z3;
3131    z2 -= tmp10;
3132    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
3133    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
3134    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
3135    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
3136
3137    /* Odd part */
3138
3139    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3140    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3141    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3142
3143    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
3144    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
3145    tmp10 = tmp11 - tmp12;
3146    tmp11 += tmp12;
3147    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
3148    tmp11 += tmp12;
3149    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
3150    tmp10 += z2;
3151    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
3152
3153    /* Final output stage */
3154
3155    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3156    wsptr[8*6] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3157    wsptr[8*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
3158    wsptr[8*5] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
3159    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3160    wsptr[8*4] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3161    wsptr[8*3] = (int) RIGHT_SHIFT(tmp23, CONST_BITS-PASS1_BITS);
3162  }
3163
3164  /* Pass 2: process 7 rows from work array, store into output array.
3165   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
3166   */
3167  wsptr = workspace;
3168  for (ctr = 0; ctr < 7; ctr++) {
3169    outptr = output_buf[ctr] + output_col;
3170
3171    /* Even part */
3172
3173    /* Add fudge factor here for final descale. */
3174    z1 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3175    z1 <<= CONST_BITS;
3176    z4 = (INT32) wsptr[4];
3177    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
3178    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
3179    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
3180
3181    tmp10 = z1 + z2;
3182    tmp11 = z1 + z3;
3183    tmp12 = z1 - z4;
3184
3185    tmp23 = z1 - ((z2 + z3 - z4) << 1);          /* c0 = (c4+c12-c8)*2 */
3186
3187    z1 = (INT32) wsptr[2];
3188    z2 = (INT32) wsptr[6];
3189
3190    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
3191
3192    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
3193    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
3194    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
3195            MULTIPLY(z2, FIX(1.378756276));      /* c2 */
3196
3197    tmp20 = tmp10 + tmp13;
3198    tmp26 = tmp10 - tmp13;
3199    tmp21 = tmp11 + tmp14;
3200    tmp25 = tmp11 - tmp14;
3201    tmp22 = tmp12 + tmp15;
3202    tmp24 = tmp12 - tmp15;
3203
3204    /* Odd part */
3205
3206    z1 = (INT32) wsptr[1];
3207    z2 = (INT32) wsptr[3];
3208    z3 = (INT32) wsptr[5];
3209    z4 = (INT32) wsptr[7];
3210    z4 <<= CONST_BITS;
3211
3212    tmp14 = z1 + z3;
3213    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
3214    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
3215    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
3216    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
3217    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
3218    z1    -= z2;
3219    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
3220    tmp16 += tmp15;
3221    tmp13 = MULTIPLY(z2 + z3, - FIX(0.158341681)) - z4;    /* -c13 */
3222    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
3223    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
3224    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
3225    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
3226    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
3227
3228    tmp13 = ((z1 - z3) << CONST_BITS) + z4;
3229
3230    /* Final output stage */
3231
3232    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3233                                               CONST_BITS+PASS1_BITS+3)
3234                             & RANGE_MASK];
3235    outptr[13] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3236                                               CONST_BITS+PASS1_BITS+3)
3237                             & RANGE_MASK];
3238    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3239                                               CONST_BITS+PASS1_BITS+3)
3240                             & RANGE_MASK];
3241    outptr[12] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3242                                               CONST_BITS+PASS1_BITS+3)
3243                             & RANGE_MASK];
3244    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3245                                               CONST_BITS+PASS1_BITS+3)
3246                             & RANGE_MASK];
3247    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3248                                               CONST_BITS+PASS1_BITS+3)
3249                             & RANGE_MASK];
3250    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3251                                               CONST_BITS+PASS1_BITS+3)
3252                             & RANGE_MASK];
3253    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3254                                               CONST_BITS+PASS1_BITS+3)
3255                             & RANGE_MASK];
3256    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3257                                               CONST_BITS+PASS1_BITS+3)
3258                             & RANGE_MASK];
3259    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3260                                               CONST_BITS+PASS1_BITS+3)
3261                             & RANGE_MASK];
3262    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3263                                               CONST_BITS+PASS1_BITS+3)
3264                             & RANGE_MASK];
3265    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3266                                               CONST_BITS+PASS1_BITS+3)
3267                             & RANGE_MASK];
3268    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp26 + tmp16,
3269                                               CONST_BITS+PASS1_BITS+3)
3270                             & RANGE_MASK];
3271    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp26 - tmp16,
3272                                               CONST_BITS+PASS1_BITS+3)
3273                             & RANGE_MASK];
3274
3275    wsptr += 8;         /* advance pointer to next row */
3276  }
3277}
3278
3279
3280/*
3281 * Perform dequantization and inverse DCT on one block of coefficients,
3282 * producing a 12x6 output block.
3283 *
3284 * 6-point IDCT in pass 1 (columns), 12-point in pass 2 (rows).
3285 */
3286
3287GLOBAL(void)
3288jpeg_idct_12x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3289                JCOEFPTR coef_block,
3290                JSAMPARRAY output_buf, JDIMENSION output_col)
3291{
3292  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3293  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
3294  INT32 z1, z2, z3, z4;
3295  JCOEFPTR inptr;
3296  ISLOW_MULT_TYPE * quantptr;
3297  int * wsptr;
3298  JSAMPROW outptr;
3299  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3300  int ctr;
3301  int workspace[8*6];   /* buffers data between passes */
3302  SHIFT_TEMPS
3303
3304  /* Pass 1: process columns from input, store into work array.
3305   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3306   */
3307  inptr = coef_block;
3308  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3309  wsptr = workspace;
3310  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3311    /* Even part */
3312
3313    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3314    tmp10 <<= CONST_BITS;
3315    /* Add fudge factor here for final descale. */
3316    tmp10 += ONE << (CONST_BITS-PASS1_BITS-1);
3317    tmp12 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3318    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
3319    tmp11 = tmp10 + tmp20;
3320    tmp21 = RIGHT_SHIFT(tmp10 - tmp20 - tmp20, CONST_BITS-PASS1_BITS);
3321    tmp20 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3322    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
3323    tmp20 = tmp11 + tmp10;
3324    tmp22 = tmp11 - tmp10;
3325
3326    /* Odd part */
3327
3328    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3329    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3330    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
3331    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3332    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
3333    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
3334    tmp11 = (z1 - z2 - z3) << PASS1_BITS;
3335
3336    /* Final output stage */
3337
3338    wsptr[8*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
3339    wsptr[8*5] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
3340    wsptr[8*1] = (int) (tmp21 + tmp11);
3341    wsptr[8*4] = (int) (tmp21 - tmp11);
3342    wsptr[8*2] = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
3343    wsptr[8*3] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
3344  }
3345
3346  /* Pass 2: process 6 rows from work array, store into output array.
3347   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
3348   */
3349  wsptr = workspace;
3350  for (ctr = 0; ctr < 6; ctr++) {
3351    outptr = output_buf[ctr] + output_col;
3352
3353    /* Even part */
3354
3355    /* Add fudge factor here for final descale. */
3356    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3357    z3 <<= CONST_BITS;
3358
3359    z4 = (INT32) wsptr[4];
3360    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
3361
3362    tmp10 = z3 + z4;
3363    tmp11 = z3 - z4;
3364
3365    z1 = (INT32) wsptr[2];
3366    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
3367    z1 <<= CONST_BITS;
3368    z2 = (INT32) wsptr[6];
3369    z2 <<= CONST_BITS;
3370
3371    tmp12 = z1 - z2;
3372
3373    tmp21 = z3 + tmp12;
3374    tmp24 = z3 - tmp12;
3375
3376    tmp12 = z4 + z2;
3377
3378    tmp20 = tmp10 + tmp12;
3379    tmp25 = tmp10 - tmp12;
3380
3381    tmp12 = z4 - z1 - z2;
3382
3383    tmp22 = tmp11 + tmp12;
3384    tmp23 = tmp11 - tmp12;
3385
3386    /* Odd part */
3387
3388    z1 = (INT32) wsptr[1];
3389    z2 = (INT32) wsptr[3];
3390    z3 = (INT32) wsptr[5];
3391    z4 = (INT32) wsptr[7];
3392
3393    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
3394    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
3395
3396    tmp10 = z1 + z3;
3397    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
3398    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
3399    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
3400    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
3401    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
3402    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
3403    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
3404             MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
3405
3406    z1 -= z4;
3407    z2 -= z3;
3408    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
3409    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
3410    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
3411
3412    /* Final output stage */
3413
3414    outptr[0]  = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3415                                               CONST_BITS+PASS1_BITS+3)
3416                             & RANGE_MASK];
3417    outptr[11] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3418                                               CONST_BITS+PASS1_BITS+3)
3419                             & RANGE_MASK];
3420    outptr[1]  = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3421                                               CONST_BITS+PASS1_BITS+3)
3422                             & RANGE_MASK];
3423    outptr[10] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3424                                               CONST_BITS+PASS1_BITS+3)
3425                             & RANGE_MASK];
3426    outptr[2]  = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3427                                               CONST_BITS+PASS1_BITS+3)
3428                             & RANGE_MASK];
3429    outptr[9]  = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3430                                               CONST_BITS+PASS1_BITS+3)
3431                             & RANGE_MASK];
3432    outptr[3]  = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3433                                               CONST_BITS+PASS1_BITS+3)
3434                             & RANGE_MASK];
3435    outptr[8]  = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3436                                               CONST_BITS+PASS1_BITS+3)
3437                             & RANGE_MASK];
3438    outptr[4]  = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3439                                               CONST_BITS+PASS1_BITS+3)
3440                             & RANGE_MASK];
3441    outptr[7]  = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3442                                               CONST_BITS+PASS1_BITS+3)
3443                             & RANGE_MASK];
3444    outptr[5]  = range_limit[(int) RIGHT_SHIFT(tmp25 + tmp15,
3445                                               CONST_BITS+PASS1_BITS+3)
3446                             & RANGE_MASK];
3447    outptr[6]  = range_limit[(int) RIGHT_SHIFT(tmp25 - tmp15,
3448                                               CONST_BITS+PASS1_BITS+3)
3449                             & RANGE_MASK];
3450
3451    wsptr += 8;         /* advance pointer to next row */
3452  }
3453}
3454
3455
3456/*
3457 * Perform dequantization and inverse DCT on one block of coefficients,
3458 * producing a 10x5 output block.
3459 *
3460 * 5-point IDCT in pass 1 (columns), 10-point in pass 2 (rows).
3461 */
3462
3463GLOBAL(void)
3464jpeg_idct_10x5 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3465                JCOEFPTR coef_block,
3466                JSAMPARRAY output_buf, JDIMENSION output_col)
3467{
3468  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3469  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
3470  INT32 z1, z2, z3, z4;
3471  JCOEFPTR inptr;
3472  ISLOW_MULT_TYPE * quantptr;
3473  int * wsptr;
3474  JSAMPROW outptr;
3475  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3476  int ctr;
3477  int workspace[8*5];   /* buffers data between passes */
3478  SHIFT_TEMPS
3479
3480  /* Pass 1: process columns from input, store into work array.
3481   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
3482   */
3483  inptr = coef_block;
3484  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3485  wsptr = workspace;
3486  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3487    /* Even part */
3488
3489    tmp12 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3490    tmp12 <<= CONST_BITS;
3491    /* Add fudge factor here for final descale. */
3492    tmp12 += ONE << (CONST_BITS-PASS1_BITS-1);
3493    tmp13 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3494    tmp14 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
3495    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
3496    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
3497    z3 = tmp12 + z2;
3498    tmp10 = z3 + z1;
3499    tmp11 = z3 - z1;
3500    tmp12 -= z2 << 2;
3501
3502    /* Odd part */
3503
3504    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3505    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3506
3507    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
3508    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
3509    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
3510
3511    /* Final output stage */
3512
3513    wsptr[8*0] = (int) RIGHT_SHIFT(tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3514    wsptr[8*4] = (int) RIGHT_SHIFT(tmp10 - tmp13, CONST_BITS-PASS1_BITS);
3515    wsptr[8*1] = (int) RIGHT_SHIFT(tmp11 + tmp14, CONST_BITS-PASS1_BITS);
3516    wsptr[8*3] = (int) RIGHT_SHIFT(tmp11 - tmp14, CONST_BITS-PASS1_BITS);
3517    wsptr[8*2] = (int) RIGHT_SHIFT(tmp12, CONST_BITS-PASS1_BITS);
3518  }
3519
3520  /* Pass 2: process 5 rows from work array, store into output array.
3521   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
3522   */
3523  wsptr = workspace;
3524  for (ctr = 0; ctr < 5; ctr++) {
3525    outptr = output_buf[ctr] + output_col;
3526
3527    /* Even part */
3528
3529    /* Add fudge factor here for final descale. */
3530    z3 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3531    z3 <<= CONST_BITS;
3532    z4 = (INT32) wsptr[4];
3533    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
3534    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
3535    tmp10 = z3 + z1;
3536    tmp11 = z3 - z2;
3537
3538    tmp22 = z3 - ((z1 - z2) << 1);               /* c0 = (c4-c8)*2 */
3539
3540    z2 = (INT32) wsptr[2];
3541    z3 = (INT32) wsptr[6];
3542
3543    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
3544    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
3545    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
3546
3547    tmp20 = tmp10 + tmp12;
3548    tmp24 = tmp10 - tmp12;
3549    tmp21 = tmp11 + tmp13;
3550    tmp23 = tmp11 - tmp13;
3551
3552    /* Odd part */
3553
3554    z1 = (INT32) wsptr[1];
3555    z2 = (INT32) wsptr[3];
3556    z3 = (INT32) wsptr[5];
3557    z3 <<= CONST_BITS;
3558    z4 = (INT32) wsptr[7];
3559
3560    tmp11 = z2 + z4;
3561    tmp13 = z2 - z4;
3562
3563    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
3564
3565    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
3566    z4 = z3 + tmp12;
3567
3568    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
3569    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
3570
3571    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
3572    z4 = z3 - tmp12 - (tmp13 << (CONST_BITS - 1));
3573
3574    tmp12 = ((z1 - tmp13) << CONST_BITS) - z3;
3575
3576    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
3577    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
3578
3579    /* Final output stage */
3580
3581    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
3582                                              CONST_BITS+PASS1_BITS+3)
3583                            & RANGE_MASK];
3584    outptr[9] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
3585                                              CONST_BITS+PASS1_BITS+3)
3586                            & RANGE_MASK];
3587    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
3588                                              CONST_BITS+PASS1_BITS+3)
3589                            & RANGE_MASK];
3590    outptr[8] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
3591                                              CONST_BITS+PASS1_BITS+3)
3592                            & RANGE_MASK];
3593    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
3594                                              CONST_BITS+PASS1_BITS+3)
3595                            & RANGE_MASK];
3596    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
3597                                              CONST_BITS+PASS1_BITS+3)
3598                            & RANGE_MASK];
3599    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23 + tmp13,
3600                                              CONST_BITS+PASS1_BITS+3)
3601                            & RANGE_MASK];
3602    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp23 - tmp13,
3603                                              CONST_BITS+PASS1_BITS+3)
3604                            & RANGE_MASK];
3605    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp24 + tmp14,
3606                                              CONST_BITS+PASS1_BITS+3)
3607                            & RANGE_MASK];
3608    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp24 - tmp14,
3609                                              CONST_BITS+PASS1_BITS+3)
3610                            & RANGE_MASK];
3611
3612    wsptr += 8;         /* advance pointer to next row */
3613  }
3614}
3615
3616
3617/*
3618 * Perform dequantization and inverse DCT on one block of coefficients,
3619 * producing a 8x4 output block.
3620 *
3621 * 4-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
3622 */
3623
3624GLOBAL(void)
3625jpeg_idct_8x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3626               JCOEFPTR coef_block,
3627               JSAMPARRAY output_buf, JDIMENSION output_col)
3628{
3629  INT32 tmp0, tmp1, tmp2, tmp3;
3630  INT32 tmp10, tmp11, tmp12, tmp13;
3631  INT32 z1, z2, z3;
3632  JCOEFPTR inptr;
3633  ISLOW_MULT_TYPE * quantptr;
3634  int * wsptr;
3635  JSAMPROW outptr;
3636  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3637  int ctr;
3638  int workspace[8*4];   /* buffers data between passes */
3639  SHIFT_TEMPS
3640
3641  /* Pass 1: process columns from input, store into work array.
3642   * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3643   */
3644  inptr = coef_block;
3645  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3646  wsptr = workspace;
3647  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
3648    /* Even part */
3649
3650    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3651    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3652
3653    tmp10 = (tmp0 + tmp2) << PASS1_BITS;
3654    tmp12 = (tmp0 - tmp2) << PASS1_BITS;
3655
3656    /* Odd part */
3657    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
3658
3659    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3660    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
3661
3662    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);               /* c6 */
3663    /* Add fudge factor here for final descale. */
3664    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
3665    tmp0 = RIGHT_SHIFT(z1 + MULTIPLY(z2, FIX_0_765366865), /* c2-c6 */
3666                       CONST_BITS-PASS1_BITS);
3667    tmp2 = RIGHT_SHIFT(z1 - MULTIPLY(z3, FIX_1_847759065), /* c2+c6 */
3668                       CONST_BITS-PASS1_BITS);
3669
3670    /* Final output stage */
3671
3672    wsptr[8*0] = (int) (tmp10 + tmp0);
3673    wsptr[8*3] = (int) (tmp10 - tmp0);
3674    wsptr[8*1] = (int) (tmp12 + tmp2);
3675    wsptr[8*2] = (int) (tmp12 - tmp2);
3676  }
3677
3678  /* Pass 2: process rows from work array, store into output array. */
3679  /* Note that we must descale the results by a factor of 8 == 2**3, */
3680  /* and also undo the PASS1_BITS scaling. */
3681
3682  wsptr = workspace;
3683  for (ctr = 0; ctr < 4; ctr++) {
3684    outptr = output_buf[ctr] + output_col;
3685
3686    /* Even part: reverse the even part of the forward DCT. */
3687    /* The rotator is sqrt(2)*c(-6). */
3688
3689    z2 = (INT32) wsptr[2];
3690    z3 = (INT32) wsptr[6];
3691   
3692    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
3693    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
3694    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
3695   
3696    /* Add fudge factor here for final descale. */
3697    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3698    z3 = (INT32) wsptr[4];
3699   
3700    tmp0 = (z2 + z3) << CONST_BITS;
3701    tmp1 = (z2 - z3) << CONST_BITS;
3702   
3703    tmp10 = tmp0 + tmp2;
3704    tmp13 = tmp0 - tmp2;
3705    tmp11 = tmp1 + tmp3;
3706    tmp12 = tmp1 - tmp3;
3707
3708    /* Odd part per figure 8; the matrix is unitary and hence its
3709     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
3710     */
3711
3712    tmp0 = (INT32) wsptr[7];
3713    tmp1 = (INT32) wsptr[5];
3714    tmp2 = (INT32) wsptr[3];
3715    tmp3 = (INT32) wsptr[1];
3716
3717    z2 = tmp0 + tmp2;
3718    z3 = tmp1 + tmp3;
3719
3720    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
3721    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
3722    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
3723    z2 += z1;
3724    z3 += z1;
3725
3726    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
3727    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
3728    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
3729    tmp0 += z1 + z2;
3730    tmp3 += z1 + z3;
3731
3732    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
3733    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
3734    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
3735    tmp1 += z1 + z3;
3736    tmp2 += z1 + z2;
3737
3738    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
3739
3740    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
3741                                              CONST_BITS+PASS1_BITS+3)
3742                            & RANGE_MASK];
3743    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
3744                                              CONST_BITS+PASS1_BITS+3)
3745                            & RANGE_MASK];
3746    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
3747                                              CONST_BITS+PASS1_BITS+3)
3748                            & RANGE_MASK];
3749    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
3750                                              CONST_BITS+PASS1_BITS+3)
3751                            & RANGE_MASK];
3752    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
3753                                              CONST_BITS+PASS1_BITS+3)
3754                            & RANGE_MASK];
3755    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
3756                                              CONST_BITS+PASS1_BITS+3)
3757                            & RANGE_MASK];
3758    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
3759                                              CONST_BITS+PASS1_BITS+3)
3760                            & RANGE_MASK];
3761    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
3762                                              CONST_BITS+PASS1_BITS+3)
3763                            & RANGE_MASK];
3764
3765    wsptr += DCTSIZE;           /* advance pointer to next row */
3766  }
3767}
3768
3769
3770/*
3771 * Perform dequantization and inverse DCT on one block of coefficients,
3772 * producing a reduced-size 6x3 output block.
3773 *
3774 * 3-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
3775 */
3776
3777GLOBAL(void)
3778jpeg_idct_6x3 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3779               JCOEFPTR coef_block,
3780               JSAMPARRAY output_buf, JDIMENSION output_col)
3781{
3782  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
3783  INT32 z1, z2, z3;
3784  JCOEFPTR inptr;
3785  ISLOW_MULT_TYPE * quantptr;
3786  int * wsptr;
3787  JSAMPROW outptr;
3788  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3789  int ctr;
3790  int workspace[6*3];   /* buffers data between passes */
3791  SHIFT_TEMPS
3792
3793  /* Pass 1: process columns from input, store into work array.
3794   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
3795   */
3796  inptr = coef_block;
3797  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3798  wsptr = workspace;
3799  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
3800    /* Even part */
3801
3802    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3803    tmp0 <<= CONST_BITS;
3804    /* Add fudge factor here for final descale. */
3805    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
3806    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
3807    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
3808    tmp10 = tmp0 + tmp12;
3809    tmp2 = tmp0 - tmp12 - tmp12;
3810
3811    /* Odd part */
3812
3813    tmp12 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3814    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
3815
3816    /* Final output stage */
3817
3818    wsptr[6*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
3819    wsptr[6*2] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
3820    wsptr[6*1] = (int) RIGHT_SHIFT(tmp2, CONST_BITS-PASS1_BITS);
3821  }
3822 
3823  /* Pass 2: process 3 rows from work array, store into output array.
3824   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
3825   */
3826  wsptr = workspace;
3827  for (ctr = 0; ctr < 3; ctr++) {
3828    outptr = output_buf[ctr] + output_col;
3829
3830    /* Even part */
3831
3832    /* Add fudge factor here for final descale. */
3833    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
3834    tmp0 <<= CONST_BITS;
3835    tmp2 = (INT32) wsptr[4];
3836    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
3837    tmp1 = tmp0 + tmp10;
3838    tmp11 = tmp0 - tmp10 - tmp10;
3839    tmp10 = (INT32) wsptr[2];
3840    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
3841    tmp10 = tmp1 + tmp0;
3842    tmp12 = tmp1 - tmp0;
3843
3844    /* Odd part */
3845
3846    z1 = (INT32) wsptr[1];
3847    z2 = (INT32) wsptr[3];
3848    z3 = (INT32) wsptr[5];
3849    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
3850    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
3851    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
3852    tmp1 = (z1 - z2 - z3) << CONST_BITS;
3853
3854    /* Final output stage */
3855
3856    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
3857                                              CONST_BITS+PASS1_BITS+3)
3858                            & RANGE_MASK];
3859    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
3860                                              CONST_BITS+PASS1_BITS+3)
3861                            & RANGE_MASK];
3862    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp1,
3863                                              CONST_BITS+PASS1_BITS+3)
3864                            & RANGE_MASK];
3865    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp1,
3866                                              CONST_BITS+PASS1_BITS+3)
3867                            & RANGE_MASK];
3868    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
3869                                              CONST_BITS+PASS1_BITS+3)
3870                            & RANGE_MASK];
3871    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
3872                                              CONST_BITS+PASS1_BITS+3)
3873                            & RANGE_MASK];
3874
3875    wsptr += 6;         /* advance pointer to next row */
3876  }
3877}
3878
3879
3880/*
3881 * Perform dequantization and inverse DCT on one block of coefficients,
3882 * producing a 4x2 output block.
3883 *
3884 * 2-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
3885 */
3886
3887GLOBAL(void)
3888jpeg_idct_4x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3889               JCOEFPTR coef_block,
3890               JSAMPARRAY output_buf, JDIMENSION output_col)
3891{
3892  INT32 tmp0, tmp2, tmp10, tmp12;
3893  INT32 z1, z2, z3;
3894  JCOEFPTR inptr;
3895  ISLOW_MULT_TYPE * quantptr;
3896  INT32 * wsptr;
3897  JSAMPROW outptr;
3898  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3899  int ctr;
3900  INT32 workspace[4*2]; /* buffers data between passes */
3901  SHIFT_TEMPS
3902
3903  /* Pass 1: process columns from input, store into work array. */
3904
3905  inptr = coef_block;
3906  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3907  wsptr = workspace;
3908  for (ctr = 0; ctr < 4; ctr++, inptr++, quantptr++, wsptr++) {
3909    /* Even part */
3910
3911    tmp10 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
3912
3913    /* Odd part */
3914
3915    tmp0 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
3916
3917    /* Final output stage */
3918
3919    wsptr[4*0] = tmp10 + tmp0;
3920    wsptr[4*1] = tmp10 - tmp0;
3921  }
3922
3923  /* Pass 2: process 2 rows from work array, store into output array.
3924   * 4-point IDCT kernel,
3925   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
3926   */
3927  wsptr = workspace;
3928  for (ctr = 0; ctr < 2; ctr++) {
3929    outptr = output_buf[ctr] + output_col;
3930
3931    /* Even part */
3932
3933    /* Add fudge factor here for final descale. */
3934    tmp0 = wsptr[0] + (ONE << 2);
3935    tmp2 = wsptr[2];
3936
3937    tmp10 = (tmp0 + tmp2) << CONST_BITS;
3938    tmp12 = (tmp0 - tmp2) << CONST_BITS;
3939
3940    /* Odd part */
3941    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
3942
3943    z2 = wsptr[1];
3944    z3 = wsptr[3];
3945
3946    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
3947    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
3948    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
3949
3950    /* Final output stage */
3951
3952    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
3953                                              CONST_BITS+3)
3954                            & RANGE_MASK];
3955    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
3956                                              CONST_BITS+3)
3957                            & RANGE_MASK];
3958    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
3959                                              CONST_BITS+3)
3960                            & RANGE_MASK];
3961    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
3962                                              CONST_BITS+3)
3963                            & RANGE_MASK];
3964
3965    wsptr += 4;         /* advance pointer to next row */
3966  }
3967}
3968
3969
3970/*
3971 * Perform dequantization and inverse DCT on one block of coefficients,
3972 * producing a 2x1 output block.
3973 *
3974 * 1-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
3975 */
3976
3977GLOBAL(void)
3978jpeg_idct_2x1 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
3979               JCOEFPTR coef_block,
3980               JSAMPARRAY output_buf, JDIMENSION output_col)
3981{
3982  INT32 tmp0, tmp10;
3983  ISLOW_MULT_TYPE * quantptr;
3984  JSAMPROW outptr;
3985  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
3986  SHIFT_TEMPS
3987
3988  /* Pass 1: empty. */
3989
3990  /* Pass 2: process 1 row from input, store into output array. */
3991
3992  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
3993  outptr = output_buf[0] + output_col;
3994
3995  /* Even part */
3996
3997  tmp10 = DEQUANTIZE(coef_block[0], quantptr[0]);
3998  /* Add fudge factor here for final descale. */
3999  tmp10 += ONE << 2;
4000
4001  /* Odd part */
4002
4003  tmp0 = DEQUANTIZE(coef_block[1], quantptr[1]);
4004
4005  /* Final output stage */
4006
4007  outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3) & RANGE_MASK];
4008  outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3) & RANGE_MASK];
4009}
4010
4011
4012/*
4013 * Perform dequantization and inverse DCT on one block of coefficients,
4014 * producing a 8x16 output block.
4015 *
4016 * 16-point IDCT in pass 1 (columns), 8-point in pass 2 (rows).
4017 */
4018
4019GLOBAL(void)
4020jpeg_idct_8x16 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4021                JCOEFPTR coef_block,
4022                JSAMPARRAY output_buf, JDIMENSION output_col)
4023{
4024  INT32 tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
4025  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
4026  INT32 z1, z2, z3, z4;
4027  JCOEFPTR inptr;
4028  ISLOW_MULT_TYPE * quantptr;
4029  int * wsptr;
4030  JSAMPROW outptr;
4031  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4032  int ctr;
4033  int workspace[8*16];  /* buffers data between passes */
4034  SHIFT_TEMPS
4035
4036  /* Pass 1: process columns from input, store into work array.
4037   * 16-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
4038   */
4039  inptr = coef_block;
4040  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4041  wsptr = workspace;
4042  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
4043    /* Even part */
4044
4045    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4046    tmp0 <<= CONST_BITS;
4047    /* Add fudge factor here for final descale. */
4048    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
4049
4050    z1 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4051    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
4052    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
4053
4054    tmp10 = tmp0 + tmp1;
4055    tmp11 = tmp0 - tmp1;
4056    tmp12 = tmp0 + tmp2;
4057    tmp13 = tmp0 - tmp2;
4058
4059    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4060    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4061    z3 = z1 - z2;
4062    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
4063    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
4064
4065    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
4066    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
4067    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
4068    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
4069
4070    tmp20 = tmp10 + tmp0;
4071    tmp27 = tmp10 - tmp0;
4072    tmp21 = tmp12 + tmp1;
4073    tmp26 = tmp12 - tmp1;
4074    tmp22 = tmp13 + tmp2;
4075    tmp25 = tmp13 - tmp2;
4076    tmp23 = tmp11 + tmp3;
4077    tmp24 = tmp11 - tmp3;
4078
4079    /* Odd part */
4080
4081    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4082    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4083    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4084    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4085
4086    tmp11 = z1 + z3;
4087
4088    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
4089    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
4090    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
4091    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
4092    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
4093    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
4094    tmp0  = tmp1 + tmp2 + tmp3 -
4095            MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
4096    tmp13 = tmp10 + tmp11 + tmp12 -
4097            MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
4098    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
4099    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
4100    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
4101    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
4102    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
4103    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
4104    z2    += z4;
4105    z1    = MULTIPLY(z2, - FIX(0.666655658));      /* -c11 */
4106    tmp1  += z1;
4107    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
4108    z2    = MULTIPLY(z2, - FIX(1.247225013));      /* -c5 */
4109    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
4110    tmp12 += z2;
4111    z2    = MULTIPLY(z3 + z4, - FIX(1.353318001)); /* -c3 */
4112    tmp2  += z2;
4113    tmp3  += z2;
4114    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
4115    tmp10 += z2;
4116    tmp11 += z2;
4117
4118    /* Final output stage */
4119
4120    wsptr[8*0]  = (int) RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS-PASS1_BITS);
4121    wsptr[8*15] = (int) RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS-PASS1_BITS);
4122    wsptr[8*1]  = (int) RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS-PASS1_BITS);
4123    wsptr[8*14] = (int) RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS-PASS1_BITS);
4124    wsptr[8*2]  = (int) RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS-PASS1_BITS);
4125    wsptr[8*13] = (int) RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS-PASS1_BITS);
4126    wsptr[8*3]  = (int) RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS-PASS1_BITS);
4127    wsptr[8*12] = (int) RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS-PASS1_BITS);
4128    wsptr[8*4]  = (int) RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS-PASS1_BITS);
4129    wsptr[8*11] = (int) RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS-PASS1_BITS);
4130    wsptr[8*5]  = (int) RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS-PASS1_BITS);
4131    wsptr[8*10] = (int) RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS-PASS1_BITS);
4132    wsptr[8*6]  = (int) RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS-PASS1_BITS);
4133    wsptr[8*9]  = (int) RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS-PASS1_BITS);
4134    wsptr[8*7]  = (int) RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS-PASS1_BITS);
4135    wsptr[8*8]  = (int) RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS-PASS1_BITS);
4136  }
4137 
4138  /* Pass 2: process rows from work array, store into output array. */
4139  /* Note that we must descale the results by a factor of 8 == 2**3, */
4140  /* and also undo the PASS1_BITS scaling. */
4141
4142  wsptr = workspace;
4143  for (ctr = 0; ctr < 16; ctr++) {
4144    outptr = output_buf[ctr] + output_col;
4145   
4146    /* Even part: reverse the even part of the forward DCT. */
4147    /* The rotator is sqrt(2)*c(-6). */
4148   
4149    z2 = (INT32) wsptr[2];
4150    z3 = (INT32) wsptr[6];
4151   
4152    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
4153    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
4154    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
4155   
4156    /* Add fudge factor here for final descale. */
4157    z2 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4158    z3 = (INT32) wsptr[4];
4159   
4160    tmp0 = (z2 + z3) << CONST_BITS;
4161    tmp1 = (z2 - z3) << CONST_BITS;
4162   
4163    tmp10 = tmp0 + tmp2;
4164    tmp13 = tmp0 - tmp2;
4165    tmp11 = tmp1 + tmp3;
4166    tmp12 = tmp1 - tmp3;
4167   
4168    /* Odd part per figure 8; the matrix is unitary and hence its
4169     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4170     */
4171   
4172    tmp0 = (INT32) wsptr[7];
4173    tmp1 = (INT32) wsptr[5];
4174    tmp2 = (INT32) wsptr[3];
4175    tmp3 = (INT32) wsptr[1];
4176   
4177    z2 = tmp0 + tmp2;
4178    z3 = tmp1 + tmp3;
4179
4180    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
4181    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
4182    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
4183    z2 += z1;
4184    z3 += z1;
4185
4186    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
4187    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
4188    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
4189    tmp0 += z1 + z2;
4190    tmp3 += z1 + z3;
4191
4192    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
4193    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
4194    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
4195    tmp1 += z1 + z3;
4196    tmp2 += z1 + z2;
4197   
4198    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4199   
4200    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp3,
4201                                              CONST_BITS+PASS1_BITS+3)
4202                            & RANGE_MASK];
4203    outptr[7] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp3,
4204                                              CONST_BITS+PASS1_BITS+3)
4205                            & RANGE_MASK];
4206    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp2,
4207                                              CONST_BITS+PASS1_BITS+3)
4208                            & RANGE_MASK];
4209    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp2,
4210                                              CONST_BITS+PASS1_BITS+3)
4211                            & RANGE_MASK];
4212    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp1,
4213                                              CONST_BITS+PASS1_BITS+3)
4214                            & RANGE_MASK];
4215    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp1,
4216                                              CONST_BITS+PASS1_BITS+3)
4217                            & RANGE_MASK];
4218    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp13 + tmp0,
4219                                              CONST_BITS+PASS1_BITS+3)
4220                            & RANGE_MASK];
4221    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp13 - tmp0,
4222                                              CONST_BITS+PASS1_BITS+3)
4223                            & RANGE_MASK];
4224   
4225    wsptr += DCTSIZE;           /* advance pointer to next row */
4226  }
4227}
4228
4229
4230/*
4231 * Perform dequantization and inverse DCT on one block of coefficients,
4232 * producing a 7x14 output block.
4233 *
4234 * 14-point IDCT in pass 1 (columns), 7-point in pass 2 (rows).
4235 */
4236
4237GLOBAL(void)
4238jpeg_idct_7x14 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4239                JCOEFPTR coef_block,
4240                JSAMPARRAY output_buf, JDIMENSION output_col)
4241{
4242  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
4243  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
4244  INT32 z1, z2, z3, z4;
4245  JCOEFPTR inptr;
4246  ISLOW_MULT_TYPE * quantptr;
4247  int * wsptr;
4248  JSAMPROW outptr;
4249  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4250  int ctr;
4251  int workspace[7*14];  /* buffers data between passes */
4252  SHIFT_TEMPS
4253
4254  /* Pass 1: process columns from input, store into work array.
4255   * 14-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/28).
4256   */
4257  inptr = coef_block;
4258  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4259  wsptr = workspace;
4260  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
4261    /* Even part */
4262
4263    z1 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4264    z1 <<= CONST_BITS;
4265    /* Add fudge factor here for final descale. */
4266    z1 += ONE << (CONST_BITS-PASS1_BITS-1);
4267    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4268    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
4269    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
4270    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
4271
4272    tmp10 = z1 + z2;
4273    tmp11 = z1 + z3;
4274    tmp12 = z1 - z4;
4275
4276    tmp23 = RIGHT_SHIFT(z1 - ((z2 + z3 - z4) << 1), /* c0 = (c4+c12-c8)*2 */
4277                        CONST_BITS-PASS1_BITS);
4278
4279    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4280    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4281
4282    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
4283
4284    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
4285    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
4286    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
4287            MULTIPLY(z2, FIX(1.378756276));      /* c2 */
4288
4289    tmp20 = tmp10 + tmp13;
4290    tmp26 = tmp10 - tmp13;
4291    tmp21 = tmp11 + tmp14;
4292    tmp25 = tmp11 - tmp14;
4293    tmp22 = tmp12 + tmp15;
4294    tmp24 = tmp12 - tmp15;
4295
4296    /* Odd part */
4297
4298    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4299    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4300    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4301    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4302    tmp13 = z4 << CONST_BITS;
4303
4304    tmp14 = z1 + z3;
4305    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
4306    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
4307    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
4308    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
4309    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
4310    z1    -= z2;
4311    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
4312    tmp16 += tmp15;
4313    z1    += z4;
4314    z4    = MULTIPLY(z2 + z3, - FIX(0.158341681)) - tmp13; /* -c13 */
4315    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
4316    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
4317    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
4318    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
4319    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
4320
4321    tmp13 = (z1 - z3) << PASS1_BITS;
4322
4323    /* Final output stage */
4324
4325    wsptr[7*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4326    wsptr[7*13] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4327    wsptr[7*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4328    wsptr[7*12] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4329    wsptr[7*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4330    wsptr[7*11] = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4331    wsptr[7*3]  = (int) (tmp23 + tmp13);
4332    wsptr[7*10] = (int) (tmp23 - tmp13);
4333    wsptr[7*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4334    wsptr[7*9]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4335    wsptr[7*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4336    wsptr[7*8]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4337    wsptr[7*6]  = (int) RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS-PASS1_BITS);
4338    wsptr[7*7]  = (int) RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS-PASS1_BITS);
4339  }
4340
4341  /* Pass 2: process 14 rows from work array, store into output array.
4342   * 7-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/14).
4343   */
4344  wsptr = workspace;
4345  for (ctr = 0; ctr < 14; ctr++) {
4346    outptr = output_buf[ctr] + output_col;
4347
4348    /* Even part */
4349
4350    /* Add fudge factor here for final descale. */
4351    tmp23 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4352    tmp23 <<= CONST_BITS;
4353
4354    z1 = (INT32) wsptr[2];
4355    z2 = (INT32) wsptr[4];
4356    z3 = (INT32) wsptr[6];
4357
4358    tmp20 = MULTIPLY(z2 - z3, FIX(0.881747734));       /* c4 */
4359    tmp22 = MULTIPLY(z1 - z2, FIX(0.314692123));       /* c6 */
4360    tmp21 = tmp20 + tmp22 + tmp23 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
4361    tmp10 = z1 + z3;
4362    z2 -= tmp10;
4363    tmp10 = MULTIPLY(tmp10, FIX(1.274162392)) + tmp23; /* c2 */
4364    tmp20 += tmp10 - MULTIPLY(z3, FIX(0.077722536));   /* c2-c4-c6 */
4365    tmp22 += tmp10 - MULTIPLY(z1, FIX(2.470602249));   /* c2+c4+c6 */
4366    tmp23 += MULTIPLY(z2, FIX(1.414213562));           /* c0 */
4367
4368    /* Odd part */
4369
4370    z1 = (INT32) wsptr[1];
4371    z2 = (INT32) wsptr[3];
4372    z3 = (INT32) wsptr[5];
4373
4374    tmp11 = MULTIPLY(z1 + z2, FIX(0.935414347));       /* (c3+c1-c5)/2 */
4375    tmp12 = MULTIPLY(z1 - z2, FIX(0.170262339));       /* (c3+c5-c1)/2 */
4376    tmp10 = tmp11 - tmp12;
4377    tmp11 += tmp12;
4378    tmp12 = MULTIPLY(z2 + z3, - FIX(1.378756276));     /* -c1 */
4379    tmp11 += tmp12;
4380    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));          /* c5 */
4381    tmp10 += z2;
4382    tmp12 += z2 + MULTIPLY(z3, FIX(1.870828693));      /* c3+c1-c5 */
4383
4384    /* Final output stage */
4385
4386    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4387                                              CONST_BITS+PASS1_BITS+3)
4388                            & RANGE_MASK];
4389    outptr[6] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4390                                              CONST_BITS+PASS1_BITS+3)
4391                            & RANGE_MASK];
4392    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4393                                              CONST_BITS+PASS1_BITS+3)
4394                            & RANGE_MASK];
4395    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4396                                              CONST_BITS+PASS1_BITS+3)
4397                            & RANGE_MASK];
4398    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4399                                              CONST_BITS+PASS1_BITS+3)
4400                            & RANGE_MASK];
4401    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4402                                              CONST_BITS+PASS1_BITS+3)
4403                            & RANGE_MASK];
4404    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp23,
4405                                              CONST_BITS+PASS1_BITS+3)
4406                            & RANGE_MASK];
4407
4408    wsptr += 7;         /* advance pointer to next row */
4409  }
4410}
4411
4412
4413/*
4414 * Perform dequantization and inverse DCT on one block of coefficients,
4415 * producing a 6x12 output block.
4416 *
4417 * 12-point IDCT in pass 1 (columns), 6-point in pass 2 (rows).
4418 */
4419
4420GLOBAL(void)
4421jpeg_idct_6x12 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4422                JCOEFPTR coef_block,
4423                JSAMPARRAY output_buf, JDIMENSION output_col)
4424{
4425  INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
4426  INT32 tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
4427  INT32 z1, z2, z3, z4;
4428  JCOEFPTR inptr;
4429  ISLOW_MULT_TYPE * quantptr;
4430  int * wsptr;
4431  JSAMPROW outptr;
4432  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4433  int ctr;
4434  int workspace[6*12];  /* buffers data between passes */
4435  SHIFT_TEMPS
4436
4437  /* Pass 1: process columns from input, store into work array.
4438   * 12-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/24).
4439   */
4440  inptr = coef_block;
4441  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4442  wsptr = workspace;
4443  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
4444    /* Even part */
4445
4446    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4447    z3 <<= CONST_BITS;
4448    /* Add fudge factor here for final descale. */
4449    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4450
4451    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4452    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
4453
4454    tmp10 = z3 + z4;
4455    tmp11 = z3 - z4;
4456
4457    z1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4458    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
4459    z1 <<= CONST_BITS;
4460    z2 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4461    z2 <<= CONST_BITS;
4462
4463    tmp12 = z1 - z2;
4464
4465    tmp21 = z3 + tmp12;
4466    tmp24 = z3 - tmp12;
4467
4468    tmp12 = z4 + z2;
4469
4470    tmp20 = tmp10 + tmp12;
4471    tmp25 = tmp10 - tmp12;
4472
4473    tmp12 = z4 - z1 - z2;
4474
4475    tmp22 = tmp11 + tmp12;
4476    tmp23 = tmp11 - tmp12;
4477
4478    /* Odd part */
4479
4480    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4481    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4482    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4483    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4484
4485    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
4486    tmp14 = MULTIPLY(z2, - FIX_0_541196100);                 /* -c9 */
4487
4488    tmp10 = z1 + z3;
4489    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
4490    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
4491    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
4492    tmp13 = MULTIPLY(z3 + z4, - FIX(1.045510580));           /* -(c7+c11) */
4493    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
4494    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
4495    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
4496             MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
4497
4498    z1 -= z4;
4499    z2 -= z3;
4500    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
4501    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
4502    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
4503
4504    /* Final output stage */
4505
4506    wsptr[6*0]  = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4507    wsptr[6*11] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4508    wsptr[6*1]  = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4509    wsptr[6*10] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4510    wsptr[6*2]  = (int) RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS-PASS1_BITS);
4511    wsptr[6*9]  = (int) RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS-PASS1_BITS);
4512    wsptr[6*3]  = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4513    wsptr[6*8]  = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4514    wsptr[6*4]  = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4515    wsptr[6*7]  = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4516    wsptr[6*5]  = (int) RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS-PASS1_BITS);
4517    wsptr[6*6]  = (int) RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS-PASS1_BITS);
4518  }
4519
4520  /* Pass 2: process 12 rows from work array, store into output array.
4521   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
4522   */
4523  wsptr = workspace;
4524  for (ctr = 0; ctr < 12; ctr++) {
4525    outptr = output_buf[ctr] + output_col;
4526
4527    /* Even part */
4528
4529    /* Add fudge factor here for final descale. */
4530    tmp10 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4531    tmp10 <<= CONST_BITS;
4532    tmp12 = (INT32) wsptr[4];
4533    tmp20 = MULTIPLY(tmp12, FIX(0.707106781));   /* c4 */
4534    tmp11 = tmp10 + tmp20;
4535    tmp21 = tmp10 - tmp20 - tmp20;
4536    tmp20 = (INT32) wsptr[2];
4537    tmp10 = MULTIPLY(tmp20, FIX(1.224744871));   /* c2 */
4538    tmp20 = tmp11 + tmp10;
4539    tmp22 = tmp11 - tmp10;
4540
4541    /* Odd part */
4542
4543    z1 = (INT32) wsptr[1];
4544    z2 = (INT32) wsptr[3];
4545    z3 = (INT32) wsptr[5];
4546    tmp11 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
4547    tmp10 = tmp11 + ((z1 + z2) << CONST_BITS);
4548    tmp12 = tmp11 + ((z3 - z2) << CONST_BITS);
4549    tmp11 = (z1 - z2 - z3) << CONST_BITS;
4550
4551    /* Final output stage */
4552
4553    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp20 + tmp10,
4554                                              CONST_BITS+PASS1_BITS+3)
4555                            & RANGE_MASK];
4556    outptr[5] = range_limit[(int) RIGHT_SHIFT(tmp20 - tmp10,
4557                                              CONST_BITS+PASS1_BITS+3)
4558                            & RANGE_MASK];
4559    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp21 + tmp11,
4560                                              CONST_BITS+PASS1_BITS+3)
4561                            & RANGE_MASK];
4562    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp21 - tmp11,
4563                                              CONST_BITS+PASS1_BITS+3)
4564                            & RANGE_MASK];
4565    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp22 + tmp12,
4566                                              CONST_BITS+PASS1_BITS+3)
4567                            & RANGE_MASK];
4568    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp22 - tmp12,
4569                                              CONST_BITS+PASS1_BITS+3)
4570                            & RANGE_MASK];
4571
4572    wsptr += 6;         /* advance pointer to next row */
4573  }
4574}
4575
4576
4577/*
4578 * Perform dequantization and inverse DCT on one block of coefficients,
4579 * producing a 5x10 output block.
4580 *
4581 * 10-point IDCT in pass 1 (columns), 5-point in pass 2 (rows).
4582 */
4583
4584GLOBAL(void)
4585jpeg_idct_5x10 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4586                JCOEFPTR coef_block,
4587                JSAMPARRAY output_buf, JDIMENSION output_col)
4588{
4589  INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
4590  INT32 tmp20, tmp21, tmp22, tmp23, tmp24;
4591  INT32 z1, z2, z3, z4, z5;
4592  JCOEFPTR inptr;
4593  ISLOW_MULT_TYPE * quantptr;
4594  int * wsptr;
4595  JSAMPROW outptr;
4596  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4597  int ctr;
4598  int workspace[5*10];  /* buffers data between passes */
4599  SHIFT_TEMPS
4600
4601  /* Pass 1: process columns from input, store into work array.
4602   * 10-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/20).
4603   */
4604  inptr = coef_block;
4605  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4606  wsptr = workspace;
4607  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
4608    /* Even part */
4609
4610    z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4611    z3 <<= CONST_BITS;
4612    /* Add fudge factor here for final descale. */
4613    z3 += ONE << (CONST_BITS-PASS1_BITS-1);
4614    z4 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4615    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
4616    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
4617    tmp10 = z3 + z1;
4618    tmp11 = z3 - z2;
4619
4620    tmp22 = RIGHT_SHIFT(z3 - ((z1 - z2) << 1),   /* c0 = (c4-c8)*2 */
4621                        CONST_BITS-PASS1_BITS);
4622
4623    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4624    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4625
4626    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
4627    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
4628    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
4629
4630    tmp20 = tmp10 + tmp12;
4631    tmp24 = tmp10 - tmp12;
4632    tmp21 = tmp11 + tmp13;
4633    tmp23 = tmp11 - tmp13;
4634
4635    /* Odd part */
4636
4637    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4638    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4639    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4640    z4 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4641
4642    tmp11 = z2 + z4;
4643    tmp13 = z2 - z4;
4644
4645    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
4646    z5 = z3 << CONST_BITS;
4647
4648    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
4649    z4 = z5 + tmp12;
4650
4651    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
4652    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
4653
4654    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
4655    z4 = z5 - tmp12 - (tmp13 << (CONST_BITS - 1));
4656
4657    tmp12 = (z1 - tmp13 - z3) << PASS1_BITS;
4658
4659    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
4660    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
4661
4662    /* Final output stage */
4663
4664    wsptr[5*0] = (int) RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS-PASS1_BITS);
4665    wsptr[5*9] = (int) RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS-PASS1_BITS);
4666    wsptr[5*1] = (int) RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS-PASS1_BITS);
4667    wsptr[5*8] = (int) RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS-PASS1_BITS);
4668    wsptr[5*2] = (int) (tmp22 + tmp12);
4669    wsptr[5*7] = (int) (tmp22 - tmp12);
4670    wsptr[5*3] = (int) RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS-PASS1_BITS);
4671    wsptr[5*6] = (int) RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS-PASS1_BITS);
4672    wsptr[5*4] = (int) RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS-PASS1_BITS);
4673    wsptr[5*5] = (int) RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS-PASS1_BITS);
4674  }
4675
4676  /* Pass 2: process 10 rows from work array, store into output array.
4677   * 5-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/10).
4678   */
4679  wsptr = workspace;
4680  for (ctr = 0; ctr < 10; ctr++) {
4681    outptr = output_buf[ctr] + output_col;
4682
4683    /* Even part */
4684
4685    /* Add fudge factor here for final descale. */
4686    tmp12 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4687    tmp12 <<= CONST_BITS;
4688    tmp13 = (INT32) wsptr[2];
4689    tmp14 = (INT32) wsptr[4];
4690    z1 = MULTIPLY(tmp13 + tmp14, FIX(0.790569415)); /* (c2+c4)/2 */
4691    z2 = MULTIPLY(tmp13 - tmp14, FIX(0.353553391)); /* (c2-c4)/2 */
4692    z3 = tmp12 + z2;
4693    tmp10 = z3 + z1;
4694    tmp11 = z3 - z1;
4695    tmp12 -= z2 << 2;
4696
4697    /* Odd part */
4698
4699    z2 = (INT32) wsptr[1];
4700    z3 = (INT32) wsptr[3];
4701
4702    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));       /* c3 */
4703    tmp13 = z1 + MULTIPLY(z2, FIX(0.513743148));    /* c1-c3 */
4704    tmp14 = z1 - MULTIPLY(z3, FIX(2.176250899));    /* c1+c3 */
4705
4706    /* Final output stage */
4707
4708    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp13,
4709                                              CONST_BITS+PASS1_BITS+3)
4710                            & RANGE_MASK];
4711    outptr[4] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp13,
4712                                              CONST_BITS+PASS1_BITS+3)
4713                            & RANGE_MASK];
4714    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp11 + tmp14,
4715                                              CONST_BITS+PASS1_BITS+3)
4716                            & RANGE_MASK];
4717    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp11 - tmp14,
4718                                              CONST_BITS+PASS1_BITS+3)
4719                            & RANGE_MASK];
4720    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12,
4721                                              CONST_BITS+PASS1_BITS+3)
4722                            & RANGE_MASK];
4723
4724    wsptr += 5;         /* advance pointer to next row */
4725  }
4726}
4727
4728
4729/*
4730 * Perform dequantization and inverse DCT on one block of coefficients,
4731 * producing a 4x8 output block.
4732 *
4733 * 8-point IDCT in pass 1 (columns), 4-point in pass 2 (rows).
4734 */
4735
4736GLOBAL(void)
4737jpeg_idct_4x8 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4738               JCOEFPTR coef_block,
4739               JSAMPARRAY output_buf, JDIMENSION output_col)
4740{
4741  INT32 tmp0, tmp1, tmp2, tmp3;
4742  INT32 tmp10, tmp11, tmp12, tmp13;
4743  INT32 z1, z2, z3;
4744  JCOEFPTR inptr;
4745  ISLOW_MULT_TYPE * quantptr;
4746  int * wsptr;
4747  JSAMPROW outptr;
4748  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4749  int ctr;
4750  int workspace[4*8];   /* buffers data between passes */
4751  SHIFT_TEMPS
4752
4753  /* Pass 1: process columns from input, store into work array. */
4754  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
4755  /* furthermore, we scale the results by 2**PASS1_BITS. */
4756
4757  inptr = coef_block;
4758  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4759  wsptr = workspace;
4760  for (ctr = 4; ctr > 0; ctr--) {
4761    /* Due to quantization, we will usually find that many of the input
4762     * coefficients are zero, especially the AC terms.  We can exploit this
4763     * by short-circuiting the IDCT calculation for any column in which all
4764     * the AC terms are zero.  In that case each output is equal to the
4765     * DC coefficient (with scale factor as needed).
4766     * With typical images and quantization tables, half or more of the
4767     * column DCT calculations can be simplified this way.
4768     */
4769
4770    if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 &&
4771        inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 &&
4772        inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 &&
4773        inptr[DCTSIZE*7] == 0) {
4774      /* AC terms all zero */
4775      int dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) << PASS1_BITS;
4776
4777      wsptr[4*0] = dcval;
4778      wsptr[4*1] = dcval;
4779      wsptr[4*2] = dcval;
4780      wsptr[4*3] = dcval;
4781      wsptr[4*4] = dcval;
4782      wsptr[4*5] = dcval;
4783      wsptr[4*6] = dcval;
4784      wsptr[4*7] = dcval;
4785
4786      inptr++;                  /* advance pointers to next column */
4787      quantptr++;
4788      wsptr++;
4789      continue;
4790    }
4791
4792    /* Even part: reverse the even part of the forward DCT. */
4793    /* The rotator is sqrt(2)*c(-6). */
4794
4795    z2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4796    z3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]);
4797   
4798    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
4799    tmp2 = z1 + MULTIPLY(z2, FIX_0_765366865);
4800    tmp3 = z1 - MULTIPLY(z3, FIX_1_847759065);
4801   
4802    z2 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4803    z3 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4804    z2 <<= CONST_BITS;
4805    z3 <<= CONST_BITS;
4806    /* Add fudge factor here for final descale. */
4807    z2 += ONE << (CONST_BITS-PASS1_BITS-1);
4808
4809    tmp0 = z2 + z3;
4810    tmp1 = z2 - z3;
4811   
4812    tmp10 = tmp0 + tmp2;
4813    tmp13 = tmp0 - tmp2;
4814    tmp11 = tmp1 + tmp3;
4815    tmp12 = tmp1 - tmp3;
4816
4817    /* Odd part per figure 8; the matrix is unitary and hence its
4818     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
4819     */
4820
4821    tmp0 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]);
4822    tmp1 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4823    tmp2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4824    tmp3 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4825
4826    z2 = tmp0 + tmp2;
4827    z3 = tmp1 + tmp3;
4828
4829    z1 = MULTIPLY(z2 + z3, FIX_1_175875602); /* sqrt(2) * c3 */
4830    z2 = MULTIPLY(z2, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
4831    z3 = MULTIPLY(z3, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
4832    z2 += z1;
4833    z3 += z1;
4834
4835    z1 = MULTIPLY(tmp0 + tmp3, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
4836    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
4837    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
4838    tmp0 += z1 + z2;
4839    tmp3 += z1 + z3;
4840
4841    z1 = MULTIPLY(tmp1 + tmp2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
4842    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
4843    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
4844    tmp1 += z1 + z3;
4845    tmp2 += z1 + z2;
4846
4847    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
4848
4849    wsptr[4*0] = (int) RIGHT_SHIFT(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
4850    wsptr[4*7] = (int) RIGHT_SHIFT(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
4851    wsptr[4*1] = (int) RIGHT_SHIFT(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
4852    wsptr[4*6] = (int) RIGHT_SHIFT(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
4853    wsptr[4*2] = (int) RIGHT_SHIFT(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
4854    wsptr[4*5] = (int) RIGHT_SHIFT(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
4855    wsptr[4*3] = (int) RIGHT_SHIFT(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
4856    wsptr[4*4] = (int) RIGHT_SHIFT(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
4857
4858    inptr++;                    /* advance pointers to next column */
4859    quantptr++;
4860    wsptr++;
4861  }
4862
4863  /* Pass 2: process 8 rows from work array, store into output array.
4864   * 4-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4865   */
4866  wsptr = workspace;
4867  for (ctr = 0; ctr < 8; ctr++) {
4868    outptr = output_buf[ctr] + output_col;
4869
4870    /* Even part */
4871
4872    /* Add fudge factor here for final descale. */
4873    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4874    tmp2 = (INT32) wsptr[2];
4875
4876    tmp10 = (tmp0 + tmp2) << CONST_BITS;
4877    tmp12 = (tmp0 - tmp2) << CONST_BITS;
4878
4879    /* Odd part */
4880    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
4881
4882    z2 = (INT32) wsptr[1];
4883    z3 = (INT32) wsptr[3];
4884
4885    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
4886    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
4887    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
4888
4889    /* Final output stage */
4890
4891    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4892                                              CONST_BITS+PASS1_BITS+3)
4893                            & RANGE_MASK];
4894    outptr[3] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
4895                                              CONST_BITS+PASS1_BITS+3)
4896                            & RANGE_MASK];
4897    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp12 + tmp2,
4898                                              CONST_BITS+PASS1_BITS+3)
4899                            & RANGE_MASK];
4900    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp12 - tmp2,
4901                                              CONST_BITS+PASS1_BITS+3)
4902                            & RANGE_MASK];
4903   
4904    wsptr += 4;         /* advance pointer to next row */
4905  }
4906}
4907
4908
4909/*
4910 * Perform dequantization and inverse DCT on one block of coefficients,
4911 * producing a reduced-size 3x6 output block.
4912 *
4913 * 6-point IDCT in pass 1 (columns), 3-point in pass 2 (rows).
4914 */
4915
4916GLOBAL(void)
4917jpeg_idct_3x6 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
4918               JCOEFPTR coef_block,
4919               JSAMPARRAY output_buf, JDIMENSION output_col)
4920{
4921  INT32 tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
4922  INT32 z1, z2, z3;
4923  JCOEFPTR inptr;
4924  ISLOW_MULT_TYPE * quantptr;
4925  int * wsptr;
4926  JSAMPROW outptr;
4927  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
4928  int ctr;
4929  int workspace[3*6];   /* buffers data between passes */
4930  SHIFT_TEMPS
4931
4932  /* Pass 1: process columns from input, store into work array.
4933   * 6-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/12).
4934   */
4935  inptr = coef_block;
4936  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
4937  wsptr = workspace;
4938  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
4939    /* Even part */
4940
4941    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
4942    tmp0 <<= CONST_BITS;
4943    /* Add fudge factor here for final descale. */
4944    tmp0 += ONE << (CONST_BITS-PASS1_BITS-1);
4945    tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]);
4946    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
4947    tmp1 = tmp0 + tmp10;
4948    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS-PASS1_BITS);
4949    tmp10 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
4950    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
4951    tmp10 = tmp1 + tmp0;
4952    tmp12 = tmp1 - tmp0;
4953
4954    /* Odd part */
4955
4956    z1 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
4957    z2 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
4958    z3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]);
4959    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
4960    tmp0 = tmp1 + ((z1 + z2) << CONST_BITS);
4961    tmp2 = tmp1 + ((z3 - z2) << CONST_BITS);
4962    tmp1 = (z1 - z2 - z3) << PASS1_BITS;
4963
4964    /* Final output stage */
4965
4966    wsptr[3*0] = (int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS-PASS1_BITS);
4967    wsptr[3*5] = (int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS-PASS1_BITS);
4968    wsptr[3*1] = (int) (tmp11 + tmp1);
4969    wsptr[3*4] = (int) (tmp11 - tmp1);
4970    wsptr[3*2] = (int) RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS-PASS1_BITS);
4971    wsptr[3*3] = (int) RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS-PASS1_BITS);
4972  }
4973
4974  /* Pass 2: process 6 rows from work array, store into output array.
4975   * 3-point IDCT kernel, cK represents sqrt(2) * cos(K*pi/6).
4976   */
4977  wsptr = workspace;
4978  for (ctr = 0; ctr < 6; ctr++) {
4979    outptr = output_buf[ctr] + output_col;
4980
4981    /* Even part */
4982
4983    /* Add fudge factor here for final descale. */
4984    tmp0 = (INT32) wsptr[0] + (ONE << (PASS1_BITS+2));
4985    tmp0 <<= CONST_BITS;
4986    tmp2 = (INT32) wsptr[2];
4987    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
4988    tmp10 = tmp0 + tmp12;
4989    tmp2 = tmp0 - tmp12 - tmp12;
4990
4991    /* Odd part */
4992
4993    tmp12 = (INT32) wsptr[1];
4994    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
4995
4996    /* Final output stage */
4997
4998    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0,
4999                                              CONST_BITS+PASS1_BITS+3)
5000                            & RANGE_MASK];
5001    outptr[2] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0,
5002                                              CONST_BITS+PASS1_BITS+3)
5003                            & RANGE_MASK];
5004    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp2,
5005                                              CONST_BITS+PASS1_BITS+3)
5006                            & RANGE_MASK];
5007
5008    wsptr += 3;         /* advance pointer to next row */
5009  }
5010}
5011
5012
5013/*
5014 * Perform dequantization and inverse DCT on one block of coefficients,
5015 * producing a 2x4 output block.
5016 *
5017 * 4-point IDCT in pass 1 (columns), 2-point in pass 2 (rows).
5018 */
5019
5020GLOBAL(void)
5021jpeg_idct_2x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5022               JCOEFPTR coef_block,
5023               JSAMPARRAY output_buf, JDIMENSION output_col)
5024{
5025  INT32 tmp0, tmp2, tmp10, tmp12;
5026  INT32 z1, z2, z3;
5027  JCOEFPTR inptr;
5028  ISLOW_MULT_TYPE * quantptr;
5029  INT32 * wsptr;
5030  JSAMPROW outptr;
5031  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5032  int ctr;
5033  INT32 workspace[2*4]; /* buffers data between passes */
5034  SHIFT_TEMPS
5035
5036  /* Pass 1: process columns from input, store into work array.
5037   * 4-point IDCT kernel,
5038   * cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point IDCT].
5039   */
5040  inptr = coef_block;
5041  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5042  wsptr = workspace;
5043  for (ctr = 0; ctr < 2; ctr++, inptr++, quantptr++, wsptr++) {
5044    /* Even part */
5045
5046    tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]);
5047    tmp2 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]);
5048
5049    tmp10 = (tmp0 + tmp2) << CONST_BITS;
5050    tmp12 = (tmp0 - tmp2) << CONST_BITS;
5051
5052    /* Odd part */
5053    /* Same rotation as in the even part of the 8x8 LL&M IDCT */
5054
5055    z2 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]);
5056    z3 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]);
5057
5058    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);   /* c6 */
5059    tmp0 = z1 + MULTIPLY(z2, FIX_0_765366865); /* c2-c6 */
5060    tmp2 = z1 - MULTIPLY(z3, FIX_1_847759065); /* c2+c6 */
5061
5062    /* Final output stage */
5063
5064    wsptr[2*0] = tmp10 + tmp0;
5065    wsptr[2*3] = tmp10 - tmp0;
5066    wsptr[2*1] = tmp12 + tmp2;
5067    wsptr[2*2] = tmp12 - tmp2;
5068  }
5069
5070  /* Pass 2: process 4 rows from work array, store into output array. */
5071
5072  wsptr = workspace;
5073  for (ctr = 0; ctr < 4; ctr++) {
5074    outptr = output_buf[ctr] + output_col;
5075
5076    /* Even part */
5077
5078    /* Add fudge factor here for final descale. */
5079    tmp10 = wsptr[0] + (ONE << (CONST_BITS+2));
5080
5081    /* Odd part */
5082
5083    tmp0 = wsptr[1];
5084
5085    /* Final output stage */
5086
5087    outptr[0] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS+3)
5088                            & RANGE_MASK];
5089    outptr[1] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS+3)
5090                            & RANGE_MASK];
5091
5092    wsptr += 2;         /* advance pointer to next row */
5093  }
5094}
5095
5096
5097/*
5098 * Perform dequantization and inverse DCT on one block of coefficients,
5099 * producing a 1x2 output block.
5100 *
5101 * 2-point IDCT in pass 1 (columns), 1-point in pass 2 (rows).
5102 */
5103
5104GLOBAL(void)
5105jpeg_idct_1x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
5106               JCOEFPTR coef_block,
5107               JSAMPARRAY output_buf, JDIMENSION output_col)
5108{
5109  INT32 tmp0, tmp10;
5110  ISLOW_MULT_TYPE * quantptr;
5111  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
5112  SHIFT_TEMPS
5113
5114  /* Process 1 column from input, store into output array. */
5115
5116  quantptr = (ISLOW_MULT_TYPE *) compptr->dct_table;
5117
5118  /* Even part */
5119   
5120  tmp10 = DEQUANTIZE(coef_block[DCTSIZE*0], quantptr[DCTSIZE*0]);
5121  /* Add fudge factor here for final descale. */
5122  tmp10 += ONE << 2;
5123
5124  /* Odd part */
5125
5126  tmp0 = DEQUANTIZE(coef_block[DCTSIZE*1], quantptr[DCTSIZE*1]);
5127
5128  /* Final output stage */
5129
5130  output_buf[0][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 + tmp0, 3)
5131                                          & RANGE_MASK];
5132  output_buf[1][output_col] = range_limit[(int) RIGHT_SHIFT(tmp10 - tmp0, 3)
5133                                          & RANGE_MASK];
5134}
5135
5136#endif /* IDCT_SCALING_SUPPORTED */
5137#endif /* DCT_ISLOW_SUPPORTED */
Note: See TracBrowser for help on using the repository browser.