Context Navigation

source: rtems-graphics-toolkit/jpeg-8d/jfdctint.c @ 8d99938

Last change on this file since 8d99938 was 86b99f7, checked in by Alexandru-Sever Horin <alex.sever.h@…>, on 08/01/12 at 22:40:32
Added jpeg-8d version. Made modifications to compile for RTEMS, without man or binaries
Property mode set to `100644`
File size: 155.0 KB

Line
1	/*
2	* jfdctint.c
3	*
4	* Copyright (C) 1991-1996, Thomas G. Lane.
5	* Modification developed 2003-2009 by Guido Vollbeding.
6	* This file is part of the Independent JPEG Group's software.
7	* For conditions of distribution and use, see the accompanying README file.
8	*
9	* This file contains a slow-but-accurate integer implementation of the
10	* forward DCT (Discrete Cosine Transform).
11	*
12	* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
13	* on each column. Direct algorithms are also available, but they are
14	* much more complex and seem not to be any faster when reduced to code.
15	*
16	* This implementation is based on an algorithm described in
17	* C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
18	* Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
19	* Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
20	* The primary algorithm described there uses 11 multiplies and 29 adds.
21	* We use their alternate method with 12 multiplies and 32 adds.
22	* The advantage of this method is that no data path contains more than one
23	* multiplication; this allows a very simple and accurate implementation in
24	* scaled fixed-point arithmetic, with a minimal number of shifts.
25	*
26	* We also provide FDCT routines with various input sample block sizes for
27	* direct resolution reduction or enlargement and for direct resolving the
28	* common 2x1 and 1x2 subsampling cases without additional resampling: NxN
29	* (N=1...16), 2NxN, and Nx2N (N=1...8) pixels for one 8x8 output DCT block.
30	*
31	* For N<8 we fill the remaining block coefficients with zero.
32	* For N>8 we apply a partial N-point FDCT on the input samples, computing
33	* just the lower 8 frequency coefficients and discarding the rest.
34	*
35	* We must scale the output coefficients of the N-point FDCT appropriately
36	* to the standard 8-point FDCT level by 8/N per 1-D pass. This scaling
37	* is folded into the constant multipliers (pass 2) and/or final/initial
38	* shifting.
39	*
40	* CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
41	* since there would be too many additional constants to pre-calculate.
42	*/
43
44	#define JPEG_INTERNALS
45	#include "jinclude.h"
46	#include "jpeglib.h"
47	#include "jdct.h" /* Private declarations for DCT subsystem */
48
49	#ifdef DCT_ISLOW_SUPPORTED
50
51
52	/*
53	* This module is specialized to the case DCTSIZE = 8.
54	*/
55
56	#if DCTSIZE != 8
57	Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
58	#endif
59
60
61	/*
62	* The poop on this scaling stuff is as follows:
63	*
64	* Each 1-D DCT step produces outputs which are a factor of sqrt(N)
65	* larger than the true DCT outputs. The final outputs are therefore
66	* a factor of N larger than desired; since N=8 this can be cured by
67	* a simple right shift at the end of the algorithm. The advantage of
68	* this arrangement is that we save two multiplications per 1-D DCT,
69	* because the y0 and y4 outputs need not be divided by sqrt(N).
70	* In the IJG code, this factor of 8 is removed by the quantization step
71	* (in jcdctmgr.c), NOT in this module.
72	*
73	* We have to do addition and subtraction of the integer inputs, which
74	* is no problem, and multiplication by fractional constants, which is
75	* a problem to do in integer arithmetic. We multiply all the constants
76	* by CONST_SCALE and convert them to integer constants (thus retaining
77	* CONST_BITS bits of precision in the constants). After doing a
78	* multiplication we have to divide the product by CONST_SCALE, with proper
79	* rounding, to produce the correct output. This division can be done
80	* cheaply as a right shift of CONST_BITS bits. We postpone shifting
81	* as long as possible so that partial sums can be added together with
82	* full fractional precision.
83	*
84	* The outputs of the first pass are scaled up by PASS1_BITS bits so that
85	* they are represented to better-than-integral precision. These outputs
86	* require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
87	* with the recommended scaling. (For 12-bit sample data, the intermediate
88	* array is INT32 anyway.)
89	*
90	* To avoid overflow of the 32-bit intermediate results in pass 2, we must
91	* have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
92	* shows that the values given below are the most effective.
93	*/
94
95	#if BITS_IN_JSAMPLE == 8
96	#define CONST_BITS 13
97	#define PASS1_BITS 2
98	#else
99	#define CONST_BITS 13
100	#define PASS1_BITS 1 /* lose a little precision to avoid overflow */
101	#endif
102
103	/* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
104	* causing a lot of useless floating-point operations at run time.
105	* To get around this we use the following pre-calculated constants.
106	* If you change CONST_BITS you may want to add appropriate values.
107	* (With a reasonable C compiler, you can just rely on the FIX() macro...)
108	*/
109
110	#if CONST_BITS == 13
111	#define FIX_0_298631336 ((INT32) 2446) /* FIX(0.298631336) */
112	#define FIX_0_390180644 ((INT32) 3196) /* FIX(0.390180644) */
113	#define FIX_0_541196100 ((INT32) 4433) /* FIX(0.541196100) */
114	#define FIX_0_765366865 ((INT32) 6270) /* FIX(0.765366865) */
115	#define FIX_0_899976223 ((INT32) 7373) /* FIX(0.899976223) */
116	#define FIX_1_175875602 ((INT32) 9633) /* FIX(1.175875602) */
117	#define FIX_1_501321110 ((INT32) 12299) /* FIX(1.501321110) */
118	#define FIX_1_847759065 ((INT32) 15137) /* FIX(1.847759065) */
119	#define FIX_1_961570560 ((INT32) 16069) /* FIX(1.961570560) */
120	#define FIX_2_053119869 ((INT32) 16819) /* FIX(2.053119869) */
121	#define FIX_2_562915447 ((INT32) 20995) /* FIX(2.562915447) */
122	#define FIX_3_072711026 ((INT32) 25172) /* FIX(3.072711026) */
123	#else
124	#define FIX_0_298631336 FIX(0.298631336)
125	#define FIX_0_390180644 FIX(0.390180644)
126	#define FIX_0_541196100 FIX(0.541196100)
127	#define FIX_0_765366865 FIX(0.765366865)
128	#define FIX_0_899976223 FIX(0.899976223)
129	#define FIX_1_175875602 FIX(1.175875602)
130	#define FIX_1_501321110 FIX(1.501321110)
131	#define FIX_1_847759065 FIX(1.847759065)
132	#define FIX_1_961570560 FIX(1.961570560)
133	#define FIX_2_053119869 FIX(2.053119869)
134	#define FIX_2_562915447 FIX(2.562915447)
135	#define FIX_3_072711026 FIX(3.072711026)
136	#endif
137
138
139	/* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
140	* For 8-bit samples with the recommended scaling, all the variable
141	* and constant values involved are no more than 16 bits wide, so a
142	* 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
143	* For 12-bit samples, a full 32-bit multiplication will be needed.
144	*/
145
146	#if BITS_IN_JSAMPLE == 8
147	#define MULTIPLY(var,const) MULTIPLY16C16(var,const)
148	#else
149	#define MULTIPLY(var,const) ((var) * (const))
150	#endif
151
152
153	/*
154	* Perform the forward DCT on one block of samples.
155	*/
156
157	GLOBAL(void)
158	jpeg_fdct_islow (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
159	{
160	INT32 tmp0, tmp1, tmp2, tmp3;
161	INT32 tmp10, tmp11, tmp12, tmp13;
162	INT32 z1;
163	DCTELEM *dataptr;
164	JSAMPROW elemptr;
165	int ctr;
166	SHIFT_TEMPS
167
168	/* Pass 1: process rows. */
169	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
170	/* furthermore, we scale the results by 2*PASS1_BITS. /
171
172	dataptr = data;
173	for (ctr = 0; ctr < DCTSIZE; ctr++) {
174	elemptr = sample_data[ctr] + start_col;
175
176	/* Even part per LL&M figure 1 --- note that published figure is faulty;
177	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
178	*/
179
180	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
181	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
182	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
183	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
184
185	tmp10 = tmp0 + tmp3;
186	tmp12 = tmp0 - tmp3;
187	tmp11 = tmp1 + tmp2;
188	tmp13 = tmp1 - tmp2;
189
190	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
191	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
192	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
193	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
194
195	/* Apply unsigned->signed conversion */
196	dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
197	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
198
199	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
200	/* Add fudge factor here for final descale. */
201	z1 += ONE << (CONST_BITS-PASS1_BITS-1);
202	dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
203	CONST_BITS-PASS1_BITS);
204	dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
205	CONST_BITS-PASS1_BITS);
206
207	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
208	* cK represents sqrt(2) * cos(K*pi/16).
209	* i0..i3 in the paper are tmp0..tmp3 here.
210	*/
211
212	tmp10 = tmp0 + tmp3;
213	tmp11 = tmp1 + tmp2;
214	tmp12 = tmp0 + tmp2;
215	tmp13 = tmp1 + tmp3;
216	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
217	/* Add fudge factor here for final descale. */
218	z1 += ONE << (CONST_BITS-PASS1_BITS-1);
219
220	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
221	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
222	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
223	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
224	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
225	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
226	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
227	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
228
229	tmp12 += z1;
230	tmp13 += z1;
231
232	dataptr[1] = (DCTELEM)
233	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
234	dataptr[3] = (DCTELEM)
235	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
236	dataptr[5] = (DCTELEM)
237	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
238	dataptr[7] = (DCTELEM)
239	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
240
241	dataptr += DCTSIZE; /* advance pointer to next row */
242	}
243
244	/* Pass 2: process columns.
245	* We remove the PASS1_BITS scaling, but leave the results scaled up
246	* by an overall factor of 8.
247	*/
248
249	dataptr = data;
250	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
251	/* Even part per LL&M figure 1 --- note that published figure is faulty;
252	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
253	*/
254
255	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
256	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
257	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
258	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
259
260	/* Add fudge factor here for final descale. */
261	tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
262	tmp12 = tmp0 - tmp3;
263	tmp11 = tmp1 + tmp2;
264	tmp13 = tmp1 - tmp2;
265
266	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
267	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
268	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
269	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
270
271	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
272	dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
273
274	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
275	/* Add fudge factor here for final descale. */
276	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
277	dataptr[DCTSIZE*2] = (DCTELEM)
278	RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
279	dataptr[DCTSIZE*6] = (DCTELEM)
280	RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
281
282	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
283	* cK represents sqrt(2) * cos(K*pi/16).
284	* i0..i3 in the paper are tmp0..tmp3 here.
285	*/
286
287	tmp10 = tmp0 + tmp3;
288	tmp11 = tmp1 + tmp2;
289	tmp12 = tmp0 + tmp2;
290	tmp13 = tmp1 + tmp3;
291	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
292	/* Add fudge factor here for final descale. */
293	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
294
295	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
296	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
297	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
298	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
299	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
300	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
301	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
302	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
303
304	tmp12 += z1;
305	tmp13 += z1;
306
307	dataptr[DCTSIZE*1] = (DCTELEM)
308	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
309	dataptr[DCTSIZE*3] = (DCTELEM)
310	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
311	dataptr[DCTSIZE*5] = (DCTELEM)
312	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
313	dataptr[DCTSIZE*7] = (DCTELEM)
314	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
315
316	dataptr++; /* advance pointer to next column */
317	}
318	}
319
320	#ifdef DCT_SCALING_SUPPORTED
321
322
323	/*
324	* Perform the forward DCT on a 7x7 sample block.
325	*/
326
327	GLOBAL(void)
328	jpeg_fdct_7x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
329	{
330	INT32 tmp0, tmp1, tmp2, tmp3;
331	INT32 tmp10, tmp11, tmp12;
332	INT32 z1, z2, z3;
333	DCTELEM *dataptr;
334	JSAMPROW elemptr;
335	int ctr;
336	SHIFT_TEMPS
337
338	/* Pre-zero output coefficient block. */
339	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
340
341	/* Pass 1: process rows. */
342	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
343	/* furthermore, we scale the results by 2*PASS1_BITS. /
344	/* cK represents sqrt(2) * cos(Kpi/14). /
345
346	dataptr = data;
347	for (ctr = 0; ctr < 7; ctr++) {
348	elemptr = sample_data[ctr] + start_col;
349
350	/* Even part */
351
352	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
353	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
354	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
355	tmp3 = GETJSAMPLE(elemptr[3]);
356
357	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
358	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
359	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
360
361	z1 = tmp0 + tmp2;
362	/* Apply unsigned->signed conversion */
363	dataptr[0] = (DCTELEM)
364	((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
365	tmp3 += tmp3;
366	z1 -= tmp3;
367	z1 -= tmp3;
368	z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
369	z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
370	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
371	dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
372	z1 -= z2;
373	z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
374	dataptr[4] = (DCTELEM)
375	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
376	CONST_BITS-PASS1_BITS);
377	dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
378
379	/* Odd part */
380
381	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
382	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
383	tmp0 = tmp1 - tmp2;
384	tmp1 += tmp2;
385	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
386	tmp1 += tmp2;
387	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
388	tmp0 += tmp3;
389	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
390
391	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
392	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
393	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
394
395	dataptr += DCTSIZE; /* advance pointer to next row */
396	}
397
398	/* Pass 2: process columns.
399	* We remove the PASS1_BITS scaling, but leave the results scaled up
400	* by an overall factor of 8.
401	* We must also scale the output by (8/7)**2 = 64/49, which we fold
402	* into the constant multipliers:
403	* cK now represents sqrt(2) * cos(Kpi/14) 64/49.
404	*/
405
406	dataptr = data;
407	for (ctr = 0; ctr < 7; ctr++) {
408	/* Even part */
409
410	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE6];
411	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE5];
412	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE4];
413	tmp3 = dataptr[DCTSIZE*3];
414
415	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE6];
416	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE5];
417	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE4];
418
419	z1 = tmp0 + tmp2;
420	dataptr[DCTSIZE*0] = (DCTELEM)
421	DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
422	CONST_BITS+PASS1_BITS);
423	tmp3 += tmp3;
424	z1 -= tmp3;
425	z1 -= tmp3;
426	z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
427	z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
428	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
429	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS);
430	z1 -= z2;
431	z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
432	dataptr[DCTSIZE*4] = (DCTELEM)
433	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
434	CONST_BITS+PASS1_BITS);
435	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS);
436
437	/* Odd part */
438
439	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
440	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
441	tmp0 = tmp1 - tmp2;
442	tmp1 += tmp2;
443	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
444	tmp1 += tmp2;
445	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
446	tmp0 += tmp3;
447	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
448
449	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS);
450	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS);
451	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS);
452
453	dataptr++; /* advance pointer to next column */
454	}
455	}
456
457
458	/*
459	* Perform the forward DCT on a 6x6 sample block.
460	*/
461
462	GLOBAL(void)
463	jpeg_fdct_6x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
464	{
465	INT32 tmp0, tmp1, tmp2;
466	INT32 tmp10, tmp11, tmp12;
467	DCTELEM *dataptr;
468	JSAMPROW elemptr;
469	int ctr;
470	SHIFT_TEMPS
471
472	/* Pre-zero output coefficient block. */
473	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
474
475	/* Pass 1: process rows. */
476	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
477	/* furthermore, we scale the results by 2*PASS1_BITS. /
478	/* cK represents sqrt(2) * cos(Kpi/12). /
479
480	dataptr = data;
481	for (ctr = 0; ctr < 6; ctr++) {
482	elemptr = sample_data[ctr] + start_col;
483
484	/* Even part */
485
486	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
487	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
488	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
489
490	tmp10 = tmp0 + tmp2;
491	tmp12 = tmp0 - tmp2;
492
493	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
494	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
495	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
496
497	/* Apply unsigned->signed conversion */
498	dataptr[0] = (DCTELEM)
499	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
500	dataptr[2] = (DCTELEM)
501	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
502	CONST_BITS-PASS1_BITS);
503	dataptr[4] = (DCTELEM)
504	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
505	CONST_BITS-PASS1_BITS);
506
507	/* Odd part */
508
509	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
510	CONST_BITS-PASS1_BITS);
511
512	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
513	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
514	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
515
516	dataptr += DCTSIZE; /* advance pointer to next row */
517	}
518
519	/* Pass 2: process columns.
520	* We remove the PASS1_BITS scaling, but leave the results scaled up
521	* by an overall factor of 8.
522	* We must also scale the output by (8/6)**2 = 16/9, which we fold
523	* into the constant multipliers:
524	* cK now represents sqrt(2) * cos(Kpi/12) 16/9.
525	*/
526
527	dataptr = data;
528	for (ctr = 0; ctr < 6; ctr++) {
529	/* Even part */
530
531	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
532	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
533	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
534
535	tmp10 = tmp0 + tmp2;
536	tmp12 = tmp0 - tmp2;
537
538	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
539	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
540	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
541
542	dataptr[DCTSIZE*0] = (DCTELEM)
543	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
544	CONST_BITS+PASS1_BITS);
545	dataptr[DCTSIZE*2] = (DCTELEM)
546	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
547	CONST_BITS+PASS1_BITS);
548	dataptr[DCTSIZE*4] = (DCTELEM)
549	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
550	CONST_BITS+PASS1_BITS);
551
552	/* Odd part */
553
554	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
555
556	dataptr[DCTSIZE*1] = (DCTELEM)
557	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
558	CONST_BITS+PASS1_BITS);
559	dataptr[DCTSIZE*3] = (DCTELEM)
560	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
561	CONST_BITS+PASS1_BITS);
562	dataptr[DCTSIZE*5] = (DCTELEM)
563	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
564	CONST_BITS+PASS1_BITS);
565
566	dataptr++; /* advance pointer to next column */
567	}
568	}
569
570
571	/*
572	* Perform the forward DCT on a 5x5 sample block.
573	*/
574
575	GLOBAL(void)
576	jpeg_fdct_5x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
577	{
578	INT32 tmp0, tmp1, tmp2;
579	INT32 tmp10, tmp11;
580	DCTELEM *dataptr;
581	JSAMPROW elemptr;
582	int ctr;
583	SHIFT_TEMPS
584
585	/* Pre-zero output coefficient block. */
586	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
587
588	/* Pass 1: process rows. */
589	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
590	/* furthermore, we scale the results by 2*PASS1_BITS. /
591	/* We scale the results further by 2 as part of output adaption */
592	/* scaling for different DCT size. */
593	/* cK represents sqrt(2) * cos(Kpi/10). /
594
595	dataptr = data;
596	for (ctr = 0; ctr < 5; ctr++) {
597	elemptr = sample_data[ctr] + start_col;
598
599	/* Even part */
600
601	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
602	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
603	tmp2 = GETJSAMPLE(elemptr[2]);
604
605	tmp10 = tmp0 + tmp1;
606	tmp11 = tmp0 - tmp1;
607
608	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
609	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
610
611	/* Apply unsigned->signed conversion */
612	dataptr[0] = (DCTELEM)
613	((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << (PASS1_BITS+1));
614	tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
615	tmp10 -= tmp2 << 2;
616	tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
617	dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS-1);
618	dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS-1);
619
620	/* Odd part */
621
622	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
623
624	dataptr[1] = (DCTELEM)
625	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
626	CONST_BITS-PASS1_BITS-1);
627	dataptr[3] = (DCTELEM)
628	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
629	CONST_BITS-PASS1_BITS-1);
630
631	dataptr += DCTSIZE; /* advance pointer to next row */
632	}
633
634	/* Pass 2: process columns.
635	* We remove the PASS1_BITS scaling, but leave the results scaled up
636	* by an overall factor of 8.
637	* We must also scale the output by (8/5)**2 = 64/25, which we partially
638	* fold into the constant multipliers (other part was done in pass 1):
639	* cK now represents sqrt(2) * cos(Kpi/10) 32/25.
640	*/
641
642	dataptr = data;
643	for (ctr = 0; ctr < 5; ctr++) {
644	/* Even part */
645
646	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE4];
647	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE3];
648	tmp2 = dataptr[DCTSIZE*2];
649
650	tmp10 = tmp0 + tmp1;
651	tmp11 = tmp0 - tmp1;
652
653	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE4];
654	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE3];
655
656	dataptr[DCTSIZE*0] = (DCTELEM)
657	DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
658	CONST_BITS+PASS1_BITS);
659	tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
660	tmp10 -= tmp2 << 2;
661	tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
662	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
663	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
664
665	/* Odd part */
666
667	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
668
669	dataptr[DCTSIZE*1] = (DCTELEM)
670	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
671	CONST_BITS+PASS1_BITS);
672	dataptr[DCTSIZE*3] = (DCTELEM)
673	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
674	CONST_BITS+PASS1_BITS);
675
676	dataptr++; /* advance pointer to next column */
677	}
678	}
679
680
681	/*
682	* Perform the forward DCT on a 4x4 sample block.
683	*/
684
685	GLOBAL(void)
686	jpeg_fdct_4x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
687	{
688	INT32 tmp0, tmp1;
689	INT32 tmp10, tmp11;
690	DCTELEM *dataptr;
691	JSAMPROW elemptr;
692	int ctr;
693	SHIFT_TEMPS
694
695	/* Pre-zero output coefficient block. */
696	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
697
698	/* Pass 1: process rows. */
699	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
700	/* furthermore, we scale the results by 2*PASS1_BITS. /
701	/* We must also scale the output by (8/4)2 = 22, which we add here. */
702	/* cK represents sqrt(2) * cos(Kpi/16) [refers to 8-point FDCT]. /
703
704	dataptr = data;
705	for (ctr = 0; ctr < 4; ctr++) {
706	elemptr = sample_data[ctr] + start_col;
707
708	/* Even part */
709
710	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
711	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
712
713	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
714	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
715
716	/* Apply unsigned->signed conversion */
717	dataptr[0] = (DCTELEM)
718	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+2));
719	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+2));
720
721	/* Odd part */
722
723	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
724	/* Add fudge factor here for final descale. */
725	tmp0 += ONE << (CONST_BITS-PASS1_BITS-3);
726
727	dataptr[1] = (DCTELEM)
728	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
729	CONST_BITS-PASS1_BITS-2);
730	dataptr[3] = (DCTELEM)
731	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
732	CONST_BITS-PASS1_BITS-2);
733
734	dataptr += DCTSIZE; /* advance pointer to next row */
735	}
736
737	/* Pass 2: process columns.
738	* We remove the PASS1_BITS scaling, but leave the results scaled up
739	* by an overall factor of 8.
740	*/
741
742	dataptr = data;
743	for (ctr = 0; ctr < 4; ctr++) {
744	/* Even part */
745
746	/* Add fudge factor here for final descale. */
747	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3] + (ONE << (PASS1_BITS-1));
748	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
749
750	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
751	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
752
753	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
754	dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
755
756	/* Odd part */
757
758	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
759	/* Add fudge factor here for final descale. */
760	tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
761
762	dataptr[DCTSIZE*1] = (DCTELEM)
763	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
764	CONST_BITS+PASS1_BITS);
765	dataptr[DCTSIZE*3] = (DCTELEM)
766	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
767	CONST_BITS+PASS1_BITS);
768
769	dataptr++; /* advance pointer to next column */
770	}
771	}
772
773
774	/*
775	* Perform the forward DCT on a 3x3 sample block.
776	*/
777
778	GLOBAL(void)
779	jpeg_fdct_3x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
780	{
781	INT32 tmp0, tmp1, tmp2;
782	DCTELEM *dataptr;
783	JSAMPROW elemptr;
784	int ctr;
785	SHIFT_TEMPS
786
787	/* Pre-zero output coefficient block. */
788	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
789
790	/* Pass 1: process rows. */
791	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
792	/* furthermore, we scale the results by 2*PASS1_BITS. /
793	/* We scale the results further by 2*2 as part of output adaption /
794	/* scaling for different DCT size. */
795	/* cK represents sqrt(2) * cos(Kpi/6). /
796
797	dataptr = data;
798	for (ctr = 0; ctr < 3; ctr++) {
799	elemptr = sample_data[ctr] + start_col;
800
801	/* Even part */
802
803	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
804	tmp1 = GETJSAMPLE(elemptr[1]);
805
806	tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
807
808	/* Apply unsigned->signed conversion */
809	dataptr[0] = (DCTELEM)
810	((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+2));
811	dataptr[2] = (DCTELEM)
812	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
813	CONST_BITS-PASS1_BITS-2);
814
815	/* Odd part */
816
817	dataptr[1] = (DCTELEM)
818	DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
819	CONST_BITS-PASS1_BITS-2);
820
821	dataptr += DCTSIZE; /* advance pointer to next row */
822	}
823
824	/* Pass 2: process columns.
825	* We remove the PASS1_BITS scaling, but leave the results scaled up
826	* by an overall factor of 8.
827	* We must also scale the output by (8/3)**2 = 64/9, which we partially
828	* fold into the constant multipliers (other part was done in pass 1):
829	* cK now represents sqrt(2) * cos(Kpi/6) 16/9.
830	*/
831
832	dataptr = data;
833	for (ctr = 0; ctr < 3; ctr++) {
834	/* Even part */
835
836	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE2];
837	tmp1 = dataptr[DCTSIZE*1];
838
839	tmp2 = dataptr[DCTSIZE0] - dataptr[DCTSIZE2];
840
841	dataptr[DCTSIZE*0] = (DCTELEM)
842	DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
843	CONST_BITS+PASS1_BITS);
844	dataptr[DCTSIZE*2] = (DCTELEM)
845	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
846	CONST_BITS+PASS1_BITS);
847
848	/* Odd part */
849
850	dataptr[DCTSIZE*1] = (DCTELEM)
851	DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
852	CONST_BITS+PASS1_BITS);
853
854	dataptr++; /* advance pointer to next column */
855	}
856	}
857
858
859	/*
860	* Perform the forward DCT on a 2x2 sample block.
861	*/
862
863	GLOBAL(void)
864	jpeg_fdct_2x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
865	{
866	INT32 tmp0, tmp1, tmp2, tmp3;
867	JSAMPROW elemptr;
868
869	/* Pre-zero output coefficient block. */
870	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
871
872	/* Pass 1: process rows. */
873	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
874
875	/* Row 0 */
876	elemptr = sample_data[0] + start_col;
877
878	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
879	tmp1 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
880
881	/* Row 1 */
882	elemptr = sample_data[1] + start_col;
883
884	tmp2 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[1]);
885	tmp3 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[1]);
886
887	/* Pass 2: process columns.
888	* We leave the results scaled up by an overall factor of 8.
889	* We must also scale the output by (8/2)2 = 24.
890	*/
891
892	/* Column 0 */
893	/* Apply unsigned->signed conversion */
894	data[DCTSIZE0] = (DCTELEM) ((tmp0 + tmp2 - 4 CENTERJSAMPLE) << 4);
895	data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp2) << 4);
896
897	/* Column 1 */
898	data[DCTSIZE*0+1] = (DCTELEM) ((tmp1 + tmp3) << 4);
899	data[DCTSIZE*1+1] = (DCTELEM) ((tmp1 - tmp3) << 4);
900	}
901
902
903	/*
904	* Perform the forward DCT on a 1x1 sample block.
905	*/
906
907	GLOBAL(void)
908	jpeg_fdct_1x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
909	{
910	/* Pre-zero output coefficient block. */
911	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
912
913	/* We leave the result scaled up by an overall factor of 8. */
914	/* We must also scale the output by (8/1)2 = 26. */
915	/* Apply unsigned->signed conversion */
916	data[0] = (DCTELEM)
917	((GETJSAMPLE(sample_data[0][start_col]) - CENTERJSAMPLE) << 6);
918	}
919
920
921	/*
922	* Perform the forward DCT on a 9x9 sample block.
923	*/
924
925	GLOBAL(void)
926	jpeg_fdct_9x9 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
927	{
928	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
929	INT32 tmp10, tmp11, tmp12, tmp13;
930	INT32 z1, z2;
931	DCTELEM workspace[8];
932	DCTELEM *dataptr;
933	DCTELEM *wsptr;
934	JSAMPROW elemptr;
935	int ctr;
936	SHIFT_TEMPS
937
938	/* Pass 1: process rows. */
939	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
940	/* we scale the results further by 2 as part of output adaption */
941	/* scaling for different DCT size. */
942	/* cK represents sqrt(2) * cos(Kpi/18). /
943
944	dataptr = data;
945	ctr = 0;
946	for (;;) {
947	elemptr = sample_data[ctr] + start_col;
948
949	/* Even part */
950
951	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[8]);
952	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[7]);
953	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[6]);
954	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[5]);
955	tmp4 = GETJSAMPLE(elemptr[4]);
956
957	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[8]);
958	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[7]);
959	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[6]);
960	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[5]);
961
962	z1 = tmp0 + tmp2 + tmp3;
963	z2 = tmp1 + tmp4;
964	/* Apply unsigned->signed conversion */
965	dataptr[0] = (DCTELEM) ((z1 + z2 - 9 * CENTERJSAMPLE) << 1);
966	dataptr[6] = (DCTELEM)
967	DESCALE(MULTIPLY(z1 - z2 - z2, FIX(0.707106781)), /* c6 */
968	CONST_BITS-1);
969	z1 = MULTIPLY(tmp0 - tmp2, FIX(1.328926049)); /* c2 */
970	z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(0.707106781)); /* c6 */
971	dataptr[2] = (DCTELEM)
972	DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.083350441)) /* c4 */
973	+ z1 + z2, CONST_BITS-1);
974	dataptr[4] = (DCTELEM)
975	DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.245575608)) /* c8 */
976	+ z1 - z2, CONST_BITS-1);
977
978	/* Odd part */
979
980	dataptr[3] = (DCTELEM)
981	DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.224744871)), /* c3 */
982	CONST_BITS-1);
983
984	tmp11 = MULTIPLY(tmp11, FIX(1.224744871)); /* c3 */
985	tmp0 = MULTIPLY(tmp10 + tmp12, FIX(0.909038955)); /* c5 */
986	tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.483689525)); /* c7 */
987
988	dataptr[1] = (DCTELEM) DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS-1);
989
990	tmp2 = MULTIPLY(tmp12 - tmp13, FIX(1.392728481)); /* c1 */
991
992	dataptr[5] = (DCTELEM) DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS-1);
993	dataptr[7] = (DCTELEM) DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS-1);
994
995	ctr++;
996
997	if (ctr != DCTSIZE) {
998	if (ctr == 9)
999	break; /* Done. */
1000	dataptr += DCTSIZE; /* advance pointer to next row */
1001	} else
1002	dataptr = workspace; /* switch pointer to extended workspace */
1003	}
1004
1005	/* Pass 2: process columns.
1006	* We leave the results scaled up by an overall factor of 8.
1007	* We must also scale the output by (8/9)**2 = 64/81, which we partially
1008	* fold into the constant multipliers and final/initial shifting:
1009	* cK now represents sqrt(2) * cos(Kpi/18) 128/81.
1010	*/
1011
1012	dataptr = data;
1013	wsptr = workspace;
1014	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1015	/* Even part */
1016
1017	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE0];
1018	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE7];
1019	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE6];
1020	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE5];
1021	tmp4 = dataptr[DCTSIZE*4];
1022
1023	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE0];
1024	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE7];
1025	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE6];
1026	tmp13 = dataptr[DCTSIZE3] - dataptr[DCTSIZE5];
1027
1028	z1 = tmp0 + tmp2 + tmp3;
1029	z2 = tmp1 + tmp4;
1030	dataptr[DCTSIZE*0] = (DCTELEM)
1031	DESCALE(MULTIPLY(z1 + z2, FIX(1.580246914)), /* 128/81 */
1032	CONST_BITS+2);
1033	dataptr[DCTSIZE*6] = (DCTELEM)
1034	DESCALE(MULTIPLY(z1 - z2 - z2, FIX(1.117403309)), /* c6 */
1035	CONST_BITS+2);
1036	z1 = MULTIPLY(tmp0 - tmp2, FIX(2.100031287)); /* c2 */
1037	z2 = MULTIPLY(tmp1 - tmp4 - tmp4, FIX(1.117403309)); /* c6 */
1038	dataptr[DCTSIZE*2] = (DCTELEM)
1039	DESCALE(MULTIPLY(tmp2 - tmp3, FIX(1.711961190)) /* c4 */
1040	+ z1 + z2, CONST_BITS+2);
1041	dataptr[DCTSIZE*4] = (DCTELEM)
1042	DESCALE(MULTIPLY(tmp3 - tmp0, FIX(0.388070096)) /* c8 */
1043	+ z1 - z2, CONST_BITS+2);
1044
1045	/* Odd part */
1046
1047	dataptr[DCTSIZE*3] = (DCTELEM)
1048	DESCALE(MULTIPLY(tmp10 - tmp12 - tmp13, FIX(1.935399303)), /* c3 */
1049	CONST_BITS+2);
1050
1051	tmp11 = MULTIPLY(tmp11, FIX(1.935399303)); /* c3 */
1052	tmp0 = MULTIPLY(tmp10 + tmp12, FIX(1.436506004)); /* c5 */
1053	tmp1 = MULTIPLY(tmp10 + tmp13, FIX(0.764348879)); /* c7 */
1054
1055	dataptr[DCTSIZE*1] = (DCTELEM)
1056	DESCALE(tmp11 + tmp0 + tmp1, CONST_BITS+2);
1057
1058	tmp2 = MULTIPLY(tmp12 - tmp13, FIX(2.200854883)); /* c1 */
1059
1060	dataptr[DCTSIZE*5] = (DCTELEM)
1061	DESCALE(tmp0 - tmp11 - tmp2, CONST_BITS+2);
1062	dataptr[DCTSIZE*7] = (DCTELEM)
1063	DESCALE(tmp1 - tmp11 + tmp2, CONST_BITS+2);
1064
1065	dataptr++; /* advance pointer to next column */
1066	wsptr++; /* advance pointer to next column */
1067	}
1068	}
1069
1070
1071	/*
1072	* Perform the forward DCT on a 10x10 sample block.
1073	*/
1074
1075	GLOBAL(void)
1076	jpeg_fdct_10x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1077	{
1078	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
1079	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1080	DCTELEM workspace[8*2];
1081	DCTELEM *dataptr;
1082	DCTELEM *wsptr;
1083	JSAMPROW elemptr;
1084	int ctr;
1085	SHIFT_TEMPS
1086
1087	/* Pass 1: process rows. */
1088	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
1089	/* we scale the results further by 2 as part of output adaption */
1090	/* scaling for different DCT size. */
1091	/* cK represents sqrt(2) * cos(Kpi/20). /
1092
1093	dataptr = data;
1094	ctr = 0;
1095	for (;;) {
1096	elemptr = sample_data[ctr] + start_col;
1097
1098	/* Even part */
1099
1100	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
1101	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
1102	tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
1103	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
1104	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
1105
1106	tmp10 = tmp0 + tmp4;
1107	tmp13 = tmp0 - tmp4;
1108	tmp11 = tmp1 + tmp3;
1109	tmp14 = tmp1 - tmp3;
1110
1111	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
1112	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
1113	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
1114	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
1115	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
1116
1117	/* Apply unsigned->signed conversion */
1118	dataptr[0] = (DCTELEM)
1119	((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << 1);
1120	tmp12 += tmp12;
1121	dataptr[4] = (DCTELEM)
1122	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
1123	MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
1124	CONST_BITS-1);
1125	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
1126	dataptr[2] = (DCTELEM)
1127	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
1128	CONST_BITS-1);
1129	dataptr[6] = (DCTELEM)
1130	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
1131	CONST_BITS-1);
1132
1133	/* Odd part */
1134
1135	tmp10 = tmp0 + tmp4;
1136	tmp11 = tmp1 - tmp3;
1137	dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << 1);
1138	tmp2 <<= CONST_BITS;
1139	dataptr[1] = (DCTELEM)
1140	DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
1141	MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
1142	MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
1143	MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
1144	CONST_BITS-1);
1145	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
1146	MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
1147	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
1148	(tmp11 << (CONST_BITS - 1)) - tmp2;
1149	dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-1);
1150	dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-1);
1151
1152	ctr++;
1153
1154	if (ctr != DCTSIZE) {
1155	if (ctr == 10)
1156	break; /* Done. */
1157	dataptr += DCTSIZE; /* advance pointer to next row */
1158	} else
1159	dataptr = workspace; /* switch pointer to extended workspace */
1160	}
1161
1162	/* Pass 2: process columns.
1163	* We leave the results scaled up by an overall factor of 8.
1164	* We must also scale the output by (8/10)**2 = 16/25, which we partially
1165	* fold into the constant multipliers and final/initial shifting:
1166	* cK now represents sqrt(2) * cos(Kpi/20) 32/25.
1167	*/
1168
1169	dataptr = data;
1170	wsptr = workspace;
1171	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1172	/* Even part */
1173
1174	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE1];
1175	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE0];
1176	tmp12 = dataptr[DCTSIZE2] + dataptr[DCTSIZE7];
1177	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE6];
1178	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE5];
1179
1180	tmp10 = tmp0 + tmp4;
1181	tmp13 = tmp0 - tmp4;
1182	tmp11 = tmp1 + tmp3;
1183	tmp14 = tmp1 - tmp3;
1184
1185	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE1];
1186	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE0];
1187	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE7];
1188	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE6];
1189	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE5];
1190
1191	dataptr[DCTSIZE*0] = (DCTELEM)
1192	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
1193	CONST_BITS+2);
1194	tmp12 += tmp12;
1195	dataptr[DCTSIZE*4] = (DCTELEM)
1196	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
1197	MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
1198	CONST_BITS+2);
1199	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
1200	dataptr[DCTSIZE*2] = (DCTELEM)
1201	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
1202	CONST_BITS+2);
1203	dataptr[DCTSIZE*6] = (DCTELEM)
1204	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
1205	CONST_BITS+2);
1206
1207	/* Odd part */
1208
1209	tmp10 = tmp0 + tmp4;
1210	tmp11 = tmp1 - tmp3;
1211	dataptr[DCTSIZE*5] = (DCTELEM)
1212	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
1213	CONST_BITS+2);
1214	tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
1215	dataptr[DCTSIZE*1] = (DCTELEM)
1216	DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
1217	MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
1218	MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
1219	MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
1220	CONST_BITS+2);
1221	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
1222	MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
1223	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
1224	MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
1225	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+2);
1226	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+2);
1227
1228	dataptr++; /* advance pointer to next column */
1229	wsptr++; /* advance pointer to next column */
1230	}
1231	}
1232
1233
1234	/*
1235	* Perform the forward DCT on an 11x11 sample block.
1236	*/
1237
1238	GLOBAL(void)
1239	jpeg_fdct_11x11 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1240	{
1241	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1242	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
1243	INT32 z1, z2, z3;
1244	DCTELEM workspace[8*3];
1245	DCTELEM *dataptr;
1246	DCTELEM *wsptr;
1247	JSAMPROW elemptr;
1248	int ctr;
1249	SHIFT_TEMPS
1250
1251	/* Pass 1: process rows. */
1252	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
1253	/* we scale the results further by 2 as part of output adaption */
1254	/* scaling for different DCT size. */
1255	/* cK represents sqrt(2) * cos(Kpi/22). /
1256
1257	dataptr = data;
1258	ctr = 0;
1259	for (;;) {
1260	elemptr = sample_data[ctr] + start_col;
1261
1262	/* Even part */
1263
1264	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[10]);
1265	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[9]);
1266	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[8]);
1267	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[7]);
1268	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[6]);
1269	tmp5 = GETJSAMPLE(elemptr[5]);
1270
1271	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[10]);
1272	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[9]);
1273	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[8]);
1274	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[7]);
1275	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[6]);
1276
1277	/* Apply unsigned->signed conversion */
1278	dataptr[0] = (DCTELEM)
1279	((tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 - 11 * CENTERJSAMPLE) << 1);
1280	tmp5 += tmp5;
1281	tmp0 -= tmp5;
1282	tmp1 -= tmp5;
1283	tmp2 -= tmp5;
1284	tmp3 -= tmp5;
1285	tmp4 -= tmp5;
1286	z1 = MULTIPLY(tmp0 + tmp3, FIX(1.356927976)) + /* c2 */
1287	MULTIPLY(tmp2 + tmp4, FIX(0.201263574)); /* c10 */
1288	z2 = MULTIPLY(tmp1 - tmp3, FIX(0.926112931)); /* c6 */
1289	z3 = MULTIPLY(tmp0 - tmp1, FIX(1.189712156)); /* c4 */
1290	dataptr[2] = (DCTELEM)
1291	DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.018300590)) /* c2+c8-c6 */
1292	- MULTIPLY(tmp4, FIX(1.390975730)), /* c4+c10 */
1293	CONST_BITS-1);
1294	dataptr[4] = (DCTELEM)
1295	DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.062335650)) /* c4-c6-c10 */
1296	- MULTIPLY(tmp2, FIX(1.356927976)) /* c2 */
1297	+ MULTIPLY(tmp4, FIX(0.587485545)), /* c8 */
1298	CONST_BITS-1);
1299	dataptr[6] = (DCTELEM)
1300	DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.620527200)) /* c2+c4-c6 */
1301	- MULTIPLY(tmp2, FIX(0.788749120)), /* c8+c10 */
1302	CONST_BITS-1);
1303
1304	/* Odd part */
1305
1306	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.286413905)); /* c3 */
1307	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.068791298)); /* c5 */
1308	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.764581576)); /* c7 */
1309	tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.719967871)) /* c7+c5+c3-c1 */
1310	+ MULTIPLY(tmp14, FIX(0.398430003)); /* c9 */
1311	tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.764581576)); /* -c7 */
1312	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.399818907)); /* -c1 */
1313	tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.276416582)) /* c9+c7+c1-c3 */
1314	- MULTIPLY(tmp14, FIX(1.068791298)); /* c5 */
1315	tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.398430003)); /* c9 */
1316	tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(1.989053629)) /* c9+c5+c3-c7 */
1317	+ MULTIPLY(tmp14, FIX(1.399818907)); /* c1 */
1318	tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.305598626)) /* c1+c5-c9-c7 */
1319	- MULTIPLY(tmp14, FIX(1.286413905)); /* c3 */
1320
1321	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-1);
1322	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-1);
1323	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-1);
1324	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS-1);
1325
1326	ctr++;
1327
1328	if (ctr != DCTSIZE) {
1329	if (ctr == 11)
1330	break; /* Done. */
1331	dataptr += DCTSIZE; /* advance pointer to next row */
1332	} else
1333	dataptr = workspace; /* switch pointer to extended workspace */
1334	}
1335
1336	/* Pass 2: process columns.
1337	* We leave the results scaled up by an overall factor of 8.
1338	* We must also scale the output by (8/11)**2 = 64/121, which we partially
1339	* fold into the constant multipliers and final/initial shifting:
1340	* cK now represents sqrt(2) * cos(Kpi/22) 128/121.
1341	*/
1342
1343	dataptr = data;
1344	wsptr = workspace;
1345	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1346	/* Even part */
1347
1348	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE2];
1349	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE1];
1350	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE0];
1351	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE7];
1352	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE6];
1353	tmp5 = dataptr[DCTSIZE*5];
1354
1355	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE2];
1356	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE1];
1357	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE0];
1358	tmp13 = dataptr[DCTSIZE3] - dataptr[DCTSIZE7];
1359	tmp14 = dataptr[DCTSIZE4] - dataptr[DCTSIZE6];
1360
1361	dataptr[DCTSIZE*0] = (DCTELEM)
1362	DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5,
1363	FIX(1.057851240)), /* 128/121 */
1364	CONST_BITS+2);
1365	tmp5 += tmp5;
1366	tmp0 -= tmp5;
1367	tmp1 -= tmp5;
1368	tmp2 -= tmp5;
1369	tmp3 -= tmp5;
1370	tmp4 -= tmp5;
1371	z1 = MULTIPLY(tmp0 + tmp3, FIX(1.435427942)) + /* c2 */
1372	MULTIPLY(tmp2 + tmp4, FIX(0.212906922)); /* c10 */
1373	z2 = MULTIPLY(tmp1 - tmp3, FIX(0.979689713)); /* c6 */
1374	z3 = MULTIPLY(tmp0 - tmp1, FIX(1.258538479)); /* c4 */
1375	dataptr[DCTSIZE*2] = (DCTELEM)
1376	DESCALE(z1 + z2 - MULTIPLY(tmp3, FIX(1.077210542)) /* c2+c8-c6 */
1377	- MULTIPLY(tmp4, FIX(1.471445400)), /* c4+c10 */
1378	CONST_BITS+2);
1379	dataptr[DCTSIZE*4] = (DCTELEM)
1380	DESCALE(z2 + z3 + MULTIPLY(tmp1, FIX(0.065941844)) /* c4-c6-c10 */
1381	- MULTIPLY(tmp2, FIX(1.435427942)) /* c2 */
1382	+ MULTIPLY(tmp4, FIX(0.621472312)), /* c8 */
1383	CONST_BITS+2);
1384	dataptr[DCTSIZE*6] = (DCTELEM)
1385	DESCALE(z1 + z3 - MULTIPLY(tmp0, FIX(1.714276708)) /* c2+c4-c6 */
1386	- MULTIPLY(tmp2, FIX(0.834379234)), /* c8+c10 */
1387	CONST_BITS+2);
1388
1389	/* Odd part */
1390
1391	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.360834544)); /* c3 */
1392	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.130622199)); /* c5 */
1393	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.808813568)); /* c7 */
1394	tmp0 = tmp1 + tmp2 + tmp3 - MULTIPLY(tmp10, FIX(1.819470145)) /* c7+c5+c3-c1 */
1395	+ MULTIPLY(tmp14, FIX(0.421479672)); /* c9 */
1396	tmp4 = MULTIPLY(tmp11 + tmp12, - FIX(0.808813568)); /* -c7 */
1397	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.480800167)); /* -c1 */
1398	tmp1 += tmp4 + tmp5 + MULTIPLY(tmp11, FIX(1.350258864)) /* c9+c7+c1-c3 */
1399	- MULTIPLY(tmp14, FIX(1.130622199)); /* c5 */
1400	tmp10 = MULTIPLY(tmp12 + tmp13, FIX(0.421479672)); /* c9 */
1401	tmp2 += tmp4 + tmp10 - MULTIPLY(tmp12, FIX(2.104122847)) /* c9+c5+c3-c7 */
1402	+ MULTIPLY(tmp14, FIX(1.480800167)); /* c1 */
1403	tmp3 += tmp5 + tmp10 + MULTIPLY(tmp13, FIX(1.381129125)) /* c1+c5-c9-c7 */
1404	- MULTIPLY(tmp14, FIX(1.360834544)); /* c3 */
1405
1406	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
1407	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
1408	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
1409	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
1410
1411	dataptr++; /* advance pointer to next column */
1412	wsptr++; /* advance pointer to next column */
1413	}
1414	}
1415
1416
1417	/*
1418	* Perform the forward DCT on a 12x12 sample block.
1419	*/
1420
1421	GLOBAL(void)
1422	jpeg_fdct_12x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1423	{
1424	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1425	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1426	DCTELEM workspace[8*4];
1427	DCTELEM *dataptr;
1428	DCTELEM *wsptr;
1429	JSAMPROW elemptr;
1430	int ctr;
1431	SHIFT_TEMPS
1432
1433	/* Pass 1: process rows. */
1434	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
1435	/* cK represents sqrt(2) * cos(Kpi/24). /
1436
1437	dataptr = data;
1438	ctr = 0;
1439	for (;;) {
1440	elemptr = sample_data[ctr] + start_col;
1441
1442	/* Even part */
1443
1444	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
1445	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
1446	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
1447	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
1448	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
1449	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
1450
1451	tmp10 = tmp0 + tmp5;
1452	tmp13 = tmp0 - tmp5;
1453	tmp11 = tmp1 + tmp4;
1454	tmp14 = tmp1 - tmp4;
1455	tmp12 = tmp2 + tmp3;
1456	tmp15 = tmp2 - tmp3;
1457
1458	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
1459	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
1460	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
1461	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
1462	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
1463	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
1464
1465	/* Apply unsigned->signed conversion */
1466	dataptr[0] = (DCTELEM) (tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE);
1467	dataptr[6] = (DCTELEM) (tmp13 - tmp14 - tmp15);
1468	dataptr[4] = (DCTELEM)
1469	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
1470	CONST_BITS);
1471	dataptr[2] = (DCTELEM)
1472	DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
1473	CONST_BITS);
1474
1475	/* Odd part */
1476
1477	tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
1478	tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
1479	tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
1480	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
1481	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
1482	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
1483	+ MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
1484	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
1485	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
1486	+ MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
1487	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
1488	- MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
1489	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
1490	- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
1491
1492	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS);
1493	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS);
1494	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS);
1495	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS);
1496
1497	ctr++;
1498
1499	if (ctr != DCTSIZE) {
1500	if (ctr == 12)
1501	break; /* Done. */
1502	dataptr += DCTSIZE; /* advance pointer to next row */
1503	} else
1504	dataptr = workspace; /* switch pointer to extended workspace */
1505	}
1506
1507	/* Pass 2: process columns.
1508	* We leave the results scaled up by an overall factor of 8.
1509	* We must also scale the output by (8/12)**2 = 4/9, which we partially
1510	* fold into the constant multipliers and final shifting:
1511	* cK now represents sqrt(2) * cos(Kpi/24) 8/9.
1512	*/
1513
1514	dataptr = data;
1515	wsptr = workspace;
1516	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1517	/* Even part */
1518
1519	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE3];
1520	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE2];
1521	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE1];
1522	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE0];
1523	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE7];
1524	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE6];
1525
1526	tmp10 = tmp0 + tmp5;
1527	tmp13 = tmp0 - tmp5;
1528	tmp11 = tmp1 + tmp4;
1529	tmp14 = tmp1 - tmp4;
1530	tmp12 = tmp2 + tmp3;
1531	tmp15 = tmp2 - tmp3;
1532
1533	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE3];
1534	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE2];
1535	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE1];
1536	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE0];
1537	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE7];
1538	tmp5 = dataptr[DCTSIZE5] - dataptr[DCTSIZE6];
1539
1540	dataptr[DCTSIZE*0] = (DCTELEM)
1541	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
1542	CONST_BITS+1);
1543	dataptr[DCTSIZE*6] = (DCTELEM)
1544	DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
1545	CONST_BITS+1);
1546	dataptr[DCTSIZE*4] = (DCTELEM)
1547	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
1548	CONST_BITS+1);
1549	dataptr[DCTSIZE*2] = (DCTELEM)
1550	DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
1551	MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
1552	CONST_BITS+1);
1553
1554	/* Odd part */
1555
1556	tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
1557	tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
1558	tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
1559	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
1560	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
1561	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
1562	+ MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
1563	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
1564	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
1565	+ MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
1566	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
1567	- MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
1568	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
1569	- MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
1570
1571	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+1);
1572	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+1);
1573	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+1);
1574	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+1);
1575
1576	dataptr++; /* advance pointer to next column */
1577	wsptr++; /* advance pointer to next column */
1578	}
1579	}
1580
1581
1582	/*
1583	* Perform the forward DCT on a 13x13 sample block.
1584	*/
1585
1586	GLOBAL(void)
1587	jpeg_fdct_13x13 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1588	{
1589	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1590	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1591	INT32 z1, z2;
1592	DCTELEM workspace[8*5];
1593	DCTELEM *dataptr;
1594	DCTELEM *wsptr;
1595	JSAMPROW elemptr;
1596	int ctr;
1597	SHIFT_TEMPS
1598
1599	/* Pass 1: process rows. */
1600	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
1601	/* cK represents sqrt(2) * cos(Kpi/26). /
1602
1603	dataptr = data;
1604	ctr = 0;
1605	for (;;) {
1606	elemptr = sample_data[ctr] + start_col;
1607
1608	/* Even part */
1609
1610	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[12]);
1611	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[11]);
1612	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[10]);
1613	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[9]);
1614	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[8]);
1615	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[7]);
1616	tmp6 = GETJSAMPLE(elemptr[6]);
1617
1618	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[12]);
1619	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[11]);
1620	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[10]);
1621	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[9]);
1622	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[8]);
1623	tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[7]);
1624
1625	/* Apply unsigned->signed conversion */
1626	dataptr[0] = (DCTELEM)
1627	(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6 - 13 * CENTERJSAMPLE);
1628	tmp6 += tmp6;
1629	tmp0 -= tmp6;
1630	tmp1 -= tmp6;
1631	tmp2 -= tmp6;
1632	tmp3 -= tmp6;
1633	tmp4 -= tmp6;
1634	tmp5 -= tmp6;
1635	dataptr[2] = (DCTELEM)
1636	DESCALE(MULTIPLY(tmp0, FIX(1.373119086)) + /* c2 */
1637	MULTIPLY(tmp1, FIX(1.058554052)) + /* c6 */
1638	MULTIPLY(tmp2, FIX(0.501487041)) - /* c10 */
1639	MULTIPLY(tmp3, FIX(0.170464608)) - /* c12 */
1640	MULTIPLY(tmp4, FIX(0.803364869)) - /* c8 */
1641	MULTIPLY(tmp5, FIX(1.252223920)), /* c4 */
1642	CONST_BITS);
1643	z1 = MULTIPLY(tmp0 - tmp2, FIX(1.155388986)) - /* (c4+c6)/2 */
1644	MULTIPLY(tmp3 - tmp4, FIX(0.435816023)) - /* (c2-c10)/2 */
1645	MULTIPLY(tmp1 - tmp5, FIX(0.316450131)); /* (c8-c12)/2 */
1646	z2 = MULTIPLY(tmp0 + tmp2, FIX(0.096834934)) - /* (c4-c6)/2 */
1647	MULTIPLY(tmp3 + tmp4, FIX(0.937303064)) + /* (c2+c10)/2 */
1648	MULTIPLY(tmp1 + tmp5, FIX(0.486914739)); /* (c8+c12)/2 */
1649
1650	dataptr[4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS);
1651	dataptr[6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS);
1652
1653	/* Odd part */
1654
1655	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.322312651)); /* c3 */
1656	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(1.163874945)); /* c5 */
1657	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.937797057)) + /* c7 */
1658	MULTIPLY(tmp14 + tmp15, FIX(0.338443458)); /* c11 */
1659	tmp0 = tmp1 + tmp2 + tmp3 -
1660	MULTIPLY(tmp10, FIX(2.020082300)) + /* c3+c5+c7-c1 */
1661	MULTIPLY(tmp14, FIX(0.318774355)); /* c9-c11 */
1662	tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.937797057)) - /* c7 */
1663	MULTIPLY(tmp11 + tmp12, FIX(0.338443458)); /* c11 */
1664	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(1.163874945)); /* -c5 */
1665	tmp1 += tmp4 + tmp5 +
1666	MULTIPLY(tmp11, FIX(0.837223564)) - /* c5+c9+c11-c3 */
1667	MULTIPLY(tmp14, FIX(2.341699410)); /* c1+c7 */
1668	tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.657217813)); /* -c9 */
1669	tmp2 += tmp4 + tmp6 -
1670	MULTIPLY(tmp12, FIX(1.572116027)) + /* c1+c5-c9-c11 */
1671	MULTIPLY(tmp15, FIX(2.260109708)); /* c3+c7 */
1672	tmp3 += tmp5 + tmp6 +
1673	MULTIPLY(tmp13, FIX(2.205608352)) - /* c3+c5+c9-c7 */
1674	MULTIPLY(tmp15, FIX(1.742345811)); /* c1+c11 */
1675
1676	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
1677	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
1678	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
1679	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
1680
1681	ctr++;
1682
1683	if (ctr != DCTSIZE) {
1684	if (ctr == 13)
1685	break; /* Done. */
1686	dataptr += DCTSIZE; /* advance pointer to next row */
1687	} else
1688	dataptr = workspace; /* switch pointer to extended workspace */
1689	}
1690
1691	/* Pass 2: process columns.
1692	* We leave the results scaled up by an overall factor of 8.
1693	* We must also scale the output by (8/13)**2 = 64/169, which we partially
1694	* fold into the constant multipliers and final shifting:
1695	* cK now represents sqrt(2) * cos(Kpi/26) 128/169.
1696	*/
1697
1698	dataptr = data;
1699	wsptr = workspace;
1700	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1701	/* Even part */
1702
1703	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE4];
1704	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE3];
1705	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE2];
1706	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE1];
1707	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE0];
1708	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE7];
1709	tmp6 = dataptr[DCTSIZE*6];
1710
1711	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE4];
1712	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE3];
1713	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE2];
1714	tmp13 = dataptr[DCTSIZE3] - wsptr[DCTSIZE1];
1715	tmp14 = dataptr[DCTSIZE4] - wsptr[DCTSIZE0];
1716	tmp15 = dataptr[DCTSIZE5] - dataptr[DCTSIZE7];
1717
1718	dataptr[DCTSIZE*0] = (DCTELEM)
1719	DESCALE(MULTIPLY(tmp0 + tmp1 + tmp2 + tmp3 + tmp4 + tmp5 + tmp6,
1720	FIX(0.757396450)), /* 128/169 */
1721	CONST_BITS+1);
1722	tmp6 += tmp6;
1723	tmp0 -= tmp6;
1724	tmp1 -= tmp6;
1725	tmp2 -= tmp6;
1726	tmp3 -= tmp6;
1727	tmp4 -= tmp6;
1728	tmp5 -= tmp6;
1729	dataptr[DCTSIZE*2] = (DCTELEM)
1730	DESCALE(MULTIPLY(tmp0, FIX(1.039995521)) + /* c2 */
1731	MULTIPLY(tmp1, FIX(0.801745081)) + /* c6 */
1732	MULTIPLY(tmp2, FIX(0.379824504)) - /* c10 */
1733	MULTIPLY(tmp3, FIX(0.129109289)) - /* c12 */
1734	MULTIPLY(tmp4, FIX(0.608465700)) - /* c8 */
1735	MULTIPLY(tmp5, FIX(0.948429952)), /* c4 */
1736	CONST_BITS+1);
1737	z1 = MULTIPLY(tmp0 - tmp2, FIX(0.875087516)) - /* (c4+c6)/2 */
1738	MULTIPLY(tmp3 - tmp4, FIX(0.330085509)) - /* (c2-c10)/2 */
1739	MULTIPLY(tmp1 - tmp5, FIX(0.239678205)); /* (c8-c12)/2 */
1740	z2 = MULTIPLY(tmp0 + tmp2, FIX(0.073342435)) - /* (c4-c6)/2 */
1741	MULTIPLY(tmp3 + tmp4, FIX(0.709910013)) + /* (c2+c10)/2 */
1742	MULTIPLY(tmp1 + tmp5, FIX(0.368787494)); /* (c8+c12)/2 */
1743
1744	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+1);
1745	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - z2, CONST_BITS+1);
1746
1747	/* Odd part */
1748
1749	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.001514908)); /* c3 */
1750	tmp2 = MULTIPLY(tmp10 + tmp12, FIX(0.881514751)); /* c5 */
1751	tmp3 = MULTIPLY(tmp10 + tmp13, FIX(0.710284161)) + /* c7 */
1752	MULTIPLY(tmp14 + tmp15, FIX(0.256335874)); /* c11 */
1753	tmp0 = tmp1 + tmp2 + tmp3 -
1754	MULTIPLY(tmp10, FIX(1.530003162)) + /* c3+c5+c7-c1 */
1755	MULTIPLY(tmp14, FIX(0.241438564)); /* c9-c11 */
1756	tmp4 = MULTIPLY(tmp14 - tmp15, FIX(0.710284161)) - /* c7 */
1757	MULTIPLY(tmp11 + tmp12, FIX(0.256335874)); /* c11 */
1758	tmp5 = MULTIPLY(tmp11 + tmp13, - FIX(0.881514751)); /* -c5 */
1759	tmp1 += tmp4 + tmp5 +
1760	MULTIPLY(tmp11, FIX(0.634110155)) - /* c5+c9+c11-c3 */
1761	MULTIPLY(tmp14, FIX(1.773594819)); /* c1+c7 */
1762	tmp6 = MULTIPLY(tmp12 + tmp13, - FIX(0.497774438)); /* -c9 */
1763	tmp2 += tmp4 + tmp6 -
1764	MULTIPLY(tmp12, FIX(1.190715098)) + /* c1+c5-c9-c11 */
1765	MULTIPLY(tmp15, FIX(1.711799069)); /* c3+c7 */
1766	tmp3 += tmp5 + tmp6 +
1767	MULTIPLY(tmp13, FIX(1.670519935)) - /* c3+c5+c9-c7 */
1768	MULTIPLY(tmp15, FIX(1.319646532)); /* c1+c11 */
1769
1770	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+1);
1771	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+1);
1772	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+1);
1773	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+1);
1774
1775	dataptr++; /* advance pointer to next column */
1776	wsptr++; /* advance pointer to next column */
1777	}
1778	}
1779
1780
1781	/*
1782	* Perform the forward DCT on a 14x14 sample block.
1783	*/
1784
1785	GLOBAL(void)
1786	jpeg_fdct_14x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1787	{
1788	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1789	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1790	DCTELEM workspace[8*6];
1791	DCTELEM *dataptr;
1792	DCTELEM *wsptr;
1793	JSAMPROW elemptr;
1794	int ctr;
1795	SHIFT_TEMPS
1796
1797	/* Pass 1: process rows. */
1798	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
1799	/* cK represents sqrt(2) * cos(Kpi/28). /
1800
1801	dataptr = data;
1802	ctr = 0;
1803	for (;;) {
1804	elemptr = sample_data[ctr] + start_col;
1805
1806	/* Even part */
1807
1808	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
1809	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
1810	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
1811	tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
1812	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
1813	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
1814	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
1815
1816	tmp10 = tmp0 + tmp6;
1817	tmp14 = tmp0 - tmp6;
1818	tmp11 = tmp1 + tmp5;
1819	tmp15 = tmp1 - tmp5;
1820	tmp12 = tmp2 + tmp4;
1821	tmp16 = tmp2 - tmp4;
1822
1823	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
1824	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
1825	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
1826	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
1827	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
1828	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
1829	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
1830
1831	/* Apply unsigned->signed conversion */
1832	dataptr[0] = (DCTELEM)
1833	(tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE);
1834	tmp13 += tmp13;
1835	dataptr[4] = (DCTELEM)
1836	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
1837	MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
1838	MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
1839	CONST_BITS);
1840
1841	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
1842
1843	dataptr[2] = (DCTELEM)
1844	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
1845	+ MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
1846	CONST_BITS);
1847	dataptr[6] = (DCTELEM)
1848	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
1849	- MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
1850	CONST_BITS);
1851
1852	/* Odd part */
1853
1854	tmp10 = tmp1 + tmp2;
1855	tmp11 = tmp5 - tmp4;
1856	dataptr[7] = (DCTELEM) (tmp0 - tmp10 + tmp3 - tmp11 - tmp6);
1857	tmp3 <<= CONST_BITS;
1858	tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
1859	tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
1860	tmp10 += tmp11 - tmp3;
1861	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
1862	MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
1863	dataptr[5] = (DCTELEM)
1864	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
1865	+ MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
1866	CONST_BITS);
1867	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
1868	MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
1869	dataptr[3] = (DCTELEM)
1870	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
1871	- MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
1872	CONST_BITS);
1873	dataptr[1] = (DCTELEM)
1874	DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
1875	MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
1876	CONST_BITS);
1877
1878	ctr++;
1879
1880	if (ctr != DCTSIZE) {
1881	if (ctr == 14)
1882	break; /* Done. */
1883	dataptr += DCTSIZE; /* advance pointer to next row */
1884	} else
1885	dataptr = workspace; /* switch pointer to extended workspace */
1886	}
1887
1888	/* Pass 2: process columns.
1889	* We leave the results scaled up by an overall factor of 8.
1890	* We must also scale the output by (8/14)**2 = 16/49, which we partially
1891	* fold into the constant multipliers and final shifting:
1892	* cK now represents sqrt(2) * cos(Kpi/28) 32/49.
1893	*/
1894
1895	dataptr = data;
1896	wsptr = workspace;
1897	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
1898	/* Even part */
1899
1900	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE5];
1901	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE4];
1902	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE3];
1903	tmp13 = dataptr[DCTSIZE3] + wsptr[DCTSIZE2];
1904	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE1];
1905	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE0];
1906	tmp6 = dataptr[DCTSIZE6] + dataptr[DCTSIZE7];
1907
1908	tmp10 = tmp0 + tmp6;
1909	tmp14 = tmp0 - tmp6;
1910	tmp11 = tmp1 + tmp5;
1911	tmp15 = tmp1 - tmp5;
1912	tmp12 = tmp2 + tmp4;
1913	tmp16 = tmp2 - tmp4;
1914
1915	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE5];
1916	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE4];
1917	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE3];
1918	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE2];
1919	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE1];
1920	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE0];
1921	tmp6 = dataptr[DCTSIZE6] - dataptr[DCTSIZE7];
1922
1923	dataptr[DCTSIZE*0] = (DCTELEM)
1924	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
1925	FIX(0.653061224)), /* 32/49 */
1926	CONST_BITS+1);
1927	tmp13 += tmp13;
1928	dataptr[DCTSIZE*4] = (DCTELEM)
1929	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
1930	MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
1931	MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
1932	CONST_BITS+1);
1933
1934	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
1935
1936	dataptr[DCTSIZE*2] = (DCTELEM)
1937	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
1938	+ MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
1939	CONST_BITS+1);
1940	dataptr[DCTSIZE*6] = (DCTELEM)
1941	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
1942	- MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
1943	CONST_BITS+1);
1944
1945	/* Odd part */
1946
1947	tmp10 = tmp1 + tmp2;
1948	tmp11 = tmp5 - tmp4;
1949	dataptr[DCTSIZE*7] = (DCTELEM)
1950	DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
1951	FIX(0.653061224)), /* 32/49 */
1952	CONST_BITS+1);
1953	tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
1954	tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
1955	tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
1956	tmp10 += tmp11 - tmp3;
1957	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
1958	MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
1959	dataptr[DCTSIZE*5] = (DCTELEM)
1960	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
1961	+ MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
1962	CONST_BITS+1);
1963	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
1964	MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
1965	dataptr[DCTSIZE*3] = (DCTELEM)
1966	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
1967	- MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
1968	CONST_BITS+1);
1969	dataptr[DCTSIZE*1] = (DCTELEM)
1970	DESCALE(tmp11 + tmp12 + tmp3
1971	- MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
1972	- MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
1973	CONST_BITS+1);
1974
1975	dataptr++; /* advance pointer to next column */
1976	wsptr++; /* advance pointer to next column */
1977	}
1978	}
1979
1980
1981	/*
1982	* Perform the forward DCT on a 15x15 sample block.
1983	*/
1984
1985	GLOBAL(void)
1986	jpeg_fdct_15x15 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
1987	{
1988	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1989	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
1990	INT32 z1, z2, z3;
1991	DCTELEM workspace[8*7];
1992	DCTELEM *dataptr;
1993	DCTELEM *wsptr;
1994	JSAMPROW elemptr;
1995	int ctr;
1996	SHIFT_TEMPS
1997
1998	/* Pass 1: process rows. */
1999	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
2000	/* cK represents sqrt(2) * cos(Kpi/30). /
2001
2002	dataptr = data;
2003	ctr = 0;
2004	for (;;) {
2005	elemptr = sample_data[ctr] + start_col;
2006
2007	/* Even part */
2008
2009	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[14]);
2010	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[13]);
2011	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[12]);
2012	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[11]);
2013	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[10]);
2014	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[9]);
2015	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[8]);
2016	tmp7 = GETJSAMPLE(elemptr[7]);
2017
2018	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[14]);
2019	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[13]);
2020	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[12]);
2021	tmp13 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[11]);
2022	tmp14 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[10]);
2023	tmp15 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[9]);
2024	tmp16 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[8]);
2025
2026	z1 = tmp0 + tmp4 + tmp5;
2027	z2 = tmp1 + tmp3 + tmp6;
2028	z3 = tmp2 + tmp7;
2029	/* Apply unsigned->signed conversion */
2030	dataptr[0] = (DCTELEM) (z1 + z2 + z3 - 15 * CENTERJSAMPLE);
2031	z3 += z3;
2032	dataptr[6] = (DCTELEM)
2033	DESCALE(MULTIPLY(z1 - z3, FIX(1.144122806)) - /* c6 */
2034	MULTIPLY(z2 - z3, FIX(0.437016024)), /* c12 */
2035	CONST_BITS);
2036	tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2037	z1 = MULTIPLY(tmp3 - tmp2, FIX(1.531135173)) - /* c2+c14 */
2038	MULTIPLY(tmp6 - tmp2, FIX(2.238241955)); /* c4+c8 */
2039	z2 = MULTIPLY(tmp5 - tmp2, FIX(0.798468008)) - /* c8-c14 */
2040	MULTIPLY(tmp0 - tmp2, FIX(0.091361227)); /* c2-c4 */
2041	z3 = MULTIPLY(tmp0 - tmp3, FIX(1.383309603)) + /* c2 */
2042	MULTIPLY(tmp6 - tmp5, FIX(0.946293579)) + /* c8 */
2043	MULTIPLY(tmp1 - tmp4, FIX(0.790569415)); /* (c6+c12)/2 */
2044
2045	dataptr[2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS);
2046	dataptr[4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS);
2047
2048	/* Odd part */
2049
2050	tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2051	FIX(1.224744871)); /* c5 */
2052	tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.344997024)) + /* c3 */
2053	MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.831253876)); /* c9 */
2054	tmp12 = MULTIPLY(tmp12, FIX(1.224744871)); /* c5 */
2055	tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.406466353)) + /* c1 */
2056	MULTIPLY(tmp11 + tmp14, FIX(1.344997024)) + /* c3 */
2057	MULTIPLY(tmp13 + tmp15, FIX(0.575212477)); /* c11 */
2058	tmp0 = MULTIPLY(tmp13, FIX(0.475753014)) - /* c7-c11 */
2059	MULTIPLY(tmp14, FIX(0.513743148)) + /* c3-c9 */
2060	MULTIPLY(tmp16, FIX(1.700497885)) + tmp4 + tmp12; /* c1+c13 */
2061	tmp3 = MULTIPLY(tmp10, - FIX(0.355500862)) - /* -(c1-c7) */
2062	MULTIPLY(tmp11, FIX(2.176250899)) - /* c3+c9 */
2063	MULTIPLY(tmp15, FIX(0.869244010)) + tmp4 - tmp12; /* c11+c13 */
2064
2065	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS);
2066	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS);
2067	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS);
2068	dataptr[7] = (DCTELEM) DESCALE(tmp3, CONST_BITS);
2069
2070	ctr++;
2071
2072	if (ctr != DCTSIZE) {
2073	if (ctr == 15)
2074	break; /* Done. */
2075	dataptr += DCTSIZE; /* advance pointer to next row */
2076	} else
2077	dataptr = workspace; /* switch pointer to extended workspace */
2078	}
2079
2080	/* Pass 2: process columns.
2081	* We leave the results scaled up by an overall factor of 8.
2082	* We must also scale the output by (8/15)**2 = 64/225, which we partially
2083	* fold into the constant multipliers and final shifting:
2084	* cK now represents sqrt(2) * cos(Kpi/30) 256/225.
2085	*/
2086
2087	dataptr = data;
2088	wsptr = workspace;
2089	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2090	/* Even part */
2091
2092	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE6];
2093	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE5];
2094	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE4];
2095	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE3];
2096	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE2];
2097	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE1];
2098	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE0];
2099	tmp7 = dataptr[DCTSIZE*7];
2100
2101	tmp10 = dataptr[DCTSIZE0] - wsptr[DCTSIZE6];
2102	tmp11 = dataptr[DCTSIZE1] - wsptr[DCTSIZE5];
2103	tmp12 = dataptr[DCTSIZE2] - wsptr[DCTSIZE4];
2104	tmp13 = dataptr[DCTSIZE3] - wsptr[DCTSIZE3];
2105	tmp14 = dataptr[DCTSIZE4] - wsptr[DCTSIZE2];
2106	tmp15 = dataptr[DCTSIZE5] - wsptr[DCTSIZE1];
2107	tmp16 = dataptr[DCTSIZE6] - wsptr[DCTSIZE0];
2108
2109	z1 = tmp0 + tmp4 + tmp5;
2110	z2 = tmp1 + tmp3 + tmp6;
2111	z3 = tmp2 + tmp7;
2112	dataptr[DCTSIZE*0] = (DCTELEM)
2113	DESCALE(MULTIPLY(z1 + z2 + z3, FIX(1.137777778)), /* 256/225 */
2114	CONST_BITS+2);
2115	z3 += z3;
2116	dataptr[DCTSIZE*6] = (DCTELEM)
2117	DESCALE(MULTIPLY(z1 - z3, FIX(1.301757503)) - /* c6 */
2118	MULTIPLY(z2 - z3, FIX(0.497227121)), /* c12 */
2119	CONST_BITS+2);
2120	tmp2 += ((tmp1 + tmp4) >> 1) - tmp7 - tmp7;
2121	z1 = MULTIPLY(tmp3 - tmp2, FIX(1.742091575)) - /* c2+c14 */
2122	MULTIPLY(tmp6 - tmp2, FIX(2.546621957)); /* c4+c8 */
2123	z2 = MULTIPLY(tmp5 - tmp2, FIX(0.908479156)) - /* c8-c14 */
2124	MULTIPLY(tmp0 - tmp2, FIX(0.103948774)); /* c2-c4 */
2125	z3 = MULTIPLY(tmp0 - tmp3, FIX(1.573898926)) + /* c2 */
2126	MULTIPLY(tmp6 - tmp5, FIX(1.076671805)) + /* c8 */
2127	MULTIPLY(tmp1 - tmp4, FIX(0.899492312)); /* (c6+c12)/2 */
2128
2129	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z3, CONST_BITS+2);
2130	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(z2 + z3, CONST_BITS+2);
2131
2132	/* Odd part */
2133
2134	tmp2 = MULTIPLY(tmp10 - tmp12 - tmp13 + tmp15 + tmp16,
2135	FIX(1.393487498)); /* c5 */
2136	tmp1 = MULTIPLY(tmp10 - tmp14 - tmp15, FIX(1.530307725)) + /* c3 */
2137	MULTIPLY(tmp11 - tmp13 - tmp16, FIX(0.945782187)); /* c9 */
2138	tmp12 = MULTIPLY(tmp12, FIX(1.393487498)); /* c5 */
2139	tmp4 = MULTIPLY(tmp10 - tmp16, FIX(1.600246161)) + /* c1 */
2140	MULTIPLY(tmp11 + tmp14, FIX(1.530307725)) + /* c3 */
2141	MULTIPLY(tmp13 + tmp15, FIX(0.654463974)); /* c11 */
2142	tmp0 = MULTIPLY(tmp13, FIX(0.541301207)) - /* c7-c11 */
2143	MULTIPLY(tmp14, FIX(0.584525538)) + /* c3-c9 */
2144	MULTIPLY(tmp16, FIX(1.934788705)) + tmp4 + tmp12; /* c1+c13 */
2145	tmp3 = MULTIPLY(tmp10, - FIX(0.404480980)) - /* -(c1-c7) */
2146	MULTIPLY(tmp11, FIX(2.476089912)) - /* c3+c9 */
2147	MULTIPLY(tmp15, FIX(0.989006518)) + tmp4 - tmp12; /* c11+c13 */
2148
2149	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+2);
2150	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+2);
2151	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+2);
2152	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3, CONST_BITS+2);
2153
2154	dataptr++; /* advance pointer to next column */
2155	wsptr++; /* advance pointer to next column */
2156	}
2157	}
2158
2159
2160	/*
2161	* Perform the forward DCT on a 16x16 sample block.
2162	*/
2163
2164	GLOBAL(void)
2165	jpeg_fdct_16x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2166	{
2167	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2168	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2169	DCTELEM workspace[DCTSIZE2];
2170	DCTELEM *dataptr;
2171	DCTELEM *wsptr;
2172	JSAMPROW elemptr;
2173	int ctr;
2174	SHIFT_TEMPS
2175
2176	/* Pass 1: process rows. */
2177	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2178	/* furthermore, we scale the results by 2*PASS1_BITS. /
2179	/* cK represents sqrt(2) * cos(Kpi/32). /
2180
2181	dataptr = data;
2182	ctr = 0;
2183	for (;;) {
2184	elemptr = sample_data[ctr] + start_col;
2185
2186	/* Even part */
2187
2188	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2189	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2190	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2191	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2192	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2193	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2194	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2195	tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2196
2197	tmp10 = tmp0 + tmp7;
2198	tmp14 = tmp0 - tmp7;
2199	tmp11 = tmp1 + tmp6;
2200	tmp15 = tmp1 - tmp6;
2201	tmp12 = tmp2 + tmp5;
2202	tmp16 = tmp2 - tmp5;
2203	tmp13 = tmp3 + tmp4;
2204	tmp17 = tmp3 - tmp4;
2205
2206	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2207	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2208	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2209	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2210	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2211	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2212	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2213	tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2214
2215	/* Apply unsigned->signed conversion */
2216	dataptr[0] = (DCTELEM)
2217	((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2218	dataptr[4] = (DCTELEM)
2219	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2220	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2221	CONST_BITS-PASS1_BITS);
2222
2223	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2224	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2225
2226	dataptr[2] = (DCTELEM)
2227	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2228	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
2229	CONST_BITS-PASS1_BITS);
2230	dataptr[6] = (DCTELEM)
2231	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2232	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2233	CONST_BITS-PASS1_BITS);
2234
2235	/* Odd part */
2236
2237	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2238	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2239	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2240	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2241	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2242	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2243	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2244	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2245	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2246	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2247	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2248	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2249	tmp10 = tmp11 + tmp12 + tmp13 -
2250	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2251	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2252	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2253	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2254	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2255	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2256	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2257	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2258
2259	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2260	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2261	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2262	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2263
2264	ctr++;
2265
2266	if (ctr != DCTSIZE) {
2267	if (ctr == DCTSIZE * 2)
2268	break; /* Done. */
2269	dataptr += DCTSIZE; /* advance pointer to next row */
2270	} else
2271	dataptr = workspace; /* switch pointer to extended workspace */
2272	}
2273
2274	/* Pass 2: process columns.
2275	* We remove the PASS1_BITS scaling, but leave the results scaled up
2276	* by an overall factor of 8.
2277	* We must also scale the output by (8/16)2 = 1/22.
2278	*/
2279
2280	dataptr = data;
2281	wsptr = workspace;
2282	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2283	/* Even part */
2284
2285	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE7];
2286	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE6];
2287	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE5];
2288	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE4];
2289	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE3];
2290	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE2];
2291	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE1];
2292	tmp7 = dataptr[DCTSIZE7] + wsptr[DCTSIZE0];
2293
2294	tmp10 = tmp0 + tmp7;
2295	tmp14 = tmp0 - tmp7;
2296	tmp11 = tmp1 + tmp6;
2297	tmp15 = tmp1 - tmp6;
2298	tmp12 = tmp2 + tmp5;
2299	tmp16 = tmp2 - tmp5;
2300	tmp13 = tmp3 + tmp4;
2301	tmp17 = tmp3 - tmp4;
2302
2303	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE7];
2304	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE6];
2305	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE5];
2306	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE4];
2307	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE3];
2308	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE2];
2309	tmp6 = dataptr[DCTSIZE6] - wsptr[DCTSIZE1];
2310	tmp7 = dataptr[DCTSIZE7] - wsptr[DCTSIZE0];
2311
2312	dataptr[DCTSIZE*0] = (DCTELEM)
2313	DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+2);
2314	dataptr[DCTSIZE*4] = (DCTELEM)
2315	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2316	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2317	CONST_BITS+PASS1_BITS+2);
2318
2319	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2320	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2321
2322	dataptr[DCTSIZE*2] = (DCTELEM)
2323	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2324	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+10 */
2325	CONST_BITS+PASS1_BITS+2);
2326	dataptr[DCTSIZE*6] = (DCTELEM)
2327	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2328	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2329	CONST_BITS+PASS1_BITS+2);
2330
2331	/* Odd part */
2332
2333	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2334	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2335	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2336	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2337	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2338	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2339	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2340	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2341	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2342	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2343	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2344	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2345	tmp10 = tmp11 + tmp12 + tmp13 -
2346	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2347	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2348	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2349	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2350	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2351	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2352	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2353	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2354
2355	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+2);
2356	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+2);
2357	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+2);
2358	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+2);
2359
2360	dataptr++; /* advance pointer to next column */
2361	wsptr++; /* advance pointer to next column */
2362	}
2363	}
2364
2365
2366	/*
2367	* Perform the forward DCT on a 16x8 sample block.
2368	*
2369	* 16-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
2370	*/
2371
2372	GLOBAL(void)
2373	jpeg_fdct_16x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2374	{
2375	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2376	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
2377	INT32 z1;
2378	DCTELEM *dataptr;
2379	JSAMPROW elemptr;
2380	int ctr;
2381	SHIFT_TEMPS
2382
2383	/* Pass 1: process rows. */
2384	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2385	/* furthermore, we scale the results by 2*PASS1_BITS. /
2386	/* 16-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/32). /
2387
2388	dataptr = data;
2389	ctr = 0;
2390	for (ctr = 0; ctr < DCTSIZE; ctr++) {
2391	elemptr = sample_data[ctr] + start_col;
2392
2393	/* Even part */
2394
2395	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[15]);
2396	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[14]);
2397	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[13]);
2398	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[12]);
2399	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[11]);
2400	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[10]);
2401	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[9]);
2402	tmp7 = GETJSAMPLE(elemptr[7]) + GETJSAMPLE(elemptr[8]);
2403
2404	tmp10 = tmp0 + tmp7;
2405	tmp14 = tmp0 - tmp7;
2406	tmp11 = tmp1 + tmp6;
2407	tmp15 = tmp1 - tmp6;
2408	tmp12 = tmp2 + tmp5;
2409	tmp16 = tmp2 - tmp5;
2410	tmp13 = tmp3 + tmp4;
2411	tmp17 = tmp3 - tmp4;
2412
2413	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[15]);
2414	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[14]);
2415	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[13]);
2416	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[12]);
2417	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[11]);
2418	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[10]);
2419	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[9]);
2420	tmp7 = GETJSAMPLE(elemptr[7]) - GETJSAMPLE(elemptr[8]);
2421
2422	/* Apply unsigned->signed conversion */
2423	dataptr[0] = (DCTELEM)
2424	((tmp10 + tmp11 + tmp12 + tmp13 - 16 * CENTERJSAMPLE) << PASS1_BITS);
2425	dataptr[4] = (DCTELEM)
2426	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
2427	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
2428	CONST_BITS-PASS1_BITS);
2429
2430	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
2431	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
2432
2433	dataptr[2] = (DCTELEM)
2434	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
2435	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
2436	CONST_BITS-PASS1_BITS);
2437	dataptr[6] = (DCTELEM)
2438	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
2439	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
2440	CONST_BITS-PASS1_BITS);
2441
2442	/* Odd part */
2443
2444	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
2445	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
2446	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
2447	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
2448	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
2449	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
2450	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
2451	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
2452	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
2453	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
2454	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
2455	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
2456	tmp10 = tmp11 + tmp12 + tmp13 -
2457	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
2458	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
2459	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
2460	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
2461	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
2462	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
2463	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
2464	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
2465
2466	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2467	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2468	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2469	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2470
2471	dataptr += DCTSIZE; /* advance pointer to next row */
2472	}
2473
2474	/* Pass 2: process columns.
2475	* We remove the PASS1_BITS scaling, but leave the results scaled up
2476	* by an overall factor of 8.
2477	* We must also scale the output by 8/16 = 1/2.
2478	*/
2479
2480	dataptr = data;
2481	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2482	/* Even part per LL&M figure 1 --- note that published figure is faulty;
2483	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
2484	*/
2485
2486	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
2487	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
2488	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
2489	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
2490
2491	tmp10 = tmp0 + tmp3;
2492	tmp12 = tmp0 - tmp3;
2493	tmp11 = tmp1 + tmp2;
2494	tmp13 = tmp1 - tmp2;
2495
2496	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
2497	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
2498	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
2499	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
2500
2501	dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS+1);
2502	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS+1);
2503
2504	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
2505	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
2506	CONST_BITS+PASS1_BITS+1);
2507	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
2508	CONST_BITS+PASS1_BITS+1);
2509
2510	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
2511	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
2512	* i0..i3 in the paper are tmp0..tmp3 here.
2513	*/
2514
2515	tmp10 = tmp0 + tmp3;
2516	tmp11 = tmp1 + tmp2;
2517	tmp12 = tmp0 + tmp2;
2518	tmp13 = tmp1 + tmp3;
2519	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
2520
2521	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
2522	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
2523	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
2524	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
2525	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
2526	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
2527	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
2528	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
2529
2530	tmp12 += z1;
2531	tmp13 += z1;
2532
2533	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12,
2534	CONST_BITS+PASS1_BITS+1);
2535	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13,
2536	CONST_BITS+PASS1_BITS+1);
2537	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12,
2538	CONST_BITS+PASS1_BITS+1);
2539	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13,
2540	CONST_BITS+PASS1_BITS+1);
2541
2542	dataptr++; /* advance pointer to next column */
2543	}
2544	}
2545
2546
2547	/*
2548	* Perform the forward DCT on a 14x7 sample block.
2549	*
2550	* 14-point FDCT in pass 1 (rows), 7-point in pass 2 (columns).
2551	*/
2552
2553	GLOBAL(void)
2554	jpeg_fdct_14x7 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2555	{
2556	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
2557	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
2558	INT32 z1, z2, z3;
2559	DCTELEM *dataptr;
2560	JSAMPROW elemptr;
2561	int ctr;
2562	SHIFT_TEMPS
2563
2564	/* Zero bottom row of output coefficient block. */
2565	MEMZERO(&data[DCTSIZE7], SIZEOF(DCTELEM) DCTSIZE);
2566
2567	/* Pass 1: process rows. */
2568	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2569	/* furthermore, we scale the results by 2*PASS1_BITS. /
2570	/* 14-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/28). /
2571
2572	dataptr = data;
2573	for (ctr = 0; ctr < 7; ctr++) {
2574	elemptr = sample_data[ctr] + start_col;
2575
2576	/* Even part */
2577
2578	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[13]);
2579	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[12]);
2580	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[11]);
2581	tmp13 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[10]);
2582	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[9]);
2583	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[8]);
2584	tmp6 = GETJSAMPLE(elemptr[6]) + GETJSAMPLE(elemptr[7]);
2585
2586	tmp10 = tmp0 + tmp6;
2587	tmp14 = tmp0 - tmp6;
2588	tmp11 = tmp1 + tmp5;
2589	tmp15 = tmp1 - tmp5;
2590	tmp12 = tmp2 + tmp4;
2591	tmp16 = tmp2 - tmp4;
2592
2593	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[13]);
2594	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[12]);
2595	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[11]);
2596	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[10]);
2597	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[9]);
2598	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[8]);
2599	tmp6 = GETJSAMPLE(elemptr[6]) - GETJSAMPLE(elemptr[7]);
2600
2601	/* Apply unsigned->signed conversion */
2602	dataptr[0] = (DCTELEM)
2603	((tmp10 + tmp11 + tmp12 + tmp13 - 14 * CENTERJSAMPLE) << PASS1_BITS);
2604	tmp13 += tmp13;
2605	dataptr[4] = (DCTELEM)
2606	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.274162392)) + /* c4 */
2607	MULTIPLY(tmp11 - tmp13, FIX(0.314692123)) - /* c12 */
2608	MULTIPLY(tmp12 - tmp13, FIX(0.881747734)), /* c8 */
2609	CONST_BITS-PASS1_BITS);
2610
2611	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(1.105676686)); /* c6 */
2612
2613	dataptr[2] = (DCTELEM)
2614	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.273079590)) /* c2-c6 */
2615	+ MULTIPLY(tmp16, FIX(0.613604268)), /* c10 */
2616	CONST_BITS-PASS1_BITS);
2617	dataptr[6] = (DCTELEM)
2618	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.719280954)) /* c6+c10 */
2619	- MULTIPLY(tmp16, FIX(1.378756276)), /* c2 */
2620	CONST_BITS-PASS1_BITS);
2621
2622	/* Odd part */
2623
2624	tmp10 = tmp1 + tmp2;
2625	tmp11 = tmp5 - tmp4;
2626	dataptr[7] = (DCTELEM) ((tmp0 - tmp10 + tmp3 - tmp11 - tmp6) << PASS1_BITS);
2627	tmp3 <<= CONST_BITS;
2628	tmp10 = MULTIPLY(tmp10, - FIX(0.158341681)); /* -c13 */
2629	tmp11 = MULTIPLY(tmp11, FIX(1.405321284)); /* c1 */
2630	tmp10 += tmp11 - tmp3;
2631	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(1.197448846)) + /* c5 */
2632	MULTIPLY(tmp4 + tmp6, FIX(0.752406978)); /* c9 */
2633	dataptr[5] = (DCTELEM)
2634	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(2.373959773)) /* c3+c5-c13 */
2635	+ MULTIPLY(tmp4, FIX(1.119999435)), /* c1+c11-c9 */
2636	CONST_BITS-PASS1_BITS);
2637	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(1.334852607)) + /* c3 */
2638	MULTIPLY(tmp5 - tmp6, FIX(0.467085129)); /* c11 */
2639	dataptr[3] = (DCTELEM)
2640	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.424103948)) /* c3-c9-c13 */
2641	- MULTIPLY(tmp5, FIX(3.069855259)), /* c1+c5+c11 */
2642	CONST_BITS-PASS1_BITS);
2643	dataptr[1] = (DCTELEM)
2644	DESCALE(tmp11 + tmp12 + tmp3 + tmp6 -
2645	MULTIPLY(tmp0 + tmp6, FIX(1.126980169)), /* c3+c5-c1 */
2646	CONST_BITS-PASS1_BITS);
2647
2648	dataptr += DCTSIZE; /* advance pointer to next row */
2649	}
2650
2651	/* Pass 2: process columns.
2652	* We remove the PASS1_BITS scaling, but leave the results scaled up
2653	* by an overall factor of 8.
2654	* We must also scale the output by (8/14)*(8/7) = 32/49, which we
2655	* partially fold into the constant multipliers and final shifting:
2656	* 7-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/14) 64/49.
2657	*/
2658
2659	dataptr = data;
2660	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2661	/* Even part */
2662
2663	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE6];
2664	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE5];
2665	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE4];
2666	tmp3 = dataptr[DCTSIZE*3];
2667
2668	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE6];
2669	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE5];
2670	tmp12 = dataptr[DCTSIZE2] - dataptr[DCTSIZE4];
2671
2672	z1 = tmp0 + tmp2;
2673	dataptr[DCTSIZE*0] = (DCTELEM)
2674	DESCALE(MULTIPLY(z1 + tmp1 + tmp3, FIX(1.306122449)), /* 64/49 */
2675	CONST_BITS+PASS1_BITS+1);
2676	tmp3 += tmp3;
2677	z1 -= tmp3;
2678	z1 -= tmp3;
2679	z1 = MULTIPLY(z1, FIX(0.461784020)); /* (c2+c6-c4)/2 */
2680	z2 = MULTIPLY(tmp0 - tmp2, FIX(1.202428084)); /* (c2+c4-c6)/2 */
2681	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.411026446)); /* c6 */
2682	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS+PASS1_BITS+1);
2683	z1 -= z2;
2684	z2 = MULTIPLY(tmp0 - tmp1, FIX(1.151670509)); /* c4 */
2685	dataptr[DCTSIZE*4] = (DCTELEM)
2686	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.923568041)), /* c2+c6-c4 */
2687	CONST_BITS+PASS1_BITS+1);
2688	dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS+PASS1_BITS+1);
2689
2690	/* Odd part */
2691
2692	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(1.221765677)); /* (c3+c1-c5)/2 */
2693	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.222383464)); /* (c3+c5-c1)/2 */
2694	tmp0 = tmp1 - tmp2;
2695	tmp1 += tmp2;
2696	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.800824523)); /* -c1 */
2697	tmp1 += tmp2;
2698	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.801442310)); /* c5 */
2699	tmp0 += tmp3;
2700	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(2.443531355)); /* c3+c1-c5 */
2701
2702	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp0, CONST_BITS+PASS1_BITS+1);
2703	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp1, CONST_BITS+PASS1_BITS+1);
2704	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp2, CONST_BITS+PASS1_BITS+1);
2705
2706	dataptr++; /* advance pointer to next column */
2707	}
2708	}
2709
2710
2711	/*
2712	* Perform the forward DCT on a 12x6 sample block.
2713	*
2714	* 12-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
2715	*/
2716
2717	GLOBAL(void)
2718	jpeg_fdct_12x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2719	{
2720	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
2721	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2722	DCTELEM *dataptr;
2723	JSAMPROW elemptr;
2724	int ctr;
2725	SHIFT_TEMPS
2726
2727	/* Zero 2 bottom rows of output coefficient block. */
2728	MEMZERO(&data[DCTSIZE6], SIZEOF(DCTELEM) DCTSIZE * 2);
2729
2730	/* Pass 1: process rows. */
2731	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2732	/* furthermore, we scale the results by 2*PASS1_BITS. /
2733	/* 12-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/24). /
2734
2735	dataptr = data;
2736	for (ctr = 0; ctr < 6; ctr++) {
2737	elemptr = sample_data[ctr] + start_col;
2738
2739	/* Even part */
2740
2741	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[11]);
2742	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[10]);
2743	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[9]);
2744	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[8]);
2745	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[7]);
2746	tmp5 = GETJSAMPLE(elemptr[5]) + GETJSAMPLE(elemptr[6]);
2747
2748	tmp10 = tmp0 + tmp5;
2749	tmp13 = tmp0 - tmp5;
2750	tmp11 = tmp1 + tmp4;
2751	tmp14 = tmp1 - tmp4;
2752	tmp12 = tmp2 + tmp3;
2753	tmp15 = tmp2 - tmp3;
2754
2755	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[11]);
2756	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[10]);
2757	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[9]);
2758	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[8]);
2759	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[7]);
2760	tmp5 = GETJSAMPLE(elemptr[5]) - GETJSAMPLE(elemptr[6]);
2761
2762	/* Apply unsigned->signed conversion */
2763	dataptr[0] = (DCTELEM)
2764	((tmp10 + tmp11 + tmp12 - 12 * CENTERJSAMPLE) << PASS1_BITS);
2765	dataptr[6] = (DCTELEM) ((tmp13 - tmp14 - tmp15) << PASS1_BITS);
2766	dataptr[4] = (DCTELEM)
2767	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.224744871)), /* c4 */
2768	CONST_BITS-PASS1_BITS);
2769	dataptr[2] = (DCTELEM)
2770	DESCALE(tmp14 - tmp15 + MULTIPLY(tmp13 + tmp15, FIX(1.366025404)), /* c2 */
2771	CONST_BITS-PASS1_BITS);
2772
2773	/* Odd part */
2774
2775	tmp10 = MULTIPLY(tmp1 + tmp4, FIX_0_541196100); /* c9 */
2776	tmp14 = tmp10 + MULTIPLY(tmp1, FIX_0_765366865); /* c3-c9 */
2777	tmp15 = tmp10 - MULTIPLY(tmp4, FIX_1_847759065); /* c3+c9 */
2778	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.121971054)); /* c5 */
2779	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.860918669)); /* c7 */
2780	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.580774953)) /* c5+c7-c1 */
2781	+ MULTIPLY(tmp5, FIX(0.184591911)); /* c11 */
2782	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.184591911)); /* -c11 */
2783	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.339493912)) /* c1+c5-c11 */
2784	+ MULTIPLY(tmp5, FIX(0.860918669)); /* c7 */
2785	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.725788011)) /* c1+c11-c7 */
2786	- MULTIPLY(tmp5, FIX(1.121971054)); /* c5 */
2787	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.306562965)) /* c3 */
2788	- MULTIPLY(tmp2 + tmp5, FIX_0_541196100); /* c9 */
2789
2790	dataptr[1] = (DCTELEM) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
2791	dataptr[3] = (DCTELEM) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
2792	dataptr[5] = (DCTELEM) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
2793	dataptr[7] = (DCTELEM) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
2794
2795	dataptr += DCTSIZE; /* advance pointer to next row */
2796	}
2797
2798	/* Pass 2: process columns.
2799	* We remove the PASS1_BITS scaling, but leave the results scaled up
2800	* by an overall factor of 8.
2801	* We must also scale the output by (8/12)*(8/6) = 8/9, which we
2802	* partially fold into the constant multipliers and final shifting:
2803	* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12) 16/9.
2804	*/
2805
2806	dataptr = data;
2807	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2808	/* Even part */
2809
2810	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
2811	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
2812	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
2813
2814	tmp10 = tmp0 + tmp2;
2815	tmp12 = tmp0 - tmp2;
2816
2817	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
2818	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
2819	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
2820
2821	dataptr[DCTSIZE*0] = (DCTELEM)
2822	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
2823	CONST_BITS+PASS1_BITS+1);
2824	dataptr[DCTSIZE*2] = (DCTELEM)
2825	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
2826	CONST_BITS+PASS1_BITS+1);
2827	dataptr[DCTSIZE*4] = (DCTELEM)
2828	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
2829	CONST_BITS+PASS1_BITS+1);
2830
2831	/* Odd part */
2832
2833	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
2834
2835	dataptr[DCTSIZE*1] = (DCTELEM)
2836	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
2837	CONST_BITS+PASS1_BITS+1);
2838	dataptr[DCTSIZE*3] = (DCTELEM)
2839	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
2840	CONST_BITS+PASS1_BITS+1);
2841	dataptr[DCTSIZE*5] = (DCTELEM)
2842	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
2843	CONST_BITS+PASS1_BITS+1);
2844
2845	dataptr++; /* advance pointer to next column */
2846	}
2847	}
2848
2849
2850	/*
2851	* Perform the forward DCT on a 10x5 sample block.
2852	*
2853	* 10-point FDCT in pass 1 (rows), 5-point in pass 2 (columns).
2854	*/
2855
2856	GLOBAL(void)
2857	jpeg_fdct_10x5 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2858	{
2859	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
2860	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
2861	DCTELEM *dataptr;
2862	JSAMPROW elemptr;
2863	int ctr;
2864	SHIFT_TEMPS
2865
2866	/* Zero 3 bottom rows of output coefficient block. */
2867	MEMZERO(&data[DCTSIZE5], SIZEOF(DCTELEM) DCTSIZE * 3);
2868
2869	/* Pass 1: process rows. */
2870	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
2871	/* furthermore, we scale the results by 2*PASS1_BITS. /
2872	/* 10-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/20). /
2873
2874	dataptr = data;
2875	for (ctr = 0; ctr < 5; ctr++) {
2876	elemptr = sample_data[ctr] + start_col;
2877
2878	/* Even part */
2879
2880	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[9]);
2881	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[8]);
2882	tmp12 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[7]);
2883	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[6]);
2884	tmp4 = GETJSAMPLE(elemptr[4]) + GETJSAMPLE(elemptr[5]);
2885
2886	tmp10 = tmp0 + tmp4;
2887	tmp13 = tmp0 - tmp4;
2888	tmp11 = tmp1 + tmp3;
2889	tmp14 = tmp1 - tmp3;
2890
2891	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[9]);
2892	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[8]);
2893	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[7]);
2894	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[6]);
2895	tmp4 = GETJSAMPLE(elemptr[4]) - GETJSAMPLE(elemptr[5]);
2896
2897	/* Apply unsigned->signed conversion */
2898	dataptr[0] = (DCTELEM)
2899	((tmp10 + tmp11 + tmp12 - 10 * CENTERJSAMPLE) << PASS1_BITS);
2900	tmp12 += tmp12;
2901	dataptr[4] = (DCTELEM)
2902	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.144122806)) - /* c4 */
2903	MULTIPLY(tmp11 - tmp12, FIX(0.437016024)), /* c8 */
2904	CONST_BITS-PASS1_BITS);
2905	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(0.831253876)); /* c6 */
2906	dataptr[2] = (DCTELEM)
2907	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.513743148)), /* c2-c6 */
2908	CONST_BITS-PASS1_BITS);
2909	dataptr[6] = (DCTELEM)
2910	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.176250899)), /* c2+c6 */
2911	CONST_BITS-PASS1_BITS);
2912
2913	/* Odd part */
2914
2915	tmp10 = tmp0 + tmp4;
2916	tmp11 = tmp1 - tmp3;
2917	dataptr[5] = (DCTELEM) ((tmp10 - tmp11 - tmp2) << PASS1_BITS);
2918	tmp2 <<= CONST_BITS;
2919	dataptr[1] = (DCTELEM)
2920	DESCALE(MULTIPLY(tmp0, FIX(1.396802247)) + /* c1 */
2921	MULTIPLY(tmp1, FIX(1.260073511)) + tmp2 + /* c3 */
2922	MULTIPLY(tmp3, FIX(0.642039522)) + /* c7 */
2923	MULTIPLY(tmp4, FIX(0.221231742)), /* c9 */
2924	CONST_BITS-PASS1_BITS);
2925	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(0.951056516)) - /* (c3+c7)/2 */
2926	MULTIPLY(tmp1 + tmp3, FIX(0.587785252)); /* (c1-c9)/2 */
2927	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.309016994)) + /* (c3-c7)/2 */
2928	(tmp11 << (CONST_BITS - 1)) - tmp2;
2929	dataptr[3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS-PASS1_BITS);
2930	dataptr[7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS-PASS1_BITS);
2931
2932	dataptr += DCTSIZE; /* advance pointer to next row */
2933	}
2934
2935	/* Pass 2: process columns.
2936	* We remove the PASS1_BITS scaling, but leave the results scaled up
2937	* by an overall factor of 8.
2938	* We must also scale the output by (8/10)*(8/5) = 32/25, which we
2939	* fold into the constant multipliers:
2940	* 5-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/10) 32/25.
2941	*/
2942
2943	dataptr = data;
2944	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
2945	/* Even part */
2946
2947	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE4];
2948	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE3];
2949	tmp2 = dataptr[DCTSIZE*2];
2950
2951	tmp10 = tmp0 + tmp1;
2952	tmp11 = tmp0 - tmp1;
2953
2954	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE4];
2955	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE3];
2956
2957	dataptr[DCTSIZE*0] = (DCTELEM)
2958	DESCALE(MULTIPLY(tmp10 + tmp2, FIX(1.28)), /* 32/25 */
2959	CONST_BITS+PASS1_BITS);
2960	tmp11 = MULTIPLY(tmp11, FIX(1.011928851)); /* (c2+c4)/2 */
2961	tmp10 -= tmp2 << 2;
2962	tmp10 = MULTIPLY(tmp10, FIX(0.452548340)); /* (c2-c4)/2 */
2963	dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS+PASS1_BITS);
2964	dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS+PASS1_BITS);
2965
2966	/* Odd part */
2967
2968	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(1.064004961)); /* c3 */
2969
2970	dataptr[DCTSIZE*1] = (DCTELEM)
2971	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.657591230)), /* c1-c3 */
2972	CONST_BITS+PASS1_BITS);
2973	dataptr[DCTSIZE*3] = (DCTELEM)
2974	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.785601151)), /* c1+c3 */
2975	CONST_BITS+PASS1_BITS);
2976
2977	dataptr++; /* advance pointer to next column */
2978	}
2979	}
2980
2981
2982	/*
2983	* Perform the forward DCT on an 8x4 sample block.
2984	*
2985	* 8-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
2986	*/
2987
2988	GLOBAL(void)
2989	jpeg_fdct_8x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
2990	{
2991	INT32 tmp0, tmp1, tmp2, tmp3;
2992	INT32 tmp10, tmp11, tmp12, tmp13;
2993	INT32 z1;
2994	DCTELEM *dataptr;
2995	JSAMPROW elemptr;
2996	int ctr;
2997	SHIFT_TEMPS
2998
2999	/* Zero 4 bottom rows of output coefficient block. */
3000	MEMZERO(&data[DCTSIZE4], SIZEOF(DCTELEM) DCTSIZE * 4);
3001
3002	/* Pass 1: process rows. */
3003	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3004	/* furthermore, we scale the results by 2*PASS1_BITS. /
3005	/* We must also scale the output by 8/4 = 2, which we add here. */
3006
3007	dataptr = data;
3008	for (ctr = 0; ctr < 4; ctr++) {
3009	elemptr = sample_data[ctr] + start_col;
3010
3011	/* Even part per LL&M figure 1 --- note that published figure is faulty;
3012	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
3013	*/
3014
3015	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3016	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3017	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3018	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3019
3020	tmp10 = tmp0 + tmp3;
3021	tmp12 = tmp0 - tmp3;
3022	tmp11 = tmp1 + tmp2;
3023	tmp13 = tmp1 - tmp2;
3024
3025	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3026	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3027	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3028	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3029
3030	/* Apply unsigned->signed conversion */
3031	dataptr[0] = (DCTELEM)
3032	((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << (PASS1_BITS+1));
3033	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << (PASS1_BITS+1));
3034
3035	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
3036	/* Add fudge factor here for final descale. */
3037	z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3038	dataptr[2] = (DCTELEM) RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865),
3039	CONST_BITS-PASS1_BITS-1);
3040	dataptr[6] = (DCTELEM) RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065),
3041	CONST_BITS-PASS1_BITS-1);
3042
3043	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3044	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3045	* i0..i3 in the paper are tmp0..tmp3 here.
3046	*/
3047
3048	tmp10 = tmp0 + tmp3;
3049	tmp11 = tmp1 + tmp2;
3050	tmp12 = tmp0 + tmp2;
3051	tmp13 = tmp1 + tmp3;
3052	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
3053	/* Add fudge factor here for final descale. */
3054	z1 += ONE << (CONST_BITS-PASS1_BITS-2);
3055
3056	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
3057	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
3058	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
3059	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
3060	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
3061	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
3062	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
3063	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
3064
3065	tmp12 += z1;
3066	tmp13 += z1;
3067
3068	dataptr[1] = (DCTELEM)
3069	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS-1);
3070	dataptr[3] = (DCTELEM)
3071	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS-1);
3072	dataptr[5] = (DCTELEM)
3073	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS-1);
3074	dataptr[7] = (DCTELEM)
3075	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS-1);
3076
3077	dataptr += DCTSIZE; /* advance pointer to next row */
3078	}
3079
3080	/* Pass 2: process columns.
3081	* We remove the PASS1_BITS scaling, but leave the results scaled up
3082	* by an overall factor of 8.
3083	* 4-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3084	*/
3085
3086	dataptr = data;
3087	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3088	/* Even part */
3089
3090	/* Add fudge factor here for final descale. */
3091	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3] + (ONE << (PASS1_BITS-1));
3092	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
3093
3094	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
3095	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
3096
3097	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3098	dataptr[DCTSIZE*2] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3099
3100	/* Odd part */
3101
3102	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
3103	/* Add fudge factor here for final descale. */
3104	tmp0 += ONE << (CONST_BITS+PASS1_BITS-1);
3105
3106	dataptr[DCTSIZE*1] = (DCTELEM)
3107	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3108	CONST_BITS+PASS1_BITS);
3109	dataptr[DCTSIZE*3] = (DCTELEM)
3110	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3111	CONST_BITS+PASS1_BITS);
3112
3113	dataptr++; /* advance pointer to next column */
3114	}
3115	}
3116
3117
3118	/*
3119	* Perform the forward DCT on a 6x3 sample block.
3120	*
3121	* 6-point FDCT in pass 1 (rows), 3-point in pass 2 (columns).
3122	*/
3123
3124	GLOBAL(void)
3125	jpeg_fdct_6x3 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3126	{
3127	INT32 tmp0, tmp1, tmp2;
3128	INT32 tmp10, tmp11, tmp12;
3129	DCTELEM *dataptr;
3130	JSAMPROW elemptr;
3131	int ctr;
3132	SHIFT_TEMPS
3133
3134	/* Pre-zero output coefficient block. */
3135	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3136
3137	/* Pass 1: process rows. */
3138	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3139	/* furthermore, we scale the results by 2*PASS1_BITS. /
3140	/* We scale the results further by 2 as part of output adaption */
3141	/* scaling for different DCT size. */
3142	/* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12). /
3143
3144	dataptr = data;
3145	for (ctr = 0; ctr < 3; ctr++) {
3146	elemptr = sample_data[ctr] + start_col;
3147
3148	/* Even part */
3149
3150	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3151	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3152	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3153
3154	tmp10 = tmp0 + tmp2;
3155	tmp12 = tmp0 - tmp2;
3156
3157	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3158	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3159	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3160
3161	/* Apply unsigned->signed conversion */
3162	dataptr[0] = (DCTELEM)
3163	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << (PASS1_BITS+1));
3164	dataptr[2] = (DCTELEM)
3165	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
3166	CONST_BITS-PASS1_BITS-1);
3167	dataptr[4] = (DCTELEM)
3168	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3169	CONST_BITS-PASS1_BITS-1);
3170
3171	/* Odd part */
3172
3173	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
3174	CONST_BITS-PASS1_BITS-1);
3175
3176	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << (PASS1_BITS+1)));
3177	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << (PASS1_BITS+1));
3178	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << (PASS1_BITS+1)));
3179
3180	dataptr += DCTSIZE; /* advance pointer to next row */
3181	}
3182
3183	/* Pass 2: process columns.
3184	* We remove the PASS1_BITS scaling, but leave the results scaled up
3185	* by an overall factor of 8.
3186	* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
3187	* fold into the constant multipliers (other part was done in pass 1):
3188	* 3-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/6) 16/9.
3189	*/
3190
3191	dataptr = data;
3192	for (ctr = 0; ctr < 6; ctr++) {
3193	/* Even part */
3194
3195	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE2];
3196	tmp1 = dataptr[DCTSIZE*1];
3197
3198	tmp2 = dataptr[DCTSIZE0] - dataptr[DCTSIZE2];
3199
3200	dataptr[DCTSIZE*0] = (DCTELEM)
3201	DESCALE(MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
3202	CONST_BITS+PASS1_BITS);
3203	dataptr[DCTSIZE*2] = (DCTELEM)
3204	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(1.257078722)), /* c2 */
3205	CONST_BITS+PASS1_BITS);
3206
3207	/* Odd part */
3208
3209	dataptr[DCTSIZE*1] = (DCTELEM)
3210	DESCALE(MULTIPLY(tmp2, FIX(2.177324216)), /* c1 */
3211	CONST_BITS+PASS1_BITS);
3212
3213	dataptr++; /* advance pointer to next column */
3214	}
3215	}
3216
3217
3218	/*
3219	* Perform the forward DCT on a 4x2 sample block.
3220	*
3221	* 4-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
3222	*/
3223
3224	GLOBAL(void)
3225	jpeg_fdct_4x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3226	{
3227	INT32 tmp0, tmp1;
3228	INT32 tmp10, tmp11;
3229	DCTELEM *dataptr;
3230	JSAMPROW elemptr;
3231	int ctr;
3232	SHIFT_TEMPS
3233
3234	/* Pre-zero output coefficient block. */
3235	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3236
3237	/* Pass 1: process rows. */
3238	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3239	/* furthermore, we scale the results by 2*PASS1_BITS. /
3240	/* We must also scale the output by (8/4)(8/2) = 23, which we add here. /
3241	/* 4-point FDCT kernel, */
3242	/* cK represents sqrt(2) * cos(Kpi/16) [refers to 8-point FDCT]. /
3243
3244	dataptr = data;
3245	for (ctr = 0; ctr < 2; ctr++) {
3246	elemptr = sample_data[ctr] + start_col;
3247
3248	/* Even part */
3249
3250	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
3251	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
3252
3253	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
3254	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
3255
3256	/* Apply unsigned->signed conversion */
3257	dataptr[0] = (DCTELEM)
3258	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+3));
3259	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+3));
3260
3261	/* Odd part */
3262
3263	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
3264	/* Add fudge factor here for final descale. */
3265	tmp0 += ONE << (CONST_BITS-PASS1_BITS-4);
3266
3267	dataptr[1] = (DCTELEM)
3268	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
3269	CONST_BITS-PASS1_BITS-3);
3270	dataptr[3] = (DCTELEM)
3271	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
3272	CONST_BITS-PASS1_BITS-3);
3273
3274	dataptr += DCTSIZE; /* advance pointer to next row */
3275	}
3276
3277	/* Pass 2: process columns.
3278	* We remove the PASS1_BITS scaling, but leave the results scaled up
3279	* by an overall factor of 8.
3280	*/
3281
3282	dataptr = data;
3283	for (ctr = 0; ctr < 4; ctr++) {
3284	/* Even part */
3285
3286	/* Add fudge factor here for final descale. */
3287	tmp0 = dataptr[DCTSIZE*0] + (ONE << (PASS1_BITS-1));
3288	tmp1 = dataptr[DCTSIZE*1];
3289
3290	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp0 + tmp1, PASS1_BITS);
3291
3292	/* Odd part */
3293
3294	dataptr[DCTSIZE*1] = (DCTELEM) RIGHT_SHIFT(tmp0 - tmp1, PASS1_BITS);
3295
3296	dataptr++; /* advance pointer to next column */
3297	}
3298	}
3299
3300
3301	/*
3302	* Perform the forward DCT on a 2x1 sample block.
3303	*
3304	* 2-point FDCT in pass 1 (rows), 1-point in pass 2 (columns).
3305	*/
3306
3307	GLOBAL(void)
3308	jpeg_fdct_2x1 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3309	{
3310	INT32 tmp0, tmp1;
3311	JSAMPROW elemptr;
3312
3313	/* Pre-zero output coefficient block. */
3314	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3315
3316	elemptr = sample_data[0] + start_col;
3317
3318	tmp0 = GETJSAMPLE(elemptr[0]);
3319	tmp1 = GETJSAMPLE(elemptr[1]);
3320
3321	/* We leave the results scaled up by an overall factor of 8.
3322	* We must also scale the output by (8/2)(8/1) = 2*5.
3323	*/
3324
3325	/* Even part */
3326	/* Apply unsigned->signed conversion */
3327	data[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 5);
3328
3329	/* Odd part */
3330	data[1] = (DCTELEM) ((tmp0 - tmp1) << 5);
3331	}
3332
3333
3334	/*
3335	* Perform the forward DCT on an 8x16 sample block.
3336	*
3337	* 8-point FDCT in pass 1 (rows), 16-point in pass 2 (columns).
3338	*/
3339
3340	GLOBAL(void)
3341	jpeg_fdct_8x16 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3342	{
3343	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3344	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;
3345	INT32 z1;
3346	DCTELEM workspace[DCTSIZE2];
3347	DCTELEM *dataptr;
3348	DCTELEM *wsptr;
3349	JSAMPROW elemptr;
3350	int ctr;
3351	SHIFT_TEMPS
3352
3353	/* Pass 1: process rows. */
3354	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3355	/* furthermore, we scale the results by 2*PASS1_BITS. /
3356
3357	dataptr = data;
3358	ctr = 0;
3359	for (;;) {
3360	elemptr = sample_data[ctr] + start_col;
3361
3362	/* Even part per LL&M figure 1 --- note that published figure is faulty;
3363	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
3364	*/
3365
3366	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[7]);
3367	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[6]);
3368	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[5]);
3369	tmp3 = GETJSAMPLE(elemptr[3]) + GETJSAMPLE(elemptr[4]);
3370
3371	tmp10 = tmp0 + tmp3;
3372	tmp12 = tmp0 - tmp3;
3373	tmp11 = tmp1 + tmp2;
3374	tmp13 = tmp1 - tmp2;
3375
3376	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[7]);
3377	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[6]);
3378	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[5]);
3379	tmp3 = GETJSAMPLE(elemptr[3]) - GETJSAMPLE(elemptr[4]);
3380
3381	/* Apply unsigned->signed conversion */
3382	dataptr[0] = (DCTELEM) ((tmp10 + tmp11 - 8 * CENTERJSAMPLE) << PASS1_BITS);
3383	dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
3384
3385	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
3386	dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, FIX_0_765366865),
3387	CONST_BITS-PASS1_BITS);
3388	dataptr[6] = (DCTELEM) DESCALE(z1 - MULTIPLY(tmp13, FIX_1_847759065),
3389	CONST_BITS-PASS1_BITS);
3390
3391	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
3392	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
3393	* i0..i3 in the paper are tmp0..tmp3 here.
3394	*/
3395
3396	tmp10 = tmp0 + tmp3;
3397	tmp11 = tmp1 + tmp2;
3398	tmp12 = tmp0 + tmp2;
3399	tmp13 = tmp1 + tmp3;
3400	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
3401
3402	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
3403	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
3404	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
3405	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
3406	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
3407	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
3408	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
3409	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
3410
3411	tmp12 += z1;
3412	tmp13 += z1;
3413
3414	dataptr[1] = (DCTELEM) DESCALE(tmp0 + tmp10 + tmp12, CONST_BITS-PASS1_BITS);
3415	dataptr[3] = (DCTELEM) DESCALE(tmp1 + tmp11 + tmp13, CONST_BITS-PASS1_BITS);
3416	dataptr[5] = (DCTELEM) DESCALE(tmp2 + tmp11 + tmp12, CONST_BITS-PASS1_BITS);
3417	dataptr[7] = (DCTELEM) DESCALE(tmp3 + tmp10 + tmp13, CONST_BITS-PASS1_BITS);
3418
3419	ctr++;
3420
3421	if (ctr != DCTSIZE) {
3422	if (ctr == DCTSIZE * 2)
3423	break; /* Done. */
3424	dataptr += DCTSIZE; /* advance pointer to next row */
3425	} else
3426	dataptr = workspace; /* switch pointer to extended workspace */
3427	}
3428
3429	/* Pass 2: process columns.
3430	* We remove the PASS1_BITS scaling, but leave the results scaled up
3431	* by an overall factor of 8.
3432	* We must also scale the output by 8/16 = 1/2.
3433	* 16-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/32).
3434	*/
3435
3436	dataptr = data;
3437	wsptr = workspace;
3438	for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
3439	/* Even part */
3440
3441	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE7];
3442	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE6];
3443	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE5];
3444	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE4];
3445	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE3];
3446	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE2];
3447	tmp6 = dataptr[DCTSIZE6] + wsptr[DCTSIZE1];
3448	tmp7 = dataptr[DCTSIZE7] + wsptr[DCTSIZE0];
3449
3450	tmp10 = tmp0 + tmp7;
3451	tmp14 = tmp0 - tmp7;
3452	tmp11 = tmp1 + tmp6;
3453	tmp15 = tmp1 - tmp6;
3454	tmp12 = tmp2 + tmp5;
3455	tmp16 = tmp2 - tmp5;
3456	tmp13 = tmp3 + tmp4;
3457	tmp17 = tmp3 - tmp4;
3458
3459	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE7];
3460	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE6];
3461	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE5];
3462	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE4];
3463	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE3];
3464	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE2];
3465	tmp6 = dataptr[DCTSIZE6] - wsptr[DCTSIZE1];
3466	tmp7 = dataptr[DCTSIZE7] - wsptr[DCTSIZE0];
3467
3468	dataptr[DCTSIZE*0] = (DCTELEM)
3469	DESCALE(tmp10 + tmp11 + tmp12 + tmp13, PASS1_BITS+1);
3470	dataptr[DCTSIZE*4] = (DCTELEM)
3471	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(1.306562965)) + /* c4[16] = c2[8] */
3472	MULTIPLY(tmp11 - tmp12, FIX_0_541196100), /* c12[16] = c6[8] */
3473	CONST_BITS+PASS1_BITS+1);
3474
3475	tmp10 = MULTIPLY(tmp17 - tmp15, FIX(0.275899379)) + /* c14[16] = c7[8] */
3476	MULTIPLY(tmp14 - tmp16, FIX(1.387039845)); /* c2[16] = c1[8] */
3477
3478	dataptr[DCTSIZE*2] = (DCTELEM)
3479	DESCALE(tmp10 + MULTIPLY(tmp15, FIX(1.451774982)) /* c6+c14 */
3480	+ MULTIPLY(tmp16, FIX(2.172734804)), /* c2+c10 */
3481	CONST_BITS+PASS1_BITS+1);
3482	dataptr[DCTSIZE*6] = (DCTELEM)
3483	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(0.211164243)) /* c2-c6 */
3484	- MULTIPLY(tmp17, FIX(1.061594338)), /* c10+c14 */
3485	CONST_BITS+PASS1_BITS+1);
3486
3487	/* Odd part */
3488
3489	tmp11 = MULTIPLY(tmp0 + tmp1, FIX(1.353318001)) + /* c3 */
3490	MULTIPLY(tmp6 - tmp7, FIX(0.410524528)); /* c13 */
3491	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(1.247225013)) + /* c5 */
3492	MULTIPLY(tmp5 + tmp7, FIX(0.666655658)); /* c11 */
3493	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(1.093201867)) + /* c7 */
3494	MULTIPLY(tmp4 - tmp7, FIX(0.897167586)); /* c9 */
3495	tmp14 = MULTIPLY(tmp1 + tmp2, FIX(0.138617169)) + /* c15 */
3496	MULTIPLY(tmp6 - tmp5, FIX(1.407403738)); /* c1 */
3497	tmp15 = MULTIPLY(tmp1 + tmp3, - FIX(0.666655658)) + /* -c11 */
3498	MULTIPLY(tmp4 + tmp6, - FIX(1.247225013)); /* -c5 */
3499	tmp16 = MULTIPLY(tmp2 + tmp3, - FIX(1.353318001)) + /* -c3 */
3500	MULTIPLY(tmp5 - tmp4, FIX(0.410524528)); /* c13 */
3501	tmp10 = tmp11 + tmp12 + tmp13 -
3502	MULTIPLY(tmp0, FIX(2.286341144)) + /* c7+c5+c3-c1 */
3503	MULTIPLY(tmp7, FIX(0.779653625)); /* c15+c13-c11+c9 */
3504	tmp11 += tmp14 + tmp15 + MULTIPLY(tmp1, FIX(0.071888074)) /* c9-c3-c15+c11 */
3505	- MULTIPLY(tmp6, FIX(1.663905119)); /* c7+c13+c1-c5 */
3506	tmp12 += tmp14 + tmp16 - MULTIPLY(tmp2, FIX(1.125726048)) /* c7+c5+c15-c3 */
3507	+ MULTIPLY(tmp5, FIX(1.227391138)); /* c9-c11+c1-c13 */
3508	tmp13 += tmp15 + tmp16 + MULTIPLY(tmp3, FIX(1.065388962)) /* c15+c3+c11-c7 */
3509	+ MULTIPLY(tmp4, FIX(2.167985692)); /* c1+c13+c5-c9 */
3510
3511	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS+1);
3512	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS+1);
3513	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS+1);
3514	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS+1);
3515
3516	dataptr++; /* advance pointer to next column */
3517	wsptr++; /* advance pointer to next column */
3518	}
3519	}
3520
3521
3522	/*
3523	* Perform the forward DCT on a 7x14 sample block.
3524	*
3525	* 7-point FDCT in pass 1 (rows), 14-point in pass 2 (columns).
3526	*/
3527
3528	GLOBAL(void)
3529	jpeg_fdct_7x14 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3530	{
3531	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
3532	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
3533	INT32 z1, z2, z3;
3534	DCTELEM workspace[8*6];
3535	DCTELEM *dataptr;
3536	DCTELEM *wsptr;
3537	JSAMPROW elemptr;
3538	int ctr;
3539	SHIFT_TEMPS
3540
3541	/* Pre-zero output coefficient block. */
3542	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3543
3544	/* Pass 1: process rows. */
3545	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3546	/* furthermore, we scale the results by 2*PASS1_BITS. /
3547	/* 7-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/14). /
3548
3549	dataptr = data;
3550	ctr = 0;
3551	for (;;) {
3552	elemptr = sample_data[ctr] + start_col;
3553
3554	/* Even part */
3555
3556	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[6]);
3557	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[5]);
3558	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[4]);
3559	tmp3 = GETJSAMPLE(elemptr[3]);
3560
3561	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[6]);
3562	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[5]);
3563	tmp12 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[4]);
3564
3565	z1 = tmp0 + tmp2;
3566	/* Apply unsigned->signed conversion */
3567	dataptr[0] = (DCTELEM)
3568	((z1 + tmp1 + tmp3 - 7 * CENTERJSAMPLE) << PASS1_BITS);
3569	tmp3 += tmp3;
3570	z1 -= tmp3;
3571	z1 -= tmp3;
3572	z1 = MULTIPLY(z1, FIX(0.353553391)); /* (c2+c6-c4)/2 */
3573	z2 = MULTIPLY(tmp0 - tmp2, FIX(0.920609002)); /* (c2+c4-c6)/2 */
3574	z3 = MULTIPLY(tmp1 - tmp2, FIX(0.314692123)); /* c6 */
3575	dataptr[2] = (DCTELEM) DESCALE(z1 + z2 + z3, CONST_BITS-PASS1_BITS);
3576	z1 -= z2;
3577	z2 = MULTIPLY(tmp0 - tmp1, FIX(0.881747734)); /* c4 */
3578	dataptr[4] = (DCTELEM)
3579	DESCALE(z2 + z3 - MULTIPLY(tmp1 - tmp3, FIX(0.707106781)), /* c2+c6-c4 */
3580	CONST_BITS-PASS1_BITS);
3581	dataptr[6] = (DCTELEM) DESCALE(z1 + z2, CONST_BITS-PASS1_BITS);
3582
3583	/* Odd part */
3584
3585	tmp1 = MULTIPLY(tmp10 + tmp11, FIX(0.935414347)); /* (c3+c1-c5)/2 */
3586	tmp2 = MULTIPLY(tmp10 - tmp11, FIX(0.170262339)); /* (c3+c5-c1)/2 */
3587	tmp0 = tmp1 - tmp2;
3588	tmp1 += tmp2;
3589	tmp2 = MULTIPLY(tmp11 + tmp12, - FIX(1.378756276)); /* -c1 */
3590	tmp1 += tmp2;
3591	tmp3 = MULTIPLY(tmp10 + tmp12, FIX(0.613604268)); /* c5 */
3592	tmp0 += tmp3;
3593	tmp2 += tmp3 + MULTIPLY(tmp12, FIX(1.870828693)); /* c3+c1-c5 */
3594
3595	dataptr[1] = (DCTELEM) DESCALE(tmp0, CONST_BITS-PASS1_BITS);
3596	dataptr[3] = (DCTELEM) DESCALE(tmp1, CONST_BITS-PASS1_BITS);
3597	dataptr[5] = (DCTELEM) DESCALE(tmp2, CONST_BITS-PASS1_BITS);
3598
3599	ctr++;
3600
3601	if (ctr != DCTSIZE) {
3602	if (ctr == 14)
3603	break; /* Done. */
3604	dataptr += DCTSIZE; /* advance pointer to next row */
3605	} else
3606	dataptr = workspace; /* switch pointer to extended workspace */
3607	}
3608
3609	/* Pass 2: process columns.
3610	* We remove the PASS1_BITS scaling, but leave the results scaled up
3611	* by an overall factor of 8.
3612	* We must also scale the output by (8/7)*(8/14) = 32/49, which we
3613	* fold into the constant multipliers:
3614	* 14-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/28) 32/49.
3615	*/
3616
3617	dataptr = data;
3618	wsptr = workspace;
3619	for (ctr = 0; ctr < 7; ctr++) {
3620	/* Even part */
3621
3622	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE5];
3623	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE4];
3624	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE3];
3625	tmp13 = dataptr[DCTSIZE3] + wsptr[DCTSIZE2];
3626	tmp4 = dataptr[DCTSIZE4] + wsptr[DCTSIZE1];
3627	tmp5 = dataptr[DCTSIZE5] + wsptr[DCTSIZE0];
3628	tmp6 = dataptr[DCTSIZE6] + dataptr[DCTSIZE7];
3629
3630	tmp10 = tmp0 + tmp6;
3631	tmp14 = tmp0 - tmp6;
3632	tmp11 = tmp1 + tmp5;
3633	tmp15 = tmp1 - tmp5;
3634	tmp12 = tmp2 + tmp4;
3635	tmp16 = tmp2 - tmp4;
3636
3637	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE5];
3638	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE4];
3639	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE3];
3640	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE2];
3641	tmp4 = dataptr[DCTSIZE4] - wsptr[DCTSIZE1];
3642	tmp5 = dataptr[DCTSIZE5] - wsptr[DCTSIZE0];
3643	tmp6 = dataptr[DCTSIZE6] - dataptr[DCTSIZE7];
3644
3645	dataptr[DCTSIZE*0] = (DCTELEM)
3646	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12 + tmp13,
3647	FIX(0.653061224)), /* 32/49 */
3648	CONST_BITS+PASS1_BITS);
3649	tmp13 += tmp13;
3650	dataptr[DCTSIZE*4] = (DCTELEM)
3651	DESCALE(MULTIPLY(tmp10 - tmp13, FIX(0.832106052)) + /* c4 */
3652	MULTIPLY(tmp11 - tmp13, FIX(0.205513223)) - /* c12 */
3653	MULTIPLY(tmp12 - tmp13, FIX(0.575835255)), /* c8 */
3654	CONST_BITS+PASS1_BITS);
3655
3656	tmp10 = MULTIPLY(tmp14 + tmp15, FIX(0.722074570)); /* c6 */
3657
3658	dataptr[DCTSIZE*2] = (DCTELEM)
3659	DESCALE(tmp10 + MULTIPLY(tmp14, FIX(0.178337691)) /* c2-c6 */
3660	+ MULTIPLY(tmp16, FIX(0.400721155)), /* c10 */
3661	CONST_BITS+PASS1_BITS);
3662	dataptr[DCTSIZE*6] = (DCTELEM)
3663	DESCALE(tmp10 - MULTIPLY(tmp15, FIX(1.122795725)) /* c6+c10 */
3664	- MULTIPLY(tmp16, FIX(0.900412262)), /* c2 */
3665	CONST_BITS+PASS1_BITS);
3666
3667	/* Odd part */
3668
3669	tmp10 = tmp1 + tmp2;
3670	tmp11 = tmp5 - tmp4;
3671	dataptr[DCTSIZE*7] = (DCTELEM)
3672	DESCALE(MULTIPLY(tmp0 - tmp10 + tmp3 - tmp11 - tmp6,
3673	FIX(0.653061224)), /* 32/49 */
3674	CONST_BITS+PASS1_BITS);
3675	tmp3 = MULTIPLY(tmp3 , FIX(0.653061224)); /* 32/49 */
3676	tmp10 = MULTIPLY(tmp10, - FIX(0.103406812)); /* -c13 */
3677	tmp11 = MULTIPLY(tmp11, FIX(0.917760839)); /* c1 */
3678	tmp10 += tmp11 - tmp3;
3679	tmp11 = MULTIPLY(tmp0 + tmp2, FIX(0.782007410)) + /* c5 */
3680	MULTIPLY(tmp4 + tmp6, FIX(0.491367823)); /* c9 */
3681	dataptr[DCTSIZE*5] = (DCTELEM)
3682	DESCALE(tmp10 + tmp11 - MULTIPLY(tmp2, FIX(1.550341076)) /* c3+c5-c13 */
3683	+ MULTIPLY(tmp4, FIX(0.731428202)), /* c1+c11-c9 */
3684	CONST_BITS+PASS1_BITS);
3685	tmp12 = MULTIPLY(tmp0 + tmp1, FIX(0.871740478)) + /* c3 */
3686	MULTIPLY(tmp5 - tmp6, FIX(0.305035186)); /* c11 */
3687	dataptr[DCTSIZE*3] = (DCTELEM)
3688	DESCALE(tmp10 + tmp12 - MULTIPLY(tmp1, FIX(0.276965844)) /* c3-c9-c13 */
3689	- MULTIPLY(tmp5, FIX(2.004803435)), /* c1+c5+c11 */
3690	CONST_BITS+PASS1_BITS);
3691	dataptr[DCTSIZE*1] = (DCTELEM)
3692	DESCALE(tmp11 + tmp12 + tmp3
3693	- MULTIPLY(tmp0, FIX(0.735987049)) /* c3+c5-c1 */
3694	- MULTIPLY(tmp6, FIX(0.082925825)), /* c9-c11-c13 */
3695	CONST_BITS+PASS1_BITS);
3696
3697	dataptr++; /* advance pointer to next column */
3698	wsptr++; /* advance pointer to next column */
3699	}
3700	}
3701
3702
3703	/*
3704	* Perform the forward DCT on a 6x12 sample block.
3705	*
3706	* 6-point FDCT in pass 1 (rows), 12-point in pass 2 (columns).
3707	*/
3708
3709	GLOBAL(void)
3710	jpeg_fdct_6x12 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3711	{
3712	INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3713	INT32 tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
3714	DCTELEM workspace[8*4];
3715	DCTELEM *dataptr;
3716	DCTELEM *wsptr;
3717	JSAMPROW elemptr;
3718	int ctr;
3719	SHIFT_TEMPS
3720
3721	/* Pre-zero output coefficient block. */
3722	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3723
3724	/* Pass 1: process rows. */
3725	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3726	/* furthermore, we scale the results by 2*PASS1_BITS. /
3727	/* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12). /
3728
3729	dataptr = data;
3730	ctr = 0;
3731	for (;;) {
3732	elemptr = sample_data[ctr] + start_col;
3733
3734	/* Even part */
3735
3736	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[5]);
3737	tmp11 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[4]);
3738	tmp2 = GETJSAMPLE(elemptr[2]) + GETJSAMPLE(elemptr[3]);
3739
3740	tmp10 = tmp0 + tmp2;
3741	tmp12 = tmp0 - tmp2;
3742
3743	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[5]);
3744	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[4]);
3745	tmp2 = GETJSAMPLE(elemptr[2]) - GETJSAMPLE(elemptr[3]);
3746
3747	/* Apply unsigned->signed conversion */
3748	dataptr[0] = (DCTELEM)
3749	((tmp10 + tmp11 - 6 * CENTERJSAMPLE) << PASS1_BITS);
3750	dataptr[2] = (DCTELEM)
3751	DESCALE(MULTIPLY(tmp12, FIX(1.224744871)), /* c2 */
3752	CONST_BITS-PASS1_BITS);
3753	dataptr[4] = (DCTELEM)
3754	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(0.707106781)), /* c4 */
3755	CONST_BITS-PASS1_BITS);
3756
3757	/* Odd part */
3758
3759	tmp10 = DESCALE(MULTIPLY(tmp0 + tmp2, FIX(0.366025404)), /* c5 */
3760	CONST_BITS-PASS1_BITS);
3761
3762	dataptr[1] = (DCTELEM) (tmp10 + ((tmp0 + tmp1) << PASS1_BITS));
3763	dataptr[3] = (DCTELEM) ((tmp0 - tmp1 - tmp2) << PASS1_BITS);
3764	dataptr[5] = (DCTELEM) (tmp10 + ((tmp2 - tmp1) << PASS1_BITS));
3765
3766	ctr++;
3767
3768	if (ctr != DCTSIZE) {
3769	if (ctr == 12)
3770	break; /* Done. */
3771	dataptr += DCTSIZE; /* advance pointer to next row */
3772	} else
3773	dataptr = workspace; /* switch pointer to extended workspace */
3774	}
3775
3776	/* Pass 2: process columns.
3777	* We remove the PASS1_BITS scaling, but leave the results scaled up
3778	* by an overall factor of 8.
3779	* We must also scale the output by (8/6)*(8/12) = 8/9, which we
3780	* fold into the constant multipliers:
3781	* 12-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/24) 8/9.
3782	*/
3783
3784	dataptr = data;
3785	wsptr = workspace;
3786	for (ctr = 0; ctr < 6; ctr++) {
3787	/* Even part */
3788
3789	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE3];
3790	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE2];
3791	tmp2 = dataptr[DCTSIZE2] + wsptr[DCTSIZE1];
3792	tmp3 = dataptr[DCTSIZE3] + wsptr[DCTSIZE0];
3793	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE7];
3794	tmp5 = dataptr[DCTSIZE5] + dataptr[DCTSIZE6];
3795
3796	tmp10 = tmp0 + tmp5;
3797	tmp13 = tmp0 - tmp5;
3798	tmp11 = tmp1 + tmp4;
3799	tmp14 = tmp1 - tmp4;
3800	tmp12 = tmp2 + tmp3;
3801	tmp15 = tmp2 - tmp3;
3802
3803	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE3];
3804	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE2];
3805	tmp2 = dataptr[DCTSIZE2] - wsptr[DCTSIZE1];
3806	tmp3 = dataptr[DCTSIZE3] - wsptr[DCTSIZE0];
3807	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE7];
3808	tmp5 = dataptr[DCTSIZE5] - dataptr[DCTSIZE6];
3809
3810	dataptr[DCTSIZE*0] = (DCTELEM)
3811	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(0.888888889)), /* 8/9 */
3812	CONST_BITS+PASS1_BITS);
3813	dataptr[DCTSIZE*6] = (DCTELEM)
3814	DESCALE(MULTIPLY(tmp13 - tmp14 - tmp15, FIX(0.888888889)), /* 8/9 */
3815	CONST_BITS+PASS1_BITS);
3816	dataptr[DCTSIZE*4] = (DCTELEM)
3817	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.088662108)), /* c4 */
3818	CONST_BITS+PASS1_BITS);
3819	dataptr[DCTSIZE*2] = (DCTELEM)
3820	DESCALE(MULTIPLY(tmp14 - tmp15, FIX(0.888888889)) + /* 8/9 */
3821	MULTIPLY(tmp13 + tmp15, FIX(1.214244803)), /* c2 */
3822	CONST_BITS+PASS1_BITS);
3823
3824	/* Odd part */
3825
3826	tmp10 = MULTIPLY(tmp1 + tmp4, FIX(0.481063200)); /* c9 */
3827	tmp14 = tmp10 + MULTIPLY(tmp1, FIX(0.680326102)); /* c3-c9 */
3828	tmp15 = tmp10 - MULTIPLY(tmp4, FIX(1.642452502)); /* c3+c9 */
3829	tmp12 = MULTIPLY(tmp0 + tmp2, FIX(0.997307603)); /* c5 */
3830	tmp13 = MULTIPLY(tmp0 + tmp3, FIX(0.765261039)); /* c7 */
3831	tmp10 = tmp12 + tmp13 + tmp14 - MULTIPLY(tmp0, FIX(0.516244403)) /* c5+c7-c1 */
3832	+ MULTIPLY(tmp5, FIX(0.164081699)); /* c11 */
3833	tmp11 = MULTIPLY(tmp2 + tmp3, - FIX(0.164081699)); /* -c11 */
3834	tmp12 += tmp11 - tmp15 - MULTIPLY(tmp2, FIX(2.079550144)) /* c1+c5-c11 */
3835	+ MULTIPLY(tmp5, FIX(0.765261039)); /* c7 */
3836	tmp13 += tmp11 - tmp14 + MULTIPLY(tmp3, FIX(0.645144899)) /* c1+c11-c7 */
3837	- MULTIPLY(tmp5, FIX(0.997307603)); /* c5 */
3838	tmp11 = tmp15 + MULTIPLY(tmp0 - tmp3, FIX(1.161389302)) /* c3 */
3839	- MULTIPLY(tmp2 + tmp5, FIX(0.481063200)); /* c9 */
3840
3841	dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10, CONST_BITS+PASS1_BITS);
3842	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp11, CONST_BITS+PASS1_BITS);
3843	dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp12, CONST_BITS+PASS1_BITS);
3844	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp13, CONST_BITS+PASS1_BITS);
3845
3846	dataptr++; /* advance pointer to next column */
3847	wsptr++; /* advance pointer to next column */
3848	}
3849	}
3850
3851
3852	/*
3853	* Perform the forward DCT on a 5x10 sample block.
3854	*
3855	* 5-point FDCT in pass 1 (rows), 10-point in pass 2 (columns).
3856	*/
3857
3858	GLOBAL(void)
3859	jpeg_fdct_5x10 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
3860	{
3861	INT32 tmp0, tmp1, tmp2, tmp3, tmp4;
3862	INT32 tmp10, tmp11, tmp12, tmp13, tmp14;
3863	DCTELEM workspace[8*2];
3864	DCTELEM *dataptr;
3865	DCTELEM *wsptr;
3866	JSAMPROW elemptr;
3867	int ctr;
3868	SHIFT_TEMPS
3869
3870	/* Pre-zero output coefficient block. */
3871	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
3872
3873	/* Pass 1: process rows. */
3874	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
3875	/* furthermore, we scale the results by 2*PASS1_BITS. /
3876	/* 5-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/10). /
3877
3878	dataptr = data;
3879	ctr = 0;
3880	for (;;) {
3881	elemptr = sample_data[ctr] + start_col;
3882
3883	/* Even part */
3884
3885	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[4]);
3886	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[3]);
3887	tmp2 = GETJSAMPLE(elemptr[2]);
3888
3889	tmp10 = tmp0 + tmp1;
3890	tmp11 = tmp0 - tmp1;
3891
3892	tmp0 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[4]);
3893	tmp1 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[3]);
3894
3895	/* Apply unsigned->signed conversion */
3896	dataptr[0] = (DCTELEM)
3897	((tmp10 + tmp2 - 5 * CENTERJSAMPLE) << PASS1_BITS);
3898	tmp11 = MULTIPLY(tmp11, FIX(0.790569415)); /* (c2+c4)/2 */
3899	tmp10 -= tmp2 << 2;
3900	tmp10 = MULTIPLY(tmp10, FIX(0.353553391)); /* (c2-c4)/2 */
3901	dataptr[2] = (DCTELEM) DESCALE(tmp11 + tmp10, CONST_BITS-PASS1_BITS);
3902	dataptr[4] = (DCTELEM) DESCALE(tmp11 - tmp10, CONST_BITS-PASS1_BITS);
3903
3904	/* Odd part */
3905
3906	tmp10 = MULTIPLY(tmp0 + tmp1, FIX(0.831253876)); /* c3 */
3907
3908	dataptr[1] = (DCTELEM)
3909	DESCALE(tmp10 + MULTIPLY(tmp0, FIX(0.513743148)), /* c1-c3 */
3910	CONST_BITS-PASS1_BITS);
3911	dataptr[3] = (DCTELEM)
3912	DESCALE(tmp10 - MULTIPLY(tmp1, FIX(2.176250899)), /* c1+c3 */
3913	CONST_BITS-PASS1_BITS);
3914
3915	ctr++;
3916
3917	if (ctr != DCTSIZE) {
3918	if (ctr == 10)
3919	break; /* Done. */
3920	dataptr += DCTSIZE; /* advance pointer to next row */
3921	} else
3922	dataptr = workspace; /* switch pointer to extended workspace */
3923	}
3924
3925	/* Pass 2: process columns.
3926	* We remove the PASS1_BITS scaling, but leave the results scaled up
3927	* by an overall factor of 8.
3928	* We must also scale the output by (8/5)*(8/10) = 32/25, which we
3929	* fold into the constant multipliers:
3930	* 10-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/20) 32/25.
3931	*/
3932
3933	dataptr = data;
3934	wsptr = workspace;
3935	for (ctr = 0; ctr < 5; ctr++) {
3936	/* Even part */
3937
3938	tmp0 = dataptr[DCTSIZE0] + wsptr[DCTSIZE1];
3939	tmp1 = dataptr[DCTSIZE1] + wsptr[DCTSIZE0];
3940	tmp12 = dataptr[DCTSIZE2] + dataptr[DCTSIZE7];
3941	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE6];
3942	tmp4 = dataptr[DCTSIZE4] + dataptr[DCTSIZE5];
3943
3944	tmp10 = tmp0 + tmp4;
3945	tmp13 = tmp0 - tmp4;
3946	tmp11 = tmp1 + tmp3;
3947	tmp14 = tmp1 - tmp3;
3948
3949	tmp0 = dataptr[DCTSIZE0] - wsptr[DCTSIZE1];
3950	tmp1 = dataptr[DCTSIZE1] - wsptr[DCTSIZE0];
3951	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE7];
3952	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE6];
3953	tmp4 = dataptr[DCTSIZE4] - dataptr[DCTSIZE5];
3954
3955	dataptr[DCTSIZE*0] = (DCTELEM)
3956	DESCALE(MULTIPLY(tmp10 + tmp11 + tmp12, FIX(1.28)), /* 32/25 */
3957	CONST_BITS+PASS1_BITS);
3958	tmp12 += tmp12;
3959	dataptr[DCTSIZE*4] = (DCTELEM)
3960	DESCALE(MULTIPLY(tmp10 - tmp12, FIX(1.464477191)) - /* c4 */
3961	MULTIPLY(tmp11 - tmp12, FIX(0.559380511)), /* c8 */
3962	CONST_BITS+PASS1_BITS);
3963	tmp10 = MULTIPLY(tmp13 + tmp14, FIX(1.064004961)); /* c6 */
3964	dataptr[DCTSIZE*2] = (DCTELEM)
3965	DESCALE(tmp10 + MULTIPLY(tmp13, FIX(0.657591230)), /* c2-c6 */
3966	CONST_BITS+PASS1_BITS);
3967	dataptr[DCTSIZE*6] = (DCTELEM)
3968	DESCALE(tmp10 - MULTIPLY(tmp14, FIX(2.785601151)), /* c2+c6 */
3969	CONST_BITS+PASS1_BITS);
3970
3971	/* Odd part */
3972
3973	tmp10 = tmp0 + tmp4;
3974	tmp11 = tmp1 - tmp3;
3975	dataptr[DCTSIZE*5] = (DCTELEM)
3976	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp2, FIX(1.28)), /* 32/25 */
3977	CONST_BITS+PASS1_BITS);
3978	tmp2 = MULTIPLY(tmp2, FIX(1.28)); /* 32/25 */
3979	dataptr[DCTSIZE*1] = (DCTELEM)
3980	DESCALE(MULTIPLY(tmp0, FIX(1.787906876)) + /* c1 */
3981	MULTIPLY(tmp1, FIX(1.612894094)) + tmp2 + /* c3 */
3982	MULTIPLY(tmp3, FIX(0.821810588)) + /* c7 */
3983	MULTIPLY(tmp4, FIX(0.283176630)), /* c9 */
3984	CONST_BITS+PASS1_BITS);
3985	tmp12 = MULTIPLY(tmp0 - tmp4, FIX(1.217352341)) - /* (c3+c7)/2 */
3986	MULTIPLY(tmp1 + tmp3, FIX(0.752365123)); /* (c1-c9)/2 */
3987	tmp13 = MULTIPLY(tmp10 + tmp11, FIX(0.395541753)) + /* (c3-c7)/2 */
3988	MULTIPLY(tmp11, FIX(0.64)) - tmp2; /* 16/25 */
3989	dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp12 + tmp13, CONST_BITS+PASS1_BITS);
3990	dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp12 - tmp13, CONST_BITS+PASS1_BITS);
3991
3992	dataptr++; /* advance pointer to next column */
3993	wsptr++; /* advance pointer to next column */
3994	}
3995	}
3996
3997
3998	/*
3999	* Perform the forward DCT on a 4x8 sample block.
4000	*
4001	* 4-point FDCT in pass 1 (rows), 8-point in pass 2 (columns).
4002	*/
4003
4004	GLOBAL(void)
4005	jpeg_fdct_4x8 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4006	{
4007	INT32 tmp0, tmp1, tmp2, tmp3;
4008	INT32 tmp10, tmp11, tmp12, tmp13;
4009	INT32 z1;
4010	DCTELEM *dataptr;
4011	JSAMPROW elemptr;
4012	int ctr;
4013	SHIFT_TEMPS
4014
4015	/* Pre-zero output coefficient block. */
4016	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4017
4018	/* Pass 1: process rows. */
4019	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
4020	/* furthermore, we scale the results by 2*PASS1_BITS. /
4021	/* We must also scale the output by 8/4 = 2, which we add here. */
4022	/* 4-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/16). /
4023
4024	dataptr = data;
4025	for (ctr = 0; ctr < DCTSIZE; ctr++) {
4026	elemptr = sample_data[ctr] + start_col;
4027
4028	/* Even part */
4029
4030	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[3]);
4031	tmp1 = GETJSAMPLE(elemptr[1]) + GETJSAMPLE(elemptr[2]);
4032
4033	tmp10 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[3]);
4034	tmp11 = GETJSAMPLE(elemptr[1]) - GETJSAMPLE(elemptr[2]);
4035
4036	/* Apply unsigned->signed conversion */
4037	dataptr[0] = (DCTELEM)
4038	((tmp0 + tmp1 - 4 * CENTERJSAMPLE) << (PASS1_BITS+1));
4039	dataptr[2] = (DCTELEM) ((tmp0 - tmp1) << (PASS1_BITS+1));
4040
4041	/* Odd part */
4042
4043	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
4044	/* Add fudge factor here for final descale. */
4045	tmp0 += ONE << (CONST_BITS-PASS1_BITS-2);
4046
4047	dataptr[1] = (DCTELEM)
4048	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4049	CONST_BITS-PASS1_BITS-1);
4050	dataptr[3] = (DCTELEM)
4051	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4052	CONST_BITS-PASS1_BITS-1);
4053
4054	dataptr += DCTSIZE; /* advance pointer to next row */
4055	}
4056
4057	/* Pass 2: process columns.
4058	* We remove the PASS1_BITS scaling, but leave the results scaled up
4059	* by an overall factor of 8.
4060	*/
4061
4062	dataptr = data;
4063	for (ctr = 0; ctr < 4; ctr++) {
4064	/* Even part per LL&M figure 1 --- note that published figure is faulty;
4065	* rotator "sqrt(2)c1" should be "sqrt(2)c6".
4066	*/
4067
4068	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE7];
4069	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE6];
4070	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE5];
4071	tmp3 = dataptr[DCTSIZE3] + dataptr[DCTSIZE4];
4072
4073	/* Add fudge factor here for final descale. */
4074	tmp10 = tmp0 + tmp3 + (ONE << (PASS1_BITS-1));
4075	tmp12 = tmp0 - tmp3;
4076	tmp11 = tmp1 + tmp2;
4077	tmp13 = tmp1 - tmp2;
4078
4079	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE7];
4080	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE6];
4081	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE5];
4082	tmp3 = dataptr[DCTSIZE3] - dataptr[DCTSIZE4];
4083
4084	dataptr[DCTSIZE*0] = (DCTELEM) RIGHT_SHIFT(tmp10 + tmp11, PASS1_BITS);
4085	dataptr[DCTSIZE*4] = (DCTELEM) RIGHT_SHIFT(tmp10 - tmp11, PASS1_BITS);
4086
4087	z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
4088	/* Add fudge factor here for final descale. */
4089	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4090	dataptr[DCTSIZE*2] = (DCTELEM)
4091	RIGHT_SHIFT(z1 + MULTIPLY(tmp12, FIX_0_765366865), CONST_BITS+PASS1_BITS);
4092	dataptr[DCTSIZE*6] = (DCTELEM)
4093	RIGHT_SHIFT(z1 - MULTIPLY(tmp13, FIX_1_847759065), CONST_BITS+PASS1_BITS);
4094
4095	/* Odd part per figure 8 --- note paper omits factor of sqrt(2).
4096	* 8-point FDCT kernel, cK represents sqrt(2) * cos(K*pi/16).
4097	* i0..i3 in the paper are tmp0..tmp3 here.
4098	*/
4099
4100	tmp10 = tmp0 + tmp3;
4101	tmp11 = tmp1 + tmp2;
4102	tmp12 = tmp0 + tmp2;
4103	tmp13 = tmp1 + tmp3;
4104	z1 = MULTIPLY(tmp12 + tmp13, FIX_1_175875602); /* c3 */
4105	/* Add fudge factor here for final descale. */
4106	z1 += ONE << (CONST_BITS+PASS1_BITS-1);
4107
4108	tmp0 = MULTIPLY(tmp0, FIX_1_501321110); /* c1+c3-c5-c7 */
4109	tmp1 = MULTIPLY(tmp1, FIX_3_072711026); /* c1+c3+c5-c7 */
4110	tmp2 = MULTIPLY(tmp2, FIX_2_053119869); /* c1+c3-c5+c7 */
4111	tmp3 = MULTIPLY(tmp3, FIX_0_298631336); /* -c1+c3+c5-c7 */
4112	tmp10 = MULTIPLY(tmp10, - FIX_0_899976223); /* c7-c3 */
4113	tmp11 = MULTIPLY(tmp11, - FIX_2_562915447); /* -c1-c3 */
4114	tmp12 = MULTIPLY(tmp12, - FIX_0_390180644); /* c5-c3 */
4115	tmp13 = MULTIPLY(tmp13, - FIX_1_961570560); /* -c3-c5 */
4116
4117	tmp12 += z1;
4118	tmp13 += z1;
4119
4120	dataptr[DCTSIZE*1] = (DCTELEM)
4121	RIGHT_SHIFT(tmp0 + tmp10 + tmp12, CONST_BITS+PASS1_BITS);
4122	dataptr[DCTSIZE*3] = (DCTELEM)
4123	RIGHT_SHIFT(tmp1 + tmp11 + tmp13, CONST_BITS+PASS1_BITS);
4124	dataptr[DCTSIZE*5] = (DCTELEM)
4125	RIGHT_SHIFT(tmp2 + tmp11 + tmp12, CONST_BITS+PASS1_BITS);
4126	dataptr[DCTSIZE*7] = (DCTELEM)
4127	RIGHT_SHIFT(tmp3 + tmp10 + tmp13, CONST_BITS+PASS1_BITS);
4128
4129	dataptr++; /* advance pointer to next column */
4130	}
4131	}
4132
4133
4134	/*
4135	* Perform the forward DCT on a 3x6 sample block.
4136	*
4137	* 3-point FDCT in pass 1 (rows), 6-point in pass 2 (columns).
4138	*/
4139
4140	GLOBAL(void)
4141	jpeg_fdct_3x6 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4142	{
4143	INT32 tmp0, tmp1, tmp2;
4144	INT32 tmp10, tmp11, tmp12;
4145	DCTELEM *dataptr;
4146	JSAMPROW elemptr;
4147	int ctr;
4148	SHIFT_TEMPS
4149
4150	/* Pre-zero output coefficient block. */
4151	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4152
4153	/* Pass 1: process rows. */
4154	/* Note results are scaled up by sqrt(8) compared to a true DCT; */
4155	/* furthermore, we scale the results by 2*PASS1_BITS. /
4156	/* We scale the results further by 2 as part of output adaption */
4157	/* scaling for different DCT size. */
4158	/* 3-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/6). /
4159
4160	dataptr = data;
4161	for (ctr = 0; ctr < 6; ctr++) {
4162	elemptr = sample_data[ctr] + start_col;
4163
4164	/* Even part */
4165
4166	tmp0 = GETJSAMPLE(elemptr[0]) + GETJSAMPLE(elemptr[2]);
4167	tmp1 = GETJSAMPLE(elemptr[1]);
4168
4169	tmp2 = GETJSAMPLE(elemptr[0]) - GETJSAMPLE(elemptr[2]);
4170
4171	/* Apply unsigned->signed conversion */
4172	dataptr[0] = (DCTELEM)
4173	((tmp0 + tmp1 - 3 * CENTERJSAMPLE) << (PASS1_BITS+1));
4174	dataptr[2] = (DCTELEM)
4175	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp1, FIX(0.707106781)), /* c2 */
4176	CONST_BITS-PASS1_BITS-1);
4177
4178	/* Odd part */
4179
4180	dataptr[1] = (DCTELEM)
4181	DESCALE(MULTIPLY(tmp2, FIX(1.224744871)), /* c1 */
4182	CONST_BITS-PASS1_BITS-1);
4183
4184	dataptr += DCTSIZE; /* advance pointer to next row */
4185	}
4186
4187	/* Pass 2: process columns.
4188	* We remove the PASS1_BITS scaling, but leave the results scaled up
4189	* by an overall factor of 8.
4190	* We must also scale the output by (8/6)*(8/3) = 32/9, which we partially
4191	* fold into the constant multipliers (other part was done in pass 1):
4192	* 6-point FDCT kernel, cK represents sqrt(2) * cos(Kpi/12) 16/9.
4193	*/
4194
4195	dataptr = data;
4196	for (ctr = 0; ctr < 3; ctr++) {
4197	/* Even part */
4198
4199	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE5];
4200	tmp11 = dataptr[DCTSIZE1] + dataptr[DCTSIZE4];
4201	tmp2 = dataptr[DCTSIZE2] + dataptr[DCTSIZE3];
4202
4203	tmp10 = tmp0 + tmp2;
4204	tmp12 = tmp0 - tmp2;
4205
4206	tmp0 = dataptr[DCTSIZE0] - dataptr[DCTSIZE5];
4207	tmp1 = dataptr[DCTSIZE1] - dataptr[DCTSIZE4];
4208	tmp2 = dataptr[DCTSIZE2] - dataptr[DCTSIZE3];
4209
4210	dataptr[DCTSIZE*0] = (DCTELEM)
4211	DESCALE(MULTIPLY(tmp10 + tmp11, FIX(1.777777778)), /* 16/9 */
4212	CONST_BITS+PASS1_BITS);
4213	dataptr[DCTSIZE*2] = (DCTELEM)
4214	DESCALE(MULTIPLY(tmp12, FIX(2.177324216)), /* c2 */
4215	CONST_BITS+PASS1_BITS);
4216	dataptr[DCTSIZE*4] = (DCTELEM)
4217	DESCALE(MULTIPLY(tmp10 - tmp11 - tmp11, FIX(1.257078722)), /* c4 */
4218	CONST_BITS+PASS1_BITS);
4219
4220	/* Odd part */
4221
4222	tmp10 = MULTIPLY(tmp0 + tmp2, FIX(0.650711829)); /* c5 */
4223
4224	dataptr[DCTSIZE*1] = (DCTELEM)
4225	DESCALE(tmp10 + MULTIPLY(tmp0 + tmp1, FIX(1.777777778)), /* 16/9 */
4226	CONST_BITS+PASS1_BITS);
4227	dataptr[DCTSIZE*3] = (DCTELEM)
4228	DESCALE(MULTIPLY(tmp0 - tmp1 - tmp2, FIX(1.777777778)), /* 16/9 */
4229	CONST_BITS+PASS1_BITS);
4230	dataptr[DCTSIZE*5] = (DCTELEM)
4231	DESCALE(tmp10 + MULTIPLY(tmp2 - tmp1, FIX(1.777777778)), /* 16/9 */
4232	CONST_BITS+PASS1_BITS);
4233
4234	dataptr++; /* advance pointer to next column */
4235	}
4236	}
4237
4238
4239	/*
4240	* Perform the forward DCT on a 2x4 sample block.
4241	*
4242	* 2-point FDCT in pass 1 (rows), 4-point in pass 2 (columns).
4243	*/
4244
4245	GLOBAL(void)
4246	jpeg_fdct_2x4 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4247	{
4248	INT32 tmp0, tmp1;
4249	INT32 tmp10, tmp11;
4250	DCTELEM *dataptr;
4251	JSAMPROW elemptr;
4252	int ctr;
4253	SHIFT_TEMPS
4254
4255	/* Pre-zero output coefficient block. */
4256	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4257
4258	/* Pass 1: process rows. */
4259	/* Note results are scaled up by sqrt(8) compared to a true DCT. */
4260	/* We must also scale the output by (8/2)(8/4) = 23, which we add here. /
4261
4262	dataptr = data;
4263	for (ctr = 0; ctr < 4; ctr++) {
4264	elemptr = sample_data[ctr] + start_col;
4265
4266	/* Even part */
4267
4268	tmp0 = GETJSAMPLE(elemptr[0]);
4269	tmp1 = GETJSAMPLE(elemptr[1]);
4270
4271	/* Apply unsigned->signed conversion */
4272	dataptr[0] = (DCTELEM) ((tmp0 + tmp1 - 2 * CENTERJSAMPLE) << 3);
4273
4274	/* Odd part */
4275
4276	dataptr[1] = (DCTELEM) ((tmp0 - tmp1) << 3);
4277
4278	dataptr += DCTSIZE; /* advance pointer to next row */
4279	}
4280
4281	/* Pass 2: process columns.
4282	* We leave the results scaled up by an overall factor of 8.
4283	* 4-point FDCT kernel,
4284	* cK represents sqrt(2) * cos(K*pi/16) [refers to 8-point FDCT].
4285	*/
4286
4287	dataptr = data;
4288	for (ctr = 0; ctr < 2; ctr++) {
4289	/* Even part */
4290
4291	tmp0 = dataptr[DCTSIZE0] + dataptr[DCTSIZE3];
4292	tmp1 = dataptr[DCTSIZE1] + dataptr[DCTSIZE2];
4293
4294	tmp10 = dataptr[DCTSIZE0] - dataptr[DCTSIZE3];
4295	tmp11 = dataptr[DCTSIZE1] - dataptr[DCTSIZE2];
4296
4297	dataptr[DCTSIZE*0] = (DCTELEM) (tmp0 + tmp1);
4298	dataptr[DCTSIZE*2] = (DCTELEM) (tmp0 - tmp1);
4299
4300	/* Odd part */
4301
4302	tmp0 = MULTIPLY(tmp10 + tmp11, FIX_0_541196100); /* c6 */
4303	/* Add fudge factor here for final descale. */
4304	tmp0 += ONE << (CONST_BITS-1);
4305
4306	dataptr[DCTSIZE*1] = (DCTELEM)
4307	RIGHT_SHIFT(tmp0 + MULTIPLY(tmp10, FIX_0_765366865), /* c2-c6 */
4308	CONST_BITS);
4309	dataptr[DCTSIZE*3] = (DCTELEM)
4310	RIGHT_SHIFT(tmp0 - MULTIPLY(tmp11, FIX_1_847759065), /* c2+c6 */
4311	CONST_BITS);
4312
4313	dataptr++; /* advance pointer to next column */
4314	}
4315	}
4316
4317
4318	/*
4319	* Perform the forward DCT on a 1x2 sample block.
4320	*
4321	* 1-point FDCT in pass 1 (rows), 2-point in pass 2 (columns).
4322	*/
4323
4324	GLOBAL(void)
4325	jpeg_fdct_1x2 (DCTELEM * data, JSAMPARRAY sample_data, JDIMENSION start_col)
4326	{
4327	INT32 tmp0, tmp1;
4328
4329	/* Pre-zero output coefficient block. */
4330	MEMZERO(data, SIZEOF(DCTELEM) * DCTSIZE2);
4331
4332	tmp0 = GETJSAMPLE(sample_data[0][start_col]);
4333	tmp1 = GETJSAMPLE(sample_data[1][start_col]);
4334
4335	/* We leave the results scaled up by an overall factor of 8.
4336	* We must also scale the output by (8/1)(8/2) = 2*5.
4337	*/
4338
4339	/* Even part */
4340	/* Apply unsigned->signed conversion */
4341	data[DCTSIZE0] = (DCTELEM) ((tmp0 + tmp1 - 2 CENTERJSAMPLE) << 5);
4342
4343	/* Odd part */
4344	data[DCTSIZE*1] = (DCTELEM) ((tmp0 - tmp1) << 5);
4345	}
4346
4347	#endif /* DCT_SCALING_SUPPORTED */
4348	#endif /* DCT_ISLOW_SUPPORTED */

Note: See TracBrowser for help on using the repository browser.

Download in other formats: