source: rtems-tools/linkers/ConvertUTF.c @ ea29902

4.104.115
Last change on this file since ea29902 was ea29902, checked in by Chris Johns <chrisj@…>, on 08/01/14 at 06:44:32

Add initial support for the RTEM Trace Linker.

The RTEMS Trace Linker or rtems-rld creates an RTEMS executable with
trace support built in without any changes the existing code.

This commit is an initial starting point with function signatures
being read from INI files.

  • Property mode set to 100644
File size: 18.6 KB
Line 
1/*
2 * Copyright 2001-2004 Unicode, Inc.
3 *
4 * Disclaimer
5 *
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
13 *
14 * Limitations on Rights to Redistribute This Code
15 *
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
20 * remains attached.
21 */
22
23/* ---------------------------------------------------------------------
24
25    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
26    Author: Mark E. Davis, 1994.
27    Rev History: Rick McGowan, fixes & updates May 2001.
28    Sept 2001: fixed const & error conditions per
29        mods suggested by S. Parent & A. Lillich.
30    June 2002: Tim Dodd added detection and handling of incomplete
31        source sequences, enhanced error detection, added casts
32        to eliminate compiler warnings.
33    July 2003: slight mods to back out aggressive FFFE detection.
34    Jan 2004: updated switches in from-UTF8 conversions.
35    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
36
37    See the header file "ConvertUTF.h" for complete documentation.
38
39------------------------------------------------------------------------ */
40
41
42#include "ConvertUTF.h"
43#ifdef CVTUTF_DEBUG
44#include <stdio.h>
45#endif
46
47static const int halfShift  = 10; /* used for shifting by 10 bits */
48
49static const UTF32 halfBase = 0x0010000UL;
50static const UTF32 halfMask = 0x3FFUL;
51
52#define UNI_SUR_HIGH_START  (UTF32)0xD800
53#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
54#define UNI_SUR_LOW_START   (UTF32)0xDC00
55#define UNI_SUR_LOW_END     (UTF32)0xDFFF
56#define false      0
57#define true        1
58
59/* --------------------------------------------------------------------- */
60
61ConversionResult ConvertUTF32toUTF16 (
62        const UTF32** sourceStart, const UTF32* sourceEnd,
63        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
64    ConversionResult result = conversionOK;
65    const UTF32* source = *sourceStart;
66    UTF16* target = *targetStart;
67    while (source < sourceEnd) {
68        UTF32 ch;
69        if (target >= targetEnd) {
70            result = targetExhausted; break;
71        }
72        ch = *source++;
73        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
74            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
75            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
76                if (flags == strictConversion) {
77                    --source; /* return to the illegal value itself */
78                    result = sourceIllegal;
79                    break;
80                } else {
81                    *target++ = UNI_REPLACEMENT_CHAR;
82                }
83            } else {
84                *target++ = (UTF16)ch; /* normal case */
85            }
86        } else if (ch > UNI_MAX_LEGAL_UTF32) {
87            if (flags == strictConversion) {
88                result = sourceIllegal;
89            } else {
90                *target++ = UNI_REPLACEMENT_CHAR;
91            }
92        } else {
93            /* target is a character in range 0xFFFF - 0x10FFFF. */
94            if (target + 1 >= targetEnd) {
95                --source; /* Back up source pointer! */
96                result = targetExhausted; break;
97            }
98            ch -= halfBase;
99            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
100            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
101        }
102    }
103    *sourceStart = source;
104    *targetStart = target;
105    return result;
106}
107
108/* --------------------------------------------------------------------- */
109
110ConversionResult ConvertUTF16toUTF32 (
111        const UTF16** sourceStart, const UTF16* sourceEnd,
112        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
113    ConversionResult result = conversionOK;
114    const UTF16* source = *sourceStart;
115    UTF32* target = *targetStart;
116    UTF32 ch, ch2;
117    while (source < sourceEnd) {
118        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
119        ch = *source++;
120        /* If we have a surrogate pair, convert to UTF32 first. */
121        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
122            /* If the 16 bits following the high surrogate are in the source buffer... */
123            if (source < sourceEnd) {
124                ch2 = *source;
125                /* If it's a low surrogate, convert to UTF32. */
126                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
127                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
128                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
129                    ++source;
130                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
131                    --source; /* return to the illegal value itself */
132                    result = sourceIllegal;
133                    break;
134                }
135            } else { /* We don't have the 16 bits following the high surrogate. */
136                --source; /* return to the high surrogate */
137                result = sourceExhausted;
138                break;
139            }
140        } else if (flags == strictConversion) {
141            /* UTF-16 surrogate values are illegal in UTF-32 */
142            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
143                --source; /* return to the illegal value itself */
144                result = sourceIllegal;
145                break;
146            }
147        }
148        if (target >= targetEnd) {
149            source = oldSource; /* Back up source pointer! */
150            result = targetExhausted; break;
151        }
152        *target++ = ch;
153    }
154    *sourceStart = source;
155    *targetStart = target;
156#ifdef CVTUTF_DEBUG
157if (result == sourceIllegal) {
158    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
159    fflush(stderr);
160}
161#endif
162    return result;
163}
164
165/* --------------------------------------------------------------------- */
166
167/*
168 * Index into the table below with the first byte of a UTF-8 sequence to
169 * get the number of trailing bytes that are supposed to follow it.
170 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
171 * left as-is for anyone who may want to do such conversion, which was
172 * allowed in earlier algorithms.
173 */
174static const char trailingBytesForUTF8[256] = {
175    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
182    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
183};
184
185/*
186 * Magic values subtracted from a buffer value during UTF8 conversion.
187 * This table contains as many values as there might be trailing bytes
188 * in a UTF-8 sequence.
189 */
190static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
191                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
192
193/*
194 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
195 * into the first byte, depending on how many bytes follow.  There are
196 * as many entries in this table as there are UTF-8 sequence types.
197 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
198 * for *legal* UTF-8 will be 4 or fewer bytes total.
199 */
200static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
201
202/* --------------------------------------------------------------------- */
203
204/* The interface converts a whole buffer to avoid function-call overhead.
205 * Constants have been gathered. Loops & conditionals have been removed as
206 * much as possible for efficiency, in favor of drop-through switches.
207 * (See "Note A" at the bottom of the file for equivalent code.)
208 * If your compiler supports it, the "isLegalUTF8" call can be turned
209 * into an inline function.
210 */
211
212/* --------------------------------------------------------------------- */
213
214ConversionResult ConvertUTF16toUTF8 (
215        const UTF16** sourceStart, const UTF16* sourceEnd,
216        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
217    ConversionResult result = conversionOK;
218    const UTF16* source = *sourceStart;
219    UTF8* target = *targetStart;
220    while (source < sourceEnd) {
221        UTF32 ch;
222        unsigned short bytesToWrite = 0;
223        const UTF32 byteMask = 0xBF;
224        const UTF32 byteMark = 0x80;
225        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
226        ch = *source++;
227        /* If we have a surrogate pair, convert to UTF32 first. */
228        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
229            /* If the 16 bits following the high surrogate are in the source buffer... */
230            if (source < sourceEnd) {
231                UTF32 ch2 = *source;
232                /* If it's a low surrogate, convert to UTF32. */
233                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
234                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
235                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
236                    ++source;
237                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
238                    --source; /* return to the illegal value itself */
239                    result = sourceIllegal;
240                    break;
241                }
242            } else { /* We don't have the 16 bits following the high surrogate. */
243                --source; /* return to the high surrogate */
244                result = sourceExhausted;
245                break;
246            }
247        } else if (flags == strictConversion) {
248            /* UTF-16 surrogate values are illegal in UTF-32 */
249            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
250                --source; /* return to the illegal value itself */
251                result = sourceIllegal;
252                break;
253            }
254        }
255        /* Figure out how many bytes the result will require */
256        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
257        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
258        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
259        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
260        } else {                            bytesToWrite = 3;
261                                            ch = UNI_REPLACEMENT_CHAR;
262        }
263
264        target += bytesToWrite;
265        if (target > targetEnd) {
266            source = oldSource; /* Back up source pointer! */
267            target -= bytesToWrite; result = targetExhausted; break;
268        }
269        switch (bytesToWrite) { /* note: everything falls through. */
270            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
271            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
272            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
273            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
274        }
275        target += bytesToWrite;
276    }
277    *sourceStart = source;
278    *targetStart = target;
279    return result;
280}
281
282/* --------------------------------------------------------------------- */
283
284/*
285 * Utility routine to tell whether a sequence of bytes is legal UTF-8.
286 * This must be called with the length pre-determined by the first byte.
287 * If not calling this from ConvertUTF8to*, then the length can be set by:
288 *  length = trailingBytesForUTF8[*source]+1;
289 * and the sequence is illegal right away if there aren't that many bytes
290 * available.
291 * If presented with a length > 4, this returns false.  The Unicode
292 * definition of UTF-8 goes up to 4-byte sequences.
293 */
294
295static Boolean isLegalUTF8(const UTF8 *source, int length) {
296    UTF8 a;
297    const UTF8 *srcptr = source+length;
298    switch (length) {
299    default: return false;
300        /* Everything else falls through when "true"... */
301    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
302    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
303    case 2: if ((a = (*--srcptr)) > 0xBF) return false;
304
305        switch (*source) {
306            /* no fall-through in this inner switch */
307            case 0xE0: if (a < 0xA0) return false; break;
308            case 0xED: if (a > 0x9F) return false; break;
309            case 0xF0: if (a < 0x90) return false; break;
310            case 0xF4: if (a > 0x8F) return false; break;
311            default:   if (a < 0x80) return false;
312        }
313
314    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
315    }
316    if (*source > 0xF4) return false;
317    return true;
318}
319
320/* --------------------------------------------------------------------- */
321
322/*
323 * Exported function to return whether a UTF-8 sequence is legal or not.
324 * This is not used here; it's just exported.
325 */
326Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
327    int length = trailingBytesForUTF8[*source]+1;
328    if (source+length > sourceEnd) {
329        return false;
330    }
331    return isLegalUTF8(source, length);
332}
333
334/* --------------------------------------------------------------------- */
335
336ConversionResult ConvertUTF8toUTF16 (
337        const UTF8** sourceStart, const UTF8* sourceEnd,
338        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
339    ConversionResult result = conversionOK;
340    const UTF8* source = *sourceStart;
341    UTF16* target = *targetStart;
342    while (source < sourceEnd) {
343        UTF32 ch = 0;
344        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
345        if (source + extraBytesToRead >= sourceEnd) {
346            result = sourceExhausted; break;
347        }
348        /* Do this check whether lenient or strict */
349        if (! isLegalUTF8(source, extraBytesToRead+1)) {
350            result = sourceIllegal;
351            break;
352        }
353        /*
354         * The cases all fall through. See "Note A" below.
355         */
356        switch (extraBytesToRead) {
357            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
358            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
359            case 3: ch += *source++; ch <<= 6;
360            case 2: ch += *source++; ch <<= 6;
361            case 1: ch += *source++; ch <<= 6;
362            case 0: ch += *source++;
363        }
364        ch -= offsetsFromUTF8[extraBytesToRead];
365
366        if (target >= targetEnd) {
367            source -= (extraBytesToRead+1); /* Back up source pointer! */
368            result = targetExhausted; break;
369        }
370        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
371            /* UTF-16 surrogate values are illegal in UTF-32 */
372            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
373                if (flags == strictConversion) {
374                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
375                    result = sourceIllegal;
376                    break;
377                } else {
378                    *target++ = UNI_REPLACEMENT_CHAR;
379                }
380            } else {
381                *target++ = (UTF16)ch; /* normal case */
382            }
383        } else if (ch > UNI_MAX_UTF16) {
384            if (flags == strictConversion) {
385                result = sourceIllegal;
386                source -= (extraBytesToRead+1); /* return to the start */
387                break; /* Bail out; shouldn't continue */
388            } else {
389                *target++ = UNI_REPLACEMENT_CHAR;
390            }
391        } else {
392            /* target is a character in range 0xFFFF - 0x10FFFF. */
393            if (target + 1 >= targetEnd) {
394                source -= (extraBytesToRead+1); /* Back up source pointer! */
395                result = targetExhausted; break;
396            }
397            ch -= halfBase;
398            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
399            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
400        }
401    }
402    *sourceStart = source;
403    *targetStart = target;
404    return result;
405}
406
407/* --------------------------------------------------------------------- */
408
409ConversionResult ConvertUTF32toUTF8 (
410        const UTF32** sourceStart, const UTF32* sourceEnd,
411        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
412    ConversionResult result = conversionOK;
413    const UTF32* source = *sourceStart;
414    UTF8* target = *targetStart;
415    while (source < sourceEnd) {
416        UTF32 ch;
417        unsigned short bytesToWrite = 0;
418        const UTF32 byteMask = 0xBF;
419        const UTF32 byteMark = 0x80;
420        ch = *source++;
421        if (flags == strictConversion ) {
422            /* UTF-16 surrogate values are illegal in UTF-32 */
423            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
424                --source; /* return to the illegal value itself */
425                result = sourceIllegal;
426                break;
427            }
428        }
429        /*
430         * Figure out how many bytes the result will require. Turn any
431         * illegally large UTF32 things (> Plane 17) into replacement chars.
432         */
433        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
434        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
435        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
436        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
437        } else {                            bytesToWrite = 3;
438                                            ch = UNI_REPLACEMENT_CHAR;
439                                            result = sourceIllegal;
440        }
441       
442        target += bytesToWrite;
443        if (target > targetEnd) {
444            --source; /* Back up source pointer! */
445            target -= bytesToWrite; result = targetExhausted; break;
446        }
447        switch (bytesToWrite) { /* note: everything falls through. */
448            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
449            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
450            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
451            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
452        }
453        target += bytesToWrite;
454    }
455    *sourceStart = source;
456    *targetStart = target;
457    return result;
458}
459
460/* --------------------------------------------------------------------- */
461
462ConversionResult ConvertUTF8toUTF32 (
463        const UTF8** sourceStart, const UTF8* sourceEnd,
464        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
465    ConversionResult result = conversionOK;
466    const UTF8* source = *sourceStart;
467    UTF32* target = *targetStart;
468    while (source < sourceEnd) {
469        UTF32 ch = 0;
470        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
471        if (source + extraBytesToRead >= sourceEnd) {
472            result = sourceExhausted; break;
473        }
474        /* Do this check whether lenient or strict */
475        if (! isLegalUTF8(source, extraBytesToRead+1)) {
476            result = sourceIllegal;
477            break;
478        }
479        /*
480         * The cases all fall through. See "Note A" below.
481         */
482        switch (extraBytesToRead) {
483            case 5: ch += *source++; ch <<= 6;
484            case 4: ch += *source++; ch <<= 6;
485            case 3: ch += *source++; ch <<= 6;
486            case 2: ch += *source++; ch <<= 6;
487            case 1: ch += *source++; ch <<= 6;
488            case 0: ch += *source++;
489        }
490        ch -= offsetsFromUTF8[extraBytesToRead];
491
492        if (target >= targetEnd) {
493            source -= (extraBytesToRead+1); /* Back up the source pointer! */
494            result = targetExhausted; break;
495        }
496        if (ch <= UNI_MAX_LEGAL_UTF32) {
497            /*
498             * UTF-16 surrogate values are illegal in UTF-32, and anything
499             * over Plane 17 (> 0x10FFFF) is illegal.
500             */
501            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
502                if (flags == strictConversion) {
503                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
504                    result = sourceIllegal;
505                    break;
506                } else {
507                    *target++ = UNI_REPLACEMENT_CHAR;
508                }
509            } else {
510                *target++ = ch;
511            }
512        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
513            result = sourceIllegal;
514            *target++ = UNI_REPLACEMENT_CHAR;
515        }
516    }
517    *sourceStart = source;
518    *targetStart = target;
519    return result;
520}
521
522/* ---------------------------------------------------------------------
523
524    Note A.
525    The fall-through switches in UTF-8 reading code save a
526    temp variable, some decrements & conditionals.  The switches
527    are equivalent to the following loop:
528        {
529            int tmpBytesToRead = extraBytesToRead+1;
530            do {
531                ch += *source++;
532                --tmpBytesToRead;
533                if (tmpBytesToRead) ch <<= 6;
534            } while (tmpBytesToRead > 0);
535        }
536    In UTF-8 writing code, the switches on "bytesToWrite" are
537    similarly unrolled loops.
538
539   --------------------------------------------------------------------- */
Note: See TracBrowser for help on using the repository browser.