source: rtems/cpukit/libmisc/utf8proc/utf8proc.c @ 0771936e

4.115
Last change on this file since 0771936e was 46b7f921, checked in by Ralf Kirchner <ralf.kirchner@…>, on 02/26/13 at 11:00:34

libmisc: Add utf8proc-v1.1.5

utf8proc is a small library for processing UTF-8 encoded Unicode strings.
Some features are Unicode normalization, stripping of default ignorable characters, case folding and detection of grapheme cluster boundaries.
For the time beeing utf8proc is intended to be used for normalizing and folding UTF-8 strings
for comparison purposes when adding UTF-8 support to the FAT file system.

  • Property mode set to 100644
File size: 19.8 KB
Line 
1/*
2 *  Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
3 *
4 *  Permission is hereby granted, free of charge, to any person obtaining a
5 *  copy of this software and associated documentation files (the "Software"),
6 *  to deal in the Software without restriction, including without limitation
7 *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 *  and/or sell copies of the Software, and to permit persons to whom the
9 *  Software is furnished to do so, subject to the following conditions:
10 *
11 *  The above copyright notice and this permission notice shall be included in
12 *  all copies or substantial portions of the Software.
13 *
14 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 *  DEALINGS IN THE SOFTWARE.
21 */
22
23/*
24 *  This library contains derived data from a modified version of the
25 *  Unicode data files.
26 *
27 *  The original data files are available at
28 *  http://www.unicode.org/Public/UNIDATA/
29 *
30 *  Please notice the copyright statement in the file "utf8proc_data.c".
31 */
32
33
34/*
35 *  File name:    utf8proc.c
36 *
37 *  Description:
38 *  Implementation of libutf8proc.
39 */
40
41
42#include "utf8proc.h"
43#include "utf8proc_data.c"
44
45
46const int8_t utf8proc_utf8class[256] = {
47  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
55  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
58  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
59  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
60  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
61  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
62  4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
63
64#define UTF8PROC_HANGUL_SBASE 0xAC00
65#define UTF8PROC_HANGUL_LBASE 0x1100
66#define UTF8PROC_HANGUL_VBASE 0x1161
67#define UTF8PROC_HANGUL_TBASE 0x11A7
68#define UTF8PROC_HANGUL_LCOUNT 19
69#define UTF8PROC_HANGUL_VCOUNT 21
70#define UTF8PROC_HANGUL_TCOUNT 28
71#define UTF8PROC_HANGUL_NCOUNT 588
72#define UTF8PROC_HANGUL_SCOUNT 11172
73/* END is exclusive */
74#define UTF8PROC_HANGUL_L_START  0x1100
75#define UTF8PROC_HANGUL_L_END    0x115A
76#define UTF8PROC_HANGUL_L_FILLER 0x115F
77#define UTF8PROC_HANGUL_V_START  0x1160
78#define UTF8PROC_HANGUL_V_END    0x11A3
79#define UTF8PROC_HANGUL_T_START  0x11A8
80#define UTF8PROC_HANGUL_T_END    0x11FA
81#define UTF8PROC_HANGUL_S_START  0xAC00
82#define UTF8PROC_HANGUL_S_END    0xD7A4
83
84
85#define UTF8PROC_BOUNDCLASS_START    0
86#define UTF8PROC_BOUNDCLASS_OTHER    1
87#define UTF8PROC_BOUNDCLASS_CR       2
88#define UTF8PROC_BOUNDCLASS_LF       3
89#define UTF8PROC_BOUNDCLASS_CONTROL  4
90#define UTF8PROC_BOUNDCLASS_EXTEND   5
91#define UTF8PROC_BOUNDCLASS_L        6
92#define UTF8PROC_BOUNDCLASS_V        7
93#define UTF8PROC_BOUNDCLASS_T        8
94#define UTF8PROC_BOUNDCLASS_LV       9
95#define UTF8PROC_BOUNDCLASS_LVT     10
96
97
98const char *utf8proc_version(void) {
99  return "1.1.5";
100}
101
102const char *utf8proc_errmsg(ssize_t errcode) {
103  switch (errcode) {
104    case UTF8PROC_ERROR_NOMEM:
105    return "Memory for processing UTF-8 data could not be allocated.";
106    case UTF8PROC_ERROR_OVERFLOW:
107    return "UTF-8 string is too long to be processed.";
108    case UTF8PROC_ERROR_INVALIDUTF8:
109    return "Invalid UTF-8 string";
110    case UTF8PROC_ERROR_NOTASSIGNED:
111    return "Unassigned Unicode code point found in UTF-8 string.";
112    case UTF8PROC_ERROR_INVALIDOPTS:
113    return "Invalid options for UTF-8 processing chosen.";
114    default:
115    return "An unknown error occured while processing UTF-8 data.";
116  }
117}
118
119ssize_t utf8proc_iterate(
120  const uint8_t *str, ssize_t strlen, int32_t *dst
121) {
122  int length;
123  int i;
124  int32_t uc = -1;
125  *dst = -1;
126  if (!strlen) return 0;
127  length = utf8proc_utf8class[str[0]];
128  if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
129  if (strlen >= 0 && length > strlen) return UTF8PROC_ERROR_INVALIDUTF8;
130  for (i=1; i<length; i++) {
131    if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
132  }
133  switch (length) {
134    case 1:
135    uc = str[0];
136    break;
137    case 2:
138    uc = ((str[0] & 0x1F) <<  6) + (str[1] & 0x3F);
139    if (uc < 0x80) uc = -1;
140    break;
141    case 3:
142    uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) <<  6)
143      + (str[2] & 0x3F);
144    if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
145      (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
146    break;
147    case 4:
148    uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
149      + ((str[2] & 0x3F) <<  6) + (str[3] & 0x3F);
150    if (uc < 0x10000 || uc >= 0x110000) uc = -1;
151    break;
152  }
153  if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
154    return UTF8PROC_ERROR_INVALIDUTF8;
155  *dst = uc;
156  return length;
157}
158
159bool utf8proc_codepoint_valid(int32_t uc) {
160  if (uc < 0 || uc >= 0x110000 ||
161    ((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
162    (uc >= 0xFDD0 && uc < 0xFDF0)) return false;
163  else return true;
164}
165
166ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
167  if (uc < 0x00) {
168    return 0;
169  } else if (uc < 0x80) {
170    dst[0] = uc;
171    return 1;
172  } else if (uc < 0x800) {
173    dst[0] = 0xC0 + (uc >> 6);
174    dst[1] = 0x80 + (uc & 0x3F);
175    return 2;
176  } else if (uc == 0xFFFF) {
177    dst[0] = 0xFF;
178    return 1;
179  } else if (uc == 0xFFFE) {
180    dst[0] = 0xFE;
181    return 1;
182  } else if (uc < 0x10000) {
183    dst[0] = 0xE0 + (uc >> 12);
184    dst[1] = 0x80 + ((uc >> 6) & 0x3F);
185    dst[2] = 0x80 + (uc & 0x3F);
186    return 3;
187  } else if (uc < 0x110000) {
188    dst[0] = 0xF0 + (uc >> 18);
189    dst[1] = 0x80 + ((uc >> 12) & 0x3F);
190    dst[2] = 0x80 + ((uc >> 6) & 0x3F);
191    dst[3] = 0x80 + (uc & 0x3F);
192    return 4;
193  } else return 0;
194}
195
196const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
197  /* ASSERT: uc >= 0 && uc < 0x110000 */
198  return utf8proc_properties + (
199    utf8proc_stage2table[
200      utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
201    ]
202  );
203}
204
205#define utf8proc_decompose_lump(replacement_uc) \
206  return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
207  options & ~UTF8PROC_LUMP, last_boundclass)
208
209ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
210    int options, int *last_boundclass) {
211  /* ASSERT: uc >= 0 && uc < 0x110000 */
212  const utf8proc_property_t *property;
213  utf8proc_propval_t category;
214  int32_t hangul_sindex;
215  property = utf8proc_get_property(uc);
216  category = property->category;
217  hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
218  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
219    if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
220      int32_t hangul_tindex;
221      if (bufsize >= 1) {
222        dst[0] = UTF8PROC_HANGUL_LBASE +
223          hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
224        if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
225          (hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
226      }
227      hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
228      if (!hangul_tindex) return 2;
229      if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
230      return 3;
231    }
232  }
233  if (options & UTF8PROC_REJECTNA) {
234    if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
235  }
236  if (options & UTF8PROC_IGNORE) {
237    if (property->ignorable) return 0;
238  }
239  if (options & UTF8PROC_LUMP) {
240    if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
241    if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
242      utf8proc_decompose_lump(0x0027);
243    if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
244      utf8proc_decompose_lump(0x002D);
245    if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
246    if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
247    if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
248      utf8proc_decompose_lump(0x003C);
249    if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
250      utf8proc_decompose_lump(0x003E);
251    if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
252    if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
253      utf8proc_decompose_lump(0x005E);
254    if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
255      utf8proc_decompose_lump(0x005F);
256    if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
257    if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
258    if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
259    if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
260      if (category == UTF8PROC_CATEGORY_ZL ||
261          category == UTF8PROC_CATEGORY_ZP)
262        utf8proc_decompose_lump(0x000A);
263    }
264  }
265  if (options & UTF8PROC_STRIPMARK) {
266    if (category == UTF8PROC_CATEGORY_MN ||
267      category == UTF8PROC_CATEGORY_MC ||
268      category == UTF8PROC_CATEGORY_ME) return 0;
269  }
270  if (options & UTF8PROC_CASEFOLD) {
271    if (property->casefold_mapping) {
272      const int32_t *casefold_entry;
273      ssize_t written = 0;
274      for (casefold_entry = property->casefold_mapping;
275          *casefold_entry >= 0; casefold_entry++) {
276        written += utf8proc_decompose_char(*casefold_entry, dst+written,
277          (bufsize > written) ? (bufsize - written) : 0, options,
278          last_boundclass);
279        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
280      }
281      return written;
282    }
283  }
284  if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
285    if (property->decomp_mapping &&
286        (!property->decomp_type || (options & UTF8PROC_COMPAT))) {
287      const int32_t *decomp_entry;
288      ssize_t written = 0;
289      for (decomp_entry = property->decomp_mapping;
290          *decomp_entry >= 0; decomp_entry++) {
291        written += utf8proc_decompose_char(*decomp_entry, dst+written,
292          (bufsize > written) ? (bufsize - written) : 0, options,
293        last_boundclass);
294        if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
295      }
296      return written;
297    }
298  }
299  if (options & UTF8PROC_CHARBOUND) {
300    bool boundary;
301    int tbc, lbc;
302    tbc =
303      (uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
304      (uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
305      ((category == UTF8PROC_CATEGORY_ZL ||
306        category == UTF8PROC_CATEGORY_ZP ||
307        category == UTF8PROC_CATEGORY_CC ||
308        category == UTF8PROC_CATEGORY_CF) &&
309        !(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
310      property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
311      ((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
312        uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
313      (uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
314        UTF8PROC_BOUNDCLASS_V :
315      (uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
316        UTF8PROC_BOUNDCLASS_T :
317      (uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
318        ((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
319          UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
320      ) :
321      UTF8PROC_BOUNDCLASS_OTHER;
322    lbc = *last_boundclass;
323    boundary =
324      (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
325      (lbc == UTF8PROC_BOUNDCLASS_START) ? true :
326      (lbc == UTF8PROC_BOUNDCLASS_CR &&
327       tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
328      (lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
329      (tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
330      (lbc == UTF8PROC_BOUNDCLASS_L &&
331       (tbc == UTF8PROC_BOUNDCLASS_L ||
332        tbc == UTF8PROC_BOUNDCLASS_V ||
333        tbc == UTF8PROC_BOUNDCLASS_LV ||
334        tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
335      ((lbc == UTF8PROC_BOUNDCLASS_LV ||
336        lbc == UTF8PROC_BOUNDCLASS_V) &&
337       (tbc == UTF8PROC_BOUNDCLASS_V ||
338        tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
339      ((lbc == UTF8PROC_BOUNDCLASS_LVT ||
340        lbc == UTF8PROC_BOUNDCLASS_T) &&
341       tbc == UTF8PROC_BOUNDCLASS_T) ? false :
342       true;
343    *last_boundclass = tbc;
344    if (boundary) {
345      if (bufsize >= 1) dst[0] = 0xFFFF;
346      if (bufsize >= 2) dst[1] = uc;
347      return 2;
348    }
349  }
350  if (bufsize >= 1) *dst = uc;
351  return 1;
352}
353
354ssize_t utf8proc_decompose(
355  const uint8_t *str, ssize_t strlen,
356  int32_t *buffer, ssize_t bufsize, int options
357) {
358  /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
359  ssize_t wpos = 0;
360  if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
361    return UTF8PROC_ERROR_INVALIDOPTS;
362  if ((options & UTF8PROC_STRIPMARK) &&
363      !(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
364    return UTF8PROC_ERROR_INVALIDOPTS;
365  {
366    int32_t uc;
367    ssize_t rpos = 0;
368    ssize_t decomp_result;
369    int boundclass = UTF8PROC_BOUNDCLASS_START;
370    while (1) {
371      if (options & UTF8PROC_NULLTERM) {
372        rpos += utf8proc_iterate(str + rpos, -1, &uc);
373        /* checking of return value is not neccessary,
374           as 'uc' is < 0 in case of error */
375        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
376        if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
377        if (uc == 0) break;
378      } else {
379        if (rpos >= strlen) break;
380        rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
381        if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
382      }
383      decomp_result = utf8proc_decompose_char(
384        uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
385        &boundclass
386      );
387      if (decomp_result < 0) return decomp_result;
388      wpos += decomp_result;
389      /* prohibiting integer overflows due to too long strings: */
390      if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
391        return UTF8PROC_ERROR_OVERFLOW;
392    }
393  }
394  if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
395    ssize_t pos = 0;
396    while (pos < wpos-1) {
397      int32_t uc1, uc2;
398      const utf8proc_property_t *property1, *property2;
399      uc1 = buffer[pos];
400      uc2 = buffer[pos+1];
401      property1 = utf8proc_get_property(uc1);
402      property2 = utf8proc_get_property(uc2);
403      if (property1->combining_class > property2->combining_class &&
404          property2->combining_class > 0) {
405        buffer[pos] = uc2;
406        buffer[pos+1] = uc1;
407        if (pos > 0) pos--; else pos++;
408      } else {
409        pos++;
410      }
411    }
412  }
413  return wpos;
414}
415
416ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
417  /* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
418     ASSERT: 'buffer' has one spare byte of free space at the end! */
419  if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
420    ssize_t rpos;
421    ssize_t wpos = 0;
422    int32_t uc;
423    for (rpos = 0; rpos < length; rpos++) {
424      uc = buffer[rpos];
425      if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
426      if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
427          ((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
428        if (options & UTF8PROC_NLF2LS) {
429          if (options & UTF8PROC_NLF2PS) {
430            buffer[wpos++] = 0x000A;
431          } else {
432            buffer[wpos++] = 0x2028;
433          }
434        } else {
435          if (options & UTF8PROC_NLF2PS) {
436            buffer[wpos++] = 0x2029;
437          } else {
438            buffer[wpos++] = 0x0020;
439          }
440        }
441      } else if ((options & UTF8PROC_STRIPCC) &&
442          (uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
443        if (uc == 0x0009) buffer[wpos++] = 0x0020;
444      } else {
445        buffer[wpos++] = uc;
446      }
447    }
448    length = wpos;
449  }
450  if (options & UTF8PROC_COMPOSE) {
451    int32_t *starter = NULL;
452    int32_t current_char;
453    const utf8proc_property_t *starter_property = NULL, *current_property;
454    utf8proc_propval_t max_combining_class = -1;
455    ssize_t rpos;
456    ssize_t wpos = 0;
457    int32_t composition;
458    for (rpos = 0; rpos < length; rpos++) {
459      current_char = buffer[rpos];
460      current_property = utf8proc_get_property(current_char);
461      if (starter && current_property->combining_class > max_combining_class) {
462        /* combination perhaps possible */
463        int32_t hangul_lindex;
464        int32_t hangul_sindex;
465        hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
466        if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
467          int32_t hangul_vindex;
468          hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
469          if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
470            *starter = UTF8PROC_HANGUL_SBASE +
471              (hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
472              UTF8PROC_HANGUL_TCOUNT;
473            starter_property = NULL;
474            continue;
475          }
476        }
477        hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
478        if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
479            (hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
480          int32_t hangul_tindex;
481          hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
482          if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
483            *starter += hangul_tindex;
484            starter_property = NULL;
485            continue;
486          }
487        }
488        if (!starter_property) {
489          starter_property = utf8proc_get_property(*starter);
490        }
491        if (starter_property->comb1st_index >= 0 &&
492            current_property->comb2nd_index >= 0) {
493          composition = utf8proc_combinations[
494            starter_property->comb1st_index +
495            current_property->comb2nd_index
496          ];
497          if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
498              !(utf8proc_get_property(composition)->comp_exclusion))) {
499            *starter = composition;
500            starter_property = NULL;
501            continue;
502          }
503        }
504      }
505      buffer[wpos] = current_char;
506      if (current_property->combining_class) {
507        if (current_property->combining_class > max_combining_class) {
508          max_combining_class = current_property->combining_class;
509        }
510      } else {
511        starter = buffer + wpos;
512        starter_property = NULL;
513        max_combining_class = -1;
514      }
515      wpos++;
516    }
517    length = wpos;
518  }
519  {
520    ssize_t rpos, wpos = 0;
521    int32_t uc;
522    for (rpos = 0; rpos < length; rpos++) {
523      uc = buffer[rpos];
524      wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
525    }
526    ((uint8_t *)buffer)[wpos] = 0;
527    return wpos;
528  }
529}
530
531ssize_t utf8proc_map(
532  const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
533) {
534  int32_t *buffer;
535  ssize_t result;
536  *dstptr = NULL;
537  result = utf8proc_decompose(str, strlen, NULL, 0, options);
538  if (result < 0) return result;
539  buffer = malloc(result * sizeof(int32_t) + 1);
540  if (!buffer) return UTF8PROC_ERROR_NOMEM;
541  result = utf8proc_decompose(str, strlen, buffer, result, options);
542  if (result < 0) {
543    free(buffer);
544    return result;
545  }
546  result = utf8proc_reencode(buffer, result, options);
547  if (result < 0) {
548    free(buffer);
549    return result;
550  }
551  {
552    int32_t *newptr;
553    newptr = realloc(buffer, (size_t)result+1);
554    if (newptr) buffer = newptr;
555  }
556  *dstptr = (uint8_t *)buffer;
557  return result;
558}
559
560uint8_t *utf8proc_NFD(const uint8_t *str) {
561  uint8_t *retval;
562  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
563    UTF8PROC_DECOMPOSE);
564  return retval;
565}
566
567uint8_t *utf8proc_NFC(const uint8_t *str) {
568  uint8_t *retval;
569  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
570    UTF8PROC_COMPOSE);
571  return retval;
572}
573
574uint8_t *utf8proc_NFKD(const uint8_t *str) {
575  uint8_t *retval;
576  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
577    UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
578  return retval;
579}
580
581uint8_t *utf8proc_NFKC(const uint8_t *str) {
582  uint8_t *retval;
583  utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
584    UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
585  return retval;
586}
587
Note: See TracBrowser for help on using the repository browser.