* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
value encoding. * libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate value decoding.
This commit is contained in:
parent
9c47bbb6e9
commit
6ff28fc3b1
|
@ -1,3 +1,10 @@
|
||||||
|
2009-10-03 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
|
* libc/stdlib/mbtowc_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
|
||||||
|
value encoding.
|
||||||
|
* libc/stdlib/wctomb_r.c (__utf8_mbtowc): Allow CESU-8 surrogate
|
||||||
|
value decoding.
|
||||||
|
|
||||||
2009-09-29 Corinna Vinschen <corinna@vinschen.de>
|
2009-09-29 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
* libc/locale/locale.c (loadlocale): Allow "C." same as "C-" as locale
|
* libc/locale/locale.c (loadlocale): Allow "C." same as "C-" as locale
|
||||||
|
|
|
@ -295,12 +295,6 @@ _DEFUN (__utf8_mbtowc, (r, pwc, s, n, charset, state),
|
||||||
tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
|
tmp = (wchar_t)((state->__value.__wchb[0] & 0x0f) << 12)
|
||||||
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
|
| (wchar_t)((state->__value.__wchb[1] & 0x3f) << 6)
|
||||||
| (wchar_t)(ch & 0x3f);
|
| (wchar_t)(ch & 0x3f);
|
||||||
/* Check for invalid CESU-8 encoding of UTF-16 surrogate values. */
|
|
||||||
if (tmp >= 0xd800 && tmp <= 0xdfff)
|
|
||||||
{
|
|
||||||
r->_errno = EILSEQ;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
*pwc = tmp;
|
*pwc = tmp;
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
|
@ -63,72 +63,75 @@ _DEFUN (__utf8_wctomb, (r, s, wchar, charset, state),
|
||||||
mbstate_t *state)
|
mbstate_t *state)
|
||||||
{
|
{
|
||||||
wint_t wchar = _wchar;
|
wint_t wchar = _wchar;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
if (s == NULL)
|
if (s == NULL)
|
||||||
return 0; /* UTF-8 encoding is not state-dependent */
|
return 0; /* UTF-8 encoding is not state-dependent */
|
||||||
|
|
||||||
if (state->__count == -4 && (wchar < 0xdc00 || wchar >= 0xdfff))
|
if (sizeof (wchar_t) == 2 && state->__count == -4
|
||||||
|
&& (wchar < 0xdc00 || wchar >= 0xdfff))
|
||||||
{
|
{
|
||||||
/* At this point only the second half of a surrogate pair is valid. */
|
/* There's a leftover lone high surrogate. Write out the CESU-8 value
|
||||||
r->_errno = EILSEQ;
|
of the surrogate and proceed to convert the given character. Note
|
||||||
return -1;
|
to return extra 3 bytes. */
|
||||||
|
wchar_t tmp;
|
||||||
|
tmp = (state->__value.__wchb[0] << 16 | state->__value.__wchb[1] << 8)
|
||||||
|
- 0x10000 >> 10 | 0xd80d;
|
||||||
|
*s++ = 0xe0 | ((tmp & 0xf000) >> 12);
|
||||||
|
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
|
||||||
|
*s++ = 0x80 | (tmp & 0x3f);
|
||||||
|
state->__count = 0;
|
||||||
|
ret = 3;
|
||||||
}
|
}
|
||||||
if (wchar <= 0x7f)
|
if (wchar <= 0x7f)
|
||||||
{
|
{
|
||||||
*s = wchar;
|
*s = wchar;
|
||||||
return 1;
|
return ret + 1;
|
||||||
}
|
}
|
||||||
if (wchar >= 0x80 && wchar <= 0x7ff)
|
if (wchar >= 0x80 && wchar <= 0x7ff)
|
||||||
{
|
{
|
||||||
*s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
|
*s++ = 0xc0 | ((wchar & 0x7c0) >> 6);
|
||||||
*s = 0x80 | (wchar & 0x3f);
|
*s = 0x80 | (wchar & 0x3f);
|
||||||
return 2;
|
return ret + 2;
|
||||||
}
|
}
|
||||||
if (wchar >= 0x800 && wchar <= 0xffff)
|
if (wchar >= 0x800 && wchar <= 0xffff)
|
||||||
{
|
{
|
||||||
if (wchar >= 0xd800 && wchar <= 0xdfff)
|
/* No UTF-16 surrogate handling in UCS-4 */
|
||||||
|
if (sizeof (wchar_t) == 2 && wchar >= 0xd800 && wchar <= 0xdfff)
|
||||||
{
|
{
|
||||||
wint_t tmp;
|
wint_t tmp;
|
||||||
/* UTF-16 surrogates -- must not occur in normal UCS-4 data */
|
if (wchar <= 0xdbff)
|
||||||
if (sizeof (wchar_t) != 2)
|
|
||||||
{
|
{
|
||||||
r->_errno = EILSEQ;
|
/* First half of a surrogate pair. Store the state and
|
||||||
return -1;
|
return ret + 0. */
|
||||||
|
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
|
||||||
|
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
|
||||||
|
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
|
||||||
|
state->__count = -4;
|
||||||
|
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
if (wchar >= 0xdc00)
|
if (state->__count == -4)
|
||||||
{
|
{
|
||||||
/* Second half of a surrogate pair. It's not valid if
|
/* Second half of a surrogate pair. Reconstruct the full
|
||||||
we don't have already read a first half of a surrogate
|
Unicode value and return the trailing three bytes of the
|
||||||
before. */
|
UTF-8 character. */
|
||||||
if (state->__count != -4)
|
|
||||||
{
|
|
||||||
r->_errno = EILSEQ;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
/* If it's valid, reconstruct the full Unicode value and
|
|
||||||
return the trailing three bytes of the UTF-8 char. */
|
|
||||||
tmp = (state->__value.__wchb[0] << 16)
|
tmp = (state->__value.__wchb[0] << 16)
|
||||||
| (state->__value.__wchb[1] << 8)
|
| (state->__value.__wchb[1] << 8)
|
||||||
| (wchar & 0x3ff);
|
| (wchar & 0x3ff);
|
||||||
state->__count = 0;
|
state->__count = 0;
|
||||||
|
*s++ = 0xf0 | ((tmp & 0x1c0000) >> 18);
|
||||||
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
|
*s++ = 0x80 | ((tmp & 0x3f000) >> 12);
|
||||||
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
|
*s++ = 0x80 | ((tmp & 0xfc0) >> 6);
|
||||||
*s = 0x80 | (tmp & 0x3f);
|
*s = 0x80 | (tmp & 0x3f);
|
||||||
return 3;
|
return 4;
|
||||||
}
|
}
|
||||||
/* First half of a surrogate pair. Store the state and return
|
/* Otherwise translate into CESU-8 value. */
|
||||||
the first byte of the UTF-8 char. */
|
|
||||||
tmp = ((wchar & 0x3ff) << 10) + 0x10000;
|
|
||||||
state->__value.__wchb[0] = (tmp >> 16) & 0xff;
|
|
||||||
state->__value.__wchb[1] = (tmp >> 8) & 0xff;
|
|
||||||
state->__count = -4;
|
|
||||||
*s = (0xf0 | ((tmp & 0x1c0000) >> 18));
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
|
*s++ = 0xe0 | ((wchar & 0xf000) >> 12);
|
||||||
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
*s++ = 0x80 | ((wchar & 0xfc0) >> 6);
|
||||||
*s = 0x80 | (wchar & 0x3f);
|
*s = 0x80 | (wchar & 0x3f);
|
||||||
return 3;
|
return ret + 3;
|
||||||
}
|
}
|
||||||
if (wchar >= 0x10000 && wchar <= 0x10ffff)
|
if (wchar >= 0x10000 && wchar <= 0x10ffff)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue