* strfuncs.cc (sys_cp_wcstombs): Convert lone surrogate pair
second halves to unambiguous ASCII SO sequence. When converting chars invalid in current codepage to ASCII SO sequence, make sure to check for surrogate pair second half only if ct least one wide characters is left. Decrement nwc if valid second half has been converted. (sys_cp_mbstowcs): Improve ASCII SO handling. Never break from loop if invalid character has been found. Recognize ASCII SO sequence representing originally invalid mulitbyte char converted into a lone surrogate pair second half. Convert accordingly.
This commit is contained in:
parent
70ecec052a
commit
e664429d77
|
@ -1,3 +1,16 @@
|
||||||
|
2009-09-23 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
|
* strfuncs.cc (sys_cp_wcstombs): Convert lone surrogate pair
|
||||||
|
second halves to unambiguous ASCII SO sequence. When converting
|
||||||
|
chars invalid in current codepage to ASCII SO sequence, make
|
||||||
|
sure to check for surrogate pair second half only if ct least
|
||||||
|
one wide characters is left. Decrement nwc if valid second half has
|
||||||
|
been converted.
|
||||||
|
(sys_cp_mbstowcs): Improve ASCII SO handling. Never break from loop
|
||||||
|
if invalid character has been found. Recognize ASCII SO sequence
|
||||||
|
representing originally invalid mulitbyte char converted into a
|
||||||
|
lone surrogate pair second half. Convert accordingly.
|
||||||
|
|
||||||
2009-09-22 Corinna Vinschen <corinna@vinschen.de>
|
2009-09-22 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
* autoload.cc (WSARecv): Define.
|
* autoload.cc (WSARecv): Define.
|
||||||
|
|
|
@ -435,8 +435,10 @@ sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
|
||||||
surrogate pair in the 0xDCxx range specifying an invalid byte
|
surrogate pair in the 0xDCxx range specifying an invalid byte
|
||||||
value when converting from MB to WC.
|
value when converting from MB to WC.
|
||||||
The comment in sys_cp_mbstowcs below explains it. */
|
The comment in sys_cp_mbstowcs below explains it. */
|
||||||
buf[0] = (char) (pw & 0xff);
|
buf[0] = 0x0e; /* ASCII SO */
|
||||||
bytes = 1;
|
buf[1] = 0xff;
|
||||||
|
buf[2] = (char) (pw & 0xff);
|
||||||
|
bytes = 3;
|
||||||
}
|
}
|
||||||
else if (bytes == -1 && *charset != 'U'/*TF-8*/)
|
else if (bytes == -1 && *charset != 'U'/*TF-8*/)
|
||||||
{
|
{
|
||||||
|
@ -451,7 +453,7 @@ sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
++bytes; /* Add the ASCII SO to the byte count. */
|
++bytes; /* Add the ASCII SO to the byte count. */
|
||||||
if (ps.__count == -4) /* First half of a surrogate pair. */
|
if (ps.__count == -4 && nwc > 0) /* First half of a surrogate pair. */
|
||||||
{
|
{
|
||||||
++pwcs;
|
++pwcs;
|
||||||
if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
|
if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
|
||||||
|
@ -461,6 +463,7 @@ sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset, &ps);
|
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset, &ps);
|
||||||
|
nwc--;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n + bytes <= len)
|
if (n + bytes <= len)
|
||||||
|
@ -546,42 +549,64 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
|
||||||
len = (size_t)-1;
|
len = (size_t)-1;
|
||||||
while (len > 0 && nms > 0)
|
while (len > 0 && nms > 0)
|
||||||
{
|
{
|
||||||
/* ASCII SO. Sanity check: If this is a lead SO byte for a following
|
/* ASCII SO handling. */
|
||||||
UTF-8 sequence, there must be at least two more bytes left, and the
|
if (*pmbs == 0x0e)
|
||||||
next byte must be a valid UTF-8 start byte. If the charset isn't
|
|
||||||
UTF-8 anyway, try to convert the following bytes as UTF-8 sequence. */
|
|
||||||
if (*pmbs == 0x0e && nms > 2 && pmbs[1] >= 0xc2
|
|
||||||
&& pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/)
|
|
||||||
{
|
{
|
||||||
pmbs++;
|
/* Sanity check: If this is a lead SO byte for a following UTF-8
|
||||||
--nms;
|
sequence, there must be at least two more bytes left, and the
|
||||||
bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
|
next byte must be a valid UTF-8 start byte. If the charset
|
||||||
charset, &ps);
|
isn't UTF-8 anyway, try to convert the following bytes as UTF-8
|
||||||
if (bytes < 0)
|
sequence. */
|
||||||
|
if (nms > 2 && pmbs[1] >= 0xc2 && pmbs[1] <= 0xf4 && *charset != 'U'/*TF-8*/)
|
||||||
|
{
|
||||||
|
bytes = __utf8_mbtowc (_REENT, ptr, (const char *) pmbs + 1,
|
||||||
|
nms - 1, charset, &ps);
|
||||||
|
if (bytes < 0)
|
||||||
|
{
|
||||||
|
/* Invalid UTF-8 sequence? Treat the ASCII SO character as
|
||||||
|
stand-alone ASCII SO char. */
|
||||||
|
bytes = 1;
|
||||||
|
if (dst)
|
||||||
|
*ptr = 0x0e;
|
||||||
|
memset (&ps, 0, sizeof ps);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
++bytes; /* Count SO byte */
|
||||||
|
if (bytes > 1 && ps.__count == 4)
|
||||||
|
{
|
||||||
|
/* First half of a surrogate. */
|
||||||
|
wchar_t *ptr2 = dst ? ptr + 1 : NULL;
|
||||||
|
int bytes2 = __utf8_mbtowc (_REENT, ptr2,
|
||||||
|
(const char *) pmbs + bytes,
|
||||||
|
nms - bytes, charset, &ps);
|
||||||
|
if (bytes2 < 0)
|
||||||
|
memset (&ps, 0, sizeof ps);
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bytes += bytes2;
|
||||||
|
++count;
|
||||||
|
ptr = dst ? ptr + 1 : NULL;
|
||||||
|
--len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/* Sequence for an invalid byte originally created in the next outer
|
||||||
|
else branch below. This must be converted back to a 0xDCxx value
|
||||||
|
as well. */
|
||||||
|
else if (nms > 2 && pmbs[1] == 0xff)
|
||||||
|
{
|
||||||
|
bytes = 3;
|
||||||
|
if (dst)
|
||||||
|
*ptr = L'\xdc80' | pmbs[2];
|
||||||
|
}
|
||||||
|
/* Otherwise it's just a simple ASCII SO. */
|
||||||
|
else
|
||||||
{
|
{
|
||||||
/* Invalid UTF-8 sequence? Treat the ASCII SO character as
|
|
||||||
stand-alone ASCII SO char. */
|
|
||||||
bytes = 1;
|
bytes = 1;
|
||||||
if (dst)
|
if (dst)
|
||||||
*ptr = 0x0e;
|
*ptr = 0x0e;
|
||||||
memset (&ps, 0, sizeof ps);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (bytes == 0)
|
|
||||||
break;
|
|
||||||
if (ps.__count == 4) /* First half of a surrogate. */
|
|
||||||
{
|
|
||||||
wchar_t *ptr2 = dst ? ptr + 1 : NULL;
|
|
||||||
int bytes2 = __utf8_mbtowc (_REENT, ptr2,
|
|
||||||
(const char *) pmbs + bytes,
|
|
||||||
nms - bytes, charset, &ps);
|
|
||||||
if (bytes2 < 0)
|
|
||||||
break;
|
|
||||||
pmbs += bytes2;
|
|
||||||
nms -= bytes2;
|
|
||||||
++count;
|
|
||||||
ptr = dst ? ptr + 1 : NULL;
|
|
||||||
--len;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
|
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
|
||||||
|
@ -598,10 +623,10 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
|
||||||
characters converted to this format. It does allow processing of
|
characters converted to this format. It does allow processing of
|
||||||
src to continue, however, which, since there is no way to signal
|
src to continue, however, which, since there is no way to signal
|
||||||
decoding errors, seems like the best we can do. */
|
decoding errors, seems like the best we can do. */
|
||||||
|
bytes = 1;
|
||||||
if (dst)
|
if (dst)
|
||||||
*ptr = L'\xdc80' | *pmbs;
|
*ptr = L'\xdc80' | *pmbs;
|
||||||
memset (&ps, 0, sizeof ps);
|
memset (&ps, 0, sizeof ps);
|
||||||
bytes = 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bytes > 0)
|
if (bytes > 0)
|
||||||
|
|
Loading…
Reference in New Issue