From 397775c6f62496fc0887bdda34d2084d38b598e7 Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Wed, 3 Jun 2009 17:23:39 +0000 Subject: [PATCH] * strfuncs.cc (sys_cp_wcstombs): Implement reverse functionality of the change to sys_cp_mbstowcs from 2009-05-30. (sys_cp_mbstowcs): Slightly reformat. Fix comment to accommodate change to sys_cp_wcstombs. Don't write to *ptr if dst is NULL. --- winsup/cygwin/ChangeLog | 7 +++++++ winsup/cygwin/strfuncs.cc | 34 ++++++++++++++++++++++------------ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/winsup/cygwin/ChangeLog b/winsup/cygwin/ChangeLog index 696916c74..fdb85ab52 100644 --- a/winsup/cygwin/ChangeLog +++ b/winsup/cygwin/ChangeLog @@ -1,3 +1,10 @@ +2009-06-03 Corinna Vinschen + + * strfuncs.cc (sys_cp_wcstombs): Implement reverse functionality + of the change to sys_cp_mbstowcs from 2009-05-30. + (sys_cp_mbstowcs): Slightly reformat. Fix comment to accommodate + change to sys_cp_wcstombs. Don't write to *ptr if dst is NULL. + 2009-06-03 Corinna Vinschen * fhandler_console.cc (fhandler_console::read): Convert Alt-Backspace diff --git a/winsup/cygwin/strfuncs.cc b/winsup/cygwin/strfuncs.cc index 4563f98db..d7abcb3c3 100644 --- a/winsup/cygwin/strfuncs.cc +++ b/winsup/cygwin/strfuncs.cc @@ -427,10 +427,19 @@ sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len, if ((pw & 0xff00) == 0xf000) pw &= 0xff; int bytes = f_wctomb (_REENT, buf, pw, charset, &ps); - /* Convert chars invalid in the current codepage to a sequence - ASCII SO; UTF-8 representation of invalid char. */ - if (bytes == -1 && *charset != 'U'/*TF-8*/) + if (bytes == -1 && (pw & 0xff00) == 0xdc00) + { + /* Reverse functionality of the single invalid second half of a + surrogate pair in the 0xDCxx range specifying an invalid byte + value when converting from MB to WC. + The comment in sys_cp_mbstowcs below explains it. */ + buf[0] = (char) (pw & 0xff); + bytes = 1; + } + else if (bytes == -1 && *charset != 'U'/*TF-8*/) { + /* Convert chars invalid in the current codepage to a sequence + ASCII SO; UTF-8 representation of invalid char. */ buf[0] = 0x0e; /* ASCII SO */ bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps); if (bytes == -1) @@ -561,7 +570,8 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen, if (ps.__count == 4) /* First half of a surrogate. */ { wchar_t *ptr2 = dst ? ptr + 1 : NULL; - int bytes2 = __utf8_mbtowc (_REENT, ptr2, (const char *) pmbs + bytes, + int bytes2 = __utf8_mbtowc (_REENT, ptr2, + (const char *) pmbs + bytes, nms - bytes, charset, &ps); if (bytes2 < 0) break; @@ -572,7 +582,9 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen, --len; } } - else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms, charset, &ps)) < 0 && *pmbs > '\x80') + else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms, + charset, &ps)) < 0 + && *pmbs > '\x80') { /* This should probably be handled in f_mbtowc which can operate on sequences rather than individual characters. @@ -581,13 +593,11 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen, http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html This is hardly perfect. Windows doesn't do anything sensical with - characters converted to this format and (currently) we don't convert - them back into their original single byte form. It does allow - processing of src to continue, however, which, since there is no - way to signal decoding errors, seems like the best we can do. - - */ - *ptr = L'\xdc80' | *pmbs; + characters converted to this format. It does allow processing of + src to continue, however, which, since there is no way to signal + decoding errors, seems like the best we can do. */ + if (dst) + *ptr = L'\xdc80' | *pmbs; bytes = 1; }