* fhandler.h (class dev_console): Constify charset parameter of
str_to_con. * fhandler_console.cc (dev_console::con_to_str): Simplify. Always default to the current internal locale. (dev_console::get_console_cp): Always use codepage 437 for alternate charset. (dev_console::str_to_con): Constify charset parameter. (fhandler_console::write_normal): Always use codepage 437 for alternate charset. Otherwise always default to the current internal locale. Replace ASCII SO with ASCII CAN. * strfuncs.cc: Tweka comments according to below changes. (sys_cp_wcstombs): Constify charset parameter. Convert all wchar_t values in the Unicode private use area U+F0xx to the singlebyte counterpart. Drop special handling creating ASCII SO sequence from U+DCxx value. Rearrange for performance. Replace ASCII SO with ASCII CAN. (sys_cp_mbstowcs): Constify charset parameter. Replace ASCII SO with ASCII CAN. Drop special case for U+DCxx ASCII SO sequences. Always create a replacement from the Unicode private use area U+F0xx for invalid byte values in a multibyte sequence. Do the same for wchar_t values from the U+F0xx range to make them roundtrip safe. * wchar.h (sys_cp_wcstombs): Constify charset parameter. (sys_cp_mbstowcs): Ditto.
This commit is contained in:
parent
d856640e1c
commit
587b75f7bd
|
@ -1,3 +1,29 @@
|
||||||
|
2009-09-28 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
|
* fhandler.h (class dev_console): Constify charset parameter of
|
||||||
|
str_to_con.
|
||||||
|
* fhandler_console.cc (dev_console::con_to_str): Simplify. Always
|
||||||
|
default to the current internal locale.
|
||||||
|
(dev_console::get_console_cp): Always use codepage 437 for alternate
|
||||||
|
charset.
|
||||||
|
(dev_console::str_to_con): Constify charset parameter.
|
||||||
|
(fhandler_console::write_normal): Always use codepage 437 for alternate
|
||||||
|
charset. Otherwise always default to the current internal locale.
|
||||||
|
Replace ASCII SO with ASCII CAN.
|
||||||
|
* strfuncs.cc: Tweka comments according to below changes.
|
||||||
|
(sys_cp_wcstombs): Constify charset parameter. Convert all wchar_t
|
||||||
|
values in the Unicode private use area U+F0xx to the singlebyte
|
||||||
|
counterpart. Drop special handling creating ASCII SO sequence from
|
||||||
|
U+DCxx value. Rearrange for performance. Replace ASCII SO with
|
||||||
|
ASCII CAN.
|
||||||
|
(sys_cp_mbstowcs): Constify charset parameter. Replace ASCII SO with
|
||||||
|
ASCII CAN. Drop special case for U+DCxx ASCII SO sequences. Always
|
||||||
|
create a replacement from the Unicode private use area U+F0xx for
|
||||||
|
invalid byte values in a multibyte sequence. Do the same for wchar_t
|
||||||
|
values from the U+F0xx range to make them roundtrip safe.
|
||||||
|
* wchar.h (sys_cp_wcstombs): Constify charset parameter.
|
||||||
|
(sys_cp_mbstowcs): Ditto.
|
||||||
|
|
||||||
2009-09-28 Corinna Vinschen <corinna@vinschen.de>
|
2009-09-28 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
* cygheap.cc (cygheap_init): Default locale.charset to "UTF-8".
|
* cygheap.cc (cygheap_init): Default locale.charset to "UTF-8".
|
||||||
|
|
|
@ -934,7 +934,7 @@ class dev_console
|
||||||
|
|
||||||
inline UINT get_console_cp ();
|
inline UINT get_console_cp ();
|
||||||
DWORD con_to_str (char *d, int dlen, WCHAR w);
|
DWORD con_to_str (char *d, int dlen, WCHAR w);
|
||||||
DWORD str_to_con (mbtowc_p, char *, PWCHAR d, const char *s, DWORD sz);
|
DWORD str_to_con (mbtowc_p, const char *, PWCHAR d, const char *s, DWORD sz);
|
||||||
void set_color (HANDLE);
|
void set_color (HANDLE);
|
||||||
bool fillin_info (HANDLE);
|
bool fillin_info (HANDLE);
|
||||||
void set_default_attr ();
|
void set_default_attr ();
|
||||||
|
|
|
@ -127,19 +127,19 @@ tty_list::get_tty (int n)
|
||||||
inline DWORD
|
inline DWORD
|
||||||
dev_console::con_to_str (char *d, int dlen, WCHAR w)
|
dev_console::con_to_str (char *d, int dlen, WCHAR w)
|
||||||
{
|
{
|
||||||
return sys_cp_wcstombs (*cygheap->locale.charset == 'A'
|
return sys_cp_wcstombs (cygheap->locale.wctomb, cygheap->locale.charset,
|
||||||
? __ascii_wctomb : cygheap->locale.wctomb,
|
d, dlen, &w, 1);
|
||||||
cygheap->locale.charset, d, dlen, &w, 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inline UINT
|
inline UINT
|
||||||
dev_console::get_console_cp ()
|
dev_console::get_console_cp ()
|
||||||
{
|
{
|
||||||
return alternate_charset_active ? GetConsoleOutputCP () : 0;
|
/* The alternate charset is always 437, just as in the Linux console. */
|
||||||
|
return alternate_charset_active ? 437 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline DWORD
|
inline DWORD
|
||||||
dev_console::str_to_con (mbtowc_p f_mbtowc, char *charset,
|
dev_console::str_to_con (mbtowc_p f_mbtowc, const char *charset,
|
||||||
PWCHAR d, const char *s, DWORD sz)
|
PWCHAR d, const char *s, DWORD sz)
|
||||||
{
|
{
|
||||||
return sys_cp_mbstowcs (f_mbtowc, charset, d, CONVERT_LIMIT, s, sz);
|
return sys_cp_mbstowcs (f_mbtowc, charset, d, CONVERT_LIMIT, s, sz);
|
||||||
|
@ -1454,16 +1454,19 @@ fhandler_console::write_normal (const unsigned char *src,
|
||||||
size_t ret;
|
size_t ret;
|
||||||
mbstate_t ps;
|
mbstate_t ps;
|
||||||
UINT cp = dev_state->get_console_cp ();
|
UINT cp = dev_state->get_console_cp ();
|
||||||
char charsetbuf[ENCODING_LEN + 1];
|
const char *charset;
|
||||||
char *charset;
|
|
||||||
mbtowc_p f_mbtowc;
|
mbtowc_p f_mbtowc;
|
||||||
|
|
||||||
if (cp)
|
if (cp)
|
||||||
f_mbtowc = __set_charset_from_codepage (cp, charset = charsetbuf);
|
{
|
||||||
|
/* The alternate charset is always 437, just as in the Linux console. */
|
||||||
|
f_mbtowc = __cp_mbtowc;
|
||||||
|
charset = "CP437";
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
f_mbtowc = cygheap->locale.mbtowc;
|
||||||
charset = cygheap->locale.charset;
|
charset = cygheap->locale.charset;
|
||||||
f_mbtowc = (*charset == 'A') ? __ascii_mbtowc : cygheap->locale.mbtowc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* First check if we have cached lead bytes of a former try to write
|
/* First check if we have cached lead bytes of a former try to write
|
||||||
|
@ -1606,10 +1609,10 @@ fhandler_console::write_normal (const unsigned char *src,
|
||||||
cursor_set (false, 0, y);
|
cursor_set (false, 0, y);
|
||||||
break;
|
break;
|
||||||
case ERR:
|
case ERR:
|
||||||
/* Don't print chars marked as ERR chars, except for a SO sequence
|
/* Don't print chars marked as ERR chars, except for a ASCII CAN
|
||||||
which is printed as singlebyte chars from the UTF Basic Latin
|
sequence which is printed as singlebyte chars from the UTF
|
||||||
and Latin 1 Supplement plains. */
|
Basic Latin and Latin 1 Supplement plains. */
|
||||||
if (*found == 0x0e)
|
if (*found == 0x18)
|
||||||
{
|
{
|
||||||
write_replacement_char ();
|
write_replacement_char ();
|
||||||
if (found + 1 < end)
|
if (found + 1 < end)
|
||||||
|
|
|
@ -310,8 +310,7 @@ __big5_mbtowc (struct _reent *r, wchar_t *pwc, const char *s, size_t n,
|
||||||
Called from newlib's setlocale() with codepage set to 0, if the
|
Called from newlib's setlocale() with codepage set to 0, if the
|
||||||
charset isn't given explicitely in the POSIX compatible locale specifier.
|
charset isn't given explicitely in the POSIX compatible locale specifier.
|
||||||
The function also returns a pointer to the corresponding _mbtowc_r
|
The function also returns a pointer to the corresponding _mbtowc_r
|
||||||
function. Also called from fhandler_console::write_normal() if the
|
function. */
|
||||||
"Alternate Charset" has been switched on by an escape sequence. */
|
|
||||||
extern "C" mbtowc_p
|
extern "C" mbtowc_p
|
||||||
__set_charset_from_codepage (UINT cp, char *charset)
|
__set_charset_from_codepage (UINT cp, char *charset)
|
||||||
{
|
{
|
||||||
|
@ -400,17 +399,17 @@ __set_charset_from_codepage (UINT cp, char *charset)
|
||||||
multibyte charset, then usually you wouldn't be able to access the
|
multibyte charset, then usually you wouldn't be able to access the
|
||||||
file. To fix this problem, sys_wcstombs creates a replacement multibyte
|
file. To fix this problem, sys_wcstombs creates a replacement multibyte
|
||||||
sequences for the non-representable wide-char. The sequence starts with
|
sequences for the non-representable wide-char. The sequence starts with
|
||||||
an ASCII SO (0x0e, Ctrl-N), followed by the UTF-8 representation of the
|
an ASCII CAN (0x18, Ctrl-X), followed by the UTF-8 representation of the
|
||||||
character. The sys_(cp_)mbstowcs function detects ASCII SO characters
|
character. The sys_(cp_)mbstowcs function detects ASCII CAN characters
|
||||||
in the input multibyte string and converts the following multibyte
|
in the input multibyte string and converts the following multibyte
|
||||||
sequence in by treating it as an UTF-8 char. If that fails, the ASCII
|
sequence in by treating it as an UTF-8 char. If that fails, the ASCII
|
||||||
SO was probably standalone and it gets just copied over as ASCII SO.
|
CAN was probably standalone and it gets just copied over as ASCII CAN.
|
||||||
|
|
||||||
- The functions always create 0-terminated results, no matter what.
|
- The functions always create 0-terminated results, no matter what.
|
||||||
If the result is truncated due to buffer size, it's a bug in Cygwin
|
If the result is truncated due to buffer size, it's a bug in Cygwin
|
||||||
and the buffer in the calling function should be raised. */
|
and the buffer in the calling function should be raised. */
|
||||||
size_t __stdcall
|
size_t __stdcall
|
||||||
sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
|
sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
|
||||||
const wchar_t *src, size_t nwc)
|
const wchar_t *src, size_t nwc)
|
||||||
{
|
{
|
||||||
char buf[10];
|
char buf[10];
|
||||||
|
@ -426,46 +425,47 @@ sys_cp_wcstombs (wctomb_p f_wctomb, char *charset, char *dst, size_t len,
|
||||||
while (n < len && nwc-- > 0)
|
while (n < len && nwc-- > 0)
|
||||||
{
|
{
|
||||||
wchar_t pw = *pwcs;
|
wchar_t pw = *pwcs;
|
||||||
/* Convert UNICODE private use area. Reverse functionality (only for
|
int bytes;
|
||||||
path names) is transform_chars in path.cc. */
|
|
||||||
if ((pw & 0xff00) == 0xf000)
|
/* Convert UNICODE private use area. Reverse functionality for the
|
||||||
pw &= 0xff;
|
ASCII area <= 0x7f (only for path names) is transform_chars in
|
||||||
int bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
|
path.cc. Reverse functionality for invalid bytes in a multibyte
|
||||||
if (bytes == -1 && (pw & 0xff00) == 0xdc00)
|
sequence is in sys_cp_mbstowcs. */
|
||||||
|
if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1))
|
||||||
{
|
{
|
||||||
/* Reverse functionality of the single invalid second half of a
|
buf[0] = pw & 0xff;
|
||||||
surrogate pair in the 0xDCxx range specifying an invalid byte
|
bytes = 1;
|
||||||
value when converting from MB to WC.
|
}
|
||||||
The comment in sys_cp_mbstowcs below explains it. */
|
else
|
||||||
buf[0] = 0x0e; /* ASCII SO */
|
|
||||||
buf[1] = 0xff;
|
|
||||||
buf[2] = (char) (pw & 0xff);
|
|
||||||
bytes = 3;
|
|
||||||
}
|
|
||||||
else if (bytes == -1 && *charset != 'U'/*TF-8*/)
|
|
||||||
{
|
{
|
||||||
/* Convert chars invalid in the current codepage to a sequence
|
bytes = f_wctomb (_REENT, buf, pw, charset, &ps);
|
||||||
ASCII SO; UTF-8 representation of invalid char. */
|
if (bytes == -1 && *charset != 'U'/*TF-8*/)
|
||||||
buf[0] = 0x0e; /* ASCII SO */
|
|
||||||
bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
|
|
||||||
if (bytes == -1)
|
|
||||||
{
|
{
|
||||||
++pwcs;
|
/* Convert chars invalid in the current codepage to a sequence
|
||||||
ps.__count = 0;
|
ASCII CAN; UTF-8 representation of invalid char. */
|
||||||
continue;
|
buf[0] = 0x18; /* ASCII CAN */
|
||||||
}
|
bytes = __utf8_wctomb (_REENT, buf + 1, pw, charset, &ps);
|
||||||
++bytes; /* Add the ASCII SO to the byte count. */
|
if (bytes == -1)
|
||||||
if (ps.__count == -4 && nwc > 0) /* First half of a surrogate pair. */
|
|
||||||
{
|
|
||||||
++pwcs;
|
|
||||||
if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
|
|
||||||
{
|
{
|
||||||
++pwcs;
|
++pwcs;
|
||||||
ps.__count = 0;
|
ps.__count = 0;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset, &ps);
|
++bytes; /* Add the ASCII CAN to the byte count. */
|
||||||
nwc--;
|
if (ps.__count == -4 && nwc > 0)
|
||||||
|
{
|
||||||
|
/* First half of a surrogate pair. */
|
||||||
|
++pwcs;
|
||||||
|
if ((*pwcs & 0xfc00) != 0xdc00) /* Invalid second half. */
|
||||||
|
{
|
||||||
|
++pwcs;
|
||||||
|
ps.__count = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
bytes += __utf8_wctomb (_REENT, buf + bytes, *pwcs, charset,
|
||||||
|
&ps);
|
||||||
|
nwc--;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (n + bytes <= len)
|
if (n + bytes <= len)
|
||||||
|
@ -535,8 +535,8 @@ sys_wcstombs_alloc (char **dst_p, int type, const wchar_t *src, size_t nwc)
|
||||||
charset, which is the charset returned by GetConsoleCP (). Most of the
|
charset, which is the charset returned by GetConsoleCP (). Most of the
|
||||||
time this is used for box and line drawing characters. */
|
time this is used for box and line drawing characters. */
|
||||||
size_t __stdcall
|
size_t __stdcall
|
||||||
sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
|
sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
|
||||||
const char *src, size_t nms)
|
size_t dlen, const char *src, size_t nms)
|
||||||
{
|
{
|
||||||
wchar_t *ptr = dst;
|
wchar_t *ptr = dst;
|
||||||
unsigned const char *pmbs = (unsigned const char *) src;
|
unsigned const char *pmbs = (unsigned const char *) src;
|
||||||
|
@ -551,10 +551,10 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
|
||||||
len = (size_t)-1;
|
len = (size_t)-1;
|
||||||
while (len > 0 && nms > 0)
|
while (len > 0 && nms > 0)
|
||||||
{
|
{
|
||||||
/* ASCII SO handling. */
|
/* ASCII CAN handling. */
|
||||||
if (*pmbs == 0x0e)
|
if (*pmbs == 0x18)
|
||||||
{
|
{
|
||||||
/* Sanity check: If this is a lead SO byte for a following UTF-8
|
/* Sanity check: If this is a lead CAN byte for a following UTF-8
|
||||||
sequence, there must be at least two more bytes left, and the
|
sequence, there must be at least two more bytes left, and the
|
||||||
next byte must be a valid UTF-8 start byte. If the charset
|
next byte must be a valid UTF-8 start byte. If the charset
|
||||||
isn't UTF-8 anyway, try to convert the following bytes as UTF-8
|
isn't UTF-8 anyway, try to convert the following bytes as UTF-8
|
||||||
|
@ -565,16 +565,16 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
|
||||||
nms - 1, charset, &ps);
|
nms - 1, charset, &ps);
|
||||||
if (bytes < 0)
|
if (bytes < 0)
|
||||||
{
|
{
|
||||||
/* Invalid UTF-8 sequence? Treat the ASCII SO character as
|
/* Invalid UTF-8 sequence? Treat the ASCII CAN character as
|
||||||
stand-alone ASCII SO char. */
|
stand-alone ASCII CAN char. */
|
||||||
bytes = 1;
|
bytes = 1;
|
||||||
if (dst)
|
if (dst)
|
||||||
*ptr = 0x0e;
|
*ptr = 0x18;
|
||||||
memset (&ps, 0, sizeof ps);
|
memset (&ps, 0, sizeof ps);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
++bytes; /* Count SO byte */
|
++bytes; /* Count CAN byte */
|
||||||
if (bytes > 1 && ps.__count == 4)
|
if (bytes > 1 && ps.__count == 4)
|
||||||
{
|
{
|
||||||
/* First half of a surrogate. */
|
/* First half of a surrogate. */
|
||||||
|
@ -594,40 +594,28 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, char *charset, wchar_t *dst, size_t dlen,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Sequence for an invalid byte originally created in the next outer
|
/* Otherwise it's just a simple ASCII CAN. */
|
||||||
else branch below. This must be converted back to a 0xDCxx value
|
|
||||||
as well. */
|
|
||||||
else if (nms > 2 && pmbs[1] == 0xff)
|
|
||||||
{
|
|
||||||
bytes = 3;
|
|
||||||
if (dst)
|
|
||||||
*ptr = L'\xdc80' | pmbs[2];
|
|
||||||
}
|
|
||||||
/* Otherwise it's just a simple ASCII SO. */
|
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
bytes = 1;
|
bytes = 1;
|
||||||
if (dst)
|
if (dst)
|
||||||
*ptr = 0x0e;
|
*ptr = 0x18;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
|
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
|
||||||
charset, &ps)) < 0
|
charset, &ps)) < 0
|
||||||
&& *pmbs >= 0x80)
|
|| (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80))
|
||||||
{
|
{
|
||||||
/* This should probably be handled in f_mbtowc which can operate
|
/* The technique is based on a discussion here:
|
||||||
on sequences rather than individual characters.
|
|
||||||
The technique is based on a discussion here:
|
|
||||||
|
|
||||||
http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
|
http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
|
||||||
|
|
||||||
This is hardly perfect. Windows doesn't do anything sensical with
|
Invalid bytes in a multibyte secuence are converted to
|
||||||
characters converted to this format. It does allow processing of
|
the private use area which is already used to store ASCII
|
||||||
src to continue, however, which, since there is no way to signal
|
chars invalid in Windows filenames. This techinque allows
|
||||||
decoding errors, seems like the best we can do. */
|
to store them in a symmetric way. */
|
||||||
bytes = 1;
|
bytes = 1;
|
||||||
if (dst)
|
if (dst)
|
||||||
*ptr = L'\xdc80' | *pmbs;
|
*ptr = L'\xf000' | *pmbs;
|
||||||
memset (&ps, 0, sizeof ps);
|
memset (&ps, 0, sizeof ps);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -51,7 +51,7 @@ extern mbtowc_p __set_charset_from_codepage (unsigned int cp, char *charset);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __INSIDE_CYGWIN__
|
#ifdef __INSIDE_CYGWIN__
|
||||||
size_t __stdcall sys_cp_wcstombs (wctomb_p, char *, char *, size_t,
|
size_t __stdcall sys_cp_wcstombs (wctomb_p, const char *, char *, size_t,
|
||||||
const wchar_t *, size_t = (size_t) -1)
|
const wchar_t *, size_t = (size_t) -1)
|
||||||
__attribute__ ((regparm(3)));
|
__attribute__ ((regparm(3)));
|
||||||
size_t __stdcall sys_wcstombs (char *dst, size_t len, const wchar_t * src,
|
size_t __stdcall sys_wcstombs (char *dst, size_t len, const wchar_t * src,
|
||||||
|
@ -61,7 +61,7 @@ size_t __stdcall sys_wcstombs_alloc (char **, int, const wchar_t *,
|
||||||
size_t = (size_t) -1)
|
size_t = (size_t) -1)
|
||||||
__attribute__ ((regparm(3)));
|
__attribute__ ((regparm(3)));
|
||||||
|
|
||||||
size_t __stdcall sys_cp_mbstowcs (mbtowc_p, char *, wchar_t *, size_t,
|
size_t __stdcall sys_cp_mbstowcs (mbtowc_p, const char *, wchar_t *, size_t,
|
||||||
const char *, size_t = (size_t) -1)
|
const char *, size_t = (size_t) -1)
|
||||||
__attribute__ ((regparm(3)));
|
__attribute__ ((regparm(3)));
|
||||||
size_t __stdcall sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src,
|
size_t __stdcall sys_mbstowcs (wchar_t * dst, size_t dlen, const char *src,
|
||||||
|
|
Loading…
Reference in New Issue