* miscfuncs.h (transform_chars): Declare. Define inline variation here.
* mount.cc (mount_info::from_fstab): Remove extern declaration of transform_chars. * path.cc (tfx_chars): Move to strfuncs.cc. (transform_chars): Ditto. * strfunc.cc (tfx_chars): Moved here from path.cc. (transform_chars): Ditto. (sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip save for all characters. (sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences representing U+f0XX UNICODE chars. Fix typo in comment.
This commit is contained in:
parent
9725900d86
commit
a657970571
|
@ -1,3 +1,17 @@
|
||||||
|
2009-11-02 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
|
* miscfuncs.h (transform_chars): Declare. Define inline variation here.
|
||||||
|
* mount.cc (mount_info::from_fstab): Remove extern declaration of
|
||||||
|
transform_chars.
|
||||||
|
* path.cc (tfx_chars): Move to strfuncs.cc.
|
||||||
|
(transform_chars): Ditto.
|
||||||
|
* strfunc.cc (tfx_chars): Moved here from path.cc.
|
||||||
|
(transform_chars): Ditto.
|
||||||
|
(sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip
|
||||||
|
save for all characters.
|
||||||
|
(sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences
|
||||||
|
representing U+f0XX UNICODE chars. Fix typo in comment.
|
||||||
|
|
||||||
2009-11-02 Corinna Vinschen <corinna@vinschen.de>
|
2009-11-02 Corinna Vinschen <corinna@vinschen.de>
|
||||||
|
|
||||||
* path.cc (tfx_chars): Constify.
|
* path.cc (tfx_chars): Constify.
|
||||||
|
@ -362,7 +376,7 @@
|
||||||
(fhandler_console::write_normal): Always use codepage 437 for alternate
|
(fhandler_console::write_normal): Always use codepage 437 for alternate
|
||||||
charset. Otherwise always default to the current internal locale.
|
charset. Otherwise always default to the current internal locale.
|
||||||
Replace ASCII SO with ASCII CAN.
|
Replace ASCII SO with ASCII CAN.
|
||||||
* strfuncs.cc: Tweka comments according to below changes.
|
* strfuncs.cc: Tweak comments according to below changes.
|
||||||
(sys_cp_wcstombs): Constify charset parameter. Convert all wchar_t
|
(sys_cp_wcstombs): Constify charset parameter. Convert all wchar_t
|
||||||
values in the Unicode private use area U+F0xx to the singlebyte
|
values in the Unicode private use area U+F0xx to the singlebyte
|
||||||
counterpart. Drop special handling creating ASCII SO sequence from
|
counterpart. Drop special handling creating ASCII SO sequence from
|
||||||
|
|
|
@ -25,6 +25,14 @@ void backslashify (const char *, char *, bool);
|
||||||
void slashify (const char *, char *, bool);
|
void slashify (const char *, char *, bool);
|
||||||
#define isslash(c) ((c) == '/')
|
#define isslash(c) ((c) == '/')
|
||||||
|
|
||||||
|
extern void transform_chars (PWCHAR, PWCHAR);
|
||||||
|
inline void
|
||||||
|
transform_chars (PUNICODE_STRING upath, USHORT start_idx)
|
||||||
|
{
|
||||||
|
transform_chars (upath->Buffer + start_idx,
|
||||||
|
upath->Buffer + upath->Length / sizeof (WCHAR) - 1);
|
||||||
|
}
|
||||||
|
|
||||||
/* Memory checking */
|
/* Memory checking */
|
||||||
int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) __attribute__ ((regparm(2)));
|
int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) __attribute__ ((regparm(2)));
|
||||||
|
|
||||||
|
|
|
@ -997,7 +997,6 @@ mount_info::from_fstab (bool user, WCHAR fstab[], PWCHAR fstab_end)
|
||||||
|
|
||||||
if (user)
|
if (user)
|
||||||
{
|
{
|
||||||
extern void transform_chars (PWCHAR, PWCHAR);
|
|
||||||
PWCHAR username;
|
PWCHAR username;
|
||||||
sys_mbstowcs (username = wcpcpy (fstab_end, L".d\\"),
|
sys_mbstowcs (username = wcpcpy (fstab_end, L".d\\"),
|
||||||
NT_MAX_PATH - (fstab_end - fstab),
|
NT_MAX_PATH - (fstab_end - fstab),
|
||||||
|
|
|
@ -395,63 +395,6 @@ path_conv::set_normalized_path (const char *path_copy)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Transform characters invalid for Windows filenames to the Unicode private
|
|
||||||
use area in the U+f0XX range. The affected characters are all control
|
|
||||||
chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash
|
|
||||||
is affected as well, but we can't transform it as long as we accept Win32
|
|
||||||
paths as input.
|
|
||||||
The reverse functionality is in strfuncs.cc, function sys_cp_wcstombs. */
|
|
||||||
static const WCHAR tfx_chars[] = {
|
|
||||||
0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3,
|
|
||||||
0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7,
|
|
||||||
0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11,
|
|
||||||
0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15,
|
|
||||||
0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19,
|
|
||||||
0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23,
|
|
||||||
0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27,
|
|
||||||
0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31,
|
|
||||||
' ', '!', 0xf000 | '"', '#',
|
|
||||||
'$', '%', '&', 39,
|
|
||||||
'(', ')', 0xf000 | '*', '+',
|
|
||||||
',', '-', '.', '\\',
|
|
||||||
'0', '1', '2', '3',
|
|
||||||
'4', '5', '6', '7',
|
|
||||||
'8', '9', 0xf000 | ':', ';',
|
|
||||||
0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?',
|
|
||||||
'@', 'A', 'B', 'C',
|
|
||||||
'D', 'E', 'F', 'G',
|
|
||||||
'H', 'I', 'J', 'K',
|
|
||||||
'L', 'M', 'N', 'O',
|
|
||||||
'P', 'Q', 'R', 'S',
|
|
||||||
'T', 'U', 'V', 'W',
|
|
||||||
'X', 'Y', 'Z', '[',
|
|
||||||
'\\', ']', '^', '_',
|
|
||||||
'`', 'a', 'b', 'c',
|
|
||||||
'd', 'e', 'f', 'g',
|
|
||||||
'h', 'i', 'j', 'k',
|
|
||||||
'l', 'm', 'n', 'o',
|
|
||||||
'p', 'q', 'r', 's',
|
|
||||||
't', 'u', 'v', 'w',
|
|
||||||
'x', 'y', 'z', '{',
|
|
||||||
0xf000 | '|', '}', '~', 127
|
|
||||||
};
|
|
||||||
|
|
||||||
void
|
|
||||||
transform_chars (PWCHAR path, PWCHAR path_end)
|
|
||||||
{
|
|
||||||
for (; path <= path_end; ++path)
|
|
||||||
if (*path < 128)
|
|
||||||
*path = tfx_chars[*path];
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline
|
|
||||||
void
|
|
||||||
transform_chars (PUNICODE_STRING upath, USHORT start_idx)
|
|
||||||
{
|
|
||||||
transform_chars (upath->Buffer + start_idx,
|
|
||||||
upath->Buffer + upath->Length / sizeof (WCHAR) - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
str2uni_cat (UNICODE_STRING &tgt, const char *srcstr)
|
str2uni_cat (UNICODE_STRING &tgt, const char *srcstr)
|
||||||
{
|
{
|
||||||
|
|
|
@ -22,6 +22,55 @@ details. */
|
||||||
#include "cygheap.h"
|
#include "cygheap.h"
|
||||||
#include "tls_pbuf.h"
|
#include "tls_pbuf.h"
|
||||||
|
|
||||||
|
/* Transform characters invalid for Windows filenames to the Unicode private
|
||||||
|
use area in the U+f0XX range. The affected characters are all control
|
||||||
|
chars 1 <= c <= 31, as well as the characters " * : < > ? |. The backslash
|
||||||
|
is affected as well, but we can't transform it as long as we accept Win32
|
||||||
|
paths as input.
|
||||||
|
The reverse functionality is in function sys_cp_wcstombs. */
|
||||||
|
static const WCHAR tfx_chars[] = {
|
||||||
|
0, 0xf000 | 1, 0xf000 | 2, 0xf000 | 3,
|
||||||
|
0xf000 | 4, 0xf000 | 5, 0xf000 | 6, 0xf000 | 7,
|
||||||
|
0xf000 | 8, 0xf000 | 9, 0xf000 | 10, 0xf000 | 11,
|
||||||
|
0xf000 | 12, 0xf000 | 13, 0xf000 | 14, 0xf000 | 15,
|
||||||
|
0xf000 | 16, 0xf000 | 17, 0xf000 | 18, 0xf000 | 19,
|
||||||
|
0xf000 | 20, 0xf000 | 21, 0xf000 | 22, 0xf000 | 23,
|
||||||
|
0xf000 | 24, 0xf000 | 25, 0xf000 | 26, 0xf000 | 27,
|
||||||
|
0xf000 | 28, 0xf000 | 29, 0xf000 | 30, 0xf000 | 31,
|
||||||
|
' ', '!', 0xf000 | '"', '#',
|
||||||
|
'$', '%', '&', 39,
|
||||||
|
'(', ')', 0xf000 | '*', '+',
|
||||||
|
',', '-', '.', '\\',
|
||||||
|
'0', '1', '2', '3',
|
||||||
|
'4', '5', '6', '7',
|
||||||
|
'8', '9', 0xf000 | ':', ';',
|
||||||
|
0xf000 | '<', '=', 0xf000 | '>', 0xf000 | '?',
|
||||||
|
'@', 'A', 'B', 'C',
|
||||||
|
'D', 'E', 'F', 'G',
|
||||||
|
'H', 'I', 'J', 'K',
|
||||||
|
'L', 'M', 'N', 'O',
|
||||||
|
'P', 'Q', 'R', 'S',
|
||||||
|
'T', 'U', 'V', 'W',
|
||||||
|
'X', 'Y', 'Z', '[',
|
||||||
|
'\\', ']', '^', '_',
|
||||||
|
'`', 'a', 'b', 'c',
|
||||||
|
'd', 'e', 'f', 'g',
|
||||||
|
'h', 'i', 'j', 'k',
|
||||||
|
'l', 'm', 'n', 'o',
|
||||||
|
'p', 'q', 'r', 's',
|
||||||
|
't', 'u', 'v', 'w',
|
||||||
|
'x', 'y', 'z', '{',
|
||||||
|
0xf000 | '|', '}', '~', 127
|
||||||
|
};
|
||||||
|
|
||||||
|
void
|
||||||
|
transform_chars (PWCHAR path, PWCHAR path_end)
|
||||||
|
{
|
||||||
|
for (; path <= path_end; ++path)
|
||||||
|
if (*path < 128)
|
||||||
|
*path = tfx_chars[*path];
|
||||||
|
}
|
||||||
|
|
||||||
/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
|
/* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
|
||||||
wchar_t character representation. That's unfortunate for us since
|
wchar_t character representation. That's unfortunate for us since
|
||||||
we require UTF for the OS. What we do here is to have our own
|
we require UTF for the OS. What we do here is to have our own
|
||||||
|
@ -426,14 +475,17 @@ sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
|
||||||
{
|
{
|
||||||
wchar_t pw = *pwcs;
|
wchar_t pw = *pwcs;
|
||||||
int bytes;
|
int bytes;
|
||||||
|
unsigned char cwc;
|
||||||
|
|
||||||
/* Convert UNICODE private use area. Reverse functionality for the
|
/* Convert UNICODE private use area. Reverse functionality for the
|
||||||
ASCII area <= 0x7f (only for path names) is transform_chars in
|
ASCII area <= 0x7f (only for path names) is transform_chars above.
|
||||||
path.cc. Reverse functionality for invalid bytes in a multibyte
|
Reverse functionality for invalid bytes in a multibyte sequence is
|
||||||
sequence is in sys_cp_mbstowcs. */
|
in sys_cp_mbstowcs below. */
|
||||||
if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1))
|
if ((pw & 0xff00) == 0xf000
|
||||||
|
&& (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000)
|
||||||
|
|| (cwc >= 0x80 && MB_CUR_MAX > 1)))
|
||||||
{
|
{
|
||||||
buf[0] = pw & 0xff;
|
buf[0] = (char) cwc;
|
||||||
bytes = 1;
|
bytes = 1;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -603,15 +655,14 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
|
else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
|
||||||
charset, &ps)) < 0
|
charset, &ps)) < 0)
|
||||||
|| (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80))
|
|
||||||
{
|
{
|
||||||
/* The technique is based on a discussion here:
|
/* The technique is based on a discussion here:
|
||||||
http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
|
http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
|
||||||
|
|
||||||
Invalid bytes in a multibyte secuence are converted to
|
Invalid bytes in a multibyte secuence are converted to
|
||||||
the private use area which is already used to store ASCII
|
the private use area which is already used to store ASCII
|
||||||
chars invalid in Windows filenames. This techinque allows
|
chars invalid in Windows filenames. This technque allows
|
||||||
to store them in a symmetric way. */
|
to store them in a symmetric way. */
|
||||||
bytes = 1;
|
bytes = 1;
|
||||||
if (dst)
|
if (dst)
|
||||||
|
|
Loading…
Reference in New Issue