* miscfuncs.h (transform_chars): Declare. Define inline variation here.

* mount.cc (mount_info::from_fstab): Remove extern declaration of transform_chars. * path.cc (tfx_chars): Move to strfuncs.cc. (transform_chars): Ditto. * strfunc.cc (tfx_chars): Moved here from path.cc. (transform_chars): Ditto. (sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip save for all characters. (sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences representing U+f0XX UNICODE chars. Fix typo in comment.
2009-11-02 11:42:04 +00:00 · 2009-11-02 11:42:04 +00:00 · a657970571
parent 9725900d86
commit a657970571
5 changed files with 83 additions and 68 deletions
--- a/winsup/cygwin/ChangeLog
+++ b/winsup/cygwin/ChangeLog
@ -1,3 +1,17 @@
 2009-11-02  Corinna Vinschen  <corinna@vinschen.de>
 	* miscfuncs.h (transform_chars): Declare.  Define inline variation here.
 	* mount.cc (mount_info::from_fstab): Remove extern declaration of
 	transform_chars.
 	* path.cc (tfx_chars): Move to strfuncs.cc.
 	(transform_chars): Ditto.
 	* strfunc.cc (tfx_chars): Moved here from path.cc.
 	(transform_chars): Ditto.
 	(sys_cp_wcstombs): Make UNICODE private use area conversion roundtrip
 	save for all characters.
 	(sys_cp_mbstowcs): Ditto, by removing special case for UTF-8 sequences
 	representing U+f0XX UNICODE chars.  Fix typo in comment.
 2009-11-02  Corinna Vinschen  <corinna@vinschen.de>
 	* path.cc (tfx_chars): Constify.
@ -362,7 +376,7 @@
 	(fhandler_console::write_normal): Always use codepage 437 for alternate
 	charset.  Otherwise always default to the current internal locale.
 	Replace ASCII SO with ASCII CAN.
-	* strfuncs.cc: Tweka comments according to below changes.
+	* strfuncs.cc: Tweak comments according to below changes.
 	(sys_cp_wcstombs): Constify charset parameter.  Convert all wchar_t
 	values in the Unicode private use area U+F0xx to the singlebyte
 	counterpart.  Drop special handling creating ASCII SO sequence from
--- a/winsup/cygwin/miscfuncs.h
+++ b/winsup/cygwin/miscfuncs.h
@ -25,6 +25,14 @@ void backslashify (const char *, char *, bool);
 void slashify (const char *, char *, bool);
 #define isslash(c) ((c) == '/')
 extern void transform_chars (PWCHAR, PWCHAR);
 inline void
 transform_chars (PUNICODE_STRING upath, USHORT start_idx)
 {
  transform_chars (upath->Buffer + start_idx,
 		   upath->Buffer + upath->Length / sizeof (WCHAR) - 1);
 }
 /* Memory checking */
 int __stdcall check_invalid_virtual_addr (const void *s, unsigned sz) __attribute__ ((regparm(2)));
--- a/winsup/cygwin/mount.cc
+++ b/winsup/cygwin/mount.cc
@ -997,7 +997,6 @@ mount_info::from_fstab (bool user, WCHAR fstab[], PWCHAR fstab_end)
  if (user)
    {
      extern void transform_chars (PWCHAR, PWCHAR);
      PWCHAR username;
      sys_mbstowcs (username = wcpcpy (fstab_end, L".d\\"),
 		    NT_MAX_PATH - (fstab_end - fstab),
--- a/winsup/cygwin/path.cc
+++ b/winsup/cygwin/path.cc
@ -395,63 +395,6 @@ path_conv::set_normalized_path (const char *path_copy)
    }
 }
 /* Transform characters invalid for Windows filenames to the Unicode private
   use area in the U+f0XX range.  The affected characters are all control
   chars 1 <= c <= 31, as well as the characters " * : < > ? |.  The backslash
   is affected as well, but we can't transform it as long as we accept Win32
   paths as input.
   The reverse functionality is in strfuncs.cc, function sys_cp_wcstombs. */
 static const WCHAR tfx_chars[] = {
            0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
 0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
 0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
 0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
 0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
 0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
 0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
 0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
          ' ',          '!', 0xf000 | '"',          '#',
          '$',          '%',          '&',           39,
          '(',          ')', 0xf000 | '*',          '+',
          ',',          '-',          '.',          '\\',
          '0',          '1',          '2',          '3',
          '4',          '5',          '6',          '7',
          '8',          '9', 0xf000 | ':',          ';',
 0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
          '@',          'A',          'B',          'C',
          'D',          'E',          'F',          'G',
          'H',          'I',          'J',          'K',
          'L',          'M',          'N',          'O',
          'P',          'Q',          'R',          'S',
          'T',          'U',          'V',          'W',
          'X',          'Y',          'Z',          '[',
          '\\',          ']',          '^',          '_',
          '`',          'a',          'b',          'c',
          'd',          'e',          'f',          'g',
          'h',          'i',          'j',          'k',
          'l',          'm',          'n',          'o',
          'p',          'q',          'r',          's',
          't',          'u',          'v',          'w',
          'x',          'y',          'z',          '{',
 0xf000 | '|',          '}',          '~',          127
 };
 void
 transform_chars (PWCHAR path, PWCHAR path_end)
 {
  for (; path <= path_end; ++path)
    if (*path < 128)
      *path = tfx_chars[*path];
 }
 static inline
 void
 transform_chars (PUNICODE_STRING upath, USHORT start_idx)
 {
  transform_chars (upath->Buffer + start_idx,
 		   upath->Buffer + upath->Length / sizeof (WCHAR) - 1);
 }
 static inline void
 str2uni_cat (UNICODE_STRING &tgt, const char *srcstr)
 {
--- a/winsup/cygwin/strfuncs.cc
+++ b/winsup/cygwin/strfuncs.cc
@ -22,6 +22,55 @@ details. */
 #include "cygheap.h"
 #include "tls_pbuf.h"
 /* Transform characters invalid for Windows filenames to the Unicode private
   use area in the U+f0XX range.  The affected characters are all control
   chars 1 <= c <= 31, as well as the characters " * : < > ? |.  The backslash
   is affected as well, but we can't transform it as long as we accept Win32
   paths as input.
   The reverse functionality is in function sys_cp_wcstombs. */
 static const WCHAR tfx_chars[] = {
            0, 0xf000 |   1, 0xf000 |   2, 0xf000 |   3,
 0xf000 |   4, 0xf000 |   5, 0xf000 |   6, 0xf000 |   7,
 0xf000 |   8, 0xf000 |   9, 0xf000 |  10, 0xf000 |  11,
 0xf000 |  12, 0xf000 |  13, 0xf000 |  14, 0xf000 |  15,
 0xf000 |  16, 0xf000 |  17, 0xf000 |  18, 0xf000 |  19,
 0xf000 |  20, 0xf000 |  21, 0xf000 |  22, 0xf000 |  23,
 0xf000 |  24, 0xf000 |  25, 0xf000 |  26, 0xf000 |  27,
 0xf000 |  28, 0xf000 |  29, 0xf000 |  30, 0xf000 |  31,
          ' ',          '!', 0xf000 | '"',          '#',
          '$',          '%',          '&',           39,
          '(',          ')', 0xf000 | '*',          '+',
          ',',          '-',          '.',          '\\',
          '0',          '1',          '2',          '3',
          '4',          '5',          '6',          '7',
          '8',          '9', 0xf000 | ':',          ';',
 0xf000 | '<',          '=', 0xf000 | '>', 0xf000 | '?',
          '@',          'A',          'B',          'C',
          'D',          'E',          'F',          'G',
          'H',          'I',          'J',          'K',
          'L',          'M',          'N',          'O',
          'P',          'Q',          'R',          'S',
          'T',          'U',          'V',          'W',
          'X',          'Y',          'Z',          '[',
          '\\',          ']',          '^',          '_',
          '`',          'a',          'b',          'c',
          'd',          'e',          'f',          'g',
          'h',          'i',          'j',          'k',
          'l',          'm',          'n',          'o',
          'p',          'q',          'r',          's',
          't',          'u',          'v',          'w',
          'x',          'y',          'z',          '{',
 0xf000 | '|',          '}',          '~',          127
 };
 void
 transform_chars (PWCHAR path, PWCHAR path_end)
 {
  for (; path <= path_end; ++path)
    if (*path < 128)
      *path = tfx_chars[*path];
 }
 /* The SJIS, JIS and eucJP conversion in newlib does not use UTF as
   wchar_t character representation.  That's unfortunate for us since
   we require UTF for the OS.  What we do here is to have our own
@ -426,14 +475,17 @@ sys_cp_wcstombs (wctomb_p f_wctomb, const char *charset, char *dst, size_t len,
    {
      wchar_t pw = *pwcs;
      int bytes;
      unsigned char cwc;
      /* Convert UNICODE private use area.  Reverse functionality for the
-         ASCII area <= 0x7f (only for path names) is transform_chars in
+         ASCII area <= 0x7f (only for path names) is transform_chars above.
-	 path.cc.  Reverse functionality for invalid bytes in a multibyte
+	 Reverse functionality for invalid bytes in a multibyte sequence is
-	 sequence is in sys_cp_mbstowcs. */
+	 in sys_cp_mbstowcs below. */
-      if ((pw & 0xff00) == 0xf000 && ((pw & 0xff) <= 0x7f || MB_CUR_MAX > 1))
+      if ((pw & 0xff00) == 0xf000
 	  && (((cwc = (pw & 0xff)) <= 0x7f && tfx_chars[cwc] >= 0xf000)
 	      || (cwc >= 0x80 && MB_CUR_MAX > 1)))
 	{
-	  buf[0] = pw & 0xff;
+	  buf[0] = (char) cwc;
 	  bytes = 1;
 	}
      else
@ -603,15 +655,14 @@ sys_cp_mbstowcs (mbtowc_p f_mbtowc, const char *charset, wchar_t *dst,
 	    }
 	}
      else if ((bytes = f_mbtowc (_REENT, ptr, (const char *) pmbs, nms,
-				  charset, &ps)) < 0
+				  charset, &ps)) < 0)
 	       || (bytes == 3 && pmbs[0] == 0xef && (pmbs[1] & 0xf4) == 0x80))
 	{
 	  /* The technique is based on a discussion here:
 	     http://www.mail-archive.com/linux-utf8@nl.linux.org/msg00080.html
 	     Invalid bytes in a multibyte secuence are converted to
 	     the private use area which is already used to store ASCII
-	     chars invalid in Windows filenames.  This techinque allows 
+	     chars invalid in Windows filenames.  This technque allows 
 	     to store them in a symmetric way. */
 	  bytes = 1;
 	  if (dst)