From af8e63023a915511a640aaa6e22c96cf9fe10ba8 Mon Sep 17 00:00:00 2001 From: Keith Marshall Date: Thu, 8 Mar 2007 23:15:58 +0000 Subject: [PATCH] Make basename and dirname functions work with path names containing multibyte character strings. --- winsup/mingw/ChangeLog | 6 ++ winsup/mingw/mingwex/basename.c | 116 +++++++++++++++++----- winsup/mingw/mingwex/dirname.c | 167 ++++++++++++++++++++++---------- 3 files changed, 210 insertions(+), 79 deletions(-) diff --git a/winsup/mingw/ChangeLog b/winsup/mingw/ChangeLog index f225db48f..b0c559ae0 100644 --- a/winsup/mingw/ChangeLog +++ b/winsup/mingw/ChangeLog @@ -1,3 +1,9 @@ +2007-03-08 Keith Marshall + + * mingwex/basename.c: Make it work with path names containing + multibyte character strings. + * mingwex/dirname.c: Likewise. + 2007-03-05 Danny Smith * include/io.h (__mingw_access): New static inline wrapper to restore diff --git a/winsup/mingw/mingwex/basename.c b/winsup/mingw/mingwex/basename.c index 768b9a0f4..c531cab58 100755 --- a/winsup/mingw/mingwex/basename.c +++ b/winsup/mingw/mingwex/basename.c @@ -20,8 +20,10 @@ */ #include +#include #include #include +#include #ifndef __cdecl /* If compiling on any non-Win32 platform ... */ #define __cdecl /* this may not be defined. */ @@ -29,61 +31,114 @@ __cdecl char *basename( char *path ) { - char *retname; - static char retfail[] = "."; + size_t len; + static char *retfail = NULL; + + /* to handle path names for files in multibyte character locales, + * we need to set up LC_CTYPE to match the host file system locale + */ + + char *locale = setlocale( LC_CTYPE, NULL ); + if( locale != NULL ) locale = strdup( locale ); + setlocale( LC_CTYPE, "" ); if( path && *path ) { - /* step over the drive designator, if present ... - * (FIXME: maybe should confirm *path is a valid drive designator). + /* allocate sufficient local storage space, + * in which to create a wide character reference copy of path */ - if( path[1] == ':' ) - path += 2; + wchar_t refcopy[1 + (len = mbstowcs( NULL, path, 0 ))]; + + /* create the wide character reference copy of path, + * and step over the drive designator, if present ... + */ + + wchar_t *refpath = refcopy; + if( ((len = mbstowcs( refpath, path, len )) > 1) && (refpath[1] == L':') ) + { + /* FIXME: maybe should confirm *refpath is a valid drive designator */ + + refpath += 2; + } + + /* ensure that our wide character reference path is NUL terminated */ + + refcopy[ len ] = L'\0'; /* check again, just to ensure we still have a non-empty path name ... */ - if( *path ) + if( *refpath ) { - /* and, when we do ... - * scan from left to right, to the char after the final dir separator + /* and, when we do, process it in the wide character domain ... + * scanning from left to right, to the char after the final dir separator */ - for( retname = path ; *path ; ++path ) + wchar_t *refname; + for( refname = refpath ; *refpath ; ++refpath ) { - if( (*path == '/') || (*path == '\\') ) + if( (*refpath == L'/') || (*refpath == L'\\') ) { /* we found a dir separator ... * step over it, and any others which immediately follow it */ - while( (*path == '/') || (*path == '\\') ) - ++path; + while( (*refpath == L'/') || (*refpath == L'\\') ) + ++refpath; /* if we didn't reach the end of the path string ... */ - if( *path ) + if( *refpath ) /* then we have a new candidate for the base name */ - retname = path; + refname = refpath; /* otherwise ... * strip off any trailing dir separators which we found */ - else while( (path > retname) && ((*--path == '/') || (*path == '\\')) ) - *path = '\0'; + else while( (refpath > refname) + && ((*--refpath == L'/') || (*refpath == L'\\')) ) + *refpath = L'\0'; } } - /* retname now points at the resolved base name ... - * if it's not empty, then we return it as it is, otherwise ... - * we must have had only dir separators in the original path name, - * so we return "/". + /* in the wide character domain ... + * refname now points at the resolved base name ... */ - return *retname ? retname : strcpy( retfail, "/" ); + if( *refname ) + { + /* if it's not empty, + * then we transform the full normalised path back into + * the multibyte character domain, and skip over the dirname, + * to return the resolved basename. + */ + + if( (len = wcstombs( path, refcopy, len )) != (size_t)(-1) ) + path[ len ] = '\0'; + *refname = L'\0'; + if( (len = wcstombs( NULL, refcopy, 0 )) != (size_t)(-1) ) + path += len; + } + + else + { + /* the basename is empty, so return the default value of "/", + * transforming from wide char to multibyte char domain, and + * returning it in our own buffer. + */ + + retfail = realloc( retfail, len = 1 + wcstombs( NULL, L"/", 0 )); + wcstombs( path = retfail, L"/", len ); + } + + /* restore the caller's locale, clean up, and return the result */ + + setlocale( LC_CTYPE, locale ); + free( locale ); + return( path ); } /* or we had an empty residual path name, after the drive designator, @@ -93,11 +148,20 @@ __cdecl char *basename( char *path ) /* and, if we get to here ... * the path name is either NULL, or it decomposes to an empty string; - * in either case, we return the default value of "." in our static buffer, - * (but strcpy it, just in case the caller trashed it after a previous call). + * in either case, we return the default value of "." in our own buffer, + * reloading it with the correct value, transformed from the wide char + * to the multibyte char domain, just in case the caller trashed it + * after a previous call. */ - return strcpy( retfail, "." ); + retfail = realloc( retfail, len = 1 + wcstombs( NULL, L".", 0 )); + wcstombs( retfail, L".", len ); + + /* restore the caller's locale, clean up, and return the result */ + + setlocale( LC_CTYPE, locale ); + free( locale ); + return( retfail ); } -/* $RCSfile$: end of file */ +/* $RCSfile$$Revision$: end of file */ diff --git a/winsup/mingw/mingwex/dirname.c b/winsup/mingw/mingwex/dirname.c index 44256b19c..20d7a30bb 100755 --- a/winsup/mingw/mingwex/dirname.c +++ b/winsup/mingw/mingwex/dirname.c @@ -20,8 +20,10 @@ */ #include +#include #include #include +#include #ifndef __cdecl /* If compiling on any non-Win32 platform ... */ #define __cdecl /* this may not be defined. */ @@ -29,12 +31,30 @@ __cdecl char *dirname( char *path ) { - static char retfail[] = "?:."; - char *retname, *basename, *copyptr = retfail; + size_t len; + static char *retfail = NULL; + + /* to handle path names for files in multibyte character locales, + * we need to set up LC_CTYPE to match the host file system locale. + */ + + char *locale = setlocale( LC_CTYPE, NULL ); + if( locale != NULL ) locale = strdup( locale ); + setlocale( LC_CTYPE, "" ); if( path && *path ) { - retname = path; + /* allocate sufficient local storage space, + * in which to create a wide character reference copy of path + */ + + wchar_t refcopy[1 + (len = mbstowcs( NULL, path, 0 ))]; + + /* create the wide character reference copy of path */ + + wchar_t *refpath = refcopy; + len = mbstowcs( refpath, path, len ); + refcopy[ len ] = L'\0'; /* SUSv3 identifies a special case, where path is exactly equal to "//"; * (we will also accept "\\" in the Win32 context, but not "/\" or "\/", @@ -44,48 +64,55 @@ __cdecl char *dirname( char *path ) * simply return the path unchanged, (i.e. "//" or "\\"). */ - if( (*path == '/') || (*path == '\\') ) + if( (len > 1) && ((refpath[0] == L'/') || (refpath[0] == L'\\')) ) { - if( (path[1] == *retname) && (path[2] == '\0') ) - return retname; + if( (refpath[1] == refpath[0]) && (refpath[2] == L'\0') ) + { + setlocale( LC_CTYPE, locale ); + free( locale ); + return path; + } } /* For all other cases ... - * step over the drive designator, if present, copying it to retfail ... - * (FIXME: maybe should confirm *path is a valid drive designator). + * step over the drive designator, if present ... */ - else if( *path && (path[1] == ':') ) + else if( (len > 1) && (refpath[1] == L':') ) { - *copyptr++ = *path++; - *copyptr++ = *path++; + /* FIXME: maybe should confirm *refpath is a valid drive designator */ + + refpath += 2; } - if( *path ) + /* check again, just to ensure we still have a non-empty path name ... */ + + if( *refpath ) { /* reproduce the scanning logic of the "basename" function * to locate the basename component of the current path string, * (but also remember where the dirname component starts). */ - for( retname = basename = path ; *path ; ++path ) + wchar_t *refname, *basename; + for( refname = basename = refpath ; *refpath ; ++refpath ) { - if( (*path == '/') || (*path == '\\') ) + if( (*refpath == L'/') || (*refpath == L'\\') ) { /* we found a dir separator ... * step over it, and any others which immediately follow it */ - while( (*path == '/') || (*path == '\\') ) - ++path; + while( (*refpath == L'/') || (*refpath == L'\\') ) + ++refpath; /* if we didn't reach the end of the path string ... */ - if( *path ) + if( *refpath ) /* then we have a new candidate for the base name */ - basename = path; + basename = refpath; else @@ -102,73 +129,107 @@ __cdecl char *dirname( char *path ) * to confirm that we have distinct dirname and basename components */ - if( basename > retname ) + if( basename > refname ) { /* and, when we do ... * backtrack over all trailing separators on the dirname component, * (but preserve exactly two initial dirname separators, if identical), - * and add a NULL terminator in their place. + * and add a NUL terminator in their place. */ - --basename; - while( (basename > retname) && ((*basename == '/') || (*basename == '\\')) ) - --basename; - if( (basename == retname) && ((*retname == '/') || (*retname == '\\')) - && (retname[1] == *retname) && (retname[2] != '/') && (retname[2] != '\\') ) + do --basename; + while( (basename > refname) && ((*basename == L'/') || (*basename == L'\\')) ); + if( (basename == refname) && ((refname[0] == L'/') || (refname[0] == L'\\')) + && (refname[1] == refname[0]) && (refname[2] != L'/') && (refname[2] != L'\\') ) ++basename; - *++basename = '\0'; - - /* adjust the start point of the dirname, - * to accommodate the Win32 drive designator, if it was present. - */ - - if( copyptr > retfail ) - retname -= 2; + *++basename = L'\0'; /* if the resultant dirname begins with EXACTLY two dir separators, * AND both are identical, then we preserve them. */ - path = copyptr = retname; - while( ((*path == '/') || (*path == '\\')) ) - ++path; - if( ((path - retname) == 2) && (*++copyptr == *retname) ) - ++copyptr; + refpath = refcopy; + while( ((*refpath == L'/') || (*refpath == L'\\')) ) + ++refpath; + if( ((refpath - refcopy) > 2) || (refcopy[1] != refcopy[0]) ) + refpath = refcopy; /* and finally ... * we remove any residual, redundantly duplicated separators from the dirname, * reterminate, and return it. */ - path = copyptr; - while( *path ) + refname = refpath; + while( *refpath ) { - if( ((*copyptr++ = *path) == '/') || (*path++ == '\\') ) + if( ((*refname++ = *refpath) == L'/') || (*refpath++ == L'\\') ) { - while( (*path == '/') || (*path == '\\') ) - ++path; + while( (*refpath == L'/') || (*refpath == L'\\') ) + ++refpath; } } - *copyptr = '\0'; - return retname; + *refname = L'\0'; + + /* finally ... + * transform the resolved dirname back into the multibyte char domain, + * restore the caller's locale, and return the resultant dirname + */ + + if( (len = wcstombs( path, refcopy, len )) != (size_t)(-1) ) + path[ len ] = '\0'; } - else if( (*retname == '/') || (*retname == '\\') ) + else { - *copyptr++ = *retname; - *copyptr = '\0'; - return retfail; + /* either there were no dirname separators in the path name, + * or there was nothing else ... + */ + + if( (*refname == L'/') || (*refname == L'\\') ) + { + /* it was all separators, so return one */ + + ++refname; + } + + else + { + /* there were no separators, so return '.' */ + + *refname++ = L'.'; + } + + /* add a NUL terminator, in either case, + * then transform to the multibyte char domain, + * using our own buffer + */ + + *refname = L'\0'; + retfail = realloc( retfail, len = 1 + wcstombs( NULL, refcopy, 0 )); + wcstombs( path = retfail, refcopy, len ); } + + /* restore caller's locale, clean up, and return the resolved dirname */ + + setlocale( LC_CTYPE, locale ); + free( locale ); + return path; } } /* path is NULL, or an empty string; default return value is "." ... - * return this in our own static buffer, but strcpy it, just in case - * the caller trashed it after a previous call. + * return this in our own buffer, regenerated by wide char transform, + * in case the caller trashed it after a previous call. */ - strcpy( copyptr, "." ); + retfail = realloc( retfail, len = 1 + wcstombs( NULL, L".", 0 )); + wcstombs( retfail, L".", len ); + + /* restore caller's locale, clean up, and return the default dirname */ + + setlocale( LC_CTYPE, locale ); + free( locale ); return retfail; } -/* $RCSfile$: end of file */ +/* $RCSfile$$Revision$: end of file */