diff --git a/newlib/libc/ctype/mkcaseconv b/newlib/libc/ctype/mkcaseconv new file mode 100755 index 000000000..ab0571d8f --- /dev/null +++ b/newlib/libc/ctype/mkcaseconv @@ -0,0 +1,128 @@ +#! /bin/sh -f + +# generate a table for Unicode case conversion; entries: +# struct caseconv_entry defined in towctrans_l.c + +if [ -r UnicodeData.txt ] +then UnicodeData=UnicodeData.txt +elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ] +then UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt +else echo UnicodeData.txt not found >&2 + exit 1 +fi + +LC_ALL=C +export LC_ALL + +compact=true + +#0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; +#0061;LATIN SMALL LETTER A;Ll;0;L;;;;;N;;;0041;;0041 +#0130;LATIN CAPITAL LETTER I WITH DOT ABOVE;Lu;0;L;0049 0307;;;;N;LATIN CAPITAL LETTER I DOT;;;0069; +#01C4;LATIN CAPITAL LETTER DZ WITH CARON;Lu;0;L; 0044 017D;;;;N;LATIN CAPITAL LETTER D Z HACEK;;;01C6;01C5 +#01C5;LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON;Lt;0;L; 0044 017E;;;;N;LATIN LETTER CAPITAL D SMALL Z HACEK;;01C4;01C6;01C5 +#01C6;LATIN SMALL LETTER DZ WITH CARON;Ll;0;L; 0064 017E;;;;N;LATIN SMALL LETTER D Z HACEK;;01C4;;01C5 + +tr -d '\015' < $UnicodeData | +sed \ +-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \ +-e t \ +-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \ +-e t \ +-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;]*\);\([^;][^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \ +-e t \ +-e d | +(#src 01C5 upper "01C4" lower "01C6" title "01C5" +if $compact +then + ( + cat <<\/EOS + src () { + if [ -n "$3" ] + then tohi=$(( 0x0$3 - 0x0$1 )) + else tohi=0 + fi + if [ -n "$5" ] + then tolo=$(( 0x0$5 - 0x0$1 )) + else tolo=0 + fi + case "$tolo.$tohi" in + 0.0) true;; + 0.*) + case "$1.$tohi" in + *[02468ACE].1) echo "'#error' U+$1 ODDSML";; + *[02468ACE].-1) echo " 0x$1 TO1 ODDCAP";; + *[13579BDF].1) echo "'#error' U+$1 EVENSML";; + *[13579BDF].-1) echo " 0x$1 TO1 EVENCAP";; + *) echo " 0x$1 TOUP $tohi";; + esac;; + *.0) + case "$1.$tolo" in + *[02468ACE].1) echo " 0x$1 TO1 EVENCAP";; + *[02468ACE].-1) echo "'#error' U+$1 EVENSML";; + *[13579BDF].1) echo " 0x$1 TO1 ODDCAP";; + *[13579BDF].-1) echo "'#error' U+$1 ODDSML";; + *) echo " 0x$1 TOLO $tolo";; + esac;; + *) case "$tolo.$tohi" in + 1.-1) echo " 0x$1 TOBOTH 0";; + *) echo "'#error' U+$1";; + esac;; + esac + } +/EOS + cat + ) | sh | + uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ," | + ( + cat <<\/EOS + first= + diff=-1 + max=255 + range () { + # $diff == $(($last - $first)) + if [ "$diff" -ge 0 ] + then # we have items at all + echo " {$first, $diff, $v2, $v3}," + fi + first= + diff=-1 + } + item () { + if [ "$1" == "#error" ] + then echo "$*" + return + fi + + if [ $diff -eq $max ] + then range + elif [ -n "$first" ] + then if [ $(( $1 )) -ne $(( ${last-0} + 1 )) ] + then range + fi + fi + + if [ -z "$first" ] + then first=$1 + v2=$2 + v3=$3 + fi + + last=$1 + diff=$(( $diff + 1 )) + } +/EOS + cat + ) | sh +elif false +then + sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/ {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \ + -e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/' \ + -e 's/\(0x[0-9A-F][0-9A-F]*\) - \(0x[0-9A-F][0-9A-F]*\)/$((`printf %d \1` - `printf %d \2`))/g' \ + -e 's/^/echo "/' -e 's/$/"/' | + sh +else + sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/ {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \ + -e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/' +fi +) > caseconv.t diff --git a/newlib/libc/ctype/mkcategories b/newlib/libc/ctype/mkcategories new file mode 100755 index 000000000..24dd93ad0 --- /dev/null +++ b/newlib/libc/ctype/mkcategories @@ -0,0 +1,69 @@ +#! /bin/sh + +# generate table of Unicode character category ranges; +# note: undefined characters between two characters of the same category +# are associated to the same category, e.g. +#0A0A;GURMUKHI LETTER UU;Lo +#0A0B..0A0E -> Lo +#0A0F;GURMUKHI LETTER EE;Lo + +if [ -r UnicodeData.txt ] +then UnicodeData=UnicodeData.txt +elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ] +then UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt +else echo UnicodeData.txt not found >&2 + exit 1 +fi + +# the code assumes foldall=false, foldcase=true +foldall=false +foldcase=true + +( +cat <<\/EOS +first= +item () { + if [ -n "$first" ] + then if [ $(( 0x$1 )) -ne $(( 0x${last-0} + 1 )) ] + then range + fi + fi + + if [ -z "$first" ] + then first=$1 + val=$2 + fi + + last=$1 +} +range () { +# echo " {0x$first, 0x$last, CAT_$val}," +# echo " {0x$first, $((0x$last - 0x$first)), CAT_$val}," +# echo " {0x$first | (CAT_$val << 24), $((0x$last - 0x$first))}," + echo " {CAT_$val, 0x$first, $((0x$last - 0x$first))}," + first= +} +/EOS + +cat "$UnicodeData" | +if $foldall +then sed -e "s,;L[lu];,;LC;," -e "s,;C[fs];,;Cfs;," \ + -e "s,;L[mo];,;Lmo;," -e "s,;Nl;,;Lmo;," \ + -e "s,;P.;,;P;," -e "s,;No;,;P;," \ + -e "s,;S.;,;S;," -e "s,;Z[lp];,;Zlp;," \ + -e "s,;C[no];,;X;," -e "s,;M[cen];,;M;," +elif $foldcase +then +# fold Lu/Ll to LC only if lower/upper conversion is available + sed -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);.*/ s/;Lu;/;LC;/' \ + -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);.*/ s/;Ll;/;LC;/' \ + -e '/;Co;/ d' +else cat +fi | +sed -e "s,^\([^;]*\);[^;]*;\([^;]*\);.*,\1 \2," | +uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ," +) | sh > categories.t + +sed -e "s/.*\(CAT_[A-Za-z]*\).*/ \1,/" categories.t | +sort | uniq > categories.cat + diff --git a/newlib/libc/ctype/mkunidata b/newlib/libc/ctype/mkunidata new file mode 100755 index 000000000..ea18e6759 --- /dev/null +++ b/newlib/libc/ctype/mkunidata @@ -0,0 +1,40 @@ +#! /bin/sh + +echo generating Unicode character properties data for newlib/libc/ctype + +cd `dirname $0` + +############################################################################# +# checks and (with option -u) download + +case "$1" in +-u) + #WGET=wget -N -t 1 --timeout=55 + WGET=curl -R -O --connect-timeout 55 + WGET+=-z $@ + + echo downloading data from unicode.org + for data in UnicodeData.txt + do $WGET http://unicode.org/Public/UNIDATA/$data + done + ;; +*) echo checking package unicode-ucd + grep unicode-ucd /etc/setup/installed.db || exit 9 + ;; +esac + +for data in UnicodeData.txt +do test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9 +done + +############################################################################# +# table generation + +echo generating character category table for "isw*.c" + sh ./mkcategories + +echo generating case conversion table for "tow*.c" + sh ./mkcaseconv + +############################################################################# +# end