698 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
			
		
		
	
	
			698 lines
		
	
	
		
			21 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
| #!/usr/bin/perl
 | |
| # Uniset -- Unicode subset manager -- Markus Kuhn
 | |
| # http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz
 | |
| 
 | |
| require 5.014;
 | |
| use open ':utf8';
 | |
| use FindBin qw($RealBin);  # to find directory where this file is located
 | |
| 
 | |
| binmode(STDOUT, ":utf8");
 | |
| binmode(STDIN, ":utf8");
 | |
| 
 | |
| my (%name, %invname, %category, %comment);
 | |
| 
 | |
| print <<End if $#ARGV < 0;
 | |
| Uniset -- Unicode subset manager -- Markus Kuhn
 | |
| 
 | |
| Uniset merges and subtracts Unicode subsets. It can output and
 | |
| analyse the resulting character set in various formats.
 | |
| 
 | |
| Uniset understand the following command-line arguments:
 | |
| 
 | |
| Commands to define a set of characters:
 | |
| 
 | |
|   + filename   add the character set described in the file to the set
 | |
|   - filename   remove the character set described in the file from the set
 | |
|   +: filename  add the characters in the UTF-8 file to the set
 | |
|   -: filename  remove the characters in the UTF-8 file from the set
 | |
|   +xxxx..yyyy  add the range to the set (xxxx and yyyy are hex numbers)
 | |
|   -xxxx..yyyy  remove the range from the set (xxxx and yyyy are hex numbers)
 | |
|   +cat=Xx      add all Unicode characters with category code Xx
 | |
|   -cat=Xx      remove all Unicode characters with category code Xx
 | |
|   -cat!=Xx     remove all Unicode characters without category code Xx
 | |
|   clean        remove any elements that do not appear in the Unicode database
 | |
|   unknown      remove any elements that do appear in the Unicode database
 | |
| 
 | |
| Command to output descriptions of the constructed set of characters:
 | |
| 
 | |
|   table        write a full table with one line per character
 | |
|   compact      output the set in compact MES format
 | |
|   c            output the set as C interval array
 | |
|   nr           output the number of characters
 | |
|   sources      output a table that shows the number of characters contributed
 | |
|                by the various combinations of input sets added with +.
 | |
|   utf8-list    output a list of all characters encoded in UTF-8
 | |
| 
 | |
| Commands to tailor the following output commands:
 | |
| 
 | |
|   html         write HTML tables instead of plain text
 | |
|   ucs          add the unicode character itself to the table (UTF-8 in
 | |
|                plain table, numeric character reference in HTML)
 | |
| 
 | |
| Formats of character set input files read by the + and - command:
 | |
| 
 | |
| Empty lines, white space at the start and end of the line and any
 | |
| comment text following a \# are ignored. The following formats are
 | |
| recognized
 | |
| 
 | |
| xx yyyy             xx is the hex code in an 8-bit character set and yyyy
 | |
|                     is the corresponding Unicode value. Both can optionally
 | |
|                     be prefixed by 0x. This is the format used in the
 | |
|                     files on <ftp://ftp.unicode.org/Public/MAPPINGS/>.
 | |
| 
 | |
| yyyy                yyyy (optionally prefixed with 0x) is a Unicode character
 | |
|                     belonging to the specified subset.
 | |
| 
 | |
| yyyy-yyyy           a range of Unicode characters belonging to
 | |
| yyyy..yyyy          the specified subset.
 | |
| 
 | |
| xx yy yy yy-yy yy   xx denotes a row (high-byte) and the yy specify
 | |
|                     corresponding low bytes or with a hyphen also ranges of
 | |
|                     low bytes in the Unicode values that belong to this
 | |
|                     subset. This is also the format that is generated by
 | |
|                     the compact command.
 | |
| End
 | |
| exit 1 if $#ARGV < 0;
 | |
| 
 | |
| 
 | |
| # Subroutine to identify whether the ISO 10646/Unicode character code
 | |
| # ucs belongs into the East Asian Wide (W) or East Asian FullWidth
 | |
| # (F) category as defined in Unicode Technical Report #11.
 | |
| 
 | |
| sub iswide ($) {
 | |
|     my $ucs = shift(@_);
 | |
| 
 | |
|     return ($ucs >= 0x1100 &&
 | |
| 	    ($ucs <= 0x115f ||                     # Hangul Jamo
 | |
| 	     $ucs == 0x2329 || $ucs == 0x232a ||
 | |
| 	     ($ucs >= 0x2e80 && $ucs <= 0xa4cf &&
 | |
| 	      $ucs != 0x303f) ||                   # CJK .. Yi
 | |
| 	     ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables
 | |
| 	     ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs
 | |
| 	     ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms
 | |
| 	     ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms
 | |
| 	     ($ucs >= 0xffe0 && $ucs <= 0xffe6) ||
 | |
| 	     ($ucs >= 0x20000 && $ucs <= 0x2fffd) ||
 | |
| 	     ($ucs >= 0x30000 && $ucs <= 0x3fffd)));
 | |
| }
 | |
| 
 | |
| # Return the Unicode name that belongs to a given character code
 | |
| 
 | |
| # Jamo short names, see Unicode 3.0, table 4-4, page 86
 | |
| 
 | |
| my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '',
 | |
| 	     'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112
 | |
| my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O',
 | |
| 	     'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU',
 | |
| 	     'EU', 'YI', 'I'); # 1161..1175
 | |
| my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM',
 | |
| 	     'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS',
 | |
| 	     'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2
 | |
| 
 | |
| sub name {
 | |
|     my $ucs = shift(@_);
 | |
| 
 | |
|     # The intervals used here reflect Unicode Version 3.2
 | |
|     if (($ucs >=  0x3400 && $ucs <=  0x4db5) ||
 | |
| 	($ucs >=  0x4e00 && $ucs <=  0x9fa5) ||
 | |
| 	($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
 | |
| 	return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs);
 | |
|     }
 | |
| 
 | |
|     if ($ucs >= 0xac00 && $ucs <= 0xd7a3) {
 | |
| 	my $s = $ucs - 0xac00;
 | |
| 	my $l = 0x1100 + int($s / (21 * 28));
 | |
| 	my $v = 0x1161 + int(($s % (21 * 28)) / 28);
 | |
| 	my $t = 0x11a7 + $s % 28;
 | |
| 	return "HANGUL SYLLABLE " .
 | |
| 	    ($lname[int($s / (21 * 28))] .
 | |
| 	     $vname[int(($s % (21 * 28)) / 28)] .
 | |
| 	     $tname[$s % 28 - 1]);
 | |
|     }
 | |
| 
 | |
|     return $name{$ucs};
 | |
| }
 | |
| 
 | |
| sub is_unicode {
 | |
|     my $ucs = shift(@_);
 | |
| 
 | |
|     # The intervals used here reflect Unicode Version 3.2
 | |
|     if (($ucs >=  0x3400 && $ucs <=  0x4db5) ||
 | |
| 	($ucs >=  0x4e00 && $ucs <=  0x9fa5) ||
 | |
| 	($ucs >=  0xac00 && $ucs <=  0xd7a3) ||
 | |
| 	($ucs >= 0x20000 && $ucs <= 0x2a6d6)) {
 | |
| 	return 1;
 | |
|     }
 | |
| 
 | |
|     return exists $name{$ucs};
 | |
| }
 | |
| 
 | |
| my @search_path = ();
 | |
| if ($RealBin =~ m|^(.*)/bin\z| && -d "$1/share/uniset") {
 | |
|     push @search_path, "$1/share/uniset";
 | |
| } else {
 | |
|     push @search_path, $RealBin;
 | |
| }
 | |
| 
 | |
| sub search_open {
 | |
|     my ($mode, $fn) = @_;
 | |
|     my $file;
 | |
|     return $file if open($file, $mode, $fn);
 | |
|     return undef if $fn =~ m|/|;
 | |
|     for my $path (@search_path) {
 | |
| 	return $file if open($file, $mode, "$path/$fn");
 | |
|     }
 | |
|     return undef;
 | |
| }
 | |
| 
 | |
| my $html = 0;
 | |
| my $image = 0;
 | |
| my $adducs = 0;
 | |
| my $unicodedata = "UnicodeData.txt";
 | |
| my $blockdata = "Blocks.txt";
 | |
| 
 | |
| # read list of all Unicode names
 | |
| my $data = search_open('<', $unicodedata);
 | |
| unless ($data) {
 | |
|     die ("Can't open Unicode database '$unicodedata':\n$!\n\n" .
 | |
| 	 "Please make sure that you have downloaded the file\n" .
 | |
| 	 "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n");
 | |
| }
 | |
| while (<$data>) {
 | |
|     if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) {
 | |
| 	next if $2 ne '<control>' && substr($2, 0, 1) eq '<';
 | |
| 	$ucs = hex($1);
 | |
|         $name{$ucs} = $2;
 | |
| 	$invname{$2} = $ucs;
 | |
| 	$category{$ucs} = $3;
 | |
|         $comment{$ucs} = $12;
 | |
|     } else {
 | |
|         die("Syntax error in line '$_' in file '$unicodedata'\n");
 | |
|     }
 | |
| }
 | |
| close($data);
 | |
| 
 | |
| # read list of all Unicode blocks
 | |
| $data = search_open('<', $blockdata);
 | |
| unless ($data) {
 | |
|     die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" .
 | |
| 	 "Please make sure that you have downloaded the file\n" .
 | |
| 	 "http://www.unicode.org/Public/UNIDATA/Blocks.txt\n");
 | |
| }
 | |
| my $blocks = 0;
 | |
| my (@blockstart, @blockend, @blockname);
 | |
| while (<$data>) {
 | |
|     if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) {
 | |
|         $blockstart[$blocks] = hex($1);
 | |
| 	$blockend  [$blocks] = hex($2);
 | |
|         $blockname [$blocks] = $3;
 | |
| 	$blocks++;
 | |
|     } elsif (/^\s*\#/ || /^\s*$/) {
 | |
| 	# ignore comments and empty lines
 | |
|     } else {
 | |
|         die("Syntax error in line '$_' in file '$blockdata'\n");
 | |
|     }
 | |
| }
 | |
| close($data);
 | |
| if ($blockend[$blocks-1] < 0x110000) {
 | |
|     $blockstart[$blocks] = 0x110000;
 | |
|     $blockend  [$blocks] = 0x7FFFFFFF;
 | |
|     $blockname [$blocks] = "Beyond Plane 16";
 | |
|     $blocks++;
 | |
| }
 | |
| 
 | |
| # process command line arguments
 | |
| while ($_ = shift(@ARGV)) {
 | |
|     if (/^html$/) {
 | |
| 	$html = 1;
 | |
|     } elsif (/^ucs$/) {
 | |
| 	$adducs = 1;
 | |
|     } elsif (/^img$/) {
 | |
| 	$html = 1;
 | |
| 	$image = 1;
 | |
|     } elsif (/^template$/) {
 | |
| 	$template = shift(@ARGV);
 | |
| 	open(TEMPLATE, $template) || die("Can't open template file '$template': $!\n");
 | |
| 	while (<TEMPLATE>) {
 | |
| 	    if (/^\#\s*include\s+\"([^\"]*)\"\s*$/) {
 | |
| 		open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n");
 | |
| 		while (<INCLUDE>) {
 | |
| 		    print $_;
 | |
| 		}
 | |
| 		close(INCLUDE);
 | |
| 	    } elsif (/^\#\s*quote\s+\"([^\"]*)\"\s*$/) {
 | |
| 		open(INCLUDE, $1) || die("Can't open template include file '$1': $!\n");
 | |
| 		while (<INCLUDE>) {
 | |
| 		    s/&/&/g;
 | |
| 		    s/</</g;
 | |
| 		    print $_;
 | |
| 		}
 | |
| 		close(INCLUDE);
 | |
| 	    } else {
 | |
| 		print $_;
 | |
| 	    }
 | |
| 	}
 | |
| 	close(TEMPLATE);
 | |
|     } elsif (/^\+cat=(.+)$/) {
 | |
| 	# add characters with given category
 | |
| 	$cat = $1;
 | |
| 	for $i (keys(%category)) {
 | |
| 	    $used{$i} = "[${cat}]" if $category{$i} eq $cat;
 | |
| 	}
 | |
|     } elsif (/^\-cat=(.+)$/) {
 | |
| 	# remove characters with given category
 | |
| 	$cat = $1;
 | |
| 	for $i (keys(%category)) {
 | |
| 	    delete $used{$i} if $category{$i} eq $cat;
 | |
| 	}
 | |
|     } elsif (/^\-cat!=(.+)$/) {
 | |
| 	# remove characters without given category
 | |
| 	$cat = $1;
 | |
| 	for $i (keys(%category)) {
 | |
| 	    delete $used{$i} unless $category{$i} eq $cat;
 | |
| 	}
 | |
|     } elsif (/^([+-]):(.*)/) {
 | |
| 	$remove = $1 eq "-";
 | |
| 	$setfile = $2;
 | |
| 	$setfile = shift(@ARGV) if $setfile eq "";
 | |
| 	push(@SETS, $setfile);
 | |
| 	open(SET, $setfile) || die("Can't open set file '$setfile': $!\n");
 | |
| 	$setname = $setfile;
 | |
| 	while (<SET>) {
 | |
| 	    while ($_) {
 | |
| 		$i = ord($_);
 | |
| 		$used{$i} .= "[${setname}]" unless $remove;
 | |
| 		delete $used{$i} if $remove;
 | |
| 		$_ = substr($_, 1);
 | |
| 	    }
 | |
| 	}
 | |
| 	close SET;
 | |
|     } elsif (/^([+-])(.*)/) {
 | |
| 	$remove = $1 eq "-";
 | |
| 	$setfile = $2;
 | |
| 	$setfile = "$setfile..$setfile" if $setfile =~ /^([0-9A-Fa-f]{4,8})$/;
 | |
| 	if ($setfile =~ /^([0-9A-Fa-f]{4,8})(-|\.\.)([0-9A-Fa-f]{4,8})$/) {
 | |
| 	    # handle intervall specification on command line
 | |
| 	    $first = hex($1);
 | |
| 	    $last = hex($3);
 | |
| 	    for ($i = $first; $i <= $last; $i++) {
 | |
| 		$used{$i} .= "[ARG]" unless $remove;
 | |
| 		delete $used{$i} if $remove;
 | |
| 	    }
 | |
| 	    next;
 | |
| 	}
 | |
| 	$setfile = shift(@ARGV) if $setfile eq "";
 | |
| 	push(@SETS, $setfile);
 | |
| 	my $setf = search_open('<', $setfile);
 | |
| 	die("Can't open set file '$setfile': $!\n") unless $setf;
 | |
| 	$cedf = ($setfile =~ /cedf/); # detect Kosta Kosti's trans CEDF format by path name
 | |
| 	$setname = $setfile;
 | |
| 	$setname =~ s/([^.\[\]]*)\..*/$1/;
 | |
| 	while (<$setf>) {
 | |
| 	    if (/^<code_set_name>/) {
 | |
| 		# handle ISO 15897 (POSIX registry) charset mapping format
 | |
| 		undef $comment_char;
 | |
| 		undef $escape_char;
 | |
| 		while (<$setf>) {
 | |
| 		    if ($comment_char && /^$comment_char/) {
 | |
| 			# remove comments
 | |
| 			$_ = $`;
 | |
| 		    }
 | |
| 		    next if (/^\032?\s*$/);                                             # skip empty lines
 | |
| 		    if (/^<comment_char> (\S)$/) {
 | |
| 			$comment_char = $1;
 | |
| 		    } elsif (/^<escape_char> (\S)$/) {
 | |
| 			$escape_char = $1;
 | |
| 		    } elsif (/^(END )?CHARMAP$/) {
 | |
| 			#ignore
 | |
| 		    } elsif (/^<.*>\s*\/x([0-9A-F]{2})\s*<U([0-9A-F]{4,8})>/) {
 | |
| 			$used{hex($2)} .= "[${setname}{$1}]" unless $remove;
 | |
| 			delete $used{hex($2)} if $remove;
 | |
| 		    } else {
 | |
| 			die("Syntax error in line $. in file '$setfile':\n'$_'\n");
 | |
| 		    }
 | |
| 		}
 | |
| 		next;
 | |
| 	    } elsif (/^STARTFONT /) {
 | |
| 		# handle X11 BDF file
 | |
| 		while (<$setf>) {
 | |
| 		    if (/^ENCODING\s+([0-9]+)/) {
 | |
| 			$used{$1} .= "[${setname}]" unless $remove;
 | |
| 			delete $used{$1} if $remove;
 | |
| 		    }
 | |
| 		}
 | |
| 		next;
 | |
| 	    }
 | |
| 	    tr/a-z/A-Z/;           # make input uppercase
 | |
| 	    if ($cedf) {
 | |
| 		if ($. > 4) {
 | |
| 		    if (/^([0-9A-F]{2})\t.?\t(.*)$/) {
 | |
| 			# handle Kosta Kosti's trans CEDF format
 | |
| 			next if (hex($1) < 32 || (hex($1) > 0x7e && hex($1) < 0xa0));
 | |
| 			$ucs = $invname{$2};
 | |
| 			die "unknown ISO 10646 name '$2' in '$setfile' line $..\n" if ! $ucs;
 | |
| 			$used{$ucs} .= "[${setname}{$1}]" unless $remove;
 | |
| 			delete $used{$ucs} if $remove;
 | |
| 		    } else {
 | |
| 			die("Syntax error in line $. in CEDF file '$setfile':\n'$_'\n");
 | |
| 		    }
 | |
| 		}
 | |
| 		next;
 | |
| 	    }
 | |
| 	    if (/^\s*(0X|U\+|U-)?([0-9A-F]{2})\s+\#\s*UNDEFINED\s*$/) {
 | |
| 		# ignore ftp.unicode.org mapping file lines with #UNDEFINED
 | |
| 		next;
 | |
| 	    }
 | |
| 	    s/^([^\#]*)\#.*$/$1/;  # remove comments
 | |
| 	    next if (/^\032?\s*$/);     # skip empty lines
 | |
| 	    if (/^\s*(0X)?([0-9A-F-]{2})\s+(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
 | |
| 		# handle entry from a ftp.unicode.org mapping file
 | |
| 		$used{hex($4)} .= "[${setname}{$2}]" unless $remove;
 | |
| 		delete $used{hex($4)} if $remove;
 | |
| 	    } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})(\s*-\s*|\s*\.\.\s*|\s+)(0X|U\+|U-)?([0-9A-F]{4,8})\s*$/) {
 | |
| 		# handle interval specification
 | |
| 		$first = hex($2);
 | |
| 		$last = hex($5);
 | |
| 		for ($i = $first; $i <= $last; $i++) {
 | |
| 		    $used{$i} .= "[${setname}]" unless $remove;
 | |
| 		    delete $used{$i} if $remove;
 | |
| 		}
 | |
| 	    } elsif (/^\s*([0-9A-F]{2,6})(\s+[0-9A-F]{2},?|\s+[0-9A-F]{2}-[0-9A-F]{2},?)+/) {
 | |
| 		# handle lines from P10 MES draft
 | |
| 		$row = $1;
 | |
| 		$cols = $_;
 | |
| 		$cols =~ s/^\s*([0-9A-F]{2,6})\s*(.*)\s*$/$2/;
 | |
| 		$cols =~ tr/,//d;
 | |
| 		@cols = split(/\s+/, $cols);
 | |
| 		for (@cols) {
 | |
| 		    if (/^(..)$/) {
 | |
| 			$first = hex("$row$1");
 | |
| 			$last  = $first;
 | |
| 		    } elsif (/^(..)-(..)$/) {
 | |
| 			$first = hex("$row$1");
 | |
| 			$last  = hex("$row$2");
 | |
| 		    } else {
 | |
| 			die ("this should never happen '$_'");
 | |
| 		    }
 | |
| 		    for ($i = $first; $i <= $last; $i++) {
 | |
| 			$used{$i} .= "[${setname}]" unless $remove;
 | |
| 			delete $used{$i} if $remove;
 | |
| 		    }
 | |
| 		}
 | |
| 	    } elsif (/^\s*(0X|U\+|U-)?([0-9A-F]{4,8})\s*/) {
 | |
| 		# handle single character
 | |
| 		$used{hex($2)} .= "[${setname}]" unless $remove;
 | |
| 		delete $used{hex($2)} if $remove;
 | |
| 	    } else {
 | |
| 		die("Syntax error in line $. in file '$setfile':\n'$_'\n") unless /^\s*(\#.*)?$/;
 | |
| 	    }
 | |
| 	}
 | |
| 	close $setf;
 | |
|     } elsif (/^loadimages$/ || /^loadbigimages$/) {
 | |
| 	if (/^loadimages$/) {
 | |
| 	    $prefix = "Small.Glyphs";
 | |
| 	} else {
 | |
| 	    $prefix = "Glyphs";
 | |
| 	}
 | |
| 	$total = 0;
 | |
| 	for $i (keys(%used)) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    $total++;
 | |
| 	}
 | |
| 	$count = 0;
 | |
| 	$| = 1;
 | |
| 	for $i (sort({$a <=> $b} keys(%used))) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    $count++;
 | |
| 	    $j = sprintf("%04X", $i);
 | |
| 	    $j =~ /(..)(..)/;
 | |
| 	    $gif = "http://charts.unicode.org/Unicode.charts/$prefix/$1/U$j.gif";
 | |
| 	    print("\r$count/$total: $gif");
 | |
| 	    system("mkdir -p $prefix/$1; cd $prefix/$1; webcopy -u -s $gif &");
 | |
| 	    select(undef, undef, undef, 0.2);
 | |
| 	}
 | |
| 	print("\n");
 | |
| 	exit 0;
 | |
|     } elsif (/^giftable/) {
 | |
| 	# form a table of glyphs (requires pbmtools installed)
 | |
| 	$count = 0;
 | |
| 	for $i (keys(%used)) {
 | |
| 	    $count++ unless $name{$i} eq "<control>";
 | |
| 	}
 | |
| 	$width = int(sqrt($count/sqrt(2)) + 0.5);
 | |
| 	$width = $1 if /^giftable([0-9]+)$/;
 | |
| 	system("rm -f tmp-*.pnm table.pnm~ table.pnm");
 | |
| 	$col = 0;
 | |
| 	$row = 0;
 | |
| 	for $i (sort({$a <=> $b} keys(%used))) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    $j = sprintf("%04X", $i);
 | |
| 	    $j =~ /(..)(..)/;
 | |
| 	    $gif = "Small.Glyphs/$1/U$j.gif";
 | |
| 	    $pnm = sprintf("tmp-%02x.pnm", $col);
 | |
| 	    $fallback = "Small.Glyphs/FF/UFFFD.gif";
 | |
| 	    system("giftopnm $gif >$pnm || { rm $pnm ; giftopnm $fallback >$pnm ; }");
 | |
| 	    if (++$col == $width) {
 | |
| 		system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
 | |
| 		if ($row == 0) {
 | |
| 		    system("mv tmp-row.pnm table.pnm");
 | |
| 		} else {
 | |
| 		    system("mv table.pnm table.pnm~; pnmcat -tb table.pnm~ tmp-row.pnm >table.pnm");
 | |
| 		}
 | |
| 		$row++;
 | |
| 		$col = 0;
 | |
| 		system("rm -f tmp-*.pnm table.pnm~");
 | |
| 	    }
 | |
| 	}
 | |
| 	if ($col > 0) {
 | |
| 	    system("pnmcat -lr tmp-*.pnm | cat >tmp-row.pnm");
 | |
| 	    if ($row == 0) {
 | |
| 		system("mv tmp-row.pnm table.pnm");
 | |
| 	    } else {
 | |
| 		system("mv table.pnm table.pnm~; pnmcat -tb -jleft -black table.pnm~ tmp-row.pnm >table.pnm");
 | |
| 	    }
 | |
| 	}
 | |
| 	system("rm -f table.gif ; ppmtogif table.pnm > table.gif");
 | |
| 	system("rm -f tmp-*.pnm table.pnm~ table.pnm");
 | |
|     } elsif (/^table$/) {
 | |
| 	# go through all used names to print full table
 | |
| 	print "<TABLE border=2>\n" if $html;
 | |
| 	for $i (sort({$a <=> $b} keys(%used))) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    if ($html) {
 | |
| 		$sources = $used{$i};
 | |
| 		$sources =~ s/\]\[/, /g;
 | |
| 		$sources =~ s/^\[//g;
 | |
| 		$sources =~ s/\]$//g;
 | |
| 		$sources =~ s/\{(..)\}/<SUB>$1<\/SUB>/g;
 | |
| 		$j = sprintf("%04X", $i);
 | |
| 		$j =~ /(..)(..)/;
 | |
| 		$gif = "Small.Glyphs/$1/U$j.gif";
 | |
| 		print "<TR>";
 | |
| 		print "<TD><img width=32 height=32 src=\"$gif\">" if $image;
 | |
| 		printf("<TD>&#%d;", $i) if $adducs;
 | |
| 		print "<TD><SAMP>$j</SAMP><TD><SAMP>" . name($i);
 | |
| 		print " ($comment{$i})" if $comment{$i};
 | |
| 		print "</SAMP><TD><SMALL>$sources</SMALL>\n";
 | |
| 	    } else {
 | |
| 		printf("%04X \# ", $i);
 | |
| 		print pack("U", $i) . " " if $adducs;
 | |
| 		print name($i) ."\n";
 | |
| 	    }
 | |
| 	}
 | |
| 	print "</TABLE>\n" if $html;
 | |
|     } elsif (/^imgblock$/) {
 | |
| 	$width = 16;
 | |
| 	$width = $1 if /giftable([0-9]+)/;
 | |
| 	$col = 0;
 | |
| 	$subline = "";
 | |
| 	print "\n<P><TABLE cellspacing=0 cellpadding=0>";
 | |
| 	for $i (sort({$a <=> $b} keys(%used))) {
 | |
| 	    print "<TR>" if $col == 0;
 | |
| 	    $j = sprintf("%04X", $i);
 | |
| 	    $j =~ /(..)(..)/;
 | |
| 	    $gif = "Small.Glyphs/$1/U$j.gif";
 | |
| 	    $alt = name($i);
 | |
| 	    print "<TD><img width=32 height=32 src=\"$gif\" alt=\"$alt\">";
 | |
| 	    $subline .= "<TD><SMALL><SAMP>$j</SAMP></SMALL>";
 | |
| 	    if (++$col == $width) {
 | |
| 		print "<TR align=center>$subline";
 | |
| 		$col = 0;
 | |
| 		$subline = "";
 | |
| 	    }
 | |
| 	}
 | |
| 	print "<TR align=center>$subline" if ($col > 0);
 | |
| 	print "</TABLE>\n";
 | |
|     } elsif (/^sources$/) {
 | |
| 	# count how many characters are attributed to the various source set combinations
 | |
| 	print "<P>Number of occurences of source character set combinations:\n<TABLE border=2>" if $html;
 | |
| 	for $i (keys(%used)) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    $sources = $used{$i};
 | |
| 	    $sources =~ s/\]\[/, /g;
 | |
| 	    $sources =~ s/^\[//g;
 | |
| 	    $sources =~ s/\]$//g;
 | |
| 	    $sources =~ s/\{(..)\}//g;
 | |
| 	    $contribs{$sources} += 1;
 | |
| 	}
 | |
| 	for $j (keys(%contribs)) {
 | |
| 	    print "<TR><TD>$contribs{$j}<TD>$j\n" if $html;
 | |
| 	}
 | |
| 	print "</TABLE>\n" if $html;
 | |
|     } elsif (/^compact$/) {
 | |
| 	# print compact table in P10 MES format
 | |
| 	print "<P>Compact representation of this character set:\n<TABLE border=2>" if $html;
 | |
| 	print "<TR><TD><B>Rows</B><TD><B>Positions (Cells)</B>" if $html;
 | |
| 	print "\n# Plane 00\n# Rows\tPositions (Cells)\n" unless $html;
 | |
| 	$current_row = '';
 | |
| 	$start_col = '';
 | |
| 	$last_col = '';
 | |
| 	for $i (sort({$a <=> $b} keys(%used))) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    $row = sprintf("%02X", $i >> 8);
 | |
| 	    $col = sprintf("%02X", $i & 0xff);
 | |
| 	    if ($row ne $current_row) {
 | |
| 		if (($last_col ne '') and ($last_col ne $start_col)) {
 | |
| 		    print "-$last_col";
 | |
| 		    print "</SAMP>" if $html;
 | |
| 		}
 | |
| 		print "<TR><TD><SAMP>$row</SAMP><TD><SAMP>" if $html;
 | |
| 		print "\n  $row\t" unless $html;
 | |
| 		$len = 0;
 | |
| 		$current_row = $row;
 | |
| 		$start_col = '';
 | |
| 	    }
 | |
| 	    if ($start_col eq '') {
 | |
| 		print "$col";
 | |
| 		$len += 2;
 | |
| 		$start_col = $col;
 | |
| 		$last_col = $col;
 | |
| 	    } elsif (hex($col) == hex($last_col) + 1) {
 | |
| 		$last_col = $col;
 | |
| 	    } else {
 | |
| 		if ($last_col ne $start_col) {
 | |
| 		    print "-$last_col";
 | |
| 		    $len += 3;
 | |
| 		}
 | |
| 		if ($len > 60 && !$html) {
 | |
| 		    print "\n  $row\t";
 | |
| 		    $len = 0;
 | |
| 		};
 | |
| 		print " " if $len;
 | |
| 		print "$col";
 | |
| 		$len += 2 + !! $len;
 | |
| 		$start_col = $col;
 | |
| 		$last_col = $col;
 | |
| 	    }
 | |
| 	}
 | |
| 	if (($last_col ne '') and ($last_col ne $start_col)) {
 | |
| 	    print "-$last_col";
 | |
| 	    print "</SAMP>" if $html;
 | |
| 	}
 | |
| 	print "\n" if ($current_row ne '');
 | |
| 	print "</TABLE>\n" if $html;
 | |
| 	print "\n";
 | |
|     } elsif (/^c$/) {
 | |
| 	# print table as C interval array
 | |
| 	print "{";
 | |
| 	$last_i = '';
 | |
| 	$columns = 3;
 | |
| 	$col = $columns;
 | |
| 	for $i (sort({$a <=> $b} keys(%used))) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    if ($last_i eq '') {
 | |
| 		if (++$col > $columns) { $col = 1; print "\n "; }
 | |
| 		printf(" { 0x%04X, ", $i);
 | |
| 		$last_i = $i;
 | |
| 	    } elsif ($i == $last_i + 1) {
 | |
| 		$last_i = $i;
 | |
| 	    } else {
 | |
| 		printf("0x%04X },", $last_i);
 | |
| 		if (++$col > $columns) { $col = 1; print "\n "; }
 | |
| 		printf(" { 0x%04X, ", $i);
 | |
| 		$last_i = $i;
 | |
| 	    }
 | |
| 	}
 | |
| 	if ($last_i ne '') {
 | |
| 	    printf("0x%04X }", $last_i);
 | |
| 	}
 | |
| 	print "\n};\n";
 | |
|     } elsif (/^utf8-list$/) {
 | |
| 	$col = 0;
 | |
| 	$block = 0;
 | |
| 	$last = -1;
 | |
| 	for $i (sort({$a <=> $b} keys(%used))) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    while ($blockend[$block] < $i && $block < $blocks - 1) {
 | |
| 		$block++;
 | |
| 	    }
 | |
| 	    if ($last <= $blockend[$block-1] &&
 | |
| 		$i < $blockstart[$block]) {
 | |
| 		print "\n" if ($col);
 | |
| 		printf "\nFree block (U+%04X-U+%04X):\n\n",
 | |
| 		    $blockend[$block-1] + 1, $blockstart[$block] - 1;
 | |
| 		$col = 0;
 | |
| 	    }
 | |
| 	    if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
 | |
| 		print "\n" if ($col);
 | |
| 		printf "\n$blockname[$block] (U+%04X-U+%04X):\n\n",
 | |
| 		$blockstart[$block], $blockend[$block];
 | |
| 		$col = 0;
 | |
| 	    }
 | |
| 	    if ($category{$i} eq 'Mn') {
 | |
| 		# prefix non-spacing character with U+25CC DOTTED CIRCLE
 | |
| 		print "\x{25CC}";
 | |
| 	    } elsif ($category{$i} eq 'Me') {
 | |
| 		# prefix enclosing non-spacing character with space
 | |
| 		print " ";
 | |
| 	    }
 | |
| 	    print pack("U", $i);
 | |
| 	    $col += 1 + iswide($i);
 | |
| 	    if ($col >= 64) {
 | |
| 		print "\n";
 | |
| 		$col = 0;
 | |
| 	    }
 | |
| 	    $last = $i;
 | |
| 	}
 | |
| 	print "\n" if ($col);
 | |
|     } elsif (/^collections$/) {
 | |
| 	$block = 0;
 | |
| 	$last = -1;
 | |
| 	for $i (sort({$a <=> $b} keys(%used))) {
 | |
| 	    next if ($name{$i} eq "<control>");
 | |
| 	    while ($blockend[$block] < $i && $block < $blocks - 1) {
 | |
| 		$block++;
 | |
| 	    }
 | |
| 	    if ($last < $blockstart[$block] && $i >= $blockstart[$block]) {
 | |
| 		print $blockname[$block],
 | |
| 		  " " x (40 - length($blockname[$block]));
 | |
| 		printf "%04X-%04X\n",
 | |
| 		  $blockstart[$block], $blockend[$block];
 | |
| 	    }
 | |
| 	    $last = $i;
 | |
| 	}
 | |
|     } elsif (/^nr$/) {
 | |
| 	print "<P>" if $html;
 | |
| 	print "# " unless $html;
 | |
| 	print "Number of characters in above table: ";
 | |
| 	$count = 0;
 | |
| 	for $i (keys(%used)) {
 | |
| 	    $count++ unless $name{$i} eq "<control>";
 | |
| 	}
 | |
| 	print $count;
 | |
| 	print "\n";
 | |
|     } elsif (/^clean$/) {
 | |
| 	# remove characters from set that are not in $unicodedata
 | |
| 	for $i (keys(%used)) {
 | |
| 	    delete $used{$i} unless is_unicode($i);
 | |
| 	}
 | |
|     } elsif (/^unknown$/) {
 | |
| 	# remove characters from set that are in $unicodedata
 | |
| 	for $i (keys(%used)) {
 | |
| 	    delete $used{$i} if is_unicode($i);
 | |
| 	}
 | |
|     } else {
 | |
| 	die("Unknown command line command '$_'\n");
 | |
|     };
 | |
| }
 |