#!/usr/bin/perl # # EE00-E650 = 1968 # E650 E6FF EE00 if ($#ARGV != 2) { print STDERR "usage: shiftutf8 rangefrom rangeto new_from < filein > fileout$/"; exit (1); } $range_from = hex (shift); $range_to = hex(shift); $new_from = hex(shift); #print $range_from ."\n"; #exit (0); while (<>) { # read chars s/([\xc0-\xdf])([\x80-\xbf])/&utf8dec($1,$2)/ge; s/([\xe0-\xef])([\x80-\xbf])([\x80-\xbf])/&utf8dec($1,$2,$3)/ge; # now sgml # add number while (s/&#(\d+)\;/&sgmladd($1)/ge) {} while (s/&##(\d+)\;/&utf8enc($1)/ge) {} print; } exit (0); sub sgmladd { if ($_[0] >= $range_from && $_[0] <= $range_to) { return sprintf ("&##%u;", $_[0] + $new_from-$range_from); } return sprintf ("&##%u;", $_[0]); } sub utf8enc { $str = $_[0]; if ($_[0] >= 0x800) { $str = chr (0xe0 | ($_[0] >> 12)); $str .= chr (0x80 | (($_[0] >> 6) & 0x3f)); $str .= chr (0x80 | ($_[0] & 0x3f)); } elsif ($_[0] >= 0x80) { $str = chr (0xc0 | ($_[0] >> 6)); $str .= chr (0x80 | ($_[0] & 0x3f)); } else { $str .= chr ($str); } $str; } # # Works for UCS2 # sub utf8dec { $ret = 0x1f & unpack("C", $_[0]); for ($i=1; $i<=$#_; $i++) { $ret = ($ret << 6) + int (0x3f & unpack ("C", $_[$i])); } sprintf ("&#%u;", $ret); }