# Selima Website Content Management System # Unicode.pm: The subroutines to deal with character set encodings. # Copyright (c) 2003-2018 imacat. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Author: imacat # First written: 2003-03-23 package Selima::Unicode; use 5.008; use strict; use warnings; use base qw(Exporter); use vars qw(@EXPORT @EXPORT_OK); BEGIN { @EXPORT = qw(); push @EXPORT, qw(h_encode a_encode); push @EXPORT, qw(is_charset is_usascii_printable is_usascii_printable_text); push @EXPORT, qw(all_to_trad all_to_simp); push @EXPORT, qw(hcref_encode hcref_decode page_encode); @EXPORT_OK = @EXPORT; # Prototype declaration sub h_encode($;$); sub a_encode($;$); sub is_charset($$); sub is_usascii_printable($); sub is_usascii_printable_text($); sub all_to_trad($); sub all_to_simp($); sub hcref_encode($$); sub hcref_decode($$); sub page_encode($$); sub hcref2char($); sub hcref2char_keep($); } use Config qw(%Config); use Encode qw(encode decode from_to FB_CROAK FB_HTMLCREF); use File::Basename qw(dirname); use File::Spec::Functions qw(splitdir catdir); use GDBM_File qw(GDBM_READER); use HTML::Entities qw(%entity2char %char2entity); use Selima::A2HTML; use Selima::HTTP; use Selima::DataVars qw(:lninfo); use Selima::GetLang; use Selima::ShortCut; use vars qw(%BIG5_RANGES $ETEN_EXT); %BIG5_RANGES = ( "常用字" => [ 0xA440, 0xC67E ], # 5401 "次常用字" => [ 0xC940, 0xF9D5 ], # 7652 "常用符號" => [ 0xA140, 0xA3BF ], # 408 # "控制碼" => [ 0xA3C0, 0xA3E0 ], # 33 # "罕用符號" => [ 0xC6A1, 0xC8FE ], ); # Big5 is indeed Big5-ETen, which has ETen extension character that covers # Japanese, where ETen extension is not covered by Microsoft CP950. # Using Big5 (Big5-ETen) will preserve Japanese characters that cannot # be displayed under Microsoft Internet Explorer. # CP950 converts European characters into US-ASCII. European characters # will not be output correctly. use vars qw($ALL2TRAD %ALL2TRAD $ALL2SIMP %ALL2SIMP); BEGIN { my (@dirs, $arch); @dirs = splitdir(dirname((caller 1)[1])); $arch = $Config{"myarchname"}; $arch =~ s/-[^\-]+$//; $ALL2TRAD = catdir(@dirs[0..$#dirs-2], $arch, "all2trad.db"); $ALL2SIMP = catdir(@dirs[0..$#dirs-2], $arch, "all2simp.db"); } # h_encode: Encode and escape HTML special characters sub h_encode($;$) { local ($_, %_); my ($source, $encoding); ($source, $encoding) = @_; $encoding = getlang LN_CHARSET if !defined $encoding; return hcref_encode($encoding, h($source)); } # a_encode: Encode and convert from plain text to HTML sub a_encode($;$) { local ($_, %_); my ($source, $encoding); ($source, $encoding) = @_; $encoding = getlang LN_CHARSET if !defined $encoding; return hcref_encode($encoding, a2html($source)); } # is_charset: If a piece of text is in a certain character set sub is_charset($$) { local ($_, %_); my ($text, $charset); ($text, $charset) = @_; eval { encode($charset, $text, FB_CROAK); }; return $@ eq ""; } # is_usascii_printable: If a piece of text is US-ASCII printable # Positive range. Decoded unicode text may have a HEX value larger than \xFF. sub is_usascii_printable($) { local ($_, %_); $_ = $_[0]; # Scalar return /^[\x20-\x7E]+$/ if ref $_ eq ""; # Array if (ref $_ eq "ARRAY") { @_ = @$_; foreach (@_) { return 0 if !is_usascii_printable $_; } return 1; } # Hash if (ref $_ eq "HASH") { %_ = %$_; foreach (keys %_) { return 0 if !is_usascii_printable $_ || !is_usascii_printable $_{$_}; } return 1; } # False for others. We have no idea about other types of data. return 0; } # is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable # Positive range. Decoded unicode text may have a HEX value larger than \xFF. sub is_usascii_printable_text($) { $_[0] =~ /^[\x20-\x7E\s]+$/; } # all_to_trad: Convert all Simplified characters in text to Traditional Chinese sub all_to_trad($) { local ($_, %_); my $text; $text = $_[0]; if (!tied %ALL2TRAD) { tie %ALL2TRAD, "GDBM_File", $ALL2TRAD, &GDBM_READER, 0644 or http_500 "$ALL2TRAD: $!"; } @_ = split //, $text; foreach my $c (@_) { $_ = ord $c; $c = chr $ALL2TRAD{$_} if exists $ALL2TRAD{$_}; } return join "", @_; } # all_to_simp: Convert all Simplified characters in text to Traditional Chinese sub all_to_simp($) { local ($_, %_); my $text; $text = $_[0]; if (!tied %ALL2SIMP) { tie %ALL2SIMP, "GDBM_File", $ALL2SIMP, &GDBM_READER, 0644 or http_500 "$ALL2SIMP: $!"; } @_ = split //, $text; foreach my $c (@_) { $_ = ord $c; $c = chr $ALL2SIMP{$_} if exists $ALL2SIMP{$_}; } return join "", @_; } # hcref_encode: Encode text with HTML character entity references sub hcref_encode($$) { local ($_, %_); my $charset; ($charset, $_) = @_; $_ = encode($charset, $_, FB_HTMLCREF); s/&#(\d{1,10});/exists $char2entity{chr $1}? $char2entity{chr $1}: "&#$1;";/ge; return $_; } # hcref_decode: Decode octets and HTML character references sub hcref_decode($$) { local ($_, %_); my $charset; ($charset, $_) = @_; eval { $_ = decode($charset, $_, FB_CROAK); 1; }; return undef if $@ ne ""; return hcref2char($_); } # page_encode: encode() an HTML page sub page_encode($$) { local ($_, %_); my $charset; ($_, $charset) = @_; $_ = hcref_encode($charset, $_); return if !defined $_; $charset = h($charset); s//$charset/g; return $_; } # hcref2char: Decode HTML character entity references # The input and output should have already been decoded # It preserves encoded US-ASCII characters. US-ASCII characters do # not need to be encoded. There must be some reason to encode them. # (like @/@, </<, >/>, etc.) sub hcref2char($) { local ($_, %_); $_ = $_[0]; # Numeric character references (decimal) s/&#(\d{1,10});/my $c = decode("UTF-32LE", pack("V", $1), FB_CROAK); !hcref2char_keep($c)? $c: "&#$1;";/ge; # Numeric character references (hexadecimal) s/&#x([0-9a-f]{1,8});/my $c = decode("UTF-32LE", pack("V", hex $1), FB_CROAK); !hcref2char_keep($c)? $c: "&#x$1;";/gei; # Character entity references s/&([a-z]{2,8}\d{0,2};)/ if ( !exists $entity2char{$1} || hcref2char_keep($entity2char{$1})) { "&$1"; } else { $entity2char{$1}; } /gei; return $_; } # hcref2char_keep: If this character should not be decoded # HTML characters are not decoded (<, >, ", &) " gettext sub hcref2char_keep($) { $_[0] =~ /^[<>s"&]$/; } # " gettext return 1;