245 lines
7.3 KiB
Perl
245 lines
7.3 KiB
Perl
# Selima Website Content Management System
|
|
# Unicode.pm: The subroutines to deal with character set encodings.
|
|
|
|
# Copyright (c) 2003-2018 imacat.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Author: imacat <imacat@mail.imacat.idv.tw>
|
|
# First written: 2003-03-23
|
|
|
|
package Selima::Unicode;
|
|
use 5.008;
|
|
use strict;
|
|
use warnings;
|
|
use base qw(Exporter);
|
|
use vars qw(@EXPORT @EXPORT_OK);
|
|
BEGIN {
|
|
@EXPORT = qw();
|
|
push @EXPORT, qw(h_encode a_encode);
|
|
push @EXPORT, qw(is_charset is_usascii_printable is_usascii_printable_text);
|
|
push @EXPORT, qw(all_to_trad all_to_simp);
|
|
push @EXPORT, qw(hcref_encode hcref_decode page_encode);
|
|
@EXPORT_OK = @EXPORT;
|
|
# Prototype declaration
|
|
sub h_encode($;$);
|
|
sub a_encode($;$);
|
|
sub is_charset($$);
|
|
sub is_usascii_printable($);
|
|
sub is_usascii_printable_text($);
|
|
sub all_to_trad($);
|
|
sub all_to_simp($);
|
|
sub hcref_encode($$);
|
|
sub hcref_decode($$);
|
|
sub page_encode($$);
|
|
sub hcref2char($);
|
|
sub hcref2char_keep($);
|
|
}
|
|
|
|
use Config qw(%Config);
|
|
use Encode qw(encode decode from_to FB_CROAK FB_HTMLCREF);
|
|
use File::Basename qw(dirname);
|
|
use File::Spec::Functions qw(splitdir catdir);
|
|
use GDBM_File qw(GDBM_READER);
|
|
use HTML::Entities qw(%entity2char %char2entity);
|
|
|
|
use Selima::A2HTML;
|
|
use Selima::HTTP;
|
|
use Selima::DataVars qw(:lninfo);
|
|
use Selima::GetLang;
|
|
use Selima::ShortCut;
|
|
|
|
use vars qw(%BIG5_RANGES $ETEN_EXT);
|
|
%BIG5_RANGES = (
|
|
"常用字" => [ 0xA440, 0xC67E ], # 5401
|
|
"次常用字" => [ 0xC940, 0xF9D5 ], # 7652
|
|
"常用符號" => [ 0xA140, 0xA3BF ], # 408
|
|
# "控制碼" => [ 0xA3C0, 0xA3E0 ], # 33
|
|
# "罕用符號" => [ 0xC6A1, 0xC8FE ],
|
|
);
|
|
|
|
# Big5 is indeed Big5-ETen, which has ETen extension character that covers
|
|
# Japanese, where ETen extension is not covered by Microsoft CP950.
|
|
# Using Big5 (Big5-ETen) will preserve Japanese characters that cannot
|
|
# be displayed under Microsoft Internet Explorer.
|
|
# CP950 converts European characters into US-ASCII. European characters
|
|
# will not be output correctly.
|
|
|
|
use vars qw($ALL2TRAD %ALL2TRAD $ALL2SIMP %ALL2SIMP);
|
|
BEGIN {
|
|
my (@dirs, $arch);
|
|
@dirs = splitdir(dirname((caller 1)[1]));
|
|
$arch = $Config{"myarchname"};
|
|
$arch =~ s/-[^\-]+$//;
|
|
$ALL2TRAD = catdir(@dirs[0..$#dirs-2], $arch, "all2trad.db");
|
|
$ALL2SIMP = catdir(@dirs[0..$#dirs-2], $arch, "all2simp.db");
|
|
}
|
|
|
|
# h_encode: Encode and escape HTML special characters
|
|
sub h_encode($;$) {
|
|
local ($_, %_);
|
|
my ($source, $encoding);
|
|
($source, $encoding) = @_;
|
|
$encoding = getlang LN_CHARSET if !defined $encoding;
|
|
return hcref_encode($encoding, h($source));
|
|
}
|
|
|
|
# a_encode: Encode and convert from plain text to HTML
|
|
sub a_encode($;$) {
|
|
local ($_, %_);
|
|
my ($source, $encoding);
|
|
($source, $encoding) = @_;
|
|
$encoding = getlang LN_CHARSET if !defined $encoding;
|
|
return hcref_encode($encoding, a2html($source));
|
|
}
|
|
|
|
# is_charset: If a piece of text is in a certain character set
|
|
sub is_charset($$) {
|
|
local ($_, %_);
|
|
my ($text, $charset);
|
|
($text, $charset) = @_;
|
|
eval { encode($charset, $text, FB_CROAK); };
|
|
return $@ eq "";
|
|
}
|
|
|
|
# is_usascii_printable: If a piece of text is US-ASCII printable
|
|
# Positive range. Decoded unicode text may have a HEX value larger than \xFF.
|
|
sub is_usascii_printable($) {
|
|
local ($_, %_);
|
|
$_ = $_[0];
|
|
# Scalar
|
|
return /^[\x20-\x7E]+$/ if ref $_ eq "";
|
|
# Array
|
|
if (ref $_ eq "ARRAY") {
|
|
@_ = @$_;
|
|
foreach (@_) {
|
|
return 0 if !is_usascii_printable $_;
|
|
}
|
|
return 1;
|
|
}
|
|
# Hash
|
|
if (ref $_ eq "HASH") {
|
|
%_ = %$_;
|
|
foreach (keys %_) {
|
|
return 0 if !is_usascii_printable $_
|
|
|| !is_usascii_printable $_{$_};
|
|
}
|
|
return 1;
|
|
}
|
|
# False for others. We have no idea about other types of data.
|
|
return 0;
|
|
}
|
|
|
|
# is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
|
|
# Positive range. Decoded unicode text may have a HEX value larger than \xFF.
|
|
sub is_usascii_printable_text($) { $_[0] =~ /^[\x20-\x7E\s]+$/; }
|
|
|
|
# all_to_trad: Convert all Simplified characters in text to Traditional Chinese
|
|
sub all_to_trad($) {
|
|
local ($_, %_);
|
|
my $text;
|
|
$text = $_[0];
|
|
if (!tied %ALL2TRAD) {
|
|
tie %ALL2TRAD, "GDBM_File", $ALL2TRAD, &GDBM_READER, 0644
|
|
or http_500 "$ALL2TRAD: $!";
|
|
}
|
|
@_ = split //, $text;
|
|
foreach my $c (@_) {
|
|
$_ = ord $c;
|
|
$c = chr $ALL2TRAD{$_} if exists $ALL2TRAD{$_};
|
|
}
|
|
return join "", @_;
|
|
}
|
|
|
|
# all_to_simp: Convert all Simplified characters in text to Traditional Chinese
|
|
sub all_to_simp($) {
|
|
local ($_, %_);
|
|
my $text;
|
|
$text = $_[0];
|
|
if (!tied %ALL2SIMP) {
|
|
tie %ALL2SIMP, "GDBM_File", $ALL2SIMP, &GDBM_READER, 0644
|
|
or http_500 "$ALL2SIMP: $!";
|
|
}
|
|
@_ = split //, $text;
|
|
foreach my $c (@_) {
|
|
$_ = ord $c;
|
|
$c = chr $ALL2SIMP{$_} if exists $ALL2SIMP{$_};
|
|
}
|
|
return join "", @_;
|
|
}
|
|
|
|
# hcref_encode: Encode text with HTML character entity references
|
|
sub hcref_encode($$) {
|
|
local ($_, %_);
|
|
my $charset;
|
|
($charset, $_) = @_;
|
|
$_ = encode($charset, $_, FB_HTMLCREF);
|
|
s/&#(\d{1,10});/exists $char2entity{chr $1}? $char2entity{chr $1}: "&#$1;";/ge;
|
|
return $_;
|
|
}
|
|
|
|
# hcref_decode: Decode octets and HTML character references
|
|
sub hcref_decode($$) {
|
|
local ($_, %_);
|
|
my $charset;
|
|
($charset, $_) = @_;
|
|
eval { $_ = decode($charset, $_, FB_CROAK); 1; };
|
|
return undef if $@ ne "";
|
|
return hcref2char($_);
|
|
}
|
|
|
|
# page_encode: encode() an HTML page
|
|
sub page_encode($$) {
|
|
local ($_, %_);
|
|
my $charset;
|
|
($_, $charset) = @_;
|
|
$_ = hcref_encode($charset, $_);
|
|
return if !defined $_;
|
|
$charset = h($charset);
|
|
s/<!--selima:charset-->/$charset/g;
|
|
return $_;
|
|
}
|
|
|
|
# hcref2char: Decode HTML character entity references
|
|
# The input and output should have already been decoded
|
|
# It preserves encoded US-ASCII characters. US-ASCII characters do
|
|
# not need to be encoded. There must be some reason to encode them.
|
|
# (like @/@, </<, >/>, etc.)
|
|
sub hcref2char($) {
|
|
local ($_, %_);
|
|
$_ = $_[0];
|
|
|
|
# Numeric character references (decimal)
|
|
s/&#(\d{1,10});/my $c = decode("UTF-32LE", pack("V", $1), FB_CROAK);
|
|
!hcref2char_keep($c)? $c: "&#$1;";/ge;
|
|
# Numeric character references (hexadecimal)
|
|
s/&#x([0-9a-f]{1,8});/my $c = decode("UTF-32LE", pack("V", hex $1), FB_CROAK);
|
|
!hcref2char_keep($c)? $c: "&#x$1;";/gei;
|
|
# Character entity references
|
|
s/&([a-z]{2,8}\d{0,2};)/
|
|
if ( !exists $entity2char{$1}
|
|
|| hcref2char_keep($entity2char{$1})) {
|
|
"&$1";
|
|
} else {
|
|
$entity2char{$1};
|
|
} /gei;
|
|
|
|
return $_;
|
|
}
|
|
|
|
# hcref2char_keep: If this character should not be decoded
|
|
# HTML characters are not decoded (<, >, ", &) " gettext
|
|
sub hcref2char_keep($) { $_[0] =~ /^[<>s"&]$/; } # " gettext
|
|
|
|
return 1;
|