Files
selima-perl/lib/perl5/Selima/Unicode.pm
2026-03-10 21:31:43 +08:00

245 lines
7.3 KiB
Perl

# Selima Website Content Management System
# Unicode.pm: The subroutines to deal with character set encodings.
# Copyright (c) 2003-2018 imacat.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Author: imacat <imacat@mail.imacat.idv.tw>
# First written: 2003-03-23
package Selima::Unicode;
use 5.008;
use strict;
use warnings;
use base qw(Exporter);
use vars qw(@EXPORT @EXPORT_OK);
BEGIN {
@EXPORT = qw();
push @EXPORT, qw(h_encode a_encode);
push @EXPORT, qw(is_charset is_usascii_printable is_usascii_printable_text);
push @EXPORT, qw(all_to_trad all_to_simp);
push @EXPORT, qw(hcref_encode hcref_decode page_encode);
@EXPORT_OK = @EXPORT;
# Prototype declaration
sub h_encode($;$);
sub a_encode($;$);
sub is_charset($$);
sub is_usascii_printable($);
sub is_usascii_printable_text($);
sub all_to_trad($);
sub all_to_simp($);
sub hcref_encode($$);
sub hcref_decode($$);
sub page_encode($$);
sub hcref2char($);
sub hcref2char_keep($);
}
use Config qw(%Config);
use Encode qw(encode decode from_to FB_CROAK FB_HTMLCREF);
use File::Basename qw(dirname);
use File::Spec::Functions qw(splitdir catdir);
use GDBM_File qw(GDBM_READER);
use HTML::Entities qw(%entity2char %char2entity);
use Selima::A2HTML;
use Selima::HTTP;
use Selima::DataVars qw(:lninfo);
use Selima::GetLang;
use Selima::ShortCut;
use vars qw(%BIG5_RANGES $ETEN_EXT);
%BIG5_RANGES = (
"常用字" => [ 0xA440, 0xC67E ], # 5401
"次常用字" => [ 0xC940, 0xF9D5 ], # 7652
"常用符號" => [ 0xA140, 0xA3BF ], # 408
# "控制碼" => [ 0xA3C0, 0xA3E0 ], # 33
# "罕用符號" => [ 0xC6A1, 0xC8FE ],
);
# Big5 is indeed Big5-ETen, which has ETen extension character that covers
# Japanese, where ETen extension is not covered by Microsoft CP950.
# Using Big5 (Big5-ETen) will preserve Japanese characters that cannot
# be displayed under Microsoft Internet Explorer.
# CP950 converts European characters into US-ASCII. European characters
# will not be output correctly.
use vars qw($ALL2TRAD %ALL2TRAD $ALL2SIMP %ALL2SIMP);
BEGIN {
my (@dirs, $arch);
@dirs = splitdir(dirname((caller 1)[1]));
$arch = $Config{"myarchname"};
$arch =~ s/-[^\-]+$//;
$ALL2TRAD = catdir(@dirs[0..$#dirs-2], $arch, "all2trad.db");
$ALL2SIMP = catdir(@dirs[0..$#dirs-2], $arch, "all2simp.db");
}
# h_encode: Encode and escape HTML special characters
sub h_encode($;$) {
local ($_, %_);
my ($source, $encoding);
($source, $encoding) = @_;
$encoding = getlang LN_CHARSET if !defined $encoding;
return hcref_encode($encoding, h($source));
}
# a_encode: Encode and convert from plain text to HTML
sub a_encode($;$) {
local ($_, %_);
my ($source, $encoding);
($source, $encoding) = @_;
$encoding = getlang LN_CHARSET if !defined $encoding;
return hcref_encode($encoding, a2html($source));
}
# is_charset: If a piece of text is in a certain character set
sub is_charset($$) {
local ($_, %_);
my ($text, $charset);
($text, $charset) = @_;
eval { encode($charset, $text, FB_CROAK); };
return $@ eq "";
}
# is_usascii_printable: If a piece of text is US-ASCII printable
# Positive range. Decoded unicode text may have a HEX value larger than \xFF.
sub is_usascii_printable($) {
local ($_, %_);
$_ = $_[0];
# Scalar
return /^[\x20-\x7E]+$/ if ref $_ eq "";
# Array
if (ref $_ eq "ARRAY") {
@_ = @$_;
foreach (@_) {
return 0 if !is_usascii_printable $_;
}
return 1;
}
# Hash
if (ref $_ eq "HASH") {
%_ = %$_;
foreach (keys %_) {
return 0 if !is_usascii_printable $_
|| !is_usascii_printable $_{$_};
}
return 1;
}
# False for others. We have no idea about other types of data.
return 0;
}
# is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
# Positive range. Decoded unicode text may have a HEX value larger than \xFF.
sub is_usascii_printable_text($) { $_[0] =~ /^[\x20-\x7E\s]+$/; }
# all_to_trad: Convert all Simplified characters in text to Traditional Chinese
sub all_to_trad($) {
local ($_, %_);
my $text;
$text = $_[0];
if (!tied %ALL2TRAD) {
tie %ALL2TRAD, "GDBM_File", $ALL2TRAD, &GDBM_READER, 0644
or http_500 "$ALL2TRAD: $!";
}
@_ = split //, $text;
foreach my $c (@_) {
$_ = ord $c;
$c = chr $ALL2TRAD{$_} if exists $ALL2TRAD{$_};
}
return join "", @_;
}
# all_to_simp: Convert all Simplified characters in text to Traditional Chinese
sub all_to_simp($) {
local ($_, %_);
my $text;
$text = $_[0];
if (!tied %ALL2SIMP) {
tie %ALL2SIMP, "GDBM_File", $ALL2SIMP, &GDBM_READER, 0644
or http_500 "$ALL2SIMP: $!";
}
@_ = split //, $text;
foreach my $c (@_) {
$_ = ord $c;
$c = chr $ALL2SIMP{$_} if exists $ALL2SIMP{$_};
}
return join "", @_;
}
# hcref_encode: Encode text with HTML character entity references
sub hcref_encode($$) {
local ($_, %_);
my $charset;
($charset, $_) = @_;
$_ = encode($charset, $_, FB_HTMLCREF);
s/&#(\d{1,10});/exists $char2entity{chr $1}? $char2entity{chr $1}: "&#$1;";/ge;
return $_;
}
# hcref_decode: Decode octets and HTML character references
sub hcref_decode($$) {
local ($_, %_);
my $charset;
($charset, $_) = @_;
eval { $_ = decode($charset, $_, FB_CROAK); 1; };
return undef if $@ ne "";
return hcref2char($_);
}
# page_encode: encode() an HTML page
sub page_encode($$) {
local ($_, %_);
my $charset;
($_, $charset) = @_;
$_ = hcref_encode($charset, $_);
return if !defined $_;
$charset = h($charset);
s/<!--selima:charset-->/$charset/g;
return $_;
}
# hcref2char: Decode HTML character entity references
# The input and output should have already been decoded
# It preserves encoded US-ASCII characters. US-ASCII characters do
# not need to be encoded. There must be some reason to encode them.
# (like &#64;/@, &lt;/<, &gt/>, etc.)
sub hcref2char($) {
local ($_, %_);
$_ = $_[0];
# Numeric character references (decimal)
s/&#(\d{1,10});/my $c = decode("UTF-32LE", pack("V", $1), FB_CROAK);
!hcref2char_keep($c)? $c: "&#$1;";/ge;
# Numeric character references (hexadecimal)
s/&#x([0-9a-f]{1,8});/my $c = decode("UTF-32LE", pack("V", hex $1), FB_CROAK);
!hcref2char_keep($c)? $c: "&#x$1;";/gei;
# Character entity references
s/&([a-z]{2,8}\d{0,2};)/
if ( !exists $entity2char{$1}
|| hcref2char_keep($entity2char{$1})) {
"&$1";
} else {
$entity2char{$1};
} /gei;
return $_;
}
# hcref2char_keep: If this character should not be decoded
# HTML characters are not decoded (<, >, ", &) " gettext
sub hcref2char_keep($) { $_[0] =~ /^[<>s"&]$/; } # " gettext
return 1;