Initial commit.
This commit is contained in:
244
lib/perl5/Selima/Unicode.pm
Normal file
244
lib/perl5/Selima/Unicode.pm
Normal file
@@ -0,0 +1,244 @@
|
||||
# Selima Website Content Management System
|
||||
# Unicode.pm: The subroutines to deal with character set encodings.
|
||||
|
||||
# Copyright (c) 2003-2018 imacat.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Author: imacat <imacat@mail.imacat.idv.tw>
|
||||
# First written: 2003-03-23
|
||||
|
||||
package Selima::Unicode;
|
||||
use 5.008;
|
||||
use strict;
|
||||
use warnings;
|
||||
use base qw(Exporter);
|
||||
use vars qw(@EXPORT @EXPORT_OK);
|
||||
BEGIN {
|
||||
@EXPORT = qw();
|
||||
push @EXPORT, qw(h_encode a_encode);
|
||||
push @EXPORT, qw(is_charset is_usascii_printable is_usascii_printable_text);
|
||||
push @EXPORT, qw(all_to_trad all_to_simp);
|
||||
push @EXPORT, qw(hcref_encode hcref_decode page_encode);
|
||||
@EXPORT_OK = @EXPORT;
|
||||
# Prototype declaration
|
||||
sub h_encode($;$);
|
||||
sub a_encode($;$);
|
||||
sub is_charset($$);
|
||||
sub is_usascii_printable($);
|
||||
sub is_usascii_printable_text($);
|
||||
sub all_to_trad($);
|
||||
sub all_to_simp($);
|
||||
sub hcref_encode($$);
|
||||
sub hcref_decode($$);
|
||||
sub page_encode($$);
|
||||
sub hcref2char($);
|
||||
sub hcref2char_keep($);
|
||||
}
|
||||
|
||||
use Config qw(%Config);
|
||||
use Encode qw(encode decode from_to FB_CROAK FB_HTMLCREF);
|
||||
use File::Basename qw(dirname);
|
||||
use File::Spec::Functions qw(splitdir catdir);
|
||||
use GDBM_File qw(GDBM_READER);
|
||||
use HTML::Entities qw(%entity2char %char2entity);
|
||||
|
||||
use Selima::A2HTML;
|
||||
use Selima::HTTP;
|
||||
use Selima::DataVars qw(:lninfo);
|
||||
use Selima::GetLang;
|
||||
use Selima::ShortCut;
|
||||
|
||||
use vars qw(%BIG5_RANGES $ETEN_EXT);
|
||||
%BIG5_RANGES = (
|
||||
"常用字" => [ 0xA440, 0xC67E ], # 5401
|
||||
"次常用字" => [ 0xC940, 0xF9D5 ], # 7652
|
||||
"常用符號" => [ 0xA140, 0xA3BF ], # 408
|
||||
# "控制碼" => [ 0xA3C0, 0xA3E0 ], # 33
|
||||
# "罕用符號" => [ 0xC6A1, 0xC8FE ],
|
||||
);
|
||||
|
||||
# Big5 is indeed Big5-ETen, which has ETen extension character that covers
|
||||
# Japanese, where ETen extension is not covered by Microsoft CP950.
|
||||
# Using Big5 (Big5-ETen) will preserve Japanese characters that cannot
|
||||
# be displayed under Microsoft Internet Explorer.
|
||||
# CP950 converts European characters into US-ASCII. European characters
|
||||
# will not be output correctly.
|
||||
|
||||
use vars qw($ALL2TRAD %ALL2TRAD $ALL2SIMP %ALL2SIMP);
|
||||
BEGIN {
|
||||
my (@dirs, $arch);
|
||||
@dirs = splitdir(dirname((caller 1)[1]));
|
||||
$arch = $Config{"myarchname"};
|
||||
$arch =~ s/-[^\-]+$//;
|
||||
$ALL2TRAD = catdir(@dirs[0..$#dirs-2], $arch, "all2trad.db");
|
||||
$ALL2SIMP = catdir(@dirs[0..$#dirs-2], $arch, "all2simp.db");
|
||||
}
|
||||
|
||||
# h_encode: Encode and escape HTML special characters
|
||||
sub h_encode($;$) {
|
||||
local ($_, %_);
|
||||
my ($source, $encoding);
|
||||
($source, $encoding) = @_;
|
||||
$encoding = getlang LN_CHARSET if !defined $encoding;
|
||||
return hcref_encode($encoding, h($source));
|
||||
}
|
||||
|
||||
# a_encode: Encode and convert from plain text to HTML
|
||||
sub a_encode($;$) {
|
||||
local ($_, %_);
|
||||
my ($source, $encoding);
|
||||
($source, $encoding) = @_;
|
||||
$encoding = getlang LN_CHARSET if !defined $encoding;
|
||||
return hcref_encode($encoding, a2html($source));
|
||||
}
|
||||
|
||||
# is_charset: If a piece of text is in a certain character set
|
||||
sub is_charset($$) {
|
||||
local ($_, %_);
|
||||
my ($text, $charset);
|
||||
($text, $charset) = @_;
|
||||
eval { encode($charset, $text, FB_CROAK); };
|
||||
return $@ eq "";
|
||||
}
|
||||
|
||||
# is_usascii_printable: If a piece of text is US-ASCII printable
|
||||
# Positive range. Decoded unicode text may have a HEX value larger than \xFF.
|
||||
sub is_usascii_printable($) {
|
||||
local ($_, %_);
|
||||
$_ = $_[0];
|
||||
# Scalar
|
||||
return /^[\x20-\x7E]+$/ if ref $_ eq "";
|
||||
# Array
|
||||
if (ref $_ eq "ARRAY") {
|
||||
@_ = @$_;
|
||||
foreach (@_) {
|
||||
return 0 if !is_usascii_printable $_;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
# Hash
|
||||
if (ref $_ eq "HASH") {
|
||||
%_ = %$_;
|
||||
foreach (keys %_) {
|
||||
return 0 if !is_usascii_printable $_
|
||||
|| !is_usascii_printable $_{$_};
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
# False for others. We have no idea about other types of data.
|
||||
return 0;
|
||||
}
|
||||
|
||||
# is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
|
||||
# Positive range. Decoded unicode text may have a HEX value larger than \xFF.
|
||||
sub is_usascii_printable_text($) { $_[0] =~ /^[\x20-\x7E\s]+$/; }
|
||||
|
||||
# all_to_trad: Convert all Simplified characters in text to Traditional Chinese
|
||||
sub all_to_trad($) {
|
||||
local ($_, %_);
|
||||
my $text;
|
||||
$text = $_[0];
|
||||
if (!tied %ALL2TRAD) {
|
||||
tie %ALL2TRAD, "GDBM_File", $ALL2TRAD, &GDBM_READER, 0644
|
||||
or http_500 "$ALL2TRAD: $!";
|
||||
}
|
||||
@_ = split //, $text;
|
||||
foreach my $c (@_) {
|
||||
$_ = ord $c;
|
||||
$c = chr $ALL2TRAD{$_} if exists $ALL2TRAD{$_};
|
||||
}
|
||||
return join "", @_;
|
||||
}
|
||||
|
||||
# all_to_simp: Convert all Simplified characters in text to Traditional Chinese
|
||||
sub all_to_simp($) {
|
||||
local ($_, %_);
|
||||
my $text;
|
||||
$text = $_[0];
|
||||
if (!tied %ALL2SIMP) {
|
||||
tie %ALL2SIMP, "GDBM_File", $ALL2SIMP, &GDBM_READER, 0644
|
||||
or http_500 "$ALL2SIMP: $!";
|
||||
}
|
||||
@_ = split //, $text;
|
||||
foreach my $c (@_) {
|
||||
$_ = ord $c;
|
||||
$c = chr $ALL2SIMP{$_} if exists $ALL2SIMP{$_};
|
||||
}
|
||||
return join "", @_;
|
||||
}
|
||||
|
||||
# hcref_encode: Encode text with HTML character entity references
|
||||
sub hcref_encode($$) {
|
||||
local ($_, %_);
|
||||
my $charset;
|
||||
($charset, $_) = @_;
|
||||
$_ = encode($charset, $_, FB_HTMLCREF);
|
||||
s/&#(\d{1,10});/exists $char2entity{chr $1}? $char2entity{chr $1}: "&#$1;";/ge;
|
||||
return $_;
|
||||
}
|
||||
|
||||
# hcref_decode: Decode octets and HTML character references
|
||||
sub hcref_decode($$) {
|
||||
local ($_, %_);
|
||||
my $charset;
|
||||
($charset, $_) = @_;
|
||||
eval { $_ = decode($charset, $_, FB_CROAK); 1; };
|
||||
return undef if $@ ne "";
|
||||
return hcref2char($_);
|
||||
}
|
||||
|
||||
# page_encode: encode() an HTML page
|
||||
sub page_encode($$) {
|
||||
local ($_, %_);
|
||||
my $charset;
|
||||
($_, $charset) = @_;
|
||||
$_ = hcref_encode($charset, $_);
|
||||
return if !defined $_;
|
||||
$charset = h($charset);
|
||||
s/<!--selima:charset-->/$charset/g;
|
||||
return $_;
|
||||
}
|
||||
|
||||
# hcref2char: Decode HTML character entity references
|
||||
# The input and output should have already been decoded
|
||||
# It preserves encoded US-ASCII characters. US-ASCII characters do
|
||||
# not need to be encoded. There must be some reason to encode them.
|
||||
# (like @/@, </<, >/>, etc.)
|
||||
sub hcref2char($) {
|
||||
local ($_, %_);
|
||||
$_ = $_[0];
|
||||
|
||||
# Numeric character references (decimal)
|
||||
s/&#(\d{1,10});/my $c = decode("UTF-32LE", pack("V", $1), FB_CROAK);
|
||||
!hcref2char_keep($c)? $c: "&#$1;";/ge;
|
||||
# Numeric character references (hexadecimal)
|
||||
s/&#x([0-9a-f]{1,8});/my $c = decode("UTF-32LE", pack("V", hex $1), FB_CROAK);
|
||||
!hcref2char_keep($c)? $c: "&#x$1;";/gei;
|
||||
# Character entity references
|
||||
s/&([a-z]{2,8}\d{0,2};)/
|
||||
if ( !exists $entity2char{$1}
|
||||
|| hcref2char_keep($entity2char{$1})) {
|
||||
"&$1";
|
||||
} else {
|
||||
$entity2char{$1};
|
||||
} /gei;
|
||||
|
||||
return $_;
|
||||
}
|
||||
|
||||
# hcref2char_keep: If this character should not be decoded
|
||||
# HTML characters are not decoded (<, >, ", &) " gettext
|
||||
sub hcref2char_keep($) { $_[0] =~ /^[<>s"&]$/; } # " gettext
|
||||
|
||||
return 1;
|
||||
Reference in New Issue
Block a user