selima-perl/lib/perl5/Selima/Unicode.pm

# Selima Website Content Management System
# Unicode.pm: The subroutines to deal with character set encodings.

#  Copyright (c) 2003-2018 imacat.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

# Author: imacat <imacat@mail.imacat.idv.tw>
# First written: 2003-03-23

package Selima::Unicode;
use 5.008;
use strict;
use warnings;
use base qw(Exporter);
use vars qw(@EXPORT @EXPORT_OK);
BEGIN {
@EXPORT = qw();
push @EXPORT, qw(h_encode a_encode);
push @EXPORT, qw(is_charset is_usascii_printable is_usascii_printable_text);
push @EXPORT, qw(all_to_trad all_to_simp);
push @EXPORT, qw(hcref_encode hcref_decode page_encode);
@EXPORT_OK = @EXPORT;
# Prototype declaration
sub h_encode($;$);
sub a_encode($;$);
sub is_charset($$);
sub is_usascii_printable($);
sub is_usascii_printable_text($);
sub all_to_trad($);
sub all_to_simp($);
sub hcref_encode($$);
sub hcref_decode($$);
sub page_encode($$);
sub hcref2char($);
sub hcref2char_keep($);
}

use Config qw(%Config);
use Encode qw(encode decode from_to FB_CROAK FB_HTMLCREF);
use File::Basename qw(dirname);
use File::Spec::Functions qw(splitdir catdir);
use GDBM_File qw(GDBM_READER);
use HTML::Entities qw(%entity2char %char2entity);

use Selima::A2HTML;
use Selima::HTTP;
use Selima::DataVars qw(:lninfo);
use Selima::GetLang;
use Selima::ShortCut;

use vars qw(%BIG5_RANGES $ETEN_EXT);
%BIG5_RANGES = (
    "常用字"    => [ 0xA440, 0xC67E  ],     # 5401
    "次常用字"  => [ 0xC940, 0xF9D5  ],     # 7652
    "常用符號"  => [ 0xA140, 0xA3BF  ],     #  408
#    "控制碼"    => [ 0xA3C0, 0xA3E0  ],    #   33
#    "罕用符號"  => [ 0xC6A1, 0xC8FE  ],
);

# Big5 is indeed Big5-ETen, which has ETen extension character that covers
#   Japanese, where ETen extension is not covered by Microsoft CP950.
#   Using Big5 (Big5-ETen) will preserve Japanese characters that cannot
#   be displayed under Microsoft Internet Explorer.
# CP950 converts European characters into US-ASCII.  European characters
#   will not be output correctly.

use vars qw($ALL2TRAD %ALL2TRAD $ALL2SIMP %ALL2SIMP);
BEGIN {
my (@dirs, $arch);
@dirs = splitdir(dirname((caller 1)[1]));
$arch = $Config{"myarchname"};
$arch =~ s/-[^\-]+$//;
$ALL2TRAD = catdir(@dirs[0..$#dirs-2], $arch, "all2trad.db");
$ALL2SIMP = catdir(@dirs[0..$#dirs-2], $arch, "all2simp.db");
}

# h_encode: Encode and escape HTML special characters
sub h_encode($;$) {
    local ($_, %_);
    my ($source, $encoding);
    ($source, $encoding) = @_;
    $encoding = getlang LN_CHARSET if !defined $encoding;
    return hcref_encode($encoding, h($source));
}

# a_encode: Encode and convert from plain text to HTML
sub a_encode($;$) {
    local ($_, %_);
    my ($source, $encoding);
    ($source, $encoding) = @_;
    $encoding = getlang LN_CHARSET if !defined $encoding;
    return hcref_encode($encoding, a2html($source));
}

# is_charset: If a piece of text is in a certain character set
sub is_charset($$) {
    local ($_, %_);
    my ($text, $charset);
    ($text, $charset) = @_;
    eval { encode($charset, $text, FB_CROAK); };
    return $@ eq "";
}

# is_usascii_printable: If a piece of text is US-ASCII printable
#   Positive range.  Decoded unicode text may have a HEX value larger than \xFF.
sub is_usascii_printable($) {
    local ($_, %_);
    $_ = $_[0];
    # Scalar
    return /^[\x20-\x7E]+$/ if ref $_ eq "";
    # Array
    if (ref $_ eq "ARRAY") {
        @_ = @$_;
        foreach (@_) {
            return 0 if !is_usascii_printable $_;
        }
        return 1;
    }
    # Hash
    if (ref $_ eq "HASH") {
        %_ = %$_;
        foreach (keys %_) {
            return 0 if !is_usascii_printable $_
                        || !is_usascii_printable $_{$_};
        }
        return 1;
    }
    # False for others.  We have no idea about other types of data.
    return 0;
}

# is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
#   Positive range.  Decoded unicode text may have a HEX value larger than \xFF.
sub is_usascii_printable_text($) { $_[0] =~ /^[\x20-\x7E\s]+$/; }

# all_to_trad: Convert all Simplified characters in text to Traditional Chinese
sub all_to_trad($) {
    local ($_, %_);
    my $text;
    $text = $_[0];
    if (!tied %ALL2TRAD) {
        tie %ALL2TRAD, "GDBM_File", $ALL2TRAD, &GDBM_READER, 0644
                                        or http_500 "$ALL2TRAD: $!";
    }
    @_ = split //, $text;
    foreach my $c (@_) {
        $_ = ord $c;
        $c = chr $ALL2TRAD{$_} if exists $ALL2TRAD{$_};
    }
    return join "", @_;
}

# all_to_simp: Convert all Simplified characters in text to Traditional Chinese
sub all_to_simp($) {
    local ($_, %_);
    my $text;
    $text = $_[0];
    if (!tied %ALL2SIMP) {
        tie %ALL2SIMP, "GDBM_File", $ALL2SIMP, &GDBM_READER, 0644
                                        or http_500 "$ALL2SIMP: $!";
    }
    @_ = split //, $text;
    foreach my $c (@_) {
        $_ = ord $c;
        $c = chr $ALL2SIMP{$_} if exists $ALL2SIMP{$_};
    }
    return join "", @_;
}

# hcref_encode: Encode text with HTML character entity references
sub hcref_encode($$) {
    local ($_, %_);
    my $charset;
    ($charset, $_) = @_;
    $_ = encode($charset, $_, FB_HTMLCREF);
    s/&#(\d{1,10});/exists $char2entity{chr $1}? $char2entity{chr $1}: "&#$1;";/ge;
    return $_;
}

# hcref_decode: Decode octets and HTML character references
sub hcref_decode($$) {
    local ($_, %_);
    my $charset;
    ($charset, $_) = @_;
    eval { $_ = decode($charset, $_, FB_CROAK); 1; };
    return undef if $@ ne "";
    return hcref2char($_);
}

# page_encode: encode() an HTML page
sub page_encode($$) {
    local ($_, %_);
    my $charset;
    ($_, $charset) = @_;
    $_ = hcref_encode($charset, $_);
    return if !defined $_;
    $charset = h($charset);
    s/<!--selima:charset-->/$charset/g;
    return $_;
}

# hcref2char: Decode HTML character entity references
#   The input and output should have already been decoded
#   It preserves encoded US-ASCII characters.  US-ASCII characters do
#     not need to be encoded.  There must be some reason to encode them.
#     (like &#64;/@, &lt;/<, &gt/>, etc.)
sub hcref2char($) {
    local ($_, %_);
    $_ = $_[0];

    # Numeric character references (decimal)
    s/&#(\d{1,10});/my $c = decode("UTF-32LE", pack("V", $1), FB_CROAK);
        !hcref2char_keep($c)? $c: "&#$1;";/ge;
    # Numeric character references (hexadecimal)
    s/&#x([0-9a-f]{1,8});/my $c = decode("UTF-32LE", pack("V", hex $1), FB_CROAK);
        !hcref2char_keep($c)? $c: "&#x$1;";/gei;
    # Character entity references
    s/&([a-z]{2,8}\d{0,2};)/
        if (    !exists $entity2char{$1}
                || hcref2char_keep($entity2char{$1})) {
            "&$1";
        } else {
             $entity2char{$1};
        } /gei;

    return $_;
}

# hcref2char_keep: If this character should not be decoded
#   HTML characters are not decoded (<, >, ", &)    " gettext
sub hcref2char_keep($) { $_[0] =~ /^[<>s"&]$/; }    # " gettext

return 1;