Initial commit.

2026-03-10 21:25:26 +08:00
commit 78739bf725
3089 changed files with 472990 additions and 0 deletions
--- a/lib/perl5/Selima/Unicode.pm
+++ b/lib/perl5/Selima/Unicode.pm
@@ -0,0 +1,244 @@
+# Selima Website Content Management System
+# Unicode.pm: The subroutines to deal with character set encodings.
+
+#  Copyright (c) 2003-2018 imacat.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+# Author: imacat <imacat@mail.imacat.idv.tw>
+# First written: 2003-03-23
+
+package Selima::Unicode;
+use 5.008;
+use strict;
+use warnings;
+use base qw(Exporter);
+use vars qw(@EXPORT @EXPORT_OK);
+BEGIN {
+@EXPORT = qw();
+push @EXPORT, qw(h_encode a_encode);
+push @EXPORT, qw(is_charset is_usascii_printable is_usascii_printable_text);
+push @EXPORT, qw(all_to_trad all_to_simp);
+push @EXPORT, qw(hcref_encode hcref_decode page_encode);
+@EXPORT_OK = @EXPORT;
+# Prototype declaration
+sub h_encode($;$);
+sub a_encode($;$);
+sub is_charset($$);
+sub is_usascii_printable($);
+sub is_usascii_printable_text($);
+sub all_to_trad($);
+sub all_to_simp($);
+sub hcref_encode($$);
+sub hcref_decode($$);
+sub page_encode($$);
+sub hcref2char($);
+sub hcref2char_keep($);
+}
+
+use Config qw(%Config);
+use Encode qw(encode decode from_to FB_CROAK FB_HTMLCREF);
+use File::Basename qw(dirname);
+use File::Spec::Functions qw(splitdir catdir);
+use GDBM_File qw(GDBM_READER);
+use HTML::Entities qw(%entity2char %char2entity);
+
+use Selima::A2HTML;
+use Selima::HTTP;
+use Selima::DataVars qw(:lninfo);
+use Selima::GetLang;
+use Selima::ShortCut;
+
+use vars qw(%BIG5_RANGES $ETEN_EXT);
+%BIG5_RANGES = (
+    "常用字"    => [ 0xA440, 0xC67E  ],     # 5401
+    "次常用字"  => [ 0xC940, 0xF9D5  ],     # 7652
+    "常用符號"  => [ 0xA140, 0xA3BF  ],     #  408
+#    "控制碼"    => [ 0xA3C0, 0xA3E0  ],    #   33
+#    "罕用符號"  => [ 0xC6A1, 0xC8FE  ],
+);
+
+# Big5 is indeed Big5-ETen, which has ETen extension character that covers
+#   Japanese, where ETen extension is not covered by Microsoft CP950.
+#   Using Big5 (Big5-ETen) will preserve Japanese characters that cannot
+#   be displayed under Microsoft Internet Explorer.
+# CP950 converts European characters into US-ASCII.  European characters
+#   will not be output correctly.
+
+use vars qw($ALL2TRAD %ALL2TRAD $ALL2SIMP %ALL2SIMP);
+BEGIN {
+my (@dirs, $arch);
+@dirs = splitdir(dirname((caller 1)[1]));
+$arch = $Config{"myarchname"};
+$arch =~ s/-[^\-]+$//;
+$ALL2TRAD = catdir(@dirs[0..$#dirs-2], $arch, "all2trad.db");
+$ALL2SIMP = catdir(@dirs[0..$#dirs-2], $arch, "all2simp.db");
+}
+
+# h_encode: Encode and escape HTML special characters
+sub h_encode($;$) {
+    local ($_, %_);
+    my ($source, $encoding);
+    ($source, $encoding) = @_;
+    $encoding = getlang LN_CHARSET if !defined $encoding;
+    return hcref_encode($encoding, h($source));
+}
+
+# a_encode: Encode and convert from plain text to HTML
+sub a_encode($;$) {
+    local ($_, %_);
+    my ($source, $encoding);
+    ($source, $encoding) = @_;
+    $encoding = getlang LN_CHARSET if !defined $encoding;
+    return hcref_encode($encoding, a2html($source));
+}
+
+# is_charset: If a piece of text is in a certain character set
+sub is_charset($$) {
+    local ($_, %_);
+    my ($text, $charset);
+    ($text, $charset) = @_;
+    eval { encode($charset, $text, FB_CROAK); };
+    return $@ eq "";
+}
+
+# is_usascii_printable: If a piece of text is US-ASCII printable
+#   Positive range.  Decoded unicode text may have a HEX value larger than \xFF.
+sub is_usascii_printable($) {
+    local ($_, %_);
+    $_ = $_[0];
+    # Scalar
+    return /^[\x20-\x7E]+$/ if ref $_ eq "";
+    # Array
+    if (ref $_ eq "ARRAY") {
+        @_ = @$_;
+        foreach (@_) {
+            return 0 if !is_usascii_printable $_;
+        }
+        return 1;
+    }
+    # Hash
+    if (ref $_ eq "HASH") {
+        %_ = %$_;
+        foreach (keys %_) {
+            return 0 if !is_usascii_printable $_
+                        || !is_usascii_printable $_{$_};
+        }
+        return 1;
+    }
+    # False for others.  We have no idea about other types of data.
+    return 0;
+}
+
+# is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
+#   Positive range.  Decoded unicode text may have a HEX value larger than \xFF.
+sub is_usascii_printable_text($) { $_[0] =~ /^[\x20-\x7E\s]+$/; }
+
+# all_to_trad: Convert all Simplified characters in text to Traditional Chinese
+sub all_to_trad($) {
+    local ($_, %_);
+    my $text;
+    $text = $_[0];
+    if (!tied %ALL2TRAD) {
+        tie %ALL2TRAD, "GDBM_File", $ALL2TRAD, &GDBM_READER, 0644
+                                        or http_500 "$ALL2TRAD: $!";
+    }
+    @_ = split //, $text;
+    foreach my $c (@_) {
+        $_ = ord $c;
+        $c = chr $ALL2TRAD{$_} if exists $ALL2TRAD{$_};
+    }
+    return join "", @_;
+}
+
+# all_to_simp: Convert all Simplified characters in text to Traditional Chinese
+sub all_to_simp($) {
+    local ($_, %_);
+    my $text;
+    $text = $_[0];
+    if (!tied %ALL2SIMP) {
+        tie %ALL2SIMP, "GDBM_File", $ALL2SIMP, &GDBM_READER, 0644
+                                        or http_500 "$ALL2SIMP: $!";
+    }
+    @_ = split //, $text;
+    foreach my $c (@_) {
+        $_ = ord $c;
+        $c = chr $ALL2SIMP{$_} if exists $ALL2SIMP{$_};
+    }
+    return join "", @_;
+}
+
+# hcref_encode: Encode text with HTML character entity references
+sub hcref_encode($$) {
+    local ($_, %_);
+    my $charset;
+    ($charset, $_) = @_;
+    $_ = encode($charset, $_, FB_HTMLCREF);
+    s/&#(\d{1,10});/exists $char2entity{chr $1}? $char2entity{chr $1}: "&#$1;";/ge;
+    return $_;
+}
+
+# hcref_decode: Decode octets and HTML character references
+sub hcref_decode($$) {
+    local ($_, %_);
+    my $charset;
+    ($charset, $_) = @_;
+    eval { $_ = decode($charset, $_, FB_CROAK); 1; };
+    return undef if $@ ne "";
+    return hcref2char($_);
+}
+
+# page_encode: encode() an HTML page
+sub page_encode($$) {
+    local ($_, %_);
+    my $charset;
+    ($_, $charset) = @_;
+    $_ = hcref_encode($charset, $_);
+    return if !defined $_;
+    $charset = h($charset);
+    s/<!--selima:charset-->/$charset/g;
+    return $_;
+}
+
+# hcref2char: Decode HTML character entity references
+#   The input and output should have already been decoded
+#   It preserves encoded US-ASCII characters.  US-ASCII characters do
+#     not need to be encoded.  There must be some reason to encode them.
+#     (like &#64;/@, &lt;/<, &gt/>, etc.)
+sub hcref2char($) {
+    local ($_, %_);
+    $_ = $_[0];
+    
+    # Numeric character references (decimal)
+    s/&#(\d{1,10});/my $c = decode("UTF-32LE", pack("V", $1), FB_CROAK);
+        !hcref2char_keep($c)? $c: "&#$1;";/ge;
+    # Numeric character references (hexadecimal)
+    s/&#x([0-9a-f]{1,8});/my $c = decode("UTF-32LE", pack("V", hex $1), FB_CROAK);
+        !hcref2char_keep($c)? $c: "&#x$1;";/gei;
+    # Character entity references
+    s/&([a-z]{2,8}\d{0,2};)/
+        if (    !exists $entity2char{$1}
+                || hcref2char_keep($entity2char{$1})) {
+            "&$1";
+        } else {
+             $entity2char{$1};
+        } /gei;
+    
+    return $_;
+}
+
+# hcref2char_keep: If this character should not be decoded
+#   HTML characters are not decoded (<, >, ", &)    " gettext
+sub hcref2char_keep($) { $_[0] =~ /^[<>s"&]$/; }    # " gettext
+
+return 1;