Namazu-devel-ja(旧)


[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Perl5.8Encode対応



臼田です。

knok@xxxxxxxxxxxxx wrote:
> > エンバグは心配なので、とりあえずUTF-8対応化のブランチを作ってはじ
> > めませんか。
>   HEAD でやってしまってもいいかもしれません。
インデックスのUTF-8化はゆっくりとやるとして
Perlも5.8.2が出ているようですし、Encodeモジュールを利用できるようにと
作業をはじめました。

> > ・mknmzでいままで使っているnkfは文字コード変換だけでなくmimeデコード機能
> >  も受け持っているので、Unicode変換ツールにはUTF-8変換と同時にmimeデコード
> >  機能もあるPerl5.8もしくはnkf2.0のみを対象とするということでよいか?
> >  (iconvなども選べるようにする場合はmimeデコードルーチンが別途必要になるが
> >  それでも対応しておくか。)
> >  また、国際化を考えると日本語コード処理に特化しているnkf2.0やjcode.pmの利
> >  用は後に問題になる気もします。
>   この辺りは新しいものに合わせる、で良いと思います。
configureで外部プログラムを検索している個所で
Encodeモジュール、nkf2.0のモジュール、nkf2.0
のどれかを$conf::NKFに設定することにしようと思います。
UTF-8文書の処理を煩雑にしないためにnkf1.xとlvは利用
しない形にしてしまいたいと思っています。

HEADへの変更は、こういう考え方で良いでしょうか?
大丈夫であればcommitしてもう少し作業を続けようと思います。

臼田幸生
Index: namazu/configure.in
===================================================================
RCS file: /storage/cvsroot/namazu/configure.in,v
retrieving revision 1.157
diff -u -r1.157 configure.in
--- namazu/configure.in	1 Aug 2003 08:18:10 -0000	1.157
+++ namazu/configure.in	12 Dec 2003 16:47:09 -0000
@@ -221,15 +221,23 @@
 fi
 
 AC_MSG_CHECKING(for NKF perl module)
-if $PERL -e "use lib '$PMDIR'; use NKF;" >/dev/null 2>&1; then
+if $PERL -e 'exit ($] < 5.008)'; then
+    NKF=module_encode
+    AC_MSG_RESULT(yes)
+elif $PERL -e "use lib '$PMDIR'; use NKF 2.00;" >/dev/null 2>&1; then
     NKF=module_nkf
     AC_MSG_RESULT(yes)
 else
     AC_MSG_RESULT(no)
     AC_PATH_PROG(NKF,nkf, no)
+    if test "$NKF" != "no"; then
+        if $NKF --version 2>&1 > /dev/null | $PERL -e ' /Version\s+(\d)\.\d/; exit ($1>=2)'; then
+            NKF=no
+        fi
+    fi
 fi
 if test "$NKF" = "no"; then
-    AC_MSG_WARN(NKF (network kanji filter) not found)
+    AC_MSG_WARN(Encode and NKF2 (network kanji filter) not found)
 fi
 
 AC_MSG_CHECKING(for Text::Kakasi perl module)
Index: namazu/pl/codeconv.pl
===================================================================
RCS file: /storage/cvsroot/namazu/pl/codeconv.pl,v
retrieving revision 1.16
diff -u -r1.16 codeconv.pl
--- namazu/pl/codeconv.pl	1 Aug 2003 01:45:10 -0000	1.16
+++ namazu/pl/codeconv.pl	12 Dec 2003 16:47:09 -0000
@@ -126,27 +126,118 @@
     return $str;
 }
 
-# convert to EUC-JP by using NKF
-sub toeuc ($) {
-    my ($contref, $opt) = @_;
+my %nkfopt_f  = ( '7bit-jis' => 'J',  'euc-jp'   => 'E',
+                  'utf-8'    => 'W',  'shiftjis' => 'S',
+                  'unknown'  => '' );
+my %nkfopt_t  = ( 'euc-jp'   => 'e',  'utf-8'    => 'w',
+                  'shiftjis' => 's');
+
+sub from_to_by_nkf_m($$$){
+    my ($contref, $code_f, $code_t) = @_;
+    my $tmp = $nkfopt_f{$code_f};
+    if (!$tmp){
+        $nkfopt_f{$code_f}='';
+    }
+    my $nkf_opt = "-". $nkfopt_f{$code_f} . $nkfopt_t{$code_t} . "mXZ1";
+    $$contref = NKF::nkf($nkf_opt, $$contref);
+}
+
+sub from_to_by_nkf($$$){
+    my ($contref, $code_f, $code_t) = @_;
+    my $tmp = $nkfopt_f{$code_f};
+    if (!$tmp){
+        $nkfopt_f{$code_f}='';
+    }
+    my $nkf_opt = "-". $nkfopt_f{$code_f} . $nkfopt_t{$code_t} . "mXZ1";
+    my $nkftmp = util::tmpnam("NMZ.nkf");
+    {
+        my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
+        print $nh $$contref;
+    }
+    {
+       my $nh = util::efopen("< $nkftmp");
+       $$contref = util::readfile($nh);
+    }
+    unlink($nkftmp);
+}
+
+sub decode_mime_header($$){
+    my ($contref,$code_t) = @_;
+    my $m_head = '=\?(?:ISO-2022-JP|iso-2022-jp|ISO-8859-1|iso-8859-1)\?[BbQq]\?[A-Za-z0-9\+\/]+=*\?=';
+    $$contref =~ s/($m_head)/de_mime_header_by_encode($1,$code_t)/ge;
+}
+
+sub de_mime_header_by_encode($$){
+    my ($str,$code_t) = @_;
+    $str = Encode::decode('MIME-Header', $str);
+    _utf8_off($str);
+    Encode::from_to($str, 'utf8', $code_t);
+    return $str;
+}
 
+sub encode_from_to($$$){
+    my ($contref, $code_f, $code_t) = @_;
+
+    #dprint("Encode from $code_f to $code_t");
+
+    if ($conf::NKF eq 'module_encode'){
+        if ($code_f eq 'unknown'){
+            #$Encode::Guess::DEBUG=1;
+            my $enc = guess_encoding($$contref);
+            if (ref $enc){
+                $code_f = $enc->name;
+                util::dprint("Encode guessed : $code_f\n");
+            }else {
+                # print "Warning : Encode::Guess couldn't find coding name \n";
+                $$contref = "";
+                util::dprint("Encode::Guess couldn't find coding name");
+                return "Encode::Guess couldn't find encoding";
+            }
+        }
+        Encode::from_to($$contref, $code_f ,$code_t);
+    }elsif ($conf::NKF eq 'module_nkf'){
+        from_to_by_nkf_m($contref, $code_f, $code_t);
+    }elsif ($conf::NKF ne 'no'){
+        from_to_by_nkf($contref, $code_f, $code_t);
+    }
+}
+
+sub to_index_charset($$) {
+    my ($contref, $code_from) = @_;
+    my $err = undef;
+    my $code_to = $var::INDEX_CHARSET;
+    if (!($code_from)) {
+        $code_from = 'unknown';
+    }
     if (util::islang("ja")) {
-	my $nkf_opt = "-emXZ1";
+        $err = encode_from_to($contref,$code_from,$code_to);
+        if ($conf::NKF eq 'module_encode') { 
+            decode_mime_header($contref,$code_to);
+            if ($code_to eq 'euc-jp'){
+                normalize_eucjp($contref);
+            }
+        }
+    }
+    return $err;
+}
+
+# convert to EUC-JP by using NKF
+sub toeuc($) {
+    my ($contref, $tmp) = @_;
+    my $err = undef;
+    $err = to_index_charset($contref, 'unknown');
+    return $err;
+}
 
-	if ($var::USE_NKF_MODULE) {
-	    $$contref = NKF::nkf($nkf_opt, $$contref); # namazu-devel-ja #3152 -> backed out, #3181
-	} else {
-	    my $nkftmp = util::tmpnam("NMZ.nkf");
-	    {
-		my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
-		print $nh $$contref;
-	    }
-	    {
-		my $nh = util::efopen("< $nkftmp");
-		$$contref = util::readfile($nh);
-	    }
-	    unlink($nkftmp);
-	}
+sub load_encode{
+    if ($] >= 5.008){
+        eval 'use Encode qw/ from_to decode _utf8_off /;';
+        if ($@) {return 0};
+        eval 'use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf-8 /;';
+        if ($@) {return 0};
+        return 1;
+    }else{
+        return 0;
     }
 }
 
Index: namazu/pl/var.pl.in
===================================================================
RCS file: /storage/cvsroot/namazu/pl/var.pl.in,v
retrieving revision 1.15
diff -u -r1.15 var.pl.in
--- namazu/pl/var.pl.in	23 Sep 2003 07:34:21 -0000	1.15
+++ namazu/pl/var.pl.in	12 Dec 2003 16:47:09 -0000
@@ -127,7 +127,7 @@
 
 $OUTPUT_DIR = undef;
 $NO_TITLE   = N_("No title in original");
-$USE_NKF_MODULE = 0;
+$INDEX_CHARSET = 'euc-jp';
 
 %REQUIRE_ACTIONS = ();
 %RECURSIVE_ACTIONS = ();
Index: namazu/scripts/mknmz.in
===================================================================
RCS file: /storage/cvsroot/namazu/scripts/mknmz.in,v
retrieving revision 1.126
diff -u -r1.126 mknmz.in
--- namazu/scripts/mknmz.in	21 Oct 2003 05:53:48 -0000	1.126
+++ namazu/scripts/mknmz.in	12 Dec 2003 16:47:10 -0000
@@ -1015,7 +1015,9 @@
     if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
 	require NKF || die "unable to require \"NKF\"\n";
         util::dprint(_("code conversion: using NKF module\n"));
-	$var::USE_NKF_MODULE = 1;
+    }elsif (util::islang("ja") && $conf::NKF =~ /^module_encode/) {
+        codeconv::load_encode();
+        util::dprint(_("code conversion: using Encode module\n"));
     }
     if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
 	require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
@@ -2608,7 +2610,7 @@
      $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
      $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
      $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
-     $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
+     $var::RECURSIVE_ACTIONS, $conf::META_TAGS, 
      $conf::ADDRESS, $var::MAILING_ADDRESS,
      $conf::FILE_SIZE_MAX,
      $var::SupportedScheme, $var::RECURSIVE_SCHEME,