Namazu-devel-ja(旧)
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Perl5.8Encode対応
- From: Yukio USUDA <m6694ha392t@xxxxxxxxxxxxxxx>
- Date: Sat, 13 Dec 2003 02:03:36 +0900
- X-ml-name: namazu-devel-ja
- X-mail-count: 03404
臼田です。
knok@xxxxxxxxxxxxx wrote:
> > エンバグは心配なので、とりあえずUTF-8対応化のブランチを作ってはじ
> > めませんか。
> HEAD でやってしまってもいいかもしれません。
インデックスのUTF-8化はゆっくりとやるとして
Perlも5.8.2が出ているようですし、Encodeモジュールを利用できるようにと
作業をはじめました。
> > ・mknmzでいままで使っているnkfは文字コード変換だけでなくmimeデコード機能
> > も受け持っているので、Unicode変換ツールにはUTF-8変換と同時にmimeデコード
> > 機能もあるPerl5.8もしくはnkf2.0のみを対象とするということでよいか?
> > (iconvなども選べるようにする場合はmimeデコードルーチンが別途必要になるが
> > それでも対応しておくか。)
> > また、国際化を考えると日本語コード処理に特化しているnkf2.0やjcode.pmの利
> > 用は後に問題になる気もします。
> この辺りは新しいものに合わせる、で良いと思います。
configureで外部プログラムを検索している個所で
Encodeモジュール、nkf2.0のモジュール、nkf2.0
のどれかを$conf::NKFに設定することにしようと思います。
UTF-8文書の処理を煩雑にしないためにnkf1.xとlvは利用
しない形にしてしまいたいと思っています。
HEADへの変更は、こういう考え方で良いでしょうか?
大丈夫であればcommitしてもう少し作業を続けようと思います。
臼田幸生
Index: namazu/configure.in
===================================================================
RCS file: /storage/cvsroot/namazu/configure.in,v
retrieving revision 1.157
diff -u -r1.157 configure.in
--- namazu/configure.in 1 Aug 2003 08:18:10 -0000 1.157
+++ namazu/configure.in 12 Dec 2003 16:47:09 -0000
@@ -221,15 +221,23 @@
fi
AC_MSG_CHECKING(for NKF perl module)
-if $PERL -e "use lib '$PMDIR'; use NKF;" >/dev/null 2>&1; then
+if $PERL -e 'exit ($] < 5.008)'; then
+ NKF=module_encode
+ AC_MSG_RESULT(yes)
+elif $PERL -e "use lib '$PMDIR'; use NKF 2.00;" >/dev/null 2>&1; then
NKF=module_nkf
AC_MSG_RESULT(yes)
else
AC_MSG_RESULT(no)
AC_PATH_PROG(NKF,nkf, no)
+ if test "$NKF" != "no"; then
+ if $NKF --version 2>&1 > /dev/null | $PERL -e ' /Version\s+(\d)\.\d/; exit ($1>=2)'; then
+ NKF=no
+ fi
+ fi
fi
if test "$NKF" = "no"; then
- AC_MSG_WARN(NKF (network kanji filter) not found)
+ AC_MSG_WARN(Encode and NKF2 (network kanji filter) not found)
fi
AC_MSG_CHECKING(for Text::Kakasi perl module)
Index: namazu/pl/codeconv.pl
===================================================================
RCS file: /storage/cvsroot/namazu/pl/codeconv.pl,v
retrieving revision 1.16
diff -u -r1.16 codeconv.pl
--- namazu/pl/codeconv.pl 1 Aug 2003 01:45:10 -0000 1.16
+++ namazu/pl/codeconv.pl 12 Dec 2003 16:47:09 -0000
@@ -126,27 +126,118 @@
return $str;
}
-# convert to EUC-JP by using NKF
-sub toeuc ($) {
- my ($contref, $opt) = @_;
+my %nkfopt_f = ( '7bit-jis' => 'J', 'euc-jp' => 'E',
+ 'utf-8' => 'W', 'shiftjis' => 'S',
+ 'unknown' => '' );
+my %nkfopt_t = ( 'euc-jp' => 'e', 'utf-8' => 'w',
+ 'shiftjis' => 's');
+
+sub from_to_by_nkf_m($$$){
+ my ($contref, $code_f, $code_t) = @_;
+ my $tmp = $nkfopt_f{$code_f};
+ if (!$tmp){
+ $nkfopt_f{$code_f}='';
+ }
+ my $nkf_opt = "-". $nkfopt_f{$code_f} . $nkfopt_t{$code_t} . "mXZ1";
+ $$contref = NKF::nkf($nkf_opt, $$contref);
+}
+
+sub from_to_by_nkf($$$){
+ my ($contref, $code_f, $code_t) = @_;
+ my $tmp = $nkfopt_f{$code_f};
+ if (!$tmp){
+ $nkfopt_f{$code_f}='';
+ }
+ my $nkf_opt = "-". $nkfopt_f{$code_f} . $nkfopt_t{$code_t} . "mXZ1";
+ my $nkftmp = util::tmpnam("NMZ.nkf");
+ {
+ my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
+ print $nh $$contref;
+ }
+ {
+ my $nh = util::efopen("< $nkftmp");
+ $$contref = util::readfile($nh);
+ }
+ unlink($nkftmp);
+}
+
+sub decode_mime_header($$){
+ my ($contref,$code_t) = @_;
+ my $m_head = '=\?(?:ISO-2022-JP|iso-2022-jp|ISO-8859-1|iso-8859-1)\?[BbQq]\?[A-Za-z0-9\+\/]+=*\?=';
+ $$contref =~ s/($m_head)/de_mime_header_by_encode($1,$code_t)/ge;
+}
+
+sub de_mime_header_by_encode($$){
+ my ($str,$code_t) = @_;
+ $str = Encode::decode('MIME-Header', $str);
+ _utf8_off($str);
+ Encode::from_to($str, 'utf8', $code_t);
+ return $str;
+}
+sub encode_from_to($$$){
+ my ($contref, $code_f, $code_t) = @_;
+
+ #dprint("Encode from $code_f to $code_t");
+
+ if ($conf::NKF eq 'module_encode'){
+ if ($code_f eq 'unknown'){
+ #$Encode::Guess::DEBUG=1;
+ my $enc = guess_encoding($$contref);
+ if (ref $enc){
+ $code_f = $enc->name;
+ util::dprint("Encode guessed : $code_f\n");
+ }else {
+ # print "Warning : Encode::Guess couldn't find coding name \n";
+ $$contref = "";
+ util::dprint("Encode::Guess couldn't find coding name");
+ return "Encode::Guess couldn't find encoding";
+ }
+ }
+ Encode::from_to($$contref, $code_f ,$code_t);
+ }elsif ($conf::NKF eq 'module_nkf'){
+ from_to_by_nkf_m($contref, $code_f, $code_t);
+ }elsif ($conf::NKF ne 'no'){
+ from_to_by_nkf($contref, $code_f, $code_t);
+ }
+}
+
+sub to_index_charset($$) {
+ my ($contref, $code_from) = @_;
+ my $err = undef;
+ my $code_to = $var::INDEX_CHARSET;
+ if (!($code_from)) {
+ $code_from = 'unknown';
+ }
if (util::islang("ja")) {
- my $nkf_opt = "-emXZ1";
+ $err = encode_from_to($contref,$code_from,$code_to);
+ if ($conf::NKF eq 'module_encode') {
+ decode_mime_header($contref,$code_to);
+ if ($code_to eq 'euc-jp'){
+ normalize_eucjp($contref);
+ }
+ }
+ }
+ return $err;
+}
+
+# convert to EUC-JP by using NKF
+sub toeuc($) {
+ my ($contref, $tmp) = @_;
+ my $err = undef;
+ $err = to_index_charset($contref, 'unknown');
+ return $err;
+}
- if ($var::USE_NKF_MODULE) {
- $$contref = NKF::nkf($nkf_opt, $$contref); # namazu-devel-ja #3152 -> backed out, #3181
- } else {
- my $nkftmp = util::tmpnam("NMZ.nkf");
- {
- my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
- print $nh $$contref;
- }
- {
- my $nh = util::efopen("< $nkftmp");
- $$contref = util::readfile($nh);
- }
- unlink($nkftmp);
- }
+sub load_encode{
+ if ($] >= 5.008){
+ eval 'use Encode qw/ from_to decode _utf8_off /;';
+ if ($@) {return 0};
+ eval 'use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf-8 /;';
+ if ($@) {return 0};
+ return 1;
+ }else{
+ return 0;
}
}
Index: namazu/pl/var.pl.in
===================================================================
RCS file: /storage/cvsroot/namazu/pl/var.pl.in,v
retrieving revision 1.15
diff -u -r1.15 var.pl.in
--- namazu/pl/var.pl.in 23 Sep 2003 07:34:21 -0000 1.15
+++ namazu/pl/var.pl.in 12 Dec 2003 16:47:09 -0000
@@ -127,7 +127,7 @@
$OUTPUT_DIR = undef;
$NO_TITLE = N_("No title in original");
-$USE_NKF_MODULE = 0;
+$INDEX_CHARSET = 'euc-jp';
%REQUIRE_ACTIONS = ();
%RECURSIVE_ACTIONS = ();
Index: namazu/scripts/mknmz.in
===================================================================
RCS file: /storage/cvsroot/namazu/scripts/mknmz.in,v
retrieving revision 1.126
diff -u -r1.126 mknmz.in
--- namazu/scripts/mknmz.in 21 Oct 2003 05:53:48 -0000 1.126
+++ namazu/scripts/mknmz.in 12 Dec 2003 16:47:10 -0000
@@ -1015,7 +1015,9 @@
if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
require NKF || die "unable to require \"NKF\"\n";
util::dprint(_("code conversion: using NKF module\n"));
- $var::USE_NKF_MODULE = 1;
+ }elsif (util::islang("ja") && $conf::NKF =~ /^module_encode/) {
+ codeconv::load_encode();
+ util::dprint(_("code conversion: using Encode module\n"));
}
if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
@@ -2608,7 +2610,7 @@
$var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
$usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
$var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
- $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
+ $var::RECURSIVE_ACTIONS, $conf::META_TAGS,
$conf::ADDRESS, $var::MAILING_ADDRESS,
$conf::FILE_SIZE_MAX,
$var::SupportedScheme, $var::RECURSIVE_SCHEME,