Namazu-devel-ja(旧)


[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Perl58 Encodeへの対応(Re: 全角半角変換)



臼田です

Encodeの使い方のアドバイスも頂いたので気がかりだったnkf,lv依存
部の修正を試みました。ご意見をください。

utf8を含む文字コード変換と全角半角正規化、mimeヘッダのデコード
の部分をEncode,NKF.pm,nkf,lvの組み合わせのいずれかで利用できる
ようにしています。

優先順位はいまのところ
(Encode) > (NKF.pm Ver2) > (NKF.pm Ver1 + lv) > (nkf Ver2) > 
(nkf Ver1 + lv) としています。
どのツールまで環境判定の対象にしておくべきなのか、何を優先的に
つかうべきなのかでむやみに複雑になりそうなので現行並みとEncode
のみを対象としました。

これによってフィルターモジュールからの使い方で変更しようと
思う点としては
・utf8を扱うフィルター毎で、status()内でlvの有無のチェック
 をするかわりに $var::HAS_UTF8TOOLをチェックして
 フィルターの利用可否を決める
・フィルター内でのlvの呼出しをやめて
 codeconv::to_default_coding($contref, 'utf-8')
 としてコンテンツへのレファレンス($contref)、コンテンツの
 文字コード名('utf-8','sjis'等)をcodeconv.plに渡すように
 しました。
というところです。

基本的にフィルターモジュール内で文字コード変換を完了させること
とし、フィルターモジュールの引数は当面そのままにしました。

臼田幸生
Index: namazu/filter/excel.pl
===================================================================
RCS file: /storage/cvsroot/namazu/filter/excel.pl,v
retrieving revision 1.16
diff -u -r1.16 excel.pl
--- namazu/filter/excel.pl	23 Sep 2002 08:52:32 -0000	1.16
+++ namazu/filter/excel.pl	15 Jun 2003 10:51:08 -0000
@@ -48,8 +48,7 @@
 	if (!util::islang("ja")) {
 	    return 'yes';
 	} else {
-	    $utfconvpath = util::checkcmd('lv');
-	    if (defined $utfconvpath) {
+	    if ($var::HAS_UTF8TOOL){
 		return 'yes';
 	    } else {
 		return 'no';
@@ -125,28 +124,13 @@
 
     # Code conversion for Japanese document.
     if (util::islang("ja")) {
-	my $encoding = "u8"; # UTF-8
+	my $encoding = "utf-8"; # UTF-8
 	# Pattern for xlHtml version 0.2.6.
 	if ($$cont =~ m!^<FONT SIZE="?-1"?><I>Last Updated(&nbsp;using| with) Excel 5.0 or 95</I></FONT><br>$!m) 
 	{
-	    $encoding = "s"; # Shift_JIS
-	}
-	{
-	    my $fh = util::efopen("> $tmpfile");
-	    print $fh $$cont;
-	}
-	{
-	    my @cmd = ($utfconvpath, "-I$encoding", "-Oej", $tmpfile);
-	    my ($status, $fh_out, $fh_err) = util::systemcmd(@cmd);
-	    my $size = util::filesize($fh_out);
-	    if ($size == 0) {
-		return "Unable to convert file ($xlconvpath error occurred)";
-	    }
-	    if ($size > $conf::TEXT_SIZE_MAX) {
-		return 'Too large excel file';
-	    }
-	    $$cont = util::readfile($fh_out);
+	    $encoding = "shiftjis"; # Shift_JIS
 	}
+	codeconv::to_default_coding($cont, $encoding);
     } 
 
     unlink $tmpfile;
Index: namazu/filter/msword.pl
===================================================================
RCS file: /storage/cvsroot/namazu/filter/msword.pl,v
retrieving revision 1.35
diff -u -r1.35 msword.pl
--- namazu/filter/msword.pl	12 Nov 2002 07:27:38 -0000	1.35
+++ namazu/filter/msword.pl	15 Jun 2003 10:51:08 -0000
@@ -47,8 +47,7 @@
 	    return 'yes';
 	} else {
 	    $wvversionpath = util::checkcmd('wvVersion');
-	    $utfconvpath   = util::checkcmd('lv');
-	    if (defined $wvversionpath && defined $utfconvpath) {
+	    if (defined $wvversionpath && ($var::HAS_UTF8TOOL)) {
 		return 'yes';
 	    } else {
 		return 'no';
@@ -147,16 +146,7 @@
 
     # Code conversion for Japanese document.
     if (util::islang("ja")) {
-	my @cmd = ($utfconvpath, "-Iu8", "-Oej", $tmpfile2);
-	my ($status, $fh_out, $fh_err) = util::systemcmd(@cmd);
-	my $size = util::filesize($fh_out);
-	if ($size == 0) {
-	    return "Unable to convert file ($utfconvpath error occurred).";
-	}
-	if ($size > $conf::TEXT_SIZE_MAX) {
-	    return 'Too large word file';
-	}
-	$$cont = util::readfile($fh_out);
+	codeconv::to_default_coding($cont, 'utf-8');
     }
 
     unlink $tmpfile;
Index: namazu/filter/ooo.pl
===================================================================
RCS file: /storage/cvsroot/namazu/filter/ooo.pl,v
retrieving revision 1.5
diff -u -r1.5 ooo.pl
--- namazu/filter/ooo.pl	13 May 2003 14:46:01 -0000	1.5
+++ namazu/filter/ooo.pl	15 Jun 2003 10:51:09 -0000
@@ -24,13 +24,7 @@
 package ooo;
 use strict;
 require 'util.pl';
-
-my $perlver =$];
-$perlver =~ s/\.//;
-$perlver =~ m/^(\d\d\d\d)\d*$/;
-$perlver = 0; 
-#$perlver = $1;
-my $utfconvpath = undef;
+require 'codeconv.pl';
 
 sub mediatype() {
     # http://framework.openoffice.org/documentation/mimetypes/mimetypes.html
@@ -41,16 +35,7 @@
     my $unzippath = util::checkcmd('unzip');
     if (defined $unzippath){
         if (util::islang("ja")) {
-           return 'yes' if ($perlver >= 5008);
-           $utfconvpath = util::checkcmd('lv');
-           if ($utfconvpath){ 
-               return 'yes';
-           }else{
-               $utfconvpath = util::checklib('unicode.pl');
-               if ($utfconvpath){ 
-                   return 'yes';
-               }
-           }
+           return 'yes' if ($var::HAS_UTF8TOOL);
            return 'no'; 
         } else {
            return 'yes'; 
@@ -108,9 +93,9 @@
 
     # Code conversion for Japanese document.
     if (util::islang("ja")) {
-        ooo::utoe(\$authorname);
-        ooo::utoe(\$title);
-        ooo::utoe(\$keywords);
+	codeconv::to_default_coding(\$authorname, 'utf-8');
+	codeconv::to_default_coding(\$title, 'utf-8');
+	codeconv::to_default_coding(\$keywords, 'utf-8');
     }
     if (!($authorname eq "")){
         $fields->{'author'} = $authorname;
@@ -121,7 +106,7 @@
         $$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
     }else{
         $fields->{'title'} 
-           = gfilter::filename_to_title($cfile, $weighted_str)
+           = gfilter::filename_to_title($cfile, $weighted_str);
     }
     my @weight_str = split(' ',$keywords);
     for my $tmp (@weight_str) {
@@ -147,7 +132,7 @@
 
     # Code conversion for Japanese document.
     if (util::islang("ja")) {
-         ooo::utoe(\$xml);
+	codeconv::to_default_coding(\$xml, 'utf-8');
     }
     $$contref = $xml;
     gfilter::line_adjust_filter($contref);
@@ -187,34 +172,6 @@
       $$contref =~ s/<[^>]*>/\n/gs;
       $$contref =~ s/\n+/\n/gs;
       $$contref =~ s/^\n+//;
-}
-
-# convert utf-8 to euc
-# require Perl5.8 or unicode.pl
-sub utoe ($) {
-    my ($tmp) = @_;
-    if ($utfconvpath =~ /lv/){
-        my $tmpfile  = util::tmpnam('NMZ.ooo');
-	{
-	    my $fh = util::efopen("> $tmpfile");
-	    print $fh $$tmp;
-	}
-        my $cmd = ($utfconvpath . " -Iu8 " . "-Oej " . $tmpfile . " |");
-        $$tmp = "";
-        my $fh = util::efopen($cmd);
-        while (defined(my $line = <$fh>)){
-            $$tmp .= $line;
-        }
-        unlink $tmpfile;
-    }elsif ($perlver >= 5008){
-        eval 'use Encode;';
-        Encode::from_to($$tmp, "utf-8" ,"euc-jp");
-    }else{
-        eval require 'unicode.pl';
-        my @unicodeList = unicode::UTF8toUTF16($$tmp);
-        $$tmp = unicode::u2e(@unicodeList);
-        $$tmp =~ s/\00//g;
-    }
 }
 
 # Decode a numberd entity. Exclude an invalid number.
Index: namazu/pl/codeconv.pl
===================================================================
RCS file: /storage/cvsroot/namazu/pl/codeconv.pl,v
retrieving revision 1.13
diff -u -r1.13 codeconv.pl
--- namazu/pl/codeconv.pl	8 Aug 2001 09:05:48 -0000	1.13
+++ namazu/pl/codeconv.pl	15 Jun 2003 10:51:09 -0000
@@ -30,6 +30,15 @@
 package codeconv;
 use strict;
 
+my $lvpath = undef;
+my $nkfpath = undef;
+my $nkfopts = undef;
+my $HAS_LV = chk_lv();
+my $HAS_NKF = chk_nkf();
+my $HAS_NKF_MODULE = chk_nkf_m();
+my $HAS_ENCODE = load_encode();
+
+
 my @ktoe = (0xA3, 0xD6, 0xD7, 0xA2, 0xA6, 0xF2, 0xA1, 0xA3,
 	     0xA5, 0xA7, 0xA9, 0xE3, 0xE5, 0xE7, 0xC3, 0xBC,
 	     0xA2, 0xA4, 0xA6, 0xA8, 0xAA, 0xAB, 0xAD, 0xAF,
@@ -126,27 +135,225 @@
     return $str;
 }
 
-# convert to EUC-JP by using NKF
-sub toeuc ($) {
-    my ($contref, $opt) = @_;
+sub chk_nkf_m{
+    if (util::checklib('NKF.pm')){
+        eval {require NKF};
+	my $nkf_opt = "-eW";
+        my $result = NKF::nkf($nkf_opt,"\xef\xbb\xbf\xE3\x81\xaa\xe3\x81\xbe\xe3\x81\x9a");
+	if ($result =~ /\xa4\xca\xa4\xde\xa4\xba/) {
+	    $var::HAS_UTF8TOOL = 'nkfpm';
+            return 2;
+        }
+        return 1;
+    }else {
+        return 0;
+    }
+}
+
+sub chk_lv{
+    $lvpath = util::checkcmd('lv');
+    if (defined $lvpath) {
+	$var::HAS_UTF8TOOL = 'lv';
+        return 1;
+    }else {
+        return 0;
+    }
+}
+sub chk_nkf{
+    $nkfpath = util::checkcmd('nkf');
+    if (defined $nkfpath) {
+        $nkfopts = "--version";
+        my @cmd = ($nkfpath, $nkfopts);
+	my ($status, $fh_out, $fh_err) = util::systemcmd(@cmd);
+	my $result = util::readfile($fh_err);
+	if ($result =~ /Version\s+(\d)\.\d/) {
+	    my $nkfversion = $1;
+	    if ($nkfversion >= 2) {
+		$var::HAS_UTF8TOOL = 'nkf';
+                return 2;
+            }else{
+                return 1;
+            }
+	}
+    }else{
+        return 0;
+    }
+}
+
+sub load_encode{
+    if ($] >= 5.008){
+        eval 'use Encode qw/ from_to decode _utf8_off /;';
+        if ($@) {return 0};
+        eval 'use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf-8 /;';
+        if ($@) {return 0};
+	$var::HAS_UTF8TOOL = 'encode';
+        return 1;
+    }else{
+        return 0;
+    }
+}
+
+sub toeuc($) {
+    my ($contref, $tmp) = @_;
+    my $err = undef;
+    $err = to_default_coding($contref, 'unknown');
+    return $err; 
+}
 
+sub to_default_coding ($$) {
+    my ($contref, $code_from) = @_;
+    my $err = undef;
+    my $code_to = $var::Default_index_coding;
+    if (!($code_from)) {
+        $code_from = 'unknown';
+    }
     if (util::islang("ja")) {
-	my $nkf_opt = "-emXZ1";
-
-	if ($var::USE_NKF_MODULE) {
-	    $$contref = NKF::nkf($nkf_opt, $$contref);
-	} else {
-	    my $nkftmp = util::tmpnam("NMZ.nkf");
-	    {
-		my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
-		print $nh $$contref;
+        $err = encode_from_to($contref,$code_from,$code_to);
+        if ($HAS_ENCODE && !(($var::USE_NKF_MODULE)||($var::USE_NKF))){
+	    decode_mime_header($contref,$code_to);
+            if ($code_to eq 'euc-jp'){
+		normalize_euc_jp($contref);
 	    }
-	    {
-		my $nh = util::efopen("< $nkftmp");
-		$$contref = util::readfile($nh);
+	}
+    }
+    return $err;
+}
+
+sub encode_from_to($$$){
+    my ($contref, $code_f, $code_t) = @_;
+    if ((($code_f eq 'utf-8') || ($code_t eq 'utf-8')) && !($var::HAS_UTF8TOOL)){
+	print "Warning : utf-8 convert tool doesn't exist\n";
+        $$contref = "";
+	return;
+    }
+    if ($HAS_ENCODE && !(($var::USE_NKF_MODULE)||($var::USE_NKF))){
+	if ($code_f eq 'unknown'){
+	    #$Encode::Guess::DEBUG=1;
+	    my $enc = guess_encoding($$contref);
+	    if (ref $enc){
+		$code_f = $enc->name;
+	    }else {
+		# print "Warning : Encode::Guess couldn't find coding name \n";
+		$$contref = "";
+		return "Encode::Guess couldn't find encoding";
 	    }
-	    unlink($nkftmp);
 	}
+        Encode::from_to($$contref, $code_f ,$code_t);
+    }elsif ($HAS_NKF_MODULE && !($var::USE_NKF)){
+        #if NKF.pm ver1 then use lv
+	if (($HAS_NKF_MODULE == 1) && ($code_f eq 'utf-8')){
+	    from_to_by_lv($contref, $code_f, $code_t);
+	}else {
+	    from_to_by_nkf_m($contref, $code_f, $code_t);
+        }
+    }elsif ($HAS_NKF){
+        #if nkf ver1 then use lv
+	if (($HAS_NKF == 1) && ($code_f eq 'utf-8')){
+	    from_to_by_lv($contref, $code_f, $code_t);
+	}else {
+	    from_to_by_nkf($contref, $code_f, $code_t);
+        }
+    }elsif ($HAS_LV){
+	from_to_by_lv($contref, $code_f, $code_t);
     }
 }
+
+my %nkfopt_f  = ( '7bit-jis' => 'J',  'euc-jp'   => 'E',
+                  'utf-8'    => 'W',  'shiftjis' => 'S',
+                  'unknown'  => '' );
+my %nkfopt_t  = ( 'euc-jp'   => 'e',  'utf-8'    => 'w',
+                  'shiftjis' => 's');
+
+sub from_to_by_nkf_m($$$){
+    my ($contref, $code_f, $code_t) = @_;
+    my $tmp = $nkfopt_f{$code_f};
+    if (!$tmp){
+        $nkfopt_f{$code_f}='';
+    }
+    my $nkf_opt = "-". $nkfopt_f{$code_f} . $nkfopt_t{$code_t} . "mXZ1";
+    $$contref = NKF::nkf($nkf_opt, $$contref);
+}
+
+sub from_to_by_nkf($$$){
+    my ($contref, $code_f, $code_t) = @_;
+    my $tmp = $nkfopt_f{$code_f};
+    if (!$tmp){
+        $nkfopt_f{$code_f}='';
+    }
+
+    my $nkf_opt = "-". $nkfopt_f{$code_f} . $nkfopt_t{$code_t} . "mXZ1";
+    my $nkftmp = util::tmpnam("NMZ.nkf");
+    {
+        #$conf::NKF will be obsolute
+	#my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
+	my $nh = util::efopen("|$nkfpath $nkf_opt > $nkftmp");
+	print $nh $$contref;
+    }
+    {
+	my $nh = util::efopen("< $nkftmp");
+	$$contref = util::readfile($nh);
+    }
+    unlink($nkftmp);
+}
+
+my %lvopts = ( '7bit-jis' => 'j',  'euc-jp'   => 'ej',
+                  'utf-8'    => 'u8', 'shiftjis' => 's',
+                  'unknown'  => 'a' );
+
+sub from_to_by_lv($$$){
+    my ($contref, $code_f, $code_t) = @_;
+    my $tmp = $lvopts{$code_f};
+    if (!$tmp){
+        $lvopts{$code_f}='a';
+    }
+    my $lv_opt = "-I". $lvopts{$code_f} ." -O" . $lvopts{$code_t};
+    my $lvtmpfile  = util::tmpnam('NMZ.lv');
+    {
+	my $fh = util::efopen("> $lvtmpfile");
+	print $fh $$contref;
+    }
+    {
+	my @cmd = ($lvpath, $lv_opt, $lvtmpfile);
+	my ($status, $fh_out, $fh_err) = util::systemcmd(@cmd);
+	my $size = util::filesize($fh_out);
+	if ($size == 0) {
+	    return "Unable to convert file ($lvpath error occurred)";
+	}
+	$$contref = util::readfile($fh_out);
+    }
+    unlink($lvtmpfile);
+}
+
+sub decode_mime_header($$){
+    my ($contref,$code_t) = @_;
+    my $m_head = '=\?(?:ISO-2022-JP|iso-2022-jp|ISO-8859-1|iso-8859-1)\?[BbQq]\?[A-Za-z0-9\+\/]+=*\?=';
+    $$contref =~ s/($m_head)/de_mime_header_by_encode($1,$code_t)/ge;
+}
+
+sub de_mime_header_by_encode($$){
+    # this subroutine require Encode::MIME::Header
+    my ($str,$code_t) = @_;
+    $str = Encode::decode('MIME-Header', $str);
+    _utf8_off($str);
+    Encode::from_to($str, 'utf8', $code_t);
+    return $str;
+}
+
+sub eucjp_zen2han_ascii ($) {
+    my ($str) = @_;
+    if (util::islang("ja")) {
+        $str =~ s/\xa3([\xb0-\xb9\xc1-\xda\xe1-\xfa])/pack("C",unpack("C",$1)-0x89)/ge;
+    }
+    $str;
+}
+
+sub normalize_euc_jp ($) {
+    my ($contref) = @_;
+    if (util::islang("ja")) {
+       $$contref = codeconv::eucjp_han2zen_kana($$contref);
+       $$contref = codeconv::eucjp_zen2han_ascii($$contref);
+    }
+    $contref;
+}
+
 1;
Index: namazu/pl/var.pl.in
===================================================================
RCS file: /storage/cvsroot/namazu/pl/var.pl.in,v
retrieving revision 1.14
diff -u -r1.14 var.pl.in
--- namazu/pl/var.pl.in	27 Feb 2003 10:04:49 -0000	1.14
+++ namazu/pl/var.pl.in	15 Jun 2003 10:51:09 -0000
@@ -127,6 +127,11 @@
 $NO_TITLE   = N_("No title in original");
 $USE_NKF_MODULE = 0;
 
+$Default_index_coding = 'euc-jp';
+$USE_NKF_MODULE = 0;
+$USE_NKF = 0;
+$HAS_UTF8TOOL = undef;
+
 %REQUIRE_ACTIONS = ();
 %RECURSIVE_ACTIONS = ();
 %REQUIRE_PRE_CODECONV =