Namazu-devel-ja(旧)
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Perl58 Encodeへの対応(Re: 全角半角変換)
- From: Yukio USUDA <usuda@xxxxxxxxxx>
- Date: Mon, 16 Jun 2003 09:15:42 +0900
- X-ml-name: namazu-devel-ja
- X-mail-count: 02981
- References: <3EE23FCD.7607F6FD@asahi-net.or.jp>
臼田です
Encodeの使い方のアドバイスも頂いたので気がかりだったnkf,lv依存
部の修正を試みました。ご意見をください。
utf8を含む文字コード変換と全角半角正規化、mimeヘッダのデコード
の部分をEncode,NKF.pm,nkf,lvの組み合わせのいずれかで利用できる
ようにしています。
優先順位はいまのところ
(Encode) > (NKF.pm Ver2) > (NKF.pm Ver1 + lv) > (nkf Ver2) >
(nkf Ver1 + lv) としています。
どのツールまで環境判定の対象にしておくべきなのか、何を優先的に
つかうべきなのかでむやみに複雑になりそうなので現行並みとEncode
のみを対象としました。
これによってフィルターモジュールからの使い方で変更しようと
思う点としては
・utf8を扱うフィルター毎で、status()内でlvの有無のチェック
をするかわりに $var::HAS_UTF8TOOLをチェックして
フィルターの利用可否を決める
・フィルター内でのlvの呼出しをやめて
codeconv::to_default_coding($contref, 'utf-8')
としてコンテンツへのレファレンス($contref)、コンテンツの
文字コード名('utf-8','sjis'等)をcodeconv.plに渡すように
しました。
というところです。
基本的にフィルターモジュール内で文字コード変換を完了させること
とし、フィルターモジュールの引数は当面そのままにしました。
臼田幸生
Index: namazu/filter/excel.pl
===================================================================
RCS file: /storage/cvsroot/namazu/filter/excel.pl,v
retrieving revision 1.16
diff -u -r1.16 excel.pl
--- namazu/filter/excel.pl 23 Sep 2002 08:52:32 -0000 1.16
+++ namazu/filter/excel.pl 15 Jun 2003 10:51:08 -0000
@@ -48,8 +48,7 @@
if (!util::islang("ja")) {
return 'yes';
} else {
- $utfconvpath = util::checkcmd('lv');
- if (defined $utfconvpath) {
+ if ($var::HAS_UTF8TOOL){
return 'yes';
} else {
return 'no';
@@ -125,28 +124,13 @@
# Code conversion for Japanese document.
if (util::islang("ja")) {
- my $encoding = "u8"; # UTF-8
+ my $encoding = "utf-8"; # UTF-8
# Pattern for xlHtml version 0.2.6.
if ($$cont =~ m!^<FONT SIZE="?-1"?><I>Last Updated( using| with) Excel 5.0 or 95</I></FONT><br>$!m)
{
- $encoding = "s"; # Shift_JIS
- }
- {
- my $fh = util::efopen("> $tmpfile");
- print $fh $$cont;
- }
- {
- my @cmd = ($utfconvpath, "-I$encoding", "-Oej", $tmpfile);
- my ($status, $fh_out, $fh_err) = util::systemcmd(@cmd);
- my $size = util::filesize($fh_out);
- if ($size == 0) {
- return "Unable to convert file ($xlconvpath error occurred)";
- }
- if ($size > $conf::TEXT_SIZE_MAX) {
- return 'Too large excel file';
- }
- $$cont = util::readfile($fh_out);
+ $encoding = "shiftjis"; # Shift_JIS
}
+ codeconv::to_default_coding($cont, $encoding);
}
unlink $tmpfile;
Index: namazu/filter/msword.pl
===================================================================
RCS file: /storage/cvsroot/namazu/filter/msword.pl,v
retrieving revision 1.35
diff -u -r1.35 msword.pl
--- namazu/filter/msword.pl 12 Nov 2002 07:27:38 -0000 1.35
+++ namazu/filter/msword.pl 15 Jun 2003 10:51:08 -0000
@@ -47,8 +47,7 @@
return 'yes';
} else {
$wvversionpath = util::checkcmd('wvVersion');
- $utfconvpath = util::checkcmd('lv');
- if (defined $wvversionpath && defined $utfconvpath) {
+ if (defined $wvversionpath && ($var::HAS_UTF8TOOL)) {
return 'yes';
} else {
return 'no';
@@ -147,16 +146,7 @@
# Code conversion for Japanese document.
if (util::islang("ja")) {
- my @cmd = ($utfconvpath, "-Iu8", "-Oej", $tmpfile2);
- my ($status, $fh_out, $fh_err) = util::systemcmd(@cmd);
- my $size = util::filesize($fh_out);
- if ($size == 0) {
- return "Unable to convert file ($utfconvpath error occurred).";
- }
- if ($size > $conf::TEXT_SIZE_MAX) {
- return 'Too large word file';
- }
- $$cont = util::readfile($fh_out);
+ codeconv::to_default_coding($cont, 'utf-8');
}
unlink $tmpfile;
Index: namazu/filter/ooo.pl
===================================================================
RCS file: /storage/cvsroot/namazu/filter/ooo.pl,v
retrieving revision 1.5
diff -u -r1.5 ooo.pl
--- namazu/filter/ooo.pl 13 May 2003 14:46:01 -0000 1.5
+++ namazu/filter/ooo.pl 15 Jun 2003 10:51:09 -0000
@@ -24,13 +24,7 @@
package ooo;
use strict;
require 'util.pl';
-
-my $perlver =$];
-$perlver =~ s/\.//;
-$perlver =~ m/^(\d\d\d\d)\d*$/;
-$perlver = 0;
-#$perlver = $1;
-my $utfconvpath = undef;
+require 'codeconv.pl';
sub mediatype() {
# http://framework.openoffice.org/documentation/mimetypes/mimetypes.html
@@ -41,16 +35,7 @@
my $unzippath = util::checkcmd('unzip');
if (defined $unzippath){
if (util::islang("ja")) {
- return 'yes' if ($perlver >= 5008);
- $utfconvpath = util::checkcmd('lv');
- if ($utfconvpath){
- return 'yes';
- }else{
- $utfconvpath = util::checklib('unicode.pl');
- if ($utfconvpath){
- return 'yes';
- }
- }
+ return 'yes' if ($var::HAS_UTF8TOOL);
return 'no';
} else {
return 'yes';
@@ -108,9 +93,9 @@
# Code conversion for Japanese document.
if (util::islang("ja")) {
- ooo::utoe(\$authorname);
- ooo::utoe(\$title);
- ooo::utoe(\$keywords);
+ codeconv::to_default_coding(\$authorname, 'utf-8');
+ codeconv::to_default_coding(\$title, 'utf-8');
+ codeconv::to_default_coding(\$keywords, 'utf-8');
}
if (!($authorname eq "")){
$fields->{'author'} = $authorname;
@@ -121,7 +106,7 @@
$$weighted_str .= "\x7f$weight\x7f$title\x7f/$weight\x7f\n";
}else{
$fields->{'title'}
- = gfilter::filename_to_title($cfile, $weighted_str)
+ = gfilter::filename_to_title($cfile, $weighted_str);
}
my @weight_str = split(' ',$keywords);
for my $tmp (@weight_str) {
@@ -147,7 +132,7 @@
# Code conversion for Japanese document.
if (util::islang("ja")) {
- ooo::utoe(\$xml);
+ codeconv::to_default_coding(\$xml, 'utf-8');
}
$$contref = $xml;
gfilter::line_adjust_filter($contref);
@@ -187,34 +172,6 @@
$$contref =~ s/<[^>]*>/\n/gs;
$$contref =~ s/\n+/\n/gs;
$$contref =~ s/^\n+//;
-}
-
-# convert utf-8 to euc
-# require Perl5.8 or unicode.pl
-sub utoe ($) {
- my ($tmp) = @_;
- if ($utfconvpath =~ /lv/){
- my $tmpfile = util::tmpnam('NMZ.ooo');
- {
- my $fh = util::efopen("> $tmpfile");
- print $fh $$tmp;
- }
- my $cmd = ($utfconvpath . " -Iu8 " . "-Oej " . $tmpfile . " |");
- $$tmp = "";
- my $fh = util::efopen($cmd);
- while (defined(my $line = <$fh>)){
- $$tmp .= $line;
- }
- unlink $tmpfile;
- }elsif ($perlver >= 5008){
- eval 'use Encode;';
- Encode::from_to($$tmp, "utf-8" ,"euc-jp");
- }else{
- eval require 'unicode.pl';
- my @unicodeList = unicode::UTF8toUTF16($$tmp);
- $$tmp = unicode::u2e(@unicodeList);
- $$tmp =~ s/\00//g;
- }
}
# Decode a numberd entity. Exclude an invalid number.
Index: namazu/pl/codeconv.pl
===================================================================
RCS file: /storage/cvsroot/namazu/pl/codeconv.pl,v
retrieving revision 1.13
diff -u -r1.13 codeconv.pl
--- namazu/pl/codeconv.pl 8 Aug 2001 09:05:48 -0000 1.13
+++ namazu/pl/codeconv.pl 15 Jun 2003 10:51:09 -0000
@@ -30,6 +30,15 @@
package codeconv;
use strict;
+my $lvpath = undef;
+my $nkfpath = undef;
+my $nkfopts = undef;
+my $HAS_LV = chk_lv();
+my $HAS_NKF = chk_nkf();
+my $HAS_NKF_MODULE = chk_nkf_m();
+my $HAS_ENCODE = load_encode();
+
+
my @ktoe = (0xA3, 0xD6, 0xD7, 0xA2, 0xA6, 0xF2, 0xA1, 0xA3,
0xA5, 0xA7, 0xA9, 0xE3, 0xE5, 0xE7, 0xC3, 0xBC,
0xA2, 0xA4, 0xA6, 0xA8, 0xAA, 0xAB, 0xAD, 0xAF,
@@ -126,27 +135,225 @@
return $str;
}
-# convert to EUC-JP by using NKF
-sub toeuc ($) {
- my ($contref, $opt) = @_;
+sub chk_nkf_m{
+ if (util::checklib('NKF.pm')){
+ eval {require NKF};
+ my $nkf_opt = "-eW";
+ my $result = NKF::nkf($nkf_opt,"\xef\xbb\xbf\xE3\x81\xaa\xe3\x81\xbe\xe3\x81\x9a");
+ if ($result =~ /\xa4\xca\xa4\xde\xa4\xba/) {
+ $var::HAS_UTF8TOOL = 'nkfpm';
+ return 2;
+ }
+ return 1;
+ }else {
+ return 0;
+ }
+}
+
+sub chk_lv{
+ $lvpath = util::checkcmd('lv');
+ if (defined $lvpath) {
+ $var::HAS_UTF8TOOL = 'lv';
+ return 1;
+ }else {
+ return 0;
+ }
+}
+sub chk_nkf{
+ $nkfpath = util::checkcmd('nkf');
+ if (defined $nkfpath) {
+ $nkfopts = "--version";
+ my @cmd = ($nkfpath, $nkfopts);
+ my ($status, $fh_out, $fh_err) = util::systemcmd(@cmd);
+ my $result = util::readfile($fh_err);
+ if ($result =~ /Version\s+(\d)\.\d/) {
+ my $nkfversion = $1;
+ if ($nkfversion >= 2) {
+ $var::HAS_UTF8TOOL = 'nkf';
+ return 2;
+ }else{
+ return 1;
+ }
+ }
+ }else{
+ return 0;
+ }
+}
+
+sub load_encode{
+ if ($] >= 5.008){
+ eval 'use Encode qw/ from_to decode _utf8_off /;';
+ if ($@) {return 0};
+ eval 'use Encode::Guess qw/ euc-jp shiftjis 7bit-jis utf-8 /;';
+ if ($@) {return 0};
+ $var::HAS_UTF8TOOL = 'encode';
+ return 1;
+ }else{
+ return 0;
+ }
+}
+
+sub toeuc($) {
+ my ($contref, $tmp) = @_;
+ my $err = undef;
+ $err = to_default_coding($contref, 'unknown');
+ return $err;
+}
+sub to_default_coding ($$) {
+ my ($contref, $code_from) = @_;
+ my $err = undef;
+ my $code_to = $var::Default_index_coding;
+ if (!($code_from)) {
+ $code_from = 'unknown';
+ }
if (util::islang("ja")) {
- my $nkf_opt = "-emXZ1";
-
- if ($var::USE_NKF_MODULE) {
- $$contref = NKF::nkf($nkf_opt, $$contref);
- } else {
- my $nkftmp = util::tmpnam("NMZ.nkf");
- {
- my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
- print $nh $$contref;
+ $err = encode_from_to($contref,$code_from,$code_to);
+ if ($HAS_ENCODE && !(($var::USE_NKF_MODULE)||($var::USE_NKF))){
+ decode_mime_header($contref,$code_to);
+ if ($code_to eq 'euc-jp'){
+ normalize_euc_jp($contref);
}
- {
- my $nh = util::efopen("< $nkftmp");
- $$contref = util::readfile($nh);
+ }
+ }
+ return $err;
+}
+
+sub encode_from_to($$$){
+ my ($contref, $code_f, $code_t) = @_;
+ if ((($code_f eq 'utf-8') || ($code_t eq 'utf-8')) && !($var::HAS_UTF8TOOL)){
+ print "Warning : utf-8 convert tool doesn't exist\n";
+ $$contref = "";
+ return;
+ }
+ if ($HAS_ENCODE && !(($var::USE_NKF_MODULE)||($var::USE_NKF))){
+ if ($code_f eq 'unknown'){
+ #$Encode::Guess::DEBUG=1;
+ my $enc = guess_encoding($$contref);
+ if (ref $enc){
+ $code_f = $enc->name;
+ }else {
+ # print "Warning : Encode::Guess couldn't find coding name \n";
+ $$contref = "";
+ return "Encode::Guess couldn't find encoding";
}
- unlink($nkftmp);
}
+ Encode::from_to($$contref, $code_f ,$code_t);
+ }elsif ($HAS_NKF_MODULE && !($var::USE_NKF)){
+ #if NKF.pm ver1 then use lv
+ if (($HAS_NKF_MODULE == 1) && ($code_f eq 'utf-8')){
+ from_to_by_lv($contref, $code_f, $code_t);
+ }else {
+ from_to_by_nkf_m($contref, $code_f, $code_t);
+ }
+ }elsif ($HAS_NKF){
+ #if nkf ver1 then use lv
+ if (($HAS_NKF == 1) && ($code_f eq 'utf-8')){
+ from_to_by_lv($contref, $code_f, $code_t);
+ }else {
+ from_to_by_nkf($contref, $code_f, $code_t);
+ }
+ }elsif ($HAS_LV){
+ from_to_by_lv($contref, $code_f, $code_t);
}
}
+
+my %nkfopt_f = ( '7bit-jis' => 'J', 'euc-jp' => 'E',
+ 'utf-8' => 'W', 'shiftjis' => 'S',
+ 'unknown' => '' );
+my %nkfopt_t = ( 'euc-jp' => 'e', 'utf-8' => 'w',
+ 'shiftjis' => 's');
+
+sub from_to_by_nkf_m($$$){
+ my ($contref, $code_f, $code_t) = @_;
+ my $tmp = $nkfopt_f{$code_f};
+ if (!$tmp){
+ $nkfopt_f{$code_f}='';
+ }
+ my $nkf_opt = "-". $nkfopt_f{$code_f} . $nkfopt_t{$code_t} . "mXZ1";
+ $$contref = NKF::nkf($nkf_opt, $$contref);
+}
+
+sub from_to_by_nkf($$$){
+ my ($contref, $code_f, $code_t) = @_;
+ my $tmp = $nkfopt_f{$code_f};
+ if (!$tmp){
+ $nkfopt_f{$code_f}='';
+ }
+
+ my $nkf_opt = "-". $nkfopt_f{$code_f} . $nkfopt_t{$code_t} . "mXZ1";
+ my $nkftmp = util::tmpnam("NMZ.nkf");
+ {
+ #$conf::NKF will be obsolute
+ #my $nh = util::efopen("|$conf::NKF $nkf_opt > $nkftmp");
+ my $nh = util::efopen("|$nkfpath $nkf_opt > $nkftmp");
+ print $nh $$contref;
+ }
+ {
+ my $nh = util::efopen("< $nkftmp");
+ $$contref = util::readfile($nh);
+ }
+ unlink($nkftmp);
+}
+
+my %lvopts = ( '7bit-jis' => 'j', 'euc-jp' => 'ej',
+ 'utf-8' => 'u8', 'shiftjis' => 's',
+ 'unknown' => 'a' );
+
+sub from_to_by_lv($$$){
+ my ($contref, $code_f, $code_t) = @_;
+ my $tmp = $lvopts{$code_f};
+ if (!$tmp){
+ $lvopts{$code_f}='a';
+ }
+ my $lv_opt = "-I". $lvopts{$code_f} ." -O" . $lvopts{$code_t};
+ my $lvtmpfile = util::tmpnam('NMZ.lv');
+ {
+ my $fh = util::efopen("> $lvtmpfile");
+ print $fh $$contref;
+ }
+ {
+ my @cmd = ($lvpath, $lv_opt, $lvtmpfile);
+ my ($status, $fh_out, $fh_err) = util::systemcmd(@cmd);
+ my $size = util::filesize($fh_out);
+ if ($size == 0) {
+ return "Unable to convert file ($lvpath error occurred)";
+ }
+ $$contref = util::readfile($fh_out);
+ }
+ unlink($lvtmpfile);
+}
+
+sub decode_mime_header($$){
+ my ($contref,$code_t) = @_;
+ my $m_head = '=\?(?:ISO-2022-JP|iso-2022-jp|ISO-8859-1|iso-8859-1)\?[BbQq]\?[A-Za-z0-9\+\/]+=*\?=';
+ $$contref =~ s/($m_head)/de_mime_header_by_encode($1,$code_t)/ge;
+}
+
+sub de_mime_header_by_encode($$){
+ # this subroutine require Encode::MIME::Header
+ my ($str,$code_t) = @_;
+ $str = Encode::decode('MIME-Header', $str);
+ _utf8_off($str);
+ Encode::from_to($str, 'utf8', $code_t);
+ return $str;
+}
+
+sub eucjp_zen2han_ascii ($) {
+ my ($str) = @_;
+ if (util::islang("ja")) {
+ $str =~ s/\xa3([\xb0-\xb9\xc1-\xda\xe1-\xfa])/pack("C",unpack("C",$1)-0x89)/ge;
+ }
+ $str;
+}
+
+sub normalize_euc_jp ($) {
+ my ($contref) = @_;
+ if (util::islang("ja")) {
+ $$contref = codeconv::eucjp_han2zen_kana($$contref);
+ $$contref = codeconv::eucjp_zen2han_ascii($$contref);
+ }
+ $contref;
+}
+
1;
Index: namazu/pl/var.pl.in
===================================================================
RCS file: /storage/cvsroot/namazu/pl/var.pl.in,v
retrieving revision 1.14
diff -u -r1.14 var.pl.in
--- namazu/pl/var.pl.in 27 Feb 2003 10:04:49 -0000 1.14
+++ namazu/pl/var.pl.in 15 Jun 2003 10:51:09 -0000
@@ -127,6 +127,11 @@
$NO_TITLE = N_("No title in original");
$USE_NKF_MODULE = 0;
+$Default_index_coding = 'euc-jp';
+$USE_NKF_MODULE = 0;
+$USE_NKF = 0;
+$HAS_UTF8TOOL = undef;
+
%REQUIRE_ACTIONS = ();
%RECURSIVE_ACTIONS = ();
%REQUIRE_PRE_CODECONV =