namazu-ml(ring)
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Word97の検索
- From: Jun Kurabe <jun-krb@xxxxxxxxxxxxxx>
- Date: Mon, 01 Nov 1999 06:04:27 +0900
はじめまして、倉部といいます。
Namazuを使って、Word97の検索をしようと、簡単なヘルパープログラムを作成
しました。
Win32, Active Perl, Word97が入っていないと動きません。
他に、どんな方法を取られているのでしょうか?
過去の履歴を見ると、 MSWordView, MSWord 8 converter for unix がある
ようですが、まだ試していません。
P.S. ReadMSWord.plの中で、Word文書が持っているProperty(Title, Subject,
Author, etc)の値に、
Subjecct:: Date: 等のフィールド名を付けて、インデックスの作成をさ
せています。
------------- 以下 私のサンプル --------------
mknmzを変更したところ
my $TARGET_FILE = '.*\.html?|.*\.txt|.*_default|.*\.doc|.*\.rtf'; #doc
とrtfを追加
my %HELPER_PROGRAMS = (
'gz' => 'zcat',
'Z' => 'zcat',
'man' => 'groff -man -Tnippon',
'doc' => 'perl.exe d:/usr/local/namazu/bin/ReadMSWord.pl', #doc を
追加
'rtf' => 'perl.exe d:/usr/local/namazu/bin/ReadMSWord.pl', #rtfを
追加
);
ReadMSWord.pl の内容
# Created by Jun Kurabe
# 1999/10/30
use Win32::OLE;
use Win32::OLE::Enum;
package ReadMSWord;
sub ReadMSWord {
my $fileName = shift;
my $word;
# use existing instance if Word is already running
eval {$word = Win32::OLE->GetActiveObject('Word.Application')};
die "MSWord not installed" if $@;
unless (defined $word) {
$word = Win32::OLE->new('Word.Application', sub {$_[0]->Quit;})
or die "Oops, cannot start Word";
}
# for debug
# $word->{Visible} = 1;
my $doc = $word->{Documents}->open($fileName);
$allText = '';
$allText .= getProperties($doc);
$allText .= getParagraphs($doc);
$allText .= getShapes($doc);
$allText .= getHeadersFooters($doc);
$doc->close(0);
undef $doc;
undef $word;
return $allText;
}
sub getParagraphs {
my $doc = shift;
my $e = Win32::OLE::Enum->new($doc->Paragraphs);
my $allText = '';
while(($obj = $e->Next)) {
$p = $obj->Range->{Text};
chop $p;
$allText .= $p;
$allText .= "\n";
}
return $allText;
}
sub getProperties {
my $doc = shift;
my $allText = '';
# get Title
$title = $doc->BuiltInDocumentProperties(1)->{Value};
$subject = $doc->BuiltInDocumentProperties(2)->{Value};
$author = $doc->BuiltInDocumentProperties(3)->{Value};
$lastAuthor = $doc->BuiltInDocumentProperties(7)->{Value};
$createDate = $doc->BuiltInDocumentProperties(11)->{Value};
$editDate = $doc->BuiltInDocumentProperties(13)->{Value};
$allText .= 'Subject: ' . $title . ' ' . $subjext ;
$allText .= "\n";
$allText .= 'From: ' . $author . ',' . $lastAuthor;
$allText .= "\n";
$allText .= 'Date: ' . $createDate;
$allText .= "\n";
$allText .= "\n";
return $allText;
}
sub getShapes {
my $doc = shift;
my $e = Win32::OLE::Enum->new($doc->Shapes);
my $allText = '';
while(($obj = $e->Next)) {
if ($obj->{Type} == 17 ) { # msoShapeTextBox = 17
$p = $obj->TextFrame->TextRange->{Text};
chop $p;
$allText .= $p;
$allText .= "\n";
}
}
return $allText;
}
sub getHeadersFooters {
my $doc = shift;
my $allText = '';
my $obj ;
my $e = Win32::OLE::Enum->new($doc->Sections);
while(($obj = $e->Next)) {
my $e_header = Win32::OLE::Enum->new($obj->Headers);
my $h;
while(($h = $e_header->Next)) {
$p = $h->Range->{Text};
chop $p;
$allText .= $p;
$allText .= "\n";
}
my $e_footer = Win32::OLE::Enum->new($obj->Footers);
my $f;
while(($f = $e_footer->Next)) {
$p = $f->Range->{Text};
chop $p;
$allText .= $p;
$allText .= "\n";
}
}
return $allText;
}
#main
print ReadMSWord::ReadMSWord("$ARGV[0]");