Namazu-devel-ja(旧)
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
indexer.pl
- From: knok@xxxxxxxxxxxxx (NOKUBI Takatsugu)
- Date: Fri, 8 Nov 2002 14:22:11 JST
- X-ml-name: namazu-devel-ja
- X-mail-count: 02692
リファクタリングの一環として、index 関連のコードを pl/ 以下に追い出
してみました。
一応 make check した範囲では問題がないことを確認していますが、commit
する前に review していただけると嬉しいです。
# 次は非ファイル対応かな...
--
野首 貴嗣
E-mail: knok@xxxxxxxxxxxxx
knok@xxxxxxxxxx / knok@xxxxxxxxxx
diff -cNr namazu-head/pl/Makefile.am namazu-knok-im2/pl/Makefile.am
*** namazu-head/pl/Makefile.am 2001-09-21 17:11:30.000000000 +0900
--- namazu-knok-im2/pl/Makefile.am 2002-11-07 15:38:41.000000000 +0900
***************
*** 16,22 ****
usage.pl \
util.pl \
var.pl \
! wakati.pl
# Slightly different from perllib_DATA because of *.in files.
EXTRA_DIST = $\
--- 16,23 ----
usage.pl \
util.pl \
var.pl \
! wakati.pl \
! indexer.pl
# Slightly different from perllib_DATA because of *.in files.
EXTRA_DIST = $\
***************
*** 30,36 ****
usage.pl \
util.pl \
var.pl \
! wakati.pl
CLEANFILES = gettext.pl
--- 31,38 ----
usage.pl \
util.pl \
var.pl \
! wakati.pl \
! indexer.pl
CLEANFILES = gettext.pl
diff -cNr namazu-head/pl/indexer.pl namazu-knok-im2/pl/indexer.pl
*** namazu-head/pl/indexer.pl 1970-01-01 09:00:00.000000000 +0900
--- namazu-knok-im2/pl/indexer.pl 2002-11-07 17:05:46.000000000 +0900
***************
*** 0 ****
--- 1,124 ----
+ #
+
+ package mknmz::indexer;
+
+ sub new {
+ my $self = {};
+ my $proto = shift @_;
+ my $class = ref($proto) || $proto;
+ bless($self);
+
+ $self->init(@_);
+ return $self;
+ }
+
+ sub init {
+ my $self = shift @_;
+ $self->{'KeyIndex'} = {};
+ $self->{'content'} = shift @_;
+ $self->{'conf::WORD_LENG_MAX'} = shift @_;
+ $self->{'conf::nosymbol'} = shift @_;
+ $self->{'hook::word'} = undef;
+ }
+
+ sub get_keyindex {
+ my $self = shift @_;
+ return $self->{'KeyIndex'};
+ }
+
+ sub word_hook {
+ my $self = shift @_;
+ $self->{'hook::word'} = shift @_;
+ }
+
+ sub noedgesymbol {
+ my $self = shift @_;
+ $self->word_hook(sub {$_[0] =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g; $_[0];});
+ }
+
+ sub count_words {
+ my $self = shift @_;
+
+ my $contref = $self->{'content'};
+
+ my $part1 = "";
+ my $part2 = "";
+ if ($$contref =~ /\x7f/) {
+ $part1 = substr $$contref, 0, index($$contref, "\x7f");
+ $part2 = substr $$contref, index($$contref, "\x7f");
+ # $part1 = $PREMATCH; # $& and friends are not efficient
+ # $part2 = $MATCH . $POSTMATCH;
+ } else {
+ $part1 = $$contref;
+ $part2 = "";
+ }
+
+ # do scoring
+ my $word_count = $self->{'KeyIndex'};
+ $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
+ $self->wordcount_sub($2, $1, $word_count)!ge;
+ $self->wordcount_sub($part1, 1, $word_count);
+ }
+
+ sub wordcount_sub {
+ my $self = shift @_;
+ my ($text, $weight, $word_count) = @_;
+
+ # Count frequencies of words in a current document.
+ # Handle symbols as follows.
+ #
+ # tcp/ip -> tcp/ip, tcp, ip
+ # (tcp/ip) -> (tcp/ip), tcp/ip, tcp, ip
+ # ((tcpi/ip)) -> ((tcp/ip)), (tcp/ip), tcp
+ #
+ # Don't do processing for nested symbols.
+ # NOTE: When -K is specified, all symbols are already removed.
+
+ my @words = split /\s+/, $text;
+ for my $word (@words) {
+ next if ($word eq "" || length($word) > $self->{'conf::WORD_LENG_MAX'});
+ if (defined $self->{'hook::word'}) {
+ $word = &{$self->{'hook::word'}}($word);
+ }
+ $word_count->{$word} = 0 unless defined($word_count->{$word});
+ $word_count->{$word} += $weight;
+ unless ($self->{'option::nosymbol'}) {
+ $self->splitsymbol($word, $weight);
+ }
+ }
+ return "";
+ }
+
+ sub splitsymbol {
+ my $self = shift @_;
+ my $word = shift @_;
+ my $weight = shift @_;
+ my $word_count = $self->{'KeyIndex'};
+ if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
+ $word_count->{$1} = 0 unless defined($word_count->{$1});
+ $word_count->{$1} += $weight;
+ return unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+ } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
+ $word_count->{$1} = 0 unless defined($word_count->{$1});
+ $word_count->{$1} += $weight;
+ return unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+ } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
+ $word_count->{$1} = 0 unless defined($word_count->{$1});
+ $word_count->{$1} += $weight;
+ return unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+ }
+ my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
+ if $word =~ /[^\xa1-\xfea-z_0-9]/;
+ for my $tmp (@words_) {
+ next if $tmp eq "";
+ $word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
+ $word_count->{$tmp} += $weight;
+ }
+ }
+
+ 1;
diff -cNr namazu-head/scripts/mknmz.in namazu-knok-im2/scripts/mknmz.in
*** namazu-head/scripts/mknmz.in 2002-10-31 19:24:17.000000000 +0900
--- namazu-knok-im2/scripts/mknmz.in 2002-11-07 17:05:43.000000000 +0900
***************
*** 65,70 ****
--- 65,71 ----
my @Seed = ();
my @LoadedRcfiles = ();
my $Magic = new File::MMagic;
+ my $Indexer = undef;
my $ReceiveTERM = 0;
***************
*** 358,363 ****
--- 359,367 ----
require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
require "seed.pl" || die "unable to require \"seed.pl\"\n";
require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
+ require "indexer.pl" || die "unable to require \"indexer.pl\"\n";
+
+ $Indexer = new mknmz::indexer;
@Seed = seed::init();
}
***************
*** 484,491 ****
put_dateindex($cfile);
$content .= $weighted_str; # add weights
! normalize_content(\$content, $kanji);
! count_words($docid_count, $docid_base, \$content);
make_phrase_hash($docid_count, $docid_base, \$content);
# assertion
--- 488,500 ----
put_dateindex($cfile);
$content .= $weighted_str; # add weights
! my $indexer = $Indexer;
! $indexer->init(\$content, $conf::WORD_LENG_MAX, $var::Opt{'nosymbol'});
! $indexer->noedgesymbol() if ($var::Opt{'noedgesymbol'});
! do_wakatigaki(\$content, $kanji);
! $content =~ tr/A-Z/a-z/; # Normalize
! $indexer->count_words();
! add_key($indexer, $docid_count, $docid_base);
make_phrase_hash($docid_count, $docid_base, \$content);
# assertion
***************
*** 495,500 ****
--- 504,525 ----
return $cfile_size;
}
+ sub add_key($$$) {
+ my $indexer = shift @_;
+ my $docid_count = shift @_;
+ my $docid_base = shift @_;
+ my $keyref = $indexer->get_keyindex();
+ my $docid = $docid_count + $docid_base;
+ for my $word (keys(%$keyref)) {
+ next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
+ $KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
+ $KeyIndex{$word} .= pack("w2",
+ $docid - $KeyIndexLast{$word},
+ $keyref->{$word});
+ $KeyIndexLast{$word} = $docid;
+ }
+ }
+
#
# Make the URI from the given file name.
#
***************
*** 2184,2290 ****
return $hash & 65535;
}
! # Nomalization
! sub normalize_content($$) {
my ($contref, $kanji) = @_;
-
- # Normalize into small letter.
- $$contref =~ tr/A-Z/a-z/;
# Do wakatigaki if necessary.
if (util::islang("ja")) {
wakati::wakatize_japanese($contref) if $kanji;
}
-
- # Remove all symbols when -K option is specified.
- $$contref =~ tr/\xa1-\xfea-z0-9/ /c if $var::Opt{'nosymbol'};
- }
-
- # Count frequencies of words.
- sub count_words ($$$) {
- my ($docid_count, $docid_base, $contref) = @_;
- my (@tmp);
-
- my $part1 = "";
- my $part2 = "";
- if ($$contref =~ /\x7f/) {
- $part1 = substr $$contref, 0, index($$contref, "\x7f");
- $part2 = substr $$contref, index($$contref, "\x7f");
- # $part1 = $PREMATCH; # $& and friends are not efficient
- # $part2 = $MATCH . $POSTMATCH;
- } else {
- $part1 = $$contref;
- $part2 = "";
- }
-
- # do scoring
- my %word_count = ();
- $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
- wordcount_sub($2, $1, \%word_count)!ge;
- wordcount_sub($part1, 1, \%word_count);
-
- # Add them to whole index
- my $docid = $docid_count + $docid_base;
- for my $word (keys(%word_count)) {
- next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
- $KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
- $KeyIndex{$word} .= pack("w2",
- $docid - $KeyIndexLast{$word},
- $word_count{$word});
- $KeyIndexLast{$word} = $docid;
- }
- }
-
- #
- # Count words and do score weighting
- #
- sub wordcount_sub ($$\%) {
- my ($text, $weight, $word_count) = @_;
-
- # Count frequencies of words in a current document.
- # Handle symbols as follows.
- #
- # tcp/ip -> tcp/ip, tcp, ip
- # (tcp/ip) -> (tcp/ip), tcp/ip, tcp, ip
- # ((tcpi/ip)) -> ((tcp/ip)), (tcp/ip), tcp
- #
- # Don't do processing for nested symbols.
- # NOTE: When -K is specified, all symbols are already removed.
-
- my @words = split /\s+/, $text;
- for my $word (@words) {
- next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
- if ($var::Opt{'noedgesymbol'}) {
- # remove symbols at both ends
- $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
- }
- $word_count->{$word} = 0 unless defined($word_count->{$word});
- $word_count->{$word} += $weight;
- unless ($var::Opt{'nosymbol'}) {
- if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
- $word_count->{$1} = 0 unless defined($word_count->{$1});
- $word_count->{$1} += $weight;
- next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
- } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
- $word_count->{$1} = 0 unless defined($word_count->{$1});
- $word_count->{$1} += $weight;
- next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
- } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
- $word_count->{$1} = 0 unless defined($word_count->{$1});
- $word_count->{$1} += $weight;
- next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
- }
- my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
- if $word =~ /[^\xa1-\xfea-z_0-9]/;
- for my $tmp (@words_) {
- next if $tmp eq "";
- $word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
- $word_count->{$tmp} += $weight;
- }
- @words_ = ();
- }
- }
- return "";
}
# Construct NMZ.i and NMZ.ii file. this processing is rather complex.
--- 2209,2221 ----
return $hash & 65535;
}
! sub do_wakatigaki ($$) {
my ($contref, $kanji) = @_;
# Do wakatigaki if necessary.
if (util::islang("ja")) {
wakati::wakatize_japanese($contref) if $kanji;
}
}
# Construct NMZ.i and NMZ.ii file. this processing is rather complex.
diff -cNr namazu-head/scripts/mknmz.in.orig namazu-knok-im2/scripts/mknmz.in.orig
*** namazu-head/scripts/mknmz.in.orig 1970-01-01 09:00:00.000000000 +0900
--- namazu-knok-im2/scripts/mknmz.in.orig 2002-10-31 19:24:17.000000000 +0900
***************
*** 0 ****
--- 1,2503 ----
+ #! %PERL% -w
+ # -*- Perl -*-
+ # mknmz - indexer of Namazu
+ # $Id: mknmz.in,v 1.114 2002/10/31 10:24:17 knok Exp $
+ #
+ # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
+ # Copyright (C) 2000,2001 Namazu Project All rights reserved.
+ # This is free software with ABSOLUTELY NO WARRANTY.
+ #
+ # This program is free software; you can redistribute it and/or modify
+ # it under the terms of the GNU General Public License as published by
+ # the Free Software Foundation; either versions 2, or (at your option)
+ # any later version.
+ #
+ # This program is distributed in the hope that it will be useful
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ # GNU General Public License for more details.
+ #
+ # You should have received a copy of the GNU General Public License
+ # along with this program; if not, write to the Free Software
+ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ # 02111-1307, USA
+ #
+ # This file must be encoded in EUC-JP encoding
+ #
+
+ package mknmz;
+ require 5.004;
+ use lib "%ADDITIONAL_INC%";
+ use Cwd;
+ use IO::File;
+ use File::Find;
+ use File::MMagic;
+ use Time::Local;
+ use strict; # be strict since v1.2.0
+ use Getopt::Long;
+ use File::Copy;
+ use DirHandle;
+
+ use vars qw($SYSTEM);
+ $SYSTEM = $^O;
+
+ my $NAMAZU_INDEX_VERSION = "2.0";
+
+ my $CodingSystem = "euc";
+ my $PKGDATADIR = $ENV{'pkgdatadir'} || "@pkgdatadir@";
+ my $CONFDIR = "@CONFDIR@"; # directory where mknmzrc are in.
+ my $LIBDIR = $PKGDATADIR . "/pl"; # directory where library etc. are in.
+ my $FILTERDIR = $PKGDATADIR . "/filter"; # directory where filters are in.
+ my $TEMPLATEDIR = $PKGDATADIR . "/template"; # directory where templates are in.
+
+ my $DeletedFilesCount = 0;
+ my $UpdatedFilesCount = 0;
+ my $APPENDMODE = 0;
+ my %PhraseHash = ();
+ my %PhraseHashLast = ();
+ my %KeyIndex = ();
+ my %KeyIndexLast = ();
+ my %CheckPoint = ("on" => undef, "continue" => undef);
+ my $ConfigFile = undef;
+ my $MediaType = undef;
+
+ my $ReplaceCode = undef; # perl code for transforming URI
+ my @Seed = ();
+ my @LoadedRcfiles = ();
+ my $Magic = new File::MMagic;
+
+ my $ReceiveTERM = 0;
+
+ STDOUT->autoflush(1);
+ STDERR->autoflush(1);
+ main();
+ sub main {
+ my $start_time = time;
+
+ init();
+
+ # At first, loading pl/conf.pl to prevent overriding some variables.
+ preload_modules();
+
+ # set LANG and bind textdomain
+ util::set_lang();
+ textdomain('namazu', $util::LANG_MSG);
+
+ load_rcfiles();
+ load_modules();
+ my ($output_dir, @targets) = parse_options();
+ my ($docid_base, $total_files_num) = prep($output_dir, @targets);
+
+ my $swap = 1;
+ my $docid_count = 0;
+ my $file_count = 0;
+ my $total_files_size = 0;
+ my $key_count = 0;
+ my $checkpoint = 0;
+ my $flist_ptr = 0;
+ my $processed_files_size = 0;
+
+ if ($CheckPoint{'continue'}) {
+ # Restore variables
+ eval util::readfile($var::NMZ{'_checkpoint'}) ;
+ } else {
+ print $total_files_num . _(" files are found to be indexed.\n");
+ }
+
+ {
+ my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
+ my $fh_flist = util::efopen($var::NMZ{'_flist'});
+ my %field_indices = ();
+ get_field_index_base(\%field_indices);
+
+ if ($CheckPoint{'continue'}) {
+ seek($fh_flist, $flist_ptr, 0);
+ }
diff -cNr namazu-head/scripts/mknmz.in.rej namazu-knok-im2/scripts/mknmz.in.rej
*** namazu-head/scripts/mknmz.in.rej 1970-01-01 09:00:00.000000000 +0900
--- namazu-knok-im2/scripts/mknmz.in.rej 2002-11-07 15:31:02.000000000 +0900
***************
*** 0 ****
--- 1,152 ----
+ ***************
+ *** 477,489 ****
+ print $msg_prefix . "$uri [$mtype]\n";
+ }
+
+ complete_field_info(\%fields, $cfile, $uri,
+ \$headings, \$content, \$weighted_str);
+ put_field_index(\%fields, $field_indices);
+
+ put_dateindex($cfile);
+ $content .= $weighted_str; # add weights
+ - count_words($docid_count, $docid_base, \$content, $kanji);
+ make_phrase_hash($docid_count, $docid_base, \$content);
+
+ # assertion
+ --- 481,499 ----
+ print $msg_prefix . "$uri [$mtype]\n";
+ }
+
+ +
+ complete_field_info(\%fields, $cfile, $uri,
+ \$headings, \$content, \$weighted_str);
+ put_field_index(\%fields, $field_indices);
+
+ put_dateindex($cfile);
+ $content .= $weighted_str; # add weights
+ + my $indexer = $Indexer;
+ + $indexer->init(\$content, $conf::WORD_LENG_MAX, $var::Opt{'nosymbol'});
+ + $indexer->noedgesymbol() if ($var::Opt{'noedgesymbol'});
+ + do_wakatigaki(\$content, $kanji);
+ + $indexer->count_words();
+ + add_key($indexer, $docid_count, $docid_base);
+ make_phrase_hash($docid_count, $docid_base, \$content);
+
+ # assertion
+ ***************
+ *** 2181,2282 ****
+ return $hash & 65535;
+ }
+
+ - # Count frequencies of words.
+ - sub count_words ($$$$) {
+ - my ($docid_count, $docid_base, $contref, $kanji) = @_;
+ - my (@tmp);
+ -
+ - # Normalize into small letter.
+ - $$contref =~ tr/A-Z/a-z/;
+ -
+ # Do wakatigaki if necessary.
+ if (util::islang("ja")) {
+ wakati::wakatize_japanese($contref) if $kanji;
+ }
+ -
+ - # Remove all symbols when -K option is specified.
+ - $$contref =~ tr/\xa1-\xfea-z0-9/ /c if $var::Opt{'nosymbol'};
+ -
+ - my $part1 = "";
+ - my $part2 = "";
+ - if ($$contref =~ /\x7f/) {