Namazu-devel-ja(旧)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
indexer.pl

From: knok@xxxxxxxxxxxxx (NOKUBI Takatsugu)
Date: Fri, 8 Nov 2002 14:22:11 JST
X-ml-name: namazu-devel-ja
X-mail-count: 02692
  リファクタリングの一環として、index 関連のコードを pl/ 以下に追い出
してみました。

  一応 make check した範囲では問題がないことを確認していますが、commit
する前に review していただけると嬉しいです。

# 次は非ファイル対応かな...
-- 
野首 貴嗣
E-mail: knok@xxxxxxxxxxxxx
	knok@xxxxxxxxxx / knok@xxxxxxxxxx

diff -cNr namazu-head/pl/Makefile.am namazu-knok-im2/pl/Makefile.am
*** namazu-head/pl/Makefile.am	2001-09-21 17:11:30.000000000 +0900
--- namazu-knok-im2/pl/Makefile.am	2002-11-07 15:38:41.000000000 +0900
***************
*** 16,22 ****
  	usage.pl \
  	util.pl \
  	var.pl \
! 	wakati.pl
  
  # Slightly different from perllib_DATA because of *.in files.
  EXTRA_DIST = $\
--- 16,23 ----
  	usage.pl \
  	util.pl \
  	var.pl \
! 	wakati.pl \
! 	indexer.pl
  
  # Slightly different from perllib_DATA because of *.in files.
  EXTRA_DIST = $\
***************
*** 30,36 ****
  	usage.pl \
  	util.pl \
  	var.pl \
! 	wakati.pl
  
  CLEANFILES = gettext.pl
  
--- 31,38 ----
  	usage.pl \
  	util.pl \
  	var.pl \
! 	wakati.pl \
! 	indexer.pl
  
  CLEANFILES = gettext.pl
  
diff -cNr namazu-head/pl/indexer.pl namazu-knok-im2/pl/indexer.pl
*** namazu-head/pl/indexer.pl	1970-01-01 09:00:00.000000000 +0900
--- namazu-knok-im2/pl/indexer.pl	2002-11-07 17:05:46.000000000 +0900
***************
*** 0 ****
--- 1,124 ----
+ #
+ 
+ package mknmz::indexer;
+ 
+ sub new {
+     my $self = {};
+     my $proto = shift @_;
+     my $class = ref($proto) || $proto;
+     bless($self);
+ 
+     $self->init(@_);
+     return $self;
+ }
+ 
+ sub init {
+     my $self = shift @_;
+     $self->{'KeyIndex'} = {};
+     $self->{'content'} = shift @_;
+     $self->{'conf::WORD_LENG_MAX'} = shift @_;
+     $self->{'conf::nosymbol'} = shift @_;
+     $self->{'hook::word'} = undef;
+ }
+ 
+ sub get_keyindex {
+     my $self = shift @_;
+     return $self->{'KeyIndex'};
+ }
+ 
+ sub word_hook {
+     my $self = shift @_;
+     $self->{'hook::word'} = shift @_;
+ }
+ 
+ sub noedgesymbol {
+     my $self = shift @_;
+     $self->word_hook(sub {$_[0] =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g; $_[0];});
+ }
+ 
+ sub count_words {
+     my $self = shift @_;
+ 
+     my $contref = $self->{'content'};
+ 
+     my $part1 = "";
+     my $part2 = "";
+     if ($$contref =~ /\x7f/) {
+         $part1 = substr $$contref, 0, index($$contref, "\x7f");
+         $part2 = substr $$contref, index($$contref, "\x7f");
+ #       $part1 = $PREMATCH;  # $& and friends are not efficient
+ #       $part2 = $MATCH . $POSTMATCH;
+     } else {
+         $part1 = $$contref;
+         $part2 = "";
+     }
+ 
+     # do scoring
+     my $word_count = $self->{'KeyIndex'};
+     $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
+         $self->wordcount_sub($2, $1, $word_count)!ge;
+     $self->wordcount_sub($part1, 1, $word_count);
+ }
+ 
+ sub wordcount_sub {
+     my $self = shift @_;
+     my ($text, $weight, $word_count) = @_;
+ 
+     # Count frequencies of words in a current document.
+     # Handle symbols as follows.
+     #
+     # tcp/ip      ->  tcp/ip,     tcp,      ip
+     # (tcp/ip)    ->  (tcp/ip),   tcp/ip,   tcp, ip
+     # ((tcpi/ip)) ->  ((tcp/ip)), (tcp/ip), tcp
+     #
+     # Don't do processing for nested symbols.
+     # NOTE: When -K is specified, all symbols are already removed.
+ 
+     my @words = split /\s+/, $text;
+     for my $word (@words) {
+         next if ($word eq "" || length($word) > $self->{'conf::WORD_LENG_MAX'});
+ 	if (defined $self->{'hook::word'}) {
+ 	    $word = &{$self->{'hook::word'}}($word);
+ 	}
+         $word_count->{$word} = 0 unless defined($word_count->{$word});
+         $word_count->{$word} += $weight;
+         unless ($self->{'option::nosymbol'}) {
+ 	    $self->splitsymbol($word, $weight);
+         }
+     }
+     return "";
+ }
+ 
+ sub splitsymbol {
+     my $self = shift @_;
+     my $word = shift @_;
+     my $weight = shift @_;
+     my $word_count = $self->{'KeyIndex'};
+     if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
+ 	$word_count->{$1} = 0 unless defined($word_count->{$1});
+ 	$word_count->{$1} += $weight;
+ 	return unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+     } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
+ 	$word_count->{$1} = 0 unless defined($word_count->{$1});
+ 	$word_count->{$1} += $weight;
+ 	return unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+     } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
+ 	$word_count->{$1} = 0 unless defined($word_count->{$1});
+ 	$word_count->{$1} += $weight;
+ 	return unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
+     }
+     my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
+       if $word =~ /[^\xa1-\xfea-z_0-9]/;
+     for my $tmp (@words_) {
+ 	next if $tmp eq "";
+ 	$word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
+ 	$word_count->{$tmp} += $weight;
+     }
+ }
+ 
+ 1;
diff -cNr namazu-head/scripts/mknmz.in namazu-knok-im2/scripts/mknmz.in
*** namazu-head/scripts/mknmz.in	2002-10-31 19:24:17.000000000 +0900
--- namazu-knok-im2/scripts/mknmz.in	2002-11-07 17:05:43.000000000 +0900
***************
*** 65,70 ****
--- 65,71 ----
  my @Seed = ();
  my @LoadedRcfiles = ();
  my $Magic = new File::MMagic;
+ my $Indexer = undef;
  
  my $ReceiveTERM = 0;
  
***************
*** 358,363 ****
--- 359,367 ----
      require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
      require "seed.pl" || die "unable to require \"seed.pl\"\n";
      require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
+     require "indexer.pl" || die "unable to require \"indexer.pl\"\n";
+ 
+     $Indexer = new mknmz::indexer;
  
      @Seed = seed::init();
  }
***************
*** 484,491 ****
  
      put_dateindex($cfile);
      $content .= $weighted_str;   # add weights
!     normalize_content(\$content, $kanji);
!     count_words($docid_count, $docid_base, \$content);
      make_phrase_hash($docid_count, $docid_base, \$content);
  
      # assertion
--- 488,500 ----
  
      put_dateindex($cfile);
      $content .= $weighted_str;   # add weights
!     my $indexer = $Indexer;
!     $indexer->init(\$content, $conf::WORD_LENG_MAX, $var::Opt{'nosymbol'});
!     $indexer->noedgesymbol() if ($var::Opt{'noedgesymbol'});
!     do_wakatigaki(\$content, $kanji);
!     $content =~ tr/A-Z/a-z/; # Normalize
!     $indexer->count_words();
!     add_key($indexer, $docid_count, $docid_base);
      make_phrase_hash($docid_count, $docid_base, \$content);
  
      # assertion
***************
*** 495,500 ****
--- 504,525 ----
      return $cfile_size;
  }
  
+ sub add_key($$$) {
+     my $indexer = shift @_;
+     my $docid_count = shift @_;
+     my $docid_base = shift @_;
+     my $keyref = $indexer->get_keyindex();
+     my $docid = $docid_count + $docid_base;
+     for my $word (keys(%$keyref)) {
+ 	next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
+ 	$KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
+ 	$KeyIndex{$word} .= pack("w2", 
+ 				 $docid - $KeyIndexLast{$word}, 
+ 				 $keyref->{$word});
+ 	$KeyIndexLast{$word} = $docid;
+     }
+ }
+ 
  #
  # Make the URI from the given file name.
  #
***************
*** 2184,2290 ****
      return $hash & 65535;
  }
  
! # Nomalization
! sub normalize_content($$) {
      my ($contref, $kanji) = @_;
-     
-     # Normalize into small letter.
-     $$contref =~ tr/A-Z/a-z/;
  
      # Do wakatigaki if necessary.
      if (util::islang("ja")) {
  	wakati::wakatize_japanese($contref) if $kanji;
      }
- 
-     # Remove all symbols when -K option is specified.
-     $$contref =~ tr/\xa1-\xfea-z0-9/   /c if $var::Opt{'nosymbol'};
- }
- 
- # Count frequencies of words.
- sub count_words ($$$) {
-     my ($docid_count, $docid_base, $contref) = @_;
-     my (@tmp);
- 
-     my $part1 = "";
-     my $part2 = "";
-     if ($$contref =~ /\x7f/) {
- 	$part1 = substr $$contref, 0, index($$contref, "\x7f");
- 	$part2 = substr $$contref, index($$contref, "\x7f");
- #	$part1 = $PREMATCH;  # $& and friends are not efficient
- #	$part2 = $MATCH . $POSTMATCH;
-     } else {
- 	$part1 = $$contref;
- 	$part2 = "";
-     }
- 
-     # do scoring
-     my %word_count = ();
-     $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
- 	wordcount_sub($2, $1, \%word_count)!ge;
-     wordcount_sub($part1, 1, \%word_count);
- 
-     # Add them to whole index
-     my $docid = $docid_count + $docid_base;
-     for my $word (keys(%word_count)) {
- 	next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
- 	$KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
- 	$KeyIndex{$word} .= pack("w2", 
- 				 $docid - $KeyIndexLast{$word}, 
- 				 $word_count{$word});
- 	$KeyIndexLast{$word} = $docid;
-     }
- }
- 
- #
- # Count words and do score weighting
- #
- sub wordcount_sub ($$\%) {
-     my ($text, $weight, $word_count) = @_;
- 
-     # Count frequencies of words in a current document.
-     # Handle symbols as follows.
-     #
-     # tcp/ip      ->  tcp/ip,     tcp,      ip
-     # (tcp/ip)    ->  (tcp/ip),   tcp/ip,   tcp, ip
-     # ((tcpi/ip)) ->  ((tcp/ip)), (tcp/ip), tcp
-     #
-     # Don't do processing for nested symbols.
-     # NOTE: When -K is specified, all symbols are already removed.
- 
-     my @words = split /\s+/, $text;
-     for my $word (@words) {
- 	next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
- 	if ($var::Opt{'noedgesymbol'}) {
- 	    # remove symbols at both ends
- 	    $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
- 	}
- 	$word_count->{$word} = 0 unless defined($word_count->{$word});
- 	$word_count->{$word} += $weight;
- 	unless ($var::Opt{'nosymbol'}) {
- 	    if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
- 		$word_count->{$1} = 0 unless defined($word_count->{$1});
- 		$word_count->{$1} += $weight;
- 		next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
- 	    } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
- 		$word_count->{$1} = 0 unless defined($word_count->{$1});
- 		$word_count->{$1} += $weight;
- 		next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
- 	    } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
- 		$word_count->{$1} = 0 unless defined($word_count->{$1});
- 		$word_count->{$1} += $weight;
- 		next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
- 	    }
- 	    my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
- 		if $word =~ /[^\xa1-\xfea-z_0-9]/;
- 	    for my $tmp (@words_) {
- 		next if $tmp eq "";
- 		$word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
- 		$word_count->{$tmp} += $weight;
- 	    }
- 	    @words_ = ();
- 	}
-     }
-     return "";
  }
  
  # Construct NMZ.i and NMZ.ii file. this processing is rather complex.
--- 2209,2221 ----
      return $hash & 65535;
  }
  
! sub do_wakatigaki ($$) {
      my ($contref, $kanji) = @_;
  
      # Do wakatigaki if necessary.
      if (util::islang("ja")) {
  	wakati::wakatize_japanese($contref) if $kanji;
      }
  }
  
  # Construct NMZ.i and NMZ.ii file. this processing is rather complex.
diff -cNr namazu-head/scripts/mknmz.in.orig namazu-knok-im2/scripts/mknmz.in.orig
*** namazu-head/scripts/mknmz.in.orig	1970-01-01 09:00:00.000000000 +0900
--- namazu-knok-im2/scripts/mknmz.in.orig	2002-10-31 19:24:17.000000000 +0900
***************
*** 0 ****
--- 1,2503 ----
+ #! %PERL% -w
+ # -*- Perl -*-
+ # mknmz - indexer of Namazu
+ # $Id: mknmz.in,v 1.114 2002/10/31 10:24:17 knok Exp $
+ #
+ # Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
+ # Copyright (C) 2000,2001 Namazu Project All rights reserved.
+ #     This is free software with ABSOLUTELY NO WARRANTY.
+ #
+ #  This program is free software; you can redistribute it and/or modify
+ #  it under the terms of the GNU General Public License as published by
+ #  the Free Software Foundation; either versions 2, or (at your option)
+ #  any later version.
+ # 
+ #  This program is distributed in the hope that it will be useful
+ #  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ #  GNU General Public License for more details.
+ #
+ #  You should have received a copy of the GNU General Public License
+ #  along with this program; if not, write to the Free Software
+ #  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ #  02111-1307, USA
+ #
+ #  This file must be encoded in EUC-JP encoding
+ #
+ 
+ package mknmz;
+ require 5.004;
+ use lib "%ADDITIONAL_INC%";
+ use Cwd;
+ use IO::File;
+ use File::Find;
+ use File::MMagic;
+ use Time::Local;
+ use strict;  # be strict since v1.2.0
+ use Getopt::Long;
+ use File::Copy;
+ use DirHandle;
+ 
+ use vars qw($SYSTEM);
+ $SYSTEM = $^O;
+ 
+ my $NAMAZU_INDEX_VERSION = "2.0";
+ 
+ my $CodingSystem = "euc";
+ my $PKGDATADIR    = $ENV{'pkgdatadir'} || "@pkgdatadir@";
+ my $CONFDIR       = "@CONFDIR@";     # directory where mknmzrc are in.
+ my $LIBDIR        = $PKGDATADIR . "/pl";      # directory where library etc. are in.
+ my $FILTERDIR     = $PKGDATADIR . "/filter";   # directory where filters are in.
+ my $TEMPLATEDIR   = $PKGDATADIR . "/template"; # directory where templates are in.
+ 
+ my $DeletedFilesCount = 0;
+ my $UpdatedFilesCount = 0;
+ my $APPENDMODE = 0;
+ my %PhraseHash = ();
+ my %PhraseHashLast = ();
+ my %KeyIndex = ();
+ my %KeyIndexLast = ();
+ my %CheckPoint = ("on" => undef, "continue" => undef);
+ my $ConfigFile = undef;
+ my $MediaType  = undef;
+ 
+ my $ReplaceCode  = undef;  # perl code for transforming URI
+ my @Seed = ();
+ my @LoadedRcfiles = ();
+ my $Magic = new File::MMagic;
+ 
+ my $ReceiveTERM = 0;
+ 
+ STDOUT->autoflush(1);
+ STDERR->autoflush(1);
+ main();
+ sub main {
+     my $start_time = time;
+ 
+     init();
+ 
+     # At first, loading pl/conf.pl to prevent overriding some variables.
+     preload_modules();
+ 
+     # set LANG and bind textdomain
+     util::set_lang();
+     textdomain('namazu', $util::LANG_MSG);
+ 
+     load_rcfiles();
+     load_modules();
+     my ($output_dir, @targets) = parse_options();
+     my ($docid_base, $total_files_num) = prep($output_dir, @targets);
+ 
+     my $swap = 1;
+     my $docid_count = 0;
+     my $file_count = 0;
+     my $total_files_size = 0;
+     my $key_count = 0;
+     my $checkpoint = 0;
+     my $flist_ptr = 0;
+     my $processed_files_size = 0;
+ 
+     if ($CheckPoint{'continue'}) {
+ 	# Restore variables
+ 	eval util::readfile($var::NMZ{'_checkpoint'}) ;
+     } else {
+ 	print $total_files_num . _(" files are found to be indexed.\n");
+     }
+ 
+     {
+ 	my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
+ 	my $fh_flist = util::efopen($var::NMZ{'_flist'});
+ 	my %field_indices = ();
+ 	get_field_index_base(\%field_indices);
+ 
+ 	if ($CheckPoint{'continue'}) {
+ 	    seek($fh_flist, $flist_ptr, 0);
+ 	}
diff -cNr namazu-head/scripts/mknmz.in.rej namazu-knok-im2/scripts/mknmz.in.rej
*** namazu-head/scripts/mknmz.in.rej	1970-01-01 09:00:00.000000000 +0900
--- namazu-knok-im2/scripts/mknmz.in.rej	2002-11-07 15:31:02.000000000 +0900
***************
*** 0 ****
--- 1,152 ----
+ ***************
+ *** 477,489 ****
+   	print $msg_prefix . "$uri [$mtype]\n";
+       }
+   
+       complete_field_info(\%fields, $cfile, $uri, 
+   			\$headings, \$content, \$weighted_str);
+       put_field_index(\%fields, $field_indices);
+   
+       put_dateindex($cfile);
+       $content .= $weighted_str;   # add weights
+ -     count_words($docid_count, $docid_base, \$content, $kanji);
+       make_phrase_hash($docid_count, $docid_base, \$content);
+   
+       # assertion
+ --- 481,499 ----
+   	print $msg_prefix . "$uri [$mtype]\n";
+       }
+   
+ + 
+       complete_field_info(\%fields, $cfile, $uri, 
+   			\$headings, \$content, \$weighted_str);
+       put_field_index(\%fields, $field_indices);
+   
+       put_dateindex($cfile);
+       $content .= $weighted_str;   # add weights
+ +     my $indexer = $Indexer;
+ +     $indexer->init(\$content, $conf::WORD_LENG_MAX, $var::Opt{'nosymbol'});
+ +     $indexer->noedgesymbol() if ($var::Opt{'noedgesymbol'});
+ +     do_wakatigaki(\$content, $kanji);
+ +     $indexer->count_words();
+ +     add_key($indexer, $docid_count, $docid_base);
+       make_phrase_hash($docid_count, $docid_base, \$content);
+   
+       # assertion
+ ***************
+ *** 2181,2282 ****
+       return $hash & 65535;
+   }
+   
+ - # Count frequencies of words.
+ - sub count_words ($$$$) {
+ -     my ($docid_count, $docid_base, $contref, $kanji) = @_;
+ -     my (@tmp);
+ - 
+ -     # Normalize into small letter.
+ -     $$contref =~ tr/A-Z/a-z/;
+ - 
+       # Do wakatigaki if necessary.
+       if (util::islang("ja")) {
+   	wakati::wakatize_japanese($contref) if $kanji;
+       }
+ - 
+ -     # Remove all symbols when -K option is specified.
+ -     $$contref =~ tr/\xa1-\xfea-z0-9/   /c if $var::Opt{'nosymbol'};
+ - 
+ -     my $part1 = "";
+ -     my $part2 = "";
+ -     if ($$contref =~ /\x7f/) {
Follow-Ups:
- Re: indexer.pl
  - From: Ryuji Abe
- Re: indexer.pl
  - From: Rei FURUKAWA
Prev by Date: Re: apachecache.pl
Next by Date: Re: indexer.pl
Previous by thread: Re: 中国語
Next by thread: Re: indexer.pl
Index(es):
- Date
- Thread