#
# Copyright (c) 2008  Kazuhiro Ito
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. Neither the name of the project nor the names of its contributors
#    may be used to endorse or promote products derived from this software
#    without specific prior written permission.
# 
# THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#

use strict;
use warnings;
use Getopt::Long;

use English;
use FileHandle;

 MAIN: {
   my $encoding = 'EUC-JP';
   my $index = 'all';

   GetOptions('encoding=s' => \$encoding, 'index=s' => \$index);

   if (!defined($ARGV[0])) {
     print "$PROGRAM_NAME: Too few arguments.\n";
     show_help();
     exit(1);
   }

   # Check index type.
   my ($keyword_level, $indexing_regexp);

   if ($index eq 'all') {
     $keyword_level = 1;
     $indexing_regexp = "^([^<\n]|<K>)";
   } elsif ($index eq 'keyword') {
     $keyword_level = 0;
     $indexing_regexp = '.';
   } elsif ($index eq 'prefix') {
     $keyword_level = 1;
     $indexing_regexp = '<K>';
   } else {
     die "$PROGRAM_NAME: Unknown index type, $index\n";
   }


   # Check encoding.
   my $char_regexp;

   if ($encoding =~ m/euc/i) {
     $char_regexp = "(&[a-z]+;|[\x8e\xa1-\xfe][\xa1-\xfe]|\x8f[\xa1-\xfe]{2}|[\x00-\x25]|[\x27-\x3b]|[\x3f-\x7f]|=|<\/?[KH]>)";
   } elsif ($encoding =~ m/utf.*8/i) {
     $char_regexp = "(&[a-z]+;|[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3}|[\x00-\x25]|[\x27-\x3b]|[\x3f-\x7f]|=|<\/?[KH]>)";
   } elsif ($encoding =~ m/1.*byte/i) {
     $char_regexp = "(&[a-z]+;|[\x00-\x25]|[\x27-\x3b]|[\x3f-\xff]|=|<\/?[KH]>)";
   } else {
     die "$PROGRAM_NAME: Unkown encoding, $encoding\n";
   }


   # Check files.
   my $src_name = $ARGV[0];
   my $dst_name;

   if (defined($ARGV[1])) {
     $dst_name = $ARGV[1];
   } else {
     $dst_name = "$src_name.ary";
   }

   print "text  file is $src_name\n";
   print "array file is $dst_name\n";

   my $src_handle = new FileHandle;
   my $dst_handle = new FileHandle;

   if (!$src_handle->open("$src_name", 'r')) {
     die "$PROGRAM_NAME: Failed to open the file. $ERRNO: $src_name\n";
   }
   binmode $src_handle;

   if (!$dst_handle->open("$dst_name", 'w')) {
     die "$PROGRAM_NAME: Failed to open the file. $ERRNO: $dst_name\n";
   }
   binmode $dst_handle;


   # Dump index points.
   my ($line, $key);
   my $point = 0;
   my $buffer='';

   while ($line = $src_handle->getline) {
     if ($line !~ /^\#/) {
       my $in_keyword = $keyword_level;

       while ($line =~ m{$char_regexp}gx) {
	 $key = $1;

	 if ($keyword_level == 0){
	   if ($key eq '<K>') {
	     $in_keyword += 1;
	   } elsif ($key eq '</K>') {
	     $in_keyword -= 1;
	   }
	 }

	 if ($key =~ /$indexing_regexp/ && $in_keyword > 0) {
	   # print "$key ";

	   $buffer .= pack ("N", $point);
	   if (length($buffer) > 1024 * 64) {
	     $dst_handle->print($buffer);
	     $buffer = '';
	   }
	 }
	 $point += length($key);
       }
       # print "\n";
     }
     $point = $src_handle->tell();
   }
   $dst_handle->print($buffer);
}

sub show_help {
  print "Usage:$PROGRAM_NAME [Options] text_file [array_file]\n";
  print "Options\n";
  print "--encoding=neme  Set encoding of dictionary. Below names are available.\n";
  print "                   EUC-JP for EUC-JP encoding. (default)\n";
  print "                   UTF-8  for UTF-8 encoding.\n";
  print "                   1BYTE  for encoding which encode 1character per byte.\n";
  print "--index=type     Set index type to be created. Below types are available.\n";
  print "                   all     for indexing all text. (default)\n";
  print "                   keyword for indexing keyword text.\n";
  print "                   prefix  for indexing headings of keyword.\n";
}
