#                                                         -*- Perl -*-
# Copyright (c) 2007  Kazuhiro Ito
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#

use strict;
use warnings;
use Getopt::Long;

use English;
use FreePWING::FPWUtils::FPWParser;
use Encode qw/ from_to /;
use FileHandle;
use Compress::Raw::Zlib;

use vars qw(%fpwoald7_conf);
require "oald7-fpw.conf";

use vars qw (%entity_table %utf2euc_table $utf2euc_regexp);
use vars qw (%tags_table $tags_regexp);
require "tables";

use vars qw (@tags_table);

@tags_table = (
  # tag, begining LF, ending LF, indent, ignoring, beg-text, end-text
#    ['p:h'     , 0, 0, 0, 0, '', ''],
#    ['p:panel' , 1, 0, 0, 0, '', ''],
  ['p:althead'  , 1, 0, 0, 0, ''  , '' ],
  ['p:box'      , 2, 2, 2, 0, ''  , '' ],
  ['p:d'        , 0, 0, 0, 0, ''  , '' ],
  ['p:defs'     , 1, 0, 0, 0, ''  , '' ],
  ['p:defshead' , 1, 0, 0 ,0, ''  , '' ],
  ['p:defsref'  , 1, 0, 0, 0, ''  , '' ],
  ['p:dr-g'     , 1, 1, 1, 0, ''  , '' ],
  ['p:e_h'      , 1, 1, 0, 0, ''  , '' ],
  ['p:h-g'      , 0, 1, 0, 0, ''  , '' ],
  ['p:helppanel', 1, 1, 0, 0, ''  , '' ],
  ['p:id-g'     , 1, 0, 0, 0, ''  , '' ],
  ['p:idpanel'  , 1, 0, 0, 0, '[' , ']'],
  ['p:inref'    , 2, 2, 0, 0, ''  , '' ],
  ['p:more'     , 1, 0, 0 ,0, ''  , '' ],
  ['p:n-g'      , 1, 0, 1, 0, ''  , '' ],
  ['p:ndx'      , 1, 0, 0, 0, ''  , '' ],
  ['p:ndxnr'    , 0, 0, 0, 0, ''  , ' '],
  ['p:ngnum'    , 0, 0, 0, 0, ''  , ' '],
  ['p:np'       , 1, 0, 0, 0, '　', '' ],
  ['p:p-g'      , 1, 2, 0, 0, ''  , '' ],
  ['p:para'     , 1, 0, 0, 0, ''  , '' ],
  ['p:pv-g'     , 1, 0, 0, 0, ''  , '' ],
  ['p:pvpanel'  , 1, 0, 0, 0, '[' , ']'],
  ['p:sd-g'     , 1, 0, 0, 0, ''  , '' ],
  ['p:subentry' , 2, 0, 1, 0, ''  , '' ],
  ['p:subhdnr'  , 0, 0, 0, 0, ''  , ' '],
  ['p:sn'       , 1, 0, 0, 0, ''  , '' ],
  ['p:unpanel'  , 1, 1, 0, 0, ''  , '' ],
  ['p:x'        , 0, 0, 0, 0, ''  , ' '],
  ['p:zn'       , 0, 0, 0, 0, ''  , ' '],

#  ['p:ud'       , 0, 0, 0, 0, ' ', ''],
  ['span class="helpbox"', 2, 2, 2, 0, '', ''],
  ['span class="etymbox"', 2, 2, 2, 0, '', ''],
  ['span class="unbox"'  , 2, 2, 2, 0, '', ''],
#  ['span class="exa"'    , 0, 0, 0, 0, '', ''],

  ['p:i-g'   , 0, 0, 0, 1, '', ''],
  ['span class="activebutton"', 0, 0, 0, 1, '', ''],
 
  ['.*'      , 0, 0, 0, 0, '', ''],
    );


 MAIN: {
   my $time = time;
   my $page_count = 0;
   my $entry_count = 0;
   
   my ($fpwtext, $fpwheading, $fpwword2, $fpwcopyright);
   initialize_fpwparser('text' => \$fpwtext,
			'heading' => \$fpwheading,
			'word2' => \$fpwword2,
			'copyright' => \$fpwcopyright);

   my $SRCDIR = './SRC/';
   my $content_filename = $SRCDIR.'CONTENT.tda';
   my $content_index_filename = $SRCDIR.'CONTENT.tda.tdz';
#    my $index_filename = $SRCDIR.'files.dat';
#    my $heading_filename = $SRCDIR.'TITLE.tda';

   my $content_handle = new FileHandle;
   if (!$content_handle->open("$content_filename", 'r')) {
     die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $content_filename\n";
   }
   binmode $content_handle;

   my $content_index_handle = new FileHandle;
   if (!$content_index_handle->open("$content_index_filename", 'r')) {
     die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $content_index_filename\n";
   }
   binmode $content_index_handle;

   my $heading_position;
   my $text_position;

   for (;;) {
     my $size;
     my $contents;
     my $content;
     my $zipped_contents;
     my $tmp;

     if (read($content_index_handle, $tmp, 8) != 8) {
       last;
     }
     ($size) = unpack("x4V", $tmp);

     if (read($content_handle, $zipped_contents, $size) != $size) {
       die "$PROGRAM_NAME: File reading error: $content_filename\n";
     }

     my ($inflater, $status) = new Compress::Raw::Zlib::Inflate();
     if ($status != Z_OK) {
       die "$PROGRAM_NAME: Failed to initialize inflater\n";
     }
     $inflater->inflate($zipped_contents, $contents);
     if ($status != Z_OK && $status != Z_STREAM_END) {
       die "$PROGRAM_NAME: Failed to inflate\n";
     }

     foreach $content (split(/\0+/, $contents)) {
       my $heading;
       my @headings;

       if (!($content =~ /<p:(ht?)>(.+?)<\/p:\1>/)) {
	 print "Warning: no header is found.\n";
	 next;
       }
       $heading = $2;
       $heading =~ s/<[^>]*>//g;

       $page_count++;

       print "Entry: $page_count; $heading\n";
       $heading_position = register_heading(\$fpwheading, $heading);
       $text_position = register_content(\$fpwtext, $heading, $content);


       @headings= ($heading);
       while ($content =~ s/<p:v>(.+?)<\/p:v>//) {
	 $heading = $1;
	 $heading =~ s/<[^>]*>//g;
	 push (@headings, ($heading));
       }
       $tmp = [];
       foreach $_ (@headings) {
	 $_ =~ s/%//g;
	 push (@$tmp, ($_));
	 if ($_ =~ /^(?:[aA]n?|[Tt]he) (([^a&]|a[^n]|an[^d]|and[^ ]).+)/) {
	   push (@$tmp, ($1));
	 }
       }
       @headings = @$tmp;

       register_search_entry(\$fpwword2, \@headings, $heading_position, $text_position);
     }
   }

   finalize_fpwparser('text' => \$fpwtext,
		      'heading' => \$fpwheading,
		      'word2' => \$fpwword2,
		      'copyright' => \$fpwcopyright);
   
   printf("$PROGRAM_NAME: Elapsed time     : %8dsec.\n", time - $time);
   printf("$PROGRAM_NAME: Number of entries: %8d\n", $page_count);
}


sub register_heading {
  my ($fpwheading, $heading) = @_;
  
  $heading = utf2euc(decode_entity($heading));
  $$fpwheading->new_entry()
      || die "$PROGRAM_NAME: " . $$fpwheading->error_message() . "\n";
  $$fpwheading->add_text($heading)
      || die "$PROGRAM_NAME: " . $$fpwheading->error_message() . "\n";
  return $$fpwheading->entry_position();
}

sub trim_key {
  my $str = $_[0];
  $str =~ s/(^([\x80-\xff][\x80-\xff]|[\x20-\x7f]){1,127}).*$/$1/o;
  return $str;
}

sub register_search_entry {
  my ($fpwword2, $keys, $heading, $text) = @_;
  my $key;
  my %tmp;

  foreach $key (@$keys) {
    $key = utf2euc(decode_entity($key));
    $key =~ s/\xA1\xA6|\?|\(.*?\)//g; 

    $tmp{$key} = 1;
  }

  foreach $key (keys(%tmp)) {
    if(verbose_mode ()) {
      my $tmp = $key;
      from_to($tmp, 'euc-jp', 'utf-8');
      print "Entry key: $tmp\n";
    }
    if ($fpwoald7_conf{'trim_long_index'}) {
      $key = trim_key($key);
    }
    $$fpwword2->add_entry($key, $heading, $text)
	|| die "$PROGRAM_NAME: " . $$fpwword2->error_message() . "\n";
  }
}

sub decode_entity {
  my $text = $_[0];

  $text =~ s/&([0-9a-zA-Z]+);/$entity_table{$1}/g;

  return $text;
}

sub utf2euc {
  my ($text) = @_;

  $text =~ s/$utf2euc_regexp/$utf2euc_table{$1}/g;

  from_to($text, 'utf-8', 'euc-jp');
  # $text =~ s/\x8F[\xA1-\xFE][\xA1-\xFE]|\x8E[\xE0-\xFE]/?/g;
  $text =~ s/\x8F[\xA1-\xFE][\xA1-\xFE]/?/g;

  # Workaround
  $text =~ s/\x7f/?/g;

  return $text;
}


sub register_content {
  my ($fpwtext, $heading, $content) = @_;
  my $converted_heading = utf2euc(decode_entity($heading));
  
  ($$fpwtext->new_entry()
   && $$fpwtext->add_keyword_start()
   && $$fpwtext->add_text($converted_heading)
   && $$fpwtext->add_keyword_end()
   && $$fpwtext->add_newline()
   && $$fpwtext->add_indent_level(2))
      || die "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";

  my $text;

  my $indent_level = 2;
  my $indent_last  = 2;
  my @tag_info;
  my $ignore_level = 0;
  my $tag_level = 0;
  my $lfed = 0;

  my $tmp;
  my $i;

  $content =~ s/(<p:entry.*<\/p:entry>)/$1/;

  foreach $text (split(/(<[^<>]*?>)/, $content)) {
    if ($text =~ /^<[^\/>]([^>]*[^\/])?>/) {
      # some opening tag.
      if ($ignore_level != 0) {
	$tag_level++;
	next;
      }
      foreach $tmp (@tags_table) {
	$_ = ${$tmp}[0];
	if  ($text =~ /<$_[^a-zA-Z_-]/) {
	  if (${$tmp}[4] != 0) {
	    $ignore_level = $tag_level;
	    last;
	  }
	  $tag_info[$tag_level] = $tmp;

	  if (${$tmp}[1] != 0) {
	    $i = ${$tmp}[1] - $lfed;
	    if ($i > 0) {
	      for ($lfed = $i; $i > 0; $i--) {
		$$fpwtext->add_newline()
		    || register_content_error ($fpwtext, $heading, $content);
	      }
	    }
	  }

	  if (length(${$tmp}[5]) > 0) {
	    if ($indent_level != $indent_last) {
	      if (!($indent_level >= 6 && $indent_last == 6)) {
		$$fpwtext->add_indent_level ($indent_level >=6 ? 6 : $indent_level)
		    || register_content_error ($fpwtext, $heading, $content);
		$indent_last = ($indent_level >=6 ? 6 : $indent_level);
	      }
	    }
	    $$fpwtext->add_text(utf2euc(${$tmp}[5]))
		|| register_content_error ($fpwtext, $heading, $content);
	    $lfed = 0;
	  }

	  $indent_level += ${$tmp}[3];
	  last;
	}
      }
      $tag_level++;

    } elsif ($text =~ /^<\//) {
      # some ending tag.
      $tag_level--;

      if ($ignore_level) {
	if ($ignore_level < $tag_level) {
	  next;
	}
	$ignore_level = 0;
	next;
      }

      if (length(${$tag_info[$tag_level]}[6]) > 0) {
	if ($indent_level != $indent_last) {
	  if (!($indent_level >= 6 && $indent_last == 6)) {
	    $$fpwtext->add_indent_level ($indent_level >=6 ? 6 : $indent_level)
		|| register_content_error ($fpwtext, $heading, $content);
	    $indent_last = ($indent_level >=6 ? 6 : $indent_level);
	  }
	}
	$$fpwtext->add_text(utf2euc(${$tag_info[$tag_level]}[6]))
	    || register_content_error ($fpwtext, $heading, $content);
	$lfed = 0;
      }

      if (${$tag_info[$tag_level]}[2] != 0) {
	$i = ${$tag_info[$tag_level]}[2] - $lfed;
	if ($i > 0) {
	  for ($lfed = $i; $i > 0; $i--) {
	    $$fpwtext->add_newline()
		|| register_content_error ($fpwtext, $heading, $content);
	  }
	}
      }

      $indent_level -= ${$tag_info[$tag_level]}[3];

    } elsif ($text =~ /^</) {
      # some single tag.
      if ($text =~ /^<br[ >\/]/) {
	$$fpwtext->add_newline()
	    || register_content_error ($$fpwtext, $heading, $content);
	$lfed++;
      }
    } elsif ($ignore_level == 0 && $text !~ /^\n*$/) {
      # text.
      if ($indent_level != $indent_last) {
	if (!($indent_level >= 6 && $indent_last == 6)) {
	  $$fpwtext->add_indent_level ($indent_level >=6 ? 6 : $indent_level)
	      || register_content_error ($fpwtext, $heading, $content);
	  $indent_last = ($indent_level >=6 ? 6 : $indent_level);
	}
      }
      $$fpwtext->add_text(utf2euc(decode_entity($text)))
	  || register_content_error ($fpwtext, $heading, $content);
      $lfed = 0;
    }
  }

  if ($lfed == 0) {
    $$fpwtext->add_newline()
	|| register_content_error ($$fpwtext, $heading, $content);
  }
  
  return $$fpwtext->entry_position();
}

sub register_content_error {
  my ($fpwtext, $heading, $content) = @_;

  print '$PROGRAM_NAME:  '.$$fpwtext->error_message()."\n";
  print "Heading: $heading\n";
  print "Content: $content\n";
  die;
}
