#                                                         -*- Perl -*-
# Copyright (c) 2007-2009  Kazuhiro Ito
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#

use strict;
use warnings;
use Getopt::Long;

use English;
use FreePWING::FPWUtils::FPWParser;
use Encode qw/ from_to /;
use FileHandle;
use Compress::Raw::Zlib;

use vars qw(%fpwoald7_conf @tags_table);

use vars qw (%entity_table %utf2euc_table $utf2euc_regexp);
use vars qw (%ipa_table %symbol_table %text_table);
require "../tables";

 MAIN: {
   my $time = time;
   my $page_count = 0;
   my $entry_count = 0;

   if ( $#ARGV < 1) {
     print "$PROGRAM_NAME: Too few arguments.\n\n";
     print "  $PROGRAM_NAME conf_file srcdir\n";
     die "\n";
   }


   require "$ARGV[0]";
   
   my ($fpwtext, $fpwheading, $fpwword2, $fpwcopyright);
   initialize_fpwparser('text' => \$fpwtext,
			'heading' => \$fpwheading,
			'word2' => \$fpwword2,
			'copyright' => \$fpwcopyright);

   my $srcdir = $ARGV[1];
   $srcdir =~ s/^(.+?)\/?$/$1\//;

   my $content_filename = $srcdir.'CONTENT.tda';
   my $content_index_filename = $srcdir.'CONTENT.tda.tdz';

   my $content_handle = new FileHandle;
   if (!$content_handle->open("$content_filename", 'r')) {
     die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $content_filename\n";
   }
   binmode $content_handle;

   my $content_index_handle = new FileHandle;
   if (!$content_index_handle->open("$content_index_filename", 'r')) {
     die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $content_index_filename\n";
   }
   binmode $content_index_handle;

   my $heading_position;
   my $text_position;

   for (;;) {
     my $size;
     my $contents;
     my $content;
     my $zipped_contents;
     my $tmp;

     if (read($content_index_handle, $tmp, 8) != 8) {
       last;
     }
     ($size) = unpack("x4V", $tmp);

     if (read($content_handle, $zipped_contents, $size) != $size) {
       die "$PROGRAM_NAME: File reading error: $content_filename\n";
     }

     my ($inflater, $status) = new Compress::Raw::Zlib::Inflate();
     if ($status != Z_OK) {
       die "$PROGRAM_NAME: Failed to initialize inflater\n";
     }
     $inflater->inflate($zipped_contents, $contents);
     if ($status != Z_OK && $status != Z_STREAM_END) {
       die "$PROGRAM_NAME: Failed to inflate\n";
     }

     foreach $content (split(/\0+/, $contents)) {
       my $heading;
       my @headings;

       if (!($content =~ /<p:(ht?)>(.+?)<\/p:\1>/)) {
	 print "warning: no header was found.\n";
	 next;
       }
       $heading = $2;
       $tmp = $heading;
       $tmp =~ s/<[^>]*>//g;

       print "Entry: $page_count; $tmp\n";
       $heading_position = register_heading(\$fpwheading, $heading);
       $text_position = register_content(\$fpwtext, $page_count, $heading, $content);

       @headings= ($heading);
       while ($content =~ s/<p:v>(.+?)<\/p:v>//) {
	 push (@headings, ($1));
       }

       @headings = expand_search_entry(@headings);
       register_search_entry(\$fpwword2, \@headings, $heading_position, $text_position);

       $page_count++;
     }
   }

   finalize_fpwparser('text' => \$fpwtext,
		      'heading' => \$fpwheading,
		      'word2' => \$fpwword2,
		      'copyright' => \$fpwcopyright);
   
   printf("$PROGRAM_NAME: Elapsed time     : %8dsec.\n", time - $time);
   printf("$PROGRAM_NAME: Number of entries: %8d\n", $page_count);
}


sub expand_search_entry {
  my @headings = @_;
  my @tmp;

  foreach $_ (@headings) {
    $_ =~ s/%|<[^>]*>//g;
    push (@tmp, ($_));
    if ($_ =~ /^(?:[aA]n?|[Tt]he) (([^a&]|a[^n]|an[^d]|and[^ ]).+)/) {
      # Add heading removed leading 'the' or 'a[n]'.
      push (@tmp, ($1));
    }
  }
  @headings = @tmp;

  @tmp = ();
  foreach $_ (@headings) {
    push (@tmp, ($_));
    if ($_ =~ /(.* )?([^ ]+)\xc2\xa0\/\xc2\xa0([^ ]+)( .*)?/) {
      my ($beg, $end) = (defined($1) ? $1 : '', defined($4) ? $4 : '');
      push (@tmp, ($beg.$2.$end, $beg.$3.$end, $beg.$2.$3.$end));
    }

    if ($fpwoald7_conf{'book_name'} eq 'wfinder' && $_ =~ /\//) {
      # Add heading separeted by '/' (for wordfinder).
      push (@tmp, split(/\//, $_));
    }
  }

  return @tmp;
}

sub register_heading {
  my ($fpwheading, $heading) = @_;
  
  # $heading = utf2euc(decode_entity($heading));
  $$fpwheading->new_entry()
      && register_text($fpwheading, $heading)
      || die "$PROGRAM_NAME: " . $$fpwheading->error_message() . "\n";
  return $$fpwheading->entry_position();
}

sub trim_key {
  my $str = $_[0];
  $str =~ s/(^([\x80-\xff][\x80-\xff]|[\x20-\x7f]){1,127}).*$/$1/o;
  return $str;
}

sub register_search_entry {
  my ($fpwword2, $keys, $heading, $text) = @_;
  my $key;
  my %tmp;

  foreach $key (@$keys) {
    $key = utf2euc(decode_entity($key));
    $key =~ s/\xA1\xA6|\?|\(.*?\)//g; 

    $tmp{$key} = 1;
  }

  foreach $key (keys(%tmp)) {
#    if(verbose_mode ()) {
    if(1) {
      my $tmp = $key;
      from_to($tmp, 'euc-jp', 'utf-8');
      print "Entry key: $tmp\n";
    }
    if ($fpwoald7_conf{'trim_long_index'}) {
      $key = trim_key($key);
    }
    $$fpwword2->add_entry($key, $heading, $text)
	|| die "$PROGRAM_NAME: " . $$fpwword2->error_message() . "\n";
  }
}

sub decode_entity {
  my $text = $_[0];

  $text =~ s/&([0-9a-zA-Z]+);/$entity_table{$1}/g;

  return $text;
}

sub utf2euc {
  my ($text) = @_;

  $text =~ s/$utf2euc_regexp/$utf2euc_table{$1}/g;

  from_to($text, 'utf-8', 'euc-jp');
  # $text =~ s/\x8F[\xA1-\xFE][\xA1-\xFE]|\x8E[\xE0-\xFE]/?/g;
  $text =~ s/\x8F[\xA1-\xFE][\xA1-\xFE]/?/g;

  # Workaround
  $text =~ s/\x7f/?/g;

  return $text;
}


sub register_content {
  my ($fpwtext, $count, $heading, $content) = @_;
  
  ($$fpwtext->new_entry()
   && $$fpwtext->add_entry_tag(sprintf('%06d', $count))
   && $$fpwtext->add_keyword_start()
   && register_text($fpwtext, $heading)
   && $$fpwtext->add_keyword_end()
   && $$fpwtext->add_newline()
   && $$fpwtext->add_indent_level(2))
      || die "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";

  $content =~ s/^.*<p:(OALD|WORDFINDER|GUIDE)[^>]+>(.*)<\/p:\1.*$/$2/;

  register_text($fpwtext, $content)
      || register_content_error ($fpwtext, $heading, $content);

  return $$fpwtext->entry_position();
}

sub register_text {
  my ($fpwtext, $content) = @_;

  my $text;

  my $indent_level = 2;
  my $indent_last  = 2;
  my $ignore_level = 0;
  my $lfed = 0;
  my @tag_info;
  my $tag_level = 0;
  my $font_level = 0;
  my @font_info;
  my $ref_level = 0;
  my $ref_entry_id = '';
  my %images = ();
  my $sound_level = 0;
  my $style_level = 0;
  my @style_info;
  my $style_last = '';

  my $tmp;
  my $i;

  foreach $text (split(/(<[^<>]*?>)/, $content)) {
    if ($text =~ /^<[^\/>]([^>]*[^\/])?>/) {
      # some opening tags.
      # Chechk ignoring.
      if ($ignore_level != 0) {
	$tag_level++;
	next;
      }

      # Check for sound reference, superscript and subscript.
      if ($text =~ /^<span class="speaker(u[sk])"/) {
	if ($fpwoald7_conf{'sound_type'}) {
	  $tmp = $1;
	  $text =~ /\/([^\/]+)\.MP3/;
	  $tmp .= "_$1";
	  $tmp =~ s/\./_/g;
	  $$fpwtext->add_sound_start($tmp) || return 0;
	  $sound_level = $tag_level;
	} else {
	  $ignore_level = $tag_level;
	  $tag_level++;
	  next;
	}
      } elsif ($text =~ /^<p:sub[^a-zA-Z]/) {
	$$fpwtext->add_subscript_start() || return 0;
      } elsif ($text =~ /^<p:sup[^a-zA-Z]/) {
	$$fpwtext->add_superscript_start() || return 0;
      }

      foreach $tmp (@tags_table) {
	$_ = ${$tmp}[0];
	if  ($text !~ /^<$_[^a-zA-Z_-]/) {
	  next;
	}

	#Check ignore
	if (${$tmp}[4] == 1) {
	  $ignore_level = $tag_level;
	  last;
	}
	$tag_info[$tag_level] = $tmp;

	# Check whether LF is needed.
	if ((${$tmp}[1] != 0)
	    && (($tag_level == 0) || (${$tag_info[$tag_level - 1]}[4] < 2))) {
	  $i = ${$tmp}[1] - $lfed;
	  if ($i > 0) {
	    register_text_set_style($fpwtext, '', \$style_last) || return 0;
	    for (; $i > 0; $i--) {
	      $$fpwtext->add_newline() || return 0;
	      $lfed++;
	    }
	  }
	}

	# Check beginning text.
	if (length(${$tmp}[5]) > 0) {
	  register_text_set_indent($fpwtext, $indent_level, \$indent_last)
	      && register_text_set_style
	      ($fpwtext, $style_level ? $style_info[$style_level - 1] : '',
	       \$style_last)
	      && $$fpwtext->add_text(utf2euc(${$tmp}[5])) || return 0;
	  $lfed = 0;
	}
	$indent_level += ${$tmp}[3];

	# Check font_table.
	if (length(${$tmp}[7]) > 0) {
	  $font_info[$font_level] = ${$tmp}[7];
	  $font_level++;
	}
	  
	# Check style;
	if (length(${$tmp}[8]) > 0) {
	  $style_info[$style_level]
	      = (${$tmp}[8] eq 'default') ? '' :  ${$tmp}[8];
	  $style_level++;
	}
	  
	# Check reference.
	if ($text =~ /<p:xhz? type="$fpwoald7_conf{'book_name'}" entry="([0-9]+)"/) {
	  $ref_entry_id = substr("000000".$1, -6);
	  $ref_level = $tag_level;

	  register_text_set_style($fpwtext, '', \$style_last)
	      && $$fpwtext->add_reference_start()
	      && register_text_set_style
	      ($fpwtext, $style_level ? $style_info[$style_level - 1] : '',
	       \$style_last) || return 0;
	}
	last;
      }
      $tag_level++;
    } elsif ($text =~ /^<\//) {
      # some ending tags.
      $tag_level--;

      # Check whether ignored.
      if ($ignore_level) {
	if ($ignore_level >= $tag_level) {
	  $ignore_level = 0;
	}
	next;
      }

      # Check for sound reference, superscript and subscript.
      if ($sound_level && ($sound_level == $tag_level)) {
	$$fpwtext->add_sound_end() || return 0;
	$sound_level = 0;
      } elsif ($text =~ /^<\/p:sub[^a-zA-Z]/) {
	$$fpwtext->add_subscript_end() || return 0;
      } elsif ($text =~ /^<\/p:sup[^a-zA-Z]/) {
	$$fpwtext->add_superscript_end() || return 0;
      } 

      # Check font table.
      if (length(${$tag_info[$tag_level]}[7]) > 0) {
	if ($font_level == 0) {
	  print "warning: font level mismatch.\n";
	} else {
	  $font_level--;
	}
      }

      # Check style.
      if (length(${$tag_info[$tag_level]}[8]) > 0) {
	if ($style_level == 0) {
	  print "warning: style level mismatch.\n";
	} else {
	  $style_level--;
	}
      }

      # Check ending text.
      if (length(${$tag_info[$tag_level]}[6]) > 0) {
	register_text_set_indent($fpwtext, $indent_level, \$indent_last)
	    && register_text_set_style
	    ($fpwtext, $style_level ? $style_info[$style_level - 1] : '',
	     \$style_last)
	    && $$fpwtext->add_text(utf2euc(${$tag_info[$tag_level]}[6]))
	    || return 0;
	$lfed = 0;
      }

      # Check if LF is needed.
      if ((${$tag_info[$tag_level]}[2] != 0)
	  && (($tag_level == 0) || (${$tag_info[$tag_level - 1]}[4] < 2))) {
	$i = ${$tag_info[$tag_level]}[2] - $lfed;
	if ($i > 0) {
	  register_text_set_style($fpwtext, '', \$style_last) || return 0;
	  for (; $i > 0; $i--) {
	    $$fpwtext->add_newline() || return 0;
	    $lfed++;
	  }
	}
      }

      $indent_level -= ${$tag_info[$tag_level]}[3];

      #Check reference.
      if($ref_level && ($ref_level == $tag_level)) {
	register_text_set_style($fpwtext, '', \$style_last)
	    && $$fpwtext->add_reference_end($ref_entry_id) || return 0;
	$ref_level = 0;
      }
    } elsif ($text =~ /^</) {
      # some single tags.
      if ($text =~ /^<(br|p:z)([^a-zA-Z_-][^>]*)?\/ *>$/) {
	if ($lfed == 0) {
	  register_text_set_style($fpwtext, '', \$style_last)
	      && $$fpwtext->add_newline() || return 0;
	  $lfed++;
	}
      } elsif (($text =~ /^<span class="ill".+\/([^\/]+)\.htm/)
	       && ($fpwoald7_conf{'image_type'})
	       && !defined($images{$1})) {
	# figure.
	my $title;
	$tmp = $1;
	$images{$tmp} = 1;

	if ($text =~ / title="(.+?)"/) {
	  $title = $1;
	} else {
	  $title ="Figure";
	}

	register_text_set_style($fpwtext, '', \$style_last)
	    && ($lfed || $$fpwtext->add_newline())
	    && ($fpwoald7_conf{'image_type'} == 1 ?
		$$fpwtext->add_jpeg_graphic_start($tmp) :
		$$fpwtext->add_color_graphic_start($tmp))
	    && register_text_internal($fpwtext, $title)
	    && ($fpwoald7_conf{'image_type'} == 1 ?
		$$fpwtext->add_jpeg_graphic_end() :
		$$fpwtext->add_color_graphic_end())		 
	    && $$fpwtext->add_newline() || return 0;
	$lfed = 1;
      }
    } elsif ($ignore_level == 0 && $text !~ /^\n*$/) {
      # text.
      register_text_set_indent($fpwtext, $indent_level, \$indent_last)
	  && register_text_set_style
	  ($fpwtext, $style_level ? $style_info[$style_level - 1] : '',
	   \$style_last) || return 0;

      if ($font_level > 0) {
	if ($font_info[$font_level - 1] eq 'ipa') {
	  # ERRATA
	  $text =~ s/\'m∋tJ\(r\)/'m&amp;ni;tJ(r)/;

	  register_text_with_table ($fpwtext, $text, \%ipa_table) || return 0;
	} elsif ($font_info[$font_level - 1] eq 'symbol') {
	  register_text_with_table ($fpwtext, $text, \%symbol_table)
	      || return 0;
	}
      } else {
	register_text_internal ($fpwtext, $text) || return 0;
      }
      $lfed = 0;
    }
  }
  return 1;
}

sub register_text_set_indent {
  my ($fpwtext, $level, $last) = @_;

  if (($level >=6 ? 6 : $level) != $$last) {
    $$last = $level >=6 ? 6 : $level;
    return $$fpwtext->add_indent_level ($$last);
  }

  return 1;
}

sub register_text_set_style {
  my ($fpwtext, $style, $last) = @_;

  if ($style eq $$last) {
    return 1;
  }

  if ($$last eq 'em') {
    $$fpwtext->add_emphasis_end() || return 0;
  } elsif ($$last ne '') {
    $$fpwtext->add_font_end() || return 0;
  }

  if ($style eq 'em') {
    $$fpwtext->add_emphasis_start($style) || return 0;
  } elsif ($style ne '') {
    $$fpwtext->add_font_start($style) || return 0;
  }
  $$last = $style;

  return 1;
}

sub register_text_internal {
  my ($fpwtext, $text) = @_;
  my $tmp;
  my $char;
  $text = decode_entity($text);
 
  foreach $tmp (split(/([\x00-\x7f]+)/,$text)) {
    if ($tmp =~ /[\x00-\x7f]/) {
      $tmp =~ s/\x7f//g;

      $$fpwtext->add_text($tmp) || return 0;
    } else {
      register_text_with_table ($fpwtext, $tmp, \%text_table);
    }
  }

  return 1;
}

sub register_text_with_table {
  my ($fpwtext, $text, $font_table) = @_;
  my $letter;
  my $tmp;

  $text = decode_entity($text);

  foreach $letter ($text =~ /([\x00-\x7f]|[\xc0-\xff][\x80-\xbf]+)/g ){
    # print "$letter\n";
    $tmp = $$font_table{$letter};
    if (length($tmp) > 0) {
      $letter = $tmp;
    }

    if ($letter =~ /^u-/) {
      # print "$letter\n";
      $$fpwtext->add_half_user_character($letter) || return 0;
    } else {
      $$fpwtext->add_text(utf2euc($letter)) || return 0;
    }
  }
  return 1;
}

sub register_content_error {
  my ($fpwtext, $heading, $content) = @_;

  print '$PROGRAM_NAME:  '.$$fpwtext->error_message()."\n";
  print "Heading: $heading\n";
  print "Content: $content\n";
  die;
}
