#                                                         -*- Perl -*-
# Copyright (c) 2007  Kazuhiro Ito
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#

use strict;
use warnings;
use Getopt::Long;

use English;
use FreePWING::FPWUtils::FPWParser;
use Encode qw/ from_to /;
use FileHandle;

use vars qw(%fpwwikipedia_conf);
require "wikipedia-fpw.conf";

use vars qw (%entity_table);
use vars qw (%utf2euc_table);
use vars qw ($utf2euc_regexp);
require "tables";

use vars qw(%entry_headings);

 MAIN: {
   my $time = time;
   my $page_count = 0;
   my $entry_count = 0;

   my ($fpwtext, $fpwheading, $fpwword2, $fpwcopyright);
   initialize_fpwparser('text' => \$fpwtext,
			'heading' => \$fpwheading,
			'word2' => \$fpwword2,
			'copyright' => \$fpwcopyright);

   get_entry_headings('entries');

   my $copyright_filename;
   if ( $#ARGV < 1) {
     die("$PROGRAM_NAME: Only a file ($ARGV[0]) is specified.");
   }
   $copyright_filename = $ARGV[$#ARGV];
   register_copyright(\$fpwcopyright, $copyright_filename);


   my $wikipedia_filename = $ARGV[0];

   if (not -e $wikipedia_filename) {
     die("$PROGRAM_NAME: '$wikipedia_filename' does not exist.");
   }

   my $xml = FileHandle->new();
   if (!$xml->open("$wikipedia_filename", 'r')) {
     die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $wikipedia_filename\n";
   }

   my ($text, $heading);
   my ($text_position, $heading_position);

   if(verbose_mode ()) {
     print "Skipping headers: $fpwwikipedia_conf{'skip_heading'}\n";
   }

   PARSER: for (;;) {
     $_ = '';
     while (!(/<page>/)) {
       $_ = $xml->getline();
       if (!defined($_)) {
	 last PARSER;
       }
     }

     $text = $_;
     
     while (!(/<\/page>/)) {
       $_ = $xml->getline();
       if (!defined($_)) {
	 die "$PROGRAM_NAME: Unexpected file end\n";
       }
       $text .= $_;
     }

     $text =~ /<title>([^<]+)<\/title>/;
     $heading = $1;

     # Skipping entries (for debug)
     $page_count++;
     if ($page_count <= $fpwwikipedia_conf{'skip_count'}) {
       next;
     }

     if ($heading =~ /$fpwwikipedia_conf{'skip_heading'}/) {
       if (verbose_mode()) {
	 print "Skipping page: $heading.\n";
       }
       next;
     }

     # Skipping redirect entries
     if (is_redirect_page($text)) {
       if (verbose_mode()) {
	 print "Skipping redirect: $heading.\n";
       }
       next;
     }

     $text_position = register_content(\$fpwtext, $heading, $text);
     print "Entry: $page_count; $heading\n";
     $heading_position = register_heading(\$fpwheading, $heading);
     register_search_entry(\$fpwword2, $heading, $heading_position, $text_position);

     # Check number of entries (for debug)
     $entry_count++;
     if ($fpwwikipedia_conf{'entry_count'} > 0
	 && $entry_count >= $fpwwikipedia_conf{'entry_count'}) {
       last;
     }
   }

   finalize_fpwparser('text' => \$fpwtext,
		      'heading' => \$fpwheading,
		      'word2' => \$fpwword2,
		      'copyright' => \$fpwcopyright);

   printf("$PROGRAM_NAME: Elapsed time     : %8dsec.\n", time - $time);
   printf("$PROGRAM_NAME: Number of entries: %8d\n", $entry_count);
}


sub register_heading {
  my ($fpwheading, $heading) = @_;

  $heading = to_euc(decode_entity($heading), 'utf8');
  $$fpwheading->new_entry()
      || die "$PROGRAM_NAME: " . $$fpwheading->error_message() . "\n";
  $$fpwheading->add_text($heading)
      || die "$PROGRAM_NAME: " . $$fpwheading->error_message() . "\n";
  return $$fpwheading->entry_position();
}

sub register_content {
  my ($fpwtext, $heading, $content) = @_;
  my $converted_heading;
  my $formatted_content;
  my $text;
  my @texts;

  $converted_heading = to_euc(decode_entity($heading), 'utf8');
  $formatted_content = format_content($content);

  @texts = split(/(<\/?[^>]+>|&[a-zA-Z0-9]+;)/, $formatted_content);
  
  ($$fpwtext->new_entry()
   && $$fpwtext->add_entry_tag(unpack('h*', $heading))
   && $$fpwtext->add_keyword_start()
   && $$fpwtext->add_text($converted_heading)
   && $$fpwtext->add_keyword_end()
   && $$fpwtext->add_newline()
   && $$fpwtext->add_indent_level(2))
      || die "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";

  # print "$text\n"; 
  
  my $indent_level = 2;
  my $indent_last  = 2;
  my $last_ref = '';
  my $tmp;
  my $start_ref_count = 0;
  my $end_ref_count = 0;

  foreach $text (@texts) {
    if  ($text =~ /<IND>/) {
      $indent_level++;
    } elsif ($text =~ /<(BR|\/?(H[2-6]|DT|DD))>/) {
      # In reference, don't output newline.
      if ($last_ref !~ /./) {
	if (!($$fpwtext->add_newline())) {
	  print "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";
	  register_content_error
	      ($fpwtext, $heading, $content, $formatted_content);
	}
	$indent_level = 2;
      }
    } else {
      if ($indent_level != $indent_last) {
	if (!($indent_level >= 6 && $indent_last == 6)) {
	  if (!($$fpwtext->add_indent_level
		($indent_level >=6 ? 6 : $indent_level))) {
	    print "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";
	    register_content_error
		($fpwtext, $heading, $content, $formatted_content);
	  }
	  $indent_last = ($indent_level >=6 ? 6 : $indent_level);
	}
      }
      if ($text =~ /<R ([^>]+)>/) {
	if (defined($entry_headings{$1})) {
	  if (!($$fpwtext->add_reference_start)) {
	    print "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";
	    register_content_error
		($fpwtext, $heading, $content, $formatted_content);
	  }
	  $last_ref = $1;
	}
      } elsif ($text =~ /<\/R>/) {
	if (defined($entry_headings{$last_ref})) {
	  if (!($$fpwtext->add_reference_end(unpack("h*", $last_ref)))) {
	    print "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";
	    register_content_error
		($fpwtext, $heading, $content, $formatted_content);
	  }
	  $last_ref = '';
	}
      } else {
	if (!($$fpwtext->add_text(to_euc($text)))) {
	  print "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";
	  register_content_error
	      ($fpwtext, $heading, $content, $formatted_content);
	}
      }
    }
  }
  return $$fpwtext->entry_position();
}

sub register_content_error {
  my ($fpwtext, $heading, $content, $formatted_content) = @_;

  print "Heading: $heading\n";
  print "Content: $content\n";
  print "Formatted_content: $formatted_content\n";
  die;
}


sub register_search_entry {
  my ($fpwword2, $key, $heading, $text) = @_;
  
  my @keys = ($key);

  if (defined($entry_headings{$key})
      && $entry_headings{$key} != 1) {
    push (@keys, @{$entry_headings{$key}}); 
  } else {
    @keys = ($key);
  }
  
  my @extended_keys;
  @extended_keys = register_search_entry_internal(@keys);
    
  foreach $key (@extended_keys) {
    if(verbose_mode ()) {
      my $tmp = $key;
      from_to($tmp, 'euc-jp', 'utf-8');
      print "Entry key: $tmp\n";
    }
    if ($fpwwikipedia_conf{'trim_long_index'}) {
      $key = trim_key($key);
    }
    $$fpwword2->add_entry($key, $heading, $text)
	|| die "$PROGRAM_NAME: " . $$fpwword2->error_message() . "\n";
  }
}

sub register_search_entry_internal {
  my @headings = @_;
  my @keys = ();
  my $heading;

  foreach  $heading (@headings) {
    $heading = to_euc($heading, 'utf-8');

    push (@keys, $heading);

    if ($heading =~ /^(.+)[ \t]+\(.*\)$/) {
      push (@keys, $1);
    }

    if ($heading =~ /^[^ \t]+:(.+$)/) {
      push (@keys, $1);
    }
  }
  
  return @keys;
}


sub to_euc {
  my ($text, $in_code) = @_;
  #  print "in_text: $text, in_code: $in_code\n";
  if (!($in_code)) {
    $in_code = 'utf-8';
  }

  $text =~ s/$utf2euc_regexp/$utf2euc_table{$1}/g;

  from_to($text, $in_code, 'euc-jp');
  # $text =~ s/\x8F[\xA1-\xFE][\xA1-\xFE]|\x8E[\xE0-\xFE]/?/g;
  $text =~ s/\x8F[\xA1-\xFE][\xA1-\xFE]/?/g;

  # Workaround
  $text =~ s/\x7f/?/g;

  return $text;
}

sub format_content {
  my $text = $_[0];
  my @texts;
  my $level;

  # Check content for skipping

  # Remove header and footer 
  $text =~ s/.*<text [^>]+>[ \t\n]*//s;
  $text =~ s/[ \t\n]*<\/text>.*//s;


  # print "Content: $text\n";

  # Decode entitiy
  $text = decode_entity($text);

  # Remove html commet
  #$text =~ s/<!--.*?-->//sg;
  if ($text =~ /<!--/) {
    @texts = split(/(<!--|-->)/, $text);
    $text ='';
    $level = 0;
    foreach $_ (@texts) {
      if ($_ =~ /<!--/) {
	$level = 1;
      } elsif ($_ =~ /-->/) {
	$level = 0;
      } elsif ($level == 0) {
	$text .= $_;
      }
    }
  }

  # Remove tables.
  #$text =~ s/\{\|([^\|\{]|\|[^\}]|\{[^\|])*\|\}//sg;
  if ($text =~ /\{\|/) {
    @texts = split(/(\{\||\|\})/, $text);
    $text ='';
    $level = 0;
    foreach $_ (@texts) {
      if ($_ =~ /\{\|/) {
	$level++;
      } elsif ($_ =~ /\|\}/) {
	$level--;
      } elsif ($level == 0) {
	$text .= $_;
      }
    }
  }

  # Remove gallery
  $text =~ s/<gallery>.*?<\/gallery>//sg;

  # Format footnote
  $text =~ s/<ref>(.*?)<\/ref>/ \($1\) /sg;

  # Remove xhtml tags
  $text =~ s/<\/?[a-zA-Z]+( [^<>]*)?>//sg;

  # Format supported templates.
  $text =~ s/\{\{[lL]ang\|[a-z-]+\|([^\}]+)\}\}/$1/g;

  # Remove templates.
  while ($text =~ s/\{{3}[^\{\}]*\}{3}//sg) {}
  while ($text =~ s/\{\{[^\{\}]*\}\}//sg) {}

  # Remove links to other languages.
  $text =~ s/\[\[[a-z-]+:[^\[\]]+\]\]//g;

  # Format links to articles in other languages
  $text =~ s/\[\[:[a-z-]+:([^\[\]\|]+)\]\]/$1/g;
  $text =~ s/\[\[:[a-z-]+:[^\[\]\|]+\|([^\[\]\|]+)\]\]/$1/g;

  # Remove links to media data.
  # $text =~ s/\n*\[\[(?:[Mm]edia|[Ii]mage|画像):[^\[\]]+\|(([^\|\[\]]|\[\[[^\|\[\]]+\]\])+)\]\]/\n\n(メディアデータ: $1)\n\n/sg;
  $text =~ s/\n*\[\[(?:[Mm]edia|[Ii]mage|画像):[^\[\]]+\|(([^\|\[\]]|\[\[[^\|\[\]]+\]\])+)\]\]//sg;

  # Format subheadings.
  $text =~ s/\n={6} *([^=\n]+?) *={6}\n/\n<H6>$1<\/H6>\n/sg;
  $text =~ s/\n={5} *([^=\n]+?) *={5}\n/\n<H5>・$1<\/H5>\n/sg;
  $text =~ s/\n={4,} *([^=\n]+?) *={4,}\n/\n<H4>○$1<\/H4>\n/sg;
  $text =~ s/\n=== *([^=\n]+?) *===\n/\n<H3>□$1<\/H3>\n/sg;
  $text =~ s/\n== *([^=\n]+?) *==\n/\n<H2>■$1<\/H2>\n/sg;
  
  # Format definitions
  $text =~ s/\s*\n;([^:\n]+)(?: +|\n):([^\n]+)\s*\n/\n<DT>$1<\/DT>\n<DD>$2<\/DD>\n/sg;
  $text =~ s/\s*\n;([^:\n]+)\s*\n/\n<DT>$1<\/DT>\n/sg;

  # Format indents
  while ($text =~ s/(\n:*):/$1<IND>/sg) {}
  # Format itemize
  while ($text =~ s/(\n[*\#]*)([*\#])/$1<IND>$2/sg) {}
  while ($text =~ s/(\n(<IND>)+)([*\#]+)<IND>/$1<IND>$3/sg) {}

  $text =~ s/(?: *\n)+(<IND>[^\n]*)\n/\n\n$1<BR>\n/sg;
 
  # Format links to other articles
  if ($fpwwikipedia_conf{'enable_reference'}) {
    $text =~ s/\[\[([^\[\]\|\#\n]+?)(?:\#[^\[\]\|\#\n]+)?\]\]/<R $1>$1<\/R>/sg;
    $text =~ s/\[\[([^\[\]\|\#\n]+?)(?:\#[^\[\]\|\#\n]+)?\|(([^\[\]\|\n]|\[[^\[\]]|\][^\]])+)\]\]/<R $1>$2<\/R>/sg;
# ([^\[\]\|\n]+)\]\]/<R $1>$2<\/R>/g;
  } else {
    $text =~ s/\[\[([^\[\]\|\#\n]+?)(?:\#[^\[\]\|\#\n]+)?\]\]/$1/sg;
    $text =~ s/\[\[([^\[\]\|\#\n]+?)(?:\#[^\[\]\|\#\n]+)?\|(([^\[\]\|\n]|\[[^\[\]]|\][^\]])+)\]\]/$2/sg;
  }

  # Format emphasis
  # $text =~ s/\'\'\'\'\'((\'{0,4}[^\'])+)\'\'\'\'\'/<EM>$1<\/EM>/g;
  # $text =~ s/\'\'\'((\'{0,2}[^\'])+)\'\'\'/<EM>$1<\/EM>/g;
  # $text =~ s/\'\'((\'?[^\']))+\'\'/<EM>$1<\/EM>/g;
  $text =~ s/'{2,}//g; 

  # Format LFs
  $text =~ s/(?: *\n){2,}/<BR>\n/sg;
  $text =~ s/(?:<BR>\n)*(<(H|DT))/<BR>\n$1/sg;
  $text =~ s/(<\/(H[2-6]|DT)>)\n?(?:<BR>\n?)+/$1\n/sg;

  $text =~ s/(?:<BR>|[ \n])+$/<BR>/s;
  $text =~ s/^(?:<BR>|[ \n])+//s;

  # print "Formatted_Content: $text\n";

  return decode_entity($text);
}

sub decode_entity {
  my $text = $_[0];

  $text =~ s/&([0-9a-zA-Z]+);/$entity_table{$1}/sg;

  return $text;
}

sub register_copyright {
  my ($fpwcopyright, $filename) = @_;
  my $handle = FileHandle->new();

  if (!$handle->open($filename, 'r')) {
    die "$PROGRAM_NAME: failed to open the file, $filename\n";
  }

  if(verbose_mode ()) {
    print "Copyright notice: $filename.\n";
  }

  $_ = $handle->getline();
  for (; defined($_); $_ = $handle->getline()) {
    ($$fpwcopyright->add_text($_)
     && $$fpwcopyright->add_newline())
	||  die "$PROGRAM_NAME: " . $$fpwcopyright->error_message() . "\n";
  }
}

sub trim_key {
  my $str = $_[0];
  $str =~ s/(^([\x80-\xff][\x80-\xff]|[\x20-\x7f]){1,127}).*$/$1/o;
  return $str;
}

sub get_entry_headings {
  my $filename = $_[0];

  if (not -e $filename) {
    die("$PROGRAM_NAME: '$filename' does not exist.");
  }

  my $entry_file = FileHandle->new();
  if (!$entry_file->open("$filename", 'r')) {
    die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $filename\n";
  }
  
  for (;;) {
    $_ = $entry_file->getline();
    if (!defined($_)) {
      last;
    }
    if ($_ =~ /\t/) {
      $_ =~ s/^([^\t]+)\t//s;
      $entry_headings{$1} = [split(/\t|\n/, $_)];
    } else {
      $_ =~ /(.+)\n/;
      $entry_headings{$1} = 1;
    }
  }
}
