#                                                         -*- Perl -*-
# Copyright (c) 2007, 2008  Kazuhiro Ito
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#

use strict;
use warnings;
use Getopt::Long;

use English;
use FreePWING::FPWUtils::FPWParser;
use Encode qw/ from_to /;
use FileHandle;

use vars qw(%fpwwikipedia_conf);
require "wikipedia-fpw.conf";

use vars qw (%entity_table);
use vars qw (%utf2euc_table);
use vars qw ($utf2euc_regexp);
require "tables";

use vars qw(%entry_headings);

 MAIN: {
   my $time = time;
   my $page_count = 0;
   my $entry_count = 0;

   my ($fpwtext, $fpwheading, $fpwword2, $fpwcopyright);
   initialize_fpwparser('text' => \$fpwtext,
			'heading' => \$fpwheading,
			'word2' => \$fpwword2,
			'copyright' => \$fpwcopyright);

   get_entry_headings('entries');

   my $copyright_filename;
   if ( $#ARGV < 1) {
     die("$PROGRAM_NAME: Only a file ($ARGV[0]) is specified.");
   }
   $copyright_filename = $ARGV[$#ARGV];
   register_copyright(\$fpwcopyright, $copyright_filename);


   my $wikipedia_filename = $ARGV[0];

   if (not -e $wikipedia_filename) {
     die("$PROGRAM_NAME: '$wikipedia_filename' does not exist.");
   }

   my $xml = FileHandle->new();
   if (!$xml->open("$wikipedia_filename", 'r')) {
     die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $wikipedia_filename\n";
   }

   my ($text, $heading);
   my ($text_position, $heading_position);

   if(verbose_mode ()) {
     print "Skipping headers: $fpwwikipedia_conf{'skip_heading'}\n";
     print "Skipping contents: $fpwwikipedia_conf{'skip_content'}\n";
     print "Selecting headers: $fpwwikipedia_conf{'select_heading'}\n";
     print "Selecting contents: $fpwwikipedia_conf{'select_content'}\n";
   }

   PARSER: for (;;) {
     $_ = '';
     while (!(/<page>/)) {
       $_ = $xml->getline();
       if (!defined($_)) {
	 last PARSER;
       }
     }

     $text = $_;
     
     while (!(/<\/page>/)) {
       $_ = $xml->getline();
       if (!defined($_)) {
	 die "$PROGRAM_NAME: Unexpected file end\n";
       }
       $text .= $_;
     }

     $text =~ /<title>([^<]+)<\/title>/;
     $heading = $1;

     # Skipping entries (for debug)
     $page_count++;
     if ($page_count <= $fpwwikipedia_conf{'skip_count'}) {
       next;
     }

     if (!defined($entry_headings{$heading})) {
       if (verbose_mode()) {
	 print "Skipping page: $heading.\n";
       }
       next;
     }

     # Workarounds
     if ($heading =~ /^Ruby$/) {
       $text =~ s/\{\|([^\|]*\|[^\}]+\})/<nowiki>{|<\/nowiki>$1/g;
#     # Fixed at 2007/10/01
#     } elsif ($heading =~ /^ネイチャーライティング$/) {
#       $text =~ s/\|-\}/\|-\|\}/g;
#     } elsif ($heading =~ /^大原めぐみ$/) {
#       $text =~ s/ドラえもん\[\[Wii\]\]/ドラえもんWii/;
#     } elsif ($heading =~ /^岩田正太$/) {
#       $text =~ s/JaSRAサッカークラ\*2007年- \[\[ザスパ草津\]\]\nブ\]\]/JaSRAサッカークラブ]]\n/;
#      # Fixed at 2007/08/22
#      } elsif ($heading =~ /^予算$/) {
#        $text =~ s/\[\[s:地方自治法 第二編 第九章 財務#211\|地方自治法第211条）/[[s:地方自治法 第二編 第九章 財務#211|地方自治法第211条]]）/;
#      # Fixed at 2007/08/21
#      } elsif ($heading =~ /^加茂田重政$/) {
#        $text =~ s/\[\[加茂田組\|加茂田会\}\}/[[加茂田組|加茂田会]]/;
#      } elsif ($heading =~ /^石川県の廃止市町村一覧$/) {
#        $text =~ s/\[\[三谷村 \(石川県河北郡\)\|新設の為/[[三谷村 (石川県河北郡)|三谷村]]新設の為/;
#      } elsif ($heading =~ /^ドメインハック$/) {
#        $text =~ s/\[\[バーナー\|ウェブバーナー\]の/[[ウェブバーナー|バーナー]]の/;
#      } elsif ($heading =~ /^チャールストン \(サウスカロライナ州\)$/) {
#        $text =~ s/\[\[:en:University of Illinois Press\|U. of Illinois Press, /[[:en:University of Illinois Press|U. of Illinois Press]], /;
#        $text =~ s/\[\[:en:University Press of Mississippi\|U. Press of Mississippi, /[[:en:University Press of Mississippi|U. Press of Mississippi]], /;
#      # Fixed at 2007/07/09
#      } elsif ($heading =~ /^単位一覧$/) {
#        $text =~ s/(<sup>229376<\/sup>\n\|)-/$1\}/g;
     }


     print "Entry: $page_count; $heading\n";
     $heading_position = register_heading(\$fpwheading, $heading);
     $text_position = register_content(\$fpwtext, $heading, $text);
     register_search_entry(\$fpwword2, $heading, $heading_position, $text_position);

     # Check number of entries (for debug)
     $entry_count++;
     if ($fpwwikipedia_conf{'entry_count'} > 0
	 && $entry_count >= $fpwwikipedia_conf{'entry_count'}) {
       last;
     }
   }

   finalize_fpwparser('text' => \$fpwtext,
		      'heading' => \$fpwheading,
		      'word2' => \$fpwword2,
		      'copyright' => \$fpwcopyright);
   
   printf("$PROGRAM_NAME: Elapsed time     : %8dsec.\n", time - $time);
   printf("$PROGRAM_NAME: Number of entries: %8d\n", $entry_count);
}


sub register_copyright {
  my ($fpwcopyright, $filename) = @_;
  my $handle = FileHandle->new();

  if (!$handle->open($filename, 'r')) {
    die "$PROGRAM_NAME: failed to open the file, $filename\n";
  }

  if(verbose_mode ()) {
    print "Copyright notice: $filename.\n";
  }

  $_ = $handle->getline();
  for (; defined($_); $_ = $handle->getline()) {
    ($$fpwcopyright->add_text($_)
     && $$fpwcopyright->add_newline())
	||  die "$PROGRAM_NAME: " . $$fpwcopyright->error_message() . "\n";
  }
}

sub register_heading {
  my ($fpwheading, $heading) = @_;
  
  $heading = utf2euc(decode_entity($heading));
  $$fpwheading->new_entry()
      || die "$PROGRAM_NAME: " . $$fpwheading->error_message() . "\n";
  $$fpwheading->add_text($heading)
      || die "$PROGRAM_NAME: " . $$fpwheading->error_message() . "\n";
  return $$fpwheading->entry_position();
}

sub trim_key {
  my $str = $_[0];
  $str =~ s/(^([\x80-\xff][\x80-\xff]|[\x20-\x7f]){1,127}).*$/$1/o;
  return $str;
}

sub register_search_entry {
  my ($fpwword2, $key, $heading, $text) = @_;
  
  my @keys = ($key);

  if (defined($entry_headings{$key})
      && $entry_headings{$key} != 1) {
    push (@keys, @{$entry_headings{$key}}); 
  } else {
    @keys = ($key);
  }
  
  my @extended_keys;
  @extended_keys = register_search_entry_internal(@keys);
    
  foreach $key (@extended_keys) {
    if(verbose_mode ()) {
      my $tmp = $key;
      from_to($tmp, 'euc-jp', 'utf-8');
      print "Entry key: $tmp\n";
    }
    if ($fpwwikipedia_conf{'trim_long_index'}) {
      $key = trim_key($key);
    }
    $$fpwword2->add_entry($key, $heading, $text)
	|| die "$PROGRAM_NAME: " . $$fpwword2->error_message() . "\n";
  }
}

sub register_search_entry_internal {
  my @headings = @_;
  my @keys = ();
  my $heading;

  foreach  $heading (@headings) {
    $heading = utf2euc($heading);

    push (@keys, $heading);

    if ($heading =~ /^(.+)[ \t]+\(.*\)$/) {
      push (@keys, $1);
    }

    if ($heading =~ /^[^ \t]+:(.+$)/) {
      push (@keys, $1);
    }
  }
  
  return @keys;
}


sub decode_entity {
  my $text = $_[0];

  # $text =~ s/&([0-9a-zA-Z]+);/$entity_table{$1}/g;

#  my @texts = split (/(&([0-9a-zA-Z]+|\#([0-9]+|x[0-9a-fA-F]+));)/, $text);
  my @texts = split (/(&[0-9a-zA-Z]+;|&\#[0-9]+;|&\#x[0-9a-fA-F]+;)/, $text);
  $text = '';
  foreach $_ (@texts) {
    if ($_ =~ /&([0-9a-zA-Z]+);/) {
      if (defined($entity_table{$1})) {
	$text .= $entity_table{$1};
      } else {
	$text .= '?';
      }
    } elsif ($_ =~ /&\#([0-9]+|x[0-9a-fA-F]+);/) {
      $text .= decode_utf8($1);
    } else {
      $text .= $_;
    }
  }

  return $text;
}


sub decode_utf8 {
  my $entity = $_[0];

  if ($entity =~ /^x(.*)/) {
    $entity = hex ($1);
  }

  if      ($entity & 0x7c000000) {
    return
	chr(0xfc | (($entity >> 30) & 0x01)).
	chr(0x80 | (($entity >> 24) & 0x3f)).
	chr(0x80 | (($entity >> 18) & 0x3f)).
	chr(0x80 | (($entity >> 12) & 0x3f)).
	chr(0x80 | (($entity >>  6) & 0x3f)).
	chr(0x80 | ($entity & 0x3f));
  } elsif ($entity & 0x03e00000) {
    return
	chr(0xf8 | (($entity >> 24) & 0x03)).
	chr(0x80 | (($entity >> 18) & 0x3f)).
	chr(0x80 | (($entity >> 12) & 0x3f)).
	chr(0x80 | (($entity >>  6) & 0x3f)).
	chr(0x80 | ($entity & 0x3f));
  } elsif ($entity & 0x001f0000) {
    return
	chr(0xf0 | (($entity >> 18) & 0x07)).
	chr(0x80 | (($entity >> 12) & 0x3f)).
	chr(0x80 | (($entity >>  6) & 0x3f)).
	chr(0x80 | ($entity & 0x3f));
  } elsif ($entity & 0x0000f800) {
    return
	chr(0xe0 | (($entity >> 12) & 0x0f)).
	chr(0x80 | (($entity >>  6) & 0x3f)).
	chr(0x80 | ($entity & 0x3f));
  } elsif ($entity & 0x00000780) {
    return
	chr(0xc0 | (($entity >>  6) & 0x1f)).
	chr(0x80 | ($entity & 0x3f));
  }
  return chr($entity);
}

sub utf2euc {
  my ($text) = @_;

  $text =~ s/$utf2euc_regexp/$utf2euc_table{$1}/g;

  from_to($text, 'utf-8', 'euc-jp');
  # $text =~ s/\x8F[\xA1-\xFE][\xA1-\xFE]|\x8E[\xE0-\xFE]/?/g;
  $text =~ s/\x8F[\xA1-\xFE][\xA1-\xFE]/?/g;

  # Workaround
  $text =~ s/\x7f/?/g;

  return $text;
}


sub register_content {
  my ($fpwtext, $heading, $content) = @_;
  my $formatted_content;
  my $converted_heading = utf2euc(decode_entity($heading));
  
  ($$fpwtext->new_entry()
   && $$fpwtext->add_entry_tag(unpack('h*', $heading))
   && $$fpwtext->add_keyword_start()
   && $$fpwtext->add_text($converted_heading)
   && $$fpwtext->add_keyword_end()
   && $$fpwtext->add_newline()
   && $$fpwtext->add_indent_level(2))
      || die "$PROGRAM_NAME: " . $$fpwtext->error_message() . "\n";

  $formatted_content = decode_entity(format_content($heading, $content));

  my ($text, @texts);

  @texts = split(/(<\/?[^<>]+>)/, $formatted_content);
  
  my $indent_level = 2;
  my $indent_last  = 2;
  my $last_ref = '';
  my $tmp;
  my $start_ref_count = 0;
  my $end_ref_count = 0;
  my %reference_hash = ();

  foreach $text (@texts) {
    if  ($text =~ /^<IND>/) {
      if ($indent_level < 6) {
	$indent_level++;
      }
    } elsif ($text =~ /^<(BR|\/?(H[2-6]|DT|DD))>/) {
      # In reference, don't output newline.
      if (!length($last_ref)) {
	$$fpwtext->add_newline()
	    || register_content_error ($fpwtext, $heading, $content, $formatted_content);
	$indent_level = 2;
      }
    } elsif ($text =~ /^<R (.+)>$/) {
      if (defined($entry_headings{$1})
	  && ($fpwwikipedia_conf{'enable_reference'} == 1
	      || !defined($reference_hash{$1}))) {
	$$fpwtext->add_reference_start()
	    || register_content_error ($fpwtext, $heading, $content, $formatted_content);
	$last_ref = $1;
	$reference_hash{$1} = 1;
      }
    } elsif ($text =~ /^<\/R>/) {
      if (length($last_ref)) {
	$$fpwtext->add_reference_end(unpack("h*", $last_ref))
	    || register_content_error ($fpwtext, $heading, $content, $formatted_content);
	$last_ref = '';
      }
    } elsif ($text =~ /^\n+$/) {
    } else {
      if ($indent_level != $indent_last) {
	$indent_last = $indent_level;
	$$fpwtext->add_indent_level ($indent_last)
	    || register_content_error ($fpwtext, $heading, $content, $formatted_content);
      }
      $$fpwtext->add_text(utf2euc($text))
	  || register_content_error ($fpwtext, $heading, $content, $formatted_content);
    }
  }

  $$fpwtext->add_newline()
      || register_content_error ($$fpwtext, $heading, $content, $content);

  return $$fpwtext->entry_position();
}

sub format_content {
  my ($heading, $text) =@_;
  my @texts;
  my $level;

  my @f_texts;
  my $f_texts_count = 0;


  # Remove header and footer 
  $text =~ s/.*<text [^>]+>\s*//s;
  $text =~ s/\s<\/text>.*//s;

  # Decode entitiy
  $text = decode_entity($text);

  # Remove html commet
  if ($text =~ /<!--/) {
    @texts = split(/(<!--|-->)/, $text);
    $text ='';
    $level = 0;
    foreach $_ (@texts) {
      if ($_ =~ /<!--/) {
	$level = 1;
      } elsif ($_ =~ /-->/) {
	$level = 0;
      } elsif ($level == 0) {
	$text .= $_;
      }
    }
  }

  #Backup nowiki text
  if ($text =~ /<nowiki>/) {
    @texts = split(/(<nowiki>.*?<\/nowiki>)/s, $text);
    $text ='';
    foreach $_ (@texts) {
      if ($_ =~ /^<nowiki>(.*)<\/nowiki>$/) {
	$f_texts[$f_texts_count] = $1;
	$f_texts[$f_texts_count] =~ s/\n/<BR>\n/g;
	$text .= "<F_TEXTS>$f_texts_count</F_TEXTS>";
	$f_texts_count++;
      } else {
	$text .= $_;
      }
    }
  }

  # Format <pre> tags
  if ($text =~ /<pre>/) {
    $text = format_content_pre($text);
  }
  
  # Remove gallery
  $text =~ s/<gallery>.*?<\/gallery>//sg;

  # Remove math
  $text =~ s/<math>.*?<\/math>//sg;

  # Format supported templates.
  $text =~ s/\{\{[lL]ang\|[a-z-]+\|([^\}]+)\}\}/$1/g;
  $text =~s/\{\{:利用者:Bcjp\/t\/fbp国内表_top\|[^\}]*\}\}/\{\|/g;
  $text =~ s/\{\{サッカー代表個人成績\|[^\}]+\}\}/\{\|/g;

  # Remove templates.
  while ($text =~ s/\{{3}[^\{\}]*\}{3}//sg) {}
  while ($text =~ s/\{\{[^\{\}]*\}\}//sg) {}

  # Remove tables
  if ($text =~ /\{\|/) {
    $text = format_content_table($text);
  }

  if ($text =~ /<table/) {
    $text = format_content_table_html($text);
  }

  # Format footnote
  $text =~ s/<ref>(.*?)<\/ref>/ \($1\) /sg;

  # Remove xhtml tags
  $text =~ s/<\/?[a-zA-Z]+( [^<>]*|\/)?>//g;

  # Remove links to other languages.
  $text =~ s/\[\[[a-z-]+:[^\[\]]+\]\]//g;

  # Format links to articles in other languages.

  $text =~ s/\[\[:[a-z-]+:([^\[\]\|]+)\]\]/$1/g;
  $text =~ s/\[\[:[a-z-]+:[^\[\]\|]+\|([^\[\]\|]+)\]\]/$1/g;

  # Remove links to media data.
  # $text =~ s/\n*\[\[(?:[Mm]edia|[Ii]mage|画像):[^\[\]]+\|(([^\|\[\]]|\[\[[^\|\[\]]+\]\])+)\]\]/\n\n(メディアデータ: $1)\n\n/sg;
  $text =~ s/\n*\[\[(?:[Mm]edia|[Ii]mage|画像):([^\[\]]|\[\[[^\[\]]+\]\])+\]\]//g;

  # Format subheadings.
  $text =~ s/^={6} *([^=]+?) *={6}$/<H6>$1<\/H6>/mg;
  $text =~ s/^={5} *([^=]+?) *={5}$/<H5>・$1<\/H5>/mg;
  $text =~ s/^={4,} *([^=]+?) *={4,}$/<H4>○$1<\/H4>/mg;
  $text =~ s/^=== *([^=]+?) *===$/<H3>□$1<\/H3>/mg;
  $text =~ s/^== *([^=]+?) *==$/<H2>■$1<\/H2>/mg;
  
  #Format pre-formatted text
  $text =~ s/^ (.*)/<IND>$1/mg;

  # Format definitions
#  $text =~ s/\n;([^:\n]+)(?: +|\n):([^\n]+)\s*\n/\n<DT>$1<\/DT>\n<DD>$2<\/DD>\n/sg;
#  $text =~ s/\n;([^:\n]+)\s*\n/\n<DT>$1<\/DT>\n/sg;

  $text =~ s/^;(.+?) :(.*)/<DT>$1<\/DT>\n<DD>$2<\/DD>/mg;
  $text =~ s/^;(.+?\]\]):(.*)/<DT>$1<\/DT>\n<DD>$2<\/DD>/mg;
  $text =~ s/^;(.*)/<DT>$1<\/DT>/mg;

  # Format indents
  while ($text =~ s/^(:*):/$1<IND>/mg) {}
  # Format itemize
  while ($text =~ s/^([*\#]*)([*\#])((<IND>)*)/$1<IND>$3$2/mg) {}
#   while ($text =~ s/^((<IND>)+)([*\#]+)<IND>/$1<IND>$3/mg) {}
  $text =~ s/^(<IND>.*)$/$1<BR>/mg;
 
  # Remove trailing garbage.
  $text =~ s/^\[\[Category:.+\]\]$//gm;

  # Format links to other articles
  if ($fpwwikipedia_conf{'enable_reference'}) {
    $text =~ s/\[\[([^\[\]\|\#]+?)(?:\#[^\[\]\|\#]+)?\|(([^\[\]\|]|\[[^\[\]]|\][^\]])+)\]\]/<R $1>$2<\/R>/g;
    $text =~ s/\[\[([^\[\]\|\#]+?)(?:\#[^\[\]\|\#]+)?\]\]/<R $1>$1<\/R>/g;
  } else {
    $text =~ s/\[\[([^\[\]\|\#]+?)(?:\#[^\[\]\|\#]+)?\|(([^\[\]\|]|\[[^\[\]]|\][^\]])+)\]\]/$2/g;
    $text =~ s/\[\[([^\[\]\|\#]+?)(?:\#[^\[\]\|\#]+)?\]\]/$1/g;
  }

   # Format links to self page.
  $text =~ s/\[\[\#[^\[\]\|\#]+\|(([^\[\]\|]|\[[^\[\]]|\][^\]])+)\]\]/$1/g;
  $text =~ s/\[\[\#([^\[\]\|\#]+)\]\]/$1/g;

  # Format emphasis
  $text =~ s/'{2,}//g; 

  # Format LFs
  $text =~ s/(?:\n{2,})/<BR>\n/g;
  $text =~ s/(?:<BR>\n?)*\n(<(H|DT|IND))/<BR>\n$1/g;
  $text =~ s/(<\/(H[2-6]|DT)>)\n?(?:<BR>\n?)+/$1\n/g;

  $text =~ s/^(?:<BR>|\s)+//g;
  $text =~ s/(?:<BR>|\s)+$//g;

  # Recover nowiki texts
  $text =~ s/<F_TEXTS>([0-9]+)<\/F_TEXTS>/$f_texts[$1]/g;

  # print "Formatted_Content: $text\n";
  return $text;
}

sub format_content_pre {
  my $text = $_[0];
  my @texts = split(/(<\/?pre>)/, $text);

  $text = '';
  my $level = 0;
  foreach $_ (@texts) {
    if ($_ =~ /<pre>/) {
      if ($level) {
	format_content_warning ("closing pre tag recognition is failed");
      }
      $level = 1;
    } elsif ($_ =~ /<\/pre>/) {
      if ($level == 0) {
	format_content_warning ("opening pre tag recognition is failed");
      }
      $level = 0;
      } else {
	if ($level) {
	  $_ =~  s/^(.)/ $1/mg;
	}
	$text .= $_;
      }
    }
    if ($level) {
      format_content_warning("closing pre tag recognition is failed");
    }

return $text;
}

sub format_content_table {
  my $text = $_[0];
  my @texts = split(/(\{\||\|\})/, $text);
  my $level = 0;

  $text = '';
  foreach $_ (@texts) {
    if ($_ =~ /^\{\|/) {
      $level++;
    } elsif ($_ =~ /^\|\}/) {
      if ($level == 0) {
	format_content_warning ("opening table tag recognition is failed");
        $text .= $_;
      }	else {
          $level--;
      }
    } elsif ($level == 0) {
        $text .= $_;
    }
  }

  if ($level) {
    format_content_warning("closing table tag recognition is failed");
  }

  return $text;
}

sub format_content_table_html {
  my $text = $_[0];
  my @texts = split(/(<\/?table[^<]*?>)/, $text);
  my $level = 0;

  $text = '';
  foreach $_ (@texts) {
    if ($_ =~ /^<table/) {
      $level++;
    } elsif ($_ =~ /^<\/table/) {
      if ($level == 0) {
	format_content_warning ("opening table html tag recognition is failed");
        $text .= $_;
      } else {
        $level--;
      }
    } elsif ($level == 0) {
      $text .= $_;
    }
  }

  if ($level) {
    format_content_warning("closing table html tag recognition is failed");
  }

  return $text;
}

sub register_content_error {
  my ($fpwtext, $heading, $content, $formatted_content) = @_;

  print '$PROGRAM_NAME:  '.$$fpwtext->error_message()."\n";
  print "Heading: $heading\n";
  print "Content: $content\n";
  print "Formatted_content: $formatted_content\n";
  die;
}

sub format_content_warning {
  my ($message)= @_;

  print "$PROGRAM_NAME: warning: $message.\n";
}

sub get_entry_headings {
  my $filename = $_[0];
  
  if (not -e $filename) {
    die("$PROGRAM_NAME: '$filename' does not exist.");
  }

  my $entry_file = FileHandle->new();
  if (!$entry_file->open("$filename", 'r')) {
    die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $filename\n";
  }
  
  for (;;) {
    $_ = $entry_file->getline();
    if (!defined($_)) {
      last;
    }
    if ($_ =~ /\t/) {
      $_ =~ s/^([^\t]+)\t//s;
      $entry_headings{$1} = [split(/\t|\n/, $_)];
    } else {
      $_ =~ /(.+)\n/;
      $entry_headings{$1} = 1;
    }
  }
}
