#                                                         -*- Perl -*-
# Copyright (c) 2009  Kazuhiro Ito
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#

sub decode_entity {
  my $text = $_[0];
  my @texts = split (/(&[0-9a-zA-Z]+;|&\#[0-9]+;|&\#x[0-9a-fA-F]+;)/, $text);

  $text = '';
  foreach $_ (@texts) {
    if ($_ =~ /&([0-9a-zA-Z]+);/) {
      if (defined($entity_table{$1})) {
	$text .= $entity_table{$1};
      } else {
	$text .= '?';
      }
    } elsif ($_ =~ /&\#([0-9]+|x[0-9a-fA-F]+);/) {
      $text .= decode_entitry_internal($1);
    } else {
      $text .= $_;
    }
  }

  return $text;
}

sub decode_entitry_internal {
  my $entity = $_[0];

  if ($entity =~ /^x(.*)/) {
    $entity = hex ($1);
  }

  if      ($entity & 0x7c000000) {
    return
	chr(0xfc | (($entity >> 30) & 0x01)).
	chr(0x80 | (($entity >> 24) & 0x3f)).
	chr(0x80 | (($entity >> 18) & 0x3f)).
	chr(0x80 | (($entity >> 12) & 0x3f)).
	chr(0x80 | (($entity >>  6) & 0x3f)).
	chr(0x80 | ($entity & 0x3f));
  } elsif ($entity & 0x03e00000) {
    return
	chr(0xf8 | (($entity >> 24) & 0x03)).
	chr(0x80 | (($entity >> 18) & 0x3f)).
	chr(0x80 | (($entity >> 12) & 0x3f)).
	chr(0x80 | (($entity >>  6) & 0x3f)).
	chr(0x80 | ($entity & 0x3f));
  } elsif ($entity & 0x001f0000) {
    return
	chr(0xf0 | (($entity >> 18) & 0x07)).
	chr(0x80 | (($entity >> 12) & 0x3f)).
	chr(0x80 | (($entity >>  6) & 0x3f)).
	chr(0x80 | ($entity & 0x3f));
  } elsif ($entity & 0x0000f800) {
    return
	chr(0xe0 | (($entity >> 12) & 0x0f)).
	chr(0x80 | (($entity >>  6) & 0x3f)).
	chr(0x80 | ($entity & 0x3f));
  } elsif ($entity & 0x00000780) {
    return
	chr(0xc0 | (($entity >>  6) & 0x1f)).
	chr(0x80 | ($entity & 0x3f));
  }
  return chr($entity);
}

sub format_content_table {
  my $text = $_[0];
  my @texts = split(/(\{\||\|\})/, $text);
  my $level = 0;

  $text = '';
  foreach $_ (@texts) {
    if ($_ eq '{|') {
      $level++;
    } elsif ($_ eq '|}') {
      if ($level == 0) {
	format_content_warning ("opening table tag recognition is failed");
        $text .= $_;
      }	else {
        $level--;
      }
    } elsif ($level == 0) {
      $text .= $_;
    }
  }

  if ($level) {
    format_content_warning("closing table tag recognition is failed");
  }

  return $text;
}

sub format_content_table_html {
  my $text = $_[0];
  my @texts = split(/(<\/?table[^<]*?>)/, $text);
  my $level = 0;

  $text = '';
  foreach $_ (@texts) {
    if ($_ =~ /^<table/) {
      $level++;
    } elsif ($_ eq '</table>') {
      if ($level == 0) {
	format_content_warning ("opening table html tag recognition is failed");
        $text .= $_;
      } else {
        $level--;
      }
    } elsif ($level == 0) {
      $text .= $_;
    }
  }

  if ($level) {
    format_content_warning("closing table html tag recognition is failed");
  }

  return $text;
}

sub format_content_warning {
  my ($message)= @_;

  print "$PROGRAM_NAME: warning: $message.\n";
}

sub uniq_array {
  my %tmp;
  my $key;

  foreach $key (@_) {
    $tmp{$key} = 1;
  }

  return keys(%tmp);
}

sub get_entry_headings {
  my ($filename) = @_;
  
  if ($hash_module eq 'BDB') {
    require WikipediaFpw::Hash::BDB;
    $entry_headings = WikipediaFpw::Hash::BDB->new();
  } elsif ($hash_module eq 'GDBM') {
    require WikipediaFpw::Hash::GDBM;
    $entry_headings = WikipediaFpw::Hash::GDBM->new();
  } else {
    require WikipediaFpw::Hash::Hash;
    $entry_headings = WikipediaFpw::Hash::Hash->new();
  }
  $entry_headings->open('entry_headings.db');
  
  if (not -e $filename) {
    die("$PROGRAM_NAME: '$filename' does not exist.");
  }

  my $entry_file = FileHandle->new();
  if (!$entry_file->open("$filename", 'r')) {
    die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $filename\n";
  }
  
  for (;;) {
    $_ = $entry_file->getline();
    if (!defined($_)) {
      last;
    }
    if ($_ =~ /\t/) {
      $_ =~ s/^(.+?)\t(.+)\n//s;
      $entry_headings->put($1, $2);
    } else {
      $_ =~ /(.+)\n/;
      $entry_headings->put($1, '');
    }
  }

  $entry_file->close();  
}

1;
