#                                                         -*- Perl -*-
# Copyright (c) 2007  Kazuhiro Ito
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#

use strict;
use warnings;

use English;

use vars qw (%entity_table %utf2euc_table $utf2euc_regexp);
use vars qw (%ipa_table %symbol_table %text_table);

%entity_table =
    (
      'amp' => '&',
      'gt' => '>',
      'lt' => '<',
    );

%utf2euc_table = (
  # (FULLWIDTH TILDE, U+FF5E) -> (WAVE DASH, U+301C)
  "\xEF\xBD\x9E" => "\xE3\x80\x9C",
  # (FULLWIDTH HYPHEN-MINUS, U+FF0D) -> (MINUS SIGN, U+2212)
  "\xEF\xBC\x8D" => "\xE2\x88\x92",
  'è' => 'e`',	
  'é' => "e'",
  'ñ' => 'n~',
  'Ĥ' => 'H^',
  'Ĵ' => 'J^',
  'ĥ' => 'h^',
  'ĵ' => 'j^',
  'Ĉ' => 'C^',
  'Ĝ' => 'G^',
  'Ŝ' => 'S^',
  'ĉ' => 'c^',
  'ĝ' => 'g^',
  'ŝ' => 's^',
  'œ' => 'oe',
  # 'Ŭ' => 'U',
  # 'ŭ' => 'u',
  '·' => '・',
  "\xC2\xA0" => ' ',
  "\xC2\xB7" => '・',
  "\xC3\x87" => '・',
  "\xC3\xAC" => '★',
  "\xC3\xB2" => '→',
  "\xC3\xB5" => '◆',
  "\xC3\xB4" => '→',
  "\xC5\x92" => '■',
  "\xC5\xA0" => '▼',
  "\xC6\x92" => '⇔',
  "\xE2\x80\x82" => ' ',
  "\xE2\x80\x93" => '-',
  "\xE2\x80\x94" => '--',
  "\xE2\x84\xA2" => ' (TM)',
  # "\xE2\x9c\x93" => '', #Wordfinder sign, check sign
    );

%ipa_table = (
  "'" => 'u-02c8',
  "\xc3\xa6" => 'u-00e6',
  "\xc3\xb0" => 'u-00f0',
  "\xc5\x8b" => 'u-014b',
  "\xc9\x91" => 'u-0251',
  "\xc9\x92" => 'u-0252',
  "\xc9\x94" => 'u-0254',
  "\xc9\x99" => 'u-0259',
  "\xc9\x9c" => 'u-025c',
  "\xc9\xa1" => 'u-0261',
  "\xc9\xaa" => 'u-026a',
  "\xc9\xb5" => 'u-0275',
  "\xca\x83" => 'u-0283',
  "\xca\x8a" => 'u-028a',
  "\xca\x8c" => 'u-028c',
  "\xca\x92" => 'u-0292',
  "\xcb\x8c" => 'u-02c8',
  "\xcb\x8c" => 'u-02cc',
  "\xcb\x90" => 'u-02d0',
  "\xcd\x82" => 'u-0342',
  ' ' => '',
  '%' => 'u-02cc',
  '&' => 'u-00e6',
  '(' => '',
  ')' => '',
  ',' => '',
  '-' => '',
  '3' => 'u-025c',
  ':' => 'u-02d0',
  ';' => 'u-02d0',
  'A' => 'u-0251',
  'D' => 'u-00f0',
  'I' => 'u-026a',
  'J' => 'u-0259',
  'L' => '',
  'N' => 'u-014b',
  'O' => 'u-0254',
  'Q' => 'u-0252',
  'S' => 'u-0283',
  'T' => 'u-0275',
  'U' => 'u-028a',
  'V' => 'u-028c',
  'Z' => 'u-0292',
  'a' => '',
  'b' => '',
  'd' => '',
  'e' => '',
  'f' => '',
  'g' => 'u-0261',
  'h' => '',
  'i' => '',
  'j' => '',
  'k' => '',
  'l' => '',
  'm' => '',
  'n' => '',
  'o' => '',
  'p' => '',
  'r' => '',
  's' => '',
  't' => '',
  'u' => '',
  'v' => '',
  'w' => '',
  'x' => '',
  'y' => '',
  'z' => '',
    );

%symbol_table = (
  ' ' => '',
  '%' => 'u-02cc',		# second accent
  '&' => 'u-00e6',		# ae ligature
  ',' => '',
  '-' => '',
  '\'' => 'u-02c8',		# Fisrt accent
  # "\xc3\x87" => "・",
  # "\xc3\x87" => "u-00b7",
  "\xc3\x87" => "◇",
  "\xc3\x89" => "→",		# Wordfinder sign
  "\xc3\xaa" => "u-20ac",	# Euro
  "\xc3\xac" => "★",
  "\xc3\xb2" => "→",
  "\xc3\xb4" => "→",
  "\xc3\xb5" => "◆",
  "\xc5\x92" => "■",
  "\xc5\xa0" => "▼",
  "\xc6\x92" => "⇔",
  "\xe2\x80\xa6" => "u-221b",	#cubic root
  "\xe2\x80\xb0" => "♯",
  "\xe2\x80\xa1" => "u-266e",	# natural
  "\xe2\x80\xa0" => "♭",
  "\xc3\x91" => ' ',  #speaker mark
    );

%text_table = (
  # ' ' => ' ',
  "\xc2\xa0" => ' ',
  '£' => '', # 'u-00a3',
  '©' => '', # 'u-00a9',
  '®' => '', # 'u-00ae',
  '¯' => 'u-00af',
  '°' => 'u-00b0',
  '²' => 'u-00b2',
  '·' => 'u-00b7',
  '¹' => 'u-00b9',
  '¼' => 'u-00bc',
  '½' => 'u-00bd',
  '¾' => 'u-00be',
  'Ä' => 'u-00c4',
  'Å' => 'u-00c5',
  'Æ' => 'u-00c6',
  'É' => 'u-00c9',
  '×' => '', # 'u-00d7',
  'Ü' => 'u-00dc',
  'à' => 'u-00e0',
  'á' => 'u-00e1',
  'â' => 'u-00e2',
  'ã' => 'u-00e3',
  'ä' => 'u-00e4',
  'å' => 'u-00e5',
  'æ' => 'u-00e6',
  'ç' => 'u-00e7',
  'è' => 'u-00e8',
  'é' => 'u-00e9',
  'ê' => 'u-00ea',
  'ë' => 'u-00eb',
  'ì' => 'u-00ec',
  'í' => 'u-00ed',
  'î' => 'u-00ee',
  'ï' => 'u-00ef',
  'ð' => 'u-00f0',
  'ñ' => 'u-00f1',
  'ò' => 'u-00f2',
  'ó' => 'u-00f3',
  'ô' => 'u-00f4',
  'ö' => 'u-00f6',
  '÷' => '', # 'u-00f7',
  'ø' => 'u-00f8',
  'ù' => 'u-00f9',
  'ú' => 'u-00fa',
  'û' => 'u-00fb',
  'ü' => 'u-00fc',
  'ý' => 'u-00fd',
  'þ' => 'u-00fe',
  'ā' => 'u-0101',
  'ă' => 'u-0103',
  'Č' => 'u-010c',
  'č' => 'u-010d',
  'Ē' => 'u-0112',
  'ē' => 'u-0113',
  'ě' => 'u-011b',
  'ĝ' => 'u-011d',
  'ġ' => 'u-0121',
  'ĩ' => 'u-0129',
  'Ī' => 'u-012a',
  'ī' => 'u-012b',
  'ĭ' => 'u-012d',
  'ı' => 'u-0131',
  'ł' => 'u-0142',
  'ň' => 'u-0148',
  'ō' => 'u-014d',
  'œ' => 'u-0153',
  'ř' => 'u-0159',
  'Ś' => 'u-015a',
  'ś' => 'u-015b',
  'ş' => 'u-015f',
  'š' => 'u-0161',
  'Ţ' => 'u-0162',
  'ũ' => 'u-0169',
  'ū' => 'u-016b',
  'ŭ' => 'u-016d',
  'ů' => 'u-016f',
  'ŷ' => 'u-0177',
  'ž' => 'u-017e',
  'ǐ' => 'u-01d0',
  'ǣ' => 'u-01e3',
  'ǧ' => 'u-01e7',
  'ǽ' => 'u-01fd',
  '̄' => 'u-0304', # COMBINING MACRON
  '̣' => 'u-0323', # COMBINING DOT BELOW
  '̧' => 'u-0327', # COMBINING CEDILLA
  '̱' => 'u-0331', # COMBINING INVERTED BREVE
  'Γ' => '', # 'u-0393',
  'Δ' => '', # 'u-0394',
  'Θ' => '', # 'u-0398',
  'Λ' => '', # 'u-039b',
  'Ξ' => '', # 'u-039e',
  'Π' => '', # 'u-03a0',
  'Σ' => '', # 'u-03a3',
  'Φ' => '', # 'u-03a6',
  'Ψ' => '', # 'u-03a8',
  'Ω' => '', # 'u-03a9',
  'α' => '', # 'u-03b1',
  'β' => '', # 'u-03b2',
  'γ' => '', # 'u-03b3',
  'δ' => '', # 'u-03b4',
  'ζ' => '', # 'u-03b6',
  'η' => '', # 'u-03b7',
  'θ' => '', # 'u-03b8',
  'ι' => '', # 'u-03b9',
  'κ' => '', # 'u-03ba',
  'λ' => '', # 'u-03bb',
  'μ' => '', # 'u-03bc',
  'ν' => '', # 'u-03bd',
  'ξ' => '', # 'u-03be',
  'ο' => '', # 'u-03bf',
  'π' => '', # 'u-03c0',
  'ρ' => '', # 'u-03c1',
  'σ' => '', # 'u-03c3',
  'τ' => '', # 'u-03c4',
  'υ' => '', # 'u-03c5',
  'χ' => '', # 'u-03c7',
  'ψ' => '', # 'u-03c8',
  'ω' => '', # 'u-03c9',
  'ϒ' => 'u-03d2',
  'ϕ' => 'u-03d5',
  'ϵ' => 'u-03f5',
  'ӯ' => 'u-04ef',
  'ḇ' => 'u-1e07',
  'ḍ' => 'u-1e0d',
  'ḏ' => 'u-1e0f',
  'ḥ' => 'u-1e25',
  'ḳ' => 'u-1e33',
  'Ḵ' => 'u-1e34',
  'ḵ' => 'u-1e35',
  'ḷ' => 'u-1e37',
  'ṁ' => 'u-1e41',
  'ṃ' => 'u-1e43',
  'ṅ' => 'u-1e45',
  'ṇ' => 'u-1e47',
  'ṛ' => 'u-1e5b',
  'ṟ' => 'u-1e5f',
  'ṣ' => 'u-1e63',
  'ṭ' => 'u-1e6d',
  'ṯ' => 'u-1e6f',
  'ẓ' => 'u-1e93',
  'ẽ' => 'u-1ebd',
  ' ' => '　', # 'u-2002'
  ' ' => '　', # 'u-2003',
  '‐' => '-', # 'u-2010',
  '–' => '--', # 'u-2013',
  '—' => '---', # 'u-2014',
  '‘' => 'u-2018',
  '’' => 'u-2019',
  '“' => '', # 'u-201c',
  '”' => '', # 'u-201d',
  '…' => '', # 'u-2026',
  '′' => '', # 'u-2032',
  '″' => '', # 'u-2033',
  '⁁' => 'u-2041',
  '⁄' => '/', # 'u-2044',
  '⁵' => 'u-2075',
  '₀' => 'u-2080',
  '₁' => 'u-2081',
  '₂' => 'u-2082',
  '₃' => 'u-2083',
  '₆' => 'u-2086',
  '™' => 'u-2122',
  '⅝' => 'u-215d',
  '⅞' => 'u-215e',
  "\xe2\x88\x92" => '', # 'u-2212', #'－'
  '√' => '', # 'u-221a',
  '∞' => '', # 'u-221e',
  '≠' => '', # 'u-2260',
  '✓' => 'u-2713',
  '➔' => '⇒', # 'u-2794',
  '〃' => '', # 'u-3003',
    );


$utf2euc_regexp = '(';
foreach $_ (keys(%utf2euc_table)) {
  $utf2euc_regexp .= "$_|";
}
$utf2euc_regexp =~ s/\|$/\)/;


# Following line must be at the end of this file.
1;
