#! /usr/local/bin/python
# -*- coding: UTF-8 -*-

# $Id$
#=============================================================================
#
#  @file    htkdic2pls.py
#
#  @author Fukasawa Mitsuo
#
#
#    Copyright (C) 2006 BEE Co.,Ltd. All rights reserved.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#=============================================================================

import getopt, sys, string, os, shutil
import kanatable

_lexicon_head = """<?xml version="1.0" encoding="UTF-8"?>
<lexicon version="1.0" xmlns="http://www.w3.org/2005/01/pronunciation-lexicon"
         schemaLocation="pls.xsd"
      alphabet="julius" xml:lang="ja">
"""

_lexicon_tail = """</lexicon>
"""

_lexeme_template = """  <lexeme xml:id="${lexid}">
    <grapheme>${grapheme}</grapheme>
    <phoneme>${phoneme}</phoneme>
    <alias>${alias}</alias>
  </lexeme>
""" 

_lexeme_template0 = """  <lexeme xml:id="${lexid}">
    <grapheme>${grapheme}</grapheme>
    <phoneme>${phoneme}</phoneme>
  </lexeme>
""" 

###############################################################################
#
# Get csv file in excel file
#
def convert_to_pls(dicname):
    infp = open(dicname)
    path, ext = os.path.splitext(dicname)
    path += '.pls'
    outfp = open(path, "w")
    outfp.write(_lexicon_head)
    line = infp.readline()
    num = 1
    while len(line) > 0:
        plsline = htkdic_to_pls(line, num)
        if len(plsline) > 0:
            outfp.write(plsline)
        line = infp.readline()
        num += 1
    outfp.write(_lexicon_tail)
    infp.close()
    outfp.close()


def htkdic_to_pls(ln, num):
    line = unicode(ln)
    idx0 = line.index('[');
    idx1 = line.index(']');
    if idx1 < 0:
        print("Error: ']' not found(", num, ")")
        return ""
    pronunciation = line[idx1+1:].strip()
    output = line[idx0+1:idx1]
    a = line[:idx0].split(':')
    orthography = a[0]
    if len(a) > 2:
        kana = ""
        ku = unicode(a[1])
        lb = ku.find('{')
        if lb >= 0:
            rb = ku.find('}')
            n = u""
            m = u""
            if lb > 0:
                m = ku[0:lb]     # get "xyz" of "xyz{abc/def}"
            if (rb + 1) < len(ku):
                n = ku[rb+1:]    # get "xyz" of "{abc/def}xyz" 
            k = ku[lb+1:rb]      # get "abc/def" of "{abc/def}xyz" 
            for s in unicode(k).split('/'):
                x = unicode(m + s + n)
                if kanatable.kana_compare(x, pronunciation) == True:
                    kana = x
            if len(kana) == 0:
                kana == u"$B!)!)!)(B"
                print str(num) + ":" + line 
        else:
            kana = ku
        d = dict(lexid=num,grapheme=orthography,phoneme=pronunciation,alias=kana)
        t = string.Template(_lexeme_template)
        xml = t.substitute(d)
    else:
        x = orthography.strip().replace("<","&lt;")
        orthography = x.replace(">", "&gt;")
        d = dict(lexid=num, grapheme=orthography, phoneme=pronunciation)
        t = string.Template(_lexeme_template0)
        xml = t.substitute(d)
    return xml

#
# Print usage of this script
#
def usage():
    print("htkdic2pls.py [-d <dicname>] [-h]");


#
# Test main
#
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hd:", ["help", "dictionary"])
    except getopt.GetoptError:
        usage()
        sys.exit(2)

    dicname = "web.60k.htkdic"
    for o, a in opts:
        if o in ("-d", "--dictionary"):
            dicname = a
        if o in ("-h", "--help"):
            usage()
            sys.exit()
    # ...

    convert_to_pls(dicname)


if __name__ == "__main__":
    main()
