Subversion Repositories LCARS

Rev

Rev 297 | Go to most recent revision | View as "text/plain" | Blame | Compare with Previous | Last modification | View Log | RSS feed

1
"""
Created on 2014-10-20

@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>

"""


from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile

debug_level = 2

def dmsg(*args, **kwargs):
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
        kwargs['min_level'] = 1

    if not hasattr(kwargs, 'file'):
        kwargs['file'] = stderr

    if debug_level >= kwargs['min_level']:
        del kwargs['min_level']
        print(*args, **kwargs)

def sort_dict_alnum_english_key(phrase):
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()

class Dictionary(dict):
    """
    A Dictionary (not to be confused with its ancestor, dict)
    represents a word dictionary stored in a file.
   
    """

    _language_key = 'en'
    _keys = "ipa|en|lit|pos|com|tag|ex"
    _expressions = {}

    def load (self, dictionary_file, language_key='en'):
        """
        Loads a word dictionary from a file.
        :param dictionary_file:
        :type dictionary_file:
        :param language_key:
        :type language_key:
        """

        self._language_key = language_key

        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)

        chdir(dirname(realpath(__file__)))

        pickle_file = basename(dictionary_file) + '.pickle'

        try:
            pickle_mtime = stat(pickle_file).st_mtime
        except FileNotFoundError:
            pickle_mtime = None

        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
            phrase = None
            key = None
            value = None
            with open(dictionary_file) as f:
                indent = None

                for line in f:
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
                    if m is not None:
                        phrase = m.group("phrase")
                        self[phrase] = {}
                        indent = None
                    else:
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
                        if m is not None:
                            # join previous value if necessary
                            if type(value) == list:
                                self[phrase][key] = ' '.join(value)

                            indent = m.group("indent")
                            key = m.group("key")
                            value = m.group("value")
                            # assign a string for memory efficiency
                            self[phrase][key] = value
                        elif indent is not None:
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
                            if m is not None:
                                if len(m.group("indent")) == len(indent) + 2:
                                    continuation = m.group("continuation")
                                    if type(value) == str:
                                        # when a continuation is first found, convert to a list
                                        # because there could be more continuations
                                        value = self[phrase][key] = [value, continuation]
                                    else:
                                        value.append(continuation)

            # join last value if necessary
            if type(value) == list:
                self[phrase][key] = ' '.join(value)

            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
            # TODO: Pickle should only contain strings to be small
            with open(pickle_file, mode='wb') as f: dump(self, f)
            dmsg(' done.', min_level=1)
        else:
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
            with open(pickle_file, mode='rb') as f: pickle = load(f)
            for key, value in pickle.items():
                self[key] = value

        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)

    def clean (self):
        """
        Cleans dictionary entries
        """

        re_parens = compile(r'\(.+\)', DOTALL)
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
        re_braces = compile(
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
            DOTALL)
        re_semicolon = compile(r'\s*;\s*')

        for orig_phrase, data in list(self.items()):
            # if there are optional or alternating parts
            if search(re_parens, orig_phrase):
                if orig_phrase.find('|') > -1:
                    # TODO alternation
                    pass
                else:
                    # TODO optional parts
                    pass

            if orig_phrase.find(';') > -1:
                synonyms = map(
                    lambda x: sub(re_braces, r'\1', x),
                    split(re_semicolon, orig_phrase))

                for synonym in synonyms:
                    self[synonym] = data

                del self[orig_phrase]
            else:
                m = match(re_braces, orig_phrase)
                if m is not None:
                    phrase = m.group('phrase')

                    if callable(getattr(self, 'clean_entry', None)):
                        phrase = self.clean_entry(phrase)

                    m_parens = search(re_parens, phrase)
                    if m_parens is not None:
                        # alternation and optional parts
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
                        expr = sub('~', '(?=.)', expr)
                        self._expressions[expr] = data
                    else:
                        # remove braces
                        self[phrase] = data

                    del self[orig_phrase]

    def translate (self, phrase):
        """
        Translate a phrase according to this dictionary.
        For language-specific processing, this method should be
        called/overridden by inheriting classes.
        :param phrase:
        :type phrase: str
        """

        translation = self.get(phrase.lower(), None)
        if translation is not None:
            translation[self._language_key] = phrase
            return translation

        return None

    def translate_expression (self, phrase):
        """
        Translate a phrase according entries in this dictionary
        based on regular expressions.
        :param phrase:
        :type phrase:
        """

        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
            expression_match = match(r'{0}$'.format(expression), phrase)
            if expression_match is not None:
                data[self._language_key] = expression_match.group(0)
                return data

        return None