Subversion Repositories LCARS

Rev

Rev 294 | Go to most recent revision | View as "text/plain" | Blame | Compare with Previous | Last modification | View Log | RSS feed

1
"""
Created on 2014-10-20

@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>

"""


from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile
from copy import deepcopy

debug_level = 2

def dmsg(*args, **kwargs):
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
        kwargs['min_level'] = 1

    if not hasattr(kwargs, 'file'):
        kwargs['file'] = stderr

    if debug_level >= kwargs['min_level']:
        del kwargs['min_level']
        print(*args, **kwargs)

def sort_dict_alnum_english_key(phrase):
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()

class Dictionary(dict):
    """
    classdocs
   
    """

    _keys = "ipa|en|lit|pos|com|tag|ex"
    _expressions = {}

    def load (self, dictionary_file):
        dictionary = self

        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)

        chdir(dirname(realpath(__file__)))

        pickle_file = basename(dictionary_file) + '.pickle'

        try:
            pickle_mtime = stat(pickle_file).st_mtime
        except FileNotFoundError:
            pickle_mtime = None

        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
            phrase = None
            key = None
            value = None
            with open(dictionary_file) as f:
                indent = None

                for line in f:
                    m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line)
                    if m is not None:
                        phrase = m.group("phrase")
                        dictionary[phrase] = {}
                        indent = None
                    else:
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
                        if m is not None:
                            # join previous value if necessary
                            if type(value) == list:
                                dictionary[phrase][key] = ' '.join(value)

                            indent = m.group("indent")
                            key = m.group("key")
                            value = m.group("value")
                            # assign a string for memory efficiency
                            dictionary[phrase][key] = value
                        elif indent is not None:
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
                            if m is not None:
                                if len(m.group("indent")) == len(indent) + 2:
                                    continuation = m.group("continuation")
                                    if type(value) == str:
                                        # when a continuation is first found, convert to a list
                                        # because there could be more continuations
                                        value = dictionary[phrase][key] = [value, continuation]
                                    else:
                                        value.append(continuation)

            # join last value if necessary
            if type(value) == list:
                dictionary[phrase][key] = ' '.join(value)

            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
            # TODO: Pickle should only contain strings to be small
            with open(pickle_file, mode='wb') as f: dump(dictionary, f)
            dmsg(' done.', min_level=1)
        else:
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
            with open(pickle_file, mode='rb') as f: pickle = load(f)
            for key, value in pickle.items():
                dictionary[key] = value

        dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)

    def clean (self):
        dictionary = self

        parens_re = compile(r'\(.+\)', DOTALL)
        braces_re = compile(r'^\s*\{(.+)\}\s*$', DOTALL)
        semicolon_re = compile(r'\s*;\s*')

        for orig_phrase, data in list(dictionary.items()):
            # if there are optional or alternating parts
            if search(parens_re, orig_phrase):
                if orig_phrase.find('|') > -1:
                    # TODO alternation
                    pass
                else:
                    # TODO optional parts
                    pass

            if orig_phrase.find(';') > -1:
                synonyms = map(
                    lambda x: sub(braces_re, r'\1', x),
                    split(semicolon_re, orig_phrase))

                for synonym in synonyms:
                    dictionary[synonym] = deepcopy(data)

                del dictionary[orig_phrase]
            else:
                m = match(braces_re, orig_phrase)
                if m is not None:
                    dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
                    del dictionary[orig_phrase]