""" Created on 2014-10-20 @author: Thomas 'PointedEars' Lahn """ from os import chdir, stat from sys import stderr from os.path import dirname, realpath, basename from pickle import dump, load from re import match, DOTALL, search, sub, split, compile debug_level = 2 def dmsg(*args, **kwargs): if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: kwargs['min_level'] = 1 if not hasattr(kwargs, 'file'): kwargs['file'] = stderr if debug_level >= kwargs['min_level']: del kwargs['min_level'] print(*args, **kwargs) def sort_dict_alnum_english_key(phrase): return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() class Dictionary(dict): """ A Dictionary (not to be confused with its ancestor, dict) represents a word dictionary stored in a file. """ _language_key = 'en' _keys = "ipa|en|lit|pos|com|tag|ex" _expressions = {} def load (self, dictionary_file, language_key='en'): """ Loads a word dictionary from a file. :param dictionary_file: :type dictionary_file: :param language_key: :type language_key: """ self._language_key = language_key dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) chdir(dirname(realpath(__file__))) pickle_file = basename(dictionary_file) + '.pickle' try: pickle_mtime = stat(pickle_file).st_mtime except FileNotFoundError: pickle_mtime = None if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) phrase = None key = None value = None with open(dictionary_file) as f: indent = None for line in f: m = match(r'^\s*{0}:\s*(?P.+)'.format(self._language_key), line) if m is not None: phrase = m.group("phrase") self[phrase] = {} indent = None else: m = match(r'(?P\s*)(?P{0}):\s*(?P.+)'.format(self._keys), line) if m is not None: # join previous value if necessary if type(value) == list: self[phrase][key] = ' '.join(value) indent = m.group("indent") key = m.group("key") value = m.group("value") # assign a string for memory efficiency self[phrase][key] = value elif indent is not None: m = match(r'(?P\s+)(?P\S.*)', line) if m is not None: if len(m.group("indent")) == len(indent) + 2: continuation = m.group("continuation") if type(value) == str: # when a continuation is first found, convert to a list # because there could be more continuations value = self[phrase][key] = [value, continuation] else: value.append(continuation) # join last value if necessary if type(value) == list: self[phrase][key] = ' '.join(value) dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) # TODO: Pickle should only contain strings to be small with open(pickle_file, mode='wb') as f: dump(self, f) dmsg(' done.', min_level=1) else: dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) with open(pickle_file, mode='rb') as f: pickle = load(f) for key, value in pickle.items(): self[key] = value dmsg(' done ({0} entries).'.format(len(self)), min_level=1) def clean (self): """ Cleans dictionary entries """ re_parens = compile(r'\(.+\)', DOTALL) re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL) re_braces = compile( r'^\s*\{(?P.+)\}(?:\s*\((?P.+?)\))?\s*$', DOTALL) re_semicolon = compile(r'\s*;\s*') for orig_phrase, data in list(self.items()): # if there are optional or alternating parts if search(re_parens, orig_phrase): if orig_phrase.find('|') > -1: # TODO alternation pass else: # TODO optional parts pass if orig_phrase.find(';') > -1: synonyms = map( lambda x: sub(re_braces, r'\1', x), split(re_semicolon, orig_phrase)) for synonym in synonyms: self[synonym] = data del self[orig_phrase] else: m = match(re_braces, orig_phrase) if m is not None: phrase = m.group('phrase') if callable(getattr(self, 'clean_entry', None)): phrase = self.clean_entry(phrase) m_parens = search(re_parens, phrase) if m_parens is not None: # alternation and optional parts expr = sub(re_parens_no_alt, r'(?:\1)?', phrase) expr = sub('~', '(?=.)', expr) self._expressions[expr] = data else: # remove braces self[phrase] = data del self[orig_phrase] def translate (self, phrase): """ Translate a phrase according to this dictionary. For language-specific processing, this method should be called/overridden by inheriting classes. :param phrase: :type phrase: str """ translation = self.get(phrase.lower(), None) if translation is not None: translation[self._language_key] = phrase return translation return None def translate_expression (self, phrase): """ Translate a phrase according entries in this dictionary based on regular expressions. :param phrase: :type phrase: """ for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])): expression_match = match(r'{0}$'.format(expression), phrase) if expression_match is not None: data[self._language_key] = expression_match.group(0) return data return None