Rev 297 |
Go to most recent revision |
View as "text/plain" |
Blame |
Compare with Previous |
Last modification |
View Log
| RSS feed
1
"""
Created on 2014-10-20
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
"""
from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile
debug_level = 2
def dmsg(*args, **kwargs):
if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
kwargs['min_level'] = 1
if not hasattr(kwargs, 'file'):
kwargs['file'] = stderr
if debug_level >= kwargs['min_level']:
del kwargs['min_level']
print(*args, **kwargs)
def sort_dict_alnum_english_key(phrase):
return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
class Dictionary(dict):
"""
A Dictionary (not to be confused with its ancestor, dict)
represents a word dictionary stored in a file.
"""
_language_key = 'en'
_keys = "ipa|en|lit|pos|com|tag|ex"
_expressions = {}
def load (self, dictionary_file, language_key='en'):
"""
Loads a word dictionary from a file.
:param dictionary_file:
:type dictionary_file:
:param language_key:
:type language_key:
"""
self._language_key = language_key
dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
chdir(dirname(realpath(__file__)))
pickle_file = basename(dictionary_file) + '.pickle'
try:
pickle_mtime = stat(pickle_file).st_mtime
except FileNotFoundError:
pickle_mtime = None
if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
phrase = None
key = None
value = None
with open(dictionary_file) as f:
indent = None
for line in f:
m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
if m is not None:
phrase = m.group("phrase")
self[phrase] = {}
indent = None
else:
m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
if m is not None:
# join previous value if necessary
if type(value) == list:
self[phrase][key] = ' '.join(value)
indent = m.group("indent")
key = m.group("key")
value = m.group("value")
# assign a string for memory efficiency
self[phrase][key] = value
elif indent is not None:
m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
if m is not None:
if len(m.group("indent")) == len(indent) + 2:
continuation = m.group("continuation")
if type(value) == str:
# when a continuation is first found, convert to a list
# because there could be more continuations
value = self[phrase][key] = [value, continuation]
else:
value.append(continuation)
# join last value if necessary
if type(value) == list:
self[phrase][key] = ' '.join(value)
dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
# TODO: Pickle should only contain strings to be small
with open(pickle_file, mode='wb') as f: dump(self, f)
dmsg(' done.', min_level=1)
else:
dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
with open(pickle_file, mode='rb') as f: pickle = load(f)
for key, value in pickle.items():
self[key] = value
dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
def clean (self):
"""
Cleans dictionary entries
"""
re_parens = compile(r'\(.+\)', DOTALL)
re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
re_braces = compile(
r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
DOTALL)
re_semicolon = compile(r'\s*;\s*')
for orig_phrase, data in list(self.items()):
# if there are optional or alternating parts
if search(re_parens, orig_phrase):
if orig_phrase.find('|') > -1:
# TODO alternation
pass
else:
# TODO optional parts
pass
if orig_phrase.find(';') > -1:
synonyms = map(
lambda x: sub(re_braces, r'\1', x),
split(re_semicolon, orig_phrase))
for synonym in synonyms:
self[synonym] = data
del self[orig_phrase]
else:
m = match(re_braces, orig_phrase)
if m is not None:
phrase = m.group('phrase')
if callable(getattr(self, 'clean_entry', None)):
phrase = self.clean_entry(phrase)
m_parens = search(re_parens, phrase)
if m_parens is not None:
# alternation and optional parts
expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
expr = sub('~', '(?=.)', expr)
self._expressions[expr] = data
else:
# remove braces
self[phrase] = data
del self[orig_phrase]
def translate (self, phrase):
"""
Translate a phrase according to this dictionary.
For language-specific processing, this method should be
called/overridden by inheriting classes.
:param phrase:
:type phrase: str
"""
translation = self.get(phrase.lower(), None)
if translation is not None:
translation[self._language_key] = phrase
return translation
return None
def translate_expression (self, phrase):
"""
Translate a phrase according entries in this dictionary
based on regular expressions.
:param phrase:
:type phrase:
"""
for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
expression_match = match(r'{0}$'.format(expression), phrase)
if expression_match is not None:
data[self._language_key] = expression_match.group(0)
return data
return None