Subversion Repositories LCARS

Compare Revisions

Last modification

Regard whitespace Rev 292 → Rev 293

/trunk/tools/eazytrans/vuh.py
6,17 → 6,13
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
'''
from sys import argv, stderr
from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
split
from os import chdir, stat
from os.path import dirname, realpath, basename
from collections import OrderedDict
from re import findall, DOTALL, match, sub, compile, \
escape, search
from os.path import basename
from functools import cmp_to_key
from copy import deepcopy
from collections.abc import MutableSequence
from pickle import dump, load
from Dictionary import Dictionary, dmsg, \
sort_dict_alnum_english_key
 
debug_level = 2
dictionary = {}
 
prepositions = {
25,124 → 21,9
"t'": 'of'
}
 
def dmsg(*args, **kwargs):
if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
kwargs['min_level'] = 1
 
if not hasattr(kwargs, 'file'):
kwargs['file'] = stderr
 
if debug_level >= kwargs['min_level']:
del kwargs['min_level']
print(*args, **kwargs)
 
class MutableString2(MutableSequence):
def __init__(self, value=None):
self._values = [str(value)] if value is not None else []
 
def __add__(self, value):
return ''.join([self, value])
 
def __delitem__(self):
raise NotImplementedError
 
def __getitem__(self, index):
return str(self)[index]
 
def __len__(self):
return len(str(self))
 
def __repr__(self):
return ''.join(self._values)
 
def __setitem__(self, index, value):
raise NotImplementedError
 
def __str__(self):
return self.__repr__()
 
def extend(self, values):
self._values.append(values)
 
def insert(self, index, value):
raise NotImplementedError
 
def cli_help():
print('Usage: {0} TEXT...'.format(basename(argv[0])))
 
def load_dictionary(dictionary, dictionary_file):
dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
 
chdir(dirname(realpath(__file__)))
 
pickle_file = basename(dictionary_file) + '.pickle'
 
try:
pickle_mtime = stat(pickle_file).st_mtime
except FileNotFoundError:
pickle_mtime = None
 
if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
with open(dictionary_file) as f:
keys = "ipa|en|lit|pos|com|tag|ex"
indent = None
value = None
 
for line in f:
m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line)
if m is not None:
phrase = m.group("phrase")
dictionary[phrase] = {}
indent = None
else:
m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys), line)
if m is not None:
indent = m.group("indent")
key = m.group("key")
value = m.group("value")
value = dictionary[phrase][key] = MutableString2(value)
elif indent is not None:
m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
if m is not None:
if len(m.group("indent")) == len(indent) + 2:
dictionary[phrase][key] += (" " + m.group("continuation"))
 
dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
# TODO: Pickle should only contain strings to be small
with open(pickle_file, mode='wb') as f: dump(dictionary, f)
dmsg(' done.', min_level=1)
else:
dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
with open(pickle_file, mode='rb') as f: pickle = load(f)
for key, value in pickle.items():
dictionary[key] = value
 
dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)
 
def clean_dictionary(dictionary):
braces_re = compile(r'^\s*\{(.+)\}\s*$')
semicolon_re = compile(r'\s*;\s*')
 
for orig_phrase, data in list(dictionary.items()):
if orig_phrase.find(";") > -1:
synonyms = map(
lambda x: sub(braces_re, r'\1', orig_phrase),
split(semicolon_re, orig_phrase))
 
for synonym in synonyms:
dictionary[synonym] = deepcopy(data)
 
del dictionary[orig_phrase]
else:
m = match(braces_re, orig_phrase)
if m is not None:
dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
del dictionary[orig_phrase]
 
def sort_dict_alnum_english_key(phrase):
return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
 
def get_sort_dict_alnum_vulcan_key():
letters = list(map(str.lower, [
" ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D',
173,10 → 54,36
 
return cmp_to_key(sort_dict_alnum_vulcan)
 
def translate (phrase):
class VulcanDictionary(Dictionary):
def translate (self, phrase, search_prefix=True, search_plural=True):
dictionary = self
 
translation = dictionary.get(phrase.lower(), None)
if translation is not None:
translation['vuh'] = phrase
return translation
else:
if search_prefix:
# find prefix
for preposition in prepositions:
prefix = match(escape(preposition), phrase)
if prefix is not None:
prefix_translation = self.translate(prefix.group(0))
if prefix_translation is not None:
tail = sub(preposition, '', phrase)
tail_translation = self.translate(tail, search_prefix=False)
if tail_translation is not None:
return [prefix_translation, tail_translation]
elif search_plural:
# find plural
suffix = search(r'lar$', phrase)
if suffix is not None:
head = sub(r'lar$', '', phrase)
head_translation = self.translate(head, search_prefix=False, search_plural=False)
if head_translation is not None:
head_translation = dict(head_translation)
head_translation['en'] += ' (pl.)'
return head_translation
 
return None
 
188,8 → 95,9
 
text = argv[1]
 
load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
clean_dictionary(dictionary)
dictionary = VulcanDictionary(dictionary)
dictionary.load('vuh-gol-en.dict.zdb.txt')
dictionary.clean()
 
# try:
# for phrase, data in OrderedDict(sorted(
200,12 → 108,13
# except BrokenPipeError:
# pass
 
dmsg("text:", text, min_level=2)
sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
dmsg("sentences:", sentences, min_level=2)
for sentence in sentences:
dmsg("sentence:", sentence, min_level=2)
 
clauses = split(r'\s+[-–—]\s+', sentence)
clauses = findall(r'(?!\s+)(?:.+?(?:\s+-\s*|\s*[–—]\s*|\.{1,3}|.+$))', sentence, DOTALL)
dmsg("clauses:", clauses, min_level=2)
for clause in clauses:
dmsg("clause:", clause, min_level=2)
217,18 → 126,19
while offset < len(words):
translation = None
 
for i in reversed(range(offset + 1, len(words) + 1)):
for i in range(len(words), offset, -1):
dmsg("words[{0}:{1}] = {2}".format(offset, i, words[offset:i]), min_level=2)
phrase = ' '.join(words[offset:i])
 
dmsg("phrase:", phrase, min_level=2)
 
translation = translate(phrase)
translation = dictionary.translate(phrase)
 
if translation is not None:
dmsg("phrase-translation:", translation, min_level=2)
dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2)
words[offset:i] = [translation]
offset += i - 1
offset += i - offset
break
 
if translation is None:
/trunk/tools/eazytrans/Dictionary.py
0,0 → 1,137
"""
Created on 2014-10-20
 
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
 
"""
 
from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile
from copy import deepcopy
 
debug_level = 2
 
def dmsg(*args, **kwargs):
if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
kwargs['min_level'] = 1
 
if not hasattr(kwargs, 'file'):
kwargs['file'] = stderr
 
if debug_level >= kwargs['min_level']:
del kwargs['min_level']
print(*args, **kwargs)
 
def sort_dict_alnum_english_key(phrase):
return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
 
class Dictionary(dict):
"""
classdocs
"""
_keys = "ipa|en|lit|pos|com|tag|ex"
_expressions = {}
 
def load (self, dictionary_file):
dictionary = self
 
dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
 
chdir(dirname(realpath(__file__)))
 
pickle_file = basename(dictionary_file) + '.pickle'
 
try:
pickle_mtime = stat(pickle_file).st_mtime
except FileNotFoundError:
pickle_mtime = None
 
if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
phrase = None
key = None
value = None
with open(dictionary_file) as f:
indent = None
 
for line in f:
m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line)
if m is not None:
phrase = m.group("phrase")
dictionary[phrase] = {}
indent = None
else:
m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
if m is not None:
# join previous value if necessary
if type(value) == list:
dictionary[phrase][key] = ' '.join(value)
 
indent = m.group("indent")
key = m.group("key")
value = m.group("value")
# assign a string for memory efficiency
dictionary[phrase][key] = value
elif indent is not None:
m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
if m is not None:
if len(m.group("indent")) == len(indent) + 2:
continuation = m.group("continuation")
if type(value) == str:
# when a continuation is first found, convert to a list
# because there could be more continuations
value = dictionary[phrase][key] = [value, continuation]
else:
value.append(continuation)
 
# join last value if necessary
if type(value) == list:
dictionary[phrase][key] = ' '.join(value)
 
dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
# TODO: Pickle should only contain strings to be small
with open(pickle_file, mode='wb') as f: dump(dictionary, f)
dmsg(' done.', min_level=1)
else:
dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
with open(pickle_file, mode='rb') as f: pickle = load(f)
for key, value in pickle.items():
dictionary[key] = value
 
dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)
 
def clean (self):
dictionary = self
 
parens_re = compile(r'\(.+\)', DOTALL)
braces_re = compile(r'^\s*\{(.+)\}\s*$', DOTALL)
semicolon_re = compile(r'\s*;\s*')
 
for orig_phrase, data in list(dictionary.items()):
# if there are optional or alternating parts
if search(parens_re, orig_phrase):
if orig_phrase.find('|') > -1:
# TODO alternation
pass
else:
# TODO optional parts
pass
 
if orig_phrase.find(';') > -1:
synonyms = map(
lambda x: sub(braces_re, r'\1', x),
split(semicolon_re, orig_phrase))
 
for synonym in synonyms:
dictionary[synonym] = deepcopy(data)
 
del dictionary[orig_phrase]
else:
m = match(braces_re, orig_phrase)
if m is not None:
dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
del dictionary[orig_phrase]
Property changes:
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property