Compare Revisions
Last modification
- Rev 293 2014-11-27 04:07:24
- Author: PointedEars
- Log message:
eazytrans:
* Dictionary.py
- Prepared translation of optional parts and alternation
* vuh.py
- Moved language-independent parts to class
- Translate prepositional prefixes and plurals
/trunk/tools/eazytrans/Dictionary.py |
File deleted |
|
Property changes: |
Deleted: svn:mime-type |
## -1 +0,0 ## |
-text/plain |
\ No newline at end of property |
Index: tools/eazytrans/vuh.py |
=================================================================== |
--- tools/eazytrans/vuh.py (revision 293) |
+++ tools/eazytrans/vuh.py (revision 292) |
@@ -6,13 +6,17 @@ |
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de> |
''' |
from sys import argv, stderr |
-from re import findall, DOTALL, match, sub, compile, \ |
- escape, search |
-from os.path import basename |
+from re import findall, DOTALL, IGNORECASE, match, sub, compile, \ |
+ split |
+from os import chdir, stat |
+from os.path import dirname, realpath, basename |
+from collections import OrderedDict |
from functools import cmp_to_key |
-from Dictionary import Dictionary, dmsg, \ |
- sort_dict_alnum_english_key |
+from copy import deepcopy |
+from collections.abc import MutableSequence |
+from pickle import dump, load |
+debug_level = 2 |
dictionary = {} |
|
prepositions = { |
@@ -21,9 +25,124 @@ |
"t'": 'of' |
} |
|
+def dmsg(*args, **kwargs): |
+ if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: |
+ kwargs['min_level'] = 1 |
+ |
+ if not hasattr(kwargs, 'file'): |
+ kwargs['file'] = stderr |
+ |
+ if debug_level >= kwargs['min_level']: |
+ del kwargs['min_level'] |
+ print(*args, **kwargs) |
+ |
+class MutableString2(MutableSequence): |
+ def __init__(self, value=None): |
+ self._values = [str(value)] if value is not None else [] |
+ |
+ def __add__(self, value): |
+ return ''.join([self, value]) |
+ |
+ def __delitem__(self): |
+ raise NotImplementedError |
+ |
+ def __getitem__(self, index): |
+ return str(self)[index] |
+ |
+ def __len__(self): |
+ return len(str(self)) |
+ |
+ def __repr__(self): |
+ return ''.join(self._values) |
+ |
+ def __setitem__(self, index, value): |
+ raise NotImplementedError |
+ |
+ def __str__(self): |
+ return self.__repr__() |
+ |
+ def extend(self, values): |
+ self._values.append(values) |
+ |
+ def insert(self, index, value): |
+ raise NotImplementedError |
+ |
def cli_help(): |
print('Usage: {0} TEXT...'.format(basename(argv[0]))) |
|
+def load_dictionary(dictionary, dictionary_file): |
+ dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
+ |
+ chdir(dirname(realpath(__file__))) |
+ |
+ pickle_file = basename(dictionary_file) + '.pickle' |
+ |
+ try: |
+ pickle_mtime = stat(pickle_file).st_mtime |
+ except FileNotFoundError: |
+ pickle_mtime = None |
+ |
+ if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: |
+ dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) |
+ with open(dictionary_file) as f: |
+ keys = "ipa|en|lit|pos|com|tag|ex" |
+ indent = None |
+ value = None |
+ |
+ for line in f: |
+ m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line) |
+ if m is not None: |
+ phrase = m.group("phrase") |
+ dictionary[phrase] = {} |
+ indent = None |
+ else: |
+ m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys), line) |
+ if m is not None: |
+ indent = m.group("indent") |
+ key = m.group("key") |
+ value = m.group("value") |
+ value = dictionary[phrase][key] = MutableString2(value) |
+ elif indent is not None: |
+ m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
+ if m is not None: |
+ if len(m.group("indent")) == len(indent) + 2: |
+ dictionary[phrase][key] += (" " + m.group("continuation")) |
+ |
+ dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
+ # TODO: Pickle should only contain strings to be small |
+ with open(pickle_file, mode='wb') as f: dump(dictionary, f) |
+ dmsg(' done.', min_level=1) |
+ else: |
+ dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
+ with open(pickle_file, mode='rb') as f: pickle = load(f) |
+ for key, value in pickle.items(): |
+ dictionary[key] = value |
+ |
+ dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1) |
+ |
+def clean_dictionary(dictionary): |
+ braces_re = compile(r'^\s*\{(.+)\}\s*$') |
+ semicolon_re = compile(r'\s*;\s*') |
+ |
+ for orig_phrase, data in list(dictionary.items()): |
+ if orig_phrase.find(";") > -1: |
+ synonyms = map( |
+ lambda x: sub(braces_re, r'\1', orig_phrase), |
+ split(semicolon_re, orig_phrase)) |
+ |
+ for synonym in synonyms: |
+ dictionary[synonym] = deepcopy(data) |
+ |
+ del dictionary[orig_phrase] |
+ else: |
+ m = match(braces_re, orig_phrase) |
+ if m is not None: |
+ dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase]) |
+ del dictionary[orig_phrase] |
+ |
+def sort_dict_alnum_english_key(phrase): |
+ return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() |
+ |
def get_sort_dict_alnum_vulcan_key(): |
letters = list(map(str.lower, [ |
" ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D', |
@@ -54,36 +173,10 @@ |
|
return cmp_to_key(sort_dict_alnum_vulcan) |
|
-class VulcanDictionary(Dictionary): |
- def translate (self, phrase, search_prefix=True, search_plural=True): |
- dictionary = self |
- |
+def translate (phrase): |
translation = dictionary.get(phrase.lower(), None) |
if translation is not None: |
- translation['vuh'] = phrase |
return translation |
- else: |
- if search_prefix: |
- # find prefix |
- for preposition in prepositions: |
- prefix = match(escape(preposition), phrase) |
- if prefix is not None: |
- prefix_translation = self.translate(prefix.group(0)) |
- if prefix_translation is not None: |
- tail = sub(preposition, '', phrase) |
- tail_translation = self.translate(tail, search_prefix=False) |
- if tail_translation is not None: |
- return [prefix_translation, tail_translation] |
- elif search_plural: |
- # find plural |
- suffix = search(r'lar$', phrase) |
- if suffix is not None: |
- head = sub(r'lar$', '', phrase) |
- head_translation = self.translate(head, search_prefix=False, search_plural=False) |
- if head_translation is not None: |
- head_translation = dict(head_translation) |
- head_translation['en'] += ' (pl.)' |
- return head_translation |
|
return None |
|
@@ -95,9 +188,8 @@ |
|
text = argv[1] |
|
- dictionary = VulcanDictionary(dictionary) |
- dictionary.load('vuh-gol-en.dict.zdb.txt') |
- dictionary.clean() |
+ load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt') |
+ clean_dictionary(dictionary) |
|
# try: |
# for phrase, data in OrderedDict(sorted( |
@@ -108,13 +200,12 @@ |
# except BrokenPipeError: |
# pass |
|
- dmsg("text:", text, min_level=2) |
sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL) |
dmsg("sentences:", sentences, min_level=2) |
for sentence in sentences: |
dmsg("sentence:", sentence, min_level=2) |
|
- clauses = findall(r'(?!\s+)(?:.+?(?:\s+-\s*|\s*[–—]\s*|\.{1,3}|.+$))', sentence, DOTALL) |
+ clauses = split(r'\s+[-–—]\s+', sentence) |
dmsg("clauses:", clauses, min_level=2) |
for clause in clauses: |
dmsg("clause:", clause, min_level=2) |
@@ -126,19 +217,18 @@ |
while offset < len(words): |
translation = None |
|
- for i in range(len(words), offset, -1): |
- dmsg("words[{0}:{1}] = {2}".format(offset, i, words[offset:i]), min_level=2) |
+ for i in reversed(range(offset + 1, len(words) + 1)): |
phrase = ' '.join(words[offset:i]) |
|
dmsg("phrase:", phrase, min_level=2) |
|
- translation = dictionary.translate(phrase) |
+ translation = translate(phrase) |
|
if translation is not None: |
dmsg("phrase-translation:", translation, min_level=2) |
dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2) |
words[offset:i] = [translation] |
- offset += i - offset |
+ offset += i - 1 |
break |
|
if translation is None: |