#!/usr/bin/env python3 ''' Created on 2014-10-20 @author: Thomas 'PointedEars' Lahn ''' from sys import argv, stderr from re import findall, DOTALL, IGNORECASE, match, sub, compile, \ split from os import chdir, stat from os.path import dirname, realpath, basename from collections import OrderedDict from functools import cmp_to_key from copy import deepcopy from collections.abc import MutableSequence from pickle import dump, load debug_level = 2 dictionary = {} prepositions = { "fi'": 'on', "na'": 'at|to', "t'": 'of' } def dmsg(*args, **kwargs): if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: kwargs['min_level'] = 1 if not hasattr(kwargs, 'file'): kwargs['file'] = stderr if debug_level >= kwargs['min_level']: del kwargs['min_level'] print(*args, **kwargs) class MutableString2(MutableSequence): def __init__(self, value=None): self._values = [str(value)] if value is not None else [] def __add__(self, value): return ''.join([self, value]) def __delitem__(self): raise NotImplementedError def __getitem__(self, index): return str(self)[index] def __len__(self): return len(str(self)) def __repr__(self): return ''.join(self._values) def __setitem__(self, index, value): raise NotImplementedError def __str__(self): return self.__repr__() def extend(self, values): self._values.append(values) def insert(self, index, value): raise NotImplementedError def cli_help(): print('Usage: {0} TEXT...'.format(basename(argv[0]))) def load_dictionary(dictionary, dictionary_file): dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) chdir(dirname(realpath(__file__))) pickle_file = basename(dictionary_file) + '.pickle' try: pickle_mtime = stat(pickle_file).st_mtime except FileNotFoundError: pickle_mtime = None if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) with open(dictionary_file) as f: keys = "ipa|en|lit|pos|com|tag|ex" indent = None value = None for line in f: m = match(r'^\s*vuh:\s*(?P.+)', line) if m is not None: phrase = m.group("phrase") dictionary[phrase] = {} indent = None else: m = match(r'(?P\s*)(?P{0}):\s*(?P.+)'.format(keys), line) if m is not None: indent = m.group("indent") key = m.group("key") value = m.group("value") value = dictionary[phrase][key] = MutableString2(value) elif indent is not None: m = match(r'(?P\s+)(?P\S.*)', line) if m is not None: if len(m.group("indent")) == len(indent) + 2: dictionary[phrase][key] += (" " + m.group("continuation")) dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) # TODO: Pickle should only contain strings to be small with open(pickle_file, mode='wb') as f: dump(dictionary, f) dmsg(' done.', min_level=1) else: dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) with open(pickle_file, mode='rb') as f: pickle = load(f) for key, value in pickle.items(): dictionary[key] = value dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1) def clean_dictionary(dictionary): braces_re = compile(r'^\s*\{(.+)\}\s*$') semicolon_re = compile(r'\s*;\s*') for orig_phrase, data in list(dictionary.items()): if orig_phrase.find(";") > -1: synonyms = map( lambda x: sub(braces_re, r'\1', orig_phrase), split(semicolon_re, orig_phrase)) for synonym in synonyms: dictionary[synonym] = deepcopy(data) del dictionary[orig_phrase] else: m = match(braces_re, orig_phrase) if m is not None: dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase]) del dictionary[orig_phrase] def sort_dict_alnum_english_key(phrase): return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() def get_sort_dict_alnum_vulcan_key(): letters = list(map(str.lower, [ " ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D', 'V', 'Kh', 'E', 'H', 'G', 'Ch', 'I', 'N', 'Zh', 'M', 'Y', 'F', 'Z', 'Th', 'W', 'B', "'", '-'])) letter_values = dict(map(lambda x: (x[1], x[0]), enumerate(letters))) letters_re = compile(r'(?:{0})'.format('|'.join(sorted(letters, key=lambda char:-len(char))))) def sort_dict_alnum_vulcan (a, b): # split into Vulcan letters a = findall(letters_re, sort_dict_alnum_english_key(a)) b = findall(letters_re, sort_dict_alnum_english_key(b)) if len(a) < len(b): for index, char in enumerate(a): diff = letter_values[char] - letter_values[b[index]] if diff != 0: return diff return -1 # len(b) <= len(a) for index, char in enumerate(b): diff = letter_values[a[index]] - letter_values[char] if diff != 0: return diff return 1 if len(b) < len(a) else 0 return cmp_to_key(sort_dict_alnum_vulcan) def translate (phrase): translation = dictionary.get(phrase.lower(), None) if translation is not None: return translation return None if __name__ == '__main__': if len(argv) < 2: print('Nothing to translate.', end='\n\n', file=stderr) cli_help() exit(1) text = argv[1] load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt') clean_dictionary(dictionary) # try: # for phrase, data in OrderedDict(sorted( # dictionary.items(), # key=get_sort_dict_alnum_vulcan_key() # )).items(): # print(phrase, "=", data) # except BrokenPipeError: # pass sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL) dmsg("sentences:", sentences, min_level=2) for sentence in sentences: dmsg("sentence:", sentence, min_level=2) clauses = split(r'\s+[-–—]\s+', sentence) dmsg("clauses:", clauses, min_level=2) for clause in clauses: dmsg("clause:", clause, min_level=2) words = findall(r'[^\s.]+', clause) dmsg("words:", words, min_level=2) offset = 0 while offset < len(words): translation = None for i in reversed(range(offset + 1, len(words) + 1)): phrase = ' '.join(words[offset:i]) dmsg("phrase:", phrase, min_level=2) translation = translate(phrase) if translation is not None: dmsg("phrase-translation:", translation, min_level=2) dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2) words[offset:i] = [translation] offset += i - 1 break if translation is None: dmsg("phrase-translation:", translation, min_level=2) offset += 1 dmsg("words-translation:", words, min_level=2)