''' Created on 2014-10-20 @author: Thomas 'PointedEars' Lahn ''' from sys import argv, stderr from re import findall, DOTALL, IGNORECASE, match, sub, compile, \ split from os import chdir from os.path import dirname, realpath from collections import OrderedDict from functools import cmp_to_key from copy import deepcopy from collections.abc import MutableSequence dictionary = {} prepositions = { "fi'": 'on', "na'": 'at|to', "t'": 'of' } class MutableString2(MutableSequence): def __init__(self, value=None): self._values = [str(value)] if value is not None else [] def __add__(self, value): return ''.join([self, value]) def __delitem__(self): raise NotImplementedError def __getitem__(self, index): return str(self)[index] def __len__(self): return len(str(self)) def __repr__(self): return ''.join(self._values) def __setitem__(self, index, value): raise NotImplementedError def __str__(self): return self.__repr__() def extend(self, values): self._values.append(values) def insert(self, index, value): raise NotImplementedError def load_dictionary(dictionary, dictionary_file): print('Loading dictionary {0} ...'.format(dictionary_file), end='', file=stderr) chdir(dirname(realpath(__file__))) with open(dictionary_file) as f: keys = "ipa|en|lit|pos|com|tag|ex" indent = None value = None for line in f: m = match(r'^\s*vuh:\s*(?P.+)', line) if m is not None: phrase = m.group("phrase") dictionary[phrase] = {} indent = None else: m = match( r'(?P\s*)(?P{0}):\s*(?P.+)'.format(keys), line) if m is not None: indent = m.group("indent") key = m.group("key") value = m.group("value") value = dictionary[phrase][key] = MutableString2(value) elif indent is not None: m = match(r'(?P\s+)(?P\S.*)', line) if m is not None: if len(m.group("indent")) == len(indent) + 2: dictionary[phrase][key] += (" " + m.group("continuation")) print(' done ({0} entries).'.format(len(dictionary)), file=stderr) def clean_dictionary(dictionary): braces_re = compile(r'^\s*\{(.+)\}\s*$') semicolon_re = compile(r'\s*;\s*') for orig_phrase, data in list(dictionary.items()): if orig_phrase.find(";") > -1: synonyms = map( lambda x: sub(braces_re, r'\1', orig_phrase), split(semicolon_re, orig_phrase)) for synonym in synonyms: dictionary[synonym] = deepcopy(data) del dictionary[orig_phrase] else: m = match(braces_re, orig_phrase) if m is not None: dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase]) del dictionary[orig_phrase] def sort_dict_alnum_english_key(phrase): return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() def get_sort_dict_alnum_vulcan_key(): letters = list(map(str.lower, [ " ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D', 'V', 'Kh', 'E', 'H', 'G', 'Ch', 'I', 'N', 'Zh', 'M', 'Y', 'F', 'Z', 'Th', 'W', 'B', "'", '-'])) letter_values = dict(map(lambda x: (x[1], x[0]), enumerate(letters))) letters_re = compile(r'(?:{0})'.format('|'.join(sorted(letters, key=lambda char:-len(char))))) def sort_dict_alnum_vulcan (a, b): # split into Vulcan letters a = findall(letters_re, sort_dict_alnum_english_key(a)) b = findall(letters_re, sort_dict_alnum_english_key(b)) if len(a) < len(b): for index, char in enumerate(a): diff = letter_values[char] - letter_values[b[index]] if diff != 0: return diff return -1 # len(b) <= len(a) for index, char in enumerate(b): diff = letter_values[a[index]] - letter_values[char] if diff != 0: return diff return 1 if len(b) < len(a) else 0 return cmp_to_key(sort_dict_alnum_vulcan) def translate (word, recursion=False): translation = dictionary.get(word.lower(), None) if translation is not None: translation = translation["en"] if match('[A-Z]', word): return sub('[a-z]', lambda ch: ch.group(0).upper(), str(translation), count=1) return translation if not recursion: # prepositions attached? for prep, prep_transl in prepositions.items(): if (match(prep, word)): real_word = word.replace(r'^' + prep, '') real_word_transl = translate(real_word, recursion=True) if real_word_transl is not None: return prep_transl + ' ' + real_word_transl if recursion: return None else: # Not in dictionary: proper name or missing for other reasons return '{{{0}}}'.format(word) if __name__ == '__main__': load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt') clean_dictionary(dictionary) # try: # for phrase, data in OrderedDict(sorted( # dictionary.items(), # key=get_sort_dict_alnum_vulcan_key() # )).items(): # print(phrase, "=", data) # except BrokenPipeError: # pass text = argv[1] sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL) for sentence in sentences: print(sentence) words = findall(r"(?!\s+)[a-z'-]{2,}", sentence, IGNORECASE) print(words) translated_words = list(map(translate, words)) print(translated_words) for index, word in enumerate(words): sentence = sentence.replace(word, str(translated_words[index])) print(sentence) # replace punctuation for symbol, replacement in ({" - ": ", "}).items(): sentence = sentence.replace(symbol, replacement) print(sentence)