WebSVN - LCARS - Blame - Rev 292 - /trunk/tools/eazytrans/vuh.py

Rev	Author	Line No.	Line
292	PointedEar	1	#!/usr/bin/env python3
		2
291	PointedEar	3	'''
		4	Created on 2014-10-20
		5
		6	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
		7	'''
		8	from sys import argv, stderr
		9	from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
		10	split
292	PointedEar	11	from os import chdir, stat
		12	from os.path import dirname, realpath, basename
291	PointedEar	13	from collections import OrderedDict
		14	from functools import cmp_to_key
		15	from copy import deepcopy
		16	from collections.abc import MutableSequence
292	PointedEar	17	from pickle import dump, load
291	PointedEar	18
292	PointedEar	19	debug_level = 2
291	PointedEar	20	dictionary = {}
		21
		22	prepositions = {
		23	"fi'": 'on',
		24	"na'": 'at\|to',
		25	"t'": 'of'
		26	}
		27
292	PointedEar	28	def dmsg(args, *kwargs):
		29	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
		30	kwargs['min_level'] = 1
		31
		32	if not hasattr(kwargs, 'file'):
		33	kwargs['file'] = stderr
		34
		35	if debug_level >= kwargs['min_level']:
		36	del kwargs['min_level']
		37	print(args, *kwargs)
		38
291	PointedEar	39	class MutableString2(MutableSequence):
		40	def __init__(self, value=None):
		41	self._values = [str(value)] if value is not None else []
		42
		43	def __add__(self, value):
		44	return ''.join([self, value])
		45
		46	def __delitem__(self):
		47	raise NotImplementedError
		48
		49	def __getitem__(self, index):
		50	return str(self)[index]
		51
		52	def __len__(self):
		53	return len(str(self))
		54
		55	def __repr__(self):
		56	return ''.join(self._values)
		57
		58	def __setitem__(self, index, value):
		59	raise NotImplementedError
		60
		61	def __str__(self):
		62	return self.__repr__()
		63
		64	def extend(self, values):
		65	self._values.append(values)
		66
		67	def insert(self, index, value):
		68	raise NotImplementedError
		69
292	PointedEar	70	def cli_help():
		71	print('Usage: {0} TEXT...'.format(basename(argv[0])))
		72
291	PointedEar	73	def load_dictionary(dictionary, dictionary_file):
292	PointedEar	74	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
291	PointedEar	75
		76	chdir(dirname(realpath(__file__)))
		77
292	PointedEar	78	pickle_file = basename(dictionary_file) + '.pickle'
		79
		80	try:
		81	pickle_mtime = stat(pickle_file).st_mtime
		82	except FileNotFoundError:
		83	pickle_mtime = None
		84
		85	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
		86	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
		87	with open(dictionary_file) as f:
		88	keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"
		89	indent = None
		90	value = None
		91
		92	for line in f:
		93	m = match(r'^\svuh:\s(?P<phrase>.+)', line)
291	PointedEar	94	if m is not None:
292	PointedEar	95	phrase = m.group("phrase")
		96	dictionary[phrase] = {}
		97	indent = None
		98	else:
		99	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(keys), line)
291	PointedEar	100	if m is not None:
292	PointedEar	101	indent = m.group("indent")
		102	key = m.group("key")
		103	value = m.group("value")
		104	value = dictionary[phrase][key] = MutableString2(value)
		105	elif indent is not None:
		106	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
		107	if m is not None:
		108	if len(m.group("indent")) == len(indent) + 2:
		109	dictionary[phrase][key] += (" " + m.group("continuation"))
291	PointedEar	110
292	PointedEar	111	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
		112	# TODO: Pickle should only contain strings to be small
		113	with open(pickle_file, mode='wb') as f: dump(dictionary, f)
		114	dmsg(' done.', min_level=1)
		115	else:
		116	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
		117	with open(pickle_file, mode='rb') as f: pickle = load(f)
		118	for key, value in pickle.items():
		119	dictionary[key] = value
291	PointedEar	120
292	PointedEar	121	dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)
		122
291	PointedEar	123	def clean_dictionary(dictionary):
		124	braces_re = compile(r'^\s\{(.+)\}\s$')
		125	semicolon_re = compile(r'\s;\s')
		126
		127	for orig_phrase, data in list(dictionary.items()):
		128	if orig_phrase.find(";") > -1:
		129	synonyms = map(
		130	lambda x: sub(braces_re, r'\1', orig_phrase),
		131	split(semicolon_re, orig_phrase))
		132
		133	for synonym in synonyms:
		134	dictionary[synonym] = deepcopy(data)
		135
		136	del dictionary[orig_phrase]
		137	else:
		138	m = match(braces_re, orig_phrase)
		139	if m is not None:
		140	dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
		141	del dictionary[orig_phrase]
		142
		143	def sort_dict_alnum_english_key(phrase):
		144	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
		145
		146	def get_sort_dict_alnum_vulcan_key():
		147	letters = list(map(str.lower, [
		148	" ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D',
		149	'V', 'Kh', 'E', 'H', 'G', 'Ch', 'I', 'N', 'Zh', 'M', 'Y', 'F', 'Z',
		150	'Th', 'W', 'B', "'", '-']))
		151	letter_values = dict(map(lambda x: (x[1], x[0]), enumerate(letters)))
		152	letters_re = compile(r'(?:{0})'.format('\|'.join(sorted(letters, key=lambda char:-len(char)))))
		153
		154	def sort_dict_alnum_vulcan (a, b):
		155	# split into Vulcan letters
		156	a = findall(letters_re, sort_dict_alnum_english_key(a))
		157	b = findall(letters_re, sort_dict_alnum_english_key(b))
		158
		159	if len(a) < len(b):
		160	for index, char in enumerate(a):
		161	diff = letter_values[char] - letter_values[b[index]]
		162	if diff != 0:
		163	return diff
		164	return -1
		165
		166	# len(b) <= len(a)
		167	for index, char in enumerate(b):
		168	diff = letter_values[a[index]] - letter_values[char]
		169	if diff != 0:
		170	return diff
		171
		172	return 1 if len(b) < len(a) else 0
		173
		174	return cmp_to_key(sort_dict_alnum_vulcan)
		175
292	PointedEar	176	def translate (phrase):
		177	translation = dictionary.get(phrase.lower(), None)
291	PointedEar	178	if translation is not None:
		179	return translation
		180
292	PointedEar	181	return None
291	PointedEar	182
292	PointedEar	183	if __name__ == '__main__':
		184	if len(argv) < 2:
		185	print('Nothing to translate.', end='\n\n', file=stderr)
		186	cli_help()
		187	exit(1)
291	PointedEar	188
292	PointedEar	189	text = argv[1]
		190
291	PointedEar	191	load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
		192	clean_dictionary(dictionary)
		193
		194	# try:
		195	# for phrase, data in OrderedDict(sorted(
		196	# dictionary.items(),
		197	# key=get_sort_dict_alnum_vulcan_key()
		198	# )).items():
		199	# print(phrase, "=", data)
		200	# except BrokenPipeError:
		201	# pass
		202
		203	sentences = findall(r'(?!\s+)(?:.+?\.{1,3}\|.+$)', text, DOTALL)
292	PointedEar	204	dmsg("sentences:", sentences, min_level=2)
291	PointedEar	205	for sentence in sentences:
292	PointedEar	206	dmsg("sentence:", sentence, min_level=2)
291	PointedEar	207
292	PointedEar	208	clauses = split(r'\s+[-–—]\s+', sentence)
		209	dmsg("clauses:", clauses, min_level=2)
		210	for clause in clauses:
		211	dmsg("clause:", clause, min_level=2)
291	PointedEar	212
292	PointedEar	213	words = findall(r'[^\s.]+', clause)
		214	dmsg("words:", words, min_level=2)
291	PointedEar	215
292	PointedEar	216	offset = 0
		217	while offset < len(words):
		218	translation = None
291	PointedEar	219
292	PointedEar	220	for i in reversed(range(offset + 1, len(words) + 1)):
		221	phrase = ' '.join(words[offset:i])
291	PointedEar	222
292	PointedEar	223	dmsg("phrase:", phrase, min_level=2)
		224
		225	translation = translate(phrase)
		226
		227	if translation is not None:
		228	dmsg("phrase-translation:", translation, min_level=2)
		229	dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2)
		230	words[offset:i] = [translation]
		231	offset += i - 1
		232	break
		233
		234	if translation is None:
		235	dmsg("phrase-translation:", translation, min_level=2)
		236	offset += 1
		237
		238	dmsg("words-translation:", words, min_level=2)

Subversion Repositories LCARS

(root)/trunk/tools/eazytrans/vuh.py @ 293 - Rev 292