WebSVN - LCARS - Diff - Rev 293 and 294 - /trunk/tools/eazytrans/Dictionary.py


"""
Created on 2014-10-20

@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>

"""

from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile
from copy import deepcopy

debug_level = 2

def dmsg(*args, **kwargs):
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
        kwargs['min_level'] = 1

    if not hasattr(kwargs, 'file'):
        kwargs['file'] = stderr

    if debug_level >= kwargs['min_level']:
        del kwargs['min_level']
        print(*args, **kwargs)

def sort_dict_alnum_english_key(phrase):
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()

class Dictionary(dict):
    """
    classdocs
   
    """
    _keys = "ipa|en|lit|pos|com|tag|ex"
    _expressions = {}

    def load (self, dictionary_file, language_key='en'):
        dictionary = self

        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)

        chdir(dirname(realpath(__file__)))

        pickle_file = basename(dictionary_file) + '.pickle'

        try:
            pickle_mtime = stat(pickle_file).st_mtime
        except FileNotFoundError:
            pickle_mtime = None

        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
            phrase = None
            key = None
            value = None
            with open(dictionary_file) as f:
                indent = None

                for line in f:
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(language_key), line)
                    if m is not None:
                        phrase = m.group("phrase")
                        dictionary[phrase] = {}
                        indent = None
                    else:
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
                        if m is not None:
                            # join previous value if necessary
                            if type(value) == list:
                                dictionary[phrase][key] = ' '.join(value)

                            indent = m.group("indent")
                            key = m.group("key")
                            value = m.group("value")
                            # assign a string for memory efficiency
                            dictionary[phrase][key] = value
                        elif indent is not None:
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
                            if m is not None:
                                if len(m.group("indent")) == len(indent) + 2:
                                    continuation = m.group("continuation")
                                    if type(value) == str:
                                        # when a continuation is first found, convert to a list
                                        # because there could be more continuations
                                        value = dictionary[phrase][key] = [value, continuation]
                                    else:
                                        value.append(continuation)

            # join last value if necessary
            if type(value) == list:
                dictionary[phrase][key] = ' '.join(value)

            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
            # TODO: Pickle should only contain strings to be small
            with open(pickle_file, mode='wb') as f: dump(dictionary, f)
            dmsg(' done.', min_level=1)
        else:
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
            with open(pickle_file, mode='rb') as f: pickle = load(f)
            for key, value in pickle.items():
                dictionary[key] = value

        dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)

    def clean (self):
        dictionary = self

        parens_re = compile(r'\(.+\)', DOTALL)
        braces_re = compile(r'^\s*\{(.+)\}\s*$', DOTALL)
        semicolon_re = compile(r'\s*;\s*')

        for orig_phrase, data in list(dictionary.items()):
            # if there are optional or alternating parts
            if search(parens_re, orig_phrase):
                if orig_phrase.find('|') > -1:
                    # TODO alternation
                    pass
                else:
                    # TODO optional parts
                    pass

            if orig_phrase.find(';') > -1:
                synonyms = map(
                    lambda x: sub(braces_re, r'\1', x),
                    split(semicolon_re, orig_phrase))

                for synonym in synonyms:
                    dictionary[synonym] = deepcopy(data)

                del dictionary[orig_phrase]
            else:
                m = match(braces_re, orig_phrase)
                if m is not None:
                    dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
                    del dictionary[orig_phrase]
 

Rev 293	Rev 294
1	"""	1	"""
2	Created on 2014-10-20	2	Created on 2014-10-20
3		3
4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>	4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5		5
6	"""	6	"""
7		7
8	from os import chdir, stat	8	from os import chdir, stat
9	from sys import stderr	9	from sys import stderr
10	from os.path import dirname, realpath, basename	10	from os.path import dirname, realpath, basename
11	from pickle import dump, load	11	from pickle import dump, load
12	from re import match, DOTALL, search, sub, split, compile	12	from re import match, DOTALL, search, sub, split, compile
13	from copy import deepcopy	13	from copy import deepcopy
14		14
15	debug_level = 2	15	debug_level = 2
16		16
17	def dmsg(args, *kwargs):	17	def dmsg(args, *kwargs):
18	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:	18	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
19	kwargs['min_level'] = 1	19	kwargs['min_level'] = 1
20		20
21	if not hasattr(kwargs, 'file'):	21	if not hasattr(kwargs, 'file'):
22	kwargs['file'] = stderr	22	kwargs['file'] = stderr
23		23
24	if debug_level >= kwargs['min_level']:	24	if debug_level >= kwargs['min_level']:
25	del kwargs['min_level']	25	del kwargs['min_level']
26	print(args, *kwargs)	26	print(args, *kwargs)
27		27
28	def sort_dict_alnum_english_key(phrase):	28	def sort_dict_alnum_english_key(phrase):
29	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()	29	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
30		30
31	class Dictionary(dict):	31	class Dictionary(dict):
32	"""	32	"""
33	classdocs	33	classdocs
34		34
35	"""	35	"""
36	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"	36	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"
37	_expressions = {}	37	_expressions = {}
38		38
39	def load (self, dictionary_file):	39	def load (self, dictionary_file, language_key='en'):
40	dictionary = self	40	dictionary = self
41		41
42	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)	42	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
43		43
44	chdir(dirname(realpath(__file__)))	44	chdir(dirname(realpath(__file__)))
45		45
46	pickle_file = basename(dictionary_file) + '.pickle'	46	pickle_file = basename(dictionary_file) + '.pickle'
47		47
48	try:	48	try:
49	pickle_mtime = stat(pickle_file).st_mtime	49	pickle_mtime = stat(pickle_file).st_mtime
50	except FileNotFoundError:	50	except FileNotFoundError:
51	pickle_mtime = None	51	pickle_mtime = None
52		52
53	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:	53	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
54	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)	54	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
55	phrase = None	55	phrase = None
56	key = None	56	key = None
57	value = None	57	value = None
58	with open(dictionary_file) as f:	58	with open(dictionary_file) as f:
59	indent = None	59	indent = None
60		60
61	for line in f:	61	for line in f:
62	m = match(r'^\svuh:\s(?P<phrase>.+)', line)	62	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(language_key), line)
63	if m is not None:	63	if m is not None:
64	phrase = m.group("phrase")	64	phrase = m.group("phrase")
65	dictionary[phrase] = {}	65	dictionary[phrase] = {}
66	indent = None	66	indent = None
67	else:	67	else:
68	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)	68	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)
69	if m is not None:	69	if m is not None:
70	# join previous value if necessary	70	# join previous value if necessary
71	if type(value) == list:	71	if type(value) == list:
72	dictionary[phrase][key] = ' '.join(value)	72	dictionary[phrase][key] = ' '.join(value)
73		73
74	indent = m.group("indent")	74	indent = m.group("indent")
75	key = m.group("key")	75	key = m.group("key")
76	value = m.group("value")	76	value = m.group("value")
77	# assign a string for memory efficiency	77	# assign a string for memory efficiency
78	dictionary[phrase][key] = value	78	dictionary[phrase][key] = value
79	elif indent is not None:	79	elif indent is not None:
80	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)	80	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
81	if m is not None:	81	if m is not None:
82	if len(m.group("indent")) == len(indent) + 2:	82	if len(m.group("indent")) == len(indent) + 2:
83	continuation = m.group("continuation")	83	continuation = m.group("continuation")
84	if type(value) == str:	84	if type(value) == str:
85	# when a continuation is first found, convert to a list	85	# when a continuation is first found, convert to a list
86	# because there could be more continuations	86	# because there could be more continuations
87	value = dictionary[phrase][key] = [value, continuation]	87	value = dictionary[phrase][key] = [value, continuation]
88	else:	88	else:
89	value.append(continuation)	89	value.append(continuation)
90		90
91	# join last value if necessary	91	# join last value if necessary
92	if type(value) == list:	92	if type(value) == list:
93	dictionary[phrase][key] = ' '.join(value)	93	dictionary[phrase][key] = ' '.join(value)
94		94
95	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)	95	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
96	# TODO: Pickle should only contain strings to be small	96	# TODO: Pickle should only contain strings to be small
97	with open(pickle_file, mode='wb') as f: dump(dictionary, f)	97	with open(pickle_file, mode='wb') as f: dump(dictionary, f)
98	dmsg(' done.', min_level=1)	98	dmsg(' done.', min_level=1)
99	else:	99	else:
100	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)	100	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
101	with open(pickle_file, mode='rb') as f: pickle = load(f)	101	with open(pickle_file, mode='rb') as f: pickle = load(f)
102	for key, value in pickle.items():	102	for key, value in pickle.items():
103	dictionary[key] = value	103	dictionary[key] = value
104		104
105	dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)	105	dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)
106		106
107	def clean (self):	107	def clean (self):
108	dictionary = self	108	dictionary = self
109		109
110	parens_re = compile(r'\(.+\)', DOTALL)	110	parens_re = compile(r'\(.+\)', DOTALL)
111	braces_re = compile(r'^\s\{(.+)\}\s$', DOTALL)	111	braces_re = compile(r'^\s\{(.+)\}\s$', DOTALL)
112	semicolon_re = compile(r'\s;\s')	112	semicolon_re = compile(r'\s;\s')
113		113
114	for orig_phrase, data in list(dictionary.items()):	114	for orig_phrase, data in list(dictionary.items()):
115	# if there are optional or alternating parts	115	# if there are optional or alternating parts
116	if search(parens_re, orig_phrase):	116	if search(parens_re, orig_phrase):
117	if orig_phrase.find('\|') > -1:	117	if orig_phrase.find('\|') > -1:
118	# TODO alternation	118	# TODO alternation
119	pass	119	pass
120	else:	120	else:
121	# TODO optional parts	121	# TODO optional parts
122	pass	122	pass
123		123
124	if orig_phrase.find(';') > -1:	124	if orig_phrase.find(';') > -1:
125	synonyms = map(	125	synonyms = map(
126	lambda x: sub(braces_re, r'\1', x),	126	lambda x: sub(braces_re, r'\1', x),
127	split(semicolon_re, orig_phrase))	127	split(semicolon_re, orig_phrase))
128		128
129	for synonym in synonyms:	129	for synonym in synonyms:
130	dictionary[synonym] = deepcopy(data)	130	dictionary[synonym] = deepcopy(data)
131		131
132	del dictionary[orig_phrase]	132	del dictionary[orig_phrase]
133	else:	133	else:
134	m = match(braces_re, orig_phrase)	134	m = match(braces_re, orig_phrase)
135	if m is not None:	135	if m is not None:
136	dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])	136	dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
137	del dictionary[orig_phrase]	137	del dictionary[orig_phrase]
138		138

Subversion Repositories LCARS

(root)/trunk/tools/eazytrans/Dictionary.py @ 295 - Rev 293 → 294