WebSVN - LCARS - Diff - Rev 297 and 298 - /trunk/tools/eazytrans/Dictionary.py


"""
Created on 2014-10-20

@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>

"""

from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile

debug_level = 2

def dmsg(*args, **kwargs):
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
        kwargs['min_level'] = 1

    if not hasattr(kwargs, 'file'):
        kwargs['file'] = stderr

    if debug_level >= kwargs['min_level']:
        del kwargs['min_level']
        print(*args, **kwargs)

def sort_dict_alnum_english_key(phrase):
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()

class Dictionary(dict):
    """
    A Dictionary (not to be confused with its ancestor, dict)
    represents a word dictionary stored in a file.
   
    """
    _language_key = 'en'
    _keys = "ipa|en|lit|pos|com|tag|ex"
    _expressions = {}

    def load (self, dictionary_file, language_key='en'):
        """
        Loads a word dictionary from a file.
        :param dictionary_file:
        :type dictionary_file:
        :param language_key:
        :type language_key:
        """
        self._language_key = language_key

        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)

        chdir(dirname(realpath(__file__)))

        pickle_file = basename(dictionary_file) + '.pickle'

        try:
            pickle_mtime = stat(pickle_file).st_mtime
        except FileNotFoundError:
            pickle_mtime = None

        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
            phrase = None
            key = None
            value = None
            with open(dictionary_file) as f:
                indent = None

                for line in f:
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
                    if m is not None:
                        phrase = m.group("phrase")
                        self[phrase] = {}
                        indent = None
                    else:
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
                        if m is not None:
                            # join previous value if necessary
                            if type(value) == list:
                                self[phrase][key] = ' '.join(value)

                            indent = m.group("indent")
                            key = m.group("key")
                            value = m.group("value")
                            # assign a string for memory efficiency
                            self[phrase][key] = value
                        elif indent is not None:
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
                            if m is not None:
                                if len(m.group("indent")) == len(indent) + 2:
                                    continuation = m.group("continuation")
                                    if type(value) == str:
                                        # when a continuation is first found, convert to a list
                                        # because there could be more continuations
                                        value = self[phrase][key] = [value, continuation]
                                    else:
                                        value.append(continuation)

            # join last value if necessary
            if type(value) == list:
                self[phrase][key] = ' '.join(value)

            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
            # TODO: Pickle should only contain strings to be small
            with open(pickle_file, mode='wb') as f: dump(self, f)
            dmsg(' done.', min_level=1)
        else:
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
            with open(pickle_file, mode='rb') as f: pickle = load(f)
            for key, value in pickle.items():
                self[key] = value

        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)

    def clean (self):
        """
        Cleans dictionary entries
        """
        re_parens = compile(r'\(.+\)', DOTALL)
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
        re_braces = compile(
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
            DOTALL)
        re_semicolon = compile(r'\s*;\s*')

        for orig_phrase, data in list(self.items()):
            # if there are optional or alternating parts
            if search(re_parens, orig_phrase):
                if orig_phrase.find('|') > -1:
                    # TODO alternation
                    pass
                else:
                    # TODO optional parts
                    pass

            if orig_phrase.find(';') > -1:
                synonyms = map(
                    lambda x: sub(re_braces, r'\1', x),
                    split(re_semicolon, orig_phrase))

                for synonym in synonyms:
                    self[synonym] = data

                del self[orig_phrase]
            else:
                m = match(re_braces, orig_phrase)
                if m is not None:
                    phrase = m.group('phrase')

                    if callable(getattr(self, 'clean_entry', None)):
                        phrase = self.clean_entry(phrase)

                    m_parens = search(re_parens, phrase)
                    if m_parens is not None:
                        # alternation and optional parts
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
                        expr = sub('~', '(?=.)', expr)
                        self._expressions[expr] = data
                    else:
                        # remove braces
                        self[phrase] = data

                    del self[orig_phrase]

    def translate (self, phrase):
        """
        Translate a phrase according to this dictionary.
        For language-specific processing, this method should be
        called/overridden by inheriting classes.
        :param phrase:
        :type phrase: str
        """
        translation = self.get(phrase.lower(), None)
        if translation is not None:
            translation[self._language_key] = phrase
            return translation

        return None

    def translate_expression (self, phrase):
        """
        Translate a phrase according entries in this dictionary
        based on regular expressions.
        :param phrase:
        :type phrase:
        """
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
            expression_match = match(r'{0}$'.format(expression), phrase)
            if expression_match is not None:
                data[self._language_key] = expression_match.group(0)
                return data

        return None
 

Rev 297	Rev 298
1	"""	1	"""
2	Created on 2014-10-20	2	Created on 2014-10-20
3		3
4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>	4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5		5
6	"""	6	"""
7		7
8	from os import chdir, stat	8	from os import chdir, stat
9	from sys import stderr	9	from sys import stderr
10	from os.path import dirname, realpath, basename	10	from os.path import dirname, realpath, basename
11	from pickle import dump, load	11	from pickle import dump, load
12	from re import match, DOTALL, search, sub, split, compile	12	from re import match, DOTALL, search, sub, split, compile
13		13
14	debug_level = 2	14	debug_level = 2
15		15
16	def dmsg(args, *kwargs):	16	def dmsg(args, *kwargs):
17	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:	17	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
18	kwargs['min_level'] = 1	18	kwargs['min_level'] = 1
19		19
20	if not hasattr(kwargs, 'file'):	20	if not hasattr(kwargs, 'file'):
21	kwargs['file'] = stderr	21	kwargs['file'] = stderr
22		22
23	if debug_level >= kwargs['min_level']:	23	if debug_level >= kwargs['min_level']:
24	del kwargs['min_level']	24	del kwargs['min_level']
25	print(args, *kwargs)	25	print(args, *kwargs)
26		26
27	def sort_dict_alnum_english_key(phrase):	27	def sort_dict_alnum_english_key(phrase):
28	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()	28	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
29		29
30	class Dictionary(dict):	30	class Dictionary(dict):
31	"""	31	"""
32	A Dictionary (not to be confused with its ancestor, dict)	32	A Dictionary (not to be confused with its ancestor, dict)
33	represents a word dictionary stored in a file.	33	represents a word dictionary stored in a file.
34		34
35	"""	35	"""
36	_language_key = 'en'	36	_language_key = 'en'
37	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"	37	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"
38	_expressions = {}	38	_expressions = {}
39		39
40	def load (self, dictionary_file, language_key='en'):	40	def load (self, dictionary_file, language_key='en'):
41	"""	41	"""
42	Loads a word dictionary from a file.	42	Loads a word dictionary from a file.
43	:param dictionary_file:	43	:param dictionary_file:
44	:type dictionary_file:	44	:type dictionary_file:
45	:param language_key:	45	:param language_key:
46	:type language_key:	46	:type language_key:
47	"""	47	"""
48	self._language_key = language_key	48	self._language_key = language_key
49		49
50	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)	50	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
51		51
52	chdir(dirname(realpath(__file__)))	52	chdir(dirname(realpath(__file__)))
53		53
54	pickle_file = basename(dictionary_file) + '.pickle'	54	pickle_file = basename(dictionary_file) + '.pickle'
55		55
56	try:	56	try:
57	pickle_mtime = stat(pickle_file).st_mtime	57	pickle_mtime = stat(pickle_file).st_mtime
58	except FileNotFoundError:	58	except FileNotFoundError:
59	pickle_mtime = None	59	pickle_mtime = None
60		60
61	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:	61	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
62	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)	62	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
63	phrase = None	63	phrase = None
64	key = None	64	key = None
65	value = None	65	value = None
66	with open(dictionary_file) as f:	66	with open(dictionary_file) as f:
67	indent = None	67	indent = None
68		68
69	for line in f:	69	for line in f:
70	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(self._language_key), line)	70	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(self._language_key), line)
71	if m is not None:	71	if m is not None:
72	phrase = m.group("phrase")	72	phrase = m.group("phrase")
73	self[phrase] = {}	73	self[phrase] = {}
74	indent = None	74	indent = None
75	else:	75	else:
76	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)	76	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)
77	if m is not None:	77	if m is not None:
78	# join previous value if necessary	78	# join previous value if necessary
79	if type(value) == list:	79	if type(value) == list:
80	self[phrase][key] = ' '.join(value)	80	self[phrase][key] = ' '.join(value)
81		81
82	indent = m.group("indent")	82	indent = m.group("indent")
83	key = m.group("key")	83	key = m.group("key")
84	value = m.group("value")	84	value = m.group("value")
85	# assign a string for memory efficiency	85	# assign a string for memory efficiency
86	self[phrase][key] = value	86	self[phrase][key] = value
87	elif indent is not None:	87	elif indent is not None:
88	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)	88	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
89	if m is not None:	89	if m is not None:
90	if len(m.group("indent")) == len(indent) + 2:	90	if len(m.group("indent")) == len(indent) + 2:
91	continuation = m.group("continuation")	91	continuation = m.group("continuation")
92	if type(value) == str:	92	if type(value) == str:
93	# when a continuation is first found, convert to a list	93	# when a continuation is first found, convert to a list
94	# because there could be more continuations	94	# because there could be more continuations
95	value = self[phrase][key] = [value, continuation]	95	value = self[phrase][key] = [value, continuation]
96	else:	96	else:
97	value.append(continuation)	97	value.append(continuation)
98		98
99	# join last value if necessary	99	# join last value if necessary
100	if type(value) == list:	100	if type(value) == list:
101	self[phrase][key] = ' '.join(value)	101	self[phrase][key] = ' '.join(value)
102		102
103	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)	103	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
104	# TODO: Pickle should only contain strings to be small	104	# TODO: Pickle should only contain strings to be small
105	with open(pickle_file, mode='wb') as f: dump(self, f)	105	with open(pickle_file, mode='wb') as f: dump(self, f)
106	dmsg(' done.', min_level=1)	106	dmsg(' done.', min_level=1)
107	else:	107	else:
108	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)	108	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
109	with open(pickle_file, mode='rb') as f: pickle = load(f)	109	with open(pickle_file, mode='rb') as f: pickle = load(f)
110	for key, value in pickle.items():	110	for key, value in pickle.items():
111	self[key] = value	111	self[key] = value
112		112
113	dmsg(' done ({0} entries).'.format(len(self)), min_level=1)	113	dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
114		114
115	def clean (self):	115	def clean (self):
116	"""	116	"""
117	Cleans dictionary entries	117	Cleans dictionary entries
118	"""	118	"""
119	re_parens = compile(r'\(.+\)', DOTALL)	119	re_parens = compile(r'\(.+\)', DOTALL)
120	re_parens_no_alt = compile(r'\(([^\\|]+)\)', DOTALL)	120	re_parens_no_alt = compile(r'\(([^\\|]+)\)', DOTALL)
121	re_braces = compile(	121	re_braces = compile(
122	r'^\s\{(?P<phrase>.+)\}(?:\s\((?P<variant>.+?)\))?\s*$',	122	r'^\s\{(?P<phrase>.+)\}(?:\s\((?P<variant>.+?)\))?\s*$',
123	DOTALL)	123	DOTALL)
124	re_semicolon = compile(r'\s;\s')	124	re_semicolon = compile(r'\s;\s')
125		125
126	for orig_phrase, data in list(self.items()):	126	for orig_phrase, data in list(self.items()):
127	# if there are optional or alternating parts	127	# if there are optional or alternating parts
128	if search(re_parens, orig_phrase):	128	if search(re_parens, orig_phrase):
129	if orig_phrase.find('\|') > -1:	129	if orig_phrase.find('\|') > -1:
130	# TODO alternation	130	# TODO alternation
131	pass	131	pass
132	else:	132	else:
133	# TODO optional parts	133	# TODO optional parts
134	pass	134	pass
135		135
136	if orig_phrase.find(';') > -1:	136	if orig_phrase.find(';') > -1:
137	synonyms = map(	137	synonyms = map(
138	lambda x: sub(re_braces, r'\1', x),	138	lambda x: sub(re_braces, r'\1', x),
139	split(re_semicolon, orig_phrase))	139	split(re_semicolon, orig_phrase))
140		140
141	for synonym in synonyms:	141	for synonym in synonyms:
142	self[synonym] = data	142	self[synonym] = data
143		143
144	del self[orig_phrase]	144	del self[orig_phrase]
145	else:	145	else:
146	m = match(re_braces, orig_phrase)	146	m = match(re_braces, orig_phrase)
147	if m is not None:	147	if m is not None:
148	phrase = m.group('phrase')	148	phrase = m.group('phrase')
149		149
150	if callable(getattr(self, 'clean_entry', None)):	150	if callable(getattr(self, 'clean_entry', None)):
151	phrase = self.clean_entry(phrase)	151	phrase = self.clean_entry(phrase)
152		152
153	m_parens = search(re_parens, phrase)	153	m_parens = search(re_parens, phrase)
154	if m_parens is not None:	154	if m_parens is not None:
155	# alternation and optional parts	155	# alternation and optional parts
156	expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)	156	expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
157	expr = sub('~', '(?=.)', expr)	157	expr = sub('~', '(?=.)', expr)
158	self._expressions[expr] = data	158	self._expressions[expr] = data
159	else:	159	else:
160	# remove braces	160	# remove braces
161	self[phrase] = data	161	self[phrase] = data
162		162
163	del self[orig_phrase]	163	del self[orig_phrase]
164		164
165	def translate (self, phrase):	165	def translate (self, phrase):
166	"""	166	"""
167	Translate a phrase according to this dictionary.	167	Translate a phrase according to this dictionary.
168	For language-specific processing, this method should be	168	For language-specific processing, this method should be
169	called/overridden by inheriting classes.	169	called/overridden by inheriting classes.
170	:param phrase:	170	:param phrase:
171	:type phrase: str	171	:type phrase: str
172	"""	172	"""
173	translation = self.get(phrase.lower(), None)	173	translation = self.get(phrase.lower(), None)
174	if translation is not None:	174	if translation is not None:
175	translation[self._language_key] = phrase	175	translation[self._language_key] = phrase
176	return translation	176	return translation
177		177
178	return None	178	return None
179		179
180	def translate_expression (self, phrase):	180	def translate_expression (self, phrase):
181	"""	181	"""
182	Translate a phrase according entries in this dictionary	182	Translate a phrase according entries in this dictionary
183	based on regular expressions.	183	based on regular expressions.
184	:param phrase:	184	:param phrase:
185	:type phrase:	185	:type phrase:
186	"""	186	"""
187	for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):	187	for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
188	expression_match = match(expression, phrase)	188	expression_match = match(r'{0}$'.format(expression), phrase)
189	if expression_match is not None:	189	if expression_match is not None:
190	data[self._language_key] = expression_match.group(0)	190	data[self._language_key] = expression_match.group(0)
191	return data	191	return data
192		192
193	return None	193	return None
194		194

Subversion Repositories LCARS

(root)/trunk/tools/eazytrans/Dictionary.py @ 293 - Rev 297 → 298