WebSVN - LCARS - Diff - Rev 298 and 300 - /trunk/tools/eazytrans/Dictionary.py


"""
Created on 2014-10-20

@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>

"""

from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile

debug_level = 2

def dmsg (*args, **kwargs):
 
 
 
    if not kwargs.get('file'):
        kwargs['file'] = stderr

    min_level = kwargs.pop('min_level', 1)

    if debug_level >= min_level:
        print(*args, **kwargs)

def sort_dict_alnum_english_key (phrase):
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()

class Dictionary (dict):
    """
    A Dictionary (not to be confused with its ancestor, dict)
    represents a word dictionary stored in a file.
   
    """
    _language_key = 'en'
    _keys = "ipa|en|lit|pos|com|tag|ex"
    _expressions = {}

    def load (self, dictionary_file, keys=None, language_key=None):
        """
        Loads a word dictionary from a file.
        :param dictionary_file:
        :type dictionary_file:
        :param language_key:
        :type language_key:
        """
        if keys is not None:
            self._keys = keys

        if language_key is not None:
            self._language_key = language_key

        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)

        chdir(dirname(realpath(__file__)))

        pickle_file = basename(dictionary_file) + '.pickle'

        try:
            pickle_mtime = stat(pickle_file).st_mtime
        except FileNotFoundError:
            pickle_mtime = None

        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)

            phrase = None
            key = None
            value = None
            with open(dictionary_file) as f:
                indent = None

                for line in f:
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
                    if m is not None:
                        phrase = m.group("phrase")
                        self[phrase] = {}
                        indent = None
                    else:
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
                        if m is not None:
                            # join previous value if necessary
                            if type(value) == list:
                                self[phrase][key] = ' '.join(value)

                            indent = m.group("indent")
                            key = m.group("key")
                            value = m.group("value")
                            # assign a string for memory efficiency
                            self[phrase][key] = value
                        elif indent is not None:
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
                            if m is not None:
                                if len(m.group("indent")) == len(indent) + 2:
                                    continuation = m.group("continuation")
                                    if type(value) == str:
                                        # when a continuation is first found, convert to a list
                                        # because there could be more continuations
                                        value = self[phrase][key] = [value, continuation]
                                    else:
                                        value.append(continuation)

            # join last value if necessary
            if type(value) == list:
                self[phrase][key] = ' '.join(value)

            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)

            # TODO: Pickle should only contain strings to be small
            with open(pickle_file, mode='wb') as f: dump(self, f)

            dmsg(' done.', min_level=1)
        else:
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)

            with open(pickle_file, mode='rb') as f: pickle = load(f)
            for key, value in pickle.items():
                self[key] = value

        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)

    def clean (self):
        """
        Cleans dictionary entries
        """
        re_parens = compile(r'\(.+\)', DOTALL)
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
        re_braces = compile(
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
            DOTALL)
        re_semicolon = compile(r'\s*;\s*')

        for orig_phrase, data in list(self.items()):
            # if there are optional or alternating parts
            if search(re_parens, orig_phrase):
                if orig_phrase.find('|') > -1:
                    # TODO alternation
                    pass
                else:
                    # TODO optional parts
                    pass

            if orig_phrase.find(';') > -1:
                synonyms = map(
                    lambda x: sub(re_braces, r'\1', x),
                    split(re_semicolon, orig_phrase))

                for synonym in synonyms:
                    self[synonym] = data

                del self[orig_phrase]
            else:
                m = match(re_braces, orig_phrase)
                if m is not None:
                    phrase = m.group('phrase')

                    if callable(getattr(self, 'clean_entry', None)):
                        phrase = self.clean_entry(phrase)

                    m_parens = search(re_parens, phrase)
                    if m_parens is not None:
                        # alternation and optional parts
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
                        expr = sub('~', '(?=.)', expr)
                        self._expressions[expr] = data
                    else:
                        # remove braces
                        self[phrase] = data

                    del self[orig_phrase]

    def translate (self, phrase):
        """
        Translate a phrase according to this dictionary.
        For language-specific processing, this method should be
        called/overridden by inheriting classes.
        :param phrase:
        :type phrase: str
        """
        translation = self.get(phrase.lower(), None)
        if translation is not None:
            translation[self._language_key] = phrase
            return translation

        return None

    def translate_expression (self, phrase):
        """
        Translate a phrase according entries in this dictionary
        based on regular expressions.
        :param phrase:
        :type phrase:
        """
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
            expression_match = match(r'{0}$'.format(expression), phrase)
            if expression_match is not None:
                data[self._language_key] = expression_match.group(0)
                return data

        return None
 

Rev 298	Rev 300
1	"""	1	"""
2	Created on 2014-10-20	2	Created on 2014-10-20
3		3
4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>	4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5		5
6	"""	6	"""
7		7
8	from os import chdir, stat	8	from os import chdir, stat
9	from sys import stderr	9	from sys import stderr
10	from os.path import dirname, realpath, basename	10	from os.path import dirname, realpath, basename
11	from pickle import dump, load	11	from pickle import dump, load
12	from re import match, DOTALL, search, sub, split, compile	12	from re import match, DOTALL, search, sub, split, compile
13		13
14	debug_level = 2	14	debug_level = 2
15		15
16	def dmsg(args, *kwargs):	16	def dmsg (args, *kwargs):
17	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:	-
18	kwargs['min_level'] = 1	-
19		-
20	if not hasattr(kwargs, 'file'):	17	if not kwargs.get('file'):
21	kwargs['file'] = stderr	18	kwargs['file'] = stderr
22		19
23	if debug_level >= kwargs['min_level']:	20	min_level = kwargs.pop('min_level', 1)
-		21
24	del kwargs['min_level']	22	if debug_level >= min_level:
25	print(args, *kwargs)	23	print(args, *kwargs)
26		24
27	def sort_dict_alnum_english_key(phrase):	25	def sort_dict_alnum_english_key (phrase):
28	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()	26	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
29		27
30	class Dictionary(dict):	28	class Dictionary (dict):
31	"""	29	"""
32	A Dictionary (not to be confused with its ancestor, dict)	30	A Dictionary (not to be confused with its ancestor, dict)
33	represents a word dictionary stored in a file.	31	represents a word dictionary stored in a file.
34		32
35	"""	33	"""
36	_language_key = 'en'	34	_language_key = 'en'
37	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"	35	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"
38	_expressions = {}	36	_expressions = {}
39		37
40	def load (self, dictionary_file, language_key='en'):	38	def load (self, dictionary_file, keys=None, language_key=None):
41	"""	39	"""
42	Loads a word dictionary from a file.	40	Loads a word dictionary from a file.
43	:param dictionary_file:	41	:param dictionary_file:
44	:type dictionary_file:	42	:type dictionary_file:
45	:param language_key:	43	:param language_key:
46	:type language_key:	44	:type language_key:
47	"""	45	"""
-		46	if keys is not None:
-		47	self._keys = keys
-		48
-		49	if language_key is not None:
48	self._language_key = language_key	50	self._language_key = language_key
49		51
50	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)	52	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
51		53
52	chdir(dirname(realpath(__file__)))	54	chdir(dirname(realpath(__file__)))
53		55
54	pickle_file = basename(dictionary_file) + '.pickle'	56	pickle_file = basename(dictionary_file) + '.pickle'
55		57
56	try:	58	try:
57	pickle_mtime = stat(pickle_file).st_mtime	59	pickle_mtime = stat(pickle_file).st_mtime
58	except FileNotFoundError:	60	except FileNotFoundError:
59	pickle_mtime = None	61	pickle_mtime = None
60		62
61	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:	63	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
62	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)	64	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
-		65
63	phrase = None	66	phrase = None
64	key = None	67	key = None
65	value = None	68	value = None
66	with open(dictionary_file) as f:	69	with open(dictionary_file) as f:
67	indent = None	70	indent = None
68		71
69	for line in f:	72	for line in f:
70	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(self._language_key), line)	73	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(self._language_key), line)
71	if m is not None:	74	if m is not None:
72	phrase = m.group("phrase")	75	phrase = m.group("phrase")
73	self[phrase] = {}	76	self[phrase] = {}
74	indent = None	77	indent = None
75	else:	78	else:
76	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)	79	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)
77	if m is not None:	80	if m is not None:
78	# join previous value if necessary	81	# join previous value if necessary
79	if type(value) == list:	82	if type(value) == list:
80	self[phrase][key] = ' '.join(value)	83	self[phrase][key] = ' '.join(value)
81		84
82	indent = m.group("indent")	85	indent = m.group("indent")
83	key = m.group("key")	86	key = m.group("key")
84	value = m.group("value")	87	value = m.group("value")
85	# assign a string for memory efficiency	88	# assign a string for memory efficiency
86	self[phrase][key] = value	89	self[phrase][key] = value
87	elif indent is not None:	90	elif indent is not None:
88	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)	91	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
89	if m is not None:	92	if m is not None:
90	if len(m.group("indent")) == len(indent) + 2:	93	if len(m.group("indent")) == len(indent) + 2:
91	continuation = m.group("continuation")	94	continuation = m.group("continuation")
92	if type(value) == str:	95	if type(value) == str:
93	# when a continuation is first found, convert to a list	96	# when a continuation is first found, convert to a list
94	# because there could be more continuations	97	# because there could be more continuations
95	value = self[phrase][key] = [value, continuation]	98	value = self[phrase][key] = [value, continuation]
96	else:	99	else:
97	value.append(continuation)	100	value.append(continuation)
98		101
99	# join last value if necessary	102	# join last value if necessary
100	if type(value) == list:	103	if type(value) == list:
101	self[phrase][key] = ' '.join(value)	104	self[phrase][key] = ' '.join(value)
102		105
103	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)	106	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
-		107
104	# TODO: Pickle should only contain strings to be small	108	# TODO: Pickle should only contain strings to be small
105	with open(pickle_file, mode='wb') as f: dump(self, f)	109	with open(pickle_file, mode='wb') as f: dump(self, f)
-		110
106	dmsg(' done.', min_level=1)	111	dmsg(' done.', min_level=1)
107	else:	112	else:
108	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)	113	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
-		114
109	with open(pickle_file, mode='rb') as f: pickle = load(f)	115	with open(pickle_file, mode='rb') as f: pickle = load(f)
110	for key, value in pickle.items():	116	for key, value in pickle.items():
111	self[key] = value	117	self[key] = value
112		118
113	dmsg(' done ({0} entries).'.format(len(self)), min_level=1)	119	dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
114		120
115	def clean (self):	121	def clean (self):
116	"""	122	"""
117	Cleans dictionary entries	123	Cleans dictionary entries
118	"""	124	"""
119	re_parens = compile(r'\(.+\)', DOTALL)	125	re_parens = compile(r'\(.+\)', DOTALL)
120	re_parens_no_alt = compile(r'\(([^\\|]+)\)', DOTALL)	126	re_parens_no_alt = compile(r'\(([^\\|]+)\)', DOTALL)
121	re_braces = compile(	127	re_braces = compile(
122	r'^\s\{(?P<phrase>.+)\}(?:\s\((?P<variant>.+?)\))?\s*$',	128	r'^\s\{(?P<phrase>.+)\}(?:\s\((?P<variant>.+?)\))?\s*$',
123	DOTALL)	129	DOTALL)
124	re_semicolon = compile(r'\s;\s')	130	re_semicolon = compile(r'\s;\s')
125		131
126	for orig_phrase, data in list(self.items()):	132	for orig_phrase, data in list(self.items()):
127	# if there are optional or alternating parts	133	# if there are optional or alternating parts
128	if search(re_parens, orig_phrase):	134	if search(re_parens, orig_phrase):
129	if orig_phrase.find('\|') > -1:	135	if orig_phrase.find('\|') > -1:
130	# TODO alternation	136	# TODO alternation
131	pass	137	pass
132	else:	138	else:
133	# TODO optional parts	139	# TODO optional parts
134	pass	140	pass
135		141
136	if orig_phrase.find(';') > -1:	142	if orig_phrase.find(';') > -1:
137	synonyms = map(	143	synonyms = map(
138	lambda x: sub(re_braces, r'\1', x),	144	lambda x: sub(re_braces, r'\1', x),
139	split(re_semicolon, orig_phrase))	145	split(re_semicolon, orig_phrase))
140		146
141	for synonym in synonyms:	147	for synonym in synonyms:
142	self[synonym] = data	148	self[synonym] = data
143		149
144	del self[orig_phrase]	150	del self[orig_phrase]
145	else:	151	else:
146	m = match(re_braces, orig_phrase)	152	m = match(re_braces, orig_phrase)
147	if m is not None:	153	if m is not None:
148	phrase = m.group('phrase')	154	phrase = m.group('phrase')
149		155
150	if callable(getattr(self, 'clean_entry', None)):	156	if callable(getattr(self, 'clean_entry', None)):
151	phrase = self.clean_entry(phrase)	157	phrase = self.clean_entry(phrase)
152		158
153	m_parens = search(re_parens, phrase)	159	m_parens = search(re_parens, phrase)
154	if m_parens is not None:	160	if m_parens is not None:
155	# alternation and optional parts	161	# alternation and optional parts
156	expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)	162	expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
157	expr = sub('~', '(?=.)', expr)	163	expr = sub('~', '(?=.)', expr)
158	self._expressions[expr] = data	164	self._expressions[expr] = data
159	else:	165	else:
160	# remove braces	166	# remove braces
161	self[phrase] = data	167	self[phrase] = data
162		168
163	del self[orig_phrase]	169	del self[orig_phrase]
164		170
165	def translate (self, phrase):	171	def translate (self, phrase):
166	"""	172	"""
167	Translate a phrase according to this dictionary.	173	Translate a phrase according to this dictionary.
168	For language-specific processing, this method should be	174	For language-specific processing, this method should be
169	called/overridden by inheriting classes.	175	called/overridden by inheriting classes.
170	:param phrase:	176	:param phrase:
171	:type phrase: str	177	:type phrase: str
172	"""	178	"""
173	translation = self.get(phrase.lower(), None)	179	translation = self.get(phrase.lower(), None)
174	if translation is not None:	180	if translation is not None:
175	translation[self._language_key] = phrase	181	translation[self._language_key] = phrase
176	return translation	182	return translation
177		183
178	return None	184	return None
179		185
180	def translate_expression (self, phrase):	186	def translate_expression (self, phrase):
181	"""	187	"""
182	Translate a phrase according entries in this dictionary	188	Translate a phrase according entries in this dictionary
183	based on regular expressions.	189	based on regular expressions.
184	:param phrase:	190	:param phrase:
185	:type phrase:	191	:type phrase:
186	"""	192	"""
187	for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):	193	for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
188	expression_match = match(r'{0}$'.format(expression), phrase)	194	expression_match = match(r'{0}$'.format(expression), phrase)
189	if expression_match is not None:	195	if expression_match is not None:
190	data[self._language_key] = expression_match.group(0)	196	data[self._language_key] = expression_match.group(0)
191	return data	197	return data
192		198
193	return None	199	return None
194		200

Subversion Repositories LCARS

(root)/trunk/tools/eazytrans/Dictionary.py - Rev 298 → 300