WebSVN - LCARS - Diff - Rev 296 and 297 - /trunk/tools/eazytrans/Dictionary.py


"""
Created on 2014-10-20

@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>

"""

from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile

debug_level = 2

def dmsg(*args, **kwargs):
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
        kwargs['min_level'] = 1

    if not hasattr(kwargs, 'file'):
        kwargs['file'] = stderr

    if debug_level >= kwargs['min_level']:
        del kwargs['min_level']
        print(*args, **kwargs)

def sort_dict_alnum_english_key(phrase):
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()

class Dictionary(dict):
    """
    A Dictionary (not to be confused with its ancestor, dict)
    represents a word dictionary stored in a file.
   
    """
    _language_key = 'en'
    _keys = "ipa|en|lit|pos|com|tag|ex"
    _expressions = {}

    def load (self, dictionary_file, language_key='en'):
        """
        Loads a word dictionary from a file.
        :param dictionary_file:
        :type dictionary_file:
        :param language_key:
        :type language_key:
        """
        self._language_key = language_key

        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)

        chdir(dirname(realpath(__file__)))

        pickle_file = basename(dictionary_file) + '.pickle'

        try:
            pickle_mtime = stat(pickle_file).st_mtime
        except FileNotFoundError:
            pickle_mtime = None

        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
            phrase = None
            key = None
            value = None
            with open(dictionary_file) as f:
                indent = None

                for line in f:
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
                    if m is not None:
                        phrase = m.group("phrase")
                        self[phrase] = {}
                        indent = None
                    else:
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
                        if m is not None:
                            # join previous value if necessary
                            if type(value) == list:
                                self[phrase][key] = ' '.join(value)

                            indent = m.group("indent")
                            key = m.group("key")
                            value = m.group("value")
                            # assign a string for memory efficiency
                            self[phrase][key] = value
                        elif indent is not None:
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
                            if m is not None:
                                if len(m.group("indent")) == len(indent) + 2:
                                    continuation = m.group("continuation")
                                    if type(value) == str:
                                        # when a continuation is first found, convert to a list
                                        # because there could be more continuations
                                        value = self[phrase][key] = [value, continuation]
                                    else:
                                        value.append(continuation)

            # join last value if necessary
            if type(value) == list:
                self[phrase][key] = ' '.join(value)

            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
            # TODO: Pickle should only contain strings to be small
            with open(pickle_file, mode='wb') as f: dump(self, f)
            dmsg(' done.', min_level=1)
        else:
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
            with open(pickle_file, mode='rb') as f: pickle = load(f)
            for key, value in pickle.items():
                self[key] = value

        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)

    def clean (self):
        """
        Cleans dictionary entries
        """
        re_parens = compile(r'\(.+\)', DOTALL)
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
        re_braces = compile(
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
            DOTALL)
        re_semicolon = compile(r'\s*;\s*')

        for orig_phrase, data in list(self.items()):
            # if there are optional or alternating parts
            if search(re_parens, orig_phrase):
                if orig_phrase.find('|') > -1:
                    # TODO alternation
                    pass
                else:
                    # TODO optional parts
                    pass

            if orig_phrase.find(';') > -1:
                synonyms = map(
                    lambda x: sub(re_braces, r'\1', x),
                    split(re_semicolon, orig_phrase))

                for synonym in synonyms:
                    self[synonym] = data

                del self[orig_phrase]
            else:
                m = match(re_braces, orig_phrase)
                if m is not None:
                    phrase = m.group('phrase')

                    if callable(getattr(self, 'clean_entry', None)):
                        phrase = self.clean_entry(phrase)

                    m_parens = search(re_parens, phrase)
                    if m_parens is not None:
                        # alternation and optional parts
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
                        expr = sub('~', '(?=.)', expr)
                        self._expressions[expr] = data
                    else:
                        # remove braces
                        self[phrase] = data

                    del self[orig_phrase]

    def translate (self, phrase):
        """
        Translate a phrase according to this dictionary.
        For language-specific processing, this method should be
        called/overridden by inheriting classes.
        :param phrase:
        :type phrase: str
        """
        translation = self.get(phrase.lower(), None)
        if translation is not None:
            translation[self._language_key] = phrase
            return translation

        return None

    def translate_expression (self, phrase):
        """
        Translate a phrase according entries in this dictionary
        based on regular expressions.
        :param phrase:
        :type phrase:
        """
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
            expression_match = match(expression, phrase)
            if expression_match is not None:
                data[self._language_key] = expression_match.group(0)
                return data

        return None
 

Rev 296	Rev 297
1	"""	1	"""
2	Created on 2014-10-20	2	Created on 2014-10-20
3		3
4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>	4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5		5
6	"""	6	"""
7		7
8	from os import chdir, stat	8	from os import chdir, stat
9	from sys import stderr	9	from sys import stderr
10	from os.path import dirname, realpath, basename	10	from os.path import dirname, realpath, basename
11	from pickle import dump, load	11	from pickle import dump, load
12	from re import match, DOTALL, search, sub, split, compile	12	from re import match, DOTALL, search, sub, split, compile
13		13
14	debug_level = 2	14	debug_level = 2
15		15
16	def dmsg(args, *kwargs):	16	def dmsg(args, *kwargs):
17	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:	17	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
18	kwargs['min_level'] = 1	18	kwargs['min_level'] = 1
19		19
20	if not hasattr(kwargs, 'file'):	20	if not hasattr(kwargs, 'file'):
21	kwargs['file'] = stderr	21	kwargs['file'] = stderr
22		22
23	if debug_level >= kwargs['min_level']:	23	if debug_level >= kwargs['min_level']:
24	del kwargs['min_level']	24	del kwargs['min_level']
25	print(args, *kwargs)	25	print(args, *kwargs)
26		26
27	def sort_dict_alnum_english_key(phrase):	27	def sort_dict_alnum_english_key(phrase):
28	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()	28	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
29		29
30	class Dictionary(dict):	30	class Dictionary(dict):
31	"""	31	"""
-		32	A Dictionary (not to be confused with its ancestor, dict)
32	classdocs	33	represents a word dictionary stored in a file.
33		34
34	"""	35	"""
35	_language_key = 'en'	36	_language_key = 'en'
36	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"	37	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"
37	_expressions = {}	38	_expressions = {}
38		39
39	def load (self, dictionary_file, language_key='en'):	40	def load (self, dictionary_file, language_key='en'):
-		41	"""
-		42	Loads a word dictionary from a file.
-		43	:param dictionary_file:
-		44	:type dictionary_file:
-		45	:param language_key:
-		46	:type language_key:
-		47	"""
40	self._language_key = language_key	48	self._language_key = language_key
41		49
42	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)	50	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
43		51
44	chdir(dirname(realpath(__file__)))	52	chdir(dirname(realpath(__file__)))
45		53
46	pickle_file = basename(dictionary_file) + '.pickle'	54	pickle_file = basename(dictionary_file) + '.pickle'
47		55
48	try:	56	try:
49	pickle_mtime = stat(pickle_file).st_mtime	57	pickle_mtime = stat(pickle_file).st_mtime
50	except FileNotFoundError:	58	except FileNotFoundError:
51	pickle_mtime = None	59	pickle_mtime = None
52		60
53	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:	61	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
54	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)	62	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
55	phrase = None	63	phrase = None
56	key = None	64	key = None
57	value = None	65	value = None
58	with open(dictionary_file) as f:	66	with open(dictionary_file) as f:
59	indent = None	67	indent = None
60		68
61	for line in f:	69	for line in f:
62	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(self._language_key), line)	70	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(self._language_key), line)
63	if m is not None:	71	if m is not None:
64	phrase = m.group("phrase")	72	phrase = m.group("phrase")
65	self[phrase] = {}	73	self[phrase] = {}
66	indent = None	74	indent = None
67	else:	75	else:
68	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)	76	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)
69	if m is not None:	77	if m is not None:
70	# join previous value if necessary	78	# join previous value if necessary
71	if type(value) == list:	79	if type(value) == list:
72	self[phrase][key] = ' '.join(value)	80	self[phrase][key] = ' '.join(value)
73		81
74	indent = m.group("indent")	82	indent = m.group("indent")
75	key = m.group("key")	83	key = m.group("key")
76	value = m.group("value")	84	value = m.group("value")
77	# assign a string for memory efficiency	85	# assign a string for memory efficiency
78	self[phrase][key] = value	86	self[phrase][key] = value
79	elif indent is not None:	87	elif indent is not None:
80	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)	88	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
81	if m is not None:	89	if m is not None:
82	if len(m.group("indent")) == len(indent) + 2:	90	if len(m.group("indent")) == len(indent) + 2:
83	continuation = m.group("continuation")	91	continuation = m.group("continuation")
84	if type(value) == str:	92	if type(value) == str:
85	# when a continuation is first found, convert to a list	93	# when a continuation is first found, convert to a list
86	# because there could be more continuations	94	# because there could be more continuations
87	value = self[phrase][key] = [value, continuation]	95	value = self[phrase][key] = [value, continuation]
88	else:	96	else:
89	value.append(continuation)	97	value.append(continuation)
90		98
91	# join last value if necessary	99	# join last value if necessary
92	if type(value) == list:	100	if type(value) == list:
93	self[phrase][key] = ' '.join(value)	101	self[phrase][key] = ' '.join(value)
94		102
95	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)	103	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
96	# TODO: Pickle should only contain strings to be small	104	# TODO: Pickle should only contain strings to be small
97	with open(pickle_file, mode='wb') as f: dump(self, f)	105	with open(pickle_file, mode='wb') as f: dump(self, f)
98	dmsg(' done.', min_level=1)	106	dmsg(' done.', min_level=1)
99	else:	107	else:
100	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)	108	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
101	with open(pickle_file, mode='rb') as f: pickle = load(f)	109	with open(pickle_file, mode='rb') as f: pickle = load(f)
102	for key, value in pickle.items():	110	for key, value in pickle.items():
103	self[key] = value	111	self[key] = value
104		112
105	dmsg(' done ({0} entries).'.format(len(self)), min_level=1)	113	dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
106		114
107	def clean (self):	115	def clean (self):
-		116	"""
-		117	Cleans dictionary entries
-		118	"""
108	re_parens = compile(r'\(.+\)', DOTALL)	119	re_parens = compile(r'\(.+\)', DOTALL)
109	re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)	120	re_parens_no_alt = compile(r'\(([^\\|]+)\)', DOTALL)
110	re_braces = compile(	121	re_braces = compile(
111	r'^\s\{(?P<phrase>.+)\}(?:\s\((?P<variant>.+?)\))?\s*$',	122	r'^\s\{(?P<phrase>.+)\}(?:\s\((?P<variant>.+?)\))?\s*$',
112	DOTALL)	123	DOTALL)
113	re_semicolon = compile(r'\s;\s')	124	re_semicolon = compile(r'\s;\s')
114		125
115	for orig_phrase, data in list(self.items()):	126	for orig_phrase, data in list(self.items()):
116	# if there are optional or alternating parts	127	# if there are optional or alternating parts
117	if search(re_parens, orig_phrase):	128	if search(re_parens, orig_phrase):
118	if orig_phrase.find('\|') > -1:	129	if orig_phrase.find('\|') > -1:
119	# TODO alternation	130	# TODO alternation
120	pass	131	pass
121	else:	132	else:
122	# TODO optional parts	133	# TODO optional parts
123	pass	134	pass
124		135
125	if orig_phrase.find(';') > -1:	136	if orig_phrase.find(';') > -1:
126	synonyms = map(	137	synonyms = map(
127	lambda x: sub(re_braces, r'\1', x),	138	lambda x: sub(re_braces, r'\1', x),
128	split(re_semicolon, orig_phrase))	139	split(re_semicolon, orig_phrase))
129		140
130	for synonym in synonyms:	141	for synonym in synonyms:
131	self[synonym] = data	142	self[synonym] = data
132		143
133	del self[orig_phrase]	144	del self[orig_phrase]
134	else:	145	else:
135	m = match(re_braces, orig_phrase)	146	m = match(re_braces, orig_phrase)
136	if m is not None:	147	if m is not None:
137	phrase = m.group("phrase")	148	phrase = m.group('phrase')
-		149
-		150	if callable(getattr(self, 'clean_entry', None)):
-		151	phrase = self.clean_entry(phrase)
-		152
138	m_parens = search(re_parens, phrase)	153	m_parens = search(re_parens, phrase)
139	if m_parens is not None:	154	if m_parens is not None:
140	# alternation and optional parts	155	# alternation and optional parts
141	expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)	156	expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
142	expr = sub('~', '(?=.)', expr)	157	expr = sub('~', '(?=.)', expr)
143	self._expressions[expr] = data	158	self._expressions[expr] = data
144	else:	159	else:
145	# remove braces	160	# remove braces
146	self[phrase] = data	161	self[phrase] = data
147		162
148	del self[orig_phrase]	163	del self[orig_phrase]
149		164
150	def translate (self, phrase):	165	def translate (self, phrase):
-		166	"""
-		167	Translate a phrase according to this dictionary.
-		168	For language-specific processing, this method should be
-		169	called/overridden by inheriting classes.
-		170	:param phrase:
-		171	:type phrase: str
-		172	"""
151	translation = self.get(phrase.lower(), None)	173	translation = self.get(phrase.lower(), None)
152	if translation is not None:	174	if translation is not None:
153	translation[self._language_key] = phrase	175	translation[self._language_key] = phrase
154	return translation	176	return translation
155		177
156	return None	178	return None
157		179
158	def translate_expression (self, phrase):	180	def translate_expression (self, phrase):
-		181	"""
-		182	Translate a phrase according entries in this dictionary
-		183	based on regular expressions.
-		184	:param phrase:
-		185	:type phrase:
-		186	"""
159	for expression, data in list(self._expressions.items()):	187	for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
160	expression_match = match(expression, phrase)	188	expression_match = match(expression, phrase)
161	if expression_match is not None:	189	if expression_match is not None:
162	data[self._language_key] = expression_match.group(0)	190	data[self._language_key] = expression_match.group(0)
163	return data	191	return data
164		192
165	return None	193	return None
166		194

Subversion Repositories LCARS

(root)/trunk/tools/eazytrans/Dictionary.py - Rev 296 → 297