WebSVN - LCARS - Diff - Rev 294 and 295 - /trunk/tools/eazytrans/Dictionary.py


"""
Created on 2014-10-20

@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>

"""

from os import chdir, stat
from sys import stderr
from os.path import dirname, realpath, basename
from pickle import dump, load
from re import match, DOTALL, search, sub, split, compile
 

debug_level = 2

def dmsg(*args, **kwargs):
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
        kwargs['min_level'] = 1

    if not hasattr(kwargs, 'file'):
        kwargs['file'] = stderr

    if debug_level >= kwargs['min_level']:
        del kwargs['min_level']
        print(*args, **kwargs)

def sort_dict_alnum_english_key(phrase):
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()

class Dictionary(dict):
    """
    classdocs
   
    """
    _keys = "ipa|en|lit|pos|com|tag|ex"
    _expressions = {}

    def load (self, dictionary_file, language_key='en'):
 
 
        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)

        chdir(dirname(realpath(__file__)))

        pickle_file = basename(dictionary_file) + '.pickle'

        try:
            pickle_mtime = stat(pickle_file).st_mtime
        except FileNotFoundError:
            pickle_mtime = None

        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
            phrase = None
            key = None
            value = None
            with open(dictionary_file) as f:
                indent = None

                for line in f:
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(language_key), line)
                    if m is not None:
                        phrase = m.group("phrase")
                        self[phrase] = {}
                        indent = None
                    else:
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
                        if m is not None:
                            # join previous value if necessary
                            if type(value) == list:
                                self[phrase][key] = ' '.join(value)

                            indent = m.group("indent")
                            key = m.group("key")
                            value = m.group("value")
                            # assign a string for memory efficiency
                            self[phrase][key] = value
                        elif indent is not None:
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
                            if m is not None:
                                if len(m.group("indent")) == len(indent) + 2:
                                    continuation = m.group("continuation")
                                    if type(value) == str:
                                        # when a continuation is first found, convert to a list
                                        # because there could be more continuations
                                        value = self[phrase][key] = [value, continuation]
                                    else:
                                        value.append(continuation)

            # join last value if necessary
            if type(value) == list:
                self[phrase][key] = ' '.join(value)

            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
            # TODO: Pickle should only contain strings to be small
            with open(pickle_file, mode='wb') as f: dump(self, f)
            dmsg(' done.', min_level=1)
        else:
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
            with open(pickle_file, mode='rb') as f: pickle = load(f)
            for key, value in pickle.items():
                self[key] = value

        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)

    def clean (self):
 
 
        parens_re = compile(r'\(.+\)', DOTALL)
        braces_re = compile(
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
            DOTALL)
        semicolon_re = compile(r'\s*;\s*')

        for orig_phrase, data in list(self.items()):
            # if there are optional or alternating parts
            if search(parens_re, orig_phrase):
                if orig_phrase.find('|') > -1:
                    # TODO alternation
                    pass
                else:
                    # TODO optional parts
                    pass

            if orig_phrase.find(';') > -1:
                synonyms = map(
                    lambda x: sub(braces_re, r'\1', x),
                    split(semicolon_re, orig_phrase))

                for synonym in synonyms:
                    self[synonym] = data

                del self[orig_phrase]
            else:
                m = match(braces_re, orig_phrase)
                if m is not None:
                    phrase = m.group("phrase")
                    m2 = match(parens_re, phrase)
                    if m2 is not None:
                        # TODO alternation and optional parts
                        pass

                    self[phrase] = data
                    del self[orig_phrase]
 

Rev 294	Rev 295
1	"""	1	"""
2	Created on 2014-10-20	2	Created on 2014-10-20
3		3
4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>	4	@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5		5
6	"""	6	"""
7		7
8	from os import chdir, stat	8	from os import chdir, stat
9	from sys import stderr	9	from sys import stderr
10	from os.path import dirname, realpath, basename	10	from os.path import dirname, realpath, basename
11	from pickle import dump, load	11	from pickle import dump, load
12	from re import match, DOTALL, search, sub, split, compile	12	from re import match, DOTALL, search, sub, split, compile
13	from copy import deepcopy	-
14		13
15	debug_level = 2	14	debug_level = 2
16		15
17	def dmsg(args, *kwargs):	16	def dmsg(args, *kwargs):
18	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:	17	if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
19	kwargs['min_level'] = 1	18	kwargs['min_level'] = 1
20		19
21	if not hasattr(kwargs, 'file'):	20	if not hasattr(kwargs, 'file'):
22	kwargs['file'] = stderr	21	kwargs['file'] = stderr
23		22
24	if debug_level >= kwargs['min_level']:	23	if debug_level >= kwargs['min_level']:
25	del kwargs['min_level']	24	del kwargs['min_level']
26	print(args, *kwargs)	25	print(args, *kwargs)
27		26
28	def sort_dict_alnum_english_key(phrase):	27	def sort_dict_alnum_english_key(phrase):
29	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()	28	return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
30		29
31	class Dictionary(dict):	30	class Dictionary(dict):
32	"""	31	"""
33	classdocs	32	classdocs
34		33
35	"""	34	"""
36	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"	35	_keys = "ipa\|en\|lit\|pos\|com\|tag\|ex"
37	_expressions = {}	36	_expressions = {}
38		37
39	def load (self, dictionary_file, language_key='en'):	38	def load (self, dictionary_file, language_key='en'):
40	dictionary = self	-
41		-
42	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)	39	dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
43		40
44	chdir(dirname(realpath(__file__)))	41	chdir(dirname(realpath(__file__)))
45		42
46	pickle_file = basename(dictionary_file) + '.pickle'	43	pickle_file = basename(dictionary_file) + '.pickle'
47		44
48	try:	45	try:
49	pickle_mtime = stat(pickle_file).st_mtime	46	pickle_mtime = stat(pickle_file).st_mtime
50	except FileNotFoundError:	47	except FileNotFoundError:
51	pickle_mtime = None	48	pickle_mtime = None
52		49
53	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:	50	if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
54	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)	51	dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
55	phrase = None	52	phrase = None
56	key = None	53	key = None
57	value = None	54	value = None
58	with open(dictionary_file) as f:	55	with open(dictionary_file) as f:
59	indent = None	56	indent = None
60		57
61	for line in f:	58	for line in f:
62	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(language_key), line)	59	m = match(r'^\s{0}:\s(?P<phrase>.+)'.format(language_key), line)
63	if m is not None:	60	if m is not None:
64	phrase = m.group("phrase")	61	phrase = m.group("phrase")
65	dictionary[phrase] = {}	62	self[phrase] = {}
66	indent = None	63	indent = None
67	else:	64	else:
68	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)	65	m = match(r'(?P<indent>\s)(?P<key>{0}):\s(?P<value>.+)'.format(self._keys), line)
69	if m is not None:	66	if m is not None:
70	# join previous value if necessary	67	# join previous value if necessary
71	if type(value) == list:	68	if type(value) == list:
72	dictionary[phrase][key] = ' '.join(value)	69	self[phrase][key] = ' '.join(value)
73		70
74	indent = m.group("indent")	71	indent = m.group("indent")
75	key = m.group("key")	72	key = m.group("key")
76	value = m.group("value")	73	value = m.group("value")
77	# assign a string for memory efficiency	74	# assign a string for memory efficiency
78	dictionary[phrase][key] = value	75	self[phrase][key] = value
79	elif indent is not None:	76	elif indent is not None:
80	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)	77	m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
81	if m is not None:	78	if m is not None:
82	if len(m.group("indent")) == len(indent) + 2:	79	if len(m.group("indent")) == len(indent) + 2:
83	continuation = m.group("continuation")	80	continuation = m.group("continuation")
84	if type(value) == str:	81	if type(value) == str:
85	# when a continuation is first found, convert to a list	82	# when a continuation is first found, convert to a list
86	# because there could be more continuations	83	# because there could be more continuations
87	value = dictionary[phrase][key] = [value, continuation]	84	value = self[phrase][key] = [value, continuation]
88	else:	85	else:
89	value.append(continuation)	86	value.append(continuation)
90		87
91	# join last value if necessary	88	# join last value if necessary
92	if type(value) == list:	89	if type(value) == list:
93	dictionary[phrase][key] = ' '.join(value)	90	self[phrase][key] = ' '.join(value)
94		91
95	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)	92	dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
96	# TODO: Pickle should only contain strings to be small	93	# TODO: Pickle should only contain strings to be small
97	with open(pickle_file, mode='wb') as f: dump(dictionary, f)	94	with open(pickle_file, mode='wb') as f: dump(self, f)
98	dmsg(' done.', min_level=1)	95	dmsg(' done.', min_level=1)
99	else:	96	else:
100	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)	97	dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
101	with open(pickle_file, mode='rb') as f: pickle = load(f)	98	with open(pickle_file, mode='rb') as f: pickle = load(f)
102	for key, value in pickle.items():	99	for key, value in pickle.items():
103	dictionary[key] = value	100	self[key] = value
104		101
105	dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)	102	dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
106		103
107	def clean (self):	104	def clean (self):
108	dictionary = self	-
109		-
110	parens_re = compile(r'\(.+\)', DOTALL)	105	parens_re = compile(r'\(.+\)', DOTALL)
111	braces_re = compile(r'^\s\{(.+)\}\s$', DOTALL)	106	braces_re = compile(
-		107	r'^\s\{(?P<phrase>.+)\}(?:\s\((?P<variant>.+?)\))?\s*$',
-		108	DOTALL)
112	semicolon_re = compile(r'\s;\s')	109	semicolon_re = compile(r'\s;\s')
113		110
114	for orig_phrase, data in list(dictionary.items()):	111	for orig_phrase, data in list(self.items()):
115	# if there are optional or alternating parts	112	# if there are optional or alternating parts
116	if search(parens_re, orig_phrase):	113	if search(parens_re, orig_phrase):
117	if orig_phrase.find('\|') > -1:	114	if orig_phrase.find('\|') > -1:
118	# TODO alternation	115	# TODO alternation
119	pass	116	pass
120	else:	117	else:
121	# TODO optional parts	118	# TODO optional parts
122	pass	119	pass
123		120
124	if orig_phrase.find(';') > -1:	121	if orig_phrase.find(';') > -1:
125	synonyms = map(	122	synonyms = map(
126	lambda x: sub(braces_re, r'\1', x),	123	lambda x: sub(braces_re, r'\1', x),
127	split(semicolon_re, orig_phrase))	124	split(semicolon_re, orig_phrase))
128		125
129	for synonym in synonyms:	126	for synonym in synonyms:
130	dictionary[synonym] = deepcopy(data)	127	self[synonym] = data
131		128
132	del dictionary[orig_phrase]	129	del self[orig_phrase]
133	else:	130	else:
134	m = match(braces_re, orig_phrase)	131	m = match(braces_re, orig_phrase)
135	if m is not None:	132	if m is not None:
-		133	phrase = m.group("phrase")
-		134	m2 = match(parens_re, phrase)
-		135	if m2 is not None:
136	dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])	136	# TODO alternation and optional parts
-		137	pass
-		138
-		139	self[phrase] = data
137	del dictionary[orig_phrase]	140	del self[orig_phrase]
138		141

Subversion Repositories LCARS

(root)/trunk/tools/eazytrans/Dictionary.py - Rev 294 → 295