Rev 294 | Rev 296 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
| Rev 294 | Rev 295 | ||
|---|---|---|---|
| Line 8... | Line 8... | ||
| 8 | from os import chdir, stat |
8 | from os import chdir, stat |
| 9 | from sys import stderr |
9 | from sys import stderr |
| 10 | from os.path import dirname, realpath, basename |
10 | from os.path import dirname, realpath, basename |
| 11 | from pickle import dump, load |
11 | from pickle import dump, load |
| 12 | from re import match, DOTALL, search, sub, split, compile |
12 | from re import match, DOTALL, search, sub, split, compile |
| 13 | from copy import deepcopy |
- | |
| 14 | 13 | ||
| 15 | debug_level = 2 |
14 | debug_level = 2 |
| 16 | 15 | ||
| 17 | def dmsg(*args, **kwargs): |
16 | def dmsg(*args, **kwargs): |
| 18 | if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: |
17 | if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: |
| Line 35... | Line 34... | ||
| 35 | """
|
34 | """
|
| 36 | _keys = "ipa|en|lit|pos|com|tag|ex" |
35 | _keys = "ipa|en|lit|pos|com|tag|ex" |
| 37 | _expressions = {} |
36 | _expressions = {} |
| 38 | 37 | ||
| 39 | def load (self, dictionary_file, language_key='en'): |
38 | def load (self, dictionary_file, language_key='en'): |
| 40 | dictionary = self |
- | |
| 41 | - | ||
| 42 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
39 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
| 43 | 40 | ||
| 44 | chdir(dirname(realpath(__file__))) |
41 | chdir(dirname(realpath(__file__))) |
| 45 | 42 | ||
| 46 | pickle_file = basename(dictionary_file) + '.pickle' |
43 | pickle_file = basename(dictionary_file) + '.pickle' |
| Line 60... | Line 57... | ||
| 60 | 57 | ||
| 61 | for line in f: |
58 | for line in f: |
| 62 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(language_key), line) |
59 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(language_key), line) |
| 63 | if m is not None: |
60 | if m is not None: |
| 64 | phrase = m.group("phrase") |
61 | phrase = m.group("phrase") |
| 65 | dictionary[phrase] = {} |
62 | self[phrase] = {} |
| 66 | indent = None |
63 | indent = None |
| 67 | else:
|
64 | else:
|
| 68 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) |
65 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) |
| 69 | if m is not None: |
66 | if m is not None: |
| 70 | # join previous value if necessary
|
67 | # join previous value if necessary
|
| 71 | if type(value) == list: |
68 | if type(value) == list: |
| 72 | dictionary[phrase][key] = ' '.join(value) |
69 | self[phrase][key] = ' '.join(value) |
| 73 | 70 | ||
| 74 | indent = m.group("indent") |
71 | indent = m.group("indent") |
| 75 | key = m.group("key") |
72 | key = m.group("key") |
| 76 | value = m.group("value") |
73 | value = m.group("value") |
| 77 | # assign a string for memory efficiency
|
74 | # assign a string for memory efficiency
|
| 78 | dictionary[phrase][key] = value |
75 | self[phrase][key] = value |
| 79 | elif indent is not None: |
76 | elif indent is not None: |
| 80 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
77 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
| 81 | if m is not None: |
78 | if m is not None: |
| 82 | if len(m.group("indent")) == len(indent) + 2: |
79 | if len(m.group("indent")) == len(indent) + 2: |
| 83 | continuation = m.group("continuation") |
80 | continuation = m.group("continuation") |
| 84 | if type(value) == str: |
81 | if type(value) == str: |
| 85 | # when a continuation is first found, convert to a list
|
82 | # when a continuation is first found, convert to a list
|
| 86 | # because there could be more continuations
|
83 | # because there could be more continuations
|
| 87 | value = dictionary[phrase][key] = [value, continuation] |
84 | value = self[phrase][key] = [value, continuation] |
| 88 | else:
|
85 | else:
|
| 89 | value.append(continuation) |
86 | value.append(continuation) |
| 90 | 87 | ||
| 91 | # join last value if necessary
|
88 | # join last value if necessary
|
| 92 | if type(value) == list: |
89 | if type(value) == list: |
| 93 | dictionary[phrase][key] = ' '.join(value) |
90 | self[phrase][key] = ' '.join(value) |
| 94 | 91 | ||
| 95 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
92 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
| 96 | # TODO: Pickle should only contain strings to be small
|
93 | # TODO: Pickle should only contain strings to be small
|
| 97 | with open(pickle_file, mode='wb') as f: dump(dictionary, f) |
94 | with open(pickle_file, mode='wb') as f: dump(self, f) |
| 98 | dmsg(' done.', min_level=1) |
95 | dmsg(' done.', min_level=1) |
| 99 | else:
|
96 | else:
|
| 100 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
97 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
| 101 | with open(pickle_file, mode='rb') as f: pickle = load(f) |
98 | with open(pickle_file, mode='rb') as f: pickle = load(f) |
| 102 | for key, value in pickle.items(): |
99 | for key, value in pickle.items(): |
| 103 | dictionary[key] = value |
100 | self[key] = value |
| 104 | 101 | ||
| 105 | dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1) |
102 | dmsg(' done ({0} entries).'.format(len(self)), min_level=1) |
| 106 | 103 | ||
| 107 | def clean (self): |
104 | def clean (self): |
| 108 | dictionary = self |
- | |
| 109 | - | ||
| 110 | parens_re = compile(r'\(.+\)', DOTALL) |
105 | parens_re = compile(r'\(.+\)', DOTALL) |
| 111 | braces_re = compile(r'^\s*\{(.+)\}\s*$', DOTALL) |
106 | braces_re = compile( |
| - | 107 | r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$', |
|
| - | 108 | DOTALL)
|
|
| 112 | semicolon_re = compile(r'\s*;\s*') |
109 | semicolon_re = compile(r'\s*;\s*') |
| 113 | 110 | ||
| 114 | for orig_phrase, data in list(dictionary.items()): |
111 | for orig_phrase, data in list(self.items()): |
| 115 | # if there are optional or alternating parts
|
112 | # if there are optional or alternating parts
|
| 116 | if search(parens_re, orig_phrase): |
113 | if search(parens_re, orig_phrase): |
| 117 | if orig_phrase.find('|') > -1: |
114 | if orig_phrase.find('|') > -1: |
| 118 | # TODO alternation
|
115 | # TODO alternation
|
| 119 | pass
|
116 | pass
|
| Line 125... | Line 122... | ||
| 125 | synonyms = map( |
122 | synonyms = map( |
| 126 | lambda x: sub(braces_re, r'\1', x), |
123 | lambda x: sub(braces_re, r'\1', x), |
| 127 | split(semicolon_re, orig_phrase)) |
124 | split(semicolon_re, orig_phrase)) |
| 128 | 125 | ||
| 129 | for synonym in synonyms: |
126 | for synonym in synonyms: |
| 130 | dictionary[synonym] = deepcopy(data) |
127 | self[synonym] = data |
| 131 | 128 | ||
| 132 | del dictionary[orig_phrase] |
129 | del self[orig_phrase] |
| 133 | else:
|
130 | else:
|
| 134 | m = match(braces_re, orig_phrase) |
131 | m = match(braces_re, orig_phrase) |
| 135 | if m is not None: |
132 | if m is not None: |
| - | 133 | phrase = m.group("phrase") |
|
| - | 134 | m2 = match(parens_re, phrase) |
|
| - | 135 | if m2 is not None: |
|
| 136 | dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase]) |
136 | # TODO alternation and optional parts
|
| - | 137 | pass
|
|
| - | 138 | ||
| - | 139 | self[phrase] = data |
|
| 137 | del dictionary[orig_phrase] |
140 | del self[orig_phrase] |