Subversion Repositories LCARS

Rev

Rev 292 | Rev 294 | Go to most recent revision | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 292 Rev 293
1
#!/usr/bin/env python3
1
#!/usr/bin/env python3
2
2
3
'''
3
'''
4
Created on 2014-10-20
4
Created on 2014-10-20
5

5

6
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
6
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
7
'''
7
'''
8
from sys import argv, stderr
8
from sys import argv, stderr
9
from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
9
from re import findall, DOTALL, match, sub, compile, \
10
    split
10
    escape, search
11
from os import chdir, stat
-
 
12
from os.path import dirname, realpath, basename
11
from os.path import basename
13
from collections import OrderedDict
-
 
14
from functools import cmp_to_key
12
from functools import cmp_to_key
15
from copy import deepcopy
-
 
16
from collections.abc import MutableSequence
13
from Dictionary import Dictionary, dmsg, \
17
from pickle import dump, load
14
    sort_dict_alnum_english_key
18
15
19
debug_level = 2
-
 
20
dictionary = {}
16
dictionary = {}
21
17
22
prepositions = {
18
prepositions = {
23
    "fi'": 'on',
19
    "fi'": 'on',
24
    "na'": 'at|to',
20
    "na'": 'at|to',
25
    "t'": 'of'
21
    "t'": 'of'
26
}
22
}
27
23
28
def dmsg(*args, **kwargs):
-
 
29
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
-
 
30
        kwargs['min_level'] = 1
-
 
31
-
 
32
    if not hasattr(kwargs, 'file'):
-
 
33
        kwargs['file'] = stderr
-
 
34
-
 
35
    if debug_level >= kwargs['min_level']:
-
 
36
        del kwargs['min_level']
-
 
37
        print(*args, **kwargs)
-
 
38
-
 
39
class MutableString2(MutableSequence):
-
 
40
    def __init__(self, value=None):
-
 
41
        self._values = [str(value)] if value is not None else []
-
 
42
-
 
43
    def __add__(self, value):
-
 
44
        return ''.join([self, value])
-
 
45
-
 
46
    def __delitem__(self):
-
 
47
        raise NotImplementedError
-
 
48
-
 
49
    def __getitem__(self, index):
-
 
50
        return str(self)[index]
-
 
51
-
 
52
    def __len__(self):
-
 
53
        return len(str(self))
-
 
54
-
 
55
    def __repr__(self):
-
 
56
        return ''.join(self._values)
-
 
57
-
 
58
    def __setitem__(self, index, value):
-
 
59
        raise NotImplementedError
-
 
60
-
 
61
    def __str__(self):
-
 
62
        return self.__repr__()
-
 
63
-
 
64
    def extend(self, values):
-
 
65
        self._values.append(values)
-
 
66
-
 
67
    def insert(self, index, value):
-
 
68
        raise NotImplementedError
-
 
69
-
 
70
def cli_help():
24
def cli_help():
71
    print('Usage: {0} TEXT...'.format(basename(argv[0])))
25
    print('Usage: {0} TEXT...'.format(basename(argv[0])))
72
26
73
def load_dictionary(dictionary, dictionary_file):
-
 
74
    dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
-
 
75
-
 
76
    chdir(dirname(realpath(__file__)))
-
 
77
-
 
78
    pickle_file = basename(dictionary_file) + '.pickle'
-
 
79
-
 
80
    try:
-
 
81
        pickle_mtime = stat(pickle_file).st_mtime
-
 
82
    except FileNotFoundError:
-
 
83
        pickle_mtime = None
-
 
84
-
 
85
    if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
-
 
86
        dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
-
 
87
        with open(dictionary_file) as f:
-
 
88
            keys = "ipa|en|lit|pos|com|tag|ex"
-
 
89
            indent = None
-
 
90
            value = None
-
 
91
-
 
92
            for line in f:
-
 
93
                m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line)
-
 
94
                if m is not None:
-
 
95
                    phrase = m.group("phrase")
-
 
96
                    dictionary[phrase] = {}
-
 
97
                    indent = None
-
 
98
                else:
-
 
99
                    m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys), line)
-
 
100
                    if m is not None:
-
 
101
                        indent = m.group("indent")
-
 
102
                        key = m.group("key")
-
 
103
                        value = m.group("value")
-
 
104
                        value = dictionary[phrase][key] = MutableString2(value)
-
 
105
                    elif indent is not None:
-
 
106
                        m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
-
 
107
                        if m is not None:
-
 
108
                            if len(m.group("indent")) == len(indent) + 2:
-
 
109
                                dictionary[phrase][key] += (" " + m.group("continuation"))
-
 
110
-
 
111
        dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
-
 
112
        # TODO: Pickle should only contain strings to be small
-
 
113
        with open(pickle_file, mode='wb') as f: dump(dictionary, f)
-
 
114
        dmsg(' done.', min_level=1)
-
 
115
    else:
-
 
116
        dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
-
 
117
        with open(pickle_file, mode='rb') as f: pickle = load(f)
-
 
118
        for key, value in pickle.items():
-
 
119
            dictionary[key] = value
-
 
120
-
 
121
    dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)
-
 
122
-
 
123
def clean_dictionary(dictionary):
-
 
124
    braces_re = compile(r'^\s*\{(.+)\}\s*$')
-
 
125
    semicolon_re = compile(r'\s*;\s*')
-
 
126
-
 
127
    for orig_phrase, data in list(dictionary.items()):
-
 
128
        if orig_phrase.find(";") > -1:
-
 
129
            synonyms = map(
-
 
130
                lambda x: sub(braces_re, r'\1', orig_phrase),
-
 
131
                split(semicolon_re, orig_phrase))
-
 
132
-
 
133
            for synonym in synonyms:
-
 
134
                dictionary[synonym] = deepcopy(data)
-
 
135
-
 
136
            del dictionary[orig_phrase]
-
 
137
        else:
-
 
138
            m = match(braces_re, orig_phrase)
-
 
139
            if m is not None:
-
 
140
                dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
-
 
141
                del dictionary[orig_phrase]
-
 
142
-
 
143
def sort_dict_alnum_english_key(phrase):
-
 
144
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
-
 
145
-
 
146
def get_sort_dict_alnum_vulcan_key():
27
def get_sort_dict_alnum_vulcan_key():
147
    letters = list(map(str.lower, [
28
    letters = list(map(str.lower, [
148
        " ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D',
29
        " ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D',
149
        'V', 'Kh', 'E', 'H', 'G', 'Ch', 'I', 'N', 'Zh', 'M', 'Y', 'F', 'Z',
30
        'V', 'Kh', 'E', 'H', 'G', 'Ch', 'I', 'N', 'Zh', 'M', 'Y', 'F', 'Z',
150
        'Th', 'W', 'B', "'", '-']))
31
        'Th', 'W', 'B', "'", '-']))
151
    letter_values = dict(map(lambda x: (x[1], x[0]), enumerate(letters)))
32
    letter_values = dict(map(lambda x: (x[1], x[0]), enumerate(letters)))
152
    letters_re = compile(r'(?:{0})'.format('|'.join(sorted(letters, key=lambda char:-len(char)))))
33
    letters_re = compile(r'(?:{0})'.format('|'.join(sorted(letters, key=lambda char:-len(char)))))
153
34
154
    def sort_dict_alnum_vulcan (a, b):
35
    def sort_dict_alnum_vulcan (a, b):
155
        # split into Vulcan letters
36
        # split into Vulcan letters
156
        a = findall(letters_re, sort_dict_alnum_english_key(a))
37
        a = findall(letters_re, sort_dict_alnum_english_key(a))
157
        b = findall(letters_re, sort_dict_alnum_english_key(b))
38
        b = findall(letters_re, sort_dict_alnum_english_key(b))
158
39
159
        if len(a) < len(b):
40
        if len(a) < len(b):
160
            for index, char in enumerate(a):
41
            for index, char in enumerate(a):
161
                diff = letter_values[char] - letter_values[b[index]]
42
                diff = letter_values[char] - letter_values[b[index]]
162
                if diff != 0:
43
                if diff != 0:
163
                    return diff
44
                    return diff
164
            return -1
45
            return -1
165
46
166
        # len(b) <= len(a)
47
        # len(b) <= len(a)
167
        for index, char in enumerate(b):
48
        for index, char in enumerate(b):
168
            diff = letter_values[a[index]] - letter_values[char]
49
            diff = letter_values[a[index]] - letter_values[char]
169
            if diff != 0:
50
            if diff != 0:
170
                return diff
51
                return diff
171
52
172
        return 1 if len(b) < len(a) else 0
53
        return 1 if len(b) < len(a) else 0
173
54
174
    return cmp_to_key(sort_dict_alnum_vulcan)
55
    return cmp_to_key(sort_dict_alnum_vulcan)
175
56
-
 
57
class VulcanDictionary(Dictionary):
-
 
58
    def translate (self, phrase, search_prefix=True, search_plural=True):
176
def translate (phrase):
59
        dictionary = self
-
 
60
177
    translation = dictionary.get(phrase.lower(), None)
61
        translation = dictionary.get(phrase.lower(), None)
178
    if translation is not None:
62
        if translation is not None:
-
 
63
            translation['vuh'] = phrase
179
        return translation
64
            return translation
-
 
65
        else:
-
 
66
            if search_prefix:
-
 
67
                # find prefix
-
 
68
                for preposition in prepositions:
-
 
69
                    prefix = match(escape(preposition), phrase)
-
 
70
                    if prefix is not None:
-
 
71
                        prefix_translation = self.translate(prefix.group(0))
-
 
72
                        if prefix_translation is not None:
-
 
73
                            tail = sub(preposition, '', phrase)
-
 
74
                            tail_translation = self.translate(tail, search_prefix=False)
-
 
75
                            if tail_translation is not None:
-
 
76
                                return [prefix_translation, tail_translation]
-
 
77
            elif search_plural:
-
 
78
                # find plural
-
 
79
                suffix = search(r'lar$', phrase)
-
 
80
                if suffix is not None:
-
 
81
                    head = sub(r'lar$', '', phrase)
-
 
82
                    head_translation = self.translate(head, search_prefix=False, search_plural=False)
-
 
83
                    if head_translation is not None:
-
 
84
                        head_translation = dict(head_translation)
-
 
85
                        head_translation['en'] += ' (pl.)'
-
 
86
                        return head_translation
180
87
181
    return None
88
        return None
182
89
183
if __name__ == '__main__':
90
if __name__ == '__main__':
184
    if len(argv) < 2:
91
    if len(argv) < 2:
185
        print('Nothing to translate.', end='\n\n', file=stderr)
92
        print('Nothing to translate.', end='\n\n', file=stderr)
186
        cli_help()
93
        cli_help()
187
        exit(1)
94
        exit(1)
188
95
189
    text = argv[1]
96
    text = argv[1]
190
97
-
 
98
    dictionary = VulcanDictionary(dictionary)
191
    load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
99
    dictionary.load('vuh-gol-en.dict.zdb.txt')
192
    clean_dictionary(dictionary)
100
    dictionary.clean()
193
101
194
#     try:
102
#     try:
195
#         for phrase, data in OrderedDict(sorted(
103
#         for phrase, data in OrderedDict(sorted(
196
#             dictionary.items(),
104
#             dictionary.items(),
197
#             key=get_sort_dict_alnum_vulcan_key()
105
#             key=get_sort_dict_alnum_vulcan_key()
198
#         )).items():
106
#         )).items():
199
#             print(phrase, "=", data)
107
#             print(phrase, "=", data)
200
#     except BrokenPipeError:
108
#     except BrokenPipeError:
201
#         pass
109
#         pass
202
110
-
 
111
    dmsg("text:", text, min_level=2)
203
    sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
112
    sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
204
    dmsg("sentences:", sentences, min_level=2)
113
    dmsg("sentences:", sentences, min_level=2)
205
    for sentence in sentences:
114
    for sentence in sentences:
206
        dmsg("sentence:", sentence, min_level=2)
115
        dmsg("sentence:", sentence, min_level=2)
207
116
208
        clauses = split(r'\s+[-–—]\s+', sentence)
117
        clauses = findall(r'(?!\s+)(?:.+?(?:\s+-\s*|\s*[–—]\s*|\.{1,3}|.+$))', sentence, DOTALL)
209
        dmsg("clauses:", clauses, min_level=2)
118
        dmsg("clauses:", clauses, min_level=2)
210
        for clause in clauses:
119
        for clause in clauses:
211
            dmsg("clause:", clause, min_level=2)
120
            dmsg("clause:", clause, min_level=2)
212
121
213
            words = findall(r'[^\s.]+', clause)
122
            words = findall(r'[^\s.]+', clause)
214
            dmsg("words:", words, min_level=2)
123
            dmsg("words:", words, min_level=2)
215
124
216
            offset = 0
125
            offset = 0
217
            while offset < len(words):
126
            while offset < len(words):
218
                translation = None
127
                translation = None
219
128
220
                for i in reversed(range(offset + 1, len(words) + 1)):
129
                for i in range(len(words), offset, -1):
-
 
130
                    dmsg("words[{0}:{1}] = {2}".format(offset, i, words[offset:i]), min_level=2)
221
                    phrase = ' '.join(words[offset:i])
131
                    phrase = ' '.join(words[offset:i])
222
132
223
                    dmsg("phrase:", phrase, min_level=2)
133
                    dmsg("phrase:", phrase, min_level=2)
224
134
225
                    translation = translate(phrase)
135
                    translation = dictionary.translate(phrase)
226
136
227
                    if translation is not None:
137
                    if translation is not None:
228
                        dmsg("phrase-translation:", translation, min_level=2)
138
                        dmsg("phrase-translation:", translation, min_level=2)
229
                        dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2)
139
                        dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2)
230
                        words[offset:i] = [translation]
140
                        words[offset:i] = [translation]
231
                        offset += i - 1
141
                        offset += i - offset
232
                        break
142
                        break
233
143
234
                if translation is None:
144
                if translation is None:
235
                    dmsg("phrase-translation:", translation, min_level=2)
145
                    dmsg("phrase-translation:", translation, min_level=2)
236
                    offset += 1
146
                    offset += 1
237
147
238
            dmsg("words-translation:", words, min_level=2)
148
            dmsg("words-translation:", words, min_level=2)
239
 
149