Subversion Repositories LCARS

Rev

Rev 291 | Rev 293 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 291 Rev 292
Line -... Line 1...
-
 
1
#!/usr/bin/env python3
-
 
2
1
'''
3
'''
2
Created on 2014-10-20
4
Created on 2014-10-20
3

5

4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
6
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5
'''
7
'''
6
from sys import argv, stderr
8
from sys import argv, stderr
7
from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
9
from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
8
    split
10
    split
9
from os import chdir
11
from os import chdir, stat
10
from os.path import dirname, realpath
12
from os.path import dirname, realpath, basename
11
from collections import OrderedDict
13
from collections import OrderedDict
12
from functools import cmp_to_key
14
from functools import cmp_to_key
13
from copy import deepcopy
15
from copy import deepcopy
14
from collections.abc import MutableSequence
16
from collections.abc import MutableSequence
-
 
17
from pickle import dump, load
15
18
-
 
19
debug_level = 2
16
dictionary = {}
20
dictionary = {}
17
21
18
prepositions = {
22
prepositions = {
19
    "fi'": 'on',
23
    "fi'": 'on',
20
    "na'": 'at|to',
24
    "na'": 'at|to',
21
    "t'": 'of'
25
    "t'": 'of'
22
}
26
}
23
27
-
 
28
def dmsg(*args, **kwargs):
-
 
29
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
-
 
30
        kwargs['min_level'] = 1
-
 
31
-
 
32
    if not hasattr(kwargs, 'file'):
-
 
33
        kwargs['file'] = stderr
-
 
34
-
 
35
    if debug_level >= kwargs['min_level']:
-
 
36
        del kwargs['min_level']
-
 
37
        print(*args, **kwargs)
-
 
38
24
class MutableString2(MutableSequence):
39
class MutableString2(MutableSequence):
25
    def __init__(self, value=None):
40
    def __init__(self, value=None):
26
        self._values = [str(value)] if value is not None else []
41
        self._values = [str(value)] if value is not None else []
27
42
28
    def __add__(self, value):
43
    def __add__(self, value):
Line 50... Line 65...
50
        self._values.append(values)
65
        self._values.append(values)
51
66
52
    def insert(self, index, value):
67
    def insert(self, index, value):
53
        raise NotImplementedError
68
        raise NotImplementedError
54
69
-
 
70
def cli_help():
-
 
71
    print('Usage: {0} TEXT...'.format(basename(argv[0])))
-
 
72
55
def load_dictionary(dictionary, dictionary_file):
73
def load_dictionary(dictionary, dictionary_file):
56
    print('Loading dictionary {0} ...'.format(dictionary_file), end='', file=stderr)
74
    dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
57
75
58
    chdir(dirname(realpath(__file__)))
76
    chdir(dirname(realpath(__file__)))
59
    with open(dictionary_file) as f:
-
 
60
        keys = "ipa|en|lit|pos|com|tag|ex"
-
 
61
        indent = None
-
 
62
        value = None
-
 
63
77
-
 
78
    pickle_file = basename(dictionary_file) + '.pickle'
-
 
79
64
        for line in f:
80
    try:
65
            m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line)
81
        pickle_mtime = stat(pickle_file).st_mtime
-
 
82
    except FileNotFoundError:
66
            if m is not None:
83
        pickle_mtime = None
-
 
84
-
 
85
    if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
-
 
86
        dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
67
                phrase = m.group("phrase")
87
        with open(dictionary_file) as f:
68
                dictionary[phrase] = {}
88
            keys = "ipa|en|lit|pos|com|tag|ex"
69
                indent = None
89
            indent = None
70
            else:
90
            value = None
-
 
91
71
                m = match(
92
            for line in f:
72
                    r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys),
93
                m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line)
73
                    line)
-
 
74
                if m is not None:
94
                if m is not None:
75
                    indent = m.group("indent")
95
                    phrase = m.group("phrase")
76
                    key = m.group("key")
96
                    dictionary[phrase] = {}
77
                    value = m.group("value")
97
                    indent = None
78
                    value = dictionary[phrase][key] = MutableString2(value)
-
 
79
                elif indent is not None:
98
                else:
80
                    m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
99
                    m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys), line)
81
                    if m is not None:
100
                    if m is not None:
-
 
101
                        indent = m.group("indent")
-
 
102
                        key = m.group("key")
-
 
103
                        value = m.group("value")
-
 
104
                        value = dictionary[phrase][key] = MutableString2(value)
-
 
105
                    elif indent is not None:
-
 
106
                        m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
-
 
107
                        if m is not None:
82
                        if len(m.group("indent")) == len(indent) + 2:
108
                            if len(m.group("indent")) == len(indent) + 2:
83
                            dictionary[phrase][key] += (" " + m.group("continuation"))
109
                                dictionary[phrase][key] += (" " + m.group("continuation"))
-
 
110
-
 
111
        dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
-
 
112
        # TODO: Pickle should only contain strings to be small
-
 
113
        with open(pickle_file, mode='wb') as f: dump(dictionary, f)
-
 
114
        dmsg(' done.', min_level=1)
-
 
115
    else:
-
 
116
        dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
-
 
117
        with open(pickle_file, mode='rb') as f: pickle = load(f)
-
 
118
        for key, value in pickle.items():
-
 
119
            dictionary[key] = value
84
120
85
    print(' done ({0} entries).'.format(len(dictionary)), file=stderr)
121
    dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)
86
122
87
def clean_dictionary(dictionary):
123
def clean_dictionary(dictionary):
88
    braces_re = compile(r'^\s*\{(.+)\}\s*$')
124
    braces_re = compile(r'^\s*\{(.+)\}\s*$')
89
    semicolon_re = compile(r'\s*;\s*')
125
    semicolon_re = compile(r'\s*;\s*')
90
126
Line 135... Line 171...
135
171
136
        return 1 if len(b) < len(a) else 0
172
        return 1 if len(b) < len(a) else 0
137
173
138
    return cmp_to_key(sort_dict_alnum_vulcan)
174
    return cmp_to_key(sort_dict_alnum_vulcan)
139
175
140
def translate (word, recursion=False):
176
def translate (phrase):
141
    translation = dictionary.get(word.lower(), None)
177
    translation = dictionary.get(phrase.lower(), None)
142
    if translation is not None:
178
    if translation is not None:
143
        translation = translation["en"]
-
 
144
        if match('[A-Z]', word):
-
 
145
            return sub('[a-z]', lambda ch: ch.group(0).upper(), str(translation), count=1)
-
 
146
        return translation
179
        return translation
147
180
148
    if not recursion:
-
 
149
        # prepositions attached?
-
 
150
        for prep, prep_transl in prepositions.items():
-
 
151
            if (match(prep, word)):
-
 
152
                real_word = word.replace(r'^' + prep, '')
-
 
153
                real_word_transl = translate(real_word, recursion=True)
-
 
154
                if real_word_transl is not None:
-
 
155
                    return prep_transl + ' ' + real_word_transl
-
 
156
-
 
157
    if recursion:
-
 
158
        return None
181
    return None
159
    else:
-
 
160
        # Not in dictionary: proper name or missing for other reasons
-
 
161
        return '{{{0}}}'.format(word)
-
 
162
182
163
if __name__ == '__main__':
183
if __name__ == '__main__':
-
 
184
    if len(argv) < 2:
-
 
185
        print('Nothing to translate.', end='\n\n', file=stderr)
-
 
186
        cli_help()
-
 
187
        exit(1)
-
 
188
-
 
189
    text = argv[1]
-
 
190
164
    load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
191
    load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
165
    clean_dictionary(dictionary)
192
    clean_dictionary(dictionary)
166
193
167
#     try:
194
#     try:
168
#         for phrase, data in OrderedDict(sorted(
195
#         for phrase, data in OrderedDict(sorted(
Line 171... Line 198...
171
#         )).items():
198
#         )).items():
172
#             print(phrase, "=", data)
199
#             print(phrase, "=", data)
173
#     except BrokenPipeError:
200
#     except BrokenPipeError:
174
#         pass
201
#         pass
175
202
176
    text = argv[1]
-
 
177
    sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
203
    sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
-
 
204
    dmsg("sentences:", sentences, min_level=2)
178
    for sentence in sentences:
205
    for sentence in sentences:
-
 
206
        dmsg("sentence:", sentence, min_level=2)
-
 
207
-
 
208
        clauses = split(r'\s+[-–—]\s+', sentence)
-
 
209
        dmsg("clauses:", clauses, min_level=2)
-
 
210
        for clause in clauses:
-
 
211
            dmsg("clause:", clause, min_level=2)
-
 
212
-
 
213
            words = findall(r'[^\s.]+', clause)
-
 
214
            dmsg("words:", words, min_level=2)
-
 
215
179
        print(sentence)
216
            offset = 0
-
 
217
            while offset < len(words):
-
 
218
                translation = None
-
 
219
-
 
220
                for i in reversed(range(offset + 1, len(words) + 1)):
-
 
221
                    phrase = ' '.join(words[offset:i])
180
222
181
        words = findall(r"(?!\s+)[a-z'-]{2,}", sentence, IGNORECASE)
223
                    dmsg("phrase:", phrase, min_level=2)
182
        print(words)
-
 
183
224
184
        translated_words = list(map(translate, words))
225
                    translation = translate(phrase)
185
        print(translated_words)
-
 
186
226
187
        for index, word in enumerate(words):
227
                    if translation is not None:
188
            sentence = sentence.replace(word, str(translated_words[index]))
228
                        dmsg("phrase-translation:", translation, min_level=2)
-
 
229
                        dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2)
-
 
230
                        words[offset:i] = [translation]
-
 
231
                        offset += i - 1
189
        print(sentence)
232
                        break
190
233
191
        # replace punctuation
234
                if translation is None:
192
        for symbol, replacement in ({" - ": ", "}).items():
235
                    dmsg("phrase-translation:", translation, min_level=2)
193
            sentence = sentence.replace(symbol, replacement)
236
                    offset += 1
194
237
195
        print(sentence)
238
            dmsg("words-translation:", words, min_level=2)