1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
|
/*
* Copyright (C) 2009 The Android Open Source Project
* Copyright (C) 2016 AnySoftKeyboard
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/** Compresses a list of words and frequencies into a tree structured binary dictionary. */
class MakeBinaryDictionary {
private static final int ALPHA_SIZE = 256;
private static final String TAG_WORD = "w";
private static final String ATTR_FREQ = "f";
private static final CharNode EMPTY_NODE = new CharNode();
private static final int CHAR_WIDTH = 8;
private static final int FLAGS_WIDTH = 1; // Terminal flag (word end)
private static final int ADDR_WIDTH = 23; // Offset to children
private static final int FREQ_WIDTH_BYTES = 1;
private static final int COUNT_WIDTH_BYTES = 1;
private static final int FLAG_ADDRESS_MASK = 0x400000;
private static final int FLAG_TERMINAL_MASK = 0x800000;
private static final int ADDRESS_MASK = 0x3FFFFF;
private final String srcFilename;
private final String destFilename;
private List<CharNode> roots;
private Map<String, Integer> mDictionary;
private int mWordCount;
private byte[] dict;
private int dictSize;
private int nullChildrenCount = 0;
private int notTerminalCount = 0;
public MakeBinaryDictionary(String srcFilename, String destFilename) {
this.srcFilename = srcFilename;
this.destFilename = destFilename;
}
public void makeDictionary() throws ParserConfigurationException, SAXException, IOException {
populateDictionary(srcFilename);
writeToDict(destFilename);
}
private void populateDictionary(String filename)
throws IOException, SAXException, ParserConfigurationException {
roots = new ArrayList<>();
SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(
new File(filename),
new DefaultHandler() {
boolean inWord;
int freq;
StringBuilder wordBuilder = new StringBuilder(48);
@Override
public void startElement(
String uri, String localName, String qName, Attributes attributes) {
if (qName.equals(TAG_WORD)) {
inWord = true;
freq = Integer.parseInt(attributes.getValue(0));
wordBuilder.setLength(0);
}
}
@Override
public void characters(char[] data, int offset, int length) {
// Ignore other whitespace
if (!inWord) return;
wordBuilder.append(data, offset, length);
}
@Override
public void endElement(String uri, String localName, String qName) {
if (qName.equals(TAG_WORD)) {
if (wordBuilder.length() > 1) {
addWordTop(wordBuilder.toString(), freq);
mWordCount++;
}
inWord = false;
}
}
});
System.out.println("Nodes = " + CharNode.sNodes);
}
private int indexOf(List<CharNode> children, char c) {
if (children == null) {
return -1;
}
for (int i = 0; i < children.size(); i++) {
if (children.get(i).data == c) {
return i;
}
}
return -1;
}
private void addWordTop(String word, int occur) {
if (occur > 255) occur = 255;
char firstChar = word.charAt(0);
int index = indexOf(roots, firstChar);
if (index == -1) {
CharNode newNode = new CharNode();
newNode.data = firstChar;
newNode.freq = occur;
index = roots.size();
roots.add(newNode);
} else {
roots.get(index).freq += occur;
}
if (word.length() > 1) {
addWordRec(roots.get(index), word, 1, occur);
} else {
roots.get(index).terminal = true;
}
}
private void addWordRec(CharNode parent, String word, int charAt, int occur) {
CharNode child = null;
char data = word.charAt(charAt);
if (parent.children == null) {
parent.children = new ArrayList<>();
} else {
for (int i = 0; i < parent.children.size(); i++) {
CharNode node = parent.children.get(i);
if (node.data == data) {
child = node;
break;
}
}
}
if (child == null) {
child = new CharNode();
parent.children.add(child);
}
child.data = data;
if (child.freq == 0) child.freq = occur;
if (word.length() > charAt + 1) {
addWordRec(child, word, charAt + 1, occur);
} else {
child.terminal = true;
child.freq = occur;
}
}
private void addCount(int count) {
dict[dictSize++] = (byte) (0xFF & count);
}
private void addNode(CharNode node) {
int charData = 0xFFFF & node.data;
if (charData > 254) {
dict[dictSize++] = (byte) 255;
dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF);
dict[dictSize++] = (byte) (node.data & 0xFF);
} else {
dict[dictSize++] = (byte) (0xFF & node.data);
}
if (node.children != null) {
dictSize += 3; // Space for children address
} else {
dictSize += 1; // Space for just the terminal/address flags
}
if ((0xFFFFFF & node.freq) > 255) {
node.freq = 255;
}
if (node.terminal) {
byte freq = (byte) (0xFF & node.freq);
dict[dictSize++] = freq;
}
}
private void updateNodeAddress(int nodeAddress, CharNode node, int childrenAddress) {
if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character
nodeAddress += 2;
}
childrenAddress = ADDRESS_MASK & childrenAddress;
if (childrenAddress == 0) {
nullChildrenCount++;
} else {
childrenAddress |= FLAG_ADDRESS_MASK;
}
if (node.terminal) {
childrenAddress |= FLAG_TERMINAL_MASK;
} else {
notTerminalCount++;
}
dict[nodeAddress + 1] = (byte) (childrenAddress >> 16);
if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) {
dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
}
}
void writeWordsRec(List<CharNode> children) {
if (children == null || children.size() == 0) {
return;
}
final int childCount = children.size();
addCount(childCount);
// int childrenStart = dictSize;
int[] childrenAddresses = new int[childCount];
for (int j = 0; j < childCount; j++) {
CharNode node = children.get(j);
childrenAddresses[j] = dictSize;
addNode(node);
}
for (int j = 0; j < childCount; j++) {
CharNode node = children.get(j);
int nodeAddress = childrenAddresses[j];
int cacheDictSize = dictSize;
writeWordsRec(node.children);
updateNodeAddress(nodeAddress, node, node.children != null ? cacheDictSize : 0);
}
}
private void writeToDict(String dictFilename) throws IOException {
// 4MB max, 22-bit offsets
dict = new byte[4 * 1024 * 1024];
dictSize = 0;
writeWordsRec(roots);
System.out.println("Dict Size = " + dictSize);
try (FileOutputStream fos = new FileOutputStream(dictFilename)) {
fos.write(dict, 0, dictSize);
}
}
private static class CharNode {
static int sNodes;
char data;
int freq;
boolean terminal;
List<CharNode> children;
CharNode() {
sNodes++;
}
}
}
|