blob: 410fd349a51815343f7d3da17f0fdd8abee16ff6 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
|
import static org.gradle.api.tasks.PathSensitivity.RELATIVE;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipInputStream;
import org.gradle.api.DefaultTask;
import org.gradle.api.tasks.CacheableTask;
import org.gradle.api.tasks.Input;
import org.gradle.api.tasks.InputFile;
import org.gradle.api.tasks.OutputFile;
import org.gradle.api.tasks.PathSensitive;
import org.gradle.api.tasks.TaskAction;
/**
* Task to generate words-list XML file from a AOSP words-list file.
* https://android.googlesource.com/platform/packages/inputmethods/LatinIME/+/master/dictionaries/
*/
@CacheableTask
public class GenerateWordsListFromAOSPTask extends DefaultTask {
private static final Pattern mWordLineRegex =
Pattern.compile("^\\s*word=([\\w\\p{L}'\"-]+),f=(\\d+).*$");
private File inputFile;
private File outputWordsListFile;
private int maxWordsInList = 500000;
@TaskAction
public void generateWordsList() throws IOException {
if (inputFile == null) {
throw new IllegalArgumentException("Please provide inputFile value.");
}
if (!inputFile.isFile()) throw new IllegalArgumentException("inputFile must be a file!");
if (outputWordsListFile == null) {
throw new IllegalArgumentException("Please provide outputWordsListFile value.");
}
final long inputSize = inputFile.length();
System.out.println(
"Reading input file " + inputFile.getName() + " (size " + inputSize + ")...");
InputStream fileInput = new FileInputStream(inputFile);
if (inputFile.getName().endsWith(".zip")) {
fileInput = new ZipInputStream(fileInput);
} else if (inputFile.getName().endsWith(".gz")) {
fileInput = new GZIPInputStream(fileInput);
}
BufferedReader reader =
new BufferedReader(new InputStreamReader(fileInput, Charset.forName("UTF-8")));
String wordDataLine;
try (WordListWriter wordListWriter = new WordListWriter(outputWordsListFile)) {
long read = 0;
long wordsWritten = 0;
while (null != (wordDataLine = reader.readLine())) {
read += wordDataLine.length();
// word=heh,f=0,flags=,originalFreq=53,possibly_offensive=true
Matcher matcher = mWordLineRegex.matcher(wordDataLine);
if (matcher.matches()) {
String word = matcher.group(1);
int frequency = Integer.parseInt(matcher.group(2));
wordListWriter.addEntry(word, frequency);
if ((wordsWritten % 50000) == 0) {
System.out.print("." + ((100 * read) / inputSize) + "%.");
}
wordsWritten++;
if (maxWordsInList == wordsWritten) {
System.out.println("!!!!");
System.out.println(
"Reached " + maxWordsInList + " words! Breaking parsing.");
break;
}
}
}
System.out.print(".100%.");
}
System.out.println("Done.");
}
@InputFile
@PathSensitive(RELATIVE)
public File getInputFile() {
return inputFile;
}
public void setInputFile(File inputFile) {
this.inputFile = inputFile;
}
@OutputFile
public File getOutputWordsListFile() {
return outputWordsListFile;
}
public void setOutputWordsListFile(File outputWordsListFile) {
this.outputWordsListFile = outputWordsListFile;
}
@Input
public int getMaxWordsInList() {
return maxWordsInList;
}
public void setMaxWordsInList(int maxWordsInList) {
this.maxWordsInList = maxWordsInList;
}
}
|