aboutsummaryrefslogtreecommitdiff
path: root/buildSrc/src/main/java/GenerateWordsListFromAOSPTask.java
diff options
context:
space:
mode:
Diffstat (limited to 'buildSrc/src/main/java/GenerateWordsListFromAOSPTask.java')
-rw-r--r--buildSrc/src/main/java/GenerateWordsListFromAOSPTask.java109
1 files changed, 109 insertions, 0 deletions
diff --git a/buildSrc/src/main/java/GenerateWordsListFromAOSPTask.java b/buildSrc/src/main/java/GenerateWordsListFromAOSPTask.java
new file mode 100644
index 000000000..1f03427ea
--- /dev/null
+++ b/buildSrc/src/main/java/GenerateWordsListFromAOSPTask.java
@@ -0,0 +1,109 @@
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.ZipInputStream;
+import org.gradle.api.DefaultTask;
+import org.gradle.api.tasks.Input;
+import org.gradle.api.tasks.InputFile;
+import org.gradle.api.tasks.OutputFile;
+import org.gradle.api.tasks.TaskAction;
+
+/**
+ * Task to generate words-list XML file from a AOSP words-list file.
+ * https://android.googlesource.com/platform/packages/inputmethods/LatinIME/+/master/dictionaries/
+ */
+public class GenerateWordsListFromAOSPTask extends DefaultTask {
+ private static final Pattern mWordLineRegex =
+ Pattern.compile("^\\s*word=([\\w\\p{L}'\"-]+),f=(\\d+).*$");
+
+ private File inputFile;
+ private File outputWordsListFile;
+ private int maxWordsInList = 500000;
+
+ @TaskAction
+ public void generateWordsList() throws IOException {
+ if (inputFile == null) {
+ throw new IllegalArgumentException("Please provide inputFile value.");
+ }
+ if (!inputFile.isFile()) throw new IllegalArgumentException("inputFile must be a file!");
+ if (outputWordsListFile == null) {
+ throw new IllegalArgumentException("Please provide outputWordsListFile value.");
+ }
+
+ final long inputSize = inputFile.length();
+ System.out.println(
+ "Reading input file " + inputFile.getName() + " (size " + inputSize + ")...");
+
+ InputStream fileInput = new FileInputStream(inputFile);
+ if (inputFile.getName().endsWith(".zip")) {
+ fileInput = new ZipInputStream(fileInput);
+ } else if (inputFile.getName().endsWith(".gz")) {
+ fileInput = new GZIPInputStream(fileInput);
+ }
+ BufferedReader reader =
+ new BufferedReader(new InputStreamReader(fileInput, Charset.forName("UTF-8")));
+ String wordDataLine;
+
+ try (WordListWriter wordListWriter = new WordListWriter(outputWordsListFile)) {
+ long read = 0;
+ long wordsWritten = 0;
+ while (null != (wordDataLine = reader.readLine())) {
+ read += wordDataLine.length();
+ // word=heh,f=0,flags=,originalFreq=53,possibly_offensive=true
+ Matcher matcher = mWordLineRegex.matcher(wordDataLine);
+ if (matcher.matches()) {
+ String word = matcher.group(1);
+ int frequency = Integer.parseInt(matcher.group(2));
+ wordListWriter.addEntry(word, frequency);
+ if ((wordsWritten % 50000) == 0) {
+ System.out.print("." + ((100 * read) / inputSize) + "%.");
+ }
+ wordsWritten++;
+ if (maxWordsInList == wordsWritten) {
+ System.out.println("!!!!");
+ System.out.println(
+ "Reached " + maxWordsInList + " words! Breaking parsing.");
+ break;
+ }
+ }
+ }
+ System.out.print(".100%.");
+ }
+
+ System.out.println("Done.");
+ }
+
+ @InputFile
+ public File getInputFile() {
+ return inputFile;
+ }
+
+ public void setInputFile(File inputFile) {
+ this.inputFile = inputFile;
+ }
+
+ @OutputFile
+ public File getOutputWordsListFile() {
+ return outputWordsListFile;
+ }
+
+ public void setOutputWordsListFile(File outputWordsListFile) {
+ this.outputWordsListFile = outputWordsListFile;
+ }
+
+ @Input
+ public int getMaxWordsInList() {
+ return maxWordsInList;
+ }
+
+ public void setMaxWordsInList(int maxWordsInList) {
+ this.maxWordsInList = maxWordsInList;
+ }
+}