aboutsummaryrefslogtreecommitdiff
path: root/buildSrc/src/main/java/GenerateWordsListFromAOSPTask.java
blob: 410fd349a51815343f7d3da17f0fdd8abee16ff6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import static org.gradle.api.tasks.PathSensitivity.RELATIVE;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipInputStream;
import org.gradle.api.DefaultTask;
import org.gradle.api.tasks.CacheableTask;
import org.gradle.api.tasks.Input;
import org.gradle.api.tasks.InputFile;
import org.gradle.api.tasks.OutputFile;
import org.gradle.api.tasks.PathSensitive;
import org.gradle.api.tasks.TaskAction;

/**
 * Task to generate words-list XML file from a AOSP words-list file.
 * https://android.googlesource.com/platform/packages/inputmethods/LatinIME/+/master/dictionaries/
 */
@CacheableTask
public class GenerateWordsListFromAOSPTask extends DefaultTask {
    private static final Pattern mWordLineRegex =
            Pattern.compile("^\\s*word=([\\w\\p{L}'\"-]+),f=(\\d+).*$");

    private File inputFile;
    private File outputWordsListFile;
    private int maxWordsInList = 500000;

    @TaskAction
    public void generateWordsList() throws IOException {
        if (inputFile == null) {
            throw new IllegalArgumentException("Please provide inputFile value.");
        }
        if (!inputFile.isFile()) throw new IllegalArgumentException("inputFile must be a file!");
        if (outputWordsListFile == null) {
            throw new IllegalArgumentException("Please provide outputWordsListFile value.");
        }

        final long inputSize = inputFile.length();
        System.out.println(
                "Reading input file " + inputFile.getName() + " (size " + inputSize + ")...");

        InputStream fileInput = new FileInputStream(inputFile);
        if (inputFile.getName().endsWith(".zip")) {
            fileInput = new ZipInputStream(fileInput);
        } else if (inputFile.getName().endsWith(".gz")) {
            fileInput = new GZIPInputStream(fileInput);
        }
        BufferedReader reader =
                new BufferedReader(new InputStreamReader(fileInput, Charset.forName("UTF-8")));
        String wordDataLine;

        try (WordListWriter wordListWriter = new WordListWriter(outputWordsListFile)) {
            long read = 0;
            long wordsWritten = 0;
            while (null != (wordDataLine = reader.readLine())) {
                read += wordDataLine.length();
                // word=heh,f=0,flags=,originalFreq=53,possibly_offensive=true
                Matcher matcher = mWordLineRegex.matcher(wordDataLine);
                if (matcher.matches()) {
                    String word = matcher.group(1);
                    int frequency = Integer.parseInt(matcher.group(2));
                    wordListWriter.addEntry(word, frequency);
                    if ((wordsWritten % 50000) == 0) {
                        System.out.print("." + ((100 * read) / inputSize) + "%.");
                    }
                    wordsWritten++;
                    if (maxWordsInList == wordsWritten) {
                        System.out.println("!!!!");
                        System.out.println(
                                "Reached " + maxWordsInList + " words! Breaking parsing.");
                        break;
                    }
                }
            }
            System.out.print(".100%.");
        }

        System.out.println("Done.");
    }

    @InputFile
    @PathSensitive(RELATIVE)
    public File getInputFile() {
        return inputFile;
    }

    public void setInputFile(File inputFile) {
        this.inputFile = inputFile;
    }

    @OutputFile
    public File getOutputWordsListFile() {
        return outputWordsListFile;
    }

    public void setOutputWordsListFile(File outputWordsListFile) {
        this.outputWordsListFile = outputWordsListFile;
    }

    @Input
    public int getMaxWordsInList() {
        return maxWordsInList;
    }

    public void setMaxWordsInList(int maxWordsInList) {
        this.maxWordsInList = maxWordsInList;
    }
}