aboutsummaryrefslogtreecommitdiff
path: root/buildSrc/src/main/java/GenerateWordsListTask.java
blob: b0601e185258322d2697ddacd2e3a4905748789d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import org.gradle.api.DefaultTask;
import org.gradle.api.tasks.Input;
import org.gradle.api.tasks.InputFiles;
import org.gradle.api.tasks.OutputFile;
import org.gradle.api.tasks.TaskAction;
import org.jsoup.Jsoup;

/** Task to generate words-list XML file from a input */
public class GenerateWordsListTask extends DefaultTask {
    @TaskAction
    public void generateWordsList() throws Exception {
        final List<File> inputTextFiles = new ArrayList<>();
        for (File it : inputFiles) {
            if (it.getName().endsWith(".html") || it.getName().endsWith(".htm")) {
                File wordsInputFile = File.createTempFile(it.getName() + "_stripped_html_", ".txt");
                String inputText = Jsoup.parse(it, "UTF-8").text();

                Writer writer =
                        new OutputStreamWriter(
                                new FileOutputStream(wordsInputFile), Charset.forName("UTF-8"));
                writer.write(inputText);
                writer.flush();
                writer.close();
                inputTextFiles.add(wordsInputFile);
            } else if (it.getName().endsWith(".txt")) {
                inputTextFiles.add(it);
            } else {
                System.out.println(
                        "Skipping file "
                                + it.getAbsolutePath()
                                + ", since it's not txt or html file.");
            }
        }

        final File parentFile = outputWordsListFile.getParentFile();
        if (!parentFile.exists() && !parentFile.mkdirs()) {
            throw new IllegalArgumentException(
                    "Failed to create output folder " + parentFile.getAbsolutePath());
        }
        Parser parser =
                new Parser(
                        inputTextFiles,
                        outputWordsListFile,
                        wordCharacters,
                        locale,
                        additionalInnerCharacters,
                        maxWordsInList,
                        maxWordFrequency);
        parser.parse();
    }

    @InputFiles
    public File[] getInputFiles() {
        return inputFiles;
    }

    public void setInputFiles(File[] inputFiles) {
        this.inputFiles = inputFiles;
    }

    @OutputFile
    public File getOutputWordsListFile() {
        return outputWordsListFile;
    }

    public void setOutputWordsListFile(File outputWordsListFile) {
        this.outputWordsListFile = outputWordsListFile;
    }

    @Input
    public char[] getWordCharacters() {
        return wordCharacters;
    }

    public void setWordCharacters(char[] wordCharacters) {
        this.wordCharacters = wordCharacters;
    }

    @Input
    public char[] getAdditionalInnerCharacters() {
        return additionalInnerCharacters;
    }

    public void setAdditionalInnerCharacters(char[] additionalInnerCharacters) {
        this.additionalInnerCharacters = additionalInnerCharacters;
    }

    @Input
    public Locale getLocale() {
        return locale;
    }

    public void setLocale(Locale locale) {
        this.locale = locale;
    }

    @Input
    public int getMaxWordFrequency() {
        return maxWordFrequency;
    }

    public void setMaxWordFrequency(int frequency) {
        maxWordFrequency = frequency;
    }

    @Input
    public int getMaxWordsInList() {
        return maxWordsInList;
    }

    public void setMaxWordsInList(int maxWordsInList) {
        this.maxWordsInList = maxWordsInList;
    }

    private File[] inputFiles;
    private File outputWordsListFile;
    private char[] wordCharacters =
            "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".toCharArray();
    private char[] additionalInnerCharacters = "'".toCharArray();
    private Locale locale = Locale.US;
    private int maxWordsInList = Integer.MAX_VALUE;
    private int maxWordFrequency = 64;
}