blob: b0601e185258322d2697ddacd2e3a4905748789d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import org.gradle.api.DefaultTask;
import org.gradle.api.tasks.Input;
import org.gradle.api.tasks.InputFiles;
import org.gradle.api.tasks.OutputFile;
import org.gradle.api.tasks.TaskAction;
import org.jsoup.Jsoup;
/** Task to generate words-list XML file from a input */
public class GenerateWordsListTask extends DefaultTask {
@TaskAction
public void generateWordsList() throws Exception {
final List<File> inputTextFiles = new ArrayList<>();
for (File it : inputFiles) {
if (it.getName().endsWith(".html") || it.getName().endsWith(".htm")) {
File wordsInputFile = File.createTempFile(it.getName() + "_stripped_html_", ".txt");
String inputText = Jsoup.parse(it, "UTF-8").text();
Writer writer =
new OutputStreamWriter(
new FileOutputStream(wordsInputFile), Charset.forName("UTF-8"));
writer.write(inputText);
writer.flush();
writer.close();
inputTextFiles.add(wordsInputFile);
} else if (it.getName().endsWith(".txt")) {
inputTextFiles.add(it);
} else {
System.out.println(
"Skipping file "
+ it.getAbsolutePath()
+ ", since it's not txt or html file.");
}
}
final File parentFile = outputWordsListFile.getParentFile();
if (!parentFile.exists() && !parentFile.mkdirs()) {
throw new IllegalArgumentException(
"Failed to create output folder " + parentFile.getAbsolutePath());
}
Parser parser =
new Parser(
inputTextFiles,
outputWordsListFile,
wordCharacters,
locale,
additionalInnerCharacters,
maxWordsInList,
maxWordFrequency);
parser.parse();
}
@InputFiles
public File[] getInputFiles() {
return inputFiles;
}
public void setInputFiles(File[] inputFiles) {
this.inputFiles = inputFiles;
}
@OutputFile
public File getOutputWordsListFile() {
return outputWordsListFile;
}
public void setOutputWordsListFile(File outputWordsListFile) {
this.outputWordsListFile = outputWordsListFile;
}
@Input
public char[] getWordCharacters() {
return wordCharacters;
}
public void setWordCharacters(char[] wordCharacters) {
this.wordCharacters = wordCharacters;
}
@Input
public char[] getAdditionalInnerCharacters() {
return additionalInnerCharacters;
}
public void setAdditionalInnerCharacters(char[] additionalInnerCharacters) {
this.additionalInnerCharacters = additionalInnerCharacters;
}
@Input
public Locale getLocale() {
return locale;
}
public void setLocale(Locale locale) {
this.locale = locale;
}
@Input
public int getMaxWordFrequency() {
return maxWordFrequency;
}
public void setMaxWordFrequency(int frequency) {
maxWordFrequency = frequency;
}
@Input
public int getMaxWordsInList() {
return maxWordsInList;
}
public void setMaxWordsInList(int maxWordsInList) {
this.maxWordsInList = maxWordsInList;
}
private File[] inputFiles;
private File outputWordsListFile;
private char[] wordCharacters =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz".toCharArray();
private char[] additionalInnerCharacters = "'".toCharArray();
private Locale locale = Locale.US;
private int maxWordsInList = Integer.MAX_VALUE;
private int maxWordFrequency = 64;
}
|