/* * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package doccheckutils.checkers; import doccheckutils.FileChecker; import doccheckutils.Log; import java.io.*; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.text.MessageFormat; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Checks the contents of an HTML file for bad/unmappable characters. *
* The file encoding is determined from the file contents.
*/
public class BadCharacterChecker implements FileChecker, AutoCloseable {
private static final Pattern doctype = Pattern.compile("(?i)");
private static final Pattern metaCharset = Pattern.compile("(?i)");
private static final Pattern metaContentType = Pattern.compile("(?i)");
private final Log errors;
private int files = 0;
private int badFiles = 0;
public BadCharacterChecker() {
errors = new Log();
}
public void checkFile(Path path) {
files++;
boolean ok = true;
try (InputStream in = new BufferedInputStream(Files.newInputStream(path))) {
CharsetDecoder d = getCharset(in).newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
BufferedReader r = new BufferedReader(new InputStreamReader(in, d));
int lineNumber = 0;
String line;
try {
while ((line = r.readLine()) != null) {
lineNumber++;
int errorsOnLine = 0;
for (int i = 0; i < line.length(); i++) {
char ch = line.charAt(i);
if (ch == 0xFFFD) {
errorsOnLine++;
}
}
if (errorsOnLine > 0) {
errors.log(path, lineNumber, "found %d invalid characters", errorsOnLine);
ok = false;
}
}
} catch (IOException e) {
errors.log(path, lineNumber, e);
ok = false;
}
} catch (IOException e) {
errors.log(path, e);
ok = false;
}
if (!ok)
badFiles++;
}
@Override
public void checkFiles(List