/* * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package doccheckutils; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.regex.Pattern; /** * Reads an HTML file, and calls a series of{@link HtmlChecker HTML checkers} * for the HTML constructs found therein. */ public class HtmlFileChecker implements FileChecker { private final CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder() .onMalformedInput(CodingErrorAction.IGNORE) .onUnmappableCharacter(CodingErrorAction.IGNORE); private final Log log; private final HtmlChecker htmlChecker; private Path path; private BufferedReader in; private int ch; private int lineNumber; private boolean inScript; private boolean xml; public HtmlFileChecker(HtmlChecker htmlChecker, Path BaseDir) { this.log = new Log(); log.setBaseDirectory(BaseDir); this.htmlChecker = htmlChecker; } @Override public void checkFiles(List files) { for (Path file : files) { read(file); } } @Override public void report() { System.err.println(log); } @Override public void close() throws IOException { // report(); htmlChecker.close(); } private void read(Path path) { try (BufferedReader r = new BufferedReader( new InputStreamReader(Files.newInputStream(path), decoder))) { this.path = path; this.in = r; StringBuilder content = new StringBuilder(); startFile(path); try { lineNumber = 1; xml = false; nextChar(); while (ch != -1) { if (ch == '<') { content(content.toString()); content.setLength(0); html(); } else { content.append((char) ch); if (ch == '\n') { content(content.toString()); content.setLength(0); } nextChar(); } } } finally { endFile(); } } catch (IOException e) { log.log(path, lineNumber, e); } catch (Throwable t) { log.log(path, lineNumber, t); log.log(String.valueOf(t)); } } private void startFile(Path path) { htmlChecker.startFile(path); } private void endFile() { htmlChecker.endFile(); } private void docType(String s) { htmlChecker.docType(lineNumber, s); } private void startElement(String name, Map attrs, boolean selfClosing) { htmlChecker.startElement(lineNumber, name, attrs, selfClosing); } private void endElement(String name) { htmlChecker.endElement(lineNumber, name); } private void content(String s) { htmlChecker.content(lineNumber, s); } private void nextChar() throws IOException { ch = in.read(); if (ch == '\n') lineNumber++; } /** * Read the start or end of an HTML tag, or an HTML comment * {@literal } or {@literal } * * @throws IOException if there is a problem reading the file */ protected void html() throws IOException { nextChar(); if (isIdentifierStart((char) ch)) { String name = readIdentifier().toLowerCase(Locale.US); Map attrs = htmlAttrs(); if (attrs != null) { boolean selfClosing = false; if (ch == '/') { nextChar(); selfClosing = true; } if (ch == '>') { nextChar(); startElement(name, attrs, selfClosing); if (name.equals("script")) { inScript = true; } return; } } } else if (ch == '/') { nextChar(); if (isIdentifierStart((char) ch)) { String name = readIdentifier().toLowerCase(Locale.US); skipWhitespace(); if (ch == '>') { nextChar(); endElement(name); if (name.equals("script")) { inScript = false; } return; } } } else if (ch == '!') { nextChar(); if (ch == '-') { nextChar(); if (ch == '-') { nextChar(); while (ch != -1) { int dash = 0; while (ch == '-') { dash++; nextChar(); } // Strictly speaking, a comment should not contain "--" // so dash > 2 is an error, dash == 2 implies ch == '>' // See http://www.w3.org/TR/html-markup/syntax.html#syntax-comments // for more details. if (dash >= 2 && ch == '>') { nextChar(); return; } nextChar(); } } } else if (ch == '[') { nextChar(); if (ch == 'C') { nextChar(); if (ch == 'D') { nextChar(); if (ch == 'A') { nextChar(); if (ch == 'T') { nextChar(); if (ch == 'A') { nextChar(); if (ch == '[') { while (true) { nextChar(); if (ch == ']') { nextChar(); if (ch == ']') { nextChar(); if (ch == '>') { nextChar(); return; } } } } } } } } } } } else { StringBuilder sb = new StringBuilder(); while (ch != -1 && ch != '>') { sb.append((char) ch); nextChar(); } Pattern p = Pattern.compile("(?is)doctype\\s+html\\s?.*"); String s = sb.toString(); if (p.matcher(s).matches()) { xml = s.contains("XHTML"); docType(s); return; } } } else if (ch == '?') { nextChar(); if (ch == 'x') { nextChar(); if (ch == 'm') { nextChar(); if (ch == 'l') { nextChar(); if (ch == '?') { nextChar(); if (ch == '>') { nextChar(); xml = true; return; } } } } } } if (!inScript) { log.log(path, lineNumber, "bad html"); } } /** * Read a series of HTML attributes, terminated by {@literal > }. * Each attribute is of the form {@literal identifier[=value] }. * "value" may be unquoted, single-quoted, or double-quoted. */ protected Map htmlAttrs() throws IOException { Map map = new LinkedHashMap<>(); skipWhitespace(); while (isIdentifierStart((char) ch)) { String name = readAttributeName().toLowerCase(Locale.US); skipWhitespace(); String value = null; if (ch == '=') { nextChar(); skipWhitespace(); if (ch == '\'' || ch == '"') { char quote = (char) ch; nextChar(); StringBuilder sb = new StringBuilder(); while (ch != -1 && ch != quote) { // if (ch == '\n') { // error(path, lineNumber, "unterminated string"); // // No point trying to read more. // // In fact, all attrs get discarded by the caller // // and superseded by a malformed.html node because // // the html tag itself is not terminated correctly. // break loop; // } sb.append((char) ch); nextChar(); } value = sb.toString() // hack to replace common entities .replace("<", "<") .replace(">", ">") .replace("&", "&"); nextChar(); } else { StringBuilder sb = new StringBuilder(); while (ch != -1 && !isUnquotedAttrValueTerminator((char) ch)) { sb.append((char) ch); nextChar(); } value = sb.toString(); } skipWhitespace(); } map.put(name, value); } return map; } protected boolean isIdentifierStart(char ch) { return Character.isUnicodeIdentifierStart(ch); } protected String readIdentifier() throws IOException { StringBuilder sb = new StringBuilder(); sb.append((char) ch); nextChar(); while (ch != -1 && Character.isUnicodeIdentifierPart(ch)) { sb.append((char) ch); nextChar(); } return sb.toString(); } protected String readAttributeName() throws IOException { StringBuilder sb = new StringBuilder(); sb.append((char) ch); nextChar(); while ((ch != -1 && Character.isUnicodeIdentifierPart(ch)) || ch == '-' || (xml && ch == ':')) { sb.append((char) ch); nextChar(); } return sb.toString(); } protected boolean isWhitespace(char ch) { return Character.isWhitespace(ch); } protected void skipWhitespace() throws IOException { while (isWhitespace((char) ch)) { nextChar(); } } protected boolean isUnquotedAttrValueTerminator(char ch) { return switch (ch) { case '\f', '\n', '\r', '\t', ' ', '"', '\'', '`', '=', '<', '>' -> true; default -> false; }; } @Override public boolean isOK() { throw new UnsupportedOperationException(); } }