443 lines
14 KiB
Java

/*
* Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package doccheckutils.checkers;
import doccheckutils.HtmlChecker;
import doccheckutils.Log;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
/**
* Checks the links defined by and referenced in HTML files.
*/
public class LinkChecker implements HtmlChecker {
private final Log log;
private final Map<Path, IDTable> allFiles;
private final Map<URI, IDTable> allURIs;
// left for debugging
private final boolean checkInwardReferencesOnly = false;
private int files;
private int links;
private int duplicateIds;
private int missingFiles;
private int missingIds;
private int badSchemes;
private Path currFile;
private IDTable currTable;
private boolean html5;
public LinkChecker() {
this.log = new Log();
allFiles = new HashMap<>();
allURIs = new HashMap<>();
}
public void setBaseDir(Path dir) {
log.setBaseDirectory(dir);
}
@Override
public void startFile(Path path) {
currFile = path.toAbsolutePath().normalize();
currTable = allFiles.computeIfAbsent(currFile, p -> new IDTable(log.relativize(p)));
html5 = false;
files++;
}
@Override
public void endFile() {
currTable.check();
}
//unused
public List<Path> getUncheckedFiles() {
return allFiles.entrySet().stream()
.filter(e -> !e.getValue().checked
&& e.getKey().toString().endsWith(".html")
&& Files.exists(e.getKey()))
.map(Map.Entry::getKey)
.toList();
}
public List<Path> getMissingFiles() {
return allFiles.keySet().stream()
.filter(idTable -> !Files.exists(idTable)).toList();
}
@Override
public void xml(int line, Map<String, String> attrs) {
}
@Override
public void docType(int line, String doctype) {
html5 = doctype.matches("(?i)<\\?doctype\\s+html>");
}
@Override
@SuppressWarnings("fallthrough")
public void startElement(int line, String name, Map<String, String> attrs, boolean selfClosing) {
switch (name) {
case "a":
String nameAttr = html5 ? null : attrs.get("name");
if (nameAttr != null) {
foundAnchor(line, nameAttr);
}
// fallthrough
case "link":
String href = attrs.get("href");
if (href != null && !checkInwardReferencesOnly) {
foundReference(line, href);
}
break;
}
String idAttr = attrs.get("id");
if (idAttr != null) {
foundAnchor(line, idAttr);
}
}
@Override
public void endElement(int line, String name) {
}
@Override
public void content(int line, String content) {
HtmlChecker.super.content(line, content);
}
@Override
public void report() {
List<Path> pathList = getMissingFiles();
log.log("");
log.log("Link Checker Report");
if (!pathList.isEmpty()) {
log.log("");
log.log("Missing files: (" + pathList.size() + ")");
pathList.stream()
.sorted()
.forEach(this::reportMissingFile);
}
int anchors = 0;
for (IDTable t : allFiles.values()) {
anchors += (int) t.map.values().stream()
.filter(e -> !e.getReferences().isEmpty())
.count();
}
for (IDTable t : allURIs.values()) {
anchors += (int) t.map.values().stream()
.filter(e -> !e.references.isEmpty())
.count();
}
log.log("Checked " + files + " files.");
log.log("Found " + links + " references to " + anchors + " anchors "
+ "in " + allFiles.size() + " files and " + allURIs.size() + " other URIs.");
if (!pathList.isEmpty()) {
log.log("%6d missing files", pathList.size());
}
if (duplicateIds > 0) {
log.log("%6d duplicate ids", duplicateIds);
}
if (missingIds > 0) {
log.log("%6d missing ids", missingIds);
}
Map<String, Integer> hostCounts = new TreeMap<>(new HostComparator());
for (URI uri : allURIs.keySet()) {
String host = uri.getHost();
if (host != null) {
hostCounts.put(host, hostCounts.computeIfAbsent(host, h -> 0) + 1);
}
}
// if (hostCounts.size() > 0) {
// log.log("");
// log.log("Hosts");
// hostCounts.forEach((h, n) -> log.log("%6d %s", n, h));
// }
for (String message : log.getErrors()) {
System.err.println(message);
}
}
private void reportMissingFile(Path file) {
log.log(log.relativize(file).toString());
IDTable table = allFiles.get(file);
Set<Path> refs = new TreeSet<>();
for (IDInfo id : table.map.values()) {
if (id.references != null) {
for (Position ref : id.references) {
refs.add(ref.path);
}
}
}
int n = 0;
int MAX_REFS = 10;
for (Path ref : refs) {
log.log(" in " + log.relativize(ref));
if (++n == MAX_REFS) {
log.log(" ... and %d more", refs.size() - n);
break;
}
}
missingFiles++;
}
@Override
public boolean isOK() {
return log.noErrors() && (missingFiles == 0);
}
@Override
public void close() {
if (!log.noErrors()) {
report();
throw new RuntimeException("LinkChecker encountered errors; see log above.");
}
}
private void foundAnchor(int line, String name) {
currTable.addID(line, name);
}
private void foundReference(int line, String ref) {
links++;
try {
String uriPath = ref;
String fragment = null;
// The checker runs into a problem with links that have more than one hash character.
// You cannot create a URI unless the second hash is escaped.
int firstHashIndex = ref.indexOf('#');
int lastHashIndex = ref.lastIndexOf('#');
if (firstHashIndex != -1 && firstHashIndex != lastHashIndex) {
uriPath = ref.substring(0, firstHashIndex);
fragment = ref.substring(firstHashIndex + 1).replace("#", "%23");
} else if (firstHashIndex != -1) {
uriPath = ref.substring(0, firstHashIndex);
fragment = ref.substring(firstHashIndex + 1);
}
URI uri = new URI(uriPath);
if (fragment != null) {
uri = new URI(uri + "#" + fragment);
}
if (uri.isAbsolute()) {
foundReference(line, uri);
} else {
Path p;
String resolvedUriPath = uri.getPath();
if (resolvedUriPath == null || resolvedUriPath.isEmpty()) {
p = currFile;
} else {
p = currFile.getParent().resolve(resolvedUriPath).normalize();
}
if (!Files.exists(p)) {
log.log(currFile, line, "missing file reference: " + log.relativize(p));
return;
}
if (fragment != null && !fragment.isEmpty()) {
foundReference(line, p, fragment);
}
}
} catch (URISyntaxException e) {
System.err.println("Failed to create URI: " + ref);
log.log(currFile, line, "invalid URI: " + e);
}
}
private void foundReference(int line, Path p, String fragment) {
IDTable t = allFiles.computeIfAbsent(p, key -> new IDTable(log.relativize(key)));
t.addReference(fragment, currFile, line);
}
private void foundReference(int line, URI uri) {
if (!isSchemeOK(uri.getScheme()) && !checkInwardReferencesOnly) {
log.log(currFile, line, "bad scheme in URI");
badSchemes++;
}
String fragment = uri.getRawFragment();
if (fragment != null && !fragment.isEmpty()) {
try {
URI noFrag = new URI(uri.toString().replaceAll("#\\Q" + fragment + "\\E$", ""));
IDTable t = allURIs.computeIfAbsent(noFrag, IDTable::new);
t.addReference(fragment, currFile, line);
} catch (URISyntaxException e) {
throw new Error(e);
}
}
}
private boolean isSchemeOK(String uriScheme) {
if (uriScheme == null) {
return true;
}
return switch (uriScheme) {
case "ftp", "http", "https", "javascript" -> true;
default -> false;
};
}
static class Position implements Comparable<Position> {
Path path;
int line;
Position(Path path, int line) {
this.path = path;
this.line = line;
}
@Override
public int compareTo(Position o) {
int v = path.compareTo(o.path);
return v != 0 ? v : Integer.compare(line, o.line);
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
} else if (obj == null || getClass() != obj.getClass()) {
return false;
} else {
final Position other = (Position) obj;
return Objects.equals(this.path, other.path)
&& this.line == other.line;
}
}
@Override
public int hashCode() {
return Objects.hashCode(path) * 37 + line;
}
}
static class IDInfo {
boolean declared;
Set<Position> references;
Set<Position> getReferences() {
return references == null ? Collections.emptySet() : references;
}
}
static class HostComparator implements Comparator<String> {
@Override
public int compare(String h1, String h2) {
List<String> l1 = new ArrayList<>(Arrays.asList(h1.split("\\.")));
Collections.reverse(l1);
String r1 = String.join(".", l1);
List<String> l2 = new ArrayList<>(Arrays.asList(h2.split("\\.")));
Collections.reverse(l2);
String r2 = String.join(".", l2);
return r1.compareTo(r2);
}
}
class IDTable {
private final Map<String, IDInfo> map = new HashMap<>();
private final String pathOrURI;
private boolean checked;
IDTable(Path path) {
this.pathOrURI = path.toString();
}
IDTable(URI uri) {
this.pathOrURI = uri.toString();
}
void addID(int line, String name) {
if (checked) {
throw new IllegalStateException("Adding ID after file has been checked");
}
Objects.requireNonNull(name);
IDInfo info = map.computeIfAbsent(name, _ -> new IDInfo());
if (info.declared) {
if (info.references != null || !checkInwardReferencesOnly) {
// don't report error if we're only checking inbound references
// and there are no references to this ID.
log.log(log.relativize(currFile), line, "name already declared: " + name);
duplicateIds++;
}
} else {
info.declared = true;
}
}
void addReference(String name, Path from, int line) {
if (checked) {
if (name != null) {
IDInfo id = map.get(name);
if (id == null || !id.declared) {
log.log(log.relativize(from), line,
"id not found: " + this.pathOrURI + "#" + name);
LinkChecker.this.missingIds++;
}
}
} else {
IDInfo id = map.computeIfAbsent(name, x -> new IDInfo());
if (id.references == null) {
id.references = new TreeSet<>();
}
id.references.add(new Position(from, line));
}
}
void check() {
map.forEach((name, id) -> {
if (name != null && !id.declared) {
for (Position ref : id.references) {
log.log(log.relativize(ref.path), ref.line,
"id not found: " + this.pathOrURI + "#" + name);
}
missingIds++;
}
});
checked = true;
}
}
}