945 lines
28 KiB
Python
Executable File
945 lines
28 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# SPDX-FileCopyrightText: 2023 Blender Authors
|
|
#
|
|
# SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
"""
|
|
Script for checking source code spelling.
|
|
|
|
python3 tools/check_source/check_spelling.py some_source_file.py
|
|
|
|
- Pass in a directory for it to be checked recursively.
|
|
- Pass in '--extract=STRINGS' to check strings instead of comments.
|
|
|
|
Currently only python source is checked.
|
|
"""
|
|
__all__ = (
|
|
"main",
|
|
)
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
from enum import Enum
|
|
|
|
from collections.abc import (
|
|
Callable,
|
|
Iterator,
|
|
)
|
|
|
|
|
|
# Report: word, line, column.
|
|
Report = tuple[str, int, int]
|
|
# Cache: {filepath: length, hash, reports}.
|
|
CacheData = dict[str, tuple[int, bytes, list[Report]]]
|
|
# Map word to suggestions.
|
|
SuggestMap = dict[str, str]
|
|
|
|
ONLY_ONCE = True
|
|
USE_COLOR = True
|
|
|
|
# Ignore: `/*identifier*/` as these are used in C++ for unused arguments or to denote struct members.
|
|
# These identifiers can be ignored in most cases.
|
|
USE_SKIP_SINGLE_IDENTIFIER_COMMENTS = True
|
|
|
|
_words_visited = set()
|
|
_files_visited = set()
|
|
|
|
# Lowercase word -> suggestion list.
|
|
_suggest_map: SuggestMap = {}
|
|
|
|
VERBOSE_CACHE = False
|
|
|
|
if USE_COLOR:
|
|
COLOR_WORD = "\033[92m"
|
|
COLOR_ENDC = "\033[0m"
|
|
else:
|
|
COLOR_WORD = ""
|
|
COLOR_ENDC = ""
|
|
|
|
from check_spelling_config import (
|
|
dict_custom,
|
|
dict_ignore,
|
|
dict_ignore_hyphenated_prefix,
|
|
dict_ignore_hyphenated_suffix,
|
|
files_ignore,
|
|
directories_ignore,
|
|
)
|
|
|
|
SOURCE_EXT = (
|
|
"c",
|
|
"cc",
|
|
"inl",
|
|
"cpp",
|
|
"cxx",
|
|
"hpp",
|
|
"hxx",
|
|
"h",
|
|
"hh",
|
|
"m",
|
|
"mm",
|
|
"metal",
|
|
"msl",
|
|
"glsl",
|
|
"osl",
|
|
"py",
|
|
"txt", # for `CMakeLists.txt`.
|
|
"cmake",
|
|
)
|
|
|
|
|
|
class TokenType(Enum):
|
|
COMMENT = 0
|
|
STRING = 1
|
|
DOCSTRING = 1
|
|
|
|
|
|
class LangType(Enum):
|
|
C = 0
|
|
CMAKE = 1
|
|
PYTHON = 2
|
|
|
|
|
|
LangTokenType = tuple[LangType, TokenType]
|
|
|
|
|
|
BASEDIR = os.path.abspath(os.path.dirname(__file__))
|
|
ROOTDIR = os.path.normpath(os.path.join(BASEDIR, "..", ".."))
|
|
ROOTDIR_WITH_SLASH = ROOTDIR + os.sep
|
|
|
|
# Ensure native slashes.
|
|
files_ignore = {
|
|
os.path.normpath(os.path.join(ROOTDIR, f.replace("/", os.sep)))
|
|
for f in files_ignore
|
|
}
|
|
|
|
directories_ignore = {
|
|
os.path.normpath(os.path.join(ROOTDIR, f.replace("/", os.sep)))
|
|
for f in directories_ignore
|
|
}
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Dictionary Utilities
|
|
|
|
|
|
def dictionary_create(): # type: ignore
|
|
import enchant # type: ignore
|
|
dict_spelling = enchant.Dict("en_US")
|
|
|
|
# Don't add ignore to the dictionary, since they will be suggested.
|
|
for w in dict_custom:
|
|
# Also, don't use `add(w)`, this will manipulate users personal dictionaries.
|
|
dict_spelling.add_to_session(w)
|
|
return dict_spelling
|
|
|
|
|
|
def dictionary_check(w: str, code_words: set[str]) -> bool:
|
|
w_lower = w.lower()
|
|
if w_lower in dict_ignore:
|
|
return True
|
|
|
|
is_correct: bool = _dict.check(w)
|
|
# Split by hyphenation and check.
|
|
if not is_correct:
|
|
if "-" in w:
|
|
is_correct = True
|
|
|
|
# Allow: `un-word`, `re-word`.
|
|
w_split = w.strip("-").split("-")
|
|
if len(w_split) > 1:
|
|
if w_split and w_split[0].lower() in dict_ignore_hyphenated_prefix:
|
|
del w_split[0]
|
|
# Allow: `word-ish`, `word-ness`.
|
|
if len(w_split) > 1:
|
|
if w_split and w_split[-1].lower() in dict_ignore_hyphenated_suffix:
|
|
del w_split[-1]
|
|
|
|
for w_sub in w_split:
|
|
if w_sub:
|
|
if w_sub in code_words:
|
|
continue
|
|
w_sub_lower = w_sub.lower()
|
|
if w_sub_lower in dict_ignore:
|
|
continue
|
|
if not _dict.check(w_sub):
|
|
is_correct = False
|
|
break
|
|
return is_correct
|
|
|
|
|
|
def dictionary_suggest(w: str) -> list[str]:
|
|
return _dict.suggest(w) # type: ignore
|
|
|
|
|
|
_dict = dictionary_create() # type: ignore
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# General Utilities
|
|
|
|
def hash_of_file_and_len(fp: str) -> tuple[bytes, int]:
|
|
import hashlib
|
|
with open(fp, 'rb') as fh:
|
|
data = fh.read()
|
|
m = hashlib.sha512()
|
|
m.update(data)
|
|
return m.digest(), len(data)
|
|
|
|
|
|
re_vars = re.compile("[A-Za-z]+")
|
|
|
|
|
|
def re_compile_from_sequence(ls: tuple[str, ...]) -> re.Pattern[str]:
|
|
return re.compile(
|
|
"({:s})".format("|".join(ls)), re.MULTILINE | re.DOTALL,
|
|
)
|
|
|
|
|
|
# First remove this from comments, so we don't spell check example code, DOXYGEN commands, etc.
|
|
re_ignore_elems_generic_url_email_tags: tuple[str, ...] = (
|
|
# URL.
|
|
r'\b(https?|ftp)://\S+',
|
|
# Email address: <me@email.com>
|
|
# <someone@foo.bar-baz.com>
|
|
r"<\w+@[\w\.\-]+>",
|
|
|
|
# Convention for TODO/FIXME messages: TODO(my name) OR FIXME(name+name) OR XXX(some-name) OR NOTE(name/other-name):
|
|
r"\b(TODO|FIXME|XXX|NOTE|WARNING|WORKAROUND)\(@?[\w\s\+\-/]+\)",
|
|
)
|
|
|
|
re_ignore_elems_generic_expressions: tuple[str, ...] = (
|
|
# Words containing underscores: a_b
|
|
r'\S*\w+_\S+',
|
|
# Words containing arrows: a->b
|
|
r'\S*\w+\->\S+',
|
|
# Words containing dot notation: a.b (NOT ab... since this is used in English).
|
|
r'\w+\.\w+\S*',
|
|
)
|
|
|
|
re_ignore_elems_generic_single_backtick: tuple[str, ...] = (
|
|
# Single and back-tick quotes (often used to reference code).
|
|
# Allow white-space or any bracket prefix, e.g:
|
|
# (`expr a+b`)
|
|
r"[\s\(\[\{]\`[^\n`]+\`",
|
|
)
|
|
|
|
re_ignore_elems_generic_double_backtick: tuple[str, ...] = (
|
|
# Double back-ticks are used for doc-strings for literals:
|
|
# (`expr a+b`)
|
|
r"[\s\(\[\{]\`\`[^\n`]+\`\`",
|
|
)
|
|
|
|
re_ignore_elems_lang_c_doxygen: tuple[str, ...] = (
|
|
# DOXYGEN style: `<pre> ... </pre>`
|
|
r"<pre>.+</pre>",
|
|
# DOXYGEN style: `\code ... \endcode`
|
|
r"\s+\\code\b.+\s\\endcode\b",
|
|
# DOXYGEN style `#SOME_CODE`.
|
|
r'#\S+',
|
|
# DOXYGEN commands: `\param foo`
|
|
r"\\(section|subsection|subsubsection|defgroup|ingroup|addtogroup|param|tparam|page|a|see)\s+\S+",
|
|
# DOXYGEN commands without any arguments after them: \command
|
|
r"\\(retval|todo|name)\b",
|
|
# DOXYGEN 'param' syntax used rarely: `\param foo[in,out]`
|
|
r"\\param\[[a-z,]+\]\S*",
|
|
|
|
)
|
|
|
|
re_ignore_map: dict[tuple[LangType, TokenType], re.Pattern[str]] = {
|
|
(LangType.C, TokenType.COMMENT): re_compile_from_sequence((
|
|
*re_ignore_elems_generic_url_email_tags,
|
|
*re_ignore_elems_lang_c_doxygen,
|
|
*re_ignore_elems_generic_expressions,
|
|
*re_ignore_elems_generic_single_backtick,
|
|
)),
|
|
(LangType.C, TokenType.STRING): re_compile_from_sequence((
|
|
*re_ignore_elems_generic_url_email_tags,
|
|
*re_ignore_elems_generic_expressions,
|
|
*re_ignore_elems_generic_single_backtick,
|
|
)),
|
|
|
|
(LangType.PYTHON, TokenType.COMMENT): re_compile_from_sequence((
|
|
*re_ignore_elems_generic_url_email_tags,
|
|
*re_ignore_elems_generic_expressions,
|
|
*re_ignore_elems_generic_single_backtick,
|
|
)),
|
|
(LangType.PYTHON, TokenType.STRING): re_compile_from_sequence((
|
|
*re_ignore_elems_generic_url_email_tags,
|
|
*re_ignore_elems_generic_expressions,
|
|
*re_ignore_elems_generic_single_backtick,
|
|
)),
|
|
# Only Python uses the doc-string type.
|
|
(LangType.PYTHON, TokenType.DOCSTRING): re_compile_from_sequence((
|
|
*re_ignore_elems_generic_url_email_tags,
|
|
*re_ignore_elems_generic_expressions,
|
|
*re_ignore_elems_generic_double_backtick,
|
|
)),
|
|
|
|
(LangType.CMAKE, TokenType.COMMENT): re_compile_from_sequence((
|
|
*re_ignore_elems_generic_url_email_tags,
|
|
*re_ignore_elems_generic_expressions,
|
|
*re_ignore_elems_generic_single_backtick,
|
|
)),
|
|
(LangType.CMAKE, TokenType.STRING): re_compile_from_sequence((
|
|
*re_ignore_elems_generic_url_email_tags,
|
|
*re_ignore_elems_generic_expressions,
|
|
*re_ignore_elems_generic_single_backtick,
|
|
)),
|
|
}
|
|
|
|
del re_ignore_elems_generic_url_email_tags
|
|
del re_ignore_elems_generic_expressions
|
|
del re_ignore_elems_generic_double_backtick
|
|
del re_ignore_elems_lang_c_doxygen
|
|
|
|
|
|
# Then extract words.
|
|
re_words = re.compile(
|
|
r"\b("
|
|
# Capital words, with optional '-' and "'".
|
|
r"[A-Z]+[\-'A-Z]*[A-Z]|"
|
|
# Lowercase words, with optional '-' and "'".
|
|
r"[A-Za-z][\-'a-z]*[a-z]+"
|
|
r")\b"
|
|
)
|
|
|
|
re_not_newline = re.compile("[^\n]")
|
|
|
|
if USE_SKIP_SINGLE_IDENTIFIER_COMMENTS:
|
|
re_single_word_c_comments = re.compile(r"\/\*[\s]*[a-zA-Z_]+[a-zA-Z0-9_]*[\s]*\*\/")
|
|
|
|
|
|
def words_from_text(
|
|
text: str,
|
|
lang: LangType,
|
|
type: TokenType,
|
|
check_type: str,
|
|
) -> list[tuple[str, int]]:
|
|
""" Extract words to treat as English for spell checking.
|
|
"""
|
|
# Replace non-newlines with white-space, so all alignment is kept.
|
|
def replace_ignore(match: re.Match[str]) -> str:
|
|
start, end = match.span()
|
|
return re_not_newline.sub(" ", match.string[start:end])
|
|
|
|
# Handy for checking what we ignore, in case we ignore too much and miss real errors.
|
|
# for match in re_ignore.finditer(text):
|
|
# print(match.group(0))
|
|
|
|
# Strip out URL's, code-blocks, etc.
|
|
re_ignore = re_ignore_map[(lang, type)]
|
|
|
|
text = re_ignore.sub(replace_ignore, text)
|
|
|
|
words = []
|
|
|
|
if check_type == 'SPELLING':
|
|
for match in re_words.finditer(text):
|
|
words.append((match.group(0), match.start()))
|
|
|
|
def word_ok(w: str) -> bool:
|
|
# Ignore all uppercase words.
|
|
if w.isupper():
|
|
return False
|
|
return True
|
|
words[:] = [w for w in words if word_ok(w[0])]
|
|
|
|
elif check_type == 'DUPLICATES':
|
|
w_prev = ""
|
|
w_prev_start = 0
|
|
for match in re_words.finditer(text):
|
|
w = match.group(0)
|
|
w_start = match.start()
|
|
w_lower = w.lower()
|
|
if w_lower == w_prev:
|
|
text_ws = text[w_prev_start + len(w_prev): w_start]
|
|
if text_ws == " ":
|
|
words.append((w_lower, w_start))
|
|
w_prev = w_lower
|
|
w_prev_start = w_start
|
|
else:
|
|
assert False, "unreachable"
|
|
|
|
return words
|
|
|
|
|
|
class Comment:
|
|
__slots__ = (
|
|
"file",
|
|
"text",
|
|
"line",
|
|
"lang",
|
|
"type",
|
|
)
|
|
|
|
def __init__(self, file: str, text: str, line: int, lang: LangType, type: TokenType):
|
|
self.file = file
|
|
self.text = text
|
|
self.line = line
|
|
self.lang = lang
|
|
self.type = type
|
|
|
|
def parse(self, check_type: str) -> list[tuple[str, int]]:
|
|
return words_from_text(self.text, self.lang, self.type, check_type=check_type)
|
|
|
|
def line_and_column_from_comment_offset(self, pos: int) -> tuple[int, int]:
|
|
text = self.text
|
|
slineno = self.line + text.count("\n", 0, pos)
|
|
# Allow for -1 to be not found.
|
|
scol = text.rfind("\n", 0, pos) + 1
|
|
if scol == 0:
|
|
# Not found.
|
|
scol = pos
|
|
else:
|
|
scol = pos - scol
|
|
return slineno, scol
|
|
|
|
|
|
def extract_code_strings(filepath: str) -> tuple[list[Comment], set[str]]:
|
|
from pygments import lexers
|
|
from pygments.token import Token
|
|
|
|
comments = []
|
|
code_words = set()
|
|
|
|
# lex = lexers.find_lexer_class_for_filename(filepath)
|
|
# if lex is None:
|
|
# return comments, code_words
|
|
if filepath.endswith(".py"):
|
|
lex = lexers.get_lexer_by_name("python")
|
|
lang_type = LangType.PYTHON
|
|
elif filepath.endswith((".cmake", ".txt")):
|
|
lex = lexers.get_lexer_by_name("cmake")
|
|
lang_type = LangType.CMAKE
|
|
else:
|
|
lex = lexers.get_lexer_by_name("c")
|
|
lang_type = LangType.C
|
|
|
|
slineno = 0
|
|
with open(filepath, encoding='utf-8') as fh:
|
|
source = fh.read()
|
|
|
|
for ty, ttext in lex.get_tokens(source):
|
|
if ty in {
|
|
Token.Literal.String,
|
|
Token.Literal.String.Double,
|
|
Token.Literal.String.Single,
|
|
}:
|
|
comments.append(Comment(filepath, ttext, slineno, lang_type, TokenType.STRING))
|
|
else:
|
|
for match in re_vars.finditer(ttext):
|
|
code_words.add(match.group(0))
|
|
# Ugh - not nice or fast.
|
|
slineno += ttext.count("\n")
|
|
|
|
return comments, code_words
|
|
|
|
|
|
def extract_py_comments(filepath: str) -> tuple[list[Comment], set[str]]:
|
|
|
|
import token
|
|
import tokenize
|
|
|
|
source = open(filepath, encoding='utf-8')
|
|
|
|
comments = []
|
|
code_words = set()
|
|
|
|
prev_toktype = token.INDENT
|
|
|
|
tokgen = tokenize.generate_tokens(source.readline)
|
|
for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
|
|
if toktype == token.STRING:
|
|
if prev_toktype == token.INDENT:
|
|
comments.append(Comment(filepath, ttext, slineno - 1, LangType.PYTHON, TokenType.DOCSTRING))
|
|
elif toktype == tokenize.COMMENT:
|
|
# non standard hint for commented CODE that we can ignore
|
|
if not ttext.startswith("#~"):
|
|
comments.append(Comment(filepath, ttext, slineno - 1, LangType.PYTHON, TokenType.COMMENT))
|
|
else:
|
|
for match in re_vars.finditer(ttext):
|
|
code_words.add(match.group(0))
|
|
|
|
prev_toktype = toktype
|
|
return comments, code_words
|
|
|
|
|
|
def extract_cmake_comments(filepath: str) -> tuple[list[Comment], set[str]]:
|
|
from pygments import lexers
|
|
from pygments.token import Token
|
|
|
|
lex = lexers.get_lexer_by_name("cmake")
|
|
|
|
with open(filepath, encoding='utf-8') as fh:
|
|
source = fh.read()
|
|
|
|
comments = []
|
|
code_words = set()
|
|
|
|
slineno = 0
|
|
for ty, ttext in lex.get_tokens(source):
|
|
if ty in {Token.Literal.String, Token.Literal.String.Double, Token.Literal.String.Single}:
|
|
# Disable because most CMake strings are references to paths/code."
|
|
if False:
|
|
comments.append(Comment(filepath, ttext, slineno, LangType.CMAKE, TokenType.STRING))
|
|
elif ty in {Token.Comment, Token.Comment.Single}:
|
|
comments.append(Comment(filepath, ttext, slineno, LangType.CMAKE, TokenType.COMMENT))
|
|
else:
|
|
for match in re_vars.finditer(ttext):
|
|
code_words.add(match.group(0))
|
|
# Ugh - not nice or fast.
|
|
slineno += ttext.count("\n")
|
|
|
|
return comments, code_words
|
|
|
|
|
|
def extract_c_comments(filepath: str) -> tuple[list[Comment], set[str]]:
|
|
"""
|
|
Extracts comments like this:
|
|
|
|
/*
|
|
* This is a multi-line comment, notice the '*'s are aligned.
|
|
*/
|
|
"""
|
|
text = open(filepath, encoding='utf-8').read()
|
|
|
|
BEGIN = "/*"
|
|
END = "*/"
|
|
|
|
# reverse these to find blocks we won't parse
|
|
PRINT_NON_ALIGNED = False
|
|
PRINT_SPELLING = True
|
|
|
|
comment_ranges = []
|
|
|
|
if USE_SKIP_SINGLE_IDENTIFIER_COMMENTS:
|
|
comment_ignore_offsets = set()
|
|
for match in re_single_word_c_comments.finditer(text):
|
|
comment_ignore_offsets.add(match.start(0))
|
|
|
|
i = 0
|
|
while i != -1:
|
|
i = text.find(BEGIN, i)
|
|
if i != -1:
|
|
i_next = text.find(END, i)
|
|
if i_next != -1:
|
|
do_comment_add = True
|
|
if USE_SKIP_SINGLE_IDENTIFIER_COMMENTS:
|
|
if i in comment_ignore_offsets:
|
|
do_comment_add = False
|
|
|
|
# Not essential but seek back to find beginning of line.
|
|
while i > 0 and text[i - 1] in {"\t", " "}:
|
|
i -= 1
|
|
i_next += len(END)
|
|
if do_comment_add:
|
|
comment_ranges.append((i, i_next))
|
|
i = i_next
|
|
else:
|
|
pass
|
|
|
|
if PRINT_NON_ALIGNED:
|
|
for i, i_next in comment_ranges:
|
|
# Seek i back to the line start.
|
|
i_bol = text.rfind("\n", 0, i) + 1
|
|
l_ofs_first = i - i_bol
|
|
star_offsets = set()
|
|
block = text[i_bol:i_next]
|
|
for line_index, l in enumerate(block.split("\n")):
|
|
star_offsets.add(l.find("*", l_ofs_first))
|
|
l_ofs_first = 0
|
|
if len(star_offsets) > 1:
|
|
print("{:s}:{:d}".format(filepath, line_index + text.count("\n", 0, i)))
|
|
break
|
|
|
|
if not PRINT_SPELLING:
|
|
return [], set()
|
|
|
|
# Collect variables from code, so we can reference variables from code blocks
|
|
# without this generating noise from the spell checker.
|
|
|
|
code_ranges = []
|
|
if not comment_ranges:
|
|
code_ranges.append((0, len(text)))
|
|
else:
|
|
for index in range(len(comment_ranges) + 1):
|
|
if index == 0:
|
|
i_prev = 0
|
|
else:
|
|
i_prev = comment_ranges[index - 1][1]
|
|
|
|
if index == len(comment_ranges):
|
|
i_next = len(text)
|
|
else:
|
|
i_next = comment_ranges[index][0]
|
|
|
|
code_ranges.append((i_prev, i_next))
|
|
|
|
code_words = set()
|
|
|
|
for i, i_next in code_ranges:
|
|
for match in re_vars.finditer(text[i:i_next]):
|
|
w = match.group(0)
|
|
code_words.add(w)
|
|
# Allow plurals of these variables too.
|
|
code_words.add(w + "'s")
|
|
# Allow `th` suffix, mainly for indices, e.g. the `i'th` element.
|
|
code_words.add(w + "'th")
|
|
|
|
comments = []
|
|
|
|
slineno = 0
|
|
i_prev = 0
|
|
for i, i_next in comment_ranges:
|
|
block = text[i:i_next]
|
|
# Add white-space in front of the block (for alignment test)
|
|
# allow for -1 being not found, which results as zero.
|
|
j = text.rfind("\n", 0, i) + 1
|
|
block = (" " * (i - j)) + block
|
|
|
|
slineno += text.count("\n", i_prev, i)
|
|
comments.append(Comment(filepath, block, slineno, LangType.C, TokenType.COMMENT))
|
|
i_prev = i
|
|
|
|
return comments, code_words
|
|
|
|
|
|
def spell_check_report(filepath: str, check_type: str, report: Report) -> None:
|
|
w, slineno, scol = report
|
|
|
|
if check_type == 'SPELLING':
|
|
w_lower = w.lower()
|
|
|
|
if ONLY_ONCE:
|
|
if w_lower in _words_visited:
|
|
return
|
|
else:
|
|
_words_visited.add(w_lower)
|
|
|
|
suggest = _suggest_map.get(w_lower)
|
|
if suggest is None:
|
|
_suggest_map[w_lower] = suggest = " ".join(dictionary_suggest(w))
|
|
|
|
print("{:s}:{:d}:{:d}: {:s}{:s}{:s}, suggest ({:s})".format(
|
|
filepath,
|
|
slineno + 1,
|
|
scol + 1,
|
|
COLOR_WORD,
|
|
w,
|
|
COLOR_ENDC,
|
|
suggest,
|
|
))
|
|
elif check_type == 'DUPLICATES':
|
|
print("{:s}:{:d}:{:d}: {:s}{:s}{:s}, duplicate".format(
|
|
filepath,
|
|
slineno + 1,
|
|
scol + 1,
|
|
COLOR_WORD,
|
|
w,
|
|
COLOR_ENDC,
|
|
))
|
|
|
|
|
|
def spell_check_file(
|
|
filepath: str,
|
|
check_type: str,
|
|
extract_type: str = 'COMMENTS',
|
|
) -> Iterator[Report]:
|
|
if extract_type == 'COMMENTS':
|
|
if filepath.endswith(".py"):
|
|
comment_list, code_words = extract_py_comments(filepath)
|
|
elif filepath.endswith((".cmake", ".txt")):
|
|
comment_list, code_words = extract_cmake_comments(filepath)
|
|
else:
|
|
comment_list, code_words = extract_c_comments(filepath)
|
|
elif extract_type == 'STRINGS':
|
|
comment_list, code_words = extract_code_strings(filepath)
|
|
if check_type == 'SPELLING':
|
|
for comment in comment_list:
|
|
words = comment.parse(check_type='SPELLING')
|
|
for w, pos in words:
|
|
w_lower = w.lower()
|
|
if w_lower in dict_ignore:
|
|
continue
|
|
|
|
is_good_spelling = dictionary_check(w, code_words)
|
|
if not is_good_spelling:
|
|
# Ignore literals that show up in code,
|
|
# gets rid of a lot of noise from comments that reference variables.
|
|
if w in code_words:
|
|
# print("Skipping", w)
|
|
continue
|
|
|
|
slineno, scol = comment.line_and_column_from_comment_offset(pos)
|
|
yield (w, slineno, scol)
|
|
elif check_type == 'DUPLICATES':
|
|
for comment in comment_list:
|
|
words = comment.parse(check_type='DUPLICATES')
|
|
for w, pos in words:
|
|
slineno, scol = comment.line_and_column_from_comment_offset(pos)
|
|
# print(filepath + ":" + str(slineno + 1) + ":" + str(scol), w, "(duplicates)")
|
|
yield (w, slineno, scol)
|
|
else:
|
|
assert False, "unreachable"
|
|
|
|
|
|
def spell_check_file_recursive(
|
|
dirpath: str,
|
|
check_type: str,
|
|
regex_list: list[re.Pattern[str]],
|
|
extract_type: str = 'COMMENTS',
|
|
cache_data: CacheData | None = None,
|
|
) -> None:
|
|
from os.path import join
|
|
|
|
def source_list(
|
|
path: str,
|
|
filename_check: Callable[[str], bool] | None = None,
|
|
) -> Iterator[str]:
|
|
for dirpath, dirnames, filenames in os.walk(path):
|
|
# Only needed so this can be matches with ignore paths.
|
|
dirpath = os.path.abspath(dirpath)
|
|
if dirpath in directories_ignore:
|
|
dirnames.clear()
|
|
continue
|
|
# skip '.git'
|
|
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
|
|
for filename in filenames:
|
|
if filename.startswith("."):
|
|
continue
|
|
filepath = join(dirpath, filename)
|
|
if not (filename_check is None or filename_check(filepath)):
|
|
continue
|
|
if filepath in files_ignore:
|
|
continue
|
|
yield filepath
|
|
|
|
def is_source(filename: str) -> bool:
|
|
from os.path import splitext
|
|
filename = filename.removeprefix(ROOTDIR_WITH_SLASH)
|
|
for regex in regex_list:
|
|
if regex.match(filename) is not None:
|
|
filename
|
|
ext = splitext(filename)[1].removeprefix(".")
|
|
if ext not in SOURCE_EXT:
|
|
raise Exception("Unknown extension \".{:s}\" aborting!".format(ext))
|
|
return True
|
|
return False
|
|
|
|
for filepath in source_list(dirpath, is_source):
|
|
for report in spell_check_file_with_cache_support(
|
|
filepath, check_type, extract_type=extract_type, cache_data=cache_data,
|
|
):
|
|
spell_check_report(filepath, check_type, report)
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Cache File Support
|
|
#
|
|
# Cache is formatted as follows:
|
|
# (
|
|
# # Store all misspelled words.
|
|
# {filepath: (size, sha512, [reports, ...])},
|
|
#
|
|
# # Store suggestions, as these are slow to re-calculate.
|
|
# {lowercase_words: suggestions},
|
|
# )
|
|
#
|
|
|
|
def spell_cache_read(cache_filepath: str) -> tuple[CacheData, SuggestMap]:
|
|
import pickle
|
|
cache_store: tuple[CacheData, SuggestMap] = {}, {}
|
|
if os.path.exists(cache_filepath):
|
|
with open(cache_filepath, 'rb') as fh:
|
|
cache_store = pickle.load(fh)
|
|
return cache_store
|
|
|
|
|
|
def spell_cache_write(cache_filepath: str, cache_store: tuple[CacheData, SuggestMap]) -> None:
|
|
import pickle
|
|
with open(cache_filepath, 'wb') as fh:
|
|
pickle.dump(cache_store, fh)
|
|
|
|
|
|
def spell_check_file_with_cache_support(
|
|
filepath: str,
|
|
check_type: str,
|
|
*,
|
|
extract_type: str = 'COMMENTS',
|
|
cache_data: CacheData | None = None,
|
|
) -> Iterator[Report]:
|
|
"""
|
|
Iterator each item is a report: (word, line_number, column_number)
|
|
"""
|
|
_files_visited.add(filepath)
|
|
|
|
if cache_data is None:
|
|
yield from spell_check_file(filepath, check_type, extract_type=extract_type)
|
|
return
|
|
|
|
cache_data_for_file = cache_data.get(filepath)
|
|
if cache_data_for_file and len(cache_data_for_file) != 3:
|
|
cache_data_for_file = None
|
|
|
|
cache_hash_test, cache_len_test = hash_of_file_and_len(filepath)
|
|
if cache_data_for_file is not None:
|
|
cache_len, cache_hash, cache_reports = cache_data_for_file
|
|
if cache_len_test == cache_len:
|
|
if cache_hash_test == cache_hash:
|
|
if VERBOSE_CACHE:
|
|
print("Using cache for:", filepath)
|
|
yield from cache_reports
|
|
return
|
|
|
|
cache_reports = []
|
|
for report in spell_check_file(filepath, check_type, extract_type=extract_type):
|
|
cache_reports.append(report)
|
|
|
|
cache_data[filepath] = (cache_len_test, cache_hash_test, cache_reports)
|
|
|
|
yield from cache_reports
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Extract Bad Spelling from a Source File
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Main & Argument Parsing
|
|
|
|
def argparse_create() -> argparse.ArgumentParser:
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description=__doc__,
|
|
formatter_class=argparse.RawTextHelpFormatter,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--match",
|
|
nargs='+',
|
|
default=(
|
|
r".*\.(" + "|".join(SOURCE_EXT) + ")$",
|
|
),
|
|
required=False,
|
|
metavar="REGEX",
|
|
help="Match file paths against this expression",
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--extract',
|
|
dest='extract',
|
|
choices=('COMMENTS', 'STRINGS'),
|
|
default='COMMENTS',
|
|
required=False,
|
|
metavar='WHAT',
|
|
help=(
|
|
'Text to extract for checking.\n'
|
|
'\n'
|
|
'- ``COMMENTS`` extracts comments from source code.\n'
|
|
'- ``STRINGS`` extracts text.'
|
|
),
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--check',
|
|
dest='check_type',
|
|
choices=('SPELLING', 'DUPLICATES'),
|
|
default='SPELLING',
|
|
required=False,
|
|
metavar='CHECK_TYPE',
|
|
help=(
|
|
'The check to perform.\n'
|
|
'\n'
|
|
'- ``SPELLING`` check spelling.\n'
|
|
'- ``DUPLICATES`` report repeated words.'
|
|
),
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--cache-file",
|
|
dest="cache_file",
|
|
help=(
|
|
"Optional cache, for fast re-execution, "
|
|
"avoiding re-extracting spelling when files have not been modified."
|
|
),
|
|
required=False,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"paths",
|
|
nargs='+',
|
|
help="Files or directories to walk recursively.",
|
|
)
|
|
|
|
return parser
|
|
|
|
|
|
def main() -> int:
|
|
global _suggest_map
|
|
|
|
args = argparse_create().parse_args()
|
|
|
|
regex_list = []
|
|
for expr in args.match:
|
|
try:
|
|
regex_list.append(re.compile(expr))
|
|
except Exception as ex:
|
|
print("Error in expression: {!r}\n {!r}".format(expr, ex))
|
|
return 1
|
|
|
|
extract_type = args.extract
|
|
cache_filepath = args.cache_file
|
|
check_type = args.check_type
|
|
|
|
cache_data: CacheData | None = None
|
|
if cache_filepath:
|
|
cache_data, _suggest_map = spell_cache_read(cache_filepath)
|
|
clear_stale_cache = True
|
|
|
|
# print(extract_type)
|
|
try:
|
|
for filepath in args.paths:
|
|
if os.path.isdir(filepath):
|
|
|
|
# recursive search
|
|
spell_check_file_recursive(
|
|
filepath,
|
|
check_type,
|
|
regex_list=regex_list,
|
|
extract_type=extract_type,
|
|
cache_data=cache_data,
|
|
)
|
|
else:
|
|
# single file
|
|
for report in spell_check_file_with_cache_support(
|
|
filepath,
|
|
check_type,
|
|
extract_type=extract_type,
|
|
cache_data=cache_data,
|
|
):
|
|
spell_check_report(filepath, check_type, report)
|
|
except KeyboardInterrupt:
|
|
clear_stale_cache = False
|
|
|
|
if cache_filepath:
|
|
assert cache_data is not None
|
|
if VERBOSE_CACHE:
|
|
print("Writing cache:", len(cache_data))
|
|
|
|
if clear_stale_cache:
|
|
# Don't keep suggestions for old misspellings.
|
|
_suggest_map = {w_lower: _suggest_map[w_lower] for w_lower in _words_visited}
|
|
|
|
for filepath in list(cache_data.keys()):
|
|
if filepath not in _files_visited:
|
|
del cache_data[filepath]
|
|
|
|
spell_cache_write(cache_filepath, (cache_data, _suggest_map))
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|