From d0115672ddec7a6cf88059faa1ea8a9176b2bcde Mon Sep 17 00:00:00 2001 From: Steffen Beyer Date: Wed, 1 Apr 2026 23:35:50 +0200 Subject: [PATCH] First iteration --- .envrc | 12 + .gitignore | 13 + README.md | 215 +++++++ bin/gulie | 24 + devenv.lock | 123 ++++ devenv.nix | 48 ++ devenv.yaml | 15 + docs/INSPIRATION.md | 813 +++++++++++++++++++++++++ docs/PLAN.md | 470 ++++++++++++++ gulie/cli.scm | 105 ++++ gulie/compiler.scm | 92 +++ gulie/config.scm | 129 ++++ gulie/cst.scm | 95 +++ gulie/diagnostic.scm | 91 +++ gulie/engine.scm | 139 +++++ gulie/rule.scm | 61 ++ gulie/rules/comments.scm | 71 +++ gulie/rules/surface.scm | 93 +++ gulie/suppression.scm | 139 +++++ gulie/tokenizer.scm | 348 +++++++++++ test/fixtures/clean/well-formatted.scm | 13 + test/fixtures/violations/semantic.scm | 9 + test/fixtures/violations/surface.scm | 12 + test/run-tests.scm | 34 ++ test/test-compiler.scm | 56 ++ test/test-cst.scm | 59 ++ test/test-rules.scm | 104 ++++ test/test-suppression.scm | 43 ++ test/test-tokenizer.scm | 127 ++++ 29 files changed, 3553 insertions(+) create mode 100644 .envrc create mode 100644 .gitignore create mode 100644 README.md create mode 100755 bin/gulie create mode 100644 devenv.lock create mode 100644 devenv.nix create mode 100644 devenv.yaml create mode 100644 docs/INSPIRATION.md create mode 100644 docs/PLAN.md create mode 100644 gulie/cli.scm create mode 100644 gulie/compiler.scm create mode 100644 gulie/config.scm create mode 100644 gulie/cst.scm create mode 100644 gulie/diagnostic.scm create mode 100644 gulie/engine.scm create mode 100644 gulie/rule.scm create mode 100644 gulie/rules/comments.scm create mode 100644 gulie/rules/surface.scm create mode 100644 gulie/suppression.scm create mode 100644 gulie/tokenizer.scm create mode 100644 test/fixtures/clean/well-formatted.scm create mode 100644 test/fixtures/violations/semantic.scm create mode 100644 test/fixtures/violations/surface.scm create mode 100644 test/run-tests.scm create mode 100644 test/test-compiler.scm create mode 100644 test/test-cst.scm create mode 100644 test/test-rules.scm create mode 100644 test/test-suppression.scm create mode 100644 test/test-tokenizer.scm diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..cc5c18b --- /dev/null +++ b/.envrc @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +export DIRENV_WARN_TIMEOUT=20s + +eval "$(devenv direnvrc)" + +# `use devenv` supports the same options as the `devenv shell` command. +# +# To silence all output, use `--quiet`. +# +# Example usage: use devenv --quiet --impure --option services.postgres.enable:bool true +use devenv diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..49cac37 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +# Devenv +.devenv* +devenv.local.nix +devenv.local.yaml + +# direnv +.direnv + +# pre-commit +.pre-commit-config.yaml + +# Temporary +/refs/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..8e72e07 --- /dev/null +++ b/README.md @@ -0,0 +1,215 @@ +# gulie + +A linter, static analyser, and formatter for [Guile Scheme](https://www.gnu.org/software/guile/). + +``` +$ gulie gulie/ +gulie/engine.scm:97:80: warning: line-length: line exceeds 80 characters (82) +gulie/tokenizer.scm:131:0: warning: trailing-whitespace: trailing whitespace +``` + +## Why + +No linter, formatter, or static analysis tool exists for Guile Scheme. gulie +fills that gap with a two-pass architecture that catches both surface-level +formatting issues and deep semantic problems. + +## Features + +- **Surface rules** (no parsing needed): trailing whitespace, line length, tabs, + excessive blank lines, comment style conventions +- **Semantic rules** (via Guile's compiler): unused variables, unbound + variables, arity mismatches, format string errors, shadowed top-levels, + unused modules +- **Inline suppression**: `; gulie:suppress rule-name` on a line, or region + disable/enable blocks +- **Auto-fix mode**: `--fix` applies automatic corrections where available +- **Configuration**: `.gulie.sexp` in your project root, overridable via CLI +- **CI friendly**: exit code 0 for clean, 1 for findings + +## Requirements + +- [Guile](https://www.gnu.org/software/guile/) 3.0 or later + +## Installation + +Clone the repository and ensure `bin/gulie` is on your `PATH`, or run it +directly: + +```sh +bin/gulie --check . +``` + +## Usage + +``` +gulie [OPTIONS] [FILE|DIR...] + +Options: + -h, --help Show help message + -v, --version Print version + --check Check mode (default): report issues, exit non-zero on findings + --fix Fix mode: auto-fix what's possible, report the rest + --init Generate .gulie.sexp template in current directory + --pass PASS Run only: surface, semantic, all (default: all) + --config FILE Config file path (default: auto-discover .gulie.sexp) + --rule RULE Enable only this rule + --disable RULE Disable this rule + --severity SEV Minimum severity: error, warning, info + --output FORMAT Output format: standard (default), json, compact + --list-rules List all available rules +``` + +### Examples + +Check a single file: + +```sh +gulie mylib.scm +``` + +Check an entire project: + +```sh +gulie src/ +``` + +Auto-fix trailing whitespace and other fixable issues: + +```sh +gulie --fix src/ +``` + +Generate a config template: + +```sh +gulie --init +``` + +## Configuration + +gulie looks for `.gulie.sexp` in the current directory and parent directories. +Generate a template with `gulie --init`. + +```scheme +((line-length . 80) + (indent . 2) + (max-blank-lines . 2) + (enable trailing-whitespace line-length no-tabs blank-lines + comment-semicolons unused-variable unbound-variable + arity-mismatch) + (disable) + (rules + (line-length (max . 100))) + (indent-rules + (define . 1) (let . 1) (lambda . 1) + (with-syntax . 1) (match . 1)) + (ignore "build/**" ".direnv/**")) +``` + +## Inline Suppression + +Suppress a rule on the current line: + +```scheme +(define x "messy") ; gulie:suppress trailing-whitespace +``` + +Suppress all rules on the next line: + +```scheme +;; gulie:suppress +(define intentionally-long-variable-name "value") +``` + +Region disable/enable: + +```scheme +;; gulie:disable line-length +(define long-line ...............................................) +(define another .................................................) +;; gulie:enable line-length +``` + +## Architecture + +gulie uses a two-pass design: + +``` + .gulie.sexp + | + file.scm --+--> [Tokenizer] --> tokens --> [CST parser] --> CST + | | + | [Pass 1: Surface] line rules + CST rules + | | + | diagnostics-1 + | + +--> [Guile compiler] --> Tree-IL --> CPS + | + [Pass 2: Semantic] Guile's built-in analyses + | + diagnostics-2 + | + [merge + suppress + sort + report] +``` + +**Pass 1** uses a hand-written tokenizer that preserves all whitespace, comments, +and exact source text. The critical invariant: +`(string-concatenate (map token-text (tokenize input)))` reproduces the input +exactly. This feeds a lightweight concrete syntax tree for formatting checks. + +**Pass 2** delegates to Guile's own compiler and analysis infrastructure: +`unused-variable-analysis`, `arity-analysis`, `format-analysis`, and others. +These are battle-tested and handle macroexpansion correctly. + +The two passes are independent because Guile's reader irrecoverably strips +comments and whitespace — there is no way to get formatting info and semantic +info from a single parse. + +## Rules + +| Rule | Type | Category | Description | +|------|------|----------|-------------| +| `trailing-whitespace` | line | format | Trailing spaces or tabs | +| `line-length` | line | format | Line exceeds maximum width | +| `no-tabs` | line | format | Tab characters in source | +| `blank-lines` | line | format | Excessive consecutive blank lines | +| `comment-semicolons` | cst | style | Comment style conventions (`;`/`;;`/`;;;`) | +| `unused-variable` | semantic | correctness | Unused local variable | +| `unused-toplevel` | semantic | correctness | Unused top-level definition | +| `unused-module` | semantic | correctness | Unused module import | +| `unbound-variable` | semantic | correctness | Reference to undefined variable | +| `arity-mismatch` | semantic | correctness | Wrong number of arguments | +| `shadowed-toplevel` | semantic | correctness | Top-level binding shadows import | +| `format-string` | semantic | correctness | Format string validation | + +## Module Structure + +``` +gulie/ + cli.scm Command-line interface + config.scm Configuration loading and merging + diagnostic.scm Diagnostic record type and formatting + tokenizer.scm Hand-written lexer preserving all tokens + cst.scm Token stream to concrete syntax tree + compiler.scm Guile compiler wrapper for semantic analysis + rule.scm Rule record type and registry + engine.scm Orchestrator: file discovery, pass sequencing + suppression.scm Inline suppression parsing and filtering + rules/ + surface.scm Line-based formatting rules + comments.scm Comment style rules +``` + +## Testing + +```sh +guile --no-auto-compile -L . -s test/run-tests.scm +``` + +84 tests covering tokenizer roundtrip, CST parsing, surface rules, suppression, +and semantic analysis. + +## Licence + +[TODO: add licence] diff --git a/bin/gulie b/bin/gulie new file mode 100755 index 0000000..2d079d6 --- /dev/null +++ b/bin/gulie @@ -0,0 +1,24 @@ +#!/usr/bin/env -S guile --no-auto-compile -e main -s +!# +;;; gulie — a linter and formatter for Guile Scheme + +;; Add project root to load path +(let ((dir (dirname (dirname (current-filename))))) + (set! %load-path (cons dir %load-path))) + +;; Load rule modules (registering rules as a side effect) +(use-modules (gulie rules surface)) + +;; Load optional modules if available +(false-if-exception (use-modules (gulie tokenizer))) +(false-if-exception (use-modules (gulie cst))) +(false-if-exception (use-modules (gulie rules comments))) +(false-if-exception (use-modules (gulie rules indentation))) +(false-if-exception (use-modules (gulie compiler))) +(false-if-exception (use-modules (gulie rules semantic))) + +;; Run +(use-modules (gulie cli)) + +(define (main args) + (exit ((@@ (gulie cli) main) args))) diff --git a/devenv.lock b/devenv.lock new file mode 100644 index 0000000..77b838b --- /dev/null +++ b/devenv.lock @@ -0,0 +1,123 @@ +{ + "nodes": { + "devenv": { + "locked": { + "dir": "src/modules", + "lastModified": 1775040883, + "owner": "cachix", + "repo": "devenv", + "rev": "c277ffa27759cd230089700da568864446528e80", + "type": "github" + }, + "original": { + "dir": "src/modules", + "owner": "cachix", + "repo": "devenv", + "type": "github" + } + }, + "flake-compat": { + "flake": false, + "locked": { + "lastModified": 1767039857, + "owner": "NixOS", + "repo": "flake-compat", + "rev": "5edf11c44bc78a0d334f6334cdaf7d60d732daab", + "type": "github" + }, + "original": { + "owner": "NixOS", + "repo": "flake-compat", + "type": "github" + } + }, + "git-hooks": { + "inputs": { + "flake-compat": "flake-compat", + "gitignore": "gitignore", + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1775036584, + "owner": "cachix", + "repo": "git-hooks.nix", + "rev": "4e0eb042b67d863b1b34b3f64d52ceb9cd926735", + "type": "github" + }, + "original": { + "owner": "cachix", + "repo": "git-hooks.nix", + "type": "github" + } + }, + "gitignore": { + "inputs": { + "nixpkgs": [ + "git-hooks", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1762808025, + "owner": "hercules-ci", + "repo": "gitignore.nix", + "rev": "cb5e3fdca1de58ccbc3ef53de65bd372b48f567c", + "type": "github" + }, + "original": { + "owner": "hercules-ci", + "repo": "gitignore.nix", + "type": "github" + } + }, + "nixpkgs": { + "inputs": { + "nixpkgs-src": "nixpkgs-src" + }, + "locked": { + "lastModified": 1774287239, + "owner": "cachix", + "repo": "devenv-nixpkgs", + "rev": "fa7125ea7f1ae5430010a6e071f68375a39bd24c", + "type": "github" + }, + "original": { + "owner": "cachix", + "ref": "rolling", + "repo": "devenv-nixpkgs", + "type": "github" + } + }, + "nixpkgs-src": { + "flake": false, + "locked": { + "lastModified": 1769922788, + "narHash": "sha256-H3AfG4ObMDTkTJYkd8cz1/RbY9LatN5Mk4UF48VuSXc=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "207d15f1a6603226e1e223dc79ac29c7846da32e", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "devenv": "devenv", + "git-hooks": "git-hooks", + "nixpkgs": "nixpkgs", + "pre-commit-hooks": [ + "git-hooks" + ] + } + } + }, + "root": "root", + "version": 7 +} diff --git a/devenv.nix b/devenv.nix new file mode 100644 index 0000000..a5301bf --- /dev/null +++ b/devenv.nix @@ -0,0 +1,48 @@ +{ pkgs, lib, config, inputs, ... }: + +{ + # https://devenv.sh/basics/ + # env.GREET = "devenv"; + + # https://devenv.sh/packages/ + packages = with pkgs; [ + guile + ]; + + # https://devenv.sh/languages/ + # languages.rust.enable = true; + + # https://devenv.sh/processes/ + # processes.dev.exec = "${lib.getExe pkgs.watchexec} -n -- ls -la"; + + # https://devenv.sh/services/ + # services.postgres.enable = true; + + # https://devenv.sh/scripts/ + # scripts.hello.exec = '' + # echo hello from $GREET + # ''; + + # https://devenv.sh/basics/ + # enterShell = '' + # hello # Run scripts directly + # git --version # Use packages + # ''; + + # https://devenv.sh/tasks/ + # tasks = { + # "myproj:setup".exec = "mytool build"; + # "devenv:enterShell".after = [ "myproj:setup" ]; + # }; + + # https://devenv.sh/tests/ + # enterTest = '' + # echo "Running tests" + # git --version | grep --color=auto "${pkgs.git.version}" + # ''; + + # https://devenv.sh/git-hooks/ + # git-hooks.hooks.shellcheck.enable = true; + + # See full reference at https://devenv.sh/reference/options/ +} diff --git a/devenv.yaml b/devenv.yaml new file mode 100644 index 0000000..116a2ad --- /dev/null +++ b/devenv.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://devenv.sh/devenv.schema.json +inputs: + nixpkgs: + url: github:cachix/devenv-nixpkgs/rolling + +# If you're using non-OSS software, you can set allowUnfree to true. +# allowUnfree: true + +# If you're willing to use a package that's vulnerable +# permittedInsecurePackages: +# - "openssl-1.1.1w" + +# If you have more than one devenv you can merge them +#imports: +# - ./backend diff --git a/docs/INSPIRATION.md b/docs/INSPIRATION.md new file mode 100644 index 0000000..463adcb --- /dev/null +++ b/docs/INSPIRATION.md @@ -0,0 +1,813 @@ +# Inspiration: Existing Lisp Linters, Formatters & Static Analysers + +Survey of reference tools in `./refs/` — what they do, how they work, and what +we can steal for a Guile linter/formatter. + +--- + +## Table of Contents + +| Tool | Ecosystem | Type | Language | +|------|-----------|------|----------| +| [Eastwood](#eastwood) | Clojure | Linter (bug-finder) | Clojure/JVM | +| [fmt](#fmt) | Racket | Formatter | Racket | +| [Kibit](#kibit) | Clojure | Linter (idiom suggester) | Clojure | +| [Mallet](#mallet) | Common Lisp | Linter + formatter + fixer | Common Lisp | +| [OCICL Lint](#ocicl-lint) | Common Lisp | Linter + fixer | Common Lisp | +| [racket-review](#racket-review) | Racket | Linter | Racket | +| [SBLint](#sblint) | Common Lisp (SBCL) | Compiler-driven linter | Common Lisp | + +--- + +## Eastwood + +**Repo:** `refs/eastwood/` — Clojure linter (v1.4.3, by Jonas Enlund) + +### What it does + +A **bug-finding linter** for Clojure. Focuses on detecting actual errors +(wrong arity, undefined vars, misplaced docstrings) rather than enforcing style. +Achieves high accuracy by using the same compilation infrastructure as the +Clojure compiler itself. + +### How it works + +``` +File discovery (tools.namespace) + → Topological sort by :require/:use deps + → For each namespace: + Parse → Macroexpand → AST (tools.analyzer.jvm) → eval + → Run linter functions over AST nodes + → Filter warnings by config + → Report +``` + +Key: uses `tools.analyzer.jvm/analyze+eval` — it actually **compiles and +evaluates** source code to build an AST. This gives compiler-grade accuracy but +means it can only lint code that successfully loads. + +### Architecture + +- **`lint.clj`** — Central coordinator: linter registry, namespace ordering, + main analysis loop +- **`analyze-ns.clj`** — AST generation via tools.analyzer +- **`passes.clj`** — Custom analysis passes (reflection validation, def-name + propagation) +- **`linters/*.clj`** — Individual linter implementations (~8 files) +- **`reporting-callbacks.clj`** — Output formatters (multimethod dispatch) +- **`util.clj`** — Config loading, AST walking, warning filtering + +### Rules (25+) + +| Category | Examples | +|----------|----------| +| Arity | `:wrong-arity` — function called with wrong arg count | +| Definitions | `:def-in-def`, `:redefd-vars`, `:misplaced-docstrings` | +| Unused | `:unused-private-vars`, `:unused-fn-args`, `:unused-locals`, `:unused-namespaces` | +| Suspicious | `:constant-test`, `:suspicious-expression`, `:suspicious-test` | +| Style | `:unlimited-use`, `:non-dynamic-earmuffs`, `:local-shadows-var` | +| Interop | `:reflection`, `:boxed-math`, `:performance` | +| Types | `:wrong-tag`, `:deprecations` | + +### Configuration + +Rules are suppressed via **Clojure code** (not YAML/JSON): + +```clojure +(disable-warning + {:linter :suspicious-expression + :for-macro 'clojure.core/let + :if-inside-macroexpansion-of #{'clojure.core/when-first} + :within-depth 6 + :reason "False positive from when-first expansion"}) +``` + +Builtin config files ship for `clojure.core`, contrib libs, and popular +third-party libraries. Users add their own via `:config-files` option. + +### What we can learn + +- **Macroexpansion-aware suppression** — Can distinguish user code from + macro-generated code; suppression rules can target specific macro expansions. + Critical for any Lisp linter. +- **Topological namespace ordering** — Analyse dependencies before dependents. + Relevant if we want cross-module analysis. +- **Linter registry pattern** — Each linter is a map `{:name :fn :enabled-by-default :url}`. + Simple, extensible. +- **Warning filtering pipeline** — Raw warnings → handle result → remove ignored + faults → remove excluded kinds → filter by config → final warnings. Clean + composable chain. +- **Metadata preservation through AST transforms** — Custom `postwalk` that + preserves metadata. Essential for accurate source locations. + +--- + +## fmt + +**Repo:** `refs/fmt/` — Racket code formatter (v0.0.3, by Sorawee Porncharoenwase) + +### What it does + +An **extensible code formatter** for Racket. Reads source, reformats according +to style conventions using **cost-based optimal layout selection**. Supports +custom formatting rules via pluggable formatter maps. + +### How it works + +Clean **4-stage pipeline**: + +``` +Source string + → [1] Tokenize (syntax-color/module-lexer) + → [2] Read/Parse → tree of node/atom/wrapper structs + → [3] Realign (fix sexp-comments, quotes) + → [4] Pretty-print (pretty-expressive library, cost-based) + → Formatted string +``` + +The pretty-printer uses the **Wadler/Leijen optimal layout algorithm** via the +`pretty-expressive` library. It evaluates multiple layout alternatives and +selects the one with the lowest cost vector. + +### Architecture + +- **`tokenize.rkt`** (72 lines) — Lexer wrapper around Racket's `syntax-color` +- **`read.rkt`** (135 lines) — Token stream → tree IR; preserves comments +- **`realign.rkt`** (75 lines) — Post-process sexp-comments and quote prefixes +- **`conventions.rkt`** (640 lines) — **All formatting rules** for 100+ Racket forms +- **`core.rkt`** (167 lines) — `define-pretty` DSL, AST-to-document conversion +- **`main.rkt`** (115 lines) — Public API, cost factory, entry point +- **`params.rkt`** (38 lines) — Configuration parameters (width, indent, etc.) +- **`raco.rkt`** (148 lines) — CLI interface (`raco fmt`) + +### Formatting rules (100+) + +Rules are organised by form type in `conventions.rkt`: + +| Category | Forms | +|----------|-------| +| Control flow | `if`, `when`, `unless`, `cond`, `case-lambda` | +| Definitions | `define`, `define-syntax`, `lambda`, `define/contract` | +| Bindings | `let`, `let*`, `letrec`, `parameterize`, `with-handlers` | +| Loops | `for`, `for/list`, `for/fold`, `for/hash` (15+ variants) | +| Modules | `module`, `begin`, `class`, `interface` | +| Macros | `syntax-rules`, `match`, `syntax-parse`, `syntax-case` | +| Imports | `require`, `provide` — vertically stacked | + +### Configuration + +**Pluggable formatter maps** — a function `(string? → procedure?)`: + +```racket +;; .fmt.rkt +(define (the-formatter-map s) + (case s + [("my-form") (format-uniform-body/helper 4)] + [else #f])) ; delegate to standard +``` + +Formatter maps compose via `compose-formatter-map` (chain of responsibility). + +**Runtime parameters:** + +| Parameter | Default | Purpose | +|-----------|---------|---------| +| `current-width` | 102 | Page width limit | +| `current-limit` | 120 | Computation width limit | +| `current-max-blank-lines` | 1 | Max consecutive blank lines | +| `current-indent` | 0 | Extra indentation | + +### Cost-based layout selection + +The pretty-printer evaluates layout alternatives using a **3-dimensional cost +vector** `[badness, height, characters]`: + +- **Badness** — Quadratic penalty for exceeding page width +- **Height** — Number of lines used +- **Characters** — Total character count (tiebreaker) + +This means the formatter provably selects the **optimal layout** within the +configured width, not just the first one that fits. + +### What we can learn + +- **Cost-based layout is the gold standard** for formatter quality. Worth + investing in an optimal pretty-printer (Wadler/Leijen family) rather than + ad-hoc heuristics. +- **Staged pipeline** (tokenize → parse → realign → pretty-print) is clean, + testable, and easy to reason about. Each stage has well-defined I/O. +- **Form-specific formatting rules** (`define-pretty` DSL) — each Scheme + special form gets a dedicated formatter. Extensible via user-provided maps. +- **Comment preservation as metadata** — Comments are attached to AST nodes, not + discarded. Essential for a practical formatter. +- **Pattern-based extraction** — `match/extract` identifies which elements can + stay inline vs. must be on separate lines. Smart structural analysis. +- **Memoisation via weak hash tables** — Performance optimisation for AST + traversal without memory leaks. +- **Config file convention** — `.fmt.rkt` in project root, auto-discovered. We + should do similar (`.gulie.scm` or similar). + +--- + +## Kibit + +**Repo:** `refs/kibit/` — Clojure idiom suggester (v0.1.11, by Jonas Enlund) + +### What it does + +A **static code analyser** that identifies non-idiomatic Clojure code and +suggests more idiomatic replacements. Example: `(if x y nil)` → `(when x y)`. +Supports auto-replacement via `--replace` flag. + +**Status:** Maintenance mode. Authors recommend **Splint** as successor +(faster, more extensible). + +### How it works + +``` +Source file + → Parse with edamame (side-effect-free reader) + → Extract S-expressions + → Tree walk (depth-first via clojure.walk/prewalk) + → Match each node against rules (core.logic unification) + → Simplify (iterative rewriting until fixpoint) + → Report or Replace (via rewrite-clj zippers) +``` + +The key insight: rules are expressed as **logic programming patterns** using +`clojure.core.logic`. Pattern variables (`?x`, `?y`) unify against arbitrary +subexpressions. + +### Architecture + +- **`core.clj`** (33 lines) — Core simplification logic (tiny!) +- **`check.clj`** (204 lines) — Public API for checking expressions/files +- **`check/reader.clj`** (189 lines) — Source parsing with alias tracking +- **`rules.clj`** (39 lines) — Rule aggregation and indexing +- **`rules/*.clj`** (~153 lines) — Rule definitions by category +- **`reporters.clj`** (59 lines) — Output formatters (text, markdown) +- **`replace.clj`** (134 lines) — Auto-replacement via rewrite-clj zippers +- **`driver.clj`** (144 lines) — CLI entry point, file discovery + +Total: ~1,105 lines. Remarkably compact. + +### Rules (~60) + +Rules are defined via the `defrules` macro: + +```clojure +(defrules rules + ;; Control structures + [(if ?x ?y nil) (when ?x ?y)] + [(if ?x nil ?y) (when-not ?x ?y)] + [(if (not ?x) ?y ?z) (if-not ?x ?y ?z)] + [(do ?x) ?x] + + ;; Arithmetic + [(+ ?x 1) (inc ?x)] + [(- ?x 1) (dec ?x)] + + ;; Collections + [(not (empty? ?x)) (seq ?x)] + [(into [] ?coll) (vec ?coll)] + + ;; Equality + [(= ?x nil) (nil? ?x)] + [(= 0 ?x) (zero? ?x)]) +``` + +Categories: **control structures**, **arithmetic**, **collections**, +**equality**, **miscellaneous** (string ops, Java interop, threading macros). + +### Auto-replacement + +Uses **rewrite-clj zippers** — functional tree navigation that preserves +whitespace, comments, and formatting when applying replacements. Navigate to the +target node, swap it, regenerate text. + +### What we can learn + +- **Logic programming for pattern matching** is beautifully expressive for + "suggest X instead of Y" rules. `core.logic` unification makes patterns + concise and bidirectional. We could use Guile's pattern matching or even a + miniKanren implementation. +- **Rule-as-data pattern** — Rules are just vectors `[pattern replacement]`. + Easy to add, easy to test, easy for users to contribute. +- **Iterative rewriting to fixpoint** — Apply rules until nothing changes. + Catches nested patterns that only become apparent after an inner rewrite. +- **Zipper-based source rewriting** — Preserves formatting/comments when + applying fixes. Critical for auto-fix functionality. +- **Side-effect-free parsing** — Using edamame instead of `clojure.core/read` + avoids executing reader macros. Important for security and for analysing code + with unknown dependencies. +- **Guard-based filtering** — Composable predicates that decide whether to + report a suggestion. Users can plug in custom guards. +- **Two resolution modes** — `:toplevel` (entire defn) vs `:subform` (individual + expressions). Different granularity for different use cases. + +--- + +## Mallet + +**Repo:** `refs/mallet/` — Common Lisp linter + formatter + fixer (~15,800 LOC) + +### What it does + +A **production-grade linter** for Common Lisp with 40+ rules across 7 +categories, auto-fixing, a powerful configuration system (presets with +inheritance), and multiple suppression mechanisms. Targets SBCL. + +### How it works + +**Three-phase pipeline:** + +``` +File content + → [1] Tokenize (hand-written tokenizer, preserves all tokens incl. comments) + → [2] Parse (Eclector reader with parse-result protocol → forms with precise positions) + → [3] Rule checking (text rules, token rules, form rules) + → Suppression filtering + → Auto-fix & formatting + → Report +``` + +**Critical design decision:** Symbols are stored as **strings**, not interned. +This means the parser never needs to resolve packages — safe to analyse code +with unknown dependencies. + +### Architecture + +| Module | Lines | Purpose | +|--------|-------|---------| +| `main.lisp` | ~600 | CLI parsing, entry point | +| `engine.lisp` | ~900 | Linting orchestration, suppression filtering | +| `config.lisp` | ~1,200 | Config files, presets, path-specific overrides | +| `parser/reader.lisp` | ~800 | Eclector integration, position tracking | +| `parser/tokenizer.lisp` | ~200 | Hand-written tokenizer | +| `suppression.lisp` | ~600 | Suppression state management | +| `formatter.lisp` | ~400 | Output formatters (text, JSON, line) | +| `fixer.lisp` | ~300 | Auto-fix application | +| `rules/` | ~5,500 | 40+ individual rule implementations | + +### Rules (40+) + +| Category | Rules | Examples | +|----------|-------|----------| +| Correctness | 2 | `ecase` with `otherwise`, missing `otherwise` | +| Suspicious | 5 | Runtime `eval`, symbol interning, `ignore-errors` | +| Practice | 6 | Avoid `:use` in `defpackage`, one package per file | +| Cleanliness | 4 | Unused variables, unused loop vars, unused imports | +| Style | 5 | `when`/`unless` vs `if` without else, needless `let*` | +| Format | 6 | Line length, trailing whitespace, tabs, blank lines | +| Metrics | 3 | Function length, cyclomatic complexity, comment ratio | +| ASDF | 8 | Component strings, redundant prefixes, secondary systems | +| Naming | 4 | `*special*` and `+constant+` conventions | +| Documentation | 4 | Missing docstrings (functions, packages, variables) | + +Rules are **classes** inheriting from a base `rule` class with generic methods: + +```lisp +(defclass if-without-else-rule (base:rule) + () + (:default-initargs + :name :missing-else + :severity :warning + :category :style + :type :form)) + +(defmethod base:check-form ((rule if-without-else-rule) form file) + ...) +``` + +### Configuration system + +**Layered presets with inheritance:** + +```lisp +(:mallet-config + (:extends :strict) + (:ignore "**/vendor/**") + (:enable :cyclomatic-complexity :max 15) + (:disable :function-length) + (:set-severity :metrics :info) + (:for-paths ("tests") + (:enable :line-length :max 120) + (:disable :unused-variables))) +``` + +Built-in presets: `:default`, `:strict`, `:all`, `:none`. + +Precedence: CLI flags > config file > preset inheritance > built-in defaults. + +### Suppression mechanisms (3 levels) + +1. **Declarations** — `#+mallet (declaim (mallet:suppress-next :rule-name))` +2. **Inline comments** — `; mallet:suppress rule-name` +3. **Region-based** — `; mallet:disable rule-name` / `; mallet:enable rule-name` +4. **Stale suppression detection** — Warns when suppressions don't match any violation + +### Auto-fix + +Fixes are collected, sorted bottom-to-top (to preserve line numbers), and +applied in a single pass. Fix types: `:replace-line`, `:delete-range`, +`:delete-lines`, `:replace-form`. + +### What we can learn + +- **Symbols as strings** is a crucial insight for Lisp linters. Avoids + package/module resolution entirely. We should do the same for Guile — parse + symbols without interning them. +- **Eclector-style parse-result protocol** — Every sub-expression gets precise + line/column info. Invest in this early; it's the foundation of accurate error + reporting. +- **Three rule types** (text, token, form) — Clean separation. Text rules don't + need parsing, token rules don't need a full AST, form rules get the full tree. + Efficient and composable. +- **Preset inheritance with path-specific overrides** — Powerful configuration + that scales from solo projects to monorepos. `:for-paths` is particularly + useful (different rules for `src/` vs `tests/`). +- **Multiple suppression mechanisms** — Comment-based, declaration-based, + region-based. Users need all three for real-world use. +- **Stale suppression detection** — Prevents suppression comments from + accumulating after the underlying issue is fixed. Brilliant. +- **Rule metaclass pattern** — Base class + generic methods scales cleanly to + 40+ rules. Each rule is self-contained with its own severity, category, and + check method. +- **Bottom-to-top fix application** — Simple trick that avoids line number + invalidation when applying multiple fixes to the same file. + +--- + +## OCICL Lint + +**Repo:** `refs/ocicl/` — Common Lisp linter (part of the OCICL package manager) + +### What it does + +A **129-rule linter with auto-fix** for Common Lisp, integrated into the OCICL +package manager as a subcommand (`ocicl lint`). Supports dry-run mode, +per-line suppression, and `.ocicl-lint.conf` configuration. + +### How it works + +**Three-pass analysis:** + +``` +File content + → [Pass 1] Line-based rules (text-level: whitespace, tabs, line length) + → [Pass 2] AST-based rules (via rewrite-cl zippers: naming, bindings, packages) + → [Pass 3] Single-pass visitor rules (pattern matching: 50+ checks in one traversal) + → Suppression filtering (per-line ; lint:suppress comments) + → Auto-fix (via fixer registry) + → Report +``` + +### Architecture + +``` +lint/ +├── linter.lisp — Main orchestrator, issue aggregation, output formatting +├── config.lisp — .ocicl-lint.conf parsing +├── parsing.lisp — rewrite-cl wrapper (zipper API) +├── fixer.lisp — Auto-fix infrastructure with RCS/backup support +├── main.lisp — CLI entry point +├── rules/ +│ ├── line-based.lisp — Text-level rules (9 rules) +│ ├── ast.lisp — AST-based rules (naming, lambda lists, bindings) +│ └── single-pass.lisp — Pattern matching rules (50+ in one walk) +└── fixes/ + ├── whitespace.lisp — Formatting fixes + └── style.lisp — Style rule fixes +``` + +### Rules (129) + +| Category | Count | Examples | +|----------|-------|---------| +| Formatting | 9 | Trailing whitespace, tabs, line length, blank lines | +| File structure | 3 | SPDX headers, package declarations, reader errors | +| Naming | 6 | Underscores, `*special*` style, `+constant+` style, vague names | +| Boolean/conditionals | 18 | `(IF test T NIL)` → `test`, `(WHEN (NOT x) ...)` → `(UNLESS x ...)` | +| Logic simplification | 12 | Flatten nested `AND`/`OR`, redundant conditions | +| Arithmetic | 4 | `(+ x 1)` → `(1+ x)`, `(= x 0)` → `(zerop x)` | +| List operations | 13 | `FIRST`/`REST` vs `CAR`/`CDR`, `(cons x nil)` → `(list x)` | +| Comparison | 5 | `EQL` vs `EQ`, string equality, membership testing | +| Sequence operations | 6 | `-IF-NOT` variants, `ASSOC` patterns | +| Advanced/safety | 26 | Library suggestions, destructive ops on constants | + +### Configuration + +INI-style `.ocicl-lint.conf`: + +```ini +max-line-length = 180 +suppress-rules = rule1, rule2, rule3 +suggest-libraries = alexandria, uiop, serapeum +``` + +Per-line suppression: + +```lisp +(some-code) ; lint:suppress rule-name1 rule-name2 +(other-code) ; lint:suppress ;; suppress ALL rules on this line +``` + +### Fixer registry + +```lisp +(register-fixer "rule-name" #'fixer-function) +``` + +Fixers are decoupled from rule detection. Each fixer takes `(content issue)` and +returns modified content or NIL. Supports RCS backup before modification. + +### What we can learn + +- **Single-pass visitor for pattern rules** — 50+ pattern checks in one tree + traversal. Much faster than running each rule separately. Good model for + performance-sensitive linting. +- **Quote awareness** — Detects quoted contexts (`'x`, `quote`, backtick) to + avoid false positives inside macro templates. We'll need the same for Guile. +- **Fixer registry pattern** — Decouples detection from fixing. Easy to add + auto-fix for a rule without touching the rule itself. +- **Library suggestion rules** — "You could use `(alexandria:when-let ...)` + instead of this pattern." Interesting category that could work for Guile + (SRFI suggestions, etc.). +- **Three-pass architecture** — Line-based first (fastest, no parsing needed), + then AST, then pattern matching. Each pass adds cost; skip what you don't need. + +--- + +## racket-review + +**Repo:** `refs/racket-review/` — Racket linter (v0.2, by Bogdan Popa) + +### What it does + +A **surface-level linter** for Racket modules. Intentionally does NOT expand +macros — analyses syntax only, optimised for **speed**. Designed for tight +editor integration (ships with Flycheck for Emacs). + +### How it works + +``` +File → read-syntax (Racket's built-in reader) + → Validate as module form (#lang) + → Walk syntax tree via syntax-parse + → Track scopes, bindings, provides, usages + → Report problems +``` + +The entire rule system is built on Racket's `syntax/parse` — pattern matching +on syntax objects with guard conditions and side effects. + +### Architecture + +Remarkably compact: + +| File | Lines | Purpose | +|------|-------|---------| +| `lint.rkt` | 1,130 | **All linting rules** + semantic tracking | +| `problem.rkt` | 26 | Problem data structure | +| `cli.rkt` | 25 | CLI interface | +| `ext.rkt` | 59 | Extension mechanism | + +### Semantic tracking + +Maintains multiple **parameter-based state machines**: + +- **Scope stack** — Hierarchical scope with parent links, binding hash at each level +- **Binding info** — Per-identifier: syntax object, usage count, check flag, + related identifiers +- **Provide tracking** — What's explicitly `provide`d vs `all-defined-out` +- **Punted bindings** — Forward references resolved when definition is encountered +- **Savepoints** — Save/restore state for tentative matching in complex patterns + +### Rules + +**Errors (23 patterns):** +- Identifier already defined in same scope +- `if` missing else branch +- `let`/`for` missing body +- `case` clauses not quoted literals +- Wrong match fallthrough pattern (`_` not `else`) +- Provided but not defined + +**Warnings (17+ patterns):** +- Identifier never used +- Brackets: `let` bindings should use `[]`, not `()` +- Requires not sorted (for-syntax first, then alphabetical) +- Cond without else clause +- Nested if (flatten to cond) +- `racket/contract` → use `racket/contract/base` + +### Suppression + +```racket +#|review: ignore|# ;; Ignore entire file +;; noqa ;; Ignore this line +;; review: ignore ;; Ignore this line +``` + +### Extension mechanism + +Plugins register via Racket's package system: + +```racket +(define review-exts + '((module-path predicate-proc lint-proc))) +``` + +Extensions receive a `current-reviewer` parameter with API: +`recur`, `track-error!`, `track-warning!`, `track-binding!`, `push-scope!`, +`pop-scope!`, `save!`, `undo!`. + +### What we can learn + +- **Surface-level analysis is fast and useful** — No macro expansion means + instant feedback. Catches the majority of real mistakes. Good default for + editor integration; deeper analysis can be opt-in. +- **syntax-parse as rule DSL** — Pattern matching on syntax objects is a natural + fit for Lisp linters. Guile has `syntax-case` and `match` which serve a + similar role. +- **Scope tracking with punted bindings** — Handles forward references in a + single pass. Elegant solution for `letrec`-style bindings and mutual recursion. +- **Savepoints for tentative matching** — Save/restore state when the parser + enters a complex branch. If the branch fails, roll back. Useful for `cond`, + `match`, etc. +- **Plugin API via reviewer parameter** — Extensions get a well-defined API + surface. Clean contract between core and plugins. +- **Snapshot-based testing** — 134 test files with `.rkt`/`.rkt.out` pairs. + Lint a file, compare output to expected. Simple, maintainable, high coverage. +- **Bracket style enforcement** — Racket uses `[]` for bindings, `()` for + application. Guile doesn't have this, but we could enforce consistent bracket + usage or other parenthesis conventions. + +--- + +## SBLint + +**Repo:** `refs/sblint/` — SBCL compiler-driven linter (~650 LOC) + +### What it does + +A **compiler-assisted linter** for Common Lisp. Doesn't implement its own rules — +instead, it **compiles code through SBCL** and surfaces all compiler diagnostics +(errors, warnings, style notes) with proper file locations. + +### How it works + +``` +Source code + → Resolve ASDF dependencies (topological sort) + → Load dependencies via Quicklisp + → Compile project via SBCL (handler-bind captures conditions) + → Extract file/position from compiler internals (Swank protocol) + → Convert byte offset → line:column + → Deduplicate and report +``` + +No custom parser. No AST. Just the compiler. + +### Architecture + +| File | Lines | Purpose | +|------|-------|---------| +| `run-lint.lisp` | 277 | Core logic: lint file/system/directory | +| `compiler-aux.lisp` | 33 | SBCL introspection bridge | +| `asdf.lisp` | 153 | Dependency resolution graph | +| `file-position.lisp` | 18 | Byte offset → line:column conversion | +| `quicklisp.lisp` | 41 | Auto-install missing dependencies | +| `sblint.ros` | — | CLI entry point (Roswell script) | + +### What it catches + +Whatever SBCL catches: +- Undefined variables and functions +- Type mismatches (with SBCL's type inference) +- Style warnings (ANSI compliance, naming) +- Reader/syntax errors +- Dead code paths +- Unused declarations + +Filters out: redefinition warnings, Quicklisp dependency warnings, SBCL +contrib warnings. + +### What we can learn + +- **Leverage the host compiler** — Guile itself has `compile` and can produce + warnings. We should capture Guile's own compiler diagnostics (undefined + variables, unused imports, etc.) as a baseline — it's "free" accuracy. +- **Condition-based error collection** — CL's condition system (≈ Guile's + exception/handler system) lets you catch errors without stopping compilation. + `handler-bind` continues execution after catching. Guile's `with-exception-handler` + can do the same. +- **Dependency-aware compilation** — Load dependencies first, then compile + project. Catches "symbol not found" errors that surface-level analysis misses. +- **Deduplication** — Multiple compilation passes can report the same issue. + Hash table dedup is simple and effective. +- **Minimal is viable** — 650 LOC total. A compiler-driven linter layer could + be our first deliverable, augmented with custom rules later. + +--- + +## Cross-cutting themes + +### Parsing strategies + +| Strategy | Used by | Pros | Cons | +|----------|---------|------|------| +| Host compiler | SBLint, Eastwood | Maximum accuracy, type checking | Requires loading code, slow | +| Custom reader with positions | Mallet, fmt | Full control, no side effects | Must maintain parser | +| Language's built-in reader | racket-review | Free, well-tested | May lack position info | +| Side-effect-free reader lib | Kibit (edamame) | Safe, preserves metadata | External dependency | +| Zipper-based AST | OCICL (rewrite-cl) | Preserves formatting for fixes | Complex API | + +**For Guile:** We should explore whether `(ice-9 read)` or Guile's reader +provides sufficient source location info. If not, a custom reader (or a reader +wrapper that annotates with positions) is needed. Guile's `read-syntax` (if +available) or source properties on read forms could be the answer. + +### Rule definition patterns + +| Pattern | Used by | Character | +|---------|---------|-----------| +| Logic programming (unification) | Kibit | Elegant, concise; slow | +| OOP classes + generic methods | Mallet | Scales well, self-contained rules | +| Registry maps | Eastwood | Simple, data-driven | +| Syntax-parse patterns | racket-review, fmt | Natural for Lisps | +| Single-pass visitor | OCICL | High performance | +| Compiler conditions | SBLint | Zero-effort, limited scope | + +**For Guile:** A combination seems right — `match`/`syntax-case` patterns for +the rule DSL (natural in Scheme), with a registry for rule metadata (name, +severity, category, enabled-by-default). + +### Configuration patterns + +| Feature | Mallet | OCICL | Eastwood | Kibit | racket-review | fmt | +|---------|--------|-------|----------|-------|---------------|-----| +| Config file | `.mallet.lisp` | `.ocicl-lint.conf` | Clojure maps | `project.clj` | - | `.fmt.rkt` | +| Presets | Yes (4) | - | - | - | - | - | +| Preset inheritance | Yes | - | - | - | - | - | +| Path-specific rules | Yes | - | - | - | - | - | +| Inline suppression | Yes (3 mechanisms) | Yes | Yes | - | Yes | - | +| Stale suppression detection | Yes | - | - | - | - | - | +| CLI override | Yes | Yes | Yes | Yes | - | Yes | + +**For Guile:** Mallet's configuration system is the most sophisticated and +worth emulating — presets, inheritance, path-specific overrides, and stale +suppression detection. + +### Auto-fix patterns + +| Tool | Fix mechanism | Preserves formatting? | +|------|--------------|----------------------| +| Kibit | rewrite-clj zippers | Yes | +| Mallet | Bottom-to-top line replacement | Partial | +| OCICL | Fixer registry + zipper AST | Yes | + +**For Guile:** Zipper-based AST manipulation (or Guile's SXML tools) for +formatting-preserving fixes. The fixer registry pattern (OCICL) keeps rule +detection and fixing decoupled. + +### Output formats + +All tools support at minimum: `file:line:column: severity: message` + +Additional formats: JSON (Mallet), Markdown (Kibit), line-only for CI (Mallet). + +--- + +## Feature wishlist for gulie + +Based on this survey, the features worth cherry-picking: + +### Must-have (core) + +1. **Guile compiler diagnostics** — Capture Guile's own warnings as baseline (SBLint approach) +2. **Custom reader with source positions** — Every form, subform, and token gets line:column +3. **Staged pipeline** — Text rules → token rules → form rules (Mallet/OCICL) +4. **Pattern-based rule DSL** — Using Guile's `match` or `syntax-case` (Kibit/racket-review inspiration) +5. **Rule registry** — `{name, severity, category, enabled-by-default, check-fn}` (Eastwood) +6. **Standard output format** — `file:line:column: severity: rule: message` +7. **Inline suppression** — `; gulie:suppress rule-name` (Mallet/OCICL) + +### Should-have (v1) + +8. **Config file** — `.gulie.scm` with presets and rule enable/disable (Mallet) +9. **Auto-fix infrastructure** — Fixer registry, bottom-to-top application (OCICL/Mallet) +10. **Idiom suggestions** — Pattern → replacement rules (Kibit style) +11. **Unused binding detection** — Scope tracking with forward reference handling (racket-review) +12. **Quote/unquote awareness** — Don't lint inside quoted forms (OCICL) +13. **Snapshot-based testing** — `.scm`/`.expected` pairs (racket-review) + +### Nice-to-have (v2+) + +14. **Code formatter** — Cost-based optimal layout (fmt) +15. **Pluggable formatter maps** — Per-form formatting rules (fmt) +16. **Path-specific rule overrides** — Different rules for `src/` vs `tests/` (Mallet) +17. **Stale suppression detection** (Mallet) +18. **Editor integration** — Flycheck/flymake for Emacs (racket-review) +19. **Macroexpansion-aware analysis** — Suppress false positives from macro output (Eastwood) +20. **Cyclomatic complexity and other metrics** (Mallet) diff --git a/docs/PLAN.md b/docs/PLAN.md new file mode 100644 index 0000000..086298a --- /dev/null +++ b/docs/PLAN.md @@ -0,0 +1,470 @@ +# Gulie — Guile Linter/Formatter: Architecture & Implementation Plan + +## Context + +No linter, formatter, or static analyser exists for Guile Scheme. We're building +one from scratch, called **gulie**. The tool is written in Guile itself, reusing +as much of Guile's infrastructure as possible (reader, compiler, Tree-IL +analyses, warning system). The design draws on patterns observed in 7 reference +tools (see `docs/INSPIRATION.md`). + +Guile 3.0.11 is available in the devenv. No source code exists yet. + +--- + +## High-level architecture + +Two independent passes, extensible to three: + +``` + .gulie.sexp (config) + | + file.scm ──┬──> [Tokenizer] ──> tokens ──> [CST parser] ──> CST + | | + | [Pass 1: Surface] line rules + CST rules + | | + | diagnostics-1 + | + └──> [Guile reader] ──> s-exprs ──> [Guile compiler] ──> Tree-IL + | + [Pass 2: Semantic] built-in analyses + custom Tree-IL rules + | + diagnostics-2 + | + [merge + suppress + sort + report/fix] +``` + +**Why two passes?** Guile's reader (`ice-9/read.scm:949-973`) irrecoverably +strips comments, whitespace, and datum comments in `next-non-whitespace`. There +is no way to get formatting info AND semantic info from one parse. Accepting this +and building two clean, independent passes is simpler than fighting the reader. + +--- + +## Module structure + +``` +gulie/ + bin/gulie # CLI entry point (executable Guile script) + gulie/ + cli.scm # (gulie cli) — arg parsing, dispatch + config.scm # (gulie config) — .gulie.sexp loading, defaults, merging + diagnostic.scm # (gulie diagnostic) — record type, sorting, formatting + tokenizer.scm # (gulie tokenizer) — hand-written lexer, preserves everything + cst.scm # (gulie cst) — token stream → concrete syntax tree + compiler.scm # (gulie compiler) — Guile compile wrapper, warning capture + rule.scm # (gulie rule) — rule record, registry, define-rule macros + engine.scm # (gulie engine) — orchestrator: file discovery, pass sequencing + fixer.scm # (gulie fixer) — fix application (bottom-to-top edits) + suppression.scm # (gulie suppression) — ; gulie:suppress parsing/filtering + formatter.scm # (gulie formatter) — cost-based optimal pretty-printer + rules/ + surface.scm # (gulie rules surface) — trailing-ws, line-length, tabs, blanks + indentation.scm # (gulie rules indentation) — indent checking vs CST + comments.scm # (gulie rules comments) — comment style conventions + semantic.scm # (gulie rules semantic) — wrappers around Guile's analyses + idiom.scm # (gulie rules idiom) — pattern-based suggestions via match + module-form.scm # (gulie rules module-form) — define-module checks + test/ + test-tokenizer.scm + test-cst.scm + test-rules-surface.scm + test-rules-semantic.scm + fixtures/ + clean/ # .scm files producing zero diagnostics + violations/ # .scm + .expected pairs (snapshot testing) +``` + +~16 source files. Each has one clear job. + +--- + +## Key components + +### Tokenizer (`gulie/tokenizer.scm`) + +Hand-written character-by-character state machine. Must handle the same lexical +syntax as Guile's reader but **preserve** what the reader discards. + +```scheme +(define-record-type + (make-token type text line column) + token? + (type token-type) ;; symbol (see list below) + (text token-text) ;; string: exact source text + (line token-line) ;; integer: 1-based + (column token-column)) ;; integer: 0-based +``` + +Token types (~15): `open-paren`, `close-paren`, `symbol`, `number`, `string`, +`keyword`, `boolean`, `character`, `prefix` (`'`, `` ` ``, `,`, `,@`, `#'`, +etc.), `special` (`#;`, `#(`, `#vu8(`, etc.), `line-comment`, `block-comment`, +`whitespace`, `newline`, `dot`. + +**Critical invariant:** `(string-concatenate (map token-text (tokenize input)))` must +reproduce the original input exactly. This is our primary roundtrip test. + +Estimated size: ~200-250 lines. Reference: Mallet's tokenizer (163 lines CL). + +### CST (`gulie/cst.scm`) + +Trivial parenthesised tree built from the token stream: + +```scheme +(define-record-type + (make-cst-node open close children) + cst-node? + (open cst-node-open) ;; for ( [ { + (close cst-node-close) ;; for ) ] } + (children cst-node-children)) ;; list of | +``` + +Children is a flat list of interleaved atoms (tokens) and nested nodes. Comments +and whitespace are children like anything else. + +The first non-whitespace symbol child of a `` identifies the form +(`define`, `let`, `cond`, etc.) — enough for indentation rules. + +Estimated size: ~80-100 lines. + +### Compiler wrapper (`gulie/compiler.scm`) + +Wraps Guile's compile pipeline to capture warnings as structured diagnostics: + +```scheme +;; Key Guile APIs we delegate to: +;; - (system base compile): read-and-compile, compile, default-warning-level +;; - (language tree-il analyze): make-analyzer, analyze-tree +;; - (system base message): %warning-types, current-warning-port +``` + +Strategy: call `read-and-compile` with `#:to 'tree-il` and `#:warning-level 2` +while redirecting `current-warning-port` to a string port, then parse the +warning output into `` records. Alternatively, invoke `make-analyzer` +directly and hook the warning printers. + +Guile's built-in analyses (all free): +- `unused-variable-analysis` +- `unused-toplevel-analysis` +- `unused-module-analysis` +- `shadowed-toplevel-analysis` +- `make-use-before-definition-analysis` (unbound variables) +- `arity-analysis` (wrong arg count) +- `format-analysis` (format string validation) + +### Rule system (`gulie/rule.scm`) + +```scheme +(define-record-type + (make-rule name description severity category type check-proc fix-proc) + rule? + (name rule-name) ;; symbol + (description rule-description) ;; string + (severity rule-severity) ;; 'error | 'warning | 'info + (category rule-category) ;; 'format | 'style | 'correctness | 'idiom + (type rule-type) ;; 'line | 'cst | 'tree-il + (check-proc rule-check-proc) ;; procedure (signature depends on type) + (fix-proc rule-fix-proc)) ;; procedure | #f +``` + +Three rule types with different check signatures: +- **`'line`** — `(lambda (file line-num line-text config) -> diagnostics)` — fastest, no parsing +- **`'cst`** — `(lambda (file cst config) -> diagnostics)` — needs tokenizer+CST +- **`'tree-il`** — `(lambda (file tree-il env config) -> diagnostics)` — needs compilation + +Global registry: `*rules*` alist, populated at module load time via +`register-rule!`. Convenience macros: `define-line-rule`, `define-cst-rule`, +`define-tree-il-rule`. + +### Diagnostic record (`gulie/diagnostic.scm`) + +```scheme +(define-record-type + (make-diagnostic file line column severity rule message fix) + diagnostic? + (file diagnostic-file) ;; string + (line diagnostic-line) ;; integer, 1-based + (column diagnostic-column) ;; integer, 0-based + (severity diagnostic-severity) ;; symbol + (rule diagnostic-rule) ;; symbol + (message diagnostic-message) ;; string + (fix diagnostic-fix)) ;; | #f +``` + +Standard output: `file:line:column: severity: rule: message` + +### Config (`gulie/config.scm`) + +File: `.gulie.sexp` in project root (plain s-expression, read with `(read)`, +never evaluated): + +```scheme +((line-length . 80) + (indent . 2) + (enable trailing-whitespace line-length unused-variable arity-mismatch) + (disable tabs) + (rules + (line-length (max . 100))) + (indent-rules + (with-syntax . 1) + (match . 1)) + (ignore "build/**" ".direnv/**")) +``` + +Precedence: CLI flags > config file > built-in defaults. + +`--init` generates a template with all rules listed and commented. + +### Suppression (`gulie/suppression.scm`) + +```scheme +;; gulie:suppress trailing-whitespace — suppress on next line +(define x "messy") + +(define x "messy") ; gulie:suppress — suppress on this line + +;; gulie:disable line-length — region disable +... code ... +;; gulie:enable line-length — region enable +``` + +Parsed from raw text before rules run. Produces a suppression map that filters +diagnostics after all rules have emitted. + +--- + +## Indentation rules + +The key data is `scheme-indent-function` values from `.dir-locals.el` — an +integer N meaning "N arguments on first line, then body indented +2": + +```scheme +(define *default-indent-rules* + '((define . 1) (define* . 1) (define-public . 1) (define-syntax . 1) + (define-module . 0) (lambda . 1) (lambda* . 1) + (let . 1) (let* . 1) (letrec . 1) (letrec* . 1) + (if . #f) (cond . 0) (case . 1) (when . 1) (unless . 1) + (match . 1) (syntax-case . 2) (with-syntax . 1) + (begin . 0) (do . 2) (parameterize . 1) (guard . 1))) +``` + +Overridable via config `indent-rules`. The indentation checker walks the CST, +identifies the form by its first symbol child, looks up the rule, and compares +actual indentation to expected. + +--- + +## Formatting conventions (Guile vs Guix) + +Both use 2-space indent, same special-form conventions. Key difference: +- **Guile:** 72-char fill column, `;;; {Section}` headers +- **Guix:** 78-80 char fill column, `;;` headers + +Our default config targets Guile conventions. A Guix preset can override +`line-length` and comment style. + +--- + +## Formatter: cost-based optimal pretty-printing + +The formatter (`gulie/formatter.scm`) is a later-phase component that +**rewrites** files with correct layout, as opposed to the indentation checker +which merely **reports** violations. + +### Why cost-based? + +When deciding where to break lines in a long expression, there are often multiple +valid options. A greedy approach (fill as much as fits, then break) produces +mediocre output — it can't "look ahead" to see that a break earlier would produce +a better overall layout. The Wadler/Leijen family of algorithms evaluates +alternative layouts and selects the optimal one. + +### The algorithm (Wadler/Leijen, as used by fmt's `pretty-expressive`) + +The pretty-printer works with an abstract **document** type: + +``` +doc = text(string) — literal text + | line — line break (or space if flattened) + | nest(n, doc) — increase indent by n + | concat(doc, doc) — concatenation + | alt(doc, doc) — choose better of two layouts + | group(doc) — try flat first, break if doesn't fit +``` + +The key operator is `alt(a, b)` — "try layout A, but if it overflows the page +width, use layout B instead." The algorithm evaluates both alternatives and +picks the one with the lower **cost vector**: + +``` +cost = [badness, height, characters] + + badness — quadratic penalty for exceeding page width + height — number of lines used + characters — total chars (tiebreaker) +``` + +This produces provably optimal output: the layout that minimises overflow while +using the fewest lines. + +### How it fits our architecture + +``` +CST (from tokenizer + cst.scm) + → [doc generator] convert CST nodes to abstract doc, using form-specific rules + → [layout solver] evaluate alternatives, select optimal layout + → [renderer] emit formatted text with comments preserved +``` + +The **doc generator** uses the same form-identification logic as the indentation +checker (first symbol child of a CST node) to apply form-specific layout rules. +For example: + +- `define` — name on first line, body indented +- `let` — bindings as aligned block, body indented +- `cond` — each clause on its own line + +These rules are data (the `indent-rules` table extended with layout hints), +making the formatter configurable just like the checker. + +### Implementation approach + +We can either: +1. **Port `pretty-expressive`** from Racket — the core algorithm is ~300 lines, + well-documented in academic papers +2. **Upgrade Guile's `(ice-9 pretty-print)`** — it already knows form-specific + indentation rules but uses greedy layout; we'd replace the layout engine with + cost-based selection + +Option 1 is cleaner (purpose-built). Option 2 reuses more existing code but +would be a heavier modification. We'll decide when we reach that phase. + +### Phase note + +The formatter is **Phase 6** work. Phases 0-4 deliver a useful checker without +it. The indentation checker (Phase 4) validates existing formatting; the +formatter (Phase 6) rewrites it. The checker comes first because it's simpler +and immediately useful in CI. + +--- + +## CLI interface + +``` +gulie [OPTIONS] [FILE|DIR...] + + --check Report issues, exit non-zero on findings (default) + --fix Fix mode: auto-fix what's possible, report the rest + --format Format mode: rewrite files with optimal layout + --init Generate .gulie.sexp template + --pass PASS Run only: surface, semantic, all (default: all) + --rule RULE Enable only this rule (repeatable) + --disable RULE Disable this rule (repeatable) + --severity SEV Minimum severity: error, warning, info + --output FORMAT Output: standard (default), json, compact + --config FILE Config file path (default: auto-discover) + --list-rules List all rules and exit + --version Print version +``` + +Exit codes: 0 = clean, 1 = findings, 2 = config error, 3 = internal error. + +--- + +## Implementation phases + +### Phase 0: Skeleton +- `bin/gulie` — shebang script, loads CLI module +- `(gulie cli)` — basic arg parsing (`--check`, `--version`, file args) +- `(gulie diagnostic)` — record type + standard formatter +- `(gulie rule)` — record type + registry + `register-rule!` +- `(gulie engine)` — discovers `.scm` files, runs line rules, reports +- One trivial rule: `trailing-whitespace` (line rule) +- **Verification:** `gulie --check some-file.scm` reports trailing whitespace + +### Phase 1: Tokenizer + CST + surface rules +- `(gulie tokenizer)` — hand-written lexer +- `(gulie cst)` — token → tree +- Surface rules: `trailing-whitespace`, `line-length`, `no-tabs`, `blank-lines` +- Comment rule: `comment-semicolons` (check `;`/`;;`/`;;;` usage) +- Roundtrip test: tokenize → concat = original +- Snapshot tests for each rule + +### Phase 2: Semantic rules (compiler pass) +- `(gulie compiler)` — `read-and-compile` wrapper, warning capture +- Semantic rules wrapping Guile's built-in analyses: + `unused-variable`, `unused-toplevel`, `unbound-variable`, `arity-mismatch`, + `format-string`, `shadowed-toplevel`, `unused-module` +- **Verification:** run against Guile and Guix source files, check false-positive rate + +### Phase 3: Config + suppression +- `(gulie config)` — `.gulie.sexp` loading + merging +- `(gulie suppression)` — inline comment suppression +- `--init` command +- Rule enable/disable via config and CLI + +### Phase 4: Indentation checking +- `(gulie rules indentation)` — CST-based indent checker +- Default indent rules for standard Guile forms +- Configurable `indent-rules` in `.gulie.sexp` + +### Phase 5: Fix mode + idiom rules +- `(gulie fixer)` — bottom-to-top edit application +- Auto-fix for: trailing whitespace, line-length (where possible) +- `(gulie rules idiom)` — `match`-based pattern suggestions on Tree-IL +- `(gulie rules module-form)` — `define-module` form checks (sorted imports, etc.) + +### Phase 6: Formatter (cost-based optimal layout) +- `(gulie formatter)` — Wadler/Leijen pretty-printer with cost-based selection +- Abstract document type: `text`, `line`, `nest`, `concat`, `alt`, `group` +- Form-specific layout rules (reuse indent-rules table + layout hints) +- Comment preservation through formatting +- `--format` CLI mode +- **Verification:** format Guile/Guix source files, diff against originals, + verify roundtrip stability (format twice = same output) + +### Phase 7: Cross-module analysis (future) +- Load multiple modules, walk dependency graph +- Unused exports, cross-module arity checks +- `--pass cross-module` CLI option + +--- + +## Testing strategy + +1. **Roundtrip test** (tokenizer): tokenize → concat must equal original input +2. **Snapshot tests**: `fixtures/violations/rule-name.scm` + `.expected` pairs +3. **Clean file tests**: `fixtures/clean/*.scm` must produce zero diagnostics +4. **Unit tests**: `(srfi srfi-64)` for tokenizer, CST, config, diagnostics +5. **Real-world corpus**: run against `test/guix/` and `refs/guile/module/` for + false-positive rate validation +6. **Formatter idempotency**: `format(format(x)) = format(x)` for all test files + +--- + +## Key design decisions + +| Decision | Rationale | +|----------|-----------| +| Hand-written tokenizer, not extending Guile's reader | The reader is ~1000 lines of nested closures not designed for extension. A clean 200-line tokenizer is easier to write/test. | +| Two independent passes, not a unified AST | Reader strips comments irrecoverably. Accepting this gives clean separation. | +| Delegate to Guile's built-in analyses | They're battle-tested, handle macroexpansion edge cases, and are maintained upstream. | +| `(ice-9 match)` for idiom rules, not logic programming | Built-in, fast, sufficient. miniKanren can be added later if needed. | +| S-expression config, not YAML/TOML | Zero deps. Our users write Scheme. `(read)` does the parsing. | +| Flat CST (parens + interleaved tokens), not rich AST | Enough for indentation/formatting checks. No overengineering. | +| Cost-based optimal layout for the formatter | Greedy formatters produce mediocre output. Wadler/Leijen is cleaner and provably correct. Worth the investment when we reach that phase. | +| Checker first, formatter later | Checking is simpler, immediately useful in CI, and validates the tokenizer/CST infrastructure that the formatter will build on. | + +--- + +## Critical files to reference during implementation + +- `refs/guile/module/ice-9/read.scm:949-973` — what the reader discards (our tokenizer must keep) +- `refs/guile/module/language/tree-il/analyze.scm:1461-1479` — `make-analyzer` API +- `refs/guile/module/system/base/compile.scm:298-340` — `read-and-compile` / `compile` +- `refs/guile/module/system/base/message.scm:83-220` — `%warning-types` definitions +- `refs/guile/module/language/tree-il.scm` — Tree-IL node types and traversal +- `refs/guile/module/ice-9/pretty-print.scm` — existing pretty-printer (form-specific rules to extract) +- `refs/mallet/src/parser/tokenizer.lisp` — reference tokenizer (163 lines) +- `refs/fmt/conventions.rkt` — form-specific formatting rules (100+ forms) +- `refs/fmt/main.rkt` — cost-based layout selection implementation diff --git a/gulie/cli.scm b/gulie/cli.scm new file mode 100644 index 0000000..199211c --- /dev/null +++ b/gulie/cli.scm @@ -0,0 +1,105 @@ +;;; (gulie cli) — command-line argument parsing and main dispatch + +(define-module (gulie cli) + #:use-module (ice-9 getopt-long) + #:use-module (ice-9 format) + #:use-module (srfi srfi-1) + #:use-module (gulie config) + #:use-module (gulie engine) + #:use-module (gulie rule) + #:use-module (gulie diagnostic) + #:export (main)) + +(define version "0.1.0") + +(define option-spec + '((help (single-char #\h) (value #f)) + (version (single-char #\v) (value #f)) + (check (value #f)) + (fix (value #f)) + (init (value #f)) + (pass (value #t)) + (config (value #t)) + (rule (value #t)) + (disable (value #t)) + (severity (value #t)) + (output (value #t)) + (list-rules (value #f)))) + +(define (show-help) + (display "gulie — a linter and formatter for Guile Scheme\n\n") + (display "Usage: gulie [OPTIONS] [FILE|DIR...]\n\n") + (display "Options:\n") + (display " -h, --help Show this help message\n") + (display " -v, --version Print version\n") + (display " --check Check mode (default): report issues\n") + (display " --fix Fix mode: auto-fix what's possible\n") + (display " --init Generate .gulie.sexp template\n") + (display " --pass PASS Run only: surface, semantic, all (default: all)\n") + (display " --config FILE Config file path\n") + (display " --rule RULE Enable only this rule (repeatable)\n") + (display " --disable RULE Disable this rule\n") + (display " --severity SEV Minimum severity: error, warning, info\n") + (display " --output FORMAT Output format: standard, json, compact\n") + (display " --list-rules List all available rules\n")) + +(define (show-version) + (format #t "gulie ~a~%" version)) + +(define (list-all-rules) + (let ((rules (all-rules))) + (if (null? rules) + (display "No rules registered.\n") + (for-each + (lambda (r) + (format #t " ~20a ~8a ~10a ~a~%" + (rule-name r) + (rule-severity r) + (rule-category r) + (rule-description r))) + (sort rules (lambda (a b) + (stringstring (rule-name a)) + (symbol->string (rule-name b))))))))) + +(define (main args) + (let* ((options (getopt-long args option-spec)) + (rest (option-ref options '() '()))) + + (cond + ((option-ref options 'help #f) + (show-help) + 0) + + ((option-ref options 'version #f) + (show-version) + 0) + + ((option-ref options 'list-rules #f) + ;; Ensure rules are loaded + (list-all-rules) + 0) + + ((option-ref options 'init #f) + (let ((path ".gulie.sexp")) + (if (file-exists? path) + (begin + (format (current-error-port) "~a already exists~%" path) + 2) + (begin + (call-with-output-file path generate-template) + (format #t "i Generated ~a~%" path) + 0)))) + + (else + (let* ((config-path (option-ref options 'config #f)) + (user-config (load-config config-path)) + (config (merge-configs default-config user-config)) + (paths (if (null? rest) (list ".") rest)) + (ignore-pats (config-ignore-patterns config)) + (files (discover-scheme-files paths ignore-pats))) + (if (null? files) + (begin + (display "No Scheme files found.\n" (current-error-port)) + 0) + (let ((count (lint-files files config))) + (if (> count 0) 1 0)))))))) diff --git a/gulie/compiler.scm b/gulie/compiler.scm new file mode 100644 index 0000000..5b0b1cd --- /dev/null +++ b/gulie/compiler.scm @@ -0,0 +1,92 @@ +;;; (gulie compiler) — Guile compiler wrapper for semantic analysis +;;; +;;; Wraps Guile's compile pipeline to capture compiler warnings +;;; (unused variables, arity mismatches, format string errors, etc.) +;;; as structured records. + +(define-module (gulie compiler) + #:use-module (system base compile) + #:use-module (system base message) + #:use-module (ice-9 regex) + #:use-module (ice-9 match) + #:use-module (srfi srfi-1) + #:use-module (gulie diagnostic) + #:export (compile-and-capture-warnings)) + +;; Regex to parse Guile's warning format: +;; ;;; file:line:column: warning: message +(define *warning-re* + (make-regexp "^;;; ([^:]+):([0-9]+):([0-9]+): warning: (.+)$")) + +(define (parse-warning-line text file) + "Parse a warning line from Guile's compiler output into a ." + (let ((m (regexp-exec *warning-re* text))) + (if m + (let ((wfile (match:substring m 1)) + (wline (string->number (match:substring m 2))) + (wcol (string->number (match:substring m 3))) + (wmsg (match:substring m 4))) + (make-diagnostic + (if (string=? wfile "") file wfile) + wline wcol + 'warning + (classify-warning wmsg) + wmsg + #f)) + #f))) + +(define (classify-warning msg) + "Derive a rule name symbol from a warning message." + (cond + ((string-contains msg "unused variable") 'unused-variable) + ((string-contains msg "unused local top-level") 'unused-toplevel) + ((string-contains msg "unused module") 'unused-module) + ((string-contains msg "shadows previous") 'shadowed-toplevel) + ((string-contains msg "unbound variable") 'unbound-variable) + ((string-contains msg "wrong number of arguments") 'arity-mismatch) + ((string-contains msg "used before definition") 'use-before-definition) + ((string-contains msg "macro") 'macro-use-before-definition) + ((string-contains msg "format") 'format-string) + ((string-contains msg "non-idempotent") 'non-idempotent-definition) + ((string-contains msg "duplicate datum") 'duplicate-case-datum) + ((string-contains msg "cannot be meaningfully") 'bad-case-datum) + (else 'compiler-warning))) + +(define (compile-and-capture-warnings file text config) + "Compile TEXT (as if from FILE) and capture all compiler warnings. +Returns a list of records." + (let* ((warning-output (open-output-string)) + (diagnostics '())) + (parameterize ((current-warning-port warning-output)) + (catch #t + (lambda () + (let ((port (open-input-string text))) + (set-port-filename! port file) + ;; Compile to CPS — analyses run during tree-il→CPS lowering + (read-and-compile port + #:from 'scheme + #:to 'cps + #:warning-level 3 + #:env (make-fresh-user-module)))) + (lambda (key . args) + ;; Compilation errors become diagnostics too + (let ((msg (call-with-output-string + (lambda (p) + (display key p) + (display ": " p) + (for-each (lambda (a) (display a p) (display " " p)) + args))))) + (set! diagnostics + (cons (make-diagnostic file 1 0 'error 'compile-error msg #f) + diagnostics)))))) + ;; Parse captured warnings + (let* ((output (get-output-string warning-output)) + (lines (string-split output #\newline))) + (for-each + (lambda (line) + (when (> (string-length line) 0) + (let ((diag (parse-warning-line line file))) + (when diag + (set! diagnostics (cons diag diagnostics)))))) + lines)) + diagnostics)) diff --git a/gulie/config.scm b/gulie/config.scm new file mode 100644 index 0000000..b576efa --- /dev/null +++ b/gulie/config.scm @@ -0,0 +1,129 @@ +;;; (gulie config) — configuration loading and merging +;;; +;;; Reads .gulie.sexp from project root (or CLI-specified path), +;;; merges with built-in defaults, and provides config accessors. + +(define-module (gulie config) + #:use-module (ice-9 rdelim) + #:use-module (srfi srfi-1) + #:export (default-config + load-config + merge-configs + config-ref + config-line-length + config-indent-width + config-max-blank-lines + config-enabled-rules + config-disabled-rules + config-ignore-patterns + config-indent-rules + generate-template)) + +(define default-config + '((line-length . 80) + (indent . 2) + (max-blank-lines . 2) + (enable . ()) + (disable . ()) + (ignore . ()) + (indent-rules + (define . 1) + (define* . 1) + (define-public . 1) + (define-syntax . 1) + (define-syntax-rule . 1) + (define-module . 0) + (define-record-type . 1) + (lambda . 1) + (lambda* . 1) + (let . 1) + (let* . 1) + (letrec . 1) + (letrec* . 1) + (let-values . 1) + (if . special) + (cond . 0) + (case . 1) + (when . 1) + (unless . 1) + (match . 1) + (match-lambda . 0) + (match-lambda* . 0) + (syntax-case . 2) + (syntax-rules . 1) + (with-syntax . 1) + (begin . 0) + (do . 2) + (parameterize . 1) + (guard . 1) + (with-exception-handler . 1) + (call-with-values . 1) + (receive . 2) + (use-modules . 0) + (with-fluids . 1) + (dynamic-wind . 0)))) + +(define (config-ref config key . default) + "Look up KEY in CONFIG alist, returning DEFAULT if not found." + (let ((pair (assq key config))) + (if pair + (cdr pair) + (if (null? default) #f (car default))))) + +(define (config-line-length config) + (or (config-ref config 'line-length) 80)) + +(define (config-indent-width config) + (or (config-ref config 'indent) 2)) + +(define (config-max-blank-lines config) + (or (config-ref config 'max-blank-lines) 2)) + +(define (config-enabled-rules config) + (or (config-ref config 'enable) '())) + +(define (config-disabled-rules config) + (or (config-ref config 'disable) '())) + +(define (config-ignore-patterns config) + (or (config-ref config 'ignore) '())) + +(define (config-indent-rules config) + (or (config-ref config 'indent-rules) '())) + +(define (load-config path) + "Load a .gulie.sexp config file at PATH. Returns an alist." + (if (and path (file-exists? path)) + (call-with-input-file path + (lambda (port) + (let ((data (read port))) + (if (list? data) data '())))) + '())) + +(define (merge-configs base override) + "Merge OVERRIDE config on top of BASE. Override wins for scalar values; +lists are replaced, not appended." + (let lp ((result base) + (pairs override)) + (if (null? pairs) + result + (let ((pair (car pairs))) + (lp (assq-set! (list-copy result) (car pair) (cdr pair)) + (cdr pairs)))))) + +(define (find-config-file start-dir) + "Search upward from START-DIR for .gulie.sexp. Returns path or #f." + (let lp ((dir start-dir)) + (let ((candidate (string-append dir "/.gulie.sexp"))) + (cond + ((file-exists? candidate) candidate) + ((string=? dir "/") #f) + (else (lp (dirname dir))))))) + +(define (generate-template port) + "Write a template .gulie.sexp to PORT." + (display ";;; gulie configuration\n" port) + (display ";;; Place this file as .gulie.sexp in your project root.\n" port) + (display ";;; All fields are optional — defaults are shown below.\n\n" port) + (write default-config port) + (newline port)) diff --git a/gulie/cst.scm b/gulie/cst.scm new file mode 100644 index 0000000..0de8722 --- /dev/null +++ b/gulie/cst.scm @@ -0,0 +1,95 @@ +;;; (gulie cst) — concrete syntax tree from token stream +;;; +;;; Builds a tree that mirrors the parenthesised structure of the source +;;; while preserving ALL tokens (whitespace, comments, atoms). + +(define-module (gulie cst) + #:use-module (srfi srfi-9) + #:use-module (srfi srfi-1) + #:use-module (srfi srfi-11) + #:use-module (gulie tokenizer) + #:export ( + make-cst-node + cst-node? + cst-node-open + cst-node-close + cst-node-children + + parse-cst + cst-form-name + cst-significant-children)) + +;; A parenthesised group in the CST. +(define-record-type + (make-cst-node open close children) + cst-node? + (open cst-node-open) ;; for ( [ { + (close cst-node-close) ;; for ) ] } or #f if unmatched + (children cst-node-children)) ;; list of | + +(define (parse-cst tokens) + "Parse a flat list of tokens into a CST. +Returns a with a synthetic root (no open/close parens) +whose children are the top-level forms." + (let lp ((remaining tokens) (children '())) + (if (null? remaining) + (make-cst-node #f #f (reverse children)) + (let ((tok (car remaining)) + (rest (cdr remaining))) + (case (token-type tok) + ((open-paren) + ;; Recursively parse until matching close + (let-values (((node remaining*) (parse-group tok rest))) + (lp remaining* (cons node children)))) + ((close-paren) + ;; Unmatched close paren at top level — keep as-is + (lp rest (cons tok children))) + (else + (lp rest (cons tok children)))))))) + +(define (parse-group open-token tokens) + "Parse tokens after an open paren until the matching close. +Returns (values remaining-tokens)." + (let lp ((remaining tokens) (children '())) + (if (null? remaining) + ;; Unclosed paren + (values (make-cst-node open-token #f (reverse children)) + '()) + (let ((tok (car remaining)) + (rest (cdr remaining))) + (case (token-type tok) + ((close-paren) + (values (make-cst-node open-token tok (reverse children)) + rest)) + ((open-paren) + (let-values (((node remaining*) (parse-group tok rest))) + (lp remaining* (cons node children)))) + (else + (lp rest (cons tok children)))))))) + +(define (cst-form-name node) + "For a , return the first significant symbol child as a string, +or #f if the node has no symbol children." + (let lp ((children (cst-node-children node))) + (if (null? children) + #f + (let ((child (car children))) + (cond + ((and (token? child) + (eq? (token-type child) 'symbol)) + (token-text child)) + ((and (token? child) + (memq (token-type child) '(whitespace newline line-comment + block-comment prefix))) + (lp (cdr children))) + (else #f)))))) + +(define (cst-significant-children node) + "Return children of NODE that are not whitespace, newlines, or comments." + (filter (lambda (child) + (or (cst-node? child) + (and (token? child) + (not (memq (token-type child) + '(whitespace newline line-comment + block-comment)))))) + (cst-node-children node))) diff --git a/gulie/diagnostic.scm b/gulie/diagnostic.scm new file mode 100644 index 0000000..d15e55f --- /dev/null +++ b/gulie/diagnostic.scm @@ -0,0 +1,91 @@ +;;; (gulie diagnostic) — diagnostic record type, sorting, formatting +;;; +;;; A represents a single finding from a rule check. +;;; Diagnostics are the universal currency between rules, the engine, +;;; suppression filtering, and the reporter. + +(define-module (gulie diagnostic) + #:use-module (srfi srfi-9) + #:use-module (ice-9 format) + #:export ( + make-diagnostic + diagnostic? + diagnostic-file + diagnostic-line + diagnostic-column + diagnostic-severity + diagnostic-rule + diagnostic-message + diagnostic-fix + + + make-fix + fix? + fix-type + fix-line + fix-column + fix-end-line + fix-end-column + fix-replacement + + diagnostic + (make-diagnostic file line column severity rule message fix) + diagnostic? + (file diagnostic-file) ;; string: file path + (line diagnostic-line) ;; integer: 1-based line number + (column diagnostic-column) ;; integer: 0-based column + (severity diagnostic-severity) ;; symbol: error | warning | info + (rule diagnostic-rule) ;; symbol: rule name + (message diagnostic-message) ;; string: human-readable message + (fix diagnostic-fix)) ;; | #f + +;; An auto-fix that can be applied to resolve a diagnostic. +(define-record-type + (make-fix type line column end-line end-column replacement) + fix? + (type fix-type) ;; symbol: replace-line | delete-range | replace-range + (line fix-line) ;; integer: 1-based + (column fix-column) ;; integer: 0-based + (end-line fix-end-line) ;; integer: 1-based + (end-column fix-end-column) ;; integer: 0-based + (replacement fix-replacement)) ;; string | #f + +(define (diagnostic? fa fb) #f) + (else + (let ((la (diagnostic-line a)) + (lb (diagnostic-line b))) + (cond + ((< la lb) #t) + ((> la lb) #f) + (else (< (diagnostic-column a) (diagnostic-column b))))))))) + +(define (severity->string sev) + (symbol->string sev)) + +(define (format-diagnostic diag) + "Format a diagnostic as file:line:column: severity: rule: message" + (format #f "~a:~a:~a: ~a: ~a: ~a" + (diagnostic-file diag) + (diagnostic-line diag) + (diagnostic-column diag) + (severity->string (diagnostic-severity diag)) + (diagnostic-rule diag) + (diagnostic-message diag))) + +(define (format-diagnostics diags port) + "Write all diagnostics to PORT, sorted by location." + (for-each (lambda (d) + (display (format-diagnostic d) port) + (newline port)) + (sort diags diagnostic + make-rule + rule? + rule-name + rule-description + rule-severity + rule-category + rule-type + rule-check-proc + rule-fix-proc + + register-rule! + all-rules + rules-of-type + find-rule + clear-rules!)) + +;; A lint/format rule. +(define-record-type + (make-rule name description severity category type check-proc fix-proc) + rule? + (name rule-name) ;; symbol + (description rule-description) ;; string + (severity rule-severity) ;; symbol: error | warning | info + (category rule-category) ;; symbol: format | style | correctness | idiom + (type rule-type) ;; symbol: line | cst | tree-il + (check-proc rule-check-proc) ;; procedure + (fix-proc rule-fix-proc)) ;; procedure | #f + +;; Global rule registry. +(define *rules* '()) + +(define (register-rule! rule) + "Register a rule in the global registry." + (set! *rules* (cons rule *rules*))) + +(define (all-rules) + "Return all registered rules." + (reverse *rules*)) + +(define (rules-of-type type) + "Return all registered rules of the given TYPE." + (filter (lambda (r) (eq? (rule-type r) type)) + *rules*)) + +(define (find-rule name) + "Find a rule by NAME, or #f." + (find (lambda (r) (eq? (rule-name r) name)) + *rules*)) + +(define (clear-rules!) + "Clear all registered rules. Useful for testing." + (set! *rules* '())) diff --git a/gulie/rules/comments.scm b/gulie/rules/comments.scm new file mode 100644 index 0000000..1ec6629 --- /dev/null +++ b/gulie/rules/comments.scm @@ -0,0 +1,71 @@ +;;; (gulie rules comments) — comment style conventions +;;; +;;; Checks that comments follow standard Scheme conventions: +;;; ; — inline comments (after code on same line) +;;; ;; — line comments (own line, aligned with code) +;;; ;;; — section/file-level comments +;;; ;;;; — file headers + +(define-module (gulie rules comments) + #:use-module (gulie rule) + #:use-module (gulie diagnostic)) + +;; Count leading semicolons in a comment string. +(define (count-semicolons text) + (let lp ((i 0)) + (if (and (< i (string-length text)) + (char=? (string-ref text i) #\;)) + (lp (1+ i)) + i))) + +;; Is a line (before the comment) only whitespace? +(define (comment-only-line? line-text comment-col) + (let lp ((i 0)) + (cond + ((>= i comment-col) #t) + ((char-whitespace? (string-ref line-text i)) (lp (1+ i))) + (else #f)))) + +(register-rule! + (make-rule + 'comment-semicolons + "Check comment semicolon count follows conventions" + 'info 'style 'line + (lambda (file line-num line-text config) + (let ((pos (string-index line-text #\;))) + (if (not pos) + '() + ;; Check if the semicolon is inside a string (rough heuristic: + ;; count quotes before the semicolon position) + (let ((quotes-before (let lp ((i 0) (count 0) (in-escape #f)) + (cond + ((>= i pos) count) + ((and (not in-escape) (char=? (string-ref line-text i) #\\)) + (lp (1+ i) count #t)) + ((and (not in-escape) (char=? (string-ref line-text i) #\")) + (lp (1+ i) (1+ count) #f)) + (else (lp (1+ i) count #f)))))) + (if (odd? quotes-before) + ;; Inside a string — not a real comment + '() + (let* ((semis (count-semicolons (substring line-text pos))) + (own-line? (comment-only-line? line-text pos))) + (cond + ;; Inline comment (after code) should use single ; + ;; But we don't enforce this strictly — just flag ;;; or more inline + ((and (not own-line?) (>= semis 3)) + (list (make-diagnostic + file line-num pos + 'info 'comment-semicolons + "inline comments should use ; or ;; not ;;;" + #f))) + ;; Own-line comment with single ; (should be ;;) + ((and own-line? (= semis 1) (> (string-length line-text) (1+ pos)) + (not (char=? (string-ref line-text (1+ pos)) #\!))) + (list (make-diagnostic + file line-num pos + 'info 'comment-semicolons + "line comments should use ;; not ;" + #f))) + (else '())))))))) + #f)) diff --git a/gulie/rules/surface.scm b/gulie/rules/surface.scm new file mode 100644 index 0000000..5a7746b --- /dev/null +++ b/gulie/rules/surface.scm @@ -0,0 +1,93 @@ +;;; (gulie rules surface) — surface-level line rules +;;; +;;; These rules operate on raw text lines. They need no parsing — +;;; just the file path, line number, and line content. + +(define-module (gulie rules surface) + #:use-module (gulie rule) + #:use-module (gulie diagnostic)) + +;;; trailing-whitespace — trailing spaces or tabs at end of line + +(register-rule! + (make-rule + 'trailing-whitespace + "Line has trailing whitespace" + 'warning 'format 'line + (lambda (file line-num line-text config) + (let ((trimmed (string-trim-right line-text))) + (if (and (not (string=? line-text trimmed)) + (> (string-length line-text) 0)) + (list (make-diagnostic + file line-num + (string-length trimmed) + 'warning 'trailing-whitespace + "trailing whitespace" + (make-fix 'replace-line line-num 0 + line-num (string-length line-text) + trimmed))) + '()))) + #f)) + +;;; line-length — line exceeds maximum width + +(define (config-max-line-length config) + (or (assq-ref config 'line-length) 80)) + +(register-rule! + (make-rule + 'line-length + "Line exceeds maximum length" + 'warning 'format 'line + (lambda (file line-num line-text config) + (let ((max-len (config-max-line-length config))) + (if (> (string-length line-text) max-len) + (list (make-diagnostic + file line-num max-len + 'warning 'line-length + (format #f "line exceeds ~a characters (~a)" + max-len (string-length line-text)) + #f)) + '()))) + #f)) + +;;; no-tabs — tab characters in source + +(register-rule! + (make-rule + 'no-tabs + "Tab character found in source" + 'warning 'format 'line + (lambda (file line-num line-text config) + (let ((pos (string-index line-text #\tab))) + (if pos + (list (make-diagnostic + file line-num pos + 'warning 'no-tabs + "tab character found; use spaces for indentation" + #f)) + '()))) + #f)) + +;;; blank-lines — excessive consecutive blank lines + +(register-rule! + (make-rule + 'blank-lines + "Excessive consecutive blank lines" + 'warning 'format 'line + (lambda (file line-num line-text config) + ;; This rule uses a stateful approach: the engine tracks consecutive + ;; blank lines and passes the count via config. See engine.scm for + ;; the blank-line counting logic. + (let ((max-blanks (or (assq-ref config 'max-blank-lines) 2)) + (consecutive (or (assq-ref config '%consecutive-blanks) 0))) + (if (and (string-every char-whitespace? line-text) + (> consecutive max-blanks)) + (list (make-diagnostic + file line-num 0 + 'warning 'blank-lines + (format #f "more than ~a consecutive blank lines" max-blanks) + #f)) + '()))) + #f)) diff --git a/gulie/suppression.scm b/gulie/suppression.scm new file mode 100644 index 0000000..40cc12d --- /dev/null +++ b/gulie/suppression.scm @@ -0,0 +1,139 @@ +;;; (gulie suppression) — inline suppression via comments +;;; +;;; Parses ; gulie:suppress and ; gulie:disable/enable directives +;;; from raw source text. Returns a suppression map used to filter +;;; diagnostics after all rules have run. + +(define-module (gulie suppression) + #:use-module (ice-9 regex) + #:use-module (ice-9 rdelim) + #:use-module (srfi srfi-1) + #:use-module (gulie diagnostic) + #:export (parse-suppressions + filter-suppressions)) + +;; A suppression entry. +;; line: 1-based line number this applies to +;; rules: list of rule name symbols, or #t for "all rules" +;; kind: 'this-line | 'next-line | 'region-start | 'region-end +(define (make-suppression line rules kind) + (list line rules kind)) + +(define (suppression-line s) (car s)) +(define (suppression-rules s) (cadr s)) +(define (suppression-kind s) (caddr s)) + +(define *suppress-re* + (make-regexp ";+\\s*gulie:suppress\\s*(.*)$")) + +(define *disable-re* + (make-regexp ";+\\s*gulie:disable\\s+(.+)$")) + +(define *enable-re* + (make-regexp ";+\\s*gulie:enable\\s+(.+)$")) + +(define (parse-rule-names str) + "Parse space-separated rule names from STR. Empty → #t (all rules)." + (let ((trimmed (string-trim-both str))) + (if (string-null? trimmed) + #t + (map string->symbol (string-split trimmed #\space))))) + +(define (parse-suppressions text) + "Parse suppression directives from source TEXT. +Returns a list of (line rules kind) entries." + (let ((lines (string-split text #\newline)) + (result '())) + (let lp ((lines lines) (line-num 1) (acc '())) + (if (null? lines) + (reverse acc) + (let ((line (car lines))) + (cond + ;; ; gulie:suppress [rules...] — on the SAME line if code precedes it, + ;; or on the NEXT line if the line is comment-only + ((regexp-exec *suppress-re* line) + => (lambda (m) + (let* ((rules (parse-rule-names (match:substring m 1))) + (trimmed (string-trim line)) + ;; If line starts with ;, it's comment-only → suppress next line + (kind (if (char=? (string-ref trimmed 0) #\;) + 'next-line + 'this-line)) + (target-line (if (eq? kind 'next-line) + (1+ line-num) + line-num))) + (lp (cdr lines) (1+ line-num) + (cons (make-suppression target-line rules kind) acc))))) + ;; ; gulie:disable rule — region start + ((regexp-exec *disable-re* line) + => (lambda (m) + (let ((rules (parse-rule-names (match:substring m 1)))) + (lp (cdr lines) (1+ line-num) + (cons (make-suppression line-num rules 'region-start) acc))))) + ;; ; gulie:enable rule — region end + ((regexp-exec *enable-re* line) + => (lambda (m) + (let ((rules (parse-rule-names (match:substring m 1)))) + (lp (cdr lines) (1+ line-num) + (cons (make-suppression line-num rules 'region-end) acc))))) + (else + (lp (cdr lines) (1+ line-num) acc)))))))) + +(define (build-suppression-set suppressions) + "Build a procedure (line rule-name) -> #t if suppressed." + ;; Point suppressions: hash of line-num -> rules + (let ((point-map (make-hash-table)) + (regions '())) + ;; Collect point suppressions and regions + (for-each + (lambda (s) + (case (suppression-kind s) + ((this-line next-line) + (hashv-set! point-map (suppression-line s) + (suppression-rules s))) + ((region-start) + (set! regions (cons s regions))))) + suppressions) + ;; Build region intervals + (let ((region-intervals + (let lp ((remaining (reverse regions)) (intervals '())) + (if (null? remaining) + intervals + (let* ((start (car remaining)) + ;; Find matching end + (end-entry (find (lambda (s) + (and (eq? (suppression-kind s) 'region-end) + (> (suppression-line s) (suppression-line start)) + (equal? (suppression-rules s) + (suppression-rules start)))) + suppressions))) + (lp (cdr remaining) + (cons (list (suppression-line start) + (if end-entry (suppression-line end-entry) 999999) + (suppression-rules start)) + intervals))))))) + ;; Return predicate + (lambda (line rule-name) + (or + ;; Check point suppressions + (let ((rules (hashv-ref point-map line))) + (and rules + (or (eq? rules #t) + (memq rule-name rules)))) + ;; Check region suppressions + (any (lambda (interval) + (and (>= line (car interval)) + (<= line (cadr interval)) + (let ((rules (caddr interval))) + (or (eq? rules #t) + (memq rule-name rules))))) + region-intervals)))))) + +(define (filter-suppressions diagnostics suppressions) + "Remove diagnostics that are suppressed." + (if (null? suppressions) + diagnostics + (let ((suppressed? (build-suppression-set suppressions))) + (filter (lambda (d) + (not (suppressed? (diagnostic-line d) (diagnostic-rule d)))) + diagnostics)))) diff --git a/gulie/tokenizer.scm b/gulie/tokenizer.scm new file mode 100644 index 0000000..07c171d --- /dev/null +++ b/gulie/tokenizer.scm @@ -0,0 +1,348 @@ +;;; (gulie tokenizer) — hand-written lexer preserving all tokens +;;; +;;; Tokenises Guile Scheme source code into a flat list of +;;; records, preserving whitespace, comments, and exact source text. +;;; +;;; Critical invariant: +;;; (string-concatenate (map token-text (tokenize input))) +;;; ≡ input + +(define-module (gulie tokenizer) + #:use-module (srfi srfi-9) + #:use-module (srfi srfi-11) + #:export ( + make-token + token? + token-type + token-text + token-line + token-column + + tokenize)) + +(define-record-type + (make-token type text line column) + token? + (type token-type) ;; symbol + (text token-text) ;; string: exact source text + (line token-line) ;; integer: 1-based + (column token-column)) ;; integer: 0-based + +(define (tokenize input filename) + "Tokenise INPUT string. Returns a list of records. +FILENAME is used only for error messages." + (let ((port (open-input-string input)) + (tokens '()) + (line 1) + (col 0)) + + (define (peek) (peek-char port)) + + (define (advance!) + (let ((ch (read-char port))) + (cond + ((eof-object? ch) ch) + ((char=? ch #\newline) + (set! line (1+ line)) + (set! col 0) + ch) + (else + (set! col (1+ col)) + ch)))) + + (define (emit! type text start-line start-col) + (set! tokens (cons (make-token type text start-line start-col) + tokens))) + + (define (collect-while first pred) + "Collect chars starting with FIRST while PRED holds on peek." + (let lp ((acc (list first))) + (let ((ch (peek))) + (if (and (not (eof-object? ch)) (pred ch)) + (begin (advance!) (lp (cons ch acc))) + (list->string (reverse acc)))))) + + (define (char-hex? ch) + (or (char-numeric? ch) + (memv ch '(#\a #\b #\c #\d #\e #\f + #\A #\B #\C #\D #\E #\F)))) + + (define (delimiter? ch) + (or (eof-object? ch) + (char-whitespace? ch) + (memv ch '(#\( #\) #\[ #\] #\{ #\} #\" #\; #\#)))) + + (define (read-string-literal) + "Read string body after opening quote. Returns full string including quotes." + (let lp ((acc (list #\"))) + (let ((ch (advance!))) + (cond + ((eof-object? ch) + (list->string (reverse acc))) + ((char=? ch #\\) + (let ((next (advance!))) + (if (eof-object? next) + (list->string (reverse (cons ch acc))) + (lp (cons next (cons ch acc)))))) + ((char=? ch #\") + (list->string (reverse (cons ch acc)))) + (else + (lp (cons ch acc))))))) + + (define (read-line-comment first) + "Read from FIRST (;) to end of line, not including the newline." + (collect-while first (lambda (ch) (not (char=? ch #\newline))))) + + (define (read-block-comment) + "Read block comment body after #|. Returns full text including #| and |#." + (let lp ((acc (list #\| #\#)) (depth 1)) + (let ((ch (advance!))) + (cond + ((eof-object? ch) + (list->string (reverse acc))) + ((and (char=? ch #\|) (eqv? (peek) #\#)) + (let ((hash (advance!))) + (if (= depth 1) + (list->string (reverse (cons hash (cons ch acc)))) + (lp (cons hash (cons ch acc)) (1- depth))))) + ((and (char=? ch #\#) (eqv? (peek) #\|)) + (let ((pipe (advance!))) + (lp (cons pipe (cons ch acc)) (1+ depth)))) + (else + (lp (cons ch acc) depth)))))) + + (define (read-character-literal) + "Read character literal after #\\. Returns full text including #\\." + (let ((ch (advance!))) + (cond + ((eof-object? ch) "#\\") + ((and (char-alphabetic? ch) + (let ((pk (peek))) + (and (not (eof-object? pk)) + (char-alphabetic? pk)))) + (string-append "#\\" + (collect-while ch char-alphabetic?))) + ((and (char=? ch #\x) + (let ((pk (peek))) + (and (not (eof-object? pk)) + (char-hex? pk)))) + (string-append "#\\" + (collect-while ch + (lambda (c) (or (char-hex? c) (char=? c #\x)))))) + (else + (string-append "#\\" (string ch)))))) + + (define (read-shebang) + "Read shebang/directive after #!. Returns full text." + (let lp ((acc (list #\! #\#))) + (let ((c (advance!))) + (cond + ((eof-object? c) + (list->string (reverse acc))) + ((and (char=? c #\!) (eqv? (peek) #\#)) + (let ((h (advance!))) + (list->string (reverse (cons h (cons c acc)))))) + (else + (lp (cons c acc))))))) + + (define (read-other-sharp next) + "Read # sequence where NEXT is first char after #. Returns full text." + (let ((text (string-append "#" + (collect-while next + (lambda (c) (not (delimiter? c))))))) + (if (eqv? (peek) #\() + (begin (advance!) + (values 'special (string-append text "("))) + (values 'special text)))) + + ;; Main tokenisation loop + (let lp () + (let ((ch (peek))) + (cond + ((eof-object? ch) + (reverse tokens)) + + ;; Newline + ((char=? ch #\newline) + (let ((sl line) (sc col)) + (advance!) + (emit! 'newline "\n" sl sc) + (lp))) + + ;; Whitespace (non-newline) + ((char-whitespace? ch) + (let ((sl line) (sc col)) + (advance!) + (emit! 'whitespace + (collect-while ch + (lambda (c) (and (char-whitespace? c) + (not (char=? c #\newline))))) + sl sc) + (lp))) + + ;; Line comment + ((char=? ch #\;) + (let ((sl line) (sc col)) + (advance!) + (emit! 'line-comment (read-line-comment ch) sl sc) + (lp))) + + ;; String + ((char=? ch #\") + (let ((sl line) (sc col)) + (advance!) + (emit! 'string (read-string-literal) sl sc) + (lp))) + + ;; Open paren + ((memv ch '(#\( #\[ #\{)) + (let ((sl line) (sc col)) + (advance!) + (emit! 'open-paren (string ch) sl sc) + (lp))) + + ;; Close paren + ((memv ch '(#\) #\] #\})) + (let ((sl line) (sc col)) + (advance!) + (emit! 'close-paren (string ch) sl sc) + (lp))) + + ;; Quote + ((char=? ch #\') + (let ((sl line) (sc col)) + (advance!) + (emit! 'prefix "'" sl sc) + (lp))) + + ;; Quasiquote + ((char=? ch #\`) + (let ((sl line) (sc col)) + (advance!) + (emit! 'prefix "`" sl sc) + (lp))) + + ;; Unquote / unquote-splicing + ((char=? ch #\,) + (let ((sl line) (sc col)) + (advance!) + (cond + ((eqv? (peek) #\@) + (advance!) + (emit! 'prefix ",@" sl sc)) + (else + (emit! 'prefix "," sl sc))) + (lp))) + + ;; Sharp sequences + ((char=? ch #\#) + (let ((sl line) (sc col)) + (advance!) ;; consume # + (let ((next (peek))) + (cond + ;; Block comment #|...|# + ((eqv? next #\|) + (advance!) + (emit! 'block-comment (read-block-comment) sl sc) + (lp)) + + ;; Datum comment #; + ((eqv? next #\;) + (advance!) + (emit! 'special "#;" sl sc) + (lp)) + + ;; Boolean #t, #f, #true, #false + ((or (eqv? next #\t) (eqv? next #\f)) + (advance!) + (let* ((rest (if (and (not (eof-object? (peek))) + (char-alphabetic? (peek))) + (collect-while next char-alphabetic?) + (string next))) + (text (string-append "#" rest))) + (emit! 'boolean text sl sc) + (lp))) + + ;; Character literal #\x + ((eqv? next #\\) + (advance!) + (emit! 'character (read-character-literal) sl sc) + (lp)) + + ;; Keyword #:foo + ((eqv? next #\:) + (advance!) + (let ((name (if (and (not (eof-object? (peek))) + (not (delimiter? (peek)))) + (collect-while (advance!) + (lambda (c) (not (delimiter? c)))) + ""))) + (emit! 'keyword (string-append "#:" name) sl sc) + (lp))) + + ;; Syntax shorthands: #', #`, #,, #,@ + ((eqv? next #\') + (advance!) + (emit! 'prefix "#'" sl sc) + (lp)) + ((eqv? next #\`) + (advance!) + (emit! 'prefix "#`" sl sc) + (lp)) + ((eqv? next #\,) + (advance!) + (cond + ((eqv? (peek) #\@) + (advance!) + (emit! 'prefix "#,@" sl sc)) + (else + (emit! 'prefix "#," sl sc))) + (lp)) + + ;; Vector #( + ((eqv? next #\() + (advance!) + (emit! 'special "#(" sl sc) + (lp)) + + ;; Shebang #!...!# + ((eqv? next #\!) + (advance!) + (emit! 'block-comment (read-shebang) sl sc) + (lp)) + + ;; Other # sequences: #vu8(, #*, etc. + ((and (not (eof-object? next)) + (not (delimiter? next))) + (advance!) + (let-values (((type text) (read-other-sharp next))) + (emit! type text sl sc) + (lp))) + + ;; Bare # at delimiter boundary + (else + (emit! 'symbol "#" sl sc) + (lp)))))) + + ;; Dot + ((char=? ch #\.) + (let ((sl line) (sc col)) + (advance!) + (if (delimiter? (peek)) + (begin + (emit! 'dot "." sl sc) + (lp)) + (let ((text (collect-while ch + (lambda (c) (not (delimiter? c)))))) + (emit! 'symbol text sl sc) + (lp))))) + + ;; Everything else: symbol or number + (else + (let ((sl line) (sc col)) + (advance!) + (let ((text (collect-while ch + (lambda (c) (not (delimiter? c)))))) + (emit! (if (string->number text) 'number 'symbol) + text sl sc) + (lp))))))))) diff --git a/test/fixtures/clean/well-formatted.scm b/test/fixtures/clean/well-formatted.scm new file mode 100644 index 0000000..13c8bd5 --- /dev/null +++ b/test/fixtures/clean/well-formatted.scm @@ -0,0 +1,13 @@ +;;; A well-formatted Guile source file. +;;; This should produce zero surface diagnostics. + +(define-module (test well-formatted) + #:export (greet add)) + +;; Greet a person by name. +(define (greet name) + (string-append "Hello, " name "!")) + +;; Add two numbers. +(define (add a b) + (+ a b)) diff --git a/test/fixtures/violations/semantic.scm b/test/fixtures/violations/semantic.scm new file mode 100644 index 0000000..eb9d76a --- /dev/null +++ b/test/fixtures/violations/semantic.scm @@ -0,0 +1,9 @@ +(define-module (test semantic) + #:use-module (ice-9 format)) + +(define (foo x y) + (let ((unused 42)) + (+ x 1))) + +(define (bar a) + (baz a)) diff --git a/test/fixtures/violations/surface.scm b/test/fixtures/violations/surface.scm new file mode 100644 index 0000000..f16c253 --- /dev/null +++ b/test/fixtures/violations/surface.scm @@ -0,0 +1,12 @@ +(define x 42) +(define y "hello") +(define z (+ x y)) + +;; This line is fine +(define (long-function-name-that-exceeds-the-default-eighty-character-limit arg1 arg2 arg3 arg4 arg5) + (+ arg1 arg2)) + + + +;; After too many blank lines +(define w 99) diff --git a/test/run-tests.scm b/test/run-tests.scm new file mode 100644 index 0000000..cc0efba --- /dev/null +++ b/test/run-tests.scm @@ -0,0 +1,34 @@ +#!/usr/bin/env -S guile --no-auto-compile -s +!# +;;; Test runner for gulie + +;; Add project root to load path +(let ((dir (dirname (dirname (current-filename))))) + (set! %load-path (cons dir %load-path))) + +(use-modules (srfi srfi-64)) + +;; Configure test runner for CI-friendly output +(test-runner-current + (let ((runner (test-runner-simple))) + (test-runner-on-final! runner + (lambda (runner) + (let ((pass (test-runner-pass-count runner)) + (fail (test-runner-fail-count runner)) + (skip (test-runner-skip-count runner))) + (newline) + (format #t "Results: ~a passed, ~a failed, ~a skipped~%" + pass fail skip) + (when (> fail 0) + (exit 1))))) + runner)) + +;; Load and run all test files (paths relative to project root) +(let ((root (dirname (dirname (current-filename))))) + (define (load-test name) + (load (string-append root "/test/" name))) + (load-test "test-tokenizer.scm") + (load-test "test-cst.scm") + (load-test "test-rules.scm") + (load-test "test-suppression.scm") + (load-test "test-compiler.scm")) diff --git a/test/test-compiler.scm b/test/test-compiler.scm new file mode 100644 index 0000000..de16361 --- /dev/null +++ b/test/test-compiler.scm @@ -0,0 +1,56 @@ +;;; Tests for (gulie compiler) — semantic analysis pass + +(use-modules (srfi srfi-64) + (srfi srfi-1) + (gulie compiler) + (gulie diagnostic)) + +(test-begin "compiler") + +(test-group "unused-variable" + (let ((diags (compile-and-capture-warnings + "test.scm" + "(define (foo x)\n (let ((unused 42))\n x))\n" + '()))) + (test-assert "detects unused variable" + (any (lambda (d) (eq? (diagnostic-rule d) 'unused-variable)) + diags)))) + +(test-group "unbound-variable" + (let ((diags (compile-and-capture-warnings + "test.scm" + "(define (foo x)\n (+ x unknown-thing))\n" + '()))) + (test-assert "detects unbound variable" + (any (lambda (d) (eq? (diagnostic-rule d) 'unbound-variable)) + diags)))) + +(test-group "arity-mismatch" + (let ((diags (compile-and-capture-warnings + "test.scm" + "(define (foo x) x)\n(define (bar) (foo 1 2 3))\n" + '()))) + (test-assert "detects arity mismatch" + (any (lambda (d) (eq? (diagnostic-rule d) 'arity-mismatch)) + diags)))) + +(test-group "clean-code" + (let ((diags (compile-and-capture-warnings + "test.scm" + "(define (foo x) (+ x 1))\n" + '()))) + ;; May have unused-toplevel but no real errors + (test-assert "no compile errors" + (not (any (lambda (d) (eq? (diagnostic-severity d) 'error)) + diags))))) + +(test-group "syntax-error" + (let ((diags (compile-and-capture-warnings + "test.scm" + "(define (foo x) (+ x" + '()))) + (test-assert "catches syntax error" + (any (lambda (d) (eq? (diagnostic-severity d) 'error)) + diags)))) + +(test-end "compiler") diff --git a/test/test-cst.scm b/test/test-cst.scm new file mode 100644 index 0000000..d6514a6 --- /dev/null +++ b/test/test-cst.scm @@ -0,0 +1,59 @@ +;;; Tests for (gulie cst) + +(use-modules (srfi srfi-1) + (srfi srfi-64) + (gulie tokenizer) + (gulie cst)) + +(test-begin "cst") + +(test-group "basic-parsing" + (let* ((tokens (tokenize "(define x 42)" "test.scm")) + (cst (parse-cst tokens))) + (test-assert "root is cst-node" (cst-node? cst)) + (test-assert "root has no open paren" (not (cst-node-open cst))) + (let ((sig (cst-significant-children cst))) + (test-equal "one top-level form" 1 (length sig)) + (test-assert "top-level is cst-node" (cst-node? (car sig)))))) + +(test-group "form-name" + (let* ((tokens (tokenize "(define x 42)" "test.scm")) + (cst (parse-cst tokens)) + (form (car (cst-significant-children cst)))) + (test-equal "form name is define" "define" (cst-form-name form)))) + +(test-group "nested-forms" + (let* ((tokens (tokenize "(let ((x 1)) (+ x 2))" "test.scm")) + (cst (parse-cst tokens)) + (form (car (cst-significant-children cst)))) + (test-equal "form name is let" "let" (cst-form-name form)) + ;; Should have nested cst-nodes for ((x 1)) and (+ x 2) + (let ((inner-nodes (filter cst-node? (cst-node-children form)))) + (test-assert "has nested nodes" (>= (length inner-nodes) 2))))) + +(test-group "multiple-top-level" + (let* ((tokens (tokenize "(define a 1)\n(define b 2)\n(define c 3)" "test.scm")) + (cst (parse-cst tokens)) + (sig (cst-significant-children cst))) + (test-equal "three top-level forms" 3 (length sig)))) + +(test-group "comments-preserved" + (let* ((tokens (tokenize ";; header\n(define x 1)\n" "test.scm")) + (cst (parse-cst tokens)) + (children (cst-node-children cst))) + ;; Should include the comment as a token child + (test-assert "has comment token" + (any (lambda (c) + (and (token? c) (eq? (token-type c) 'line-comment))) + children)))) + +(test-group "prefix-handling" + (let* ((tokens (tokenize "'(1 2 3)" "test.scm")) + (cst (parse-cst tokens)) + (children (cst-node-children cst))) + (test-assert "has prefix token" + (any (lambda (c) + (and (token? c) (eq? (token-type c) 'prefix))) + children)))) + +(test-end "cst") diff --git a/test/test-rules.scm b/test/test-rules.scm new file mode 100644 index 0000000..dd7afc7 --- /dev/null +++ b/test/test-rules.scm @@ -0,0 +1,104 @@ +;;; Tests for rule modules + +(use-modules (srfi srfi-64) + (srfi srfi-1) + (gulie rule) + (gulie diagnostic) + (gulie rules surface) + (gulie rules comments)) + +(test-begin "rules") + +;;; Surface rules + +(test-group "trailing-whitespace" + (let ((rule (find-rule 'trailing-whitespace))) + (test-assert "rule registered" rule) + + (test-equal "clean line produces no diagnostics" + '() + ((rule-check-proc rule) "f.scm" 1 "(define x 42)" '())) + + (let ((diags ((rule-check-proc rule) "f.scm" 1 "(define x 42) " '()))) + (test-equal "trailing spaces detected" 1 (length diags)) + (test-equal "correct column" + (string-length "(define x 42)") + (diagnostic-column (car diags)))) + + (test-equal "empty line no diagnostic" + '() + ((rule-check-proc rule) "f.scm" 1 "" '())))) + +(test-group "line-length" + (let ((rule (find-rule 'line-length))) + (test-assert "rule registered" rule) + + (test-equal "short line ok" + '() + ((rule-check-proc rule) "f.scm" 1 "(define x 42)" '())) + + (let* ((long-line (make-string 81 #\x)) + (diags ((rule-check-proc rule) "f.scm" 1 long-line '()))) + (test-equal "long line detected" 1 (length diags))) + + (let* ((config '((line-length . 120))) + (line (make-string 100 #\x)) + (diags ((rule-check-proc rule) "f.scm" 1 line config))) + (test-equal "respects config" 0 (length diags))))) + +(test-group "no-tabs" + (let ((rule (find-rule 'no-tabs))) + (test-assert "rule registered" rule) + + (test-equal "no tabs ok" + '() + ((rule-check-proc rule) "f.scm" 1 " (define x 1)" '())) + + (let ((diags ((rule-check-proc rule) "f.scm" 1 "\t(define x 1)" '()))) + (test-equal "tab detected" 1 (length diags))))) + +(test-group "blank-lines" + (let ((rule (find-rule 'blank-lines))) + (test-assert "rule registered" rule) + + (test-equal "normal blank ok" + '() + ((rule-check-proc rule) "f.scm" 5 "" + '((max-blank-lines . 2) (%consecutive-blanks . 1)))) + + (let ((diags ((rule-check-proc rule) "f.scm" 5 "" + '((max-blank-lines . 2) (%consecutive-blanks . 3))))) + (test-equal "excessive blanks detected" 1 (length diags))))) + +;;; Comment rules + +(test-group "comment-semicolons" + (let ((rule (find-rule 'comment-semicolons))) + (test-assert "rule registered" rule) + + (test-equal "double semicolon on own line ok" + '() + ((rule-check-proc rule) "f.scm" 1 " ;; good comment" '())) + + ;; Single semicolon on own line + (let ((diags ((rule-check-proc rule) "f.scm" 1 " ; bad comment" '()))) + (test-equal "single ; on own line flagged" 1 (length diags))))) + +;;; Diagnostic formatting + +(test-group "diagnostic-format" + (let ((d (make-diagnostic "foo.scm" 10 5 'warning 'test-rule "oops" #f))) + (test-equal "format matches expected" + "foo.scm:10:5: warning: test-rule: oops" + (format-diagnostic d)))) + +(test-group "diagnostic-sorting" + (let ((d1 (make-diagnostic "a.scm" 10 0 'warning 'r "m" #f)) + (d2 (make-diagnostic "a.scm" 5 0 'warning 'r "m" #f)) + (d3 (make-diagnostic "b.scm" 1 0 'warning 'r "m" #f))) + (let ((sorted (sort (list d1 d2 d3) diagnostic= (length supps) 2)))) + +(test-end "suppression") diff --git a/test/test-tokenizer.scm b/test/test-tokenizer.scm new file mode 100644 index 0000000..3363b77 --- /dev/null +++ b/test/test-tokenizer.scm @@ -0,0 +1,127 @@ +;;; Tests for (gulie tokenizer) + +(use-modules (srfi srfi-1) + (srfi srfi-64) + (ice-9 rdelim) + (gulie tokenizer)) + +(test-begin "tokenizer") + +;;; Roundtrip invariant — the most critical test + +(test-group "roundtrip" + (define (roundtrip-ok? input) + (let* ((tokens (tokenize input "test.scm")) + (result (string-concatenate (map token-text tokens)))) + (string=? input result))) + + (test-assert "empty input" + (roundtrip-ok? "")) + + (test-assert "simple expression" + (roundtrip-ok? "(define x 42)")) + + (test-assert "nested expressions" + (roundtrip-ok? "(define (foo x)\n (+ x 1))\n")) + + (test-assert "string with escapes" + (roundtrip-ok? "(define s \"hello \\\"world\\\"\")")) + + (test-assert "line comment" + (roundtrip-ok? ";; a comment\n(define x 1)\n")) + + (test-assert "block comment" + (roundtrip-ok? "#| block\ncomment |#\n(define x 1)")) + + (test-assert "nested block comment" + (roundtrip-ok? "#| outer #| inner |# outer |#")) + + (test-assert "datum comment" + (roundtrip-ok? "#;(skip this) (keep this)")) + + (test-assert "character literals" + (roundtrip-ok? "(list #\\space #\\newline #\\a #\\x41)")) + + (test-assert "keywords" + (roundtrip-ok? "(foo #:bar #:baz)")) + + (test-assert "booleans" + (roundtrip-ok? "(list #t #f #true #false)")) + + (test-assert "vectors" + (roundtrip-ok? "#(1 2 3)")) + + (test-assert "quasiquote and unquote" + (roundtrip-ok? "`(a ,b ,@c)")) + + (test-assert "syntax shorthands" + (roundtrip-ok? "#'x #`x #,x #,@x")) + + (test-assert "dot notation" + (roundtrip-ok? "(a . b)")) + + (test-assert "numbers" + (roundtrip-ok? "(+ 1 2.5 -3 +4 1/3 #xff)")) + + (test-assert "square brackets" + (roundtrip-ok? "(let ([x 1] [y 2]) (+ x y))")) + + (test-assert "multiline with mixed content" + (roundtrip-ok? "(define-module (foo bar) + #:use-module (ice-9 format) + #:export (baz)) + +;;; Section header + +(define (baz x) + ;; body comment + (format #t \"value: ~a\\n\" x)) +")) + + ;; Real-world file roundtrip + (test-assert "real guile source file" + (let ((text (call-with-input-file "refs/guile/module/ice-9/pretty-print.scm" + (lambda (port) + (let lp ((acc '())) + (let ((ch (read-char port))) + (if (eof-object? ch) + (list->string (reverse acc)) + (lp (cons ch acc))))))))) + (roundtrip-ok? text)))) + +;;; Token type classification + +(test-group "token-types" + (define (first-token-type input) + (token-type (car (tokenize input "test.scm")))) + + (test-eq "symbol" 'symbol (first-token-type "foo")) + (test-eq "number" 'number (first-token-type "42")) + (test-eq "string" 'string (first-token-type "\"hello\"")) + (test-eq "open-paren" 'open-paren (first-token-type "(")) + (test-eq "close-paren" 'close-paren (first-token-type ")")) + (test-eq "boolean-true" 'boolean (first-token-type "#t")) + (test-eq "boolean-false" 'boolean (first-token-type "#f")) + (test-eq "keyword" 'keyword (first-token-type "#:foo")) + (test-eq "character" 'character (first-token-type "#\\a")) + (test-eq "line-comment" 'line-comment (first-token-type ";; hi")) + (test-eq "prefix-quote" 'prefix (first-token-type "'")) + (test-eq "prefix-quasiquote" 'prefix (first-token-type "`")) + (test-eq "dot" 'dot (first-token-type ". ")) + (test-eq "newline" 'newline (first-token-type "\n"))) + +;;; Source location tracking + +(test-group "source-locations" + (let ((tokens (tokenize "(define\n x\n 42)" "test.scm"))) + (test-equal "first token line" 1 (token-line (car tokens))) + (test-equal "first token col" 0 (token-column (car tokens))) + ;; Find 'x' token + (let ((x-tok (find (lambda (t) (and (eq? (token-type t) 'symbol) + (string=? (token-text t) "x"))) + tokens))) + (test-assert "found x token" x-tok) + (test-equal "x on line 2" 2 (token-line x-tok)) + (test-equal "x at column 2" 2 (token-column x-tok))))) + +(test-end "tokenizer")