diff --git a/src/org/rascalmpl/interpreter/result/ResultFactory.java b/src/org/rascalmpl/interpreter/result/ResultFactory.java index 4607b274ce0..d30ecc94ef5 100644 --- a/src/org/rascalmpl/interpreter/result/ResultFactory.java +++ b/src/org/rascalmpl/interpreter/result/ResultFactory.java @@ -209,6 +209,9 @@ else if (value instanceof OverloadedFunction) { return (OverloadedFunction) value; } } + else if (value instanceof ComposedFunctionResult) { + return (Result) value; + } else { // otherwise this is an abstract ICalleableValue // for which no further operations are defined? diff --git a/src/org/rascalmpl/library/ParseTree.rsc b/src/org/rascalmpl/library/ParseTree.rsc index 0df550b3352..6a8cee8c19e 100644 --- a/src/org/rascalmpl/library/ParseTree.rsc +++ b/src/org/rascalmpl/library/ParseTree.rsc @@ -142,9 +142,12 @@ run-time already uses `.src` while the source code still uses `@\loc`. module ParseTree -extend Type; -extend Message; extend List; +extend Message; +extend Type; + +import Node; +import Set; @synopsis{The Tree data type as produced by the parser.} @description{ @@ -355,9 +358,52 @@ Production associativity(Symbol s, Associativity as, {*Production a, priority(Sy @synopsis{Annotate a parse tree node with a source location.} +@description{ +A generated ((parser)) will produce ((Tree)) instances annotated with @\loc. In this way every node knows its own precise +range in the file, _and_ the file it originally came from. The ((reposition)) function +can simulate the same behavior without erasing other information (keyword parameters) that was produced after parsing. + +It is here, with ((parser)), ((parsers)) and ((reposition)), that location information is given its exact semantics for parse ((Tree))s: +* The URI points to a single file location that is the source (or target) for the current parse tree. +* Right after parsing and repositioning, the URI is the same for all \@loc annotation in a single ((Tree)) instance. +However, after tree rewriting this is not the case anymore. +* The `offset` is _zero-based_, inclusive, and is increasing from left to right, as long as the tree has not changed yet. +The offset of the very first character in a file is `0`. +* The `length` is always zero or positive. The length of a character (Unicode codepoint) is always 1, even if it is a control +code like `\n` or `\r`. Even `\t` has length `1`! +* The `begin.line` is _one-based_, inclusive, and increasing from top to bottom, as long as the tree has not changed yet. This follows the +POSIX convention that the first line on a screen or a punch card is labeled with `1`. +* The `begin.column` is _zero-based_, inclusive, and increasing from left-to-right, as long as the tree has not changed yet. +The column is reset to `0` on `\r` and `\n` characters. Zero based columns are also a POSIX convention. It is sometimes motivated +by the `|` bar cursor being _before_ the first character initially. +* The `end.line` is _one-based_ and inclusive, always larger or equal than `begin.line`. +* The `end.column` is _zero-based_ and inclusive, and _not_ always larger or equal than `begin.column`. That's true only if `begin.line == end.line`. +} +@benefits{ +* @\loc can be used to point to the origins of trees, even if rewritten parse trees are composed of values +from different sources, their @\loc value will explain where they come from. This can be used to construct +debugging interfaces for DSLs and PLs, for example. +* @\loc contains offset/length and line/column information to cater for all kinds of different ways that editors work. +* @\loc follows POSIX conventions to help in minimizing off-by-one errors when mapping to editor APIs +* @\loc indexes work on the basic concept of an "abstract character", namely Unicode codepoints. The character is +what most easily relates to what a users sees as a character on the screen. +} +@pitfalls{ +* @\loc is based on Unicode's abstract characters, a.k.a. codepoints. If your editor is byte-based or follows another character +encoding than the 24-bit integer codepoints (e.g. java/javascript 16-bit characters), then you need smart just-in-time bidirectional +conversion methods to make sure selection and highlighting ranges (for example) are always exact. +* If a concrete character ("grapheme") on screen is composed of several abstract characters ("codepoints"), then the @\loc +character metaphor breaks. It depends on how the editor internally handles graphemes and on the way it is connected to Rascal +what the effect for the user is. +* @\loc annotations make ((Tree)) instances _unique_ ,where otherwise they could be semantically and syntactically equivalent. +Therefor if you want to test for ((Tree)) (in)equality, always use `t1 := t2` and `t1 !:= t2`. Pattern matching already automatically +ignores @\loc annotations and whitespace and comments. +* Annotated trees are strictly too big for optimal memory usage. Often `@\loc` is the first and only annotation, so it introduces a map for keyword parameters +for every node. Also more nodes are different, impeding in optimal reference sharing. If you require long time storage of many +parse trees it may be useful to strip them of annotations for selected categories of nodes, using ((reposition)). +} anno loc Tree@\loc; - @synopsis{Parse input text (from a string or a location) and return a parse tree.} @description{ * Parse a string and return a parse tree. @@ -741,38 +787,30 @@ data Exp = add(Exp, Exp); } java &T<:value implode(type[&T<:value] t, Tree tree); - @synopsis{Annotate a parse tree node with an (error) message.} anno Message Tree@message; - @synopsis{Annotate a parse tree node with a list of (error) messages.} anno set[Message] Tree@messages; - @synopsis{Annotate a parse tree node with a documentation string.} anno str Tree@doc; - @synopsis{Annotate a parse tree node with documentation strings for several locations.} anno map[loc,str] Tree@docs; - - @synopsis{Annotate a parse tree node with the target of a reference.} anno loc Tree@link; - @synopsis{Annotate a parse tree node with multiple targets for a reference.} anno set[loc] Tree@links; - -@synopsis{Annotate the top of the tree with hyperlinks between entities in the tree (or other trees) - -This is similar to link and links annotations, except that you can put it as one set at the top of the tree.} +@synopsis{Annotate the top of the tree with hyperlinks between entities in the tree (or other trees)} +@description{ +This is similar to link and links annotations, except that you can put it as one set at the top of the tree. +} anno rel[loc,loc] Tree@hyperlinks; - @synopsis{Tree search result type for ((treeAt)).} data TreeSearchResult[&T<:Tree] = treeFound(&T tree) | treeNotFound(); @@ -814,3 +852,173 @@ bool isNonTerminalType(Symbol::\parameterized-sort(str _, list[Symbol] _)) = tru bool isNonTerminalType(Symbol::\parameterized-lex(str _, list[Symbol] _)) = true; bool isNonTerminalType(Symbol::\start(Symbol s)) = isNonTerminalType(s); default bool isNonTerminalType(Symbol s) = false; + +private alias NewLineChar = [\n]; +private alias ReturnChar = [\t]; + +@synopsis{Re-compute and overwrite origin locations for all sub-trees of a ((Tree))} +@description{ +This function takes a ((Tree)) and overwrites the old \loc annotations of every subtree +with fresh locations. The new locations are as-if the file was parsed again from the unparsed result: +the locations describe the left-to-right order of the sub-trees again exactly, and they are all +from the same top-level location (read "file"). + +Typically, with the default options, this algorithm changes _nothing_ in a ((Tree)) which +has just been produced by the parser. It will rebuild the tree and recompute the exact +locations as they were originally. However, there are many reasons why the (location) fields +in a ((Tree)) are not at all anymore what they were just after parsing: +1. subtrees may have been removed +2. subtrees may have been relocated to different parts of the tree; +2. subtrees may have been introduced from other source files +3. subtrees may have been introduced from concrete syntax expressions in Rascal code. +4. other algorithms may have added more keyword fields, for example fully resolved qualified names, +resolved types, error messages or future computations (closures). +5. location fields themselves may have been lost accidentally when rewriting trees with `visit` +6. etc. + +Some downstream algorithms (e.g. ((HiFiLayoutDiff)) ) require source locations to be consistent with the current actual position +of every source tree. ((reposition)) provides this contract. Even if one of the above transformations have happened, +after ((reposition)) every node has an accurate position with respect to the hypothetical file contents that would be generated +if the tree is unparsed (written to a string or a file). + +Next to this feature, ((reposition)) may add locations to ((Tree)) nodes which were not annotated +initially by the ((parser)): layout nodes, literal nodes, and sub-lexical nodes. Some algorithms on +parse trees (like formatting), require more detailed location information than provided by the ((parser)): +* markLexical=true, ensures the sub-structure of lexicals is annotated as well. +* markLayout=true, ensures annotating layout nodes and their sub-structure as well. +* markLit=true, ensures literal trees and case-insensitive literal trees are annotated as well. +* markAmb=true, ensures ambiguity nodes are annotated. NB: the sub-structure of a cluster is always annotated according to the other flags. +* etc. every kind of node has a "mark" flag for completeness sake. + +Finally, ((reposition)) can be used to removed superfluous locations from ((Tree)) nodes. Every node which +originally had a position will lose it unless ((reposition)) is configured to recompute it. + +By default ((reposition)) simulates the behavior of a ((parser)) exactly. Reparsing the +yield of a tree should always produce the exact same locations as ((reposition)) does. +} +@benefits{ +* Unlike reparsing, ((reposition)) will maintain all other keyword parameters of ((Tree)) nodes, like resolved qualified names and type attributes. +* Can be used to erase superfluous annotations for memory efficiency, while keeping the essential ones. +* The default mark options simulatete the behavior of ((parser)) functions. +} +&T <: Tree reposition( + &T <: Tree tree, + loc file = tree@\loc.top, + bool \markStart = true, + bool \markSyntax = true, + bool \markLexical = true, + bool \markSubLexical = true, + bool \markRegular = true, + bool \markLayout = true, + bool \markSubLayout = true, + bool \markLit = false, + bool \markSubLit = false, + bool \markAmb = false, + bool \markCycle = false, + bool \markChar = false + ) { + // the cur variables are shared state by the `rec` local function that recurses over the entire tree + int curOffset = 0; + int curLine = 1; + int curColumn = 0; + + @synopsis{Check if this rule is configured to be annotated} + default bool doAnno(Production _) = false; + bool doAnno(prod(\lex(_), _, _)) = markLexical; + bool doAnno(prod(\label(_, \lex(_)), _, _)) = markLexical; + bool doAnno(prod(\parameterized-lex(_, _), _, _)) = markLexical; + bool doAnno(prod(\label(_, \parameterized-lex(_, _)), _, _)) = markLexical; + bool doAnno(prod(\layouts(_), _, _)) = markLayout; + bool doAnno(prod(\label(_, \layouts(_)), _, _)) = markLayout; + bool doAnno(prod(\sort(_), _, _)) = markSyntax; + bool doAnno(prod(\label(_, \sort(_)), _, _)) = markSyntax; + bool doAnno(prod(\parameterized-sort(_, _), _, _)) = markSyntax; + bool doAnno(prod(\label(_, \parameterized-sort(_, _)), _, _)) = markSyntax; + bool doAnno(\regular(_)) = markRegular; + bool doAnno(prod(\lit(_), _, _)) = markLit; + bool doAnno(prod(\cilit(_), _, _)) = markLit; + bool doAnno(prod(\start(_), _, _)) = markStart; + + @synopsis{Check if sub-structure of this rule is configured to be annotated} + default bool doSub(Production _) = true; + bool doSub(prod(\lex(_), _, _)) = \markSubLexical; + bool doSub(prod(\label(_, lex(_)), _, _)) = \markSubLexical; + bool doSub(prod(\layouts(_), _, _)) = \markSubLayout; + bool doSub(prod(\label(_, \layouts(_)), _, _)) = \markSubLayout; + bool doSub(prod(\lit(_), _, _)) = \markSubLit; + bool doSub(prod(\cilit(_), _, _)) = \markSubLit; + + // the character nodes drive the actual current position: offset, line and column + Tree rec(Tree t:char(int ch), bool _sub) { + beginOffset = curOffset; + beginLine = curLine; + beginColumn = curColumn; + + curOffset += 1; + curColumn += 1; + + switch (t) { + case ReturnChar _: { + curColumn = 0; + } + + case NewLineChar _ : { + curLine += 1; + curColumn = 0; + } + } + + Tree washCC(Tree x) = x; // workaround for issue #2342 + + return markChar + ? washCC(char(ch))[@\loc=file(beginOffset, 1, , )] + : washCC(char(ch)) + ; + } + + // cycles take no space + Tree rec(cycle(Symbol s, int up), bool _sub) = markCycle + ? cycle(s, up)[@\loc=file(curOffset, 0, , )] + : cycle(s, up) + ; + + // application nodes always have children to traverse, to get to the individual characters eventually + // different types of nodes lead to annotation, or not, depending on the parameters of ((reposition)) + Tree rec(appl(Production prod, list[Tree] args), bool sub) { + beginOffset = curOffset; + beginLine = curLine; + beginColumn = curColumn; + + // once `sub` is false, going down, we can never turn it on again + newArgs = [mergeRec(a, sub && doSub(prod)) | a <- args]; + + return (sub && doAnno(prod)) + ? appl(prod, newArgs)[@\loc=file(beginOffset, curOffset - beginOffset, , )] + : appl(prod, newArgs) + ; + } + + // ambiguity nodes are simply choices between alternatives which each receive their own positions. + Tree rec(amb(set[Tree] alts), bool sub) { + newAlts = {mergeRec(a, sub) | a <- alts}; + // inherit the outermost positions from one of the alternatives, since they are all the same by definition. + Tree x = getFirstFrom(newAlts); + return markAmb && x@\loc? + ? amb(newAlts)[@\loc=x@\loc] + : amb(newAlts) + ; + } + + @synopsis{Recurse, but not without recovering all other keyword parameters except "src" a.k.a. @\loc from the original.} + Tree mergeRec(Tree t, bool sub) { + oldParams = getKeywordParameters(t); + t = rec(t, sub); + newParams = getKeywordParameters(t); + mergedParams = (oldParams - ("src" : |unknown:///|)) + newParams; + return setKeywordParameters(t, mergedParams); + } + + // we start recursion at the top, not forgetting to merge its other keyword fields + return mergeRec(tree, true); +} + \ No newline at end of file diff --git a/src/org/rascalmpl/library/Prelude.java b/src/org/rascalmpl/library/Prelude.java index f70a0bd2b6f..c66dae2b9d7 100644 --- a/src/org/rascalmpl/library/Prelude.java +++ b/src/org/rascalmpl/library/Prelude.java @@ -3105,6 +3105,10 @@ public IValue stringChars(IList lst){ return values.string(chars); } + + public IString indent(IString indentation, IString content, IBool indentFirstLine) { + return content.indent(indentation, indentFirstLine.getValue()); + } public IValue charAt(IString s, IInteger i) throws IndexOutOfBoundsException //@doc{charAt -- return the character at position i in string s.} diff --git a/src/org/rascalmpl/library/String.rsc b/src/org/rascalmpl/library/String.rsc index 4172068c45a..a9fe8ce8ade 100644 --- a/src/org/rascalmpl/library/String.rsc +++ b/src/org/rascalmpl/library/String.rsc @@ -665,3 +665,19 @@ str substitute(str src, map[loc,str] s) { order = sort([ k | k <- s ], bool(loc a, loc b) { return a.offset < b.offset; }); return ( src | subst1(it, x, s[x]) | x <- order ); } + +@synopsis{Indent a block of text} +@description{ +Every line in `content` will be indented using the characters +of `indentation`. +} +@benefits{ +* This operation executes in constant time, independent of the size of the content +or the indentation. +* Indent is the identity function if `indentation == ""` +} +@pitfalls{ +* This function works fine if `indentation` is not spaces or tabs; but it does not make much sense. +} +@javaClass{org.rascalmpl.library.Prelude} +java str indent(str indentation, str content, bool indentFirstLine=false); \ No newline at end of file diff --git a/src/org/rascalmpl/library/analysis/diff/edits/ExecuteTextEdits.rsc b/src/org/rascalmpl/library/analysis/diff/edits/ExecuteTextEdits.rsc index dbfd814f7b6..f1c41d5c695 100644 --- a/src/org/rascalmpl/library/analysis/diff/edits/ExecuteTextEdits.rsc +++ b/src/org/rascalmpl/library/analysis/diff/edits/ExecuteTextEdits.rsc @@ -38,16 +38,21 @@ void executeFileSystemChange(changed(loc file)) { @synopsis{Edit a file according to the given ((TextEdit)) instructions} void executeFileSystemChange(changed(loc file, list[TextEdit] edits)) { + str content = readFile(file); + + content = executeTextEdits(content, edits); + + writeFile(file.top, content); +} + +str executeTextEdits(str content, list[TextEdit] edits) { assert isSorted(edits, less=bool (TextEdit e1, TextEdit e2) { return e1.range.offset < e2.range.offset; }); - str content = readFile(file); - for (replace(loc range, str repl) <- reverse(edits)) { - assert range.top == file.top; content = ""; } - writeFile(file.top, content); + return content; } diff --git a/src/org/rascalmpl/library/analysis/diff/edits/HiFiLayoutDiff.rsc b/src/org/rascalmpl/library/analysis/diff/edits/HiFiLayoutDiff.rsc new file mode 100644 index 00000000000..2bdc6a878bd --- /dev/null +++ b/src/org/rascalmpl/library/analysis/diff/edits/HiFiLayoutDiff.rsc @@ -0,0 +1,166 @@ +@synopsis{Compare equal-modulo-layout parse trees and extract the exact whitespace text edits that will format the original file.} +@description{ +This algorithm is the final component of a declarative high fidelity source code formatting pipeline. + +We have the following assumptions: +1. One original text file exists. +2. One ((ParseTree)) of the original file to be formatted, containing all orginal layout and source code comments and case-insensitive literals in the exact order of the original text file. In other words, +nothing may have happened to the parse tree after parsing. +3. One ((ParseTree)) of the _same_ file, but formatted (using a formatting algorithm like ((Tree2Box)) `|` ((Box2Text)), or string templates, and then re-parsing). This is typically obtained by +translating the tree to a `str` using some formatting tools, and then reparsing the file. +4. Typically comments and specific capitalization of case-insensitive literals have been lost in step 3. +5. We use ((analysis::diff::edits::TextEdits)) to communicate the effect of formatting to the IDE context. +} +@pitfalls{ +* if `originalTree !:= formattedTree` the algorithm will produce junk. It will break the syntactical correctness of the source code and forget source code comments. +* if comments are not marked with `@category("Comment")` in the original grammar, then this algorithm can not recover them. +} +@benefits{ +* Recovers source code comments which have been lost during earlier steps in the formatting pipeline. This makes losing source code comments an independent concern of a declarative formatter. +* Recovers the original capitalization of case-insensitive literals which may have been lost during earlier steps in the formatting pipeline. +* Can standardize the layout of case insensitive literals to ALLCAPS, all lowercase, or capitalized. Or can leave the literal as it was formatted by an earlier stage. +* Is agnostic towards the design of earlier steps in the formatting pipeline, so lang as `formattedTree := originalTree`. This means that +the pipeline may change layout (whitespace and comments and capitalization of case-insensitive literals), but nothing else. +} +module analysis::diff::edits::HiFiLayoutDiff + +extend analysis::diff::edits::HiFiTreeDiff; +import ParseTree; // this should not be necessary because imported by HiFiTreeDiff +import String; // this should not be be necessary because imported by HiFiTreeDiff + +@synopsis{Normalization choices for case-insensitive literals.} +data CaseInsensitivity + = toLower() + | toUpper() + | toCapitalized() + | asIs() + | asFormatted() + ; + +@synopsis{Extract TextEdits for the differences in whitespace between two otherwise identical ((ParseTree))s.} +@description{ +See ((HiFiLayoutDiff)). +} +list[TextEdit] layoutDiff(Tree original, Tree formatted, bool recoverComments = true, CaseInsensitivity ci = asIs()) { + assert original := formatted : "nothing except layout and keyword fields may be different for layoutDiff to work correctly."; + + @synopsis{rec is the recursive workhorse, doing a pairwise recursion over the original and the formatted tree} + @description{ + We recursively skip over every "equal" pairs of nodes, until we detect two different _layout_ nodes. The original location + of that node and the new contents of the formatted node is used to construct a replace ((TextEdit)), and + optionally the original layout is inspected for source code comments which may have been lost. Literals are skipped + explicitly to avoid arbitrary edits for case insensitive literals, and to safe some time. + } + + // if layout differences are detected, here we produce a `replace` node: + list[TextEdit] rec( + t:appl(prod(Symbol tS, _, _), list[Tree] tArgs), // layout is not necessarily parsed with the same rules (i.e. comments are lost!) + u:appl(prod(Symbol uS, _, _), list[Tree] uArgs)) + = [replace(t@\loc, recoverComments ? learnComments(t, u) : "") | tArgs != uArgs, "" != "" /* avoid useless edits */] + when + delabel(tS) is layouts, + delabel(uS) is layouts, + tArgs != uArgs; + + // matched literal trees generate empty diffs + list[TextEdit] rec( + appl(prod(lit(_), _, _), list[Tree] _), + appl(prod(lit(_), _, _), list[Tree] _)) + = []; + + // matched case-insensitive literal trees generate empty diffs such that the original is maintained. + // however, we also offer some convenience functionality to standardize their formatting right here. + list[TextEdit] rec( + t:appl(prod(cilit(_), _, _), list[Tree] _), + u:appl(prod(cilit(_), _, _), list[Tree] _)) { + + str yield = ""; + + switch (ci) { + case asIs(): + return []; + case asFormatted(): + return [replace(t@\loc, result) | str result := "", result != yield]; + case toUpper(): + return [replace(t@\loc, result) | str result := toUpperCase(yield), result != yield]; + case toLower(): + return [replace(t@\loc, result) | str result := toLowerCase(yield), result != yield]; + case toCapitalized(): + return [replace(t@\loc, result) | str result := capitalize(yield), result != yield]; + default: + throw "unexpected option: "; + } + } + + list[TextEdit] rec( + char(_), + char(_) + ) = []; + + list[TextEdit] rec( + cycle(Symbol _, int _), + cycle(_, _) + ) = []; + + // recurse through the entire parse tree to collect layout edits: + default list[TextEdit] rec( + Tree t:appl(Production p, list[Tree] argsA), + appl(p /* must be the same by the above assert */, list[Tree] argsB)) + = [*rec(a, b) | <- zip2(argsA, argsB)]; + + // first add required locations to layout nodes + original = reposition(original, markLit=true, markLayout=true, markSubLayout=true); + + return rec(original, formatted); +} + +@synopsis{Make sure the new layout still contains all the source code comments of the original layout} +@description{ +This algorithm uses the @category("Comments") tag to detect source code comments inside layout substrings. If the original +layout contains comments, we re-introduce the comments at the expected level of indentation. New comments present in the +replacement are kept and will overwrite any original comments. + +This trick is complicated by the syntax of multiline comments and single line comments that have +to end with a newline. +} +@benefits{ +* if comments are kepts and formatted by tools like Tree2Box, then this algorithm does not overwrite these. +* if comments were completely lost, then this algorithm _always_ puts them back (under assumptions of ((layoutDiff))) +* recovered comments are indented according to the indentation discovered in the _formatted_ replacement tree. +} +@pitfalls{ +* if comments are not marked with `@category("Comment")` in the original grammar, then this algorithm recovers nothing. +} +private str learnComments(Tree original, Tree replacement) { + originalComments = ["" | /c:appl(prod(_,_,{\tag("category"(/^[Cc]omment$/)), *_}), _) := original]; + + if (originalComments == []) { + // if the original did not contain comments, stick with the replacements + return ""; + } + + replacementComments = ["" | /c:appl(prod(_,_,{\tag("category"(/^[Cc]omment$/)), *_}), _) := replacement]; + + if (replacementComments != []) { + // if the replacement contains comments, we assume they've been accurately retained by a previous stage (like Tree2Box): + return ""; + } + + // At this point, we know that: (a) comments are not present in the replacement and (b) they used to be there in the original. + // So the old comments are going to be the new output. however, we want to learn indentation from the replacement. + + // Drop the last newline of single-line comments, because we don't want two newlines in the output for every comment: + str dropEndNl(str line:/^.*\n$/) = (line[..-1]); + default str dropEndNl(str line) = line; + + // the first line of the replacement is the indentation to use. + str replacementIndent = split("\n", "")[0]; + + // trimming each line makes sure we forget about the original indentation, and drop accidental spaces after comment lines + return "" + indent(replacementIndent, + " + '<}>"[..-1], indentFirstLine=false) + ""; +} + +private Symbol delabel(label(_, Symbol t)) = t; +private default Symbol delabel(Symbol x) = x; \ No newline at end of file diff --git a/src/org/rascalmpl/library/analysis/diff/edits/HiFiTreeDiff.rsc b/src/org/rascalmpl/library/analysis/diff/edits/HiFiTreeDiff.rsc new file mode 100644 index 00000000000..2fcb2d3b3d5 --- /dev/null +++ b/src/org/rascalmpl/library/analysis/diff/edits/HiFiTreeDiff.rsc @@ -0,0 +1,438 @@ +@license{ +Copyright (c) 2018-2023, NWO-I Centrum Wiskunde & Informatica +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this litst of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +} +@synopsis{Infer ((TextEdit)) from the differences between two parse ((ParseTree::Tree))s} +@description{ +This module provides an essential building block for creating high-fidelity source-to-source code transformations. +It is common for industrial use cases of source-to-source transformation to extract +a list of text edits programmatically using parse tree pattern matching. This way the +changes are made on the textual level, with less introduction of noise and fewer removals +of valuable layout (indentation) and source code comments. + +The construction of such high-fidelity edit lists can be rather involved because it tangles +and scatters a number of concerns: +1. syntax-directed pattern matching +2. string substitution; construction of the rewritten text + * retention of layout and in particular indentation + * retention of source code comments + * retention of specific case-insensitive keyword style + * syntactic correctness of the result; especially in relation to list separators there are many corner-cases to thing of + +On the other hand, ParseTree to ParseTree rewrites are much easier to write and get correct. +They are "syntax directed" via the shape of the tree that follows the grammar of the language. +Some if not all of the above aspects are tackled by the rewriting mechanism with concrete patterns. +Especially the corner cases w.r.t. list separators are all handled by the rewriting mechanisms. +Also the rules are in "concrete syntax", on both the matching and the substition side. So they are +readable for all who know the object language. The rules guarantee syntactic correctness of the +rewritten source code. However, rewrite rules do quite some noisy damage to the layout, indentation +and comments, of the result. + +With this module we bring these two modalities of source-to-source transformations together: +1. The language engineer uses concrete syntax rewrite rules to derive a new ParseTree from the original; +2. We run ((treeDiff)) to obtain a set of minimal text edits; +3. We apply the text edits to the editor contents or the file system. +} +@benefits{ +* Because the derived text edits change fewer characters, the end result is more "hifi" than simply +unparsing the rewritten ParseTree. More comments are retained and more indentation is kept the same. More +case-insensitive keywords retain their original shape. +* At the same time the rewrite rules are easier to maintain as they remain "syntax directed". +* Changes to the grammar will be picked up when checking all source and target patterns. +* The diff algorithm uses cross-cutting information from the parse tree (what is layout and what not, + what is case-insensitive, etc.) which would otherwise have to be managed by the language engineer in _every rewrite rule_. +* The diff algoritm understands what indentation is and brings new sub-trees to the original level +of indentation (same as the sub-trees they are replacing) +* Typically the algorithm's run-time is lineair in the size of the tree, or better. Same for memory usage. +} +@pitfalls{ +* ((treeDiff)) only works under the assumption that the second tree was derived from the first +by applying concrete syntax rewrite rules in Rascal. If there is no origin relation between the two +then its heuristics will not work. The algorithm could degenerate to substituting the entire file, +or worse it could degenerate to an exponential search for commonalities in long lists. +* ((treeDiff))'s efficiency is predicated on the two trees being derived from each other in main memory of the currently running JVM. +This way both trees will share pointers where they are the same, which leads to very efficient equality +testing. If the trees are first independently serialized to disk and then deserialized again, and then ((treeDiff)) is called, +this optimization is not present and the algorithm will perform (very) poorly. +* Substitution patterns should be formatted as best as possible. The algorithm will not infer +spacing or relative indentation inside of the substituted subtree. It will only infer indentation +for the entire subtree. Another way of resolving this is using a code formatter on the subsituted patterns. +} +module analysis::diff::edits::HiFiTreeDiff + +extend analysis::diff::edits::TextEdits; + +import List; +import Location; +import ParseTree; +import String; +import util::Math; + +@synopsis{Detects minimal differences between parse trees and makes them explicit as ((TextEdit)) instructions.} +@description{ +This is a "diff" algorithm of two parse trees to generate a ((TextEdit)) script that applies the differences on +the textual level, _with minimal collatoral damage in whitespace_. This is why it is called "HiFi": minimal unnecessary +noise introduction to the original file. It also tries to conserve source code comments; where still possible. + +The resulting ((TextEdit))s are an intermediate representation for making changes in source code text files. +They can be executed independently via ((ExecuteTextEdits)), or interactively via ((IDEServices)), or LanguageServer features. + +This top-down diff algorithm takes two arguments: +1. an _original_ parse tree for a text file, +2. and a _derived_ parse tree that is mostly equal to the original but has pieces of it substituted or rewritten. + +From the tree node differences between these two trees, ((TextEdit))s are derived such that: +* when the edited source text is parsed again, the resulting tree would match the derived tree. +However, the parsed tree could be different from the derived tree in terms of whitespace, indentation and case-insensitive literals (see below). +* when tree nodes (grammar rules) are equal, smaller edits are searched by pair-wise comparison of the children +* differences between respective layout or (case insensitve) literal nodes are always ignored +* when lists have changed, careful editing of possible separators ensures syntactic correctness +* when new sub-trees are inserted, the replacement will be at the same indentation level as the original. +* when case-insensitive literals have been changed under a grammar rule that remained the same, no edits are produced. + +The function comes in handy when we use Rascal to rewrite parse trees, and then need to communicate the effect +back to the IDE (for example using ((util::IDEServices)) or `util::LanguageServer` interfaces). We use +((ExecuteTextEdits)) to _test_ the effect of ((TextEdits)) while developing a source-to-source transformation. +} +@benefits{ +* This function allows the language engineer to work in terms of abstract and concrete syntax trees while manipulating source text. The +((TextEdit))s intermediate representation bridge the gap to the minute details of IDE interaction such as "undo" and "preview" features. +* Text editing is fraught with details of whitespace, comments, list separators; all of which are handled here by +the exactness of syntactic and semantic knowledge of the parse trees. +* Where possible the algorithm also retains the capitalization of case-insensitive literals. +* The algorithm retrieves and retains indentation levels from the original tree, even if sub-trees in the +derived tree have mangled indentation. This allows us to ignore the indentation concern while thinking of rewrite +rules for source-to-souce transformation, and focus on the semantic effect. +* The algorithm inherits source code comments from the original, wherever sub-trees of the original and the +rewritten tree still line up. +} +@pitfalls{ +* If the first argument is not an original parse tree, then basic assumptions of the algorithm fail and it may produce erroneous text edits. +* If the second argument is not derived from the original, then the algorithm will produce a single text edit to replace the entire source text. +* If the second argument was not produced from the first in the same JVM memory, it will not share many pointers to equal sub-trees +and the performance of the algorithm will degenerate quickly. +* If the parse tree of the original does not reflect the current state of the text in the file, then the generated text edits will do harm. +* If the original tree is not annotated with source locations, the algorithm fails. +* Both parse trees must be type correct, e.g. the number of symbols in a production rule, must be equal to the number of elements of the argument list of ((appl)). +* This algorithm does not work with ambiguous (sub)trees. +* When large sub-trees or sub-lists are moved to other parts of the tree, comment inheritance is not possible anymore. +} +@examples{ +If we rewrite parse trees, this can be done with concrete syntax matching. +The following example swaps the if-branch with the else-branch in Pico: + +```rascal-shell +import lang::pico::\syntax::Main; +import IO; +import analysis::diff::edits::ExecuteTextEdits; +import analysis::diff::edits::TextEdits; +import analysis::diff::edits::HiFiTreeDiff; +// an example Pico program: +writeFile(|tmp:///example.pico|, + "begin + ' declare + ' a : natural, + ' b : natural; + ' if a then + ' a := b + ' else + ' b := a + ' fi + 'end"); +import ParseTree; +original = parse(#start[Program], |tmp:///example.pico|); +// match and replace all conditionals +rewritten = visit(original) { + case (Statement) `if then <{Statement ";"}* ifBranch> else <{Statement ";"}* elseBranch> fi` + => (Statement) `if then + ' <{Statement ";"}* elseBranch> + 'else + ' <{Statement ";"}* ifBranch> + 'fi` +} +// Check the result as a string. It worked, but we see some collatoral damage in whitespace (indentation). +"" +// Now derive text edits from the two parse trees: +edits = treeDiff(original, rewritten); +// Wrap them in a single document edit +edit = changed(original@\loc.top, edits); +// Apply the document edit on disk: +executeDocumentEdits([edit]); +// and when we read the result back, we see the transformation succeeded, and indentation was not lost: +readFile(tmp://example.pico|); +// It's also possible to directly rewrite the original string, for debugging purposes: +executeTextEdits("", treeDiff(original, rewritten)) +``` +} +// equal trees generate empty diffs (note this already ignores whitespace differences because non-linear matching ignores layout nodes) +list[TextEdit] treeDiff(Tree a, a) = []; + +// skip production labels of original rules when diffing, to be able to focus on the Symbol constructor for downstream case-distinction +list[TextEdit] treeDiff( + Tree t:appl(prod(label(_, Symbol s), list[Symbol] syms, set[Attr] attrs), list[Tree] args), + Tree u) + = treeDiff(appl(prod(s, syms, attrs), args)[@\loc=t@\loc?|bla:///|], u); + +// skip production labels of original rules when diffing, to be able to focus on the Symbol constructor for downstream case-distinction +list[TextEdit] treeDiff( + Tree t, + Tree u:appl(prod(label(_, Symbol s), list[Symbol] syms, set[Attr] attrs), list[Tree] args)) + = treeDiff(t, appl(prod(s, syms, attrs), args)[@\loc=u@\loc?|bla:///|]); + +// matched layout trees generate empty diffs such that the original is maintained +list[TextEdit] treeDiff( + appl(prod(layouts(_), _, _), list[Tree] _), + appl(prod(layouts(_), _, _), list[Tree] _)) + = []; + +// matched literal trees generate empty diffs +list[TextEdit] treeDiff( + appl(prod(lit(str l), _, _), list[Tree] _), + appl(prod(lit(l) , _, _), list[Tree] _)) + = []; + +// matched case-insensitive literal trees generate empty diffs such that the original is maintained +list[TextEdit] treeDiff( + appl(prod(cilit(str l), _, _), list[Tree] _), + appl(prod(cilit(l) , _, _), list[Tree] _)) + = []; + +// different lexicals generate small diffs even if the parent is equal. This avoids extremely small edits within the boundaries of single identifiers. +list[TextEdit] treeDiff( + t:appl(prod(lex(str l), _, _), list[Tree] _), + r:appl(prod(lex(l) , _, _), list[Tree] _)) + = [replace(t@\loc, learnIndentation(t@\loc, "", ""))] + when t != r; + +// When the productions are different, we've found an edit, and there is no need to recurse deeper. +default list[TextEdit] treeDiff( + t:appl(Production p:prod(_,_,_), list[Tree] _), + r:appl(Production q:!p , list[Tree] _)) + = [replace(t@\loc, learnIndentation(t@\loc, "", ""))]; + +// If list production are the same, then the element lists can still be of different length +// and we switch to listDiff which has different heuristics than normal trees to detect large identical sublists. +list[TextEdit] treeDiff( + Tree t:appl(Production p:regular(Symbol reg), list[Tree] aElems), + appl(p, list[Tree] bElems)) + = listDiff(t@\loc, seps(reg), aElems, bElems); + +// When the productions are equal, but the children may be different, we dig deeper for differences +default list[TextEdit] treeDiff(t:appl(Production p, list[Tree] argsA), appl(p, list[Tree] argsB)) + = [*treeDiff(a, b) | <- zip2(argsA, argsB)]; + +@synopsis{decide how many separators we have} +private int seps(\iter-seps(_, list[Symbol] s)) = size(s); +private int seps(\iter-star-seps(_, list[Symbol] s)) = size(s); +private default int seps(Symbol _) = 0; + +@synopsis{List diff finds minimal differences between the elements of two lists.} +@description{ +This algorithm uses heuristics to avoid searching for the largest common sublist all too often. +Also it minimized the sublists that largest common sublist is executed on. + +1. Since many patches to parse tree lists typically only change a prefix or a postfix, and we +can detect this quickly, we first extract patches for those instances. +2. It is also fast and easy to detect unchanged prefixes and postfixes, so by focusing +on the changes parts in the middle we generate more instances of case 1. +3. Another simple and quick case is when simply all elements are different (the prefix==the list==the postfix) +3. What we are left with is either an empty list and we are done, or a more complex situation +where we apply the "findEqualSubList" algorithm, which splits the list in three parts: + * two unequal prefixes + * two equal sublists in the middle + * two unequal postfixes +4. the algorithm then concatenates the diffs by recursing to step 1 on the prefixes and the diffs by recursing to step 1. on the postfixes +5. two empty lists terminate the recursion, +} +list[TextEdit] listDiff(loc span, int seps, list[Tree] originals, list[Tree] replacements) { + edits = []; + + // this algorithm isolates commonalities between the two lists + // by handling different special cases. It continues always with + // what is left to be different. By maximizing commonalities, + // the edits are minimized. Note that we float on source location parameters + // not only for the edit locations but also for sub-tree identity. + + = trimEqualElements(span, originals, replacements); + + = commonSpecialCases(span, seps, originals, replacements); + edits += specialEdits; + + equalSubList = findEqualSubList(originals, replacements); + + // by using the (or "a") largest common sublist as a pivot to divide-and-conquer + // to the left and right of it, we minimize the number of necessary + // edit actions for the entire list. + if (equalSubList != [], + [*preO, *equalSubList, *postO] := originals, + [*preR, *equalSubList, *postR] := replacements) { + // TODO: what about the separators? + // we align the prefixes and the postfixes and + // continue recursively. + + return edits + + listDiff(beginCover(span, preO), seps, preO, preR) + + listDiff(endCover(span, postO), seps, postO, postR) + ; + } + else if (originals == [], replacements == []) { + return edits; + } + else { + // here we know there are no common elements anymore, only a common amount of different elements + common = min([size(originals), size(replacements)]); + + return edits + // first the minimal length pairwise replacements, essential for finding accidental commonalities + + [*treeDiff(a, b) | <- zip2(originals[..common], replacements[..common])] + // then we either remove the tail that became shorter: + + [replace(cover([after(last@\loc), cover(originals[common+1..])]), "") | size(originals) > size(replacements), [*_, last] := originals[..common]] + // or we add new elements to the end, while inheriting indentation from the originals: + + [replace(after(span), learnIndentation(span, yield(replacements[common..]), yield(originals))) | size(originals) < size(replacements)] + ; + } +} + +@synopsis{Finds the largest sublist that occurs in both lists} +@description{ +Using list matching and backtracking, this algorithm detects which common +sublist is the largest. It assumes ((trimEqualElements)) has happened already, +and thus there are interesting differences left, even if we remove any equal +sublist. + +Note that this is not a general algorithm for Largest Common Subsequence (LCS), since it +uses particular properties of the relation between the original and the replacement list: +* New elements are never equal to old elements (due to source locations) +* Equal prefixes and postfixes may be assumed to be maximal sublists as well (see above). +* Candidate equal sublists always have consecutive source locations from the origin. +} +list[Tree] findEqualSubList([*Tree sub], [*_, *sub, *_]) = sub; +list[Tree] findEqualSubList([*_, *Tree sub, *_], [*sub]) = sub; +list[Tree] findEqualSubList([*_, p, *Tree sub, q, *_], [*_, !p, *sub, !q, *_]) = sub; +default list[Tree] findEqualSubList(list[Tree] _orig, list[Tree] _repl) = []; + +@synopsis{trips equal elements from the front and the back of both lists, if any.} +tuple[loc, list[Tree], list[Tree]] trimEqualElements(loc span, + [Tree a, *Tree aPostfix], [ a, *Tree bPostfix]) + = trimEqualElements(endCover(span, aPostfix), aPostfix, bPostfix); + +tuple[loc, list[Tree], list[Tree]] trimEqualElements(loc span, + [*Tree aPrefix, Tree a], [*Tree bPrefix, a]) + = trimEqualElements(beginCover(span, aPrefix), aPrefix, bPrefix); + +default tuple[loc, list[Tree], list[Tree]] trimEqualElements(loc span, + list[Tree] a, list[Tree] b) + = ; + +// only one element removed in front, then we are done +tuple[list[TextEdit], list[Tree], list[Tree]] commonSpecialCases(loc span, 0, [Tree a, *Tree tail], [*tail]) + = <[replace(a@\loc, "")], [], []>; + +// only one element removed in front, plus 1 separator, then we are done because everything is the same +tuple[list[TextEdit], list[Tree], list[Tree]] commonSpecialCases(loc span, 1, + [Tree a, Tree _sep, Tree tHead, *Tree tail], [tHead, *tail]) + = <[replace(fromUntil(a, tHead), "")], [], []>; + +// only one element removed in front, plus 1 separator, then we are done because everything is the same +tuple[list[TextEdit], list[Tree], list[Tree]] commonSpecialCases(loc span, 3, + [Tree a, Tree _l1, Tree _sep, Tree _l2, Tree tHead, *Tree tail], [tHead, *tail]) + = <[replace(fromUntil(a, tHead), "")], [], []>; + +// singleton replacement +tuple[list[TextEdit], list[Tree], list[Tree]] commonSpecialCases(loc span, int _, + [Tree a], [Tree b]) + = ; + +default tuple[list[TextEdit], list[Tree], list[Tree]] commonSpecialCases(loc span, int _, list[Tree] a, list[Tree] b) + = <[], a, b>; + +@synopsis{convenience overload for shorter code} +private loc fromUntil(Tree from, Tree until) = fromUntil(from@\loc, until@\loc); + +@synopsis{Compute location span that is common between an element and a succeeding element} +@description{ +The resulting loc is including the `from` but excluding the `until`. It goes right +up to `until`. +```ascii-art + [from] gap [until] + <---------> +```` +} +private loc fromUntil(loc from, loc until) = from.top(from.offset, until.offset - from.offset); +private int end(loc src) = src.offset + src.length; + +private loc after(loc src) = src(end(src), 0); + +private loc endCover(loc span, []) = span(span.offset + span.length, 0); +private loc endCover(loc span, [Tree x]) = x@\loc; +private default loc endCover(loc span, list[Tree] l) = cover(l); + +private loc beginCover(loc span, []) = span(span.offset, 0); +private loc beginCover(loc span, [Tree x]) = x@\loc; +private default loc beginCover(loc span, list[Tree] l) = cover(l); + +private loc cover(list[Tree] elems:[_, *_]) = cover([e@\loc | Tree e <- elems, e@\loc?]); + +@synopsis{yield a consecutive list of trees} +private str yield(list[Tree] elems) = "<}>"; + +@synopsis{Make sure the subtitution is at least as far indented as the original} +@description{ +This algorithm ignores the first line, since the first line is always preceeded by the layout of a parent node. + +Then it measures the depth of indentation of every line in the original, and takes the minimum. +That minimum indentation is stripped off every line that already has that much indentation in the replacement, +and then _all_ lines are re-indented with the discovered minimum. +} +private str learnIndentation(loc span, str replacement, str original) { + list[str] indents(str text) = [indent | /^[^\ \t]/ <- split("\n", text)]; + + origIndents = indents(original); + replLines = split("\n", replacement); + + if (replLines == []) { + return ""; + } + + minIndent = ""; + + if ([_] := origIndents) { + // only one line. have to invent indentation from span + minIndent = " <}>"; + } + else { + // we skip the first line for learning indentation, because that one would typically be embedded in a previous line. + minIndent = sort(origIndents[1..])[0]? ""; + } + + // we remove the leading spaces _up to_ the minimal indentation of the original, + // keep the rest of the indentation from the replacement (if any is left), and then the actual content. + // that entire multiline result is then lazily indented with the minimal indentation we learned from the original. + return indent( + minIndent, + "$/ <- replLines) {> + '<}>"[..-1]) + ; +} diff --git a/src/org/rascalmpl/library/lang/box/syntax/Box.rsc b/src/org/rascalmpl/library/lang/box/syntax/Box.rsc index 23100d75818..9e43f84eda1 100644 --- a/src/org/rascalmpl/library/lang/box/syntax/Box.rsc +++ b/src/org/rascalmpl/library/lang/box/syntax/Box.rsc @@ -37,7 +37,7 @@ set on every `I` Box according to the current preferences of the user. @pitfalls{ * `U(boxes)` is rendered as `H(boxes)` if it's the outermost Box. } -data Box(int hs=1, int vs=0, int is=2) +data Box(int hs=1, int vs=0, int is=4) = H(list[Box] boxes) | V(list[Box] boxes) | HOV(list[Box] boxes) diff --git a/src/org/rascalmpl/library/lang/box/util/Box2Text.rsc b/src/org/rascalmpl/library/lang/box/util/Box2Text.rsc index 265c2413442..1c4a64fe043 100644 --- a/src/org/rascalmpl/library/lang/box/util/Box2Text.rsc +++ b/src/org/rascalmpl/library/lang/box/util/Box2Text.rsc @@ -120,7 +120,7 @@ between horizontal and vertical for HOV boxes. data Options = options( int hs = 1, int vs = 0, - int is = 2, + int is = 4, int maxWidth=80, int wrapAfter=70 ); diff --git a/src/org/rascalmpl/library/lang/box/util/Tree2Box.rsc b/src/org/rascalmpl/library/lang/box/util/Tree2Box.rsc index 0c0f3f13596..6aff94d64e0 100644 --- a/src/org/rascalmpl/library/lang/box/util/Tree2Box.rsc +++ b/src/org/rascalmpl/library/lang/box/util/Tree2Box.rsc @@ -67,6 +67,7 @@ module lang::box::util::Tree2Box import ParseTree; import lang::box::\syntax::Box; import String; +import IO; @synopsis{Configuration options for toBox} data FormatOptions = formatOptions( @@ -77,12 +78,13 @@ data FormatOptions = formatOptions( data CaseInsensitivity = toLower() | toUpper() + | toCapitalized() | asIs() ; @synopsis{This is the generic default formatter} @description{ -This generic formatter is to be overridden by someone constructig a formatter tools +This generic formatter is to be overridden by someone constructing a formatter tools for a specific language. The goal is that this `toBox` default rule maps syntax trees to plausible Box expressions, and that only a minimal amount of specialization by the user is necessary. @@ -117,8 +119,59 @@ default Box toBox(t:appl(Production p, list[Tree] args), FO opts = fo()) { case : return H([toBox(e, opts=opts) | e <- elements], hs=0); + // comma's are usually for parameters separation case : - return HV([G([toBox(e, opts=opts) | e <- elements], gs=4, hs=0, op=H)], hs=1); + return HOV([ + H([ + toBox(elements[i], opts=opts), + *[H([toBox(elements[i+2], opts=opts)], hs=1) | i + 2 < size(elements)] + ], hs=0) | int i <- [0,4..size(elements)] + ]); + + // comma's are usually for parameters separation + case : + return HOV([ + H([ + toBox(elements[i], opts=opts), + *[H([toBox(elements[i+2], opts=opts)], hs=1) | i + 2 < size(elements)] + ], hs=0) | int i <- [0,4..size(elements)] + ]); + + // semi-colons are usually for statement separation + case : + return V([ + H([ + toBox(elements[i], opts=opts), + *[H([toBox(elements[i+2], opts=opts)], hs=1) | i + 2 < size(elements)] + ], hs=0) | int i <- [0,4..size(elements)] + ]); + + // optional semi-colons also happen often + case : + return V([ + H([ + toBox(elements[i], opts=opts), + *[H([toBox(elements[i+2], opts=opts)], hs=1) | i + 2 < size(elements)] + ], hs=0) | int i <- [0,4..size(elements)] + ]); + + // semi-colons are usually for parameters separation + case : + return V([ + H([ + toBox(elements[i], opts=opts), + *[H([toBox(elements[i+2], opts=opts)], hs=1) | i + 2 < size(elements)] + ], hs=0) | int i <- [0,4..size(elements)] + ]); + + // optional semi-colons also happen often + case : + return V([ + H([ + toBox(elements[i], opts=opts), + *[H([toBox(elements[i+2], opts=opts)], hs=1) | i + 2 < size(elements)] + ], hs=0) | int i <- [0,4..size(elements)] + ]); case : return V([G([toBox(e, opts=opts) | e <- elements], gs=4, hs=0, op=H)], hs=1); @@ -133,17 +186,13 @@ default Box toBox(t:appl(Production p, list[Tree] args), FO opts = fo()) { case : return V([G([toBox(e, opts=opts) | e <- elements], gs=2, hs=0, op=H)], hs=0); - // if comments are found in layout trees, then we include them here - // and splice them into our context. If the deep match does not find any - // comments, then layout positions are reduced to U([]) which dissappears - // by splicing the empty list. + // We remove all layout node positions to make the number of children predictable + // Comments can be recovered by `layoutDiff`. By not recursing into layout + // positions `toBox` becomes more than twice as fast. case : - return U([toBox(u, opts=opts) | /u:appl(prod(_, _, {*_,\tag("category"(/^[Cc]omment$/))}), _) <- content]); + return NULL(); - // single line comments are special, since they have the newline in a literal - // we must guarantee that the formatter will print the newline, but we don't - // want an additional newline due to the formatter. We do remove any unnecessary - // spaces + // if we are given a comment node, then we can format it here for use by layoutDiff case : return V([ H([toBox(elements[0], opts=opts), @@ -151,6 +200,7 @@ default Box toBox(t:appl(Production p, list[Tree] args), FO opts = fo()) { ], hs=1) ]); + // if we are given a comment node, then we can pretty print it here for use by layoutDiff case : return V([ H([toBox(elements[0], opts=opts), @@ -173,31 +223,28 @@ default Box toBox(t:appl(Production p, list[Tree] args), FO opts = fo()) { // Those kinds of structures appear again and again as many languages share inspiration // from their pre-decessors. Watching out not to loose any comments... - // we flatten binary operators into their context for better flow of deeply nested - // operators. The effect will be somewhat like a separated list of expressions where - // the operators are the separators. case : - return U([toBox(e) | e <- elements]); + return HOV([toBox(elements[0], opts=opts), H([toBox(e, opts=opts) | e <- elements[1..]])]); // postfix operators stick case : - return H([toBox(e) | e <- elements], hs=0); + return H([toBox(e, opts=opts) | e <- elements], hs=0); // prefix operators stick case : - return H([toBox(e) | e <- elements], hs=0); + return H([toBox(e, opts=opts) | e <- elements], hs=0); // brackets stick case : - return H([toBox(e) | e <- elements], hs=0); + return H([toBox(e, opts=opts) | e <- elements], hs=0); // if the sort name is statement-like and the structure block-like, we go for // vertical with indentation // program: "begin" Declarations decls {Statement ";"}* body "end" ; - case : + case : return V([ - toBox(elements[0], opts=opts), - I([V([toBox(e, opts=opts) | e <- elements[1..-1]])]), + H([*[toBox(p, opts=opts) | Tree p <- elements[0..size(pre)]], toBox(elements[size(pre)], opts=opts)]), + I([V([toBox(e, opts=opts) | Tree e <- elements[size(pre)+1..-1]])]), toBox(elements[-1], opts=opts) ]); } @@ -234,6 +281,7 @@ private FO fo() = formatOptions(); @synopsis{Implements normalization of case-insensitive literals} private str ci(str word, toLower()) = toLowerCase(word); private str ci(str word, toUpper()) = toUpperCase(word); +private str ci(str word, toCapitalized()) = capitalize(word); private str ci(str word, asIs()) = word; @synopsis{Split a text by the supported whitespace characters} diff --git a/src/org/rascalmpl/library/lang/pico/HiFiDemo.rsc b/src/org/rascalmpl/library/lang/pico/HiFiDemo.rsc new file mode 100644 index 00000000000..3decb0f5c00 --- /dev/null +++ b/src/org/rascalmpl/library/lang/pico/HiFiDemo.rsc @@ -0,0 +1,48 @@ +@synopsis{Demonstrates HiFi source-to-source transformations through concrete syntax rewrites and text edits.} +module lang::pico::HiFiDemo + +import lang::pico::\syntax::Main; +import IO; +import ParseTree; +import analysis::diff::edits::HiFiTreeDiff; +import analysis::diff::edits::ExecuteTextEdits; + +@synopsis{Blindly swaps the branches of all the conditionals in a program} +@description{ +This rule is syntactically correct and has a clear semantics. The +layout of the resulting if-then-else-fi statement is also clear. +} +start[Program] flipConditionals(start[Program] program) = visit(program) { + case (Statement) `if then + ' <{Statement ";"}* ifBranch> + 'else + ' <{Statement ";"}* elseBranch> + 'fi` => + (Statement) `if then + ' <{Statement ";"}* elseBranch> + 'else + ' <{Statement ";"}* ifBranch> + 'fi` +}; + +void main() { + t = parse(#start[Program], |project://rascal/src/org/rascalmpl/library/lang/pico/examples/flip.pico|); + println("The original: + '"); + + u = flipConditionals(t); + println("Branches swapped, comments and indentation lost: + '"); + + edits = treeDiff(t, u); + println("Smaller text edits:"); + iprintln(edits); + + newContent = executeTextEdits("", edits); + println("Better output after executeTextEdits: + '"); + + newU = parse(#start[Program], newContent); + + assert u := newU : "the rewritten tree matches the newly parsed"; +} diff --git a/src/org/rascalmpl/library/lang/pico/examples/flip.pico b/src/org/rascalmpl/library/lang/pico/examples/flip.pico new file mode 100644 index 00000000000..63f58c62b40 --- /dev/null +++ b/src/org/rascalmpl/library/lang/pico/examples/flip.pico @@ -0,0 +1,21 @@ +begin + declare + a : natural, + b : natural; + a := 0; + b := 1; + if a then + % comment 1 % + b := a; + z := z + else + % comment 2 % + a := b; + if b then + z := a + else + z := b + fi; + z := z + fi +end \ No newline at end of file diff --git a/src/org/rascalmpl/library/lang/pico/format/Formatting.rsc b/src/org/rascalmpl/library/lang/pico/format/Formatting.rsc new file mode 100644 index 00000000000..20941f8a5f8 --- /dev/null +++ b/src/org/rascalmpl/library/lang/pico/format/Formatting.rsc @@ -0,0 +1,64 @@ +@synopsis{Demonstrates ((Tree2Box)), ((Box2Text)) and ((HiFiLayoutDiff)) for constructing a declarative and HiFi Pico formatting pipeline} +@description{ +Using four generic or generated, "language parametric", building blocks we construct a Pico formatting pipeline: + +* ((ParseTree)) is used to _generate_ a parser for Pico. +* ((Tree2Box)) provides the extensible/overridable and declarative ((toBox)) function which maps language constructs to Box expressions. +The ((toBox)) function combines generic language-parametric rules, as well as bespoke language specific rules.. +* ((Box2Text)) is a _generic_ reusable algorithm for two-dimensional string layout. +* Finally, ((HiFiLayoutDiff)) _generically_ extracts ((TextEdit))s from two trees which are equal modulo whitespace and comments. +} +@benefits{ +* The formatting is style is programmed _declaratively_ by mapping language patterns to Box expressions. +* The pipeline never loses source code comments, and this requires no attention from the language engineer. +} +@pitfalls{ +* ((Box2Text)) must be _extended_ for the open recursive calls of ((toBox)) to reach the extensions in the current module. +If you import ((Box2Text)) the extended ((toBox)) rules will only be found if they describe top-level tree nodes. +} +module lang::pico::format::Formatting + +extend lang::box::util::Tree2Box; + +import ParseTree; +import analysis::diff::edits::ExecuteTextEdits; +import analysis::diff::edits::HiFiLayoutDiff; +import lang::box::\syntax::Box; +import lang::box::util::Box2Text; +import lang::pico::\syntax::Main; + +@synopsis{In-place formatting of an entire Pico file} +void formatPicoFile(loc file) { + edits = formatPicoTree(parse(#start[Program], file)); + executeFileSystemChanges([changed(file, edits)]); +} + +@synopsis{Format a string that contains an entire Pico program} +str formatPicoString(str file) { + start[Program] tree = parse(#start[Program], file, |unknown:///|); + return executeTextEdits(file, formatPicoTree(tree)); +} + +@synopsis{Pico Format function for reuse in file, str or IDE-based formatting contexts} +list[TextEdit] formatPicoTree(start[Program] file) { + formatted = format(toBox(file)); + return layoutDiff(file, parse(#start[Program], formatted, file@\loc.top)); +} + +@synopsis{Format while} +Box toBox((Statement) `while do <{Statement ";"}* block> od`, FO opts = fo()) + = V([ + H([L("while"), toBox(e, opts=opts), L("do")]), + I([toBox(block, opts=opts)]), + L("od") + ]); + +@synopsis{Format if-then-else } +Box toBox((Statement) `if then <{Statement ";"}* thenPart> else <{Statement ";"}* elsePart> fi`, FO opts = fo()) + = V([ + H([L("if"), toBox(e, opts=opts), L("then")]), + I([toBox(thenPart, opts=opts)]), + L("else"), + I([toBox(elsePart, opts=opts)]), + L("fi") + ]); \ No newline at end of file diff --git a/src/org/rascalmpl/library/lang/rascal/tests/basic/RepositionTree.rsc b/src/org/rascalmpl/library/lang/rascal/tests/basic/RepositionTree.rsc new file mode 100644 index 00000000000..66618f19506 --- /dev/null +++ b/src/org/rascalmpl/library/lang/rascal/tests/basic/RepositionTree.rsc @@ -0,0 +1,43 @@ +module lang::rascal::tests::basic::RepositionTree + +import List; +import ParseTree; +import lang::pico::\syntax::Main; + +loc facPico = |project://rascal/src/org/rascalmpl/library/lang/pico/examples/fac.pico|; + +private list[loc] collect(Tree t) = [s@\loc | /Tree s := t, s@\loc?]; + +test bool repositionSimulatesReparse() { + t1 = parse(#start[Program], facPico); + t2 = reposition(t1); // defaults set + assert t1 := t2; // but that skips keyword parameters and layout + return collect(t1) == collect(t2); +} + +test bool removeAllAnnotations() { + t1 = parse(#start[Program], facPico); + t2 = reposition(t1, + markSyntax=false, + markLexical=false, + markSubLexical=false, + markAmb=false, + markChar=false, + markLayout=false, + markLit=false, + markStart=false, + markSubLit=false, + markSubLayout=false, + markRegular=false); + assert t1 := t2; // but that skips keyword parameters and layout + return collect(t2) == []; +} + +test bool charsFromLeftToRight() { + t1 = parse(#start[Program], facPico); + t2 = reposition(t1, markChar=true); + allChars = [ch | /ch:char(_) := t2]; + sortedChars = sort(allChars, bool (c1, c2) { return c1@\loc.offset < c2@\loc.offset;}); + + return allChars == sortedChars; +} \ No newline at end of file diff --git a/src/org/rascalmpl/library/lang/rascal/tests/library/analysis/diff/edits/HiFiTreeDiffTests.rsc b/src/org/rascalmpl/library/lang/rascal/tests/library/analysis/diff/edits/HiFiTreeDiffTests.rsc new file mode 100644 index 00000000000..a71610f1357 --- /dev/null +++ b/src/org/rascalmpl/library/lang/rascal/tests/library/analysis/diff/edits/HiFiTreeDiffTests.rsc @@ -0,0 +1,214 @@ +module lang::rascal::tests::library::analysis::diff::edits::HiFiTreeDiffTests + +extend analysis::diff::edits::ExecuteTextEdits; +extend analysis::diff::edits::HiFiLayoutDiff; +extend analysis::diff::edits::HiFiTreeDiff; +extend lang::pico::\syntax::Main; + +import IO; +import ParseTree; +import String; + +public str simpleExample + = "begin + ' declare + ' a : natural, + ' b : natural; + ' a := a + b; + ' b := a - b; + ' a := a - b + 'end + '"; + +public str ifThenElseExample + = "begin + ' declare + ' a : natural; + ' if a then + ' a := 10 + ' else + ' if a then + ' a := 11 + ' else + ' a := 12 + ' fi + ' fi + 'end + '"; + +@synopsis{Specification of what it means for `treeDiff` to be syntactically correct} +@description{ +TreeDiff is syntactically correct if: +* The tree after rewriting _matches_ the tree after applying the edits tot the source text and parsing that. +* Note that _matching_ ignores case-insensitive literals and layout, indentation and comments +} +bool editsAreSyntacticallyCorrect(type[&T<:Tree] grammar, str example, (&T<:Tree)(&T<:Tree) transform, list[TextEdit](Tree, Tree) diff) { + orig = parse(grammar, example); + transformed = transform(orig); + edits = diff(orig, transformed); + edited = executeTextEdits(example, edits); + + try { + if (transformed := parse(grammar, edited)) { + return true; + } + else { + println("The edited result is not the same:"); + println(edited); + println("As the transformed:"); + println(transformed); + return false; + } + } + catch ParseError(loc l): { + println(" caused a parse error in:"); + println(edited); + return false; + } +} + +@synopsis{Extract the leading spaces of each line of code} +list[str] indentationLevels(str example) + = [ i | /^[^\ ]*/ <- split("\n", example)]; + +@synopsis{In many cases, but not always, treeDiff maintains the indentation levels} +@description{ +Typically when a rewrite does not change the lines of code count, +and when the structure of the statements remains comparable, treeDiff +can guarantee that the indentation of a file remains unchanged, even if +significant changes to the code have been made. +} +@pitfalls{ +* This specification is not true for any transformation. Only apply it to +a test case if you can expect indentation-preservation for _the entire file_. +} +bool editsMaintainIndentationLevels(type[&T<:Tree] grammar, str example, (&T<:Tree)(&T<:Tree) transform, list[TextEdit](Tree, Tree) diff) { + orig = parse(grammar, example); + transformed = transform(orig); + edits = diff(orig, transformed); + edited = executeTextEdits(example, edits); + + return indentationLevels(example) == indentationLevels(edited); +} + +(&X<:Tree) identity(&X<:Tree x) = x; + +start[Program] swapAB(start[Program] p) = visit(p) { + case (Id) `a` => (Id) `b` + case (Id) `b` => (Id) `a` +}; + +start[Program] swapIfBranches(start[Program] p) = visit(p) { + case (Statement) `if then <{Statement ";"}* thenBranch> else <{Statement ";"}* elseBranch> fi` + => (Statement) `if then + ' <{Statement ";"}* elseBranch> + 'else + ' <{Statement ";"}* thenBranch> + 'fi` +}; + +start[Program] naturalToString(start[Program] p) = visit(p) { + case (Type) `natural` => (Type) `string` +}; + +start[Program] addDeclarationToEnd(start[Program] p) = visit(p) { + case (Program) `begin declare <{IdType ","}* decls>; <{Statement ";"}* body> end` + => (Program) `begin + ' declare + ' <{IdType ","}* decls>, + ' c : natural; + ' <{Statement ";"}* body> + 'end` +}; + +start[Program] addDeclarationToStart(start[Program] p) = visit(p) { + case (Program) `begin declare <{IdType ","}* decls>; <{Statement ";"}* body> end` + => (Program) `begin + ' declare + ' c : natural, + ' <{IdType ","}* decls>; + ' <{Statement ";"}* body> + 'end` +}; + +start[Program] addDeclarationToMiddle(start[Program] p) = visit(p) { + case (Program) `begin declare <{IdType ","}* pre>, , <{IdType ","}* post>; <{Statement ";"}* body> end` + => (Program) `begin + ' declare + ' <{IdType ","}* pre>, + ' , + ' middle : natural, + ' <{IdType ","}* post>; + ' <{Statement ";"}* body> + 'end` +}; + +start[Program](start[Program]) indent(str indentation = " ", bool indentFirstLine = true) { + return start[Program](start[Program] p) { + return parse(#start[Program], indent(indentation, "

", indentFirstLine=indentFirstLine)); + }; +} + +start[Program] insertSpacesInDeclaration(start[Program] p) = visit(p) { + case (IdType) ` : ` + => (IdType) ` : ` +}; + +test bool nulTestWithId() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, identity, treeDiff); + +test bool simpleSwapper() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, swapAB, treeDiff) + && editsMaintainIndentationLevels(#start[Program], simpleExample, swapAB, treeDiff); + +test bool addDeclarationToEndTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToEnd, treeDiff); + +test bool addDeclarationToStartTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToStart, treeDiff); + +test bool addDeclarationToMiddleTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToMiddle, treeDiff); + +test bool addDeclarationToStartAndMiddleTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToStart o addDeclarationToMiddle, treeDiff); + +test bool addDeclarationToMiddleAndEndTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToMiddle o addDeclarationToEnd, treeDiff); + +test bool addDeclarationToStartAndEndTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToStart o addDeclarationToEnd, treeDiff); + +test bool addDeclarationToStartMiddleAndEndTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToStart o addDeclarationToMiddle o addDeclarationToEnd, treeDiff); + +test bool addDeclarationToEndAndSwapABTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToEnd o swapAB, treeDiff); + +test bool addDeclarationToStartAndSwapABTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToStart o swapAB, treeDiff); + +test bool addDeclarationToStartAndEndAndSwapABTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, addDeclarationToStart o addDeclarationToEnd o swapAB, treeDiff); + +test bool naturalToStringTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, naturalToString, treeDiff); + +test bool naturalToStringAndAtoBTest() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, naturalToString o swapAB, treeDiff); + +test bool swapBranchesTest() + = editsAreSyntacticallyCorrect(#start[Program], ifThenElseExample, swapIfBranches, treeDiff); + +test bool nulTestWithIdLayout() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, identity, layoutDiff) + && editsMaintainIndentationLevels(#start[Program], simpleExample, identity, layoutDiff); + +test bool indentAllLayout() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, indent(), layoutDiff) + && !editsMaintainIndentationLevels(#start[Program], simpleExample, indent(), layoutDiff); + +test bool insertSpacesInDeclarationLayout() + = editsAreSyntacticallyCorrect(#start[Program], simpleExample, insertSpacesInDeclaration, layoutDiff) + && editsMaintainIndentationLevels(#start[Program], simpleExample, insertSpacesInDeclaration, layoutDiff); + diff --git a/src/org/rascalmpl/library/util/Highlight.rsc b/src/org/rascalmpl/library/util/Highlight.rsc index 65f8d530690..54ebdc4c04a 100644 --- a/src/org/rascalmpl/library/util/Highlight.rsc +++ b/src/org/rascalmpl/library/util/Highlight.rsc @@ -1,4 +1,3 @@ - @license{ Copyright (c) 2013-2024 CWI All rights reserved. This program and the accompanying materials diff --git a/src/org/rascalmpl/tutor/lang/rascal/tutor/Compiler.rsc b/src/org/rascalmpl/tutor/lang/rascal/tutor/Compiler.rsc index 31ac9d800b8..2f8398142b0 100644 --- a/src/org/rascalmpl/tutor/lang/rascal/tutor/Compiler.rsc +++ b/src/org/rascalmpl/tutor/lang/rascal/tutor/Compiler.rsc @@ -101,7 +101,7 @@ int main(PathConfig pcfg = getProjectPathConfig(|cwd:///|), messages = compile(pcfg); - return mainMessageHandler(messages, srcs=pcfg.srcs, errorsAsWarnings=errorsAsWarnings, warningsAsErrors=warningsAsErrors); + return mainMessageHandler(messages, projectRoot=pcfg.projectRoot, errorsAsWarnings=errorsAsWarnings, warningsAsErrors=warningsAsErrors); } @synopsis{compiles each pcfg.srcs folder as a course root} diff --git a/src/org/rascalmpl/types/NonTerminalType.java b/src/org/rascalmpl/types/NonTerminalType.java index 761221a1ccf..e85a4399feb 100644 --- a/src/org/rascalmpl/types/NonTerminalType.java +++ b/src/org/rascalmpl/types/NonTerminalType.java @@ -345,6 +345,9 @@ public boolean intersects(Type other) { if (other == RascalValueFactory.Tree) { return true; } + else if (other.isParameter()) { + return other.intersects(this); + } else if (other instanceof NonTerminalType) { return ((NonTerminalType) other).intersectsWithNonTerminal(this); } diff --git a/src/org/rascalmpl/values/parsetrees/TreeAdapter.java b/src/org/rascalmpl/values/parsetrees/TreeAdapter.java index fb53274cb7c..5f633d0dc68 100644 --- a/src/org/rascalmpl/values/parsetrees/TreeAdapter.java +++ b/src/org/rascalmpl/values/parsetrees/TreeAdapter.java @@ -25,7 +25,6 @@ import org.jline.jansi.Ansi.Color; import org.rascalmpl.exceptions.ImplementationError; import org.rascalmpl.interpreter.utils.LimitedResultWriter; -import org.rascalmpl.values.IRascalValueFactory; import org.rascalmpl.values.RascalValueFactory; import org.rascalmpl.values.ValueFactoryFactory; import org.rascalmpl.values.parsetrees.visitors.TreeVisitor;