diff --git a/cccppp/include/builtins.hpp b/cccppp/include/builtins.hpp index 8e0d33b..7e36211 100644 --- a/cccppp/include/builtins.hpp +++ b/cccppp/include/builtins.hpp @@ -1 +1,2 @@ bool __builtin_is_constant_evaluated(); +int __builtin_va_arg_pack(); diff --git a/cccppp/include/prelude.hpp b/cccppp/include/prelude.hpp index 78fac03..1cd407d 100644 --- a/cccppp/include/prelude.hpp +++ b/cccppp/include/prelude.hpp @@ -20,7 +20,7 @@ struct __has_subscript_overload template static auto subscript_test(const U* u) -> decltype(operator[]((*u), 0), char(0)) {} // a static member function that returns a length-2 thing - static short subscript_test(...) {} + static short subscript_test(...) {return 0;} static const bool value = (sizeof(subscript_test((T*)0)) == 1); }; diff --git a/cccppp/src/Makefile b/cccppp/src/Makefile index 5fcf7f6..bb7c84f 100644 --- a/cccppp/src/Makefile +++ b/cccppp/src/Makefile @@ -11,7 +11,7 @@ LLVM_CONFIG ?= llvm-config-13 # possibly be relevant... the linker will figure otu what to pull in. # This changes by version of LLVM. They are all in a group, so # the order doesn't matter (it's alphabetical). -dump cccppp-tool interstitial-tool simple-tool: LDLIBS += \ +dump cccppp-tool interstitial-tool simple-tool easyTool-tool: LDLIBS += \ -Wl,--start-group \ -lclangAST \ -lclangASTMatchers \ @@ -52,7 +52,8 @@ interstitial-tool: interstitial.o $(CXX) -o $@ $+ $(LDFLAGS) $(LDLIBS) dump: dump.o $(CXX) -o $@ $+ $(LDFLAGS) $(LDLIBS) - +simple-tool: simple.o + $(CXX) -o $@ $+ $(LDFLAGS) $(LDLIBS) OCAMLOPTFLAGS += -fPIC CFLAGS += -fPIC diff --git a/cccppp/src/Working-Interstitial b/cccppp/src/Working-Interstitial new file mode 100644 index 0000000..53bad0a --- /dev/null +++ b/cccppp/src/Working-Interstitial @@ -0,0 +1,365 @@ + What we are trying to achieve through cccppp + ______________________________________________ + + Let us take binary expression arr[0] as example + This expression will be represented in AST as :---------+ + | + | + v + +--------------------+ + +---------| [ ] |--------+ + | +--------------------+ | + | | + | | + v v + +--------------------+ +--------------------+ + | arr | | 0 | + +--------------------+ +--------------------+ + + Now we want to translate this binary expression (Specifically ArraySubscript Expressions for now) + + +---------------------------------------------------+ +-------------------------------------------------------------------------------+ + |(*) Binary Expression before translation : arr[0] | -----------------------> |(*) Binary Expression after translation : __primop_subscript()(arr, 0) | + +---------------------------------------------------+ +-------------------------------------------------------------------------------+ + + + +-------------------------------------------------------------------------------+ + | | + +----|-------------------------------------------------------------+ | + | | | | + | | | | + ^ ^ | | + | | v v + +--------------------+ +-----+-----+-----+-----+ + (AST) +---------| [ ] |--------+ | [ |arr | 0 | ] | + | +--------------------+ | | | | | | (Output Buffer) + | | +-----+-----+-----+-----+ + | | | | ^ | ^ | | + v v | | | | | | | + +--------------------+ +--------------------+ | | | | | | | + | arr | | 0 | +-----+--|--+--|--+-----+ + +--------------------+ +--------------------+ | | + | | | | + | | | | + | +----------------------------------------------|-----+ + | | + +------------------------------------------------------------------------------------+ + + + | Current Translation Done by Interstitial.cpp| ++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +| When we take: +| Optional leftIntersticeRange = mkStartLengthRange(leftIntersticeStart, leftIntersticeLength); +| +| and replace with this range: +| if (leftIntersticeRange) TheRewriter.ReplaceText( +| leftIntersticeRange->getBegin(), +| TheRewriter.getRewrittenText(*leftIntersticeRange).length(), +| leftInterstice +| ); ++__________________________________________________________________________________+___________________________________________________________________________________________________________________+ +| What we get is this : | | +| Let us take binary expression: __s[__i] | | +| | How Interstices Look | +| ----------Outer Expression ------------- | ---------------------- | +| | | | **Ignore the spaces in the expression (added space to visualize better)** | +| v v | __s [ __i ] | +| ___primop_subscript()(__s, __i) | ^^ ^ ^ | +| ^ ^ ^ ^ | || || || | +| | | | | | LI : Left Interstices LI MI RI | +| |_______________________________| |_| | RI : Right Interstices | +| eLeft eRight -> '__i' | MI : Middle Interstices | +| | | | +| v | In Ideal Case We are getting "MI Start == MI End" and "RI Start == RI End" | +| '___primop_subscript()(__s' | But in cases where there is actual space like arr [ 0 ] then we are getting | +| | | +| | MI Start = [ RI Start = ] | +| | MI End = "" RI End = ] | +| | | +| | MI End is not '['(have to figure out, Whereas RI End is ok) now the twist is the final whole expression we get is:| +| | `__primop_subscript()(arr , 0 )' | +| | | | +| | v | +| | Why is there ' ' between ? | ++----------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------+ + + + +Good Enough to Push into Main +But We should open Issue to solve this particular section + + + +_________________________________________________________________________________________________________________________________________________________________________________________________________ + + +Q) Why SourceLocation is 32 bit ? + +- The 'SourceLocation' class is important for tracking positions within the source code, particularly for various stages of compilation and analysis. +This class is designed to be (32 bits in size) to ensure efficiency and memory saving. + +ID (32 bits): This is the core component of a SourceLocation. It contains the actual encoded value that represents the position within the source code. This value is a combination of bits that encode various pieces of information, including the file ID, offset within the file, and whether it's a regular source location or a macro expansion location. + +Here's a break down what this 32-bit value represents using a diagram + + ++--------------------------------------------------------------------+ +| | +| SourceLocation | * Bit-31 : (Macro ID Bit) Indicates whether it's regular source location or macro expansion location ++--------------------------------------------------------------------+ * Bit-30 to 0: Offset within the file or macro expansion (these bits store the actual position information) +| | +| Macro ID Bit (Bit 31) | ++--------------------------------------------------------------------+ [*] Macro Expansion Location: Specific point in the source code where a macro has been expanded. +| | +| Offset within the file or macro expansion (Bit 30-0) | +| (These bits store actual position information) | +| | +| | ++--------------------------------------------------------------------+ + +SourceLocation Overflow +----------------------- +There's a LLVM discussion I found which talks about overflow in SourceLocation. +(https://discourse.llvm.org/t/rfc-clang-sourcelocation-overflow/57541) + +Basically when a large file is included multiple times using #include directives, it can lead to an overflow issue with the SourceLocation values in Clang. The repeated inclusions cause the cumulative size of included files to exceed the capacity that can be represented by the 32-bit SourceLocation, resulting in an overflow problem. This overflow can impact how the compiler tracks positions within the source code and may lead to unexpected behavior or errors. + +There's another discussion on why SourceLocation needs to be 64 bit +(https://discourse.llvm.org/t/macro-performance-lexer-and-sourcemanager/65713/6) + + +There is also a point that mentions this as current limitation in Clang's Compiler User Manual: +Some source locations are not displayed correctly. The front end has a more detailed source location tracking than the locations included in the debug info (e.g., the front end can locate code inside macro expansions). However, the locations used by -Rpass are translated from debug annotations. That translation can be lossy, which results in some remarks having no location information. +(https://intel.github.io/llvm-docs/clang/UsersManual.html) + + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + +AddInsertDelta (Inside Rewrite.cpp of Clang Directory) +------------------------------------------------------ + +/// AddInsertDelta - When an insertion is made at a position, this + /// method is used to record that information. + void AddInsertDelta(unsigned OrigOffset, int Change) { + return Deltas.AddDelta(2*OrigOffset, Change); + } + +AddDelta (Inside DeltaTree.cpp of Clang Rewrite Directory) +--------------------------------------------------------- + /// this method is used to record that info. It inserts a delta of 'Delta' +/// into the current DeltaTree at offset FileIndex. +void DeltaTree::AddDelta(unsigned FileIndex, int Delta) { + assert(Delta && "Adding a noop?"); + DeltaTreeNode *MyRoot = getRoot(Root); + + DeltaTreeNode::InsertResult InsertRes; + if (MyRoot->DoInsertion(FileIndex, Delta, &InsertRes)) { + Root = new DeltaTreeInteriorNode(InsertRes); +#ifdef VERIFY_TREE + MyRoot = Root; +#endif + } + + + +_____________________+ + |Overview of DeltaTree| ++-----------------------------------------------------------------------+---------------------+-----------------------------------------------------------------------------------+ +|DeltaTree : Maintains information about how source code changes affect the positions within a source code buffer. It tracks changes such as insertions and deletions of code and | +|allows fast lookups to determine the accumulated delta at any specfic file offset | +| | +| DeltaTree (Root) | +| | | +| +-----------+-----------+----------+---------+ | +| | | | | | | +| v v v v v | +| Node A Node B Node C Node D Node E | +| | | | | | | +| v v v v v | +| Value 1 Value 2 Value 3 Value 4 Value 5 | +| | +|In this low-level representaion: | +|(*) 'DeltaTree' : The top-level structure representing the entire 'DeltaTree'. | +|(*) 'Node A', 'Node B', 'Node C', 'Node D', 'Node E' : Interior Nodes in B-Tree structure | +|(*) 'Value1, Value2, Value3, Value4, Value5: Leaf nodes (or final partitions) that store SourceDelta values. | +| | +|-Each interior node (e.g 'Node A' to 'Node E') may have multiple child nodes and values. I have shown only one child node and one value per interior node for sake of simplicity | ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + + + Key Concepts: + ------------- + + (*) Delta - a delta represents how the source code at specific file location has been modified (added or deleted). Deltas are recorded as 'SourceDelta' objects with file location and delta values. + (*) Full Delta - Each Node in the tree maintains a 'Full Delta', which is accumulated delta of all the values or deltas within the node and it's children. + This allows for quick retrieval of the total delta for a subtree. + (** Important **) Delta's can be positive or negative) + + How it works: + ------------- + + (*) Intilisation: The 'DeltaTree' is intilised with empty root node. + (*) AddDeltaFunction: When a change is made to the source code (e.g Insertion or Deletion), the 'AddDelta' function is called. + This function records rhe change(delta) at a specific file offset. If necessary, it may split nodes in tree to maintain B-Tree structure. + (*) DeltaTreeNodeSplit: When a node becomes full (reaches a certain capacity), it is split into two nodes, this ensures that B-Tree remains balanced. + (*) Querying Accumulated Delta: The 'getDeltaAt' function is used to query the accumulated delta at specific file offset. + It traverses the tree, taking into account the delta stored in each node and it's children. + + + + here's an simple example to understand how DeltaTree works--| + v + Let us take a source code and see how insertion and deletion are recorded as delta: + +---------------------+ + |Original Source Code:| + |---------------------| + | | + |int main() { | + | int a = 10; | + | return 0; | + |} | + +---------------------+ + + Case 1: Insertion + ----------------- + let's insert a piece of code at specific location (e.g. after the declaration of a) + + int b = 2; + + Here's how the change will be represented : + 1) Delta Representation :-> Delta: FileLoc = 29 (position after 'int a = 10;'), Delta = +13 + (*) FileLoc = The file location where the insertion occurs(character position is taken) + (*) Delta = The change in file location due to insertion (In this example +13 characters) + + 2) DeltaTree Modification + Tree After Insertion:---| + v + [Root] + / \ + / \ + [Leaf Node A] [Leaf Node B] + (FileLoc: 0) (FileLoc: 29) ----------------------------------------------------------------+ + / | \ / | \ | + ... ... ... ... ... ... | + | + So what's happening in this case is: +---------->| + - The root node has two leaf nodes. | + - 'Leaf Node A' represents the original code. | If we use function 'getDeltaAt' with specfic position + - 'Lead Node B' represents the inserted code. | we get to see this: + - The root node now tracks the accumulated delta of both nodes. (It achieves this by using function 'getDeltaAt') | + - The insertion causes the split at the root node to maintain balance in bst tree. | (*) getDeltaAt(10) -> would return '0' as because + | no changes occured before this position. + Case 2: Deletion | (*) getDeltaAt(29) -> would return '13' as because + ---------------- | we inserted 13 characters at position 29. + let's delete a piece of code from specific location (e.g. the declaration of a) | (*) getDeltaAt(16) -> would return '-13' as because + | we deleted 13 characters at position 16. + Here's how the change will be represented : | + 1) Delta Representation :-> Delta: FileLoc = 16 (position of 'int a = 10;'), Delta = -13 | + (*) FileLoc = The file location where the deletion occurs(character position is taken) +--------->| + (*) Delta = The change in file location due to deletion (In this example -13 characters) | + | + 2) DeltaTree Modification | + Tree After Deletion :---| | + v | + [Root] | + / \ | + / \ | + [Leaf Node A] [Leaf Node B] | + (FileLoc: 0) (FileLoc: 13) -----------------------------------------------------------------+ + / | \ / | \ + ... ... ... ... ... ... + + So what's happening in this case is: + - The root node has two leaf nodes. + - 'Leaf Node A' represents the original code. + - 'Lead Node B' represents the deleted code. + - The root node now tracks the accumulated delta of both nodes. (It achieves this by using function 'getDeltaAt') + - The insertion causes the split at the root node to maintain balance in bst tree. + + + +Notes on functions : getDecomposedLoc, getSpellingLoc, getMappedOffset, insertText, getRewrittenText. + +getDecomposedLoc: +---------------- + takes a SourceLocation object called Loc as its input and returns a std::pair containing two values: a FileID and an unsigned integer. + + Here's what the function does step by step: + + 1) It starts by calling the getFileID function with the Loc argument to obtain the FileID corresponding to the provided source location. + + 2) Next, it calls the getSLocEntryOrNull function with the obtained FileID to get a pointer (Entry) to the source location entry associated with that file. + + 3) If the Entry pointer is nullptr (i.e., if there is no entry found for the given FileID), the function returns a std::pair with an empty FileID and an offset of 0. This suggests that the provided SourceLocation does not have a valid mapping to a file. + + 4) If an Entry is found, it calculates the offset of the provided SourceLocation within the file by subtracting the offset of the Entry (obtained using Entry->getOffset()) from the offset of the Loc (obtained using Loc.getOffset()). It then returns a std::pair with the FileID and the calculated offset. + + +getSpellingLoc: +--------------- + takes a SourceLocation object called Loc as its input and returns a SourceLocation. + + Here's what the function does: + + 1) It checks whether the Loc is associated with a file (i.e., whether it has a valid FileID). This is done using the Loc.isFileID() function call. + + 2) If the Loc is associated with a file (i.e., Loc.isFileID() returns true), it means that the Loc itself is the spelling location, and there's no need for further processing. In this case, the function simply returns the input Loc as-is. + + 3) If the Loc is not associated with a file (i.e., Loc.isFileID() returns false), it implies that the Loc represents an expansion location (e.g., a macro expansion). In this case, the function delegates the task of finding the spelling location to the getSpellingLocSlowCase function by passing Loc as an argument. This slow case handling typically involves traversing through the expansion history to determine the original source location or spelling location. + +getMappedOffset: +---------------- + +why 2*origoffset ? (I don't know whether this answer which I came up analyzing the function is correct :'( ) + +The getMappedOffset function takes an offset into the original SourceBuffer and returns the corresponding offset in the RewriteBuffer. The AfterInserts flag indicates whether the offset is after any inserted text. + +The 2*OrigOffset term is used to take into account the fact that each insert operation in the RewriteBuffer shifts all subsequent offsets by one. So, if the original offset is 10 and there is one insert operation, the corresponding offset in the RewriteBuffer will be 11. + +overall--v + +AfterInserts: If this boolean flag is true (which it is by default), it would add 1 to the 2*OrigOffset result. This would effectively move the position one step further, to be "after" any inserts. + +OrigOffset: Adding the original offset back at the end adjusts for the total shift that has occurred due to rewrites before this position. + + +insertText: +----------- + + It takes three parameters: + * OrigOffset: An unsigned integer representing the original offset where the insertion should occur. + * Str: A StringRef containing the text to be inserted. + * InsertAfter: A boolean flag indicating whether the insertion should occur after the original offset (true) or at the original offset (false). + + 1) The function first checks if the Str to be inserted is empty. If it is empty, there's nothing to insert, so the function exits early without making any changes. + + 2) It calculates the RealOffset by calling the getMappedOffset function with the OrigOffset and InsertAfter flags. This calculation adjusts the original offset to reflect any previous insertions or modifications made in the buffer. + + 3) The function then uses the RealOffset to insert the characters from Str into the Buffer at the specified position. + + 4) Finally, it calls AddInsertDelta to record a delta value that represents the change made by this insertion. This delta value is used to keep track of how future changes should be offset correctly within the buffer. + + +getRewrittenText: +----------------- + + It takes a CharSourceRange named Range as its input parameter. This Range represents the character source range for which rewritten text is desired. + + 1) The function first checks whether the beginning and end of the Range are rewritable by calling the isRewritable function for both the start and end of the range. If either the start or end is not rewritable, it returns an empty string (indicating that the requested range cannot be rewritten). + + 2) It then proceeds to obtain the file IDs and offsets for both the start and end of the Range. This involves calling the getLocationOffsetAndFileID function, which appears to determine the file and offset information for a given source location. + + 3) It checks whether the StartFileID and EndFileID are the same. If they are not, it returns an empty string because the start and end of the range belong to different buffers. + + 4) If both the start and end are in the same buffer, the function checks if any rewrites have been made to this buffer by searching for the buffer in the RewriteBuffers map. If no rewrite has been made, it retrieves the text from the original source using the SourceMgr (Source Manager) and adjusts the end offset if the Range represents a token range. The function then returns the corresponding string. + + 5) If there have been rewrites to the buffer, it retrieves the RewriteBuffer for the buffer in question from the RewriteBuffers map. + + 6) It adjusts the start and end offsets using the getMappedOffset function, which presumably adjusts the offsets to account for previous insertions or changes made in the buffer. + + 7) Again, it adjusts the end offset if the Range represents a token range. + + 8) The function then iterates over the RewriteBuffer to retrieve the rewritten text within the specified range by advancing the iterators accordingly. + + 9) Finally, it constructs and returns a std::string using the retrieved iterators to obtain the rewritten text within the specified range. + + diff --git a/cccppp/src/interstitial.cpp b/cccppp/src/interstitial.cpp index 5d9f89b..57dac9a 100644 --- a/cccppp/src/interstitial.cpp +++ b/cccppp/src/interstitial.cpp @@ -1,5 +1,6 @@ #include #include +#include #ifdef USE_STD_UNIQUE_PTR #include #endif @@ -39,6 +40,9 @@ using llvm::make_unique; #endif using llvm::Optional; +std::set statementOffsets; +std::vector> rewrittenRanges; + class SimpleRewriteASTVisitor : public RecursiveASTVisitor { public: bool shouldTraversePostOrder() const /* override */ { return true; } @@ -195,45 +199,83 @@ Initially right interstice is: `0]' should be ']' // forwards and its end backwards. // the start of eSubLeft and the start of eSubRight // It's easiest to do this right-to-left, and calculating lengths first. - unsigned leftIntersticeLength = rangeLength(mkRightOpenRange(eOuter->getSourceRange().getBegin(), + // + // Is this better name for variables ? + /* + unsigned leftIntersticeEstimatedLength = rangeLength(mkRightOpenRange(eOuter->getSourceRange().getBegin(), + eSubLeft->getSourceRange().getBegin(), false, false)); + unsigned beforeMidIntersticeEstimatedLength = rangeLength(mkRightOpenRange(eSubLeft->getSourceRange().getEnd(), + eSubRight->getSourceRange().getBegin(), true, false)); + unsigned rightIntersticeEstimatedLength = rangeLength(mkRightOpenRange(eSubRight->getSourceRange().getEnd(), + eOuter->getSourceRange().getEnd(), true, true)); + */ + + // or something like this: beforelhSubExprLength, lhSubExprLength, afterRhExprLength + + + unsigned leftIntersticeLength = rangeLength(mkRightOpenRange(eOuter->getSourceRange().getBegin(), eSubLeft->getSourceRange().getBegin(), false, false)); unsigned midIntersticeLength = rangeLength(mkRightOpenRange(eSubLeft->getSourceRange().getEnd(), eSubRight->getSourceRange().getBegin(), true, false)); unsigned rightIntersticeLength = rangeLength(mkRightOpenRange(eSubRight->getSourceRange().getEnd(), - eOuter->getSourceRange().getEnd(), true, true)); - /* Now we've got the lengths, get the start positions. */ + eOuter->getSourceRange().getEnd(), true, true)); + + /* Now we've got the lengths, get the start positions. */ + + llvm::errs() << "Length of leftInterstice is (according to range) :-> " << leftIntersticeLength << "\n"; + llvm::errs() << "Length of midInterstice is (according to range) :-> " << midIntersticeLength << "\n"; + llvm::errs() << "Length of rightInterstice is (according to range) :-> " << rightIntersticeLength << "\n"; + SourceLocation leftIntersticeStart = eOuter->getSourceRange().getBegin(); - SourceLocation midIntersticeStart = eSubLeft->getSourceRange().getEnd().getLocWithOffset(1); - SourceLocation rightIntersticeStart = eSubRight->getSourceRange().getEnd().getLocWithOffset(1); - SourceLocation leftIntersticeEnd = eSubLeft->getSourceRange().getBegin().getLocWithOffset(-1); + std::pair leftStartDecomposedLoc = TheRewriter.getSourceMgr().getDecomposedLoc(leftIntersticeStart); + llvm::errs() << "DecomposedLoc for LeftStartInterstice :-> " << leftStartDecomposedLoc.second << "\n"; + + SourceLocation midIntersticeStart = eSubLeft->getSourceRange().getEnd().getLocWithOffset(midIntersticeLength); + std::pair midStartDecomposedLoc = TheRewriter.getSourceMgr().getDecomposedLoc(midIntersticeStart); + llvm::errs() << "DecomposedLoc for MidStartInterstice :-> " << midStartDecomposedLoc.second << "\n"; + + SourceLocation rightIntersticeStart = eSubRight->getSourceRange().getEnd().getLocWithOffset(rightIntersticeLength); + std::pair rightStartDecomposedLoc = TheRewriter.getSourceMgr().getDecomposedLoc(rightIntersticeStart); + llvm::errs() << "DecomposedLoc for RightStartInterstice :-> " << rightStartDecomposedLoc.second << "\n"; + + SourceLocation leftIntersticeEnd = eSubLeft->getSourceRange().getBegin().getLocWithOffset(-1); //Initially it was set to -1 + std::pair leftEndDecomposedLoc = TheRewriter.getSourceMgr().getDecomposedLoc(leftIntersticeEnd); + llvm::errs() << "DecomposedLoc for LeftEndInterstice :-> " << leftEndDecomposedLoc.second << "\n"; + SourceLocation midIntersticeEnd = eSubRight->getSourceRange().getBegin().getLocWithOffset(-1); - SourceLocation rightIntersticeEnd = eOuter->getSourceRange().getEnd(); + std::pair midEndDecomposedLoc = TheRewriter.getSourceMgr().getDecomposedLoc(midIntersticeEnd); + llvm::errs() << "DecomposedLoc for MidEndInterstice :-> " << midEndDecomposedLoc.second << "\n"; + + SourceLocation rightIntersticeEnd = eOuter->getSourceRange().getEnd(); + std::pair rightEndDecomposedLoc = TheRewriter.getSourceMgr().getDecomposedLoc(rightIntersticeEnd); + llvm::errs() << "DecomposedLoc for RightEndInterstice :-> " << rightEndDecomposedLoc.second << "\n"; // now does this work? + Optional leftIntersticeRange = mkStartLengthRange( - leftIntersticeStart, leftIntersticeLength); - Optional midIntersticeRange = mkStartLengthRange( - midIntersticeStart, midIntersticeLength); - Optional rightIntersticeRange = mkStartLengthRange( - rightIntersticeStart, rightIntersticeLength); - + /*SourceRange(leftIntersticeStart, leftIntersticeEnd);*/leftIntersticeStart, leftIntersticeLength); + Optional midIntersticeRange = //mkStartLengthRange( + SourceRange(midIntersticeStart, midIntersticeEnd);/*midIntersticeStart, midIntersticeLength)*/; + Optional rightIntersticeRange = //mkStartLengthRange( + SourceRange(rightIntersticeStart, rightIntersticeEnd); /*rightIntersticeStart, rightIntersticeLength);*/ + /* The sum of lengths of the interstices should be equal to the length of the whole expression * minus the lengths of the subexpression. */ - unsigned intersticesLength = rangeLength(leftIntersticeRange) + /*unsigned intersticesLength = rangeLength(leftIntersticeRange) + rangeLength(midIntersticeRange) - + rangeLength(rightIntersticeRange); + + rangeLength(rightIntersticeRange);*/ //assert(intersticesLength + leftSubLength + rightSubLength == outerLength); llvm::errs() << "Initially left interstice (length " << rangeLength(leftIntersticeRange) << ") is: `" - << (!leftIntersticeRange ? "" : TheRewriter.getRewrittenText(*leftIntersticeRange)) << "'\n"; + << (!leftIntersticeRange ? "" : TheRewriter.getRewrittenText(*leftIntersticeRange)) << "'\n"; llvm::errs() << "Initially mid interstice (length " << rangeLength(midIntersticeRange) << ") is: `" - << (!midIntersticeRange ? "" : TheRewriter.getRewrittenText(*midIntersticeRange)) << "'\n"; + << (!midIntersticeRange ? "" : TheRewriter.getRewrittenText(*midIntersticeRange)) << "'\n"; llvm::errs() << "Initially right interstice (length " << rangeLength(rightIntersticeRange) << ") is: `" << (!rightIntersticeRange ? "" : TheRewriter.getRewrittenText(*rightIntersticeRange)) << "'\n"; - + /* Do three small rewrites, not one big one. */ if (leftIntersticeRange) TheRewriter.ReplaceText( leftIntersticeRange->getBegin(), - TheRewriter.getRewrittenText(*leftIntersticeRange).length(), + TheRewriter.getRewrittenText(*leftIntersticeRange).length()/*leftIntersticeLength*/, leftInterstice ); else { TheRewriter.InsertTextBefore(eOuter->getSourceRange().getBegin(), leftInterstice); @@ -265,7 +307,7 @@ Initially right interstice is: `0]' should be ']' midIntersticeRange->getBegin(), TheRewriter.getRewrittenText(*midIntersticeRange).length(), midInterstice - ); else TheRewriter.InsertTextAfter(eSubLeft->getSourceRange().getEnd(), midInterstice); + ); else TheRewriter.InsertTextAfter(eSubLeft->getSourceRange().getEnd().getLocWithOffset(midIntersticeLength), midInterstice); if (rightIntersticeRange) TheRewriter.ReplaceText( rightIntersticeRange->getBegin(), TheRewriter.getRewrittenText(*rightIntersticeRange).length(), @@ -334,6 +376,7 @@ Initially right interstice is: `0]' should be ']' llvm::errs() << "Skipping something under a sizeof, _Alignof, decltype or similar expr!\n"; return true; } + return false; }(); @@ -356,13 +399,40 @@ Initially right interstice is: `0]' should be ']' llvm::errs() << "\n"; e->dump(); std::string lhBefore = TheRewriter.getRewrittenText(e->getLHS()->getSourceRange()); + llvm::errs() << "The lhBefore :-> " << lhBefore << "\n"; std::string rhBefore = TheRewriter.getRewrittenText(e->getRHS()->getSourceRange()); // replace it with some text we have crafted + llvm::errs() << "The rhBefore :-> " << rhBefore << "\n"; QualType indexedType = e->getLHS()->getType(); // is this definitely an array? std::string leftInterstice; std::string midInterstice = ", "; std::string rightInterstice = ")"; + + unsigned int offset = TheRewriter.getSourceMgr().getFileOffset(e->getLHS()->getBeginLoc()); // Calculate the offset + unsigned int endOffset = TheRewriter.getSourceMgr().getFileOffset(e->getRHS()->getEndLoc()); + + llvm::errs() << "\e[1;32m Replacing at offset: \e[0m" << "\e[1;32m Beginning offset: \e[0m" << offset << "\e[1;32m Ending offset: \e[0m" << endOffset << "\n"; + // Check if the current range falls within any previously rewritten range + bool withinPreviousRange = false; + for (const auto& range : rewrittenRanges) { + if (offset == range.first && endOffset == range.second) { + withinPreviousRange = true; + break; + } + } + + std::string string1= "__primop_subscript<"; + std::string string2= "__maybe_primop_subscript<"; + + std::size_t found1 = TheRewriter.getRewrittenText(e->getLHS()->getSourceRange()).find(string1); + std::size_t found2 = TheRewriter.getRewrittenText(e->getLHS()->getSourceRange()).find(string2); + + if(withinPreviousRange && (found1 != std::string::npos || found2 != std::string::npos)) { + llvm::errs() << "Already Translated" << "\n"; + return true; + } + if (!(indexedType.getTypePtr()->isTemplateTypeParmType() || indexedType.getTypePtr()->isDependentType() || indexedType.getTypePtr()->isInstantiationDependentType() @@ -380,13 +450,20 @@ Initially right interstice is: `0]' should be ']' * as it already appears in the code. But in many cases this * comes out as "", and I haven't figured out a * way to make clang print what we want. So use decltype() for now. */ + //flag = "0"; leftInterstice = std::string("__maybe_primop_subscript<") + "decltype(" + lhBefore + "), !__has_subscript_overload::value>()("; + + //This was not here before } + ReplaceBinaryExpressionInterstices(e, e->getLHS(), e->getRHS(), leftInterstice, midInterstice, rightInterstice); + + rewrittenRanges.emplace_back(offset, endOffset); + //added this just to check // after replacement, we should still have the same view of the subexpressions // FIXME: EXCEPT we can't do this if the LHS begins at the same place as the // outer expression @@ -396,7 +473,7 @@ Initially right interstice is: `0]' should be ']' } return true; } - + private: Rewriter &TheRewriter; ASTContext &TheContext; @@ -431,7 +508,7 @@ class SimpleRewriteConsumer : public ASTConsumer { bool HandleTopLevelDecl(DeclGroupRef DR) override { unsigned count = 0; SourceLocation lastSourceLoc; - //llvm::errs() << "== Saw top-level decl\n"; + llvm::errs() << "== Saw top-level decl\n"; for (DeclGroupRef::iterator b = DR.begin(), e = DR.end(); b != e; ++b) { // HACK: to get parent info, I have to do this, but I have no idea why. C.setTraversalScope({*b}); @@ -445,8 +522,8 @@ class SimpleRewriteConsumer : public ASTConsumer { llvm::errs() << "== The last one ended at "; lastSourceLoc.print(llvm::errs(), R.getSourceMgr()); llvm::errs() << " (written in main file? " - << R.getSourceMgr().isWrittenInMainFile(lastSourceLoc) - << ", presumed in main file? " + << R.getSourceMgr().isWrittenInMainFile(lastSourceLoc) + << ", presumed in main file? " << R.getSourceMgr().isInMainFile(lastSourceLoc) << "; immediate spelling loc: "; R.getSourceMgr().getImmediateSpellingLoc(lastSourceLoc).print(llvm::errs(), R.getSourceMgr()); diff --git a/cccppp/src/simple.cpp b/cccppp/src/simple.cpp index 792b362..1693cd4 100644 --- a/cccppp/src/simple.cpp +++ b/cccppp/src/simple.cpp @@ -2,6 +2,8 @@ #include #ifdef USE_STD_UNIQUE_PTR #include +#include +#include #endif #include "clang/AST/AST.h" @@ -37,6 +39,9 @@ using std::make_unique; using llvm::make_unique; #endif +std::set statementOffsets; +std::vector> rewrittenRanges; + class SimpleRewriteASTVisitor : public RecursiveASTVisitor { public: bool shouldTraversePostOrder() const /* override */ { return true; } @@ -90,9 +95,12 @@ class SimpleRewriteASTVisitor : public RecursiveASTVisitorgetSourceRange()) << "'\n"; llvm::errs() << "Initially whole binary expression is: `" << TheRewriter.getRewrittenText(eOuter->getSourceRange()) << "'\n"; + std::string rewrittenStmt = TheRewriter.getRewrittenText(eOuter->getSourceRange()); + assert(rewrittenStmt.length() < 2048); + assert(overallReplacement.length() < 2048); TheRewriter.ReplaceText( eOuter->getSourceRange().getBegin(), - TheRewriter.getRewrittenText(eOuter->getSourceRange()).length(), + rewrittenStmt.length(), overallReplacement ); llvm::errs() << "Now left-hand subexpression is: `" @@ -110,6 +118,8 @@ class SimpleRewriteASTVisitor : public RecursiveASTVisitor(s)) { /* There are two cases: * (1) it's really the builtin array subscript; * (2) it's a template that might actually bind to an overload. */ ArraySubscriptExpr *e = cast(s); + + unsigned int offset = TheRewriter.getSourceMgr().getFileOffset(e->getLHS()->getBeginLoc()); // Calculate the offset + unsigned int endOffset = TheRewriter.getSourceMgr().getFileOffset(e->getRHS()->getEndLoc()); + + llvm::errs() << "\e[1;32m Replacing at offset: \e[0m" << "\e[1;32m Beginning offset: \e[0m" << offset << "\e[1;32m Ending offset: \e[0m" << endOffset << "\n"; + // Check if the current range falls within any previously rewritten range + bool withinPreviousRange = false; + for (const auto& range : rewrittenRanges) { + if (offset == range.first && endOffset == range.second) { + withinPreviousRange = true; + count++; + break; + } + } + /* + if(withinPreviousRange){ + if(count > 1) { + llvm::errs() << "Sup You've already seen me!" << "\n"; + return true; + } + } + */ + std::string string1= "__primop_subscript<"; + std::string string2= "__maybe_primop_subscript<"; + + std::size_t found1 = TheRewriter.getRewrittenText(e->getLHS()->getSourceRange()).find(string1); + std::size_t found2 = TheRewriter.getRewrittenText(e->getLHS()->getSourceRange()).find(string2); + + if(withinPreviousRange && (found1 != std::string::npos || found2 != std::string::npos)) { + llvm::errs() << "Already Translated" << "\n"; + return true; + } + llvm::errs() << "Post-order-reached a new ArraySubscriptExpr: "; e->printPretty(llvm::errs(), nullptr, PrintingPolicy(LangOptions())); llvm::errs() << " at "; @@ -190,36 +237,48 @@ class SimpleRewriteASTVisitor : public RecursiveASTVisitorisUndeducedType())) { replacement = std::string("__primop_subscript<") - + indexedType.getAsString() - + " >()(" - + lhBefore - + ", " - + rhBefore - + " )"; + + indexedType.getAsString() + + " >()(" + + lhBefore + + ", " + + rhBefore + + ")"; } else // nasty case { /* We want to print the type name, or an expression for it, - * as it already appears in the code. But in many cases this - * comes out as "", and I haven't figured out a - * way to make clang print what we want. So use decltype() for now. */ + * as it already appears in the code. But in many cases this + * comes out as "", and I haven't figured out a + * way to make clang print what we want. So use decltype() for now. */ replacement = std::string("__maybe_primop_subscript<") - + "decltype(" - + lhBefore - + "), !__has_subscript_overload::value>()(" - + lhBefore - + ", " - + rhBefore - + " )"; + + "decltype(" + + lhBefore + + "), !__has_subscript_overload::value>()(" + + lhBefore + + ", " + + rhBefore + + " )"; } - ReplaceBinaryExpression(e, e->getLHS(), e->getRHS(), - replacement); - // after replacement, we should still have the same view of the subexpressions - // FIXME: EXCEPT we can't do this if the LHS begins at the same place as the - // outer expression. So disable these assertions. We'll try again in 'interstitial'. - //assert(e->getLHS()->getSourceRange().getBegin() == e->getSourceRange().getBegin() - //|| lhBefore == TheRewriter.getRewrittenText(e->getLHS()->getSourceRange())); - //assert(rhBefore == TheRewriter.getRewrittenText(e->getRHS()->getSourceRange())); + + /* unsigned int offset = TheRewriter.getSourceMgr().getFileOffset(e->getBeginLoc()); + + if(statementOffsets.find(offset) != statementOffsets.end()) { + return false; + } + statementOffsets.insert(offset); + */ + /*ReplaceBinaryExpression(e, e->getLHS(), e->getRHS(), + replacement);*/ + // after replacement, we should still have the same view of the subexpressions + // FIXME: EXCEPT we can't do this if the LHS begins at the same place as the + // outer expression. So disable these assertions. We'll try again in 'interstitial'. + //assert(e->getLHS()->getSourceRange().getBegin() == e->getSourceRange().getBegin() + //|| lhBefore == TheRewriter.getRewrittenText(e->getLHS()->getSourceRange())); + //assert(rhBefore == TheRewriter.getRewrittenText(e->getRHS()->getSourceRange())); + + + ReplaceBinaryExpression(e, e->getLHS(), e->getRHS(), replacement); + rewrittenRanges.emplace_back(offset, endOffset); } return true; } diff --git a/cccppp/test/brackets/brack.cpp b/cccppp/test/brackets/brack.cpp new file mode 100644 index 0000000..68e9b8c --- /dev/null +++ b/cccppp/test/brackets/brack.cpp @@ -0,0 +1,7 @@ +#include +const char* getString() { + return "Hello :)"; +} +int main() { + return getString()[0]; +} diff --git a/cccppp/test/multi-dim/mult-dim.cpp b/cccppp/test/multi-dim/mult-dim.cpp new file mode 100644 index 0000000..a63a8fd --- /dev/null +++ b/cccppp/test/multi-dim/mult-dim.cpp @@ -0,0 +1,11 @@ +#include +using namespace std; + +int main() { + int b[1]; + b[0] = 1; + int a[1][2]; + a[0][0] = 1; + a[0][1] = 7; + cout << a[0][b[0]] << endl; +}