diff --git a/src/ado/reprun.ado b/src/ado/reprun.ado index 3b56b85..09fb4bf 100644 --- a/src/ado/reprun.ado +++ b/src/ado/reprun.ado @@ -7,6 +7,9 @@ qui { version 14.1 + * Store start time + local start_time = clock(c(current_time), "hms") + syntax anything [using/] , [Verbose] [Compact] [noClear] [Debug] [Suppress(passthru)] /***************************************************************************** @@ -65,12 +68,16 @@ qui { /************************************************************************* Generate the run 1 and run 2 do-files *************************************************************************/ - + + + noi di as res "" noi di as err "{phang}Starting reprun. Creating the do-files for run 1 and run 2.{p_end}" noi reprun_recurse, dofile("`dofile'") output("`dirout'") stub("m") local code_file_run1 "`r(code_file_run1)'" local code_file_run2 "`r(code_file_run2)'" + if "`r(mmmflag)'" != "" local mmmflag "`mmmflag' `r(mmmflag)'" + if "`r(sssflag)'" != "" local sssflag "`sssflag' `r(sssflag)'" noi di as err "{phang}Done creating the do-files for run 1 and run 2.{p_end}" /************************************************************************* @@ -126,6 +133,19 @@ qui { l2(`"{pstd}{c BLC}{hline 1}> `dofile'{p_end}"') l3("{hline}") file close `h_smcl' + if "`mmmflag'" != "" { + noi di as res `"{pstd}{red:Reproducibility Warning:} Your code contains many-to-many merges on lines:`mmmflag'.{p_end}"' + noi di as res `"{pstd}As the {mansection D merge:Stata Manual} says: {it:if you think you need to perform an m:m merge, then we suspect you are wrong}.{p_end}"' + noi di as res `"{pstd}Reference the above section of the Stata Manual for troubleshooting.{p_end}"' + } + + if "`sssflag'" != "" { + noi di as res `" "' + noi di as res `"{pstd}{red:Reproducibility Warning:} Your code set the sortseed on lines:`sssflag'.{p_end}"' + noi di as res `"{pstd}As the {mansection D sort:Stata Manual} says: {it:You must be sure that the ordering really does not matter. If that is the case, then why did you sort in the first place?}{p_end}"' + noi di as res `"{pstd}Reference the above section of the Stata Manual for troubleshooting.{p_end}"' + } + /***************************************************************************** Write smcl file to disk and clean up intermediate files unless debugging *****************************************************************************/ @@ -140,6 +160,31 @@ qui { } } + //display timer + * Store end time + local end_time = clock(c(current_time), "hms") + + * Calculate and display elapsed time + local elapsed_time = (`end_time' - `start_time') / 1000 + local hours = floor(`elapsed_time' / 3600) + local minutes = floor(mod(`elapsed_time', 3600) / 60) + local seconds = mod(`elapsed_time', 60) + + noi di as res "" + if (`elapsed_time' >= 3600) { + noi di as res `"{phang}Total run time: `hours':`minutes':`seconds' (HH:MM:SS){p_end}"' + } + + else if (`elapsed_time' >= 60) { + noi di as res `"{phang}Total run time: `minutes':`seconds' (MM:SS){p_end}"' + } + + else { + noi di as res `"{phang}Total run time: `seconds' seconds{p_end}"' + } + + + // Remove tmahen command is no longer in beta noi repkit "beta reprun" @@ -159,7 +204,7 @@ end program define reprun_recurse, rclass qui { - syntax, dofile(string) output(string) stub(string) + syntax, dofile(string asis) output(string) stub(string) /************************************************************************* Create the files that this recursive call needs @@ -198,7 +243,8 @@ end * Open the orginal file tempname code_orig - file open `code_orig' using "`dofile'", read + + file open `code_orig' using `dofile', read * Loop until end of file while `leof' == 0 { @@ -255,22 +301,61 @@ end * Not part of a multiline line else { - *Reset the last line local + * Reset the last line locals local last_line = "" local line_command = "OTHER" local dofile "" local doflag 0 - foreach w in `macval(line)' { - get_command, word("`w'") - if `doflag' == 1 local dofile = "`w'" + local looptype "" + local loopflag 0 + + // Sanitize that string! + local 0 `"`macval(line)'"' + + + // Identify all commands in line + while `"`0'"' != "" { + + gettoken 1 0 : 0 , quotes + if strpos(`"`1'"',"//") local 0 "" // End on comments + if strpos(`"`1'"',"*") & "`line_command'" == "OTHER" local 0 "" // End on comments + + // di as err `"`1' // `0'"' + + cap get_command, word(`"`1'"') + + if `doflag' == 1 local dofile = `"`1'"' + if `loopflag' == 1 local looptype = "`1'" + + * Dofiles if "`r(command)'" == "do" | "`r(command)'" == "run" { local doflag = 1 } else local doflag 0 + + * Loops + if "`r(command)'" == "foreach" | "`r(command)'" == "forvalues" { + local loopflag = 1 + } + else local loopflag 0 + local line_command = "`line_command' `r(command)'" + mac shift } local line_command : list uniq line_command + * If MMM + if (strpos("`line_command'","mmm")) { + di as err "Reproducibility Warning: Many-to-many merge on Line `lnum'" + return local mmmflag = `lnum' + } + + * If SSS + if (strpos("`line_command'","sss")) { + di as err "Reproducibility Warning: Sortseed set on Line `lnum'" + return local sssflag = `lnum' + } + * If using capture, log it and take second word as command if (strpos("`line_command'","capture")) { local lastline_capture = 1 @@ -290,20 +375,20 @@ end } * Line is do or run, so call recursive function - if (strpos("`line_command'","do")) | (strpos("`line_command'","run")) { + if (strpos(`"`line_command'"',"do")) | (strpos("`line_command'","run")) { * Write line handling recursion in data file local write_recline = 1 * Get the file path from the second word local file = `"`dofile'"' - local file_rev = strreverse("`file'") + local file_rev = strreverse(`"`file'"') * Only recurse on .do files and add .do when no extension is used - if (substr("`file_rev'",1,3) == "od.") { + if strpos(`"`file'"' , ".do") { local recurse 1 } - else if (substr("`file_rev'",1,4) == "oda.") { + else if strpos(`"`file'"' , ".ado") { local recurse 0 // skip recursing reprun on adofiles } else { @@ -312,28 +397,35 @@ end } * Skip recursion instead of error if file not found - cap confirm file "`file'" + cap confirm file `file' if _rc { local recurse 0 + di as err `" Skipping recursion -- file not found: `file' "' } * Test if it should recurse or not if `recurse' == 1 { + * Keep working on the stub local recursestub "`stub'_`++subf_n'" - noi reprun_recurse, dofile("`file'") /// + + + noi reprun_recurse, dofile(`file') /// output("`output'") /// stub("`recursestub'") local sub_f1 "`r(code_file_run1)'" local sub_f2 "`r(code_file_run2)'" + * Substitute the original sub-dofile with the check/write ones + if !strpos(`"`file'"',`"""') local file `""`file'""' + local run1_line = /// - subinstr(`"`line'"',`"`file'"',`""`sub_f1'""',1) + subinstr(`"`line'"',`file',`""`sub_f1'""',1) local run2_line = /// - subinstr(`"`line'"',`"`file'"',`""`sub_f2'""',1) + subinstr(`"`line'"',`file',`""`sub_f2'""',1) *Correct potential ""path"" to "path" local run1_line = subinstr(`"`run1_line'"',`""""',`"""',.) @@ -357,16 +449,21 @@ end * Write foreach/forvalues to block stack and * it's macro name to loop stack - if (strpos("`line_command'","foreach")) | (strpos("`line_command'","forvalues")) { - local block_stack "`line_command' `block_stack' " - local loop_stack = trim("`loop_stack' `secondw'") + if (strpos("`line_command'","foreach")) { + local block_stack "foreach `block_stack' " + local loop_stack = trim("`loop_stack' `looptype'") + } + + if (strpos("`line_command'","forvalues")) { + local block_stack "forvalues `block_stack' " + local loop_stack = trim("`loop_stack' `looptype'") } * Write while to block stack and * also "while" to loop stack as it does not have a macro name if strpos("`line_command'","while") { - local block_stack "`line_command' `block_stack' " - local loop_stack = trim("`loop_stack' `line_command'") + local block_stack "while `block_stack' " + local loop_stack = trim("`loop_stack' while") } } @@ -414,6 +511,7 @@ end return local code_file_run1 "`code_f1'" return local code_file_run2 "`code_f2'" } + end cap program drop org_line_parse @@ -538,6 +636,16 @@ end } } + if "`word'" == "m:m" { + return local command "mmm" + local match = 1 + } + + if "`word'" == "sortseed" { + return local command "sss" + local match = 1 + } + * No match, return OTHER if (`match'==0) { return local command "OTHER" @@ -570,6 +678,9 @@ end local prev_line1 "" local prev_line2 "" + * Local for empty tables + local any_lines_written 0 + * Loop over all lines in the two data files local eof = 0 while `eof' == 0 { @@ -624,12 +735,23 @@ end * Write end to previous table, write the file tree for the next * recursion, and write the beginning of that table - output_writetitle , outputcolumns("`outputcolumns'") - noi write_and_print_output, h_smcl(`h_smcl') /// - l1("`r(botline)'") l2(" ") /// + if (`any_lines_written' == 1 ) { + * Close the table for his file + output_writetitle , outputcolumns("`outputcolumns'") + noi write_and_print_output, h_smcl(`h_smcl') /// + l1("`r(botline)'") l2(" ") /// l3(`"{pstd} Stepping into sub-file:{p_end}"') + } + else { + * If the table is empty + output_writetitle , outputcolumns("`outputcolumns'") + noi write_and_print_output, h_smcl(`h_smcl') /// + l1("`r(botline)'") l2("No mismatches and/or changes detected") l3(" ") /// + l4(`"{pstd} Stepping into sub-file:{p_end}"') + } + noi print_filetree_and_verbose_title, /// - files(`" "`orgfile'" "`new_orgfile'" "') h_smcl(`h_smcl') `verbose' `compact' + files(`" `orgfile' "`new_orgfile'" "') h_smcl(`h_smcl') `verbose' `compact' output_writetitle , outputcolumns("`outputcolumns'") noi write_and_print_output, h_smcl(`h_smcl') /// l1("`r(topline)'") l2("`r(state_titles)'") /// @@ -637,7 +759,7 @@ end * Make the recurisive call for next file noi recurse_comp_lines , dirout("`dirout'") stub("`new_stub'") /// - orgfile(`"`orgfile' "`new_orgfile'" "') /// + orgfile(`" `orgfile' "`new_orgfile'" "') /// outputcolumns("`outputcolumns'") h_smcl(`h_smcl') `verbose' `compact' `suppress' * Step back into this data file after the recursive call and: @@ -646,7 +768,7 @@ end noi write_and_print_output, h_smcl(`h_smcl') /// l1(`"{phang} Stepping back into file:{p_end}"') noi print_filetree_and_verbose_title, /// - files(`" "`orgfile'" "') h_smcl(`h_smcl') `verbose' `compact' + files(`" `orgfile' "') h_smcl(`h_smcl') `verbose' `compact' output_writetitle , outputcolumns("`outputcolumns'") noi write_and_print_output, h_smcl(`h_smcl') /// l1("`r(topline)'") l2("`r(state_titles)'") /// @@ -689,6 +811,7 @@ end dsum1("`r(dsum_c1)'") dsum2("`r(dsum_c2)'") dsumm("`r(dsum_m)'") /// loopiteration("`r(loopt)'") noi write_and_print_output, h_smcl(`h_smcl') l1("`r(outputline)'") + local any_lines_written 1 } * Load these lines into pre_line locals for next run @@ -698,10 +821,19 @@ end } * End of this data file else { - * Close the table for his file - output_writetitle , outputcolumns("`outputcolumns'") - noi write_and_print_output, h_smcl(`h_smcl') /// - l1("`r(botline)'") l2(" ") + * If the table is not empty + if (`any_lines_written' == 1 ) { + * Close the table for his file + output_writetitle , outputcolumns("`outputcolumns'") + noi write_and_print_output, h_smcl(`h_smcl') /// + l1("`r(botline)'") l2(" ") + } + else { + * If the table is empty + output_writetitle , outputcolumns("`outputcolumns'") + noi write_and_print_output, h_smcl(`h_smcl') /// + l1("`r(botline)'") l2("No mismatches and/or changes detected") l3(" ") + } } } } @@ -965,6 +1097,7 @@ end program define print_filetree_and_verbose_title, rclass syntax , files(string) h_smcl(string) [verbose] [compact] local file_count = 0 + foreach file of local files { noi write_and_print_output, h_smcl(`h_smcl') /// l1(`"{pstd}{c BLC}{hline `++file_count'}> `file'{p_end}"') diff --git a/src/ado/reprun_dataline.ado b/src/ado/reprun_dataline.ado index 8e20f9a..62f779b 100644 --- a/src/ado/reprun_dataline.ado +++ b/src/ado/reprun_dataline.ado @@ -14,7 +14,7 @@ cap program drop reprun_dataline [ /// datatmp(string) /// The tempfile that holds the RNG etc. data recursestub(string) /// keep track of sub-do-file - orgsubfile(string) /// + orgsubfile(string asis) /// looptracker(string) /// keeps track of inside a loop ] @@ -36,9 +36,8 @@ cap program drop reprun_dataline local loopt = trim("`looptracker'") * Handle data state - tempfile checksum - cap export delimited using `checksum' , replace - cap qui checksum `checksum' + cap export delimited using "`datatmp'.dta" , replace + cap qui checksum "`datatmp'.dta" cap local srngcheck = `r(checksum)' if _rc local srngcheck = 0 local dsum "`srngcheck'" @@ -50,7 +49,7 @@ cap program drop reprun_dataline * Recurse line else { *Build recurse instructions line - local line `"recurse `recursestub' "`orgsubfile'" "' + local line `"recurse `recursestub' `orgsubfile' "' } *Write line and close file diff --git a/src/dev/run-adodown-util.do b/src/dev/run-adodown-util.do index ea62fe2..7927148 100644 --- a/src/dev/run-adodown-util.do +++ b/src/dev/run-adodown-util.do @@ -7,6 +7,10 @@ global clone "/Users/bbdaniels/GitHub/" } + if "`c(username)'" == "ankritisingh" { + global clone "/Users/ankritisingh/GitHub/" + } + local rk "${clone}/repkit" //ad_sthlp , adfolder("`rk'") commands(repadolog) diff --git a/src/mdhlp/reprun.md b/src/mdhlp/reprun.md index 421cb73..9d37df7 100644 --- a/src/mdhlp/reprun.md +++ b/src/mdhlp/reprun.md @@ -1,6 +1,6 @@ # Title -__reprun__ - This command is used to automate a reproducibility check for a single Stata do-file, or a set of do-files called by a main do-file. The command should be used interactively; __reprun__ will execute one run of the do-file and record the state of Stata after the execution of each line. It will then run the entire do-file a second time and flag all potential reproducibility error causes by comparing the Stata state to the first run _after each line_. Debugging and reporting options are available. +__reprun__ - This command is used to automate a reproducibility check for a single Stata do-file, or a set of do-files called by a main do-file. The command should be used interactively; __reprun__ will execute one run of the do-file and record the state of Stata after the execution of each line. It will then run the entire do-file a second time and flag all potential reproducibility error caused by comparing the Stata state to the first run _after each line_. Debugging and reporting options are available. # Syntax @@ -12,17 +12,26 @@ By default, __reprun__ will execute the complete do-file specified in "_do-file. | _options_ | Description | |-----------|-------------| -| __**v**erbose__ | Report all lines where Run 1 and Run 2 mismatch __or__ change for any value | -| __**c**ompact__ | Report only lines where Run 1 and Run 2 mismatch __and__ change for either the seed or sort RNG | +| __**v**erbose__ | Report all lines where Run 1 and Run 2 mismatch __**or**__ change for any value | +| __**c**ompact__ | Report only lines where Run 1 and Run 2 mismatch __**and**__ change for either the seed or sort RNG | | __**s**uppress(types)__ | Suppress reporting of state changes that do not result in mismatches for seed RNG state (`rng`), sort order RNG (`srng`), and/or data checksum (`dsum`), for any reporting setting | | __**d**ebug__ | Save all records of Stata states in Run 1 and Run 2 for inspection in the `/reprun/` folder | | __**noc**lear__ | Do not reset the Stata state before beginning reproducibility Run 1 | # Description -The __reprun__ command is intended to be used to check the reproducibility of a do-file or set of do-files (called by a main do-file) that are ready to be transferred to other users or published. The command will ensure that the outputs produced by the do-file or set of do-files are stable across runs, such that they do not produce reproducibility errors caused by incorrectly managed randomness in Stata. To do so, __reprun__ will check three key sources of reproducibility failure at each point in execution of the do-file(s): the state of the random number generator, the sort order of the data, and the contents of the data itself. +The __reprun__ command is intended to be used to check the reproducibility of a do-file or set of do-files (called by a main do-file) that are ready to be transferred to other users or published. The command will ensure that the outputs produced by the do-file or set of do-files are stable across runs, such that they do not produce reproducibility errors caused by incorrectly managed randomness in Stata. To do so, __reprun__ will check three key sources of reproducibility failure at each point in execution of the do-file(s): the state of the random number generator, the sort order of the data, and the contents of the data itself (see detailed description below). + +After completing Run 2, __reprun__ will report all lines where there are mismatches between Run 1 and Run 2 in any of these values. Lines where _changes_ lead to _mismatches_ will be highlighted. Problems should be approached top-to-bottom, as solving earlier issues will often resolve later ones. Additionally, addressing issues from left-to-right in the table is effective. RNG states are responsible for most errors, followed by unstable sorts, while data mismatches are typically symptoms of these reproducibility failures rather than causes in and of themselves. + +__Mismatches are defined as follows:__ + +__Seed RNG State:__ A mismatch occurs whenever the RNG state differs from Run 1 to Run 2, _except_ any time the RNG state is exactly equivalent to `set seed 12345` in Run 1 (the initialization default). By default, __reprun__ invokes __clear__ and __set seed 12345__ to match the default Stata state before beginning Run 1. The __noclear__ option prevents this behavior; this is not recommended unless you have a rare issue that you need to check at the very beginning of the file. Most projects should quickly set the randomization seed appropriately for replicability. + +__Sort Order RNG:__ Since the sort RNG state should _always_ differ between Run 1 and Run 2, a mismatch is defined as any line where the sort RNG state is advanced _and_ __checksum__ fails to match when compared with the Run 1 data (as a CSV) at the same line. This mismatch occurs when the sort order RNG is used in a command that results in the data taking a different order between the two runs. Users should never manually set the `sortseed` (See `help seed` and `help sortseed`) to override these mismatches; instead, they should implement a unique sort on the data using a command like `isid` (See `help isid`). + +__Data Checksum:__ A mismatch occurs whenever __checksum__ fails to match when comparing the result from the Run 1 data (as a CSV) in Run 2. Users should understand that lines where _only_ the data checksum fails to match are unlikely to be where problems originate in the code; these mismatches are generally consequences of earlier reproducibility failures in randomization or sorting. Users should also note that results from __datasignature__ are only unique up to the sort order of each column independently; hence, we do not use this command. -After completing Run 2, __reprun__ will report all lines where there are mismatches in any of these values between Run 1 and Run 2. Lines where _changes_ lead to _mismatches_ will be highlighted, and an indicator for potentially "cascading" mismatches (those caused by previous changes) will be shown by a vertical line and the absence of the change flag. In general, this structure means that problems should be, to a first approximation, approached top-to-bottom, as solving an earlier issue will often resolve later ones (since later changes may not be the ones causing the mismatches). In addition, we have set things up so that problems should also in general be approached from left-to-right in this table. RNG states are responsible for most errors; then unstable sorts; and data mismatches are typically symptoms of these reproducibility failures, rather than causes in and of themselves. # Options @@ -30,9 +39,9 @@ By default, __reprun__ returns a list of _mismatches_ in Stata state between Run ## Line flagging options -The __**v**erbose__ option can be used to produce even more detail than the default. If the __**v**erbose__ option is specified, then any line in which the state changes _during_ Run 1 or Run 2; or mismatches _between_ the runs will be flagged and reported. This is intended to allow the user to do a deep-dive into the function and structure of the do-file's execution. +The __**v**erbose__ option can be used to produce even more detail than the default. If the __**v**erbose__ option is specified, then any line in which the state changes _during_ Run 1 or Run 2; __**or**__ mismatches between the runs will be flagged and reported. This is intended to allow the user to do a deep-dive into the function and structure of the do-file's execution. -The __**c**ompact__ option, by contrast, produces less detailed reporting, but is often a good first step to begin locating issues in the code. If the __**c**ompact__ option is specified, then _only_ those lines which have mismatched seed or sort order RNG changes _during_ Run 1 or Run 2 __and__ mismatches _between_ the runs will be flagged and reported. Data checksum mismatches alone will be ignored; as will RNG mismatches not accompanied by a change in the state. This is intended to reduce the reporting of "cascading" differences, which are caused because some state value changes inconsistently at a single point and remains inconsistent for the remainder of the run (making every subsequent data change a mismatch, for example). +The __**c**ompact__ option, by contrast, produces less detailed reporting, but is often a good first step to begin locating issues in the code. If the __**c**ompact__ option is specified, then _only_ those lines which have mismatched seed or sort order RNG changes _during_ Run 1 or Run 2 __**and**__ mismatches between the runs will be flagged and reported. Data checksum mismatches alone will be ignored; as will RNG mismatches not accompanied by a change in the state. This is intended to reduce the reporting of "cascading" differences, which are caused because some state value changes inconsistently at a single point and remains inconsistent for the remainder of the run (making every subsequent data change a mismatch, for example). The __**s**uppress()__ option is used to hide the reporting of changes that do not lead to mismatches (especially when the __**v**erbose__ option is specified) for one or more of the types. In particular, since the sort order RNG frequently changes and should _not_ be forced to match between runs, it will very often have changes that do not produce errors, specifying __**s**uppress(srng)__ will remove a great deal of unhelpful output from the reporting table. To do this for all states, write __**s**uppress(rng srng dsum)__. Suppressing `loop` will clean up the display of loops so that the titles are only shown on the first line; but if combined with `compact` may not display at all. @@ -44,6 +53,151 @@ The __**d**ebug__ option allows the user to save all of the underlying materials By default, __reprun__ invokes __clear__ and __set seed 12345__ to match the default Stata state before beginning Run 1. __**noc**lear__ prevents this behavior. It is not recommended unless you have a rare issue that you need to check at the very beginning of the file, because most projects should very quickly set these states appropriately for reproducibility. +# Examples + +## Example 1 + +This is the most basic usage of __reprun__. Specified in any of the following ways, either in the Stata command window or as part of a new do-file, __reprun__ will execute the complete do-file "_myfile.do_" once (Run 1), and record the "seed RNG state", "sort order RNG", and "data checksum" after the execution of every line, as well as the exact data in certain cases. __reprun__ will then execute "_myfile.do_" a second time (Run 2), and find all _changes_ and _mismatches_ in these states throughout Run 2. A table of mismatches will be reported in the Results window, as well as in a SMCL file in a new directory called `/reprun/` in the same location as "_myfile.do_". + +``` +reprun "myfile.do" +``` + +or + +``` +reprun "path/to/folder/myfile.do" +``` + +or + +``` +local myfolder "/path/to/folder" +reprun "`myfolder'/myfile.do" +``` + +## Example 2 + +This example is similar to example 1, but the `/reprun/` directory containing the SMCL file will be stored in the location specified by the __using__ argument. + +``` +reprun "myfile.do" using "path/to/report" +``` + +or + +``` +reprun "path/to/folder/myfile.do" using "path/to/report" +``` + +or + +``` +local myfolder "/path/to/folder" +reprun "`myfolder'/myfile.do" using "`myfolder'/report" +``` + +## Example 3 + +Assume "_myfile1.do_" contains the following code: + +``` +sysuse census, clear +isid state, sort +gen group = runiform() < .5 +``` + +Running a reproducibility check on this do-file using __reprun__ will generate a table listing _mismatches_ in Stata state between Run 1 and Run 2. + +``` +reprun "myfile1.do" +``` + +In "_myfile1.do_", Line 3 (`gen group = runiform() < .5`) generates a new variable `group` based on a random uniform distribution. The RNG state will differ between Run 1 and Run 2 unless the random seed is explicitly set before this command. As a result, a mismatch in the "seed RNG state" as well as "data checksum" will be flagged. + +The issue can be resolved by setting a seed before the command: + +``` +sysuse census, clear +isid state, sort +set seed 346290 +gen group = runiform() < .5 +``` + +Running the reproducibility check on the modified do-file using reprun will confirm that there are no mismatches in Stata state between Run 1 and Run 2. + +## Example 4 + +Using the __**v**erbose__ option generates more detailed tables where any lines across Run 1 and Run 2 mismatch __**or**__ change for any value. In addition to the output in Example 3, it will also report line 2 for __changes__ in "sort order RNG" and "data checksum". + +``` +reprun "myfile1.do", verbose +``` + +## Example 5 + +Assume "_myfile2.do_" contains the following code: + +``` +sysuse auto, clear +sort mpg +gen sequence = _n +``` + +Running a reproducibility check on this do-file using __reprun__ will generate a table listing _mismatches_ in Stata state between Run 1 and Run 2. + +``` +reprun "myfile2.do" +``` + +In "_myfile2.do_", Line 2 sorts the data by the non-unique variable `mpg`, causing the sort order to vary between runs. This results in a mismatch in the "sort order RNG". Consequently, Line 2 and Line 3 (`gen sequence = _n`) will be flagged for "data checksum" mismatches due to the differences in sort order, leading to discrepancies in the generated `sequence` variable. + +The issue can be resolved by sorting the data on a unique combination of variables: + +``` +sysuse auto, clear +sort mpg make +gen sequence = _n +``` + +## Example 6 + +Using the __**c**ompact__ option generates less detailed tables where only lines with mismatched seed or sort order RNG changes during Run 1 or Run 2, and mismatches between the runs, are flagged and reported. The output will be similar to Example 5, except that line 3 will no longer be flagged for "data checksum". + +``` +reprun "myfile2.do", compact +``` + +## Example 7 + +`reprun` will perform a reproducibility check on a do-file, including all do-files it calls recursively. For example, the main do-file might contain the following code that calls on "_myfile1.do_" (Example 3) and "_myfile2.do_" (Example 5): + +``` +local myfolder "/path/to/folder" +do "`myfolder'/myfile1.do" +do "`myfolder'/myfile2.do" +``` + +``` +reprun "main.do" +``` + +`reprun` on "_main.do_" performs reproducibility checks across "_main.do_", as well as "_myfile1.do_", and "_myfile2.do_". The output will include tables for each do-file, illustrating the following process: + +- __main.do__: The initial check reveals no mismatches in "_main.do_", indicating no discrepancies introduced directly by it. + +- __Sub-file 1__ ("_myfile1.do_") : `reprun` steps into "_myfile1.do_", where Line 3 is flagged for mismatches, as shown in Example 3. This table will show the issues specific to "_myfile1.do_". + +- __Return to "main.do"__" : After checking "_myfile1.do_", `reprun` returns to "_main.do_". Here, Line 2 is flagged because it calls "_myfile1.do_", reflecting the issues from the sub-file. + +- __Sub-file 2__ ("_myfile2.do_"): `reprun` then steps into "_myfile2.do_", where Line 2 is flagged for mismatches, as detailed in Example 5. + +- __Return to "main.do" (final check) __: After checking "_myfile2.do"_, `reprun` returns to "_main.do_". Line 3 in "_main.do_" is flagged due to the issues in "_myfile2.do_" propagating up. + +In summary, `reprun` provides a comprehensive view by stepping through each do-file, showing where mismatches occur and how issues in sub-files impact the main do-file. + + + # Feedback, bug reports and contributions Read more about these commands on [this repo](https://github.com/worldbank/repkit) where this package is developed. Please provide any feedback by [opening an issue](https://github.com/worldbank/repkit/issues). PRs with suggestions for improvements are also greatly appreciated. diff --git a/src/sthlp/reprun.sthlp b/src/sthlp/reprun.sthlp index 1738252..7c8ae45 100644 --- a/src/sthlp/reprun.sthlp +++ b/src/sthlp/reprun.sthlp @@ -6,7 +6,7 @@ {title:Title} -{phang}{bf:reprun} - This command is used to automate a reproducibility check for a single Stata do-file, or a set of do-files called by a main do-file. The command should be used interactively; {bf:reprun} will execute one run of the do-file and record the state of Stata after the execution of each line. It will then run the entire do-file a second time and flag all potential reproducibility error causes by comparing the Stata state to the first run {it:after each line}. Debugging and reporting options are available. +{phang}{bf:reprun} - This command is used to automate a reproducibility check for a single Stata do-file, or a set of do-files called by a main do-file. The command should be used interactively; {bf:reprun} will execute one run of the do-file and record the state of Stata after the execution of each line. It will then run the entire do-file a second time and flag all potential reproducibility error caused by comparing the Stata state to the first run {it:after each line}. Debugging and reporting options are available. {p_end} {title:Syntax} @@ -20,10 +20,10 @@ {p_end} {synoptset 15}{...} -{synopthdr:options} +{p2coldent:{it:options}}Description{p_end} {synoptline} -{synopt: {bf:{ul:v}erbose}}Report all lines where Run 1 and Run 2 mismatch {bf:or} change for any value{p_end} -{synopt: {bf:{ul:c}ompact}}Report only lines where Run 1 and Run 2 mismatch {bf:and} change for either the seed or sort RNG{p_end} +{synopt: {bf:{ul:v}erbose}}Report all lines where Run 1 and Run 2 mismatch {bf:{ul:or}} change for any value{p_end} +{synopt: {bf:{ul:c}ompact}}Report only lines where Run 1 and Run 2 mismatch {bf:{ul:and}} change for either the seed or sort RNG{p_end} {synopt: {bf:{ul:s}uppress(types)}}Suppress reporting of state changes that do not result in mismatches for seed RNG state ({inp:rng}), sort order RNG ({inp:srng}), and/or data checksum ({inp:dsum}), for any reporting setting{p_end} {synopt: {bf:{ul:d}ebug}}Save all records of Stata states in Run 1 and Run 2 for inspection in the {inp:/reprun/} folder{p_end} {synopt: {bf:{ul:noc}lear}}Do not reset the Stata state before beginning reproducibility Run 1{p_end} @@ -31,10 +31,22 @@ {title:Description} -{pstd}The {bf:reprun} command is intended to be used to check the reproducibility of a do-file or set of do-files (called by a main do-file) that are ready to be transferred to other users or published. The command will ensure that the outputs produced by the do-file or set of do-files are stable across runs, such that they do not produce reproducibility errors caused by incorrectly managed randomness in Stata. To do so, {bf:reprun} will check three key sources of reproducibility failure at each point in execution of the do-file(s): the state of the random number generator, the sort order of the data, and the contents of the data itself. +{pstd}The {bf:reprun} command is intended to be used to check the reproducibility of a do-file or set of do-files (called by a main do-file) that are ready to be transferred to other users or published. The command will ensure that the outputs produced by the do-file or set of do-files are stable across runs, such that they do not produce reproducibility errors caused by incorrectly managed randomness in Stata. To do so, {bf:reprun} will check three key sources of reproducibility failure at each point in execution of the do-file(s): the state of the random number generator, the sort order of the data, and the contents of the data itself (see detailed description below). {p_end} -{pstd}After completing Run 2, {bf:reprun} will report all lines where there are mismatches in any of these values between Run 1 and Run 2. Lines where {it:changes} lead to {it:mismatches} will be highlighted, and an indicator for potentially {c 34}cascading{c 34} mismatches (those caused by previous changes) will be shown by a vertical line and the absence of the change flag. In general, this structure means that problems should be, to a first approximation, approached top-to-bottom, as solving an earlier issue will often resolve later ones (since later changes may not be the ones causing the mismatches). In addition, we have set things up so that problems should also in general be approached from left-to-right in this table. RNG states are responsible for most errors; then unstable sorts; and data mismatches are typically symptoms of these reproducibility failures, rather than causes in and of themselves. +{pstd}After completing Run 2, {bf:reprun} will report all lines where there are mismatches between Run 1 and Run 2 in any of these values. Lines where {it:changes} lead to {it:mismatches} will be highlighted. Problems should be approached top-to-bottom, as solving earlier issues will often resolve later ones. Additionally, addressing issues from left-to-right in the table is effective. RNG states are responsible for most errors, followed by unstable sorts, while data mismatches are typically symptoms of these reproducibility failures rather than causes in and of themselves. +{p_end} + +{pstd}{bf:Mismatches are defined as follows:} +{p_end} + +{pstd}{bf:Seed RNG State:} A mismatch occurs whenever the RNG state differs from Run 1 to Run 2, {it:except} any time the RNG state is exactly equivalent to {inp:set seed 12345} in Run 1 (the initialization default). By default, {bf:reprun} invokes {bf:clear} and {bf:set seed 12345} to match the default Stata state before beginning Run 1. The {bf:noclear} option prevents this behavior; this is not recommended unless you have a rare issue that you need to check at the very beginning of the file. Most projects should quickly set the randomization seed appropriately for replicability. +{p_end} + +{pstd}{bf:Sort Order RNG:} Since the sort RNG state should {it:always} differ between Run 1 and Run 2, a mismatch is defined as any line where the sort RNG state is advanced {it:and} {bf:checksum} fails to match when compared with the Run 1 data (as a CSV) at the same line. This mismatch occurs when the sort order RNG is used in a command that results in the data taking a different order between the two runs. Users should never manually set the {inp:sortseed} (See {inp:help seed} and {inp:help sortseed}) to override these mismatches; instead, they should implement a unique sort on the data using a command like {inp:isid} (See {inp:help isid}). +{p_end} + +{pstd}{bf:Data Checksum:} A mismatch occurs whenever {bf:checksum} fails to match when comparing the result from the Run 1 data (as a CSV) in Run 2. Users should understand that lines where {it:only} the data checksum fails to match are unlikely to be where problems originate in the code; these mismatches are generally consequences of earlier reproducibility failures in randomization or sorting. Users should also note that results from {bf:datasignature} are only unique up to the sort order of each column independently; hence, we do not use this command. {p_end} {title:Options} @@ -44,10 +56,10 @@ {dlgtab:Line flagging options} -{pstd}The {bf:{ul:v}erbose} option can be used to produce even more detail than the default. If the {bf:{ul:v}erbose} option is specified, then any line in which the state changes {it:during} Run 1 or Run 2; or mismatches {it:between} the runs will be flagged and reported. This is intended to allow the user to do a deep-dive into the function and structure of the do-file{c 39}s execution. +{pstd}The {bf:{ul:v}erbose} option can be used to produce even more detail than the default. If the {bf:{ul:v}erbose} option is specified, then any line in which the state changes {it:during} Run 1 or Run 2; {bf:{ul:or}} mismatches between the runs will be flagged and reported. This is intended to allow the user to do a deep-dive into the function and structure of the do-file{c 39}s execution. {p_end} -{pstd}The {bf:{ul:c}ompact} option, by contrast, produces less detailed reporting, but is often a good first step to begin locating issues in the code. If the {bf:{ul:c}ompact} option is specified, then {it:only} those lines which have mismatched seed or sort order RNG changes {it:during} Run 1 or Run 2 {bf:and} mismatches {it:between} the runs will be flagged and reported. Data checksum mismatches alone will be ignored; as will RNG mismatches not accompanied by a change in the state. This is intended to reduce the reporting of {c 34}cascading{c 34} differences, which are caused because some state value changes inconsistently at a single point and remains inconsistent for the remainder of the run (making every subsequent data change a mismatch, for example). +{pstd}The {bf:{ul:c}ompact} option, by contrast, produces less detailed reporting, but is often a good first step to begin locating issues in the code. If the {bf:{ul:c}ompact} option is specified, then {it:only} those lines which have mismatched seed or sort order RNG changes {it:during} Run 1 or Run 2 {bf:{ul:and}} mismatches between the runs will be flagged and reported. Data checksum mismatches alone will be ignored; as will RNG mismatches not accompanied by a change in the state. This is intended to reduce the reporting of {c 34}cascading{c 34} differences, which are caused because some state value changes inconsistently at a single point and remains inconsistent for the remainder of the run (making every subsequent data change a mismatch, for example). {p_end} {pstd}The {bf:{ul:s}uppress()} option is used to hide the reporting of changes that do not lead to mismatches (especially when the {bf:{ul:v}erbose} option is specified) for one or more of the types. In particular, since the sort order RNG frequently changes and should {it:not} be forced to match between runs, it will very often have changes that do not produce errors, specifying {bf:{ul:s}uppress(srng)} will remove a great deal of unhelpful output from the reporting table. To do this for all states, write {bf:{ul:s}uppress(rng srng dsum)}. Suppressing {inp:loop} will clean up the display of loops so that the titles are only shown on the first line; but if combined with {inp:compact} may not display at all. @@ -63,6 +75,142 @@ {pstd}By default, {bf:reprun} invokes {bf:clear} and {bf:set seed 12345} to match the default Stata state before beginning Run 1. {bf:{ul:noc}lear} prevents this behavior. It is not recommended unless you have a rare issue that you need to check at the very beginning of the file, because most projects should very quickly set these states appropriately for reproducibility. {p_end} +{title:Examples} + +{dlgtab:Example 1} + +{pstd}This is the most basic usage of {bf:reprun}. Specified in any of the following ways, either in the Stata command window or as part of a new do-file, {bf:reprun} will execute the complete do-file {c 34}{it:myfile.do}{c 34} once (Run 1), and record the {c 34}seed RNG state{c 34}, {c 34}sort order RNG{c 34}, and {c 34}data checksum{c 34} after the execution of every line, as well as the exact data in certain cases. {bf:reprun} will then execute {c 34}{it:myfile.do}{c 34} a second time (Run 2), and find all {it:changes} and {it:mismatches} in these states throughout Run 2. A table of mismatches will be reported in the Results window, as well as in a SMCL file in a new directory called {inp:/reprun/} in the same location as {c 34}{it:myfile.do}{c 34}. +{p_end} + +{input}{space 8}reprun "myfile.do" +{text} +{pstd}or +{p_end} + +{input}{space 8}reprun "path/to/folder/myfile.do" +{text} +{pstd}or +{p_end} + +{input}{space 8}local myfolder "/path/to/folder" +{space 8}reprun "`myfolder'/myfile.do" +{text} +{dlgtab:Example 2} + +{pstd}This example is similar to example 1, but the {inp:/reprun/} directory containing the SMCL file will be stored in the location specified by the {bf:using} argument. +{p_end} + +{input}{space 8}reprun "myfile.do" using "path/to/report" +{text} +{pstd}or +{p_end} + +{input}{space 8}reprun "path/to/folder/myfile.do" using "path/to/report" +{text} +{pstd}or +{p_end} + +{input}{space 8}local myfolder "/path/to/folder" +{space 8}reprun "`myfolder'/myfile.do" using "`myfolder'/report" +{text} +{dlgtab:Example 3} + +{pstd}Assume {c 34}{it:myfile1.do}{c 34} contains the following code: +{p_end} + +{input}{space 8}sysuse census, clear +{space 8}isid state, sort +{space 8}gen group = runiform() < .5 +{text} +{pstd}Running a reproducibility check on this do-file using {bf:reprun} will generate a table listing {it:mismatches} in Stata state between Run 1 and Run 2. +{p_end} + +{input}{space 8}reprun "myfile1.do" +{text} +{pstd}In {c 34}{it:myfile1.do}{c 34}, Line 3 ({inp:gen group = runiform() < .5}) generates a new variable {inp:group} based on a random uniform distribution. The RNG state will differ between Run 1 and Run 2 unless the random seed is explicitly set before this command. As a result, a mismatch in the {c 34}seed RNG state{c 34} as well as {c 34}data checksum{c 34} will be flagged. +{p_end} + +{pstd}The issue can be resolved by setting a seed before the command: +{p_end} + +{input}{space 8}sysuse census, clear +{space 8}isid state, sort +{space 8}set seed 346290 +{space 8}gen group = runiform() < .5 +{text} +{pstd}Running the reproducibility check on the modified do-file using reprun will confirm that there are no mismatches in Stata state between Run 1 and Run 2. +{p_end} + +{dlgtab:Example 4} + +{pstd}Using the {bf:{ul:v}erbose} option generates more detailed tables where any lines across Run 1 and Run 2 mismatch {bf:{ul:or}} change for any value. In addition to the output in Example 3, it will also report line 2 for {bf:changes} in {c 34}sort order RNG{c 34} and {c 34}data checksum{c 34}. +{p_end} + +{input}{space 8}reprun "myfile1.do", verbose +{text} +{dlgtab:Example 5} + +{pstd}Assume {c 34}{it:myfile2.do}{c 34} contains the following code: +{p_end} + +{input}{space 8}sysuse auto, clear +{space 8}sort mpg +{space 8}gen sequence = _n +{text} +{pstd}Running a reproducibility check on this do-file using {bf:reprun} will generate a table listing {it:mismatches} in Stata state between Run 1 and Run 2. +{p_end} + +{input}{space 8}reprun "myfile2.do" +{text} +{pstd}In {c 34}{it:myfile2.do}{c 34}, Line 2 sorts the data by the non-unique variable {inp:mpg}, causing the sort order to vary between runs. This results in a mismatch in the {c 34}sort order RNG{c 34}. Consequently, Line 2 and Line 3 ({inp:gen sequence = _n}) will be flagged for {c 34}data checksum{c 34} mismatches due to the differences in sort order, leading to discrepancies in the generated {inp:sequence} variable. +{p_end} + +{pstd}The issue can be resolved by sorting the data on a unique combination of variables: +{p_end} + +{input}{space 8}sysuse auto, clear +{space 8}sort mpg make +{space 8}gen sequence = _n +{text} +{dlgtab:Example 6} + +{pstd}Using the {bf:{ul:c}ompact} option generates less detailed tables where only lines with mismatched seed or sort order RNG changes during Run 1 or Run 2, and mismatches between the runs, are flagged and reported. The output will be similar to Example 5, except that line 3 will no longer be flagged for {c 34}data checksum{c 34}. +{p_end} + +{input}{space 8}reprun "myfile2.do", compact +{text} +{dlgtab:Example 7} + +{pstd}{inp:reprun} will perform a reproducibility check on a do-file, including all do-files it calls recursively. For example, the main do-file might contain the following code that calls on {c 34}{it:myfile1.do}{c 34} (Example 3) and {c 34}{it:myfile2.do}{c 34} (Example 5): +{p_end} + +{input}{space 8}local myfolder "/path/to/folder" +{space 8}do "`myfolder'/myfile1.do" +{space 8}do "`myfolder'/myfile2.do" +{text} +{input}{space 8}reprun "main.do" +{text} +{pstd}{inp:reprun} on {c 34}{it:main.do}{c 34} performs reproducibility checks across {c 34}{it:main.do}{c 34}, as well as {c 34}{it:myfile1.do}{c 34}, and {c 34}{it:myfile2.do}{c 34}. The output will include tables for each do-file, illustrating the following process: +{p_end} + +{pstd}- {bf:main.do}: The initial check reveals no mismatches in {c 34}{it:main.do}{c 34}, indicating no discrepancies introduced directly by it. +{p_end} + +{pstd}- {bf:Sub-file 1} ({c 34}{it:myfile1.do}{c 34}) : {inp:reprun} steps into {c 34}{it:myfile1.do}{c 34}, where Line 3 is flagged for mismatches, as shown in Example 3. This table will show the issues specific to {c 34}{it:myfile1.do}{c 34}. +{p_end} + +{pstd}- {bf:Return to {c 34}main.do{c 34}}{c 34} : After checking {c 34}{it:myfile1.do}{c 34}, {inp:reprun} returns to {c 34}{it:main.do}{c 34}. Here, Line 2 is flagged because it calls {c 34}{it:myfile1.do}{c 34}, reflecting the issues from the sub-file. +{p_end} + +{pstd}- {bf:Sub-file 2} ({c 34}{it:myfile2.do}{c 34}): {inp:reprun} then steps into {c 34}{it:myfile2.do}{c 34}, where Line 2 is flagged for mismatches, as detailed in Example 5. +{p_end} + +{pstd}- {bf:Return to {c 34}main.do{c 34} (final check) }: After checking {c 34}{it:myfile2.do{c 34}}, {inp:reprun} returns to {c 34}{it:main.do}{c 34}. Line 3 in {c 34}{it:main.do}{c 34} is flagged due to the issues in {c 34}{it:myfile2.do}{c 34} propagating up. +{p_end} + +{pstd}In summary, {inp:reprun} provides a comprehensive view by stepping through each do-file, showing where mismatches occur and how issues in sub-files impact the main do-file. +{p_end} + {title:Feedback, bug reports and contributions} {pstd}Read more about these commands on {browse "https://github.com/worldbank/repkit":this repo} where this package is developed. Please provide any feedback by {browse "https://github.com/worldbank/repkit/issues":opening an issue}. PRs with suggestions for improvements are also greatly appreciated. diff --git a/src/tests/reprun/reprun.do b/src/tests/reprun/reprun.do index 1bc2a10..1c1347d 100644 --- a/src/tests/reprun/reprun.do +++ b/src/tests/reprun/reprun.do @@ -10,6 +10,10 @@ global repkit_clone "/Users/bbdaniels/GitHub/repkit" } + if "`c(username)'" == "wb558768" { + global repkit_clone "C:/Users/wb558768/Documents/GitHub/repkit" + } + * Set global to ado_fldr global src_fldr "${repkit_clone}/src" global test_fldr "${src_fldr}/tests" @@ -20,6 +24,7 @@ global lf "${run_fldr}/loop-file" global wca "${run_fldr}/with-clear-all" global waf "${run_fldr}/with-ado-folder" + global swu "${run_fldr}/stable-with-unstable" * Install the version of this package in * the plus-ado folder in the test folder @@ -36,12 +41,17 @@ cap mkdir "${tf}/output-1" cap mkdir "${tf}/output-2" cap mkdir "${tf}/output-3" + cap mkdir "${tf}/comments" + + reprun "${tf}/comments.do" using "${tf}/comments" , debug + reprun "${tf}/target-1.do" using "${tf}/output-1" , debug reprun "${tf}/target-1.do" using "${tf}/output-1" , v debug reprun "${tf}/target-1.do" using "${tf}/output-1" , c debug reprun "${tf}/target-1.do" using "${tf}/output-2" , s(loop) reprun "${tf}/target-1.do" using "${tf}/output-3" , v s(rng) reprun "${tf}/target-1.do" , verbose + reprun "${tf}/target-mmm.do" * Example A - single file cap mkdir "${sf}/output" @@ -67,4 +77,8 @@ cap mkdir "${waf}/output" reprun "${waf}/main.do" using "${waf}/output" , debug + * Example g - output with stable and unstable do-files + cap mkdir "${swu}/output" + reprun "${swu}/main.do" using "${swu}/output" + net install repkit, from("${src_fldr}") replace diff --git a/src/tests/reprun/stable-with-unstable/main.do b/src/tests/reprun/stable-with-unstable/main.do new file mode 100644 index 0000000..4966674 --- /dev/null +++ b/src/tests/reprun/stable-with-unstable/main.do @@ -0,0 +1,10 @@ +/******************************************************************************* +To test how results are displayed if some do-files have changes and other don't +*******************************************************************************/ + +local fldr "${run_fldr}/stable-with-unstable" + + +do "`fldr'/test1.do" +do "`fldr'/test2.do" +do "`fldr'/test3.do" diff --git a/src/tests/reprun/stable-with-unstable/test1.do b/src/tests/reprun/stable-with-unstable/test1.do new file mode 100644 index 0000000..02e572d --- /dev/null +++ b/src/tests/reprun/stable-with-unstable/test1.do @@ -0,0 +1,9 @@ +* test 1 (unstable results) + +* Load data +sysuse auto, clear + +bys mpg: /// sorting by mpg +gen dup = cond(_N==1,0,_n) + +drop if dup > 1 \ No newline at end of file diff --git a/src/tests/reprun/stable-with-unstable/test2.do b/src/tests/reprun/stable-with-unstable/test2.do new file mode 100644 index 0000000..530e87f --- /dev/null +++ b/src/tests/reprun/stable-with-unstable/test2.do @@ -0,0 +1,9 @@ +* test 2 (stable results) + +sysuse auto, clear + +bys make: /// sorting by mpg +gen dup = cond(_N==1,0,_n) + +drop if dup > 1 + \ No newline at end of file diff --git a/src/tests/reprun/stable-with-unstable/test3.do b/src/tests/reprun/stable-with-unstable/test3.do new file mode 100644 index 0000000..66e0257 --- /dev/null +++ b/src/tests/reprun/stable-with-unstable/test3.do @@ -0,0 +1,7 @@ +* test 3 (also stable) + +sysuse auto, clear + +bys foreign: gen count = _N + + \ No newline at end of file diff --git a/src/tests/reprun/targets/comments.do b/src/tests/reprun/targets/comments.do new file mode 100644 index 0000000..25fd3d0 --- /dev/null +++ b/src/tests/reprun/targets/comments.do @@ -0,0 +1,37 @@ + +* Bad comment +* Worse comment " +/* a comment " + from the " + bad place */ +/* Weird comment */ +// TEST COMMENT + +// TEST COMMENT do + +// TEST COMMENT [ +// TEST COMMENT ( +// TEST COMMENT * + +do "target-2.do" // missing do-file do "${tf}/target-dontrun.do" + +* do "${tf}/target-dontrun.do" + * do "${tf}/target-dontrun.do" + + +// do "${tf}/target-dontrun.do" + +// do "${tf}/target-dontrun.do" + +/* a comment " + from the " + do bad place */ + + /* do "${tf}/target-dontrun.do" */ + /* do "${tf}/target-dontrun.do" */ +/* +do "${tf}/target-dontrun.do" +do nothing +*/ + +// EOF diff --git a/src/tests/reprun/targets/target-1.do b/src/tests/reprun/targets/target-1.do index eec6ba3..163cad0 100644 --- a/src/tests/reprun/targets/target-1.do +++ b/src/tests/reprun/targets/target-1.do @@ -1,4 +1,13 @@ -// +/************************************************************* + +Name of do file: Data_Clean.do +3 +Author: Maria J. Urbina Date: 14/05/2024 + +Description: Dofile in charge of cleanning the data +45 +Input files: Output files: +********************/ clear @@ -17,7 +26,11 @@ di as err "Should be 6165... `r(mean)'" #d cr -// TEST COMMENT +local check : var lab price`domain_num' + +su /// error 196 + price + global something "nothing" @@ -42,6 +55,7 @@ gen y = rnormal() cap duplicates drop make , force + if (1 == 1) & (1 == 1) do "${tf}/target-2.do" diff --git a/src/tests/reprun/targets/target-dontrun.do b/src/tests/reprun/targets/target-dontrun.do new file mode 100644 index 0000000..70de3a8 --- /dev/null +++ b/src/tests/reprun/targets/target-dontrun.do @@ -0,0 +1,2 @@ + +di as err "Wasn't supposed to run this!" diff --git a/src/tests/reprun/targets/target-mmm.do b/src/tests/reprun/targets/target-mmm.do new file mode 100644 index 0000000..3be413c --- /dev/null +++ b/src/tests/reprun/targets/target-mmm.do @@ -0,0 +1,49 @@ +// + +clear + +sysuse auto.dta + + tempfile a + save `a' , replace + +merge m:m foreign using `a' , nogen + +isid make, sort +sort foreign + +local MYFAKELOCAL = `MYFAKELOCAL' + 1 + +su price + +di as err "Should be 6165... `r(mean)'" + +di as err "Should be 6165... `r(mean)'" + +#d cr + +isid make, sort + +bys foreign : gen x2 = _N + +// TEST COMMENT + +global something "nothing" + +expand 2 , gen(check) + +isid make check, sort + +sort foreign + +di as err "RUNFILE: THIS VALUE SHOULD ALWAYS BE THE NUMBER ONE: `MYFAKELOCAL'" + +gen x = _n + +set sortseed 12345 + +cap duplicates drop make , force + + + +//