diff --git a/.gitignore b/.gitignore index e0dfab9d..915b07c0 100644 --- a/.gitignore +++ b/.gitignore @@ -29,14 +29,14 @@ Signals/pyData/* __*__* # Testing and archive +Logs/ deleteme* Signals/Debug/* Signals/pyCode/Debug/* Signals/zzz-archive/* Signals/Human/* -dev_run_pred.sh -devone.sh -run_batch_predictors.py +dev*.sh +dev_batch_predictors.py # Other Portfolios/Code/Rplots.pdf diff --git a/Signals/Logs/02_CreatePredictors_console.txt b/Signals/Logs/02_CreatePredictors_console.txt deleted file mode 100644 index cc5850ae..00000000 --- a/Signals/Logs/02_CreatePredictors_console.txt +++ /dev/null @@ -1,3633 +0,0 @@ -Create Predictors Log - Started at 2025-08-31 08:11:41 -================================================================================ - - -🔄 Creating Signal Master Table: SignalMasterTable.py -============================================================ -Starting SignalMasterTable.py... -Loading monthly CRSP data... -Loaded monthlyCRSP: 5153763 rows, 9 columns -Filtering for common stocks and major exchanges... -After filtering: 4047630 rows -Merging with m_aCompustat... -After Compustat merge: 4047630 rows -Adding auxiliary variables... -After adding auxiliary vars: 4047630 rows, 13 columns -Checking for IBES-CRSP linking table... -Adding IBES-CRSP link... -After IBES link merge: 4047630 rows, 14 columns -Checking for OptionMetrics-CRSP linking table... -Adding OptionMetrics-CRSP link... -After OptionMetrics link merge: 4047630 rows, 16 columns -Saving SignalMasterTable... -SignalMasterTable saved to: ../pyData/Intermediate/SignalMasterTable.parquet -Final shape: 4047630 rows, 16 columns -Column names: ['permno', 'ret', 'prc', 'shrcd', 'exchcd', 'sicCRSP', 'ticker', 'time_avail_m', 'mve_c', 'gvkey', 'sicCS', 'NYSE', 'bh1m', 'tickerIBES', 'secid', 'om_score'] -============================================================ -✅ Completed: SignalMasterTable.py -Execution time: 5.00 seconds - - -🔄 Starting: Accruals.py -============================================================ -Starting Accruals.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 10 columns -Deduplicating by permno time_avail_m... -After deduplication: 3625491 rows -Setting up panel data structure... -Creating lag variables... -Calculating Accruals... -Calculated Accruals for 3276202 observations -saving Accruals -Saved 3276202 rows to ../pyData/Predictors/Accruals.csv -Accruals.py completed successfully -============================================================ -✅ Completed: Accruals.py -Execution time: 1.77 seconds - - -🔄 Starting: AccrualsBM.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: invalid value encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -Starting AccrualsBM.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 10 columns -Deduplicating by permno time_avail_m... -After deduplication: 3625491 rows -Merging with SignalMasterTable... -After merging with SignalMasterTable: 4047630 rows -Setting up panel data structure... -Calculating BM... -Creating lag variables for accruals... -Calculating tempacc... -Creating BM quintiles... -Creating accruals quintiles... -Generating AccrualsBM signal... -Calculated AccrualsBM for 220635 observations -saving AccrualsBM -Saved 220635 rows to ../pyData/Predictors/AccrualsBM.csv -AccrualsBM.py completed successfully -============================================================ -✅ Completed: AccrualsBM.py -Execution time: 4.81 seconds - - -🔄 Starting: AdExp.py -============================================================ -Starting AdExp.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 3 columns -Merging with SignalMasterTable... -After merging with SignalMasterTable: 4047630 rows -Calculating AdExp... -Calculated AdExp for 1049037 observations -saving AdExp -Saved 1049037 rows to ../pyData/Predictors/AdExp.csv -AdExp.py completed successfully -============================================================ -✅ Completed: AdExp.py -Execution time: 1.85 seconds - - -🔄 Starting: AgeIPO.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/AgeIPO.py:85: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'nan' has dtype incompatible with bool, please explicitly cast to a compatible dtype first. - df.loc[df['IPOdate'].isna(), 'tempipo'] = np.nan -Starting AgeIPO.py... -Loading SignalMasterTable data... -Loaded SignalMasterTable: 4047630 rows, 2 columns -Merging with IPODates... -After merging with IPODates: 4047630 rows -Calculating recent IPO filter... -Calculating AgeIPO... -Applying minimum IPO firms per month filter... -Calculated AgeIPO for 353486 observations -saving AgeIPO -Saved 353486 rows to ../pyData/Predictors/AgeIPO.csv -AgeIPO.py completed successfully -============================================================ -✅ Completed: AgeIPO.py -Execution time: 1.04 seconds - - -🔄 Starting: AM.py -============================================================ -AM predictor saved successfully -============================================================ -✅ Completed: AM.py -Execution time: 4.47 seconds - - -🔄 Starting: AnalystRevision.py -============================================================ -Starting AnalystRevision.py... -Loading and preparing IBES data... -Prepared IBES data: 2382154 rows, 3 columns -Loading SignalMasterTable... -Loaded SignalMasterTable: 4047630 rows, 3 columns -Merging with IBES data... -After merging with IBES data: 4047630 rows -Setting up panel data (sorting by permno, time_avail_m)... -Calculating 1-month lag and AnalystRevision... -Calculated AnalystRevision for 1923490 observations -saving AnalystRevision -Saved 1923490 rows to ../pyData/Predictors/AnalystRevision.csv -AnalystRevision.py completed successfully -============================================================ -✅ Completed: AnalystRevision.py -Execution time: 1.87 seconds - - -🔄 Starting: AssetGrowth.py -============================================================ -Starting AssetGrowth.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 4 columns -Setting up panel data (sorting by permno, time_avail_m)... -Calculating 12-month lag and AssetGrowth... -Calculated AssetGrowth for 3311751 observations -saving AssetGrowth -Saved 3311751 rows to ../pyData/Predictors/AssetGrowth.csv -AssetGrowth.py completed successfully -============================================================ -✅ Completed: AssetGrowth.py -Execution time: 1.39 seconds - - -🔄 Starting: Beta.py -============================================================ -================================================================================ -🏗️ Beta.py -Generating CAPM Beta predictor using direct polars-ols rolling regression -================================================================================ -📊 Loading monthly CRSP, FF, and Market data... -Loading monthlyCRSP.parquet... -Loaded CRSP: 5,153,763 monthly observations -Loading monthlyFF.parquet... -Loaded FF: 1,187 monthly observations -Loading monthlyMarket.parquet... -Loaded Market: 1,189 monthly observations -🔗 Merging datasets... -After merging: 5,150,010 observations -🧮 Computing CAPM Beta using direct polars-ols rolling 60-observation regressions... -Computing rolling regressions by permno using 60-observation windows... -Rolling window regression with minimum 20 observations per window -Processing 38,835 unique permnos... -Generated Beta values: 4,353,773 observations -Beta summary stats: - Mean: 0.9936 - Std: 0.7517 - Min: -29.7167 - Max: 52.6339 -💾 Saving Beta predictor... -saving Beta -Saved 4353773 rows to ../pyData/Predictors/Beta.csv -✅ Beta.csv saved successfully -================================================================================ -✅ Beta.py Complete -CAPM Beta predictor generated using rolling 60-observation regression windows -================================================================================ -============================================================ -✅ Completed: Beta.py -Execution time: 6.22 seconds - - -🔄 Starting: BetaLiquidityPS.py -============================================================ -================================================================================ -🏗️ BetaLiquidityPS.py -Generating Pastor-Stambaugh liquidity beta predictor -================================================================================ -📊 Loading monthly CRSP, FF, and Liquidity data... -Loading monthlyCRSP.parquet... -Loaded CRSP: 5,153,763 monthly observations -Loading monthlyFF.parquet... -Loaded FF: 1,187 monthly observations -Loading monthlyLiquidity.parquet... -Loaded Liquidity: 749 monthly observations -🔗 Merging datasets... -After merging: 5,150,010 observations -🧮 Computing Pastor-Stambaugh liquidity beta using direct polars-ols rolling 60-observation 4-factor regressions... -Computing rolling 4-factor regressions for 38,835 unique permnos... -Using 60-observation rolling windows with 36-observation minimum for each stock -Generated BetaLiquidityPS values: 3,479,410 observations -BetaLiquidityPS summary stats: - Mean: -0.0021 - Std: 0.4792 - Min: -23.6664 - Max: 41.6906 -💾 Saving BetaLiquidityPS predictor... -saving BetaLiquidityPS -Saved 3479410 rows to ../pyData/Predictors/BetaLiquidityPS.csv -✅ BetaLiquidityPS.csv saved successfully -================================================================================ -✅ BetaLiquidityPS.py Complete -Pastor-Stambaugh liquidity beta predictor generated using rolling 4-factor regressions -================================================================================ -============================================================ -✅ Completed: BetaLiquidityPS.py -Execution time: 10.28 seconds - - -🔄 Starting: BetaTailRisk.py -============================================================ -================================================================================ -🏗️ BetaTailRisk.py -Generating Tail Risk Beta predictor -================================================================================ -📊 Part 1: Creating monthly tail risk factor from daily returns... -Loading dailyCRSP.parquet... -Loaded daily CRSP: 107,662,961 daily observations -Calculating 5th percentile returns by month... -Generated monthly 5th percentiles for 1,188 months -Filtering to tail observations (bottom 5%) and calculating tail excess returns... -Filtered to 5,325,782 tail observations -Generated monthly tail risk factor for 1,188 months -Saved TailRisk.parquet -📊 Part 2: Computing tail risk betas from monthly returns... -Loading monthlyCRSP.parquet... -Loaded monthly CRSP: 5,153,763 monthly observations -Merging with tail risk factor... -After merging: 5,153,763 observations -Computing rolling 120-month tail risk betas for 38,843 unique permnos... -Rolling 120-month regression windows with minimum 72 observations per permno -Generated BetaTailRisk values: 2,332,084 observations -BetaTailRisk summary stats: - Mean: 0.6372 - Std: 0.7609 - Min: -96.2499 - Max: 18.9120 -💾 Saving BetaTailRisk predictor... -saving BetaTailRisk -Saved 2332084 rows to ../pyData/Predictors/BetaTailRisk.csv -✅ BetaTailRisk.csv saved successfully -================================================================================ -✅ BetaTailRisk.py Complete -Tail risk beta predictor generated using polars rolling window regression -================================================================================ -============================================================ -✅ Completed: BetaTailRisk.py -Execution time: 7.71 seconds - - -🔄 Starting: BidAskSpread.py -============================================================ -Starting BidAskSpread.py... -Loading BAspreadsCorwin data... -Loaded BAspreadsCorwin: 4481622 rows, 3 columns -BidAskSpread available for 4481622 observations -saving BidAskSpread -Saved 4481622 rows to ../pyData/Predictors/BidAskSpread.csv -BidAskSpread.py completed successfully -============================================================ -✅ Completed: BidAskSpread.py -Execution time: 0.50 seconds - - -🔄 Starting: BM.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: invalid value encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -Starting BM.py... -Loading m_aCompustat data... -Loading SignalMasterTable... -Loaded m_aCompustat: 3625491 rows -Loaded SignalMasterTable: 4047630 rows -Merging with SignalMasterTable... -After merge: 4047630 rows -Setting up panel data structure... -Creating 6-month lag for market equity... -Calculating BM signal... -Calculated BM for 2715230 observations -BM.py completed successfully -============================================================ -✅ Completed: BM.py -Execution time: 4.83 seconds - - -🔄 Starting: BMdec.py -============================================================ -Starting BMdec predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -After deduplication: 3,625,491 observations -Loading monthlyCRSP data... -Loaded 5,153,763 CRSP observations -Merging Compustat and CRSP data... -After merge: 3,362,017 observations -Constructing BMdec signal... -Generated BMdec values for 2,998,697 observations -Saving predictor... -saving BMdec -Saved 2998697 rows to ../pyData/Predictors/BMdec.csv -BMdec predictor completed successfully! -============================================================ -✅ Completed: BMdec.py -Execution time: 2.67 seconds - - -🔄 Starting: BookLeverage.py -============================================================ -Starting BookLeverage.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 10 columns -Deduplicating by permno time_avail_m... -After deduplication: 3625491 rows -Calculating tempPS with fallback logic... -Calculating tempSE with fallback logic... -Calculating BookLeverage... -Calculated BookLeverage for 3607287 observations -saving BookLeverage -Saved 3607287 rows to ../pyData/Predictors/BookLeverage.csv -BookLeverage.py completed successfully -============================================================ -✅ Completed: BookLeverage.py -Execution time: 1.49 seconds - - -🔄 Starting: BrandInvest.py -============================================================ -Starting BrandInvest predictor... -Loading a_aCompustat data... -Loaded 302,326 annual Compustat observations -Constructing BrandInvest signal... -Calculating brand capital accumulation... -Applying depreciation and accumulation... -Applying industry filters... -Filtered out 65,241 observations (utilities/financials) -Kept only December observations: 145,800 -Expanding annual data to monthly... -After monthly expansion: 1,749,600 observations -After deduplication: 1,749,600 observations -Generated BrandInvest values for 509,472 observations -Saving predictor... -saving BrandInvest -Saved 509472 rows to ../pyData/Predictors/BrandInvest.csv -BrandInvest predictor completed successfully! -============================================================ -✅ Completed: BrandInvest.py -Execution time: 32.20 seconds - - -🔄 Starting: Cash.py -============================================================ -Starting Cash predictor... -Loading m_QCompustat data... -Loaded 5,444,089 quarterly Compustat observations -Processing quarterly data... -After deduplication: 1,146,836 observations -Expanding quarterly data to monthly... -After monthly expansion: 3,440,508 observations -After removing overlapping announcements: 3,197,454 observations -Loading SignalMasterTable... -Loaded SignalMasterTable: 3,041,661 observations -Merging with SignalMasterTable... -After merge: 2,246,778 observations -Constructing Cash signal... -Generated Cash values for 2,246,763 observations -Saving predictor... -saving Cash -Saved 2246763 rows to ../pyData/Predictors/Cash.csv -Cash predictor completed successfully! -============================================================ -✅ Completed: Cash.py -Execution time: 3.17 seconds - - -🔄 Starting: CashProd.py -============================================================ -Starting CashProd.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 4 columns -Deduplicating by permno time_avail_m... -After deduplication: 3625491 rows -Merging with SignalMasterTable... -After merging with SignalMasterTable: 3041661 rows -Calculating CashProd... -Calculated CashProd for 3038208 observations -saving CashProd -Saved 3038208 rows to ../pyData/Predictors/CashProd.csv -CashProd.py completed successfully -============================================================ -✅ Completed: CashProd.py -Execution time: 1.93 seconds - - -🔄 Starting: CBOperProf.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: invalid value encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -Starting CBOperProf predictor... -Loading SignalMasterTable... -Loaded SignalMasterTable: 4,047,630 observations -Loading m_aCompustat... -Loaded m_aCompustat: 3,625,491 observations -Merging SignalMasterTable with m_aCompustat... -After merge: 4,047,630 observations -Constructing CBOperProf signal... -Generated CBOperProf values for 2,283,897 observations -Excluded 1,731,811 observations due to filtering criteria -Saving predictor... -saving CBOperProf -Saved 2283897 rows to ../pyData/Predictors/CBOperProf.csv -CBOperProf predictor completed successfully! -============================================================ -✅ Completed: CBOperProf.py -Execution time: 6.23 seconds - - -🔄 Starting: CF.py -============================================================ -Starting CF.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 5 columns -Deduplicating by permno time_avail_m... -After deduplication: 3625491 rows -Merging with SignalMasterTable... -After merging with SignalMasterTable: 4047630 rows -Calculating CF... -Calculated CF for 3053133 observations -saving CF -Saved 3053133 rows to ../pyData/Predictors/CF.csv -CF.py completed successfully -============================================================ -✅ Completed: CF.py -Execution time: 2.06 seconds - - -🔄 Starting: cfp.py -============================================================ -Starting cfp predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -After deduplication: 3,625,491 observations -Loading SignalMasterTable... -Loaded SignalMasterTable: 4,047,630 observations -Merging with SignalMasterTable... -After merge: 3,041,661 observations -Constructing cfp signal... -Generated cfp values for 2,614,930 observations -Used oancf for 1,925,462 observations -Saving predictor... -saving cfp -Saved 2614930 rows to ../pyData/Predictors/cfp.csv -cfp predictor completed successfully! -============================================================ -✅ Completed: cfp.py -Execution time: 4.12 seconds - - -🔄 Starting: ChangeInRecommendation.py -============================================================ -ChangeInRecommendation predictor created with 450458 observations -============================================================ -✅ Completed: ChangeInRecommendation.py -Execution time: 12.96 seconds - - -🔄 Starting: ChAssetTurnover.py -============================================================ -ChAssetTurnover predictor saved: 2517970 observations -============================================================ -✅ Completed: ChAssetTurnover.py -Execution time: 4.13 seconds - - -🔄 Starting: ChEQ.py -============================================================ -Starting ChEQ predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -After deduplication: 3,625,491 observations -Constructing ChEQ signal... -Generated ChEQ values for 3,060,165 observations -Saving predictor... -saving ChEQ -Saved 3060165 rows to ../pyData/Predictors/ChEQ.csv -ChEQ predictor completed successfully! -============================================================ -✅ Completed: ChEQ.py -Execution time: 0.79 seconds - - -🔄 Starting: ChForecastAccrual.py -============================================================ -ChForecastAccrual predictor saved: 628490 observations -============================================================ -✅ Completed: ChForecastAccrual.py -Execution time: 7.24 seconds - - -🔄 Starting: ChInv.py -============================================================ -ChInv predictor saved: 3311811 observations -============================================================ -✅ Completed: ChInv.py -Execution time: 3.21 seconds - - -🔄 Starting: ChInvIA.py -============================================================ -ChInvIA predictor saved: 2678515 observations -============================================================ -✅ Completed: ChInvIA.py -Execution time: 5.38 seconds - - -🔄 Starting: ChNAnalyst.py -============================================================ -saving ChNAnalyst -Saved 371936 rows to ../pyData/Predictors/ChNAnalyst.csv -============================================================ -✅ Completed: ChNAnalyst.py -Execution time: 3.50 seconds - - -🔄 Starting: ChNNCOA.py -============================================================ -ChNNCOA predictor saved: 3262618 observations -============================================================ -✅ Completed: ChNNCOA.py -Execution time: 3.48 seconds - - -🔄 Starting: ChNWC.py -============================================================ -Starting ChNWC predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -After deduplication: 3,625,491 observations -Constructing ChNWC signal... -Generated ChNWC values for 3,275,986 observations -Saving predictor... -saving ChNWC -Saved 3275986 rows to ../pyData/Predictors/ChNWC.csv -ChNWC predictor completed successfully! -============================================================ -✅ Completed: ChNWC.py -Execution time: 0.87 seconds - - -🔄 Starting: ChTax.py -============================================================ -Starting ChTax predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -Loading m_QCompustat data... -Loaded 5,444,089 quarterly Compustat observations -Merging Compustat and quarterly data... -After merge: 3,251,448 observations -Constructing ChTax signal... -Generated ChTax values for 2,828,411 observations -Saving predictor... -saving ChTax -Saved 2828411 rows to ../pyData/Predictors/ChTax.csv -ChTax predictor completed successfully! -============================================================ -✅ Completed: ChTax.py -Execution time: 2.23 seconds - - -🔄 Starting: CitationsRD.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/CitationsRD.py:97: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. - df = df.groupby('gvkey').apply(lambda x: x.iloc[2:]).reset_index(drop=True) -Loading SignalMasterTable... -After early date filter: (3481556, 6) -Loading and merging Compustat... -After Compustat merge: (3481556, 11) -After dropping missing gvkey: (2875781, 11) -Loading and merging patent data... -After patent merge: (2875781, 12) -Creating calendar-based lags... -After creating lags: (2936322, 13) -Filtering to June observations... -After June filter: (237996, 13) -Creating calendar-based rolling sums... - Computing 48-month calendar rolling XRD sums... - Computing 48-month calendar rolling citation sums... -Creating size categories... -Creating CitationsRD tercile categories... -Expanding to monthly observations... -saving CitationsRD -Saved 660588 rows to ../pyData/Predictors/CitationsRD.csv -============================================================ -✅ Completed: CitationsRD.py -Execution time: 5.77 seconds - - -🔄 Starting: CompEquIss.py -============================================================ -Starting CompEquIss predictor... -Loading SignalMasterTable... -Loaded 4,047,630 SignalMasterTable observations -Constructing CompEquIss signal... -Creating 60-month lags with calendar validation... -Generated CompEquIss values for 2,560,488 observations -Saving predictor... -saving CompEquIss -Saved 2560488 rows to ../pyData/Predictors/CompEquIss.csv -CompEquIss predictor completed successfully! -============================================================ -✅ Completed: CompEquIss.py -Execution time: 2.91 seconds - - -🔄 Starting: CompositeDebtIssuance.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: invalid value encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -Starting CompositeDebtIssuance predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -Constructing CompositeDebtIssuance signal... -After deduplication: 3,625,491 observations -Calculating 60-month calendar-based lag... -Generated CompositeDebtIssuance values for 2,157,897 observations -Saving predictor... -saving CompositeDebtIssuance -Saved 2157897 rows to ../pyData/Predictors/CompositeDebtIssuance.csv -CompositeDebtIssuance predictor completed successfully! -============================================================ -✅ Completed: CompositeDebtIssuance.py -Execution time: 1.40 seconds - - -🔄 Starting: ConsRecomm.py -============================================================ -Starting ConsRecomm predictor... -Loading IBES Recommendations data... -Loaded 864,089 IBES recommendations observations -After first collapse: 838,063 observations -After second collapse: 550,582 observations -Generated ConsRecomm values for 160,544 observations -Loading SignalMasterTable... -Loaded 4,047,630 SignalMasterTable observations -Merging data... -After merging: 464,223 observations -Saving predictor... -saving ConsRecomm -Saved 134129 rows to ../pyData/Predictors/ConsRecomm.csv -ConsRecomm predictor completed successfully! -============================================================ -✅ Completed: ConsRecomm.py -Execution time: 1.01 seconds - - -🔄 Starting: ConvDebt.py -============================================================ -Starting ConvDebt predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -Constructing ConvDebt signal... -After deduplication: 3,625,491 observations -Generated ConvDebt values for 3,625,491 observations -ConvDebt = 1 for 481,630 observations -Saving predictor... -saving ConvDebt -Saved 3625491 rows to ../pyData/Predictors/ConvDebt.csv -ConvDebt predictor completed successfully! -============================================================ -✅ Completed: ConvDebt.py -Execution time: 0.71 seconds - - -🔄 Starting: CoskewACX.py -============================================================ -Loading daily CRSP data... -Loaded dailyCRSP: (107662961, 7) rows -After 1962-07-02 filter: (98768585, 7) rows -Loading daily Fama-French data... -Loaded dailyFF: (26003, 6) rows -Merging CRSP and FF data... -After merging with FF: (98768585, 9) rows -Converting to continuous-time compounded returns... -Data ready for processing: (98768585, 9) rows -Starting 12-month batch processing loop... -Processing batch 1/12... - Batch 1: 93486210 observations after time assignment - Batch 1: 344824 final observations -Processing batch 2/12... - Batch 2: 93663925 observations after time assignment - Batch 2: 344947 final observations -Processing batch 3/12... - Batch 3: 93934869 observations after time assignment - Batch 3: 345413 final observations -Processing batch 4/12... - Batch 4: 94115199 observations after time assignment - Batch 4: 345627 final observations -Processing batch 5/12... - Batch 5: 94361510 observations after time assignment - Batch 5: 346194 final observations -Processing batch 6/12... - Batch 6: 94571108 observations after time assignment - Batch 6: 347141 final observations -Processing batch 7/12... - Batch 7: 94742725 observations after time assignment - Batch 7: 349622 final observations -Processing batch 8/12... - Batch 8: 94992396 observations after time assignment - Batch 8: 349461 final observations -Processing batch 9/12... - Batch 9: 95149874 observations after time assignment - Batch 9: 350302 final observations -Processing batch 10/12... - Batch 10: 95407990 observations after time assignment - Batch 10: 350850 final observations -Processing batch 11/12... - Batch 11: 95566186 observations after time assignment - Batch 11: 351241 final observations -Processing batch 12/12... - Batch 12: 95769074 observations after time assignment - Batch 12: 353523 final observations -Combining all batches... -Final combined dataset: (4179145, 3) rows -Final summary statistics: -shape: (9, 4) -┌────────────┬──────────────┬───────────────┬────────────┐ -│ statistic ┆ permno ┆ yyyymm ┆ CoskewACX │ -│ --- ┆ --- ┆ --- ┆ --- │ -│ str ┆ f64 ┆ f64 ┆ f64 │ -╞════════════╪══════════════╪═══════════════╪════════════╡ -│ count ┆ 4.179145e6 ┆ 4.179145e6 ┆ 4.179145e6 │ -│ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ -│ mean ┆ 55200.306551 ┆ 199858.951931 ┆ -0.139802 │ -│ std ┆ 28840.682547 ┆ 1620.653336 ┆ 0.33586 │ -│ min ┆ 10000.0 ┆ 196207.0 ┆ -6.276088 │ -│ 25% ┆ 24803.0 ┆ 198611.0 ┆ -0.220816 │ -│ 50% ┆ 60186.0 ┆ 199907.0 ┆ -0.085899 │ -│ 75% ┆ 81674.0 ┆ 201211.0 ┆ 0.020687 │ -│ max ┆ 93436.0 ┆ 202412.0 ┆ 3.566706 │ -└────────────┴──────────────┴───────────────┴────────────┘ -Saving to CSV... -✅ CoskewACX.csv saved with 4179145 observations -CoskewACX translation complete! -============================================================ -✅ Completed: CoskewACX.py -Execution time: 83.08 seconds - - -🔄 Starting: Coskewness.py -============================================================ -Loading monthly CRSP data... -Loaded monthlyCRSP: (5153763, 17) rows -Loading monthly Fama-French data... -Loaded monthlyFF: (1187, 6) rows -Merging CRSP and FF data... -After merging with FF: (5150010, 19) rows -Data ready for processing: (5150010, 18) rows -Generated m60 variable with values 0-59 -Starting 60-batch processing loop... -Processing batch 1/60... - Batch 1: 3914622 observations after time assignment - Batch 1: 73434 final observations -Processing batch 2/60... - Batch 2: 3920954 observations after time assignment - Batch 2: 73418 final observations -Processing batch 3/60... - Batch 3: 3930228 observations after time assignment - Batch 3: 73485 final observations -Processing batch 4/60... - Batch 4: 3936849 observations after time assignment - Batch 4: 73487 final observations -Processing batch 5/60... - Batch 5: 3947083 observations after time assignment - Batch 5: 73610 final observations -Processing batch 6/60... - Batch 6: 3953857 observations after time assignment - Batch 6: 73690 final observations -Processing batch 7/60... - Batch 7: 3958786 observations after time assignment - Batch 7: 73718 final observations -Processing batch 8/60... - Batch 8: 3966657 observations after time assignment - Batch 8: 73707 final observations -Processing batch 9/60... - Batch 9: 3972098 observations after time assignment - Batch 9: 73651 final observations -Processing batch 10/60... - Batch 10: 3979291 observations after time assignment - Batch 10: 73741 final observations -Processing batch 11/60... - Batch 11: 3986631 observations after time assignment - Batch 11: 73818 final observations -Processing batch 12/60... - Batch 12: 3993311 observations after time assignment - Batch 12: 73854 final observations -Processing batch 13/60... - Batch 13: 3994640 observations after time assignment - Batch 13: 73700 final observations -Processing batch 14/60... - Batch 14: 4001278 observations after time assignment - Batch 14: 73703 final observations -Processing batch 15/60... - Batch 15: 4012120 observations after time assignment - Batch 15: 73778 final observations -Processing batch 16/60... - Batch 16: 4019683 observations after time assignment - Batch 16: 73800 final observations -Processing batch 17/60... - Batch 17: 4026528 observations after time assignment - Batch 17: 73807 final observations -Processing batch 18/60... - Batch 18: 4035080 observations after time assignment - Batch 18: 74041 final observations -Processing batch 19/60... - Batch 19: 4043026 observations after time assignment - Batch 19: 74204 final observations -Processing batch 20/60... - Batch 20: 4053320 observations after time assignment - Batch 20: 74345 final observations -Processing batch 21/60... - Batch 21: 4061445 observations after time assignment - Batch 21: 74407 final observations -Processing batch 22/60... - Batch 22: 4073119 observations after time assignment - Batch 22: 74587 final observations -Processing batch 23/60... - Batch 23: 4082686 observations after time assignment - Batch 23: 74726 final observations -Processing batch 24/60... - Batch 24: 4091912 observations after time assignment - Batch 24: 74890 final observations -Processing batch 25/60... - Batch 25: 4093565 observations after time assignment - Batch 25: 74761 final observations -Processing batch 26/60... - Batch 26: 4105875 observations after time assignment - Batch 26: 74850 final observations -Processing batch 27/60... - Batch 27: 4119577 observations after time assignment - Batch 27: 75168 final observations -Processing batch 28/60... - Batch 28: 4128351 observations after time assignment - Batch 28: 75400 final observations -Processing batch 29/60... - Batch 29: 4138703 observations after time assignment - Batch 29: 75730 final observations -Processing batch 30/60... - Batch 30: 4148217 observations after time assignment - Batch 30: 76575 final observations -Processing batch 31/60... - Batch 31: 4153874 observations after time assignment - Batch 31: 76825 final observations -Processing batch 32/60... - Batch 32: 4158241 observations after time assignment - Batch 32: 76821 final observations -Processing batch 33/60... - Batch 33: 4163439 observations after time assignment - Batch 33: 76900 final observations -Processing batch 34/60... - Batch 34: 4170894 observations after time assignment - Batch 34: 77130 final observations -Processing batch 35/60... - Batch 35: 4176977 observations after time assignment - Batch 35: 77278 final observations -Processing batch 36/60... - Batch 36: 4189057 observations after time assignment - Batch 36: 77513 final observations -Processing batch 37/60... - Batch 37: 4192051 observations after time assignment - Batch 37: 77414 final observations -Processing batch 38/60... - Batch 38: 4202245 observations after time assignment - Batch 38: 77551 final observations -Processing batch 39/60... - Batch 39: 4212354 observations after time assignment - Batch 39: 77674 final observations -Processing batch 40/60... - Batch 40: 4218136 observations after time assignment - Batch 40: 77645 final observations -Processing batch 41/60... - Batch 41: 4225288 observations after time assignment - Batch 41: 77697 final observations -Processing batch 42/60... - Batch 42: 4233999 observations after time assignment - Batch 42: 77817 final observations -Processing batch 43/60... - Batch 43: 4238948 observations after time assignment - Batch 43: 78637 final observations -Processing batch 44/60... - Batch 44: 4246853 observations after time assignment - Batch 44: 78681 final observations -Processing batch 45/60... - Batch 45: 4255845 observations after time assignment - Batch 45: 78695 final observations -Processing batch 46/60... - Batch 46: 4265646 observations after time assignment - Batch 46: 78824 final observations -Processing batch 47/60... - Batch 47: 4271719 observations after time assignment - Batch 47: 79008 final observations -Processing batch 48/60... - Batch 48: 4280329 observations after time assignment - Batch 48: 81419 final observations -Processing batch 49/60... - Batch 49: 4279390 observations after time assignment - Batch 49: 81194 final observations -Processing batch 50/60... - Batch 50: 4288923 observations after time assignment - Batch 50: 81202 final observations -Processing batch 51/60... - Batch 51: 4297370 observations after time assignment - Batch 51: 81239 final observations -Processing batch 52/60... - Batch 52: 4305370 observations after time assignment - Batch 52: 81291 final observations -Processing batch 53/60... - Batch 53: 4315479 observations after time assignment - Batch 53: 81408 final observations -Processing batch 54/60... - Batch 54: 4325582 observations after time assignment - Batch 54: 81590 final observations -Processing batch 55/60... - Batch 55: 4332723 observations after time assignment - Batch 55: 81668 final observations -Processing batch 56/60... - Batch 56: 4341627 observations after time assignment - Batch 56: 81759 final observations -Processing batch 57/60... - Batch 57: 4351337 observations after time assignment - Batch 57: 81872 final observations -Processing batch 58/60... - Batch 58: 4358505 observations after time assignment - Batch 58: 81987 final observations -Processing batch 59/60... - Batch 59: 4366507 observations after time assignment - Batch 59: 82106 final observations -Processing batch 60/60... - Batch 60: 4373841 observations after time assignment - Batch 60: 82228 final observations -Combining all batches... -Final combined dataset: (4609158, 3) rows -Final summary statistics: -shape: (9, 4) -┌────────────┬──────────────┬───────────────┬────────────┐ -│ statistic ┆ permno ┆ yyyymm ┆ Coskewness │ -│ --- ┆ --- ┆ --- ┆ --- │ -│ str ┆ f64 ┆ f64 ┆ f64 │ -╞════════════╪══════════════╪═══════════════╪════════════╡ -│ count ┆ 4.609158e6 ┆ 4.609158e6 ┆ 4.609158e6 │ -│ null_count ┆ 0.0 ┆ 0.0 ┆ 0.0 │ -│ mean ┆ 52288.249363 ┆ 199446.017069 ┆ -0.20003 │ -│ std ┆ 29515.076279 ┆ 2104.481887 ┆ 0.38264 │ -│ min ┆ 10000.0 ┆ 192706.0 ┆ -4.491474 │ -│ 25% ┆ 20802.0 ┆ 198208.0 ┆ -0.384801 │ -│ 50% ┆ 54148.0 ┆ 199708.0 ┆ -0.179389 │ -│ 75% ┆ 80581.0 ┆ 201107.0 ┆ 0.012401 │ -│ max ┆ 93436.0 ┆ 202412.0 ┆ 2.53694 │ -└────────────┴──────────────┴───────────────┴────────────┘ -Saving to CSV... -✅ Coskewness.csv saved with 4609158 observations -Coskewness translation complete! -============================================================ -✅ Completed: Coskewness.py -Execution time: 13.83 seconds - - -🔄 Starting: CPVolSpread.py -============================================================ -Starting CPVolSpread.py... -Loading and cleaning OptionMetrics data... -Cleaned OptionMetrics data: 1867260 rows, 7 columns -Computed CPVolSpread for 881808 observations -Loading SignalMasterTable... -After merging with options data: 2846377 rows -After filtering closed-end funds and REITs: 2839702 rows -Final CPVolSpread for 682114 observations -saving CPVolSpread -Saved 682114 rows to ../pyData/Predictors/CPVolSpread.csv -CPVolSpread.py completed successfully -============================================================ -✅ Completed: CPVolSpread.py -Execution time: 1.44 seconds - - -🔄 Starting: CredRatDG.py -============================================================ -Starting CredRatDG predictor... -Loading m_SP_creditratings data... -Generated dataset of 12,902 SP downgrades -Loading m_CIQ_creditratings data... -Loading SignalMasterTable... -Loaded 3,041,661 SignalMasterTable observations -Merging data... -After merging: 3,041,661 observations -Constructing CredRatDG signal... -Saving predictor... -saving CredRatDG -Saved 3041661 rows to ../pyData/Predictors/CredRatDG.csv -CredRatDG predictor completed successfully! -============================================================ -✅ Completed: CredRatDG.py -Execution time: 1.76 seconds - - -🔄 Starting: CustomerMomentum.py -============================================================ -Starting CustomerMomentum predictor... -Loading customerMom data... -Loaded 356,510 customerMom observations -Constructing CustomerMomentum signal... -Generated CustomerMomentum values for 356,510 observations -Saving predictor... -saving CustomerMomentum -Saved 356510 rows to ../pyData/Predictors/CustomerMomentum.csv -CustomerMomentum predictor completed successfully! -============================================================ -✅ Completed: CustomerMomentum.py -Execution time: 0.33 seconds - - -🔄 Starting: dCPVolSpread.py -============================================================ -Starting dCPVolSpread.py... -Loading OptionMetrics volatility surface data... -Loaded options data: 4617772 rows -After screening: 2308886 rows -Calculating volatility changes... -Merging with SignalMasterTable... -After merge: 4047630 rows -Calculated dCPVolSpread for 851001 observations -saving dCPVolSpread -Saved 851001 rows to ../pyData/Predictors/dCPVolSpread.csv -dCPVolSpread.py completed successfully -============================================================ -✅ Completed: dCPVolSpread.py -Execution time: 1.42 seconds - - -🔄 Starting: DebtIssuance.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: invalid value encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -Starting DebtIssuance predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -Loading SignalMasterTable... -Loaded 4,047,630 SignalMasterTable observations -Merging data... -After merging: 3,041,661 observations -Constructing DebtIssuance signal... -Generated DebtIssuance values for 2,726,038 observations -Saving predictor... -saving DebtIssuance -Saved 2726038 rows to ../pyData/Predictors/DebtIssuance.csv -DebtIssuance predictor completed successfully! -============================================================ -✅ Completed: DebtIssuance.py -Execution time: 2.20 seconds - - -🔄 Starting: DelBreadth.py -============================================================ -DelBreadth predictor saved successfully -============================================================ -✅ Completed: DelBreadth.py -Execution time: 1.77 seconds - - -🔄 Starting: DelCOA.py -============================================================ -Starting DelCOA.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 6 columns -Deduplicating by permno time_avail_m... -After deduplication: 3625491 rows -Setting up panel data structure... -Creating lag variables... -Creating tempAvAT... -Calculating DelCOA... -Calculated DelCOA for 3311811 observations -saving DelCOA -Saved 3311811 rows to ../pyData/Predictors/DelCOA.csv -DelCOA.py completed successfully -============================================================ -✅ Completed: DelCOA.py -Execution time: 1.55 seconds - - -🔄 Starting: DelCOL.py -============================================================ -Starting DelCOL.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 6 columns -Deduplicating by permno time_avail_m... -After deduplication: 3625491 rows -Setting up panel data structure... -Creating lag variables... -Creating tempAvAT... -Calculating DelCOL... -Calculated DelCOL for 3276202 observations -saving DelCOL -Saved 3276202 rows to ../pyData/Predictors/DelCOL.csv -DelCOL.py completed successfully -============================================================ -✅ Completed: DelCOL.py -Execution time: 1.58 seconds - - -🔄 Starting: DelDRC.py -============================================================ -Starting DelDRC.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 8 columns -Deduplicating by permno time_avail_m... -After deduplication: 3625491 rows -Setting up panel data structure... -Ensuring sic is numeric... -Creating lag variables... -Calculating DelDRC... -Applying filters... -Calculated DelDRC for 462430 observations -saving DelDRC -Saved 462430 rows to ../pyData/Predictors/DelDRC.csv -DelDRC.py completed successfully -============================================================ -✅ Completed: DelDRC.py -Execution time: 2.12 seconds - - -🔄 Starting: DelEqu.py -============================================================ -DelEqu: Saved 3,195,504 observations -============================================================ -✅ Completed: DelEqu.py -Execution time: 3.84 seconds - - -🔄 Starting: DelFINL.py -============================================================ -DelFINL: Saved 3,251,941 observations -============================================================ -✅ Completed: DelFINL.py -Execution time: 3.98 seconds - - -🔄 Starting: DelLTI.py -============================================================ -DelLTI: Saved 3,296,136 observations -============================================================ -✅ Completed: DelLTI.py -Execution time: 3.44 seconds - - -🔄 Starting: DelNetFin.py -============================================================ -DelNetFin: Saved 3,251,941 observations -============================================================ -✅ Completed: DelNetFin.py -Execution time: 3.98 seconds - - -🔄 Starting: DivInit.py -============================================================ -================================================================================ -🏗️ DivInit.py -Creating dividend initiation predictor -================================================================================ -📊 Loading distributions data... -Loaded distributions: 1,064,085 observations -🧮 Computing dividend initiation signal... -💾 Saving DivInit predictor... -saving DivInit -Saved 4112633 rows to ../pyData/Predictors/DivInit.csv -✅ DivInit.csv saved successfully -================================================================================ -✅ DivInit.py Complete -Dividend initiation predictor generated successfully -================================================================================ -============================================================ -✅ Completed: DivInit.py -Execution time: 2.20 seconds - - -🔄 Starting: DivOmit.py -============================================================ -DivOmit predictor saved successfully -============================================================ -✅ Completed: DivOmit.py -Execution time: 6.32 seconds - - -🔄 Starting: DivSeason.py -============================================================ -Starting DivSeason.py... -Loading CRSP distributions data... -Loaded distributions data: 1064085 rows -After filtering for regular cash dividends: 974654 rows -Loading SignalMasterTable... -Loaded SignalMasterTable: 4047630 rows -Merging with dividend amounts... -After merge: 4047630 rows -Calculating DivSeason signal... -Creating 12-month rolling dividend payments... -Creating lag variables for dividend prediction... -Calculated DivSeason for 1785982 observations -saving DivSeason -Saved 1785982 rows to ../pyData/Predictors/DivSeason.csv -DivSeason.py completed successfully -============================================================ -✅ Completed: DivSeason.py -Execution time: 1.98 seconds - - -🔄 Starting: DivYieldST.py -============================================================ -DivYieldST predictor saved successfully -============================================================ -✅ Completed: DivYieldST.py -Execution time: 4.20 seconds - - -🔄 Starting: dNoa.py -============================================================ -Starting dNoa predictor... -Loading m_aCompustat data... -Loaded 3,625,491 m_aCompustat observations -Deduplicating by permno time_avail_m... -After deduplication: 3,625,491 observations -Constructing dNoa signal... -Generated dNoa values for 3,195,426 observations -Saving predictor... -saving dNoa -Saved 3195426 rows to ../pyData/Predictors/dNoa.csv -dNoa predictor completed successfully! -============================================================ -✅ Completed: dNoa.py -Execution time: 1.66 seconds - - -🔄 Starting: DolVol.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -DolVol predictor saved successfully -============================================================ -✅ Completed: DolVol.py -Execution time: 5.14 seconds - - -🔄 Starting: DownRecomm.py -============================================================ -DownRecomm predictor created with 464223 observations -============================================================ -✅ Completed: DownRecomm.py -Execution time: 12.63 seconds - - -🔄 Starting: dVolCall.py -============================================================ -Starting dVolCall.py... -Loading data... -Calculated dVolCall for 851001 observations -saving dVolCall -Saved 851001 rows to ../pyData/Predictors/dVolCall.csv -dVolCall.py completed successfully -============================================================ -✅ Completed: dVolCall.py -Execution time: 0.85 seconds - - -🔄 Starting: dVolPut.py -============================================================ -Starting dVolPut.py... -Loading data... -Calculated dVolPut for 851001 observations -saving dVolPut -Saved 851001 rows to ../pyData/Predictors/dVolPut.csv -dVolPut.py completed successfully -============================================================ -✅ Completed: dVolPut.py -Execution time: 0.84 seconds - - -🔄 Starting: EarningsConsistency.py -============================================================ -Starting EarningsConsistency.py... -Loading m_aCompustat data... -Loaded data: 3625491 rows -Setting up panel data structure... -Creating lag variables for earnings... -Creating additional lag variables for earnings growth... -Calculating earnings consistency... -Calculated EarningsConsistency for 1387458 observations -saving EarningsConsistency -Saved 1387458 rows to ../pyData/Predictors/EarningsConsistency.csv -EarningsConsistency.py completed successfully -============================================================ -✅ Completed: EarningsConsistency.py -Execution time: 1.89 seconds - - -🔄 Starting: EarningsForecastDisparity.py -============================================================ -Starting EarningsForecastDisparity.py... -============================================================ -✅ Completed: EarningsForecastDisparity.py -Execution time: 4.34 seconds - - -🔄 Starting: EarningsStreak.py -============================================================ -EarningsStreak predictor created with 1225443 observations -============================================================ -✅ Completed: EarningsStreak.py -Execution time: 4.72 seconds - - -🔄 Starting: EarningsSurprise.py -============================================================ -EarningsSurprise predictor created with 2324021 observations -============================================================ -✅ Completed: EarningsSurprise.py -Execution time: 10.93 seconds - - -🔄 Starting: EarnSupBig.py -============================================================ -Creating EarnSupBig predictor... -Final dataset has 3775465 observations with 2336093 non-missing EarnSupBig values -saving EarnSupBig -Saved 2336093 rows to ../pyData/Predictors/EarnSupBig.csv -EarnSupBig predictor saved successfully -============================================================ -✅ Completed: EarnSupBig.py -Execution time: 42.63 seconds - - -🔄 Starting: EntMult.py -============================================================ -EntMult predictor saved successfully -============================================================ -✅ Completed: EntMult.py -Execution time: 3.82 seconds - - -🔄 Starting: EP.py -============================================================ -EP: Saved 2,203,166 observations -============================================================ -✅ Completed: EP.py -Execution time: 3.45 seconds - - -🔄 Starting: EquityDuration.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/EquityDuration.py:98: SettingWithCopyWarning: -A value is trying to be set on a copy of a slice from a DataFrame. -Try using .loc[row_indexer,col_indexer] = value instead - -See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy - df_monthly['temp_na_sort'] = df_monthly['EquityDuration'].isna() -Loading and processing EquityDuration... -Expanding to monthly observations... -EquityDuration predictor saved successfully -============================================================ -✅ Completed: EquityDuration.py -Execution time: 6.88 seconds - - -🔄 Starting: ExchSwitch.py -============================================================ -ExchSwitch predictor saved successfully -============================================================ -✅ Completed: ExchSwitch.py -Execution time: 4.26 seconds - - -🔄 Starting: ExclExp.py -============================================================ -ExclExp predictor saved successfully -============================================================ -✅ Completed: ExclExp.py -Execution time: 3.80 seconds - - -🔄 Starting: FEPS.py -============================================================ -Starting FEPS.py... -Loading and preparing IBES data... -Prepared IBES data: 2382154 rows, 3 columns -Loading SignalMasterTable... -Loaded SignalMasterTable: 4047630 rows, 3 columns -Merging with IBES data... -After merging with IBES data: 4047630 rows -Setting up panel data (sorting by permno, time_avail_m)... -Calculating FEPS... -Calculated FEPS for 1958211 observations -saving FEPS -Saved 1958211 rows to ../pyData/Predictors/FEPS.csv -FEPS.py completed successfully -============================================================ -✅ Completed: FEPS.py -Execution time: 1.83 seconds - - -🔄 Starting: fgr5yrLag.py -============================================================ -Loading IBES data... -IBES loaded: 7,842,868 rows -Filtering IBES... -Renaming IBES columns... -IBES ready: 1,423,499 rows -Loading Compustat data... -Compustat loaded: 3,625,491 rows -After dedup: 3,625,491 rows -Merging with SignalMasterTable... -After SMT merge: 3,041,661 rows -Optimizing IBES merge... -IBES filtered from 1,423,499 to 1,223,321 rows -After IBES merge: 1,138,573 rows -Filtering required variables... -After dropna: 996,104 rows -Calculating 6-month calendar lags... -fgr5yrLag predictor saved successfully -============================================================ -✅ Completed: fgr5yrLag.py -Execution time: 35.94 seconds - - -🔄 Starting: FirmAge.py -============================================================ -FirmAge predictor saved successfully -============================================================ -✅ Completed: FirmAge.py -Execution time: 3.21 seconds - - -🔄 Starting: FirmAgeMom.py -============================================================ -saving FirmAgeMom -Saved 550955 rows to ../pyData/Predictors/FirmAgeMom.csv -FirmAgeMom predictor completed -============================================================ -✅ Completed: FirmAgeMom.py -Execution time: 1.83 seconds - - -🔄 Starting: ForecastDispersion.py -============================================================ -Starting ForecastDispersion.py... -Loading and preparing IBES data... -Prepared IBES data: 2382154 rows, 9 columns -Loading SignalMasterTable... -Loaded SignalMasterTable: 4047630 rows, 3 columns -Merging with IBES data... -After merging with IBES data: 4047630 rows -Calculating ForecastDispersion... -Calculated ForecastDispersion for 1620034 observations -saving ForecastDispersion -Saved 1620034 rows to ../pyData/Predictors/ForecastDispersion.csv -ForecastDispersion.py completed successfully -============================================================ -✅ Completed: ForecastDispersion.py -Execution time: 1.84 seconds - - -🔄 Starting: Frontier.py -============================================================ -Frontier.py starting with max_year=None -Loading data... -Computing FF48 industry codes... -Processing 889 time periods (starting from 1950-12)... -Processed 151/889 periods (17.0%), stored 66 predictions for 1963-06 - Elapsed: 0.6s, Est. remaining: 64.8s, Avg: 0.088s/period (last 7 periods) -Processed 181/889 periods (20.4%), stored 55 predictions for 1965-12 - Elapsed: 1.1s, Est. remaining: 10.3s, Avg: 0.015s/period (last 30 periods) -Processed 211/889 periods (23.7%), stored 152 predictions for 1968-06 - Elapsed: 1.7s, Est. remaining: 13.9s, Avg: 0.020s/period (last 30 periods) -Processed 241/889 periods (27.1%), stored 191 predictions for 1970-12 - Elapsed: 2.5s, Est. remaining: 18.9s, Avg: 0.029s/period (last 30 periods) -Processed 271/889 periods (30.5%), stored 1230 predictions for 1973-06 - Elapsed: 4.5s, Est. remaining: 40.6s, Avg: 0.066s/period (last 30 periods) -Processed 301/889 periods (33.9%), stored 1600 predictions for 1975-12 - Elapsed: 8.8s, Est. remaining: 83.9s, Avg: 0.143s/period (last 30 periods) -Processed 331/889 periods (37.2%), stored 1487 predictions for 1978-06 - Elapsed: 13.9s, Est. remaining: 94.4s, Avg: 0.169s/period (last 30 periods) -Processed 361/889 periods (40.6%), stored 1527 predictions for 1980-12 - Elapsed: 19.1s, Est. remaining: 91.5s, Avg: 0.173s/period (last 30 periods) -Processed 391/889 periods (44.0%), stored 1714 predictions for 1983-06 - Elapsed: 24.5s, Est. remaining: 89.7s, Avg: 0.180s/period (last 30 periods) -Processed 421/889 periods (47.4%), stored 2048 predictions for 1985-12 - Elapsed: 30.7s, Est. remaining: 96.8s, Avg: 0.207s/period (last 30 periods) -Processed 451/889 periods (50.7%), stored 2144 predictions for 1988-06 - Elapsed: 37.4s, Est. remaining: 98.3s, Avg: 0.224s/period (last 30 periods) -Processed 481/889 periods (54.1%), stored 2013 predictions for 1990-12 - Elapsed: 44.4s, Est. remaining: 94.8s, Avg: 0.232s/period (last 30 periods) -Processed 511/889 periods (57.5%), stored 2244 predictions for 1993-06 - Elapsed: 51.5s, Est. remaining: 90.0s, Avg: 0.238s/period (last 30 periods) -Processed 541/889 periods (60.9%), stored 2616 predictions for 1995-12 - Elapsed: 59.3s, Est. remaining: 90.7s, Avg: 0.261s/period (last 30 periods) -Processed 571/889 periods (64.2%), stored 3163 predictions for 1998-06 - Elapsed: 68.2s, Est. remaining: 94.1s, Avg: 0.296s/period (last 30 periods) -Processed 601/889 periods (67.6%), stored 2723 predictions for 2000-12 - Elapsed: 77.5s, Est. remaining: 89.2s, Avg: 0.310s/period (last 30 periods) -Processed 631/889 periods (71.0%), stored 2437 predictions for 2003-06 - Elapsed: 86.4s, Est. remaining: 76.2s, Avg: 0.295s/period (last 30 periods) -Processed 661/889 periods (74.4%), stored 2283 predictions for 2005-12 - Elapsed: 94.4s, Est. remaining: 60.9s, Avg: 0.267s/period (last 30 periods) -Processed 691/889 periods (77.7%), stored 2180 predictions for 2008-06 - Elapsed: 101.9s, Est. remaining: 50.0s, Avg: 0.252s/period (last 30 periods) -Processed 721/889 periods (81.1%), stored 1838 predictions for 2010-12 - Elapsed: 108.9s, Est. remaining: 39.0s, Avg: 0.232s/period (last 30 periods) -Processed 751/889 periods (84.5%), stored 1667 predictions for 2013-06 - Elapsed: 115.1s, Est. remaining: 28.5s, Avg: 0.206s/period (last 30 periods) -Processed 781/889 periods (87.9%), stored 1578 predictions for 2015-12 - Elapsed: 120.8s, Est. remaining: 20.4s, Avg: 0.189s/period (last 30 periods) -Processed 811/889 periods (91.2%), stored 1409 predictions for 2018-06 - Elapsed: 126.0s, Est. remaining: 13.7s, Avg: 0.175s/period (last 30 periods) -Processed 841/889 periods (94.6%), stored 1259 predictions for 2020-12 - Elapsed: 130.9s, Est. remaining: 7.9s, Avg: 0.164s/period (last 30 periods) -Processed 871/889 periods (98.0%), stored 1296 predictions for 2023-06 - Elapsed: 135.3s, Est. remaining: 2.6s, Avg: 0.147s/period (last 30 periods) -Total predictions generated: 1222287 -Before ceq filter: 2860911 -After ceq filter: 2695300 -Before dropping NaN Frontier: 2695300 -Final output: 1221161 observations - -Timing Summary: -Total execution time: 160.53 seconds (2.68 minutes) -Processed 889 periods at 6.39 periods/second -Frontier predictor saved successfully -============================================================ -✅ Completed: Frontier.py -Execution time: 161.85 seconds - - -🔄 Starting: Governance.py -============================================================ -Governance predictor saved successfully -============================================================ -✅ Completed: Governance.py -Execution time: 1.31 seconds - - -🔄 Starting: GP.py -============================================================ -GP: Saved 2,972,251 observations -============================================================ -✅ Completed: GP.py -Execution time: 3.38 seconds - - -🔄 Starting: GrAdExp.py -============================================================ -Starting GrAdExp.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 4 columns -Removing duplicate observations... -Merging with SignalMasterTable... -After merging with SignalMasterTable: 3625491 rows -Sorting data by permno and time_avail_m... -Calculating GrAdExp... -Calculating size deciles... -Applying filters... -Filtered out 145922 observations (xad < 0.1 or smallest size decile) -Final GrAdExp calculated for 905831 observations -saving GrAdExp -Saved 905831 rows to ../pyData/Predictors/GrAdExp.csv -GrAdExp.py completed successfully -============================================================ -✅ Completed: GrAdExp.py -Execution time: 2.55 seconds - - -🔄 Starting: GrLTNOA.py -============================================================ -Starting GrLTNOA.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 14 columns -Removing duplicate observations... -Setting up panel data (sorting by permno, time_avail_m)... -Calculating 12-month lags... -Calculating GrLTNOA... -Calculated GrLTNOA for 3235740 observations -saving GrLTNOA -Saved 3235740 rows to ../pyData/Predictors/GrLTNOA.csv -GrLTNOA.py completed successfully -============================================================ -✅ Completed: GrLTNOA.py -Execution time: 2.00 seconds - - -🔄 Starting: GrSaleToGrInv.py -============================================================ -Starting GrSaleToGrInv.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 5 columns -Removing duplicate permno-time_avail_m observations... -Setting up panel data (sorting by permno, time_avail_m)... -Calculating 12 and 24-month lags... -Calculating primary GrSaleToGrInv formula... -Applying fallback formula for missing values... -Calculated GrSaleToGrInv for 2545662 observations -saving GrSaleToGrInv -Saved 2545662 rows to ../pyData/Predictors/GrSaleToGrInv.csv -GrSaleToGrInv.py completed successfully -============================================================ -✅ Completed: GrSaleToGrInv.py -Execution time: 1.49 seconds - - -🔄 Starting: GrSaleToGrOverhead.py -============================================================ -Starting GrSaleToGrOverhead.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 5 columns -Removing duplicate observations... -Setting up panel data (sorting by permno, time_avail_m)... -Calculating 12-month and 24-month lags... -Calculating GrSaleToGrOverhead using primary formula... -Applying fallback formula where primary formula is missing... -Calculated GrSaleToGrOverhead for 2681589 observations -saving GrSaleToGrOverhead -Saved 2681589 rows to ../pyData/Predictors/GrSaleToGrOverhead.csv -GrSaleToGrOverhead.py completed successfully -============================================================ -✅ Completed: GrSaleToGrOverhead.py -Execution time: 1.65 seconds - - -🔄 Starting: Herf.py -============================================================ -Herf predictor saved successfully -============================================================ -✅ Completed: Herf.py -Execution time: 7.01 seconds - - -🔄 Starting: HerfAsset.py -============================================================ -HerfAsset predictor saved successfully -============================================================ -✅ Completed: HerfAsset.py -Execution time: 5.83 seconds - - -🔄 Starting: HerfBE.py -============================================================ -HerfBE predictor saved successfully -============================================================ -✅ Completed: HerfBE.py -Execution time: 6.27 seconds - - -🔄 Starting: High52.py -============================================================ -Starting High52 calculation... -Loaded dailyCRSP data: 107662961 observations -After collapse by permno and time_avail_m: 5116571 observations -High52 calculated for 4995429 observations -saving High52 -Saved 4995429 rows to ../pyData/Predictors/High52.csv -High52.csv saved successfully -============================================================ -✅ Completed: High52.py -Execution time: 12.50 seconds - - -🔄 Starting: hire.py -============================================================ -hire: Saved 3,498,027 observations -============================================================ -✅ Completed: hire.py -Execution time: 3.73 seconds - - -🔄 Starting: Illiquidity.py -============================================================ -Loading dailyCRSP data... -Constructing Illiquidity signal... -Final dataset shape: (4278152, 3) -Sample of final data: - permno yyyymm Illiquidity -11 10000 198612 0.000004 -12 10000 198701 0.000004 -13 10000 198702 0.000005 -14 10000 198703 0.000008 -15 10000 198704 0.000008 -Saved to ../pyData/Predictors/Illiquidity.csv -============================================================ -✅ Completed: Illiquidity.py -Execution time: 13.14 seconds - - -🔄 Starting: IndIPO.py -============================================================ -IndIPO predictor saved successfully -============================================================ -✅ Completed: IndIPO.py -Execution time: 2.52 seconds - - -🔄 Starting: IndMom.py -============================================================ -saving IndMom -Saved 4043138 rows to ../pyData/Predictors/IndMom.csv -============================================================ -✅ Completed: IndMom.py -Execution time: 25.36 seconds - - -🔄 Starting: IndRetBig.py -============================================================ -saving IndRetBig -Saved 2616695 rows to ../pyData/Predictors/IndRetBig.csv -IndRetBig predictor saved successfully -============================================================ -✅ Completed: IndRetBig.py -Execution time: 27.99 seconds - - -🔄 Starting: IntMom.py -============================================================ -saving IntMom -Saved 3686625 rows to ../pyData/Predictors/IntMom.csv -Saved 4047630 observations to IntMom.csv -============================================================ -✅ Completed: IntMom.py -Execution time: 4.34 seconds - - -🔄 Starting: Investment.py -============================================================ -Investment predictor saved successfully -============================================================ -✅ Completed: Investment.py -Execution time: 3.87 seconds - - -🔄 Starting: InvestPPEInv.py -============================================================ -Starting InvestPPEInv.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 6 columns -Removing duplicate permno-time_avail_m observations... -Removed 0 duplicate observations -Setting up panel data (sorting by permno, time_avail_m)... -Calculating 12-month lags... -Calculating tempPPE... -Calculating tempInv... -Calculating InvestPPEInv... -Calculated InvestPPEInv for 2943499 observations -saving InvestPPEInv -Saved 2943499 rows to ../pyData/Predictors/InvestPPEInv.csv -InvestPPEInv.py completed successfully -============================================================ -✅ Completed: InvestPPEInv.py -Execution time: 1.53 seconds - - -🔄 Starting: InvGrowth.py -============================================================ -Starting InvGrowth predictor... -Loading m_aCompustat data... -Loaded 3,625,491 Compustat observations -Loading GNPdefl data... -Loaded 939 GNPdefl observations -Merging with GNPdefl... -After merging with GNPdefl: 3,571,081 observations -Adjusting invt for inflation... -Applying sample selection filters... -After SIC filter (dropped SIC 4xxx and 6xxx): 2,614,476 observations (dropped 956,605) -After AT/PPENT filter: 2,595,179 observations (dropped 19,297) -Constructing InvGrowth signal... -After deduplication: 2,595,179 observations (dropped 0 duplicates) -Calculating 12-month lag for inventory growth... -Implementing efficient calendar-based 12-month lag... -Generated InvGrowth values for 1,996,001 observations -Saving predictor... -saving InvGrowth -Saved 1996001 rows to ../pyData/Predictors/InvGrowth.csv -InvGrowth predictor completed successfully! -============================================================ -✅ Completed: InvGrowth.py -Execution time: 3.64 seconds - - -🔄 Starting: IO_ShortInterest.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/IO_ShortInterest.py:54: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. - temps99_by_month = df.groupby('time_avail_m').apply( -Starting IO_ShortInterest.py... -Loading data... -Calculated IO_ShortInterest for 16077 observations -saving IO_ShortInterest -Saved 16077 rows to ../pyData/Predictors/IO_ShortInterest.csv -IO_ShortInterest.py completed successfully -============================================================ -✅ Completed: IO_ShortInterest.py -Execution time: 3.42 seconds - - -🔄 Starting: iomom_cust.py -============================================================ -Starting iomom_cust predictor... -Loading SignalMasterTable... -Loaded 3,041,661 observations with gvkey -Loading InputOutputMomentumProcessed... -Loaded 2,906,316 InputOutputMomentum observations -Merging with InputOutputMomentumProcessed... -After merge: 3,041,661 observations -After dropping missing iomom_cust: 1,637,670 observations -Saving predictor... -saving iomom_cust -Saved 1637670 rows to ../pyData/Predictors/iomom_cust.csv -iomom_cust predictor completed successfully! -============================================================ -✅ Completed: iomom_cust.py -Execution time: 0.90 seconds - - -🔄 Starting: iomom_supp.py -============================================================ -Starting iomom_supp predictor... -Loading SignalMasterTable... -Loaded 3,041,661 observations with gvkey -Loading InputOutputMomentumProcessed... -Loaded 2,906,316 InputOutputMomentum observations -Merging with InputOutputMomentumProcessed... -After merge: 3,041,661 observations -After dropping missing iomom_supp: 1,639,842 observations -Saving predictor... -saving iomom_supp -Saved 1639842 rows to ../pyData/Predictors/iomom_supp.csv -iomom_supp predictor completed successfully! -============================================================ -✅ Completed: iomom_supp.py -Execution time: 0.90 seconds - - -🔄 Starting: Leverage.py -============================================================ -Leverage: Saved 3,014,667 observations -============================================================ -✅ Completed: Leverage.py -Execution time: 3.41 seconds - - -🔄 Starting: LRreversal.py -============================================================ -LRreversal predictor saved successfully -============================================================ -✅ Completed: LRreversal.py -Execution time: 3.92 seconds - - -🔄 Starting: MaxRet.py -============================================================ -MaxRet predictor saved successfully -============================================================ -✅ Completed: MaxRet.py -Execution time: 13.96 seconds - - -🔄 Starting: MeanRankRevGrowth.py -============================================================ -MeanRankRevGrowth predictor saved successfully -============================================================ -✅ Completed: MeanRankRevGrowth.py -Execution time: 6.88 seconds - - -🔄 Starting: Mom12m.py -============================================================ -================================================================================ -🏗️ Mom12m.py -Creating twelve-month momentum predictor -================================================================================ -📊 Loading SignalMasterTable data... -Loaded: 4,047,630 observations -🧮 Computing 12-month momentum signal... -💾 Saving Mom12m predictor... -saving Mom12m -Saved 3715128 rows to ../pyData/Predictors/Mom12m.csv -✅ Mom12m.csv saved successfully -================================================================================ -✅ Mom12m.py Complete -Twelve-month momentum predictor generated successfully -================================================================================ -============================================================ -✅ Completed: Mom12m.py -Execution time: 1.45 seconds - - -🔄 Starting: Mom12mOffSeason.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/Mom12mOffSeason.py:107: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. - df = df.groupby('permno', group_keys=False).apply(calculate_calendar_rolling_fast) -Starting Mom12mOffSeason predictor translation... -Loading SignalMasterTable... -Loaded 4047630 observations -Data sorted by permno and time_avail_m -Starting signal construction... -Replaced missing returns with 0 -Computing 10-month calendar-based rolling statistics excluding focal return... -Computing true calendar-based rolling statistics (this may take several minutes)... -Processing groups (this will take time for large dataset)... -Generated 3865561 valid Mom12mOffSeason observations -saving Mom12mOffSeason -Saved 3865561 rows to ../pyData/Predictors/Mom12mOffSeason.csv -============================================================ -✅ Completed: Mom12mOffSeason.py -Execution time: 24.03 seconds - - -🔄 Starting: Mom6m.py -============================================================ -================================================================================ -🏗️ Mom6m.py -Creating six-month momentum predictor -================================================================================ -📊 Loading SignalMasterTable data... -Loaded: 4,047,630 observations -🧮 Computing 6-month momentum signal... -💾 Saving Mom6m predictor... -saving Mom6m -Saved 3895206 rows to ../pyData/Predictors/Mom6m.csv -✅ Mom6m.csv saved successfully -================================================================================ -✅ Mom6m.py Complete -Six-month momentum predictor generated successfully -================================================================================ -============================================================ -✅ Completed: Mom6m.py -Execution time: 1.31 seconds - - -🔄 Starting: Mom6mJunk.py -============================================================ -left join with sp ratings, nrow = 3041661 -left join with ciq ratings, nrow = 3041661 -saving Mom6mJunk -Saved 346566 rows to ../pyData/Predictors/Mom6mJunk.csv -Mom6mJunk saved with 3041661 observations -============================================================ -✅ Completed: Mom6mJunk.py -Execution time: 4.15 seconds - - -🔄 Starting: MomOffSeason.py -============================================================ -Starting MomOffSeason.py... -Loading SignalMasterTable... -Loaded data: 4047630 rows -Filling missing returns with 0... -Creating lag variables for returns (23, 35, 47, 59 months)... -Calculating MomOffSeason signal... -Calculated MomOffSeason for 3436969 observations -saving MomOffSeason -Saved 3436969 rows to ../pyData/Predictors/MomOffSeason.csv -MomOffSeason.py completed successfully -============================================================ -✅ Completed: MomOffSeason.py -Execution time: 3.07 seconds - - -🔄 Starting: MomOffSeason06YrPlus.py -============================================================ -Starting MomOffSeason06YrPlus.py... -Loading data... -Calculated MomOffSeason06YrPlus for 2472210 observations -saving MomOffSeason06YrPlus -Saved 2472210 rows to ../pyData/Predictors/MomOffSeason06YrPlus.csv -MomOffSeason06YrPlus.py completed successfully -============================================================ -✅ Completed: MomOffSeason06YrPlus.py -Execution time: 3.11 seconds - - -🔄 Starting: MomOffSeason11YrPlus.py -============================================================ -Starting MomOffSeason11YrPlus.py... -Loading data... -Calculated MomOffSeason11YrPlus for 1707608 observations -saving MomOffSeason11YrPlus -Saved 1707608 rows to ../pyData/Predictors/MomOffSeason11YrPlus.csv -MomOffSeason11YrPlus.py completed successfully -============================================================ -✅ Completed: MomOffSeason11YrPlus.py -Execution time: 3.10 seconds - - -🔄 Starting: MomOffSeason16YrPlus.py -============================================================ -Starting MomOffSeason16YrPlus.py... -Loading data... -Calculated MomOffSeason16YrPlus for 1044703 observations -saving MomOffSeason16YrPlus -Saved 1044703 rows to ../pyData/Predictors/MomOffSeason16YrPlus.csv -MomOffSeason16YrPlus.py completed successfully -============================================================ -✅ Completed: MomOffSeason16YrPlus.py -Execution time: 3.08 seconds - - -🔄 Starting: MomRev.py -============================================================ -================================================================================ -🏗️ MomRev.py -Creating momentum and long-term reversal signal based on 6m and 36m momentum -================================================================================ -📊 Loading SignalMasterTable data... -Loaded: 4,047,630 observations -🧮 Computing 6m and 36m momentum signals... -💾 Saving MomRev predictor... -saving MomRev -Saved 261618 rows to ../pyData/Predictors/MomRev.csv -✅ MomRev.csv saved successfully -================================================================================ -✅ MomRev.py Complete -Momentum and long-term reversal signal generated successfully -================================================================================ -============================================================ -✅ Completed: MomRev.py -Execution time: 3.91 seconds - - -🔄 Starting: MomSeason.py -============================================================ -Starting MomSeason.py... -Loading SignalMasterTable... -Loaded data: 4047630 rows -Filling missing returns with 0... -Creating lag variables for returns (23, 35, 47, 59 months)... -Calculating seasonal momentum signal... -Calculated MomSeason for 3437360 observations -saving MomSeason -Saved 3437360 rows to ../pyData/Predictors/MomSeason.csv -MomSeason.py completed successfully -============================================================ -✅ Completed: MomSeason.py -Execution time: 1.70 seconds - - -🔄 Starting: MomSeason06YrPlus.py -============================================================ -Starting MomSeason06YrPlus.py... -Loading data... -Calculated MomSeason06YrPlus for 2472493 observations -saving MomSeason06YrPlus -Saved 2472493 rows to ../pyData/Predictors/MomSeason06YrPlus.csv -MomSeason06YrPlus.py completed successfully -============================================================ -✅ Completed: MomSeason06YrPlus.py -Execution time: 1.70 seconds - - -🔄 Starting: MomSeason11YrPlus.py -============================================================ -Starting MomSeason11YrPlus.py... -Loading data... -Calculated MomSeason11YrPlus for 1707812 observations -saving MomSeason11YrPlus -Saved 1707812 rows to ../pyData/Predictors/MomSeason11YrPlus.csv -MomSeason11YrPlus.py completed successfully -============================================================ -✅ Completed: MomSeason11YrPlus.py -Execution time: 1.66 seconds - - -🔄 Starting: MomSeason16YrPlus.py -============================================================ -Starting MomSeason16YrPlus.py... -Loading data... -Calculated MomSeason16YrPlus for 1211157 observations -saving MomSeason16YrPlus -Saved 1211157 rows to ../pyData/Predictors/MomSeason16YrPlus.csv -MomSeason16YrPlus.py completed successfully -============================================================ -✅ Completed: MomSeason16YrPlus.py -Execution time: 1.66 seconds - - -🔄 Starting: MomSeasonShort.py -============================================================ -Starting MomSeasonShort.py... -Loading SignalMasterTable... -Loaded data: 4047630 rows -Filling missing returns with 0... -Creating 11-month lag for seasonal momentum... -Calculated MomSeasonShort for 3730640 observations -saving MomSeasonShort -Saved 3730640 rows to ../pyData/Predictors/MomSeasonShort.csv -MomSeasonShort.py completed successfully -============================================================ -✅ Completed: MomSeasonShort.py -Execution time: 1.20 seconds - - -🔄 Starting: MomVol.py -============================================================ -Calculating 6-month calendar-based rolling mean volume... -saving MomVol -Saved 1096816 rows to ../pyData/Predictors/MomVol.csv -============================================================ -✅ Completed: MomVol.py -Execution time: 3.27 seconds - - -🔄 Starting: MRreversal.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/MRreversal.py:47: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` - df[f'ret_lag{lag}_orig_missing'] = df[f'ret_lag{lag}_orig_missing'].fillna(True).infer_objects(copy=False) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/MRreversal.py:47: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` - df[f'ret_lag{lag}_orig_missing'] = df[f'ret_lag{lag}_orig_missing'].fillna(True).infer_objects(copy=False) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/MRreversal.py:47: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` - df[f'ret_lag{lag}_orig_missing'] = df[f'ret_lag{lag}_orig_missing'].fillna(True).infer_objects(copy=False) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/MRreversal.py:47: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` - df[f'ret_lag{lag}_orig_missing'] = df[f'ret_lag{lag}_orig_missing'].fillna(True).infer_objects(copy=False) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/MRreversal.py:47: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` - df[f'ret_lag{lag}_orig_missing'] = df[f'ret_lag{lag}_orig_missing'].fillna(True).infer_objects(copy=False) -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/MRreversal.py:47: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` - df[f'ret_lag{lag}_orig_missing'] = df[f'ret_lag{lag}_orig_missing'].fillna(True).infer_objects(copy=False) -MRreversal predictor saved successfully -============================================================ -✅ Completed: MRreversal.py -Execution time: 8.47 seconds - - -🔄 Starting: MS.py -============================================================ -================================================================================ -🏗️ MS.py -Generating Mohanram G-score predictor -================================================================================ -📊 Loading Compustat and SignalMasterTable data... -Loading m_aCompustat.parquet... -Loaded m_aCompustat: 3,625,491 observations -After deduplication: 3,625,491 observations -Loading SignalMasterTable.parquet... -Loaded SignalMasterTable: 4,047,630 observations -Loading m_QCompustat.parquet... -Loaded m_QCompustat: 5,444,089 observations -🔗 Merging datasets... -After merging: 3,041,661 observations -🎯 Applying sample selection criteria... -Calculating BM quintiles with enhanced fastxtile... -After BM quintile filter: 572,872 observations -After SIC2D minimum filter: 554,113 observations -🧮 Preparing financial variables... -📈 Computing quarterly aggregations... -🎯 Constructing Mohanram G-score components... - Computing profitability and cash flow signals... - Computing naive extrapolation (volatility) measures... - Computing conservatism (intensity) measures... -📅 Applying timing logic... -Generated MS values: 473,079 observations -MS summary stats: - Mean: 3.8112 - Std: 1.5210 - Min: 1 - Max: 6 -💾 Saving MS predictor... -saving MS -Saved 473079 rows to ../pyData/Predictors/MS.csv -✅ MS.csv saved successfully -============================================================ -✅ Completed: MS.py -Execution time: 3.19 seconds - - -🔄 Starting: NetDebtFinance.py -============================================================ -Starting NetDebtFinance calculation... -Loaded m_aCompustat data: 3625491 observations -After deduplicating by permno time_avail_m: 3625491 observations -NetDebtFinance calculated for 2797645 observations -Final output: 2797645 observations -NetDebtFinance.csv saved successfully -============================================================ -✅ Completed: NetDebtFinance.py -Execution time: 2.86 seconds - - -🔄 Starting: NetDebtPrice.py -============================================================ -NetDebtPrice: Saved 1,425,639 observations -============================================================ -✅ Completed: NetDebtPrice.py -Execution time: 3.92 seconds - - -🔄 Starting: NetEquityFinance.py -============================================================ -Starting NetEquityFinance calculation... -Loaded m_aCompustat data: 3625491 observations -After deduplicating by permno time_avail_m: 3625491 observations -NetEquityFinance calculated for 2889052 observations -Final output: 2889052 observations -NetEquityFinance.csv saved successfully -============================================================ -✅ Completed: NetEquityFinance.py -Execution time: 2.97 seconds - - -🔄 Starting: NetPayoutYield.py -============================================================ -NetPayoutYield predictor created with 1818920 observations -============================================================ -✅ Completed: NetPayoutYield.py -Execution time: 3.85 seconds - - -🔄 Starting: NOA.py -============================================================ -Starting NOA calculation... -Loaded m_aCompustat data: 3625491 observations -After deduplicating by permno time_avail_m: 3625491 observations -NOA calculated for 3213348 observations -Final output: 3213348 observations -NOA.csv saved successfully -============================================================ -✅ Completed: NOA.py -Execution time: 3.22 seconds - - -🔄 Starting: NumEarnIncrease.py -============================================================ -NumEarnIncrease: 2823459 observations saved -============================================================ -✅ Completed: NumEarnIncrease.py -Execution time: 6.06 seconds - - -🔄 Starting: OperProf.py -============================================================ -OperProf: Saved 1,407,793 observations -============================================================ -✅ Completed: OperProf.py -Execution time: 2.52 seconds - - -🔄 Starting: OperProfRD.py -============================================================ -OperProfRD predictor saved successfully -============================================================ -✅ Completed: OperProfRD.py -Execution time: 3.91 seconds - - -🔄 Starting: OPLeverage.py -============================================================ -Loading m_aCompustat data... -Loaded 3625491 observations -After removing duplicates: 3625491 observations -Calculated OPLeverage for shape: (1, 8) -┌─────────┬──────────────┬─────────┬─────────┬─────────┬─────────┬──────────┬────────────┐ -│ permno ┆ time_avail_m ┆ gvkey ┆ xsga ┆ cogs ┆ at ┆ tempxsga ┆ OPLeverage │ -│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ -│ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 ┆ u32 │ -╞═════════╪══════════════╪═════════╪═════════╪═════════╪═════════╪══════════╪════════════╡ -│ 3609010 ┆ 3609010 ┆ 3609010 ┆ 3018224 ┆ 3609010 ┆ 3609010 ┆ 3609010 ┆ 3609010 │ -└─────────┴──────────────┴─────────┴─────────┴─────────┴─────────┴──────────┴────────────┘ observations -saving OPLeverage -Saved 3609010 rows to ../pyData/Predictors/OPLeverage.csv -OPLeverage predictor completed successfully -============================================================ -✅ Completed: OPLeverage.py -Execution time: 0.96 seconds - - -🔄 Starting: OrderBacklog.py -============================================================ -OrderBacklog predictor saved successfully -============================================================ -✅ Completed: OrderBacklog.py -Execution time: 1.92 seconds - - -🔄 Starting: OrderBacklogChg.py -============================================================ -OrderBacklogChg predictor saved successfully -============================================================ -✅ Completed: OrderBacklogChg.py -Execution time: 1.87 seconds - - -🔄 Starting: OScore.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -OScore predictor saved successfully -============================================================ -✅ Completed: OScore.py -Execution time: 4.59 seconds - - -🔄 Starting: PatentsRD.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/PatentsRD.py:91: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. - df = df.groupby('gvkey').apply(lambda x: x.iloc[2:] if len(x) > 2 else x.iloc[0:0]).reset_index(drop=True) -PatentsRD predictor saved successfully -============================================================ -✅ Completed: PatentsRD.py -Execution time: 52.50 seconds - - -🔄 Starting: PayoutYield.py -============================================================ -PayoutYield predictor created with 1419513 observations -============================================================ -✅ Completed: PayoutYield.py -Execution time: 3.52 seconds - - -🔄 Starting: PctAcc.py -============================================================ -PctAcc predictor saved successfully -============================================================ -✅ Completed: PctAcc.py -Execution time: 4.25 seconds - - -🔄 Starting: PctTotAcc.py -============================================================ -PctTotAcc predictor saved successfully -============================================================ -✅ Completed: PctTotAcc.py -Execution time: 3.26 seconds - - -🔄 Starting: Price.py -============================================================ -Price: Saved 4,029,252 observations -============================================================ -✅ Completed: Price.py -Execution time: 3.61 seconds - - -🔄 Starting: ProbInformedTrading.py -============================================================ -ProbInformedTrading predictor saved with 24028 observations -============================================================ -✅ Completed: ProbInformedTrading.py -Execution time: 1.54 seconds - - -🔄 Starting: PS.py -============================================================ -Starting PS.py... -Loading m_aCompustat data... -Merging with SignalMasterTable... -Loaded and merged data: 3041661 rows -Setting up panel data structure and calculating Piotroski score... -Calculated PS for 464239 observations -saving PS -Saved 464239 rows to ../pyData/Predictors/PS.csv -PS.py completed successfully -============================================================ -✅ Completed: PS.py -Execution time: 6.12 seconds - - -🔄 Starting: RD.py -============================================================ -RD: Saved 1,419,157 observations -============================================================ -✅ Completed: RD.py -Execution time: 1.88 seconds - - -🔄 Starting: RDAbility.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/RDAbility.py:146: DeprecationWarning: the argument `min_periods` for `Expr.rolling_mean` is deprecated. It was renamed to `min_samples` in version 1.21.0. - "mean": lambda col: col.rolling_mean( -================================================================================ -RDAbility.py -Generating R&D ability predictor using Cohen, Diether and Malloy (2013) methodology -================================================================================ -Loading a_aCompustat data... -Loaded Compustat: 302,326 observations -Expanding to monthly observations... -Before expansion: 302,326 observations -After expansion: 3,627,912 observations -Applying final filters... -After gvkey-time filter: 3,625,613 observations -After permno-time filter: 3,625,491 observations -Generated RDAbility values: 3,625,491 observations -Non-null RDAbility: 181,016 observations -RDAbility summary stats: - Mean: 0.474592 - Std: 5.419300 - Min: -170.731482 - Max: 125.476431 -Saving RDAbility predictor... -saving RDAbility -Saved 181016 rows to ../pyData/Predictors/RDAbility.csv -RDAbility.csv saved successfully -============================================================ -✅ Completed: RDAbility.py -Execution time: 12.25 seconds - - -🔄 Starting: RDcap.py -============================================================ -saving RDcap -Saved 537864 rows to ../pyData/Predictors/RDcap.csv -RDcap predictor saved successfully -============================================================ -✅ Completed: RDcap.py -Execution time: 2.55 seconds - - -🔄 Starting: RDIPO.py -============================================================ -RDIPO predictor saved successfully -============================================================ -✅ Completed: RDIPO.py -Execution time: 3.00 seconds - - -🔄 Starting: RDS.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/RDS.py:84: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` - (df['recta_orig_missing'] & df['l12_recta_orig_missing'].fillna(True).infer_objects(copy=False)) & -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/RDS.py:85: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` - (df['msa_orig_missing'] & df['l12_msa_orig_missing'].fillna(True).infer_objects(copy=False)) -RDS: Saved 2,816,659 observations -============================================================ -✅ Completed: RDS.py -Execution time: 27.76 seconds - - -🔄 Starting: realestate.py -============================================================ -realestate predictor saved successfully -============================================================ -✅ Completed: realestate.py -Execution time: 4.00 seconds - - -🔄 Starting: Recomm_ShortInterest.py -============================================================ -================================================================================ -🏗️ Recomm_ShortInterest.py -Generating Recommendation and Short Interest predictor -================================================================================ -📊 Loading IBES Recommendations data... -Loaded IBES Recommendations: 864,089 observations -After taking mean recommendation within each stock-month: 1,302,063 observations -📊 Loading SignalMasterTable, CRSP, and Short Interest data... -Loaded SignalMasterTable: 3,041,661 observations -SignalMasterTable merged with CRSP: 3,041,661 observations -After merging with short interest: 1,577,922 observations -After merging with recommendations: 792,823 observations -🧮 Signal construction... -📊 Computing quintiles using stata_fastxtile... ---- Signal summary --- -Signal distribution: shape: (3, 2) -┌──────────────────────┬────────┐ -│ Recomm_ShortInterest ┆ count │ -│ --- ┆ --- │ -│ i32 ┆ u32 │ -╞══════════════════════╪════════╡ -│ 1 ┆ 41321 │ -│ 0 ┆ 26442 │ -│ null ┆ 725060 │ -└──────────────────────┴────────┘ -Time period: -1993-10-01 to 2024-12-01 -💾 Saving Recomm_ShortInterest predictor... -saving Recomm_ShortInterest -Saved 67763 rows to ../pyData/Predictors/Recomm_ShortInterest.csv -✅ Recomm_ShortInterest.csv saved successfully -============================================================ -✅ Completed: Recomm_ShortInterest.py -Execution time: 3.18 seconds - - -🔄 Starting: retConglomerate.py -============================================================ -Starting retConglomerate predictor (rewritten from scratch)... -Loading CCMLinkingTable... -Initial crosswalk shape: (31890, 4) -tempCW shape after destring: (31890, 4) -Loading monthlyCRSP... -Initial CRSP shape: (5153763, 3) -tempCRSP shape: (5153763, 3) -Loading a_aCompustat... -Initial compustat annual shape: (302326, 4) -tempCS shape after filtering: (301502, 4) -Loading CompustatSegments... -Initial segments shape: (2766341, 5) -After stype filter: (1417450, 5) -After sales filter: (1370647, 5) -Identifying conglomerates... -After collapse: (535678, 4) -After merge with tempCS: (395587, 7) -Columns after merge: ['gvkey', 'sic2D', 'datadate', 'sales', 'fyear', 'permno', 'saleACS'] -Industry count distribution:\ntempNInd -1 165217 -2 88404 -3 78291 -4 38856 -5 15370 -Name: count, dtype: int64 -After dropping missing Conglomerate: (232612, 11) -Conglomerate distribution:\nConglomerate -0.0 159834 -1.0 72778 -Name: count, dtype: int64 -Calculating industry returns from stand-alones... -Stand-alone segments shape: (159834, 11) -After merge with tempCW: (191296, 14) -Columns after merge: ['gvkey', 'sic2D', 'datadate', 'sales', 'fyear', 'permno_x', 'saleACS', 'temptotalSales', 'tempCSSegmentShare', 'tempNInd', 'Conglomerate', 'permno_y', 'timeLinkStart_d', 'timeLinkEnd_d'] -Valid links: 159,008 out of 191,296 -After link validity filter: (159008, 14) -After duplicates drop: (158797, 3) -After merge with tempCRSP: (40258781, 5) -After year filter: (1844601, 6) -Industry returns shape: (38392, 3) -Constructing conglomerate returns signal... -Conglomerate segments shape: (72778, 11) -After dropping missing sic2DCSS: (72502, 4) -After merge with industry returns: (42072631, 6) -After year filter: (866190, 7) -Weight distribution:\ncount 864726.000000 -mean 0.877258 -std 0.227946 -min 0.000000 -25% 1.000000 -50% 1.000000 -75% 1.000000 -max 1.000000 -Name: tempweight, dtype: float64 -Number of observations with weight < 1: 209516 -Final result shape: (759500, 3) -Saving predictor... -saving retConglomerate -Saved 759500 rows to ../pyData/Predictors/retConglomerate.csv -retConglomerate predictor completed successfully! -============================================================ -✅ Completed: retConglomerate.py -Execution time: 5.30 seconds - - -🔄 Starting: ReturnSkew.py -============================================================ -================================================================================ -🏗️ ReturnSkew.py -Generating ReturnSkew predictor (skewness of daily returns) -================================================================================ -📊 Loading daily CRSP data... -Loaded CRSP: 107,662,961 daily observations - -🔧 Starting signal construction... -Creating time_avail_m (year-month identifier)... -Date range: 1926-01-02 00:00:00 to 2024-12-31 00:00:00 -Calculating return skewness by permno-month... -Generated 5,116,571 permno-month observations before filtering -Filtering to permno-months with >=15 observations... -After >=15 filter: 5,072,664 observations - -📈 Predictor summary statistics: -shape: (1, 4) -┌─────────────────┬────────────────┬────────────────┬────────────────┐ -│ ReturnSkew_mean ┆ ReturnSkew_std ┆ ReturnSkew_min ┆ ReturnSkew_max │ -│ --- ┆ --- ┆ --- ┆ --- │ -│ f64 ┆ f64 ┆ f64 ┆ f64 │ -╞═════════════════╪════════════════╪════════════════╪════════════════╡ -│ NaN ┆ NaN ┆ -4.902903 ┆ 4.902903 │ -└─────────────────┴────────────────┴────────────────┴────────────────┘ - -💾 Saving predictor... -saving ReturnSkew -Saved 4952730 rows to ../pyData/Predictors/ReturnSkew.csv - -================================================================================ -✅ ReturnSkew.py completed successfully -Generated 1 predictor: - • ReturnSkew: Return Skewness -================================================================================ -============================================================ -✅ Completed: ReturnSkew.py -Execution time: 5.00 seconds - - -🔄 Starting: REV6.py -============================================================ -Loading and processing REV6... -REV6 predictor saved successfully -============================================================ -✅ Completed: REV6.py -Execution time: 3.95 seconds - - -🔄 Starting: RevenueSurprise.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/nanops.py:1016: RuntimeWarning: invalid value encountered in subtract - sqr = _ensure_numeric((avg - values) ** 2) -RevenueSurprise predictor created with 2107507 observations -============================================================ -✅ Completed: RevenueSurprise.py -Execution time: 11.47 seconds - - -🔄 Starting: roaq.py -============================================================ -roaq: Saved 2,490,872 observations -============================================================ -✅ Completed: roaq.py -Execution time: 3.66 seconds - - -🔄 Starting: RoE.py -============================================================ -RoE: Saved 3,528,982 observations -============================================================ -✅ Completed: RoE.py -Execution time: 3.46 seconds - - -🔄 Starting: sfe.py -============================================================ -sfe predictor saved successfully -============================================================ -✅ Completed: sfe.py -Execution time: 22.18 seconds - - -🔄 Starting: ShareIss1Y.py -============================================================ -Starting ShareIss1Y calculation... -After merge: 4047630 observations -ShareIss1Y calculated for 3517511 observations -Final output: 3517511 observations -ShareIss1Y.csv saved successfully -============================================================ -✅ Completed: ShareIss1Y.py -Execution time: 5.26 seconds - - -🔄 Starting: ShareIss5Y.py -============================================================ -Starting ShareIss5Y calculation... -After merge: 4047630 observations -ShareIss5Y calculated for 2508021 observations -Final output: 2508021 observations -ShareIss5Y.csv saved successfully -============================================================ -✅ Completed: ShareIss5Y.py -Execution time: 4.53 seconds - - -🔄 Starting: ShareRepurchase.py -============================================================ -ShareRepurchase predictor created with 3625491 observations -============================================================ -✅ Completed: ShareRepurchase.py -Execution time: 1.73 seconds - - -🔄 Starting: ShareVol.py -============================================================ -Starting ShareVol.py... -Loading SignalMasterTable... -Loaded SignalMasterTable: 4047630 rows -Merging with monthly CRSP data... -After merge: 4047630 rows -Setting up panel data structure... -Creating lag variables for volume calculation... -Calculating ShareVol signal... -Calculated ShareVol for 1906936 observations -saving ShareVol -Saved 1661295 rows to ../pyData/Predictors/ShareVol.csv -ShareVol.py completed successfully -============================================================ -✅ Completed: ShareVol.py -Execution time: 0.79 seconds - - -🔄 Starting: ShortInterest.py -============================================================ -Starting ShortInterest calculation... -After dropping missing gvkey: 3041661 observations -After merge with monthlyCRSP: 3041661 observations -After merge with monthlyShortInterest: 1577922 observations -ShortInterest calculated for 1577922 observations -Final output: 1577922 observations -ShortInterest.csv saved successfully -============================================================ -✅ Completed: ShortInterest.py -Execution time: 3.21 seconds - - -🔄 Starting: sinAlgo.py -============================================================ -Starting sinAlgo.py... -Loading Compustat segments data... -Loaded segments data: 2766341 rows -Identifying sin segments... -Loading SignalMasterTable... -Loaded SignalMasterTable: 4047630 rows -Merging with Compustat annual data... -After merge: 4047630 rows -Identifying sin stocks... -Calculating sinAlgo signal... -Calculated sinAlgo for 233996 observations -saving sinAlgo -Saved 233996 rows to ../pyData/Predictors/sinAlgo.csv -sinAlgo.py completed successfully -============================================================ -✅ Completed: sinAlgo.py -Execution time: 30.95 seconds - - -🔄 Starting: Size.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/.venv/lib/python3.13/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log - result = getattr(ufunc, method)(*inputs, **kwargs) -Size: Saved 4,029,252 observations -============================================================ -✅ Completed: Size.py -Execution time: 2.87 seconds - - -🔄 Starting: skew1.py -============================================================ -skew1 predictor saved successfully -============================================================ -✅ Completed: skew1.py -Execution time: 1.12 seconds - - -🔄 Starting: SmileSlope.py -============================================================ -saving SmileSlope -Saved 859994 rows to ../pyData/Predictors/SmileSlope.csv -============================================================ -✅ Completed: SmileSlope.py -Execution time: 1.16 seconds - - -🔄 Starting: SP.py -============================================================ -SP: Saved 3,030,928 observations -============================================================ -✅ Completed: SP.py -Execution time: 3.35 seconds - - -🔄 Starting: Spinoff.py -============================================================ -Spinoff predictor saved successfully -============================================================ -✅ Completed: Spinoff.py -Execution time: 2.47 seconds - - -🔄 Starting: std_turn.py -============================================================ -Starting std_turn.py... -Loading data... -saving std_turn -Saved 2200763 rows to ../pyData/Predictors/std_turn.csv -std_turn.py completed successfully -============================================================ -✅ Completed: std_turn.py -Execution time: 1.43 seconds - - -🔄 Starting: STreversal.py -============================================================ -Starting STreversal.py... -Loading SignalMasterTable... -Loaded data: 4047630 rows -Calculating STreversal... -Calculated STreversal for 4047630 observations -saving STreversal -Saved 4047630 rows to ../pyData/Predictors/STreversal.csv -STreversal.py completed successfully -============================================================ -✅ Completed: STreversal.py -Execution time: 0.44 seconds - - -🔄 Starting: SurpriseRD.py -============================================================ -SurpriseRD predictor saved successfully -============================================================ -✅ Completed: SurpriseRD.py -Execution time: 2.18 seconds - - -🔄 Starting: tang.py -============================================================ -tang: Saved 1,517,875 observations -============================================================ -✅ Completed: tang.py -Execution time: 2.96 seconds - - -🔄 Starting: Tax.py -============================================================ -Tax: Saved 3,213,292 observations -============================================================ -✅ Completed: Tax.py -Execution time: 2.95 seconds - - -🔄 Starting: TotalAccruals.py -============================================================ -TotalAccruals predictor saved successfully -============================================================ -✅ Completed: TotalAccruals.py -Execution time: 4.47 seconds - - -🔄 Starting: TrendFactor.py -============================================================ -================================================================================ -🏗️ TrendFactor.py -Generating TrendFactor predictor using daily data with moving averages -================================================================================ -📊 Loading daily CRSP data... -Daily CRSP data: 107,662,961 observations -🔄 Computing adjusted prices and time variables... -📈 Computing moving averages for 11 different lags... - Computing 3-day moving average... - Computing 5-day moving average... - Computing 10-day moving average... - Computing 20-day moving average... - Computing 50-day moving average... - Computing 100-day moving average... - Computing 200-day moving average... - Computing 400-day moving average... - Computing 600-day moving average... - Computing 800-day moving average... - Computing 1000-day moving average... -📅 Keeping only end-of-month observations... -Monthly data after filtering: 5,116,571 observations -📊 Creating monthly data with future returns and moving averages for regressions -🎯 Computing size deciles based on NYSE stocks... -🔍 Applying filters for regression sample... -After applying filters: 2,077,308 observations -After merging moving averages: 2,076,870 observations -Preparing for cross-sectional regression... -Carefully creating fRet with left join -🔧 asreg regressions with asreg_collinear -Warning: could not process group 2024-12-01 00:00:00: No rows left after applying estimation-sample mask. -📊 Computing 12-month rolling averages of beta coefficients... -🎯 Computing TrendFactor as the smoothed regression model predictions -Generated TrendFactor values: 2,058,243 observations -💾 Saving TrendFactor predictor... -saving TrendFactor -Saved 2058243 rows to ../pyData/Predictors/TrendFactor.csv -✅ TrendFactor.csv saved successfully -🎉 TrendFactor computation completed! -============================================================ -✅ Completed: TrendFactor.py -Execution time: 128.04 seconds - - -🔄 Starting: UpRecomm.py -============================================================ -UpRecomm predictor created with 464223 observations -============================================================ -✅ Completed: UpRecomm.py -Execution time: 12.44 seconds - - -🔄 Starting: VarCF.py -============================================================ -Calculating rolling statistics for 29339 firms... -Rolling statistics calculation completed -VarCF predictor saved successfully -============================================================ -✅ Completed: VarCF.py -Execution time: 4.91 seconds - - -🔄 Starting: VolMkt.py -============================================================ -Starting VolMkt.py... -Loading monthly CRSP data... -Loaded data: 5153763 rows -Calculating market value and dollar volume... -Creating 12-month rolling mean of dollar volume... -Calculated VolMkt for 5153763 observations -saving VolMkt -Saved 4361398 rows to ../pyData/Predictors/VolMkt.csv -VolMkt.py completed successfully -============================================================ -✅ Completed: VolMkt.py -Execution time: 0.65 seconds - - -🔄 Starting: VolSD.py -============================================================ -================================================================================ -🏗️ VolSD.py -Creating volume variance predictor using 36-month rolling standard deviation -================================================================================ -📊 Loading monthly CRSP data... -Loaded: 5,153,763 observations -🧮 Computing 36-month rolling volume standard deviation... -💾 Saving VolSD predictor... -saving VolSD -Saved 3922399 rows to ../pyData/Predictors/VolSD.csv -✅ VolSD.csv saved successfully -================================================================================ -✅ VolSD.py Complete -Volume variance predictor generated successfully -================================================================================ -============================================================ -✅ Completed: VolSD.py -Execution time: 0.62 seconds - - -🔄 Starting: VolumeTrend.py -============================================================ -Loading and processing VolumeTrend... -Rolling window regressions of volume on time... -Calculating 60-month rolling mean of vol... -saving VolumeTrend -Saved 3677088 rows to ../pyData/Predictors/VolumeTrend.csv -VolumeTrend predictor saved successfully -============================================================ -✅ Completed: VolumeTrend.py -Execution time: 6.41 seconds - - -🔄 Starting: XFIN.py -============================================================ -XFIN predictor saved successfully -============================================================ -✅ Completed: XFIN.py -Execution time: 3.89 seconds - - -🔄 Starting: ZZ0_RealizedVol_IdioVol3F_ReturnSkew3F.py -============================================================ -================================================================================ -🏗️ ZZ0_RealizedVol_IdioVol3F_ReturnSkew3F.py -Generating RealizedVol, IdioVol3F, and ReturnSkew3F predictors -================================================================================ -📊 Loading daily CRSP and Fama-French data... -Loading dailyCRSP.parquet... -Loaded CRSP: 107,662,961 daily observations -Loading dailyFF.parquet... -Loaded FF factors: 26,003 daily observations -Merging CRSP and FF data... -Merged dataset: 107,583,998 observations -Adjusting returns by risk-free rate... - -🔧 Starting signal construction... -Creating time_avail_m (year-month identifier)... -Date range: 1926-07-01 00:00:00 to 2024-12-31 00:00:00 -Running FF3 regressions by permno-month to extract residuals... -Adding _Nobs to track observations used in regression... -⚠️ Warning: 63491 observations with missing residuals (likely singular matrices) -Completed regressions: 105,390,189 observations -Filtering out observations where FF3 regression failed (null residuals)... -After removing null residuals: 105,326,698 observations -Permno-month groups after filtering: 4,980,936 -Calculating predictors using group aggregations... -Generated predictors: 4,980,936 permno-month observations - -📈 Predictor summary statistics: -shape: (1, 6) -┌────────────────┬────────────────┬────────────────┬───────────────┬───────────────┬───────────────┐ -│ RealizedVol_me ┆ RealizedVol_st ┆ IdioVol3F_mean ┆ IdioVol3F_std ┆ ReturnSkew3F_ ┆ ReturnSkew3F_ │ -│ an ┆ d ┆ --- ┆ --- ┆ mean ┆ std │ -│ --- ┆ --- ┆ f64 ┆ f64 ┆ --- ┆ --- │ -│ f64 ┆ f64 ┆ ┆ ┆ f64 ┆ f64 │ -╞════════════════╪════════════════╪════════════════╪═══════════════╪═══════════════╪═══════════════╡ -│ 0.029741 ┆ 0.031077 ┆ 0.025296 ┆ 0.028492 ┆ NaN ┆ NaN │ -└────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┘ - -💾 Saving predictors... -saving RealizedVol -Saved 4980936 rows to ../pyData/Predictors/RealizedVol.csv -saving IdioVol3F -Saved 4980936 rows to ../pyData/Predictors/IdioVol3F.csv -saving ReturnSkew3F -Saved 4967368 rows to ../pyData/Predictors/ReturnSkew3F.csv - -================================================================================ -✅ ZZ0_RealizedVol_IdioVol3F_ReturnSkew3F.py completed successfully -Generated 3 predictors: - • RealizedVol: Realized (Total) Vol (Daily) - • IdioVol3F: Idiosyncratic Risk (3 factor) - • ReturnSkew3F: Skewness of daily idiosyncratic returns (3F model) -================================================================================ -============================================================ -✅ Completed: ZZ0_RealizedVol_IdioVol3F_ReturnSkew3F.py -Execution time: 87.22 seconds - - -🔄 Starting: ZZ1_Activism1_Activism2.py -============================================================ -Loading SignalMasterTable... -Initial data loaded: 4047630 rows -Merging with TR_13F... -After TR_13F merge: 4047630 rows -Merging with monthlyCRSP... -After monthlyCRSP merge: 4047630 rows -Handling ticker-based merge with GovIndex... -Records with ticker: 4047630 -Records without ticker: 0 -After GovIndex merge and append: 4047630 rows -Constructing Activism1 signal... -Calculating block holding quartiles by time_avail_m... -Activism1 signal constructed -Non-missing Activism1 values: 108768 -Constructing Activism2 signal... -Activism2 signal constructed -Non-missing Activism2 values: 30170 -Saving Activism1... -saving Activism1 -Saved 108768 rows to ../pyData/Predictors/Activism1.csv -Saving Activism2... -saving Activism2 -Saved 30170 rows to ../pyData/Predictors/Activism2.csv -Activism1 and Activism2 predictors completed! -============================================================ -✅ Completed: ZZ1_Activism1_Activism2.py -Execution time: 3.09 seconds - - -🔄 Starting: ZZ1_AnalystValue_AOP_PredictedFE_IntrinsicValue.py -============================================================ -================================================================================ -🏗️ ZZ1_AnalystValue_AOP_PredictedFE_IntrinsicValue.py -Generating analyst value predictors: AnalystValue, AOP, PredictedFE. Also the placebo IntrinsicValue. -================================================================================ -📊 Preparing IBES forecast data... -Loading IBES EPS Unadj for FROE1... -FROE1 data: 183,523 observations -Loading IBES EPS Unadj for FROE2... -FROE2 data: 181,697 observations -Loading IBES EPS Unadj for LTG... -LTG data: 1,423,499 observations -📊 Loading main data sources... -SignalMasterTable: 4,047,630 observations -After merging CRSP and Compustat: 4,047,630 observations -After filtering to June observations: 336,692 observations -After merging IBES data: 336,692 observations -🧮 Computing financial variables and screens... -📈 Computing forecast-based equity values... -🔍 Applying data screens... -After applying screens: 108,632 observations -💰 Computing analyst and intrinsic values... -🔮 Computing predicted forecast error... -📅 Expanding to monthly observations... -💾 Saving predictors... -Generated AnalystValue: 1,299,504 observations - Mean: 0.780050 - Std: 10.226677 -saving AnalystValue -Saved 1299504 rows to ../pyData/Predictors/AnalystValue.csv -✅ AnalystValue.csv saved successfully -Generated AOP: 1,299,504 observations - Mean: 160.357614 - Std: 46464.774095 -saving AOP -Saved 1299504 rows to ../pyData/Predictors/AOP.csv -✅ AOP.csv saved successfully -Generated PredictedFE: 635,124 observations - Mean: 0.052645 - Std: 0.033105 -saving PredictedFE -Saved 635124 rows to ../pyData/Predictors/PredictedFE.csv -✅ PredictedFE.csv saved successfully -saving IntrinsicValue -Saved 1299504 rows to ../pyData/Placebos/IntrinsicValue.csv -✅ IntrinsicValue.csv saved successfully -🎉 All analyst value predictors completed! -============================================================ -✅ Completed: ZZ1_AnalystValue_AOP_PredictedFE_IntrinsicValue.py -Execution time: 1.82 seconds - - -🔄 Starting: ZZ1_EBM_BPEBM.py -============================================================ -Starting ZZ1_EBM_BPEBM.py... -Loading data... -saving EBM -Saved 2924826 rows to ../pyData/Predictors/EBM.csv -ZZ1_EBM_BPEBM.py completed successfully -saving BPEBM -Saved 2924826 rows to ../pyData/Predictors/BPEBM.csv -ZZ1_EBM_BPEBM.py completed successfully -============================================================ -✅ Completed: ZZ1_EBM_BPEBM.py -Execution time: 1.28 seconds - - -🔄 Starting: ZZ1_FR_FRbook.py -============================================================ -FR (Predictor) and FRbook (Placobo) saved successfully -============================================================ -✅ Completed: ZZ1_FR_FRbook.py -Execution time: 2.93 seconds - - -🔄 Starting: ZZ1_grcapx_grcapx1y_grcapx3y.py -============================================================ -Starting ZZ1_grcapx_grcapx1y_grcapx3y.py... -Loading m_aCompustat data... -Loaded m_aCompustat: 3625491 rows, 6 columns -Removing duplicate permno-time_avail_m observations... -Removed 0 duplicate observations -Merging with SignalMasterTable... -After merge: 3041661 rows, 7 columns -Setting up panel data (sorting by permno, time_avail_m)... -Calculating FirmAge... -Calculating tempcrsptime and applying FirmAge restriction... -Creating l12_ppent lag for conditional replacement... -Applying conditional capx replacement... -Creating lags for capx after replacement... -Calculating predictors... -Calculated grcapx for 2444969 observations -Calculated grcapx1y for 2440804 observations -Calculated grcapx3y for 2236619 observations -saving grcapx -Saved 2444969 rows to ../pyData/Predictors/grcapx.csv -saving grcapx1y -Saved 2440804 rows to ../pyData/Placebos/grcapx1y.csv -saving grcapx3y -Saved 2236619 rows to ../pyData/Predictors/grcapx3y.csv -ZZ1_grcapx_grcapx1y_grcapx3y.py completed successfully -============================================================ -✅ Completed: ZZ1_grcapx_grcapx1y_grcapx3y.py -Execution time: 2.56 seconds - - -🔄 Starting: ZZ1_IntanBM_IntanSP_IntanCFP_IntanEP.py -============================================================ -Starting ZZ1_IntanBM_IntanSP_IntanCFP_IntanEP.py -Loading data... -Data loaded. Shape: (3041661, 10) -Constructing signals... -Calculating cumulative returns... -Calculating 60-month calendar-based lag for cumulative returns... -Processing tempAccBM... - Calculating 60-month calendar-based lag for tempAccBM... -Processing tempAccSP... - Calculating 60-month calendar-based lag for tempAccSP... -Processing tempAccCFP... - Calculating 60-month calendar-based lag for tempAccCFP... -Processing tempAccEP... - Calculating 60-month calendar-based lag for tempAccEP... -Regressions completed. -Saving IntanBM... -Saved IntanBM with 1728573 observations -Saving IntanSP... -Saved IntanSP with 1876808 observations -Saving IntanCFP... -Saved IntanCFP with 1881252 observations -Saving IntanEP... -Saved IntanEP with 1881252 observations -ZZ1_IntanBM_IntanSP_IntanCFP_IntanEP.py completed successfully! -============================================================ -✅ Completed: ZZ1_IntanBM_IntanSP_IntanCFP_IntanEP.py -Execution time: 29.20 seconds - - -🔄 Starting: ZZ1_OptionVolume1_OptionVolume2.py -============================================================ -Starting ZZ1_OptionVolume1_OptionVolume2.py... -Loading data... -saving OptionVolume1 -Saved 852949 rows to ../pyData/Predictors/OptionVolume1.csv -ZZ1_OptionVolume1_OptionVolume2.py completed successfully -saving OptionVolume2 -Saved 841828 rows to ../pyData/Predictors/OptionVolume2.csv -ZZ1_OptionVolume1_OptionVolume2.py completed successfully -============================================================ -✅ Completed: ZZ1_OptionVolume1_OptionVolume2.py -Execution time: 1.93 seconds - - -🔄 Starting: ZZ1_OrgCap_OrgCapNoAdj.py -============================================================ -Loading data files... -After filtering: 1,583,658 observations -Applying recursive organizational capital formula... -After OrgCapNoAdj calculation: 1,343,203 non-missing values -After FF17 classification: 1,465,660 observations -Final OrgCap values: 1,243,528 non-missing -saving OrgCap -Saved 1243528 rows to ../pyData/Predictors/OrgCap.csv -saving OrgCapNoAdj -Saved 1243563 rows to ../pyData/Predictors/OrgCapNoAdj.csv -OrgCap and OrgCapNoAdj calculation completed successfully! -============================================================ -✅ Completed: ZZ1_OrgCap_OrgCapNoAdj.py -Execution time: 91.72 seconds - - -🔄 Starting: ZZ1_ResidualMomentum6m_ResidualMomentum.py -============================================================ -================================================================================ -🏗️ ZZ1_ResidualMomentum6m_ResidualMomentum.py -Generating ResidualMomentum6m and ResidualMomentum predictors -================================================================================ -📊 Loading monthly CRSP and Fama-French data... -Loading monthlyCRSP.parquet... -Loaded CRSP: 5,153,763 monthly observations -Loading monthlyFF.parquet... -Loaded FF factors: 1,187 monthly observations -Merging CRSP and FF data... -Merged dataset: 5,150,010 observations - -🔧 Starting signal construction... -Calculating excess returns (retrf = ret - rf)... -Creating time_temp position index by permno... -Running rolling 36-observation FF3 regressions by permno using direct polars-ols helper... -Processing 38835 unique permnos... -Completed rolling regressions for 5,150,010 observations -Calculating lagged residuals and momentum signals... -Calculating 6-observation and 11-observation rolling momentum signals... - -📈 Signal summary statistics: -ResidualMomentum6m - Mean: -0.0597, Std: 0.5126 -ResidualMomentum - Mean: -0.0383, Std: 0.3300 -Non-missing ResidualMomentum6m: 3,601,799 -Non-missing ResidualMomentum: 3,458,602 - -💾 Saving signals... -saving ResidualMomentum6m -Saved 3601799 rows to ../pyData/Placebos/ResidualMomentum6m.csv -saving ResidualMomentum -Saved 3458602 rows to ../pyData/Predictors/ResidualMomentum.csv - -================================================================================ -✅ ZZ1_ResidualMomentum6m_ResidualMomentum.py completed successfully -Generated 2 signals: - • ResidualMomentum6m: 6 month residual momentum (Placebo) - • ResidualMomentum: Momentum based on FF3 residuals (Predictor) -================================================================================ -============================================================ -✅ Completed: ZZ1_ResidualMomentum6m_ResidualMomentum.py -Execution time: 8.04 seconds - - -🔄 Starting: ZZ1_RIO_MB_RIO_Disp_RIO_Turnover_RIO_Volatility.py -============================================================ -================================================================================ -🏗️ ZZ1_RIO_MB_RIO_Disp_RIO_Turnover_RIO_Volatility.py -Generating Real Investment Opportunities (RIO) predictors -================================================================================ -📊 Preparing IBES data... -IBES EPS data: 2,382,154 observations -📊 Loading main data sources... -SignalMasterTable: 4,047,630 observations -After merging all data sources: 4,047,630 observations -🔍 Applying size filters... -After filtering bottom size quintile: 2,620,054 observations -🏛️ Computing Residual Institutional Ownership (RIO)... -📊 Computing characteristic variables... -🏷️ Creating characteristic quintiles and RIO interactions... -💾 Saving RIO predictors... -Generated RIO_MB: 354,474 observations - Value distribution: -shape: (5, 2) -┌────────┬───────┐ -│ RIO_MB ┆ count │ -│ --- ┆ --- │ -│ f64 ┆ u32 │ -╞════════╪═══════╡ -│ 1.0 ┆ 79108 │ -│ 2.0 ┆ 82962 │ -│ 3.0 ┆ 76270 │ -│ 4.0 ┆ 65044 │ -│ 5.0 ┆ 51090 │ -└────────┴───────┘ -saving RIO_MB -Saved 354474 rows to ../pyData/Predictors/RIO_MB.csv -✅ RIO_MB.csv saved successfully -Generated RIO_Disp: 497,742 observations - Value distribution: -shape: (5, 2) -┌──────────┬────────┐ -│ RIO_Disp ┆ count │ -│ --- ┆ --- │ -│ f64 ┆ u32 │ -╞══════════╪════════╡ -│ 1.0 ┆ 42951 │ -│ 2.0 ┆ 59618 │ -│ 3.0 ┆ 106473 │ -│ 4.0 ┆ 138134 │ -│ 5.0 ┆ 150566 │ -└──────────┴────────┘ -saving RIO_Disp -Saved 497742 rows to ../pyData/Predictors/RIO_Disp.csv -✅ RIO_Disp.csv saved successfully -Generated RIO_Turnover: 445,570 observations - Value distribution: -shape: (5, 2) -┌──────────────┬────────┐ -│ RIO_Turnover ┆ count │ -│ --- ┆ --- │ -│ f64 ┆ u32 │ -╞══════════════╪════════╡ -│ 1.0 ┆ 56529 │ -│ 2.0 ┆ 85650 │ -│ 3.0 ┆ 99589 │ -│ 4.0 ┆ 96332 │ -│ 5.0 ┆ 107470 │ -└──────────────┴────────┘ -saving RIO_Turnover -Saved 445570 rows to ../pyData/Predictors/RIO_Turnover.csv -✅ RIO_Turnover.csv saved successfully -Generated RIO_Volatility: 470,257 observations - Value distribution: -shape: (5, 2) -┌────────────────┬────────┐ -│ RIO_Volatility ┆ count │ -│ --- ┆ --- │ -│ f64 ┆ u32 │ -╞════════════════╪════════╡ -│ 1.0 ┆ 56810 │ -│ 2.0 ┆ 64167 │ -│ 3.0 ┆ 97960 │ -│ 4.0 ┆ 120450 │ -│ 5.0 ┆ 130870 │ -└────────────────┴────────┘ -saving RIO_Volatility -Saved 470257 rows to ../pyData/Predictors/RIO_Volatility.csv -✅ RIO_Volatility.csv saved successfully -🎉 All RIO predictors completed! -============================================================ -✅ Completed: ZZ1_RIO_MB_RIO_Disp_RIO_Turnover_RIO_Volatility.py -Execution time: 31.97 seconds - - -🔄 Starting: ZZ1_RIVolSpread.py -============================================================ -Starting ZZ1_RIVolSpread.py... -Loading data... -saving RIVolSpread -Saved 748931 rows to ../pyData/Predictors/RIVolSpread.csv -ZZ1_RIVolSpread.py completed successfully -============================================================ -✅ Completed: ZZ1_RIVolSpread.py -Execution time: 4.31 seconds - - -🔄 Starting: ZZ1_zerotrade_zerotradeAlt1_zerotradeAlt12.py -============================================================ -Starting ZZ1_zerotrade_zerotradeAlt1_zerotradeAlt12.py... -Loading data... -============================================================ -✅ Completed: ZZ1_zerotrade_zerotradeAlt1_zerotradeAlt12.py -Execution time: 21.84 seconds - - -🔄 Starting: ZZ2_AbnormalAccruals_AbnormalAccrualsPercent.py -============================================================ -================================================================================ -🏗️ ZZ2_AbnormalAccruals_AbnormalAccrualsPercent.py -Generating Abnormal Accruals predictor using Xie (2001) methodology -================================================================================ -📊 Loading a_aCompustat data... -Loaded a_aCompustat: 302,326 observations -After merging with SignalMasterTable: 302,326 observations -🧮 Computing abnormal accruals following Xie (2001)... -📊 Applying winsorization at 0.1% and 99.9% levels... -🏭 Running cross-sectional regressions by year and industry (SIC2)... -After cross-sectional regressions and filtering: 264,668 observations -📅 Expanding to permno-monthly observations... -💾 Saving AbnormalAccruals predictor... -saving AbnormalAccruals -Saved 2685478 rows to ../pyData/Predictors/AbnormalAccruals.csv -✅ AbnormalAccruals.csv saved successfully -💾 Saving AbnormalAccrualsPercent predictor... -saving AbnormalAccrualsPercent -Saved 2629101 rows to ../pyData/Placebos/AbnormalAccrualsPercent.csv -✅ AbnormalAccrualsPercent.csv saved successfully -Generated AbnormalAccruals values: 3,247,004 observations -Non-null AbnormalAccruals: 2,685,478 observations -AbnormalAccruals summary stats: - Mean: -0.001561 - Std: 0.166208 - Min: -8.278991 - Max: 2.811868 -============================================================ -✅ Completed: ZZ2_AbnormalAccruals_AbnormalAccrualsPercent.py -Execution time: 1.66 seconds - - -🔄 Starting: ZZ2_AnnouncementReturn.py -============================================================ -Starting AnnouncementReturn calculation... -Loading CCMLinkingTable... -Loading m_QCompustat... -Loading dailyCRSP... -Merging with crosswalk... -Observations within link validity period: 84955193 out of 131267640 -Merging with dailyFF... -Creating announcement windows... -Collapsing by announcement windows... -Filling missing months... -Columns after reset_index: ['index', 'AnnouncementReturn', 'permno'] -Index name after reset_index: None -Final dataset shape: (2922354, 3) -Saving to pyData/Predictors/AnnouncementReturn.csv... -AnnouncementReturn calculation completed. -============================================================ -✅ Completed: ZZ2_AnnouncementReturn.py -Execution time: 76.20 seconds - - -🔄 Starting: ZZ2_BetaFP.py -============================================================ -================================================================================ -🏗️ ZZ2_BetaFP.py -Generating Frazzini-Pedersen beta using rolling correlations -================================================================================ -📊 Loading daily CRSP and Fama-French data... -Loaded CRSP: 107,662,961 daily observations -Loaded FF factors: 26,003 daily observations -Merged dataset: 107,583,998 observations - -🔧 Starting signal construction... -Calculating excess log returns... -Computing 252-day rolling volatilities... -Creating 3-day overlapping returns... -Calculating rolling R-squared (1260-day window, min 500 obs)... -Computing Frazzini-Pedersen beta... - -📅 Converting to monthly frequency... -Aggregating to permno-month level... -Generated predictors: 4,156,049 permno-month observations - -📈 Predictor summary statistics: -shape: (9, 2) -┌────────────┬────────────┐ -│ statistic ┆ BetaFP │ -│ --- ┆ --- │ -│ str ┆ f64 │ -╞════════════╪════════════╡ -│ count ┆ 4.156049e6 │ -│ null_count ┆ 0.0 │ -│ mean ┆ 0.976942 │ -│ std ┆ 0.645869 │ -│ min ┆ 0.0 │ -│ 25% ┆ 0.511331 │ -│ 50% ┆ 0.890906 │ -│ 75% ┆ 1.3149 │ -│ max ┆ 12.562312 │ -└────────────┴────────────┘ - -💾 Saving predictor... -saving BetaFP -Saved 4156049 rows to ../pyData/Predictors/BetaFP.csv - -================================================================================ -✅ ZZ2_BetaFP.py completed successfully -Generated predictor: BetaFP (Frazzini-Pedersen Beta) -================================================================================ -============================================================ -✅ Completed: ZZ2_BetaFP.py -Execution time: 10.89 seconds - - -🔄 Starting: ZZ2_betaVIX.py -============================================================ -Starting ZZ2_betaVIX.py... -Loading data... -saving betaVIX -Saved 3553481 rows to ../pyData/Predictors/betaVIX.csv -ZZ2_betaVIX.py completed successfully -============================================================ -✅ Completed: ZZ2_betaVIX.py -Execution time: 61.08 seconds - - -🔄 Starting: ZZ2_IdioVolAHT.py -============================================================ -/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/Signals/pyCode/Predictors/ZZ2_IdioVolAHT.py:84: DeprecationWarning: the argument `min_periods` for `Expr.rolling_mean` is deprecated. It was renamed to `min_samples` in version 1.21.0. - .rolling_mean(window_size=252, min_periods=100) -Starting ZZ2_IdioVolAHT.py... -Loading daily CRSP and Fama-French data... -Daily CRSP data: 107662961 rows -Merging with Fama-French factors... -After merge: 107583998 rows -Running 252-day rolling CAPM regressions... -Calculating idiosyncratic volatility... -Calculated IdioVolAHT for 5030419 observations -saving IdioVolAHT -Saved 4670398 rows to ../pyData/Predictors/IdioVolAHT.csv -ZZ2_IdioVolAHT.py completed successfully -============================================================ -✅ Completed: ZZ2_IdioVolAHT.py -Execution time: 142.43 seconds - - -🔄 Starting: ZZ2_PriceDelaySlope_PriceDelayRsq_PriceDelayTstat.py -============================================================ -================================================================================ -🏗️ ZZ2_PriceDelaySlope_PriceDelayRsq_PriceDelayTstat.py -Generating price delay predictors using daily data regressions -================================================================================ -📊 Preparing daily Fama-French data with lags... -Daily FF data with 4 lags: 26,003 observations -📊 Loading daily CRSP data... -Daily CRSP data: 107,662,961 observations -After merging and adjusting returns: 107,583,998 observations - -📊 FULL DATASET MODE: Processing all permnos -📅 Setting up time variables for June regressions... -🏃 Running regressions by group... - Filtering groups with minimum 26 observations and data quality checks... -Groups before quality filtering: 461,455 -Groups after quality filtering: 448,811 - Filtered out 12,644 groups with data quality issues -🚀 Processing all valid groups -After filtering for minimum observations and data quality: 106,071,675 observations - Running regressions (one per group)... - Running restricted regressions on 448,811 groups... - Running unrestricted regressions on 448,811 groups... -📊 Extracting coefficients and R-squared values... -📅 Filtering for valid results and June endpoints... -Monthly data after filtering: 415,512 observations -🎯 Constructing price delay signals... -📊 Applying winsorization and time adjustment... -📅 Forward-filling to monthly frequency... - Calculated values: 415,512 observations - Creating complete time series per permno... - Processing 37576 permnos... - Processing batch 1/376... - Processing batch 11/376... - Processing batch 21/376... - Processing batch 31/376... - Processing batch 41/376... - Processing batch 51/376... - Processing batch 61/376... - Processing batch 71/376... - Processing batch 81/376... - Processing batch 91/376... - Processing batch 101/376... - Processing batch 111/376... - Processing batch 121/376... - Processing batch 131/376... - Processing batch 141/376... - Processing batch 151/376... - Processing batch 161/376... - Processing batch 171/376... - Processing batch 181/376... - Processing batch 191/376... - Processing batch 201/376... - Processing batch 211/376... - Processing batch 221/376... - Processing batch 231/376... - Processing batch 241/376... - Processing batch 251/376... - Processing batch 261/376... - Processing batch 271/376... - Processing batch 281/376... - Processing batch 291/376... - Processing batch 301/376... - Processing batch 311/376... - Processing batch 321/376... - Processing batch 331/376... - Processing batch 341/376... - Processing batch 351/376... - Processing batch 361/376... - Processing batch 371/376... - Complete grid after tsfill: 4,636,840 observations - Forward-filling missing values within each permno... - After forward-filling: 4,636,840 observations -💾 Saving price delay predictors... -Generated PriceDelaySlope: 4,636,840 observations - Mean: -0.172447 - Std: 251.199930 -saving PriceDelaySlope -Saved 4636840 rows to ../pyData/Predictors/PriceDelaySlope.csv -✅ PriceDelaySlope.csv saved successfully -Generated PriceDelayRsq: 4,636,840 observations - Mean: 0.364095 - Std: 0.327532 -saving PriceDelayRsq -Saved 4636840 rows to ../pyData/Predictors/PriceDelayRsq.csv -✅ PriceDelayRsq.csv saved successfully -Generated PriceDelayTstat: 4,636,840 observations - Mean: 0.633005 - Std: 191.998269 -saving PriceDelayTstat -Saved 4636840 rows to ../pyData/Predictors/PriceDelayTstat.csv -✅ PriceDelayTstat.csv saved successfully -🎉 All price delay predictors completed! -============================================================ -✅ Completed: ZZ2_PriceDelaySlope_PriceDelayRsq_PriceDelayTstat.py -Execution time: 212.69 seconds - -Log completed at 2025-08-31 08:46:37 \ No newline at end of file diff --git a/Signals/Logs/testout_predictors.md b/Signals/Logs/testout_predictors.md index 5186bc18..34f25f0b 100644 --- a/Signals/Logs/testout_predictors.md +++ b/Signals/Logs/testout_predictors.md @@ -1,6 +1,6 @@ # Predictor Validation Results -**Generated**: 2025-08-31 12:47:37 +**Generated**: 2025-09-12 09:31:11 **Configuration**: - TOL_SUPERSET: 1.0% @@ -18,17 +18,229 @@ Numbers report the **FAILURE** rate. ❌ (100.00%) is BAD. | Predictor | Python CSV | Superset | NumRows | Precision1 | Precision2 | T-stat | |---------------------------|------------|------------|---------------|--------------|---------------|------------| -| IdioVolAHT | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.3E-04) | SKIP | +| AgeIPO | ✅ | NA | NA | NA | NA | NA | +| IndIPO | ✅ | NA | NA | NA | NA | NA | +| OrgCapNoAdj | ✅ | NA | NA | NA | NA | NA | +| RDIPO | ✅ | NA | NA | NA | NA | NA | +| Recomm_ShortInterest* | ✅ | ❌ (58.72%) | ❌ (+95.4%) | ✅ (0.0%) | ✅ (0.0E+00) | ❌ (-0.67) | +| IO_ShortInterest* | ✅ | ❌ (42.47%) | ❌ (+81.8%) | ❌ (1.3%) | ❌ (8.6E-01) | ✅ (+0.00) | +| ChNAnalyst | ✅ | ❌ (26.34%) | ❌ (+76.3%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.10) | +| Mom6mJunk* | ✅ | ❌ (12.51%) | ✅ (-11.5%) | ✅ (0.3%) | ❌ (5.6E-01) | ✅ (+0.04) | +| OptionVolume2 | ✅ | ❌ (10.32%) | ✅ (-9.2%) | ❌ (93.9%) | ❌ (1.9E+01) | ❌ (+0.51) | +| OptionVolume1 | ✅ | ❌ (10.08%) | ✅ (-8.9%) | ❌ (94.2%) | ❌ (7.8E+00) | ✅ (-0.07) | +| ShortInterest* | ✅ | ✅ (0.00%) | ❌ (+80.7%) | ❌ (81.9%) | ❌ (2.4E+00) | ✅ (+0.00) | +| PredictedFE* | ✅ | ✅ (0.27%) | ❌ (+29.2%) | ❌ (85.3%) | ❌ (3.1E-01) | ❌ (+0.35) | +| CredRatDG* | ✅ | ✅ (0.00%) | ❌ (+18.8%) | ✅ (0.3%) | ❌ (6.6E+00) | ❌ (-0.39) | +| CompEquIss* | ✅ | ✅ (0.73%) | ❌ (+17.9%) | ✅ (0.0%) | ✅ (2.1E-06) | ✅ (-0.19) | +| BetaFP* | ✅ | ✅ (0.24%) | ❌ (+9.5%) | ❌ (6.3%) | ❌ (8.8E-01) | ✅ (-0.13) | +| Cash* | ✅ | ✅ (0.02%) | ❌ (+7.2%) | ✅ (0.0%) | ✅ (1.5E-07) | ✅ (+0.11) | +| PriceDelayTstat* | ✅ | ✅ (0.00%) | ✅ (+2.5%) | ❌ (98.9%) | ❌ (2.1E+02) | ❌ (+0.33) | +| TrendFactor* | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ❌ (25.1%) | ❌ (9.1E-01) | ✅ (-0.03) | +| MS* | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ❌ (15.5%) | ❌ (1.9E+00) | ❌ (-0.38) | +| AbnormalAccruals* | ✅ | ✅ (0.60%) | ✅ (+4.5%) | ❌ (12.1%) | ❌ (1.1E+00) | ✅ (-0.16) | +| RDAbility* | ✅ | ✅ (0.01%) | ✅ (+4.5%) | ❌ (4.3%) | ❌ (2.2E+00) | ❌ (+0.22) | +| ReturnSkew3F | ✅ | ✅ (0.26%) | ✅ (-0.2%) | ❌ (2.3%) | ❌ (1.8E+00) | ✅ (+0.18) | +| CitationsRD | ✅ | ✅ (0.49%) | ✅ (+2.4%) | ❌ (1.7%) | ❌ (2.4E+00) | ✅ (+0.00) | +| MomOffSeason11YrPlus | ✅ | ✅ (0.00%) | ✅ (+1.8%) | ❌ (1.5%) | ❌ (4.6E+00) | ✅ (-0.03) | +| MomOffSeason06YrPlus | ✅ | ✅ (0.00%) | ✅ (+1.9%) | ❌ (1.3%) | ❌ (3.5E+00) | ✅ (+0.02) | +| OrgCap | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ❌ (1.2%) | ✅ (7.7E-02) | ✅ (-0.10) | +| PriceDelayRsq | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ❌ (1.2%) | ❌ (1.9E+00) | ✅ (+0.02) | +| retConglomerate | ✅ | ✅ (0.00%) | ✅ (+0.2%) | ❌ (1.2%) | ❌ (1.7E-01) | ✅ (+0.04) | +| MomOffSeason | ✅ | ✅ (0.00%) | ✅ (+1.2%) | ✅ (0.9%) | ❌ (2.2E+00) | ✅ (-0.04) | +| MomOffSeason16YrPlus | ✅ | ✅ (0.00%) | ✅ (+1.7%) | ✅ (0.7%) | ❌ (9.9E-01) | ✅ (-0.07) | +| iomom_supp | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.7%) | ✅ (3.7E-02) | ✅ (+0.00) | +| ResidualMomentum | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.7%) | ✅ (2.2E-02) | ✅ (-0.10) | +| CustomerMomentum | ✅ | ✅ (0.05%) | ✅ (-0.0%) | ✅ (0.7%) | ❌ (7.2E-01) | ✅ (+0.02) | +| PriceDelaySlope | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.6%) | ✅ (7.0E-02) | ✅ (-0.03) | +| iomom_cust | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.4%) | ✅ (2.2E-02) | ✅ (+0.02) | +| Tax | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.4%) | ✅ (5.3E-02) | ✅ (-0.04) | +| BetaLiquidityPS | ✅ | ✅ (0.00%) | ✅ (+1.6%) | ✅ (0.3%) | ✅ (1.5E-02) | ✅ (+0.09) | +| AnalystValue | ✅ | ✅ (0.22%) | ✅ (+4.4%) | ✅ (0.3%) | ✅ (3.1E-02) | ✅ (-0.09) | +| skew1 | ✅ | ✅ (0.06%) | ✅ (+1.2%) | ✅ (0.2%) | ✅ (7.9E-02) | ❌ (-2.16) | +| MomVol | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.2%) | ❌ (3.5E-01) | ✅ (+0.02) | +| REV6 | ✅ | ✅ (0.17%) | ✅ (+0.0%) | ✅ (0.2%) | ✅ (1.7E-02) | ✅ (-0.14) | +| IntanEP | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.2%) | ✅ (8.1E-02) | ✅ (-0.05) | +| EarnSupBig | ✅ | ✅ (0.16%) | ✅ (+0.4%) | ✅ (0.2%) | ❌ (1.5E+00) | ✅ (-0.12) | +| IntanCFP | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.1%) | ✅ (4.1E-02) | ✅ (-0.07) | +| RIO_Volatility | ✅ | ✅ (0.17%) | ✅ (+0.0%) | ✅ (0.1%) | ❌ (7.5E-01) | ✅ (-0.04) | +| RIO_Turnover | ✅ | ✅ (0.11%) | ✅ (+0.0%) | ✅ (0.1%) | ❌ (7.4E-01) | ✅ (-0.04) | +| LRreversal | ✅ | ✅ (0.00%) | ✅ (+1.2%) | ✅ (0.1%) | ✅ (3.2E-02) | ✅ (-0.01) | +| ExclExp | ✅ | ✅ (0.12%) | ✅ (+2.1%) | ✅ (0.1%) | ✅ (8.4E-02) | ✅ (-0.11) | +| Investment | ✅ | ✅ (0.00%) | ✅ (+0.6%) | ✅ (0.1%) | ✅ (3.0E-02) | ✅ (-0.02) | +| DivInit | ✅ | ✅ (0.00%) | ✅ (+1.6%) | ✅ (0.1%) | ❌ (7.3E+00) | ✅ (-0.14) | +| RIO_Disp | ✅ | ✅ (0.23%) | ✅ (+0.1%) | ✅ (0.1%) | ❌ (7.9E-01) | ❌ (-0.20) | +| RIO_MB | ✅ | ✅ (0.03%) | ✅ (+0.1%) | ✅ (0.1%) | ✅ (0.0E+00) | ✅ (-0.01) | +| EarningsForecastDisparity | ✅ | ✅ (0.22%) | ✅ (-0.0%) | ✅ (0.1%) | ✅ (8.0E-07) | ✅ (+0.02) | +| ForecastDispersion | ✅ | ✅ (0.16%) | ✅ (+0.0%) | ✅ (0.1%) | ✅ (7.0E-07) | ✅ (-0.02) | +| fgr5yrLag | ✅ | ✅ (0.22%) | ✅ (-0.0%) | ✅ (0.1%) | ✅ (3.2E-07) | ✅ (+0.00) | +| DivYieldST | ✅ | ✅ (0.00%) | ✅ (+0.6%) | ✅ (0.1%) | ✅ (0.0E+00) | ❌ (-0.81) | +| betaVIX | ✅ | ✅ (0.00%) | ✅ (+1.2%) | ✅ (0.1%) | ✅ (8.1E-03) | ✅ (+0.03) | +| PatentsRD | ✅ | ✅ (0.04%) | ✅ (+0.6%) | ✅ (0.1%) | ✅ (0.0E+00) | ❌ (-0.93) | +| ChangeInRecommendation | ✅ | ✅ (0.23%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.5E-07) | ✅ (-0.01) | +| ExchSwitch | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (-0.01) | +| EarningsSurprise | ✅ | ✅ (0.02%) | ✅ (-0.0%) | ✅ (0.0%) | ✅ (8.6E-04) | ✅ (+0.01) | +| grcapx | ✅ | ✅ (0.74%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.7E-03) | ❌ (+0.20) | +| realestate | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (8.3E-04) | ✅ (+0.01) | +| GrSaleToGrInv | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (4.2E-03) | ✅ (+0.03) | +| AnalystRevision | ✅ | ✅ (0.16%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.4E-07) | ✅ (+0.07) | +| BM | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.2E-07) | ✅ (-0.04) | +| DivSeason | ✅ | ✅ (0.00%) | ✅ (+0.6%) | ✅ (0.0%) | ✅ (0.0E+00) | ❌ (-0.47) | +| ChForecastAccrual | ✅ | ✅ (0.18%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.04) | +| FEPS | ✅ | ✅ (0.16%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (5.0E-07) | ✅ (+0.03) | +| DownRecomm | ✅ | ✅ (0.23%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (-0.05) | +| GP | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.4E-07) | ✅ (+0.02) | +| UpRecomm | ✅ | ✅ (0.23%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.06) | +| sfe | ✅ | ✅ (0.20%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (4.4E-08) | ✅ (-0.13) | +| IdioVol3F | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (5.3E-03) | ✅ (+0.02) | +| DelNetFin | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (6.2E-07) | ✅ (+0.00) | +| RIVolSpread | ✅ | ✅ (0.07%) | ✅ (+1.1%) | ✅ (0.0%) | ✅ (7.9E-07) | ✅ (+0.10) | +| RevenueSurprise | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (4.1E-04) | ✅ (+0.02) | +| roaq | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.1E-07) | ✅ (-0.01) | +| SmileSlope | ✅ | ✅ (0.08%) | ✅ (+1.1%) | ✅ (0.0%) | ✅ (8.4E-07) | ❌ (-2.61) | +| GrSaleToGrOverhead | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.6E-03) | ✅ (-0.04) | +| GrLTNOA | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (1.4E-07) | ✅ (+0.07) | +| Accruals | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.9E-07) | ✅ (-0.03) | +| CPVolSpread | ✅ | ✅ (0.06%) | ✅ (+1.1%) | ✅ (0.0%) | ✅ (1.9E-06) | ❌ (-1.19) | +| dCPVolSpread | ✅ | ✅ (0.09%) | ✅ (+1.3%) | ✅ (0.0%) | ✅ (8.6E-07) | ❌ (-1.00) | +| DelDRC | ✅ | ✅ (0.00%) | ✅ (+0.6%) | ✅ (0.0%) | ✅ (3.2E-07) | ✅ (+0.00) | +| dVolPut | ✅ | ✅ (0.09%) | ✅ (+1.3%) | ✅ (0.0%) | ✅ (7.5E-07) | ✅ (+0.15) | +| dVolCall | ✅ | ✅ (0.09%) | ✅ (+1.3%) | ✅ (0.0%) | ✅ (7.6E-07) | ❌ (-1.90) | +| BetaTailRisk | ✅ | ✅ (0.00%) | ✅ (+1.7%) | ✅ (0.0%) | ✅ (6.3E-03) | ✅ (+0.01) | +| NumEarnIncrease | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.00) | +| DelLTI | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (6.1E-07) | ✅ (+0.00) | +| ChNNCOA | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (1.9E-07) | ✅ (-0.07) | +| tang | ✅ | ✅ (0.03%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (4.9E-07) | ✅ (+0.01) | +| IntanBM | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.0%) | ✅ (3.1E-03) | ✅ (-0.01) | +| VolSD | ✅ | ✅ (0.02%) | ✅ (-0.0%) | ✅ (0.0%) | ✅ (1.5E-04) | ✅ (+0.00) | +| hire | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (3.0E-07) | ✅ (-0.03) | +| sinAlgo | ✅ | ✅ (0.00%) | ✅ (+0.2%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.07) | +| OPLeverage | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.9E-07) | ✅ (+0.01) | +| IntanSP | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.0%) | ✅ (1.7E-03) | ✅ (+0.03) | +| VolMkt | ✅ | ✅ (0.02%) | ✅ (-0.0%) | ✅ (0.0%) | ✅ (2.2E-05) | ✅ (-0.01) | +| DelFINL | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (4.8E-07) | ✅ (-0.01) | +| DelCOL | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (4.3E-07) | ✅ (+0.05) | +| CompositeDebtIssuance | ✅ | ✅ (0.00%) | ✅ (+0.9%) | ✅ (0.0%) | ✅ (2.0E-07) | ❌ (+0.44) | +| CBOperProf | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (3.7E-07) | ✅ (+0.00) | +| dNoa | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.8E-07) | ✅ (+0.01) | +| EarningsStreak | ✅ | ✅ (0.19%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (7.4E-08) | ✅ (-0.03) | +| ChNWC | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (1.5E-07) | ❌ (-0.23) | +| OperProfRD | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.0%) | ✅ (2.9E-07) | ✅ (+0.00) | +| SurpriseRD | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.07) | +| DelCOA | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (4.5E-07) | ✅ (+0.08) | +| NOA | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.0E-07) | ✅ (+0.02) | +| NetDebtFinance | ✅ | ✅ (0.00%) | ✅ (+0.6%) | ✅ (0.0%) | ✅ (2.4E-07) | ✅ (+0.01) | +| XFIN | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (1.6E-07) | ✅ (+0.00) | +| AnnouncementReturn | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.0%) | ✅ (9.7E-04) | ✅ (-0.04) | +| ShareRepurchase | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.03) | +| ChInv | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.7E-07) | ❌ (+0.26) | +| OrderBacklogChg | ✅ | ✅ (0.00%) | ✅ (+0.9%) | ✅ (0.0%) | ✅ (6.0E-07) | ✅ (-0.01) | +| MeanRankRevGrowth | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (1.5E-03) | ✅ (-0.02) | +| TotalAccruals | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.3E-07) | ✅ (-0.03) | +| DelEqu | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.1E-07) | ✅ (-0.01) | +| ConsRecomm | ✅ | ✅ (0.26%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (-0.03) | +| NetEquityFinance | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.2E-07) | ✅ (+0.01) | +| InvestPPEInv | ✅ | ✅ (0.00%) | ✅ (+0.6%) | ✅ (0.0%) | ✅ (1.9E-07) | ✅ (+0.00) | +| AOP | ✅ | ✅ (0.22%) | ✅ (+4.4%) | ✅ (0.0%) | ✅ (8.0E-05) | ✅ (-0.07) | +| OrderBacklog | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.9E-07) | ✅ (-0.09) | +| CF | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (1.5E-07) | ✅ (-0.09) | +| DivOmit | ✅ | ✅ (0.00%) | ✅ (+1.6%) | ✅ (0.0%) | ✅ (0.0E+00) | ❌ (-0.20) | +| OperProf | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (8.1E-08) | ✅ (+0.01) | +| BPEBM | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.3E-06) | ✅ (+0.02) | +| EBM | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.3E-06) | ✅ (+0.03) | +| RDS | ✅ | ✅ (0.00%) | ✅ (+3.4%) | ✅ (0.0%) | ✅ (9.4E-08) | ✅ (+0.16) | +| PctAcc | ✅ | ✅ (0.00%) | ✅ (+0.2%) | ✅ (0.0%) | ✅ (9.2E-08) | ✅ (-0.02) | +| AssetGrowth | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (1.3E-07) | ✅ (+0.02) | +| PctTotAcc | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (7.5E-08) | ✅ (+0.02) | +| ChAssetTurnover | ✅ | ✅ (0.00%) | ✅ (+0.6%) | ✅ (0.0%) | ✅ (3.2E-07) | ✅ (-0.04) | +| EarningsConsistency | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (3.3E-07) | ✅ (+0.04) | +| ConvDebt | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.01) | +| Frontier | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.3E-05) | ✅ (+0.02) | +| ChTax | ✅ | ✅ (0.00%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.8E-09) | ✅ (+0.03) | +| DebtIssuance | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (-0.03) | +| CoskewACX | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (3.4E-03) | ✅ (+0.00) | +| EntMult | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (6.5E-08) | ✅ (+0.01) | +| PayoutYield | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.7E-07) | ✅ (+0.03) | +| NetDebtPrice | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.3E-07) | ✅ (-0.02) | +| cfp | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.8E-07) | ✅ (+0.01) | +| NetPayoutYield | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (1.6E-07) | ✅ (+0.01) | +| RD | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (9.7E-08) | ✅ (-0.02) | +| Coskewness | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.6E-03) | ✅ (+0.03) | +| InvGrowth | ✅ | ✅ (0.00%) | ✅ (+0.4%) | ✅ (0.0%) | ✅ (7.2E-06) | ❌ (+0.44) | +| IdioVolAHT | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.4E-04) | ✅ (-0.03) | +| Herf | ✅ | ✅ (0.00%) | ✅ (+0.7%) | ✅ (0.0%) | ✅ (1.0E-05) | ✅ (+0.03) | +| HerfAsset | ✅ | ✅ (0.00%) | ✅ (+1.0%) | ✅ (0.0%) | ✅ (4.2E-06) | ✅ (+0.03) | +| FR | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (7.0E-07) | ✅ (+0.01) | +| ProbInformedTrading | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (6.0E-07) | ✅ (+0.00) | +| High52 | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (4.1E-07) | ✅ (-0.01) | +| MomSeason16YrPlus | ✅ | ✅ (0.00%) | ✅ (+1.4%) | ✅ (0.0%) | ✅ (3.7E-07) | ✅ (+0.00) | +| RDcap | ✅ | ✅ (0.02%) | ✅ (+3.9%) | ✅ (0.0%) | ✅ (3.7E-07) | ✅ (-0.14) | +| MomSeason11YrPlus | ✅ | ✅ (0.00%) | ✅ (+1.6%) | ✅ (0.0%) | ✅ (3.6E-07) | ✅ (-0.01) | +| MomSeason06YrPlus | ✅ | ✅ (0.00%) | ✅ (+1.6%) | ✅ (0.0%) | ✅ (3.4E-07) | ✅ (+0.02) | +| BidAskSpread | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (3.2E-07) | ✅ (+0.00) | +| IndMom | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (3.2E-07) | ✅ (-0.03) | +| BMdec | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (3.1E-07) | ✅ (-0.01) | +| MomSeason | ✅ | ✅ (0.00%) | ✅ (+1.1%) | ✅ (0.0%) | ✅ (3.0E-07) | ✅ (-0.01) | +| zerotrade1M | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.8E-07) | ✅ (-0.05) | +| zerotrade6M | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.8E-07) | ✅ (+0.03) | +| HerfBE | ✅ | ✅ (0.00%) | ✅ (+1.0%) | ✅ (0.0%) | ✅ (2.8E-07) | ✅ (+0.04) | +| AdExp | ✅ | ✅ (0.00%) | ✅ (-0.0%) | ✅ (0.0%) | ✅ (2.7E-07) | ✅ (+0.01) | +| FirmAgeMom | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.7E-07) | ✅ (+0.00) | +| MRreversal | ✅ | ✅ (0.34%) | ✅ (+3.0%) | ✅ (0.0%) | ✅ (2.7E-07) | ✅ (+0.00) | +| GrAdExp | ✅ | ✅ (0.01%) | ✅ (+0.4%) | ✅ (0.0%) | ✅ (2.7E-07) | ✅ (-0.08) | +| IntMom | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.7E-07) | ✅ (-0.01) | +| SP | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.6E-07) | ✅ (-0.01) | +| Mom6m | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.6E-07) | ✅ (-0.06) | +| zerotrade12M | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.6E-07) | ✅ (-0.04) | +| Mom12m | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.5E-07) | ✅ (-0.05) | +| Activism2 | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.4E-07) | ✅ (+0.00) | +| DelBreadth | ✅ | ✅ (0.06%) | ✅ (+0.5%) | ✅ (0.0%) | ✅ (2.2E-07) | ✅ (-0.01) | +| Size | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.1E-07) | ✅ (+0.00) | +| BookLeverage | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.0E-07) | ✅ (-0.02) | +| Price | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.0E-07) | ✅ (+0.00) | +| Leverage | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.9E-07) | ✅ (+0.00) | +| Beta | ✅ | ✅ (0.00%) | ✅ (+1.6%) | ✅ (0.0%) | ✅ (1.9E-07) | ✅ (+0.03) | +| MomSeasonShort | ✅ | ✅ (0.00%) | ✅ (+0.3%) | ✅ (0.0%) | ✅ (1.7E-07) | ✅ (+0.00) | +| STreversal | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.6E-07) | ✅ (+0.00) | +| DolVol | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.5E-07) | ✅ (+0.04) | +| AM | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.5E-07) | ✅ (+0.00) | +| Illiquidity | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.3E-07) | ✅ (+0.00) | +| VolumeTrend | ✅ | ✅ (0.00%) | ✅ (+0.6%) | ✅ (0.0%) | ✅ (1.2E-07) | ✅ (-0.03) | +| BrandInvest | ✅ | ✅ (0.01%) | ✅ (+4.9%) | ✅ (0.0%) | ✅ (1.2E-07) | ✅ (+0.07) | +| EP | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.0E-07) | ✅ (-0.02) | +| std_turn | ✅ | ✅ (0.02%) | ✅ (+1.6%) | ✅ (0.0%) | ✅ (1.0E-07) | ✅ (-0.12) | +| CashProd | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (5.0E-08) | ❌ (-0.21) | +| ChEQ | ✅ | ✅ (0.00%) | ✅ (+0.4%) | ✅ (0.0%) | ✅ (4.1E-08) | ✅ (+0.02) | +| RoE | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (2.4E-08) | ✅ (+0.02) | +| ChInvIA | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (2.3E-08) | ✅ (+0.04) | +| VarCF | ✅ | ✅ (0.00%) | ✅ (+0.7%) | ✅ (0.0%) | ✅ (1.4E-08) | ✅ (-0.02) | +| ShareIss5Y | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (7.6E-10) | ✅ (+0.03) | +| ShareIss1Y | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (7.5E-10) | ✅ (+0.02) | +| EquityDuration | ✅ | ✅ (0.00%) | ✅ (+2.5%) | ✅ (0.0%) | ✅ (5.8E-14) | ❌ (+0.22) | +| ReturnSkew | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (5.4E-15) | ✅ (-0.01) | +| RealizedVol | ✅ | ✅ (0.13%) | ✅ (-0.1%) | ✅ (0.0%) | ✅ (2.7E-15) | ✅ (+0.01) | +| IndRetBig | ✅ | ✅ (0.21%) | ✅ (+0.3%) | ✅ (0.0%) | ✅ (2.4E-15) | ✅ (+0.05) | +| Mom12mOffSeason | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (1.4E-15) | ✅ (+0.00) | +| grcapx3y | ✅ | ✅ (0.76%) | ✅ (+0.9%) | ✅ (0.0%) | ✅ (9.3E-20) | ✅ (-0.07) | +| ShareVol | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (-0.04) | +| PS | ✅ | ✅ (0.00%) | ✅ (+0.1%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.03) | +| AccrualsBM | ✅ | ✅ (0.18%) | ✅ (+0.3%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (-0.02) | +| Activism1 | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (-0.01) | +| FirmAge | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.00) | +| Governance | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.00) | +| MaxRet | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.00) | +| MomRev | ✅ | ✅ (0.46%) | ✅ (-0.2%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.00) | +| OScore | ✅ | ✅ (0.06%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.00) | +| Spinoff | ✅ | ✅ (0.00%) | ✅ (+0.0%) | ✅ (0.0%) | ✅ (0.0E+00) | ✅ (+0.00) | -**Overall**: 1/1 available predictors passed validation - - Natural passes: 1 - - Overridden passes: 0 -**Python CSVs**: 1/1 predictors have Python implementation +**Overall**: 174/213 available predictors passed validation + - Natural passes: 160 + - Overridden passes: 14 +**Python CSVs**: 213/213 predictors have Python implementation \* = Manual override applied (see Predictors/overrides.yaml for details) ## Detailed Results -### IdioVolAHT +### AM **Status**: ✅ PASSED @@ -37,44 +249,11754 @@ Numbers report the **FAILURE** rate. ❌ (100.00%) is BAD. - Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) - Test 3 - Precision1 check: ✅ PASSED - Test 4 - Precision2 check: ✅ PASSED -- Test 5 - T-stat check: NA (Skipped - use --tstat to enable) +- Test 5 - T-stat check: NA **Observations**: -- Stata: 4,849,170 -- Python: 4,849,170 -- Common: 4,849,170 +- Stata: 3,038,206 +- Python: 3,038,217 +- Common: 3,038,206 **Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) -**Precision2**: 100th percentile diff = 1.31e-04 (tolerance: < 1.00e-01) +**Precision2**: 100th percentile diff = 1.48e-07 (tolerance: < 1.00e-01) **Summary Statistics** (Common Observations): | Statistic | Stata | Python | Difference | Std Difference | |------------|----------------|----------------|----------------|----------------| -| count | 4.85e+06 | 4.85e+06 | 4.85e+06 | 4.85e+06 | -| mean | 0.0300 | 0.0300 | -1.18e-08 | -4.46e-07 | -| std | 0.0264 | 0.0264 | 3.13e-07 | 1.19e-05 | -| min | 1.02e-05 | 1.02e-05 | -5.29e-05 | -0.0020 | -| 25% | 0.0142 | 0.0142 | -2.78e-17 | -1.05e-15 | -| 50% | 0.0232 | 0.0232 | 0.0000 | 0.0000 | -| 75% | 0.0379 | 0.0379 | 2.78e-17 | 1.05e-15 | -| max | 2.5092 | 2.5092 | 1.31e-05 | 4.95e-04 | +| count | 3.04e+06 | 3.04e+06 | 3.04e+06 | 3.04e+06 | +| mean | 3.6848 | 3.6848 | -3.74e-07 | -1.40e-08 | +| std | 26.7559 | 26.7559 | 2.63e-04 | 9.82e-06 | +| min | 0.0000 | 0.0000 | -0.2458 | -0.0092 | +| 25% | 0.6412 | 0.6412 | -2.47e-08 | -9.23e-10 | +| 50% | 1.3833 | 1.3833 | 2.24e-11 | 8.36e-13 | +| 75% | 3.1169 | 3.1169 | 2.48e-08 | 9.27e-10 | +| max | 12309.6130 | 12309.6128 | 3.48e-04 | 1.30e-05 | **Regression Analysis** (Python ~ Stata): - **Model**: python = -0.0000 + 1.0000 * stata - **R-squared**: 1.0000 -- **N observations**: 4,849,170 +- **N observations**: 3,038,206 | Coefficient | Estimate | Std Error | t-statistic | p-value | |-------------|--------------|--------------|-------------|----------| -| Intercept | -1.74e-08 | 2.15e-10 | -80.6055 | 0.000 | -| Slope | 1.0000 | 5.38e-09 | 1.86e+08 | 0.000 | +| Intercept | -3.72e-07 | 1.52e-07 | -2.4473 | 0.014 | +| Slope | 1.0000 | 5.64e-09 | 1.77e+08 | 0.000 | **Feedback**: -- Num observations with std_diff >= TOL_DIFF_1: 0/4849170 (0.000%) -- Stata standard deviation: 2.64e-02 +- Num observations with std_diff >= TOL_DIFF_1: 0/3038206 (0.000%) +- Stata standard deviation: 2.68e+01 + +--- + +### AOP + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +4.41% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,244,664 +- Python: 1,299,504 +- Common: 1,241,880 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.02e-05 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.24e+06 | 1.24e+06 | 1.24e+06 | 1.24e+06 | +| mean | 158.1360 | 158.1633 | 0.0273 | 5.75e-07 | +| std | 47467.4186 | 47467.4174 | 8.8469 | 1.86e-04 | +| min | -23548.3180 | -23547.9593 | -231.9706 | -0.0049 | +| 25% | 0.0627 | 0.0611 | -7.34e-08 | -1.55e-12 | +| 50% | 0.3989 | 0.3939 | -3.63e-09 | -7.66e-14 | +| 75% | 1.3596 | 1.3519 | 6.21e-08 | 1.31e-12 | +| max | 1.53e+07 | 1.53e+07 | 2399.7758 | 0.0506 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0273 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,241,880 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0273 | 0.0079 | 3.4367 | 0.001 | +| Slope | 1.0000 | 1.67e-07 | 5.98e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 24/1241880 (0.002%) +- Stata standard deviation: 4.75e+04 + +--- + +### AbnormalAccruals + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-31 +- Reviewed by: ac +- Details: Precision1 is failing at 9.0%. But this is a complicated predictor with custom accounting variables (including ones with missing values), many lags, regressions by sic code and window. One can only expect that there will be deviations of more than 0.01 standard deviations. The long-short t-stat is nearly identical to the 2024 10 release. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +4.50% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,570,664 +- Python: 2,686,252 +- Common: 2,555,191 + +**Precision1**: 12.056% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.08e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.56e+06 | 2.56e+06 | 2.56e+06 | 2.56e+06 | +| mean | 3.63e-05 | 1.22e-05 | -2.41e-05 | -1.50e-04 | +| std | 0.1611 | 0.1594 | 0.0255 | 0.1583 | +| min | -8.2957 | -8.2779 | -2.0177 | -12.5231 | +| 25% | -0.0406 | -0.0404 | -6.56e-09 | -4.07e-08 | +| 50% | 0.0069 | 0.0069 | -2.10e-10 | -1.30e-09 | +| 75% | 0.0526 | 0.0526 | 4.88e-09 | 3.03e-08 | +| max | 2.8119 | 2.8119 | 7.8119 | 48.4851 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 0.9771 * stata +- **R-squared**: 0.9749 +- **N observations**: 2,555,191 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -2.33e-05 | 1.58e-05 | -1.4765 | 0.140 | +| Slope | 0.9771 | 9.80e-05 | 9970.5039 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 308052/2555191 (12.056%) +- Stata standard deviation: 1.61e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 21742 202609 -0.023373 -0.025642 0.002269 +1 29946 202609 0.110688 0.092957 0.017731 +2 82598 202609 0.006953 0.004917 0.002036 +3 12783 202608 0.025802 0.028171 -0.002369 +4 13142 202608 -0.125493 -0.145704 0.020211 +5 14033 202608 1.395182 1.391868 0.003313 +6 14759 202608 0.020095 0.023689 -0.003594 +7 15623 202608 -0.091755 -0.093782 0.002027 +8 16632 202608 -0.025460 -0.029087 0.003627 +9 17920 202608 0.066617 0.064850 0.001767 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 82215 202206 0.288585 -7.523344 7.811929 +1 82215 202207 0.288585 -7.523344 7.811929 +2 82215 202208 0.288585 -7.523344 7.811929 +3 82215 202209 0.288585 -7.523344 7.811929 +4 82215 202210 0.288585 -7.523344 7.811929 +5 82215 202211 0.288585 -7.523344 7.811929 +6 82215 202212 0.288585 -7.523344 7.811929 +7 82215 202301 0.288585 -7.523344 7.811929 +8 82215 202302 0.288585 -7.523344 7.811929 +9 82215 202303 0.288585 -7.523344 7.811929 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### Accruals + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,259,701 +- Python: 3,276,994 +- Common: 3,259,701 + +**Precision1**: 0.017% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.88e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.26e+06 | 3.26e+06 | 3.26e+06 | 3.26e+06 | +| mean | -0.0314 | -0.0314 | 2.75e-06 | 1.95e-05 | +| std | 0.1407 | 0.1406 | 0.0011 | 0.0077 | +| min | -24.3138 | -24.3138 | -0.0315 | -0.2238 | +| 25% | -0.0723 | -0.0723 | -9.67e-10 | -6.87e-09 | +| 50% | -0.0291 | -0.0291 | 6.51e-13 | 4.63e-12 | +| 75% | 0.0117 | 0.0117 | 9.77e-10 | 6.95e-09 | +| max | 8.1853 | 8.1853 | 0.5525 | 3.9279 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 0.9995 * stata +- **R-squared**: 0.9999 +- **N observations**: 3,259,701 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.37e-05 | 6.17e-07 | -22.2202 | 0.000 | +| Slope | 0.9995 | 4.28e-06 | 233376.2723 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 552/3259701 (0.017%) +- Stata standard deviation: 1.41e-01 + +--- + +### AccrualsBM + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.26% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 220,066 +- Python: 220,635 +- Common: 219,661 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 219661.0000 | 219661.0000 | 219661.0000 | 219661.0000 | +| mean | 0.4836 | 0.4836 | 0.0000 | 0.0000 | +| std | 0.4997 | 0.4997 | 0.0000 | 0.0000 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 219,661 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.44e-13 | 8.98e-16 | 383.7900 | 0.000 | +| Slope | 1.0000 | 1.29e-15 | 7.75e+14 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/219661 (0.000%) +- Stata standard deviation: 5.00e-01 + +--- + +### Activism1 + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.03% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 108,733 +- Python: 108,768 +- Common: 108,733 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 108733.0000 | 108733.0000 | 108733.0000 | 108733.0000 | +| mean | 14.8865 | 14.8865 | 0.0000 | 0.0000 | +| std | 2.7243 | 2.7243 | 0.0000 | 0.0000 | +| min | 6.0000 | 6.0000 | 0.0000 | 0.0000 | +| 25% | 13.0000 | 13.0000 | 0.0000 | 0.0000 | +| 50% | 15.0000 | 15.0000 | 0.0000 | 0.0000 | +| 75% | 17.0000 | 17.0000 | 0.0000 | 0.0000 | +| max | 23.0000 | 23.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 108,733 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.68e-13 | 2.68e-15 | 286.4583 | 0.000 | +| Slope | 1.0000 | 1.77e-16 | 5.64e+15 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/108733 (0.000%) +- Stata standard deviation: 2.72e+00 + +--- + +### Activism2 + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 30,170 +- Python: 30,170 +- Common: 30,170 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.37e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 30170.0000 | 30170.0000 | 30170.0000 | 30170.0000 | +| mean | 9.2631 | 9.2631 | -9.04e-09 | -7.15e-10 | +| std | 12.6421 | 12.6421 | 3.44e-07 | 2.72e-08 | +| min | 0.0000 | 0.0000 | -4.00e-06 | -3.16e-07 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 7.4388 | 7.4388 | 0.0000 | 0.0000 | +| 75% | 10.7284 | 10.7284 | 0.0000 | 0.0000 | +| max | 221.2826 | 221.2826 | 4.00e-06 | 3.16e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 30,170 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.30e-08 | 2.44e-09 | 9.4380 | 0.000 | +| Slope | 1.0000 | 1.56e-10 | 6.43e+09 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/30170 (0.000%) +- Stata standard deviation: 1.26e+01 + +--- + +### AdExp + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,049,030 +- Python: 1,049,006 +- Common: 1,048,999 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.74e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.05e+06 | 1.05e+06 | 1.05e+06 | 1.05e+06 | +| mean | 0.0758 | 0.0758 | 3.14e-13 | 8.42e-13 | +| std | 0.3727 | 0.3727 | 1.05e-08 | 2.81e-08 | +| min | 8.40e-07 | 8.40e-07 | -1.30e-06 | -3.49e-06 | +| 25% | 0.0049 | 0.0049 | -2.80e-10 | -7.50e-10 | +| 50% | 0.0161 | 0.0161 | 1.01e-13 | 2.71e-13 | +| 75% | 0.0556 | 0.0556 | 2.81e-10 | 7.55e-10 | +| max | 94.1189 | 94.1189 | 3.84e-06 | 1.03e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,048,999 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -3.24e-11 | 1.04e-11 | -3.1049 | 0.002 | +| Slope | 1.0000 | 2.74e-11 | 3.65e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/1048999 (0.000%) +- Stata standard deviation: 3.73e-01 + +--- + +### AnalystRevision + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.02% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,920,473 +- Python: 1,920,793 +- Common: 1,917,427 + +**Precision1**: 0.038% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.38e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.92e+06 | 1.92e+06 | 1.92e+06 | 1.92e+06 | +| mean | 1.0111 | 1.0111 | 3.70e-05 | 7.66e-06 | +| std | 4.8381 | 4.8380 | 0.0550 | 0.0114 | +| min | -1046.0000 | -1046.0000 | -22.2222 | -4.5932 | +| 25% | 0.9915 | 0.9915 | -5.38e-09 | -1.11e-09 | +| 50% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| 75% | 1.0068 | 1.0068 | 1.43e-09 | 2.95e-10 | +| max | 5783.5542 | 5783.5544 | 54.4133 | 11.2469 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0001 + 0.9999 * stata +- **R-squared**: 0.9999 +- **N observations**: 1,917,427 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.20e-04 | 4.06e-05 | 2.9512 | 0.003 | +| Slope | 0.9999 | 8.21e-06 | 121756.8500 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 738/1917427 (0.038%) +- Stata standard deviation: 4.84e+00 + +--- + +### AnalystValue + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +4.41% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,244,664 +- Python: 1,299,504 +- Common: 1,241,880 + +**Precision1**: 0.263% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.10e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.24e+06 | 1.24e+06 | 1.24e+06 | 1.24e+06 | +| mean | 0.7787 | 0.7792 | 5.50e-04 | 5.24e-05 | +| std | 10.4895 | 10.4536 | 0.0510 | 0.0049 | +| min | -45.8197 | -45.8197 | -11.7195 | -1.1173 | +| 25% | 0.4419 | 0.4418 | -4.09e-08 | -3.90e-09 | +| 50% | 0.6860 | 0.6861 | -1.25e-08 | -1.19e-09 | +| 75% | 0.9878 | 0.9881 | 8.97e-09 | 8.55e-10 | +| max | 3338.2976 | 3326.5781 | 5.3913 | 0.5140 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0032 + 0.9966 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,241,880 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0032 | 3.26e-05 | 98.8219 | 0.000 | +| Slope | 0.9966 | 3.10e-06 | 321904.8846 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 3264/1241880 (0.263%) +- Stata standard deviation: 1.05e+01 + +--- + +### AnnouncementReturn + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,922,373 +- Python: 2,922,369 +- Common: 2,922,293 + +**Precision1**: 0.003% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 9.73e-04 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.92e+06 | 2.92e+06 | 2.92e+06 | 2.92e+06 | +| mean | 0.0024 | 0.0024 | 3.11e-07 | 3.02e-06 | +| std | 0.1028 | 0.1028 | 7.25e-04 | 0.0071 | +| min | -1.6087 | -1.6087 | -0.2544 | -2.4751 | +| 25% | -0.0382 | -0.0382 | -1.09e-09 | -1.06e-08 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0394 | 0.0394 | 1.09e-09 | 1.06e-08 | +| max | 9.4535 | 9.4535 | 0.3185 | 3.0989 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,922,293 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.37e-07 | 4.24e-07 | 0.7944 | 0.427 | +| Slope | 1.0000 | 4.13e-06 | 242343.2638 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 102/2922293 (0.003%) +- Stata standard deviation: 1.03e-01 + +--- + +### AssetGrowth + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,295,125 +- Python: 3,312,591 +- Common: 3,295,125 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.29e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.30e+06 | 3.30e+06 | 3.30e+06 | 3.30e+06 | +| mean | 0.1700 | 0.1700 | 1.88e-07 | 9.98e-08 | +| std | 1.8876 | 1.8876 | 2.84e-04 | 1.50e-04 | +| min | -1.0000 | -1.0000 | -0.0968 | -0.0513 | +| 25% | -0.0280 | -0.0280 | -2.07e-09 | -1.10e-09 | +| 50% | 0.0638 | 0.0638 | -8.15e-13 | -4.32e-13 | +| 75% | 0.1864 | 0.1864 | 2.04e-09 | 1.08e-09 | +| max | 679.3918 | 679.3918 | 0.1080 | 0.0572 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,295,125 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.88e-07 | 1.57e-07 | 1.1952 | 0.232 | +| Slope | 1.0000 | 8.29e-08 | 1.21e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 36/3295125 (0.001%) +- Stata standard deviation: 1.89e+00 + +--- + +### BM + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,715,090 +- Python: 2,715,182 +- Common: 2,715,084 + +**Precision1**: 0.032% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.24e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.72e+06 | 2.72e+06 | 2.72e+06 | 2.72e+06 | +| mean | -0.7346 | -0.7346 | 3.61e-05 | 3.45e-05 | +| std | 1.0457 | 1.0457 | 0.0164 | 0.0156 | +| min | -10.1310 | -10.1310 | -3.5093 | -3.3559 | +| 25% | -1.2856 | -1.2856 | -1.19e-08 | -1.14e-08 | +| 50% | -0.6000 | -0.6000 | -2.61e-12 | -2.50e-12 | +| 75% | -0.0485 | -0.0485 | 1.18e-08 | 1.13e-08 | +| max | 6.8214 | 6.8214 | 3.6885 | 3.5273 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 0.9999 * stata +- **R-squared**: 0.9998 +- **N observations**: 2,715,084 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -3.94e-05 | 1.21e-05 | -3.2484 | 0.001 | +| Slope | 0.9999 | 9.50e-06 | 105295.3648 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 882/2715084 (0.032%) +- Stata standard deviation: 1.05e+00 + +--- + +### BMdec + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.07% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,996,716 +- Python: 2,998,704 +- Common: 2,996,716 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.05e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.00e+06 | 3.00e+06 | 3.00e+06 | 3.00e+06 | +| mean | 2.9269 | 2.9269 | -2.83e-07 | -5.41e-09 | +| std | 52.4336 | 52.4336 | 1.87e-04 | 3.56e-06 | +| min | -4881.0220 | -4881.0222 | -0.1222 | -0.0023 | +| 25% | 0.3604 | 0.3604 | -2.03e-08 | -3.88e-10 | +| 50% | 0.6804 | 0.6804 | 0.0000 | 0.0000 | +| 75% | 1.1675 | 1.1675 | 2.03e-08 | 3.88e-10 | +| max | 13961.4660 | 13961.4667 | 6.67e-04 | 1.27e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,996,716 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -2.92e-07 | 1.08e-07 | -2.7020 | 0.007 | +| Slope | 1.0000 | 2.06e-09 | 4.86e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2996716 (0.000%) +- Stata standard deviation: 5.24e+01 + +--- + +### BPEBM + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,924,820 +- Python: 2,924,835 +- Common: 2,924,820 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.26e-06 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.92e+06 | 2.92e+06 | 2.92e+06 | 2.92e+06 | +| mean | 0.0397 | 0.0400 | 3.10e-04 | 9.26e-07 | +| std | 335.4250 | 333.8920 | 2.9995 | 0.0089 | +| min | -471732.0600 | -467702.6380 | -3149.6733 | -9.3901 | +| 25% | -0.0674 | -0.0674 | -1.74e-08 | -5.18e-11 | +| 50% | 0.0164 | 0.0164 | 0.0000 | 0.0000 | +| 75% | 0.2348 | 0.2348 | 1.73e-08 | 5.17e-11 | +| max | 121033.5200 | 121033.5167 | 4029.4220 | 12.0129 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0005 + 0.9954 * stata +- **R-squared**: 0.9999 +- **N observations**: 2,924,820 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.93e-04 | 0.0015 | 0.3277 | 0.743 | +| Slope | 0.9954 | 4.48e-06 | 221984.3294 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 43/2924820 (0.001%) +- Stata standard deviation: 3.35e+02 + +--- + +### Beta + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.59% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,285,574 +- Python: 4,353,773 +- Common: 4,285,574 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.93e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.29e+06 | 4.29e+06 | 4.29e+06 | 4.29e+06 | +| mean | 0.9893 | 0.9893 | 1.97e-09 | 2.64e-09 | +| std | 0.7459 | 0.7459 | 2.18e-08 | 2.92e-08 | +| min | -17.8663 | -17.8663 | -1.20e-06 | -1.61e-06 | +| 25% | 0.5301 | 0.5301 | -7.41e-09 | -9.93e-09 | +| 50% | 0.8981 | 0.8981 | 7.08e-10 | 9.49e-10 | +| 75% | 1.3300 | 1.3300 | 1.02e-08 | 1.37e-08 | +| max | 52.6339 | 52.6339 | 2.25e-06 | 3.01e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,285,574 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.53e-09 | 1.74e-11 | -87.9658 | 0.000 | +| Slope | 1.0000 | 1.40e-11 | 7.13e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4285574 (0.000%) +- Stata standard deviation: 7.46e-01 + +--- + +### BetaFP + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-28 +- Reviewed by: ac +- Details: This predictor is failing NumRows by 8 percent and Precision1 by 6.3%. The signal is complicated and I went through the py script line by line. The t-stat is close to zero, which is unchanged from the 2024 10 release. I don't think it's quite worth it to dig into this more. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ❌ FAILED (Python has +9.54% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,794,018 +- Python: 4,156,049 +- Common: 3,784,837 + +**Precision1**: 6.256% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.77e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.78e+06 | 3.78e+06 | 3.78e+06 | 3.78e+06 | +| mean | 0.9809 | 0.9797 | -0.0012 | -0.0019 | +| std | 0.6411 | 0.6407 | 0.0384 | 0.0599 | +| min | 7.25e-07 | 0.0000 | -3.9823 | -6.2115 | +| 25% | 0.5198 | 0.5188 | -0.0018 | -0.0028 | +| 50% | 0.8964 | 0.8954 | -0.0010 | -0.0016 | +| 75% | 1.3175 | 1.3161 | -4.96e-04 | -7.73e-04 | +| max | 12.6047 | 12.5623 | 4.7939 | 7.4774 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0012 + 0.9976 * stata +- **R-squared**: 0.9964 +- **N observations**: 3,784,837 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0012 | 3.61e-05 | 32.2256 | 0.000 | +| Slope | 0.9976 | 3.08e-05 | 32422.6064 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 236766/3784837 (6.256%) +- Stata standard deviation: 6.41e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10066 202412 0.366743 0.247929 0.118814 +1 11153 202412 0.194080 0.253281 -0.059201 +2 11379 202412 1.593682 1.445916 0.147766 +3 12928 202412 0.551711 0.931920 -0.380209 +4 13563 202412 0.903823 0.608259 0.295564 +5 13828 202412 0.846890 0.970209 -0.123319 +6 13878 202412 0.978247 0.966509 0.011738 +7 13947 202412 2.604774 2.657374 -0.052600 +8 14051 202412 3.479786 3.465529 0.014258 +9 14469 202412 2.212077 1.759658 0.452419 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 11453 199312 7.664115 2.870236 4.793879 +1 65622 199401 0.593349 4.575622 -3.982273 +2 65622 199402 0.930784 4.732967 -3.802183 +3 65622 199312 0.867006 4.276299 -3.409292 +4 10872 199403 0.647807 4.045698 -3.397891 +5 10216 199301 0.659991 4.034531 -3.374539 +6 10872 199404 0.422160 3.782309 -3.360148 +7 10216 199304 0.823704 4.174334 -3.350630 +8 10216 199212 0.615237 3.899257 -3.284020 +9 10872 199405 0.912626 4.134042 -3.221416 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 14269 194112 4.258444 5.277860 -1.019417 +1 13389 194108 3.766282 2.927140 0.839142 +2 14269 194201 3.999881 4.830401 -0.830520 +3 11797 193702 2.478371 1.648720 0.829651 +4 11252 194112 4.024742 4.843852 -0.819109 +5 20271 194408 1.522680 2.332971 -0.810292 +6 18649 193710 1.339925 2.143693 -0.803768 +7 11797 193701 2.275098 1.506865 0.768232 +8 12677 192910 0.713803 1.460106 -0.746303 +9 14269 194202 4.036062 4.760625 -0.724563 +``` + +--- + +### BetaLiquidityPS + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.62% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,423,856 +- Python: 3,479,410 +- Common: 3,423,856 + +**Precision1**: 0.310% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.52e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.42e+06 | 3.42e+06 | 3.42e+06 | 3.42e+06 | +| mean | -0.0013 | -0.0013 | 7.66e-05 | 1.69e-04 | +| std | 0.4525 | 0.4524 | 5.65e-04 | 0.0012 | +| min | -23.6664 | -23.6664 | -0.0358 | -0.0790 | +| 25% | -0.1738 | -0.1738 | -6.41e-09 | -1.42e-08 | +| 50% | 8.48e-04 | 8.98e-04 | 3.97e-10 | 8.77e-10 | +| 75% | 0.1792 | 0.1792 | 8.25e-09 | 1.82e-08 | +| max | 41.3486 | 41.3530 | 0.0465 | 0.1029 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0001 + 0.9999 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,423,856 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.66e-05 | 3.05e-07 | 250.9598 | 0.000 | +| Slope | 0.9999 | 6.74e-07 | 1.48e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 10630/3423856 (0.310%) +- Stata standard deviation: 4.52e-01 + +--- + +### BetaTailRisk + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.73% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,292,350 +- Python: 2,332,084 +- Common: 2,292,350 + +**Precision1**: 0.013% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 6.31e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.29e+06 | 2.29e+06 | 2.29e+06 | 2.29e+06 | +| mean | 0.6390 | 0.6390 | 1.65e-06 | 3.22e-06 | +| std | 0.5111 | 0.5111 | 3.74e-04 | 7.32e-04 | +| min | -10.7373 | -10.7363 | -0.0114 | -0.0224 | +| 25% | 0.3065 | 0.3065 | -2.17e-05 | -4.24e-05 | +| 50% | 0.5661 | 0.5661 | 5.54e-08 | 1.08e-07 | +| 75% | 0.8925 | 0.8926 | 8.11e-05 | 1.59e-04 | +| max | 8.5702 | 8.5702 | 0.0068 | 0.0133 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0001 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,292,350 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.64e-05 | 3.93e-07 | -118.0172 | 0.000 | +| Slope | 1.0001 | 4.81e-07 | 2.08e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 307/2292350 (0.013%) +- Stata standard deviation: 5.11e-01 + +--- + +### BidAskSpread + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,481,622 +- Python: 4,481,622 +- Common: 4,481,622 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.22e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.48e+06 | 4.48e+06 | 4.48e+06 | 4.48e+06 | +| mean | 0.0164 | 0.0164 | 2.17e-13 | 7.01e-12 | +| std | 0.0310 | 0.0310 | 5.81e-10 | 1.87e-08 | +| min | 0.0000 | 0.0000 | -1.00e-07 | -3.22e-06 | +| 25% | 0.0046 | 0.0046 | 0.0000 | 0.0000 | +| 50% | 0.0086 | 0.0086 | 0.0000 | 0.0000 | +| 75% | 0.0174 | 0.0174 | 0.0000 | 0.0000 | +| max | 1.5145 | 1.5145 | 3.00e-08 | 9.67e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,481,622 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 5.46e-14 | 3.11e-13 | 0.1758 | 0.860 | +| Slope | 1.0000 | 8.85e-12 | 1.13e+11 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4481622 (0.000%) +- Stata standard deviation: 3.10e-02 + +--- + +### BookLeverage + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,606,159 +- Python: 3,608,415 +- Common: 3,606,159 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.05e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.61e+06 | 3.61e+06 | 3.61e+06 | 3.61e+06 | +| mean | 4.0992 | 4.0992 | -1.24e-05 | -6.14e-08 | +| std | 201.7771 | 201.7765 | 0.0028 | 1.40e-05 | +| min | -11894.4540 | -11894.3333 | -0.9911 | -0.0049 | +| 25% | 1.4322 | 1.4322 | -6.10e-08 | -3.02e-10 | +| 50% | 2.0021 | 2.0021 | 0.0000 | 0.0000 | +| 75% | 3.1980 | 3.1980 | 6.11e-08 | 3.03e-10 | +| max | 87702.4920 | 87702.5000 | 0.1693 | 8.39e-04 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,606,159 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.77e-07 | 1.45e-06 | 0.5363 | 0.592 | +| Slope | 1.0000 | 7.18e-09 | 1.39e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3606159 (0.000%) +- Stata standard deviation: 2.02e+02 + +--- + +### BrandInvest + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +4.88% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 485,304 +- Python: 508,980 +- Common: 485,268 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.23e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 485268.0000 | 485268.0000 | 485268.0000 | 485268.0000 | +| mean | 2683.1545 | 2683.1488 | -0.0057 | -8.23e-08 | +| std | 69012.6163 | 69012.6189 | 1.5280 | 2.21e-05 | +| min | -142.1329 | -142.1329 | -299.3499 | -0.0043 | +| 25% | 20.6166 | 20.6166 | -3.45e-06 | -5.01e-11 | +| 50% | 127.1275 | 127.1245 | 0.0000 | 0.0000 | +| 75% | 774.0426 | 774.0425 | 3.78e-06 | 5.47e-11 | +| max | 1.36e+07 | 1.36e+07 | 69.2994 | 0.0010 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0058 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 485,268 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -0.0058 | 0.0022 | -2.6323 | 0.008 | +| Slope | 1.0000 | 3.18e-08 | 3.15e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/485268 (0.000%) +- Stata standard deviation: 6.90e+04 + +--- + +### CBOperProf + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,283,861 +- Python: 2,283,899 +- Common: 2,283,861 + +**Precision1**: 0.007% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.71e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.28e+06 | 2.28e+06 | 2.28e+06 | 2.28e+06 | +| mean | 0.0904 | 0.0904 | 2.80e-07 | 1.27e-06 | +| std | 0.2200 | 0.2200 | 2.06e-04 | 9.37e-04 | +| min | -11.3370 | -11.3370 | -0.0428 | -0.1946 | +| 25% | 0.0313 | 0.0313 | -3.14e-09 | -1.43e-08 | +| 50% | 0.1154 | 0.1154 | 4.39e-12 | 2.00e-11 | +| 75% | 0.1888 | 0.1888 | 3.20e-09 | 1.45e-08 | +| max | 21.0025 | 21.0025 | 0.0637 | 0.2893 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,283,861 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 5.23e-07 | 1.47e-07 | 3.5452 | 0.000 | +| Slope | 1.0000 | 6.20e-07 | 1.61e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 169/2283861 (0.007%) +- Stata standard deviation: 2.20e-01 + +--- + +### CF + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.49% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,038,206 +- Python: 3,053,142 +- Common: 3,038,206 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.53e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.04e+06 | 3.04e+06 | 3.04e+06 | 3.04e+06 | +| mean | -0.0077 | -0.0077 | -5.05e-08 | -1.83e-08 | +| std | 2.7569 | 2.7569 | 3.57e-04 | 1.29e-04 | +| min | -2140.1667 | -2140.1666 | -0.2334 | -0.0847 | +| 25% | 0.0214 | 0.0214 | -1.99e-09 | -7.22e-10 | +| 50% | 0.0794 | 0.0794 | 1.73e-18 | 6.29e-19 | +| 75% | 0.1498 | 0.1498 | 1.99e-09 | 7.22e-10 | +| max | 221.9462 | 221.9462 | 0.1304 | 0.0473 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,038,206 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -5.03e-08 | 2.05e-07 | -0.2457 | 0.806 | +| Slope | 1.0000 | 7.42e-08 | 1.35e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 55/3038206 (0.002%) +- Stata standard deviation: 2.76e+00 + +--- + +### CPVolSpread + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.15% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 684,140 +- Python: 691,983 +- Common: 683,725 + +**Precision1**: 0.016% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.94e-06 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 683725.0000 | 683725.0000 | 683725.0000 | 683725.0000 | +| mean | 0.0131 | 0.0131 | -1.16e-06 | -2.12e-05 | +| std | 0.0547 | 0.0547 | 7.34e-04 | 0.0134 | +| min | -3.8959 | -3.8959 | -0.1978 | -3.6139 | +| 25% | -0.0037 | -0.0037 | -1.00e-08 | -1.83e-07 | +| 50% | 0.0059 | 0.0059 | 0.0000 | 0.0000 | +| 75% | 0.0220 | 0.0220 | 1.00e-08 | 1.83e-07 | +| max | 4.0661 | 4.0661 | 0.2430 | 4.4403 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9999 * stata +- **R-squared**: 0.9998 +- **N observations**: 683,725 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.19e-07 | 9.13e-07 | 0.3491 | 0.727 | +| Slope | 0.9999 | 1.62e-05 | 61609.4177 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 110/683725 (0.016%) +- Stata standard deviation: 5.47e-02 + +--- + +### Cash + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-31 +- Reviewed by: ac +- Details: Cash.py is producing 7.2% more rows than Stata but is matching everything else extremely well. The additional rows seem to be due to the conversion of quarterly rdq into monthly time_avail_m. I checked several rows in the python code that were missing from stata and they all look correct. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ❌ FAILED (Python has +7.18% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,096,350 +- Python: 2,246,775 +- Common: 2,095,920 + +**Precision1**: 0.045% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.54e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.10e+06 | 2.10e+06 | 2.10e+06 | 2.10e+06 | +| mean | 0.1672 | 0.1672 | -2.14e-06 | -1.00e-05 | +| std | 0.2141 | 0.2141 | 0.0015 | 0.0071 | +| min | -0.1432 | -0.1432 | -0.3312 | -1.5468 | +| 25% | 0.0249 | 0.0249 | -1.29e-09 | -6.04e-09 | +| 50% | 0.0754 | 0.0754 | 0.0000 | 0.0000 | +| 75% | 0.2202 | 0.2202 | 1.29e-09 | 6.02e-09 | +| max | 1.0000 | 1.0000 | 0.6778 | 3.1653 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 0.9999 +- **N observations**: 2,095,920 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.91e-06 | 1.34e-06 | 1.4258 | 0.154 | +| Slope | 1.0000 | 4.93e-06 | 202783.6668 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 951/2095920 (0.045%) +- Stata standard deviation: 2.14e-01 + +--- + +### CashProd + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,002,825 +- Python: 3,002,836 +- Common: 3,002,825 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.98e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.00e+06 | 3.00e+06 | 3.00e+06 | 3.00e+06 | +| mean | -12.6735 | -12.6735 | 5.10e-06 | 1.33e-09 | +| std | 3820.7124 | 3820.7124 | 0.0017 | 4.50e-07 | +| min | -921994.3800 | -921994.3600 | -0.0738 | -1.93e-05 | +| 25% | -13.6941 | -13.6941 | -1.41e-07 | -3.69e-11 | +| 50% | -2.1096 | -2.1096 | 0.0000 | 0.0000 | +| 75% | 3.5938 | 3.5938 | 1.41e-07 | 3.69e-11 | +| max | 1.99e+06 | 1.99e+06 | 0.8730 | 2.29e-04 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,002,825 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 5.07e-06 | 9.93e-07 | 5.1059 | 0.000 | +| Slope | 1.0000 | 2.60e-10 | 3.85e+09 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3002825 (0.000%) +- Stata standard deviation: 3.82e+03 + +--- + +### ChAssetTurnover + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.61% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,503,228 +- Python: 2,518,462 +- Common: 2,503,228 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.18e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.50e+06 | 2.50e+06 | 2.50e+06 | 2.50e+06 | +| mean | -0.0575 | -0.0577 | -1.58e-04 | -5.77e-07 | +| std | 274.6743 | 274.6787 | 0.0492 | 1.79e-04 | +| min | -108816.4100 | -108818.8964 | -18.4493 | -0.0672 | +| 25% | -0.1986 | -0.1986 | -4.59e-08 | -1.67e-10 | +| 50% | 4.34e-05 | 4.20e-05 | 0.0000 | 0.0000 | +| 75% | 0.1868 | 0.1868 | 4.63e-08 | 1.69e-10 | +| max | 39275.6840 | 39275.6489 | 0.6386 | 0.0023 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0002 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,503,228 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.58e-04 | 3.10e-05 | -5.0813 | 0.000 | +| Slope | 1.0000 | 1.13e-07 | 8.86e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 24/2503228 (0.001%) +- Stata standard deviation: 2.75e+02 + +--- + +### ChEQ + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.44% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,047,458 +- Python: 3,060,849 +- Common: 3,047,458 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.05e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.05e+06 | 3.05e+06 | 3.05e+06 | 3.05e+06 | +| mean | 1.3348 | 1.3348 | 3.74e-08 | 2.11e-09 | +| std | 17.7186 | 17.7186 | 4.79e-04 | 2.70e-05 | +| min | 4.17e-05 | 4.17e-05 | -0.1655 | -0.0093 | +| 25% | 0.9647 | 0.9647 | -2.33e-08 | -1.32e-09 | +| 50% | 1.0716 | 1.0716 | 0.0000 | 0.0000 | +| 75% | 1.1870 | 1.1870 | 2.33e-08 | 1.32e-09 | +| max | 5799.5884 | 5799.5882 | 0.1144 | 0.0065 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,047,458 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 5.42e-08 | 2.75e-07 | 0.1968 | 0.844 | +| Slope | 1.0000 | 1.55e-08 | 6.46e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3047458 (0.000%) +- Stata standard deviation: 1.77e+01 + +--- + +### ChForecastAccrual + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.07% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 628,022 +- Python: 628,491 +- Common: 626,886 + +**Precision1**: 0.030% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 626886.0000 | 626886.0000 | 626886.0000 | 626886.0000 | +| mean | 0.4775 | 0.4775 | 2.55e-05 | 5.11e-05 | +| std | 0.4995 | 0.4995 | 0.0174 | 0.0349 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.0020 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.0020 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0003 + 0.9994 * stata +- **R-squared**: 0.9988 +- **N observations**: 626,886 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.14e-04 | 3.04e-05 | 10.3393 | 0.000 | +| Slope | 0.9994 | 4.40e-05 | 22706.2133 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 190/626886 (0.030%) +- Stata standard deviation: 4.99e-01 + +--- + +### ChInv + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,295,155 +- Python: 3,312,651 +- Common: 3,295,155 + +**Precision1**: 0.003% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.71e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.30e+06 | 3.30e+06 | 3.30e+06 | 3.30e+06 | +| mean | 0.0097 | 0.0097 | 1.21e-07 | 1.83e-06 | +| std | 0.0664 | 0.0664 | 4.89e-05 | 7.36e-04 | +| min | -1.6890 | -1.6890 | -0.0078 | -0.1170 | +| 25% | -0.0012 | -0.0012 | -1.12e-10 | -1.69e-09 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0174 | 0.0174 | 1.11e-10 | 1.67e-09 | +| max | 1.7133 | 1.7133 | 0.0198 | 0.2984 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,295,155 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.26e-07 | 2.72e-08 | 4.6298 | 0.000 | +| Slope | 1.0000 | 4.06e-07 | 2.47e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 96/3295155 (0.003%) +- Stata standard deviation: 6.64e-02 + +--- + +### ChInvIA + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,678,515 +- Python: 2,678,522 +- Common: 2,678,515 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.33e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.68e+06 | 2.68e+06 | 2.68e+06 | 2.68e+06 | +| mean | -1143.9086 | -2.17e-05 | 1143.9086 | 1.99e-11 | +| std | 5.76e+13 | 5.76e+13 | 2.33e+06 | 4.04e-08 | +| min | -7.94e+15 | -7.94e+15 | -1.74e+09 | -3.02e-05 | +| 25% | -0.9389 | -0.9388 | -2.05e-08 | -3.56e-22 | +| 50% | -0.3699 | -0.3698 | 1.54e-11 | 2.67e-25 | +| 75% | 0.1936 | 0.1936 | 2.07e-08 | 3.60e-22 | +| max | 2.19e+16 | 2.19e+16 | 7.67e+08 | 1.33e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 1143.9039 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,678,515 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1143.9039 | 1401.5512 | 0.8162 | 0.414 | +| Slope | 1.0000 | 2.44e-11 | 4.11e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2678515 (0.000%) +- Stata standard deviation: 5.76e+13 + +--- + +### ChNAnalyst + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 55570 Stata observations) +- Test 2 - NumRows check: ❌ FAILED (Python has +76.28% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 210,988 +- Python: 371,936 +- Common: 155,418 + +**Precision1**: 0.005% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 155418.0000 | 155418.0000 | 155418.0000 | 155418.0000 | +| mean | 0.1566 | 0.1565 | -2.57e-05 | -7.08e-05 | +| std | 0.3634 | 0.3634 | 0.0072 | 0.0197 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.7519 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.7519 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9997 * stata +- **R-squared**: 0.9996 +- **N observations**: 155,418 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.53e-05 | 1.98e-05 | 0.7700 | 0.441 | +| Slope | 0.9997 | 5.01e-05 | 19963.9526 | 0.000 | + +**Missing Observations Sample**: +``` + index permno yyyymm ChNAnalyst + 0 10003 198711 0.0 + 1 10003 198712 0.0 + 2 10003 198802 0.0 + 3 10003 198803 0.0 + 4 10003 198805 0.0 + 5 10003 198809 1.0 + 6 10003 198811 0.0 + 7 10003 198812 0.0 + 8 10003 198901 0.0 + 9 10003 198902 0.0 +``` + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 8/155418 (0.005%) +- Stata standard deviation: 3.63e-01 + +--- + +### ChNNCOA + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,246,170 +- Python: 3,263,326 +- Common: 3,246,170 + +**Precision1**: 0.013% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.88e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.25e+06 | 3.25e+06 | 3.25e+06 | 3.25e+06 | +| mean | -0.0036 | -0.0036 | -4.06e-07 | -7.23e-07 | +| std | 0.5613 | 0.5613 | 0.0019 | 0.0033 | +| min | -166.4192 | -166.4192 | -0.7531 | -1.3418 | +| 25% | -0.0412 | -0.0412 | -5.10e-09 | -9.09e-09 | +| 50% | -8.74e-04 | -8.75e-04 | -4.99e-12 | -8.89e-12 | +| 75% | 0.0417 | 0.0417 | 5.10e-09 | 9.09e-09 | +| max | 51.9783 | 51.9783 | 0.4751 | 0.8465 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,246,170 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.75e-07 | 1.03e-06 | -0.4610 | 0.645 | +| Slope | 1.0000 | 1.83e-06 | 545119.3591 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 408/3246170 (0.013%) +- Stata standard deviation: 5.61e-01 + +--- + +### ChNWC + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,259,599 +- Python: 3,276,826 +- Common: 3,259,599 + +**Precision1**: 0.006% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.52e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.26e+06 | 3.26e+06 | 3.26e+06 | 3.26e+06 | +| mean | -0.0059 | -0.0059 | 1.79e-06 | 4.19e-06 | +| std | 0.4264 | 0.4263 | 9.35e-04 | 0.0022 | +| min | -166.1671 | -166.1671 | -0.0565 | -0.1324 | +| 25% | -0.0307 | -0.0307 | -2.80e-09 | -6.56e-09 | +| 50% | -5.74e-04 | -5.73e-04 | 0.0000 | 0.0000 | +| 75% | 0.0280 | 0.0280 | 2.81e-09 | 6.59e-09 | +| max | 16.2765 | 16.2765 | 0.4751 | 1.1144 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,259,599 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.56e-06 | 5.18e-07 | 3.0040 | 0.003 | +| Slope | 1.0000 | 1.21e-06 | 823262.6101 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 192/3259599 (0.006%) +- Stata standard deviation: 4.26e-01 + +--- + +### ChTax + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.48% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,827,726 +- Python: 2,841,393 +- Common: 2,827,634 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.77e-09 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.83e+06 | 2.83e+06 | 2.83e+06 | 2.83e+06 | +| mean | 0.0017 | 0.0017 | -3.09e-07 | -8.00e-08 | +| std | 3.8678 | 3.8678 | 4.46e-04 | 1.15e-04 | +| min | -990.0000 | -990.0000 | -0.6855 | -0.1772 | +| 25% | -0.0013 | -0.0013 | -3.19e-11 | -8.25e-12 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0029 | 0.0029 | 3.23e-11 | 8.36e-12 | +| max | 3440.0000 | 3440.0000 | 0.0771 | 0.0199 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,827,634 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -3.09e-07 | 2.65e-07 | -1.1663 | 0.244 | +| Slope | 1.0000 | 6.86e-08 | 1.46e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 16/2827634 (0.001%) +- Stata standard deviation: 3.87e+00 + +--- + +### ChangeInRecommendation + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.05% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 450,217 +- Python: 450,458 +- Common: 449,187 + +**Precision1**: 0.048% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.49e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 449187.0000 | 449187.0000 | 449187.0000 | 449187.0000 | +| mean | -0.0238 | -0.0238 | 2.69e-05 | 2.47e-05 | +| std | 1.0884 | 1.0884 | 0.0355 | 0.0326 | +| min | -4.0000 | -4.0000 | -4.0000 | -3.6751 | +| 25% | -1.0000 | -1.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 4.0000 | 4.0000 | 5.0000 | 4.5938 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9994 * stata +- **R-squared**: 0.9989 +- **N observations**: 449,187 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.36e-05 | 5.29e-05 | 0.2579 | 0.796 | +| Slope | 0.9994 | 4.86e-05 | 20557.0847 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 215/449187 (0.048%) +- Stata standard deviation: 1.09e+00 + +--- + +### CitationsRD + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +2.36% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 645,360 +- Python: 660,600 +- Common: 642,216 + +**Precision1**: 1.678% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.43e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 642216.0000 | 642216.0000 | 642216.0000 | 642216.0000 | +| mean | 0.2162 | 0.2330 | 0.0168 | 0.0408 | +| std | 0.4117 | 0.4228 | 0.1284 | 0.3120 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.4291 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0214 + 0.9786 * stata +- **R-squared**: 0.9081 +- **N observations**: 642,216 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0214 | 1.81e-04 | 118.5325 | 0.000 | +| Slope | 0.9786 | 3.88e-04 | 2519.5064 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 10776/642216 (1.678%) +- Stata standard deviation: 4.12e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10382 202505 1.0 0.0 1.0 +1 11587 202505 1.0 0.0 1.0 +2 15457 202505 1.0 0.0 1.0 +3 18973 202505 1.0 0.0 1.0 +4 31480 202505 1.0 0.0 1.0 +5 35991 202505 1.0 0.0 1.0 +6 36768 202505 1.0 0.0 1.0 +7 36898 202505 1.0 0.0 1.0 +8 37875 202505 1.0 0.0 1.0 +9 48144 202505 1.0 0.0 1.0 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10382 201106 1.0 0.0 1.0 +1 10382 201107 1.0 0.0 1.0 +2 10382 201108 1.0 0.0 1.0 +3 10382 201109 1.0 0.0 1.0 +4 10382 201110 1.0 0.0 1.0 +5 10382 201111 1.0 0.0 1.0 +6 10382 201112 1.0 0.0 1.0 +7 10382 201201 1.0 0.0 1.0 +8 10382 201202 1.0 0.0 1.0 +9 10382 201203 1.0 0.0 1.0 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### CompEquIss + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-26 +- Reviewed by: ac +- Details: The predictor is failing the NumRows check: it has 16.6% more rows than Stata. But it passes all other tests very nicely, including matching the 2024 long-short t-stat. Precision1 and 2 are near perfect. I eyeballed several of the obs that are missing from the Stata file and it seems like the Python code is correct. These observations have valid lagged mve_c, and that's pretty much what is required for this predictor. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ❌ FAILED (Python has +17.86% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,172,395 +- Python: 2,560,488 +- Common: 2,156,555 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.06e-06 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.16e+06 | 2.16e+06 | 2.16e+06 | 2.16e+06 | +| mean | -0.6781 | -0.6781 | 2.13e-10 | 7.04e-11 | +| std | 3.0255 | 3.0255 | 6.78e-07 | 2.24e-07 | +| min | -1948.3774 | -1948.3773 | -8.87e-05 | -2.93e-05 | +| 25% | -0.7631 | -0.7631 | -1.63e-07 | -5.40e-08 | +| 50% | -0.3171 | -0.3171 | 7.66e-10 | 2.53e-10 | +| 75% | -0.0574 | -0.0574 | 1.64e-07 | 5.42e-08 | +| max | 6.8094 | 6.8094 | 1.37e-04 | 4.53e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,156,555 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.12e-08 | 4.72e-10 | -23.6512 | 0.000 | +| Slope | 1.0000 | 1.52e-10 | 6.57e+09 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2156555 (0.000%) +- Stata standard deviation: 3.03e+00 + +--- + +### CompositeDebtIssuance + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.88% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,898,755 +- Python: 1,915,414 +- Common: 1,898,755 + +**Precision1**: 0.008% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.01e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.90e+06 | 1.90e+06 | 1.90e+06 | 1.90e+06 | +| mean | 0.5016 | 0.5016 | -2.22e-06 | -1.56e-06 | +| std | 1.4262 | 1.4262 | 0.0026 | 0.0018 | +| min | -11.3807 | -11.3807 | -0.6525 | -0.4575 | +| 25% | -0.1400 | -0.1400 | -3.01e-08 | -2.11e-08 | +| 50% | 0.3966 | 0.3966 | -6.72e-11 | -4.71e-11 | +| 75% | 1.0520 | 1.0520 | 2.99e-08 | 2.10e-08 | +| max | 12.3441 | 12.3441 | 0.5735 | 0.4021 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,898,755 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.50e-06 | 2.02e-06 | 0.7400 | 0.459 | +| Slope | 1.0000 | 1.34e-06 | 747260.9158 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 144/1898755 (0.008%) +- Stata standard deviation: 1.43e+00 + +--- + +### ConsRecomm + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.02% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 134,102 +- Python: 134,129 +- Common: 133,755 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 133755.0000 | 133755.0000 | 133755.0000 | 133755.0000 | +| mean | 0.2638 | 0.2638 | -7.48e-06 | -1.70e-05 | +| std | 0.4407 | 0.4407 | 0.0047 | 0.0107 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.2692 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.2692 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9999 * stata +- **R-squared**: 0.9999 +- **N observations**: 133,755 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.02e-05 | 1.51e-05 | 0.6729 | 0.501 | +| Slope | 0.9999 | 2.94e-05 | 34028.5403 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 3/133755 (0.002%) +- Stata standard deviation: 4.41e-01 + +--- + +### ConvDebt + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,624,363 +- Python: 3,626,619 +- Common: 3,624,363 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.62e+06 | 3.62e+06 | 3.62e+06 | 3.62e+06 | +| mean | 0.1329 | 0.1329 | 0.0000 | 0.0000 | +| std | 0.3394 | 0.3394 | 0.0026 | 0.0076 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.9460 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.9460 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 0.9999 +- **N observations**: 3,624,363 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.82e-06 | 1.45e-06 | 2.6305 | 0.009 | +| Slope | 1.0000 | 3.98e-06 | 251120.2086 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 24/3624363 (0.001%) +- Stata standard deviation: 3.39e-01 + +--- + +### CoskewACX + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,179,145 +- Python: 4,179,145 +- Common: 4,179,145 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.40e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.18e+06 | 4.18e+06 | 4.18e+06 | 4.18e+06 | +| mean | -0.1398 | -0.1398 | -6.97e-06 | -2.08e-05 | +| std | 0.3359 | 0.3359 | 9.16e-05 | 2.73e-04 | +| min | -6.2761 | -6.2761 | -0.0047 | -0.0139 | +| 25% | -0.2208 | -0.2208 | -1.93e-08 | -5.74e-08 | +| 50% | -0.0859 | -0.0859 | -1.65e-09 | -4.91e-09 | +| 75% | 0.0207 | 0.0207 | 1.42e-08 | 4.24e-08 | +| max | 3.5667 | 3.5667 | 0.0047 | 0.0140 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,179,145 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -5.98e-06 | 4.85e-08 | -123.3363 | 0.000 | +| Slope | 1.0000 | 1.33e-07 | 7.50e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 17/4179145 (0.000%) +- Stata standard deviation: 3.36e-01 + +--- + +### Coskewness + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,609,158 +- Python: 4,609,158 +- Common: 4,609,158 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.63e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.61e+06 | 4.61e+06 | 4.61e+06 | 4.61e+06 | +| mean | -0.2000 | -0.2000 | 1.89e-05 | 4.95e-05 | +| std | 0.3826 | 0.3826 | 1.10e-04 | 2.88e-04 | +| min | -4.4915 | -4.4915 | -0.0045 | -0.0118 | +| 25% | -0.3848 | -0.3848 | -1.09e-08 | -2.86e-08 | +| 50% | -0.1794 | -0.1794 | 1.94e-09 | 5.08e-09 | +| 75% | 0.0124 | 0.0124 | 1.75e-08 | 4.58e-08 | +| max | 2.5369 | 2.5369 | 0.0043 | 0.0111 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,609,158 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.73e-05 | 5.78e-08 | 299.9166 | 0.000 | +| Slope | 1.0000 | 1.34e-07 | 7.47e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 4/4609158 (0.000%) +- Stata standard deviation: 3.83e-01 + +--- + +### CredRatDG + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-20 +- Reviewed by: ac +- Details: The sample deviations are all downgrades found in Python but not in Stata. I manually checked a few and found these all have CIQ downgrades. This is likely an improvement due to patching the CIQ deduplication bugs in the Stata DataDownloads code. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ❌ FAILED (Python has +18.83% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,559,713 +- Python: 3,041,670 +- Common: 2,559,713 + +**Precision1**: 0.342% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 6.63e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.56e+06 | 2.56e+06 | 2.56e+06 | 2.56e+06 | +| mean | 0.0233 | 0.0266 | 0.0033 | 0.0221 | +| std | 0.1508 | 0.1610 | 0.0584 | 0.3874 | +| min | 0.0000 | 0.0000 | -1.0000 | -6.6310 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 6.6310 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0035 + 0.9945 * stata +- **R-squared**: 0.8683 +- **N observations**: 2,559,713 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0035 | 3.69e-05 | 93.6060 | 0.000 | +| Slope | 0.9945 | 2.42e-04 | 4107.2676 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 8767/2559713 (0.342%) +- Stata standard deviation: 1.51e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 17956 202412 1.0 0.0 1.0 +1 18144 202412 1.0 0.0 1.0 +2 22174 202412 1.0 0.0 1.0 +3 38703 202412 1.0 0.0 1.0 +4 47896 202412 1.0 0.0 1.0 +5 58318 202412 1.0 0.0 1.0 +6 66157 202412 1.0 0.0 1.0 +7 70519 202412 1.0 0.0 1.0 +8 88284 202412 1.0 0.0 1.0 +9 89199 202412 1.0 0.0 1.0 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10026 198911 1.0 0.0 1.0 +1 10026 198912 1.0 0.0 1.0 +2 10026 199001 1.0 0.0 1.0 +3 10026 199002 1.0 0.0 1.0 +4 10026 199003 1.0 0.0 1.0 +5 10026 199004 1.0 0.0 1.0 +6 10047 199311 1.0 0.0 1.0 +7 10047 199312 1.0 0.0 1.0 +8 10047 199401 1.0 0.0 1.0 +9 10047 199402 1.0 0.0 1.0 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### CustomerMomentum + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.04% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 356,600 +- Python: 356,474 +- Common: 356,426 + +**Precision1**: 0.653% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.18e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 356426.0000 | 356426.0000 | 356426.0000 | 356426.0000 | +| mean | 0.0114 | 0.0114 | 5.85e-05 | 5.25e-04 | +| std | 0.1115 | 0.1116 | 0.0065 | 0.0585 | +| min | -0.9813 | -0.9813 | -0.4973 | -4.4621 | +| 25% | -0.0407 | -0.0407 | -2.00e-10 | -1.79e-09 | +| 50% | 0.0102 | 0.0102 | 0.0000 | 0.0000 | +| 75% | 0.0606 | 0.0606 | 1.00e-10 | 8.97e-10 | +| max | 8.1384 | 8.1384 | 0.8190 | 7.3487 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0001 + 0.9993 * stata +- **R-squared**: 0.9966 +- **N observations**: 356,426 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 6.69e-05 | 1.10e-05 | 6.0957 | 0.000 | +| Slope | 0.9993 | 9.80e-05 | 10200.2189 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 2327/356426 (0.653%) +- Stata standard deviation: 1.11e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 12513 202411 0.079070 0.050838 0.028232 +1 85265 202411 0.062140 0.038684 0.023456 +2 87299 202411 0.045972 0.041455 0.004517 +3 12513 202410 0.078612 0.080130 -0.001518 +4 85265 202410 0.011421 0.017658 -0.006237 +5 87299 202410 -0.021951 -0.017611 -0.004340 +6 12513 202409 0.050790 0.028884 0.021906 +7 85265 202409 0.014368 0.015946 -0.001578 +8 12513 202408 0.037782 0.004082 0.033701 +9 85265 202408 -0.048965 -0.060372 0.011407 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 86313 202007 0.809589 -0.009438 0.819028 +1 86313 202004 0.865116 0.208269 0.656847 +2 90508 201304 0.688312 0.067991 0.620321 +3 13887 202007 0.559422 0.013403 0.546019 +4 68187 202007 0.590193 0.044175 0.546019 +5 90508 201210 0.457268 -0.084444 0.541712 +6 86013 200804 0.613770 0.101695 0.512075 +7 22089 202208 0.018277 0.515593 -0.497316 +8 86313 202208 -0.101489 0.395827 -0.497316 +9 22128 202208 -0.090914 0.406401 -0.497316 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### DebtIssuance + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,725,997 +- Python: 2,726,040 +- Common: 2,725,997 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.73e+06 | 2.73e+06 | 2.73e+06 | 2.73e+06 | +| mean | 0.5023 | 0.5023 | 4.40e-06 | 8.80e-06 | +| std | 0.5000 | 0.5000 | 0.0021 | 0.0042 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,725,997 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.84e-06 | 1.80e-06 | 4.9101 | 0.000 | +| Slope | 1.0000 | 2.54e-06 | 393457.8670 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 12/2725997 (0.000%) +- Stata standard deviation: 5.00e-01 + +--- + +### DelBreadth + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.46% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,062,671 +- Python: 1,067,556 +- Common: 1,062,049 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.25e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.06e+06 | 1.06e+06 | 1.06e+06 | 1.06e+06 | +| mean | 0.1317 | 0.1317 | -1.47e-10 | -1.65e-10 | +| std | 0.8890 | 0.8890 | 2.45e-08 | 2.76e-08 | +| min | -47.2500 | -47.2500 | -2.00e-06 | -2.25e-06 | +| 25% | -0.1820 | -0.1820 | -1.00e-09 | -1.12e-09 | +| 50% | 0.0800 | 0.0800 | 0.0000 | 0.0000 | +| 75% | 0.3990 | 0.3990 | 0.0000 | 0.0000 | +| max | 48.0560 | 48.0560 | 1.00e-06 | 1.12e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,062,049 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.90e-11 | 2.40e-11 | 0.7899 | 0.430 | +| Slope | 1.0000 | 2.67e-11 | 3.74e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/1062049 (0.000%) +- Stata standard deviation: 8.89e-01 + +--- + +### DelCOA + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,295,155 +- Python: 3,312,651 +- Common: 3,295,155 + +**Precision1**: 0.005% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.53e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.30e+06 | 3.30e+06 | 3.30e+06 | 3.30e+06 | +| mean | 0.0221 | 0.0221 | -4.49e-08 | -3.70e-07 | +| std | 0.1215 | 0.1215 | 8.00e-05 | 6.59e-04 | +| min | -1.8713 | -1.8713 | -0.0225 | -0.1855 | +| 25% | -0.0140 | -0.0140 | -8.83e-10 | -7.27e-09 | +| 50% | 0.0104 | 0.0104 | 0.0000 | 0.0000 | +| 75% | 0.0557 | 0.0557 | 8.67e-10 | 7.14e-09 | +| max | 1.8202 | 1.8202 | 0.0239 | 0.1970 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,295,155 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.00e-07 | 4.48e-08 | -2.2375 | 0.025 | +| Slope | 1.0000 | 3.63e-07 | 2.76e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 168/3295155 (0.005%) +- Stata standard deviation: 1.21e-01 + +--- + +### DelCOL + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,259,701 +- Python: 3,276,994 +- Common: 3,259,701 + +**Precision1**: 0.008% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.33e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.26e+06 | 3.26e+06 | 3.26e+06 | 3.26e+06 | +| mean | 0.0146 | 0.0146 | -2.25e-06 | -1.92e-05 | +| std | 0.1171 | 0.1170 | 0.0011 | 0.0092 | +| min | -8.3977 | -8.3977 | -0.5525 | -4.7197 | +| 25% | -0.0119 | -0.0119 | -6.63e-10 | -5.66e-09 | +| 50% | 0.0079 | 0.0079 | 0.0000 | 0.0000 | +| 75% | 0.0389 | 0.0389 | 6.64e-10 | 5.67e-09 | +| max | 25.3738 | 25.3738 | 0.0315 | 0.2689 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9993 * stata +- **R-squared**: 0.9999 +- **N observations**: 3,259,701 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.40e-06 | 5.97e-07 | 12.3787 | 0.000 | +| Slope | 0.9993 | 5.06e-06 | 197311.2746 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 252/3259701 (0.008%) +- Stata standard deviation: 1.17e-01 + +--- + +### DelDRC + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.58% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 460,159 +- Python: 462,838 +- Common: 460,159 + +**Precision1**: 0.016% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.24e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 460159.0000 | 460159.0000 | 460159.0000 | 460159.0000 | +| mean | 0.0070 | 0.0070 | 2.99e-06 | 6.60e-05 | +| std | 0.0453 | 0.0453 | 4.15e-04 | 0.0091 | +| min | -2.4175 | -2.4175 | -4.14e-05 | -9.13e-04 | +| 25% | -0.0016 | -0.0016 | -8.65e-11 | -1.91e-09 | +| 50% | 0.0014 | 0.0014 | 8.11e-14 | 1.79e-12 | +| 75% | 0.0099 | 0.0099 | 9.26e-11 | 2.04e-09 | +| max | 1.1627 | 1.1627 | 0.0792 | 1.7468 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9999 * stata +- **R-squared**: 0.9999 +- **N observations**: 460,159 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.44e-06 | 6.18e-07 | 5.5610 | 0.000 | +| Slope | 0.9999 | 1.35e-05 | 74167.0903 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 72/460159 (0.016%) +- Stata standard deviation: 4.53e-02 + +--- + +### DelEqu + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,194,475 +- Python: 3,196,296 +- Common: 3,194,475 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.07e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.19e+06 | 3.19e+06 | 3.19e+06 | 3.19e+06 | +| mean | 0.0212 | 0.0212 | 2.28e-06 | 4.18e-06 | +| std | 0.5462 | 0.5462 | 0.0012 | 0.0021 | +| min | -240.0000 | -240.0000 | -0.1665 | -0.3048 | +| 25% | -0.0179 | -0.0179 | -1.39e-09 | -2.54e-09 | +| 50% | 0.0212 | 0.0212 | 4.35e-13 | 7.96e-13 | +| 75% | 0.0750 | 0.0750 | 1.40e-09 | 2.56e-09 | +| max | 24.9274 | 24.9274 | 0.5525 | 1.0115 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,194,475 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.91e-06 | 6.55e-07 | 4.4438 | 0.000 | +| Slope | 1.0000 | 1.20e-06 | 834625.7387 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 72/3194475 (0.002%) +- Stata standard deviation: 5.46e-01 + +--- + +### DelFINL + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,250,876 +- Python: 3,252,733 +- Common: 3,250,876 + +**Precision1**: 0.008% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.78e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.25e+06 | 3.25e+06 | 3.25e+06 | 3.25e+06 | +| mean | 0.0265 | 0.0265 | 3.64e-07 | 2.05e-06 | +| std | 0.1773 | 0.1773 | 2.14e-04 | 0.0012 | +| min | -30.5807 | -30.5807 | -0.0490 | -0.2763 | +| 25% | -0.0174 | -0.0174 | -8.28e-10 | -4.67e-09 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0538 | 0.0538 | 8.25e-10 | 4.66e-09 | +| max | 12.4151 | 12.4151 | 0.0466 | 0.2627 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,250,876 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 5.89e-07 | 1.20e-07 | 4.9162 | 0.000 | +| Slope | 1.0000 | 6.69e-07 | 1.50e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 264/3250876 (0.008%) +- Stata standard deviation: 1.77e-01 + +--- + +### DelLTI + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,295,155 +- Python: 3,296,976 +- Common: 3,295,155 + +**Precision1**: 0.013% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 6.14e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.30e+06 | 3.30e+06 | 3.30e+06 | 3.30e+06 | +| mean | 0.0063 | 0.0063 | 2.39e-06 | 3.08e-05 | +| std | 0.0776 | 0.0776 | 0.0015 | 0.0198 | +| min | -1.8040 | -1.8040 | -0.2490 | -3.2078 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 5.23e-04 | 5.24e-04 | 0.0000 | 0.0000 | +| max | 1.9377 | 1.9377 | 0.7521 | 9.6876 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9999 * stata +- **R-squared**: 0.9996 +- **N observations**: 3,295,155 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.03e-06 | 8.48e-07 | 3.5680 | 0.000 | +| Slope | 0.9999 | 1.09e-05 | 91827.2517 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 420/3295155 (0.013%) +- Stata standard deviation: 7.76e-02 + +--- + +### DelNetFin + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,250,876 +- Python: 3,252,733 +- Common: 3,250,876 + +**Precision1**: 0.020% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 6.19e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.25e+06 | 3.25e+06 | 3.25e+06 | 3.25e+06 | +| mean | -0.0184 | -0.0184 | 1.78e-06 | 8.67e-06 | +| std | 0.2049 | 0.2049 | 0.0016 | 0.0079 | +| min | -12.4151 | -12.4151 | -0.2490 | -1.2157 | +| 25% | -0.0667 | -0.0667 | -5.33e-09 | -2.60e-08 | +| 50% | -0.0015 | -0.0015 | 0.0000 | 0.0000 | +| 75% | 0.0412 | 0.0412 | 5.38e-09 | 2.63e-08 | +| max | 30.5807 | 30.5807 | 0.7521 | 3.6714 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 0.9999 +- **N observations**: 3,250,876 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.96e-07 | 9.03e-07 | 0.9925 | 0.321 | +| Slope | 1.0000 | 4.39e-06 | 227758.2534 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 636/3250876 (0.020%) +- Stata standard deviation: 2.05e-01 + +--- + +### DivInit + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.61% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,047,630 +- Python: 4,112,633 +- Common: 4,047,630 + +**Precision1**: 0.103% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.30e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.05e+06 | 4.05e+06 | 4.05e+06 | 4.05e+06 | +| mean | 0.0191 | 0.0181 | -0.0010 | -0.0074 | +| std | 0.1369 | 0.1333 | 0.0320 | 0.2337 | +| min | 0.0000 | 0.0000 | -1.0000 | -7.3042 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 7.3042 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9466 * stata +- **R-squared**: 0.9454 +- **N observations**: 4,047,630 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.03e-06 | 1.56e-05 | 0.2578 | 0.797 | +| Slope | 0.9466 | 1.13e-04 | 8369.1440 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 4149/4047630 (0.103%) +- Stata standard deviation: 1.37e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 79145 202412 0.0 1.0 -1.0 +1 81784 202412 0.0 1.0 -1.0 +2 79145 202411 0.0 1.0 -1.0 +3 79145 202410 0.0 1.0 -1.0 +4 79145 202409 0.0 1.0 -1.0 +5 10517 202408 0.0 1.0 -1.0 +6 88988 202408 0.0 1.0 -1.0 +7 10517 202407 0.0 1.0 -1.0 +8 12009 202407 0.0 1.0 -1.0 +9 88988 202407 0.0 1.0 -1.0 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10001 200510 0.0 1.0 -1.0 +1 10001 200511 0.0 1.0 -1.0 +2 10001 200512 0.0 1.0 -1.0 +3 10001 200601 0.0 1.0 -1.0 +4 10001 200602 0.0 1.0 -1.0 +5 10001 200603 0.0 1.0 -1.0 +6 10056 199410 0.0 1.0 -1.0 +7 10056 199411 0.0 1.0 -1.0 +8 10056 199412 0.0 1.0 -1.0 +9 10056 199501 0.0 1.0 -1.0 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 10372 193609 0.0 1.0 -1.0 +1 10372 193610 0.0 1.0 -1.0 +2 10372 193611 0.0 1.0 -1.0 +3 10372 193612 0.0 1.0 -1.0 +4 10372 193701 0.0 1.0 -1.0 +5 10372 193702 0.0 1.0 -1.0 +6 10751 193511 0.0 1.0 -1.0 +7 10751 193512 0.0 1.0 -1.0 +8 10751 193601 0.0 1.0 -1.0 +9 10751 193602 0.0 1.0 -1.0 +``` + +--- + +### DivOmit + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.61% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,047,630 +- Python: 4,112,633 +- Common: 4,047,630 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.05e+06 | 4.05e+06 | 4.05e+06 | 4.05e+06 | +| mean | 0.0039 | 0.0039 | -1.46e-05 | -2.34e-04 | +| std | 0.0622 | 0.0621 | 0.0042 | 0.0683 | +| min | 0.0000 | 0.0000 | -1.0000 | -16.0714 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 16.0714 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9958 * stata +- **R-squared**: 0.9953 +- **N observations**: 4,047,630 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.74e-06 | 2.11e-06 | 0.8224 | 0.411 | +| Slope | 0.9958 | 3.39e-05 | 29409.2443 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 73/4047630 (0.002%) +- Stata standard deviation: 6.22e-02 + +--- + +### DivSeason + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.60% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,775,339 +- Python: 1,785,982 +- Common: 1,775,339 + +**Precision1**: 0.032% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.78e+06 | 1.78e+06 | 1.78e+06 | 1.78e+06 | +| mean | 0.4456 | 0.4453 | -2.26e-04 | -4.56e-04 | +| std | 0.4970 | 0.4970 | 0.0179 | 0.0361 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.0120 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.0120 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0001 + 0.9993 * stata +- **R-squared**: 0.9987 +- **N observations**: 1,775,339 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.64e-05 | 1.81e-05 | 4.7743 | 0.000 | +| Slope | 0.9993 | 2.71e-05 | 36878.7410 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 572/1775339 (0.032%) +- Stata standard deviation: 4.97e-01 + +--- + +### DivYieldST + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.61% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,591,700 +- Python: 1,601,389 +- Common: 1,591,694 + +**Precision1**: 0.069% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.59e+06 | 1.59e+06 | 1.59e+06 | 1.59e+06 | +| mean | 0.6292 | 0.6298 | 6.12e-04 | 5.91e-04 | +| std | 1.0349 | 1.0358 | 0.0286 | 0.0276 | +| min | 0.0000 | 0.0000 | -1.0000 | -0.9663 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 3.0000 | 3.0000 | 3.0000 | 2.8989 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0003 + 1.0005 * stata +- **R-squared**: 0.9992 +- **N observations**: 1,591,694 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.17e-04 | 2.65e-05 | 11.9651 | 0.000 | +| Slope | 1.0005 | 2.19e-05 | 45689.0915 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 1092/1591694 (0.069%) +- Stata standard deviation: 1.03e+00 + +--- + +### DolVol + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,640,493 +- Python: 4,640,493 +- Common: 4,640,493 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.54e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.64e+06 | 4.64e+06 | 4.64e+06 | 4.64e+06 | +| mean | 1.8039 | 1.8039 | 2.66e-10 | 8.56e-11 | +| std | 3.1052 | 3.1052 | 9.44e-08 | 3.04e-08 | +| min | -12.2705 | -12.2705 | -9.62e-07 | -3.10e-07 | +| 25% | -0.3808 | -0.3808 | -4.02e-08 | -1.29e-08 | +| 50% | 1.6649 | 1.6649 | 9.04e-11 | 2.91e-11 | +| 75% | 3.9554 | 3.9554 | 4.12e-08 | 1.33e-08 | +| max | 14.2392 | 14.2392 | 9.61e-07 | 3.10e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,640,493 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 5.01e-10 | 5.07e-11 | 9.8872 | 0.000 | +| Slope | 1.0000 | 1.41e-11 | 7.09e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4640493 (0.000%) +- Stata standard deviation: 3.11e+00 + +--- + +### DownRecomm + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.05% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 463,983 +- Python: 464,223 +- Common: 462,936 + +**Precision1**: 0.025% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 462936.0000 | 462936.0000 | 462936.0000 | 462936.0000 | +| mean | 0.3701 | 0.3701 | -2.16e-06 | -4.47e-06 | +| std | 0.4828 | 0.4828 | 0.0158 | 0.0326 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.0711 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.0711 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0002 + 0.9995 * stata +- **R-squared**: 0.9989 +- **N observations**: 462,936 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.95e-04 | 2.92e-05 | 6.6982 | 0.000 | +| Slope | 0.9995 | 4.80e-05 | 20835.2921 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 115/462936 (0.025%) +- Stata standard deviation: 4.83e-01 + +--- + +### EBM + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,924,820 +- Python: 2,924,835 +- Common: 2,924,820 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.26e-06 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.92e+06 | 2.92e+06 | 2.92e+06 | 2.92e+06 | +| mean | 0.7855 | 0.7852 | -3.11e-04 | -9.27e-07 | +| std | 335.4142 | 333.8812 | 2.9995 | 0.0089 | +| min | -121033.2100 | -121033.2121 | -4029.4330 | -12.0133 | +| 25% | 0.1588 | 0.1588 | -1.34e-08 | -3.99e-11 | +| 50% | 0.4869 | 0.4869 | -2.40e-12 | -7.16e-15 | +| 75% | 0.9656 | 0.9656 | 1.33e-08 | 3.98e-11 | +| max | 471732.5300 | 467703.0970 | 3149.6636 | 9.3904 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0033 + 0.9954 * stata +- **R-squared**: 0.9999 +- **N observations**: 2,924,820 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0033 | 0.0015 | 2.1955 | 0.028 | +| Slope | 0.9954 | 4.48e-06 | 221979.7775 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 43/2924820 (0.001%) +- Stata standard deviation: 3.35e+02 + +--- + +### EP + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,203,166 +- Python: 2,203,173 +- Common: 2,203,166 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.03e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.20e+06 | 2.20e+06 | 2.20e+06 | 2.20e+06 | +| mean | 0.0885 | 0.0885 | -1.56e-12 | -5.16e-12 | +| std | 0.3016 | 0.3016 | 9.02e-09 | 2.99e-08 | +| min | 0.0000 | 0.0000 | -8.71e-06 | -2.89e-05 | +| 25% | 0.0403 | 0.0403 | -1.24e-09 | -4.10e-09 | +| 50% | 0.0666 | 0.0666 | 0.0000 | 0.0000 | +| 75% | 0.1061 | 0.1061 | 1.23e-09 | 4.09e-09 | +| max | 213.4789 | 213.4789 | 2.83e-06 | 9.39e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,203,166 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.99e-10 | 6.22e-12 | 80.2365 | 0.000 | +| Slope | 1.0000 | 1.98e-11 | 5.06e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2203166 (0.000%) +- Stata standard deviation: 3.02e-01 + +--- + +### EarnSupBig + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.37% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,327,518 +- Python: 2,336,093 +- Common: 2,323,705 + +**Precision1**: 0.152% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.47e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.32e+06 | 2.32e+06 | 2.32e+06 | 2.32e+06 | +| mean | 5.45e+10 | -0.1171 | -5.45e+10 | -0.0117 | +| std | 4.64e+12 | 1.1736 | 4.64e+12 | 1.0000 | +| min | -6.79e+13 | -60.7447 | -3.58e+14 | -77.1882 | +| 25% | -0.4066 | -0.3995 | -0.0034 | -7.23e-16 | +| 50% | -0.0832 | -0.0823 | 1.84e-09 | 3.97e-22 | +| 75% | 0.2326 | 0.2270 | 0.0052 | 1.12e-15 | +| max | 3.58e+14 | 60.4149 | 6.79e+13 | 14.6460 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.1172 + 0.0000 * stata +- **R-squared**: 0.0000 +- **N observations**: 2,323,705 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -0.1172 | 7.70e-04 | -152.1701 | 0.000 | +| Slope | 1.53e-15 | 1.66e-16 | 9.2441 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 3539/2323705 (0.152%) +- Stata standard deviation: 4.64e+12 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10100 200105 0.068662 -5.363412e+13 5.363412e+13 +1 10488 200105 0.068662 -5.363412e+13 5.363412e+13 +2 10680 200105 0.068662 -5.363412e+13 5.363412e+13 +3 11833 200105 0.068662 -5.363412e+13 5.363412e+13 +4 20248 200105 0.068662 -5.363412e+13 5.363412e+13 +5 39773 200105 0.068662 -5.363412e+13 5.363412e+13 +6 62296 200105 0.068662 -5.363412e+13 5.363412e+13 +7 69200 200105 0.068662 -5.363412e+13 5.363412e+13 +8 75526 200105 0.068662 -5.363412e+13 5.363412e+13 +9 75609 200105 0.068662 -5.363412e+13 5.363412e+13 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10613 197308 0.451178 3.580440e+14 -3.580440e+14 +1 11165 197308 0.451178 3.580440e+14 -3.580440e+14 +2 12141 197308 0.451178 3.580440e+14 -3.580440e+14 +3 14227 197308 0.451178 3.580440e+14 -3.580440e+14 +4 14569 197308 0.451178 3.580440e+14 -3.580440e+14 +5 14702 197308 0.451178 3.580440e+14 -3.580440e+14 +6 15078 197308 0.451178 3.580440e+14 -3.580440e+14 +7 15457 197308 0.451178 3.580440e+14 -3.580440e+14 +8 16986 197308 0.451178 3.580440e+14 -3.580440e+14 +9 17523 197308 0.451178 3.580440e+14 -3.580440e+14 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### EarningsConsistency + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.12% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,386,008 +- Python: 1,387,650 +- Common: 1,386,008 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.29e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.39e+06 | 1.39e+06 | 1.39e+06 | 1.39e+06 | +| mean | 0.0649 | 0.0649 | 1.81e-07 | 1.05e-07 | +| std | 1.7181 | 1.7181 | 6.06e-05 | 3.53e-05 | +| min | -274.7972 | -274.7972 | -3.86e-06 | -2.25e-06 | +| 25% | -0.0931 | -0.0931 | -7.10e-09 | -4.13e-09 | +| 50% | 0.1024 | 0.1024 | -9.30e-11 | -5.41e-11 | +| 75% | 0.3403 | 0.3403 | 6.26e-09 | 3.65e-09 | +| max | 82.0000 | 82.0000 | 0.0206 | 0.0120 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,386,008 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.81e-07 | 5.15e-08 | 3.5117 | 0.000 | +| Slope | 1.0000 | 3.00e-08 | 3.34e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 12/1386008 (0.001%) +- Stata standard deviation: 1.72e+00 + +--- + +### EarningsForecastDisparity + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 975,097 +- Python: 975,050 +- Common: 972,933 + +**Precision1**: 0.079% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.03e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 972933.0000 | 972933.0000 | 972933.0000 | 972933.0000 | +| mean | -35.2700 | -35.2816 | -0.0115 | -2.09e-05 | +| std | 553.4147 | 553.4348 | 8.0958 | 0.0146 | +| min | -87875.0000 | -87875.0000 | -3590.1192 | -6.4872 | +| 25% | -19.5800 | -19.5778 | -4.39e-07 | -7.93e-10 | +| 50% | -0.5030 | -0.5022 | 0.0000 | 0.0000 | +| 75% | 11.9556 | 11.9517 | 4.17e-07 | 7.53e-10 | +| max | 24225.0000 | 24225.0000 | 1076.2586 | 1.9448 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0140 + 0.9999 * stata +- **R-squared**: 0.9998 +- **N observations**: 972,933 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -0.0140 | 0.0082 | -1.7065 | 0.088 | +| Slope | 0.9999 | 1.48e-05 | 67422.8889 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 766/972933 (0.079%) +- Stata standard deviation: 5.53e+02 + +--- + +### EarningsStreak + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.03% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,225,060 +- Python: 1,225,411 +- Common: 1,222,765 + +**Precision1**: 0.006% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.44e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.22e+06 | 1.22e+06 | 1.22e+06 | 1.22e+06 | +| mean | -0.0014 | -0.0014 | -1.67e-06 | -5.27e-07 | +| std | 3.1638 | 3.1638 | 0.0021 | 6.73e-04 | +| min | -154.1053 | -154.1053 | -0.4182 | -0.1322 | +| 25% | -0.0024 | -0.0024 | -4.35e-11 | -1.38e-11 | +| 50% | 5.09e-04 | 5.09e-04 | 0.0000 | 0.0000 | +| 75% | 0.0025 | 0.0025 | 4.51e-11 | 1.43e-11 | +| max | 915.0000 | 915.0000 | 1.3338 | 0.4216 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,222,765 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.67e-06 | 1.92e-06 | -0.8667 | 0.386 | +| Slope | 1.0000 | 6.08e-07 | 1.64e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 77/1222765 (0.006%) +- Stata standard deviation: 3.16e+00 + +--- + +### EarningsSurprise + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.02% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,324,394 +- Python: 2,324,036 +- Common: 2,323,952 + +**Precision1**: 0.043% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.63e-04 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.32e+06 | 2.32e+06 | 2.32e+06 | 2.32e+06 | +| mean | -0.1913 | -0.1911 | 1.96e-04 | 1.45e-05 | +| std | 13.5570 | 13.5571 | 0.1467 | 0.0108 | +| min | -10442.0660 | -10442.0669 | -33.4490 | -2.4673 | +| 25% | -0.6708 | -0.6708 | -2.36e-08 | -1.74e-09 | +| 50% | 2.18e-08 | 2.07e-16 | -1.85e-10 | -1.36e-11 | +| 75% | 0.6469 | 0.6469 | 2.31e-08 | 1.70e-09 | +| max | 1153.9985 | 1153.9983 | 92.3953 | 6.8153 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0002 + 1.0000 * stata +- **R-squared**: 0.9999 +- **N observations**: 2,323,952 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.87e-04 | 9.62e-05 | 1.9397 | 0.052 | +| Slope | 1.0000 | 7.10e-06 | 140874.8131 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 988/2323952 (0.043%) +- Stata standard deviation: 1.36e+01 + +--- + +### EntMult + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,407,850 +- Python: 2,407,850 +- Common: 2,407,843 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 6.46e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.41e+06 | 2.41e+06 | 2.41e+06 | 2.41e+06 | +| mean | 23.3369 | 23.3368 | -4.80e-05 | -6.38e-08 | +| std | 753.1756 | 753.1756 | 0.0406 | 5.39e-05 | +| min | -1760.3278 | -1760.3277 | -25.2232 | -0.0335 | +| 25% | 5.1076 | 5.1076 | -1.57e-07 | -2.09e-10 | +| 50% | 8.0373 | 8.0372 | 2.63e-11 | 3.49e-14 | +| 75% | 13.0989 | 13.0990 | 1.57e-07 | 2.09e-10 | +| max | 445235.1900 | 445235.1727 | 15.2084 | 0.0202 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,407,843 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.80e-05 | 2.62e-05 | -1.8320 | 0.067 | +| Slope | 1.0000 | 3.47e-08 | 2.88e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 9/2407843 (0.000%) +- Stata standard deviation: 7.53e+02 + +--- + +### EquityDuration + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +2.49% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,124,663 +- Python: 3,202,620 +- Common: 3,124,663 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 5.80e-14 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.12e+06 | 3.12e+06 | 3.12e+06 | 3.12e+06 | +| mean | 1.28e+07 | 1.28e+07 | 1.4628 | 2.57e-10 | +| std | 5.69e+09 | 5.69e+09 | 644.0980 | 1.13e-07 | +| min | -4.06e+06 | -4.06e+06 | -3500.1904 | -6.16e-07 | +| 25% | 14.1395 | 14.1394 | -5.84e-07 | -1.03e-16 | +| 50% | 16.2921 | 16.2921 | -2.59e-10 | -4.56e-20 | +| 75% | 18.0605 | 18.0605 | 5.82e-07 | 1.02e-16 | +| max | 2.88e+12 | 2.88e+12 | 325978.6533 | 5.73e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0195 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,124,663 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0195 | 0.0166 | 1.1768 | 0.239 | +| Slope | 1.0000 | 2.92e-12 | 3.43e+11 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3124663 (0.000%) +- Stata standard deviation: 5.69e+09 + +--- + +### ExchSwitch + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,047,630 +- Python: 4,047,630 +- Common: 4,047,630 + +**Precision1**: 0.047% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.05e+06 | 4.05e+06 | 4.05e+06 | 4.05e+06 | +| mean | 0.0090 | 0.0095 | 4.69e-04 | 0.0050 | +| std | 0.0946 | 0.0970 | 0.0217 | 0.2290 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 10.5750 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0005 + 0.9995 * stata +- **R-squared**: 0.9501 +- **N observations**: 4,047,630 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.73e-04 | 1.08e-05 | 43.7858 | 0.000 | +| Slope | 0.9995 | 1.14e-04 | 8781.2394 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 1899/4047630 (0.047%) +- Stata standard deviation: 9.46e-02 + +--- + +### ExclExp + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +2.11% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,726,232 +- Python: 1,762,591 +- Common: 1,724,204 + +**Precision1**: 0.117% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.43e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.72e+06 | 1.72e+06 | 1.72e+06 | 1.72e+06 | +| mean | 0.0473 | 0.0472 | -4.31e-05 | -1.21e-04 | +| std | 0.3560 | 0.3563 | 0.0233 | 0.0654 | +| min | -1.7990 | -1.8000 | -3.6900 | -10.3663 | +| 25% | -0.0200 | -0.0200 | -5.55e-17 | -1.56e-16 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0700 | 0.0700 | 2.78e-17 | 7.80e-17 | +| max | 2.6900 | 2.6900 | 4.1300 | 11.6024 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9989 * stata +- **R-squared**: 0.9957 +- **N observations**: 1,724,204 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.38e-06 | 1.79e-05 | 0.4688 | 0.639 | +| Slope | 0.9989 | 4.98e-05 | 20073.2641 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 2020/1724204 (0.117%) +- Stata standard deviation: 3.56e-01 + +--- + +### FEPS + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.01% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,957,995 +- Python: 1,958,211 +- Common: 1,954,911 + +**Precision1**: 0.026% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 5.03e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.95e+06 | 1.95e+06 | 1.95e+06 | 1.95e+06 | +| mean | 2.7245 | 2.7242 | -2.83e-04 | -1.42e-06 | +| std | 198.6832 | 198.6832 | 0.0805 | 4.05e-04 | +| min | -129751.1900 | -129751.1900 | -17.6700 | -0.0889 | +| 25% | 0.2400 | 0.2400 | 0.0000 | 0.0000 | +| 50% | 1.0800 | 1.0800 | 0.0000 | 0.0000 | +| 75% | 2.2600 | 2.2600 | 0.0000 | 0.0000 | +| max | 30007.4900 | 30007.4900 | 9.2800 | 0.0467 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0003 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,954,911 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -2.82e-04 | 5.76e-05 | -4.9012 | 0.000 | +| Slope | 1.0000 | 2.90e-07 | 3.45e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 513/1954911 (0.026%) +- Stata standard deviation: 1.99e+02 + +--- + +### FR + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 683,893 +- Python: 683,893 +- Common: 683,893 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.02e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 683893.0000 | 683893.0000 | 683893.0000 | 683893.0000 | +| mean | -0.0325 | -0.0325 | 1.52e-10 | 1.96e-10 | +| std | 0.7738 | 0.7738 | 7.28e-08 | 9.41e-08 | +| min | -334.6507 | -334.6506 | -1.82e-05 | -2.36e-05 | +| 25% | -0.0239 | -0.0239 | -1.40e-09 | -1.80e-09 | +| 50% | -0.0021 | -0.0021 | 0.0000 | 0.0000 | +| 75% | 0.0078 | 0.0078 | 1.38e-09 | 1.78e-09 | +| max | 44.5372 | 44.5372 | 2.46e-05 | 3.18e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 683,893 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.04e-09 | 8.11e-11 | -12.8577 | 0.000 | +| Slope | 1.0000 | 1.05e-10 | 9.55e+09 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/683893 (0.000%) +- Stata standard deviation: 7.74e-01 + +--- + +### FirmAge + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,045,796 +- Python: 4,045,796 +- Common: 4,045,796 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.05e+06 | 4.05e+06 | 4.05e+06 | 4.05e+06 | +| mean | 158.4381 | 158.4381 | 0.0000 | 0.0000 | +| std | 169.9697 | 169.9697 | 0.0000 | 0.0000 | +| min | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| 25% | 39.0000 | 39.0000 | 0.0000 | 0.0000 | +| 50% | 100.0000 | 100.0000 | 0.0000 | 0.0000 | +| 75% | 219.0000 | 219.0000 | 0.0000 | 0.0000 | +| max | 1189.0000 | 1189.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,045,796 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.77e-11 | 2.04e-14 | 1847.3259 | 0.000 | +| Slope | 1.0000 | 8.78e-17 | 1.14e+16 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4045796 (0.000%) +- Stata standard deviation: 1.70e+02 + +--- + +### FirmAgeMom + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.09% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 550,434 +- Python: 550,955 +- Common: 550,434 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.70e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 550434.0000 | 550434.0000 | 550434.0000 | 550434.0000 | +| mean | 0.0904 | 0.0904 | -1.46e-12 | -4.12e-12 | +| std | 0.3547 | 0.3547 | 1.05e-08 | 2.97e-08 | +| min | -0.9374 | -0.9374 | -1.04e-06 | -2.93e-06 | +| 25% | -0.0898 | -0.0898 | -2.60e-09 | -7.33e-09 | +| 50% | 0.0415 | 0.0415 | 0.0000 | 0.0000 | +| 75% | 0.2113 | 0.2113 | 2.60e-09 | 7.33e-09 | +| max | 27.4976 | 27.4976 | 9.60e-07 | 2.71e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 550,434 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -2.83e-12 | 1.47e-11 | -0.1927 | 0.847 | +| Slope | 1.0000 | 4.01e-11 | 2.50e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/550434 (0.000%) +- Stata standard deviation: 3.55e-01 + +--- + +### ForecastDispersion + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.02% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,616,983 +- Python: 1,617,348 +- Common: 1,614,371 + +**Precision1**: 0.070% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.03e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.61e+06 | 1.61e+06 | 1.61e+06 | 1.61e+06 | +| mean | 0.2221 | 0.2221 | -7.84e-05 | -5.52e-05 | +| std | 1.4215 | 1.4213 | 0.0267 | 0.0188 | +| min | 0.0000 | 0.0000 | -14.8289 | -10.4320 | +| 25% | 0.0204 | 0.0204 | -1.16e-09 | -8.13e-10 | +| 50% | 0.0476 | 0.0476 | -4.84e-11 | -3.41e-11 | +| 75% | 0.1273 | 0.1273 | 4.62e-10 | 3.25e-10 | +| max | 207.0000 | 207.0000 | 11.0000 | 7.7384 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 0.9997 * stata +- **R-squared**: 0.9996 +- **N observations**: 1,614,371 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.36e-05 | 2.13e-05 | -0.6388 | 0.523 | +| Slope | 0.9997 | 1.48e-05 | 67630.4185 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 1132/1614371 (0.070%) +- Stata standard deviation: 1.42e+00 + +--- + +### Frontier + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,221,161 +- Python: 1,221,161 +- Common: 1,221,161 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.31e-05 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.22e+06 | 1.22e+06 | 1.22e+06 | 1.22e+06 | +| mean | -0.0086 | -0.0086 | 1.35e-07 | 1.38e-07 | +| std | 0.9776 | 0.9776 | 5.22e-05 | 5.34e-05 | +| min | -11.5532 | -11.5532 | -0.0033 | -0.0034 | +| 25% | -0.5722 | -0.5722 | -1.10e-07 | -1.12e-07 | +| 50% | 0.0113 | 0.0113 | 1.48e-10 | 1.51e-10 | +| 75% | 0.5788 | 0.5788 | 1.10e-07 | 1.13e-07 | +| max | 23.1422 | 23.1422 | 0.0231 | 0.0237 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,221,161 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.35e-07 | 4.73e-08 | 2.8467 | 0.004 | +| Slope | 1.0000 | 4.84e-08 | 2.07e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 7/1221161 (0.001%) +- Stata standard deviation: 9.78e-01 + +--- + +### GP + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.09% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,970,775 +- Python: 2,973,391 +- Common: 2,970,667 + +**Precision1**: 0.025% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.41e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.97e+06 | 2.97e+06 | 2.97e+06 | 2.97e+06 | +| mean | 0.3281 | 0.3281 | -3.46e-07 | -7.47e-07 | +| std | 0.4633 | 0.4633 | 0.0048 | 0.0103 | +| min | -134.2381 | -134.2381 | -1.4998 | -3.2375 | +| 25% | 0.1555 | 0.1554 | -5.79e-09 | -1.25e-08 | +| 50% | 0.3051 | 0.3051 | 0.0000 | 0.0000 | +| 75% | 0.4835 | 0.4835 | 5.76e-09 | 1.24e-08 | +| max | 12.9441 | 12.9441 | 1.1110 | 2.3980 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9999 * stata +- **R-squared**: 0.9999 +- **N observations**: 2,970,667 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.06e-05 | 3.39e-06 | 6.0807 | 0.000 | +| Slope | 0.9999 | 5.98e-06 | 167237.6923 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 732/2970667 (0.025%) +- Stata standard deviation: 4.63e-01 + +--- + +### Governance + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 334,058 +- Python: 334,058 +- Common: 334,058 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 334058.0000 | 334058.0000 | 334058.0000 | 334058.0000 | +| mean | 9.0443 | 9.0443 | 0.0000 | 0.0000 | +| std | 2.5733 | 2.5733 | 0.0000 | 0.0000 | +| min | 5.0000 | 5.0000 | 0.0000 | 0.0000 | +| 25% | 7.0000 | 7.0000 | 0.0000 | 0.0000 | +| 50% | 9.0000 | 9.0000 | 0.0000 | 0.0000 | +| 75% | 11.0000 | 11.0000 | 0.0000 | 0.0000 | +| max | 14.0000 | 14.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 334,058 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.54e-12 | 1.14e-14 | 222.5103 | 0.000 | +| Slope | 1.0000 | 1.22e-15 | 8.23e+14 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/334058 (0.000%) +- Stata standard deviation: 2.57e+00 + +--- + +### GrAdExp + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.38% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 898,855 +- Python: 902,283 +- Common: 898,807 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.68e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 898807.0000 | 898807.0000 | 898807.0000 | 898807.0000 | +| mean | 0.0958 | 0.0958 | -2.48e-11 | -5.21e-11 | +| std | 0.4754 | 0.4754 | 1.39e-08 | 2.92e-08 | +| min | -5.4159 | -5.4159 | -2.35e-07 | -4.95e-07 | +| 25% | -0.0747 | -0.0747 | -3.07e-09 | -6.47e-09 | +| 50% | 0.0773 | 0.0773 | 0.0000 | 0.0000 | +| 75% | 0.2478 | 0.2478 | 3.13e-09 | 6.59e-09 | +| max | 7.6737 | 7.6737 | 2.71e-07 | 5.71e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 898,807 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.06e-11 | 1.50e-11 | 2.7164 | 0.007 | +| Slope | 1.0000 | 3.08e-11 | 3.24e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/898807 (0.000%) +- Stata standard deviation: 4.75e-01 + +--- + +### GrLTNOA + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,219,259 +- Python: 3,236,460 +- Common: 3,219,235 + +**Precision1**: 0.017% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.38e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.22e+06 | 3.22e+06 | 3.22e+06 | 3.22e+06 | +| mean | 0.0277 | 0.0277 | -1.55e-07 | -2.82e-07 | +| std | 0.5488 | 0.5488 | 6.04e-04 | 0.0011 | +| min | -166.3113 | -166.3113 | -0.1253 | -0.2284 | +| 25% | -0.0128 | -0.0128 | -8.68e-10 | -1.58e-09 | +| 50% | 0.0255 | 0.0255 | 2.69e-13 | 4.90e-13 | +| 75% | 0.0758 | 0.0758 | 8.75e-10 | 1.59e-09 | +| max | 51.4748 | 51.4748 | 0.1309 | 0.2385 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,219,235 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.70e-08 | 3.37e-07 | -0.1395 | 0.889 | +| Slope | 1.0000 | 6.14e-07 | 1.63e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 552/3219235 (0.017%) +- Stata standard deviation: 5.49e-01 + +--- + +### GrSaleToGrInv + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.55% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,532,290 +- Python: 2,546,202 +- Common: 2,532,290 + +**Precision1**: 0.039% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.24e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.53e+06 | 2.53e+06 | 2.53e+06 | 2.53e+06 | +| mean | -0.6004 | -0.6010 | -5.57e-04 | -5.65e-06 | +| std | 98.7042 | 98.7036 | 0.3541 | 0.0036 | +| min | -27478.1970 | -27478.1975 | -142.5476 | -1.4442 | +| 25% | -0.1477 | -0.1476 | -3.21e-09 | -3.26e-11 | +| 50% | 0.0238 | 0.0239 | 0.0000 | 0.0000 | +| 75% | 0.2039 | 0.2041 | 3.26e-09 | 3.30e-11 | +| max | 16243.9730 | 16243.9730 | 48.0774 | 0.4871 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0006 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,532,290 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -5.65e-04 | 2.23e-04 | -2.5395 | 0.011 | +| Slope | 1.0000 | 2.25e-06 | 443591.9824 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 986/2532290 (0.039%) +- Stata standard deviation: 9.87e+01 + +--- + +### GrSaleToGrOverhead + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.52% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,668,375 +- Python: 2,682,273 +- Common: 2,668,303 + +**Precision1**: 0.017% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.56e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.67e+06 | 2.67e+06 | 2.67e+06 | 2.67e+06 | +| mean | 0.2114 | 0.2127 | 0.0013 | 1.00e-05 | +| std | 134.4912 | 134.4933 | 0.8887 | 0.0066 | +| min | -35415.9490 | -35415.9507 | -153.7168 | -1.1430 | +| 25% | -0.1013 | -0.1012 | -1.73e-09 | -1.29e-11 | +| 50% | -0.0032 | -0.0031 | -1.20e-12 | -8.90e-15 | +| 75% | 0.0962 | 0.0965 | 1.76e-09 | 1.31e-11 | +| max | 51774.7580 | 51774.7570 | 336.9123 | 2.5051 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0013 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,668,303 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0013 | 5.44e-04 | 2.4797 | 0.013 | +| Slope | 1.0000 | 4.05e-06 | 247190.9589 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 466/2668303 (0.017%) +- Stata standard deviation: 1.34e+02 + +--- + +### Herf + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.68% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,158,336 +- Python: 3,179,806 +- Common: 3,158,336 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.00e-05 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.16e+06 | 3.16e+06 | 3.16e+06 | 3.16e+06 | +| mean | 0.3294 | 0.3294 | 6.55e-09 | 2.36e-08 | +| std | 0.2778 | 0.2778 | 1.84e-06 | 6.63e-06 | +| min | 0.0000 | -4.16e-17 | -6.07e-04 | -0.0022 | +| 25% | 0.1184 | 0.1184 | -2.40e-09 | -8.64e-09 | +| 50% | 0.2537 | 0.2537 | 0.0000 | 0.0000 | +| 75% | 0.4723 | 0.4723 | 2.66e-09 | 9.58e-09 | +| max | 5.5471 | 5.5471 | 2.19e-04 | 7.89e-04 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,158,336 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -3.19e-09 | 1.61e-09 | -1.9814 | 0.048 | +| Slope | 1.0000 | 3.73e-09 | 2.68e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3158336 (0.000%) +- Stata standard deviation: 2.78e-01 + +--- + +### HerfAsset + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.98% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,547,057 +- Python: 2,571,906 +- Common: 2,547,057 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.22e-06 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.55e+06 | 2.55e+06 | 2.55e+06 | 2.55e+06 | +| mean | 0.3431 | 0.3431 | -2.59e-09 | -9.33e-09 | +| std | 0.2778 | 0.2778 | 5.93e-08 | 2.13e-07 | +| min | 0.0162 | 0.0162 | -3.80e-06 | -1.37e-05 | +| 25% | 0.1214 | 0.1214 | -2.45e-09 | -8.80e-09 | +| 50% | 0.2657 | 0.2657 | 0.0000 | 0.0000 | +| 75% | 0.4885 | 0.4885 | 2.79e-09 | 1.00e-08 | +| max | 1.0000 | 1.0000 | 1.38e-07 | 4.95e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,547,057 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -7.21e-09 | 5.89e-11 | -122.5264 | 0.000 | +| Slope | 1.0000 | 1.33e-10 | 7.50e+09 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2547057 (0.000%) +- Stata standard deviation: 2.78e-01 + +--- + +### HerfBE + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.98% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,547,057 +- Python: 2,571,906 +- Common: 2,547,057 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.78e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.55e+06 | 2.55e+06 | 2.55e+06 | 2.55e+06 | +| mean | 70.1127 | 70.1143 | 0.0016 | 2.46e-07 | +| std | 6716.9685 | 6717.1728 | 0.2161 | 3.22e-05 | +| min | 0.0000 | -6.17e-18 | -0.7337 | -1.09e-04 | +| 25% | 0.1251 | 0.1251 | -2.40e-09 | -3.58e-13 | +| 50% | 0.2675 | 0.2675 | 0.0000 | 0.0000 | +| 75% | 0.5118 | 0.5118 | 2.58e-09 | 3.84e-13 | +| max | 859657.6583 | 859686.7510 | 29.0949 | 0.0043 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0005 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,547,057 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.83e-04 | 4.42e-05 | -10.9300 | 0.000 | +| Slope | 1.0000 | 6.58e-09 | 1.52e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2547057 (0.000%) +- Stata standard deviation: 6.72e+03 + +--- + +### High52 + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,995,429 +- Python: 4,995,429 +- Common: 4,995,429 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.11e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 5.00e+06 | 5.00e+06 | 5.00e+06 | 5.00e+06 | +| mean | 0.7624 | 0.7624 | -5.38e-10 | -1.61e-09 | +| std | 0.3336 | 0.3336 | 2.93e-08 | 8.80e-08 | +| min | 1.76e-04 | 1.76e-04 | -7.54e-06 | -2.26e-05 | +| 25% | 0.6090 | 0.6090 | -1.58e-08 | -4.72e-08 | +| 50% | 0.8130 | 0.8130 | 0.0000 | 0.0000 | +| 75% | 0.9416 | 0.9416 | 1.49e-08 | 4.47e-08 | +| max | 262.3832 | 262.3832 | 1.02e-05 | 3.07e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,995,429 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -8.69e-10 | 3.28e-11 | -26.5324 | 0.000 | +| Slope | 1.0000 | 3.94e-11 | 2.54e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4995429 (0.000%) +- Stata standard deviation: 3.34e-01 + +--- + +### IO_ShortInterest + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-30 +- Reviewed by: ac +- Details: Like the other short interest predictors, the only test here that is really informative is the t-stat, and the t-stat test passes. + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 3755 Stata observations) +- Test 2 - NumRows check: ❌ FAILED (Python has +81.83% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 8,842 +- Python: 16,077 +- Common: 5,087 + +**Precision1**: 1.317% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.55e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 5087.0000 | 5087.0000 | 5087.0000 | 5087.0000 | +| mean | 85.8106 | 86.4172 | 0.6067 | 0.0047 | +| std | 127.9503 | 127.7218 | 6.7609 | 0.0528 | +| min | 0.0000 | 0.0000 | -1.00e-05 | -7.82e-08 | +| 25% | 53.6845 | 54.5490 | 0.0000 | 0.0000 | +| 50% | 94.0330 | 94.2970 | 0.0000 | 0.0000 | +| 75% | 115.9700 | 116.0100 | 0.0000 | 0.0000 | +| max | 7896.3999 | 7896.4000 | 124.2600 | 0.9712 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.8796 + 0.9968 * stata +- **R-squared**: 0.9972 +- **N observations**: 5,087 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.8796 | 0.1139 | 7.7196 | 0.000 | +| Slope | 0.9968 | 7.40e-04 | 1347.6823 | 0.000 | + +**Missing Observations Sample**: +``` + index permno yyyymm IO_ShortInterest + 0 10258 201408 89.815002 + 1 10258 201410 89.972000 + 2 10659 201110 92.874001 + 3 10659 201111 92.874001 + 4 10659 201209 90.058998 + 5 10659 201402 85.178001 + 6 10890 200910 1.404000 + 7 10890 201810 121.020000 + 8 10890 201811 121.020000 + 9 10909 200901 89.556999 +``` + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 67/5087 (1.317%) +- Stata standard deviation: 1.28e+02 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 13947 202412 29.321 0.0 29.321 +1 14523 202412 16.578 0.0 16.578 +2 14813 202412 62.901 0.0 62.901 +3 15272 202412 99.702 0.0 99.702 +4 15417 202412 2.684 0.0 2.684 +5 15540 202412 25.440 0.0 25.440 +6 16964 202412 40.467 0.0 40.467 +7 18453 202412 34.810 0.0 34.810 +8 18572 202412 51.215 0.0 51.215 +9 18676 202412 109.680 0.0 109.680 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 77606 202410 124.26 0.0 124.26 +1 77606 202411 124.26 0.0 124.26 +2 78875 202410 119.86 0.0 119.86 +3 78875 202411 119.86 0.0 119.86 +4 18676 202412 109.68 0.0 109.68 +5 78875 202412 109.44 0.0 109.44 +6 18676 202411 107.16 0.0 107.16 +7 18815 202410 106.76 0.0 106.76 +8 18815 202411 106.76 0.0 106.76 +9 77606 202412 105.53 0.0 105.53 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### IdioVol3F + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,980,936 +- Python: 4,980,936 +- Common: 4,980,936 + +**Precision1**: 0.021% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 5.32e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.98e+06 | 4.98e+06 | 4.98e+06 | 4.98e+06 | +| mean | 0.0253 | 0.0253 | -1.28e-07 | -4.50e-06 | +| std | 0.0285 | 0.0285 | 1.57e-05 | 5.50e-04 | +| min | 0.0000 | 0.0000 | -0.0176 | -0.6184 | +| 25% | 0.0103 | 0.0103 | -2.78e-17 | -9.74e-16 | +| 50% | 0.0180 | 0.0180 | 0.0000 | 0.0000 | +| 75% | 0.0312 | 0.0312 | 2.78e-17 | 9.74e-16 | +| max | 7.8173 | 7.7997 | 0.0036 | 0.1266 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,980,936 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.03e-07 | 9.37e-09 | 75.0922 | 0.000 | +| Slope | 1.0000 | 2.46e-07 | 4.07e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 1059/4980936 (0.021%) +- Stata standard deviation: 2.85e-02 + +--- + +### IdioVolAHT + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,849,170 +- Python: 4,849,170 +- Common: 4,849,170 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.40e-04 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.85e+06 | 4.85e+06 | 4.85e+06 | 4.85e+06 | +| mean | 0.0300 | 0.0300 | -1.26e-08 | -4.75e-07 | +| std | 0.0264 | 0.0264 | 3.39e-07 | 1.28e-05 | +| min | 1.02e-05 | 1.02e-05 | -5.29e-05 | -0.0020 | +| 25% | 0.0142 | 0.0142 | -2.78e-17 | -1.05e-15 | +| 50% | 0.0232 | 0.0232 | 0.0000 | 0.0000 | +| 75% | 0.0379 | 0.0379 | 2.78e-17 | 1.05e-15 | +| max | 2.5092 | 2.5092 | 1.41e-05 | 5.35e-04 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,849,170 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.92e-08 | 2.33e-10 | -82.5140 | 0.000 | +| Slope | 1.0000 | 5.82e-09 | 1.72e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4849170 (0.000%) +- Stata standard deviation: 2.64e-02 + +--- + +### Illiquidity + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,278,152 +- Python: 4,278,152 +- Common: 4,278,152 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.29e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.28e+06 | 4.28e+06 | 4.28e+06 | 4.28e+06 | +| mean | 8.77e-06 | 8.77e-06 | 2.76e-15 | 1.68e-11 | +| std | 1.64e-04 | 1.64e-04 | 5.46e-12 | 3.32e-08 | +| min | 0.0000 | 0.0000 | -4.04e-09 | -2.46e-05 | +| 25% | 7.73e-09 | 7.73e-09 | -2.01e-15 | -1.22e-11 | +| 50% | 1.27e-07 | 1.27e-07 | -4.76e-21 | -2.90e-17 | +| 75% | 1.47e-06 | 1.47e-06 | 2.01e-15 | 1.22e-11 | +| max | 0.0761 | 0.0761 | 3.75e-09 | 2.28e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,278,152 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.71e-15 | 2.64e-15 | 1.4041 | 0.160 | +| Slope | 1.0000 | 1.61e-11 | 6.22e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4278152 (0.000%) +- Stata standard deviation: 1.64e-04 + +--- + +### IndMom + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,043,138 +- Python: 4,043,138 +- Common: 4,043,138 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.20e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.04e+06 | 4.04e+06 | 4.04e+06 | 4.04e+06 | +| mean | 0.0857 | 0.0857 | -4.84e-11 | -2.79e-10 | +| std | 0.1736 | 0.1736 | 5.91e-09 | 3.41e-08 | +| min | -0.9265 | -0.9265 | -2.53e-07 | -1.46e-06 | +| 25% | -0.0099 | -0.0099 | -2.11e-09 | -1.21e-08 | +| 50% | 0.0775 | 0.0775 | -2.82e-11 | -1.62e-10 | +| 75% | 0.1676 | 0.1676 | 2.02e-09 | 1.17e-08 | +| max | 10.5068 | 10.5068 | 2.19e-07 | 1.26e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,043,138 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.32e-11 | 3.28e-12 | 7.0801 | 0.000 | +| Slope | 1.0000 | 1.69e-11 | 5.90e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4043138 (0.000%) +- Stata standard deviation: 1.74e-01 + +--- + +### IndRetBig + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.34% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,607,795 +- Python: 2,616,695 +- Common: 2,602,394 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.37e-15 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.60e+06 | 2.60e+06 | 2.60e+06 | 2.60e+06 | +| mean | 0.0180 | 0.0180 | 7.40e-20 | 1.05e-18 | +| std | 0.0704 | 0.0704 | 3.26e-17 | 4.63e-16 | +| min | -0.4860 | -0.4860 | -2.22e-16 | -3.15e-15 | +| 25% | -0.0206 | -0.0206 | -2.52e-17 | -3.57e-16 | +| 50% | 0.0176 | 0.0176 | 0.0000 | 0.0000 | +| 75% | 0.0554 | 0.0554 | 2.69e-17 | 3.82e-16 | +| max | 1.8831 | 1.8831 | 1.94e-16 | 2.76e-15 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,602,394 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.14e-16 | 3.80e-18 | 188.0642 | 0.000 | +| Slope | 1.0000 | 5.23e-17 | 1.91e+16 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2602394 (0.000%) +- Stata standard deviation: 7.04e-02 + +--- + +### IntMom + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,686,625 +- Python: 3,686,625 +- Common: 3,686,625 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.66e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.69e+06 | 3.69e+06 | 3.69e+06 | 3.69e+06 | +| mean | 0.0700 | 0.0700 | -8.55e-12 | -1.77e-11 | +| std | 0.4819 | 0.4819 | 1.37e-08 | 2.85e-08 | +| min | -1.0000 | -1.0000 | -2.15e-06 | -4.47e-06 | +| 25% | -0.1555 | -0.1555 | -3.20e-09 | -6.64e-09 | +| 50% | 0.0226 | 0.0226 | 0.0000 | 0.0000 | +| 75% | 0.2110 | 0.2110 | 3.21e-09 | 6.65e-09 | +| max | 80.0474 | 80.0474 | 1.27e-06 | 2.65e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,686,625 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.92e-11 | 7.22e-12 | 4.0433 | 0.000 | +| Slope | 1.0000 | 1.48e-11 | 6.74e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3686625 (0.000%) +- Stata standard deviation: 4.82e-01 + +--- + +### IntanBM + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,728,575 +- Python: 1,728,573 +- Common: 1,728,572 + +**Precision1**: 0.011% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.08e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.73e+06 | 1.73e+06 | 1.73e+06 | 1.73e+06 | +| mean | 1.02e-06 | 1.22e-06 | 2.05e-07 | 2.72e-07 | +| std | 0.7511 | 0.7511 | 1.75e-04 | 2.33e-04 | +| min | -5.6063 | -5.6063 | -0.0179 | -0.0238 | +| 25% | -0.4156 | -0.4156 | -2.43e-08 | -3.23e-08 | +| 50% | -0.0282 | -0.0282 | 1.42e-10 | 1.89e-10 | +| 75% | 0.3716 | 0.3716 | 2.46e-08 | 3.27e-08 | +| max | 8.4253 | 8.4253 | 0.0146 | 0.0194 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,728,572 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.05e-07 | 1.33e-07 | 1.5357 | 0.125 | +| Slope | 1.0000 | 1.77e-07 | 5.64e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 192/1728572 (0.011%) +- Stata standard deviation: 7.51e-01 + +--- + +### IntanCFP + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,881,254 +- Python: 1,881,252 +- Common: 1,881,251 + +**Precision1**: 0.149% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.12e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.88e+06 | 1.88e+06 | 1.88e+06 | 1.88e+06 | +| mean | -8.00e-06 | 6.88e-08 | 8.07e-06 | 1.72e-05 | +| std | 0.4700 | 0.4700 | 0.0019 | 0.0041 | +| min | -47.2334 | -47.2334 | -0.2540 | -0.5403 | +| 25% | -0.1325 | -0.1324 | -1.44e-08 | -3.06e-08 | +| 50% | -0.0266 | -0.0266 | 1.33e-09 | 2.83e-09 | +| 75% | 0.0715 | 0.0715 | 2.01e-08 | 4.29e-08 | +| max | 40.0578 | 40.0578 | 0.3426 | 0.7289 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,881,251 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.07e-06 | 1.40e-06 | 5.7560 | 0.000 | +| Slope | 1.0000 | 2.98e-06 | 335121.7072 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 2808/1881251 (0.149%) +- Stata standard deviation: 4.70e-01 + +--- + +### IntanEP + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,881,254 +- Python: 1,881,252 +- Common: 1,881,251 + +**Precision1**: 0.155% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.14e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.88e+06 | 1.88e+06 | 1.88e+06 | 1.88e+06 | +| mean | -1.15e-05 | -4.50e-08 | 1.14e-05 | 2.22e-05 | +| std | 0.5140 | 0.5140 | 0.0045 | 0.0088 | +| min | -33.2328 | -33.2328 | -1.0573 | -2.0569 | +| 25% | -0.1659 | -0.1658 | -1.53e-08 | -2.97e-08 | +| 50% | -0.0563 | -0.0563 | -2.85e-11 | -5.55e-11 | +| 75% | 0.0607 | 0.0607 | 1.53e-08 | 2.97e-08 | +| max | 42.0109 | 42.0109 | 0.8334 | 1.6212 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9999 * stata +- **R-squared**: 0.9999 +- **N observations**: 1,881,251 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.14e-05 | 3.29e-06 | 3.4723 | 0.001 | +| Slope | 0.9999 | 6.40e-06 | 156147.6503 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 2907/1881251 (0.155%) +- Stata standard deviation: 5.14e-01 + +--- + +### IntanSP + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,876,810 +- Python: 1,876,808 +- Common: 1,876,807 + +**Precision1**: 0.009% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.72e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.88e+06 | 1.88e+06 | 1.88e+06 | 1.88e+06 | +| mean | -2.40e-06 | 7.69e-07 | 3.17e-06 | 2.21e-06 | +| std | 1.4374 | 1.4374 | 4.14e-04 | 2.88e-04 | +| min | -37.6318 | -37.6318 | -0.2453 | -0.1706 | +| 25% | -0.8289 | -0.8289 | -4.04e-08 | -2.81e-08 | +| 50% | -0.2497 | -0.2497 | 2.77e-09 | 1.93e-09 | +| 75% | 0.4664 | 0.4664 | 4.87e-08 | 3.39e-08 | +| max | 11.3881 | 11.3881 | 0.0977 | 0.0680 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,876,807 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.17e-06 | 3.02e-07 | 10.5054 | 0.000 | +| Slope | 1.0000 | 2.10e-07 | 4.76e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 164/1876807 (0.009%) +- Stata standard deviation: 1.44e+00 + +--- + +### InvGrowth + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.37% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,973,756 +- Python: 1,981,028 +- Common: 1,973,660 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.19e-06 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.97e+06 | 1.97e+06 | 1.97e+06 | 1.97e+06 | +| mean | 0.3649 | 0.3649 | -8.49e-07 | -3.59e-08 | +| std | 23.6303 | 23.6303 | 2.70e-04 | 1.14e-05 | +| min | -1.7528 | -1.7528 | -0.3565 | -0.0151 | +| 25% | -0.1052 | -0.1052 | -1.09e-08 | -4.63e-10 | +| 50% | 0.0340 | 0.0340 | 6.80e-12 | 2.88e-13 | +| 75% | 0.2045 | 0.2045 | 1.32e-08 | 5.58e-10 | +| max | 8214.9326 | 8214.9323 | 0.0879 | 0.0037 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,973,660 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -5.68e-07 | 1.92e-07 | -2.9571 | 0.003 | +| Slope | 1.0000 | 8.12e-09 | 1.23e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 1/1973660 (0.000%) +- Stata standard deviation: 2.36e+01 + +--- + +### InvestPPEInv + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.55% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,928,130 +- Python: 2,944,339 +- Common: 2,928,106 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.85e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.93e+06 | 2.93e+06 | 2.93e+06 | 2.93e+06 | +| mean | 0.0854 | 0.0854 | 1.84e-06 | 2.56e-06 | +| std | 0.7165 | 0.7165 | 8.47e-04 | 0.0012 | +| min | -13.0490 | -13.0490 | -0.0085 | -0.0119 | +| 25% | 0.0000 | 0.0000 | -1.36e-09 | -1.90e-09 | +| 50% | 0.0422 | 0.0422 | 0.0000 | 0.0000 | +| 75% | 0.1171 | 0.1171 | 1.35e-09 | 1.88e-09 | +| max | 264.5906 | 264.5906 | 0.4182 | 0.5836 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,928,106 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.86e-06 | 4.99e-07 | 3.7309 | 0.000 | +| Slope | 1.0000 | 6.91e-07 | 1.45e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 60/2928106 (0.002%) +- Stata standard deviation: 7.17e-01 + +--- + +### Investment + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.57% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,411,862 +- Python: 2,425,728 +- Common: 2,411,862 + +**Precision1**: 0.110% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.00e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.41e+06 | 2.41e+06 | 2.41e+06 | 2.41e+06 | +| mean | 1.0036 | 1.0028 | -8.65e-04 | -4.73e-04 | +| std | 1.8269 | 1.8267 | 0.0574 | 0.0314 | +| min | -2512.3491 | -2512.3180 | -25.0000 | -13.6843 | +| 25% | 0.6665 | 0.6660 | -2.20e-08 | -1.21e-08 | +| 50% | 0.9327 | 0.9324 | 0.0000 | 0.0000 | +| 75% | 1.2036 | 1.2033 | 2.19e-08 | 1.20e-08 | +| max | 253.6225 | 253.6223 | 24.4389 | 13.3771 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0003 + 0.9994 * stata +- **R-squared**: 0.9990 +- **N observations**: 2,411,862 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -2.52e-04 | 4.22e-05 | -5.9789 | 0.000 | +| Slope | 0.9994 | 2.02e-05 | 49379.4474 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 2662/2411862 (0.110%) +- Stata standard deviation: 1.83e+00 + +--- + +### LRreversal + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.18% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,059,782 +- Python: 3,095,900 +- Common: 3,059,642 + +**Precision1**: 0.119% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.18e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.06e+06 | 3.06e+06 | 3.06e+06 | 3.06e+06 | +| mean | 0.3427 | 0.3427 | 6.08e-05 | 4.60e-05 | +| std | 1.3198 | 1.3200 | 0.0394 | 0.0298 | +| min | -1.0000 | -1.0000 | -23.2919 | -17.6487 | +| 25% | -0.2333 | -0.2333 | -7.12e-09 | -5.40e-09 | +| 50% | 0.1445 | 0.1446 | 0.0000 | 0.0000 | +| 75% | 0.5972 | 0.5973 | 7.12e-09 | 5.39e-09 | +| max | 544.3116 | 544.3116 | 26.7868 | 20.2968 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0002 + 0.9997 * stata +- **R-squared**: 0.9991 +- **N observations**: 3,059,642 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.61e-04 | 2.33e-05 | 6.9044 | 0.000 | +| Slope | 0.9997 | 1.71e-05 | 58612.7996 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 3631/3059642 (0.119%) +- Stata standard deviation: 1.32e+00 + +--- + +### Leverage + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,014,665 +- Python: 3,014,676 +- Common: 3,014,665 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.95e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.01e+06 | 3.01e+06 | 3.01e+06 | 3.01e+06 | +| mean | 2.6680 | 2.6680 | 1.29e-07 | 7.20e-09 | +| std | 17.9005 | 17.9005 | 6.48e-05 | 3.62e-06 | +| min | 0.0000 | 0.0000 | -1.80e-04 | -1.01e-05 | +| 25% | 0.2228 | 0.2228 | -1.12e-08 | -6.24e-10 | +| 50% | 0.6455 | 0.6455 | 6.12e-13 | 3.42e-14 | +| 75% | 1.8667 | 1.8667 | 1.11e-08 | 6.21e-10 | +| max | 5277.1953 | 5277.1954 | 0.0512 | 0.0029 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,014,665 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.18e-07 | 3.77e-08 | 3.1409 | 0.002 | +| Slope | 1.0000 | 2.08e-09 | 4.80e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3014665 (0.000%) +- Stata standard deviation: 1.79e+01 + +--- + +### MRreversal + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +2.96% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,518,261 +- Python: 3,622,343 +- Common: 3,506,447 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.69e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.51e+06 | 3.51e+06 | 3.51e+06 | 3.51e+06 | +| mean | 0.0738 | 0.0738 | -8.49e-12 | -1.78e-11 | +| std | 0.4762 | 0.4762 | 1.37e-08 | 2.87e-08 | +| min | -1.0000 | -1.0000 | -2.15e-06 | -4.52e-06 | +| 25% | -0.1515 | -0.1515 | -3.19e-09 | -6.70e-09 | +| 50% | 0.0258 | 0.0258 | 0.0000 | 0.0000 | +| 75% | 0.2131 | 0.2131 | 3.20e-09 | 6.71e-09 | +| max | 80.0474 | 80.0474 | 1.27e-06 | 2.68e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,506,447 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.90e-11 | 7.39e-12 | 3.9223 | 0.000 | +| Slope | 1.0000 | 1.53e-11 | 6.52e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3506447 (0.000%) +- Stata standard deviation: 4.76e-01 + +--- + +### MS + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-26 +- Reviewed by: ac +- Details: This is a complicated predictor with many many places for deviations to begin. Best I could do is get a Precision1 failure rate of 15.7%. The deviations seem to come from (1) tiny differences in the Compustat quarterly data that may be related to the deduplication in the DataDownloads code, and (2) bizarre edge behavior of Stata's asrol function that I could not make heads or tails of despite many hours of debugging. The first case we may want to fix: basically we may want to make sure when we deduplicate that we keep the row with more observations, or at least do something more systematic. The second case I think we should abandon. There are rare instances where Stata asrol decides to return Null despite there being enough observations in the window. + +These tiny differences accumulate in the complicated algorithm, which involves using many large rolling windows. I looked at many of the largest deviations and found all of them were linked to either problem (1) or (2). Most of them seemed to be due to problem (1). + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 473,079 +- Python: 473,080 +- Common: 473,073 + +**Precision1**: 15.549% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.95e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 473073.0000 | 473073.0000 | 473073.0000 | 473073.0000 | +| mean | 3.8814 | 3.8113 | -0.0701 | -0.0454 | +| std | 1.5421 | 1.5211 | 0.4388 | 0.2846 | +| min | 1.0000 | 1.0000 | -4.0000 | -2.5939 | +| 25% | 3.0000 | 3.0000 | 0.0000 | 0.0000 | +| 50% | 4.0000 | 4.0000 | 0.0000 | 0.0000 | +| 75% | 5.0000 | 5.0000 | 0.0000 | 0.0000 | +| max | 6.0000 | 6.0000 | 3.0000 | 1.9454 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.1396 + 0.9460 * stata +- **R-squared**: 0.9198 +- **N observations**: 473,073 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.1396 | 0.0017 | 82.2851 | 0.000 | +| Slope | 0.9460 | 4.06e-04 | 2328.6786 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 73558/473073 (15.549%) +- Stata standard deviation: 1.54e+00 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10966 202412 4.0 5.0 -1.0 +1 11275 202412 4.0 5.0 -1.0 +2 11308 202412 4.0 3.0 1.0 +3 11884 202412 2.0 3.0 -1.0 +4 12084 202412 5.0 6.0 -1.0 +5 12411 202412 4.0 5.0 -1.0 +6 13721 202412 5.0 6.0 -1.0 +7 13956 202412 4.0 5.0 -1.0 +8 14182 202412 2.0 1.0 1.0 +9 14258 202412 6.0 5.0 1.0 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 11600 200906 1.0 5.0 -4.0 +1 11600 200907 1.0 5.0 -4.0 +2 11600 200908 1.0 5.0 -4.0 +3 11600 200909 1.0 5.0 -4.0 +4 11600 200910 1.0 5.0 -4.0 +5 11600 201603 1.0 5.0 -4.0 +6 11600 201604 1.0 5.0 -4.0 +7 11600 201605 1.0 5.0 -4.0 +8 12169 200306 2.0 6.0 -4.0 +9 12169 200307 2.0 6.0 -4.0 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### MaxRet + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 5,033,574 +- Python: 5,033,574 +- Common: 5,033,574 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 5.03e+06 | 5.03e+06 | 5.03e+06 | 5.03e+06 | +| mean | 0.0688 | 0.0688 | 0.0000 | 0.0000 | +| std | 0.1031 | 0.1031 | 0.0000 | 0.0000 | +| min | -0.8696 | -0.8696 | 0.0000 | 0.0000 | +| 25% | 0.0248 | 0.0248 | 0.0000 | 0.0000 | +| 50% | 0.0449 | 0.0449 | 0.0000 | 0.0000 | +| 75% | 0.0811 | 0.0811 | 0.0000 | 0.0000 | +| max | 39.7253 | 39.7253 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 5,033,574 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.53e-15 | 5.31e-18 | 1606.4818 | 0.000 | +| Slope | 1.0000 | 4.29e-17 | 2.33e+16 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/5033574 (0.000%) +- Stata standard deviation: 1.03e-01 + +--- + +### MeanRankRevGrowth + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.05% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,028,817 +- Python: 2,029,858 +- Common: 2,028,817 + +**Precision1**: 0.003% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.53e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.03e+06 | 2.03e+06 | 2.03e+06 | 2.03e+06 | +| mean | 2350.3064 | 2350.3229 | 0.0165 | 1.58e-05 | +| std | 1044.5183 | 1044.5141 | 2.5820 | 0.0025 | +| min | 10.6000 | 10.6000 | -827.3333 | -0.7921 | +| 25% | 1639.9333 | 1640.0000 | -3.33e-05 | -3.19e-08 | +| 50% | 2327.5334 | 2327.5333 | 0.0000 | 0.0000 | +| 75% | 3009.8667 | 3009.8667 | 3.33e-05 | 3.19e-08 | +| max | 6667.5332 | 6667.5333 | 38.9334 | 0.0373 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0330 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,028,817 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0330 | 0.0045 | 7.3956 | 0.000 | +| Slope | 1.0000 | 1.74e-06 | 576210.3065 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 56/2028817 (0.003%) +- Stata standard deviation: 1.04e+03 + +--- + +### Mom12m + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.04% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,713,622 +- Python: 3,715,128 +- Common: 3,713,622 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.50e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.71e+06 | 3.71e+06 | 3.71e+06 | 3.71e+06 | +| mean | 0.1328 | 0.1328 | -4.81e-12 | -6.24e-12 | +| std | 0.7707 | 0.7707 | 2.10e-08 | 2.72e-08 | +| min | -1.0000 | -1.0000 | -3.45e-06 | -4.48e-06 | +| 25% | -0.2091 | -0.2091 | -4.67e-09 | -6.06e-09 | +| 50% | 0.0459 | 0.0459 | 0.0000 | 0.0000 | +| 75% | 0.3214 | 0.3214 | 4.66e-09 | 6.05e-09 | +| max | 436.6845 | 436.6845 | 4.31e-06 | 5.60e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,713,622 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.26e-11 | 1.10e-11 | 1.1398 | 0.254 | +| Slope | 1.0000 | 1.41e-11 | 7.09e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3713622 (0.000%) +- Stata standard deviation: 7.71e-01 + +--- + +### Mom12mOffSeason + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,865,561 +- Python: 3,865,561 +- Common: 3,865,561 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.43e-15 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.87e+06 | 3.87e+06 | 3.87e+06 | 3.87e+06 | +| mean | 0.0113 | 0.0113 | 2.56e-21 | 4.39e-20 | +| std | 0.0582 | 0.0582 | 2.93e-17 | 5.03e-16 | +| min | -0.5758 | -0.5758 | -8.88e-16 | -1.53e-14 | +| 25% | -0.0153 | -0.0153 | -2.78e-17 | -4.77e-16 | +| 50% | 0.0096 | 0.0096 | 0.0000 | 0.0000 | +| 75% | 0.0351 | 0.0351 | 2.78e-17 | 4.77e-16 | +| max | 4.2943 | 4.2943 | 8.88e-16 | 1.53e-14 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,865,561 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -7.00e-16 | 1.75e-18 | -399.0188 | 0.000 | +| Slope | 1.0000 | 2.96e-17 | 3.38e+16 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3865561 (0.000%) +- Stata standard deviation: 5.82e-02 + +--- + +### Mom6m + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.04% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,893,591 +- Python: 3,895,206 +- Common: 3,893,591 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.63e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.89e+06 | 3.89e+06 | 3.89e+06 | 3.89e+06 | +| mean | 0.0556 | 0.0556 | 7.28e-12 | 1.67e-11 | +| std | 0.4365 | 0.4365 | 1.24e-08 | 2.85e-08 | +| min | -1.0000 | -1.0000 | -1.69e-06 | -3.88e-06 | +| 25% | -0.1471 | -0.1471 | -2.94e-09 | -6.73e-09 | +| 50% | 0.0171 | 0.0171 | 0.0000 | 0.0000 | +| 75% | 0.1891 | 0.1891 | 2.95e-09 | 6.75e-09 | +| max | 66.9428 | 66.9428 | 1.27e-06 | 2.91e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,893,591 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.43e-11 | 6.35e-12 | 2.2572 | 0.024 | +| Slope | 1.0000 | 1.44e-11 | 6.93e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3893591 (0.000%) +- Stata standard deviation: 4.37e-01 + +--- + +### Mom6mJunk + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-21 +- Reviewed by: ac +- Details: Python is missing permno 10026 (gvkey 12825) in 198907. This is because the CIQ security rating has "NR" in 1989-07, which means that it should be excluded (see Avramov et al 2007 JF Table 3). We want only not-investment-grade stocks, excluding not-rated stocks. The old CIQ data likely missed this due to the poor deduplication code. The original paper only used SP ratings, so it's unclear what to do here. But the long-short portfolio t-stat and mean return match the OP quite well, so I'm accepting this. + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 48991 Stata observations) +- Test 2 - NumRows check: ✅ PASSED (Python has -11.54% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 391,738 +- Python: 346,535 +- Common: 342,747 + +**Precision1**: 0.282% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 5.64e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 342747.0000 | 342747.0000 | 342747.0000 | 342747.0000 | +| mean | 0.0549 | 0.0548 | -1.63e-04 | -4.27e-04 | +| std | 0.3822 | 0.3826 | 0.0172 | 0.0450 | +| min | -0.9947 | -0.9947 | -1.1543 | -3.0200 | +| 25% | -0.1348 | -0.1351 | -3.00e-09 | -7.85e-09 | +| 50% | 0.0326 | 0.0327 | 1.31e-14 | 3.42e-14 | +| 75% | 0.2011 | 0.2013 | 3.02e-09 | 7.90e-09 | +| max | 47.6527 | 47.6527 | 1.5000 | 3.9245 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0002 + 0.9999 * stata +- **R-squared**: 0.9980 +- **N observations**: 342,747 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.59e-04 | 2.97e-05 | -5.3608 | 0.000 | +| Slope | 0.9999 | 7.69e-05 | 13001.7151 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 967/342747 (0.282%) +- Stata standard deviation: 3.82e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 90979 202401 -0.064794 0.150524 -0.215318 +1 90979 202312 -0.190532 0.065445 -0.255977 +2 90979 202311 -0.348997 -0.103403 -0.245594 +3 90353 202310 0.379207 0.206912 0.172295 +4 90979 202310 -0.478251 -0.271199 -0.207052 +5 93338 202310 0.035490 0.263470 -0.227979 +6 90353 202309 0.537673 0.248826 0.288847 +7 90979 202309 -0.290354 -0.240253 -0.050101 +8 93338 202309 0.500335 0.372355 0.127980 +9 90353 202308 0.491526 0.339849 0.151677 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 67969 199207 1.499999 0.000000 1.499999 +1 10342 200001 1.736612 0.487289 1.249323 +2 86360 200106 -0.469291 0.685000 -1.154291 +3 90352 201210 -0.685484 0.426830 -1.112313 +4 80658 200110 1.066668 0.000000 1.066668 +5 48565 199307 1.333332 0.272727 1.060605 +6 24731 198604 -0.459091 0.545455 -1.004546 +7 67126 199002 1.821039 0.829268 0.991771 +8 83161 200409 0.763565 -0.222222 0.985787 +9 79338 200206 0.933027 -0.043428 0.976455 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### MomOffSeason + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.18% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,396,704 +- Python: 3,436,865 +- Common: 3,396,704 + +**Precision1**: 0.893% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.20e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.40e+06 | 3.40e+06 | 3.40e+06 | 3.40e+06 | +| mean | 0.0125 | 0.0124 | -6.59e-05 | -0.0024 | +| std | 0.0270 | 0.0264 | 0.0060 | 0.2208 | +| min | -4.1713 | -0.3549 | -1.2656 | -46.9028 | +| 25% | 3.95e-04 | 3.59e-04 | -5.00e-10 | -1.85e-08 | +| 50% | 0.0119 | 0.0118 | 0.0000 | 0.0000 | +| 75% | 0.0240 | 0.0240 | 5.00e-10 | 1.85e-08 | +| max | 1.5150 | 1.5150 | 3.8837 | 143.9315 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0005 + 0.9545 * stata +- **R-squared**: 0.9513 +- **N observations**: 3,396,704 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 5.00e-04 | 3.48e-06 | 143.6172 | 0.000 | +| Slope | 0.9545 | 1.17e-04 | 8143.1419 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 30342/3396704 (0.893%) +- Stata standard deviation: 2.70e-02 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 11379 202412 -0.009633 0.014271 -0.023904 +1 12799 202412 0.024784 0.074968 -0.050184 +2 14051 202412 -0.002322 -0.003499 0.001177 +3 16086 202412 -0.064238 -0.019903 -0.044335 +4 16794 202412 -0.000793 0.013392 -0.014186 +5 17147 202412 -0.067903 -0.071290 0.003386 +6 17901 202412 -0.031193 -0.021832 -0.009361 +7 18065 202412 0.002178 0.002903 -0.000726 +8 18103 202412 -0.019200 0.002476 -0.021675 +9 19833 202412 -0.054061 -0.049822 -0.004239 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 89169 202105 -0.287618 -4.171327 3.883708 +1 44230 198407 -0.059679 -1.585526 1.525847 +2 92161 199008 -0.128534 -1.574922 1.446388 +3 79704 200304 -0.098913 1.166667 -1.265580 +4 10097 199202 -0.105667 1.000000 -1.105667 +5 77324 200102 0.013863 1.086957 -1.073094 +6 10685 199512 -0.048412 -1.021461 0.973049 +7 82810 200509 -0.172706 -1.145503 0.972797 +8 78414 198610 0.354448 1.300000 -0.945552 +9 79704 200302 0.079939 0.960578 -0.880639 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### MomOffSeason06YrPlus + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.93% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,425,319 +- Python: 2,472,139 +- Common: 2,425,319 + +**Precision1**: 1.311% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.46e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.43e+06 | 2.43e+06 | 2.43e+06 | 2.43e+06 | +| mean | 0.0130 | 0.0132 | 1.15e-04 | 0.0036 | +| std | 0.0324 | 0.0229 | 0.0223 | 0.6882 | +| min | -4.8725 | -0.7500 | -15.4780 | -478.1221 | +| 25% | 0.0027 | 0.0027 | -4.55e-10 | -1.40e-08 | +| 50% | 0.0125 | 0.0125 | 0.0000 | 0.0000 | +| 75% | 0.0233 | 0.0233 | 4.55e-10 | 1.40e-08 | +| max | 15.8923 | 0.7811 | 4.8320 | 149.2626 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0065 + 0.5136 * stata +- **R-squared**: 0.5268 +- **N observations**: 2,425,319 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0065 | 1.09e-05 | 591.9489 | 0.000 | +| Slope | 0.5136 | 3.13e-04 | 1643.0826 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 31795/2425319 (1.311%) +- Stata standard deviation: 3.24e-02 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 11379 202412 -0.016035 -0.019014 0.002980 +1 12799 202412 0.019427 0.023379 -0.003953 +2 13563 202412 -0.016118 -0.048507 0.032389 +3 13828 202412 -0.018830 -0.008100 -0.010729 +4 13878 202412 0.007271 0.019389 -0.012117 +5 14051 202412 -0.052231 -0.051790 -0.000441 +6 15294 202412 -0.048290 -0.073018 0.024727 +7 15793 202412 -0.051124 -0.048555 -0.002569 +8 16086 202412 -0.011129 -0.002476 -0.008653 +9 16773 202412 -0.069418 -0.072436 0.003018 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 13755 202011 0.414330 15.892301 -15.477971 +1 13755 202111 0.414330 15.892301 -15.477971 +2 13755 202211 0.414330 15.892301 -15.477971 +3 83382 200510 -0.040478 -4.872470 4.831992 +4 10685 199412 -0.042227 -3.649201 3.606974 +5 86237 201012 -0.029337 3.244835 -3.274172 +6 81728 200612 -0.093591 -3.352354 3.258763 +7 88321 200611 -0.122625 2.289885 -2.412510 +8 76356 198510 0.037454 -2.300340 2.337795 +9 12715 198603 -0.003988 2.214316 -2.218304 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 16408 193402 0.015001 -0.110645 0.125646 +1 16408 193403 0.015493 -0.043396 0.058889 +2 16408 193802 0.018760 -0.036882 0.055642 +3 16408 193502 0.013422 -0.036882 0.050304 +4 16408 193602 0.013422 -0.036882 0.050304 +5 16408 193702 0.013422 -0.036882 0.050304 +6 16408 193807 0.014277 -0.031911 0.046187 +7 16408 193407 0.011901 -0.031911 0.043812 +8 16408 193507 0.011901 -0.031911 0.043812 +9 16408 193607 0.011901 -0.031911 0.043812 +``` + +--- + +### MomOffSeason11YrPlus + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.79% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,677,532 +- Python: 1,707,556 +- Common: 1,677,532 + +**Precision1**: 1.504% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.55e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.68e+06 | 1.68e+06 | 1.68e+06 | 1.68e+06 | +| mean | 0.0135 | 0.0136 | 5.08e-05 | 0.0020 | +| std | 0.0254 | 0.0224 | 0.0117 | 0.4586 | +| min | -2.6111 | -0.6522 | -2.2556 | -88.7393 | +| 25% | 0.0034 | 0.0034 | -4.55e-10 | -1.79e-08 | +| 50% | 0.0128 | 0.0128 | 0.0000 | 0.0000 | +| 75% | 0.0235 | 0.0234 | 4.55e-10 | 1.79e-08 | +| max | 2.2478 | 1.2500 | 2.6319 | 103.5419 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0030 + 0.7842 * stata +- **R-squared**: 0.7898 +- **N observations**: 1,677,532 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0030 | 8.99e-06 | 329.9016 | 0.000 | +| Slope | 0.7842 | 3.12e-04 | 2510.2788 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 25230/1677532 (1.504%) +- Stata standard deviation: 2.54e-02 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 11379 202412 0.022511 0.029216 -0.006706 +1 12799 202412 -0.005228 -0.080205 0.074976 +2 14051 202412 -0.029381 -0.028140 -0.001241 +3 32791 202412 -0.002256 -0.009110 0.006855 +4 77900 202412 0.015898 0.024432 -0.008534 +5 79666 202412 0.023037 0.032062 -0.009025 +6 79903 202412 -0.000149 0.000592 -0.000741 +7 82156 202412 -0.025388 -0.023979 -0.001408 +8 84321 202412 0.014481 -0.102155 0.116636 +9 86812 202412 0.023247 0.024643 -0.001396 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 11803 201112 0.020796 -2.611104 2.631900 +1 77729 200404 0.016586 -2.373474 2.390060 +2 33136 197712 -0.007831 2.247807 -2.255638 +3 24110 198612 -0.046078 2.129370 -2.175448 +4 65518 200005 -0.014146 1.961642 -1.975788 +5 36492 197902 0.057469 1.643900 -1.586431 +6 11803 201110 0.081820 -1.379478 1.461298 +7 86092 201008 0.196499 1.608412 -1.411913 +8 82163 201403 0.054147 -1.259050 1.313197 +9 14761 200804 0.022150 1.294914 -1.272764 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### MomOffSeason16YrPlus + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.68% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,027,449 +- Python: 1,044,703 +- Common: 1,027,449 + +**Precision1**: 0.740% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 9.90e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.03e+06 | 1.03e+06 | 1.03e+06 | 1.03e+06 | +| mean | 0.0150 | 0.0150 | -1.69e-06 | -9.62e-05 | +| std | 0.0175 | 0.0175 | 0.0013 | 0.0742 | +| min | -0.1110 | -0.1110 | -0.0954 | -5.4417 | +| 25% | 0.0053 | 0.0053 | -4.27e-10 | -2.44e-08 | +| 50% | 0.0134 | 0.0134 | 0.0000 | 0.0000 | +| 75% | 0.0230 | 0.0230 | 4.44e-10 | 2.54e-08 | +| max | 0.3670 | 0.3670 | 0.1226 | 6.9931 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0001 + 0.9945 * stata +- **R-squared**: 0.9945 +- **N observations**: 1,027,449 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.13e-05 | 1.68e-06 | 48.3098 | 0.000 | +| Slope | 0.9945 | 7.30e-05 | 13619.9181 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 7604/1027449 (0.740%) +- Stata standard deviation: 1.75e-02 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 11379 202412 0.023706 0.014200 0.009506 +1 32791 202412 0.012446 -0.002500 0.014946 +2 52231 202412 -0.040137 -0.045180 0.005043 +3 77900 202412 -0.001673 -0.005192 0.003519 +4 79903 202412 -0.007497 -0.007966 0.000470 +5 82156 202412 -0.007633 -0.002425 -0.005208 +6 86812 202412 -0.013185 -0.011868 -0.001317 +7 87043 202412 0.042220 0.040528 0.001692 +8 87404 202412 0.014837 0.019622 -0.004785 +9 89169 202412 0.016141 0.020761 -0.004619 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 82848 201810 0.037143 -0.085432 0.122575 +1 82848 201809 0.050950 -0.060400 0.111350 +2 82848 201901 0.033924 -0.073479 0.107403 +3 82848 201811 0.052141 -0.051648 0.103789 +4 82848 201905 0.037850 -0.065358 0.103208 +5 80577 201812 0.066467 -0.035637 0.102103 +6 80577 201810 0.081964 -0.019242 0.101206 +7 82848 201812 0.052138 -0.045694 0.097832 +8 80577 201809 0.086215 -0.011225 0.097440 +9 77393 201210 0.079592 0.174974 -0.095382 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### MomRev + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.23% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 262,210 +- Python: 261,618 +- Common: 261,010 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 261010.0000 | 261010.0000 | 261010.0000 | 261010.0000 | +| mean | 0.5576 | 0.5576 | 0.0000 | 0.0000 | +| std | 0.4967 | 0.4967 | 0.0000 | 0.0000 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 261,010 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -2.84e-13 | 9.00e-16 | -315.5839 | 0.000 | +| Slope | 1.0000 | 1.20e-15 | 8.30e+14 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/261010 (0.000%) +- Stata standard deviation: 4.97e-01 + +--- + +### MomSeason + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.15% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,398,424 +- Python: 3,437,360 +- Common: 3,398,424 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.97e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.40e+06 | 3.40e+06 | 3.40e+06 | 3.40e+06 | +| mean | 0.0122 | 0.0122 | 4.37e-12 | 4.32e-11 | +| std | 0.1011 | 0.1011 | 4.16e-09 | 4.11e-08 | +| min | -0.9957 | -0.9957 | -3.50e-07 | -3.46e-06 | +| 25% | -0.0317 | -0.0317 | -1.00e-09 | -9.89e-09 | +| 50% | 0.0070 | 0.0070 | 0.0000 | 0.0000 | +| 75% | 0.0487 | 0.0487 | 1.00e-09 | 9.89e-09 | +| max | 15.9845 | 15.9845 | 6.00e-07 | 5.93e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,398,424 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.34e-12 | 2.27e-12 | -1.9124 | 0.056 | +| Slope | 1.0000 | 2.23e-11 | 4.48e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3398424 (0.000%) +- Stata standard deviation: 1.01e-01 + +--- + +### MomSeason06YrPlus + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.63% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,432,862 +- Python: 2,472,493 +- Common: 2,432,862 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.40e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.43e+06 | 2.43e+06 | 2.43e+06 | 2.43e+06 | +| mean | 0.0127 | 0.0127 | 5.42e-12 | 6.14e-11 | +| std | 0.0882 | 0.0882 | 4.07e-09 | 4.62e-08 | +| min | -0.9062 | -0.9062 | -3.00e-07 | -3.40e-06 | +| 25% | -0.0260 | -0.0260 | -1.00e-09 | -1.13e-08 | +| 50% | 0.0084 | 0.0084 | 0.0000 | 0.0000 | +| 75% | 0.0455 | 0.0455 | 1.00e-09 | 1.13e-08 | +| max | 6.7025 | 6.7025 | 3.67e-07 | 4.16e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,432,862 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -6.09e-12 | 2.64e-12 | -2.3082 | 0.021 | +| Slope | 1.0000 | 2.96e-11 | 3.38e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2432862 (0.000%) +- Stata standard deviation: 8.82e-02 + +--- + +### MomSeason11YrPlus + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.62% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,680,518 +- Python: 1,707,812 +- Common: 1,680,518 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.58e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.68e+06 | 1.68e+06 | 1.68e+06 | 1.68e+06 | +| mean | 0.0132 | 0.0132 | 5.62e-12 | 6.70e-11 | +| std | 0.0839 | 0.0839 | 3.93e-09 | 4.69e-08 | +| min | -0.9062 | -0.9062 | -2.00e-07 | -2.38e-06 | +| 25% | -0.0241 | -0.0241 | -1.00e-09 | -1.19e-08 | +| 50% | 0.0091 | 0.0091 | 0.0000 | 0.0000 | +| 75% | 0.0450 | 0.0450 | 1.00e-09 | 1.19e-08 | +| max | 3.7568 | 3.7568 | 2.00e-07 | 2.38e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,680,518 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -5.15e-12 | 3.07e-12 | -1.6778 | 0.093 | +| Slope | 1.0000 | 3.61e-11 | 2.77e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/1680518 (0.000%) +- Stata standard deviation: 8.39e-02 + +--- + +### MomSeason16YrPlus + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.36% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,194,902 +- Python: 1,211,157 +- Common: 1,194,902 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.71e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.19e+06 | 1.19e+06 | 1.19e+06 | 1.19e+06 | +| mean | 0.0134 | 0.0134 | -4.78e-13 | -5.92e-12 | +| std | 0.0808 | 0.0808 | 3.79e-09 | 4.70e-08 | +| min | -0.9062 | -0.9062 | -1.00e-07 | -1.24e-06 | +| 25% | -0.0223 | -0.0223 | -1.00e-09 | -1.24e-08 | +| 50% | 0.0096 | 0.0096 | 0.0000 | 0.0000 | +| 75% | 0.0441 | 0.0441 | 1.00e-09 | 1.24e-08 | +| max | 3.7568 | 3.7568 | 2.00e-07 | 2.48e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,194,902 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.00e-11 | 3.52e-12 | -2.8522 | 0.004 | +| Slope | 1.0000 | 4.29e-11 | 2.33e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/1194902 (0.000%) +- Stata standard deviation: 8.08e-02 + +--- + +### MomSeasonShort + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.33% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,718,320 +- Python: 3,730,640 +- Common: 3,718,320 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.73e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.72e+06 | 3.72e+06 | 3.72e+06 | 3.72e+06 | +| mean | 0.0117 | 0.0117 | 1.75e-14 | 1.01e-13 | +| std | 0.1730 | 0.1730 | 4.34e-09 | 2.51e-08 | +| min | -0.9957 | -0.9957 | -5.00e-07 | -2.89e-06 | +| 25% | -0.0633 | -0.0633 | -1.11e-16 | -6.42e-16 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0690 | 0.0690 | 1.11e-16 | 6.42e-16 | +| max | 24.0000 | 24.0000 | 4.00e-07 | 2.31e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,718,320 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -9.95e-12 | 2.25e-12 | -4.4149 | 0.000 | +| Slope | 1.0000 | 1.30e-11 | 7.69e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3718320 (0.000%) +- Stata standard deviation: 1.73e-01 + +--- + +### MomVol + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.11% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,095,615 +- Python: 1,096,816 +- Common: 1,095,601 + +**Precision1**: 0.175% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.47e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.10e+06 | 1.10e+06 | 1.10e+06 | 1.10e+06 | +| mean | 5.7085 | 5.7101 | 0.0016 | 5.63e-04 | +| std | 2.8802 | 2.8804 | 0.0418 | 0.0145 | +| min | 1.0000 | 1.0000 | -1.0000 | -0.3472 | +| 25% | 3.0000 | 3.0000 | 0.0000 | 0.0000 | +| 50% | 6.0000 | 6.0000 | 0.0000 | 0.0000 | +| 75% | 8.0000 | 8.0000 | 0.0000 | 0.0000 | +| max | 10.0000 | 10.0000 | 1.0000 | 0.3472 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0017 + 1.0000 * stata +- **R-squared**: 0.9998 +- **N observations**: 1,095,601 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0017 | 8.86e-05 | 18.7027 | 0.000 | +| Slope | 1.0000 | 1.39e-05 | 72162.0290 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 1915/1095601 (0.175%) +- Stata standard deviation: 2.88e+00 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 19495 202412 9.0 8.0 1.0 +1 22293 202412 8.0 7.0 1.0 +2 13586 202411 9.0 8.0 1.0 +3 34817 202411 4.0 3.0 1.0 +4 22752 202407 7.0 6.0 1.0 +5 12872 202406 9.0 8.0 1.0 +6 15059 202406 2.0 1.0 1.0 +7 21027 202406 3.0 2.0 1.0 +8 89394 202406 2.0 1.0 1.0 +9 17272 202404 10.0 9.0 1.0 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10006 194301 4.0 3.0 1.0 +1 10006 195503 8.0 7.0 1.0 +2 10014 196802 10.0 9.0 1.0 +3 10064 198901 4.0 3.0 1.0 +4 10066 198710 2.0 1.0 1.0 +5 10078 200911 3.0 2.0 1.0 +6 10102 193202 8.0 7.0 1.0 +7 10102 193210 7.0 6.0 1.0 +8 10104 199104 9.0 8.0 1.0 +9 10104 202009 5.0 4.0 1.0 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 10006 194301 4.0 3.0 1.0 +1 10102 193202 8.0 7.0 1.0 +2 10102 193210 7.0 6.0 1.0 +3 10145 193101 7.0 6.0 1.0 +4 10145 193311 10.0 9.0 1.0 +5 10153 194907 9.0 8.0 1.0 +6 10196 193108 4.0 3.0 1.0 +7 10196 193302 3.0 2.0 1.0 +8 10225 193306 4.0 3.0 1.0 +9 10233 192910 8.0 7.0 1.0 +``` + +--- + +### NOA + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.54% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,196,825 +- Python: 3,214,068 +- Common: 3,196,825 + +**Precision1**: 0.005% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.97e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.20e+06 | 3.20e+06 | 3.20e+06 | 3.20e+06 | +| mean | 0.5611 | 0.5611 | 9.15e-07 | 5.65e-07 | +| std | 1.6192 | 1.6192 | 0.0015 | 9.17e-04 | +| min | -498.1454 | -498.1455 | -0.3434 | -0.2121 | +| 25% | 0.3049 | 0.3049 | -1.98e-08 | -1.22e-08 | +| 50% | 0.5864 | 0.5864 | 0.0000 | 0.0000 | +| 75% | 0.7647 | 0.7647 | 1.97e-08 | 1.22e-08 | +| max | 362.4152 | 362.4152 | 0.6600 | 0.4076 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,196,825 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.31e-06 | 8.79e-07 | 4.9076 | 0.000 | +| Slope | 1.0000 | 5.13e-07 | 1.95e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 156/3196825 (0.005%) +- Stata standard deviation: 1.62e+00 + +--- + +### NetDebtFinance + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.56% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,782,808 +- Python: 2,798,509 +- Common: 2,782,808 + +**Precision1**: 0.004% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.37e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.78e+06 | 2.78e+06 | 2.78e+06 | 2.78e+06 | +| mean | 0.0183 | 0.0183 | 4.46e-08 | 3.79e-07 | +| std | 0.1177 | 0.1177 | 1.78e-04 | 0.0015 | +| min | -0.9958 | -0.9958 | -0.0584 | -0.4958 | +| 25% | -0.0178 | -0.0178 | -4.01e-10 | -3.40e-09 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0359 | 0.0359 | 3.96e-10 | 3.36e-09 | +| max | 0.9994 | 0.9994 | 0.0473 | 0.4016 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,782,808 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -5.64e-08 | 1.08e-07 | -0.5234 | 0.601 | +| Slope | 1.0000 | 9.04e-07 | 1.11e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 120/2782808 (0.004%) +- Stata standard deviation: 1.18e-01 + +--- + +### NetDebtPrice + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.04% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,425,163 +- Python: 1,425,763 +- Common: 1,425,117 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.33e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.43e+06 | 1.43e+06 | 1.43e+06 | 1.43e+06 | +| mean | 1.0156 | 1.0156 | 6.45e-07 | 9.21e-08 | +| std | 7.0024 | 7.0024 | 2.04e-04 | 2.91e-05 | +| min | -185.6921 | -185.6921 | -0.0424 | -0.0061 | +| 25% | -0.0787 | -0.0787 | -8.38e-09 | -1.20e-09 | +| 50% | 0.2917 | 0.2917 | -1.33e-12 | -1.90e-13 | +| 75% | 0.9448 | 0.9448 | 8.32e-09 | 1.19e-09 | +| max | 2387.4009 | 2387.4008 | 0.0962 | 0.0137 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,425,117 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 6.95e-07 | 1.72e-07 | 4.0317 | 0.000 | +| Slope | 1.0000 | 2.44e-08 | 4.10e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 4/1425117 (0.000%) +- Stata standard deviation: 7.00e+00 + +--- + +### NetEquityFinance + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.54% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,874,470 +- Python: 2,889,868 +- Common: 2,874,470 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.16e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.87e+06 | 2.87e+06 | 2.87e+06 | 2.87e+06 | +| mean | 0.0161 | 0.0161 | -3.13e-08 | -2.26e-07 | +| std | 0.1385 | 0.1385 | 1.73e-04 | 0.0012 | +| min | -0.9976 | -0.9976 | -0.0583 | -0.4209 | +| 25% | -0.0217 | -0.0217 | -2.35e-10 | -1.70e-09 | +| 50% | -0.0014 | -0.0014 | 0.0000 | 0.0000 | +| 75% | 0.0046 | 0.0046 | 2.30e-10 | 1.66e-09 | +| max | 0.9999 | 0.9999 | 0.0482 | 0.3482 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,874,470 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.98e-08 | 1.03e-07 | -0.4842 | 0.628 | +| Slope | 1.0000 | 7.37e-07 | 1.36e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 60/2874470 (0.002%) +- Stata standard deviation: 1.39e-01 + +--- + +### NetPayoutYield + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.08% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,817,567 +- Python: 1,819,089 +- Common: 1,817,494 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.56e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.82e+06 | 1.82e+06 | 1.82e+06 | 1.82e+06 | +| mean | -0.0082 | -0.0082 | 1.99e-08 | 3.69e-08 | +| std | 0.5404 | 0.5404 | 1.35e-05 | 2.50e-05 | +| min | -589.1729 | -589.1729 | -5.92e-06 | -1.10e-05 | +| 25% | -0.0054 | -0.0054 | -4.40e-10 | -8.14e-10 | +| 50% | 0.0107 | 0.0107 | 2.48e-14 | 4.59e-14 | +| 75% | 0.0389 | 0.0389 | 4.43e-10 | 8.20e-10 | +| max | 19.2230 | 19.2230 | 0.0102 | 0.0189 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,817,494 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.00e-08 | 1.00e-08 | 2.0017 | 0.045 | +| Slope | 1.0000 | 1.85e-08 | 5.40e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 4/1817494 (0.000%) +- Stata standard deviation: 5.40e-01 + +--- + +### NumEarnIncrease + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,823,456 +- Python: 2,823,464 +- Common: 2,823,455 + +**Precision1**: 0.013% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.82e+06 | 2.82e+06 | 2.82e+06 | 2.82e+06 | +| mean | 1.2268 | 1.2267 | -5.28e-05 | -2.74e-05 | +| std | 1.9293 | 1.9293 | 0.0364 | 0.0189 | +| min | 0.0000 | 0.0000 | -8.0000 | -4.1465 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 2.0000 | 2.0000 | 0.0000 | 0.0000 | +| max | 8.0000 | 8.0000 | 8.0000 | 4.1465 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0002 + 0.9998 * stata +- **R-squared**: 0.9996 +- **N observations**: 2,823,455 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.00e-04 | 2.57e-05 | 7.7897 | 0.000 | +| Slope | 0.9998 | 1.12e-05 | 89096.4323 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 378/2823455 (0.013%) +- Stata standard deviation: 1.93e+00 + +--- + +### OPLeverage + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,607,726 +- Python: 3,609,982 +- Common: 3,607,726 + +**Precision1**: 0.009% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.92e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.61e+06 | 3.61e+06 | 3.61e+06 | 3.61e+06 | +| mean | 0.9531 | 0.9531 | -2.90e-06 | -2.41e-06 | +| std | 1.2034 | 1.2034 | 0.0017 | 0.0014 | +| min | -0.7604 | -0.7604 | -0.5914 | -0.4914 | +| 25% | 0.2942 | 0.2942 | -1.23e-08 | -1.03e-08 | +| 50% | 0.7583 | 0.7583 | -6.77e-14 | -5.62e-14 | +| 75% | 1.3112 | 1.3112 | 1.21e-08 | 1.01e-08 | +| max | 218.0000 | 218.0000 | 0.3060 | 0.2543 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,607,726 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.91e-06 | 1.14e-06 | 1.6769 | 0.094 | +| Slope | 1.0000 | 7.42e-07 | 1.35e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 324/3607726 (0.009%) +- Stata standard deviation: 1.20e+00 + +--- + +### OScore + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.02% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,197,546 +- Python: 1,197,776 +- Common: 1,196,849 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.20e+06 | 1.20e+06 | 1.20e+06 | 1.20e+06 | +| mean | 0.1247 | 0.1247 | 0.0000 | 0.0000 | +| std | 0.3304 | 0.3304 | 0.0000 | 0.0000 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,196,849 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.80e-13 | 1.18e-15 | -405.1335 | 0.000 | +| Slope | 1.0000 | 3.35e-15 | 2.98e+14 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/1196849 (0.000%) +- Stata standard deviation: 3.30e-01 + +--- + +### OperProf + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.01% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,407,636 +- Python: 1,407,752 +- Common: 1,407,593 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.11e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.41e+06 | 1.41e+06 | 1.41e+06 | 1.41e+06 | +| mean | 0.1773 | 0.1773 | 1.49e-05 | 9.20e-07 | +| std | 16.1890 | 16.1890 | 0.0034 | 2.10e-04 | +| min | -5640.1177 | -5640.1176 | -0.0947 | -0.0059 | +| 25% | 0.1291 | 0.1291 | -5.13e-09 | -3.17e-10 | +| 50% | 0.2478 | 0.2478 | 0.0000 | 0.0000 | +| 75% | 0.3634 | 0.3634 | 5.15e-09 | 3.18e-10 | +| max | 861.0132 | 861.0132 | 1.0978 | 0.0678 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,407,593 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.51e-05 | 2.87e-06 | 5.2814 | 0.000 | +| Slope | 1.0000 | 1.77e-07 | 5.65e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 25/1407593 (0.002%) +- Stata standard deviation: 1.62e+01 + +--- + +### OperProfRD + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,097,471 +- Python: 2,097,467 +- Common: 2,097,421 + +**Precision1**: 0.006% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.94e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.10e+06 | 2.10e+06 | 2.10e+06 | 2.10e+06 | +| mean | 0.1110 | 0.1110 | -1.75e-08 | -6.39e-08 | +| std | 0.2734 | 0.2734 | 2.24e-04 | 8.19e-04 | +| min | -85.0000 | -85.0000 | -0.0477 | -0.1743 | +| 25% | 0.0650 | 0.0650 | -2.97e-09 | -1.09e-08 | +| 50% | 0.1351 | 0.1351 | 3.91e-12 | 1.43e-11 | +| 75% | 0.2031 | 0.2031 | 3.00e-09 | 1.10e-08 | +| max | 47.9633 | 47.9633 | 0.0637 | 0.2328 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,097,421 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.65e-07 | 1.67e-07 | 0.9885 | 0.323 | +| Slope | 1.0000 | 5.66e-07 | 1.77e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 116/2097421 (0.006%) +- Stata standard deviation: 2.73e-01 + +--- + +### OptionVolume1 + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 86188 Stata observations) +- Test 2 - NumRows check: ✅ PASSED (Python has -8.94% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 855,113 +- Python: 778,681 +- Common: 768,925 + +**Precision1**: 94.227% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.83e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 768925.0000 | 768925.0000 | 768925.0000 | 768925.0000 | +| mean | 883.5215 | 322.4275 | -561.0940 | -0.3086 | +| std | 1818.1984 | 733.0803 | 1218.1822 | 0.6700 | +| min | 0.0000 | 0.0027 | -200679.7756 | -110.3729 | +| 25% | 141.1318 | 29.6563 | -613.3719 | -0.3374 | +| 50% | 393.9321 | 112.3404 | -250.2070 | -0.1376 | +| 75% | 975.0862 | 342.0078 | -90.1569 | -0.0496 | +| max | 223027.0200 | 219859.3074 | 17753.9114 | 9.7646 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 7.1559 + 0.3568 * stata +- **R-squared**: 0.7833 +- **N observations**: 768,925 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.1559 | 0.4327 | 16.5375 | 0.000 | +| Slope | 0.3568 | 2.14e-04 | 1667.0383 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 724537/768925 (94.227%) +- Stata standard deviation: 1.82e+03 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10026 202308 46.812612 76.414703 -29.602091 +1 10028 202308 8.890018 85.090172 -76.200154 +2 10032 202308 5.707763 49.467274 -43.759511 +3 10066 202308 3082.926829 6985.365700 -3902.438871 +4 10104 202308 2565.689584 7407.254900 -4841.565316 +5 10107 202308 5442.418572 16818.195000 -11375.776428 +6 10138 202308 614.449467 1514.908300 -900.458833 +7 10145 202308 1506.068466 2873.675800 -1367.607334 +8 10158 202308 151.885034 445.312960 -293.427926 +9 10200 202308 107.684585 390.085940 -282.401355 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 89627 200901 293.604431 200973.380 -200679.775569 +1 91068 201408 12716.909722 106741.660 -94024.750278 +2 84788 202109 29265.285925 123144.680 -93879.394075 +3 84788 202111 27465.128876 115090.130 -87625.001124 +4 84788 202110 26998.250169 112166.110 -85167.859831 +5 84788 202106 27914.822181 108274.760 -80359.937819 +6 84788 202107 26120.508908 105506.180 -79385.671092 +7 84788 202108 25946.857222 105126.660 -79179.802778 +8 84788 202112 26322.997150 105262.390 -78939.392850 +9 84788 202104 25508.049163 94597.203 -69089.153837 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### OptionVolume2 + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 87091 Stata observations) +- Test 2 - NumRows check: ✅ PASSED (Python has -9.18% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 843,512 +- Python: 766,055 +- Common: 756,421 + +**Precision1**: 93.913% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.93e+01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 756421.0000 | 756421.0000 | 756421.0000 | 756421.0000 | +| mean | 1.1995 | 1.4829 | 0.2834 | 0.1098 | +| std | 2.5826 | 11.6045 | 11.3561 | 4.3972 | +| min | 0.0000 | 2.97e-06 | -1049.1393 | -406.2357 | +| 25% | 0.5649 | 0.4112 | -0.2821 | -0.1092 | +| 50% | 0.9008 | 0.8140 | -0.0334 | -0.0129 | +| 75% | 1.3604 | 1.4229 | 0.2714 | 0.1051 | +| max | 1049.6909 | 7072.6796 | 7050.9285 | 2730.1796 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.3705 + 0.9275 * stata +- **R-squared**: 0.0426 +- **N observations**: 756,421 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.3705 | 0.0144 | 25.7364 | 0.000 | +| Slope | 0.9275 | 0.0051 | 183.4665 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 710376/756421 (93.913%) +- Stata standard deviation: 2.58e+00 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10026 202308 2.279692 0.784778 1.494914 +1 10028 202308 0.481896 0.791394 -0.309499 +2 10032 202308 0.199350 0.505502 -0.306153 +3 10066 202308 0.654055 1.027572 -0.373517 +4 10104 202308 1.096916 1.236225 -0.139309 +5 10145 202308 2.238413 1.542941 0.695471 +6 10158 202308 2.489557 2.701363 -0.211806 +7 10200 202308 0.582056 0.936441 -0.354385 +8 10220 202308 0.462774 1.884915 -1.422141 +9 10257 202308 0.714418 0.818952 -0.104534 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 31325 200702 7072.679560 21.751108 7050.928452 +1 91184 201703 2760.485638 6.929793 2753.555844 +2 19402 202211 2495.881829 178.229480 2317.652349 +3 87601 200910 1847.323494 7.190881 1840.132613 +4 83946 200502 1547.964602 14.013381 1533.951221 +5 91154 201009 1271.031239 4.160315 1266.870924 +6 92919 202308 1283.909067 225.710010 1058.199057 +7 12356 201109 0.551604 1049.690900 -1049.139296 +8 13652 201408 976.580283 11.884981 964.695302 +9 31974 201212 846.547702 0.697687 845.850015 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### OrderBacklog + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.52% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 634,164 +- Python: 637,473 +- Common: 634,164 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.89e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 634164.0000 | 634164.0000 | 634164.0000 | 634164.0000 | +| mean | 0.5886 | 0.5886 | 5.24e-07 | 4.51e-07 | +| std | 1.1620 | 1.1620 | 1.21e-04 | 1.04e-04 | +| min | 9.96e-06 | 9.96e-06 | -2.71e-06 | -2.33e-06 | +| 25% | 0.1418 | 0.1418 | -5.76e-09 | -4.96e-09 | +| 50% | 0.3146 | 0.3146 | 8.80e-12 | 7.57e-12 | +| 75% | 0.6607 | 0.6607 | 5.88e-09 | 5.06e-09 | +| max | 102.1352 | 102.1352 | 0.0277 | 0.0238 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 634,164 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 6.20e-07 | 1.70e-07 | 3.6570 | 0.000 | +| Slope | 1.0000 | 1.30e-07 | 7.68e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 12/634164 (0.002%) +- Stata standard deviation: 1.16e+00 + +--- + +### OrderBacklogChg + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.87% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 564,785 +- Python: 569,697 +- Common: 564,785 + +**Precision1**: 0.003% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 6.03e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 564785.0000 | 564785.0000 | 564785.0000 | 564785.0000 | +| mean | -0.0077 | -0.0077 | 2.74e-07 | 3.71e-07 | +| std | 0.7388 | 0.7388 | 1.74e-04 | 2.36e-04 | +| min | -98.8127 | -98.8127 | -0.0444 | -0.0601 | +| 25% | -0.0733 | -0.0733 | -7.55e-09 | -1.02e-08 | +| 50% | -0.0035 | -0.0035 | -9.05e-12 | -1.22e-11 | +| 75% | 0.0562 | 0.0562 | 7.51e-09 | 1.02e-08 | +| max | 73.7469 | 73.7469 | 0.0277 | 0.0375 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 564,785 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.77e-07 | 2.32e-07 | 1.1959 | 0.232 | +| Slope | 1.0000 | 3.13e-07 | 3.19e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 16/564785 (0.003%) +- Stata standard deviation: 7.39e-01 + +--- + +### OrgCap + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.04% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,243,383 +- Python: 1,243,846 +- Common: 1,243,383 + +**Precision1**: 1.232% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.73e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.24e+06 | 1.24e+06 | 1.24e+06 | 1.24e+06 | +| mean | 2.37e-10 | -3.12e-04 | -3.12e-04 | -3.14e-04 | +| std | 0.9941 | 0.9937 | 0.0058 | 0.0058 | +| min | -2.3446 | -2.3446 | -0.3658 | -0.3680 | +| 25% | -0.6402 | -0.6404 | -7.50e-08 | -7.55e-08 | +| 50% | -0.2736 | -0.2739 | -8.13e-09 | -8.18e-09 | +| 75% | 0.3358 | 0.3354 | 4.44e-08 | 4.47e-08 | +| max | 10.1323 | 10.1323 | 0.3632 | 0.3654 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0003 + 0.9996 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,243,383 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -3.12e-04 | 5.19e-06 | -60.1990 | 0.000 | +| Slope | 0.9996 | 5.22e-06 | 191506.4118 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 15318/1243383 (1.232%) +- Stata standard deviation: 9.94e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 12369 202412 -0.712835 -0.637767 -0.075068 +1 13103 202412 -0.661869 -0.593880 -0.067989 +2 14076 202412 0.277988 0.215437 0.062550 +3 14185 202412 -0.039569 -0.058013 0.018444 +4 14987 202412 -0.583248 -0.526179 -0.057069 +5 15113 202412 -0.622727 -0.569727 -0.053000 +6 16655 202412 -0.064895 -0.079821 0.014927 +7 16696 202412 0.351770 0.278972 0.072798 +8 18936 202412 -0.813204 -0.724195 -0.089008 +9 20120 202412 -0.394750 -0.363862 -0.030888 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 86536 202410 5.545956 5.911780 -0.365824 +1 86536 202411 5.545956 5.911780 -0.365824 +2 86536 202407 5.548397 5.912627 -0.364230 +3 86536 202408 5.548397 5.912627 -0.364230 +4 86536 202409 5.548397 5.912627 -0.364230 +5 86536 202406 5.549277 5.912725 -0.363448 +6 91826 202410 2.451869 2.088624 0.363245 +7 91826 202411 2.451869 2.088624 0.363245 +8 91826 202407 2.450749 2.088611 0.362138 +9 91826 202408 2.450749 2.088611 0.362138 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### PS + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 463,944 +- Python: 464,239 +- Common: 463,941 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 463941.0000 | 463941.0000 | 463941.0000 | 463941.0000 | +| mean | 5.0197 | 5.0197 | 0.0000 | 0.0000 | +| std | 1.6958 | 1.6958 | 0.0000 | 0.0000 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 4.0000 | 4.0000 | 0.0000 | 0.0000 | +| 50% | 5.0000 | 5.0000 | 0.0000 | 0.0000 | +| 75% | 6.0000 | 6.0000 | 0.0000 | 0.0000 | +| max | 9.0000 | 9.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 463,941 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -7.68e-13 | 4.19e-15 | -183.1897 | 0.000 | +| Slope | 1.0000 | 7.91e-16 | 1.26e+15 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/463941 (0.000%) +- Stata standard deviation: 1.70e+00 + +--- + +### PatentsRD + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.61% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 671,832 +- Python: 675,924 +- Common: 671,580 + +**Precision1**: 0.055% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 671580.0000 | 671580.0000 | 671580.0000 | 671580.0000 | +| mean | 0.2099 | 0.2104 | 5.54e-04 | 0.0014 | +| std | 0.4072 | 0.4076 | 0.0235 | 0.0578 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.4557 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0007 + 0.9993 * stata +- **R-squared**: 0.9967 +- **N observations**: 671,580 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.01e-04 | 3.23e-05 | 21.7059 | 0.000 | +| Slope | 0.9993 | 7.05e-05 | 14174.4527 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 372/671580 (0.055%) +- Stata standard deviation: 4.07e-01 + +--- + +### PayoutYield + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.02% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,419,344 +- Python: 1,419,574 +- Common: 1,419,344 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.75e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.42e+06 | 1.42e+06 | 1.42e+06 | 1.42e+06 | +| mean | 0.1105 | 0.1105 | 2.55e-08 | 5.36e-08 | +| std | 0.4766 | 0.4766 | 1.53e-05 | 3.20e-05 | +| min | 5.47e-08 | 5.47e-08 | -1.99e-06 | -4.17e-06 | +| 25% | 0.0153 | 0.0153 | -6.85e-10 | -1.44e-09 | +| 50% | 0.0377 | 0.0377 | -1.03e-14 | -2.16e-14 | +| 75% | 0.0825 | 0.0825 | 6.82e-10 | 1.43e-09 | +| max | 204.5146 | 204.5146 | 0.0102 | 0.0215 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,419,344 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.66e-08 | 1.32e-08 | 2.0239 | 0.043 | +| Slope | 1.0000 | 2.69e-08 | 3.72e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 4/1419344 (0.000%) +- Stata standard deviation: 4.77e-01 + +--- + +### PctAcc + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.19% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,174,456 +- Python: 3,180,606 +- Common: 3,174,456 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 9.16e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.17e+06 | 3.17e+06 | 3.17e+06 | 3.17e+06 | +| mean | -1.6989 | -1.6989 | -3.49e-05 | -3.63e-07 | +| std | 95.9587 | 95.9587 | 0.0224 | 2.33e-04 | +| min | -22400.0000 | -22400.0000 | -10.0625 | -0.1049 | +| 25% | -1.2865 | -1.2865 | -1.59e-08 | -1.66e-10 | +| 50% | -0.3594 | -0.3594 | 0.0000 | 0.0000 | +| 75% | 0.4757 | 0.4757 | 1.60e-08 | 1.66e-10 | +| max | 14452.5710 | 14452.5714 | 4.2589 | 0.0444 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,174,456 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -3.49e-05 | 1.26e-05 | -2.7798 | 0.005 | +| Slope | 1.0000 | 1.31e-07 | 7.64e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 36/3174456 (0.001%) +- Stata standard deviation: 9.60e+01 + +--- + +### PctTotAcc + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.09% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,412,359 +- Python: 2,414,639 +- Common: 2,412,359 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.53e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.41e+06 | 2.41e+06 | 2.41e+06 | 2.41e+06 | +| mean | 2.5870 | 2.5870 | -1.01e-05 | -7.06e-08 | +| std | 142.8889 | 142.8889 | 0.0124 | 8.70e-05 | +| min | -11719.7500 | -11719.7500 | -2.8089 | -0.0197 | +| 25% | -0.6329 | -0.6330 | -1.78e-08 | -1.24e-10 | +| 50% | 0.4821 | 0.4820 | 0.0000 | 0.0000 | +| 75% | 1.2312 | 1.2311 | 1.77e-08 | 1.24e-10 | +| max | 49788.3980 | 49788.4000 | 4.4289 | 0.0310 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,412,359 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.02e-05 | 8.01e-06 | -1.2683 | 0.205 | +| Slope | 1.0000 | 5.60e-08 | 1.78e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 24/2412359 (0.001%) +- Stata standard deviation: 1.43e+02 + +--- + +### PredictedFE + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-13 +- Reviewed by: ac +- Details: The standardized deviation is on average 1% with a sd of 7 pp. So it's above the threshold, but it's small. Sumstats and regressions show that the replication works very well. Regressing python on stata shows that the coefficient is 0.9959 and the Rsq is 0.995. This is a complicated file, so it makes sense that there will be some deviations later in the code, which is where PredictedFE is created. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ❌ FAILED (Python has +29.22% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 491,508 +- Python: 635,124 +- Common: 490,188 + +**Precision1**: 85.268% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.14e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 490188.0000 | 490188.0000 | 490188.0000 | 490188.0000 | +| mean | 0.0519 | 0.0523 | 4.00e-04 | 0.0126 | +| std | 0.0316 | 0.0316 | 0.0022 | 0.0695 | +| min | -0.1080 | -0.1098 | -0.0430 | -1.3585 | +| 25% | 0.0308 | 0.0310 | -8.23e-04 | -0.0260 | +| 50% | 0.0476 | 0.0480 | 3.15e-04 | 0.0100 | +| 75% | 0.0681 | 0.0685 | 0.0016 | 0.0519 | +| max | 0.2809 | 0.2700 | 0.0289 | 0.9139 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0006 + 0.9959 * stata +- **R-squared**: 0.9952 +- **N observations**: 490,188 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 6.10e-04 | 6.02e-06 | 101.2931 | 0.000 | +| Slope | 0.9959 | 9.91e-05 | 10051.6551 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 417972/490188 (85.268%) +- Stata standard deviation: 3.16e-02 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10107 202505 0.083963 0.078498 0.005465 +1 10145 202505 0.044157 0.040432 0.003725 +2 10200 202505 0.115949 0.113642 0.002307 +3 10397 202505 0.049261 0.049793 -0.000531 +4 10606 202505 0.046297 0.044046 0.002251 +5 10693 202505 0.042425 0.036774 0.005651 +6 10696 202505 0.107327 0.105464 0.001862 +7 11308 202505 0.073800 0.072780 0.001020 +8 11403 202505 0.095454 0.090274 0.005179 +9 11547 202505 0.080375 0.076702 0.003674 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 91575 202106 0.008267 0.051235 -0.042968 +1 91575 202107 0.008267 0.051235 -0.042968 +2 91575 202108 0.008267 0.051235 -0.042968 +3 91575 202109 0.008267 0.051235 -0.042968 +4 91575 202110 0.008267 0.051235 -0.042968 +5 91575 202111 0.008267 0.051235 -0.042968 +6 91575 202112 0.008267 0.051235 -0.042968 +7 91575 202201 0.008267 0.051235 -0.042968 +8 91575 202202 0.008267 0.051235 -0.042968 +9 91575 202203 0.008267 0.051235 -0.042968 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### Price + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,029,252 +- Python: 4,029,252 +- Common: 4,029,252 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.97e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.03e+06 | 4.03e+06 | 4.03e+06 | 4.03e+06 | +| mean | 2.3896 | 2.3896 | -5.50e-09 | -4.14e-09 | +| std | 1.3279 | 1.3279 | 7.22e-08 | 5.43e-08 | +| min | -4.8536 | -4.8536 | -9.12e-07 | -6.87e-07 | +| 25% | 1.6214 | 1.6214 | -4.87e-08 | -3.67e-08 | +| 50% | 2.5887 | 2.5887 | -3.86e-09 | -2.91e-09 | +| 75% | 3.3080 | 3.3080 | 4.02e-08 | 3.03e-08 | +| max | 13.4926 | 13.4926 | 8.70e-07 | 6.55e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,029,252 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -3.19e-09 | 7.40e-11 | -43.0915 | 0.000 | +| Slope | 1.0000 | 2.71e-11 | 3.69e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4029252 (0.000%) +- Stata standard deviation: 1.33e+00 + +--- + +### PriceDelayRsq + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.14% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,630,424 +- Python: 4,636,840 +- Common: 4,630,424 + +**Precision1**: 1.210% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.94e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.63e+06 | 4.63e+06 | 4.63e+06 | 4.63e+06 | +| mean | 0.3626 | 0.3636 | 0.0011 | 0.0032 | +| std | 0.3266 | 0.3273 | 0.0385 | 0.1179 | +| min | 5.86e-06 | 5.86e-06 | -0.9410 | -2.8811 | +| 25% | 0.0727 | 0.0727 | -7.10e-09 | -2.17e-08 | +| 50% | 0.2485 | 0.2494 | -1.51e-11 | -4.61e-11 | +| 75% | 0.6262 | 0.6293 | 6.72e-09 | 2.06e-08 | +| max | 1.0000 | 1.0000 | 0.9574 | 2.9313 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0028 + 0.9952 * stata +- **R-squared**: 0.9862 +- **N observations**: 4,630,424 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0028 | 2.67e-05 | 104.3798 | 0.000 | +| Slope | 0.9952 | 5.48e-05 | 18174.8456 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 56043/4630424 (1.210%) +- Stata standard deviation: 3.27e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 19066 202407 0.504903 0.510167 -0.005264 +1 22546 202407 0.824044 0.829349 -0.005306 +2 77202 202407 0.727244 0.723967 0.003277 +3 20665 202406 0.885442 0.656071 0.229371 +4 20665 202405 0.885442 0.656071 0.229371 +5 20665 202404 0.885442 0.656071 0.229371 +6 20665 202403 0.885442 0.656071 0.229371 +7 20665 202402 0.885442 0.656071 0.229371 +8 20665 202401 0.885442 0.656071 0.229371 +9 20665 202312 0.885442 0.656071 0.229371 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10066 199007 0.990682 0.033295 0.957386 +1 10066 199008 0.990682 0.033295 0.957386 +2 10066 199009 0.990682 0.033295 0.957386 +3 10066 199010 0.990682 0.033295 0.957386 +4 10066 199011 0.990682 0.033295 0.957386 +5 10066 199012 0.990682 0.033295 0.957386 +6 10066 199101 0.990682 0.033295 0.957386 +7 10066 199102 0.990682 0.033295 0.957386 +8 10066 199103 0.990682 0.033295 0.957386 +9 10066 199104 0.990682 0.033295 0.957386 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 17283 193007 0.995758 0.236111 0.759647 +1 17283 193008 0.995758 0.236111 0.759647 +2 17283 193009 0.995758 0.236111 0.759647 +3 17283 193010 0.995758 0.236111 0.759647 +4 17283 193011 0.995758 0.236111 0.759647 +5 17283 193012 0.995758 0.236111 0.759647 +6 17283 193101 0.995758 0.236111 0.759647 +7 17283 193102 0.995758 0.236111 0.759647 +8 17283 193103 0.995758 0.236111 0.759647 +9 17283 193104 0.995758 0.236111 0.759647 +``` + +--- + +### PriceDelaySlope + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.14% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,630,424 +- Python: 4,636,840 +- Common: 4,630,424 + +**Precision1**: 0.584% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.00e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.63e+06 | 4.63e+06 | 4.63e+06 | 4.63e+06 | +| mean | -0.1887 | -0.2305 | -0.0418 | -1.66e-04 | +| std | 251.9312 | 250.4587 | 29.1253 | 0.1156 | +| min | -85166.8980 | -85251.4764 | -16276.3988 | -64.6065 | +| 25% | -0.2251 | -0.2250 | -6.16e-08 | -2.44e-10 | +| 50% | 0.4319 | 0.4334 | -1.48e-09 | -5.86e-12 | +| 75% | 1.2235 | 1.2261 | 4.94e-08 | 1.96e-10 | +| max | 60258.0310 | 60248.4687 | 6282.3819 | 24.9369 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0442 + 0.9875 * stata +- **R-squared**: 0.9866 +- **N observations**: 4,630,424 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -0.0442 | 0.0135 | -3.2835 | 0.001 | +| Slope | 0.9875 | 5.34e-05 | 18488.9200 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 27056/4630424 (0.584%) +- Stata standard deviation: 2.52e+02 + +--- + +### PriceDelayTstat + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-14 +- Reviewed by: ac +- Details: There was a bug with the Stata code's winsorization. No way to replicate this. There's also a typo in the Stata formula for this. See https://github.com/OpenSourceAP/CrossSection/issues/177 + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +2.50% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,523,656 +- Python: 4,636,840 +- Common: 4,523,656 + +**Precision1**: 98.942% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.14e+02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.52e+06 | 4.52e+06 | 4.52e+06 | 4.52e+06 | +| mean | 1.6229 | -0.2347 | -1.8575 | -1.3425 | +| std | 1.3836 | 424.9721 | 424.9748 | 307.1411 | +| min | -5.3533 | -229867.1486 | -229870.3812 | -166133.6866 | +| 25% | 0.8336 | -0.1933 | -1.5821 | -1.1434 | +| 50% | 1.6661 | 0.4442 | -0.6690 | -0.4835 | +| 75% | 2.4069 | 1.2248 | -0.1619 | -0.1170 | +| max | 7.5741 | 47949.6348 | 47947.1826 | 34652.7559 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0705 + -0.1012 * stata +- **R-squared**: 0.0000 +- **N observations**: 4,523,656 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -0.0705 | 0.3080 | -0.2289 | 0.819 | +| Slope | -0.1012 | 0.1444 | -0.7005 | 0.484 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 4475798/4523656 (98.942%) +- Stata standard deviation: 1.38e+00 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10026 202407 1.243024 1.904548 -0.661524 +1 10028 202407 -0.178130 -0.614981 0.436851 +2 10032 202407 -1.887050 5.075211 -6.962260 +3 10044 202407 1.177566 2.110964 -0.933397 +4 10065 202407 0.374065 2.339563 -1.965497 +5 10066 202407 0.583050 1.005340 -0.422290 +6 10104 202407 0.744722 2.798215 -2.053493 +7 10107 202407 0.113950 1.981288 -1.867338 +8 10113 202407 0.170261 3.936723 -3.766462 +9 10138 202407 0.535726 1.885572 -1.349847 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 22356 202207 -229867.148626 3.232525 -229870.381152 +1 22356 202208 -229867.148626 3.232525 -229870.381152 +2 22356 202209 -229867.148626 3.232525 -229870.381152 +3 22356 202210 -229867.148626 3.232525 -229870.381152 +4 22356 202211 -229867.148626 3.232525 -229870.381152 +5 22356 202212 -229867.148626 3.232525 -229870.381152 +6 22356 202301 -229867.148626 3.232525 -229870.381152 +7 22356 202302 -229867.148626 3.232525 -229870.381152 +8 22356 202303 -229867.148626 3.232525 -229870.381152 +9 22356 202304 -229867.148626 3.232525 -229870.381152 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 21805 194507 3749.878542 3.33705 3746.541492 +1 21805 194508 3749.878542 3.33705 3746.541492 +2 21805 194509 3749.878542 3.33705 3746.541492 +3 21805 194510 3749.878542 3.33705 3746.541492 +4 21805 194511 3749.878542 3.33705 3746.541492 +5 21805 194512 3749.878542 3.33705 3746.541492 +6 21805 194601 3749.878542 3.33705 3746.541492 +7 21805 194602 3749.878542 3.33705 3746.541492 +8 21805 194603 3749.878542 3.33705 3746.541492 +9 21805 194604 3749.878542 3.33705 3746.541492 +``` + +--- + +### ProbInformedTrading + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 24,028 +- Python: 24,028 +- Common: 24,028 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 5.99e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 24028.0000 | 24028.0000 | 24028.0000 | 24028.0000 | +| mean | 0.1930 | 0.1930 | 2.66e-10 | 3.99e-09 | +| std | 0.0667 | 0.0667 | 1.12e-08 | 1.68e-07 | +| min | 0.0191 | 0.0191 | -4.00e-08 | -5.99e-07 | +| 25% | 0.1469 | 0.1469 | 0.0000 | 0.0000 | +| 50% | 0.1935 | 0.1935 | 0.0000 | 0.0000 | +| 75% | 0.2405 | 0.2405 | 0.0000 | 0.0000 | +| max | 0.4767 | 0.4767 | 5.00e-08 | 7.49e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 24,028 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -6.59e-10 | 2.21e-10 | -2.9775 | 0.003 | +| Slope | 1.0000 | 1.08e-09 | 9.23e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/24028 (0.000%) +- Stata standard deviation: 6.67e-02 + +--- + +### RD + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,419,136 +- Python: 1,419,166 +- Common: 1,419,136 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 9.73e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.42e+06 | 1.42e+06 | 1.42e+06 | 1.42e+06 | +| mean | 0.0927 | 0.0927 | -5.77e-08 | -4.81e-08 | +| std | 1.1988 | 1.1988 | 2.60e-05 | 2.17e-05 | +| min | -0.0744 | -0.0744 | -0.0132 | -0.0110 | +| 25% | 0.0061 | 0.0061 | -4.62e-10 | -3.86e-10 | +| 50% | 0.0273 | 0.0273 | 0.0000 | 0.0000 | +| 75% | 0.0770 | 0.0770 | 4.61e-10 | 3.85e-10 | +| max | 1344.9330 | 1344.9330 | 2.08e-06 | 1.73e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,419,136 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -5.56e-08 | 2.19e-08 | -2.5371 | 0.011 | +| Slope | 1.0000 | 1.82e-08 | 5.48e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 2/1419136 (0.000%) +- Stata standard deviation: 1.20e+00 + +--- + +### RDAbility + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-28 +- Reviewed by: ac +- Details: Given the complicated nature of this predictor (many regressions with many missing values), the 4.3% Precision1 failure rate is amazing. I think we actually improved the replication a bit, since the long-short t-stat is higher by a bit. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +4.50% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 173,266 +- Python: 181,066 +- Common: 173,242 + +**Precision1**: 4.336% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.17e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 173242.0000 | 173242.0000 | 173242.0000 | 173242.0000 | +| mean | 0.4685 | 0.4644 | -0.0040 | -7.50e-04 | +| std | 5.3534 | 5.2908 | 0.7769 | 0.1451 | +| min | -170.7315 | -170.7315 | -25.1031 | -4.6892 | +| 25% | -0.2961 | -0.2951 | -1.56e-07 | -2.91e-08 | +| 50% | 0.4038 | 0.4001 | 5.57e-10 | 1.04e-10 | +| 75% | 1.3891 | 1.3673 | 1.58e-07 | 2.94e-08 | +| max | 83.8592 | 83.8592 | 35.2219 | 6.5794 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0064 + 0.9778 * stata +- **R-squared**: 0.9789 +- **N observations**: 173,242 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0064 | 0.0019 | 3.4395 | 0.001 | +| Slope | 0.9778 | 3.45e-04 | 2837.9742 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 7512/173242 (4.336%) +- Stata standard deviation: 5.35e+00 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 14033 202608 0.379405 0.192091 0.187314 +1 14033 202607 0.379405 0.192091 0.187314 +2 14033 202606 0.379405 0.192091 0.187314 +3 14033 202605 0.379405 0.192091 0.187314 +4 14245 202605 0.943359 0.997209 -0.053850 +5 14432 202605 0.304188 0.448311 -0.144123 +6 14668 202605 0.619375 0.359465 0.259910 +7 15059 202605 0.805267 -4.663055 5.468322 +8 16533 202605 -0.130891 -0.232203 0.101312 +9 82670 202605 0.300767 0.394679 -0.093913 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 79283 200206 -24.396323 -59.618244 35.221921 +1 79283 200207 -24.396323 -59.618244 35.221921 +2 79283 200208 -24.396323 -59.618244 35.221921 +3 79283 200209 -24.396323 -59.618244 35.221921 +4 79283 200210 -24.396323 -59.618244 35.221921 +5 79283 200211 -24.396323 -59.618244 35.221921 +6 79283 200212 -24.396323 -59.618244 35.221921 +7 79283 200301 -24.396323 -59.618244 35.221921 +8 79283 200302 -24.396323 -59.618244 35.221921 +9 79283 200303 -24.396323 -59.618244 35.221921 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### RDS + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +3.38% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,725,375 +- Python: 2,817,595 +- Common: 2,725,375 + +**Precision1**: 0.001% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 9.44e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.73e+06 | 2.73e+06 | 2.73e+06 | 2.73e+06 | +| mean | -142.9794 | -142.9782 | 0.0012 | 1.50e-07 | +| std | 7944.9900 | 7944.9896 | 1.6490 | 2.08e-04 | +| min | -2.68e+06 | -2.68e+06 | -576.0000 | -0.0725 | +| 25% | -6.6615 | -6.6614 | -1.00e-08 | -1.26e-12 | +| 50% | -0.0935 | -0.0935 | 0.0000 | 0.0000 | +| 75% | 0.7535 | 0.7535 | 1.00e-08 | 1.26e-12 | +| max | 692140.2500 | 692140.2477 | 454.0000 | 0.0571 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0012 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,725,375 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0012 | 9.99e-04 | 1.1792 | 0.238 | +| Slope | 1.0000 | 1.26e-07 | 7.95e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 36/2725375 (0.001%) +- Stata standard deviation: 7.94e+03 + +--- + +### RDcap + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +3.89% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 517,737 +- Python: 537,864 +- Common: 517,652 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.65e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 517652.0000 | 517652.0000 | 517652.0000 | 517652.0000 | +| mean | 0.2067 | 0.2067 | 7.36e-12 | 1.06e-11 | +| std | 0.6976 | 0.6976 | 2.23e-08 | 3.20e-08 | +| min | -0.0011 | -0.0011 | -1.04e-06 | -1.49e-06 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.1533 | 0.1533 | 0.0000 | 0.0000 | +| max | 34.7810 | 34.7810 | 1.32e-06 | 1.90e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 517,652 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.63e-10 | 3.24e-11 | -5.0472 | 0.000 | +| Slope | 1.0000 | 4.45e-11 | 2.25e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/517652 (0.000%) +- Stata standard deviation: 6.98e-01 + +--- + +### REV6 + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.05% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,762,090 +- Python: 1,762,915 +- Common: 1,759,158 + +**Precision1**: 0.164% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.74e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.76e+06 | 1.76e+06 | 1.76e+06 | 1.76e+06 | +| mean | -0.0685 | -0.0645 | 0.0040 | 6.18e-05 | +| std | 64.8875 | 65.3849 | 8.3431 | 0.1286 | +| min | -58190.5590 | -58190.5598 | -3284.2074 | -50.6138 | +| 25% | -0.0119 | -0.0122 | -9.71e-10 | -1.50e-11 | +| 50% | 0.0016 | 0.0017 | 0.0000 | 0.0000 | +| 75% | 0.0124 | 0.0126 | 9.70e-10 | 1.49e-11 | +| max | 32747.2600 | 32748.0882 | 8424.1383 | 129.8267 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0040 + 0.9994 * stata +- **R-squared**: 0.9837 +- **N observations**: 1,759,158 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0040 | 0.0063 | 0.6313 | 0.528 | +| Slope | 0.9994 | 9.69e-05 | 10309.6465 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 2888/1759158 (0.164%) +- Stata standard deviation: 6.49e+01 + +--- + +### RIO_Disp + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 497,437 +- Python: 497,742 +- Common: 496,313 + +**Precision1**: 0.100% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.90e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 496313.0000 | 496313.0000 | 496313.0000 | 496313.0000 | +| mean | 3.5899 | 3.5909 | 9.95e-04 | 7.86e-04 | +| std | 1.2664 | 1.2666 | 0.0317 | 0.0250 | +| min | 1.0000 | 1.0000 | -1.0000 | -0.7896 | +| 25% | 3.0000 | 3.0000 | 0.0000 | 0.0000 | +| 50% | 4.0000 | 4.0000 | 0.0000 | 0.0000 | +| 75% | 5.0000 | 5.0000 | 0.0000 | 0.0000 | +| max | 5.0000 | 5.0000 | 1.0000 | 0.7896 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0018 + 0.9998 * stata +- **R-squared**: 0.9994 +- **N observations**: 496,313 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0018 | 1.35e-04 | 13.0912 | 0.000 | +| Slope | 0.9998 | 3.55e-05 | 28174.5881 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 498/496313 (0.100%) +- Stata standard deviation: 1.27e+00 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 21563 202412 4.0 3.0 1.0 +1 91910 202412 5.0 4.0 1.0 +2 92597 202411 4.0 3.0 1.0 +3 22758 202406 4.0 3.0 1.0 +4 16630 202405 5.0 4.0 1.0 +5 18937 202405 4.0 3.0 1.0 +6 10382 202403 5.0 4.0 1.0 +7 18572 202403 4.0 3.0 1.0 +8 15291 202401 3.0 2.0 1.0 +9 25590 202401 3.0 2.0 1.0 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10026 199903 5.0 4.0 1.0 +1 10026 201806 3.0 2.0 1.0 +2 10035 199004 5.0 4.0 1.0 +3 10083 198705 4.0 3.0 1.0 +4 10091 198808 4.0 3.0 1.0 +5 10180 200001 5.0 4.0 1.0 +6 10182 201804 4.0 3.0 1.0 +7 10192 199007 5.0 4.0 1.0 +8 10258 199104 5.0 4.0 1.0 +9 10258 201705 5.0 4.0 1.0 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### RIO_MB + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.09% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 354,170 +- Python: 354,474 +- Common: 354,047 + +**Precision1**: 0.089% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 354047.0000 | 354047.0000 | 354047.0000 | 354047.0000 | +| mean | 2.7904 | 2.7913 | 8.70e-04 | 6.41e-04 | +| std | 1.3572 | 1.3576 | 0.0299 | 0.0220 | +| min | 1.0000 | 1.0000 | -1.0000 | -0.7368 | +| 25% | 2.0000 | 2.0000 | 0.0000 | 0.0000 | +| 50% | 3.0000 | 3.0000 | 0.0000 | 0.0000 | +| 75% | 4.0000 | 4.0000 | 0.0000 | 0.0000 | +| max | 5.0000 | 5.0000 | 1.0000 | 0.7368 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0008 + 1.0000 * stata +- **R-squared**: 0.9995 +- **N observations**: 354,047 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.91e-04 | 1.15e-04 | 6.8901 | 0.000 | +| Slope | 1.0000 | 3.70e-05 | 27043.8615 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 316/354047 (0.089%) +- Stata standard deviation: 1.36e+00 + +--- + +### RIO_Turnover + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.01% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 445,546 +- Python: 445,570 +- Common: 445,078 + +**Precision1**: 0.131% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.42e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 445078.0000 | 445078.0000 | 445078.0000 | 445078.0000 | +| mean | 3.2513 | 3.2526 | 0.0013 | 9.62e-04 | +| std | 1.3475 | 1.3479 | 0.0362 | 0.0269 | +| min | 1.0000 | 1.0000 | -1.0000 | -0.7421 | +| 25% | 2.0000 | 2.0000 | 0.0000 | 0.0000 | +| 50% | 3.0000 | 3.0000 | 0.0000 | 0.0000 | +| 75% | 4.0000 | 4.0000 | 0.0000 | 0.0000 | +| max | 5.0000 | 5.0000 | 1.0000 | 0.7421 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0015 + 0.9999 * stata +- **R-squared**: 0.9993 +- **N observations**: 445,078 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0015 | 1.42e-04 | 10.5857 | 0.000 | +| Slope | 0.9999 | 4.03e-05 | 24810.6590 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 585/445078 (0.131%) +- Stata standard deviation: 1.35e+00 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 21563 202412 4.0 3.0 1.0 +1 18937 202405 4.0 3.0 1.0 +2 18572 202403 4.0 3.0 1.0 +3 15291 202401 3.0 2.0 1.0 +4 16436 202311 4.0 3.0 1.0 +5 78003 202309 4.0 3.0 1.0 +6 91606 202307 4.0 3.0 1.0 +7 18558 202301 3.0 2.0 1.0 +8 18576 202207 2.0 1.0 1.0 +9 21589 202207 4.0 3.0 1.0 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10006 195909 4.0 3.0 1.0 +1 10014 196902 5.0 4.0 1.0 +2 10022 192809 5.0 4.0 1.0 +3 10022 192901 5.0 4.0 1.0 +4 10035 199004 5.0 4.0 1.0 +5 10057 193607 5.0 4.0 1.0 +6 10057 193609 5.0 4.0 1.0 +7 10057 194101 4.0 3.0 1.0 +8 10057 195009 5.0 4.0 1.0 +9 10083 198705 4.0 3.0 1.0 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 10022 192809 5.0 4.0 1.0 +1 10022 192901 5.0 4.0 1.0 +2 10057 193607 5.0 4.0 1.0 +3 10057 193609 5.0 4.0 1.0 +4 10057 194101 4.0 3.0 1.0 +5 10137 194602 3.0 2.0 1.0 +6 10137 194703 3.0 2.0 1.0 +7 10233 193204 2.0 1.0 1.0 +8 10559 194405 5.0 4.0 1.0 +9 10671 192705 3.0 2.0 1.0 +``` + +--- + +### RIO_Volatility + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.04% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 470,062 +- Python: 470,257 +- Common: 469,253 + +**Precision1**: 0.138% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.46e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 469253.0000 | 469253.0000 | 469253.0000 | 469253.0000 | +| mean | 3.4332 | 3.4345 | 0.0014 | 0.0010 | +| std | 1.3412 | 1.3417 | 0.0371 | 0.0276 | +| min | 1.0000 | 1.0000 | -1.0000 | -0.7456 | +| 25% | 2.0000 | 2.0000 | 0.0000 | 0.0000 | +| 50% | 4.0000 | 4.0000 | 0.0000 | 0.0000 | +| 75% | 5.0000 | 5.0000 | 0.0000 | 0.0000 | +| max | 5.0000 | 5.0000 | 1.0000 | 0.7456 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0016 + 0.9999 * stata +- **R-squared**: 0.9992 +- **N observations**: 469,253 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0016 | 1.49e-04 | 10.5698 | 0.000 | +| Slope | 0.9999 | 4.04e-05 | 24777.9202 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 646/469253 (0.138%) +- Stata standard deviation: 1.34e+00 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 21563 202412 4.0 3.0 1.0 +1 14045 202410 5.0 4.0 1.0 +2 22758 202406 4.0 3.0 1.0 +3 18937 202405 4.0 3.0 1.0 +4 18572 202403 4.0 3.0 1.0 +5 88264 202401 5.0 4.0 1.0 +6 18955 202309 5.0 4.0 1.0 +7 78003 202309 4.0 3.0 1.0 +8 17357 202305 3.0 2.0 1.0 +9 18561 202305 5.0 4.0 1.0 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 10035 199004 5.0 4.0 1.0 +1 10062 199006 5.0 4.0 1.0 +2 10062 199403 4.0 3.0 1.0 +3 10083 198705 4.0 3.0 1.0 +4 10125 199008 4.0 3.0 1.0 +5 10137 194308 4.0 3.0 1.0 +6 10137 194602 3.0 2.0 1.0 +7 10166 199002 5.0 4.0 1.0 +8 10233 193204 2.0 1.0 1.0 +9 10258 199104 5.0 4.0 1.0 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 10137 194308 4.0 3.0 1.0 +1 10137 194602 3.0 2.0 1.0 +2 10233 193204 2.0 1.0 1.0 +3 10284 192910 3.0 2.0 1.0 +4 10559 194405 5.0 4.0 1.0 +5 10591 193704 3.0 2.0 1.0 +6 10671 192705 3.0 2.0 1.0 +7 10698 194503 5.0 4.0 1.0 +8 10823 194908 4.0 3.0 1.0 +9 11148 193810 5.0 4.0 1.0 +``` + +--- + +### RIVolSpread + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.13% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 750,937 +- Python: 759,401 +- Common: 750,406 + +**Precision1**: 0.019% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.88e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 750406.0000 | 750406.0000 | 750406.0000 | 750406.0000 | +| mean | -0.0296 | -0.0296 | -2.40e-06 | -1.04e-05 | +| std | 0.2302 | 0.2301 | 0.0034 | 0.0149 | +| min | -7.4048 | -7.4048 | -0.7485 | -3.2519 | +| 25% | -0.1138 | -0.1138 | -1.32e-08 | -5.75e-08 | +| 50% | -0.0452 | -0.0452 | 2.83e-11 | 1.23e-10 | +| 75% | 0.0295 | 0.0295 | 1.32e-08 | 5.75e-08 | +| max | 23.6805 | 23.6805 | 0.7300 | 3.1718 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 0.9998 * stata +- **R-squared**: 0.9998 +- **N observations**: 750,406 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -9.27e-06 | 4.00e-06 | -2.3170 | 0.021 | +| Slope | 0.9998 | 1.72e-05 | 57964.5216 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 145/750406 (0.019%) +- Stata standard deviation: 2.30e-01 + +--- + +### RealizedVol + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.13% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,987,397 +- Python: 4,980,936 +- Common: 4,980,936 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.68e-15 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.98e+06 | 4.98e+06 | 4.98e+06 | 4.98e+06 | +| mean | 0.0297 | 0.0297 | 2.57e-20 | 8.27e-19 | +| std | 0.0311 | 0.0311 | 2.95e-17 | 9.50e-16 | +| min | 0.0000 | 0.0000 | -6.66e-16 | -2.14e-14 | +| 25% | 0.0131 | 0.0131 | -2.52e-17 | -8.09e-16 | +| 50% | 0.0219 | 0.0219 | 0.0000 | 0.0000 | +| 75% | 0.0367 | 0.0367 | 2.60e-17 | 8.37e-16 | +| max | 8.4777 | 8.4777 | 3.55e-15 | 1.14e-13 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,980,936 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.44e-15 | 9.16e-19 | -1577.0941 | 0.000 | +| Slope | 1.0000 | 2.13e-17 | 4.70e+16 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4980936 (0.000%) +- Stata standard deviation: 3.11e-02 + +--- + +### Recomm_ShortInterest + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-20 +- Reviewed by: ac +- Details: The do file was using asrol with stat(first) to fill in missing values. This method is not used anywhere else. Also, this method does not work properly. I really don't understand what it's doing See https://github.com/OpenSourceAP/CrossSection/issues/178. + +I wrote Recomm_ShortInterest.py from scratch to fill in the missing values properly. It results in far more observations than the do file. I checked a few of the Stata observations that are missing in Python and they all should be missing. They had ConsRecomm scores of around 3.0, which should not be an extreme quintile and therefore should be dropped. + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 20330 Stata observations) +- Test 2 - NumRows check: ❌ FAILED (Python has +95.43% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 34,619 +- Python: 67,656 +- Common: 14,289 + +**Precision1**: 0.007% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 14289.0000 | 14289.0000 | 14289.0000 | 14289.0000 | +| mean | 0.5285 | 0.5286 | 7.00e-05 | 1.40e-04 | +| std | 0.4992 | 0.4992 | 0.0084 | 0.0168 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.0032 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0001 + 0.9999 * stata +- **R-squared**: 0.9997 +- **N observations**: 14,289 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.48e-04 | 1.02e-04 | 1.4564 | 0.145 | +| Slope | 0.9999 | 1.40e-04 | 7131.8407 | 0.000 | + +**Missing Observations Sample**: +``` + index permno yyyymm Recomm_ShortInterest + 0 10044 201106 1.0 + 1 10051 200704 1.0 + 2 10104 200607 1.0 + 3 10104 200807 1.0 + 4 10104 200808 1.0 + 5 10104 200903 1.0 + 6 10104 200904 1.0 + 7 10104 200906 1.0 + 8 10104 201402 1.0 + 9 10104 201507 1.0 +``` + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 1/14289 (0.007%) +- Stata standard deviation: 4.99e-01 + +--- + +### ResidualMomentum + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.01% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,458,422 +- Python: 3,458,602 +- Common: 3,458,422 + +**Precision1**: 0.696% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.19e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.46e+06 | 3.46e+06 | 3.46e+06 | 3.46e+06 | +| mean | -0.0384 | -0.0383 | 5.14e-05 | 1.56e-04 | +| std | 0.3299 | 0.3300 | 6.41e-04 | 0.0019 | +| min | -4.1338 | -4.1338 | -0.0317 | -0.0960 | +| 25% | -0.2366 | -0.2366 | -1.03e-08 | -3.12e-08 | +| 50% | -0.0220 | -0.0219 | 6.87e-10 | 2.08e-09 | +| 75% | 0.1765 | 0.1766 | 1.23e-08 | 3.74e-08 | +| max | 2.8989 | 2.8989 | 0.0460 | 0.1393 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0001 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,458,422 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 5.25e-05 | 3.47e-07 | 151.5271 | 0.000 | +| Slope | 1.0000 | 1.04e-06 | 957999.2235 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 24062/3458422 (0.696%) +- Stata standard deviation: 3.30e-01 + +--- + +### ReturnSkew + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,952,730 +- Python: 4,952,730 +- Common: 4,952,730 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 5.42e-15 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.95e+06 | 4.95e+06 | 4.95e+06 | 4.95e+06 | +| mean | 0.1796 | 0.1796 | -2.56e-19 | -2.60e-19 | +| std | 0.9830 | 0.9830 | 5.13e-16 | 5.21e-16 | +| min | -4.9029 | -4.9029 | -1.60e-14 | -1.63e-14 | +| 25% | -0.2931 | -0.2931 | -1.39e-16 | -1.41e-16 | +| 50% | 0.1487 | 0.1487 | 0.0000 | 0.0000 | +| 75% | 0.6344 | 0.6344 | 1.39e-16 | 1.41e-16 | +| max | 4.9029 | 4.9029 | 1.60e-14 | 1.63e-14 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,952,730 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.30e-14 | 2.93e-17 | -442.6160 | 0.000 | +| Slope | 1.0000 | 2.93e-17 | 3.41e+16 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4952730 (0.000%) +- Stata standard deviation: 9.83e-01 + +--- + +### ReturnSkew3F + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.23% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,978,948 +- Python: 4,967,368 +- Common: 4,966,101 + +**Precision1**: 2.345% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.84e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.97e+06 | 4.97e+06 | 4.97e+06 | 4.97e+06 | +| mean | 0.1547 | 0.1542 | -5.20e-04 | -6.12e-04 | +| std | 0.8501 | 0.8526 | 0.1130 | 0.1330 | +| min | -4.8206 | -4.8206 | -8.4971 | -9.9954 | +| 25% | -0.2807 | -0.2807 | -3.33e-16 | -3.92e-16 | +| 50% | 0.1302 | 0.1295 | 0.0000 | 0.0000 | +| 75% | 0.5710 | 0.5707 | 3.33e-16 | 3.92e-16 | +| max | 4.7150 | 4.8000 | 8.7287 | 10.2679 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0004 + 0.9941 * stata +- **R-squared**: 0.9825 +- **N observations**: 4,966,101 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.93e-04 | 5.15e-05 | 7.6380 | 0.000 | +| Slope | 0.9941 | 5.96e-05 | 16679.1949 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 116456/4966101 (2.345%) +- Stata standard deviation: 8.50e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10777 202412 -0.081862 -0.091706 0.009843 +1 10890 202412 -2.034009 -2.020654 -0.013356 +2 11369 202412 -0.138541 -0.147520 0.008979 +3 11404 202412 -0.595165 -0.604012 0.008846 +4 11674 202412 -0.248130 -0.262484 0.014354 +5 12397 202412 0.255910 0.246951 0.008959 +6 12476 202412 -1.112346 -1.139055 0.026710 +7 12558 202412 -0.473364 -0.491103 0.017739 +8 12680 202412 1.581095 1.593038 -0.011943 +9 12753 202412 -0.725019 -0.734683 0.009664 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 30605 196406 4.364358 -4.364358 8.728716 +1 24994 198601 -4.248529 4.248529 -8.497058 +2 92954 198601 -4.248529 4.248529 -8.497058 +3 11651 198709 -4.129483 4.129483 -8.258966 +4 10005 198812 -4.248529 1.269106 -5.517636 +5 10041 198812 -4.248529 1.269106 -5.517636 +6 10066 198812 -4.248529 1.269106 -5.517636 +7 10086 198812 -4.248529 1.269106 -5.517636 +8 10176 198812 -4.248529 1.269106 -5.517636 +9 10216 198812 -4.248529 1.269106 -5.517636 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 10540 193207 -4.587317 0.845579 -5.432896 +1 18833 193509 4.587317 -0.710719 5.298036 +2 19123 193311 4.587317 -0.591974 5.179292 +3 15253 193208 -4.800000 0.334008 -5.134008 +4 18577 193205 -4.477215 0.489618 -4.966833 +5 16918 193108 4.800000 -0.037412 4.837412 +6 17873 193108 4.800000 -0.037412 4.837412 +7 15632 192608 -4.800000 -0.091409 -4.708591 +8 15923 192608 -4.800000 -0.091409 -4.708591 +9 13127 193304 -4.587317 0.022204 -4.609522 +``` + +--- + +### RevenueSurprise + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,107,489 +- Python: 2,107,517 +- Common: 2,107,427 + +**Precision1**: 0.019% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.06e-04 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.11e+06 | 2.11e+06 | 2.11e+06 | 2.11e+06 | +| mean | 0.0943 | 0.0972 | 0.0030 | 2.58e-05 | +| std | 116.2027 | 114.0864 | 2.3993 | 0.0206 | +| min | -86414.3670 | -84415.3972 | -36.9605 | -0.3181 | +| 25% | -0.7785 | -0.7789 | -1.56e-07 | -1.34e-09 | +| 50% | 0.1277 | 0.1276 | 0.0000 | 0.0000 | +| 75% | 0.8664 | 0.8662 | 1.56e-07 | 1.34e-09 | +| max | 27665.5370 | 27665.5361 | 1998.9698 | 17.2024 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0047 + 0.9817 * stata +- **R-squared**: 0.9999 +- **N observations**: 2,107,427 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0047 | 7.72e-04 | 6.1102 | 0.000 | +| Slope | 0.9817 | 6.64e-06 | 147845.2056 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 401/2107427 (0.019%) +- Stata standard deviation: 1.16e+02 + +--- + +### RoE + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,527,662 +- Python: 3,529,918 +- Common: 3,527,662 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.37e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.53e+06 | 3.53e+06 | 3.53e+06 | 3.53e+06 | +| mean | -0.1424 | -0.1424 | -4.73e-07 | -6.18e-09 | +| std | 76.4924 | 76.4924 | 9.26e-04 | 1.21e-05 | +| min | -31837.0000 | -31837.0000 | -0.3429 | -0.0045 | +| 25% | -0.0331 | -0.0331 | -2.80e-09 | -3.66e-11 | +| 50% | 0.0884 | 0.0884 | -9.15e-13 | -1.20e-14 | +| 75% | 0.1559 | 0.1559 | 2.76e-09 | 3.61e-11 | +| max | 7770.3335 | 7770.3333 | 0.1953 | 0.0026 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,527,662 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.73e-07 | 4.93e-07 | -0.9594 | 0.337 | +| Slope | 1.0000 | 6.45e-09 | 1.55e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3527662 (0.000%) +- Stata standard deviation: 7.65e+01 + +--- + +### SP + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,030,926 +- Python: 3,030,937 +- Common: 3,030,926 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.63e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.03e+06 | 3.03e+06 | 3.03e+06 | 3.03e+06 | +| mean | 2.5432 | 2.5432 | 5.39e-08 | 6.24e-09 | +| std | 8.6324 | 8.6324 | 1.98e-05 | 2.29e-06 | +| min | -61.8389 | -61.8389 | -0.0092 | -0.0011 | +| 25% | 0.3973 | 0.3973 | -1.71e-08 | -1.98e-09 | +| 50% | 0.9928 | 0.9928 | 0.0000 | 0.0000 | +| 75% | 2.4125 | 2.4125 | 1.70e-08 | 1.97e-09 | +| max | 3668.3628 | 3668.3627 | 0.0050 | 5.80e-04 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,030,926 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 6.25e-08 | 1.18e-08 | 5.2826 | 0.000 | +| Slope | 1.0000 | 1.32e-09 | 7.60e+08 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3030926 (0.000%) +- Stata standard deviation: 8.63e+00 + +--- + +### STreversal + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,047,630 +- Python: 4,047,630 +- Common: 4,047,630 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.62e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.05e+06 | 4.05e+06 | 4.05e+06 | 4.05e+06 | +| mean | 0.0111 | 0.0111 | -2.51e-12 | -1.35e-11 | +| std | 0.1856 | 0.1856 | 4.56e-09 | 2.46e-08 | +| min | -1.0000 | -1.0000 | -5.00e-07 | -2.69e-06 | +| 25% | -0.0650 | -0.0650 | -1.11e-16 | -5.98e-16 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0690 | 0.0690 | 1.08e-16 | 5.79e-16 | +| max | 39.0000 | 39.0000 | 5.00e-07 | 2.69e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,047,630 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.22e-11 | 2.27e-12 | -5.3703 | 0.000 | +| Slope | 1.0000 | 1.22e-11 | 8.20e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4047630 (0.000%) +- Stata standard deviation: 1.86e-01 + +--- + +### ShareIss1Y + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,517,326 +- Python: 3,517,326 +- Common: 3,517,326 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.53e-10 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.52e+06 | 3.52e+06 | 3.52e+06 | 3.52e+06 | +| mean | 0.7841 | 0.7841 | -3.05e-08 | -5.13e-11 | +| std | 593.7148 | 593.7147 | 3.01e-05 | 5.07e-08 | +| min | -0.9982 | -0.9982 | -0.0413 | -6.95e-05 | +| 25% | 0.0000 | 0.0000 | -1.80e-08 | -3.03e-11 | +| 50% | 0.0024 | 0.0024 | 0.0000 | 0.0000 | +| 75% | 0.0285 | 0.0285 | 1.81e-08 | 3.05e-11 | +| max | 707720.6900 | 707720.6487 | 2.57e-04 | 4.32e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,517,326 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.19e-09 | 3.81e-09 | 2.1507 | 0.032 | +| Slope | 1.0000 | 6.42e-12 | 1.56e+11 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3517326 (0.000%) +- Stata standard deviation: 5.94e+02 + +--- + +### ShareIss5Y + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,507,320 +- Python: 2,507,320 +- Common: 2,507,320 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.63e-10 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.51e+06 | 2.51e+06 | 2.51e+06 | 2.51e+06 | +| mean | 19.3920 | 19.3920 | 1.92e-07 | 2.44e-11 | +| std | 7858.8404 | 7858.8406 | 1.99e-04 | 2.53e-08 | +| min | -0.9913 | -0.9913 | -0.0359 | -4.57e-06 | +| 25% | -1.32e-04 | -1.32e-04 | -2.67e-08 | -3.40e-12 | +| 50% | 0.0471 | 0.0471 | 0.0000 | 0.0000 | +| 75% | 0.2961 | 0.2961 | 2.71e-08 | 3.45e-12 | +| max | 6.54e+06 | 6.54e+06 | 0.2681 | 3.41e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,507,320 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.55e-07 | 8.89e-08 | -1.7387 | 0.082 | +| Slope | 1.0000 | 1.13e-11 | 8.84e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2507320 (0.000%) +- Stata standard deviation: 7.86e+03 + +--- + +### ShareRepurchase + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,624,363 +- Python: 3,626,619 +- Common: 3,624,363 + +**Precision1**: 0.003% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.62e+06 | 3.62e+06 | 3.62e+06 | 3.62e+06 | +| mean | 0.3376 | 0.3377 | 2.65e-05 | 5.60e-05 | +| std | 0.4729 | 0.4729 | 0.0058 | 0.0122 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.1146 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.1146 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9999 * stata +- **R-squared**: 0.9999 +- **N observations**: 3,624,363 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.50e-05 | 3.71e-06 | 12.1141 | 0.000 | +| Slope | 0.9999 | 6.39e-06 | 156457.1023 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 120/3624363 (0.003%) +- Stata standard deviation: 4.73e-01 + +--- + +### ShareVol + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,660,340 +- Python: 1,661,295 +- Common: 1,660,340 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.66e+06 | 1.66e+06 | 1.66e+06 | 1.66e+06 | +| mean | 0.3061 | 0.3061 | 0.0000 | 0.0000 | +| std | 0.4609 | 0.4609 | 0.0000 | 0.0000 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,660,340 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.36e-12 | 1.89e-15 | -717.5598 | 0.000 | +| Slope | 1.0000 | 3.42e-15 | 2.92e+14 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/1660340 (0.000%) +- Stata standard deviation: 4.61e-01 + +--- + +### ShortInterest + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-30 +- Reviewed by: ac +- Details: Like the other short interest predictors, the only test here that is really informative is the t-stat, and the t-stat test passes. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ❌ FAILED (Python has +80.71% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 873,175 +- Python: 1,577,931 +- Common: 873,175 + +**Precision1**: 81.892% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.45e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 873175.0000 | 873175.0000 | 873175.0000 | 873175.0000 | +| mean | 36854.3903 | 0.0465 | -36854.3438 | -0.2180 | +| std | 169080.3104 | 1.0116 | 169080.1378 | 1.0000 | +| min | 0.0000 | 0.0000 | -1.34e+08 | -794.3804 | +| 25% | 3359.3736 | 0.0092 | -45982.8982 | -0.2720 | +| 50% | 15037.8830 | 0.0256 | -15037.8507 | -0.0889 | +| 75% | 45982.9375 | 0.0591 | -3359.3612 | -0.0199 | +| max | 1.34e+08 | 931.6428 | 0.3482 | 2.06e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0089 + 0.0000 * stata +- **R-squared**: 0.0291 +- **N observations**: 873,175 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0089 | 0.0011 | 8.1081 | 0.000 | +| Slope | 1.02e-06 | 6.31e-09 | 161.7411 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 715057/873175 (81.892%) +- Stata standard deviation: 1.69e+05 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10026 202412 0.039553 41805.5230 -41805.483447 +1 10028 202412 0.004616 4117.0947 -4117.090084 +2 10032 202412 0.021802 4939.5396 -4939.517798 +3 10104 202412 0.007313 2725.3242 -2725.316887 +4 10138 202412 0.053843 77018.0780 -77018.024157 +5 10145 202412 0.013544 7984.1108 -7984.097256 +6 10158 202412 0.087012 91591.9610 -91591.873988 +7 10200 202412 0.057009 42890.2150 -42890.157991 +8 10257 202412 0.011584 7048.5195 -7048.507916 +9 10258 202412 0.110158 47825.1520 -47825.041842 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 18558 202312 140.529397 134314224.0 -1.343141e+08 +1 15833 202312 15.763534 24347756.0 -2.434774e+07 +2 12350 202409 8.195340 21476184.0 -2.147618e+07 +3 19649 202308 16.101878 20176900.0 -2.017688e+07 +4 12350 202312 11.357373 19826456.0 -1.982644e+07 +5 14444 202405 3.291250 19450570.0 -1.945057e+07 +6 84302 202305 10.957621 15805988.0 -1.580598e+07 +7 14523 202412 1.754496 15724548.0 -1.572455e+07 +8 20422 202406 6.212520 14799202.0 -1.479920e+07 +9 78877 202004 0.649375 11417539.0 -1.141754e+07 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### Size + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,029,130 +- Python: 4,029,130 +- Common: 4,029,130 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.14e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.03e+06 | 4.03e+06 | 4.03e+06 | 4.03e+06 | +| mean | 4.6018 | 4.6018 | 1.36e-11 | 5.81e-12 | +| std | 2.3329 | 2.3329 | 1.00e-07 | 4.29e-08 | +| min | -5.9915 | -5.9915 | -5.00e-07 | -2.14e-07 | +| 25% | 2.8983 | 2.8983 | 0.0000 | 0.0000 | +| 50% | 4.3953 | 4.3953 | 0.0000 | 0.0000 | +| 75% | 6.1365 | 6.1365 | 0.0000 | 0.0000 | +| max | 15.1466 | 15.1466 | 5.00e-07 | 2.14e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,029,130 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.68e-11 | 1.10e-10 | 0.4246 | 0.671 | +| Slope | 1.0000 | 2.14e-11 | 4.68e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4029130 (0.000%) +- Stata standard deviation: 2.33e+00 + +--- + +### SmileSlope + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.15% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 862,230 +- Python: 872,108 +- Common: 861,502 + +**Precision1**: 0.018% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.40e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 861502.0000 | 861502.0000 | 861502.0000 | 861502.0000 | +| mean | -0.0371 | -0.0371 | -4.58e-07 | -9.61e-07 | +| std | 0.4764 | 0.4764 | 0.0046 | 0.0096 | +| min | -7.8254 | -7.8254 | -3.4528 | -7.2473 | +| 25% | -0.0311 | -0.0311 | -1.00e-08 | -2.10e-08 | +| 50% | -0.0066 | -0.0066 | 0.0000 | 0.0000 | +| 75% | 0.0022 | 0.0022 | 1.00e-08 | 2.10e-08 | +| max | 7.7511 | 7.7511 | 1.5023 | 3.1533 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 0.9999 +- **N observations**: 861,502 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -9.09e-07 | 4.94e-06 | -0.1842 | 0.854 | +| Slope | 1.0000 | 1.03e-05 | 96806.9915 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 153/861502 (0.018%) +- Stata standard deviation: 4.76e-01 + +--- + +### Spinoff + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,047,630 +- Python: 4,047,630 +- Common: 4,047,630 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.05e+06 | 4.05e+06 | 4.05e+06 | 4.05e+06 | +| mean | 0.0253 | 0.0253 | 0.0000 | 0.0000 | +| std | 0.1571 | 0.1571 | 0.0000 | 0.0000 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 0.0000 | 0.0000 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,047,630 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.75e-13 | 1.18e-15 | 319.4918 | 0.000 | +| Slope | 1.0000 | 7.38e-15 | 1.35e+14 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4047630 (0.000%) +- Stata standard deviation: 1.57e-01 + +--- + +### SurpriseRD + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.50% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,545,193 +- Python: 1,552,935 +- Common: 1,545,193 + +**Precision1**: 0.005% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.55e+06 | 1.55e+06 | 1.55e+06 | 1.55e+06 | +| mean | 0.2861 | 0.2860 | -3.88e-05 | -8.59e-05 | +| std | 0.4519 | 0.4519 | 0.0074 | 0.0163 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.2127 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.2127 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9998 * stata +- **R-squared**: 0.9997 +- **N observations**: 1,545,193 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.09e-05 | 7.02e-06 | 1.5497 | 0.121 | +| Slope | 0.9998 | 1.31e-05 | 76184.5432 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 84/1545193 (0.005%) +- Stata standard deviation: 4.52e-01 + +--- + +### Tax + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.09% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,211,651 +- Python: 3,214,396 +- Common: 3,211,651 + +**Precision1**: 0.357% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 5.26e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.21e+06 | 3.21e+06 | 3.21e+06 | 3.21e+06 | +| mean | 1.1689 | 1.1693 | 4.25e-04 | 2.23e-05 | +| std | 19.0252 | 19.0298 | 0.4189 | 0.0220 | +| min | -2742.5000 | -2742.5000 | -1.0000 | -0.0526 | +| 25% | 0.0341 | 0.0303 | -1.62e-08 | -8.52e-10 | +| 50% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| 75% | 1.4267 | 1.4286 | 3.68e-11 | 1.93e-12 | +| max | 4463.7114 | 4463.7113 | 193.2381 | 10.1570 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0004 + 1.0000 * stata +- **R-squared**: 0.9995 +- **N observations**: 3,211,651 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.25e-04 | 2.34e-04 | 1.8143 | 0.070 | +| Slope | 1.0000 | 1.23e-05 | 81391.9517 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 11460/3211651 (0.357%) +- Stata standard deviation: 1.90e+01 + +--- + +### TotalAccruals + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.53% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,141,468 +- Python: 3,158,229 +- Common: 3,141,468 + +**Precision1**: 0.002% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.31e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.14e+06 | 3.14e+06 | 3.14e+06 | 3.14e+06 | +| mean | 9.92e-04 | 9.93e-04 | 6.76e-07 | 8.65e-07 | +| std | 0.7815 | 0.7815 | 5.68e-04 | 7.27e-04 | +| min | -161.4356 | -161.4356 | -0.0902 | -0.1155 | +| 25% | -0.0461 | -0.0461 | -2.74e-09 | -3.51e-09 | +| 50% | 0.0111 | 0.0111 | -2.37e-12 | -3.03e-12 | +| 75% | 0.0655 | 0.0655 | 2.70e-09 | 3.46e-09 | +| max | 190.7895 | 190.7895 | 0.2643 | 0.3381 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,141,468 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 6.76e-07 | 3.21e-07 | 2.1074 | 0.035 | +| Slope | 1.0000 | 4.10e-07 | 2.44e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 72/3141468 (0.002%) +- Stata standard deviation: 7.82e-01 + +--- + +### TrendFactor + +**Status**: ✅ PASSED (with override) + +**Override Applied**: +- Reviewed on: 2025-08-28 +- Reviewed by: ac +- Details: Given how complicated this predictor is, with many statistical calculations and daily data, I'm accepting the 25.1% Precision1 failure rate. This seems like reasonable numerical precision with so many calculations. + +The long-short t-stat is almost identical to the 2024 10 release. + +I also nearly went crazy trying to replicate the exact details of Stata, including the exact quantile function, and Stata's handling of collinearity regressors. + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,058,231 +- Python: 2,058,231 +- Common: 2,058,231 + +**Precision1**: 25.129% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 9.08e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.06e+06 | 2.06e+06 | 2.06e+06 | 2.06e+06 | +| mean | 0.2096 | 0.2089 | -7.75e-04 | -0.0050 | +| std | 0.1540 | 0.1547 | 0.0237 | 0.1539 | +| min | -1.0711 | -1.0711 | -0.2349 | -1.5251 | +| 25% | 0.1242 | 0.1256 | -4.61e-08 | -2.99e-07 | +| 50% | 0.2187 | 0.2184 | 1.35e-09 | 8.77e-09 | +| 75% | 0.3000 | 0.2978 | 5.79e-08 | 3.76e-07 | +| max | 3.2757 | 3.2757 | 0.3254 | 2.1131 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0007 + 0.9927 * stata +- **R-squared**: 0.9766 +- **N observations**: 2,058,231 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.48e-04 | 2.79e-05 | 26.8560 | 0.000 | +| Slope | 0.9927 | 1.07e-04 | 9267.5981 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 517211/2058231 (25.129%) +- Stata standard deviation: 1.54e-01 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10026 202412 -0.085958 0.032569 -0.118526 +1 10032 202412 -0.081422 0.035968 -0.117390 +2 10104 202412 -0.083346 0.034036 -0.117382 +3 10107 202412 -0.082053 0.038111 -0.120163 +4 10138 202412 -0.080283 0.037830 -0.118113 +5 10145 202412 -0.082692 0.036421 -0.119112 +6 10158 202412 -0.084249 0.032967 -0.117216 +7 10200 202412 -0.082381 0.036853 -0.119233 +8 10220 202412 -0.086329 0.030636 -0.116965 +9 10252 202412 -0.083978 0.033475 -0.117454 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 90505 200811 -0.040092 -0.365492 0.325401 +1 90505 200812 0.192350 -0.085925 0.278276 +2 82775 200811 -0.058161 -0.301578 0.243417 +3 14328 202407 -0.051984 0.182878 -0.234862 +4 16280 193306 1.259595 1.030139 0.229456 +5 82775 200810 -0.030812 -0.259940 0.229129 +6 13442 193209 1.082268 0.867052 0.215216 +7 75175 200807 -0.192257 -0.406882 0.214625 +8 11236 193212 0.931366 0.723557 0.207809 +9 51043 200808 -0.114697 -0.316233 0.201536 +``` + +**Largest Differences Before 1950**: +``` + permno yyyymm python stata diff +0 16280 193306 1.259595 1.030139 0.229456 +1 13442 193209 1.082268 0.867052 0.215216 +2 11236 193212 0.931366 0.723557 0.207809 +3 13864 193210 0.987406 0.790193 0.197213 +4 11252 193210 0.853703 0.660276 0.193427 +5 24512 193208 0.623597 0.430485 0.193113 +6 13426 193304 0.887916 0.697897 0.190020 +7 24512 193209 0.828814 0.641206 0.187608 +8 13864 193212 0.814815 0.628115 0.186700 +9 14509 193211 1.110083 0.929206 0.180877 +``` + +--- + +### UpRecomm + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.05% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 463,983 +- Python: 464,223 +- Common: 462,936 + +**Precision1**: 0.023% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 462936.0000 | 462936.0000 | 462936.0000 | 462936.0000 | +| mean | 0.3514 | 0.3514 | 0.0000 | 0.0000 | +| std | 0.4774 | 0.4774 | 0.0151 | 0.0317 | +| min | 0.0000 | 0.0000 | -1.0000 | -2.0946 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 1.0000 | 1.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.0946 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0002 + 0.9995 * stata +- **R-squared**: 0.9990 +- **N observations**: 462,936 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.77e-04 | 2.76e-05 | 6.3929 | 0.000 | +| Slope | 0.9995 | 4.66e-05 | 21458.7276 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 106/462936 (0.023%) +- Stata standard deviation: 4.77e-01 + +--- + +### VarCF + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.69% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,547,003 +- Python: 2,564,607 +- Common: 2,547,003 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.40e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.55e+06 | 2.55e+06 | 2.55e+06 | 2.55e+06 | +| mean | 1.3777 | 1.3777 | -5.85e-08 | -2.66e-10 | +| std | 219.5846 | 219.5846 | 3.40e-05 | 1.55e-07 | +| min | 0.0000 | 0.0000 | -0.0179 | -8.13e-05 | +| 25% | 6.28e-04 | 6.28e-04 | -6.13e-11 | -2.79e-13 | +| 50% | 0.0026 | 0.0026 | 4.83e-14 | 2.20e-16 | +| 75% | 0.0139 | 0.0139 | 6.25e-11 | 2.84e-13 | +| max | 106471.4300 | 106471.4184 | 0.0062 | 2.83e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,547,003 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 6.41e-08 | 1.75e-08 | 3.6723 | 0.000 | +| Slope | 1.0000 | 7.95e-11 | 1.26e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2547003 (0.000%) +- Stata standard deviation: 2.20e+02 + +--- + +### VolMkt + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,359,237 +- Python: 4,359,149 +- Common: 4,358,313 + +**Precision1**: 0.008% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.22e-05 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.36e+06 | 4.36e+06 | 4.36e+06 | 4.36e+06 | +| mean | 0.1938 | 0.1938 | 6.37e-06 | 2.97e-06 | +| std | 2.1410 | 2.1410 | 0.0054 | 0.0025 | +| min | 0.0000 | -1.56e-17 | -3.7592 | -1.7558 | +| 25% | 0.0234 | 0.0234 | -1.35e-09 | -6.33e-10 | +| 50% | 0.0593 | 0.0593 | -6.41e-13 | -3.00e-13 | +| 75% | 0.1414 | 0.1415 | 1.36e-09 | 6.34e-10 | +| max | 1330.3551 | 1330.3551 | 5.2599 | 2.4568 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,358,313 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.29e-06 | 2.58e-06 | 2.8284 | 0.005 | +| Slope | 1.0000 | 1.20e-06 | 834545.5875 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 368/4358313 (0.008%) +- Stata standard deviation: 2.14e+00 + +--- + +### VolSD + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,922,498 +- Python: 3,922,399 +- Common: 3,921,598 + +**Precision1**: 0.010% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.53e-04 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.92e+06 | 3.92e+06 | 3.92e+06 | 3.92e+06 | +| mean | 5.3349 | 5.3351 | 2.56e-04 | 6.62e-06 | +| std | 38.6789 | 38.6790 | 0.0517 | 0.0013 | +| min | 0.0000 | 4.33e-10 | -4.1070 | -0.1062 | +| 25% | 0.0640 | 0.0640 | -1.11e-16 | -2.87e-18 | +| 50% | 0.3696 | 0.3697 | 0.0000 | 0.0000 | +| 75% | 2.1897 | 2.1900 | 1.25e-16 | 3.23e-18 | +| max | 6121.4561 | 6121.4561 | 30.1748 | 0.7801 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0003 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,921,598 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.53e-04 | 2.63e-05 | 9.6092 | 0.000 | +| Slope | 1.0000 | 6.75e-07 | 1.48e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 396/3921598 (0.010%) +- Stata standard deviation: 3.87e+01 + +--- + +### VolumeTrend + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.58% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,655,889 +- Python: 3,677,088 +- Common: 3,655,882 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.23e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.66e+06 | 3.66e+06 | 3.66e+06 | 3.66e+06 | +| mean | 0.0057 | 0.0057 | 4.01e-13 | 1.94e-11 | +| std | 0.0207 | 0.0207 | 5.81e-10 | 2.81e-08 | +| min | -0.0566 | -0.0566 | -4.20e-09 | -2.03e-07 | +| 25% | -0.0068 | -0.0068 | -2.21e-10 | -1.07e-08 | +| 50% | 0.0052 | 0.0052 | 1.43e-14 | 6.92e-13 | +| 75% | 0.0184 | 0.0184 | 2.22e-10 | 1.07e-08 | +| max | 0.0664 | 0.0664 | 4.19e-09 | 2.02e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,655,882 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.80e-13 | 3.15e-13 | 1.5252 | 0.127 | +| Slope | 1.0000 | 1.47e-11 | 6.81e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/3655882 (0.000%) +- Stata standard deviation: 2.07e-02 + +--- + +### XFIN + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.08% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,022,290 +- Python: 3,024,594 +- Common: 3,022,290 + +**Precision1**: 0.004% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.64e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.02e+06 | 3.02e+06 | 3.02e+06 | 3.02e+06 | +| mean | 0.0642 | 0.0642 | -2.21e-07 | -3.58e-07 | +| std | 0.6161 | 0.6161 | 2.85e-04 | 4.63e-04 | +| min | -165.5014 | -165.5014 | -0.0876 | -0.1423 | +| 25% | -0.0373 | -0.0373 | -8.92e-10 | -1.45e-09 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0785 | 0.0785 | 8.91e-10 | 1.45e-09 | +| max | 64.3333 | 64.3333 | 0.0513 | 0.0833 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,022,290 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -1.82e-07 | 1.65e-07 | -1.1048 | 0.269 | +| Slope | 1.0000 | 2.66e-07 | 3.76e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 120/3022290 (0.004%) +- Stata standard deviation: 6.16e-01 + +--- + +### betaVIX + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.22% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,510,758 +- Python: 3,553,481 +- Common: 3,510,758 + +**Precision1**: 0.068% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.15e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.51e+06 | 3.51e+06 | 3.51e+06 | 3.51e+06 | +| mean | 3.88e-04 | 3.88e-04 | 8.17e-08 | 4.75e-06 | +| std | 0.0172 | 0.0172 | 1.30e-05 | 7.53e-04 | +| min | -1.5701 | -1.5701 | -0.0029 | -0.1703 | +| 25% | -0.0039 | -0.0039 | -2.31e-16 | -1.34e-14 | +| 50% | 8.83e-05 | 8.83e-05 | 8.67e-18 | 5.04e-16 | +| 75% | 0.0046 | 0.0046 | 3.73e-16 | 2.17e-14 | +| max | 1.9930 | 1.9930 | 0.0033 | 0.1944 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,510,758 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.70e-08 | 6.91e-09 | 12.5862 | 0.000 | +| Slope | 1.0000 | 4.02e-07 | 2.49e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 2386/3510758 (0.068%) +- Stata standard deviation: 1.72e-02 + +--- + +### cfp + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.04% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,613,997 +- Python: 2,614,939 +- Common: 2,613,997 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.83e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.61e+06 | 2.61e+06 | 2.61e+06 | 2.61e+06 | +| mean | -0.0066 | -0.0066 | -3.31e-07 | -1.74e-07 | +| std | 1.8996 | 1.8996 | 2.16e-04 | 1.14e-04 | +| min | -1800.0656 | -1800.0656 | -0.1876 | -0.0988 | +| 25% | -0.0438 | -0.0438 | -1.93e-09 | -1.02e-09 | +| 50% | 0.0402 | 0.0402 | 3.71e-13 | 1.95e-13 | +| 75% | 0.1136 | 0.1136 | 1.94e-09 | 1.02e-09 | +| max | 574.5096 | 574.5097 | 2.59e-05 | 1.36e-05 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,613,997 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -3.31e-07 | 1.34e-07 | -2.4741 | 0.013 | +| Slope | 1.0000 | 7.04e-08 | 1.42e+07 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 7/2613997 (0.000%) +- Stata standard deviation: 1.90e+00 + +--- + +### dCPVolSpread + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.32% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 851,720 +- Python: 862,966 +- Common: 850,981 + +**Precision1**: 0.016% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.60e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 850981.0000 | 850981.0000 | 850981.0000 | 850981.0000 | +| mean | -8.04e-04 | -7.98e-04 | 6.12e-06 | 1.05e-05 | +| std | 0.5813 | 0.5813 | 0.0048 | 0.0083 | +| min | -12.5800 | -12.5800 | -0.6180 | -1.0631 | +| 25% | -0.0233 | -0.0233 | -1.50e-08 | -2.58e-08 | +| 50% | -1.30e-05 | -1.30e-05 | 0.0000 | 0.0000 | +| 75% | 0.0230 | 0.0230 | 1.50e-08 | 2.58e-08 | +| max | 12.5655 | 12.5655 | 3.4017 | 5.8519 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 0.9999 +- **N observations**: 850,981 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 6.10e-06 | 5.25e-06 | 1.1632 | 0.245 | +| Slope | 1.0000 | 9.03e-06 | 110782.6852 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 136/850981 (0.016%) +- Stata standard deviation: 5.81e-01 + +--- + +### dNoa + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,194,445 +- Python: 3,196,266 +- Common: 3,194,445 + +**Precision1**: 0.007% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.75e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.19e+06 | 3.19e+06 | 3.19e+06 | 3.19e+06 | +| mean | 0.0925 | 0.0925 | 3.80e-06 | 2.55e-06 | +| std | 1.4933 | 1.4933 | 0.0014 | 9.35e-04 | +| min | -361.9091 | -361.9091 | -0.1253 | -0.0839 | +| 25% | -0.0325 | -0.0325 | -2.71e-08 | -1.82e-08 | +| 50% | 0.0308 | 0.0308 | -8.04e-13 | -5.38e-13 | +| 75% | 0.1209 | 0.1209 | 2.71e-08 | 1.82e-08 | +| max | 457.1988 | 457.1988 | 0.6600 | 0.4419 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 3,194,445 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.36e-06 | 7.82e-07 | 5.5684 | 0.000 | +| Slope | 1.0000 | 5.23e-07 | 1.91e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 216/3194445 (0.007%) +- Stata standard deviation: 1.49e+00 + +--- + +### dVolCall + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.32% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 851,720 +- Python: 862,966 +- Common: 850,981 + +**Precision1**: 0.015% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.56e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 850981.0000 | 850981.0000 | 850981.0000 | 850981.0000 | +| mean | 0.0163 | 0.0163 | -4.44e-06 | -8.40e-06 | +| std | 0.5288 | 0.5288 | 0.0047 | 0.0089 | +| min | -7.9158 | -7.9158 | -3.4558 | -6.5358 | +| 25% | -0.0575 | -0.0575 | -1.00e-08 | -1.89e-08 | +| 50% | -9.10e-04 | -9.10e-04 | 0.0000 | 0.0000 | +| 75% | 0.0573 | 0.0573 | 1.00e-08 | 1.89e-08 | +| max | 8.2983 | 8.2983 | 0.9739 | 1.8419 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 1.0000 * stata +- **R-squared**: 0.9999 +- **N observations**: 850,981 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -4.07e-06 | 5.08e-06 | -0.8019 | 0.423 | +| Slope | 1.0000 | 9.60e-06 | 104198.4244 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 131/850981 (0.015%) +- Stata standard deviation: 5.29e-01 + +--- + +### dVolPut + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.32% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 851,720 +- Python: 862,966 +- Common: 850,981 + +**Precision1**: 0.016% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.50e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 850981.0000 | 850981.0000 | 850981.0000 | 850981.0000 | +| mean | 0.0155 | 0.0155 | 1.68e-06 | 3.15e-06 | +| std | 0.5333 | 0.5333 | 0.0027 | 0.0050 | +| min | -8.1177 | -8.1177 | -0.6832 | -1.2812 | +| 25% | -0.0547 | -0.0547 | -1.00e-08 | -1.88e-08 | +| 50% | -0.0011 | -0.0011 | 0.0000 | 0.0000 | +| 75% | 0.0539 | 0.0539 | 1.00e-08 | 1.88e-08 | +| max | 8.1240 | 8.1240 | 1.5933 | 2.9879 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 850,981 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.88e-06 | 2.90e-06 | 0.6491 | 0.516 | +| Slope | 1.0000 | 5.43e-06 | 184153.1154 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 132/850981 (0.016%) +- Stata standard deviation: 5.33e-01 + +--- + +### fgr5yrLag + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.02% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 875,784 +- Python: 875,652 +- Common: 873,864 + +**Precision1**: 0.069% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.22e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 873864.0000 | 873864.0000 | 873864.0000 | 873864.0000 | +| mean | 16.7174 | 16.7174 | -3.97e-06 | -3.20e-07 | +| std | 12.4128 | 12.4123 | 0.2384 | 0.0192 | +| min | -259.1000 | -259.1000 | -31.0000 | -2.4974 | +| 25% | 10.6000 | 10.6000 | 0.0000 | 0.0000 | +| 50% | 15.0000 | 15.0000 | 0.0000 | 0.0000 | +| 75% | 20.0000 | 20.0000 | 0.0000 | 0.0000 | +| max | 473.9000 | 473.9000 | 45.0000 | 3.6253 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0037 + 0.9998 * stata +- **R-squared**: 0.9996 +- **N observations**: 873,864 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.0037 | 4.28e-04 | 8.7331 | 0.000 | +| Slope | 0.9998 | 2.05e-05 | 48669.1061 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 600/873864 (0.069%) +- Stata standard deviation: 1.24e+01 + +--- + +### grcapx + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.14% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,425,711 +- Python: 2,429,227 +- Common: 2,407,796 + +**Precision1**: 0.042% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.73e-03 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.41e+06 | 2.41e+06 | 2.41e+06 | 2.41e+06 | +| mean | 3.7290 | 3.4938 | -0.2352 | -5.96e-04 | +| std | 394.4922 | 367.2597 | 144.0547 | 0.3652 | +| min | -9061.0000 | -9061.0000 | -91202.9468 | -231.1908 | +| 25% | -0.3587 | -0.3588 | -9.08e-09 | -2.30e-11 | +| 50% | 0.1307 | 0.1308 | 0.0000 | 0.0000 | +| 75% | 0.8916 | 0.8918 | 9.27e-09 | 2.35e-11 | +| max | 141782.2000 | 141782.2000 | 1490.2660 | 3.7777 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.2619 + 0.8667 * stata +- **R-squared**: 0.8667 +- **N observations**: 2,407,796 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 0.2619 | 0.0864 | 3.0306 | 0.002 | +| Slope | 0.8667 | 2.19e-04 | 3955.8852 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 1014/2407796 (0.042%) +- Stata standard deviation: 3.94e+02 + +--- + +### grcapx3y + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.91% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,214,095 +- Python: 2,234,287 +- Common: 2,197,378 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 9.32e-20 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.20e+06 | 2.20e+06 | 2.20e+06 | 2.20e+06 | +| mean | -4.24e+10 | -4.24e+10 | -553.6178 | -1.23e-11 | +| std | 4.50e+13 | 4.50e+13 | 248482.1794 | 5.52e-09 | +| min | -2.57e+16 | -2.57e+16 | -1.92e+08 | -4.27e-06 | +| 25% | 0.6523 | 0.6522 | -1.95e-08 | -4.32e-22 | +| 50% | 1.0687 | 1.0686 | 0.0000 | 0.0000 | +| 75% | 1.6003 | 1.6003 | 1.93e-08 | 4.28e-22 | +| max | 1.44e+16 | 1.44e+16 | 1.50e+07 | 3.34e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -513.4838 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,197,378 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -513.4838 | 165.2024 | -3.1082 | 0.002 | +| Slope | 1.0000 | 3.67e-12 | 2.72e+11 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2197378 (0.000%) +- Stata standard deviation: 4.50e+13 + +--- + +### hire + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 3,496,899 +- Python: 3,499,155 +- Common: 3,496,899 + +**Precision1**: 0.010% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.02e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 3.50e+06 | 3.50e+06 | 3.50e+06 | 3.50e+06 | +| mean | 0.0353 | 0.0354 | 5.19e-06 | 1.88e-05 | +| std | 0.2767 | 0.2767 | 0.0040 | 0.0145 | +| min | -2.0000 | -2.0000 | -1.2440 | -4.4960 | +| 25% | -0.0274 | -0.0274 | -1.10e-09 | -3.97e-09 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0997 | 0.0998 | 9.43e-10 | 3.41e-09 | +| max | 2.0000 | 2.0000 | 1.6490 | 5.9598 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9999 * stata +- **R-squared**: 0.9998 +- **N observations**: 3,496,899 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 9.39e-06 | 2.17e-06 | 4.3345 | 0.000 | +| Slope | 0.9999 | 7.76e-06 | 128770.2511 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 348/3496899 (0.010%) +- Stata standard deviation: 2.77e-01 + +--- + +### iomom_cust + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,637,670 +- Python: 1,637,617 +- Common: 1,637,610 + +**Precision1**: 0.427% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.15e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.64e+06 | 1.64e+06 | 1.64e+06 | 1.64e+06 | +| mean | 1.7274 | 1.7273 | -1.71e-04 | -2.88e-05 | +| std | 5.9574 | 5.9572 | 0.0435 | 0.0073 | +| min | -50.7987 | -50.7987 | -13.2564 | -2.2252 | +| 25% | -1.4085 | -1.4112 | -6.97e-08 | -1.17e-08 | +| 50% | 1.8440 | 1.8440 | 8.59e-10 | 1.44e-10 | +| 75% | 5.1077 | 5.1077 | 7.12e-08 | 1.20e-08 | +| max | 147.0000 | 147.0000 | 17.9093 | 3.0062 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0001 + 0.9999 * stata +- **R-squared**: 0.9999 +- **N observations**: 1,637,610 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -6.21e-05 | 3.54e-05 | -1.7544 | 0.079 | +| Slope | 0.9999 | 5.71e-06 | 175156.9352 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 6997/1637610 (0.427%) +- Stata standard deviation: 5.96e+00 + +--- + +### iomom_supp + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has -0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,639,842 +- Python: 1,639,789 +- Common: 1,639,782 + +**Precision1**: 0.707% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 3.70e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.64e+06 | 1.64e+06 | 1.64e+06 | 1.64e+06 | +| mean | 1.6156 | 1.6152 | -3.63e-04 | -6.94e-05 | +| std | 5.2255 | 5.2250 | 0.0235 | 0.0045 | +| min | -46.2534 | -46.2534 | -5.1255 | -0.9809 | +| 25% | -1.0213 | -1.0228 | -6.63e-08 | -1.27e-08 | +| 50% | 1.7934 | 1.7927 | -4.03e-10 | -7.71e-11 | +| 75% | 4.5754 | 4.5744 | 6.28e-08 | 1.20e-08 | +| max | 135.8487 | 135.8487 | 6.3144 | 1.2084 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0002 + 0.9999 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,639,782 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -2.01e-04 | 1.92e-05 | -10.4445 | 0.000 | +| Slope | 0.9999 | 3.51e-06 | 284855.8988 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 11589/1639782 (0.707%) +- Stata standard deviation: 5.23e+00 + +--- + +### realestate + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,448,154 +- Python: 1,448,165 +- Common: 1,448,154 + +**Precision1**: 0.040% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 8.32e-04 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.45e+06 | 1.45e+06 | 1.45e+06 | 1.45e+06 | +| mean | -9.58e-12 | 1.95e-06 | 1.95e-06 | 7.90e-06 | +| std | 0.2476 | 0.2476 | 8.24e-05 | 3.33e-04 | +| min | -1.6407 | -1.6407 | -5.41e-06 | -2.19e-05 | +| 25% | -0.1188 | -0.1188 | -7.52e-09 | -3.04e-08 | +| 50% | -0.0155 | -0.0155 | -2.46e-11 | -9.93e-11 | +| 75% | 0.0987 | 0.0987 | 7.47e-09 | 3.02e-08 | +| max | 56.9154 | 56.9154 | 0.0043 | 0.0172 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,448,154 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.95e-06 | 6.85e-08 | 28.5395 | 0.000 | +| Slope | 1.0000 | 2.77e-07 | 3.61e+06 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 583/1448154 (0.040%) +- Stata standard deviation: 2.48e-01 + +--- + +### retConglomerate + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.16% rows vs Stata) +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 758,394 +- Python: 759,644 +- Common: 758,382 + +**Precision1**: 1.151% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.65e-01 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 758382.0000 | 758382.0000 | 758382.0000 | 758382.0000 | +| mean | 0.0106 | 0.0106 | -8.37e-06 | -9.95e-05 | +| std | 0.0841 | 0.0840 | 0.0012 | 0.0141 | +| min | -0.8000 | -0.8000 | -0.1242 | -1.4774 | +| 25% | -0.0313 | -0.0313 | -2.78e-17 | -3.30e-16 | +| 50% | 0.0105 | 0.0105 | 0.0000 | 0.0000 | +| 75% | 0.0495 | 0.0495 | 2.78e-17 | 3.30e-16 | +| max | 4.3779 | 4.3779 | 0.1480 | 1.7604 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = -0.0000 + 0.9999 * stata +- **R-squared**: 0.9998 +- **N observations**: 758,382 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | -6.82e-06 | 1.37e-06 | -4.9871 | 0.000 | +| Slope | 0.9999 | 1.61e-05 | 61967.2954 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 8727/758382 (1.151%) +- Stata standard deviation: 8.41e-02 + +**Most Recent Bad Observations**: +``` + permno yyyymm python stata diff +0 10104 202412 0.056786 0.054399 0.002387 +1 10220 202412 0.044009 0.017972 0.026037 +2 10253 202412 0.056786 0.054399 0.002387 +3 10318 202412 -0.065976 -0.068694 0.002718 +4 10516 202412 -0.034073 -0.030640 -0.003433 +5 10517 202412 -0.042844 -0.046507 0.003663 +6 10547 202412 0.157909 0.156824 0.001085 +7 11292 202412 -0.065976 -0.068694 0.002718 +8 11308 202412 -0.034073 -0.030640 -0.003433 +9 11499 202412 0.044009 0.017972 0.026037 +``` + +**Largest Differences**: +``` + permno yyyymm python stata diff +0 12749 199504 0.173066 0.025096 0.147970 +1 13100 199504 0.173066 0.025096 0.147970 +2 18403 199504 0.173066 0.025096 0.147970 +3 77077 199504 0.173066 0.025096 0.147970 +4 16086 202405 -0.030832 0.093346 -0.124178 +5 20342 202405 -0.030832 0.093346 -0.124178 +6 81073 202405 -0.030832 0.093346 -0.124178 +7 16086 202410 -0.127533 -0.018281 -0.109252 +8 20342 202410 -0.127533 -0.018281 -0.109252 +9 81073 202410 -0.127533 -0.018281 -0.109252 +``` + +**Largest Differences Before 1950**: +``` +No data before 1950 +``` + +--- + +### roaq + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,490,858 +- Python: 2,490,900 +- Common: 2,490,812 + +**Precision1**: 0.018% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.06e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.49e+06 | 2.49e+06 | 2.49e+06 | 2.49e+06 | +| mean | -0.0085 | -0.0085 | 4.19e-06 | 1.42e-05 | +| std | 0.2948 | 0.2948 | 0.0026 | 0.0090 | +| min | -58.0917 | -58.0917 | -0.5312 | -1.8018 | +| 25% | -0.0053 | -0.0053 | -2.63e-10 | -8.91e-10 | +| 50% | 0.0059 | 0.0059 | 0.0000 | 0.0000 | +| 75% | 0.0184 | 0.0184 | 2.64e-10 | 8.94e-10 | +| max | 171.9972 | 171.9972 | 1.8407 | 6.2435 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 0.9999 +- **N observations**: 2,490,812 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 4.08e-06 | 1.68e-06 | 2.4376 | 0.015 | +| Slope | 1.0000 | 5.68e-06 | 176049.8422 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 445/2490812 (0.018%) +- Stata standard deviation: 2.95e-01 + +--- + +### sfe + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.00% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 611,076 +- Python: 611,100 +- Common: 609,876 + +**Precision1**: 0.022% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.38e-08 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 609876.0000 | 609876.0000 | 609876.0000 | 609876.0000 | +| mean | -0.1056 | -0.1052 | 3.61e-04 | 1.91e-05 | +| std | 18.9485 | 18.9486 | 0.0463 | 0.0024 | +| min | -4241.4189 | -4241.4190 | -0.1008 | -0.0053 | +| 25% | 0.0258 | 0.0258 | -1.56e-09 | -8.21e-11 | +| 50% | 0.0642 | 0.0642 | -1.73e-18 | -9.15e-20 | +| 75% | 0.0970 | 0.0971 | 1.38e-09 | 7.29e-11 | +| max | 12.2928 | 12.2928 | 9.9914 | 0.5273 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0004 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 609,876 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.61e-04 | 5.93e-05 | 6.0906 | 0.000 | +| Slope | 1.0000 | 3.13e-06 | 319348.4277 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 132/609876 (0.022%) +- Stata standard deviation: 1.89e+01 + +--- + +### sinAlgo + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.21% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 233,503 +- Python: 233,996 +- Common: 233,503 + +**Precision1**: 0.010% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 0.00e+00 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 233503.0000 | 233503.0000 | 233503.0000 | 233503.0000 | +| mean | 0.1802 | 0.1803 | 9.85e-05 | 2.56e-04 | +| std | 0.3843 | 0.3844 | 0.0099 | 0.0258 | +| min | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 25% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 50% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| 75% | 0.0000 | 0.0000 | 0.0000 | 0.0000 | +| max | 1.0000 | 1.0000 | 1.0000 | 2.6019 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0001 + 0.9999 * stata +- **R-squared**: 0.9993 +- **N observations**: 233,503 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.20e-04 | 2.27e-05 | 5.2970 | 0.000 | +| Slope | 0.9999 | 5.34e-05 | 18711.4230 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 23/233503 (0.010%) +- Stata standard deviation: 3.84e-01 + +--- + +### skew1 + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.24% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 473,447 +- Python: 479,319 +- Common: 473,182 + +**Precision1**: 0.206% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 7.85e-02 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 473182.0000 | 473182.0000 | 473182.0000 | 473182.0000 | +| mean | 0.0514 | 0.0514 | -3.92e-06 | -4.75e-05 | +| std | 0.0824 | 0.0824 | 0.0015 | 0.0178 | +| min | -1.3802 | -1.3802 | -0.5310 | -6.4426 | +| 25% | 0.0170 | 0.0170 | -6.67e-10 | -8.09e-09 | +| 50% | 0.0381 | 0.0381 | 0.0000 | 0.0000 | +| 75% | 0.0684 | 0.0684 | 6.67e-10 | 8.09e-09 | +| max | 1.6481 | 1.6481 | 0.2257 | 2.7385 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 0.9998 * stata +- **R-squared**: 0.9997 +- **N observations**: 473,182 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 8.03e-06 | 2.51e-06 | 3.2008 | 0.001 | +| Slope | 0.9998 | 2.58e-05 | 38717.6394 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 973/473182 (0.206%) +- Stata standard deviation: 8.24e-02 + +--- + +### std_turn + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +1.58% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 2,166,584 +- Python: 2,200,763 +- Common: 2,166,204 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 1.03e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 2.17e+06 | 2.17e+06 | 2.17e+06 | 2.17e+06 | +| mean | 0.2270 | 0.2270 | 2.09e-10 | 5.96e-11 | +| std | 3.5115 | 3.5115 | 7.13e-08 | 2.03e-08 | +| min | 6.96e-06 | 6.96e-06 | -1.89e-05 | -5.37e-06 | +| 25% | 0.0161 | 0.0161 | -3.90e-10 | -1.11e-10 | +| 50% | 0.0367 | 0.0367 | -2.34e-12 | -6.67e-13 | +| 75% | 0.0941 | 0.0941 | 3.74e-10 | 1.07e-10 | +| max | 711.4860 | 711.4860 | 9.53e-06 | 2.71e-06 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 2,166,204 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 3.73e-10 | 4.85e-11 | 7.6862 | 0.000 | +| Slope | 1.0000 | 1.38e-11 | 7.25e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/2166204 (0.000%) +- Stata standard deviation: 3.51e+00 + +--- + +### tang + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.07% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 1,517,431 +- Python: 1,518,487 +- Common: 1,516,951 + +**Precision1**: 0.012% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 4.90e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 1.52e+06 | 1.52e+06 | 1.52e+06 | 1.52e+06 | +| mean | 0.6912 | 0.6912 | 2.17e-06 | 1.15e-05 | +| std | 0.1888 | 0.1888 | 5.59e-04 | 0.0030 | +| min | 0.0000 | 0.0000 | -0.0029 | -0.0154 | +| 25% | 0.5956 | 0.5956 | -1.35e-08 | -7.14e-08 | +| 50% | 0.6959 | 0.6959 | -1.89e-11 | -1.00e-10 | +| 75% | 0.7977 | 0.7977 | 1.34e-08 | 7.12e-08 | +| max | 7.1460 | 7.1460 | 0.1969 | 1.0427 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 1,516,951 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.48e-05 | 1.72e-06 | 8.5747 | 0.000 | +| Slope | 1.0000 | 2.40e-06 | 415836.8367 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 180/1516951 (0.012%) +- Stata standard deviation: 1.89e-01 + +--- + +### zerotrade12M + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.05% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,342,889 +- Python: 4,345,044 +- Common: 4,342,889 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.61e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.34e+06 | 4.34e+06 | 4.34e+06 | 4.34e+06 | +| mean | 17.8741 | 17.8741 | 4.12e-08 | 1.02e-09 | +| std | 40.3912 | 40.3912 | 1.19e-06 | 2.94e-08 | +| min | 1.49e-11 | 1.49e-11 | -1.26e-05 | -3.11e-07 | +| 25% | 6.15e-08 | 6.15e-08 | -3.76e-15 | -9.31e-17 | +| 50% | 2.22e-07 | 2.22e-07 | 3.08e-16 | 7.63e-18 | +| 75% | 10.8837 | 10.8837 | 2.37e-14 | 5.87e-16 | +| max | 251.0160 | 251.0160 | 1.25e-05 | 3.09e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,342,889 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 7.81e-09 | 6.22e-10 | 12.5575 | 0.000 | +| Slope | 1.0000 | 1.41e-11 | 7.10e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4342889 (0.000%) +- Stata standard deviation: 4.04e+01 + +--- + +### zerotrade1M + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.06% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,680,231 +- Python: 4,682,859 +- Common: 4,680,231 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.83e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.68e+06 | 4.68e+06 | 4.68e+06 | 4.68e+06 | +| mean | 1.5093 | 1.5093 | 8.03e-10 | 2.21e-10 | +| std | 3.6329 | 3.6329 | 1.17e-07 | 3.22e-08 | +| min | 7.34e-13 | 7.34e-13 | -1.44e-06 | -3.97e-07 | +| 25% | 1.65e-08 | 1.65e-08 | -9.05e-16 | -2.49e-16 | +| 50% | 4.69e-08 | 4.69e-08 | 1.09e-17 | 2.99e-18 | +| 75% | 0.9130 | 0.9130 | 1.08e-15 | 2.96e-16 | +| max | 20.2225 | 20.2225 | 1.44e-06 | 3.98e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,680,231 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 1.71e-10 | 5.85e-11 | 2.9209 | 0.003 | +| Slope | 1.0000 | 1.49e-11 | 6.72e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4680231 (0.000%) +- Stata standard deviation: 3.63e+00 + +--- + +### zerotrade6M + +**Status**: ✅ PASSED + +**Test Results**: +- Test 1 - Superset check: ✅ PASSED +- Test 2 - NumRows check: ✅ PASSED (Python has +0.05% rows vs Stata) +- Test 3 - Precision1 check: ✅ PASSED +- Test 4 - Precision2 check: ✅ PASSED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 4,530,678 +- Python: 4,533,091 +- Common: 4,530,678 + +**Precision1**: 0.000% obs with std_diff >= 1.00e-02 (tolerance: < 1%) + +**Precision2**: 100th percentile diff = 2.78e-07 (tolerance: < 1.00e-01) + +**Summary Statistics** (Common Observations): + +| Statistic | Stata | Python | Difference | Std Difference | +|------------|----------------|----------------|----------------|----------------| +| count | 4.53e+06 | 4.53e+06 | 4.53e+06 | 4.53e+06 | +| mean | 9.1074 | 9.1074 | 1.65e-08 | 7.91e-10 | +| std | 20.8086 | 20.8086 | 6.23e-07 | 3.00e-08 | +| min | 2.14e-11 | 2.14e-11 | -8.78e-06 | -4.22e-07 | +| 25% | 1.21e-07 | 1.21e-07 | -9.31e-15 | -4.47e-16 | +| 50% | 3.90e-07 | 3.90e-07 | 1.93e-16 | 9.25e-18 | +| 75% | 5.0000 | 5.0000 | 1.45e-14 | 6.97e-16 | +| max | 125.1831 | 125.1831 | 8.73e-06 | 4.19e-07 | + +**Regression Analysis** (Python ~ Stata): + +- **Model**: python = 0.0000 + 1.0000 * stata +- **R-squared**: 1.0000 +- **N observations**: 4,530,678 + +| Coefficient | Estimate | Std Error | t-statistic | p-value | +|-------------|--------------|--------------|-------------|----------| +| Intercept | 2.42e-09 | 3.19e-10 | 7.5938 | 0.000 | +| Slope | 1.0000 | 1.41e-11 | 7.11e+10 | 0.000 | + +**Feedback**: +- Num observations with std_diff >= TOL_DIFF_1: 0/4530678 (0.000%) +- Stata standard deviation: 2.08e+01 + +--- + +### AgeIPO + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 0 Stata observations) +- Test 2 - NumRows check: ❌ FAILED +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 0 +- Python: 353,486 +- Common: 0 + +--- + +### IndIPO + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 0 Stata observations) +- Test 2 - NumRows check: ❌ FAILED +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 0 +- Python: 4,047,630 +- Common: 0 + +--- + +### OrgCapNoAdj + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 0 Stata observations) +- Test 2 - NumRows check: ❌ FAILED +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 0 +- Python: 1,243,881 +- Common: 0 + +--- + +### RDIPO + +**Status**: ❌ FAILED + +**Test Results**: +- Test 1 - Superset check: ❌ FAILED (Python missing 0 Stata observations) +- Test 2 - NumRows check: ❌ FAILED +- Test 3 - Precision1 check: ❌ FAILED +- Test 4 - Precision2 check: ❌ FAILED +- Test 5 - T-stat check: NA + +**Observations**: +- Stata: 0 +- Python: 3,626,619 +- Common: 0 --- diff --git a/Signals/pyCode/DataDownloads/01_columns.yaml b/Signals/pyCode/DataDownloads/01_columns.yaml index 129ddce9..f8c9001d 100644 --- a/Signals/pyCode/DataDownloads/01_columns.yaml +++ b/Signals/pyCode/DataDownloads/01_columns.yaml @@ -319,8 +319,9 @@ IBESCRSPLinkingTable: int32: permno OPTIONMETRICSCRSPLinkingTable: - column_order: secid, permno, om_score - int32: secid, permno + column_order: permno, time_avail_m, secid, om_score + int32: permno, secid + datetime64[ns]: time_avail_m int8: om_score PatentDataProcessed: diff --git a/Signals/pyCode/DataDownloads/OptionMetricsCRSPLink.py b/Signals/pyCode/DataDownloads/OptionMetricsCRSPLink.py new file mode 100644 index 00000000..c31175a3 --- /dev/null +++ b/Signals/pyCode/DataDownloads/OptionMetricsCRSPLink.py @@ -0,0 +1,94 @@ +# %% + +# ABOUTME: OptionMetricsCRSPLink.py makes pyData/Intermediate/OPTIONMETRICSCRSPLinkingTable.parquet with columns permno, time_avail_m, secid, score, sdate_m, edate_m, volume +# ABOUTME: the extra columns are for record keeping +""" +Usage: + python3 DataDownloads/OptionMetricsCRSPLink.py + +Outputs: + - pyData/Intermediate/OPTIONMETRICSCRSPLinkingTable.parquet + +Currently, even though the option volume data isn't actually helping with deduplication, we're keeping it around for clarity in case it's useful later (2025-09-02) +""" + +import os +import pandas as pd +from pathlib import Path +from dotenv import load_dotenv +from sqlalchemy import create_engine +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +from utils.column_standardizer_yaml import standardize_columns + +load_dotenv() + + +# %% load data + +print("Processing CRSP-OptionMetrics data...") + +engine = create_engine( + f"postgresql://{os.getenv('WRDS_USERNAME')}:{os.getenv('WRDS_PASSWORD')}@wrds-pgdata.wharton.upenn.edu:9737/wrds" +) + +# Query WRDS database for OptionMetrics linking data +QUERY = """ +SELECT secid, sdate, edate, permno, score +FROM wrdsapps_link_crsp_optionm.opcrsphist as a +WHERE permno is not null +""" + +# Read from WRDS database +omlink0 = pd.read_sql_query(QUERY, engine) +print(f"Loaded {len(omlink0)} OptionMetrics linking records from WRDS") + +engine.dispose() + +# load CRSP monthly data +crspm = pd.read_parquet("../pyData/Intermediate/monthlyCRSP.parquet", columns=['permno', 'time_avail_m']) + +# %% Convert link dates to monthly +omlink = omlink0.copy() + +# start date -> same month +omlink['sdate_m'] = pd.to_datetime(omlink['sdate']).dt.to_period('M').dt.to_timestamp() + +# end date -> previous month (since monthly dates assume end of the month) +omlink["edate_m"] = ( + pd.to_datetime(omlink["edate"]).dt.to_period("M") - 1 +).dt.to_timestamp() + +omlink.drop(columns=['sdate', 'edate'], inplace=True) + +# %% join data + +# full join +df0 = omlink.merge(crspm, on=['permno'], how='outer').query( + "secid.notna()" # keep if a link exists +).query( + "time_avail_m >= sdate_m & time_avail_m <= edate_m" # keep if link date is valid +) + +print(f'joined om-crsp link, crspm, and option volume data: {len(df0)} rows') + +#%% remove duplicates + +# keep the lowest score +# (comparing old data based on Code/PrepScripts/oclink_to_csv.sas with new data, lowest score is the best) +df = df0.sort_values(['permno','time_avail_m','score']).groupby(['permno','time_avail_m']).first().reset_index() + +print(f'removed duplicates by score: {len(df)} rows') + +#%% standardize columns + +df = standardize_columns(df, "OPTIONMETRICSCRSPLinkingTable") + +#%% + +# save +df.to_parquet("../pyData/Intermediate/OPTIONMETRICSCRSPLinkingTable.parquet", index=False) + +print(f"OptionMetrics linking data saved with {len(df)} records") +print(f"Head: {df.head()}") \ No newline at end of file diff --git a/Signals/pyCode/DataDownloads/ZL_CRSPOPTIONMETRICS.py b/Signals/pyCode/DataDownloads/ZL_CRSPOPTIONMETRICS.py deleted file mode 100644 index 070725fc..00000000 --- a/Signals/pyCode/DataDownloads/ZL_CRSPOPTIONMETRICS.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 -""" -CRSP-OptionMetrics data script - Python equivalent of ZL_CRSPOPTIONMETRICS.do - -Processes OptionMetrics data from preprocessed file. -""" - -import os -import pandas as pd -from pathlib import Path -from dotenv import load_dotenv -import sys -import os -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) -from config import MAX_ROWS_DL -from utils.column_standardizer_yaml import standardize_columns - -load_dotenv() - -def main(): - """Process CRSP-OptionMetrics data""" - print("Processing CRSP-OptionMetrics data...") - - # Ensure directories exist - os.makedirs("../pyData/Intermediate", exist_ok=True) - - # Check for oclink.csv in Prep folder - om_path = Path("../pyData/Prep/oclink.csv") - - if om_path.exists(): - # Read the OptionMetrics linking file - om_data = pd.read_csv(om_path) - print(f"Loaded {len(om_data)} OptionMetrics linking records") - - # Keep only records with score <= 6 (good matches) - if 'SCORE' in om_data.columns: - om_data = om_data[om_data['SCORE'] <= 6] - print(f"After filtering for score <= 6: {len(om_data)} records") - - # Rename columns to match expected output - column_mapping = { - 'PERMNO': 'permno', - 'SCORE': 'om_score' - } - om_data = om_data.rename(columns=column_mapping) - - # Keep only required columns - required_cols = ['secid', 'permno', 'om_score'] - available_cols = [col for col in required_cols if col in om_data.columns] - om_data = om_data[available_cols] - - # Keep best match (lowest score) per permno - if 'om_score' in om_data.columns and 'permno' in om_data.columns: - om_data = om_data.sort_values('om_score').groupby('permno').first().reset_index() - - # Standardize columns to match DTA file - om_data = standardize_columns(om_data, "OPTIONMETRICSCRSPLinkingTable") - - # Save processed data - om_data.to_parquet("../pyData/Intermediate/OPTIONMETRICSCRSPLinkingTable.parquet", index=False) - - print(f"OptionMetrics linking data saved with {len(om_data)} records") - print(f"Unique permnos: {om_data['permno'].nunique()}") - - else: - print("WARNING: oclink.csv not found in ../pyData/Prep/") - print("Creating placeholder OptionMetrics linking data") - - # Create placeholder data - placeholder_data = pd.DataFrame({ - 'secid': [100001, 100002, 100003], - 'permno': [10001, 10002, 10003], - 'om_score': [1, 2, 3] - }) - - # Apply row limit for debugging if configured - if MAX_ROWS_DL > 0: - placeholder_data = placeholder_data.head(MAX_ROWS_DL) - print(f"DEBUG MODE: Limited to {MAX_ROWS_DL} rows") - - # Standardize columns to match DTA file - placeholder_data = standardize_columns(placeholder_data, "OPTIONMETRICSCRSPLinkingTable") - - # Save the data - placeholder_data.to_parquet("../pyData/Intermediate/OPTIONMETRICSCRSPLinkingTable.parquet", index=False) - print(f"Placeholder OptionMetrics linking data saved with {len(placeholder_data)} records") - - print("CRSP-OptionMetrics processing completed") - -if __name__ == "__main__": - main() diff --git a/Signals/pyCode/Predictors/sinAlgo.py b/Signals/pyCode/Predictors/sinAlgo.py index c9afe9c0..572fee3e 100644 --- a/Signals/pyCode/Predictors/sinAlgo.py +++ b/Signals/pyCode/Predictors/sinAlgo.py @@ -83,7 +83,7 @@ # DATA LOAD (Firm-level industry codes) print("Loading SignalMasterTable...") df = pd.read_parquet('../pyData/Intermediate/SignalMasterTable.parquet') -df = df[['permno', 'gvkey', 'time_avail_m', 'sicCRSP', 'shrcd', 'bh1m']].copy() +df = df[['permno', 'gvkey', 'time_avail_m', 'sicCRSP', 'shrcd']].copy() print(f"Loaded SignalMasterTable: {df.shape[0]} rows") # Add NAICS codes from Compustat annual data diff --git a/Signals/pyCode/SignalMasterTable.py b/Signals/pyCode/SignalMasterTable.py index 59b7c19e..a99440a2 100644 --- a/Signals/pyCode/SignalMasterTable.py +++ b/Signals/pyCode/SignalMasterTable.py @@ -1,169 +1,139 @@ -# ABOUTME: SignalMasterTable.py - creates monthly master table with firm identifiers and meta information -# ABOUTME: Direct line-by-line translation from Stata SignalMasterTable.do script +# ABOUTME: Creates the "backbone" used in most predictors. +# ABOUTME: Incorporates basic info from monthly CRSP and annual Compustat. +""" +Inputs: + - monthlyCRSP.parquet + - m_aCompustat.parquet + - IBESCRSPLinkingTable.parquet (optional) + - OPTIONMETRICSCRSPLinkingTable.parquet (optional) +Outputs: + - SignalMasterTable.parquet + +""" import pandas as pd import numpy as np from pathlib import Path -import os - -def main(): - """ - SignalMasterTable - Holds monthly list of firms with identifiers and some meta information - """ - - print("Starting SignalMasterTable.py...") - - # DATA LOAD - print("Loading monthly CRSP data...") - - # Start with monthly CRSP - equivalent to Stata: u permno ticker exchcd shrcd time_avail_m mve_c prc ret sicCRSP using monthlyCRSP - monthlyCRSP_path = Path("../pyData/Intermediate/monthlyCRSP.parquet") - if not monthlyCRSP_path.exists(): - raise FileNotFoundError(f"Required input file not found: {monthlyCRSP_path}") - - df = pd.read_parquet(monthlyCRSP_path) - - # Keep only the columns we need (equivalent to Stata's 'using' with specific variables) - required_cols = ['permno', 'ticker', 'exchcd', 'shrcd', 'time_avail_m', 'mve_c', 'prc', 'ret', 'sicCRSP'] - missing_cols = [col for col in required_cols if col not in df.columns] - if missing_cols: - raise ValueError(f"Missing required columns in monthlyCRSP: {missing_cols}") - - df = df[required_cols].copy() - - print(f"Loaded monthlyCRSP: {df.shape[0]} rows, {df.shape[1]} columns") - - # Screen on Stock market information: common stocks and major exchanges - print("Filtering for common stocks and major exchanges...") - - # keep if (shrcd == 10 | shrcd == 11 | shrcd == 12) & (exchcd == 1 | exchcd == 2 | exchcd == 3) - df = df[(df['shrcd'].isin([10, 11, 12])) & (df['exchcd'].isin([1, 2, 3]))].copy() - - print(f"After filtering: {df.shape[0]} rows") - - # Merge with Compustat monthly data - print("Merging with m_aCompustat...") - - # merge 1:1 permno time_avail_m using m_aCompustat, keepusing(gvkey sic) keep(master match) nogenerate - m_aCompustat_path = Path("../pyData/Intermediate/m_aCompustat.parquet") - if not m_aCompustat_path.exists(): - raise FileNotFoundError(f"Required input file not found: {m_aCompustat_path}") - - compustat_df = pd.read_parquet(m_aCompustat_path) - - # Keep only the columns we need - comp_cols = ['permno', 'time_avail_m', 'gvkey', 'sic'] - missing_comp_cols = [col for col in comp_cols if col not in compustat_df.columns] - if missing_comp_cols: - raise ValueError(f"Missing required columns in m_aCompustat: {missing_comp_cols}") - - compustat_df = compustat_df[comp_cols].copy() - - # Merge (left join to keep all CRSP observations) - df = df.merge(compustat_df, on=['permno', 'time_avail_m'], how='left') - - print(f"After Compustat merge: {df.shape[0]} rows") - - # rename sic sicCS - df = df.rename(columns={'sic': 'sicCS'}) - - # Standardize sicCS string format to match Stata (handle None -> empty string) - df['sicCS'] = df['sicCS'].fillna('') - - # add some auxiliary vars and clean up - print("Adding auxiliary variables...") - - # gen NYSE = exchcd == 1 - df['NYSE'] = (df['exchcd'] == 1).astype(int) - - # xtset permno time_avail_m and gen bh1m = f.ret (Future buy and hold return) - df = df.sort_values(['permno', 'time_avail_m']) - df['bh1m'] = df.groupby('permno')['ret'].shift(-1) - - # keep gvkey permno ticker time_avail_m ret bh1m mve_c prc NYSE exchcd shrcd sicCS sicCRSP - keep_cols = ['gvkey', 'permno', 'ticker', 'time_avail_m', 'ret', 'bh1m', 'mve_c', 'prc', 'NYSE', 'exchcd', 'shrcd', 'sicCS', 'sicCRSP'] - df = df[keep_cols].copy() - - # Fix data types to match Stata output - df['exchcd'] = df['exchcd'].astype('int8') - df['shrcd'] = df['shrcd'].astype('int8') - df['sicCRSP'] = df['sicCRSP'].astype('int16') - df['NYSE'] = df['NYSE'].astype('int8') - df['bh1m'] = df['bh1m'].astype('float32') - - # Comprehensive string column cleanup to match Stata format (handle None -> empty string) - string_columns = ['ticker', 'sicCS'] - for col in string_columns: - if col in df.columns: - df[col] = df[col].fillna('') - - print(f"After adding auxiliary vars: {df.shape[0]} rows, {df.shape[1]} columns") - - # Add IBES ticker (if available) - print("Checking for IBES-CRSP linking table...") - - IBESCRSPLink_path = Path("../pyData/Intermediate/IBESCRSPLinkingTable.parquet") - if IBESCRSPLink_path.exists(): - print("Adding IBES-CRSP link...") - - # merge m:1 permno using IBESCRSPLinkingTable, keep(master match) nogenerate - ibes_link = pd.read_parquet(IBESCRSPLink_path) - - # Merge on permno (many-to-one) - df = df.merge(ibes_link, on='permno', how='left') - - # Standardize IBES ticker string format to match Stata (handle None -> empty string) - if 'tickerIBES' in df.columns: - df['tickerIBES'] = df['tickerIBES'].fillna('') - - print(f"After IBES link merge: {df.shape[0]} rows, {df.shape[1]} columns") - else: - print("Not adding IBES-CRSP link. Some signals cannot be generated.") - - # Add OptionMetrics secid (if available) - print("Checking for OptionMetrics-CRSP linking table...") - - OptionMetricsLink_path = Path("../pyData/Intermediate/OPTIONMETRICSCRSPLinkingTable.parquet") - if OptionMetricsLink_path.exists(): - print("Adding OptionMetrics-CRSP link...") - - # merge m:1 permno using OPTIONMETRICSCRSPLinkingTable, keep(master match) nogenerate - om_link = pd.read_parquet(OptionMetricsLink_path) - - # Merge on permno (many-to-one) - df = df.merge(om_link, on='permno', how='left') - - print(f"After OptionMetrics link merge: {df.shape[0]} rows, {df.shape[1]} columns") - else: - print("Not adding OptionMetrics-CRSP link. Some signals cannot be generated.") - - # reinforce sort (equivalent to xtset permno time_avail_m) - df = df.sort_values(['permno', 'time_avail_m']) - - # Reorder columns to match Stata output exactly - stata_column_order = ['permno', 'ret', 'prc', 'shrcd', 'exchcd', 'sicCRSP', 'ticker', 'time_avail_m', 'mve_c', 'gvkey', 'sicCS', 'NYSE', 'bh1m', 'tickerIBES', 'secid', 'om_score'] - available_cols = [col for col in stata_column_order if col in df.columns] - extra_cols = [col for col in df.columns if col not in stata_column_order] - final_cols = available_cols + extra_cols - df = df[final_cols] - - # SAVE - print("Saving SignalMasterTable...") - - # Create output directory if it doesn't exist - output_dir = Path("../pyData/Intermediate/") - output_dir.mkdir(parents=True, exist_ok=True) - - # Save as parquet (equivalent to Stata's save) - output_path = output_dir / "SignalMasterTable.parquet" - df.to_parquet(output_path, index=False) - - print(f"SignalMasterTable saved to: {output_path}") - print(f"Final shape: {df.shape[0]} rows, {df.shape[1]} columns") - print(f"Column names: {list(df.columns)}") - - return df - -if __name__ == "__main__": - main() \ No newline at end of file + + +print("Starting SignalMasterTable.py...") + +# DATA LOAD +print("Loading monthly CRSP data...") + +# Start with monthly CRSP +df = pd.read_parquet('../pyData/Intermediate/monthlyCRSP.parquet', + columns=['permno', 'ticker', 'exchcd', 'shrcd', 'time_avail_m', 'mve_c', 'prc', 'ret', 'sicCRSP']) + +print(f"Loaded monthlyCRSP: {df.shape[0]} rows, {df.shape[1]} columns") + +# Screen on Stock market information: common stocks and major exchanges +# TBC: remove and use this filter as default in SignalDoc.csv +print("Filtering for common stocks and major exchanges...") + +# keep if (shrcd == 10 | shrcd == 11 | shrcd == 12) & (exchcd == 1 | exchcd == 2 | exchcd == 3) +df = df[(df['shrcd'].isin([10, 11, 12])) & (df['exchcd'].isin([1, 2, 3]))].copy() + +print(f"After filtering: {df.shape[0]} rows") + +# Merge with Compustat monthly data +print("Merging with m_aCompustat...") +compustat_df = pd.read_parquet('../pyData/Intermediate/m_aCompustat.parquet', + columns=['permno', 'time_avail_m', 'gvkey', 'sic']) + +# Merge (left join to keep all CRSP observations) +df = df.merge(compustat_df, on=['permno', 'time_avail_m'], how='left') + +print(f"After Compustat merge: {df.shape[0]} rows") + +# rename sic sicCS +df = df.rename(columns={'sic': 'sicCS'}) + +# Standardize sicCS string format to match Stata (handle None -> empty string) +df['sicCS'] = df['sicCS'].fillna('') + +# add some auxiliary vars and clean up +print("Adding auxiliary variables...") + +# gen NYSE = exchcd == 1 +df['NYSE'] = (df['exchcd'] == 1).astype(int) + +# Fix data types to match Stata output +# TBC: clean this up +df['exchcd'] = df['exchcd'].astype('int8') +df['shrcd'] = df['shrcd'].astype('int8') +df['sicCRSP'] = df['sicCRSP'].astype('int16') +df['NYSE'] = df['NYSE'].astype('int8') + +# Comprehensive string column cleanup to match Stata format (handle None -> empty string) +string_columns = ['ticker', 'sicCS'] +for col in string_columns: + if col in df.columns: + df[col] = df[col].fillna('') + +print(f"After adding auxiliary vars: {df.shape[0]} rows, {df.shape[1]} columns") + +# === Optional Columns === + +# Add IBES ticker (if available) +print("Checking for IBES-CRSP linking table...") + +IBESCRSPLink_path = Path("../pyData/Intermediate/IBESCRSPLinkingTable.parquet") +if IBESCRSPLink_path.exists(): + print("Adding IBES-CRSP link...") + ibes_link = pd.read_parquet( + IBESCRSPLink_path, columns=["permno", "tickerIBES"] + ) + df = df.merge(ibes_link, on=['permno'], how='left') + + # Standardize IBES ticker string format to match Stata (handle None -> empty string) + if 'tickerIBES' in df.columns: + df['tickerIBES'] = df['tickerIBES'].fillna('') + + print(f"After IBES link merge: {df.shape[0]} rows, {df.shape[1]} columns") +else: + print("Not adding IBES-CRSP link. Some signals cannot be generated.") + df['tickerIBES'] = '' + +# Add OptionMetrics secid (if available) +print("Checking for OptionMetrics-CRSP linking table...") + +OptionMetricsLink_path = Path( + "../pyData/Intermediate/OPTIONMETRICSCRSPLinkingTable.parquet" +) +if OptionMetricsLink_path.exists(): + print("Adding OptionMetrics-CRSP link...") + + om_link = pd.read_parquet( + OptionMetricsLink_path, columns=["permno", "time_avail_m", "secid"] + ) + + df = df.merge(om_link, on=["permno", "time_avail_m"], how="left") + + print(f"After OptionMetrics link merge: {df.shape[0]} rows, {df.shape[1]} columns") +else: + print("Not adding OptionMetrics-CRSP link. Some signals cannot be generated.") + df['secid'] = np.nan + +# reinforce sort (equivalent to xtset permno time_avail_m) +df = df.sort_values(['permno', 'time_avail_m']) + +# Reorder columns: ['permno', 'time_avail_m'] + everything else +main_cols = ['permno', 'time_avail_m'] +other_cols = [col for col in df.columns if col not in main_cols] +df = df[main_cols + other_cols] + +# SAVE +print("Saving SignalMasterTable...") + +# Create output directory if it doesn't exist +output_dir = Path("../pyData/Intermediate/") +output_dir.mkdir(parents=True, exist_ok=True) + +# Save as parquet (equivalent to Stata's save) +output_path = output_dir / "SignalMasterTable.parquet" +df.to_parquet(output_path, index=False) + +print(f"SignalMasterTable saved to: {output_path}") +print(f"Final shape: {df.shape[0]} rows, {df.shape[1]} columns") +print(f"Head: {df.head()}") \ No newline at end of file diff --git a/Signals/pyCode/utils/TestPortFocused.R b/Signals/pyCode/utils/TestPortFocused.R index dade8d65..823c256f 100644 --- a/Signals/pyCode/utils/TestPortFocused.R +++ b/Signals/pyCode/utils/TestPortFocused.R @@ -1,3 +1,7 @@ +#%% + +args = commandArgs(trailingOnly=TRUE) + #%% ==== # From master.R pathProject = '/Users/chen1678/Library/CloudStorage/Dropbox/oap-ac/CrossSection/' @@ -7,9 +11,6 @@ setwd(paste0(pathProject,'Portfolios/Code/')) source('00_SettingsAndTools.R', echo=T) source('01_PortfolioFunction.R', echo=T) -# Get command line arguments -args <- commandArgs(trailingOnly = TRUE) - # Default values quickrun = T # use T if you want to run quickly for testing quickrunlist = c('OptionVolume1','OptionVolume2') # default list @@ -21,7 +22,7 @@ if (length(args) > 0) { skipdaily = T # use T to skip daily CRSP which is very slow feed.verbose = F # use T if you want lots of feedback -# ENVIRONMENT AND DATA ==== +# ENVIRONMENT AND DATA crspinfo = read.fst( paste0(pathProject,'Portfolios/Data/Intermediate/crspminfo.fst') ) %>% # me, screens, @@ -55,7 +56,7 @@ if (dim(missing)[1]>0){ if (temp=='quit'){print('erroring out'); stop()} } -# BASE PORTS ==== +# BASE PORTS port <- loop_over_strategies( strategylist0 ) @@ -91,4 +92,23 @@ print('===============================================') print('\n\n New vs old portfolios:') print(new_vs) +#%% +# save to md file + +# first jsut t-stats +tstat = new_vs %>% filter(metric == 'tstat') + +txt <- capture.output(print(format(tstat, justify = "right"), row.names = FALSE)) +writeLines(txt, paste0(pathProject, 'Signals/Logs/TestOutPortFocused.md')) + +# then all +cat('\n\n All metrics:\n', file = paste0(pathProject, 'Signals/Logs/TestOutPortFocused.md'), append = TRUE) + + +txt <- capture.output(print(format(new_vs, justify = "right"), row.names = FALSE)) +cat( + paste0(txt, '\n'), + file = paste0(pathProject, 'Signals/Logs/TestOutPortFocused.md'), + append = TRUE) + #%% \ No newline at end of file