"""PEAD R+1 to R+5 LONG analysis on top-N gap-up days as earnings proxy.""" import pandas as pd import numpy as np import json GAPS = 'C:/datum-api-examples-main/enrichment_data/daily_4yr_gaps.parquet' OHLCV = 'C:/datum-api-examples-main/enrichment_data/daily_4yr_ohlcv.parquet' UNIVERSE = 'C:/datum-api-examples-main/universe_adv_2m.csv' print('Loading universe...') uni = pd.read_csv(UNIVERSE) uni_tickers = set(uni['ticker'].tolist()) print(f' universe: {len(uni_tickers)} tickers') print('Loading gaps...') gaps = pd.read_parquet(GAPS) gaps = gaps[gaps['ticker'].isin(uni_tickers)] print(f' gaps in universe: {len(gaps)}') print('Loading OHLCV...') ohlcv = pd.read_parquet(OHLCV) ohlcv = ohlcv[ohlcv['ticker'].isin(uni_tickers)] ohlcv = ohlcv[(ohlcv['open'] > 0) & (ohlcv['close'] > 0)] print(f' ohlcv rows: {len(ohlcv)}') # Merge gap with same-day open/close df = ohlcv.merge(gaps, on=['date', 'ticker'], how='left') df = df.sort_values(['ticker', 'date']).reset_index(drop=True) # Compute forward closes per ticker df['close_p1'] = df.groupby('ticker')['close'].shift(-1) df['close_p2'] = df.groupby('ticker')['close'].shift(-2) df['close_p3'] = df.groupby('ticker')['close'].shift(-3) df['close_p5'] = df.groupby('ticker')['close'].shift(-5) # Returns from CURRENT close (entry assumed = R0 close) df['r1'] = (df['close_p1'] / df['close'] - 1) * 100 df['r2'] = (df['close_p2'] / df['close'] - 1) * 100 df['r3'] = (df['close_p3'] / df['close'] - 1) * 100 df['r5'] = (df['close_p5'] / df['close'] - 1) * 100 df['close_above_open'] = df['close'] > df['open'] # Top-N gap proxy for earnings (no earnings flag in data) # All reports proxy: gap >= +5% mask_all = (df['gap_pct'] >= 5.0) & (df['gap_pct'] <= 50.0) # cap to exclude penny noise mask_strong = (df['gap_pct'] >= 10.0) & (df['gap_pct'] <= 50.0) & (df['close_above_open']) print(f'\nALL reports (gap >= +5%, capped 50%): N = {mask_all.sum()}') print(f'STRONG reports (gap >= +10% AND close>open): N = {mask_strong.sum()}') def stats(sub, label): out = {'label': label, 'N': int(len(sub))} for h in ['r1','r2','r3','r5']: s = sub[h].dropna() out[f'{h}_n'] = int(len(s)) out[f'{h}_mean_pct'] = round(float(s.mean()), 4) out[f'{h}_wr'] = round(float((s > 0).mean()), 4) out[f'{h}_median_pct'] = round(float(s.median()), 4) return out all_r = stats(df[mask_all], 'all_reports') strong_r = stats(df[mask_strong], 'strong_reports') # Mid-band: gap +5-10% mixed mask_mid = (df['gap_pct'] >= 5.0) & (df['gap_pct'] < 10.0) mid_r = stats(df[mask_mid], 'mid_reports_gap_5_10') print(json.dumps([all_r, strong_r, mid_r], indent=2)) with open('C:/Users/wsu/Downloads/viz/_pead_intermediate.json', 'w') as f: json.dump([all_r, strong_r, mid_r], f, indent=2) print('Saved intermediate.')