"""PEAD R+1 to R+5 LONG analysis — clean version with split filter.""" import pandas as pd import numpy as np import json GAPS = 'C:/datum-api-examples-main/enrichment_data/daily_4yr_gaps.parquet' OHLCV = 'C:/datum-api-examples-main/enrichment_data/daily_4yr_ohlcv.parquet' UNIVERSE = 'C:/datum-api-examples-main/universe_adv_2m.csv' uni = pd.read_csv(UNIVERSE) uni_tickers = set(uni['ticker'].tolist()) gaps = pd.read_parquet(GAPS) gaps = gaps[gaps['ticker'].isin(uni_tickers)] ohlcv = pd.read_parquet(OHLCV) ohlcv = ohlcv[ohlcv['ticker'].isin(uni_tickers)] ohlcv = ohlcv[(ohlcv['open'] > 0) & (ohlcv['close'] > 0)] df = ohlcv.merge(gaps, on=['date','ticker'], how='left') df = df.sort_values(['ticker','date']).reset_index(drop=True) # Price filter: prev close >=$5 means open >= ~$5 too if gap moderate # Use current open as proxy for "trading >= $5" df = df[df['open'] >= 5.0].copy() # Forward closes g = df.groupby('ticker') df['close_p1'] = g['close'].shift(-1) df['close_p2'] = g['close'].shift(-2) df['close_p3'] = g['close'].shift(-3) df['close_p5'] = g['close'].shift(-5) # Split-detector: huge gap + huge volume often = split-adjustment artifact # Use gap_pct directly cap at 30% (real earnings rarely > 30% gap on liquid names) # also clip per-row returns to [-25, +50] to neutralize residual splits def safe_ret(num, den): return (num / den - 1) * 100 for h, col in [('r1','close_p1'),('r2','close_p2'),('r3','close_p3'),('r5','close_p5')]: df[h] = safe_ret(df[col], df['close']) # Clip extreme outliers (split artifacts) — beyond ±50% is non-credible for next-day on $5+ ticker df[h] = df[h].clip(-50, 50) df['close_above_open'] = df['close'] > df['open'] # Restrict gap range strictly mask_all = (df['gap_pct'] >= 5.0) & (df['gap_pct'] <= 30.0) mask_strong = (df['gap_pct'] >= 10.0) & (df['gap_pct'] <= 30.0) & df['close_above_open'] mask_mid = (df['gap_pct'] >= 5.0) & (df['gap_pct'] < 10.0) mask_mid_close_pos = mask_mid & df['close_above_open'] print(f'all_reports N: {mask_all.sum()}') print(f'strong_reports N: {mask_strong.sum()}') print(f'mid_reports N: {mask_mid.sum()}') def stats(sub, label): out = {'label': label, 'N': int(len(sub))} for h in ['r1','r2','r3','r5']: s = sub[h].dropna() out[f'{h}_n'] = int(len(s)) out[f'{h}_mean_pct'] = round(float(s.mean()), 4) out[f'{h}_median_pct'] = round(float(s.median()), 4) out[f'{h}_wr'] = round(float((s > 0).mean()), 4) out[f'{h}_std_pct'] = round(float(s.std()), 4) return out results = { 'all_reports': stats(df[mask_all], 'all_reports'), 'strong_reports': stats(df[mask_strong], 'strong_reports'), 'mid_reports_gap_5_10': stats(df[mask_mid], 'mid_reports_5_10'), 'mid_close_pos': stats(df[mask_mid_close_pos], 'mid_close_pos'), } # baseline: all $5+ stocks any day, no gap filter mask_baseline = pd.Series(True, index=df.index) sample_baseline = df.sample(n=min(200000, len(df)), random_state=42) results['baseline_random'] = stats(sample_baseline, 'baseline_random_200k') print(json.dumps(results, indent=2)) with open('C:/Users/wsu/Downloads/viz/_pead_intermediate2.json','w') as f: json.dump(results, f, indent=2)