4+ years embedded in logistics and SaaS analytics. Six in-depth case studies covering real business problems, structured analytical approaches, working code, and measurable commercial outcomes.
WITH shipment_sla AS ( SELECT s.shipment_id, s.carrier_id, s.region, WIDTH_BUCKET(s.weight_kg, 0, 30, 6) AS weight_band, DAYNAME(s.shipped_at) AS day_of_week, DATEDIFF('hour', s.shipped_at, s.delivered_at) AS transit_hours, c.sla_hours, CASE WHEN s.delivered_at <= DATEADD('hour', c.sla_hours, s.shipped_at) THEN 1 ELSE 0 END AS met_sla FROM shipments s JOIN carrier_sla_contracts c ON s.carrier_id = c.carrier_id AND s.region = c.region WHERE s.shipped_at >= DATEADD(day, -90, CURRENT_DATE()) AND s.status = 'delivered' ), -- P95 transit time surfaces chronic underperformers vs. one-off spikes segmented AS ( SELECT carrier_id, region, weight_band, day_of_week, COUNT(*) AS shipments, ROUND(1 - AVG(met_sla), 3) AS miss_rate, AVG(transit_hours) AS avg_transit_h, PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY transit_hours) AS p95_transit_h, SUM(1 - met_sla) AS total_missed FROM shipment_sla GROUP BY 1,2,3,4 HAVING shipments > 50 ) SELECT *, -- Contribution to overall miss volume, not just local rate ROUND(total_missed / SUM(total_missed) OVER () * 100, 1) AS pct_of_total_misses FROM segmented ORDER BY total_missed DESC
import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.model_selection import TimeSeriesSplit from sklearn.metrics import roc_auc_score, classification_report # ── Feature engineering ─────────────────────────────────────── def build_features(usage_df, billing_df, support_df) -> pd.DataFrame: features = usage_df.groupby('account_id').agg( features_adopted = ('feature_key', 'nunique'), days_since_login = ('last_login_date', lambda x: (pd.Timestamp.today() - x.max()).days), sessions_30d = ('session_id', 'count') ) billing_feats = billing_df.groupby('account_id').agg( billing_failures = ('status', lambda x: (x == 'failed').sum()) ) support_feats = support_df.groupby('account_id').agg( support_tickets_30d = ('ticket_id', 'count') ) return features.join([billing_feats, support_feats], how='left').fillna(0) # ── Model pipeline ──────────────────────────────────────────── FEATURES = ['features_adopted', 'billing_failures', 'support_tickets_30d', 'days_since_login'] pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', LogisticRegression( C=0.5, class_weight='balanced', max_iter=1000 )) ]) # TimeSeriesSplit prevents future data leaking into training window tscv = TimeSeriesSplit(n_splits=5) scores = [] for tr, te in tscv.split(X): pipeline.fit(X.iloc[tr], y.iloc[tr]) p = pipeline.predict_proba(X.iloc[te])[:, 1] scores.append(roc_auc_score(y.iloc[te], p)) print(f"Mean AUC: {np.mean(scores):.3f} ± {np.std(scores):.3f}") # ── Weekly scoring output for CS team ─────────────────────── df['churn_prob'] = pipeline.predict_proba(df[FEATURES])[:, 1] df['risk_tier'] = pd.cut(df['churn_prob'], bins=[0, .30, .60, 1], labels=['low', 'medium', 'high']) # Export top 50 high-risk accounts to Deepnote report df[df['risk_tier'] == 'high'].sort_values( 'churn_prob', ascending=False).head(50)
WITH first_orders AS ( SELECT customer_id, DATE_TRUNC('month', MIN(order_date)) AS cohort_month, MIN(order_date) AS first_order_dt, MIN(delivered_at) AS first_delivery_dt FROM orders WHERE status = 'completed' GROUP BY customer_id ), email_timing AS ( -- First re-engagement email after first delivery SELECT e.customer_id, DATEDIFF('day', f.first_delivery_dt, MIN(e.sent_at)) AS days_to_first_email FROM email_sends e JOIN first_orders f USING (customer_id) WHERE e.sent_at > f.first_delivery_dt AND e.campaign_type = 're_engagement' GROUP BY e.customer_id, f.first_delivery_dt ), cohort_activity AS ( SELECT o.customer_id, f.cohort_month, DATEDIFF('month', f.first_order_dt, o.order_date) AS month_number FROM orders o JOIN first_orders f USING (customer_id) WHERE o.status = 'completed' AND o.order_date > f.first_order_dt ) SELECT f.cohort_month, -- Bucket email timing for comparison CASE WHEN et.days_to_first_email BETWEEN 3 AND 5 THEN 'day_3_5' WHEN et.days_to_first_email BETWEEN 6 AND 9 THEN 'day_6_9' WHEN et.days_to_first_email >= 10 THEN 'day_10+' ELSE 'no_email' END AS email_bucket, ca.month_number, COUNT(DISTINCT f.customer_id) AS cohort_size, COUNT(DISTINCT ca.customer_id) AS retained, ROUND(COUNT(DISTINCT ca.customer_id) * 100.0 / NULLIF(COUNT(DISTINCT f.customer_id), 0), 1) AS retention_pct FROM first_orders f LEFT JOIN email_timing et USING (customer_id) LEFT JOIN cohort_activity ca USING (customer_id) GROUP BY 1,2,3 ORDER BY 1,3,2
import pandas as pd import numpy as np from dataclasses import dataclass @dataclass class AnomalyConfig: window: int = 12 # rolling weeks for baseline min_periods: int = 4 # minimum data before flagging sigma_threshold: float = 2.5 # standard deviations to flag group_cols: list = None # e.g. ['lane', 'weight_band'] def detect_anomalies( df: pd.DataFrame, cost_col: str, date_col: str, cfg: AnomalyConfig ) -> pd.DataFrame: df = df.sort_values(cfg.group_cols + [date_col]) def rolling_stats(series: pd.Series): # shift(1) excludes current week from its own baseline shifted = series.shift(1) r_mean = shifted.rolling( cfg.window, min_periods=cfg.min_periods).mean() r_std = shifted.rolling( cfg.window, min_periods=cfg.min_periods).std() return r_mean, r_std grp = df.groupby(cfg.group_cols)[cost_col] df['rolling_mean'], df['rolling_std'] = \ zip(*grp.transform(lambda x: pd.DataFrame({'m': x.shift(1).rolling( cfg.window, min_periods=cfg.min_periods).mean(), 's': x.shift(1).rolling( cfg.window, min_periods=cfg.min_periods).std()} )['m'] # trick: transform returns one column )) df['z_score'] = (df[cost_col] - df['rolling_mean']) \ / df['rolling_std'] df['is_anomaly'] = df['z_score'].abs() > cfg.sigma_threshold df['deviation_pct'] = ( (df[cost_col] - df['rolling_mean']) / df['rolling_mean'] * 100).round(1) return df cfg = AnomalyConfig(group_cols=['lane', 'weight_band']) result = detect_anomalies(invoice_df, 'cost_per_kg', 'week', cfg) alerts = result[result['is_anomaly']][[ 'lane', 'week', 'cost_per_kg', 'rolling_mean', 'z_score', 'deviation_pct' ]]
from statsmodels.tsa.seasonal import STL from statsmodels.tsa.holtwinters import ExponentialSmoothing import pandas as pd import numpy as np def classify_sku(series: pd.Series) -> str: if len(series) < 26: return 'new' cv = series.std() / series.mean() if series.mean() > 0 else 99 zero_pct = (series == 0).mean() if zero_pct > 0.40: return 'intermittent' if cv < 0.30: return 'stable' return 'seasonal' def forecast_stable(series, horizon=12): # Linear trend projection for clockwork-demand SKUs x = np.arange(len(series)) slope, intercept = np.polyfit(x, series, 1) return slope * np.arange( len(series), len(series) + horizon) + intercept def forecast_seasonal(series, horizon=12): # STL decomposition for seasonal SKUs — separates # trend, seasonality, and noise cleanly stl = STL(series, period=52, robust=True).fit() trend_slope = np.polyfit( range(len(stl.trend)), stl.trend, 1) future_trend = np.polyval( trend_slope, range(len(stl.trend), len(stl.trend) + horizon)) seasonal_idx = stl.seasonal[-52:-52+horizon] return np.maximum(future_trend + seasonal_idx, 0) # ── Run across all SKUs ────────────────────────────────────── results = {} for sku_id, grp in df.groupby('sku_id'): s = grp.set_index('week')['units_sold'] seg = classify_sku(s) if seg == 'stable': fc = forecast_stable(s) elif seg == 'seasonal': fc = forecast_seasonal(s) else: fc = np.full(12, s[s > 0].mean()) # Croston approx results[sku_id] = {'segment': seg, 'forecast': fc}
-- models/revenue/fct_revenue_attribution.sql -- {{ config(materialized='table', tags=['revenue','weekly']) }} WITH normalised_touches AS ( -- Standardise channel taxonomy before attribution SELECT t.opportunity_id, t.touched_at, CASE WHEN t.source ILIKE '%linkedin%' THEN 'marketing' WHEN t.source ILIKE '%outbound%' THEN 'sales' WHEN t.source ILIKE '%referral%' THEN 'partnerships' ELSE 'other' END AS channel FROM {{ ref('stg_crm_touchpoints') }} t WHERE t.opportunity_id IS NOT NULL ), touch_windows AS ( SELECT opportunity_id, channel, touched_at, ROW_NUMBER() OVER (PARTITION BY opportunity_id ORDER BY touched_at) AS touch_seq, COUNT(*) OVER (PARTITION BY opportunity_id) AS total_touches FROM normalised_touches ), won_opps AS ( SELECT opportunity_id, arr, close_date, owner_team, contract_months FROM {{ ref('stg_opportunities') }} WHERE stage = 'Closed Won' AND arr > 0 ) SELECT tw.opportunity_id, tw.channel, o.close_date, o.arr, o.contract_months, -- Linear: equal share to every touchpoint o.arr / tw.total_touches AS linear_arr, -- First-touch: full credit to the opening touchpoint CASE WHEN tw.touch_seq = 1 THEN o.arr ELSE 0 END AS first_touch_arr, -- Last-touch: full credit to the closing touchpoint CASE WHEN tw.touch_seq = tw.total_touches THEN o.arr ELSE 0 END AS last_touch_arr, -- Time-decay: more recent touches get higher weight o.arr * (tw.touch_seq / ( tw.total_touches * (tw.total_touches + 1) / 2.0)) AS time_decay_arr FROM touch_windows tw JOIN won_opps o USING (opportunity_id)
I'm a Data Analyst based in South Jakarta, Indonesia, with four years embedded in logistics and SaaS analytics across Southeast Asia and Australia. My background in Industrial Engineering shaped how I think about problems. Every dataset is a system, and the real work is understanding the feedback loops before reaching for a model.
The case studies in this portfolio reflect how I work in practice: start with a clear business question, validate assumptions before building, keep methodology explainable to stakeholders who don't speak SQL, and always end with a recommendation someone can act on. Data that doesn't change a decision isn't worth generating.
I'm open to Data Analyst and Analytics roles where data drives product and commercial decisions, not just answers ad-hoc requests.