| """ |
| EDA helpers for Streamlit: Stage 1 (sensors + labels) and Stage 2 (IMS + merged). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import sys |
| from pathlib import Path |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parent.parent |
| if str(PROJECT_ROOT) not in sys.path: |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| import pandas as pd |
| import numpy as np |
|
|
|
|
| def get_stage1_eda(): |
| """Load Stage 1 data and return dict with summary, labels df, optional sensor sample for plots.""" |
| from config import settings |
|
|
| out = {"labels": None, "labels_stats": None, "sensor_sample": None, "error": None} |
| labels_path = settings.PROCESSED_DIR / "stage1_labels.csv" |
| if not labels_path.exists(): |
| out["error"] = "Stage 1 labels not found. Run Stage 1 first." |
| return out |
| labels = pd.read_csv(labels_path, index_col=0, parse_dates=True) |
| labels.index = pd.to_datetime(labels.index, utc=True) |
| out["labels"] = labels |
| out["labels_stats"] = { |
| "count": len(labels), |
| "date_min": labels.index.min(), |
| "date_max": labels.index.max(), |
| "A_mean": labels.iloc[:, 0].mean(), |
| "A_std": labels.iloc[:, 0].std(), |
| "A_min": labels.iloc[:, 0].min(), |
| "A_max": labels.iloc[:, 0].max(), |
| } |
| |
| sensor_path = settings.SENSORS_WIDE_PATH |
| if not sensor_path.exists(): |
| sensor_path = settings.SENSORS_WIDE_SAMPLE_PATH |
| if sensor_path.exists(): |
| try: |
| cols = ["time", "Air1_PAR_ref", "Air1_leafTemperature_ref", "Air1_airTemperature_ref", "Air1_CO2_ref", "Air1_VPD_ref"] |
| sensor = pd.read_csv(sensor_path, usecols=lambda c: c in cols, nrows=50000) |
| if "time" in sensor.columns: |
| sensor["time"] = pd.to_datetime(sensor["time"], utc=True) |
| sensor = sensor[sensor["Air1_PAR_ref"] > 50] |
| out["sensor_sample"] = sensor |
| except Exception: |
| out["sensor_sample"] = None |
| return out |
|
|
|
|
| def get_stage2_eda(): |
| """Load IMS + labels, merge, return merged df and summary for EDA.""" |
| from config import settings |
| from src.ims_client import IMSClient |
| from src.preprocessor import Preprocessor |
|
|
| out = {"merged": None, "ims": None, "labels": None, "stats": None, "error": None} |
| labels_path = settings.PROCESSED_DIR / "stage1_labels.csv" |
| if not labels_path.exists(): |
| out["error"] = "Stage 1 labels not found." |
| return out |
| labels = pd.read_csv(labels_path, index_col=0, parse_dates=True) |
| labels.index = pd.to_datetime(labels.index, utc=True) |
| labels = labels.iloc[:, 0] |
| client = IMSClient() |
| ims = client.load_cached() |
| if ims.empty: |
| out["error"] = "IMS cache not found. Run download_ims_data first." |
| return out |
| preproc = Preprocessor() |
| merged = preproc.merge_ims_with_labels(ims, labels, timestamp_index_labels=True) |
| if merged.empty: |
| out["error"] = "No overlap between IMS and labels." |
| return out |
| merged = preproc.create_time_features(merged) |
| out["merged"] = merged |
| out["ims"] = ims |
| out["labels"] = labels |
| out["stats"] = { |
| "ims_rows": len(ims), |
| "ims_date_min": pd.to_datetime(ims["timestamp_utc"]).min(), |
| "ims_date_max": pd.to_datetime(ims["timestamp_utc"]).max(), |
| "merged_rows": len(merged), |
| "feature_cols": [c for c in merged.select_dtypes(include=[np.number]).columns if c not in ("A",)], |
| } |
| return out |
|
|