api / scripts /eda.py
safraeli's picture
Deploy: 2026 sensor migration + redesign + bucket B endpoints
13fc29d verified
"""
EDA helpers for Streamlit: Stage 1 (sensors + labels) and Stage 2 (IMS + merged).
"""
from __future__ import annotations
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
import pandas as pd
import numpy as np
def get_stage1_eda():
"""Load Stage 1 data and return dict with summary, labels df, optional sensor sample for plots."""
from config import settings
out = {"labels": None, "labels_stats": None, "sensor_sample": None, "error": None}
labels_path = settings.PROCESSED_DIR / "stage1_labels.csv"
if not labels_path.exists():
out["error"] = "Stage 1 labels not found. Run Stage 1 first."
return out
labels = pd.read_csv(labels_path, index_col=0, parse_dates=True)
labels.index = pd.to_datetime(labels.index, utc=True)
out["labels"] = labels
out["labels_stats"] = {
"count": len(labels),
"date_min": labels.index.min(),
"date_max": labels.index.max(),
"A_mean": labels.iloc[:, 0].mean(),
"A_std": labels.iloc[:, 0].std(),
"A_min": labels.iloc[:, 0].min(),
"A_max": labels.iloc[:, 0].max(),
}
# Optional: load a sample of sensor data for PAR/T (limit rows for speed)
sensor_path = settings.SENSORS_WIDE_PATH
if not sensor_path.exists():
sensor_path = settings.SENSORS_WIDE_SAMPLE_PATH
if sensor_path.exists():
try:
cols = ["time", "Air1_PAR_ref", "Air1_leafTemperature_ref", "Air1_airTemperature_ref", "Air1_CO2_ref", "Air1_VPD_ref"]
sensor = pd.read_csv(sensor_path, usecols=lambda c: c in cols, nrows=50000)
if "time" in sensor.columns:
sensor["time"] = pd.to_datetime(sensor["time"], utc=True)
sensor = sensor[sensor["Air1_PAR_ref"] > 50]
out["sensor_sample"] = sensor
except Exception:
out["sensor_sample"] = None
return out
def get_stage2_eda():
"""Load IMS + labels, merge, return merged df and summary for EDA."""
from config import settings
from src.ims_client import IMSClient
from src.preprocessor import Preprocessor
out = {"merged": None, "ims": None, "labels": None, "stats": None, "error": None}
labels_path = settings.PROCESSED_DIR / "stage1_labels.csv"
if not labels_path.exists():
out["error"] = "Stage 1 labels not found."
return out
labels = pd.read_csv(labels_path, index_col=0, parse_dates=True)
labels.index = pd.to_datetime(labels.index, utc=True)
labels = labels.iloc[:, 0]
client = IMSClient()
ims = client.load_cached()
if ims.empty:
out["error"] = "IMS cache not found. Run download_ims_data first."
return out
preproc = Preprocessor()
merged = preproc.merge_ims_with_labels(ims, labels, timestamp_index_labels=True)
if merged.empty:
out["error"] = "No overlap between IMS and labels."
return out
merged = preproc.create_time_features(merged)
out["merged"] = merged
out["ims"] = ims
out["labels"] = labels
out["stats"] = {
"ims_rows": len(ims),
"ims_date_min": pd.to_datetime(ims["timestamp_utc"]).min(),
"ims_date_max": pd.to_datetime(ims["timestamp_utc"]).max(),
"merged_rows": len(merged),
"feature_cols": [c for c in merged.select_dtypes(include=[np.number]).columns if c not in ("A",)],
}
return out