Spaces:

SolarWine
/

api

Running

App Files Files Community

api / scripts /eda.py

safraeli

Deploy: 2026 sensor migration + redesign + bucket B endpoints

13fc29d verified 21 days ago

raw

history blame contribute delete

3.5 kB

	"""
	EDA helpers for Streamlit: Stage 1 (sensors + labels) and Stage 2 (IMS + merged).
	"""

	from __future__ import annotations

	import sys
	from pathlib import Path

	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	if str(PROJECT_ROOT) not in sys.path:
	sys.path.insert(0, str(PROJECT_ROOT))

	import pandas as pd
	import numpy as np


	def get_stage1_eda():
	"""Load Stage 1 data and return dict with summary, labels df, optional sensor sample for plots."""
	from config import settings

	out = {"labels": None, "labels_stats": None, "sensor_sample": None, "error": None}
	labels_path = settings.PROCESSED_DIR / "stage1_labels.csv"
	if not labels_path.exists():
	out["error"] = "Stage 1 labels not found. Run Stage 1 first."
	return out
	labels = pd.read_csv(labels_path, index_col=0, parse_dates=True)
	labels.index = pd.to_datetime(labels.index, utc=True)
	out["labels"] = labels
	out["labels_stats"] = {
	"count": len(labels),
	"date_min": labels.index.min(),
	"date_max": labels.index.max(),
	"A_mean": labels.iloc[:, 0].mean(),
	"A_std": labels.iloc[:, 0].std(),
	"A_min": labels.iloc[:, 0].min(),
	"A_max": labels.iloc[:, 0].max(),
	}
	# Optional: load a sample of sensor data for PAR/T (limit rows for speed)
	sensor_path = settings.SENSORS_WIDE_PATH
	if not sensor_path.exists():
	sensor_path = settings.SENSORS_WIDE_SAMPLE_PATH
	if sensor_path.exists():
	try:
	cols = ["time", "Air1_PAR_ref", "Air1_leafTemperature_ref", "Air1_airTemperature_ref", "Air1_CO2_ref", "Air1_VPD_ref"]
	sensor = pd.read_csv(sensor_path, usecols=lambda c: c in cols, nrows=50000)
	if "time" in sensor.columns:
	sensor["time"] = pd.to_datetime(sensor["time"], utc=True)
	sensor = sensor[sensor["Air1_PAR_ref"] > 50]
	out["sensor_sample"] = sensor
	except Exception:
	out["sensor_sample"] = None
	return out


	def get_stage2_eda():
	"""Load IMS + labels, merge, return merged df and summary for EDA."""
	from config import settings
	from src.ims_client import IMSClient
	from src.preprocessor import Preprocessor

	out = {"merged": None, "ims": None, "labels": None, "stats": None, "error": None}
	labels_path = settings.PROCESSED_DIR / "stage1_labels.csv"
	if not labels_path.exists():
	out["error"] = "Stage 1 labels not found."
	return out
	labels = pd.read_csv(labels_path, index_col=0, parse_dates=True)
	labels.index = pd.to_datetime(labels.index, utc=True)
	labels = labels.iloc[:, 0]
	client = IMSClient()
	ims = client.load_cached()
	if ims.empty:
	out["error"] = "IMS cache not found. Run download_ims_data first."
	return out
	preproc = Preprocessor()
	merged = preproc.merge_ims_with_labels(ims, labels, timestamp_index_labels=True)
	if merged.empty:
	out["error"] = "No overlap between IMS and labels."
	return out
	merged = preproc.create_time_features(merged)
	out["merged"] = merged
	out["ims"] = ims
	out["labels"] = labels
	out["stats"] = {
	"ims_rows": len(ims),
	"ims_date_min": pd.to_datetime(ims["timestamp_utc"]).min(),
	"ims_date_max": pd.to_datetime(ims["timestamp_utc"]).max(),
	"merged_rows": len(merged),
	"feature_cols": [c for c in merged.select_dtypes(include=[np.number]).columns if c not in ("A",)],
	}
	return out