File size: 3,496 Bytes
13fc29d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
EDA helpers for Streamlit: Stage 1 (sensors + labels) and Stage 2 (IMS + merged).
"""

from __future__ import annotations

import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd
import numpy as np


def get_stage1_eda():
    """Load Stage 1 data and return dict with summary, labels df, optional sensor sample for plots."""
    from config import settings

    out = {"labels": None, "labels_stats": None, "sensor_sample": None, "error": None}
    labels_path = settings.PROCESSED_DIR / "stage1_labels.csv"
    if not labels_path.exists():
        out["error"] = "Stage 1 labels not found. Run Stage 1 first."
        return out
    labels = pd.read_csv(labels_path, index_col=0, parse_dates=True)
    labels.index = pd.to_datetime(labels.index, utc=True)
    out["labels"] = labels
    out["labels_stats"] = {
        "count": len(labels),
        "date_min": labels.index.min(),
        "date_max": labels.index.max(),
        "A_mean": labels.iloc[:, 0].mean(),
        "A_std": labels.iloc[:, 0].std(),
        "A_min": labels.iloc[:, 0].min(),
        "A_max": labels.iloc[:, 0].max(),
    }
    # Optional: load a sample of sensor data for PAR/T (limit rows for speed)
    sensor_path = settings.SENSORS_WIDE_PATH
    if not sensor_path.exists():
        sensor_path = settings.SENSORS_WIDE_SAMPLE_PATH
    if sensor_path.exists():
        try:
            cols = ["time", "Air1_PAR_ref", "Air1_leafTemperature_ref", "Air1_airTemperature_ref", "Air1_CO2_ref", "Air1_VPD_ref"]
            sensor = pd.read_csv(sensor_path, usecols=lambda c: c in cols, nrows=50000)
            if "time" in sensor.columns:
                sensor["time"] = pd.to_datetime(sensor["time"], utc=True)
                sensor = sensor[sensor["Air1_PAR_ref"] > 50]
            out["sensor_sample"] = sensor
        except Exception:
            out["sensor_sample"] = None
    return out


def get_stage2_eda():
    """Load IMS + labels, merge, return merged df and summary for EDA."""
    from config import settings
    from src.ims_client import IMSClient
    from src.preprocessor import Preprocessor

    out = {"merged": None, "ims": None, "labels": None, "stats": None, "error": None}
    labels_path = settings.PROCESSED_DIR / "stage1_labels.csv"
    if not labels_path.exists():
        out["error"] = "Stage 1 labels not found."
        return out
    labels = pd.read_csv(labels_path, index_col=0, parse_dates=True)
    labels.index = pd.to_datetime(labels.index, utc=True)
    labels = labels.iloc[:, 0]
    client = IMSClient()
    ims = client.load_cached()
    if ims.empty:
        out["error"] = "IMS cache not found. Run download_ims_data first."
        return out
    preproc = Preprocessor()
    merged = preproc.merge_ims_with_labels(ims, labels, timestamp_index_labels=True)
    if merged.empty:
        out["error"] = "No overlap between IMS and labels."
        return out
    merged = preproc.create_time_features(merged)
    out["merged"] = merged
    out["ims"] = ims
    out["labels"] = labels
    out["stats"] = {
        "ims_rows": len(ims),
        "ims_date_min": pd.to_datetime(ims["timestamp_utc"]).min(),
        "ims_date_max": pd.to_datetime(ims["timestamp_utc"]).max(),
        "merged_rows": len(merged),
        "feature_cols": [c for c in merged.select_dtypes(include=[np.number]).columns if c not in ("A",)],
    }
    return out