bdck's picture
Upload depthpro_wrapper/point_cloud.py
180521e verified
"""
Back-projection utilities: depth map → 3D point cloud.
DepthPro outputs metric depth (meters) and an estimated focal length.
Using the standard pinhole camera model, each pixel can be back-projected
into a 3D point relative to the camera centre.
"""
from __future__ import annotations
from typing import Optional, Tuple
import numpy as np
def depth_to_point_cloud(
depth: np.ndarray,
focal_length: float,
principal_point: Optional[Tuple[float, float]] = None,
*,
mask: Optional[np.ndarray] = None,
sample_step: int = 1,
) -> np.ndarray:
"""
Back-project a metric depth map into a 3D point cloud.
Parameters
----------
depth : np.ndarray
(H, W) float array of metric depths in meters.
focal_length : float
Focal length in pixels (for the resolution of *depth*).
DepthPro returns this automatically via ``DepthResult.focal_length``.
principal_point : (cx, cy), optional
Principal point in pixel coordinates. Defaults to the image centre
``(W/2, H/2)``.
mask : np.ndarray, optional
(H, W) boolean array. Only pixels where ``mask == True`` are kept.
Useful for removing sky/background, invalid depths, etc.
sample_step : int, default 1
Spatial sub-sampling step. ``2`` keeps every 2nd pixel (75 % reduction),
``4`` keeps every 4th (93.75 % reduction). Handy for real-time viz.
Returns
-------
points : np.ndarray
(N, 3) float array of 3D points in the camera coordinate frame.
``+Z`` points forward (into the scene), ``+X`` is right, ``+Y`` is
down (standard image convention).
Notes
-----
DepthPro assumes square pixels (aspect ratio = 1) and therefore a single
focal length value is sufficient: ``fx == fy == focal_length``.
The standard pinhole projection equations are::
X = (u - cx) * Z / fx
Y = (v - cy) * Z / fy
Z = depth[v, u]
where ``(u, v)`` are pixel column/row indices.
"""
depth = np.asarray(depth, dtype=np.float32)
H, W = depth.shape
if principal_point is None:
cx, cy = W / 2.0, H / 2.0
else:
cx, cy = float(principal_point[0]), float(principal_point[1])
fx = fy = float(focal_length)
# Build pixel grid — sample every sample_step pixel
v_idx = np.arange(0, H, sample_step)
u_idx = np.arange(0, W, sample_step)
u, v = np.meshgrid(u_idx, v_idx)
Z = depth[v_idx[:, None], u_idx[None, :]]
# Remove invalid / zero depths
valid = Z > 0.0
if mask is not None:
mask = np.asarray(mask)
if mask.shape != (H, W):
raise ValueError(f"mask shape {mask.shape} does not match depth shape {(H, W)}")
# Down-sample mask to match the sampled grid
valid &= mask[v_idx[:, None], u_idx[None, :]]
u = u[valid]
v = v[valid]
Z = Z[valid]
X = (u - cx) * Z / fx
Y = (v - cy) * Z / fy
points = np.stack([X, Y, Z], axis=-1).astype(np.float32)
return points
def rgbd_to_point_cloud(
depth: np.ndarray,
rgb: np.ndarray,
focal_length: float,
principal_point: Optional[Tuple[float, float]] = None,
*,
mask: Optional[np.ndarray] = None,
sample_step: int = 1,
) -> Tuple[np.ndarray, np.ndarray]:
"""
Back-project an RGB-D pair into a coloured 3D point cloud.
Parameters
----------
depth : np.ndarray
(H, W) metric depth map.
rgb : np.ndarray
(H, W, 3) uint8 RGB image.
focal_length : float
Estimated focal length in pixels.
principal_point : (cx, cy), optional
Defaults to image centre.
mask : np.ndarray, optional
Boolean mask selecting pixels to keep.
sample_step : int, default 1
Spatial sub-sampling step.
Returns
-------
points : np.ndarray
(N, 3) float32 3D points.
colors : np.ndarray
(N, 3) uint8 RGB colours aligned with *points*.
"""
depth = np.asarray(depth)
rgb = np.asarray(rgb)
if depth.shape[:2] != rgb.shape[:2]:
raise ValueError(
f"depth shape {depth.shape} and rgb shape {rgb.shape} must have same H×W"
)
H, W = depth.shape
if principal_point is None:
cx, cy = W / 2.0, H / 2.0
else:
cx, cy = float(principal_point[0]), float(principal_point[1])
fx = fy = float(focal_length)
v_idx = np.arange(0, H, sample_step)
u_idx = np.arange(0, W, sample_step)
u, v = np.meshgrid(u_idx, v_idx)
Z = depth[v_idx[:, None], u_idx[None, :]]
colors_sampled = rgb[v_idx[:, None], u_idx[None, :]]
valid = Z > 0.0
if mask is not None:
mask = np.asarray(mask)
valid &= mask[v_idx[:, None], u_idx[None, :]]
u = u[valid]
v = v[valid]
Z = Z[valid]
colors = colors_sampled[valid]
X = (u - cx) * Z / fx
Y = (v - cy) * Z / fy
points = np.stack([X, Y, Z], axis=-1).astype(np.float32)
colors = np.asarray(colors, dtype=np.uint8)
return points, colors
def normals_from_depth(
depth: np.ndarray,
focal_length: float,
principal_point: Optional[Tuple[float, float]] = None,
) -> np.ndarray:
"""
Compute per-pixel surface normals directly from the depth map.
This is a fast, approximate normal estimator that works well for
visualisation or as input to downstream surface-reconstruction methods
(e.g. Poisson, NKSR).
Parameters
----------
depth : np.ndarray
(H, W) metric depth map.
focal_length : float
Focal length in pixels.
principal_point : (cx, cy), optional
Defaults to image centre.
Returns
-------
normals : np.ndarray
(H, W, 3) float32 array of **unoriented** unit normals.
``normals[v, u]`` is the normal at pixel ``(u, v)``.
"""
depth = np.asarray(depth, dtype=np.float64)
H, W = depth.shape
if principal_point is None:
cx, cy = W / 2.0, H / 2.0
else:
cx, cy = float(principal_point[0]), float(principal_point[1])
fx = fy = float(focal_length)
# Compute 3D coordinates for every pixel
u = np.arange(W)
v = np.arange(H)
u, v = np.meshgrid(u, v)
Z = depth
X = (u - cx) * Z / fx
Y = (v - cy) * Z / fy
# Cross-product of neighbour vectors → normal
# Forward differences (with edge padding)
dx = np.zeros_like(Z)
dy = np.zeros_like(Z)
dx[:, :-1] = (X[:, 1:] - X[:, :-1]) * (Z[:, :-1] > 0) * (Z[:, 1:] > 0)
dy[:-1, :] = (Y[1:, :] - Y[:-1, :]) * (Z[:-1, :] > 0) * (Z[1:, :] > 0)
# Average with backward differences for smoother edges
dx[:, 1:] += (X[:, 1:] - X[:, :-1]) * (Z[:, :-1] > 0) * (Z[:, 1:] > 0)
dy[1:, :] += (Y[1:, :] - Y[:-1, :]) * (Z[:-1, :] > 0) * (Z[1:, :] > 0)
dx[:, 1:-1] *= 0.5
dy[1:-1, :] *= 0.5
# Central differences in the interior
dx[:, 1:-1] = (X[:, 2:] - X[:, :-2]) / 2.0
dy[1:-1, :] = (Y[2:, :] - Y[:-2, :]) / 2.0
# Vectors in 3D
vx = np.stack([dx, np.zeros_like(dx), np.zeros_like(dx)], axis=-1)
vy = np.stack([np.zeros_like(dy), dy, np.zeros_like(dy)], axis=-1)
# More accurate: use the actual 3D neighbour differences
dX = np.zeros_like(X)
dY = np.zeros_like(Y)
dZ = np.zeros_like(Z)
dX[:, :-1] = X[:, 1:] - X[:, :-1]
dY[:, :-1] = Y[:, 1:] - Y[:, :-1]
dZ[:, :-1] = Z[:, 1:] - Z[:, :-1]
dX[:-1, :] += X[1:, :] - X[:-1, :]
dY[:-1, :] += Y[1:, :] - Y[:-1, :]
dZ[:-1, :] += Z[1:, :] - Z[:-1, :]
# Use central diff version
grad_x = np.zeros((H, W, 3), dtype=np.float32)
grad_y = np.zeros((H, W, 3), dtype=np.float32)
grad_x[:, :-1, 0] = X[:, 1:] - X[:, :-1]
grad_x[:, :-1, 1] = Y[:, 1:] - Y[:, :-1]
grad_x[:, :-1, 2] = Z[:, 1:] - Z[:, :-1]
grad_y[:-1, :, 0] = X[1:, :] - X[:-1, :]
grad_y[:-1, :, 1] = Y[1:, :] - Y[:-1, :]
grad_y[:-1, :, 2] = Z[1:, :] - Z[:-1, :]
# Average with the opposite direction for interior pixels
grad_x[:, 1:, :] += np.stack([X[:, :-1] - X[:, 1:], Y[:, :-1] - Y[:, 1:], Z[:, :-1] - Z[:, 1:]], axis=-1)
grad_y[1:, :, :] += np.stack([X[:-1, :] - X[1:, :], Y[:-1, :] - Y[1:, :], Z[:-1, :] - Z[1:, :]], axis=-1)
# Cross product for normal
normals = np.cross(grad_x, grad_y)
# Normalise
norm = np.linalg.norm(normals, axis=-1, keepdims=True)
normals = np.where(norm > 1e-8, normals / norm, 0.0)
return normals.astype(np.float32)