Upload depthpro_wrapper/point_cloud.py

180521e verified 22 days ago

8.43 kB

	"""
	Back-projection utilities: depth map → 3D point cloud.

	DepthPro outputs metric depth (meters) and an estimated focal length.
	Using the standard pinhole camera model, each pixel can be back-projected
	into a 3D point relative to the camera centre.
	"""

	from __future__ import annotations

	from typing import Optional, Tuple

	import numpy as np


	def depth_to_point_cloud(
	depth: np.ndarray,
	focal_length: float,
	principal_point: Optional[Tuple[float, float]] = None,
	*,
	mask: Optional[np.ndarray] = None,
	sample_step: int = 1,
	) -> np.ndarray:
	"""
	Back-project a metric depth map into a 3D point cloud.

	Parameters
	----------
	depth : np.ndarray
	(H, W) float array of metric depths in meters.
	focal_length : float
	Focal length in pixels (for the resolution of depth).
	DepthPro returns this automatically via ``DepthResult.focal_length``.
	principal_point : (cx, cy), optional
	Principal point in pixel coordinates. Defaults to the image centre
	``(W/2, H/2)``.
	mask : np.ndarray, optional
	(H, W) boolean array. Only pixels where ``mask == True`` are kept.
	Useful for removing sky/background, invalid depths, etc.
	sample_step : int, default 1
	Spatial sub-sampling step. ``2`` keeps every 2nd pixel (75 % reduction),
	``4`` keeps every 4th (93.75 % reduction). Handy for real-time viz.

	Returns
	-------
	points : np.ndarray
	(N, 3) float array of 3D points in the camera coordinate frame.
	``+Z`` points forward (into the scene), ``+X`` is right, ``+Y`` is
	down (standard image convention).

	Notes
	-----
	DepthPro assumes square pixels (aspect ratio = 1) and therefore a single
	focal length value is sufficient: ``fx == fy == focal_length``.

	The standard pinhole projection equations are::

	X = (u - cx) * Z / fx
	Y = (v - cy) * Z / fy
	Z = depth[v, u]

	where ``(u, v)`` are pixel column/row indices.
	"""
	depth = np.asarray(depth, dtype=np.float32)
	H, W = depth.shape

	if principal_point is None:
	cx, cy = W / 2.0, H / 2.0
	else:
	cx, cy = float(principal_point[0]), float(principal_point[1])

	fx = fy = float(focal_length)

	# Build pixel grid — sample every sample_step pixel
	v_idx = np.arange(0, H, sample_step)
	u_idx = np.arange(0, W, sample_step)
	u, v = np.meshgrid(u_idx, v_idx)

	Z = depth[v_idx[:, None], u_idx[None, :]]

	# Remove invalid / zero depths
	valid = Z > 0.0
	if mask is not None:
	mask = np.asarray(mask)
	if mask.shape != (H, W):
	raise ValueError(f"mask shape {mask.shape} does not match depth shape {(H, W)}")
	# Down-sample mask to match the sampled grid
	valid &= mask[v_idx[:, None], u_idx[None, :]]

	u = u[valid]
	v = v[valid]
	Z = Z[valid]

	X = (u - cx) * Z / fx
	Y = (v - cy) * Z / fy

	points = np.stack([X, Y, Z], axis=-1).astype(np.float32)
	return points


	def rgbd_to_point_cloud(
	depth: np.ndarray,
	rgb: np.ndarray,
	focal_length: float,
	principal_point: Optional[Tuple[float, float]] = None,
	*,
	mask: Optional[np.ndarray] = None,
	sample_step: int = 1,
	) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Back-project an RGB-D pair into a coloured 3D point cloud.

	Parameters
	----------
	depth : np.ndarray
	(H, W) metric depth map.
	rgb : np.ndarray
	(H, W, 3) uint8 RGB image.
	focal_length : float
	Estimated focal length in pixels.
	principal_point : (cx, cy), optional
	Defaults to image centre.
	mask : np.ndarray, optional
	Boolean mask selecting pixels to keep.
	sample_step : int, default 1
	Spatial sub-sampling step.

	Returns
	-------
	points : np.ndarray
	(N, 3) float32 3D points.
	colors : np.ndarray
	(N, 3) uint8 RGB colours aligned with points.
	"""
	depth = np.asarray(depth)
	rgb = np.asarray(rgb)
	if depth.shape[:2] != rgb.shape[:2]:
	raise ValueError(
	f"depth shape {depth.shape} and rgb shape {rgb.shape} must have same H×W"
	)

	H, W = depth.shape
	if principal_point is None:
	cx, cy = W / 2.0, H / 2.0
	else:
	cx, cy = float(principal_point[0]), float(principal_point[1])

	fx = fy = float(focal_length)

	v_idx = np.arange(0, H, sample_step)
	u_idx = np.arange(0, W, sample_step)
	u, v = np.meshgrid(u_idx, v_idx)

	Z = depth[v_idx[:, None], u_idx[None, :]]
	colors_sampled = rgb[v_idx[:, None], u_idx[None, :]]

	valid = Z > 0.0
	if mask is not None:
	mask = np.asarray(mask)
	valid &= mask[v_idx[:, None], u_idx[None, :]]

	u = u[valid]
	v = v[valid]
	Z = Z[valid]
	colors = colors_sampled[valid]

	X = (u - cx) * Z / fx
	Y = (v - cy) * Z / fy

	points = np.stack([X, Y, Z], axis=-1).astype(np.float32)
	colors = np.asarray(colors, dtype=np.uint8)
	return points, colors


	def normals_from_depth(
	depth: np.ndarray,
	focal_length: float,
	principal_point: Optional[Tuple[float, float]] = None,
	) -> np.ndarray:
	"""
	Compute per-pixel surface normals directly from the depth map.

	This is a fast, approximate normal estimator that works well for
	visualisation or as input to downstream surface-reconstruction methods
	(e.g. Poisson, NKSR).

	Parameters
	----------
	depth : np.ndarray
	(H, W) metric depth map.
	focal_length : float
	Focal length in pixels.
	principal_point : (cx, cy), optional
	Defaults to image centre.

	Returns
	-------
	normals : np.ndarray
	(H, W, 3) float32 array of unoriented unit normals.
	``normals[v, u]`` is the normal at pixel ``(u, v)``.
	"""
	depth = np.asarray(depth, dtype=np.float64)
	H, W = depth.shape

	if principal_point is None:
	cx, cy = W / 2.0, H / 2.0
	else:
	cx, cy = float(principal_point[0]), float(principal_point[1])

	fx = fy = float(focal_length)

	# Compute 3D coordinates for every pixel
	u = np.arange(W)
	v = np.arange(H)
	u, v = np.meshgrid(u, v)

	Z = depth
	X = (u - cx) * Z / fx
	Y = (v - cy) * Z / fy

	# Cross-product of neighbour vectors → normal
	# Forward differences (with edge padding)
	dx = np.zeros_like(Z)
	dy = np.zeros_like(Z)

	dx[:, :-1] = (X[:, 1:] - X[:, :-1]) * (Z[:, :-1] > 0) * (Z[:, 1:] > 0)
	dy[:-1, :] = (Y[1:, :] - Y[:-1, :]) * (Z[:-1, :] > 0) * (Z[1:, :] > 0)

	# Average with backward differences for smoother edges
	dx[:, 1:] += (X[:, 1:] - X[:, :-1]) * (Z[:, :-1] > 0) * (Z[:, 1:] > 0)
	dy[1:, :] += (Y[1:, :] - Y[:-1, :]) * (Z[:-1, :] > 0) * (Z[1:, :] > 0)
	dx[:, 1:-1] *= 0.5
	dy[1:-1, :] *= 0.5

	# Central differences in the interior
	dx[:, 1:-1] = (X[:, 2:] - X[:, :-2]) / 2.0
	dy[1:-1, :] = (Y[2:, :] - Y[:-2, :]) / 2.0

	# Vectors in 3D
	vx = np.stack([dx, np.zeros_like(dx), np.zeros_like(dx)], axis=-1)
	vy = np.stack([np.zeros_like(dy), dy, np.zeros_like(dy)], axis=-1)

	# More accurate: use the actual 3D neighbour differences
	dX = np.zeros_like(X)
	dY = np.zeros_like(Y)
	dZ = np.zeros_like(Z)

	dX[:, :-1] = X[:, 1:] - X[:, :-1]
	dY[:, :-1] = Y[:, 1:] - Y[:, :-1]
	dZ[:, :-1] = Z[:, 1:] - Z[:, :-1]

	dX[:-1, :] += X[1:, :] - X[:-1, :]
	dY[:-1, :] += Y[1:, :] - Y[:-1, :]
	dZ[:-1, :] += Z[1:, :] - Z[:-1, :]

	# Use central diff version
	grad_x = np.zeros((H, W, 3), dtype=np.float32)
	grad_y = np.zeros((H, W, 3), dtype=np.float32)

	grad_x[:, :-1, 0] = X[:, 1:] - X[:, :-1]
	grad_x[:, :-1, 1] = Y[:, 1:] - Y[:, :-1]
	grad_x[:, :-1, 2] = Z[:, 1:] - Z[:, :-1]

	grad_y[:-1, :, 0] = X[1:, :] - X[:-1, :]
	grad_y[:-1, :, 1] = Y[1:, :] - Y[:-1, :]
	grad_y[:-1, :, 2] = Z[1:, :] - Z[:-1, :]

	# Average with the opposite direction for interior pixels
	grad_x[:, 1:, :] += np.stack([X[:, :-1] - X[:, 1:], Y[:, :-1] - Y[:, 1:], Z[:, :-1] - Z[:, 1:]], axis=-1)
	grad_y[1:, :, :] += np.stack([X[:-1, :] - X[1:, :], Y[:-1, :] - Y[1:, :], Z[:-1, :] - Z[1:, :]], axis=-1)

	# Cross product for normal
	normals = np.cross(grad_x, grad_y)

	# Normalise
	norm = np.linalg.norm(normals, axis=-1, keepdims=True)
	normals = np.where(norm > 1e-8, normals / norm, 0.0)

	return normals.astype(np.float32)