CHAPTER 6 4 / 10

3D 포인트 클라우드 처리

PointNet, 물체 분할, 6D 포즈 추정을 위한 3D 딥러닝

13D 센서와 포인트 클라우드

로봇 비전에서 3D 정보는 물체의 위치와 자세를 정확히 파악하는 데 필수적입니다. 다양한 3D 센서에서 생성되는 포인트 클라우드(Point Cloud)는 공간 상의 점들의 집합으로, (x, y, z) 좌표와 선택적으로 색상, 법선 정보를 포함합니다.

3D Point Cloud Processing Pipeline

Stereo

Camera

Structured

Light

ToF

Sensor

LiDAR

Scanner

▼

Raw Point Cloud

(x, y, z, RGB)

▼

Preprocessing Pipeline

PassThru

Filter

→

Voxel

Downsamp

→

Outlier

Removal

→

Normal

Estimate

▼

Deep Learning Analysis

Plane Segment

(RANSAC)

→

Instance Segment

(PointNet)

→

6D Pose Estimate

(DenseFusion)

센서 타입	원리	정밀도	특징
스테레오 카메라	삼각측량	~1mm	텍스처 필요, 가성비 좋음
구조광 (Structured Light)	패턴 투사	~0.1mm	고정밀, 반사면 취약
ToF (Time of Flight)	비행시간	~5mm	넓은 범위, 속도 빠름
LiDAR	레이저 스캔	~2mm	장거리, 야외용

2포인트 클라우드 전처리

원시 포인트 클라우드는 노이즈, 이상치, 과도한 밀도 등의 문제가 있어 전처리가 필요합니다.

import numpy as np
import open3d as o3d
from scipy.spatial import KDTree

class PointCloudPreprocessor:
    """포인트 클라우드 전처리 파이프라인"""

    def __init__(self, voxel_size: float = 0.005,
                 statistical_k: int = 20,
                 statistical_std: float = 2.0):
        self.voxel_size = voxel_size
        self.statistical_k = statistical_k
        self.statistical_std = statistical_std

    def process(self, points: np.ndarray,
                colors: np.ndarray = None) -> o3d.geometry.PointCloud:
        """전처리 파이프라인 실행"""
        # Open3D 포인트 클라우드 생성
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(points)
        if colors is not None:
            pcd.colors = o3d.utility.Vector3dVector(colors)

        # 1. 패스스루 필터 (ROI 추출)
        pcd = self._passthrough_filter(pcd)

        # 2. 다운샘플링
        pcd = self._voxel_downsample(pcd)

        # 3. 이상치 제거
        pcd = self._remove_outliers(pcd)

        # 4. 법선 추정
        pcd = self._estimate_normals(pcd)

        return pcd

    def _passthrough_filter(self, pcd,
                           z_min: float = 0.01,
                           z_max: float = 1.5) -> o3d.geometry.PointCloud:
        """Z축 기준 ROI 필터링"""
        points = np.asarray(pcd.points)
        mask = (points[:, 2] >= z_min) & (points[:, 2] <= z_max)

        filtered = pcd.select_by_index(np.where(mask)[0])
        return filtered

    def _voxel_downsample(self, pcd) -> o3d.geometry.PointCloud:
        """복셀 그리드 다운샘플링"""
        return pcd.voxel_down_sample(voxel_size=self.voxel_size)

    def _remove_outliers(self, pcd) -> o3d.geometry.PointCloud:
        """통계적 이상치 제거"""
        pcd, _ = pcd.remove_statistical_outlier(
            nb_neighbors=self.statistical_k,
            std_ratio=self.statistical_std
        )
        return pcd

    def _estimate_normals(self, pcd) -> o3d.geometry.PointCloud:
        """법선 벡터 추정"""
        pcd.estimate_normals(
            search_param=o3d.geometry.KDTreeSearchParamHybrid(
                radius=0.02, max_nn=30
            )
        )
        # 카메라 방향으로 법선 방향 통일
        pcd.orient_normals_towards_camera_location(
            camera_location=np.array([0, 0, 0])
        )
        return pcd


class PlaneSegmentation:
    """RANSAC 기반 평면 분할"""

    def segment_plane(self, pcd: o3d.geometry.PointCloud,
                     distance_threshold: float = 0.01,
                     ransac_n: int = 3,
                     num_iterations: int = 1000):
        """
        가장 큰 평면 분할 (테이블/바닥면 제거용)

        Returns:
            plane_model: 평면 방정식 계수 [a, b, c, d]
            inliers: 평면에 속하는 점 인덱스
        """
        plane_model, inliers = pcd.segment_plane(
            distance_threshold=distance_threshold,
            ransac_n=ransac_n,
            num_iterations=num_iterations
        )
        return plane_model, inliers

    def remove_plane(self, pcd: o3d.geometry.PointCloud,
                    distance_threshold: float = 0.01):
        """평면 제거 후 나머지 포인트 반환"""
        _, inliers = self.segment_plane(pcd, distance_threshold)
        # 평면이 아닌 점들 선택
        return pcd.select_by_index(inliers, invert=True)

3PointNet과 PointNet++

PointNet은 순서 불변(permutation invariant)한 점 집합을 직접 처리하는 최초의 딥러닝 아키텍처입니다. PointNet++는 계층적 학습을 추가하여 지역 구조를 더 잘 캡처합니다.

import torch
import torch.nn as nn
import torch.nn.functional as F

class PointNetEncoder(nn.Module):
    """PointNet 인코더 - 글로벌 피처 추출"""

    def __init__(self, input_channels=3, output_channels=1024):
        super().__init__()

        # T-Net: 입력 변환 학습
        self.input_transform = TNet(k=input_channels)

        # 공유 MLP (점별 처리)
        self.mlp1 = nn.Sequential(
            nn.Conv1d(input_channels, 64, 1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, 64, 1),
            nn.BatchNorm1d(64),
            nn.ReLU()
        )

        # 피처 변환
        self.feature_transform = TNet(k=64)

        # 후속 MLP
        self.mlp2 = nn.Sequential(
            nn.Conv1d(64, 128, 1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, output_channels, 1),
            nn.BatchNorm1d(output_channels),
            nn.ReLU()
        )

    def forward(self, x):
        """
        Args:
            x: (B, N, C) 포인트 클라우드
        Returns:
            global_feat: (B, 1024) 글로벌 피처
            point_feat: (B, 64, N) 점별 피처
        """
        B, N, C = x.shape
        x = x.transpose(2, 1)  # (B, C, N)

        # 입력 변환
        trans = self.input_transform(x)
        x = torch.bmm(x.transpose(2, 1), trans).transpose(2, 1)

        # MLP 1
        x = self.mlp1(x)  # (B, 64, N)
        point_feat = x

        # 피처 변환
        trans_feat = self.feature_transform(x)
        x = torch.bmm(x.transpose(2, 1), trans_feat).transpose(2, 1)

        # MLP 2
        x = self.mlp2(x)  # (B, 1024, N)

        # 맥스 풀링 (순서 불변성 핵심)
        global_feat = torch.max(x, dim=2)[0]  # (B, 1024)

        return global_feat, point_feat


class TNet(nn.Module):
    """Transformation Network - 정규화 변환 학습"""

    def __init__(self, k=3):
        super().__init__()
        self.k = k

        self.conv = nn.Sequential(
            nn.Conv1d(k, 64, 1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, 128, 1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 1024, 1),
            nn.BatchNorm1d(1024),
            nn.ReLU()
        )

        self.fc = nn.Sequential(
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, k * k)
        )

    def forward(self, x):
        B = x.shape[0]

        x = self.conv(x)
        x = torch.max(x, dim=2)[0]
        x = self.fc(x)

        # 단위 행렬에 학습된 변환 추가
        iden = torch.eye(self.k, device=x.device).view(1, -1).repeat(B, 1)
        x = x + iden
        x = x.view(B, self.k, self.k)

        return x


class PointNetPlusPlusSA(nn.Module):
    """PointNet++ Set Abstraction 레이어"""

    def __init__(self, npoint, radius, nsample, in_channel, mlp):
        super().__init__()
        self.npoint = npoint
        self.radius = radius
        self.nsample = nsample

        # PointNet MLP
        mlp_layers = []
        last_channel = in_channel + 3  # XYZ 추가
        for out_channel in mlp:
            mlp_layers.extend([
                nn.Conv2d(last_channel, out_channel, 1),
                nn.BatchNorm2d(out_channel),
                nn.ReLU()
            ])
            last_channel = out_channel
        self.mlp = nn.Sequential(*mlp_layers)

    def forward(self, xyz, points):
        """
        Args:
            xyz: (B, N, 3) 좌표
            points: (B, N, C) 피처 (None 가능)
        Returns:
            new_xyz: (B, npoint, 3)
            new_points: (B, npoint, mlp[-1])
        """
        B, N, _ = xyz.shape

        # Farthest Point Sampling
        new_xyz = self._farthest_point_sample(xyz, self.npoint)

        # Ball Query로 이웃점 그룹화
        idx = self._ball_query(xyz, new_xyz, self.radius, self.nsample)

        # 그룹화된 좌표 (상대 좌표로 변환)
        grouped_xyz = self._index_points(xyz, idx)  # (B, npoint, nsample, 3)
        grouped_xyz -= new_xyz.unsqueeze(2)

        # 피처 결합
        if points is not None:
            grouped_points = self._index_points(points, idx)
            new_points = torch.cat([grouped_xyz, grouped_points], dim=-1)
        else:
            new_points = grouped_xyz

        # (B, C, npoint, nsample) 형태로 변환
        new_points = new_points.permute(0, 3, 1, 2)

        # PointNet 적용
        new_points = self.mlp(new_points)

        # 맥스 풀링
        new_points = torch.max(new_points, dim=3)[0]  # (B, C, npoint)
        new_points = new_points.permute(0, 2, 1)  # (B, npoint, C)

        return new_xyz, new_points

PointNet의 핵심은 Max Pooling입니다. 이를 통해 점의 순서에 무관하게 동일한 글로벌 피처를 추출할 수 있습니다.

43D 물체 분할

포인트 클라우드에서 개별 물체를 분할하는 것은 빈 피킹의 핵심입니다. 클러스터링 기반 방법과 딥러닝 기반 인스턴스 세그멘테이션을 모두 활용합니다.

class PointCloudSegmentation:
    """3D 포인트 클라우드 세그멘테이션"""

    def euclidean_clustering(self, pcd: o3d.geometry.PointCloud,
                            eps: float = 0.02,
                            min_points: int = 50) -> List[np.ndarray]:
        """
        DBSCAN 기반 유클리드 클러스터링

        Args:
            eps: 이웃 검색 반경
            min_points: 클러스터 최소 점 수
        """
        labels = np.array(pcd.cluster_dbscan(
            eps=eps,
            min_points=min_points,
            print_progress=False
        ))

        clusters = []
        points = np.asarray(pcd.points)

        max_label = labels.max()
        for i in range(max_label + 1):
            cluster_indices = np.where(labels == i)[0]
            if len(cluster_indices) >= min_points:
                clusters.append(points[cluster_indices])

        return clusters

    def region_growing(self, pcd: o3d.geometry.PointCloud,
                      angle_threshold: float = 30.0,
                      curvature_threshold: float = 0.05):
        """법선 기반 영역 성장 세그멘테이션"""
        # 법선이 없으면 추정
        if not pcd.has_normals():
            pcd.estimate_normals()

        points = np.asarray(pcd.points)
        normals = np.asarray(pcd.normals)
        n_points = len(points)

        # KD-Tree 구축
        kdtree = KDTree(points)

        visited = np.zeros(n_points, dtype=bool)
        clusters = []

        for seed_idx in range(n_points):
            if visited[seed_idx]:
                continue

            cluster = []
            queue = [seed_idx]

            while queue:
                current_idx = queue.pop(0)
                if visited[current_idx]:
                    continue

                visited[current_idx] = True
                cluster.append(current_idx)

                # 이웃 검색
                neighbors = kdtree.query_ball_point(points[current_idx], r=0.02)

                for neighbor_idx in neighbors:
                    if visited[neighbor_idx]:
                        continue

                    # 법선 각도 검사
                    angle = np.arccos(np.clip(
                        np.dot(normals[current_idx], normals[neighbor_idx]),
                        -1.0, 1.0
                    ))

                    if np.degrees(angle) < angle_threshold:
                        queue.append(neighbor_idx)

            if len(cluster) >= 50:
                clusters.append(points[cluster])

        return clusters


class InstanceSegmentationNet(nn.Module):
    """딥러닝 기반 인스턴스 세그멘테이션"""

    def __init__(self, num_classes: int, embed_dim: int = 64):
        super().__init__()

        # 백본: PointNet++ 인코더
        self.encoder = PointNetPlusPlusEncoder()

        # 시맨틱 세그멘테이션 헤드
        self.semantic_head = nn.Sequential(
            nn.Conv1d(128, 64, 1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, num_classes, 1)
        )

        # 인스턴스 임베딩 헤드
        self.embedding_head = nn.Sequential(
            nn.Conv1d(128, 64, 1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Conv1d(64, embed_dim, 1)
        )

    def forward(self, xyz, features=None):
        """
        Returns:
            semantic_logits: (B, N, num_classes)
            embeddings: (B, N, embed_dim)
        """
        # 인코딩
        point_features = self.encoder(xyz, features)  # (B, 128, N)

        # 시맨틱 예측
        semantic = self.semantic_head(point_features)  # (B, num_classes, N)
        semantic = semantic.transpose(2, 1)

        # 인스턴스 임베딩
        embeddings = self.embedding_head(point_features)  # (B, embed_dim, N)
        embeddings = embeddings.transpose(2, 1)
        embeddings = F.normalize(embeddings, p=2, dim=-1)

        return semantic, embeddings

    def get_instances(self, embeddings, semantic_pred,
                     bandwidth: float = 0.5):
        """Mean-shift 클러스터링으로 인스턴스 추출"""
        from sklearn.cluster import MeanShift

        instances = []
        unique_classes = torch.unique(semantic_pred)

        for cls in unique_classes:
            if cls == 0:  # 배경 스킵
                continue

            mask = semantic_pred == cls
            cls_embeddings = embeddings[mask].cpu().numpy()

            if len(cls_embeddings) < 10:
                continue

            # Mean-shift 클러스터링
            ms = MeanShift(bandwidth=bandwidth)
            cluster_labels = ms.fit_predict(cls_embeddings)

            for inst_id in np.unique(cluster_labels):
                inst_mask = cluster_labels == inst_id
                instances.append({
                    'class': cls.item(),
                    'indices': np.where(mask)[0][inst_mask]
                })

        return instances

56D 포즈 추정

6D 포즈 추정은 물체의 위치(x, y, z)와 자세(roll, pitch, yaw)를 예측하는 것으로, 로봇 그래스핑에 필수적입니다.

class PoseEstimator6D(nn.Module):
    """DenseFusion 스타일 6D 포즈 추정기"""

    def __init__(self, num_objects: int, num_points: int = 1000):
        super().__init__()
        self.num_objects = num_objects
        self.num_points = num_points

        # RGB 특징 추출 (PSPNet 스타일)
        self.rgb_encoder = RGBFeatureExtractor()

        # 포인트 클라우드 특징 추출
        self.point_encoder = PointNetEncoder(
            input_channels=3, output_channels=512
        )

        # 특징 융합
        self.fusion = nn.Sequential(
            nn.Conv1d(512 + 256, 512, 1),  # Point + RGB
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Conv1d(512, 256, 1),
            nn.BatchNorm1d(256),
            nn.ReLU()
        )

        # 포즈 예측 헤드 (객체별)
        self.pose_heads = nn.ModuleList([
            PoseHead() for _ in range(num_objects)
        ])

    def forward(self, rgb, depth, mask, obj_id):
        """
        Args:
            rgb: (B, 3, H, W) RGB 이미지
            depth: (B, 1, H, W) 깊이 이미지
            mask: (B, 1, H, W) 물체 마스크
            obj_id: 객체 인덱스

        Returns:
            rotation: (B, num_points, 4) 쿼터니언
            translation: (B, num_points, 3) 이동 벡터
            confidence: (B, num_points) 신뢰도
        """
        B = rgb.shape[0]

        # 포인트 클라우드 추출
        points = self._depth_to_points(depth, mask)  # (B, N, 3)

        # 포인트 클라우드 특징
        point_feat, _ = self.point_encoder(points)  # (B, 512)
        point_feat = point_feat.unsqueeze(-1).repeat(1, 1, self.num_points)

        # RGB 특징 추출 및 샘플링
        rgb_feat = self.rgb_encoder(rgb)  # (B, 256, H', W')
        rgb_feat = self._sample_rgb_features(rgb_feat, points, mask)  # (B, 256, N)

        # 특징 융합
        fused = torch.cat([point_feat, rgb_feat], dim=1)  # (B, 768, N)
        fused = self.fusion(fused)  # (B, 256, N)

        # 포즈 예측
        pose_head = self.pose_heads[obj_id]
        rotation, translation, confidence = pose_head(fused, points)

        return rotation, translation, confidence

    def predict_pose(self, rotation, translation, confidence):
        """최종 포즈 추출 (가중 평균)"""
        # 신뢰도 기반 소프트맥스
        weights = F.softmax(confidence, dim=1)  # (B, N)

        # 가중 평균 이동
        pred_t = torch.sum(weights.unsqueeze(-1) * translation, dim=1)  # (B, 3)

        # 쿼터니언 가중 평균
        pred_q = torch.sum(weights.unsqueeze(-1) * rotation, dim=1)  # (B, 4)
        pred_q = F.normalize(pred_q, p=2, dim=-1)

        return pred_q, pred_t


class PoseHead(nn.Module):
    """단일 객체 포즈 예측 헤드"""

    def __init__(self, feat_dim=256):
        super().__init__()

        # 회전 예측 (쿼터니언)
        self.rotation_mlp = nn.Sequential(
            nn.Conv1d(feat_dim + 3, 128, 1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 4, 1)
        )

        # 이동 예측
        self.translation_mlp = nn.Sequential(
            nn.Conv1d(feat_dim + 3, 128, 1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 3, 1)
        )

        # 신뢰도 예측
        self.confidence_mlp = nn.Sequential(
            nn.Conv1d(feat_dim + 3, 128, 1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 1, 1)
        )

    def forward(self, features, points):
        """
        Args:
            features: (B, feat_dim, N)
            points: (B, N, 3)
        """
        # 좌표와 특징 결합
        points_t = points.transpose(2, 1)  # (B, 3, N)
        x = torch.cat([features, points_t], dim=1)  # (B, feat_dim+3, N)

        # 예측
        rotation = self.rotation_mlp(x).transpose(2, 1)  # (B, N, 4)
        rotation = F.normalize(rotation, p=2, dim=-1)

        translation = self.translation_mlp(x).transpose(2, 1)  # (B, N, 3)
        confidence = self.confidence_mlp(x).squeeze(1)  # (B, N)

        return rotation, translation, confidence

6D 포즈 추정의 정확도를 높이려면 CAD 모델과의 ICP 정합을 후처리로 추가하세요. DenseFusion + ICP 조합이 가장 안정적입니다.

6산업 응용과 성능 최적화

실제 산업 환경에서 3D 포인트 클라우드 처리를 적용할 때의 고려사항입니다.

최적화 항목	방법	효과
추론 속도	TensorRT, 포인트 샘플링	10x 가속
메모리 사용	다운샘플링, 16-bit 양자화	50% 절감
정밀도	다단계 정합 (Coarse-to-Fine)	±0.5mm 달성
강건성	데이터 증강, 노이즈 학습	불량률 3% 이하

3D 센서는 반사면(금속, 유리)에서 노이즈가 발생합니다. 문제 발생 시 편광 필터, 매트 코팅, 또는 멀티뷰 촬영을 고려하세요.