Semantic Labels#

Semantic labels are USD SemanticsAPI metadata used by annotation outputs such as semantic segmentation. In ovrtx, labels can be authored directly in USD or layered over an existing scene with inline USDA.

Author Labels in USDA#

Base Scene

over "World"
{
    over "logo" (
        prepend apiSchemas = ["SemanticsAPI:class"]
    )
    {
        string semantic:class:params:semanticData = "logo"
        string semantic:class:params:semanticType = "class"
    }

    over "Plane" (
        prepend apiSchemas = ["SemanticsAPI:class"]
    )
    {
        string semantic:class:params:semanticData = "ground"
        string semantic:class:params:semanticType = "class"
    }
}

Override Layer

(
    subLayers = [
        @semantic_label_base.usda@
    ]
)

over "World"
{
    over "robot_alpha" (
        prepend apiSchemas = ["SemanticsAPI:label", "SemanticsAPI:class"]
    )
    {
        string semantic:class:params:semanticData = "robot"
        string semantic:class:params:semanticType = "class"
        string semantic:label:params:semanticData = "robot_alpha"
        string semantic:label:params:semanticType = "label"
    }

    over "robot_beta" (
        prepend apiSchemas = ["SemanticsAPI:label", "SemanticsAPI:class"]
    )
    {
        string semantic:class:params:semanticData = "robot"
        string semantic:class:params:semanticType = "class"
        string semantic:label:params:semanticData = "robot_beta"
        string semantic:label:params:semanticType = "label"
    }
}

Runtime Overrides#

Use ordinary attribute writes to update semantic class and label metadata after the stage is loaded.

Python

SEMANTIC_CLASS_USDA = f"""#usda 1.0
(
    subLayers = [
        @{SEMANTIC_LABELS_PATH}@,
        @{TEST_BASE_PATH}@
    ]
)

def "Render"
{{
    def RenderProduct "SemanticCamera"
    {{
        int2 resolution = {RESOLUTION}
        rel camera = </World/Camera>
        rel orderedVars = [<SemanticSegmentation>, <SemanticIdMap>]

        def RenderVar "SemanticSegmentation"
        {{
            string sourceName = "SemanticSegmentation"
        }}

        def RenderVar "SemanticIdMap"
        {{
            string sourceName = "SemanticIdMap"
        }}
    }}
}}
"""

C

std::string usda = "#usda 1.0\n"
                   "(\n"
                   "    subLayers = [\n"
                   "        @" + semantic_labels_path + "@,\n"
                   "        @" + scene_path + "@\n"
                   "    ]\n"
                   ")\n\n"
                   "def \"Render\"\n"
                   "{\n"
                   "    def RenderProduct \"SemanticCamera\"\n"
                   "    {\n"
                   "        int2 resolution = (1280, 720)\n"
                   "        rel camera = </World/Camera>\n"
                   "        rel orderedVars = [<SemanticSegmentation>, <SemanticIdMap>]\n"
                   "\n"
                   "        def RenderVar \"SemanticSegmentation\"\n"
                   "        {\n"
                   "            string sourceName = \"SemanticSegmentation\"\n"
                   "        }\n"
                   "\n"
                   "        def RenderVar \"SemanticIdMap\"\n"
                   "        {\n"
                   "            string sourceName = \"SemanticIdMap\"\n"
                   "        }\n"
                   "    }\n"
                   "}\n";

ovrtx_enqueue_result_t enqueue_result =
    ovrtx_open_usd_from_string(renderer, {usda.c_str(), usda.size()});
ASSERT_API_SUCCESS(enqueue_result.status);
docs_wait_no_errors(renderer, enqueue_result.op_index);

Interpreting Segmentation Output#

Semantic segmentation output maps pixels to numeric semantic identifiers. Use the identifier information in the render output to map those ids back to labels.

Python

def _map_render_var(frame, name: str) -> np.ndarray:
    mapped = frame.render_vars[name].map(device=ovrtx.Device.CPU)
    view = np.from_dlpack(mapped)
    result = view.copy()
    del view, mapped
    return result


def _decode_semantic_id_map(tensor: np.ndarray) -> dict[int, str]:
    data = np.ascontiguousarray(tensor).view(np.uint8).reshape(-1)
    if data.size < 4:
        return {}

    entry_dtype = np.dtype(
        [("id", "<u4", (4)), ("label_length", "<u4"), ("label_offset", "<u4")]
    )
    num_entries = int.from_bytes(data[-4:].tobytes(), byteorder="little")
    entries_size = num_entries * entry_dtype.itemsize
    assert entries_size <= data.size - 4

    entries = data[:entries_size].view(entry_dtype).reshape(num_entries)
    labels_by_id = {}
    for entry in entries:
        semantic_id = int(entry["id"][0])
        label_offset = int(entry["label_offset"])
        label_length = int(entry["label_length"])
        label_end = label_offset + label_length
        assert label_end <= data.size

        label = data[label_offset:label_end].tobytes().decode("utf-8")
        labels_by_id[semantic_id] = label.rstrip("\x00").rstrip()

    return labels_by_id


def test_semantic_class_labels_are_rendered(renderer):
    """Set semantic class labels and verify their rendered semantic IDs."""
    renderer.open_usd_from_string(SEMANTIC_CLASS_USDA)

    for _ in range(5):
        renderer.step(render_products={RENDER_PRODUCT_PATH}, delta_time=1.0 / 60.0)

    products = renderer.step(render_products={RENDER_PRODUCT_PATH}, delta_time=1.0 / 60.0)
    frame = products[RENDER_PRODUCT_PATH].frames[0]

    semantic_id_map = _decode_semantic_id_map(_map_render_var(frame, "SemanticIdMap"))
    ids_by_label = {label: semantic_id for semantic_id, label in semantic_id_map.items()}

    logo_id = ids_by_label["class: logo;"]
    ground_id = ids_by_label["class: ground;"]

    semantic_segmentation = np.squeeze(_map_render_var(frame, "SemanticSegmentation"))
    semantic_ids_in_image = set(int(value) for value in np.unique(semantic_segmentation))

    assert logo_id in semantic_ids_in_image
    assert ground_id in semantic_ids_in_image
    assert np.count_nonzero(semantic_segmentation == logo_id) > 0
    assert np.count_nonzero(semantic_segmentation == ground_id) > 0

C

static std::map<uint32_t, std::string> decode_semantic_id_map(DLTensor const& tensor) {
    uint8_t const* data = static_cast<uint8_t const*>(tensor.data) + tensor.byte_offset;
    size_t const byte_count = tensor_byte_size(tensor);
    if (byte_count < sizeof(uint32_t)) {
        return {};
    }

    uint32_t const num_entries = read_u32_le(data + byte_count - sizeof(uint32_t));
    size_t constexpr entry_size = sizeof(uint32_t) * 6;
    if (static_cast<size_t>(num_entries) * entry_size > byte_count - sizeof(uint32_t)) {
        return {};
    }

    std::map<uint32_t, std::string> labels_by_id;
    for (uint32_t i = 0; i < num_entries; ++i) {
        uint8_t const* entry = data + static_cast<size_t>(i) * entry_size;
        uint32_t const semantic_id = read_u32_le(entry);
        uint32_t const label_length = read_u32_le(entry + sizeof(uint32_t) * 4);
        uint32_t const label_offset = read_u32_le(entry + sizeof(uint32_t) * 5);
        size_t const label_end = static_cast<size_t>(label_offset) + label_length;
        if (label_end > byte_count) {
            continue;
        }

        std::string label(
            reinterpret_cast<char const*>(data + label_offset),
            static_cast<size_t>(label_length));
        trim_semantic_label(label);
        labels_by_id[semantic_id] = label;
    }

    return labels_by_id;
}

static std::set<uint32_t> collect_semantic_segmentation_ids(DLTensor const& tensor) {
    EXPECT_EQ(tensor.dtype.bits, 32);
    EXPECT_EQ(tensor.dtype.lanes, 1);
    EXPECT_TRUE(tensor.ndim == 2 || tensor.ndim == 3);
    if (tensor.ndim == 3) {
        EXPECT_EQ(tensor.shape[2], 1);
    }

    int64_t const height = tensor.shape[0];
    int64_t const width = tensor.shape[1];
    int64_t const stride_y = tensor.strides
        ? tensor.strides[0]
        : width * (tensor.ndim == 3 ? tensor.shape[2] : 1);
    int64_t const stride_x = tensor.strides
        ? tensor.strides[1]
        : (tensor.ndim == 3 ? tensor.shape[2] : 1);

    uint8_t const* base = static_cast<uint8_t const*>(tensor.data) + tensor.byte_offset;

    std::set<uint32_t> ids;
    if (tensor.dtype.code == kDLUInt) {
        uint32_t const* values = reinterpret_cast<uint32_t const*>(base);
        for (int64_t y = 0; y < height; ++y) {
            for (int64_t x = 0; x < width; ++x) {
                ids.insert(values[y * stride_y + x * stride_x]);
            }
        }
    } else if (tensor.dtype.code == kDLInt) {
        int32_t const* values = reinterpret_cast<int32_t const*>(base);
        for (int64_t y = 0; y < height; ++y) {
            for (int64_t x = 0; x < width; ++x) {
                ids.insert(static_cast<uint32_t>(values[y * stride_y + x * stride_x]));
            }
        }
    } else {
        ADD_FAILURE() << "SemanticSegmentation must use 32-bit integer pixels";
    }
    return ids;
}

Notes#

Put labels on the prims whose pixels or hits should carry that semantic meaning.
Inline override layers are useful when a source asset cannot be edited.
For sensor material behavior, use Non-Visual Materials instead. Semantic labels and non-visual material labels serve different systems.