Spaces:

HorizonRobotics
/

EmbodiedGen-Text-to-3D

Running on Zero

App Files Files Community

xinjie.wang commited on about 9 hours ago

Commit

c948747

1 Parent(s): 7f124e2

update

Browse files

Files changed (18) hide show

app.py +1 -6
common.py +50 -250
embodied_gen/data/asset_converter.py +17 -0
embodied_gen/models/sam3d.py +32 -27
embodied_gen/scripts/gen_scene3d.py +17 -0
embodied_gen/scripts/gen_texture.py +17 -0
embodied_gen/scripts/imageto3d.py +32 -52
embodied_gen/scripts/render_gs.py +0 -1
embodied_gen/scripts/textto3d.py +3 -3
embodied_gen/utils/gpt_clients.py +10 -3
embodied_gen/utils/inference.py +59 -0
embodied_gen/utils/monkey_patches.py +62 -12
embodied_gen/utils/process_media.py +1 -1
embodied_gen/utils/tags.py +1 -1
embodied_gen/utils/trender.py +51 -0
embodied_gen/validators/aesthetic_predictor.py +5 -1
embodied_gen/validators/quality_checkers.py +27 -6
embodied_gen/validators/urdf_convertor.py +5 -2

app.py CHANGED Viewed

@@ -17,8 +17,6 @@
 import os
-os.environ["GRADIO_APP"] = "textto3d"
 # GRADIO_APP == "textto3d_sam3d", sam3d object model, by default.
 # GRADIO_APP == "textto3d", TRELLIS model.
 os.environ["GRADIO_APP"] = "textto3d_sam3d"
@@ -35,19 +33,16 @@ from common import (
     get_cached_image,
     get_seed,
     get_selected_image,
     start_session,
     text2image_fn,
 )
 app_name = os.getenv("GRADIO_APP")
 if app_name == "textto3d_sam3d":
-    from common import image_to_3d_sam3d as image_to_3d
     enable_pre_resize = False
     sample_step = 25
 elif app_name == "textto3d":
-    from common import image_to_3d
     enable_pre_resize = True
     sample_step = 12

 import os
 # GRADIO_APP == "textto3d_sam3d", sam3d object model, by default.
 # GRADIO_APP == "textto3d", TRELLIS model.
 os.environ["GRADIO_APP"] = "textto3d_sam3d"
     get_cached_image,
     get_seed,
     get_selected_image,
+    image_to_3d,
     start_session,
     text2image_fn,
 )
 app_name = os.getenv("GRADIO_APP")
 if app_name == "textto3d_sam3d":
     enable_pre_resize = False
     sample_step = 25
 elif app_name == "textto3d":
     enable_pre_resize = True
     sample_step = 12

common.py CHANGED Viewed

@@ -14,6 +14,10 @@
 # implied. See the License for the specific language governing
 # permissions and limitations under the License.
 import gc
 import logging
 import os
@@ -27,9 +31,7 @@ import gradio as gr
 import numpy as np
 import spaces
 import torch
-import torch.nn.functional as F
 import trimesh
-from easydict import EasyDict as edict
 from PIL import Image
 from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
 from embodied_gen.data.backproject_v3 import entrypoint as backproject_api_v3
@@ -37,6 +39,7 @@ from embodied_gen.data.differentiable_render import entrypoint as render_api
 from embodied_gen.data.utils import trellis_preprocess, zip_files
 from embodied_gen.models.delight_model import DelightingModel
 from embodied_gen.models.gs_model import GaussianOperator
 from embodied_gen.models.segment_model import (
     BMGG14Remover,
     RembgRemover,
@@ -57,7 +60,7 @@ from embodied_gen.utils.process_media import (
     merge_images_video,
 )
 from embodied_gen.utils.tags import VERSION
-from embodied_gen.utils.trender import render_video
 from embodied_gen.validators.quality_checkers import (
     BaseChecker,
     ImageAestheticChecker,
@@ -70,15 +73,6 @@ current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, ".."))
 from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
-from thirdparty.TRELLIS.trellis.representations import (
-    Gaussian,
-    MeshExtractResult,
-)
-from thirdparty.TRELLIS.trellis.representations.gaussian.general_utils import (
-    build_scaling_rotation,
-    inverse_sigmoid,
-    strip_symmetric,
-)
 from thirdparty.TRELLIS.trellis.utils import postprocessing_utils
 logging.basicConfig(
@@ -86,79 +80,24 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
-    "~/.cache/torch_extensions"
-)
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
-os.environ["SPCONV_ALGO"] = "native"
 MAX_SEED = 100000
-def patched_setup_functions(self):
-    def inverse_softplus(x):
-        return x + torch.log(-torch.expm1(-x))
-    def build_covariance_from_scaling_rotation(
-        scaling, scaling_modifier, rotation
-    ):
-        L = build_scaling_rotation(scaling_modifier * scaling, rotation)
-        actual_covariance = L @ L.transpose(1, 2)
-        symm = strip_symmetric(actual_covariance)
-        return symm
-    if self.scaling_activation_type == "exp":
-        self.scaling_activation = torch.exp
-        self.inverse_scaling_activation = torch.log
-    elif self.scaling_activation_type == "softplus":
-        self.scaling_activation = F.softplus
-        self.inverse_scaling_activation = inverse_softplus
-    self.covariance_activation = build_covariance_from_scaling_rotation
-    self.opacity_activation = torch.sigmoid
-    self.inverse_opacity_activation = inverse_sigmoid
-    self.rotation_activation = F.normalize
-    self.scale_bias = self.inverse_scaling_activation(
-        torch.tensor(self.scaling_bias)
-    ).to(self.device)
-    self.rots_bias = torch.zeros((4)).to(self.device)
-    self.rots_bias[0] = 1
-    self.opacity_bias = self.inverse_opacity_activation(
-        torch.tensor(self.opacity_bias)
-    ).to(self.device)
-Gaussian.setup_functions = patched_setup_functions
 # DELIGHT = DelightingModel()
 # IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 # IMAGESR_MODEL = ImageStableSR()
-if os.getenv("GRADIO_APP") == "imageto3d":
-    RBG_REMOVER = RembgRemover()
-    RBG14_REMOVER = BMGG14Remover()
-    SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
-    PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
-        "microsoft/TRELLIS-image-large"
-    )
-    # PIPELINE.cuda()
-    SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
-    GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
-    AESTHETIC_CHECKER = ImageAestheticChecker()
-    CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
-    TMP_DIR = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
-    )
-    os.makedirs(TMP_DIR, exist_ok=True)
-elif os.getenv("GRADIO_APP") == "imageto3d_sam3d":
-    from embodied_gen.models.sam3d import Sam3dInference
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
     SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
-    PIPELINE = Sam3dInference()
     SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
     GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
     AESTHETIC_CHECKER = ImageAestheticChecker()
@@ -167,30 +106,16 @@ elif os.getenv("GRADIO_APP") == "imageto3d_sam3d":
         os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
     )
     os.makedirs(TMP_DIR, exist_ok=True)
-elif os.getenv("GRADIO_APP") == "textto3d":
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
-    PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
-        "microsoft/TRELLIS-image-large"
-    )
-    # PIPELINE.cuda()
-    text_model_dir = "weights/Kolors"
-    PIPELINE_IMG_IP = build_text2img_ip_pipeline(text_model_dir, ref_scale=0.3)
-    PIPELINE_IMG = build_text2img_pipeline(text_model_dir)
-    SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
-    GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
-    AESTHETIC_CHECKER = ImageAestheticChecker()
-    CHECKERS = [GEO_CHECKER, SEG_CHECKER, AESTHETIC_CHECKER]
-    TMP_DIR = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "sessions/textto3d"
-    )
-    os.makedirs(TMP_DIR, exist_ok=True)
-elif os.getenv("GRADIO_APP") == "textto3d_sam3d":
-    from embodied_gen.models.sam3d import Sam3dInference
-    RBG_REMOVER = RembgRemover()
-    RBG14_REMOVER = BMGG14Remover()
-    PIPELINE = Sam3dInference()
     text_model_dir = "weights/Kolors"
     PIPELINE_IMG_IP = build_text2img_ip_pipeline(text_model_dir, ref_scale=0.3)
     PIPELINE_IMG = build_text2img_pipeline(text_model_dir)
@@ -302,50 +227,6 @@ def get_cached_image(image_path: str) -> Image.Image:
     return Image.open(image_path).resize((512, 512))
-@spaces.GPU
-def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
-    return {
-        "gaussian": {
-            **gs.init_params,
-            "_xyz": gs._xyz.cpu().numpy(),
-            "_features_dc": gs._features_dc.cpu().numpy(),
-            "_scaling": gs._scaling.cpu().numpy(),
-            "_rotation": gs._rotation.cpu().numpy(),
-            "_opacity": gs._opacity.cpu().numpy(),
-        },
-        "mesh": {
-            "vertices": mesh.vertices.cpu().numpy(),
-            "faces": mesh.faces.cpu().numpy(),
-        },
-    }
-def unpack_state(state: dict, device: str = "cpu") -> tuple[Gaussian, dict]:
-    gs = Gaussian(
-        aabb=state["gaussian"]["aabb"],
-        sh_degree=state["gaussian"]["sh_degree"],
-        mininum_kernel_size=state["gaussian"]["mininum_kernel_size"],
-        scaling_bias=state["gaussian"]["scaling_bias"],
-        opacity_bias=state["gaussian"]["opacity_bias"],
-        scaling_activation=state["gaussian"]["scaling_activation"],
-        device=device,
-    )
-    gs._xyz = torch.tensor(state["gaussian"]["_xyz"], device=device)
-    gs._features_dc = torch.tensor(
-        state["gaussian"]["_features_dc"], device=device
-    )
-    gs._scaling = torch.tensor(state["gaussian"]["_scaling"], device=device)
-    gs._rotation = torch.tensor(state["gaussian"]["_rotation"], device=device)
-    gs._opacity = torch.tensor(state["gaussian"]["_opacity"], device=device)
-    mesh = edict(
-        vertices=torch.tensor(state["mesh"]["vertices"], device=device),
-        faces=torch.tensor(state["mesh"]["faces"], device=device),
-    )
-    return gs, mesh
 def get_seed(randomize_seed: bool, seed: int, max_seed: int = MAX_SEED) -> int:
     return np.random.randint(0, max_seed) if randomize_seed else seed
@@ -399,87 +280,48 @@ def image_to_3d(
     if is_sam_image:
         seg_image = filter_image_small_connected_components(sam_image)
         seg_image = Image.fromarray(seg_image, mode="RGBA")
-        seg_image = trellis_preprocess(seg_image)
     else:
         seg_image = image
     if isinstance(seg_image, np.ndarray):
         seg_image = Image.fromarray(seg_image)
-    output_root = os.path.join(TMP_DIR, str(req.session_hash))
-    os.makedirs(output_root, exist_ok=True)
-    seg_image.save(f"{output_root}/seg_image.png")
-    raw_image_cache.save(f"{output_root}/raw_image.png")
-    PIPELINE.cuda()
-    outputs = PIPELINE.run(
-        seg_image,
-        seed=seed,
-        formats=["gaussian", "mesh"],
-        preprocess_image=False,
-        sparse_structure_sampler_params={
-            "steps": ss_sampling_steps,
-            "cfg_strength": ss_guidance_strength,
-        },
-        slat_sampler_params={
-            "steps": slat_sampling_steps,
-            "cfg_strength": slat_guidance_strength,
-        },
-    )
-    # Set to cpu for memory saving.
-    PIPELINE.cpu()
     gs_model = outputs["gaussian"][0]
     mesh_model = outputs["mesh"][0]
     color_images = render_video(gs_model, r=1.85)["color"]
     normal_images = render_video(mesh_model, r=1.85)["normal"]
-    video_path = os.path.join(output_root, "gs_mesh.mp4")
-    merge_images_video(color_images, normal_images, video_path)
-    state = pack_state(gs_model, mesh_model)
-    gc.collect()
-    torch.cuda.empty_cache()
-    return state, video_path
-@spaces.GPU
-def image_to_3d_sam3d(
-    image: Image.Image,
-    seed: int,
-    ss_sampling_steps: int,
-    slat_sampling_steps: int,
-    raw_image_cache: Image.Image,
-    ss_guidance_strength: float = None,
-    slat_guidance_strength: float = None,
-    sam_image: Image.Image = None,
-    is_sam_image: bool = False,
-    req: gr.Request = None,
-) -> tuple[dict, str]:
-    if is_sam_image:
-        seg_image = filter_image_small_connected_components(sam_image)
-        seg_image = Image.fromarray(seg_image, mode="RGBA")
-    else:
-        seg_image = image
-    if isinstance(seg_image, np.ndarray):
-        seg_image = Image.fromarray(seg_image)
     output_root = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(output_root, exist_ok=True)
     seg_image.save(f"{output_root}/seg_image.png")
     raw_image_cache.save(f"{output_root}/raw_image.png")
-    outputs = PIPELINE.run(
-        seg_image,
-        seed=seed,
-        stage1_inference_steps=ss_sampling_steps,
-        stage2_inference_steps=slat_sampling_steps,
-    )
-    gs_model = outputs["gaussian"][0]
-    mesh_model = outputs["mesh"][0]
-    color_images = render_video(gs_model, r=1.85)["color"]
-    normal_images = render_video(mesh_model, r=1.85)["normal"]
     video_path = os.path.join(output_root, "gs_mesh.mp4")
     merge_images_video(color_images, normal_images, video_path)
@@ -491,56 +333,13 @@ def image_to_3d_sam3d(
     return state, video_path
-@spaces.GPU
-def extract_3d_representations(
-    state: dict, enable_delight: bool, texture_size: int, req: gr.Request
-):
-    output_root = TMP_DIR
-    output_root = os.path.join(output_root, str(req.session_hash))
-    gs_model, mesh_model = unpack_state(state, device="cuda")
-    mesh = postprocessing_utils.to_glb(
-        gs_model,
-        mesh_model,
-        simplify=0.9,
-        texture_size=1024,
-        verbose=True,
-    )
-    filename = "sample"
-    gs_path = os.path.join(output_root, f"{filename}_gs.ply")
-    gs_model.save_ply(gs_path)
-    # Rotate mesh and GS by 90 degrees around Z-axis.
-    rot_matrix = [[0, 0, -1], [0, 1, 0], [1, 0, 0]]
-    # Addtional rotation for GS to align mesh.
-    gs_rot = np.array([[1, 0, 0], [0, -1, 0], [0, 0, -1]]) @ np.array(
-        rot_matrix
-    )
-    pose = GaussianOperator.trans_to_quatpose(gs_rot)
-    aligned_gs_path = gs_path.replace(".ply", "_aligned.ply")
-    GaussianOperator.resave_ply(
-        in_ply=gs_path,
-        out_ply=aligned_gs_path,
-        instance_pose=pose,
-    )
-    mesh.vertices = mesh.vertices @ np.array(rot_matrix)
-    mesh_obj_path = os.path.join(output_root, f"{filename}.obj")
-    mesh.export(mesh_obj_path)
-    mesh_glb_path = os.path.join(output_root, f"{filename}.glb")
-    mesh.export(mesh_glb_path)
-    torch.cuda.empty_cache()
-    return mesh_glb_path, gs_path, mesh_obj_path, aligned_gs_path
 def extract_3d_representations_v2(
     state: dict,
     enable_delight: bool,
     texture_size: int,
     req: gr.Request,
 ):
     output_root = TMP_DIR
     user_dir = os.path.join(output_root, str(req.session_hash))
     gs_model, mesh_model = unpack_state(state, device="cpu")
@@ -607,6 +406,7 @@ def extract_3d_representations_v3(
     texture_size: int,
     req: gr.Request,
 ):
     output_root = TMP_DIR
     user_dir = os.path.join(output_root, str(req.session_hash))
     gs_model, mesh_model = unpack_state(state, device="cpu")

 # implied. See the License for the specific language governing
 # permissions and limitations under the License.
+from embodied_gen.utils.monkey_patches import monkey_path_trellis
+monkey_path_trellis()
 import gc
 import logging
 import os
 import numpy as np
 import spaces
 import torch
 import trimesh
 from PIL import Image
 from embodied_gen.data.backproject_v2 import entrypoint as backproject_api
 from embodied_gen.data.backproject_v3 import entrypoint as backproject_api_v3
 from embodied_gen.data.utils import trellis_preprocess, zip_files
 from embodied_gen.models.delight_model import DelightingModel
 from embodied_gen.models.gs_model import GaussianOperator
+from embodied_gen.models.sam3d import Sam3dInference
 from embodied_gen.models.segment_model import (
     BMGG14Remover,
     RembgRemover,
     merge_images_video,
 )
 from embodied_gen.utils.tags import VERSION
+from embodied_gen.utils.trender import pack_state, render_video, unpack_state
 from embodied_gen.validators.quality_checkers import (
     BaseChecker,
     ImageAestheticChecker,
 current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, ".."))
 from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
 from thirdparty.TRELLIS.trellis.utils import postprocessing_utils
 logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
 MAX_SEED = 100000
 # DELIGHT = DelightingModel()
 # IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 # IMAGESR_MODEL = ImageStableSR()
+if os.getenv("GRADIO_APP").startswith("imageto3d"):
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
     SAM_PREDICTOR = SAMPredictor(model_type="vit_h", device="cpu")
+    if "sam3d" in os.getenv("GRADIO_APP"):
+        PIPELINE = Sam3dInference()
+    else:
+        PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
+            "microsoft/TRELLIS-image-large"
+        )
+        # PIPELINE.cuda()
     SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
     GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
     AESTHETIC_CHECKER = ImageAestheticChecker()
         os.path.dirname(os.path.abspath(__file__)), "sessions/imageto3d"
     )
     os.makedirs(TMP_DIR, exist_ok=True)
+elif os.getenv("GRADIO_APP").startswith("textto3d"):
     RBG_REMOVER = RembgRemover()
     RBG14_REMOVER = BMGG14Remover()
+    if "sam3d" in os.getenv("GRADIO_APP"):
+        PIPELINE = Sam3dInference()
+    else:
+        PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
+            "microsoft/TRELLIS-image-large"
+        )
+        # PIPELINE.cuda()
     text_model_dir = "weights/Kolors"
     PIPELINE_IMG_IP = build_text2img_ip_pipeline(text_model_dir, ref_scale=0.3)
     PIPELINE_IMG = build_text2img_pipeline(text_model_dir)
     return Image.open(image_path).resize((512, 512))
 def get_seed(randomize_seed: bool, seed: int, max_seed: int = MAX_SEED) -> int:
     return np.random.randint(0, max_seed) if randomize_seed else seed
     if is_sam_image:
         seg_image = filter_image_small_connected_components(sam_image)
         seg_image = Image.fromarray(seg_image, mode="RGBA")
     else:
         seg_image = image
     if isinstance(seg_image, np.ndarray):
         seg_image = Image.fromarray(seg_image)
+    if isinstance(PIPELINE, Sam3dInference):
+        outputs = PIPELINE.run(
+            seg_image,
+            seed=seed,
+            stage1_inference_steps=ss_sampling_steps,
+            stage2_inference_steps=slat_sampling_steps,
+        )
+    else:
+        PIPELINE.cuda()
+        seg_image = trellis_preprocess(seg_image)
+        outputs = PIPELINE.run(
+            seg_image,
+            seed=seed,
+            formats=["gaussian", "mesh"],
+            preprocess_image=False,
+            sparse_structure_sampler_params={
+                "steps": ss_sampling_steps,
+                "cfg_strength": ss_guidance_strength,
+            },
+            slat_sampler_params={
+                "steps": slat_sampling_steps,
+                "cfg_strength": slat_guidance_strength,
+            },
+        )
+        # Set back to cpu for memory saving.
+        PIPELINE.cpu()
     gs_model = outputs["gaussian"][0]
     mesh_model = outputs["mesh"][0]
     color_images = render_video(gs_model, r=1.85)["color"]
     normal_images = render_video(mesh_model, r=1.85)["normal"]
     output_root = os.path.join(TMP_DIR, str(req.session_hash))
     os.makedirs(output_root, exist_ok=True)
     seg_image.save(f"{output_root}/seg_image.png")
     raw_image_cache.save(f"{output_root}/raw_image.png")
     video_path = os.path.join(output_root, "gs_mesh.mp4")
     merge_images_video(color_images, normal_images, video_path)
     return state, video_path
 def extract_3d_representations_v2(
     state: dict,
     enable_delight: bool,
     texture_size: int,
     req: gr.Request,
 ):
+    """Back-Projection Version of Texture Super-Resolution."""
     output_root = TMP_DIR
     user_dir = os.path.join(output_root, str(req.session_hash))
     gs_model, mesh_model = unpack_state(state, device="cpu")
     texture_size: int,
     req: gr.Request,
 ):
+    """Back-Projection Version with Optimization-Based."""
     output_root = TMP_DIR
     user_dir = os.path.join(output_root, str(req.session_hash))
     gs_model, mesh_model = unpack_state(state, device="cpu")

embodied_gen/data/asset_converter.py CHANGED Viewed

@@ -1,3 +1,20 @@
 from __future__ import annotations
 import logging

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
 from __future__ import annotations
 import logging

embodied_gen/models/sam3d.py CHANGED Viewed

@@ -19,7 +19,6 @@ from embodied_gen.utils.monkey_patches import monkey_patch_sam3d
 monkey_patch_sam3d()
 import os
 import sys
-from typing import Optional, Union
 import numpy as np
 from hydra.utils import instantiate
@@ -31,29 +30,38 @@ from PIL import Image
 current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, "../.."))
 from thirdparty.sam3d.sam3d_objects.pipeline.inference_pipeline_pointmap import (
     InferencePipelinePointMap,
 )
 __all__ = ["Sam3dInference"]
-def load_image(path: str) -> np.ndarray:
-    image = Image.open(path)
-    image = np.array(image)
-    image = image.astype(np.uint8)
-    return image
-def load_mask(path: str) -> np.ndarray:
-    mask = load_image(path)
-    mask = mask > 0
-    if mask.ndim == 3:
-        mask = mask[..., -1]
-    return mask
-class Sam3dInference:
     def __init__(
         self, local_dir: str = "weights/sam-3d-objects", compile: bool = False
     ) -> None:
@@ -65,7 +73,7 @@ class Sam3dInference:
         config.rendering_engine = "nvdiffrast"
         config.compile_model = compile
         config.workspace_dir = os.path.dirname(config_file)
-        # Generate 4 gs in each pixel.
         config["slat_decoder_gs_config_path"] = config.pop(
             "slat_decoder_gs_4_config_path", "slat_decoder_gs_4.yaml"
         )
@@ -118,25 +126,22 @@ class Sam3dInference:
 if __name__ == "__main__":
     pipeline = Sam3dInference()
-    # load image
-    image = load_image(
-        "/home/users/xinjie.wang/xinjie/sam-3d-objects/notebook/images/shutterstock_stylish_kidsroom_1640806567/image.png"
-    )
-    mask = load_mask(
-        "/home/users/xinjie.wang/xinjie/sam-3d-objects/notebook/images/shutterstock_stylish_kidsroom_1640806567/13.png"
-    )
     import torch
     if torch.cuda.is_available():
         torch.cuda.reset_peak_memory_stats()
         torch.cuda.empty_cache()
-    from time import time
     start = time()
-    output = pipeline.run(image, mask, seed=42)
     print(f"Running cost: {round(time()-start, 1)}")
     if torch.cuda.is_available():
@@ -145,5 +150,5 @@ if __name__ == "__main__":
     print(f"End: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
-    output["gs"].save_ply(f"outputs/splat.ply")
-    print("Your reconstruction has been saved to outputs/splat.ply")

 monkey_patch_sam3d()
 import os
 import sys
 import numpy as np
 from hydra.utils import instantiate
 current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, "../.."))
+from loguru import logger
 from thirdparty.sam3d.sam3d_objects.pipeline.inference_pipeline_pointmap import (
     InferencePipelinePointMap,
 )
+logger.remove()
+logger.add(lambda _: None, level="ERROR")
 __all__ = ["Sam3dInference"]
+class Sam3dInference:
+    """Wrapper for the SAM-3D-Objects inference pipeline.
+    This class handles loading the SAM-3D-Objects model, configuring it for inference,
+    and running the pipeline on input images (optionally with masks and pointmaps).
+    It supports distillation options and inference step customization.
+    Args:
+        local_dir (str): Directory to store or load model weights and configs.
+        compile (bool): Whether to compile the model for faster inference.
+    Methods:
+        merge_mask_to_rgba(image, mask):
+            Merges a binary mask into the alpha channel of an RGB image.
+        run(image, mask=None, seed=None, pointmap=None, use_stage1_distillation=False,
+            use_stage2_distillation=False, stage1_inference_steps=25, stage2_inference_steps=25):
+            Runs the inference pipeline and returns the output dictionary.
+    """
     def __init__(
         self, local_dir: str = "weights/sam-3d-objects", compile: bool = False
     ) -> None:
         config.rendering_engine = "nvdiffrast"
         config.compile_model = compile
         config.workspace_dir = os.path.dirname(config_file)
+        # Generate 4 instead of 32 gs in each pixel for efficient storage.
         config["slat_decoder_gs_config_path"] = config.pop(
             "slat_decoder_gs_4_config_path", "slat_decoder_gs_4.yaml"
         )
 if __name__ == "__main__":
     pipeline = Sam3dInference()
+    from time import time
     import torch
+    from embodied_gen.models.segment_model import RembgRemover
+    input_image = "apps/assets/example_image/sample_00.jpg"
+    output_gs = "outputs/splat.ply"
+    remover = RembgRemover()
+    clean_image = remover(input_image)
     if torch.cuda.is_available():
         torch.cuda.reset_peak_memory_stats()
         torch.cuda.empty_cache()
     start = time()
+    output = pipeline.run(clean_image, seed=42)
     print(f"Running cost: {round(time()-start, 1)}")
     if torch.cuda.is_available():
     print(f"End: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
+    output["gs"].save_ply(output_gs)
+    print(f"Saved to {output_gs}")

embodied_gen/scripts/gen_scene3d.py CHANGED Viewed

@@ -1,3 +1,20 @@
 import logging
 import os
 import random

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
 import logging
 import os
 import random

embodied_gen/scripts/gen_texture.py CHANGED Viewed

@@ -1,3 +1,20 @@
 import os
 import shutil
 from dataclasses import dataclass

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
 import os
 import shutil
 from dataclasses import dataclass

embodied_gen/scripts/imageto3d.py CHANGED Viewed

@@ -14,30 +14,30 @@
 # implied. See the License for the specific language governing
 # permissions and limitations under the License.
 import argparse
 import os
 import random
-import sys
 from glob import glob
 from shutil import copy, copytree, rmtree
 import numpy as np
-import torch
 import trimesh
 from PIL import Image
 from embodied_gen.data.backproject_v3 import entrypoint as backproject_api
-from embodied_gen.data.utils import delete_dir, trellis_preprocess
 # from embodied_gen.models.delight_model import DelightingModel
 from embodied_gen.models.gs_model import GaussianOperator
 from embodied_gen.models.segment_model import RembgRemover
-# from embodied_gen.models.sr_model import ImageRealESRGAN
 from embodied_gen.scripts.render_gs import entrypoint as render_gs_api
 from embodied_gen.utils.gpt_clients import GPT_CLIENT
 from embodied_gen.utils.log import logger
-from embodied_gen.utils.process_media import merge_images_video
 from embodied_gen.utils.tags import VERSION
 from embodied_gen.utils.trender import render_video
 from embodied_gen.validators.quality_checkers import (
@@ -48,26 +48,24 @@ from embodied_gen.validators.quality_checkers import (
 )
 from embodied_gen.validators.urdf_convertor import URDFGenerator
-current_file_path = os.path.abspath(__file__)
-current_dir = os.path.dirname(current_file_path)
-sys.path.append(os.path.join(current_dir, "../.."))
-from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
-os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
-    "~/.cache/torch_extensions"
-)
-os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
-os.environ["SPCONV_ALGO"] = "native"
-random.seed(0)
-logger.info("Loading Image3D Models...")
 # DELIGHT = DelightingModel()
 # IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 RBG_REMOVER = RembgRemover()
-PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
-    "microsoft/TRELLIS-image-large"
-)
-# PIPELINE.cuda()
 SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
 GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
 AESTHETIC_CHECKER = ImageAestheticChecker()
@@ -151,7 +149,6 @@ def entrypoint(**kwargs):
             # Segmentation: Get segmented image using Rembg.
             seg_path = f"{output_root}/{filename}_cond.png"
             seg_image = RBG_REMOVER(image) if image.mode != "RGBA" else image
-            seg_image = trellis_preprocess(seg_image)
             seg_image.save(seg_path)
             seed = args.seed
@@ -162,27 +159,8 @@ def entrypoint(**kwargs):
                 logger.info(
                     f"Try: {try_idx + 1}/{args.n_retry}, Seed: {seed}, Prompt: {seg_path}"
                 )
-                # Run the pipeline
                 try:
-                    PIPELINE.cuda()
-                    outputs = PIPELINE.run(
-                        seg_image,
-                        preprocess_image=False,
-                        seed=(
-                            random.randint(0, 100000) if seed is None else seed
-                        ),
-                        # Optional parameters
-                        # sparse_structure_sampler_params={
-                        #     "steps": 12,
-                        #     "cfg_strength": 7.5,
-                        # },
-                        # slat_sampler_params={
-                        #     "steps": 12,
-                        #     "cfg_strength": 3,
-                        # },
-                    )
-                    PIPELINE.cpu()
-                    torch.cuda.empty_cache()
                 except Exception as e:
                     logger.error(
                         f"[Pipeline Failed] process {image_path}: {e}, skip."
@@ -215,14 +193,13 @@ def entrypoint(**kwargs):
                 render_gs_api(
                     input_gs=aligned_gs_path,
                     output_path=color_path,
-                    elevation=[20, -10, 60, -50],
-                    num_images=12,
                 )
                 color_img = Image.open(color_path)
-                keep_height = int(color_img.height * 2 / 3)
-                crop_img = color_img.crop((0, 0, color_img.width, keep_height))
-                geo_flag, geo_result = GEO_CHECKER([crop_img], text=asset_node)
                 logger.warning(
                     f"{GEO_CHECKER.__class__.__name__}: {geo_result} for {seg_path}"
                 )
@@ -232,8 +209,8 @@ def entrypoint(**kwargs):
                 seed = random.randint(0, 100000) if seed is not None else None
             # Render the video for generated 3D asset.
-            color_images = render_video(gs_model)["color"]
-            normal_images = render_video(mesh_model)["normal"]
             video_path = os.path.join(output_root, "gs_mesh.mp4")
             merge_images_video(color_images, normal_images, video_path)
@@ -312,7 +289,7 @@ def entrypoint(**kwargs):
             image_paths = glob(f"{image_dir}/*.png")
             images_list = []
             for checker in CHECKERS:
-                images = image_paths
                 if isinstance(checker, ImageSegChecker):
                     images = [
                         f"{output_root}/{filename}_raw.png",
@@ -334,9 +311,12 @@ def entrypoint(**kwargs):
                 f"{result_dir}/{urdf_convertor.output_mesh_dir}",
             )
             copy(video_path, f"{result_dir}/video.mp4")
             if not args.keep_intermediate:
                 delete_dir(output_root, keep_subs=["result"])
         except Exception as e:
             logger.error(f"Failed to process {image_path}: {e}, skip.")
             continue

 # implied. See the License for the specific language governing
 # permissions and limitations under the License.
 import argparse
 import os
 import random
 from glob import glob
 from shutil import copy, copytree, rmtree
 import numpy as np
 import trimesh
 from PIL import Image
 from embodied_gen.data.backproject_v3 import entrypoint as backproject_api
+from embodied_gen.data.utils import delete_dir
+# from embodied_gen.models.sr_model import ImageRealESRGAN
 # from embodied_gen.models.delight_model import DelightingModel
 from embodied_gen.models.gs_model import GaussianOperator
 from embodied_gen.models.segment_model import RembgRemover
 from embodied_gen.scripts.render_gs import entrypoint as render_gs_api
 from embodied_gen.utils.gpt_clients import GPT_CLIENT
+from embodied_gen.utils.inference import image3d_model_infer
 from embodied_gen.utils.log import logger
+from embodied_gen.utils.process_media import (
+    combine_images_to_grid,
+    merge_images_video,
+)
 from embodied_gen.utils.tags import VERSION
 from embodied_gen.utils.trender import render_video
 from embodied_gen.validators.quality_checkers import (
 )
 from embodied_gen.validators.urdf_convertor import URDFGenerator
+# random.seed(0)
+IMAGE3D_MODEL = "SAM3D"  # TRELLIS or SAM3D
+logger.info(f"Loading {IMAGE3D_MODEL} as Image3D Models...")
+if IMAGE3D_MODEL == "TRELLIS":
+    from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
+    PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
+        "microsoft/TRELLIS-image-large"
+    )
+    # PIPELINE.cuda()
+elif IMAGE3D_MODEL == "SAM3D":
+    from embodied_gen.models.sam3d import Sam3dInference
+    PIPELINE = Sam3dInference()
 # DELIGHT = DelightingModel()
 # IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 RBG_REMOVER = RembgRemover()
 SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
 GEO_CHECKER = MeshGeoChecker(GPT_CLIENT)
 AESTHETIC_CHECKER = ImageAestheticChecker()
             # Segmentation: Get segmented image using Rembg.
             seg_path = f"{output_root}/{filename}_cond.png"
             seg_image = RBG_REMOVER(image) if image.mode != "RGBA" else image
             seg_image.save(seg_path)
             seed = args.seed
                 logger.info(
                     f"Try: {try_idx + 1}/{args.n_retry}, Seed: {seed}, Prompt: {seg_path}"
                 )
                 try:
+                    outputs = image3d_model_infer(PIPELINE, seg_image, seed)
                 except Exception as e:
                     logger.error(
                         f"[Pipeline Failed] process {image_path}: {e}, skip."
                 render_gs_api(
                     input_gs=aligned_gs_path,
                     output_path=color_path,
+                    elevation=[30, -30],
+                    num_images=4,
                 )
                 color_img = Image.open(color_path)
+                geo_flag, geo_result = GEO_CHECKER(
+                    [color_img], text=asset_node
+                )
                 logger.warning(
                     f"{GEO_CHECKER.__class__.__name__}: {geo_result} for {seg_path}"
                 )
                 seed = random.randint(0, 100000) if seed is not None else None
             # Render the video for generated 3D asset.
+            color_images = render_video(gs_model, r=1.85)["color"]
+            normal_images = render_video(mesh_model, r=1.85)["normal"]
             video_path = os.path.join(output_root, "gs_mesh.mp4")
             merge_images_video(color_images, normal_images, video_path)
             image_paths = glob(f"{image_dir}/*.png")
             images_list = []
             for checker in CHECKERS:
+                images = combine_images_to_grid(image_paths)
                 if isinstance(checker, ImageSegChecker):
                     images = [
                         f"{output_root}/{filename}_raw.png",
                 f"{result_dir}/{urdf_convertor.output_mesh_dir}",
             )
             copy(video_path, f"{result_dir}/video.mp4")
             if not args.keep_intermediate:
                 delete_dir(output_root, keep_subs=["result"])
+            logger.info(f"Saved results for {image_path} in {result_dir}")
         except Exception as e:
             logger.error(f"Failed to process {image_path}: {e}, skip.")
             continue

embodied_gen/scripts/render_gs.py CHANGED Viewed

@@ -27,7 +27,6 @@ from tqdm import tqdm
 from embodied_gen.data.utils import (
     CameraSetting,
     init_kal_camera,
-    normalize_vertices_array,
 )
 from embodied_gen.models.gs_model import load_gs_model
 from embodied_gen.utils.process_media import combine_images_to_grid

 from embodied_gen.data.utils import (
     CameraSetting,
     init_kal_camera,
 )
 from embodied_gen.models.gs_model import load_gs_model
 from embodied_gen.utils.process_media import combine_images_to_grid

embodied_gen/scripts/textto3d.py CHANGED Viewed

@@ -30,6 +30,7 @@ from embodied_gen.utils.gpt_clients import GPT_CLIENT
 from embodied_gen.utils.log import logger
 from embodied_gen.utils.process_media import (
     check_object_edge_truncated,
     render_asset3d,
 )
 from embodied_gen.validators.quality_checkers import (
@@ -51,7 +52,6 @@ BG_REMOVER = RembgRemover()
 __all__ = [
-    "text_to_image",
     "text_to_3d",
 ]
@@ -176,12 +176,12 @@ def text_to_3d(**kwargs) -> dict:
             image_path = render_asset3d(
                 mesh_path,
                 output_root=f"{node_save_dir}/result",
-                num_images=6,
                 elevation=(30, -30),
                 output_subdir="renders",
                 no_index_file=True,
             )
             check_text = asset_type if asset_type is not None else prompt
             qa_flag, qa_result = TXTGEN_CHECKER(check_text, image_path)
             logger.warning(

 from embodied_gen.utils.log import logger
 from embodied_gen.utils.process_media import (
     check_object_edge_truncated,
+    combine_images_to_grid,
     render_asset3d,
 )
 from embodied_gen.validators.quality_checkers import (
 __all__ = [
     "text_to_3d",
 ]
             image_path = render_asset3d(
                 mesh_path,
                 output_root=f"{node_save_dir}/result",
+                num_images=4,
                 elevation=(30, -30),
                 output_subdir="renders",
                 no_index_file=True,
             )
+            image_path = combine_images_to_grid(image_path)
             check_text = asset_type if asset_type is not None else prompt
             qa_flag, qa_result = TXTGEN_CHECKER(check_text, image_path)
             logger.warning(

embodied_gen/utils/gpt_clients.py CHANGED Viewed

@@ -21,13 +21,14 @@ import os
 from io import BytesIO
 from typing import Optional
 import yaml
 from openai import AzureOpenAI, OpenAI  # pip install openai
 from PIL import Image
 from tenacity import (
     retry,
     stop_after_attempt,
-    stop_after_delay,
     wait_random_exponential,
 )
 from embodied_gen.utils.process_media import combine_images_to_grid
@@ -106,8 +107,9 @@ class GPTclient:
         logger.info(f"Using GPT model: {self.model_name}.")
     @retry(
-        wait=wait_random_exponential(min=1, max=20),
-        stop=(stop_after_attempt(10) | stop_after_delay(30)),
     )
     def completion_with_backoff(self, **kwargs):
         """Performs a chat completion request with retry/backoff."""
@@ -246,3 +248,8 @@ GPT_CLIENT = GPTclient(
     model_name=model_name,
     check_connection=False,
 )

 from io import BytesIO
 from typing import Optional
+import openai
 import yaml
 from openai import AzureOpenAI, OpenAI  # pip install openai
 from PIL import Image
 from tenacity import (
     retry,
+    retry_if_not_exception_type,
     stop_after_attempt,
     wait_random_exponential,
 )
 from embodied_gen.utils.process_media import combine_images_to_grid
         logger.info(f"Using GPT model: {self.model_name}.")
     @retry(
+        retry=retry_if_not_exception_type(openai.BadRequestError),
+        wait=wait_random_exponential(min=1, max=10),
+        stop=stop_after_attempt(5),
     )
     def completion_with_backoff(self, **kwargs):
         """Performs a chat completion request with retry/backoff."""
     model_name=model_name,
     check_connection=False,
 )
+if __name__ == "__main__":
+    response = GPT_CLIENT.query("What is the capital of China?")
+    print(f"Response: {response}")

embodied_gen/utils/inference.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from embodied_gen.utils.monkey_patches import monkey_path_trellis
+monkey_path_trellis()
+import random
+import torch
+from PIL import Image
+from embodied_gen.data.utils import trellis_preprocess
+from embodied_gen.models.sam3d import Sam3dInference
+from embodied_gen.utils.trender import pack_state, unpack_state
+from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
+__all__ = [
+    "image3d_model_infer",
+]
+def image3d_model_infer(
+    pipe: TrellisImageTo3DPipeline | Sam3dInference,
+    seg_image: Image.Image,
+    seed: int = None,
+    **kwargs: dict,
+) -> dict[str, any]:
+    if isinstance(pipe, TrellisImageTo3DPipeline):
+        pipe.cuda()
+        seg_image = trellis_preprocess(seg_image)
+        outputs = pipe.run(
+            seg_image,
+            preprocess_image=False,
+            seed=(random.randint(0, 100000) if seed is None else seed),
+            # Optional parameters
+            # sparse_structure_sampler_params={
+            #     "steps": 12,
+            #     "cfg_strength": 7.5,
+            # },
+            # slat_sampler_params={
+            #     "steps": 12,
+            #     "cfg_strength": 3,
+            # },
+            **kwargs,
+        )
+        pipe.cpu()
+    elif isinstance(pipe, Sam3dInference):
+        outputs = pipe.run(
+            seg_image,
+            seed=(random.randint(0, 100000) if seed is None else seed),
+            # stage1_inference_steps=25,
+            # stage2_inference_steps=25,
+            **kwargs,
+        )
+        state = pack_state(outputs["gaussian"][0], outputs["mesh"][0])
+        # Align GS3D from SAM3D with TRELLIS format.
+        outputs["gaussian"][0], _ = unpack_state(state, device="cuda")
+    else:
+        raise ValueError(f"Unsupported pipeline type: {type(pipe)}")
+    torch.cuda.empty_cache()
+    return outputs

embodied_gen/utils/monkey_patches.py CHANGED Viewed

@@ -32,6 +32,67 @@ __all__ = [
 ]
 def monkey_patch_pano2room():
     current_file_path = os.path.abspath(__file__)
     current_dir = os.path.dirname(current_file_path)
@@ -240,8 +301,6 @@ def monkey_patch_sam3d():
     if sam3d_root not in sys.path:
         sys.path.insert(0, sam3d_root)
-    print(f"[MonkeyPatch] Added to sys.path: {sam3d_root}")
     def patch_pointmap_infer_pipeline():
         from copy import deepcopy
@@ -317,9 +376,6 @@ def monkey_patch_sam3d():
                     )
                 )
-                logger.info(
-                    f"Rescaling scale by {ss_return_dict['downsample_factor']} after downsampling"
-                )
                 ss_return_dict["scale"] = (
                     ss_return_dict["scale"]
                     * ss_return_dict["downsample_factor"]
@@ -471,11 +527,6 @@ def monkey_patch_sam3d():
             self.rendering_engine = rendering_engine
             self.device = torch.device(device)
             self.compile_model = compile_model
-            logger.info(f"self.device: {self.device}")
-            logger.info(
-                f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', None)}"
-            )
-            logger.info(f"Actually using GPU: {torch.cuda.current_device()}")
             with self.device:
                 self.decode_formats = decode_formats
                 self.pad_size = pad_size
@@ -511,7 +562,6 @@ def monkey_patch_sam3d():
                 )
                 self.slat_preprocessor = slat_preprocessor
-                logger.info("Loading model weights...")
                 raw_device = self.device
                 self.device = torch.device("cpu")
                 ss_generator = self.init_ss_generator(
@@ -578,7 +628,7 @@ def monkey_patch_sam3d():
                         "slat_decoder_mesh": slat_decoder_mesh,
                     }
                 )
-                logger.info("Loading model weights completed!")
                 if self.compile_model:
                     logger.info("Compiling model...")

 ]
+def monkey_path_trellis():
+    import torch.nn.functional as F
+    current_file_path = os.path.abspath(__file__)
+    current_dir = os.path.dirname(current_file_path)
+    sys.path.append(os.path.join(current_dir, "../.."))
+    from thirdparty.TRELLIS.trellis.representations import Gaussian
+    from thirdparty.TRELLIS.trellis.representations.gaussian.general_utils import (
+        build_scaling_rotation,
+        inverse_sigmoid,
+        strip_symmetric,
+    )
+    os.environ["TORCH_EXTENSIONS_DIR"] = os.path.expanduser(
+        "~/.cache/torch_extensions"
+    )
+    os.environ["SPCONV_ALGO"] = "auto"  # Can be 'native' or 'auto'
+    os.environ['ATTN_BACKEND'] = (
+        "xformers"  # Can be 'flash-attn' or 'xformers'
+    )
+    from thirdparty.TRELLIS.trellis.modules.sparse import set_attn
+    set_attn("xformers")
+    def patched_setup_functions(self):
+        def inverse_softplus(x):
+            return x + torch.log(-torch.expm1(-x))
+        def build_covariance_from_scaling_rotation(
+            scaling, scaling_modifier, rotation
+        ):
+            L = build_scaling_rotation(scaling_modifier * scaling, rotation)
+            actual_covariance = L @ L.transpose(1, 2)
+            symm = strip_symmetric(actual_covariance)
+            return symm
+        if self.scaling_activation_type == "exp":
+            self.scaling_activation = torch.exp
+            self.inverse_scaling_activation = torch.log
+        elif self.scaling_activation_type == "softplus":
+            self.scaling_activation = F.softplus
+            self.inverse_scaling_activation = inverse_softplus
+        self.covariance_activation = build_covariance_from_scaling_rotation
+        self.opacity_activation = torch.sigmoid
+        self.inverse_opacity_activation = inverse_sigmoid
+        self.rotation_activation = F.normalize
+        self.scale_bias = self.inverse_scaling_activation(
+            torch.tensor(self.scaling_bias)
+        ).to(self.device)
+        self.rots_bias = torch.zeros((4)).to(self.device)
+        self.rots_bias[0] = 1
+        self.opacity_bias = self.inverse_opacity_activation(
+            torch.tensor(self.opacity_bias)
+        ).to(self.device)
+    Gaussian.setup_functions = patched_setup_functions
 def monkey_patch_pano2room():
     current_file_path = os.path.abspath(__file__)
     current_dir = os.path.dirname(current_file_path)
     if sam3d_root not in sys.path:
         sys.path.insert(0, sam3d_root)
     def patch_pointmap_infer_pipeline():
         from copy import deepcopy
                     )
                 )
                 ss_return_dict["scale"] = (
                     ss_return_dict["scale"]
                     * ss_return_dict["downsample_factor"]
             self.rendering_engine = rendering_engine
             self.device = torch.device(device)
             self.compile_model = compile_model
             with self.device:
                 self.decode_formats = decode_formats
                 self.pad_size = pad_size
                 )
                 self.slat_preprocessor = slat_preprocessor
                 raw_device = self.device
                 self.device = torch.device("cpu")
                 ss_generator = self.init_ss_generator(
                         "slat_decoder_mesh": slat_decoder_mesh,
                     }
                 )
+                logger.info("Loading SAM3D model weights completed.")
                 if self.compile_model:
                     logger.info("Compiling model...")

embodied_gen/utils/process_media.py CHANGED Viewed

@@ -96,7 +96,7 @@ def render_asset3d(
         image_paths = render_asset3d(
             mesh_path="path_to_mesh.obj",
             output_root="path_to_save_dir",
-            num_images=6,
             elevation=(30, -30),
             output_subdir="renders",
             no_index_file=True,

         image_paths = render_asset3d(
             mesh_path="path_to_mesh.obj",
             output_root="path_to_save_dir",
+            num_images=4,
             elevation=(30, -30),
             output_subdir="renders",
             no_index_file=True,

embodied_gen/utils/tags.py CHANGED Viewed

	@@ -1 +1 @@
1	- VERSION = "v0.1.6"


1	+ VERSION = "v0.1.7"

embodied_gen/utils/trender.py CHANGED Viewed

@@ -21,18 +21,25 @@ from collections import defaultdict
 import numpy as np
 import spaces
 import torch
 from tqdm import tqdm
 current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, "../.."))
 from thirdparty.TRELLIS.trellis.renderers import GaussianRenderer, MeshRenderer
 from thirdparty.TRELLIS.trellis.utils.render_utils import (
     yaw_pitch_r_fov_to_extrinsics_intrinsics,
 )
 __all__ = [
     "render_video",
 ]
@@ -140,3 +147,47 @@ def render_video(
     )
     return result

 import numpy as np
 import spaces
 import torch
+from easydict import EasyDict as edict
 from tqdm import tqdm
 current_file_path = os.path.abspath(__file__)
 current_dir = os.path.dirname(current_file_path)
 sys.path.append(os.path.join(current_dir, "../.."))
 from thirdparty.TRELLIS.trellis.renderers import GaussianRenderer, MeshRenderer
+from thirdparty.TRELLIS.trellis.representations import (
+    Gaussian,
+    MeshExtractResult,
+)
 from thirdparty.TRELLIS.trellis.utils.render_utils import (
     yaw_pitch_r_fov_to_extrinsics_intrinsics,
 )
 __all__ = [
     "render_video",
+    "pack_state",
+    "unpack_state",
 ]
     )
     return result
+@spaces.GPU
+def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
+    return {
+        "gaussian": {
+            **gs.init_params,
+            "_xyz": gs._xyz.cpu().numpy(),
+            "_features_dc": gs._features_dc.cpu().numpy(),
+            "_scaling": gs._scaling.cpu().numpy(),
+            "_rotation": gs._rotation.cpu().numpy(),
+            "_opacity": gs._opacity.cpu().numpy(),
+        },
+        "mesh": {
+            "vertices": mesh.vertices.cpu().numpy(),
+            "faces": mesh.faces.cpu().numpy(),
+        },
+    }
+def unpack_state(state: dict, device: str = "cpu") -> tuple[Gaussian, dict]:
+    gs = Gaussian(
+        aabb=state["gaussian"]["aabb"],
+        sh_degree=state["gaussian"]["sh_degree"],
+        mininum_kernel_size=state["gaussian"]["mininum_kernel_size"],
+        scaling_bias=state["gaussian"]["scaling_bias"],
+        opacity_bias=state["gaussian"]["opacity_bias"],
+        scaling_activation=state["gaussian"]["scaling_activation"],
+        device=device,
+    )
+    gs._xyz = torch.tensor(state["gaussian"]["_xyz"], device=device)
+    gs._features_dc = torch.tensor(
+        state["gaussian"]["_features_dc"], device=device
+    )
+    gs._scaling = torch.tensor(state["gaussian"]["_scaling"], device=device)
+    gs._rotation = torch.tensor(state["gaussian"]["_rotation"], device=device)
+    gs._opacity = torch.tensor(state["gaussian"]["_opacity"], device=device)
+    mesh = edict(
+        vertices=torch.tensor(state["mesh"]["vertices"], device=device),
+        faces=torch.tensor(state["mesh"]["faces"], device=device),
+    )
+    return gs, mesh

embodied_gen/validators/aesthetic_predictor.py CHANGED Viewed

@@ -125,7 +125,11 @@ class AestheticPredictor:
         Returns:
             float: Predicted aesthetic score.
         """
-        pil_image = Image.open(image_path)
         image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
         with torch.no_grad():

         Returns:
             float: Predicted aesthetic score.
         """
+        if isinstance(image_path, str):
+            pil_image = Image.open(image_path)
+        else:
+            pil_image = image_path
         image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
         with torch.no_grad():

embodied_gen/validators/quality_checkers.py CHANGED Viewed

@@ -126,6 +126,30 @@ class MeshGeoChecker(BaseChecker):
         super().__init__(prompt, verbose)
         self.gpt_client = gpt_client
         if self.prompt is None:
             self.prompt = """
             You are an expert in evaluating the geometry quality of generated 3D asset.
             You will be given rendered views of a generated 3D asset, type {}, with black background.
@@ -137,16 +161,13 @@ class MeshGeoChecker(BaseChecker):
             - Minor flaws, asymmetries, or simplifications (e.g., less detail on sides or back,
                 soft edges) are acceptable if the object is structurally sound and recognizable.
             - Only evaluate geometry. Do not assess texture quality.
-            - The asset should not contain any unrelated elements, such as
-                ground planes, platforms, or background props (e.g., paper, flooring).
-            If all the above criteria are met, return "YES". Otherwise, return
                 "NO" followed by a brief explanation (no more than 20 words).
             Example:
-            Images show a yellow cup standing on a flat white plane -> NO
-            -> Response: NO: extra white surface under the object.
-            Image shows a chair with simplified back legs and soft edges → YES
             """
     def query(

         super().__init__(prompt, verbose)
         self.gpt_client = gpt_client
         if self.prompt is None:
+            # Old version for TRELLIS.
+            # self.prompt = """
+            # You are an expert in evaluating the geometry quality of generated 3D asset.
+            # You will be given rendered views of a generated 3D asset, type {}, with black background.
+            # Your task is to evaluate the quality of the 3D asset generation,
+            # including geometry, structure, and appearance, based on the rendered views.
+            # Criteria:
+            # - Is the object in the image a single, complete, and well-formed instance,
+            #     without truncation, missing parts, overlapping duplicates, or redundant geometry?
+            # - Minor flaws, asymmetries, or simplifications (e.g., less detail on sides or back,
+            #     soft edges) are acceptable if the object is structurally sound and recognizable.
+            # - Only evaluate geometry. Do not assess texture quality.
+            # - The asset should not contain any unrelated elements, such as
+            #     ground planes, platforms, or background props (e.g., paper, flooring).
+            # If all the above criteria are met, return "YES". Otherwise, return
+            #     "NO" followed by a brief explanation (no more than 20 words).
+            # Example:
+            # Images show a yellow cup standing on a flat white plane -> NO
+            # -> Response: NO: extra white surface under the object.
+            # Image shows a chair with simplified back legs and soft edges -> YES
+            # """
             self.prompt = """
             You are an expert in evaluating the geometry quality of generated 3D asset.
             You will be given rendered views of a generated 3D asset, type {}, with black background.
             - Minor flaws, asymmetries, or simplifications (e.g., less detail on sides or back,
                 soft edges) are acceptable if the object is structurally sound and recognizable.
             - Only evaluate geometry. Do not assess texture quality.
+            If all the above criteria are met, return "YES" only. Otherwise, return
                 "NO" followed by a brief explanation (no more than 20 words).
             Example:
+            Image shows a chair with one leg missing -> NO: the chair missing leg.
+            Image shows a geometrically complete cup -> YES
             """
     def query(

embodied_gen/validators/urdf_convertor.py CHANGED Viewed

@@ -27,7 +27,10 @@ import trimesh
 from scipy.spatial.transform import Rotation
 from embodied_gen.data.convex_decomposer import decompose_convex_mesh
 from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
-from embodied_gen.utils.process_media import render_asset3d
 from embodied_gen.utils.tags import VERSION
 logging.basicConfig(level=logging.INFO)
@@ -482,7 +485,7 @@ class URDFGenerator(object):
             output_subdir=self.output_render_dir,
             no_index_file=True,
         )
         response = self.gpt_client.query(text_prompt, image_path)
         # logger.info(response)
         if response is None:

 from scipy.spatial.transform import Rotation
 from embodied_gen.data.convex_decomposer import decompose_convex_mesh
 from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
+from embodied_gen.utils.process_media import (
+    combine_images_to_grid,
+    render_asset3d,
+)
 from embodied_gen.utils.tags import VERSION
 logging.basicConfig(level=logging.INFO)
             output_subdir=self.output_render_dir,
             no_index_file=True,
         )
+        # image_path = combine_images_to_grid(image_path)
         response = self.gpt_client.query(text_prompt, image_path)
         # logger.info(response)
         if response is None: