robotics-diffusion-transformer
/

rdt-1b

Inference Endpoints

Model card Files Files and versions Community

robotics-diffusion-transformer commited on Aug 28

Commit

5aa1106

•

1 Parent(s): f4fb9ee

Update README.md

Files changed (1) hide show

README.md +6 -6

README.md CHANGED Viewed

@@ -36,23 +36,23 @@ Here's an example of how to use the RDT-1B model for inference on a Mobile-ALOHA
 from scripts.agilex_model import create_model
 CAMERA_NAMES = ['cam_high', 'cam_right_wrist', 'cam_left_wrist'] # Names of cameras used for visual input
 config = {
-    'episode_len': 1000,  # Length of one episode
     'state_dim': 14,      # Dimension of the robot's state
     'chunk_size': 64,     # Number of actions to predict in one step
     'camera_names': CAMERA_NAMES,
 }
-ctrl_freq=25 # Set the control frequency (Hz)
-pretrained_vision_encoder_name_or_path = "google/siglip-so400m-patch14-384" # The pre-trained vision encoder model
 # Create the model with specified configuration
 model = create_model(
     args=config,
-    dtype=torch.bfloat16,  # Use bfloat16 for improved performance
     pretrained_vision_encoder_name_or_path=pretrained_vision_encoder_name_or_path,
-    control_frequency=ctrl_freq,
 )
 # Start inference process
 lang_embeddings_path = 'your/language/embedding/path'
-text_embedding = torch.load(lang_embeddings_path)['embeddings'] # Load pre-computed language embeddings
 images: List(PIL.Image) = ... #  The images from last 2 frame
 proprio = ... # The current robot state
 # Perform inference to predict the next chunk_size actions

 from scripts.agilex_model import create_model
 CAMERA_NAMES = ['cam_high', 'cam_right_wrist', 'cam_left_wrist'] # Names of cameras used for visual input
 config = {
+    'episode_len': 1000,  # Max length of one episode
     'state_dim': 14,      # Dimension of the robot's state
     'chunk_size': 64,     # Number of actions to predict in one step
     'camera_names': CAMERA_NAMES,
 }
+control_frequency=25
+pretrained_vision_encoder_name_or_path = "google/siglip-so400m-patch14-384"
 # Create the model with specified configuration
 model = create_model(
     args=config,
+    dtype=torch.bfloat16,
     pretrained_vision_encoder_name_or_path=pretrained_vision_encoder_name_or_path,
+    control_frequency=control_frequency,
 )
 # Start inference process
 lang_embeddings_path = 'your/language/embedding/path'
+text_embedding = torch.load(lang_embeddings_path)['embeddings']  # Pre-computed language embeddings
 images: List(PIL.Image) = ... #  The images from last 2 frame
 proprio = ... # The current robot state
 # Perform inference to predict the next chunk_size actions