代码可以跑起来了

2026-02-19 15:32:28 +08:00
parent b701d939c2
commit 88d14221ae
11 changed files with 503 additions and 89 deletions
--- a/ENDOSCOPE_ACT_ADAPTATION_PLAN.md
+++ b/ENDOSCOPE_ACT_ADAPTATION_PLAN.md
@@ -268,10 +268,30 @@
 ## 8. 你接下来只需提供的最小信息（进入代码改造前）
-1. 2 个电机各自的物理含义与取值范围（单位、上下限）。
+1. 2 个电机各自的物理含义与取值范围（单位、上下限）：电机分别为 motor_x 和 motor_y，x 的范围为 7000-17384，y 的范围为 8000-18884。对应的 action_x 和 action_y 都为 0~65535 之间
-2. 你当前数据中 `qpos` 和 `action` 的实际定义（是否相同）。
+2. 你当前数据中 `qpos` 和 `action` 的实际定义（是否相同）：action 和 qpos 定义接近，只不过是将 0~65535 分别映射到电机磁编码器数值上。
-3. text instruction 是每个 episode 一条，还是每个 timestep 一条。
+3. text instruction 是每个 episode 一条，还是每个 timestep 一条：text instruction 是每个 timestep 一条。
-4. 相机数量、分辨率、帧率。
+4. 相机数量、分辨率、帧率：相机数量为 1，分辨率为 224*224，帧率为 30Hz；对应的电机控制频率也为 30Hz
-5. 是否在训练时冻结 DistilBERT（`freeze_text_encoder=True/False`）。
+5. 是否在训练时冻结 DistilBERT（`freeze_text_encoder=True/False`）：DistilBERT 完全冻结。构建训练集时，先将每一个 frame 的 text instruction 用 DistilBERT 编码以后再保存。这样训练过程中不需要调用 DistilBERT。
-> 有了这 5 项，即可进入下一步代码改造。
+> 有了这 5 项，即可进入下一步代码改造。
 text_input_ids、text_attention_mask 什么意思；'instruction_mode': 'episode-level'没有用到；
 ```python
 instruction = ''
 if self.use_text_instruction:
    if '/instruction_timestep' in root:
        instruction = self._decode_instruction(root['/instruction_timestep'][start_ts])
    elif '/instruction' in root:
        instruction_node = root['/instruction']
        if getattr(instruction_node, 'shape', ()) == ():
            instruction = self._decode_instruction(instruction_node[()])
        else:
            if len(instruction_node.shape) == 1 and instruction_node.shape[0] == episode_len:
                instruction = self._decode_instruction(instruction_node[start_ts])
            else:
                instruction = self._decode_instruction(instruction_node[0])
 ```
 为什么修改了 Transformer 的定义？这里是否会生效？
--- a/conda_env.yaml
+++ b/conda_env.yaml
@@ -21,3 +21,5 @@ dependencies:
  - packaging=23.0
  - h5py=3.8.0
  - ipython=8.12.0
  - pip:
    - transformers==4.38.2
--- a/constants.py
+++ b/constants.py
@@ -1,7 +1,7 @@
 import pathlib
 ### Task parameters
-DATA_DIR = '<put your data dir here>'
+DATA_DIR = str(pathlib.Path(__file__).parent.resolve() / 'data')
 SIM_TASK_CONFIGS = {
    'sim_transfer_cube_scripted':{
        'dataset_dir': DATA_DIR + '/sim_transfer_cube_scripted',
@@ -32,6 +32,43 @@ SIM_TASK_CONFIGS = {
    },
 }
 ENDOSCOPE_TASK_CONFIGS = {
    'endoscope_default': {
        'dataset_dir': DATA_DIR + '/endoscope_default',
        'num_episodes': 50,
        'episode_len': 400,
        'camera_names': ['top'],
        'state_dim': 2,
        'action_dim': 2,
        'use_text_instruction': True,
        'instruction_mode': 'timestep-level',
        'use_cached_text_features': True,
        'text_encoder_type': 'distilbert',
        'text_feature_dim': 768,
        'text_fusion_type': 'concat_transformer_input',
        'freeze_text_encoder': True,
        'text_max_length': 32,
        'text_tokenizer_name': 'distilbert-base-uncased',
    },
    'endoscope_follow': {
        'dataset_dir': DATA_DIR + '/follow',
        'num_episodes': 3,
        'episode_len': 400,
        'camera_names': ['top'],
        'state_dim': 2,
        'action_dim': 2,
        'use_text_instruction': True,
        'instruction_mode': 'timestep-level',
        'use_cached_text_features': True,
        'text_encoder_type': 'distilbert',
        'text_feature_dim': 768,
        'text_fusion_type': 'concat_transformer_input',
        'freeze_text_encoder': True,
        'text_max_length': 32,
        'text_tokenizer_name': 'distilbert-base-uncased',
    },
 }
 ### Simulation envs fixed constants
 DT = 0.02
 JOINT_NAMES = ["waist", "shoulder", "elbow", "forearm_roll", "wrist_angle", "wrist_rotate"]
--- a/detr/main.py
+++ b/detr/main.py
@@ -4,6 +4,7 @@ from pathlib import Path
 import numpy as np
 import torch
 from torch.optim.adamw import AdamW
 from .models import build_ACT_model, build_CNNMLP_model
 import IPython
@@ -30,6 +31,15 @@ def get_args_parser():
                        help="Type of positional embedding to use on top of the image features")
    parser.add_argument('--camera_names', default=[], type=list, # will be overridden
                        help="A list of camera names")
    parser.add_argument('--state_dim', default=14, type=int)
    parser.add_argument('--action_dim', default=14, type=int)
    parser.add_argument('--use_text', action='store_true')
    parser.add_argument('--text_encoder_type', default='distilbert', type=str)
    parser.add_argument('--text_feature_dim', default=768, type=int)
    parser.add_argument('--text_fusion_type', default='concat_transformer_input', type=str)
    parser.add_argument('--freeze_text_encoder', action='store_true')
    parser.add_argument('--text_max_length', default=32, type=int)
    parser.add_argument('--text_tokenizer_name', default='distilbert-base-uncased', type=str)
    # * Transformer
    parser.add_argument('--enc_layers', default=4, type=int, # will be overridden
@@ -84,8 +94,8 @@ def build_ACT_model_and_optimizer(args_override):
            "lr": args.lr_backbone,
        },
    ]
-    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+    optimizer = AdamW(param_dicts, lr=args.lr,
-                                  weight_decay=args.weight_decay)
+                      weight_decay=args.weight_decay)
    return model, optimizer
@@ -107,8 +117,8 @@ def build_CNNMLP_model_and_optimizer(args_override):
            "lr": args.lr_backbone,
        },
    ]
-    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+    optimizer = AdamW(param_dicts, lr=args.lr,
-                                  weight_decay=args.weight_decay)
+                      weight_decay=args.weight_decay)
    return model, optimizer
--- a/detr/models/backbone.py
+++ b/detr/models/backbone.py
@@ -89,9 +89,32 @@ class Backbone(BackboneBase):
                 train_backbone: bool,
                 return_interm_layers: bool,
                 dilation: bool):
-        backbone = getattr(torchvision.models, name)(
+        backbone_builder = getattr(torchvision.models, name)
-            replace_stride_with_dilation=[False, False, dilation],
+        weights = None
-            pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) # pretrained # TODO do we want frozen batch_norm??
+        if is_main_process():
            weight_enum_name_map = {
                'resnet18': 'ResNet18_Weights',
                'resnet34': 'ResNet34_Weights',
                'resnet50': 'ResNet50_Weights',
                'resnet101': 'ResNet101_Weights',
            }
            enum_name = weight_enum_name_map.get(name)
            if enum_name is not None and hasattr(torchvision.models, enum_name):
                weights = getattr(getattr(torchvision.models, enum_name), 'DEFAULT')
        try:
            backbone = backbone_builder(
                replace_stride_with_dilation=[False, False, dilation],
                weights=weights,
                norm_layer=FrozenBatchNorm2d,
            )
        except TypeError:
            # Backward compatibility for older torchvision that still expects `pretrained`.
            backbone = backbone_builder(
                replace_stride_with_dilation=[False, False, dilation],
                pretrained=(weights is not None),
                norm_layer=FrozenBatchNorm2d,
            )
        num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
        super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
--- a/detr/models/detr_vae.py
+++ b/detr/models/detr_vae.py
@@ -33,7 +33,8 @@ def get_sinusoid_encoding_table(n_position, d_hid):
 class DETRVAE(nn.Module):
    """ This is the DETR module that performs object detection """
-    def __init__(self, backbones, transformer, encoder, state_dim, num_queries, camera_names):
+    def __init__(self, backbones, transformer, encoder, state_dim, action_dim, num_queries, camera_names,
                 use_text=False, text_feature_dim=768, text_fusion_type='concat_transformer_input'):
        """ Initializes the model.
        Parameters:
            backbones: torch module of the backbone to be used. See backbone.py
@@ -48,17 +49,18 @@ class DETRVAE(nn.Module):
        self.camera_names = camera_names
        self.transformer = transformer
        self.encoder = encoder
        self.use_text = use_text
        self.text_fusion_type = text_fusion_type
        hidden_dim = transformer.d_model
-        self.action_head = nn.Linear(hidden_dim, state_dim)
+        self.action_head = nn.Linear(hidden_dim, action_dim)
        self.is_pad_head = nn.Linear(hidden_dim, 1)
        self.query_embed = nn.Embedding(num_queries, hidden_dim)
        if backbones is not None:
            self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1)
            self.backbones = nn.ModuleList(backbones)
-            self.input_proj_robot_state = nn.Linear(14, hidden_dim)
+            self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim)
        else:
-            # input_dim = 14 + 7 # robot_state + env_state
+            self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim)
            self.input_proj_robot_state = nn.Linear(14, hidden_dim)
            self.input_proj_env_state = nn.Linear(7, hidden_dim)
            self.pos = torch.nn.Embedding(2, hidden_dim)
            self.backbones = None
@@ -66,16 +68,18 @@ class DETRVAE(nn.Module):
        # encoder extra parameters
        self.latent_dim = 32 # final size of latent z # TODO tune
        self.cls_embed = nn.Embedding(1, hidden_dim) # extra cls token embedding
-        self.encoder_action_proj = nn.Linear(14, hidden_dim) # project action to embedding
+        self.encoder_action_proj = nn.Linear(action_dim, hidden_dim) # project action to embedding
-        self.encoder_joint_proj = nn.Linear(14, hidden_dim)  # project qpos to embedding
+        self.encoder_joint_proj = nn.Linear(state_dim, hidden_dim)  # project qpos to embedding
        self.latent_proj = nn.Linear(hidden_dim, self.latent_dim*2) # project hidden state to latent std, var
        self.register_buffer('pos_table', get_sinusoid_encoding_table(1+1+num_queries, hidden_dim)) # [CLS], qpos, a_seq
        # decoder extra parameters
        self.latent_out_proj = nn.Linear(self.latent_dim, hidden_dim) # project latent sample to embedding
-        self.additional_pos_embed = nn.Embedding(2, hidden_dim) # learned position embedding for proprio and latent
+        num_extra_tokens = 3 if self.use_text else 2
        self.additional_pos_embed = nn.Embedding(num_extra_tokens, hidden_dim) # latent, proprio, optional text
        self.text_proj = nn.Linear(text_feature_dim, hidden_dim) if self.use_text else None
-    def forward(self, qpos, image, env_state, actions=None, is_pad=None):
+    def forward(self, qpos, image, env_state, text_features=None, actions=None, is_pad=None):
        """
        qpos: batch, qpos_dim
        image: batch, num_cam, channel, height, width
@@ -125,10 +129,25 @@ class DETRVAE(nn.Module):
                all_cam_pos.append(pos)
            # proprioception features
            proprio_input = self.input_proj_robot_state(qpos)
            extra_input_tokens = None
            if self.use_text and text_features is not None:
                if self.text_fusion_type != 'concat_transformer_input':
                    raise NotImplementedError(f'Unsupported text fusion type: {self.text_fusion_type}')
                text_input = self.text_proj(text_features)
                extra_input_tokens = text_input.unsqueeze(0)
            # fold camera dimension into width dimension
            src = torch.cat(all_cam_features, axis=3)
            pos = torch.cat(all_cam_pos, axis=3)
-            hs = self.transformer(src, None, self.query_embed.weight, pos, latent_input, proprio_input, self.additional_pos_embed.weight)[0]
+            hs = self.transformer(
                src,
                None,
                self.query_embed.weight,
                pos,
                latent_input,
                proprio_input,
                self.additional_pos_embed.weight,
                extra_input_tokens=extra_input_tokens,
            )[0]
        else:
            qpos = self.input_proj_robot_state(qpos)
            env_state = self.input_proj_env_state(env_state)
@@ -141,7 +160,7 @@ class DETRVAE(nn.Module):
 class CNNMLP(nn.Module):
-    def __init__(self, backbones, state_dim, camera_names):
+    def __init__(self, backbones, state_dim, action_dim, camera_names):
        """ Initializes the model.
        Parameters:
            backbones: torch module of the backbone to be used. See backbone.py
@@ -153,7 +172,7 @@ class CNNMLP(nn.Module):
        """
        super().__init__()
        self.camera_names = camera_names
-        self.action_head = nn.Linear(1000, state_dim) # TODO add more
+        self.action_head = nn.Linear(1000, action_dim) # TODO add more
        if backbones is not None:
            self.backbones = nn.ModuleList(backbones)
            backbone_down_projs = []
@@ -166,8 +185,8 @@ class CNNMLP(nn.Module):
                backbone_down_projs.append(down_proj)
            self.backbone_down_projs = nn.ModuleList(backbone_down_projs)
-            mlp_in_dim = 768 * len(backbones) + 14
+            mlp_in_dim = 768 * len(backbones) + state_dim
-            self.mlp = mlp(input_dim=mlp_in_dim, hidden_dim=1024, output_dim=14, hidden_depth=2)
+            self.mlp = mlp(input_dim=mlp_in_dim, hidden_dim=1024, output_dim=action_dim, hidden_depth=2)
        else:
            raise NotImplementedError
@@ -192,7 +211,7 @@ class CNNMLP(nn.Module):
        for cam_feature in all_cam_features:
            flattened_features.append(cam_feature.reshape([bs, -1]))
        flattened_features = torch.cat(flattened_features, axis=1) # 768 each
-        features = torch.cat([flattened_features, qpos], axis=1) # qpos: 14
+        features = torch.cat([flattened_features, qpos], axis=1)
        a_hat = self.mlp(features)
        return a_hat
@@ -227,7 +246,8 @@ def build_encoder(args):
 def build(args):
-    state_dim = 14 # TODO hardcode
+    state_dim = args.state_dim
    action_dim = args.action_dim
    # From state
    # backbone = None # from state for now, no need for conv nets
@@ -245,8 +265,12 @@ def build(args):
        transformer,
        encoder,
        state_dim=state_dim,
        action_dim=action_dim,
        num_queries=args.num_queries,
        camera_names=args.camera_names,
        use_text=args.use_text,
        text_feature_dim=args.text_feature_dim,
        text_fusion_type=args.text_fusion_type,
    )
    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
@@ -255,7 +279,8 @@ def build(args):
    return model
 def build_cnnmlp(args):
-    state_dim = 14 # TODO hardcode
+    state_dim = args.state_dim
    action_dim = args.action_dim
    # From state
    # backbone = None # from state for now, no need for conv nets
@@ -268,6 +293,7 @@ def build_cnnmlp(args):
    model = CNNMLP(
        backbones,
        state_dim=state_dim,
        action_dim=action_dim,
        camera_names=args.camera_names,
    )
--- a/detr/models/transformer.py
+++ b/detr/models/transformer.py
@@ -46,7 +46,7 @@ class Transformer(nn.Module):
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
-    def forward(self, src, mask, query_embed, pos_embed, latent_input=None, proprio_input=None, additional_pos_embed=None):
+    def forward(self, src, mask, query_embed, pos_embed, latent_input=None, proprio_input=None, additional_pos_embed=None, extra_input_tokens=None):
        # TODO flatten only when input has H and W
        if len(src.shape) == 4: # has H and W
            # flatten NxCxHxW to HWxNxC
@@ -56,10 +56,19 @@ class Transformer(nn.Module):
            query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
            # mask = mask.flatten(1)
-            additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1) # seq, bs, dim
+            additional_inputs = [latent_input, proprio_input]
-            pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0)
+            if extra_input_tokens is not None:
                if len(extra_input_tokens.shape) == 2:
                    extra_input_tokens = extra_input_tokens.unsqueeze(0)
                for i in range(extra_input_tokens.shape[0]):
                    additional_inputs.append(extra_input_tokens[i])
            addition_input = torch.stack(additional_inputs, axis=0)
            if additional_pos_embed is not None:
                additional_pos_embed = additional_pos_embed[:addition_input.shape[0]]
                additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1) # seq, bs, dim
                pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0)
            addition_input = torch.stack([latent_input, proprio_input], axis=0)
            src = torch.cat([addition_input, src], axis=0)
        else:
            assert len(src.shape) == 3
--- a/imitate_episodes.py
+++ b/imitate_episodes.py
@@ -10,14 +10,13 @@ from einops import rearrange
 from constants import DT
 from constants import PUPPET_GRIPPER_JOINT_OPEN
 from constants import SIM_TASK_CONFIGS, ENDOSCOPE_TASK_CONFIGS
 from utils import load_data # data functions
 from utils import sample_box_pose, sample_insertion_pose # robot functions
 from utils import compute_dict_mean, set_seed, detach_dict # helper functions
 from policy import ACTPolicy, CNNMLPPolicy
 from visualize_episodes import save_videos
 from sim_env import BOX_POSE
 import IPython
 e = IPython.embed
@@ -34,25 +33,47 @@ def main(args):
    num_epochs = args['num_epochs']
    # get task parameters
-    is_sim = task_name[:4] == 'sim_'
+    is_endoscope = task_name in ENDOSCOPE_TASK_CONFIGS
-    if is_sim:
+    if is_endoscope:
-        from constants import SIM_TASK_CONFIGS
+        task_config = ENDOSCOPE_TASK_CONFIGS[task_name]
        is_sim = False
    elif task_name in SIM_TASK_CONFIGS:
        task_config = SIM_TASK_CONFIGS[task_name]
        is_sim = True
    else:
        from aloha_scripts.constants import TASK_CONFIGS
        task_config = TASK_CONFIGS[task_name]
        is_sim = False
    dataset_dir = task_config['dataset_dir']
    num_episodes = task_config['num_episodes']
    episode_len = task_config['episode_len']
    camera_names = task_config['camera_names']
    state_dim = task_config.get('state_dim', 14)
    action_dim = task_config.get('action_dim', state_dim)
    use_text_instruction = task_config.get('use_text_instruction', False)
    instruction_mode = task_config.get('instruction_mode', 'timestep-level')
    use_cached_text_features = task_config.get('use_cached_text_features', True)
    text_encoder_type = task_config.get('text_encoder_type', 'distilbert')
    text_feature_dim = task_config.get('text_feature_dim', 768)
    text_fusion_type = task_config.get('text_fusion_type', 'concat_transformer_input')
    text_max_length = task_config.get('text_max_length', 32)
    text_tokenizer_name = task_config.get('text_tokenizer_name', 'distilbert-base-uncased')
    freeze_text_encoder = task_config.get('freeze_text_encoder', True)
    if args.get('text_encoder_type') is not None:
        text_encoder_type = args['text_encoder_type']
    if args.get('text_max_length') is not None:
        text_max_length = args['text_max_length']
    if args.get('freeze_text_encoder', False):
        freeze_text_encoder = True
    # fixed parameters
    state_dim = 14
    lr_backbone = 1e-5
    backbone = 'resnet18'
    if policy_class == 'ACT':
-        enc_layers = 4
+        enc_layers = 2
-        dec_layers = 7
+        dec_layers = 4
        nheads = 8
        policy_config = {'lr': args['lr'],
                         'num_queries': args['chunk_size'],
@@ -65,10 +86,25 @@ def main(args):
                         'dec_layers': dec_layers,
                         'nheads': nheads,
                         'camera_names': camera_names,
                         'state_dim': state_dim,
                         'action_dim': action_dim,
                         'use_text': use_text_instruction,
                         'text_encoder_type': text_encoder_type,
                         'text_feature_dim': text_feature_dim,
                         'text_fusion_type': text_fusion_type,
                         'freeze_text_encoder': freeze_text_encoder,
                         'instruction_mode': instruction_mode,
                         'use_cached_text_features': use_cached_text_features,
                         'text_max_length': text_max_length,
                         'text_tokenizer_name': text_tokenizer_name,
                         }
    elif policy_class == 'CNNMLP':
        policy_config = {'lr': args['lr'], 'lr_backbone': lr_backbone, 'backbone' : backbone, 'num_queries': 1,
-                         'camera_names': camera_names,}
+                         'camera_names': camera_names,
                         'state_dim': state_dim,
                         'action_dim': action_dim,
                         'use_text': use_text_instruction,
                         }
    else:
        raise NotImplementedError
@@ -77,6 +113,7 @@ def main(args):
        'ckpt_dir': ckpt_dir,
        'episode_len': episode_len,
        'state_dim': state_dim,
        'action_dim': action_dim,
        'lr': args['lr'],
        'policy_class': policy_class,
        'onscreen_render': onscreen_render,
@@ -85,7 +122,12 @@ def main(args):
        'seed': args['seed'],
        'temporal_agg': args['temporal_agg'],
        'camera_names': camera_names,
-        'real_robot': not is_sim
+        'real_robot': (not is_sim) and (not is_endoscope),
        'use_text_instruction': use_text_instruction,
        'instruction_mode': instruction_mode,
        'use_cached_text_features': use_cached_text_features,
        'text_tokenizer_name': text_tokenizer_name,
        'text_max_length': text_max_length,
    }
    if is_eval:
@@ -100,7 +142,19 @@ def main(args):
        print()
        exit()
-    train_dataloader, val_dataloader, stats, _ = load_data(dataset_dir, num_episodes, camera_names, batch_size_train, batch_size_val)
+    train_dataloader, val_dataloader, stats, _ = load_data(
        dataset_dir,
        num_episodes,
        camera_names,
        batch_size_train,
        batch_size_val,
        use_text_instruction=use_text_instruction,
        instruction_mode=instruction_mode,
        use_cached_text_features=use_cached_text_features,
        text_feature_dim=text_feature_dim,
        text_tokenizer_name=text_tokenizer_name,
        text_max_length=text_max_length,
    )
    # save dataset stats
    if not os.path.isdir(ckpt_dir):
@@ -152,6 +206,7 @@ def eval_bc(config, ckpt_name, save_episode=True):
    set_seed(1000)
    ckpt_dir = config['ckpt_dir']
    state_dim = config['state_dim']
    action_dim = config['action_dim']
    real_robot = config['real_robot']
    policy_class = config['policy_class']
    onscreen_render = config['onscreen_render']
@@ -161,6 +216,7 @@ def eval_bc(config, ckpt_name, save_episode=True):
    task_name = config['task_name']
    temporal_agg = config['temporal_agg']
    onscreen_cam = 'angle'
    BOX_POSE = None
    # load policy and stats
    ckpt_path = os.path.join(ckpt_dir, ckpt_name)
@@ -202,8 +258,14 @@ def eval_bc(config, ckpt_name, save_episode=True):
        rollout_id += 0
        ### set task
        if 'sim_transfer_cube' in task_name:
            if BOX_POSE is None:
                from sim_env import BOX_POSE as _BOX_POSE
                BOX_POSE = _BOX_POSE
            BOX_POSE[0] = sample_box_pose() # used in sim reset
        elif 'sim_insertion' in task_name:
            if BOX_POSE is None:
                from sim_env import BOX_POSE as _BOX_POSE
                BOX_POSE = _BOX_POSE
            BOX_POSE[0] = np.concatenate(sample_insertion_pose()) # used in sim reset
        ts = env.reset()
@@ -216,7 +278,7 @@ def eval_bc(config, ckpt_name, save_episode=True):
        ### evaluation loop
        if temporal_agg:
-            all_time_actions = torch.zeros([max_timesteps, max_timesteps+num_queries, state_dim]).cuda()
+            all_time_actions = torch.zeros([max_timesteps, max_timesteps+num_queries, action_dim]).cuda()
        qpos_history = torch.zeros((1, max_timesteps, state_dim)).cuda()
        image_list = [] # for visualization
@@ -314,9 +376,29 @@ def eval_bc(config, ckpt_name, save_episode=True):
 def forward_pass(data, policy):
-    image_data, qpos_data, action_data, is_pad = data
+    image_data, qpos_data, action_data, is_pad, text_input_ids, text_attention_mask, text_feature_data, text_feature_valid = data
-    image_data, qpos_data, action_data, is_pad = image_data.cuda(), qpos_data.cuda(), action_data.cuda(), is_pad.cuda()
+    image_data = image_data.cuda()
-    return policy(qpos_data, image_data, action_data, is_pad) # TODO remove None
+    qpos_data = qpos_data.cuda()
    action_data = action_data.cuda()
    is_pad = is_pad.cuda()
    text_input_ids = text_input_ids.cuda()
    text_attention_mask = text_attention_mask.cuda()
    text_feature_data = text_feature_data.cuda()
    text_feature_valid = text_feature_valid.cuda()
    text_features = None
    if torch.any(text_feature_valid):
        text_features = text_feature_data
    return policy(
        qpos_data,
        image_data,
        text_input_ids=text_input_ids,
        text_attention_mask=text_attention_mask,
        text_features=text_features,
        actions=action_data,
        is_pad=is_pad,
    )
 def train_bc(train_dataloader, val_dataloader, config):
@@ -431,5 +513,8 @@ if __name__ == '__main__':
    parser.add_argument('--hidden_dim', action='store', type=int, help='hidden_dim', required=False)
    parser.add_argument('--dim_feedforward', action='store', type=int, help='dim_feedforward', required=False)
    parser.add_argument('--temporal_agg', action='store_true')
    parser.add_argument('--text_encoder_type', action='store', type=str, required=False)
    parser.add_argument('--freeze_text_encoder', action='store_true')
    parser.add_argument('--text_max_length', action='store', type=int, required=False)
    main(vars(parser.parse_args()))
--- a/policy.py
+++ b/policy.py
@@ -3,6 +3,7 @@ from torch.nn import functional as F
 import torchvision.transforms as transforms
 from detr.main import build_ACT_model_and_optimizer, build_CNNMLP_model_and_optimizer
 from models.text_encoder import DistilBERTTextEncoder
 import IPython
 e = IPython.embed
@@ -13,18 +14,44 @@ class ACTPolicy(nn.Module):
        self.model = model # CVAE decoder
        self.optimizer = optimizer
        self.kl_weight = args_override['kl_weight']
        self.use_text = args_override.get('use_text', False)
        self.text_encoder = None
        if self.use_text:
            text_encoder_type = args_override.get('text_encoder_type', 'distilbert')
            if text_encoder_type != 'distilbert':
                raise NotImplementedError(f'Unsupported text encoder: {text_encoder_type}')
            self.text_encoder = DistilBERTTextEncoder(
                model_name=args_override.get('text_tokenizer_name', 'distilbert-base-uncased'),
                output_dim=args_override.get('text_feature_dim', 768),
                freeze=args_override.get('freeze_text_encoder', True),
            )
        print(f'KL Weight {self.kl_weight}')
-    def __call__(self, qpos, image, actions=None, is_pad=None):
+    def __call__(self, qpos, image, text_input_ids=None, text_attention_mask=None, text_features=None, actions=None, is_pad=None):
        env_state = None
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        image = normalize(image)
        if self.use_text and text_features is None and text_input_ids is not None and text_attention_mask is not None:
            if self.text_encoder is None:
                raise RuntimeError('Text encoder is not initialized while use_text=True.')
            text_features = self.text_encoder(text_input_ids, text_attention_mask)
        if actions is not None: # training time
            if is_pad is None:
                raise ValueError('`is_pad` must be provided during training when `actions` is not None.')
            actions = actions[:, :self.model.num_queries]
            is_pad = is_pad[:, :self.model.num_queries]
-            a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad)
+            a_hat, is_pad_hat, (mu, logvar) = self.model(
                qpos,
                image,
                env_state,
                text_features=text_features,
                actions=actions,
                is_pad=is_pad,
            )
            total_kld, dim_wise_kld, mean_kld = kl_divergence(mu, logvar)
            loss_dict = dict()
            all_l1 = F.l1_loss(actions, a_hat, reduction='none')
@@ -34,7 +61,7 @@ class ACTPolicy(nn.Module):
            loss_dict['loss'] = loss_dict['l1'] + loss_dict['kl'] * self.kl_weight
            return loss_dict
        else: # inference time
-            a_hat, _, (_, _) = self.model(qpos, image, env_state) # no action, sample from prior
+            a_hat, _, (_, _) = self.model(qpos, image, env_state, text_features=text_features) # no action, sample from prior
            return a_hat
    def configure_optimizers(self):
@@ -48,7 +75,7 @@ class CNNMLPPolicy(nn.Module):
        self.model = model # decoder
        self.optimizer = optimizer
-    def __call__(self, qpos, image, actions=None, is_pad=None):
+    def __call__(self, qpos, image, text_input_ids=None, text_attention_mask=None, text_features=None, actions=None, is_pad=None):
        env_state = None # TODO
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
--- a/utils.py
+++ b/utils.py
@@ -2,21 +2,88 @@ import numpy as np
 import torch
 import os
 import h5py
 import re
 from torch.utils.data import TensorDataset, DataLoader
 import IPython
 e = IPython.embed
 class EpisodicDataset(torch.utils.data.Dataset):
-    def __init__(self, episode_ids, dataset_dir, camera_names, norm_stats):
+    def __init__(self, episode_ids, dataset_dir, camera_names, norm_stats,
                 use_text_instruction=False,
                 instruction_mode='timestep-level',
                 use_cached_text_features=True,
                 text_feature_dim=768,
                 text_tokenizer_name='distilbert-base-uncased',
                 text_max_length=32):
        super(EpisodicDataset).__init__()
        self.episode_ids = episode_ids
        self.dataset_dir = dataset_dir
        self.camera_names = camera_names
        self.norm_stats = norm_stats
        self.use_text_instruction = use_text_instruction
        self.instruction_mode = instruction_mode
        self.use_cached_text_features = use_cached_text_features
        self.text_feature_dim = text_feature_dim
        self.text_max_length = text_max_length
        self.is_sim = None
        self.max_episode_len = None
        self.action_dim = None
        self.text_tokenizer = None
        if self.use_text_instruction:
            try:
                from transformers import DistilBertTokenizerFast
            except ImportError as exc:
                raise ImportError(
                    'transformers is required for text instruction loading. '
                    'Install it with: pip install transformers'
                ) from exc
            self.text_tokenizer = DistilBertTokenizerFast.from_pretrained(text_tokenizer_name)
        self._init_episode_shapes()
        self.__getitem__(0) # initialize self.is_sim
    def _init_episode_shapes(self):
        max_len = 0
        action_dim = None
        for episode_id in self.episode_ids:
            dataset_path = os.path.join(self.dataset_dir, f'episode_{episode_id}.hdf5')
            with h5py.File(dataset_path, 'r') as root:
                shape = root['/action'].shape
                if len(shape) != 2:
                    raise ValueError(f'Expected /action to have shape [T, D], got {shape} in {dataset_path}')
                max_len = max(max_len, int(shape[0]))
                if action_dim is None:
                    action_dim = int(shape[1])
                elif int(shape[1]) != action_dim:
                    raise ValueError(
                        f'Inconsistent action dim in dataset. Expected {action_dim}, got {shape[1]} in {dataset_path}'
                    )
        if max_len <= 0 or action_dim is None:
            raise ValueError(f'Invalid dataset metadata in {self.dataset_dir}')
        self.max_episode_len = max_len
        self.action_dim = action_dim
    @staticmethod
    def _decode_instruction(raw_value):
        if raw_value is None:
            return ''
        if isinstance(raw_value, bytes):
            return raw_value.decode('utf-8')
        if isinstance(raw_value, np.bytes_):
            return raw_value.tobytes().decode('utf-8')
        if isinstance(raw_value, np.ndarray):
            if raw_value.shape == ():
                return EpisodicDataset._decode_instruction(raw_value.item())
            if raw_value.size == 0:
                return ''
            return EpisodicDataset._decode_instruction(raw_value.reshape(-1)[0])
        return str(raw_value)
    def __len__(self):
        return len(self.episode_ids)
@@ -26,7 +93,7 @@ class EpisodicDataset(torch.utils.data.Dataset):
        episode_id = self.episode_ids[index]
        dataset_path = os.path.join(self.dataset_dir, f'episode_{episode_id}.hdf5')
        with h5py.File(dataset_path, 'r') as root:
-            is_sim = root.attrs['sim']
+            is_sim = bool(root.attrs.get('sim', False))
            original_action_shape = root['/action'].shape
            episode_len = original_action_shape[0]
            if sample_full_episode:
@@ -35,10 +102,40 @@ class EpisodicDataset(torch.utils.data.Dataset):
                start_ts = np.random.choice(episode_len)
            # get observation at start_ts only
            qpos = root['/observations/qpos'][start_ts]
            qvel = root['/observations/qvel'][start_ts]
            image_dict = dict()
            for cam_name in self.camera_names:
                image_dict[cam_name] = root[f'/observations/images/{cam_name}'][start_ts]
            instruction = ''
            text_feature = None
            if self.use_text_instruction:
                effective_mode = self.instruction_mode
                if effective_mode == 'timestep-level' and '/instruction_timestep' in root:
                    instruction = self._decode_instruction(root['/instruction_timestep'][start_ts])
                elif '/instruction' in root:
                    instruction_node = root['/instruction']
                    if getattr(instruction_node, 'shape', ()) == ():
                        instruction = self._decode_instruction(instruction_node[()])
                    else:
                        if len(instruction_node.shape) == 1 and instruction_node.shape[0] == episode_len:
                            instruction = self._decode_instruction(instruction_node[start_ts])
                        else:
                            instruction = self._decode_instruction(instruction_node[0])
                if self.use_cached_text_features:
                    if effective_mode == 'timestep-level' and '/instruction_features_timestep' in root:
                        text_feature = root['/instruction_features_timestep'][start_ts]
                    elif '/instruction_features' in root:
                        feat_node = root['/instruction_features']
                        if getattr(feat_node, 'shape', ()) == ():
                            text_feature = np.array(feat_node[()])
                        elif len(feat_node.shape) == 1:
                            text_feature = feat_node[()]
                        elif len(feat_node.shape) == 2 and feat_node.shape[0] == episode_len:
                            text_feature = feat_node[start_ts]
                        else:
                            text_feature = feat_node[0]
            # get all actions after and including start_ts
            if is_sim:
                action = root['/action'][start_ts:]
@@ -48,10 +145,10 @@ class EpisodicDataset(torch.utils.data.Dataset):
                action_len = episode_len - max(0, start_ts - 1) # hack, to make timesteps more aligned
        self.is_sim = is_sim
-        padded_action = np.zeros(original_action_shape, dtype=np.float32)
+        padded_action = np.zeros((self.max_episode_len, self.action_dim), dtype=np.float32)
        padded_action[:action_len] = action
-        is_pad = np.zeros(episode_len)
+        is_pad = np.ones(self.max_episode_len)
-        is_pad[action_len:] = 1
+        is_pad[:action_len] = 0
        # new axis for different cameras
        all_cam_images = []
@@ -73,55 +170,132 @@ class EpisodicDataset(torch.utils.data.Dataset):
        action_data = (action_data - self.norm_stats["action_mean"]) / self.norm_stats["action_std"]
        qpos_data = (qpos_data - self.norm_stats["qpos_mean"]) / self.norm_stats["qpos_std"]
-        return image_data, qpos_data, action_data, is_pad
+        if self.use_text_instruction and text_feature is not None:
            text_feature_data = torch.from_numpy(np.array(text_feature)).float()
            text_feature_valid = torch.tensor(True, dtype=torch.bool)
            text_input_ids = torch.zeros(1, dtype=torch.long)
            text_attention_mask = torch.zeros(1, dtype=torch.long)
        elif self.use_text_instruction:
            tokenized = self.text_tokenizer(
                instruction,
                padding='max_length',
                truncation=True,
                max_length=self.text_max_length,
                return_tensors='pt',
            )
            text_input_ids = tokenized['input_ids'].squeeze(0).long()
            text_attention_mask = tokenized['attention_mask'].squeeze(0).long()
            text_feature_data = torch.zeros(self.text_feature_dim, dtype=torch.float32)
            text_feature_valid = torch.tensor(False, dtype=torch.bool)
        else:
            text_input_ids = torch.zeros(1, dtype=torch.long)
            text_attention_mask = torch.zeros(1, dtype=torch.long)
            text_feature_data = torch.zeros(self.text_feature_dim, dtype=torch.float32)
            text_feature_valid = torch.tensor(False, dtype=torch.bool)
        return image_data, qpos_data, action_data, is_pad, text_input_ids, text_attention_mask, text_feature_data, text_feature_valid
-def get_norm_stats(dataset_dir, num_episodes):
+def _discover_episode_ids(dataset_dir, num_episodes=None):
    pattern = re.compile(r'^episode_(\d+)\.hdf5$')
    episode_ids = []
    for fname in os.listdir(dataset_dir):
        m = pattern.match(fname)
        if m:
            episode_ids.append(int(m.group(1)))
    episode_ids.sort()
    if num_episodes is not None:
        episode_ids = episode_ids[:num_episodes]
    return episode_ids
 def get_norm_stats(dataset_dir, episode_ids):
    all_qpos_data = []
    all_action_data = []
-    for episode_idx in range(num_episodes):
+    example_qpos = None
    for episode_idx in episode_ids:
        dataset_path = os.path.join(dataset_dir, f'episode_{episode_idx}.hdf5')
        with h5py.File(dataset_path, 'r') as root:
            qpos = root['/observations/qpos'][()]
            qvel = root['/observations/qvel'][()]
            action = root['/action'][()]
-        all_qpos_data.append(torch.from_numpy(qpos))
+        qpos_t = torch.from_numpy(qpos)
-        all_action_data.append(torch.from_numpy(action))
+        action_t = torch.from_numpy(action)
-    all_qpos_data = torch.stack(all_qpos_data)
+        all_qpos_data.append(qpos_t)
-    all_action_data = torch.stack(all_action_data)
+        all_action_data.append(action_t)
-    all_action_data = all_action_data
+        if example_qpos is None and len(qpos) > 0:
            example_qpos = qpos[0]
    # Episodes may have different lengths; concatenate over time axis.
    all_qpos_data = torch.cat(all_qpos_data, dim=0)
    all_action_data = torch.cat(all_action_data, dim=0)
    # normalize action data
-    action_mean = all_action_data.mean(dim=[0, 1], keepdim=True)
+    action_mean = all_action_data.mean(dim=0, keepdim=True)
-    action_std = all_action_data.std(dim=[0, 1], keepdim=True)
+    action_std = all_action_data.std(dim=0, keepdim=True)
    action_std = torch.clip(action_std, 1e-2, np.inf) # clipping
    # normalize qpos data
-    qpos_mean = all_qpos_data.mean(dim=[0, 1], keepdim=True)
+    qpos_mean = all_qpos_data.mean(dim=0, keepdim=True)
-    qpos_std = all_qpos_data.std(dim=[0, 1], keepdim=True)
+    qpos_std = all_qpos_data.std(dim=0, keepdim=True)
    qpos_std = torch.clip(qpos_std, 1e-2, np.inf) # clipping
    stats = {"action_mean": action_mean.numpy().squeeze(), "action_std": action_std.numpy().squeeze(),
             "qpos_mean": qpos_mean.numpy().squeeze(), "qpos_std": qpos_std.numpy().squeeze(),
-             "example_qpos": qpos}
+             "example_qpos": example_qpos}
    return stats
-
+def load_data(dataset_dir, num_episodes, camera_names, batch_size_train, batch_size_val,
-def load_data(dataset_dir, num_episodes, camera_names, batch_size_train, batch_size_val):
+              use_text_instruction=False,
              instruction_mode='timestep-level',
              use_cached_text_features=True,
              text_feature_dim=768,
              text_tokenizer_name='distilbert-base-uncased',
              text_max_length=32):
    print(f'\nData from: {dataset_dir}\n')
    episode_ids = _discover_episode_ids(dataset_dir, num_episodes)
    if len(episode_ids) == 0:
        raise FileNotFoundError(f'No episode_*.hdf5 found in {dataset_dir}')
    if len(episode_ids) < 2:
        raise ValueError(f'Need at least 2 episodes for train/val split, found {len(episode_ids)} in {dataset_dir}')
    # obtain train test split
    train_ratio = 0.8
-    shuffled_indices = np.random.permutation(num_episodes)
+    shuffled_indices = np.random.permutation(len(episode_ids))
-    train_indices = shuffled_indices[:int(train_ratio * num_episodes)]
+    train_count = int(train_ratio * len(episode_ids))
-    val_indices = shuffled_indices[int(train_ratio * num_episodes):]
+    train_indices = shuffled_indices[:train_count]
    val_indices = shuffled_indices[train_count:]
    train_episode_ids = np.array(episode_ids)[train_indices]
    val_episode_ids = np.array(episode_ids)[val_indices]
    # obtain normalization stats for qpos and action
-    norm_stats = get_norm_stats(dataset_dir, num_episodes)
+    norm_stats = get_norm_stats(dataset_dir, episode_ids)
    # construct dataset and dataloader
-    train_dataset = EpisodicDataset(train_indices, dataset_dir, camera_names, norm_stats)
+    train_dataset = EpisodicDataset(
-    val_dataset = EpisodicDataset(val_indices, dataset_dir, camera_names, norm_stats)
+        train_episode_ids,
        dataset_dir,
        camera_names,
        norm_stats,
        use_text_instruction=use_text_instruction,
        instruction_mode=instruction_mode,
        use_cached_text_features=use_cached_text_features,
        text_feature_dim=text_feature_dim,
        text_tokenizer_name=text_tokenizer_name,
        text_max_length=text_max_length,
    )
    val_dataset = EpisodicDataset(
        val_episode_ids,
        dataset_dir,
        camera_names,
        norm_stats,
        use_text_instruction=use_text_instruction,
        instruction_mode=instruction_mode,
        use_cached_text_features=use_cached_text_features,
        text_feature_dim=text_feature_dim,
        text_tokenizer_name=text_tokenizer_name,
        text_max_length=text_max_length,
    )
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True, pin_memory=True, num_workers=1, prefetch_factor=1)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size_val, shuffle=True, pin_memory=True, num_workers=1, prefetch_factor=1)
--- a/visualize_episodes.py
+++ b/visualize_episodes.py
@@ -21,32 +21,33 @@ def load_hdf5(dataset_dir, dataset_name):
    with h5py.File(dataset_path, 'r') as root:
        is_sim = root.attrs['sim']
        dt = float(root.attrs.get('dt', DT))
        qpos = root['/observations/qpos'][()]
-        qvel = root['/observations/qvel'][()]
+        qvel = root['/observations/qvel'][()] if '/observations/qvel' in root else np.zeros_like(qpos)
        action = root['/action'][()]
        image_dict = dict()
        for cam_name in root[f'/observations/images/'].keys():
            image_dict[cam_name] = root[f'/observations/images/{cam_name}'][()]
-    return qpos, qvel, action, image_dict
+    return qpos, qvel, action, image_dict, dt
 def main(args):
    dataset_dir = args['dataset_dir']
    episode_idx = args['episode_idx']
    dataset_name = f'episode_{episode_idx}'
-    qpos, qvel, action, image_dict = load_hdf5(dataset_dir, dataset_name)
+    qpos, qvel, action, image_dict, dt = load_hdf5(dataset_dir, dataset_name)
-    save_videos(image_dict, DT, video_path=os.path.join(dataset_dir, dataset_name + '_video.mp4'))
+    save_videos(image_dict, dt, video_path=os.path.join(dataset_dir, dataset_name + '_video.mp4'))
    visualize_joints(qpos, action, plot_path=os.path.join(dataset_dir, dataset_name + '_qpos.png'))
    # visualize_timestamp(t_list, dataset_path) # TODO addn timestamp back
-def save_videos(video, dt, video_path=None):
+def save_videos(video, dt, video_path):
    if isinstance(video, list):
        cam_names = list(video[0].keys())
        h, w, _ = video[0][cam_names[0]].shape
        w = w * len(cam_names)
-        fps = int(1/dt)
+        fps = max(1, int(round(1 / dt)))
        out = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
        for ts, image_dict in enumerate(video):
            images = []
@@ -66,7 +67,7 @@ def save_videos(video, dt, video_path=None):
        all_cam_videos = np.concatenate(all_cam_videos, axis=2) # width dimension
        n_frames, h, w, _ = all_cam_videos.shape
-        fps = int(1 / dt)
+        fps = max(1, int(round(1 / dt)))
        out = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
        for t in range(n_frames):
            image = all_cam_videos[t]