From d85cce8a521d8d5eb7474ee87225dd95945c6b08 Mon Sep 17 00:00:00 2001
From: JC6123 <jimmychan6123@163.com>
Date: Fri, 20 Feb 2026 14:13:25 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9E=84=E5=BB=BAno-text=E6=95=B0=E6=8D=AE?=
 =?UTF-8?q?=E9=9B=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 build_endoscope_act_dataset.py | 191 ++++++++++++++++++++++-----------
 build_no_text_dataset.sh       |  25 +++++
 build_text_dataset.sh          |  29 +++++
 constants.py                   |   9 ++
 utils.py                       |   2 +-
 5 files changed, 194 insertions(+), 62 deletions(-)
 create mode 100755 build_no_text_dataset.sh
 create mode 100755 build_text_dataset.sh

diff --git a/build_endoscope_act_dataset.py b/build_endoscope_act_dataset.py
index e1c592a..cd511d1 100644
--- a/build_endoscope_act_dataset.py
+++ b/build_endoscope_act_dataset.py
@@ -152,6 +152,16 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Disable head/tail stationary instruction override",
     )
+    parser.add_argument(
+        "--trim_stationary_edges",
+        action="store_true",
+        help="Trim stationary head/tail segments and keep only the middle moving segment",
+    )
+    parser.add_argument(
+        "--no_text_instruction",
+        action="store_true",
+        help="Do not save instruction/instruction_timestep (and disable text feature encoding)",
+    )
     return parser.parse_args()
 
 
@@ -317,48 +327,21 @@ def override_stationary_edge_instructions(
     motion_threshold: float,
 ) -> Tuple[List[str], int, int]:
     """
-    Override instruction text at head/tail when deviation is below threshold.
-    Start side: use the first frame as reference and expand forward until
-    deviation exceeds threshold.
-    End side: use the last frame as reference and expand backward until
-    deviation exceeds threshold.
+    Override instruction text at head/tail using qpos velocity.
+    Start side: once qpos speed is above threshold for consecutive frames,
+    stop applying stop_instruction from that point onward.
+    End side: similarly scan backward from the end.
     """
     num = len(instructions)
     if num == 0:
         return instructions, 0, 0
 
-    # normalize to comparable scales (0~1)
-    py = _normalize_series(motor_pos_y.astype(np.float32), 8000.0, 18884.0)
-    px = _normalize_series(motor_pos_x.astype(np.float32), 7000.0, 17384.0)
-    c0 = _normalize_series(motor_cmd_0.astype(np.float32), 0.0, 65535.0)
-    c1 = _normalize_series(motor_cmd_1.astype(np.float32), 0.0, 65535.0)
-
-    if num == 1:
-        return [stop_instruction], 1, 1
-
-    # keep argument for backward CLI compatibility
-    _ = motion_window
-
-    start_ref = np.array([py[0], px[0], c0[0], c1[0]], dtype=np.float32)
-    end_ref = np.array([py[-1], px[-1], c0[-1], c1[-1]], dtype=np.float32)
-
-    def deviation_to_ref(i: int, ref: np.ndarray) -> float:
-        cur = np.array([py[i], px[i], c0[i], c1[i]], dtype=np.float32)
-        return float(np.max(np.abs(cur - ref)))
-
-    start_count = 0
-    for i in range(num):
-        if deviation_to_ref(i, start_ref) <= motion_threshold:
-            start_count += 1
-        else:
-            break
-
-    end_count = 0
-    for i in range(num - 1, -1, -1):
-        if deviation_to_ref(i, end_ref) <= motion_threshold:
-            end_count += 1
-        else:
-            break
+    start_count, end_count = detect_stationary_edge_counts_from_qpos(
+        motor_pos_y=motor_pos_y,
+        motor_pos_x=motor_pos_x,
+        motion_window=motion_window,
+        motion_threshold=motion_threshold,
+    )
 
     updated = list(instructions)
     for i in range(start_count):
@@ -369,6 +352,56 @@ def override_stationary_edge_instructions(
     return updated, start_count, end_count
 
 
+def detect_stationary_edge_counts_from_qpos(
+    motor_pos_y: np.ndarray,
+    motor_pos_x: np.ndarray,
+    motion_window: int,
+    motion_threshold: float,
+) -> Tuple[int, int]:
+    """Return stationary frame counts on head and tail using qpos velocity rule."""
+    num = int(len(motor_pos_y))
+    if num == 0:
+        return 0, 0
+    if num == 1:
+        return 1, 1
+
+    py = _normalize_series(motor_pos_y.astype(np.float32), 8000.0, 18884.0)
+    px = _normalize_series(motor_pos_x.astype(np.float32), 7000.0, 17384.0)
+
+    consecutive = max(1, int(motion_window))
+    dt = 1.0 / 30.0
+
+    frame_speed = np.zeros((num,), dtype=np.float32)
+    dy = np.abs(np.diff(py)) / dt
+    dx = np.abs(np.diff(px)) / dt
+    frame_speed[1:] = np.maximum(dy, dx)
+
+    high_run = 0
+    start_count = num
+    for i in range(1, num):
+        if frame_speed[i] > motion_threshold:
+            high_run += 1
+            if high_run >= consecutive:
+                start_count = i - consecutive + 1
+                break
+        else:
+            high_run = 0
+
+    high_run = 0
+    end_count = num
+    for i in range(num - 1, 0, -1):
+        if frame_speed[i] > motion_threshold:
+            high_run += 1
+            if high_run >= consecutive:
+                tail_start = i + consecutive
+                end_count = max(0, num - tail_start)
+                break
+        else:
+            high_run = 0
+
+    return start_count, end_count
+
+
 def find_segment_csv(segment_dir: Path) -> Path:
     csvs = sorted(segment_dir.glob("*.csv"))
     if not csvs:
@@ -441,6 +474,9 @@ def save_episode_plot_with_stop_segments(
 def main() -> None:
     args = parse_args()
 
+    if args.no_text_instruction and args.encode_text_features:
+        raise ValueError('--no_text_instruction and --encode_text_features cannot be used together.')
+
     segment_dir = Path(args.segment_dir).resolve()
     output_dir = Path(args.output_dir).resolve()
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -513,17 +549,48 @@ def main() -> None:
         action[i, 0] = normalize_value(np.array([motor_cmd_0], dtype=np.float32), cmd_min, cmd_max, args.action_norm)[0]
         action[i, 1] = normalize_value(np.array([motor_cmd_1], dtype=np.float32), cmd_min, cmd_max, args.action_norm)[0]
 
-        ins = instruction_from_annotation(
-            ann,
-            crop,
-            args.instruction_template,
-            args.instruction_empty,
-        )
-        instructions.append(ins)
+        if not args.no_text_instruction:
+            ins = instruction_from_annotation(
+                ann,
+                crop,
+                args.instruction_template,
+                args.instruction_empty,
+            )
+            instructions.append(ins)
 
-    start_stop_count = 0
-    end_stop_count = 0
-    if not args.disable_stop_override:
+    start_stop_count, end_stop_count = detect_stationary_edge_counts_from_qpos(
+        motor_pos_y=motor_pos_y_series,
+        motor_pos_x=motor_pos_x_series,
+        motion_window=args.motion_window,
+        motion_threshold=args.motion_threshold,
+    )
+
+    if args.trim_stationary_edges:
+        keep_start = int(start_stop_count)
+        keep_end = int(num - end_stop_count)
+        if keep_end <= keep_start:
+            raise ValueError(
+                f'No moving segment left after trim: start={start_stop_count}, end={end_stop_count}, num={num}. '
+                f'Consider lowering --motion_threshold or --motion_window.'
+            )
+        images = images[keep_start:keep_end]
+        qpos = qpos[keep_start:keep_end]
+        action = action[keep_start:keep_end]
+        motor_pos_y_series = motor_pos_y_series[keep_start:keep_end]
+        motor_pos_x_series = motor_pos_x_series[keep_start:keep_end]
+        motor_cmd_0_series = motor_cmd_0_series[keep_start:keep_end]
+        motor_cmd_1_series = motor_cmd_1_series[keep_start:keep_end]
+        if not args.no_text_instruction:
+            instructions = instructions[keep_start:keep_end]
+
+        print(
+            f'Trim stationary edges: removed head={start_stop_count}, tail={end_stop_count}, '
+            f'kept={keep_end - keep_start}'
+        )
+        # After trimming, full kept segment is the moving region.
+        start_stop_count, end_stop_count = 0, 0
+
+    if (not args.disable_stop_override) and (not args.no_text_instruction):
         instructions, start_stop_count, end_stop_count = override_stationary_edge_instructions(
             instructions=instructions,
             motor_pos_y=motor_pos_y_series,
@@ -564,19 +631,20 @@ def main() -> None:
 
         root.create_dataset("action", data=action, dtype=np.float32)
 
-        str_dtype = h5py.string_dtype(encoding="utf-8")
-        root.create_dataset(
-            "instruction_timestep",
-            shape=(num,),
-            dtype=str_dtype,
-            data=np.asarray(instructions, dtype=object),
-        )
-        root.create_dataset(
-            "instruction",
-            shape=(),
-            dtype=str_dtype,
-            data=instructions[0] if len(instructions) > 0 else "",
-        )
+        if not args.no_text_instruction:
+            str_dtype = h5py.string_dtype(encoding="utf-8")
+            root.create_dataset(
+                "instruction_timestep",
+                shape=(len(instructions),),
+                dtype=str_dtype,
+                data=np.asarray(instructions, dtype=object),
+            )
+            root.create_dataset(
+                "instruction",
+                shape=(),
+                dtype=str_dtype,
+                data=instructions[0] if len(instructions) > 0 else "",
+            )
 
         if text_features is not None:
             root.create_dataset("instruction_features_timestep", data=text_features, dtype=np.float32)
@@ -589,7 +657,8 @@ def main() -> None:
     if not args.disable_stop_override:
         print(
             f"stationary override: head={start_stop_count}, tail={end_stop_count}, "
-            f"mode=endpoint_reference, threshold={args.motion_threshold}, "
+            f"mode=qpos_velocity_consecutive, consecutive={args.motion_window}, "
+            f"threshold={args.motion_threshold}, "
             f"instruction='{args.stop_instruction}'"
         )
     if text_features is not None:
diff --git a/build_no_text_dataset.sh b/build_no_text_dataset.sh
new file mode 100755
index 0000000..0990d11
--- /dev/null
+++ b/build_no_text_dataset.sh
@@ -0,0 +1,25 @@
+SEG_ROOT="/home/cyx6123/DuodenoVLA/data/ACT/aloha/act/data/raw_data/00-follow"
+OUT_DIR="/home/cyx6123/DuodenoVLA/data/ACT/aloha/act/data/follow-no-text"
+SCRIPT="/home/cyx6123/DuodenoVLA/data/ACT/aloha/act/build_endoscope_act_dataset.py"
+
+mkdir -p "$OUT_DIR"
+i=50
+for d in "$SEG_ROOT"/follow_seg_*; do
+  [ -d "$d" ] || continue
+  echo "Building $d -> episode_$i"
+  python "$SCRIPT" \
+    --segment_dir "$d" \
+    --output_dir "$OUT_DIR" \
+    --episode_idx "$i" \
+    --max_frames -1 \
+    --camera_name top \
+    --crop 733 30 1754 1051 \
+    --resize 224 224 \
+    --motion_window 3 \
+    --motion_threshold 0.05 \
+    --state_norm minus1_1 \
+    --action_norm minus1_1 \
+    --trim_stationary_edges \
+    --no_text_instruction
+  i=$((i+1))
+done
\ No newline at end of file
diff --git a/build_text_dataset.sh b/build_text_dataset.sh
new file mode 100755
index 0000000..343b591
--- /dev/null
+++ b/build_text_dataset.sh
@@ -0,0 +1,29 @@
+SEG_ROOT="/home/cyx6123/DuodenoVLA/data/ACT/aloha/act/data/raw_data/01-cannulation"
+OUT_DIR="/home/cyx6123/DuodenoVLA/data/ACT/aloha/act/data/cannulation"
+SCRIPT="/home/cyx6123/DuodenoVLA/data/ACT/aloha/act/build_endoscope_act_dataset.py"
+
+mkdir -p "$OUT_DIR"
+i=12
+for d in "$SEG_ROOT"/seg_*; do
+  [ -d "$d" ] || continue
+  echo "Building $d -> episode_$i"
+  python "$SCRIPT" \
+    --segment_dir "$d" \
+    --output_dir "$OUT_DIR" \
+    --episode_idx "$i" \
+    --max_frames -1 \
+    --camera_name top \
+    --crop 733 30 1754 1051 \
+    --resize 224 224 \
+    --instruction_template 'Cannulate the {label} on the phantom located at the {region} with the sphincterotome.' \
+    --instruction_empty 'No target visible.' \
+    --stop_instruction 'Stop move.' \
+    --motion_window 3 \
+    --motion_threshold 0.05 \
+    --state_norm minus1_1 \
+    --action_norm minus1_1 \
+    --encode_text_features \
+    --text_model_name distilbert-base-uncased \
+    --text_batch_size 32
+  i=$((i+1))
+done
\ No newline at end of file
diff --git a/constants.py b/constants.py
index 33debc8..463acad 100644
--- a/constants.py
+++ b/constants.py
@@ -67,6 +67,15 @@ ENDOSCOPE_TASK_CONFIGS = {
         'text_max_length': 32,
         'text_tokenizer_name': 'distilbert-base-uncased',
     },
+    'endoscope_both_no_text': {
+        'dataset_dir': DATA_DIR + '/both-no-text',
+        'num_episodes': 3,
+        'episode_len': 400,
+        'camera_names': ['top'],
+        'state_dim': 2,
+        'action_dim': 2,
+        'use_text_instruction': False,
+    },
 }
 
 ### Simulation envs fixed constants
diff --git a/utils.py b/utils.py
index e2ea612..6d2054c 100644
--- a/utils.py
+++ b/utils.py
@@ -344,7 +344,7 @@ def load_data(dataset_dir, num_episodes, camera_names, batch_size_train, batch_s
         raise ValueError(f'Need at least 2 episodes for train/val split, found {len(episode_ids)} in {dataset_dir}')
 
     # obtain train test split
-    train_ratio = 0.8
+    train_ratio = 0.9
     shuffled_indices = np.random.permutation(len(episode_ids))
     train_count = int(train_ratio * len(episode_ids))
     train_indices = shuffled_indices[:train_count]