From 14839e97d976f62bb38226e3e68e91dd1a68b3bc Mon Sep 17 00:00:00 2001 From: GuillaumePELLUET Date: Wed, 24 Jun 2026 17:50:11 +0200 Subject: [PATCH 1/5] Fix dataframe issue with index --- weightslab/utils/tools.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/weightslab/utils/tools.py b/weightslab/utils/tools.py index 4e8fa025..b2442f95 100644 --- a/weightslab/utils/tools.py +++ b/weightslab/utils/tools.py @@ -34,10 +34,12 @@ def safe_reset_index(df: "pd.DataFrame") -> "pd.DataFrame": """ import pandas as _pd if not isinstance(df, _pd.DataFrame) or not isinstance(df.index, _pd.MultiIndex): - # Single-level index: only reset if the name isn't already a column. - if df.index.name and df.index.name in df.columns: - return df - return df.reset_index() + # Single-level index: only promote if the name is meaningful and missing. + # Unnamed index (None) has nothing to promote; drop it to avoid inserting + # a spurious 'index' / 'level_0' column when one already exists. + if df.index.name and df.index.name not in df.columns: + return df.reset_index() + return df.reset_index(drop=True) missing = [n for n in df.index.names if n and n not in df.columns] if not missing: # All levels already present as columns — nothing to promote. From 58e63eb345c99e96fc9e8e2793bd64cf323ad5c7 Mon Sep 17 00:00:00 2001 From: GuillaumePELLUET Date: Wed, 24 Jun 2026 18:14:38 +0200 Subject: [PATCH 2/5] feat: add __discard_by_tag__ handler in EditDataSample MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New special stat_name that discards all samples carrying a given tag across the whole dataset without requiring the frontend to enumerate sample IDs. Uses the existing upsert_df path — same as per-sample discard — grouped by origin for correctness. Co-Authored-By: Claude Sonnet 4.6 --- weightslab/trainer/services/data_service.py | 56 ++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/weightslab/trainer/services/data_service.py b/weightslab/trainer/services/data_service.py index b12495c3..5b518ee6 100755 --- a/weightslab/trainer/services/data_service.py +++ b/weightslab/trainer/services/data_service.py @@ -3765,10 +3765,64 @@ def EditDataSample(self, request, context): message=f"Failed to delete metadata column: {str(e)}", ) + if request.stat_name == "__discard_by_tag__": + tag_name = str(request.string_value or "").strip() + if not tag_name: + return pb2.DataEditsResponse( + success=False, + message="Missing tag name for discard-by-tag operation.", + ) + + with self._watched_lock("_lock[EditDataSample/__discard_by_tag__]"): + try: + self._slowUpdateInternals() + if self._all_datasets_df is None or self._all_datasets_df.empty: + return pb2.DataEditsResponse( + success=False, + message="No dataframe available.", + ) + + tag_col = f"{SampleStatsEx.TAG.value}:{tag_name}" + df = safe_reset_index(self._all_datasets_df) + if tag_col not in df.columns: + return pb2.DataEditsResponse( + success=True, + message=f"No samples found with tag '{tag_name}'.", + ) + + tagged = df[df[tag_col] == 1] + if tagged.empty: + return pb2.DataEditsResponse( + success=True, + message=f"No samples found with tag '{tag_name}'.", + ) + + for origin, origin_df in tagged.groupby(SampleStatsEx.ORIGIN.value, sort=False): + sample_ids = origin_df.index.astype(str).tolist() + rows = [ + {"sample_id": sid, SampleStatsEx.ORIGIN.value: origin, SampleStatsEx.DISCARDED.value: True} + for sid in sample_ids + ] + df_update = pd.DataFrame(rows).set_index("sample_id") + self._df_manager.upsert_df(df_update, origin=origin, force_flush=True) + + count = len(tagged) + self._slowUpdateInternals(force=True) + return pb2.DataEditsResponse( + success=True, + message=f"Discarded {count} samples with tag '{tag_name}'.", + ) + except Exception as e: + logger.error(f"[EditDataSample] discard_by_tag failed: {e}", exc_info=True) + return pb2.DataEditsResponse( + success=False, + message=f"Failed to discard by tag: {str(e)}", + ) + if not request.stat_name or not request.stat_name.startswith(SampleStatsEx.TAG.value) and request.stat_name not in [SampleStatsEx.DISCARDED.value]: return pb2.DataEditsResponse( success=False, - message="Only 'tags', 'discarded', '__copy_metadata__', '__delete_metadata__', '__save_data_state__', and '__force_h5_write_and_save__' edits are supported.", + message="Only 'tags', 'discarded', '__copy_metadata__', '__delete_metadata__', '__save_data_state__', '__force_h5_write_and_save__', and '__discard_by_tag__' edits are supported.", ) # ===================================================================== From ff42584bfdd603310ad39fd90a44091c59147600 Mon Sep 17 00:00:00 2001 From: GuillaumePELLUET Date: Thu, 25 Jun 2026 12:52:51 +0200 Subject: [PATCH 3/5] fix: metadata clone naming (@hash format) + delete no longer blocks gRPC thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - build_metadata_copy_column_names: first clone of original field uses {source}@{hash} (no _{n}); conflicts get _{n} suffix; cloning an already-cloned field reuses the same @hash, only increments _{n} - is_copy_metadata_column_name: updated regex to match new .+@.+ format - __delete_metadata__ handler: replace blocking _slowUpdateInternals(force=True) with non-blocking background kick — view is already up-to-date from manual drop, no need to stall the gRPC response for 5-10s Co-Authored-By: Claude Sonnet 4.6 --- weightslab/trainer/services/data_service.py | 49 ++++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/weightslab/trainer/services/data_service.py b/weightslab/trainer/services/data_service.py index 5b518ee6..c2f97d2b 100755 --- a/weightslab/trainer/services/data_service.py +++ b/weightslab/trainer/services/data_service.py @@ -72,19 +72,39 @@ def normalize_metadata_copy_source_name(source_name: str, experiment_hash: str = return name -def build_metadata_copy_column_names(existing_columns, experiment_hash: str, source_name: str): - """Build backend/ui copied metadata names with incrementing suffix _1, _2, ...""" - exp_hash = str(experiment_hash or "current_experiment_hash").strip() or "current_experiment_hash" - normalized_source = normalize_metadata_copy_source_name(source_name, exp_hash) +def build_metadata_copy_column_names(existing_columns, experiment_hash: str, source_name: str) -> str: + """Build backend/ui copied metadata column names. + + Naming rules: + - Cloning an original field (no '@'): first copy is ``{source}@{hash}``; + subsequent copies of the same source get ``{source}@{hash}_1``, ``_2``, ... + - Cloning an already-cloned field (contains '@'): strip any trailing ``_N`` + suffix to find the base name, then append ``_1``, ``_2``, ... — no new hash. + """ existing_iterable = [] if existing_columns is None else existing_columns existing = {str(col) for col in existing_iterable} - index = 1 - while True: - copy_name = f"{normalized_source}_{index}@{exp_hash}" - if copy_name not in existing: - return copy_name - index += 1 + if "@" in source_name: + # Cloning a clone: reuse the same @hash suffix, just increment _{n} + base = re.sub(r"_\d+$", "", source_name) + n = 1 + while True: + candidate = f"{base}_{n}" + if candidate not in existing: + return candidate + n += 1 + else: + # Cloning an original field: first copy has no index, then _1, _2, ... + exp_hash = str(experiment_hash or "current_experiment_hash").strip() or "current_experiment_hash" + base = f"{source_name}@{exp_hash}" + if base not in existing: + return base + n = 1 + while True: + candidate = f"{base}_{n}" + if candidate not in existing: + return candidate + n += 1 def duplicate_metadata_column_in_dataframe(df: pd.DataFrame, source_column: str, experiment_hash: str): @@ -99,7 +119,9 @@ def duplicate_metadata_column_in_dataframe(df: pd.DataFrame, source_column: str, def is_copy_metadata_column_name(column_name: str) -> bool: name = str(column_name or "").strip() - return bool(re.match(r".+_\d+@.+$", name)) + # Matches any column that was produced by a clone operation: contains '@' + # with at least one character on each side (e.g. "loss@abc123" or "loss@abc123_2"). + return bool(re.match(r".+@.+", name)) def detect_bbox_format(bboxes: np.ndarray) -> str: @@ -3752,7 +3774,10 @@ def EditDataSample(self, request, context): if target_column in self._all_datasets_df.columns: self._all_datasets_df = self._all_datasets_df.drop(columns=[target_column]) - self._slowUpdateInternals(force=True) + # Kick a background view-refresh (non-blocking) — the in-memory view + # is already consistent after the drop above, so blocking inline rebuild + # is unnecessary and causes the gRPC response to stall for 5-10 s. + self._slowUpdateInternals() return pb2.DataEditsResponse( success=True, From f198af2e11bb610de39a68631e6e8fe6dfb1ea3d Mon Sep 17 00:00:00 2001 From: GuillaumePELLUET Date: Thu, 25 Jun 2026 14:06:44 +0200 Subject: [PATCH 4/5] fix: always serialize/deserialize prediction and target as lists in H5 Write path: serialize_value now handles numpy arrays of ALL ndim (not just <= 1). A 2D array was falling through to str() which produced numpy repr strings (space-separated, no commas) that aren't valid JSON. Read path: deserialize_value for MODEL_INOUT_LIST columns now: - maps "nan"/"none"/"" strings to np.nan instead of returning them as-is - removes the single-element unwrap (a [0.5] list stays [0.5], not 0.5) - adds a fallback regex pass to recover old data stored as numpy repr Co-Authored-By: Claude Sonnet 4.6 --- weightslab/data/h5_dataframe_store.py | 37 +++++++++++++++++---------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/weightslab/data/h5_dataframe_store.py b/weightslab/data/h5_dataframe_store.py index e2271995..def47790 100644 --- a/weightslab/data/h5_dataframe_store.py +++ b/weightslab/data/h5_dataframe_store.py @@ -1,4 +1,5 @@ import os +import re import json import time import logging @@ -344,10 +345,8 @@ def serialize_value(val): if not isinstance(val, (list, set, np.ndarray)) and pd.isna(val): return np.nan - if isinstance(val, np.ndarray) and val.ndim <= 1: - if val.ndim == 0: - val = val.reshape(-1) - val = val.tolist() + if isinstance(val, np.ndarray): + val = val.item() if val.ndim == 0 else val.tolist() if isinstance(val, (list, dict)): try: @@ -407,18 +406,30 @@ def _normalize_for_read(self, df: pd.DataFrame, origin: str) -> pd.DataFrame: # Handle deserialization of nested objects (lists, dicts) stored as JSON strings cols_to_deserialize = [col for col in SampleStats.MODEL_INOUT_LIST if col in df.columns] if cols_to_deserialize: + _MISSING = {"nan", "none", "", ""} + def deserialize_value(val): - if not isinstance(val, str) or not (val.startswith('[') or val.startswith('{')): + if not isinstance(val, str): return val - try: - obj = json.loads(val) - except Exception: + stripped = val.strip() + if stripped.lower() in _MISSING: + return np.nan + if not (stripped.startswith('[') or stripped.startswith('{')): return val - - # Unwrap single-element lists to scalars for consistency with active training data - if isinstance(obj, list) and len(obj) == 1: - return obj[0] - return obj + try: + return json.loads(stripped) + except json.JSONDecodeError: + # Fallback: numpy repr uses spaces as delimiters without commas. + # E.g. "[0.1 0.2]" or "[[0.1 0.2]\n [0.3 0.4]]" + try: + normalized = re.sub( + r'(?<=[0-9.])\s+(?=[-0-9.\[])', + ', ', + stripped.replace('\n', ' '), + ) + return json.loads(normalized) + except Exception: + return val for col in cols_to_deserialize: df[col] = df[col].apply(deserialize_value) From 70d11e1dfc5c73288fe73b28ed053ce4e713da4c Mon Sep 17 00:00:00 2001 From: GuillaumePELLUET Date: Thu, 25 Jun 2026 16:39:41 +0200 Subject: [PATCH 5/5] Fix ultralytics BB rendering on UI side and weightslab examples --- weightslab/data/data_utils.py | 8 +- weightslab/data/dataframe_manager.py | 2 +- .../examples/PyTorch/ws-detection/main.py | 56 ++++++------ .../PyTorch/ws-segmentation/config.yaml | 3 +- .../examples/PyTorch/ws-segmentation/main.py | 89 +++++++++++-------- .../PyTorch/ws-segmentation/utils/data.py | 27 ++++-- .../Ultralytics/ws-detection/config.yaml | 4 +- .../ws-2d-lidar-detection/config.yaml | 1 + weightslab/src.py | 13 --- weightslab/trainer/services/data_service.py | 6 +- 10 files changed, 114 insertions(+), 95 deletions(-) diff --git a/weightslab/data/data_utils.py b/weightslab/data/data_utils.py index 9ce19864..2d65ba52 100644 --- a/weightslab/data/data_utils.py +++ b/weightslab/data/data_utils.py @@ -5,6 +5,8 @@ from PIL import Image +from weightslab.data.array_proxy import ArrayH5Proxy + __all__ = [ "_detect_dataset_split", @@ -21,9 +23,6 @@ logger = logging.getLogger(__name__) - - - def _to_uint8_image(img_float: np.ndarray) -> np.ndarray: """ Convert float image to uint8 for visualization. @@ -222,6 +221,9 @@ def _downsample_nn(arr: np.ndarray, max_hw: int = 96) -> np.ndarray: def to_numpy_safe(x): + if isinstance(x, ArrayH5Proxy): + return np.asanyarray(x) + if isinstance(x, (int, float)): return np.array([x]) diff --git a/weightslab/data/dataframe_manager.py b/weightslab/data/dataframe_manager.py index 07644190..21e3fd91 100644 --- a/weightslab/data/dataframe_manager.py +++ b/weightslab/data/dataframe_manager.py @@ -683,7 +683,7 @@ def upsert_df(self, df_local: List | pd.DataFrame, origin: str = None, force_flu # "Cannot setitem on a Categorical with a new category". The # _optimize_dataframe_memory pass below re-applies categorical dtypes. for col in all_cols: - if col in self._df.columns and isinstance(self._df[col].dtype, pd.CategoricalDtype): + if col in self._df.columns and col != SampleStatsEx.ORIGIN and isinstance(self._df[col].dtype, pd.CategoricalDtype): self._df[col] = self._df[col].astype(object) self._df.loc[existing_idx, all_cols] = df_norm.loc[existing_idx, all_cols] diff --git a/weightslab/examples/PyTorch/ws-detection/main.py b/weightslab/examples/PyTorch/ws-detection/main.py index 7da1d950..0f6d9320 100644 --- a/weightslab/examples/PyTorch/ws-detection/main.py +++ b/weightslab/examples/PyTorch/ws-detection/main.py @@ -49,7 +49,7 @@ def train(loader, model, optimizer, sig, device, grid_size, conf_thresh): targets = [t.to(device) for t in targets] optimizer.zero_grad() - outputs = model(inputs) # [B, S, S, 5 + num_classes] + outputs = model(inputs) # [B, S, S, 5 + num_classes] # Decoded boxes for the UI overlay (detached — display only). preds = decode_predictions(outputs.detach(), grid_size, conf_thresh=conf_thresh) @@ -90,7 +90,7 @@ def test(loader, model, sig, device, grid_size, conf_thresh, test_loader_len): loss = float((losses / test_loader_len).detach().cpu().item()) iou = float((ious / test_loader_len).detach().cpu().item()) - return loss, iou * 100.0 # Return mean IoU as percentage + return loss, iou * 100.0 # Return mean IoU as percentage # ============================================================================= @@ -111,7 +111,7 @@ def test(loader, model, sig, device, grid_size, conf_thresh, test_loader_len): parameters.setdefault("training_steps_to_do", 500) parameters.setdefault("eval_full_to_train_steps_ratio", 50) parameters.setdefault("number_of_workers", 4) - parameters.setdefault("num_classes", 1) # Penn-Fudan: single class (person) + parameters.setdefault("num_classes", 1) # Penn-Fudan: single class (person) parameters.setdefault("image_size", 256) parameters.setdefault("grid_size", 8) parameters.setdefault("conf_thresh", 0.3) @@ -119,40 +119,40 @@ def test(loader, model, sig, device, grid_size, conf_thresh, test_loader_len): parameters.setdefault("freeze_backbone", True) parameters.setdefault("compute_natural_sort", True) + # --- 2) Register hyperparameters --- exp_name = parameters["experiment_name"] + wl.watch_or_edit( + parameters, + flag="hyperparameters", + name=exp_name, + defaults=parameters, + poll_interval=1.0, + ) + num_classes = int(parameters["num_classes"]) image_size = int(parameters["image_size"]) grid_size = int(parameters["grid_size"]) conf_thresh = float(parameters["conf_thresh"]) - # --- 2) Device selection --- + # --- 3) Device selection --- if parameters.get("device", "auto") == "auto": parameters["device"] = torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) device = parameters["device"] - # --- 3) Logging directory --- + # --- 4) Logging directory --- if not parameters.get("root_log_dir"): tmp_dir = tempfile.mkdtemp() parameters["root_log_dir"] = tmp_dir print(f"No root_log_dir specified, using temporary directory: {parameters['root_log_dir']}") os.makedirs(parameters["root_log_dir"], exist_ok=True) - log_dir = parameters["root_log_dir"] max_steps = parameters["training_steps_to_do"] eval_full_to_train_steps_ratio = parameters["eval_full_to_train_steps_ratio"] verbose = parameters.get("verbose", True) tqdm_display = parameters.get("tqdm_display", True) - # --- 4) Register hyperparameters --- - wl.watch_or_edit( - parameters, - flag="hyperparameters", - name=exp_name, - defaults=parameters, - poll_interval=1.0, - ) # --- 5) Data (Penn-Fudan pedestrians, downloaded on first run) --- default_data_root = os.path.abspath( @@ -255,7 +255,7 @@ def compute_class_weights(dataset, num_classes, max_samples=200): class_counts = np.zeros(num_classes, dtype=np.float64) num_samples = min(len(dataset), max_samples) - for idx in tqdm.tqdm(range(num_samples), desc="📊 Analyzing Distribution"): + for idx in tqdm.tqdm(range(num_samples), desc=" Analyzing Distribution"): _, _, target, _ = dataset.get_items(idx, include_labels=True) if target is None or len(target) == 0: continue @@ -263,10 +263,10 @@ def compute_class_weights(dataset, num_classes, max_samples=200): if 0 <= c < num_classes: class_counts[c] += 1 - class_counts = np.maximum(class_counts, 1) # Avoid div by zero + class_counts = np.maximum(class_counts, 1) # Avoid div by zero total = class_counts.sum() class_weights = total / (num_classes * class_counts) - class_weights = class_weights / class_weights.mean() # Normalize + class_weights = class_weights / class_weights.mean() # Normalize print("\nClass distribution and weights:", flush=True) for c in range(num_classes): @@ -287,16 +287,16 @@ def compute_class_weights(dataset, num_classes, max_samples=200): ) print("=" * 60) - print("🚀 STARTING PENN-FUDAN PEDESTRIAN DETECTION TRAINING") - print(f"📈 Total steps: {max_steps}") - print(f"🔄 Evaluation every {eval_full_to_train_steps_ratio} steps") - print(f"💾 Logs will be saved to: {log_dir}") - print(f"📂 Data root: {data_root}") + print(" STARTING PENN-FUDAN PEDESTRIAN DETECTION TRAINING") + print(f" Total steps: {max_steps}") + print(f" Evaluation every {eval_full_to_train_steps_ratio} steps") + print(f" Logs will be saved to: {log_dir}") + print(f" Data root: {data_root}") print("=" * 60 + "\n") - # ================ - # Training Loop - wl.start_training(timeout=3) # Blocks and keeps the main thread alive while background services run. Optionally set a timeout (seconds) to auto-stop. + # # ================ + # # Training Loop + # wl.start_training(timeout=3) # Blocks and keeps the main thread alive while background services run. Optionally set a timeout (seconds) to auto-stop. # ================ train_range = tqdm.tqdm(itertools.count(), desc="Training") if tqdm_display else itertools.count() @@ -310,7 +310,7 @@ def compute_class_weights(dataset, num_classes, max_samples=200): # Test if age == 0 or age % eval_full_to_train_steps_ratio == 0: - test_loader_len = len(test_loader) # Store length before wrapping with tqdm + test_loader_len = len(test_loader) # Store length before wrapping with tqdm test_loader_it = tqdm.tqdm(test_loader, desc="Evaluating") if tqdm_display else test_loader test_loss, test_metric = test(test_loader_it, model, test_sig, device, grid_size, conf_thresh, test_loader_len) @@ -332,8 +332,8 @@ def compute_class_weights(dataset, num_classes, max_samples=200): ) print("\n" + "=" * 60) - print(f"✅ Training completed in {time.time() - start_time:.2f} seconds") - print(f"💾 Logs saved to: {log_dir}") + print(f" Training completed in {time.time() - start_time:.2f} seconds") + print(f" Logs saved to: {log_dir}") print("=" * 60) # Keep the main thread alive to allow background serving threads to run diff --git a/weightslab/examples/PyTorch/ws-segmentation/config.yaml b/weightslab/examples/PyTorch/ws-segmentation/config.yaml index 5054332b..3fd60d3b 100644 --- a/weightslab/examples/PyTorch/ws-segmentation/config.yaml +++ b/weightslab/examples/PyTorch/ws-segmentation/config.yaml @@ -28,7 +28,8 @@ ledger_flush_interval: 60.0 # Data num_classes: 6 -image_size: 180 +class_names: ["Background", "Ego Road", "Driveable Area", "Lane Line 1", "Lane Line 2", "Lane Line 3"] +image_size: 128 data_root: .\BDD_subset # Bdd format data: train_loader: diff --git a/weightslab/examples/PyTorch/ws-segmentation/main.py b/weightslab/examples/PyTorch/ws-segmentation/main.py index f3a94d47..0df3795a 100644 --- a/weightslab/examples/PyTorch/ws-segmentation/main.py +++ b/weightslab/examples/PyTorch/ws-segmentation/main.py @@ -45,13 +45,13 @@ def _instance_batch_idx(labels): def _run_instance_signals(sig, outputs, labels, ids, preds, return_metric=False): """Compute + log/save the per-sample AND per-instance Dice (metric) and BCE (loss).""" bce_sample = sig["bce_sample"](outputs, labels, batch_ids=ids, preds=preds) - dice_sample = sig["dice_sample"](outputs, labels, batch_ids=ids) # Register processed predictions one time only + dice_sample = sig["dice_sample"](outputs, labels, batch_ids=ids) # Register processed predictions one time only - sig["dice_instance"](outputs, labels, batch_ids=ids) # Register processed predictions one time only + sig["dice_instance"](outputs, labels, batch_ids=ids) # Register processed predictions one time only sig["bce_instance"](outputs, labels, batch_ids=ids) avg_loss = 0.5 * dice_sample + 0.5 * bce_sample - wl.save_signals({"combined_bce_dice_per_sample": avg_loss}, ids) # Save the per-sample aggregate loss for backward step + wl.save_signals({"combined_bce_dice_per_sample": avg_loss}, ids) # Save the per-sample aggregate loss for backward step if return_metric: return avg_loss, dice_sample return avg_loss @@ -91,11 +91,11 @@ def train(loader, model, optimizer, sig, device): with guard_training_context: (inputs, ids, labels, _) = next(loader) inputs = inputs.to(device) - labels = [[m.to(device) for m in insts] for insts in labels] # per-sample list of instances + labels = [[m.to(device) for m in insts] for insts in labels] # per-sample list of instances optimizer.zero_grad() - outputs = model(inputs) # [B,C,H,W] - preds = outputs.argmax(dim=1) # [B,H,W] + outputs = model(inputs) # [B,C,H,W] + preds = outputs.argmax(dim=1) # [B,H,W] # Per-instance + per-sample Dice/BCE (tracked & saved at annotation level). loss_per_sample = _run_instance_signals(sig, outputs, labels, ids, preds=preds) @@ -110,7 +110,7 @@ def train(loader, model, optimizer, sig, device): wl.save_signals( _user_custom_signals(preds, labels), ids - ) # Save the per-sample predictions for visualization + ) # Save the per-sample predictions for visualization return float(loss.detach().cpu().item()) @@ -122,23 +122,23 @@ def test(loader, model, sig, device, test_loader_len): with guard_testing_context, torch.no_grad(): for inputs, ids, labels, _ in loader: inputs = inputs.to(device) - labels = [[m.to(device) for m in insts] for insts in labels] # per-sample list of instances + labels = [[m.to(device) for m in insts] for insts in labels] # per-sample list of instances outputs = model(inputs) - preds = outputs.argmax(dim=1) # [B,H,W] + preds = outputs.argmax(dim=1) # [B,H,W] # Per-instance + per-sample Dice/BCE (tracked & saved at annotation level). loss_per_sample, dice_sample = _run_instance_signals(sig, outputs, labels, ids, preds=preds, return_metric=True) - losses += torch.mean(loss_per_sample) # Average over the batch and accumulate - dices += torch.mean(dice_sample) # Average over the batch and accumulate + losses += torch.mean(loss_per_sample) # Average over the batch and accumulate + dices += torch.mean(dice_sample) # Average over the batch and accumulate # I want to see in the UI the per-sample classes predicted by the model - wl.save_signals(_user_custom_signals(preds, labels), ids) # Save the per-sample predictions for visualization + wl.save_signals(_user_custom_signals(preds, labels), ids) # Save the per-sample predictions for visualization loss = float((losses / test_loader_len).detach().cpu().item()) dice = float((dices / test_loader_len).detach().cpu().item()) - return loss, dice * 100.0 # Return average Dice as percentage + return loss, dice * 100.0 # Return average Dice as percentage # ============================================================================= @@ -159,13 +159,23 @@ def test(loader, model, sig, device, test_loader_len): parameters.setdefault("training_steps_to_do", 500) parameters.setdefault("eval_full_to_train_steps_ratio", 50) parameters.setdefault("number_of_workers", 4) - parameters.setdefault("num_classes", 6) # adjust to your label set - parameters.setdefault("ignore_index", 255) # if you have void pixels + parameters.setdefault("num_classes", 6) # adjust to your label set + parameters.setdefault("class_names", None) # adjust to your label set + parameters.setdefault("ignore_index", 255) # if you have void pixels parameters.setdefault("image_size", 256) parameters.setdefault("compute_natural_sort", True) + # --- 4) Register hyperparameters --- exp_name = parameters["experiment_name"] + wl.watch_or_edit( + parameters, + flag="hyperparameters", + name=exp_name, + defaults=parameters, + poll_interval=1.0, + ) num_classes = int(parameters["num_classes"]) + class_names = parameters["class_names"] ignore_index = int(parameters["ignore_index"]) image_size = int(parameters["image_size"]) @@ -186,18 +196,10 @@ def test(loader, model, sig, device, test_loader_len): log_dir = parameters["root_log_dir"] max_steps = parameters["training_steps_to_do"] eval_full_to_train_steps_ratio = parameters["eval_full_to_train_steps_ratio"] + write_export_ratio = parameters.get("write_export_ratio", 100) verbose = parameters.get("verbose", True) tqdm_display = parameters.get("tqdm_display", True) - # --- 4) Register hyperparameters --- - wl.watch_or_edit( - parameters, - flag="hyperparameters", - name=exp_name, - defaults=parameters, - poll_interval=1.0, - ) - # --- 5) Data (BDD100k reduced) --- default_data_root = os.path.abspath( os.path.join(os.path.dirname(__file__), "..", "..", "data", "BDD100k_reduced") @@ -211,17 +213,19 @@ def test(loader, model, sig, device, test_loader_len): root=data_root, split="train", num_classes=num_classes, + class_names=class_names, ignore_index=ignore_index, image_size=image_size, - max_samples=train_cfg.get("max_samples", None) # Optionally limit number of samples for faster testing + max_samples=train_cfg.get("max_samples", None) # Optionally limit number of samples for faster testing ) _val_dataset = BDD100kSegDataset( root=data_root, split="val", num_classes=num_classes, + class_names=class_names, ignore_index=ignore_index, image_size=image_size, - max_samples=test_cfg.get("max_samples", None) # Optionally limit number of samples for faster testing + max_samples=test_cfg.get("max_samples", None) # Optionally limit number of samples for faster testing ) train_loader = wl.watch_or_edit( @@ -300,8 +304,8 @@ def compute_class_weights(dataset, num_classes, ignore_index=255, max_samples=10 class_counts = np.zeros(num_classes, dtype=np.float64) num_samples = min(len(dataset), max_samples) - for idx in tqdm.tqdm(range(num_samples), desc="📊 Analyzing Distribution"): - _, _, label, _ = dataset.get_items(idx, include_labels=True) # Get the label/mask for this sample + for idx in tqdm.tqdm(range(num_samples), desc=" Analyzing Distribution"): + _, _, label, _ = dataset.get_items(idx, include_labels=True) # Get the label/mask for this sample label_np = label.numpy() if hasattr(label, 'numpy') else np.array(label) for c in range(num_classes): class_counts[c] += (label_np == c).sum() @@ -330,33 +334,38 @@ def compute_class_weights(dataset, num_classes, ignore_index=255, max_samples=10 ) print("=" * 60) - print("🚀 STARTING BDD100k SEGMENTATION TRAINING") - print(f"📈 Total steps: {max_steps}") - print(f"🔄 Evaluation every {eval_full_to_train_steps_ratio} steps") - print(f"💾 Logs will be saved to: {log_dir}") - print(f"📂 Data root: {data_root}") + print(" STARTING BDD100k SEGMENTATION TRAINING") + print(f" Total steps: {max_steps}") + print(f" Evaluation every {eval_full_to_train_steps_ratio} steps") + print(f" Logs will be saved to: {log_dir}") + print(f" Data root: {data_root}") print("=" * 60 + "\n") # ================ # Training Loop - wl.start_training(timeout=3) # This will block and keep the main thread alive while background services run. You can optionally set a timeout (in seconds) to automatically stop after a certain duration. + # wl.start_training(timeout=3) # This will block and keep the main thread alive while background services run. You can optionally set a timeout (in seconds) to automatically stop after a certain duration. # ================ train_range = tqdm.tqdm(itertools.count(), desc="Training") if tqdm_display else itertools.count() test_loss, test_metric = None, None start_time = time.time() for train_step in train_range: - age = model.get_age() if hasattr(model, "get_age") else train_step # Get model age in steps (not necessarily equal to train_step if model was reloaded or has seen more data than training steps) + age = model.get_age() if hasattr(model, "get_age") else train_step # Get model age in steps (not necessarily equal to train_step if model was reloaded or has seen more data than training steps) # Train train_loss = train(train_loader, model, optimizer, train_sig, device) # Test if age == 0 or age % eval_full_to_train_steps_ratio == 0: - test_loader_len = len(test_loader) # Store length before wrapping with tqdm + test_loader_len = len(test_loader) # Store length before wrapping with tqdm test_loader_it = tqdm.tqdm(test_loader, desc="Evaluating") if tqdm_display else test_loader test_loss, test_metric = test(test_loader_it, model, test_sig, device, test_loader_len) + # Periodic history + dataframe export (JSON/CSV snapshots to root_log_dir) + if age > 0 and age % write_export_ratio == 0: + wl.write_history() + wl.write_dataframe() + # Verbose if verbose and not tqdm_display: print( @@ -375,9 +384,13 @@ def compute_class_weights(dataset, num_classes, ignore_index=255, max_samples=10 ) print("\n" + "=" * 60) - print(f"✅ Training completed in {time.time() - start_time:.2f} seconds") - print(f"💾 Logs saved to: {log_dir}") + print(f" Training completed in {time.time() - start_time:.2f} seconds") + print(f" Logs saved to: {log_dir}") print("=" * 60) + # Final export of signal history and data grid to root_log_dir + wl.write_history() + wl.write_dataframe() + # Keep the main thread alive to allow background serving threads to run wl.keep_serving() diff --git a/weightslab/examples/PyTorch/ws-segmentation/utils/data.py b/weightslab/examples/PyTorch/ws-segmentation/utils/data.py index 864c6853..fcc14cc5 100644 --- a/weightslab/examples/PyTorch/ws-segmentation/utils/data.py +++ b/weightslab/examples/PyTorch/ws-segmentation/utils/data.py @@ -32,6 +32,7 @@ def __init__( root, split="train", num_classes=6, + class_names=None, ignore_index=255, image_size=256, max_samples=None @@ -40,6 +41,7 @@ def __init__( self.root = root self.split = split self.num_classes = num_classes + self.class_names = class_names self.ignore_index = ignore_index self.task_type = "segmentation" @@ -51,7 +53,7 @@ def __init__( for f in os.listdir(img_dir) if f.lower().endswith((".jpg", ".jpeg", ".png")) ] - image_files = sorted(set(image_files))[:max_samples] if max_samples is not None else sorted(set(image_files)) # Optionally limit number of samples for faster testing + image_files = sorted(set(image_files))[:max_samples] if max_samples != None else sorted(set(image_files)) # Optionally limit number of samples for faster testing self.images = [] self.masks = [] @@ -113,6 +115,16 @@ def get_items(self, idx, include_metadata=False, include_labels=False, include_i img = Image.open(img_path).convert("RGB") img_t = self.image_transform(img) + # Process labels/masks + # # # Sample wise segmentation + # mask_t = None + # if include_labels: + # mask = Image.open(mask_path) + # mask_r = self.mask_resize(mask) + # mask_np = np.array(mask_r, dtype=np.int64) + # mask_t = torch.from_numpy(mask_np) # [H, W] int64 + # return img_t, uid, mask_t, metadata + # # Instance wise segmentaiton # Process labels/masks mask_t_instances = list() mask_t = None @@ -120,17 +132,16 @@ def get_items(self, idx, include_metadata=False, include_labels=False, include_i mask = Image.open(mask_path) mask_r = self.mask_resize(mask) mask_np = np.array(mask_r, dtype=np.int64) - mask_t = torch.from_numpy(mask_np) # [H, W] int64 + mask_t = torch.from_numpy(mask_np) # [H, W] int64 # Format labels to register multiple instance_ids lbl_max = mask_t.max().item() for i in range(1, lbl_max + 1): m = torch.zeros_like(mask_t) - m[mask_t == i] = i # Assign class ID as instance ID for simplicity; if set to 1, all instances of the same class would be merged... + m[mask_t == i] = i # Assign class ID as instance ID for simplicity; if set to 1, all instances of the same class would be merged... mask_t_instances.append(m) return img_t, uid, mask_t_instances, metadata - def seg_collate(batch): """Collate WL per-sample tuples for instance-segmentation. @@ -141,10 +152,10 @@ def seg_collate(batch): background) are filtered out so every kept instance is a real annotation. Returns: - images: FloatTensor [B, C, H, W] - ids: list[str] of length B - labels: list[B] where labels[s] is a list of instance mask tensors - metas: list[B] of metadata dicts + images: FloatTensor [B, C, H, W] + ids: list[str] of length B + labels: list[B] where labels[s] is a list of instance mask tensors + metas: list[B] of metadata dicts """ images = torch.stack([b[0] for b in batch], dim=0) ids = [b[1] for b in batch] diff --git a/weightslab/examples/Ultralytics/ws-detection/config.yaml b/weightslab/examples/Ultralytics/ws-detection/config.yaml index 2ba8184e..512f60a4 100644 --- a/weightslab/examples/Ultralytics/ws-detection/config.yaml +++ b/weightslab/examples/Ultralytics/ws-detection/config.yaml @@ -46,14 +46,14 @@ data: preload_labels: true preload_metadata: true drop_last: false - num_workers: 4 + num_workers: 0 # Force to 0 for windows for perfs. opt val_loader: batch_size: 2 shuffle: false preload_labels: true preload_metadata: true drop_last: false - num_workers: 4 + num_workers: 0 # Force to 0 for windows for perfs. opt signals_cfg: train_nms: diff --git a/weightslab/examples/Usecases/ws-2d-lidar-detection/config.yaml b/weightslab/examples/Usecases/ws-2d-lidar-detection/config.yaml index 80b795c2..edde4c24 100644 --- a/weightslab/examples/Usecases/ws-2d-lidar-detection/config.yaml +++ b/weightslab/examples/Usecases/ws-2d-lidar-detection/config.yaml @@ -2,6 +2,7 @@ device: auto experiment_name: lidar2d_detection_usecase training_steps_to_do: null # null = infinite training until manually stopped +# root_log_dir: # Empty to write in tmp directory, or specify a path to store logs and checkpoints checkpoint_manager: load_config: false diff --git a/weightslab/src.py b/weightslab/src.py index a41d9939..e55be200 100644 --- a/weightslab/src.py +++ b/weightslab/src.py @@ -3012,22 +3012,9 @@ def _check_cancel_or_timeout(self) -> None: if self._controller.is_cancel_requested(): raise _EvalCanceled(f"Evaluation on '{self._split_name}' canceled by user") - elapsed = time.monotonic() - self._start_time - if self._absolute_timeout > 0 and elapsed > self._absolute_timeout: - raise _EvalTimeout( - f"Evaluation timeout on '{self._split_name}' after {elapsed:.1f}s (configured {self._absolute_timeout:.1f}s)" - ) - if self._total_batches <= 0 or self._processed_batches <= 0 or self._avg_batch_seconds <= 0: return - projected = self._avg_batch_seconds * self._total_batches - timeout_seconds = max(self._min_seconds, projected * self._multiplier) - if elapsed > timeout_seconds: - raise _EvalTimeout( - f"Evaluation timeout on '{self._split_name}' after {elapsed:.1f}s " - f"(projected={projected:.1f}s, limit={timeout_seconds:.1f}s, multiplier={self._multiplier:.2f})" - ) def __len__(self): return len(self._loader) diff --git a/weightslab/trainer/services/data_service.py b/weightslab/trainer/services/data_service.py index c2f97d2b..4860517f 100755 --- a/weightslab/trainer/services/data_service.py +++ b/weightslab/trainer/services/data_service.py @@ -1362,6 +1362,8 @@ def _process_sample_row(self, args): except Exception: label_arr = np.array([]) + if label_arr.ndim == 1 and label_arr.size >= 4: + label_arr = label_arr.reshape(1, -1) if label_arr.size > 0 and label_arr.ndim == 2 and label_arr.shape[-1] >= 4: # 6-col rows when class+score present: [x,y,x,y,class,score] # (or [tl_x,tl_y,w,h,class,score] for the xywh-top-left flavor). @@ -2771,7 +2773,7 @@ def _build_metadata_only_response(self, df_slice: pd.DataFrame, request): # detection) pred/target are large JSON arrays (~310 KB/record) that bloat the # response to 100s of MB and silently break the histogram fetch. if not requested_cols: - _HEAVY_BLOB_COLS = {"pred", "prediction", "prediction_raw", "target"} + _HEAVY_BLOB_COLS = {"prediction_raw"} requested_cols = [c for c in df_slice.columns if c not in _HEAVY_BLOB_COLS] metadata_cols = [ @@ -3808,6 +3810,8 @@ def EditDataSample(self, request, context): ) tag_col = f"{SampleStatsEx.TAG.value}:{tag_name}" + if tag_col not in self._all_datasets_df.columns: + self._slowUpdateInternals() df = safe_reset_index(self._all_datasets_df) if tag_col not in df.columns: return pb2.DataEditsResponse(