diff --git a/.claude/settings.json b/.claude/settings.json
new file mode 100644
index 0000000..1eb082d
--- /dev/null
+++ b/.claude/settings.json
@@ -0,0 +1,191 @@
+{
+ "permissions": {
+ "allow": [
+ "Read(*)",
+ "Edit(*)",
+ "Write(*)",
+ "Glob(*)",
+ "Grep(*)",
+ "WebFetch(*)",
+ "WebSearch(*)",
+ "Task(*)",
+ "NotebookEdit(*)",
+ "Skill(*)",
+ "Agent(*)",
+ "Bash(uv *)",
+ "Bash(pnpm *)",
+ "Bash(npm *)",
+ "Bash(npx *)",
+ "Bash(pip *)",
+ "Bash(python *)",
+ "Bash(python3 *)",
+ "Bash(node *)",
+ "Bash(tsx *)",
+ "Bash(tsc *)",
+ "Bash(pytest *)",
+ "Bash(rg *)",
+ "Bash(find *)",
+ "Bash(ls *)",
+ "Bash(cat *)",
+ "Bash(head *)",
+ "Bash(tail *)",
+ "Bash(wc *)",
+ "Bash(sort *)",
+ "Bash(grep *)",
+ "Bash(awk *)",
+ "Bash(sed *)",
+ "Bash(echo *)",
+ "Bash(printf *)",
+ "Bash(mkdir *)",
+ "Bash(cp *)",
+ "Bash(mv *)",
+ "Bash(touch *)",
+ "Bash(chmod +x *)",
+ "Bash(git add *)",
+ "Bash(git commit *)",
+ "Bash(git status*)",
+ "Bash(git log *)",
+ "Bash(git diff *)",
+ "Bash(git branch *)",
+ "Bash(git checkout *)",
+ "Bash(git stash *)",
+ "Bash(git tag *)",
+ "Bash(git remote -v*)",
+ "Bash(git rev-parse *)",
+ "Bash(git show *)",
+ "Bash(docker compose *)",
+ "Bash(docker build *)",
+ "Bash(docker ps*)",
+ "Bash(docker images*)",
+ "Bash(docker logs *)",
+ "Bash(docker inspect *)",
+ "Bash(docker exec *)",
+ "Bash(docker run *)",
+ "Bash(docker stop *)",
+ "Bash(docker start *)",
+ "Bash(curl *)",
+ "Bash(wget *)",
+ "Bash(ssh *)",
+ "Bash(rsync *)",
+ "Bash(scp *)",
+ "Bash(ping *)",
+ "Bash(ifconfig*)",
+ "Bash(networksetup *)",
+ "Bash(brew *)",
+ "Bash(which *)",
+ "Bash(env *)",
+ "Bash(export *)",
+ "Bash(source *)",
+ "Bash(eval *)",
+ "Bash(cd *)",
+ "Bash(pwd*)",
+ "Bash(date*)",
+ "Bash(df *)",
+ "Bash(du *)",
+ "Bash(free *)",
+ "Bash(top *)",
+ "Bash(htop*)",
+ "Bash(ps *)",
+ "Bash(lsof *)",
+ "Bash(nc *)",
+ "Bash(tar *)",
+ "Bash(unzip *)",
+ "Bash(zip *)",
+ "Bash(jq *)",
+ "Bash(yq *)",
+ "Bash(tree *)",
+ "Bash(xargs *)",
+ "Bash(tee *)",
+ "Bash(diff *)",
+ "Bash(patch *)",
+ "Bash(ruff *)",
+ "Bash(mypy *)",
+ "Bash(black *)",
+ "Bash(isort *)",
+ "Bash(eslint *)",
+ "Bash(prettier *)",
+ "Bash(cargo *)",
+ "Bash(rustc *)",
+ "Bash(go *)",
+ "Bash(make *)",
+ "Bash(cmake *)",
+ "Bash(conda *)",
+ "Bash(mamba *)",
+ "Bash(ros2 *)",
+ "Bash(colcon *)",
+ "Bash(osgrep *)",
+ "Bash(gh *)",
+ "Bash(rtk *)"
+ ],
+ "deny": [
+ "Bash(rm -rf /)*",
+ "Bash(rm -rf ~)*",
+ "Bash(rm -rf /*)*",
+ "Bash(rm -rf .)*",
+ "Bash(rm -rf ..)*",
+ "Bash(sudo rm -rf *)",
+ "Bash(sudo rm -r /)*",
+ "Bash(git push --force *)",
+ "Bash(git push -f *)",
+ "Bash(git push --force-with-lease *)",
+ "Bash(git reset --hard *)",
+ "Bash(git clean -fd*)",
+ "Bash(git checkout -- .)*",
+ "Bash(git restore .)*",
+ "Bash(git rebase -i *)",
+ "Bash(git push origin master*)",
+ "Bash(mkfs *)",
+ "Bash(dd if=*of=/dev/*)",
+ "Bash(shutdown *)",
+ "Bash(reboot *)",
+ "Bash(halt *)",
+ "Bash(init 0*)",
+ "Bash(:(){ :|:& };:)*",
+ "Bash(> /dev/sd*)",
+ "Bash(> /dev/nvme*)",
+ "Bash(curl * | sh)*",
+ "Bash(curl * | bash)*",
+ "Bash(wget * | sh)*",
+ "Bash(wget * | bash)*",
+ "Bash(chmod 777 *)",
+ "Bash(chmod -R 777 *)",
+ "Bash(chown -R *)",
+ "Bash(chgrp -R *)",
+ "Bash(pkill -9 *)",
+ "Bash(killall *)",
+ "Bash(kill -9 -1*)",
+ "Bash(sudo *)",
+ "Bash(su *)",
+ "Bash(passwd *)",
+ "Bash(usermod *)",
+ "Bash(useradd *)",
+ "Bash(userdel *)",
+ "Bash(visudo *)",
+ "Bash(crontab -r*)",
+ "Bash(iptables -F*)",
+ "Bash(systemctl stop *)",
+ "Bash(systemctl disable *)",
+ "Bash(launchctl unload *)",
+ "Bash(npm publish *)",
+ "Bash(pip upload *)",
+ "Bash(twine upload *)",
+ "Bash(docker push *)",
+ "Bash(docker rmi -f *)",
+ "Bash(docker system prune -a*)",
+ "Bash(docker volume rm *)",
+ "Bash(dropdb *)",
+ "Bash(drop database *)",
+ "Bash(DROP DATABASE *)",
+ "Bash(mongo * --eval *dropDatabase*)",
+ "Bash(redis-cli FLUSHALL*)",
+ "Bash(aws s3 rm *--recursive*)",
+ "Bash(aws s3 rb *--force*)",
+ "Bash(terraform destroy *)",
+ "Bash(kubectl delete namespace *)",
+ "Bash(kubectl delete -f * --all*)",
+ "Bash(gh repo delete *)",
+ "Bash(gh issue close *)",
+ "Bash(gh pr close *)"
+ ]
+ }
+}
diff --git a/.gitignore b/.gitignore
index aeac006..9ea9cb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,4 +65,12 @@ gradio_queue.db
# stable diffusion
*.ckpt
-*.o
\ No newline at end of file
+*.o
+
+# optional third_party checkouts (source-only forks can be re-cloned locally)
+third_party/latent-diffusion/.git
+third_party/taming-transformers/.git
+
+# Third-party cloned repos (managed by bootstrap scripts)
+third_party/latent-diffusion/
+third_party/taming-transformers/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..aa459b1
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "third_party/latent-diffusion"]
+ path = third_party/latent-diffusion
+ url = https://github.com/CompVis/latent-diffusion.git
+
+[submodule "third_party/taming-transformers"]
+ path = third_party/taming-transformers
+ url = https://github.com/CompVis/taming-transformers.git
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..33d4c41
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,62 @@
+# ODISE — Open-Vocabulary Panoptic Segmentation
+
+Open-vocabulary panoptic segmentation using pre-trained text-image diffusion and discriminative models (CVPR 2023 Highlight, NVIDIA).
+
+## Architecture
+```
+odise/
+├── checkpoint/ # Custom checkpointer (ODISE weights)
+├── config/ # Detectron2-style configs
+├── data/ # Dataset registration & transforms
+├── engine/ # Training loop & defaults
+├── evaluation/ # Eval metrics
+├── model_zoo/ # Pre-built model configs
+├── modeling/ # Core models (diffusion, meta-arch, backbone, wrapper)
+└── utils/ # Env collection, misc helpers
+configs/ # YAML/Python training configs
+third_party/ # Mask2Former, latent-diffusion, taming-transformers
+tools/ # train_net.py, extract_features.py, bootstrap script
+demo/ # Gradio demo app
+```
+
+## Key Dependencies
+- Python >=3.10, PyTorch >=2.0
+- detectron2, Mask2Former (local third_party)
+- open-clip-torch==2.0.2, timm==0.6.11
+- numpy<2.0, omegaconf>=2.3
+- Stable Diffusion via latent-diffusion/taming-transformers submodules
+
+## Dev Commands
+```bash
+# Activate env (GPU server)
+source /mnt/forge-data/activate.sh
+
+# Install
+uv pip install -e .
+
+# Bootstrap third-party submodules
+bash tools/bootstrap_third_party.sh
+
+# Train
+CUDA_VISIBLE_DEVICES=0,1,2,3 python tools/train_net.py --config-file configs/common/train.py --num-gpus 4
+
+# Demo
+python demo/demo.py
+
+# Lint
+ruff check odise/ --select E,F,I,B,UP
+isort --check odise/
+mypy odise/
+```
+
+## Conventions
+- Package manager: `uv` (never pip directly)
+- Search: `rg` (ripgrep), never `grep`
+- Line length: 100
+- Style: isort + ruff
+- Config: Detectron2 LazyConfig system (Python-based configs)
+- Git commit prefix: `[ODISE]`
+- Training outputs: `/mnt/artifacts-datai/`
+
+# currentDate
+Today's date is 2026-03-29.
diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
index 7eb67e2..e5fd1c4 100644
--- a/GETTING_STARTED.md
+++ b/GETTING_STARTED.md
@@ -6,6 +6,22 @@ For further reading, please refer to [Getting Started with Detectron2](https://g
**Important Note**: ODISE's `demo/demo.py` and `tools/train_net.py` scripts link to the original pre-trained models for [Stable Diffusion v1.3](https://huggingface.co/CompVis/stable-diffusion-v-1-3-original/resolve/main/sd-v1-3.ckpt) and [CLIP](https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt). When you run them for the very first time, these scripts will automatically download the pre-trained models for Stable Diffuson and CLIP, from their original sources, to your local directories `$HOME/.torch/` and `$HOME/.cache/clip`, respectively. Their use is subject to the original license terms defined at [https://github.com/CompVis/stable-diffusion](https://github.com/CompVis/stable-diffusion) and [https://github.com/openai/CLIP](https://github.com/openai/CLIP), respectively.
+If you use `stable-diffusion` backbones (latent-diffusion/taming-transformers), initialize optional third_party checkouts first:
+
+```bash
+bash tools/bootstrap_third_party.sh
+```
+
+If your clone did not include submodules, or if you need a clean refresh:
+
+```bash
+bash tools/bootstrap_third_party.sh --force
+```
+or
+```bash
+git submodule update --init --recursive
+```
+
### Inference Demo with Pre-trained ODISE Models
@@ -49,39 +65,40 @@ python demo/demo.py --input demo/examples/purse.jpeg --output demo/purse_pred.jp
We provide a script `tools/train_net.py` that trains all configurations of ODISE.
To train a model with `tools/train_net.py`, first prepare the datasets following the instructions in
-[datasets/README.md](./datasets/README.md) and then run, for single-node (8-GPUs) NVIDIA AMP-based training:
-```bash
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp
-```
-For 4-node (32-GPUs) AMP-based training, run:
+[datasets/README.md](./datasets/README.md) and then run, for CPU-first single-process training:
```bash
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 0 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
-(node1)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 1 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
-(node2)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 2 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
-(node3)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 3 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --amp
+./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 1 --force-cpu
```
-Note that our default training configurations are designed for 32 GPUs.
-Since we use the AdamW optimizer, it is not clear as to how to scale the learning rate with batch size.
-However, we provide the ability to automatically scale the learning rate and the batch size for any number of GPUs used for training by passing in the`--ref $REFERENCE_WORLD_SIZE` argument.
-For example, if you set `$REFERENCE_WORLD_SIZE=32` while training on 8 GPUs, the batch size and learning rate will be set to 8/32 = 0.25 of the original ones.
+AMP is only enabled when CUDA is available. On CPU-only machines, training falls back to full precision.
-```bash
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --amp --ref 32
-```
+For multi-GPU training (optional, if you still run distributed CUDA), keep your existing launch pattern and pass `--num-gpus` plus `--amp` as before.
-ODISE trains in 6 days on 32 NVIDIA V100 GPUs.
+### High-throughput Feature Extraction
+
+`tools/extract_features.py` supports distributed extraction. For CPU-only use:
-To evaluate a trained ODISE model's performance, run on single node
-```
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 8 --eval-only --init-from /path/to/checkpoint
-```
-or for multi-node inference:
```bash
-(node0)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 0 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
-(node1)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 1 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
-(node2)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 2 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
-(node3)$ ./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --machine-rank 3 --num-machines 4 --dist-url tcp://${MASTER_ADDR}:29500 --num-gpus 8 --eval-only --init-from /path/to/checkpoint
+python tools/extract_features.py \
+ --config-file configs/Panoptic/odise_label_coco_50e.py \
+ --num-gpus 1 \
+ --force-cpu \
+ --num-machines 1 \
+ --init-from /path/to/checkpoint.pth \
+ --output /path/to/feature_out \
+ --dataloader dataloader.test \
+ --feature-layers s2,s3,s4,s5
+```
+
+You can scale this to multi-GPU later by increasing `--num-gpus` and `--num-machines` once your environment is configured for distributed execution.
+
+`--dataloader` is a dotted path inside the config; for built-in PANOPTIC configs this is `dataloader.test`.
+Each `.pt` file stores a single image's normalized feature maps and metadata and can be merged later as needed.
+
+To evaluate a trained ODISE model on CPU-only single process:
+```
+./tools/train_net.py --config-file configs/Panoptic/odise_label_coco_50e.py --num-gpus 1 --force-cpu --eval-only --init-from /path/to/checkpoint
```
+or use distributed multi-node/multi-GPU launch flags as needed in your own environment.
To use the our provided ODISE [model zoo](README.md#model-zoo), you can pass in the arguments `--config-file configs/Panoptic/odise_label_coco_50e.py --init-from odise://Panoptic/odise_label_coco_50e` or `--config-file configs/Panoptic/odise_label_coco_50e.py --init-from odise://Panoptic/odise_caption_coco_50e` to `./tools/train_net.py`, respectively.
diff --git a/README.md b/README.md
index fea6d56..35a3ee3 100644
--- a/README.md
+++ b/README.md
@@ -52,32 +52,67 @@ If you find our work useful in your research, please cite:
## Environment Setup
-Install dependencies by running:
+Install with PyTorch 2.x using `uv` (CPU-first path by default):
```bash
-conda create -n odise python=3.9
-conda activate odise
-conda install pytorch=1.13.1 torchvision=0.14.1 pytorch-cuda=11.6 -c pytorch -c nvidia
-conda install -c "nvidia/label/cuda-11.6.1" libcusolver-dev
-git clone git@github.com:NVlabs/ODISE.git
-cd ODISE
-pip install -e .
+uv venv .venv --python 3.10
+source .venv/bin/activate
+uv pip install --upgrade pip setuptools wheel
+uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+uv pip install -e .
+
+# Optional S3 path support (used only when training/inference references s3:// URLs):
+uv pip install -e ".[s3]"
+
+# LDM/Stable Diffusion integrations require optional third-party checkouts:
+# initialize them with submodules or bootstrap script:
+#
+# git submodule update --init --recursive
+#
+# If you prefer a one-command local bootstrap, or if cloning was done without submodules:
+#
+# bash tools/bootstrap_third_party.sh
+# For a clean reset of existing accidental nested git checkouts, pass `--force`:
+# bash tools/bootstrap_third_party.sh --force
+
+# If you are running on CUDA machines and want GPU support, install CUDA wheels instead:
+# uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
+```
+
+Optional: rebuild Mask2Former CUDA kernels after any Torch/CUDA update:
+
+```bash
+cd third_party/Mask2Former
+python setup.py build install
+```
+
+For offline feature extraction (CPU/default path):
+
+```bash
+python tools/extract_features.py \
+ --config-file configs/Panoptic/odise_label_coco_50e.py \
+ --force-cpu \
+ --init-from /path/to/checkpoint.pth \
+ --output /path/to/feature_out \
+ --num-gpus 1 \
+ --dataloader dataloader.test \
+ --feature-layers s2,s3,s4,s5
```
(Optional) install [xformers](https://github.com/facebookresearch/xformers) for efficient transformer implementation:
One could either install the pre-built version
```
-pip install xformers==0.0.16
+uv pip install xformers==0.0.16
```
or build from latest source
```bash
# (Optional) Makes the build much faster
-pip install ninja
+uv pip install ninja
# Set TORCH_CUDA_ARCH_LIST if running and building on different GPU types
-pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+uv pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
# (this can take dozens of minutes)
```
diff --git a/configs/common/train.py b/configs/common/train.py
index 8382d13..386618c 100644
--- a/configs/common/train.py
+++ b/configs/common/train.py
@@ -34,7 +34,7 @@
checkpointer=dict(period=5000, max_to_keep=2), # options for PeriodicCheckpointer
eval_period="${train.checkpointer.period}",
log_period=50,
- device="cuda",
+ device="cpu",
seed=42,
# ...
wandb=dict(
diff --git a/demo/app.py b/demo/app.py
index c7eaea4..78cedb1 100644
--- a/demo/app.py
+++ b/demo/app.py
@@ -9,9 +9,11 @@
# ------------------------------------------------------------------------------
import itertools
-import json
-from contextlib import ExitStack
-import gradio as gr
+from contextlib import ExitStack, nullcontext
+try:
+ import gradio as gr
+except Exception:
+ gr = None
import torch
from detectron2.config import instantiate
from detectron2.data import MetadataCatalog
@@ -24,7 +26,6 @@
from detectron2.utils.visualizer import ColorMode, Visualizer, random_color
from mask2former.data.datasets.register_ade20k_panoptic import ADE20K_150_CATEGORIES
from PIL import Image
-from torch.cuda.amp import autocast
from odise import model_zoo
from odise.checkpoint import ODISECheckpointer
@@ -83,6 +84,7 @@ def __init__(self, model, metadata, aug, instance_mode=ColorMode.IMAGE):
self.aug = aug
self.cpu_device = torch.device("cpu")
self.instance_mode = instance_mode
+ self._autocast_ctx = nullcontext()
def predict(self, original_image):
"""
@@ -102,7 +104,7 @@ def predict(self, original_image):
inputs = {"image": image, "height": height, "width": width}
logger.info("forwarding")
- with autocast():
+ with self._autocast_ctx:
predictions = self.model([inputs])[0]
logger.info("done")
return predictions
@@ -137,29 +139,36 @@ def run_on_image(self, image):
models = {}
-for model_name, cfg_name in zip(
- ["ODISE(Label)", "ODISE(Caption)"],
- ["Panoptic/odise_label_coco_50e.py", "Panoptic/odise_caption_coco_50e.py"],
-):
+_DEMO_MODELS = {}
+_DEMO_MODEL_CONFIGS = [
+ ("ODISE(Label)", "Panoptic/odise_label_coco_50e.py"),
+ ("ODISE(Caption)", "Panoptic/odise_caption_coco_50e.py"),
+]
+
- cfg = model_zoo.get_config(cfg_name, trained=True)
+def _load_demo_models():
+ if _DEMO_MODELS:
+ return _DEMO_MODELS
- cfg.model.overlap_threshold = 0
- cfg.model.clip_head.alpha = 0.35
- cfg.model.clip_head.beta = 0.65
- cfg.train.device = "cuda" if torch.cuda.is_available() else "cpu"
- seed_all_rng(42)
+ for model_name, cfg_name in _DEMO_MODEL_CONFIGS:
+ cfg = model_zoo.get_config(cfg_name, trained=True)
- dataset_cfg = cfg.dataloader.test
- wrapper_cfg = cfg.dataloader.wrapper
+ cfg.model.overlap_threshold = 0
+ cfg.model.clip_head.alpha = 0.35
+ cfg.model.clip_head.beta = 0.65
+ cfg.train.device = "cpu"
+ seed_all_rng(42)
- aug = instantiate(dataset_cfg.mapper).augmentations
+ dataset_cfg = cfg.dataloader.test
+ aug = instantiate(dataset_cfg.mapper).augmentations
- model = instantiate_odise(cfg.model)
- model.to(torch.float16)
- model.to(cfg.train.device)
- ODISECheckpointer(model).load(cfg.train.init_checkpoint)
- models[model_name] = model
+ model = instantiate_odise(cfg.model)
+ model.to(torch.float32 if cfg.train.device == "cpu" else torch.float16)
+ model.to(cfg.train.device)
+ ODISECheckpointer(model).load(cfg.train.init_checkpoint)
+ _DEMO_MODELS[model_name] = {"model": model, "aug": aug}
+
+ return _DEMO_MODELS
title = "ODISE"
@@ -249,10 +258,13 @@ def inference(image_path, vocab, label_list, model_name):
demo_classes, demo_metadata = build_demo_classes_and_metadata(vocab, label_list)
if model_name is None:
model_name = "ODISE(Label)"
+ model_bundle = _load_demo_models().get(model_name, _load_demo_models()["ODISE(Label)"])
+ model = model_bundle["model"]
+ aug = model_bundle["aug"]
with ExitStack() as stack:
logger.info(f"loading model {model_name}")
inference_model = OpenPanopticInference(
- model=models[model_name],
+ model=model,
labels=demo_classes,
metadata=demo_metadata,
semantic_on=False,
@@ -268,65 +280,87 @@ def inference(image_path, vocab, label_list, model_name):
return Image.fromarray(visualized_output.get_image())
-with gr.Blocks(title=title) as demo:
- gr.Markdown("
" + title + "
")
- gr.Markdown(description)
- input_components = []
- output_components = []
+def build_demo():
+ if gr is None:
+ raise ImportError(
+ "gradio is required to build the app. Install with `pip install 'odise[app]'`."
+ )
+ with gr.Blocks(title=title) as demo:
+ gr.Markdown("" + title + "
")
+ gr.Markdown(description)
+ input_components = []
+ output_components = []
+
+ with gr.Row():
+ output_image_gr = gr.Image(label="Panoptic Segmentation", type="pil")
+ output_components.append(output_image_gr)
+
+ with gr.Row(equal_height=True):
+ with gr.Column(scale=3, variant="panel") as input_component_column:
+ input_image_gr = gr.Image(type="filepath")
+ model_name_gr = gr.Dropdown(
+ label="Model",
+ choices=["ODISE(Label)", "ODISE(Caption)"],
+ value="ODISE(Label)",
+ )
+ extra_vocab_gr = gr.Textbox(value="", label="Extra Vocabulary")
+ category_list_gr = gr.CheckboxGroup(
+ choices=[
+ "COCO (133 categories)",
+ "ADE (150 categories)",
+ "LVIS (1203 categories)",
+ ],
+ value=[
+ "COCO (133 categories)",
+ "ADE (150 categories)",
+ "LVIS (1203 categories)",
+ ],
+ label="Category to use",
+ )
+ input_components.extend([input_image_gr, extra_vocab_gr, category_list_gr])
+
+ with gr.Column(scale=2):
+ examples_handler = gr.Examples(
+ examples=examples,
+ inputs=[c for c in input_components if not isinstance(c, gr.State)],
+ outputs=[c for c in output_components if not isinstance(c, gr.State)],
+ fn=inference,
+ cache_examples=False,
+ examples_per_page=5,
+ )
+ with gr.Row():
+ clear_btn = gr.Button("Clear")
+ submit_btn = gr.Button("Submit", variant="primary")
+
+ gr.Markdown(article)
+
+ submit_btn.click(
+ inference,
+ input_components + [model_name_gr],
+ output_components,
+ api_name="predict",
+ scroll_to_output=True,
+ )
- with gr.Row():
- output_image_gr = gr.outputs.Image(label="Panoptic Segmentation", type="pil")
- output_components.append(output_image_gr)
+ def clear_inputs():
+ return [None, "", [
+ "COCO (133 categories)",
+ "ADE (150 categories)",
+ "LVIS (1203 categories)",
+ ], None]
+
+ clear_btn.click(
+ clear_inputs,
+ [],
+ input_components + output_components,
+ )
+ return demo
- with gr.Row().style(equal_height=True, mobile_collapse=True):
- with gr.Column(scale=3, variant="panel") as input_component_column:
- input_image_gr = gr.inputs.Image(type="filepath")
- model_name_gr = gr.inputs.Dropdown(
- label="Model", choices=["ODISE(Label)", "ODISE(Caption)"], default="ODISE(Label)"
- )
- extra_vocab_gr = gr.inputs.Textbox(default="", label="Extra Vocabulary")
- category_list_gr = gr.inputs.CheckboxGroup(
- choices=["COCO (133 categories)", "ADE (150 categories)", "LVIS (1203 categories)"],
- default=["COCO (133 categories)", "ADE (150 categories)", "LVIS (1203 categories)"],
- label="Category to use",
- )
- input_components.extend([input_image_gr, extra_vocab_gr, category_list_gr])
-
- with gr.Column(scale=2):
- examples_handler = gr.Examples(
- examples=examples,
- inputs=[c for c in input_components if not isinstance(c, gr.State)],
- outputs=[c for c in output_components if not isinstance(c, gr.State)],
- fn=inference,
- cache_examples=torch.cuda.is_available(),
- examples_per_page=5,
- )
- with gr.Row():
- clear_btn = gr.Button("Clear")
- submit_btn = gr.Button("Submit", variant="primary")
-
- gr.Markdown(article)
-
- submit_btn.click(
- inference,
- input_components + [model_name_gr],
- output_components,
- api_name="predict",
- scroll_to_output=True,
- )
-
- clear_btn.click(
- None,
- [],
- (input_components + output_components + [input_component_column]),
- _js=f"""() => {json.dumps(
- [component.cleared_value if hasattr(component, "cleared_value") else None
- for component in input_components + output_components] + (
- [gr.Column.update(visible=True)]
- )
- + ([gr.Column.update(visible=False)])
- )}
- """,
- )
-
-demo.launch()
+
+def main():
+ demo = build_demo()
+ demo.launch()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/demo/demo.py b/demo/demo.py
index 2c8af51..34c9ceb 100644
--- a/demo/demo.py
+++ b/demo/demo.py
@@ -49,12 +49,17 @@
from odise.data import get_openseg_labels
from odise.engine.defaults import get_model_from_module
-nltk.download("popular", quiet=True)
-nltk.download("universal_tagset", quiet=True)
-
# constants
WINDOW_NAME = "ODISE demo"
+
+def _ensure_nltk_resources():
+ try:
+ nltk.download("popular", quiet=True)
+ nltk.download("universal_tagset", quiet=True)
+ except Exception as e:
+ warnings.warn(f"Skipping NLTK corpus downloads: {e}")
+
COCO_THING_CLASSES = [
label
for idx, label in enumerate(get_openseg_labels("coco_panoptic", True))
@@ -328,6 +333,7 @@ def test_opencv_video_format(codec, file_ext):
extra_classes.append([word.strip() for word in words.split(",")])
if args.caption:
+ _ensure_nltk_resources()
caption_words = []
caption_words.extend(get_nouns(args.caption, True))
caption_words.extend(get_nouns(args.caption, False))
@@ -351,7 +357,7 @@ def test_opencv_video_format(codec, file_ext):
demo_thing_classes += COCO_THING_CLASSES
demo_stuff_classes += COCO_STUFF_CLASSES
demo_thing_colors += COCO_THING_COLORS
- demo_stuff_colors = COCO_STUFF_COLORS
+ demo_stuff_colors += COCO_STUFF_COLORS
if "ADE" in args.label:
demo_thing_classes += ADE_THING_CLASSES
demo_stuff_classes += ADE_STUFF_CLASSES
diff --git a/odise/__init__.py b/odise/__init__.py
index b01ac44..99e21ce 100644
--- a/odise/__init__.py
+++ b/odise/__init__.py
@@ -10,4 +10,20 @@
# This line will be programatically read/write by setup.py.
# Leave them at the bottom of this file and don't touch them.
+
+import os
+import sys
+
+
+def _bootstrap_vendor_paths() -> None:
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ third_party = os.path.join(project_root, "third_party")
+ for name in ("Mask2Former", "latent-diffusion", "taming-transformers"):
+ pkg_root = os.path.join(third_party, name)
+ if os.path.isdir(pkg_root) and pkg_root not in sys.path:
+ sys.path.insert(0, pkg_root)
+
+
+_bootstrap_vendor_paths()
+
__version__ = "0.1"
diff --git a/odise/checkpoint/odise_checkpointer.py b/odise/checkpoint/odise_checkpointer.py
index cb281c9..fa106ff 100644
--- a/odise/checkpoint/odise_checkpointer.py
+++ b/odise/checkpoint/odise_checkpointer.py
@@ -19,6 +19,7 @@
from typing import List
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.checkpoint.c2_model_loading import align_and_update_state_dicts
+import torch
from fvcore.common.checkpoint import Checkpointer
from odise.utils.file_io import PathManager
@@ -138,3 +139,10 @@ def _load_model(self, checkpoint):
# rename the keys in checkpoint
checkpoint["model"] = checkpoint.pop("state_dict")
return super()._load_model(checkpoint)
+
+ def _load_file(self, file):
+ # PyTorch 2.6 changes default torch.load(..., weights_only=True), which breaks
+ # legacy ODISE LDV checkpoints containing optimizer/scheduler objects.
+ # These checkpoints are trusted and loaded from project-provided sources, so keep legacy behavior.
+ with self.path_manager.open(file, "rb") as f:
+ return torch.load(f, map_location=torch.device("cpu"), weights_only=False)
diff --git a/odise/engine/defaults.py b/odise/engine/defaults.py
index a2f906e..5fc7091 100644
--- a/odise/engine/defaults.py
+++ b/odise/engine/defaults.py
@@ -24,8 +24,6 @@
from detectron2.utils.file_io import PathManager
from detectron2.utils.logger import setup_logger
-from odise.utils.collect_env import collect_env_info
-
def get_model_from_module(model):
if hasattr(model, "module"):
@@ -65,16 +63,22 @@ def default_setup(cfg, args):
logger = setup_logger(log_dir, distributed_rank=rank)
logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size()))
- logger.info("Environment info:\n" + collect_env_info())
+ try:
+ from odise.utils.collect_env import collect_env_info
+
+ logger.info("Environment info:\n" + collect_env_info())
+ except Exception as e:
+ logger.warning(f"Skipping environment collection due: {e}")
logger.info("Command line arguments: " + str(args))
if hasattr(args, "config_file") and args.config_file != "":
- logger.info(
- "Contents of args.config_file={}:\n{}".format(
- args.config_file,
- _highlight(PathManager.open(args.config_file, "r").read(), args.config_file),
+ with PathManager.open(args.config_file, "r") as f:
+ logger.info(
+ "Contents of args.config_file={}:\n{}".format(
+ args.config_file,
+ _highlight(f.read(), args.config_file),
+ )
)
- )
if comm.is_main_process() and log_dir:
# Note: some of our scripts may expect the existence of
diff --git a/odise/engine/train_loop.py b/odise/engine/train_loop.py
index a147a02..abb7cef 100644
--- a/odise/engine/train_loop.py
+++ b/odise/engine/train_loop.py
@@ -18,12 +18,13 @@
import logging
import numpy as np
import time
+from math import inf
from typing import Iterable, Mapping, Union
+
import detectron2.utils.comm as comm
import torch
from detectron2.engine import SimpleTrainer as _SimpleTrainer
from detectron2.utils.events import get_event_storage
-from torch._six import inf
from torch.nn.parallel import DataParallel, DistributedDataParallel
from odise.utils.parameter_count import parameter_count_table
@@ -197,9 +198,10 @@ class NativeScalerWithGradNormCount:
state_dict_key = "amp_scaler"
def __init__(self):
- from torch.cuda.amp import GradScaler
+ from torch.amp import GradScaler
- self._scaler = GradScaler()
+ assert torch.cuda.is_available(), "AMPTrainer requires CUDA"
+ self._scaler = GradScaler('cuda')
def __call__(
self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True
@@ -263,7 +265,7 @@ def run_step(self):
"""
assert self.model.training, "[AMPTrainer] model was changed to eval mode!"
assert torch.cuda.is_available(), "[AMPTrainer] CUDA is required for AMP training!"
- from torch.cuda.amp import autocast
+ from torch.amp import autocast
start = time.perf_counter()
data = next(self._data_loader_iter)
@@ -277,7 +279,7 @@ def run_step(self):
data["runner_meta"] = dict()
data["runner_meta"]["iter"] = self.iter
data["runner_meta"]["max_iter"] = self.max_iter
- with autocast():
+ with autocast('cuda'):
loss_dict = self.model(data)
if isinstance(loss_dict, torch.Tensor):
losses = loss_dict
diff --git a/odise/evaluation/evaluator.py b/odise/evaluation/evaluator.py
index c89aa40..9139f80 100644
--- a/odise/evaluation/evaluator.py
+++ b/odise/evaluation/evaluator.py
@@ -72,10 +72,10 @@ def inference_on_dataset(
total_eval_time = 0
if use_amp and torch.cuda.is_available():
- from torch.cuda.amp import autocast
+ from torch.amp import autocast
+ amp_ctx = autocast('cuda')
else:
- # Use ExitStack as placeholder
- autocast = nullcontext
+ amp_ctx = nullcontext()
with ExitStack() as stack:
if isinstance(model, nn.Module):
@@ -93,7 +93,7 @@ def inference_on_dataset(
total_eval_time = 0
start_compute_time = time.perf_counter()
- with autocast():
+ with amp_ctx:
outputs = model(inputs)
if torch.cuda.is_available():
torch.cuda.synchronize()
diff --git a/odise/model_zoo/model_zoo.py b/odise/model_zoo/model_zoo.py
index 32c2edd..b0a672c 100644
--- a/odise/model_zoo/model_zoo.py
+++ b/odise/model_zoo/model_zoo.py
@@ -17,7 +17,7 @@
import logging
import os
from typing import Optional
-import pkg_resources
+from importlib import resources as importlib_resources
import torch
from detectron2.config import LazyConfig
@@ -86,9 +86,7 @@ def get_config_file(config_path):
Returns:
str: the real path to the config file.
"""
- cfg_file = pkg_resources.resource_filename(
- "odise.model_zoo", os.path.join("configs", config_path)
- )
+ cfg_file = str(importlib_resources.files("odise.model_zoo").joinpath("configs", config_path))
if not os.path.exists(cfg_file):
raise RuntimeError("{} not available in Model Zoo!".format(config_path))
return cfg_file
diff --git a/odise/modeling/diffusion/resample.py b/odise/modeling/diffusion/resample.py
index 3d86f92..a4daf29 100644
--- a/odise/modeling/diffusion/resample.py
+++ b/odise/modeling/diffusion/resample.py
@@ -140,7 +140,7 @@ def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
self.history_per_term = history_per_term
self.uniform_prob = uniform_prob
self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
- self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+ self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int64)
def weights(self):
if not self._warmed_up():
diff --git a/odise/utils/collect_env.py b/odise/utils/collect_env.py
index 0d391fd..74a6457 100644
--- a/odise/utils/collect_env.py
+++ b/odise/utils/collect_env.py
@@ -22,14 +22,20 @@
from collections import defaultdict
import PIL
import torch
-import torchvision
+try:
+ import torchvision
+except Exception:
+ torchvision = None
from detectron2.utils.collect_env import (
collect_torch_env,
detect_compute_compatibility,
get_env_module,
test_nccl_ops,
)
-from tabulate import tabulate
+try:
+ from tabulate import tabulate
+except Exception:
+ tabulate = None
__all__ = ["collect_env_info"]
@@ -116,7 +122,7 @@ def collect_env_info():
try:
# this is how torch/utils/cpp_extensions.py choose compiler
cxx = os.environ.get("CXX", "c++")
- cxx = subprocess.check_output("'{}' --version".format(cxx), shell=True)
+ cxx = subprocess.check_output([cxx, "--version"])
cxx = cxx.decode("utf-8").strip().split("\n")[0]
except subprocess.SubprocessError:
cxx = "Not found"
@@ -125,7 +131,7 @@ def collect_env_info():
if has_cuda and CUDA_HOME is not None:
try:
nvcc = os.path.join(CUDA_HOME, "bin", "nvcc")
- nvcc = subprocess.check_output("'{}' -V".format(nvcc), shell=True)
+ nvcc = subprocess.check_output([nvcc, "-V"])
nvcc = nvcc.decode("utf-8").strip().split("\n")[-1]
except subprocess.SubprocessError:
nvcc = "Not found"
@@ -184,22 +190,25 @@ def collect_env_info():
data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list))
data.append(("Pillow", PIL.__version__))
- try:
- data.append(
- (
- "torchvision",
- str(torchvision.__version__) + " @" + os.path.dirname(torchvision.__file__),
+ if torchvision is None:
+ data.append(("torchvision", "not found"))
+ else:
+ try:
+ data.append(
+ (
+ "torchvision",
+ str(torchvision.__version__) + " @" + os.path.dirname(torchvision.__file__),
+ )
)
- )
- if has_cuda:
- try:
- torchvision_C = importlib.util.find_spec("torchvision._C").origin
- msg = detect_compute_compatibility(CUDA_HOME, torchvision_C)
- data.append(("torchvision arch flags", msg))
- except (ImportError, AttributeError):
- data.append(("torchvision._C", "Not found"))
- except AttributeError:
- data.append(("torchvision", "unknown"))
+ if has_cuda:
+ try:
+ torchvision_C = importlib.util.find_spec("torchvision._C").origin
+ msg = detect_compute_compatibility(CUDA_HOME, torchvision_C)
+ data.append(("torchvision arch flags", msg))
+ except (ImportError, AttributeError):
+ data.append(("torchvision._C", "Not found"))
+ except AttributeError:
+ data.append(("torchvision", "unknown"))
try:
import fvcore
@@ -222,7 +231,10 @@ def collect_env_info():
except (ImportError, AttributeError):
data.append(("cv2", "Not found"))
- env_str = tabulate(data) + "\n"
+ if tabulate is None:
+ env_str = "\n".join(f"{k}: {v}" for k, v in data) + "\n"
+ else:
+ env_str = tabulate(data) + "\n"
env_str += collect_torch_env()
return env_str
diff --git a/requirements/constraints.txt b/requirements/constraints.txt
new file mode 100644
index 0000000..ab25323
--- /dev/null
+++ b/requirements/constraints.txt
@@ -0,0 +1,5 @@
+# Compatibility pins shared by ODISE + extensions.
+# Keep NumPy on the 1.x ABI line for this stack and pin timm per Python version.
+numpy<2.0
+timm==0.6.11; python_version < '3.11'
+timm==0.6.13; python_version >= '3.11'
diff --git a/scripts/bootstrap_third_party.py b/scripts/bootstrap_third_party.py
new file mode 100644
index 0000000..ae1d914
--- /dev/null
+++ b/scripts/bootstrap_third_party.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+"""Bootstrap optional third_party repositories for ODISE."""
+
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+from pathlib import Path
+from typing import Dict
+
+
+THIRD_PARTY_ROOT_REPOS: Dict[str, str] = {
+ "latent-diffusion": "https://github.com/CompVis/latent-diffusion.git",
+ "taming-transformers": "https://github.com/CompVis/taming-transformers.git",
+}
+
+
+def _run(cmd, cwd=None):
+ subprocess.run(cmd, cwd=cwd, check=True)
+
+
+def _ensure_repo(name: str, destination: Path) -> None:
+ url = THIRD_PARTY_ROOT_REPOS[name]
+ marker = destination / ".git"
+
+ if destination.exists():
+ if not marker.exists():
+ raise RuntimeError(
+ f"{destination} already exists but is not a git repository. "
+ "Please move/rename it before retrying."
+ )
+ _run(["git", "-C", str(destination), "fetch", "--all"])
+ return
+
+ _run(["git", "clone", "--depth", "1", url, str(destination)])
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--root", default=".", help="Repository root where third_party/ lives (default: '.')"
+ )
+ parser.add_argument("--all", action="store_true", help="Clone all optional repos.")
+ parser.add_argument(
+ "--latent-diffusion",
+ action="store_true",
+ help="Clone optional latent-diffusion integration.",
+ )
+ parser.add_argument(
+ "--taming-transformers",
+ action="store_true",
+ help="Clone optional taming-transformers integration.",
+ )
+ parser.add_argument(
+ "--force",
+ action="store_true",
+ help="Refresh existing checkouts by fetching remotes.",
+ )
+
+ args = parser.parse_args()
+ root = Path(args.root).resolve()
+ third_party_root = root / "third_party"
+ third_party_root.mkdir(parents=True, exist_ok=True)
+ os.chdir(third_party_root)
+
+ selected = []
+ if args.all:
+ selected = sorted(THIRD_PARTY_ROOT_REPOS.keys())
+ else:
+ if args.latent_diffusion:
+ selected.append("latent-diffusion")
+ if args.taming_transformers:
+ selected.append("taming-transformers")
+
+ if not selected:
+ raise SystemExit(
+ "No repository selected. Use --all, --latent-diffusion, or --taming-transformers."
+ )
+
+ for repo in selected:
+ destination = third_party_root / repo
+ _ensure_repo(repo, destination)
+
+ if args.force:
+ for repo in selected:
+ destination = third_party_root / repo
+ if (destination / ".git").exists():
+ _run(["git", "-C", str(destination), "pull", "--ff-only"])
+
+ print("Bootstrap completed:", ", ".join(selected))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/setup.cfg b/setup.cfg
index 3314793..219ece5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,7 +12,7 @@ sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
default_section=FIRSTPARTY
[mypy]
-python_version=3.6
+python_version=3.10
ignore_missing_imports = True
warn_unused_configs = True
disallow_untyped_defs = True
diff --git a/setup.py b/setup.py
index 794d47d..adb77e4 100644
--- a/setup.py
+++ b/setup.py
@@ -12,14 +12,25 @@
import glob
import os
+import warnings
import shutil
from os import path
from setuptools import find_packages, setup
from typing import List
-import torch
-torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
-assert torch_ver >= [1, 8], "Requires PyTorch >= 1.8"
+try:
+ import torch
+
+ torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
+ assert torch_ver >= [2, 0], "Requires PyTorch >= 2.0"
+except ImportError:
+ # keep installation possible in isolated environments where torch is installed later.
+ pass
+
+
+MASK2FORMER_PATH = path.abspath(
+ path.join(path.dirname(__file__), "third_party", "Mask2Former")
+)
def get_version():
@@ -62,6 +73,30 @@ def get_model_zoo_configs() -> List[str]:
return config_paths
+install_requires = [
+ "numpy<2.0",
+ "timm==0.6.11; python_version < '3.11'", # freeze timm version for stability
+ "timm==0.6.13; python_version >= '3.11'", # adjusted for Python 3.11 dataclass compatibility
+ "opencv-python==4.6.0.66",
+ "diffdist==0.1",
+ "nltk>=3.6.2",
+ "einops>=0.3.0",
+ "wandb>=0.12.11",
+ # "transformers==4.20.1", # freeze transformers version for stabliity
+ # there is BC breaking in omegaconf 2.2.1
+ # see: https://github.com/omry/omegaconf/issues/939
+ "omegaconf>=2.3,<3",
+ "open-clip-torch==2.0.2",
+]
+
+if path.isdir(MASK2FORMER_PATH):
+ install_requires.append(f"mask2former @ file://localhost/{MASK2FORMER_PATH}")
+else:
+ warnings.warn(
+ "third_party/Mask2Former directory not found; skipping local mask2former dependency. "
+ "Set up this submodule before packaging if needed."
+ )
+
setup(
name="odise",
version=get_version(),
@@ -70,23 +105,14 @@ def get_model_zoo_configs() -> List[str]:
description="Open-vocabulary DIffusion-based Panoptic Segmentation",
packages=find_packages(exclude=("configs", "tests*")),
package_data={"odise.model_zoo": get_model_zoo_configs()},
- python_requires=">=3.8",
- install_requires=[
- "timm==0.6.11", # freeze timm version for stabliity
- "opencv-python==4.6.0.66",
- "diffdist==0.1",
- "nltk>=3.6.2",
- "einops>=0.3.0",
- "wandb>=0.12.11",
- # "transformers==4.20.1", # freeze transformers version for stabliity
- # there is BC breaking in omegaconf 2.2.1
- # see: https://github.com/omry/omegaconf/issues/939
- "omegaconf==2.1.1",
- "open-clip-torch==2.0.2",
- f"mask2former @ file://localhost/{os.getcwd()}/third_party/Mask2Former/",
- "stable-diffusion-sdkit==2.1.3",
- ],
+ python_requires=">=3.10",
+ install_requires=install_requires,
extras_require={
+ "sdkit": ["stable-diffusion-sdkit==2.1.3"],
+ "app": ["gradio>=4.44"],
+ "s3": [
+ "boto3",
+ ],
# dev dependencies. Install them by `pip install 'odise[dev]'`
"dev": [
"flake8==3.8.1",
diff --git a/third_party/Mask2Former/INSTALL.md b/third_party/Mask2Former/INSTALL.md
index e0bbead..9beeb14 100644
--- a/third_party/Mask2Former/INSTALL.md
+++ b/third_party/Mask2Former/INSTALL.md
@@ -1,13 +1,20 @@
## Installation
### Requirements
-- Linux or macOS with Python ≥ 3.6
-- PyTorch ≥ 1.9 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
- Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
- PyTorch version matches that is required by Detectron2.
+- Linux or macOS with Python ≥ 3.10.
+- PyTorch 2.x and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
+ Install them together at [pytorch.org](https://pytorch.org) to make sure of this.
- Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
-- OpenCV is optional but needed by demo and visualization
-- `pip install -r requirements.txt`
+- OpenCV is optional but needed by demo and visualization.
+
+Example setup (CPU-first):
+
+```bash
+uv venv .venv --python 3.10
+source .venv/bin/activate
+uv pip install --upgrade pip setuptools wheel
+uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+```
### CUDA kernel for MSDeformAttn
After preparing the required environment, run the following command to compile CUDA kernel for MSDeformAttn:
@@ -15,8 +22,8 @@ After preparing the required environment, run the following command to compile C
`CUDA_HOME` must be defined and points to the directory of the installed CUDA toolkit.
```bash
-cd mask2former/modeling/pixel_decoder/ops
-sh make.sh
+cd third_party/Mask2Former
+python setup.py build install
```
#### Building on another system
@@ -25,24 +32,21 @@ To build on a system that does not have a GPU device but provide the drivers:
TORCH_CUDA_ARCH_LIST='8.0' FORCE_CUDA=1 python setup.py build install
```
-### Example conda environment setup
+### Example environment setup
+```bash
+cd third_party/Mask2Former
+uv venv .venv --python 3.10
+source .venv/bin/activate
+uv pip install -e .
+python setup.py build install
+```
+
+To keep your path aligned with CPU-first workflows used in this fork, install CPU wheels first:
+
```bash
-conda create --name mask2former python=3.8 -y
-conda activate mask2former
-conda install pytorch==1.9.0 torchvision==0.10.0 cudatoolkit=11.1 -c pytorch -c nvidia
-pip install -U opencv-python
-
-# under your working directory
-git clone git@github.com:facebookresearch/detectron2.git
-cd detectron2
-pip install -e .
-pip install git+https://github.com/cocodataset/panopticapi.git
-pip install git+https://github.com/mcordts/cityscapesScripts.git
-
-cd ..
-git clone git@github.com:facebookresearch/Mask2Former.git
-cd Mask2Former
-pip install -r requirements.txt
-cd mask2former/modeling/pixel_decoder/ops
-sh make.sh
+uv venv .venv --python 3.10
+source .venv/bin/activate
+uv pip install --upgrade pip setuptools wheel
+uv pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+uv pip install -e .
```
diff --git a/third_party/Mask2Former/cog.yaml b/third_party/Mask2Former/cog.yaml
index 4476c3a..4c03f02 100644
--- a/third_party/Mask2Former/cog.yaml
+++ b/third_party/Mask2Former/cog.yaml
@@ -22,7 +22,6 @@ build:
- pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
- pip install git+https://github.com/cocodataset/panopticapi.git
- pip install git+https://github.com/mcordts/cityscapesScripts.git
- - git clone https://github.com/facebookresearch/Mask2Former
- - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install
+ - cd third_party/Mask2Former && TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python setup.py build install
predict: "predict.py:Predictor"
diff --git a/third_party/Mask2Former/demo_video/demo.py b/third_party/Mask2Former/demo_video/demo.py
index 7f30def..6d89d5b 100644
--- a/third_party/Mask2Former/demo_video/demo.py
+++ b/third_party/Mask2Former/demo_video/demo.py
@@ -18,7 +18,7 @@
import numpy as np
import tqdm
-from torch.cuda.amp import autocast
+from torch.amp import autocast
from detectron2.config import get_cfg
from detectron2.data.detection_utils import read_image
@@ -131,7 +131,7 @@ def test_opencv_video_format(codec, file_ext):
vid_frames.append(img)
start_time = time.time()
- with autocast():
+ with autocast('cuda'):
predictions, visualized_output = demo.run_on_video(vid_frames)
logger.info(
"detected {} instances per frame in {:.2f}s".format(
@@ -168,7 +168,7 @@ def test_opencv_video_format(codec, file_ext):
break
start_time = time.time()
- with autocast():
+ with autocast('cuda'):
predictions, visualized_output = demo.run_on_video(vid_frames)
logger.info(
"detected {} instances per frame in {:.2f}s".format(
diff --git a/third_party/Mask2Former/mask2former/modeling/backbone/swin.py b/third_party/Mask2Former/mask2former/modeling/backbone/swin.py
index 3b099d8..ab17036 100644
--- a/third_party/Mask2Former/mask2former/modeling/backbone/swin.py
+++ b/third_party/Mask2Former/mask2former/modeling/backbone/swin.py
@@ -110,7 +110,7 @@ def __init__(
# get pair-wise relative position index for each token inside the window
coords_h = torch.arange(self.window_size[0])
coords_w = torch.arange(self.window_size[1])
- coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing='ij')) # 2, Wh, Ww
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
@@ -442,7 +442,7 @@ def forward(self, x, H, W):
for blk in self.blocks:
blk.H, blk.W = H, W
if self.use_checkpoint:
- x = checkpoint.checkpoint(blk, x, attn_mask)
+ x = checkpoint.checkpoint(blk, x, attn_mask, use_reentrant=False)
else:
x = blk(x, attn_mask)
if self.downsample is not None:
diff --git a/third_party/Mask2Former/mask2former/modeling/matcher.py b/third_party/Mask2Former/mask2former/modeling/matcher.py
index 7c6af7f..c1f9b25 100644
--- a/third_party/Mask2Former/mask2former/modeling/matcher.py
+++ b/third_party/Mask2Former/mask2former/modeling/matcher.py
@@ -7,7 +7,7 @@
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from torch import nn
-from torch.cuda.amp import autocast
+from torch.amp import autocast
from detectron2.projects.point_rend.point_features import point_sample
@@ -131,7 +131,7 @@ def memory_efficient_forward(self, outputs, targets):
align_corners=False,
).squeeze(1)
- with autocast(enabled=False):
+ with autocast('cuda', enabled=False):
out_mask = out_mask.float()
tgt_mask = tgt_mask.float()
# Compute the focal loss between masks
diff --git a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/fpn.py b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/fpn.py
index 7df65a1..4b77ce0 100644
--- a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/fpn.py
+++ b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/fpn.py
@@ -8,7 +8,7 @@
from torch import nn
from torch.nn import functional as F
from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
-from torch.cuda.amp import autocast
+from torch.amp import autocast
from detectron2.config import configurable
from detectron2.layers import Conv2d, DeformConv, ShapeSpec, get_norm
diff --git a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/msdeformattn.py b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/msdeformattn.py
index 26c9f57..3f955ef 100644
--- a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/msdeformattn.py
+++ b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/msdeformattn.py
@@ -8,7 +8,7 @@
from torch import nn
from torch.nn import functional as F
from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
-from torch.cuda.amp import autocast
+from torch.amp import autocast
from detectron2.config import configurable
from detectron2.layers import Conv2d, ShapeSpec, get_norm
@@ -312,7 +312,7 @@ def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
return ret
def forward_features(self, features):
- with autocast(enabled=not self.training and torch.is_autocast_enabled()):
+ with autocast('cuda', enabled=False):
srcs = []
pos = []
# Reverse feature maps into top-down order (from low to high resolution)
diff --git a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py
index 47b531e..34224fe 100644
--- a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py
+++ b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py
@@ -23,8 +23,8 @@
except ModuleNotFoundError as e:
info_string = (
"\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
- "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
- "\t`sh make.sh`\n"
+ "\t`cd third_party/Mask2Former`\n"
+ "\t`python setup.py build install`\n"
)
MSDA = None
diff --git a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py
index e7b4c42..e65205b 100644
--- a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py
+++ b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/modules/ms_deform_attn.py
@@ -80,7 +80,7 @@ def _reset_parameters(self):
constant_(self.output_proj.bias.data, 0.)
def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
- """
+ r"""
:param query (N, Length_{query}, C)
:param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
@@ -116,7 +116,7 @@ def forward(self, query, reference_points, input_flatten, input_spatial_shapes,
try:
output = MSDeformAttnFunction.apply(
value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
- except:
+ except Exception:
# CPU
output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
# # For FLOPs calculation only
diff --git a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
index 48757e2..7d24675 100644
--- a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
+++ b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp
@@ -28,7 +28,7 @@ ms_deform_attn_cpu_forward(
const at::Tensor &attn_weight,
const int im2col_step)
{
- AT_ERROR("Not implement on cpu");
+ TORCH_CHECK(false, "Not implement on cpu");
}
std::vector
@@ -41,6 +41,6 @@ ms_deform_attn_cpu_backward(
const at::Tensor &grad_output,
const int im2col_step)
{
- AT_ERROR("Not implement on cpu");
+ TORCH_CHECK(false, "Not implement on cpu");
}
diff --git a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu
index 0c465da..626cc1b 100644
--- a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu
+++ b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu
@@ -30,17 +30,17 @@ at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &attn_weight,
const int im2col_step)
{
- AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
- AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
- AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
- AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
- AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-
- AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
- AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
- AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
- AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
- AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+ TORCH_CHECK(value.is_contiguous(), "value tensor has to be contiguous");
+ TORCH_CHECK(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+ TORCH_CHECK(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+ TORCH_CHECK(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+ TORCH_CHECK(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+ TORCH_CHECK(value.is_cuda(), "value must be a CUDA tensor");
+ TORCH_CHECK(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+ TORCH_CHECK(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+ TORCH_CHECK(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+ TORCH_CHECK(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
@@ -54,34 +54,57 @@ at::Tensor ms_deform_attn_cuda_forward(
const int im2col_step_ = std::min(batch, im2col_step);
- AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+ TORCH_CHECK(
+ batch % im2col_step_ == 0,
+ "batch(",
+ batch,
+ ") must divide im2col_step(",
+ im2col_step_,
+ ")"
+ );
- auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
-
- const int batch_n = im2col_step_;
- auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
- auto per_value_size = spatial_size * num_heads * channels;
- auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
- auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
- for (int n = 0; n < batch/im2col_step_; ++n)
- {
- auto columns = output_n.select(0, n);
- AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
- ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
- value.data() + n * im2col_step_ * per_value_size,
- spatial_shapes.data(),
- level_start_index.data(),
- sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
- attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
+ auto dispatch_forward_impl = [&](auto scalar_type_tag) {
+ using scalar_t = decltype(scalar_type_tag);
+
+ auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+ const int batch_n = im2col_step_;
+ auto output_n = output.view({batch / im2col_step_, batch_n, num_query, num_heads, channels});
+ auto per_value_size = spatial_size * num_heads * channels;
+ auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+ auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+ for (int n = 0; n < batch / im2col_step_; ++n)
+ {
+ auto columns = output_n.select(0, n);
+ ms_deformable_im2col_cuda(
+ at::cuda::getCurrentCUDAStream().stream(),
+ value.data_ptr() + n * im2col_step_ * per_value_size,
+ spatial_shapes.data_ptr(),
+ level_start_index.data_ptr(),
+ sampling_loc.data_ptr() + n * im2col_step_ * per_sample_loc_size,
+ attn_weight.data_ptr() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
- columns.data());
+ columns.data_ptr());
+ }
- }));
- }
+ return output.view({batch, num_query, num_heads * channels});
+ };
- output = output.view({batch, num_query, num_heads*channels});
+ switch (value.scalar_type())
+ {
+ case at::ScalarType::Float:
+ return dispatch_forward_impl(float());
+ case at::ScalarType::Double:
+ return dispatch_forward_impl(double());
+ case at::ScalarType::Half:
+ return dispatch_forward_impl(at::Half());
+ case at::ScalarType::BFloat16:
+ return dispatch_forward_impl(at::BFloat16());
+ default:
+ TORCH_CHECK(false, "ms_deform_attn_cuda_forward supports only float, double, half, bfloat16");
+ }
- return output;
+ throw std::runtime_error("Unsupported dtype for ms_deform_attn_cuda_forward");
}
@@ -95,19 +118,19 @@ std::vector ms_deform_attn_cuda_backward(
const int im2col_step)
{
- AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
- AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
- AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
- AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
- AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
- AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+ TORCH_CHECK(value.is_contiguous(), "value tensor has to be contiguous");
+ TORCH_CHECK(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+ TORCH_CHECK(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+ TORCH_CHECK(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+ TORCH_CHECK(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+ TORCH_CHECK(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
- AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
- AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
- AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
- AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
- AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
- AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+ TORCH_CHECK(value.is_cuda(), "value must be a CUDA tensor");
+ TORCH_CHECK(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+ TORCH_CHECK(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+ TORCH_CHECK(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+ TORCH_CHECK(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+ TORCH_CHECK(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
@@ -121,38 +144,61 @@ std::vector ms_deform_attn_cuda_backward(
const int im2col_step_ = std::min(batch, im2col_step);
- AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+ TORCH_CHECK(
+ batch % im2col_step_ == 0,
+ "batch(",
+ batch,
+ ") must divide im2col_step(",
+ im2col_step_,
+ ")"
+ );
+
+ auto dispatch_backward_impl = [&](auto scalar_type_tag) {
+ using scalar_t = decltype(scalar_type_tag);
+
+ auto grad_value = at::zeros_like(value);
+ auto grad_sampling_loc = at::zeros_like(sampling_loc);
+ auto grad_attn_weight = at::zeros_like(attn_weight);
+
+ const int batch_n = im2col_step_;
+ auto per_value_size = spatial_size * num_heads * channels;
+ auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+ auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+ auto grad_output_n = grad_output.view({batch / im2col_step_, batch_n, num_query, num_heads, channels});
+
+ for (int n = 0; n < batch / im2col_step_; ++n)
+ {
+ auto grad_output_g = grad_output_n.select(0, n);
+ ms_deformable_col2im_cuda(
+ at::cuda::getCurrentCUDAStream().stream(),
+ grad_output_g.data_ptr(),
+ value.data_ptr() + n * im2col_step_ * per_value_size,
+ spatial_shapes.data_ptr(),
+ level_start_index.data_ptr(),
+ sampling_loc.data_ptr() + n * im2col_step_ * per_sample_loc_size,
+ attn_weight.data_ptr() + n * im2col_step_ * per_attn_weight_size,
+ batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+ grad_value.data_ptr() + n * im2col_step_ * per_value_size,
+ grad_sampling_loc.data_ptr() + n * im2col_step_ * per_sample_loc_size,
+ grad_attn_weight.data_ptr() + n * im2col_step_ * per_attn_weight_size);
+ }
- auto grad_value = at::zeros_like(value);
- auto grad_sampling_loc = at::zeros_like(sampling_loc);
- auto grad_attn_weight = at::zeros_like(attn_weight);
+ return std::vector({grad_value, grad_sampling_loc, grad_attn_weight});
+ };
- const int batch_n = im2col_step_;
- auto per_value_size = spatial_size * num_heads * channels;
- auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
- auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
- auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-
- for (int n = 0; n < batch/im2col_step_; ++n)
+ switch (value.scalar_type())
{
- auto grad_output_g = grad_output_n.select(0, n);
- AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
- ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
- grad_output_g.data(),
- value.data() + n * im2col_step_ * per_value_size,
- spatial_shapes.data(),
- level_start_index.data(),
- sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
- attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
- batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
- grad_value.data() + n * im2col_step_ * per_value_size,
- grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
- grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size);
-
- }));
+ case at::ScalarType::Float:
+ return dispatch_backward_impl(float());
+ case at::ScalarType::Double:
+ return dispatch_backward_impl(double());
+ case at::ScalarType::Half:
+ return dispatch_backward_impl(at::Half());
+ case at::ScalarType::BFloat16:
+ return dispatch_backward_impl(at::BFloat16());
+ default:
+ TORCH_CHECK(false, "ms_deform_attn_cuda_backward supports only float, double, half, bfloat16");
}
- return {
- grad_value, grad_sampling_loc, grad_attn_weight
- };
-}
\ No newline at end of file
+ throw std::runtime_error("Unsupported dtype for ms_deform_attn_cuda_backward");
+}
diff --git a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh
index c04e0d4..cf50ce0 100644
--- a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh
+++ b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh
@@ -20,8 +20,46 @@
#include
#include
+#include
+#include
-#include
+// Generic wrapper: forward to atomicAdd for float/double
+template
+__device__ __forceinline__ void gpuAtomicAdd(scalar_t* address, scalar_t val) {
+ atomicAdd(address, val);
+}
+
+// Specialization for c10::Half — cast to float for atomicAdd
+template <>
+__device__ __forceinline__ void gpuAtomicAdd(c10::Half* address, c10::Half val) {
+ atomicAdd(reinterpret_cast<__half*>(address), static_cast<__half>(val));
+}
+
+// Specialization for c10::BFloat16 — use float CAS loop
+template <>
+__device__ __forceinline__ void gpuAtomicAdd(c10::BFloat16* address, c10::BFloat16 val) {
+#if __CUDA_ARCH__ >= 800
+ atomicAdd(reinterpret_cast<__nv_bfloat16*>(address), static_cast<__nv_bfloat16>(val));
+#else
+ // Fallback: CAS loop via float
+ unsigned int* address_as_uint = reinterpret_cast(
+ reinterpret_cast(address) - (reinterpret_cast(address) & 2));
+ unsigned int old = *address_as_uint;
+ unsigned int assumed;
+ bool is_upper = (reinterpret_cast(address) & 2);
+ do {
+ assumed = old;
+ unsigned short raw = is_upper ? (old >> 16) : (old & 0xFFFF);
+ __nv_bfloat16 bf_val = *reinterpret_cast<__nv_bfloat16*>(&raw);
+ float sum = __bfloat162float(bf_val) + static_cast(val);
+ __nv_bfloat16 new_bf = __float2bfloat16(sum);
+ unsigned short new_raw = *reinterpret_cast(&new_bf);
+ unsigned int new_val = is_upper ? ((old & 0xFFFF) | (new_raw << 16))
+ : ((old & 0xFFFF0000) | new_raw);
+ old = atomicCAS(address_as_uint, assumed, new_val);
+ } while (old != assumed);
+#endif
+}
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
@@ -127,7 +165,7 @@ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
- atomicAdd(grad_value+ptr1, w1*top_grad_value);
+ gpuAtomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
@@ -136,7 +174,7 @@ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
- atomicAdd(grad_value+ptr2, w2*top_grad_value);
+ gpuAtomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
@@ -145,7 +183,7 @@ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
- atomicAdd(grad_value+ptr3, w3*top_grad_value);
+ gpuAtomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
@@ -154,7 +192,7 @@ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
- atomicAdd(grad_value+ptr4, w4*top_grad_value);
+ gpuAtomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
@@ -202,7 +240,7 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
- atomicAdd(grad_value+ptr1, w1*top_grad_value);
+ gpuAtomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
@@ -211,7 +249,7 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
- atomicAdd(grad_value+ptr2, w2*top_grad_value);
+ gpuAtomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
@@ -220,7 +258,7 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
- atomicAdd(grad_value+ptr3, w3*top_grad_value);
+ gpuAtomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
@@ -229,13 +267,13 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
- atomicAdd(grad_value+ptr4, w4*top_grad_value);
+ gpuAtomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
- atomicAdd(grad_attn_weight, top_grad * val);
- atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
- atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+ gpuAtomicAdd(grad_attn_weight, top_grad * val);
+ gpuAtomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+ gpuAtomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
}
@@ -831,9 +869,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const
if (tid == 0)
{
- atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
- atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
- atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+ gpuAtomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+ gpuAtomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+ gpuAtomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
}
__syncthreads();
@@ -1329,4 +1367,4 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
}
-}
\ No newline at end of file
+}
diff --git a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h
index 2f80a1b..06d73f8 100644
--- a/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h
+++ b/third_party/Mask2Former/mask2former/modeling/pixel_decoder/ops/src/ms_deform_attn.h
@@ -31,16 +31,21 @@ ms_deform_attn_forward(
const at::Tensor &attn_weight,
const int im2col_step)
{
- if (value.type().is_cuda())
+ if (value.is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_forward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
#else
- AT_ERROR("Not compiled with GPU support");
+ TORCH_CHECK(false, "Not compiled with GPU support");
#endif
}
- AT_ERROR("Not implemented on the CPU");
+ if (value.is_cpu())
+ {
+ return ms_deform_attn_cpu_forward(
+ value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+ }
+ TORCH_CHECK(false, "Unsupported device type");
}
std::vector
@@ -53,15 +58,25 @@ ms_deform_attn_backward(
const at::Tensor &grad_output,
const int im2col_step)
{
- if (value.type().is_cuda())
+ if (value.is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_backward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
#else
- AT_ERROR("Not compiled with GPU support");
+ TORCH_CHECK(false, "Not compiled with GPU support");
#endif
}
- AT_ERROR("Not implemented on the CPU");
+ if (value.is_cpu())
+ {
+ return ms_deform_attn_cpu_backward(
+ value,
+ spatial_shapes,
+ level_start_index,
+ sampling_loc,
+ attn_weight,
+ grad_output,
+ im2col_step);
+ }
+ TORCH_CHECK(false, "Unsupported device type");
}
-
diff --git a/third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py b/third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py
index f2cb8be..20b613c 100644
--- a/third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py
+++ b/third_party/Mask2Former/mask2former_video/data_video/datasets/ytvis_api/ytvoseval.py
@@ -407,8 +407,8 @@ def accumulate(self, p = None):
tps = np.logical_and( dtm, np.logical_not(dtIg) )
fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
- tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float)
- fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float)
+ tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float64)
+ fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float64)
for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
tp = np.array(tp)
fp = np.array(fp)
@@ -548,8 +548,8 @@ def setKpParams(self):
self.vidIds = []
self.catIds = []
# np.arange causes trouble. the data point on arange is slightly larger than the true value
- self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True)
- self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True)
+ self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+ self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True)
self.maxDets = [20]
self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]]
self.areaRngLbl = ['all', 'medium', 'large']
diff --git a/third_party/Mask2Former/mask2former_video/modeling/matcher.py b/third_party/Mask2Former/mask2former_video/modeling/matcher.py
index 642f360..fe231aa 100644
--- a/third_party/Mask2Former/mask2former_video/modeling/matcher.py
+++ b/third_party/Mask2Former/mask2former_video/modeling/matcher.py
@@ -7,7 +7,7 @@
import torch.nn.functional as F
from scipy.optimize import linear_sum_assignment
from torch import nn
-from torch.cuda.amp import autocast
+from torch.amp import autocast
from detectron2.projects.point_rend.point_features import point_sample
@@ -131,7 +131,7 @@ def memory_efficient_forward(self, outputs, targets):
align_corners=False,
).flatten(1)
- with autocast(enabled=False):
+ with autocast('cuda', enabled=False):
out_mask = out_mask.float()
tgt_mask = tgt_mask.float()
# Compute the focal loss between masks
diff --git a/third_party/Mask2Former/mask2former_video/utils/memory.py b/third_party/Mask2Former/mask2former_video/utils/memory.py
index 7ee5f15..e9ed0c4 100644
--- a/third_party/Mask2Former/mask2former_video/utils/memory.py
+++ b/third_party/Mask2Former/mask2former_video/utils/memory.py
@@ -4,8 +4,6 @@
from contextlib import contextmanager
from functools import wraps
import torch
-from torch.cuda.amp import autocast
-
__all__ = ["retry_if_cuda_oom"]
@@ -74,7 +72,6 @@ def wrapped(*args, **kwargs):
logger.info("Attempting to copy inputs to CPU due to CUDA OOM")
new_args = (maybe_to_cpu(x) for x in args)
new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
- with autocast(enabled=False):
- return func(*new_args, **new_kwargs)
+ return func(*new_args, **new_kwargs)
return wrapped
diff --git a/third_party/Mask2Former/setup.py b/third_party/Mask2Former/setup.py
index 399dfbb..9b5b236 100644
--- a/third_party/Mask2Former/setup.py
+++ b/third_party/Mask2Former/setup.py
@@ -46,9 +46,6 @@ def get_extensions():
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
- "-D__CUDA_NO_HALF_OPERATORS__",
- "-D__CUDA_NO_HALF_CONVERSIONS__",
- "-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
if CUDA_HOME is None:
@@ -83,13 +80,13 @@ def get_extensions():
packages=find_packages(exclude=("configs", "tests*")),
python_requires=">=3.6",
install_requires=[
- "detectron2 @ https://github.com/facebookresearch/detectron2/archive/v0.6.zip",
+ "detectron2",
"scipy>=1.7.3",
"boto3>=1.21.25",
- "hydra-core==1.1.1",
- # there is BC breaking in omegaconf 2.2.1
- # see: https://github.com/omry/omegaconf/issues/939
- "omegaconf==2.1.1",
+ # Hydra <1.3 breaks on Python 3.11 due immutable dataclass defaults.
+ "hydra-core>=1.3,<3",
+ # there is BC breaking in omegaconf 2.2.1; keep on a later stable stream.
+ "omegaconf>=2.3,<3",
"panopticapi @ https://github.com/cocodataset/panopticapi/archive/master.zip",
"lvis @ https://github.com/lvis-dataset/lvis-api/archive/master.zip",
],
diff --git a/third_party/Mask2Former/train_net_video.py b/third_party/Mask2Former/train_net_video.py
index 2d22345..db41c2b 100644
--- a/third_party/Mask2Former/train_net_video.py
+++ b/third_party/Mask2Former/train_net_video.py
@@ -195,7 +195,7 @@ def test(cls, cfg, model, evaluators=None):
Returns:
dict: a dict of result metrics
"""
- from torch.cuda.amp import autocast
+ from torch.amp import autocast
logger = logging.getLogger(__name__)
if isinstance(evaluators, DatasetEvaluator):
evaluators = [evaluators]
@@ -221,7 +221,7 @@ def test(cls, cfg, model, evaluators=None):
)
results[dataset_name] = {}
continue
- with autocast():
+ with autocast('cuda'):
results_i = inference_on_dataset(model, data_loader, evaluator)
results[dataset_name] = results_i
if comm.is_main_process():
diff --git a/tools/bootstrap_third_party.sh b/tools/bootstrap_third_party.sh
new file mode 100755
index 0000000..7960030
--- /dev/null
+++ b/tools/bootstrap_third_party.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+FORCE_REINIT=false
+if [[ "${1-}" == "--force" ]]; then
+ FORCE_REINIT=true
+fi
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+cd "$ROOT_DIR"
+
+USE_CLONE_FALLBACK=true
+if git rev-parse --is-inside-work-tree >/dev/null 2>&1; then
+ USE_CLONE_FALLBACK=false
+fi
+
+boot_dep() {
+ local name="$1"
+ local url="$2"
+ local path="$3"
+
+ if [ -f "$path/.git" ]; then
+ echo "[odise] $name already initialized as submodule link ($path)"
+ return
+ fi
+
+ if [ -d "$path/.git" ]; then
+ if [ "$FORCE_REINIT" = "true" ]; then
+ echo "[odise] Replacing nested git checkout at $path with submodule/clone..."
+ rm -rf "$path"
+ else
+ echo "[odise] $name already has a nested git checkout ($path/.git)."
+ echo "[odise] Keeping as-is; remove that directory and rerun this script for a clean submodule checkout."
+ return
+ fi
+ fi
+
+ if [ -d "$path" ]; then
+ echo "[odise] $name directory exists without git metadata; skipping auto-bootstrap."
+ echo "[odise] Ensure this directory comes from a clean git checkout before running installs that depend on it."
+ return
+ fi
+
+ if [ "$USE_CLONE_FALLBACK" = "true" ]; then
+ echo "[odise] Cloning $name (non-git context)..."
+ git clone --depth 1 "$url" "$path"
+ else
+ echo "[odise] Adding $name as submodule..."
+ git submodule add --depth 1 "$url" "$path" || git submodule update --init --recursive "$path"
+ fi
+}
+
+boot_dep "latent-diffusion" "https://github.com/CompVis/latent-diffusion.git" "third_party/latent-diffusion"
+boot_dep "taming-transformers" "https://github.com/CompVis/taming-transformers.git" "third_party/taming-transformers"
+
+if [ "$USE_CLONE_FALLBACK" = "false" ]; then
+ git submodule update --init --recursive third_party/latent-diffusion third_party/taming-transformers || true
+ echo "[odise] Submodule records refreshed."
+fi
+
+echo "[odise] Third_party bootstrap complete."
diff --git a/tools/extract_features.py b/tools/extract_features.py
new file mode 100644
index 0000000..7faac5d
--- /dev/null
+++ b/tools/extract_features.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python
+#
+# ------------------------------------------------------------------------------
+# Copyright (c) NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# This work is made available under the Nvidia Source Code License.
+# ------------------------------------------------------------------------------
+
+import argparse
+import os
+import os.path as osp
+import sys
+from contextlib import nullcontext
+from typing import Dict, List, Optional
+
+PROJECT_ROOT = osp.dirname(osp.dirname(osp.abspath(__file__)))
+MASK2FORMER_PATH = osp.join(PROJECT_ROOT, "third_party", "Mask2Former")
+if osp.isdir(MASK2FORMER_PATH) and MASK2FORMER_PATH not in sys.path:
+ sys.path.insert(0, MASK2FORMER_PATH)
+LATENT_DIFFUSION_PATH = osp.join(PROJECT_ROOT, "third_party", "latent-diffusion")
+if osp.isdir(LATENT_DIFFUSION_PATH) and LATENT_DIFFUSION_PATH not in sys.path:
+ sys.path.insert(0, LATENT_DIFFUSION_PATH)
+TAMING_TRANSFORMERS_PATH = osp.join(PROJECT_ROOT, "third_party", "taming-transformers")
+if osp.isdir(TAMING_TRANSFORMERS_PATH) and TAMING_TRANSFORMERS_PATH not in sys.path:
+ sys.path.insert(0, TAMING_TRANSFORMERS_PATH)
+
+import torch
+from detectron2.config import LazyConfig, instantiate
+from detectron2.engine import create_ddp_model, default_argument_parser, launch
+from detectron2.structures import ImageList
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import setup_logger
+
+from odise.checkpoint import ODISECheckpointer
+from odise.config import auto_scale_workers, instantiate_odise
+from odise.engine.defaults import default_setup, get_model_from_module
+
+
+def _resolve_cfg_entry(cfg, dotted_key: str):
+ target = cfg
+ for part in dotted_key.split("."):
+ if not hasattr(target, part):
+ raise ValueError(f"Cannot find config entry '{dotted_key}' at '{part}'.")
+ target = getattr(target, part)
+ return target
+
+
+def _safe_image_id(sample: Dict, fallback: int) -> str:
+ image_id = sample.get("image_id")
+ if image_id is None:
+ image_id = sample.get("id")
+ if image_id is None:
+ image_id = sample.get("file_name", f"sample_{fallback}")
+ return str(image_id).replace("/", "_")
+
+
+def _assert_file_exists(path: str, label: str) -> None:
+ if not path:
+ raise ValueError(f"{label} is required and cannot be empty.")
+ if path.startswith(("odise://", "http://", "https://")):
+ return
+ if not osp.exists(path):
+ raise ValueError(f"{label} does not exist: {path}")
+
+
+def _filter_layers(features: Dict[str, torch.Tensor], layer_names: Optional[List[str]]) -> Dict[str, torch.Tensor]:
+ if not layer_names:
+ return features
+
+ missing = [name for name in layer_names if name not in features]
+ if missing:
+ raise KeyError(f"Requested feature layers not present: {missing}")
+ return {name: features[name] for name in layer_names}
+
+
+def _get_model_device(model) -> torch.device:
+ if hasattr(model, "device"):
+ return model.device
+ for p in model.parameters():
+ return p.device
+ raise ValueError("Could not infer model device: model has no parameters and no device attribute.")
+
+
+@torch.no_grad()
+def extract_features(cfg, args):
+ cfg = auto_scale_workers(cfg, comm.get_world_size())
+ if args.init_from:
+ cfg.train.init_checkpoint = args.init_from
+ if args.output:
+ cfg.train.output_dir = args.output
+ cfg.train.log_dir = cfg.train.output_dir
+ cfg = LazyConfig.apply_overrides(cfg, args.opts)
+
+ default_setup(cfg, args)
+ logger = setup_logger(cfg.train.log_dir, distributed_rank=comm.get_rank(), name="odise")
+
+ logger.info(f"Running with config:\n{LazyConfig.to_py(cfg)}")
+ logger.info(
+ f"extract_features args: num_gpus={args.num_gpus}, num_machines={args.num_machines}, "
+ f"dataloader={args.dataloader}, feature_layers={args.feature_layers or 'ALL'}, "
+ f"output={args.output}, output_dtype={args.output_dtype}, max_images={args.max_images}"
+ )
+
+ model = instantiate_odise(cfg.model)
+ if getattr(args, "force_cpu", False) and cfg.train.device == "cuda":
+ logger.warning("CPU-only execution requested via --force-cpu. Setting cfg.train.device=cpu.")
+ cfg.train.device = "cpu"
+ model.to(cfg.train.device)
+ model = create_ddp_model(model)
+ model_module = get_model_from_module(model)
+ model_device = _get_model_device(model_module)
+
+ if cfg.train.init_checkpoint:
+ _assert_file_exists(cfg.train.init_checkpoint, "Checkpoint path")
+ checkpointer = ODISECheckpointer(model, cfg.train.output_dir)
+ if cfg.train.init_checkpoint:
+ checkpointer.resume_or_load(cfg.train.init_checkpoint, resume=args.resume)
+ else:
+ raise ValueError("`--init-from` is required for extraction.")
+
+ model.eval()
+
+ dataloader_cfg = _resolve_cfg_entry(cfg, args.dataloader)
+ data_loader = instantiate(dataloader_cfg)
+
+ if getattr(args, "force_cpu", False) and cfg.train.device != "cpu":
+ logger.warning("CPU-only execution requested via --force-cpu. Forcing feature extraction to CPU.")
+ cfg.train.device = "cpu"
+ elif cfg.train.device == "cuda" and not torch.cuda.is_available():
+ logger.warning("CUDA is not available, switching feature extraction to CPU.")
+ cfg.train.device = "cpu"
+ if args.amp and not torch.cuda.is_available():
+ logger.warning("AMP requested but CUDA is unavailable; running without autocast.")
+ amp_ctx = torch.amp.autocast(
+ "cuda", enabled=args.amp and torch.cuda.is_available() and not getattr(args, "force_cpu", False)
+ ) if torch.cuda.is_available() and not getattr(args, "force_cpu", False) else nullcontext()
+
+ rank = comm.get_rank()
+ world_size = comm.get_world_size()
+ layer_names = [name.strip() for name in args.feature_layers.split(",") if name.strip()]
+ dtype_map = {
+ "fp16": torch.float16,
+ "fp32": torch.float32,
+ "bf16": torch.bfloat16,
+ }
+ output_dtype = dtype_map[args.output_dtype]
+ output_root = osp.join(cfg.train.output_dir, "features")
+ rank_root = osp.join(output_root, f"rank_{rank:02d}_of_{world_size:02d}")
+ PathManager.mkdirs(rank_root)
+ logger.info(f"Writing feature shards to {rank_root}")
+
+ processed = 0
+ for batch_idx, batched_inputs in enumerate(data_loader):
+ if args.max_images > 0 and processed >= args.max_images:
+ break
+ images = [sample["image"].to(device=model_device, non_blocking=True) for sample in batched_inputs]
+ images = [(x - model_module.pixel_mean) / model_module.pixel_std for x in images]
+ image_batch = ImageList.from_tensors(images, model_module.size_divisibility)
+
+ with amp_ctx:
+ features = model_module.backbone(image_batch.tensor)
+
+ features = _filter_layers(features, layer_names)
+
+ for local_idx, sample in enumerate(batched_inputs):
+ if args.max_images > 0 and processed >= args.max_images:
+ break
+ feature_entry = {}
+ for name, value in features.items():
+ feature_entry[name] = value[local_idx].to(dtype=output_dtype).cpu()
+
+ image_id = _safe_image_id(sample, batch_idx * len(batched_inputs) + local_idx)
+ payload = {
+ "image_id": sample.get("image_id", image_id),
+ "file_name": sample.get("file_name"),
+ "height": sample.get("height"),
+ "width": sample.get("width"),
+ "layer_names": sorted(feature_entry.keys()),
+ "features": feature_entry,
+ }
+ out_file = osp.join(
+ rank_root,
+ f"{image_id}_bs{local_idx:02d}_r{rank:02d}.pt",
+ )
+ if args.skip_existing and PathManager.exists(out_file):
+ processed += 1
+ continue
+ torch.save(payload, out_file)
+ processed += 1
+
+ if processed % 50 == 0 and comm.is_main_process():
+ logger.info(f"Rank {rank}: processed {processed} samples")
+
+ comm.synchronize()
+ if comm.is_main_process():
+ logger.info(f"Feature extraction finished with total_local={processed}.")
+
+
+def parse_args():
+ parser = default_argument_parser()
+ parser.add_argument("--output", required=True, type=str, help="Output directory for feature shards")
+ parser.add_argument(
+ "--dataloader",
+ default="dataloader.test",
+ type=str,
+ help="Config key path for dataloader, for example `dataloader.test`.",
+ )
+ parser.add_argument(
+ "--feature-layers",
+ default="",
+ type=str,
+ help="Comma-separated backbone feature keys. Leave empty to export all.",
+ )
+ parser.add_argument(
+ "--output-dtype",
+ default="fp16",
+ type=str,
+ choices=["fp16", "fp32", "bf16"],
+ help="Dtype to store extracted feature tensors.",
+ )
+ parser.add_argument("--max-images", default=-1, type=int, help="Stop after N images per rank.")
+ parser.add_argument("--skip-existing", action="store_true", help="Skip samples already written.")
+ parser.add_argument("--amp", action="store_true", help="Use AMP for backbone inference.")
+ parser.add_argument("--force-cpu", action="store_true", help="Force CPU-only execution")
+ parser.add_argument("--init-from", type=str, default="", help="Model checkpoint path.")
+ args = parser.parse_args()
+ return args
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ if args.force_cpu or not torch.cuda.is_available():
+ if args.num_gpus != 1:
+ print("CPU-only execution requested. Forcing --num-gpus=1 for feature extraction.")
+ args.num_gpus = 1
+ if args.force_cpu or (args.amp and not torch.cuda.is_available()):
+ if args.amp and not torch.cuda.is_available():
+ print("GPU-only AMP requested without CUDA. Forcing --amp disabled for feature extraction.")
+ if args.force_cpu and args.num_gpus != 1:
+ print("CPU-only execution requested. Forcing --num-gpus=1 for feature extraction.")
+ args.amp = False
+ cfg = LazyConfig.load(args.config_file)
+ launch(
+ extract_features,
+ args.num_gpus,
+ num_machines=args.num_machines,
+ machine_rank=args.machine_rank,
+ dist_url=args.dist_url,
+ args=(cfg, args),
+ )
diff --git a/tools/train_net.py b/tools/train_net.py
index c19fecf..78162ad 100755
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -39,7 +39,11 @@
from detectron2.utils.events import JSONWriter
from detectron2.utils.file_io import PathManager
from detectron2.utils.logger import setup_logger
-from iopath.common.s3 import S3PathHandler
+import torch
+try:
+ from iopath.common.s3 import S3PathHandler
+except Exception:
+ S3PathHandler = None
from omegaconf import OmegaConf
from odise.checkpoint import ODISECheckpointer
@@ -50,7 +54,12 @@
from odise.evaluation import inference_on_dataset
from odise.utils.events import CommonMetricPrinter, WandbWriter, WriterStack
-PathManager.register_handler(S3PathHandler())
+if S3PathHandler is not None:
+ try:
+ PathManager.register_handler(S3PathHandler())
+ except Exception:
+ # Optional dependency for S3 access. Boto3 may not be installed in CPU-only envs.
+ S3PathHandler = None
logger = logging.getLogger("odise")
@@ -210,6 +219,32 @@ def do_test(cfg, model, *, final_iter=False, next_iter=0):
return all_ret
+def _apply_cpu_fallback(cfg, args, logger):
+ if not getattr(args, "force_cpu", False) and torch.cuda.is_available():
+ return
+
+ if getattr(args, "force_cpu", False):
+ logger.warning("CPU-only execution requested via --force-cpu.")
+
+ if cfg.train.device == "cuda":
+ logger.warning("Forcing cpu execution by setting cfg.train.device=cpu.")
+ cfg.train.device = "cpu"
+
+ if getattr(args, "amp", False):
+ logger.warning("CPU execution requested. Forcing --amp disabled.")
+ args.amp = False
+
+ if cfg.train.amp.enabled:
+ logger.warning("AMP is enabled in config but unsupported on CPU. Disabling.")
+ cfg.train.amp.enabled = False
+
+ if getattr(args, "num_gpus", 1) != 1:
+ logger.warning(
+ "CPU execution uses single process only. Forcing --num-gpus=1."
+ )
+ args.num_gpus = 1
+
+
def do_train(args, cfg):
"""
Args:
@@ -235,8 +270,7 @@ def do_train(args, cfg):
cfg.train.output_dir
)
# create writers at the beginning for W&B logging
- if comm.is_main_process():
- writers = default_writers(cfg)
+ writers = default_writers(cfg) if comm.is_main_process() else None
comm.synchronize()
# not sure why d2 use ExitStack(), maybe easier for multiple context
@@ -327,7 +361,6 @@ def main(args):
cfg.train.output_dir = osp.join(cfg.train.output_dir, cfg.train.run_tag)
if hasattr(args, "wandb") and args.wandb:
cfg.train.wandb.enable_writer = args.wandb
- cfg.train.wandb.enable_visualizer = args.wandb
if hasattr(args, "amp") and args.amp:
cfg.train.amp.enabled = args.amp
if hasattr(args, "init_from") and args.init_from:
@@ -338,6 +371,7 @@ def main(args):
cfg = LazyConfig.apply_overrides(cfg, args.opts)
default_setup(cfg, args)
logger = setup_logger(cfg.train.log_dir, distributed_rank=comm.get_rank(), name="odise")
+ _apply_cpu_fallback(cfg, args, logger)
logger.info(f"Running with config:\n{LazyConfig.to_py(cfg)}")
@@ -380,6 +414,7 @@ def parse_args():
parser.add_argument("--log-tag", type=str, help="tag of experiment")
parser.add_argument("--wandb", action="store_true", help="Use W&B to log experiments")
parser.add_argument("--amp", action="store_true", help="Use AMP for mixed precision training")
+ parser.add_argument("--force-cpu", action="store_true", help="Force CPU-only execution")
parser.add_argument("--reference-world-size", "--ref", type=int)
args = parser.parse_args()
@@ -389,6 +424,10 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()
+ if args.force_cpu or not torch.cuda.is_available():
+ if args.num_gpus != 1:
+ print("CPU-only execution requested. Forcing --num-gpus=1.")
+ args.num_gpus = 1
launch(
main,
args.num_gpus,