init

UVA-Computer-Vision-Lab · Nov 21, 2024 · 441ab5e · 441ab5e
commit 441ab5e
Show file tree

Hide file tree

Showing 20 changed files with 3,411 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.DS_store
+.idea
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "liveServer.settings.port": 5501
+}
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+# Nerfies
+
+This is the repository that contains source code for the [Nerfies website](https://nerfies.github.io).
+
+If you find Nerfies useful for your work please cite:
+```
+@article{park2021nerfies
+  author    = {Park, Keunhong and Sinha, Utkarsh and Barron, Jonathan T. and Bouaziz, Sofien and Goldman, Dan B and Seitz, Steven M. and Martin-Brualla, Ricardo},
+  title     = {Nerfies: Deformable Neural Radiance Fields},
+  journal   = {ICCV},
+  year      = {2021},
+}
+```
+
+# Website License
+<a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
diff --git a/index.html b/index.html
@@ -0,0 +1,307 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <meta name="description"
+        content="Open Vocabulary Monocular 3D Object Detection">
+  <!-- <meta name="keywords" content="Nerfies, D-NeRF, NeRF">
+  <meta name="viewport" content="width=device-width, initial-scale=1"> -->
+  <title>Open Vocabulary Monocular 3D Object Detection</title>
+
+  <!-- Global site tag (gtag.js) - Google Analytics -->
+  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
+  <script>
+    window.dataLayer = window.dataLayer || [];
+
+    function gtag() {
+      dataLayer.push(arguments);
+    }
+
+    gtag('js', new Date());
+
+    gtag('config', 'G-PYVRSFMDRL');
+  </script>
+
+  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
+        rel="stylesheet">
+
+  <link rel="stylesheet" href="./static/css/bulma.min.css">
+  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
+  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
+  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
+  <link rel="stylesheet"
+        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
+  <link rel="stylesheet" href="./static/css/index.css">
+  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+  <script defer src="./static/js/fontawesome.all.min.js"></script>
+  <script src="./static/js/bulma-carousel.min.js"></script>
+  <script src="./static/js/bulma-slider.min.js"></script>
+  <script src="./static/js/index.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
+</head>
+<body>
+
+
+<section class="hero">
+  <div class="hero-body">
+    <div class="container is-max-desktop">
+      <div class="columns is-centered">
+        <div class="column has-text-centered">
+          <h1 class="title is-1 publication-title">Open Vocabulary Monocular 3D Object Detection</h1>
+          <div class="is-size-5 publication-authors">
+            <div class="is-size-5 publication-authors">
+              <a href="https://yaojin17.github.io/" class="author-block" style="margin-right: 20px; text-decoration: none; position: relative;">
+                <span>Jin Yao</span>
+                <sup style="position: absolute; top: -5px; right: -5px; font-size: 0.8em;">1</sup>
+              </a>
+              <a href="https://www.linkedin.com/in/hao--gu/" class="author-block" style="margin-right: 20px; text-decoration: none; position: relative;">
+                <span>Hao Gu</span>
+                <sup style="position: absolute; top: -5px; right: -5px; font-size: 0.8em;">1</sup>
+              </a>
+              <a href="https://xuweiyichen.github.io/" class="author-block" style="margin-right: 20px; text-decoration: none; position: relative;">
+                <span>Xuweiyi Chen</span>
+                <sup style="position: absolute; top: -5px; right: -5px; font-size: 0.8em;">1</sup>
+              </a>
+              <a href="https://pwang.pw/" class="author-block" style="margin-right: 20px; text-decoration: none; position: relative;">
+                <span>Jiayun Wang</span>
+                <sup style="position: absolute; top: -5px; right: -8px; font-size: 0.8em;">2</sup>
+              </a>
+              <a href="https://sites.google.com/site/zezhoucheng/" class="author-block" style="text-decoration: none; position: relative;">
+                <span>Zezhou Cheng</span>
+                <sup style="position: absolute; top: -5px; right: -5px; font-size: 0.8em;">1</sup>
+              </a>
+            </div>
+          </div>
+          <div class="is-size-5 publication-authors">
+            <span class="author-block" style="position: relative; margin-right: 20px;">
+              <sup style="position: absolute; top: -5px; left: -10px; font-size: 0.8em;">1</sup>
+              University of Virginia
+            </span>
+            <span class="author-block" style="position: relative;">
+              <sup style="position: absolute; top: -5px; left: -10px; font-size: 0.8em;">2</sup>
+              California Institute of Technology
+            </span>
+          </div>
+
+          <div class="column has-text-centered">
+            <div class="publication-links">
+              <!-- PDF Link. -->
+              <!-- <span class="link-block">
+                <a href="https://arxiv.org/pdf/2011.12948"
+                   class="external-link button is-normal is-rounded is-dark">
+                  <span class="icon">
+                      <i class="fas fa-file-pdf"></i>
+                  </span>
+                  <span>Paper</span>
+                </a>
+              </span> -->
+              <span class="link-block">
+                <a href="https://arxiv.org/abs/2011.12948"
+                   class="external-link button is-normal is-rounded is-dark">
+                  <span class="icon">
+                      <i class="ai ai-arxiv"></i>
+                  </span>
+
+                  <span>arXiv</span>
+                </a>
+              </span>
+              <!-- Video Link. -->
+              <!-- <span class="link-block">
+                <a href="https://www.youtube.com/watch?v=MrKrnHhk8IA"
+                   class="external-link button is-normal is-rounded is-dark">
+                  <span class="icon">
+                      <i class="fab fa-youtube"></i>
+                  </span>
+                  <span>Video</span>
+                </a>
+              </span> -->
+              <!-- Code Link. -->
+              <span class="link-block">
+                <a href="https://github.com/UVA-Computer-Vision-Lab/ovmono3d"
+                   class="external-link button is-normal is-rounded is-dark">
+                  <span class="icon">
+                      <i class="fab fa-github"></i>
+                  </span>
+                  <span>GitHub</span>
+                  </a>
+              </span>
+              <!-- Dataset Link. -->
+              <!-- <span class="link-block">
+                <a href="https://github.com/google/nerfies/releases/tag/0.1"
+                   class="external-link button is-normal is-rounded is-dark">
+                  <span class="icon">
+                      <i class="far fa-images"></i>
+                  </span>
+                  <span>Data</span>
+                  </a> -->
+            </div>
+
+          </div>
+        </div>
+      </div>
+    </div>
+  </div>
+</section>
+
+<section class="section">
+  <div class="container is-max-desktop">
+    <!-- title -->
+    <!-- <h2 class="title is-4" style="text-align: center; margin-bottom: 1rem;">Qualitative Visualizations on the Omni3D Test Set</h2> -->
+
+    <!-- figure and caption -->
+    <figure>
+      <img src="static/images/coco_large.png" alt="Zero-Shot Performance on In-the-Wild COCO Images" style="width: 100%; display: block; margin: 0 auto;">
+      <figcaption>
+        <div class="content">
+          <p>
+            <strong>OVMono3D-LIFT's Zero-Shot Performance on In-the-Wild COCO Images</strong>. 
+            We display 3D predictions overlaid on the images and the top-down views with a base grid of 
+            <span class="math">\(1\,\text{m} \times 1\,\text{m}\)</span> tiles. 
+            For single-object images, only front-views are displayed.      
+          </p>
+        </div>
+      </figcaption>
+    </figure>
+  </div>
+</section>
+
+<section class="section">
+  <div class="container is-max-desktop">
+    <!-- Abstract. -->
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-four-fifths">
+        <h2 class="title is-3">Abstract</h2>
+        <div class="content has-text-justified">
+          <p>
+            In this work, we pioneer to study of open-vocabulary monocular 3D object detection, a novel task that aims to detect and localize objects in 3D space from a single RGB image without limiting detection to a predefined set of categories. 
+          </p>
+          <p>
+            We formalize this problem, establish baseline methods, and introduce a class-agnostic approach that leverages open-vocabulary 2D detectors and lifts 2D bounding boxes into 3D space. 
+            Our approach decouples the recognition and localization of objects in 2D from the task of estimating 3D bounding boxes, enabling generalization across unseen categories. 
+            Additionally, we propose a target-aware evaluation protocol to address inconsistencies in existing datasets, improving the reliability of model performance assessment. 
+          </p>
+          <p>
+            Extensive experiments on the Omni3D dataset demonstrate the effectiveness of the proposed method in zero-shot 3D detection for novel object categories, validating its robust generalization capabilities. 
+            Our method and evaluation protocols contribute towards the development of open-vocabulary object detection models that can effectively operate in real-world, category-diverse environments.
+          </p>
+        </div>
+      </div>
+    </div>
+  </div>
+</section>
+
+
+<section class="section">
+  <div class="container is-max-desktop">
+    <!-- title -->
+    <h2 class="title is-4" style="text-align: center; margin-bottom: 1rem;">Proposed Methods</h2>
+    <!-- figure and caption -->
+        <figure>
+      <img src="static/images/pipeline.png" alt="Proposed Methods" style="width: 70%; display: block; margin: 0 auto;">
+      <figcaption>
+        <div class="content">
+          <p>
+            <strong>(a)</strong> OVMono3D-GEO is a training-free method that predicts 3D detections from 2D via geometric unprojection. 
+            It exploits off-the-shelf depth estimation (e.g., 
+            <a href="https://github.com/apple/ml-depth-pro" target="_blank">Depth Pro</a>), 
+            segmentation (e.g., 
+            <a href="https://github.com/facebookresearch/segment-anything" target="_blank">SAM</a>), 
+            and OV 2D detector (<a href="https://github.com/IDEA-Research/GroundingDINO" target="_blank">Grounding DINO</a>). 
+            <strong>(b)</strong> OVMono3D-LIFT is a learning-based approach that trains a class-agnostic neural network to lift 2D detections to 3D. 
+            Both approaches disentangle the recognition and location in 2D from the estimation of 3D bounding boxes.
+          </p>
+        </div>
+      </figcaption>
+    </figure>
+  </div>
+</section>
+
+
+
+<section class="section">
+  <div class="container is-max-desktop">
+    <!-- title -->
+    <h2 class="title is-4" style="text-align: center; margin-bottom: 1rem;">Qualitative Visualizations on the Omni3D Test Set</h2>
+
+    <!-- figure and caption -->
+    <figure>
+      <img src="static/images/qualitative1.png" alt="Qualitative Results 1" style="width: 100%; display: block; margin: 0 auto;">
+      <figcaption>
+        <div class="content">
+          <p>
+            For each example, we present the predictions of Cube R-CNN and 
+            OVMono3D-LIFT, displaying both the 3D predictions overlaid on the image 
+            and a top-down view with a base grid of 
+            <span class="math">\(1\,\text{m} \times 1\,\text{m}\)</span> tiles. Base categories are depicted 
+            with brown cubes, while novel categories are represented in other colors.        
+          </p>
+        </div>
+      </figcaption>
+    </figure>
+  </div>
+</section>
+
+
+<section class="section">
+  <div class="container is-max-desktop">
+    <!-- title -->
+    <h2 class="title is-4" style="text-align: center; margin-bottom: 1rem;">Target-Aware Evaluation</h2>
+
+    <!-- figure and caption -->
+    <figure>
+      <img src="static/images/annotationissue.png" alt="Target-Aware Evaluation" style="width: 100%; display: block; margin: 0 auto;">
+      <figcaption>
+        <div class="content">
+          <p>
+            <strong>(a)</strong> 
+            <span style="color: blue;">Naming ambiguity</span> and 
+            <span style="color: red;">missing annotations</span> are common in the benchmarks. 
+            In this example, the 3D annotations are missing for "books"; the "shelves" share high similarity with "bookcases".
+            <strong>(b)</strong> 
+            This induces inaccurate performance assessment of open-vocabulary 3D detection under the standard evaluation.
+            <strong>(c)</strong> 
+            Our target-aware evaluation effectively resolves this issue by prompting the categories presented in the annotations.
+          </p>
+        </div>
+      </figcaption>
+    </figure>
+  </div>
+</section>
+
+
+<section class="section" id="BibTeX">
+  <div class="container is-max-desktop content">
+    <h2 class="title">Citation</h2>
+    <pre><code>
+  </code></pre>
+  </div>
+</section>
+
+
+<footer class="footer">
+  <div class="container">
+    <div class="content has-text-centered">
+      <a class="icon-link"
+         href="./static/videos/nerfies_paper.pdf">
+        <i class="fas fa-file-pdf"></i>
+      </a>
+      <a class="icon-link" href="https://github.com/keunhong" class="external-link" disabled>
+        <i class="fab fa-github"></i>
+      </a>
+    </div>
+    <div class="columns is-centered">
+      <div class="column is-8">
+        <div class="content">
+          <p>
+            This website is adapted from <a href="https://github.com/nerfies/nerfies.github.io">Nerfies</a>, licensed
+            under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
+            Commons Attribution-ShareAlike 4.0 International License</a>.
+          </p>
+        </div>
+      </div>
+    </div>
+  </div>
+</footer>
+
+</body>
+</html>
diff --git a/static/css/bulma-carousel.min.css b/static/css/bulma-carousel.min.css