From e51c33f5ffcbb49fe708fbf01ffd0e65ccf24ac3 Mon Sep 17 00:00:00 2001
From: whu-lyh <2942563671@qq.com>
Date: Tue, 14 Nov 2023 00:53:14 +0000
Subject: [PATCH] Github Action Automatic Update CV Arxiv Papers

---
 README.md                    | 24 +++++++++++++++---------
 docs/cv-arxiv-daily-web.json |  2 +-
 docs/cv-arxiv-daily.json     |  2 +-
 docs/index.md                |  8 +++++++-
 4 files changed, 24 insertions(+), 12 deletions(-)
diff --git a/README.md b/README.md
index f2b98f83a0..ffade35663 100755
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 [![Stargazers][stars-shield]][stars-url]
 [![Issues][issues-shield]][issues-url]
 
-## Updated on 2023.11.13
+## Updated on 2023.11.14
 > Usage instructions: [here](./docs/README.md#usage)
 
 <details>
@@ -113,7 +113,7 @@
 |**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|
 |**2023-02-22**|**GANet: Goal Area Network for Motion Forecasting**|Mingkun Wang et.al.|[2209.09723v3](http://arxiv.org/abs/2209.09723v3)|**[link](https://github.com/kingwmk/ganet)**|
 
-<p align=right>(<a href=#Updated-on-20231113>back to top</a>)</p>
+<p align=right>(<a href=#Updated-on-20231114>back to top</a>)</p>
 
 ## Map fusion
 
@@ -177,12 +177,16 @@
 |**2019-03-14**|**AgriColMap: Aerial-Ground Collaborative 3D Mapping for Precision Farming**|Ciro Potena et.al.|[1810.00457v2](http://arxiv.org/abs/1810.00457v2)|null|
 |**2019-03-05**|**Efficient Constellation-Based Map-Merging for Semantic SLAM**|Kristoffer M. Frey et.al.|[1809.09646v2](http://arxiv.org/abs/1809.09646v2)|null|
 
-<p align=right>(<a href=#Updated-on-20231113>back to top</a>)</p>
+<p align=right>(<a href=#Updated-on-20231114>back to top</a>)</p>
 
 ## MultiModality
 
 |Publish Date|Title|Authors|PDF|Code|
 |---|---|---|---|---|
+|**2023-11-10**|**Automated Heterogeneous Low-Bit Quantization of Multi-Model Deep Learning Inference Pipeline**|Jayeeta Mondal et.al.|[2311.05870v1](http://arxiv.org/abs/2311.05870v1)|null|
+|**2023-11-10**|**Watermarking Vision-Language Pre-trained Models for Multi-modal Embedding as a Service**|Yuanmin Tang et.al.|[2311.05863v1](http://arxiv.org/abs/2311.05863v1)|**[link](https://github.com/Pter61/vlpmarker)**|
+|**2023-11-09**|**Cosmological parameter estimation with Genetic Algorithms**|Ricardo Medel-Esquivel et.al.|[2311.05699v1](http://arxiv.org/abs/2311.05699v1)|null|
+|**2023-11-09**|**Multi-Modal Gaze Following in Conversational Scenarios**|Yuqi Hou et.al.|[2311.05669v1](http://arxiv.org/abs/2311.05669v1)|null|
 |**2023-11-09**|**Object-centric Cross-modal Feature Distillation for Event-based Object Detection**|Lei Li et.al.|[2311.05494v1](http://arxiv.org/abs/2311.05494v1)|null|
 |**2023-11-09**|**3DStyle-Diffusion: Pursuing Fine-grained Text-driven 3D Stylization with 2D Diffusion Models**|Haibo Yang et.al.|[2311.05464v1](http://arxiv.org/abs/2311.05464v1)|**[link](https://github.com/yanghb22-fdu/3dstyle-diffusion-official)**|
 |**2023-11-09**|**ControlStyle: Text-Driven Stylized Image Generation Using Diffusion Priors**|Jingwen Chen et.al.|[2311.05463v1](http://arxiv.org/abs/2311.05463v1)|null|
@@ -1886,7 +1890,7 @@
 |**2023-02-13**|**CLIP-RR: Improved CLIP Network for Relation-Focused Cross-Modal Information Retrieval**|Yan Gong et.al.|[2302.06350v1](http://arxiv.org/abs/2302.06350v1)|null|
 |**2023-02-13**|**CoMAE: Single Model Hybrid Pre-training on Small-Scale RGB-D Datasets**|Jiange Yang et.al.|[2302.06148v1](http://arxiv.org/abs/2302.06148v1)|**[link](https://github.com/mcg-nju/comae)**|
 
-<p align=right>(<a href=#Updated-on-20231113>back to top</a>)</p>
+<p align=right>(<a href=#Updated-on-20231114>back to top</a>)</p>
 
 ## Point Cloud Localization
 
@@ -1902,7 +1906,7 @@
 |**2018-04-03**|**Mining Point Cloud Local Structures by Kernel Correlation and Graph Pooling**|Yiru Shen et.al.|[1712.06760v2](http://arxiv.org/abs/1712.06760v2)|null|
 |**2017-02-14**|**Graph Based Over-Segmentation Methods for 3D Point Clouds**|Yizhak Ben-Shabat et.al.|[1702.04114v1](http://arxiv.org/abs/1702.04114v1)|null|
 
-<p align=right>(<a href=#Updated-on-20231113>back to top</a>)</p>
+<p align=right>(<a href=#Updated-on-20231114>back to top</a>)</p>
 
 ## Place Recognization
 
@@ -1973,7 +1977,7 @@
 |**2021-05-24**|**OverlapNet: Loop Closing for LiDAR-based SLAM**|Xieyuanli Chen et.al.|[2105.11344v1](http://arxiv.org/abs/2105.11344v1)|**[link](https://github.com/PRBonn/OverlapNet)**|
 |**2021-03-23**|**NDT-Transformer: Large-Scale 3D Point Cloud Localisation using the Normal Distribution Transform Representation**|Zhicheng Zhou et.al.|[2103.12292v1](http://arxiv.org/abs/2103.12292v1)|**[link](https://github.com/dachengxiaocheng/NDT-Transformer)**|
 
-<p align=right>(<a href=#Updated-on-20231113>back to top</a>)</p>
+<p align=right>(<a href=#Updated-on-20231114>back to top</a>)</p>
 
 ## LiDAR SLAM
 
@@ -2042,7 +2046,7 @@
 |**2020-08-09**|**LiDAR Data Enrichment Using Deep Learning Based on High-Resolution Image: An Approach to Achieve High-Performance LiDAR SLAM Using Low-cost LiDAR**|Jiang Yue et.al.|[2008.03694v1](http://arxiv.org/abs/2008.03694v1)|null|
 |**2020-08-05**|**Elasticity Meets Continuous-Time: Map-Centric Dense 3D LiDAR SLAM**|Chanoh Park et.al.|[2008.02274v1](http://arxiv.org/abs/2008.02274v1)|null|
 
-<p align=right>(<a href=#Updated-on-20231113>back to top</a>)</p>
+<p align=right>(<a href=#Updated-on-20231114>back to top</a>)</p>
 
 ## Transformer
 
@@ -2126,12 +2130,14 @@
 |**2021-10-14**|**Investigating Attention Mechanism in 3D Point Cloud Object Detection**|Shi Qiu et.al.|[2108.00620v2](http://arxiv.org/abs/2108.00620v2)|**[link](https://github.com/ShiQiu0419/attentions_in_3D_detection)**|
 |**2021-07-29**|**Reduction of balance laws in (3+1)--dimensions to autonomous conservation laws by means of equivalence transformations**|Matteo Gorgone et.al.|[2107.14144v1](http://arxiv.org/abs/2107.14144v1)|null|
 
-<p align=right>(<a href=#Updated-on-20231113>back to top</a>)</p>
+<p align=right>(<a href=#Updated-on-20231114>back to top</a>)</p>
 
 ## NeRF
 
 |Publish Date|Title|Authors|PDF|Code|
 |---|---|---|---|---|
+|**2023-11-10**|**Instant3D: Fast Text-to-3D with Sparse-View Generation and Large Reconstruction Model**|Jiahao Li et.al.|[2311.06214v1](http://arxiv.org/abs/2311.06214v1)|null|
+|**2023-11-10**|**A Neural Height-Map Approach for the Binocular Photometric Stereo Problem**|Fotios Logothetis et.al.|[2311.05958v1](http://arxiv.org/abs/2311.05958v1)|null|
 |**2023-11-09**|**BakedAvatar: Baking Neural Fields for Real-Time Head Avatar Synthesis**|Hao-Bin Duan et.al.|[2311.05521v1](http://arxiv.org/abs/2311.05521v1)|null|
 |**2023-11-09**|**Control3D: Towards Controllable Text-to-3D Generation**|Yang Chen et.al.|[2311.05461v1](http://arxiv.org/abs/2311.05461v1)|null|
 |**2023-11-08**|**LRM: Large Reconstruction Model for Single Image to 3D**|Yicong Hong et.al.|[2311.04400v1](http://arxiv.org/abs/2311.04400v1)|null|
@@ -2579,7 +2585,7 @@
 |**2022-12-08**|**GazeNeRF: 3D-Aware Gaze Redirection with Neural Radiance Fields**|Alessandro Ruzzi et.al.|[2212.04823v1](http://arxiv.org/abs/2212.04823v1)|**[link](https://github.com/alessandroruzzi/gazenerf)**|
 |**2022-12-09**|**4K-NeRF: High Fidelity Neural Radiance Fields at Ultra High Resolutions**|Zhongshu Wang et.al.|[2212.04701v1](http://arxiv.org/abs/2212.04701v1)|**[link](https://github.com/frozoul/4k-nerf)**|
 
-<p align=right>(<a href=#Updated-on-20231113>back to top</a>)</p>
+<p align=right>(<a href=#Updated-on-20231114>back to top</a>)</p>
 
 [contributors-shield]: https://img.shields.io/github/contributors/Vincentqyw/cv-arxiv-daily.svg?style=for-the-badge
 [contributors-url]: https://github.com/Vincentqyw/cv-arxiv-daily/graphs/contributors
diff --git a/docs/cv-arxiv-daily-web.json b/docs/cv-arxiv-daily-web.json
index 4b83168b71..9f21f1dfce 100644
--- a/docs/cv-arxiv-daily-web.json
+++ b/docs/cv-arxiv-daily-web.json
@@ -1 +1 @@
-{"Kinematic Mapping": {"2302.11988": "|**2023-02-23**|**Time Complexity of Broadcast and Consensus for Randomized Oblivious Message Adversaries**|Antoine El-Hayek et.al.|[2302.11988v1](http://arxiv.org/abs/2302.11988v1)|null|\n", "2302.09743": "|**2023-02-20**|**Dynamic Optimal Control: A Real-Time Control Optimization Algorithm for Dynamic Networks**|Chunyu Pan et.al.|[2302.09743v1](http://arxiv.org/abs/2302.09743v1)|null|\n", "2302.09382": "|**2023-02-18**|**Co-trading networks for modeling dynamic interdependency structures and estimating high-dimensional covariances in US equity markets**|Yutong Lu et.al.|[2302.09382v1](http://arxiv.org/abs/2302.09382v1)|null|\n", "2302.07657": "|**2023-02-15**|**Dynamic Flows with Time-Dependent Capacities**|Thomas Bl\u00e4sius et.al.|[2302.07657v1](http://arxiv.org/abs/2302.07657v1)|null|\n", "2302.04377": "|**2023-02-08**|**ER network heterogeneity guides diffusive transport and kinetics**|Zubenelgenubi C. Scott et.al.|[2302.04377v1](http://arxiv.org/abs/2302.04377v1)|null|\n", "2302.03677": "|**2023-02-24**|**Wealth distribution on a dynamic complex network**|Gustavo Kohlrausch et.al.|[2302.03677v2](http://arxiv.org/abs/2302.03677v2)|null|\n", "2302.03039": "|**2023-02-06**|**SUPER VII. Morphology and kinematics of H$\u03b1$ emission in AGN host galaxies at Cosmic noon using SINFONI**|D. Kakkad et.al.|[2302.03039v1](http://arxiv.org/abs/2302.03039v1)|null|\n", "2302.02313": "|**2023-02-05**|**A Game-Theoretic Approach to Solving the Roman Domination Problem**|Xiuyang Chen et.al.|[2302.02313v1](http://arxiv.org/abs/2302.02313v1)|null|\n", "2302.01694": "|**2023-02-03**|**Coevolving Boolean and Multi-Valued Regulatory Networks**|Larry Bull et.al.|[2302.01694v1](http://arxiv.org/abs/2302.01694v1)|null|\n", "2301.12892": "|**2023-01-30**|**Quantifying and maximizing the information flux in recurrent neural networks**|Claus Metzner et.al.|[2301.12892v1](http://arxiv.org/abs/2301.12892v1)|null|\n", "2301.12156": "|**2023-03-23**|**Perspective: How to overcome dynamical density functional theory**|Daniel de las Heras et.al.|[2301.12156v2](http://arxiv.org/abs/2301.12156v2)|null|\n", "2301.11982": "|**2023-02-01**|**Strategy evolution on dynamic networks**|Qi Su et.al.|[2301.11982v2](http://arxiv.org/abs/2301.11982v2)|null|\n", "2301.10962": "|**2023-01-26**|**Scheduling Policy for Value-of-Information (VoI) in Trajectory Estimation for Digital Twins**|Van-Phuc Bui et.al.|[2301.10962v1](http://arxiv.org/abs/2301.10962v1)|null|\n", "2301.07849": "|**2023-01-19**|**Efficient Computation in Congested Anonymous Dynamic Networks**|Giuseppe A. Di Luna et.al.|[2301.07849v1](http://arxiv.org/abs/2301.07849v1)|null|\n", "2301.07515": "|**2023-01-15**|**Towards the development of Dynamic Networked Psychology Hypotheses**|Liaquat Hossain et.al.|[2301.07515v1](http://arxiv.org/abs/2301.07515v1)|null|\n", "2301.04904": "|**2023-01-12**|**Lesion-aware Dynamic Kernel for Polyp Segmentation**|Ruifei Zhang et.al.|[2301.04904v1](http://arxiv.org/abs/2301.04904v1)|**[link](https://github.com/reafly/ldnet)**|\n", "2301.04296": "|**2023-01-11**|**A degree-corrected Cox model for dynamic networks**|Yuguo Chen et.al.|[2301.04296v1](http://arxiv.org/abs/2301.04296v1)|null|\n", "2301.03965": "|**2023-01-10**|**BiCurNet: Pre-Movement EEG based Neural Decoder for Biceps Curl Trajectory Estimation**|Manali Saini et.al.|[2301.03965v1](http://arxiv.org/abs/2301.03965v1)|null|\n", "2301.01314": "|**2023-01-03**|**Network-theoretic modeling of fluid-structure interactions**|Aditya G. Nair et.al.|[2301.01314v1](http://arxiv.org/abs/2301.01314v1)|null|\n", "2212.12843": "|**2022-12-25**|**A Note on Improved Results for One Round Distributed Clique Listing**|Quanquan C. Liu et.al.|[2212.12843v1](http://arxiv.org/abs/2212.12843v1)|null|\n", "2212.12345": "|**2022-12-23**|**Piecewise-Velocity Model for Learning Continuous-time Dynamic Node Representations**|Abdulkadir \u00c7elikkanat et.al.|[2212.12345v1](http://arxiv.org/abs/2212.12345v1)|null|\n", "2212.12130": "|**2023-02-04**|**Learning to Detect and Segment for Open Vocabulary Object Detection**|Tao Wang et.al.|[2212.12130v2](http://arxiv.org/abs/2212.12130v2)|null|\n", "2212.09483": "|**2022-12-19**|**Adaptive Control of Client Selection and Gradient Compression for Efficient Federated Learning**|Zhida Jiang et.al.|[2212.09483v1](http://arxiv.org/abs/2212.09483v1)|null|\n", "2212.08358": "|**2022-12-16**|**Some recent trends in embeddings of time series and dynamic networks**|Dag Tj\u00f8stheim et.al.|[2212.08358v1](http://arxiv.org/abs/2212.08358v1)|null|\n", "2212.08314": "|**2023-01-30**|**Synchronization-preserving clusters in hypergraphs**|Anirban Banerjee et.al.|[2212.08314v2](http://arxiv.org/abs/2212.08314v2)|null|\n", "2212.08239": "|**2022-12-16**|**Discovering Structural Hole Spanners in Dynamic Networks via Graph Neural Networks**|Diksha Goel et.al.|[2212.08239v1](http://arxiv.org/abs/2212.08239v1)|null|\n", "2212.07961": "|**2022-12-15**|**Topological Data Analysis Detects Percolation Thresholds in Arctic Melt-Pond Evolution**|Wilfred Offord et.al.|[2212.07961v1](http://arxiv.org/abs/2212.07961v1)|**[link](https://github.com/wilfofford/tda-for-sea-ice-percolation)**|\n", "2212.05980": "|**2022-12-12**|**Evaluation of RGB-D SLAM in Large Indoor Environments**|Kirill Muravyev et.al.|[2212.05980v1](http://arxiv.org/abs/2212.05980v1)|null|\n", "2212.03999": "|**2022-12-07**|**On the application of dimensionality reduction and clustering algorithms for the classification of kinematic morphologies of galaxies**|M. S. Rosito et.al.|[2212.03999v1](http://arxiv.org/abs/2212.03999v1)|null|\n", "2212.02410": "|**2023-03-17**|**Antipodal Self-Duality for a Four-Particle Form Factor**|Lance J. Dixon et.al.|[2212.02410v2](http://arxiv.org/abs/2212.02410v2)|null|\n", "2212.02383": "|**2022-12-05**|**An Approach for Detecting Dynamic Communities in Social Networks**|Souaad Boudebza et.al.|[2212.02383v1](http://arxiv.org/abs/2212.02383v1)|**[link](https://github.com/Yquetzal/ECML_PKDD_2019)**|\n", "2212.01594": "|**2022-12-03**|**Parameterized temporal exploration problems**|Thomas Erlebach et.al.|[2212.01594v1](http://arxiv.org/abs/2212.01594v1)|null|\n", "2211.16726": "|**2022-11-30**|**Boosted Dynamic Neural Networks**|Haichao Yu et.al.|[2211.16726v1](http://arxiv.org/abs/2211.16726v1)|**[link](https://github.com/SHI-Labs/Boosted-Dynamic-Networks)**|\n", "2211.15301": "|**2022-11-28**|**Learning Coherent Clusters in Weakly-Connected Network Systems**|Hancheng Min et.al.|[2211.15301v1](http://arxiv.org/abs/2211.15301v1)|null|\n", "2211.15043": "|**2022-11-28**|**Higher-order Knowledge Transfer for Dynamic Community Detection with Great Changes**|Huixin Ma et.al.|[2211.15043v1](http://arxiv.org/abs/2211.15043v1)|null|\n", "2211.14560": "|**2023-01-24**|**A dynamic multi-region MFD model for ride-sourcing with ridesplitting**|Caio Vitor Beojone et.al.|[2211.14560v2](http://arxiv.org/abs/2211.14560v2)|null|\n", "2211.12589": "|**2022-11-22**|**Building Squares with Optimal State Complexity in Restricted Active Self-Assembly**|Robert M. Alaniz et.al.|[2211.12589v1](http://arxiv.org/abs/2211.12589v1)|**[link](https://github.com/asarg/autotile)**|\n", "2211.11876": "|**2022-11-21**|**Structural Modelling of Dynamic Networks and Identifying Maximum Likelihood**|Christian Gourieroux et.al.|[2211.11876v1](http://arxiv.org/abs/2211.11876v1)|null|\n", "2211.11352": "|**2023-01-30**|**Brief Announcement: Broadcasting Time in Dynamic Rooted Trees is Linear**|Antoine El-Hayek et.al.|[2211.11352v3](http://arxiv.org/abs/2211.11352v3)|null|\n", "2211.11069": "|**2022-11-20**|**Learning Nonlinear Couplings in Network of Agents from a Single Sample Trajectory**|Arash Amini et.al.|[2211.11069v1](http://arxiv.org/abs/2211.11069v1)|null|\n", "2211.10825": "|**2022-11-20**|**Identifiability of dynamic networks: the essential r\u00f4le of dources and dinks**|Eduardo Mapurunga et.al.|[2211.10825v1](http://arxiv.org/abs/2211.10825v1)|null|\n", "2211.10151": "|**2023-01-27**|**Asymptotically Tight Bounds on the Time Complexity of Broadcast and its Variants in Dynamic Networks**|Antoine El-Hayek et.al.|[2211.10151v2](http://arxiv.org/abs/2211.10151v2)|null|\n", "2211.09139": "|**2022-11-16**|**The Pandora project. I: the impact of radiation and cosmic rays on baryonic and dark matter properties of dwarf galaxies**|Sergio Martin-Alvarez et.al.|[2211.09139v1](http://arxiv.org/abs/2211.09139v1)|null|\n", "2211.08820": "|**2022-11-16**|**Computing-Aware Routing for LEO Satellite Networks: A Transmission and Computation Integration Approach**|Jiaqi Cao et.al.|[2211.08820v1](http://arxiv.org/abs/2211.08820v1)|null|\n", "2211.08700": "|**2023-02-14**|**Bi-directional Digital Twin and Edge Computing in the Metaverse**|Jiadong Yu et.al.|[2211.08700v2](http://arxiv.org/abs/2211.08700v2)|null|\n", "2211.08639": "|**2022-11-16**|**Hierarchical Dynamic Image Harmonization**|Haoxing Chen et.al.|[2211.08639v1](http://arxiv.org/abs/2211.08639v1)|**[link](https://github.com/chenhaoxing/hdnet)**|\n", "2211.08378": "|**2022-11-15**|**Anomaly Detection in Multiplex Dynamic Networks: from Blockchain Security to Brain Disease Prediction**|Ali Behrouz et.al.|[2211.08378v1](http://arxiv.org/abs/2211.08378v1)|**[link](https://github.com/ubc-systopia/anomuly)**|\n", "2211.09664": "|**2022-11-15**|**Influencer Detection with Dynamic Graph Neural Networks**|Elena Tiukhova et.al.|[2211.09664v1](http://arxiv.org/abs/2211.09664v1)|**[link](https://github.com/banking-analytics-lab/dynamicgraphlearning)**|\n", "2211.07570": "|**2022-11-14**|**Tides Need STEMMED: A Locally Operating Spatio-Temporal Mutually Exciting Point Process with Dynamic Network for Improving Opioid Overdose Death Prediction**|Che-Yi Liao et.al.|[2211.07570v1](http://arxiv.org/abs/2211.07570v1)|null|\n", "2211.07449": "|**2022-11-14**|**Dual-based Online Learning of Dynamic Network Topologies**|Seyed Saman Saboksayr et.al.|[2211.07449v1](http://arxiv.org/abs/2211.07449v1)|null|\n", "2302.12759": "|**2023-02-24**|**Modularity-based approach for tracking communities in dynamic social networks**|Michele Mazza et.al.|[2302.12759v1](http://arxiv.org/abs/2302.12759v1)|null|\n", "2302.13629": "|**2023-02-27**|**Estimation of continuous environments by robot swarms: Correlated networks and decision-making**|Mohsen Raoufi et.al.|[2302.13629v1](http://arxiv.org/abs/2302.13629v1)|null|\n", "2302.13292": "|**2023-02-26**|**Discovering Top-k Structural Hole Spanners in Dynamic Networks**|Diksha Goel et.al.|[2302.13292v1](http://arxiv.org/abs/2302.13292v1)|null|\n", "2211.05668": "|**2022-12-07**|**Mapping the Milky Way Disk with Gaia DR3: 3D extended kinematic maps and rotation curve to $\\approx 30$ kpc**|Hai-Feng Wang et.al.|[2211.05668v2](http://arxiv.org/abs/2211.05668v2)|null|\n", "2211.01538": "|**2023-03-12**|**$D^2$SLAM: Decentralized and Distributed Collaborative Visual-inertial SLAM System for Aerial Swarm**|Hao Xu et.al.|[2211.01538v3](http://arxiv.org/abs/2211.01538v3)|**[link](https://github.com/hkust-aerial-robotics/d2slam)**|\n", "2210.14842": "|**2022-10-26**|**Continuum Robot State Estimation Using Gaussian Process Regression on $SE(3)$**|Sven Lilge et.al.|[2210.14842v1](http://arxiv.org/abs/2210.14842v1)|null|\n", "2210.04572": "|**2022-10-10**|**Floorplan-Aware Camera Poses Refinement**|Anna Sokolova et.al.|[2210.04572v1](http://arxiv.org/abs/2210.04572v1)|null|\n", "2210.03412": "|**2022-10-07**|**The Trajectory PHD Filter for Coexisting Point and Extended Target Tracking**|Shaoxiu Wei et.al.|[2210.03412v1](http://arxiv.org/abs/2210.03412v1)|null|\n", "2210.01320": "|**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|\n", "2209.09723": "|**2023-02-22**|**GANet: Goal Area Network for Motion Forecasting**|Mingkun Wang et.al.|[2209.09723v3](http://arxiv.org/abs/2209.09723v3)|**[link](https://github.com/kingwmk/ganet)**|\n", "2212.03441": "|**2023-03-23**|**Higher topological complexity of a map**|Cesar A. Ipanaque Zapata et.al.|[2212.03441v2](http://arxiv.org/abs/2212.03441v2)|null|\n", "2304.09043": "|**2023-05-16**|**Continuous-Time Range-Only Pose Estimation**|Abhishek Goudar et.al.|[2304.09043v2](http://arxiv.org/abs/2304.09043v2)|null|\n", "2304.11694": "|**2023-04-25**|**Vehicle State Estimation and Prediction**|Xinchen Li et.al.|[2304.11694v2](http://arxiv.org/abs/2304.11694v2)|null|\n", "2306.01188": "|**2023-09-12**|**Event-based Stereo Visual Odometry with Native Temporal Resolution via Continuous-time Gaussian Process Regression**|Jianeng Wang et.al.|[2306.01188v5](http://arxiv.org/abs/2306.01188v5)|null|\n", "2306.01056": "|**2023-06-01**|**ERGO-ML: Towards a robust machine learning model for inferring the fraction of accreted stars in galaxies from integral-field spectroscopic maps**|Eirini Angeloudi et.al.|[2306.01056v1](http://arxiv.org/abs/2306.01056v1)|null|\n", "2306.11091": "|**2023-06-19**|**Composite Bulges -- IV. Detecting Signatures of Gas Inflows in the IFU data: The MUSE View of Ionized Gas Kinematics in NGC 1097**|Tutku Kolcu et.al.|[2306.11091v1](http://arxiv.org/abs/2306.11091v1)|null|\n", "2306.14573": "|**2023-06-26**|**Hydrodynamic simulations of the Disk of Gas Around Supermassive black holes (HDGAS) -I; Molecular Gas Dynamics**|Mojtaba Raouf et.al.|[2306.14573v1](http://arxiv.org/abs/2306.14573v1)|null|\n", "2307.00728": "|**2023-07-03**|**A new approach to QCD evolution in processes with massive partons**|Benoit Assi et.al.|[2307.00728v1](http://arxiv.org/abs/2307.00728v1)|null|\n", "2307.03207": "|**2023-07-06**|**H$\u03b1$ Kinematics of Superbubbles and Supernova Remnants of the Dwarf galaxy NGC 4214**|M. S\u00e1nchez-Cruces et.al.|[2307.03207v1](http://arxiv.org/abs/2307.03207v1)|null|\n", "2307.10381": "|**2023-07-19**|**Accelerating galaxy dynamical modeling using a neural network for joint lensing and kinematics analyses**|Matthew R. Gomer et.al.|[2307.10381v1](http://arxiv.org/abs/2307.10381v1)|null|\n", "2307.14125": "|**2023-07-26**|**Multi-IMU Proprioceptive State Estimator for Humanoid Robots**|Fabio Elnecave Xavier et.al.|[2307.14125v1](http://arxiv.org/abs/2307.14125v1)|null|\n", "2308.04071": "|**2023-08-08**|**Path Signatures for Diversity in Probabilistic Trajectory Optimisation**|Lucas Barcelos et.al.|[2308.04071v1](http://arxiv.org/abs/2308.04071v1)|null|\n", "2308.08654": "|**2023-08-16**|**Advancing Brain-Computer Interface System Performance in Hand Trajectory Estimation with NeuroKinect**|Sidharth Pancholi et.al.|[2308.08654v1](http://arxiv.org/abs/2308.08654v1)|null|\n", "2308.11493": "|**2023-08-22**|**Looking into the faintEst WIth MUSE (LEWIS): on the nature of ultra-diffuse galaxies in the Hydra-I cluster.I. Project description and preliminary results**|Enrichetta Iodice et.al.|[2308.11493v1](http://arxiv.org/abs/2308.11493v1)|null|\n", "2308.12418": "|**2023-08-23**|**Certifiably Optimal Rotation and Pose Estimation Based on the Cayley Map**|Timothy D Barfoot et.al.|[2308.12418v1](http://arxiv.org/abs/2308.12418v1)|null|\n", "2308.16620": "|**2023-08-31**|**GA-NIFS: JWST/NIRSpec IFU observations of HFLS3 reveal a dense galaxy group at z~6.3**|G. C. Jones et.al.|[2308.16620v1](http://arxiv.org/abs/2308.16620v1)|null|\n", "2309.03396": "|**2023-09-06**|**Detection of open cluster rotation fields from Gaia EDR3 proper motions**|Pedro Guilherme-Garcia et.al.|[2309.03396v1](http://arxiv.org/abs/2309.03396v1)|null|\n", "2309.06792": "|**2023-09-13**|**Motion-Bias-Free Feature-Based SLAM**|Alejandro Fontan et.al.|[2309.06792v1](http://arxiv.org/abs/2309.06792v1)|null|\n", "2309.09808": "|**2023-09-18**|**Coco-LIC: Continuous-Time Tightly-Coupled LiDAR-Inertial-Camera Odometry using Non-Uniform B-spline**|Xiaolei Lang et.al.|[2309.09808v1](http://arxiv.org/abs/2309.09808v1)|**[link](https://github.com/april-zju/coco-lic)**|\n", "2309.09011": "|**2023-09-16**|**Optimal Initialization Strategies for Range-Only Trajectory Estimation**|Abhishek Goudar et.al.|[2309.09011v1](http://arxiv.org/abs/2309.09011v1)|null|\n", "2309.08780": "|**2023-09-15**|**Simultaneous Trajectory Estimation and Mapping for Autonomous Underwater Proximity Operations**|Aldo Ter\u00e1n Espinoza et.al.|[2309.08780v1](http://arxiv.org/abs/2309.08780v1)|null|\n", "2309.11134": "|**2023-09-20**|**GNSS/Multi-Sensor Fusion Using Continuous-Time Factor Graph Optimization for Robust Localization**|Haoming Zhang et.al.|[2309.11134v1](http://arxiv.org/abs/2309.11134v1)|**[link](https://github.com/rwth-irt/gnssfgo)**|\n", "2309.15065": "|**2023-09-26**|**Language-EXtended Indoor SLAM (LEXIS): A Versatile System for Real-time Visual Scene Understanding**|Christina Kassab et.al.|[2309.15065v1](http://arxiv.org/abs/2309.15065v1)|null|\n", "2310.03353": "|**2023-10-05**|**Deep Geometric Learning with Monotonicity Constraints for Alzheimer's Disease Progression**|Seungwoo Jeong et.al.|[2310.03353v1](http://arxiv.org/abs/2310.03353v1)|null|\n", "2310.06249": "|**2023-10-10**|**l-dyno: framework to learn consistent visual features using robot's motion**|Kartikeya Singh et.al.|[2310.06249v1](http://arxiv.org/abs/2310.06249v1)|null|\n", "2310.10723": "|**2023-10-16**|**Kinematical coherence between satellite galaxies and host stellar discs for MaNGA & SAMI galaxies**|Sen Wang et.al.|[2310.10723v1](http://arxiv.org/abs/2310.10723v1)|null|\n", "2310.12776": "|**2023-10-19**|**First holistic modelling of meteoroid ablation and fragmentation: A case study of the Orionids recorded by the Canadian Automated Meteor Observatory**|Denis Vida et.al.|[2310.12776v1](http://arxiv.org/abs/2310.12776v1)|null|\n", "2310.14506": "|**2023-10-23**|**Label Space Partition Selection for Multi-Object Tracking Using Two-Layer Partitioning**|Ji Youn Lee et.al.|[2310.14506v1](http://arxiv.org/abs/2310.14506v1)|null|\n"}, "Map fusion": {"2302.11106": "|**2023-02-22**|**Multi-Head Feature Pyramid Networks for Breast Mass Detection**|Hexiang Zhang et.al.|[2302.11106v1](http://arxiv.org/abs/2302.11106v1)|null|\n", "2301.09213": "|**2023-01-24**|**FRAME: Fast and Robust Autonomous 3D point cloud Map-merging for Egocentric multi-robot exploration**|Nikolaos Stathoulopoulos et.al.|[2301.09213v2](http://arxiv.org/abs/2301.09213v2)|null|\n", "2212.01538": "|**2022-12-03**|**Multi-resolution Monocular Depth Map Fusion by Self-supervised Gradient-based Composition**|Yaqiao Dai et.al.|[2212.01538v1](http://arxiv.org/abs/2212.01538v1)|**[link](https://github.com/yuinsky/gradient-based-depth-map-fusion)**|\n", "2211.03423": "|**2022-11-07**|**Detecting Invalid Map Merges in Lifelong SLAM**|Matthias Holoch et.al.|[2211.03423v1](http://arxiv.org/abs/2211.03423v1)|null|\n", "2209.10775": "|**2022-09-22**|**MUI-TARE: Multi-Agent Cooperative Exploration with Unknown Initial Position**|Jingtian Yan et.al.|[2209.10775v1](http://arxiv.org/abs/2209.10775v1)|null|\n", "2209.08988": "|**2022-09-19**|**MSA-GCN:Multiscale Adaptive Graph Convolution Network for Gait Emotion Recognition**|Yunfei Yin et.al.|[2209.08988v1](http://arxiv.org/abs/2209.08988v1)|null|\n", "2209.03096": "|**2022-09-07**|**Spherical wedge billiard: from chaos to fractals and Talbot carpets**|Tom\u00e1\u0161 Tyc et.al.|[2209.03096v1](http://arxiv.org/abs/2209.03096v1)|null|\n", "2208.06293": "|**2022-08-12**|**dual unet:a novel siamese network for change detection with cascade differential fusion**|Kaixuan Jiang et.al.|[2208.06293v1](http://arxiv.org/abs/2208.06293v1)|null|\n", "2207.09210": "|**2023-10-23**|**KinD-LCE Curve Estimation And Retinex Fusion On Low-Light Image**|Xiaochun Lei et.al.|[2207.09210v3](http://arxiv.org/abs/2207.09210v3)|null|\n", "2207.06965": "|**2023-06-27**|**AutoMerge: A Framework for Map Assembling and Smoothing in City-scale Environments**|Peng Yin et.al.|[2207.06965v4](http://arxiv.org/abs/2207.06965v4)|null|\n", "2203.00436": "|**2022-03-01**|**Boundary Corrected Multi-scale Fusion Network for Real-time Semantic Segmentation**|Tianjiao Jiang et.al.|[2203.00436v1](http://arxiv.org/abs/2203.00436v1)|null|\n", "2202.08498": "|**2022-02-17**|**Mirror-Yolo: An attention-based instance segmentation and detection model for mirrors**|Fengze Li et.al.|[2202.08498v1](http://arxiv.org/abs/2202.08498v1)|null|\n", "2201.11937": "|**2022-01-28**|**Stereo Matching with Cost Volume based Sparse Disparity Propagation**|Wei Xue et.al.|[2201.11937v1](http://arxiv.org/abs/2201.11937v1)|null|\n", "2201.10152": "|**2022-01-29**|**Unsupervised Image Fusion Method based on Feature Mutual Mapping**|Dongyu Rao et.al.|[2201.10152v2](http://arxiv.org/abs/2201.10152v2)|null|\n", "2112.13222": "|**2022-01-24**|**Edge Robotics: Edge-Computing-Accelerated Multi-Robot Simultaneous Localization and Mapping**|Peng Huang et.al.|[2112.13222v2](http://arxiv.org/abs/2112.13222v2)|null|\n", "2112.11044": "|**2021-12-21**|**Extending Merge Resolution to a Family of Proof Systems**|Sravanthi Chede et.al.|[2112.11044v1](http://arxiv.org/abs/2112.11044v1)|null|\n", "2111.14990": "|**2021-11-29**|**MIXER: A Principled Framework for Multimodal, Multiway Data Association**|Parker C. Lusk et.al.|[2111.14990v1](http://arxiv.org/abs/2111.14990v1)|null|\n", "2110.12338": "|**2021-10-24**|**Quality Map Fusion for Adversarial Learning**|Uche Osahor et.al.|[2110.12338v1](http://arxiv.org/abs/2110.12338v1)|null|\n", "2110.08172": "|**2021-10-18**|**MLFC: From 10 to 50 Planners in the Multi-Agent Programming Contest**|Rafael C. Cardoso et.al.|[2110.08172v2](http://arxiv.org/abs/2110.08172v2)|null|\n", "2110.06697": "|**2021-10-13**|**Semantic Image Fusion**|P. R. Hill et.al.|[2110.06697v1](http://arxiv.org/abs/2110.06697v1)|null|\n", "2110.06436": "|**2021-10-13**|**Non-local Recurrent Regularization Networks for Multi-view Stereo**|Qingshan Xu et.al.|[2110.06436v1](http://arxiv.org/abs/2110.06436v1)|null|\n", "2108.08623": "|**2021-08-19**|**VolumeFusion: Deep Depth Fusion for 3D Scene Reconstruction**|Jaesung Choe et.al.|[2108.08623v1](http://arxiv.org/abs/2108.08623v1)|null|\n", "2106.11515": "|**2021-06-23**|**Cooperative mmWave PHD-SLAM with Moving Scatterers**|Hyowon Kim et.al.|[2106.11515v2](http://arxiv.org/abs/2106.11515v2)|null|\n", "2106.10220": "|**2021-06-18**|**Semantic navigation with domain knowledge**|Rafael Gomes Braga et.al.|[2106.10220v1](http://arxiv.org/abs/2106.10220v1)|null|\n", "2106.04512": "|**2021-06-22**|**Formal Verification of a Map Merging Protocol in the Multi-Agent Programming Contest**|Matt Luckcuck et.al.|[2106.04512v2](http://arxiv.org/abs/2106.04512v2)|null|\n", "2105.14994": "|**2021-05-31**|**MAOMaps: A Photo-Realistic Benchmark For vSLAM and Map Merging Quality Assessment**|Andrey Bokovoy et.al.|[2105.14994v1](http://arxiv.org/abs/2105.14994v1)|**[link](https://github.com/CnnDepth/MAOMaps)**|\n", "2103.13246": "|**2021-03-24**|**Generic Merging of Structure from Motion Maps with a Low Memory Footprint**|Gabrielle Flood et.al.|[2103.13246v1](http://arxiv.org/abs/2103.13246v1)|null|\n", "2103.03786": "|**2022-09-22**|**Distributed Dynamic Map Fusion via Federated Learning for Intelligent Networked Vehicles**|Zijian Zhang et.al.|[2103.03786v3](http://arxiv.org/abs/2103.03786v3)|**[link](https://github.com/zijianzhang/CARLA_INVS)**|\n", "2102.10929": "|**2021-02-22**|**Deep Learning for Robust Motion Segmentation with Non-Static Cameras**|Markus Bosch et.al.|[2102.10929v1](http://arxiv.org/abs/2102.10929v1)|null|\n", "2012.10658": "|**2021-02-24**|**Generalize a Small Pre-trained Model to Arbitrarily Large TSP Instances**|Zhang-Hua Fu et.al.|[2012.10658v2](http://arxiv.org/abs/2012.10658v2)|**[link](https://github.com/Spider-scnu/TSP)**|\n", "2011.14791": "|**2021-06-08**|**NeuralFusion: Online Depth Fusion in Latent Space**|Silvan Weder et.al.|[2011.14791v2](http://arxiv.org/abs/2011.14791v2)|**[link](https://github.com/weders/NeuralFusion)**|\n", "2011.03975": "|**2020-11-11**|**Mapless-Planner: A Robust and Fast Planning Framework for Aggressive Autonomous Flight without Map Fusion**|Jialin Ji et.al.|[2011.03975v2](http://arxiv.org/abs/2011.03975v2)|null|\n", "2010.03026": "|**2020-11-16**|**Place Recognition in Forests with Urquhart Tessellations**|Guilherme V. Nardari et.al.|[2010.03026v2](http://arxiv.org/abs/2010.03026v2)|**[link](https://github.com/gnardari/urquhart)**|\n", "2009.05819": "|**2020-09-12**|**Map-merging Algorithms for Visual SLAM: Feasibility Study and Empirical Evaluation**|Andrey Bokovoy et.al.|[2009.05819v1](http://arxiv.org/abs/2009.05819v1)|null|\n", "2007.14177": "|**2020-07-28**|**Generative networks as inverse problems with fractional wavelet scattering networks**|Jiasong Wu et.al.|[2007.14177v1](http://arxiv.org/abs/2007.14177v1)|null|\n", "2007.02295": "|**2020-07-05**|**Multi view stereo with semantic priors**|Elisavet Konstantina Stathopoulou et.al.|[2007.02295v1](http://arxiv.org/abs/2007.02295v1)|null|\n", "2007.02108": "|**2020-07-04**|**SplitFusion: Simultaneous Tracking and Mapping for Non-Rigid Scenes**|Yang Li et.al.|[2007.02108v1](http://arxiv.org/abs/2007.02108v1)|null|\n", "2006.00420": "|**2020-05-31**|**VIR-SLAM: Visual, Inertial, and Ranging SLAM for single and multi-robot systems**|Yanjun Cao et.al.|[2006.00420v1](http://arxiv.org/abs/2006.00420v1)|null|\n", "2002.10342": "|**2020-02-24**|**Comparing View-Based and Map-Based Semantic Labelling in Real-Time SLAM**|Zoe Landgraf et.al.|[2002.10342v1](http://arxiv.org/abs/2002.10342v1)|null|\n", "2001.09796": "|**2020-01-16**|**Knowledge Integration of Collaborative Product Design Using Cloud Computing Infrastructure**|Mahdi Bohlouli et.al.|[2001.09796v1](http://arxiv.org/abs/2001.09796v1)|null|\n", "2001.04388": "|**2020-04-03**|**RoutedFusion: Learning Real-time Depth Map Fusion**|Silvan Weder et.al.|[2001.04388v2](http://arxiv.org/abs/2001.04388v2)|**[link](https://github.com/weders/RoutedFusion)**|\n", "1909.00703": "|**2019-09-02**|**Learned Semantic Multi-Sensor Depth Map Fusion**|Denys Rozumnyi et.al.|[1909.00703v1](http://arxiv.org/abs/1909.00703v1)|null|\n", "1908.11585": "|**2019-08-30**|**ORBSLAM-Atlas: a robust and accurate multi-map system**|Richard Elvira et.al.|[1908.11585v1](http://arxiv.org/abs/1908.11585v1)|null|\n", "1908.10541": "|**2020-06-07**|**Search and Rescue under the Forest Canopy using Multiple UAVs**|Yulun Tian et.al.|[1908.10541v2](http://arxiv.org/abs/1908.10541v2)|null|\n", "1908.09806": "|**2020-02-26**|**5G mmWave Cooperative Positioning and Mapping using Multi-Model PHD Filter and Map Fusion**|Hyowon Kim et.al.|[1908.09806v3](http://arxiv.org/abs/1908.09806v3)|**[link](https://github.com/HyowonKim-P1/5GmmWavePHDFilterMapFusion)**|\n", "1905.11257": "|**2019-05-27**|**IRAS23385+6053: An embedded massive cluster in the making**|R. Cesaroni et.al.|[1905.11257v1](http://arxiv.org/abs/1905.11257v1)|null|\n", "1812.08402": "|**2018-12-20**|**SFA: Small Faces Attention Face Detector**|Shi Luo et.al.|[1812.08402v1](http://arxiv.org/abs/1812.08402v1)|**[link](https://github.com/shiluo1990/SFA)**|\n", "1811.07632": "|**2018-11-21**|**Collaborative Dense SLAM**|Louis Gallagher et.al.|[1811.07632v2](http://arxiv.org/abs/1811.07632v2)|null|\n", "1810.00457": "|**2019-03-14**|**AgriColMap: Aerial-Ground Collaborative 3D Mapping for Precision Farming**|Ciro Potena et.al.|[1810.00457v2](http://arxiv.org/abs/1810.00457v2)|null|\n", "1809.09646": "|**2019-03-05**|**Efficient Constellation-Based Map-Merging for Semantic SLAM**|Kristoffer M. Frey et.al.|[1809.09646v2](http://arxiv.org/abs/1809.09646v2)|null|\n", "2306.15416": "|**2023-07-04**|**Irregular Change Detection in Sparse Bi-Temporal Point Clouds using Learned Place Recognition Descriptors and Point-to-Voxel Comparison**|Nikolaos Stathoulopoulos et.al.|[2306.15416v2](http://arxiv.org/abs/2306.15416v2)|null|\n", "2307.00500": "|**2023-07-02**|**CQLite: Communication-Efficient Multi-Robot Exploration Using Coverage-biased Distributed Q-Learning**|Ehsan Latif et.al.|[2307.00500v1](http://arxiv.org/abs/2307.00500v1)|null|\n", "2212.08334": "|**2023-07-10**|**Lightweight integration of 3D features to improve 2D image segmentation**|Olivier Pradelle et.al.|[2212.08334v2](http://arxiv.org/abs/2212.08334v2)|**[link](https://github.com/opradelle/2dguidedlight3d)**|\n", "2307.07126": "|**2023-07-14**|**Multi-Session, Localization-oriented and Lightweight LiDAR Mapping Using Semantic Lines and Planes**|Zehuan Yu et.al.|[2307.07126v1](http://arxiv.org/abs/2307.07126v1)|null|\n", "2308.02674": "|**2023-08-04**|**Group-$k$ consistent measurement set maximization via maximum clique over k-Uniform hypergraphs for robust multi-robot map merging**|Brendon Forsgren et.al.|[2308.02674v1](http://arxiv.org/abs/2308.02674v1)|**[link](https://bitbucket.org/jmangelson/gkcm)**|\n", "2308.08715": "|**2023-08-17**|**V-FUSE: Volumetric Depth Map Fusion with Long-Range Constraints**|Nathaniel Burgdorfer et.al.|[2308.08715v1](http://arxiv.org/abs/2308.08715v1)|**[link](https://github.com/nburgdorfer/v-fuse)**|\n", "2311.03146": "|**2023-11-06**|**Enabling In-Situ Resources Utilisation by leveraging collaborative robotics and astronaut-robot interaction**|Silvia Romero-Azpitarte et.al.|[2311.03146v1](http://arxiv.org/abs/2311.03146v1)|null|\n"}, "MultiModality": {"2302.12248": "|**2023-02-23**|**Learning Visual Representations via Language-Guided Sampling**|Mohamed El Banani et.al.|[2302.12248v1](http://arxiv.org/abs/2302.12248v1)|**[link](https://github.com/mbanani/lgssl)**|\n", "2302.11939": "|**2023-02-23**|**Power Time Series Forecasting by Pretrained LM**|Tian Zhou et.al.|[2302.11939v1](http://arxiv.org/abs/2302.11939v1)|**[link](https://github.com/damo-di-ml/one_fits_all)**|\n", "2302.11713": "|**2023-02-24**|**Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?**|Yang Chen et.al.|[2302.11713v2](http://arxiv.org/abs/2302.11713v2)|**[link](https://github.com/edchengg/infoseek_eval)**|\n", "2302.11529": "|**2023-02-22**|**Modular Deep Learning**|Jonas Pfeiffer et.al.|[2302.11529v1](http://arxiv.org/abs/2302.11529v1)|null|\n", "2302.11458": "|**2023-02-22**|**Fusing Visual Appearance and Geometry for Multi-modality 6DoF Object Tracking**|Manuel Stoiber et.al.|[2302.11458v1](http://arxiv.org/abs/2302.11458v1)|**[link](https://github.com/dlr-rm/3dobjecttracking)**|\n", "2302.11352": "|**2023-02-22**|**X-TRA: Improving Chest X-ray Tasks with Cross-Modal Retrieval Augmentation**|Tom van Sonsbeek et.al.|[2302.11352v1](http://arxiv.org/abs/2302.11352v1)|null|\n", "2302.11254": "|**2023-02-22**|**Cross-modal Audio-visual Co-learning for Text-independent Speaker Verification**|Meng Liu et.al.|[2302.11254v1](http://arxiv.org/abs/2302.11254v1)|**[link](https://github.com/danielmengliu/audiovisuallip)**|\n", "2302.11154": "|**2023-02-24**|**Open-domain Visual Entity Recognition: Towards Recognizing Millions of Wikipedia Entities**|Hexiang Hu et.al.|[2302.11154v2](http://arxiv.org/abs/2302.11154v2)|**[link](https://github.com/edchengg/oven_eval)**|\n", "2302.11097": "|**2023-02-22**|**A Multi-Modal Neural Geometric Solver with Textual Clauses Parsed from Diagram**|Ming-Liang Zhang et.al.|[2302.11097v1](http://arxiv.org/abs/2302.11097v1)|**[link](https://github.com/mingliangzhang2018/pgps)**|\n", "2302.11082": "|**2023-02-22**|**BB-GCN: A Bi-modal Bridged Graph Convolutional Network for Multi-label Chest X-Ray Recognition**|Guoli Wang et.al.|[2302.11082v1](http://arxiv.org/abs/2302.11082v1)|null|\n", "2302.11025": "|**2023-02-21**|**Asteroseismology of $\u03b4$ Scuti stars: emulating model grids using a neural network**|Owen J. Scutt et.al.|[2302.11025v1](http://arxiv.org/abs/2302.11025v1)|null|\n", "2302.11021": "|**2023-02-21**|**MVMTnet: A Multi-variate Multi-modal Transformer for Multi-class Classification of Cardiac Irregularities Using ECG Waveforms and Clinical Notes**|Ankur Samanta et.al.|[2302.11021v1](http://arxiv.org/abs/2302.11021v1)|null|\n", "2302.10873": "|**2023-02-21**|**Context-Aware Timewise VAEs for Real-Time Vehicle Trajectory Prediction**|Pei Xu et.al.|[2302.10873v1](http://arxiv.org/abs/2302.10873v1)|**[link](https://github.com/xupei0610/contextvae)**|\n", "2302.10859": "|**2023-02-21**|**SF2Former: Amyotrophic Lateral Sclerosis Identification From Multi-center MRI Data Using Spatial and Frequency Fusion Transformer**|Rafsanjany Kushol et.al.|[2302.10859v1](http://arxiv.org/abs/2302.10859v1)|**[link](https://github.com/raoyongming/GFNet)**|\n", "2302.10813": "|**2023-02-21**|**Tracking Objects and Activities with Attention for Temporal Sentence Grounding**|Zeyu Xiong et.al.|[2302.10813v1](http://arxiv.org/abs/2302.10813v1)|null|\n", "2302.10632": "|**2023-02-23**|**Multi-Modal Self-Supervised Learning for Recommendation**|Wei Wei et.al.|[2302.10632v2](http://arxiv.org/abs/2302.10632v2)|**[link](https://github.com/hkuds/mmssl)**|\n", "2302.10511": "|**2023-02-21**|**MVFusion: Multi-View 3D Object Detection with Semantic-aligned Radar and Camera Fusion**|Zizhang Wu et.al.|[2302.10511v1](http://arxiv.org/abs/2302.10511v1)|null|\n", "2302.10465": "|**2023-02-21**|**A Flexible Multi-view Multi-modal Imaging System for Outdoor Scenes**|Meng Zhang et.al.|[2302.10465v1](http://arxiv.org/abs/2302.10465v1)|null|\n", "2302.10035": "|**2023-02-20**|**Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey**|Xiao Wang et.al.|[2302.10035v1](http://arxiv.org/abs/2302.10035v1)|**[link](https://github.com/wangxiao5791509/multimodal_bigmodels_survey)**|\n", "2302.09934": "|**2023-02-20**|**CISum: Learning Cross-modality Interaction to Enhance Multimodal Semantic Coverage for Multimodal Summarization**|Litian Zhang et.al.|[2302.09934v1](http://arxiv.org/abs/2302.09934v1)|null|\n", "2302.09850": "|**2023-02-20**|**Constraint and Union for Partially-Supervised Temporal Sentence Grounding**|Chen Ju et.al.|[2302.09850v1](http://arxiv.org/abs/2302.09850v1)|null|\n", "2302.09636": "|**2023-02-19**|**Interpretable Medical Image Visual Question Answering via Multi-Modal Relationship Graph Learning**|Xinyue Hu et.al.|[2302.09636v1](http://arxiv.org/abs/2302.09636v1)|null|\n", "2302.09328": "|**2023-02-18**|**SSVMR: Saliency-based Self-training for Video-Music Retrieval**|Xuxin Cheng et.al.|[2302.09328v1](http://arxiv.org/abs/2302.09328v1)|null|\n", "2302.08958": "|**2023-02-17**|**Towards Unifying Medical Vision-and-Language Pre-training via Soft Prompts**|Zhihong Chen et.al.|[2302.08958v1](http://arxiv.org/abs/2302.08958v1)|**[link](https://github.com/zhjohnchan/ptunifier)**|\n", "2302.08888": "|**2023-02-17**|**Multimodal Federated Learning via Contrastive Representation Ensemble**|Qiying Yu et.al.|[2302.08888v1](http://arxiv.org/abs/2302.08888v1)|**[link](https://github.com/flair-thu/creamfl)**|\n", "2302.08820": "|**2023-02-17**|**Understanding Stationary and Moving Direct Skin Vibrotactile Stimulation on the Palm**|Hesham Elsayed et.al.|[2302.08820v1](http://arxiv.org/abs/2302.08820v1)|null|\n", "2302.08774": "|**2023-02-17**|**Vision, Deduction and Alignment: An Empirical Study on Multi-modal Knowledge Graph Alignment**|Yangning Li et.al.|[2302.08774v1](http://arxiv.org/abs/2302.08774v1)|null|\n", "2302.08706": "|**2023-02-20**|**Fine-grained Cross-modal Fusion based Refinement for Text-to-Image Synthesis**|Haoran Sun et.al.|[2302.08706v2](http://arxiv.org/abs/2302.08706v2)|**[link](https://github.com/haoranhfut/ff-gan)**|\n", "2302.08670": "|**2023-02-17**|**Cascaded information enhancement and cross-modal attention feature fusion for multispectral pedestrian detection**|Yang Yang et.al.|[2302.08670v1](http://arxiv.org/abs/2302.08670v1)|null|\n", "2302.09302": "|**2023-02-16**|**Bridge the Gap between Language models and Tabular Understanding**|Nuo Chen et.al.|[2302.09302v1](http://arxiv.org/abs/2302.09302v1)|null|\n", "2302.08326": "|**2023-02-16**|**NUAA-QMUL-AIIT at Memotion 3: Multi-modal Fusion with Squeeze-and-Excitation for Internet Meme Emotion Analysis**|Xiaoyu Guo et.al.|[2302.08326v1](http://arxiv.org/abs/2302.08326v1)|**[link](https://github.com/xxxxxxxxy/memotion3-SEFusion)**|\n", "2302.08212": "|**2023-02-16**|**Visible-Infrared Person Re-Identification via Patch-Mixed Cross-Modality Learning**|Zhihao Qian et.al.|[2302.08212v1](http://arxiv.org/abs/2302.08212v1)|null|\n", "2302.08180": "|**2023-02-16**|**Cross Modal Distillation for Flood Extent Mapping**|Shubhika Garg et.al.|[2302.08180v1](http://arxiv.org/abs/2302.08180v1)|null|\n", "2302.08052": "|**2023-02-16**|**Hierarchical Cross-modal Transformer for RGB-D Salient Object Detection**|Hao Chen et.al.|[2302.08052v1](http://arxiv.org/abs/2302.08052v1)|null|\n", "2302.08020": "|**2023-02-16**|**All-Electrical Skyrmionic Bits in a Chiral Magnetic Tunnel Junction**|Shaohai Chen et.al.|[2302.08020v1](http://arxiv.org/abs/2302.08020v1)|null|\n", "2302.08016": "|**2023-02-16**|**Unsupervised Domain Adaptation for MRI Volume Segmentation and Classification Using Image-to-Image Translation**|Satoshi Kondo et.al.|[2302.08016v1](http://arxiv.org/abs/2302.08016v1)|null|\n", "2302.07919": "|**2023-02-15**|**COVID-VTS: Fact Extraction and Verification on Short Video Platforms**|Fuxiao Liu et.al.|[2302.07919v1](http://arxiv.org/abs/2302.07919v1)|**[link](https://github.com/fuxiaoliu/twitter-video-dataset)**|\n", "2302.07702": "|**2023-02-15**|**Audio-Visual Contrastive Learning with Temporal Self-Supervision**|Simon Jenni et.al.|[2302.07702v1](http://arxiv.org/abs/2302.07702v1)|null|\n", "2302.07693": "|**2023-02-16**|**Fine-tuning of sign language recognition models: a technical report**|Maxim Novopoltsev et.al.|[2302.07693v2](http://arxiv.org/abs/2302.07693v2)|**[link](https://github.com/ds-hub-sochi/sl-techreport)**|\n", "2302.07661": "|**2023-02-15**|**Depth- and Semantics-aware Multi-modal Domain Translation: Generating 3D Panoramic Color Images from LiDAR Point Clouds**|Tiago Cortinhal et.al.|[2302.07661v1](http://arxiv.org/abs/2302.07661v1)|**[link](https://github.com/tiagocortinhal/titan-next)**|\n", "2302.07456": "|**2023-02-15**|**Continuous-Time Fixed-Lag Smoothing for LiDAR-Inertial-Camera SLAM**|Jiajun Lv et.al.|[2302.07456v1](http://arxiv.org/abs/2302.07456v1)|**[link](https://github.com/april-zju/clic)**|\n", "2302.07269": "|**2023-02-14**|**Dual-mode adaptive-SVD ghost imaging**|Dajing Wang et.al.|[2302.07269v1](http://arxiv.org/abs/2302.07269v1)|null|\n", "2302.06914": "|**2023-02-14**|**Heterogeneous Anomaly Detection for Software Systems via Semi-supervised Cross-modal Attention**|Cheryl Lee et.al.|[2302.06914v1](http://arxiv.org/abs/2302.06914v1)|**[link](https://github.com/bebillionaireusd/hades)**|\n", "2302.10909": "|**2023-02-14**|**Multi-modal Machine Learning in Engineering Design: A Review and Future Directions**|Binyang Song et.al.|[2302.10909v1](http://arxiv.org/abs/2302.10909v1)|null|\n", "2302.06643": "|**2023-02-13**|**Vision-RADAR fusion for Robotics BEV Detections: A Survey**|Apoorv Singh et.al.|[2302.06643v1](http://arxiv.org/abs/2302.06643v1)|null|\n", "2302.06605": "|**2023-02-13**|**UniAdapter: Unified Parameter-Efficient Transfer Learning for Cross-modal Modeling**|Haoyu Lu et.al.|[2302.06605v1](http://arxiv.org/abs/2302.06605v1)|**[link](https://github.com/rerv/uniadapter)**|\n", "2302.06560": "|**2023-02-13**|**Large Scale Multi-Lingual Multi-Modal Summarization Dataset**|Yash Verma et.al.|[2302.06560v1](http://arxiv.org/abs/2302.06560v1)|**[link](https://github.com/anubhav-jangra/m3ls)**|\n", "2302.06452": "|**2023-02-13**|**Mixed Multi-Model Semantic Interaction for Graph-based Narrative Visualizations**|Brian Keith Norambuena et.al.|[2302.06452v1](http://arxiv.org/abs/2302.06452v1)|null|\n", "2302.06350": "|**2023-02-13**|**CLIP-RR: Improved CLIP Network for Relation-Focused Cross-Modal Information Retrieval**|Yan Gong et.al.|[2302.06350v1](http://arxiv.org/abs/2302.06350v1)|null|\n", "2302.06148": "|**2023-02-13**|**CoMAE: Single Model Hybrid Pre-training on Small-Scale RGB-D Datasets**|Jiange Yang et.al.|[2302.06148v1](http://arxiv.org/abs/2302.06148v1)|**[link](https://github.com/mcg-nju/comae)**|\n", "2302.12816": "|**2023-02-24**|**Floquet Analysis of Frequency Collisions**|Kentaro Heya et.al.|[2302.12816v1](http://arxiv.org/abs/2302.12816v1)|null|\n", "2302.12610": "|**2023-02-24**|**A Joint Modeling of Vision-Language-Action for Target-oriented Grasping in Clutter**|Kechun Xu et.al.|[2302.12610v1](http://arxiv.org/abs/2302.12610v1)|**[link](https://github.com/xukechun/Vision-Language-Grasping)**|\n", "2302.12552": "|**2023-02-24**|**Deep Learning for Video-Text Retrieval: a Review**|Cunjuan Zhu et.al.|[2302.12552v1](http://arxiv.org/abs/2302.12552v1)|null|\n", "2302.12258": "|**2023-02-23**|**Data leakage in cross-modal retrieval training: A case study**|Benno Weck et.al.|[2302.12258v1](http://arxiv.org/abs/2302.12258v1)|null|\n", "2302.14045": "|**2023-02-27**|**Language Is Not All You Need: Aligning Perception with Language Models**|Shaohan Huang et.al.|[2302.14045v1](http://arxiv.org/abs/2302.14045v1)|**[link](https://github.com/microsoft/unilm)**|\n", "2302.14042": "|**2023-02-27**|**Knowledge-enhanced Pre-training for Auto-diagnosis of Chest Radiology Images**|Xiaoman Zhang et.al.|[2302.14042v1](http://arxiv.org/abs/2302.14042v1)|null|\n", "2302.14007": "|**2023-02-27**|**Joint-MAE: 2D-3D Joint Masked Autoencoders for 3D Point Cloud Pre-training**|Ziyu Guo et.al.|[2302.14007v1](http://arxiv.org/abs/2302.14007v1)|null|\n", "2302.13838": "|**2023-02-27**|**Cross-modal Face- and Voice-style Transfer**|Naoya Takahashi et.al.|[2302.13838v1](http://arxiv.org/abs/2302.13838v1)|null|\n", "2302.13668": "|**2023-02-27**|**Contrastive Video Question Answering via Video Graph Transformer**|Junbin Xiao et.al.|[2302.13668v1](http://arxiv.org/abs/2302.13668v1)|**[link](https://github.com/doc-doc/covgt)**|\n", "2302.13321": "|**2023-02-26**|**Multi-Modality in Music: Predicting Emotion in Music from High-Level Audio Features and Lyrics**|Tibor Krols et.al.|[2302.13321v1](http://arxiv.org/abs/2302.13321v1)|**[link](https://github.com/tibor-krols/cogsci2-spotify)**|\n", "2302.13311": "|**2023-02-26**|**Understanding Social Media Cross-Modality Discourse in Linguistic Space**|Chunpu Xu et.al.|[2302.13311v1](http://arxiv.org/abs/2302.13311v1)|**[link](https://github.com/cpaaax/multimodal_discourse)**|\n", "2302.13187": "|**2023-02-25**|**Tractable Diversity: Scalable Multiperspective Ontology Management via Standpoint EL**|Luc\u00eda G\u00f3mez \u00c1lvarez et.al.|[2302.13187v1](http://arxiv.org/abs/2302.13187v1)|null|\n", "2302.13094": "|**2023-02-25**|**Knowledge-infused Contrastive Learning for Urban Imagery-based Socioeconomic Prediction**|Yu Liu et.al.|[2302.13094v1](http://arxiv.org/abs/2302.13094v1)|**[link](https://github.com/tsinghua-fib-lab/urbankg-knowcl)**|\n", "2302.12971": "|**2023-02-25**|**BrainCLIP: Bridging Brain and Visual-Linguistic Representation via CLIP for Generic Natural Visual Stimulus Decoding from fMRI**|Yulong Liu et.al.|[2302.12971v1](http://arxiv.org/abs/2302.12971v1)|**[link](https://github.com/YulongBonjour/BrainCLIP)**|\n", "2302.14785": "|**2023-02-28**|**Joint Representations of Text and Knowledge Graphs for Retrieval and Evaluation**|Teven Le Scao et.al.|[2302.14785v1](http://arxiv.org/abs/2302.14785v1)|null|\n", "2302.14777": "|**2023-02-28**|**VQA with Cascade of Self- and Co-Attention Blocks**|Aakansha Mishra et.al.|[2302.14777v1](http://arxiv.org/abs/2302.14777v1)|null|\n", "2302.14564": "|**2023-02-28**|**Exploring Self-supervised Pre-trained ASR Models For Dysarthric and Elderly Speech Recognition**|Shujie Hu et.al.|[2302.14564v1](http://arxiv.org/abs/2302.14564v1)|null|\n", "2302.14418": "|**2023-02-28**|**PCR-CG: Point Cloud Registration via Deep Color and Geometry**|Yu Zhang et.al.|[2302.14418v1](http://arxiv.org/abs/2302.14418v1)|**[link](https://github.com/gardlin/pcr-cg)**|\n", "2302.14264": "|**2023-02-28**|**RGB-D Grasp Detection via Depth Guided Learning with Cross-modal Attention**|Ran Qin et.al.|[2302.14264v1](http://arxiv.org/abs/2302.14264v1)|null|\n", "2302.14115": "|**2023-02-27**|**Vid2Seq: Large-Scale Pretraining of a Visual Language Model for Dense Video Captioning**|Antoine Yang et.al.|[2302.14115v1](http://arxiv.org/abs/2302.14115v1)|**[link](https://github.com/google-research/scenic/tree/main/scenic/projects/vid2seq)**|\n", "2302.14082": "|**2023-02-27**|**Detecting and Mitigating Mode-Collapse for Flow-based Sampling of Lattice Field Theories**|Kim A. Nicoli et.al.|[2302.14082v1](http://arxiv.org/abs/2302.14082v1)|null|\n", "2303.00720": "|**2023-03-01**|**Cross-Modal Entity Matching for Visually Rich Documents**|Ritesh Sarkhel et.al.|[2303.00720v1](http://arxiv.org/abs/2303.00720v1)|null|\n", "2303.00534": "|**2023-03-01**|**RAMM: Retrieval-augmented Biomedical Visual Question Answering with Multi-modal Pre-training**|Zheng Yuan et.al.|[2303.00534v1](http://arxiv.org/abs/2303.00534v1)|**[link](https://github.com/GanjinZero/RAMM)**|\n", "2303.00462": "|**2023-03-02**|**Hidden Gems: 4D Radar Scene Flow Learning Using Cross-Modal Supervision**|Fangqiang Ding et.al.|[2303.00462v2](http://arxiv.org/abs/2303.00462v2)|**[link](https://github.com/toytiny/cmflow)**|\n", "2303.00448": "|**2023-03-01**|**The style transformer with common knowledge optimization for image-text retrieval**|Wenrui Li et.al.|[2303.00448v1](http://arxiv.org/abs/2303.00448v1)|null|\n", "2303.00369": "|**2023-03-02**|**Indescribable Multi-modal Spatial Evaluator**|Lingke Kong et.al.|[2303.00369v2](http://arxiv.org/abs/2303.00369v2)|**[link](https://github.com/kid-liet/imse)**|\n", "2303.00289": "|**2023-03-01**|**StrucTexTv2: Masked Visual-Textual Prediction for Document Image Pre-training**|Yuechen Yu et.al.|[2303.00289v1](http://arxiv.org/abs/2303.00289v1)|**[link](https://github.com/PaddlePaddle/VIMER/tree/main/StrucTexT/v2)**|\n", "2303.00277": "|**2023-03-01**|**UAV Tracking with Lidar as a Camera Sensors in GNSS-Denied Environments**|Ha Sier et.al.|[2303.00277v1](http://arxiv.org/abs/2303.00277v1)|**[link](https://github.com/tiers/uav-tracking-based-on-lidar-as-a-camera)**|\n", "2303.00233": "|**2023-03-01**|**Single-Cell Multimodal Prediction via Transformers**|Wenzhuo Tang et.al.|[2303.00233v1](http://arxiv.org/abs/2303.00233v1)|**[link](https://github.com/omicsml/scmoformer)**|\n", "2303.00200": "|**2023-03-01**|**Feature Extraction Matters More: Universal Deepfake Disruption through Attacking Ensemble Feature Extractors**|Long Tang et.al.|[2303.00200v1](http://arxiv.org/abs/2303.00200v1)|null|\n", "2303.00073": "|**2023-02-28**|**Cross-correlated quantum thermometry using diamond containing dual-defect centers**|Madhav Gupta et.al.|[2303.00073v1](http://arxiv.org/abs/2303.00073v1)|null|\n", "2303.00040": "|**2023-02-28**|**Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training**|Dezhao Luo et.al.|[2303.00040v1](http://arxiv.org/abs/2303.00040v1)|null|\n", "2303.01480": "|**2023-03-02**|**Delivering Arbitrary-Modal Semantic Segmentation**|Jiaming Zhang et.al.|[2303.01480v1](http://arxiv.org/abs/2303.01480v1)|**[link](https://github.com/jamycheung/DELIVER)**|\n", "2303.01311": "|**2023-03-02**|**Zero-Shot Text-to-Parameter Translation for Game Character Auto-Creation**|Rui Zhao et.al.|[2303.01311v1](http://arxiv.org/abs/2303.01311v1)|null|\n", "2303.01310": "|**2023-03-02**|**Learning Language-Conditioned Deformable Object Manipulation with Graph Dynamics**|Kai Mo et.al.|[2303.01310v1](http://arxiv.org/abs/2303.01310v1)|null|\n", "2303.01217": "|**2023-03-02**|**Synthetic Misinformers: Generating and Combating Multimodal Misinformation**|Stefanos-Iordanis Papadopoulos et.al.|[2303.01217v1](http://arxiv.org/abs/2303.01217v1)|null|\n", "2303.01043": "|**2023-03-02**|**I2P-Rec: Recognizing Images on Large-scale Point Cloud Maps through Bird's Eye View Projections**|Yixuan Li et.al.|[2303.01043v1](http://arxiv.org/abs/2303.01043v1)|null|\n", "2303.00882": "|**2023-03-02**|**X-Ray2EM: Uncertainty-Aware Cross-Modality Image Reconstruction from X-Ray to Electron Microscopy in Connectomics**|Yicong Li et.al.|[2303.00882v1](http://arxiv.org/abs/2303.00882v1)|null|\n", "2303.00865": "|**2023-03-01**|**AMIGO: Sparse Multi-Modal Graph Transformer with Shared-Context Processing for Representation Learning of Giga-pixel Images**|Ramin Nakhli et.al.|[2303.00865v1](http://arxiv.org/abs/2303.00865v1)|**[link](https://github.com/raminnakhli/amigo)**|\n", "2303.00806": "|**2023-03-01**|**Survival modelling of smartphone trigger data for earthquake parameter estimation in early warning. With applications to 2023 Turkish-Syrian and 2019 Ridgecrest events**|Luca Aiello et.al.|[2303.00806v1](http://arxiv.org/abs/2303.00806v1)|null|\n", "2303.02139": "|**2023-03-03**|**Data Association Aware POMDP Planning with Hypothesis Pruning Performance Guarantees**|Moran Barenboim et.al.|[2303.02139v1](http://arxiv.org/abs/2303.02139v1)|null|\n", "2303.01933": "|**2023-03-03**|**BogieCopter: A Multi-Modal Aerial-Ground Vehicle for Long-Endurance Inspection Applications**|Teodoro Dias et.al.|[2303.01933v1](http://arxiv.org/abs/2303.01933v1)|null|\n", "2303.01510": "|**2023-03-02**|**INO at Factify 2: Structure Coherence based Multi-Modal Fact Verification**|Yinuo Zhang et.al.|[2303.01510v1](http://arxiv.org/abs/2303.01510v1)|**[link](https://github.com/catrin-baze/ino-of-factify)**|\n", "2303.03378": "|**2023-03-06**|**PaLM-E: An Embodied Multimodal Language Model**|Danny Driess et.al.|[2303.03378v1](http://arxiv.org/abs/2303.03378v1)|null|\n", "2303.03131": "|**2023-03-08**|**Video Question Answering Using CLIP-Guided Visual-Text Attention**|Shuhong Ye et.al.|[2303.03131v2](http://arxiv.org/abs/2303.03131v2)|null|\n", "2303.03093": "|**2023-03-06**|**A Miniaturised Camera-based Multi-Modal Tactile Sensor**|Kaspar Althoefer et.al.|[2303.03093v1](http://arxiv.org/abs/2303.03093v1)|null|\n", "2303.03056": "|**2023-03-07**|**MOISST: Multi-modal Optimization of Implicit Scene for SpatioTemporal calibration**|Quentin Herau et.al.|[2303.03056v2](http://arxiv.org/abs/2303.03056v2)|null|\n", "2303.03032": "|**2023-03-06**|**DeCap: Decoding CLIP Latents for Zero-Shot Captioning via Text-Only Training**|Wei Li et.al.|[2303.03032v1](http://arxiv.org/abs/2303.03032v1)|**[link](https://github.com/dhg-wei/decap)**|\n", "2303.02995": "|**2023-03-06**|**HiCLIP: Contrastive Language-Image Pretraining with Hierarchy-aware Attention**|Shijie Geng et.al.|[2303.02995v1](http://arxiv.org/abs/2303.02995v1)|**[link](https://github.com/jeykigung/hiclip)**|\n", "2303.02976": "|**2023-03-06**|**Dronument: System for Reliable Deployment of Micro Aerial Vehicles in Dark Areas of Large Historical Monuments**|Pavel Petracek et.al.|[2303.02976v1](http://arxiv.org/abs/2303.02976v1)|null|\n", "2303.02688": "|**2023-03-05**|**Text2Face: A Multi-Modal 3D Face Model**|Will Rowan et.al.|[2303.02688v1](http://arxiv.org/abs/2303.02688v1)|null|\n", "2303.02684": "|**2023-03-05**|**Robust Multi-Modal Multi-LiDAR-Inertial Odometry and Mapping for Indoor Environments**|Li Qingqing et.al.|[2303.02684v1](http://arxiv.org/abs/2303.02684v1)|**[link](https://github.com/tiers/multi-modal-loam)**|\n", "2303.02506": "|**2023-03-04**|**Prismer: A Vision-Language Model with An Ensemble of Experts**|Shikun Liu et.al.|[2303.02506v1](http://arxiv.org/abs/2303.02506v1)|**[link](https://github.com/nvlabs/prismer)**|\n", "2303.02483": "|**2023-03-04**|**FAME-ViL: Multi-Tasking Vision-Language Model for Heterogeneous Fashion Tasks**|Xiao Han et.al.|[2303.02483v1](http://arxiv.org/abs/2303.02483v1)|**[link](https://github.com/brandonhanx/fame-vil)**|\n", "2303.02479": "|**2023-03-04**|**Chronic Kidney Disease of Unknown Aetiolgy (CKDu)-the search for causes and the impact of its politicization**|Chandre Dharma-wardana et.al.|[2303.02479v1](http://arxiv.org/abs/2303.02479v1)|null|\n", "2303.02407": "|**2023-03-04**|**Local Navigation Among Movable Obstacles with Deep Reinforcement Learning**|Linghong Yao et.al.|[2303.02407v1](http://arxiv.org/abs/2303.02407v1)|null|\n", "2303.02323": "|**2023-03-04**|**APE: An Open and Shared Annotated Dataset for Learning Urban Pedestrian Path Networks**|Yuxiang Zhang et.al.|[2303.02323v1](http://arxiv.org/abs/2303.02323v1)|null|\n", "2303.02203": "|**2023-03-03**|**X$^3$KD: Knowledge Distillation Across Modalities, Tasks and Stages for Multi-Camera 3D Object Detection**|Marvin Klingner et.al.|[2303.02203v1](http://arxiv.org/abs/2303.02203v1)|null|\n", "2303.03991": "|**2023-03-07**|**OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception**|Xiaofeng Wang et.al.|[2303.03991v1](http://arxiv.org/abs/2303.03991v1)|**[link](https://github.com/jeffwang987/openoccupancy)**|\n", "2303.03878": "|**2023-03-07**|**A convergence analysis of a structure-preserving gradient flow method for the all-electron Kohn-Sham model**|Yedan Shen et.al.|[2303.03878v1](http://arxiv.org/abs/2303.03878v1)|null|\n", "2303.03595": "|**2023-03-07**|**LoGoNet: Towards Accurate 3D Object Detection with Local-to-Global Cross-Modal Fusion**|Xin Li et.al.|[2303.03595v1](http://arxiv.org/abs/2303.03595v1)|**[link](https://github.com/sankin97/logonet)**|\n", "2303.03449": "|**2023-03-06**|**Dual-encoded magnetization transfer and diffusion imaging and its application to tract-specific microstructure mapping**|Ilana R Leppert et.al.|[2303.03449v1](http://arxiv.org/abs/2303.03449v1)|**[link](https://github.com/tardiflab/mt-diff)**|\n", "2303.04748": "|**2023-03-08**|**CLIP-FO3D: Learning Free Open-world 3D Scene Representations from 2D Dense CLIP**|Junbo Zhang et.al.|[2303.04748v1](http://arxiv.org/abs/2303.04748v1)|null|\n", "2303.04585": "|**2023-03-08**|**New Audio Representations Image Gan Generation from BriVL**|Sen Fang et.al.|[2303.04585v1](http://arxiv.org/abs/2303.04585v1)|**[link](https://github.com/fangsen9000/brivl-generation)**|\n", "2303.04439": "|**2023-03-08**|**A Light Weight Model for Active Speaker Detection**|Junhua Liao et.al.|[2303.04439v1](http://arxiv.org/abs/2303.04439v1)|**[link](https://github.com/junhua-liao/light-asd)**|\n", "2303.04398": "|**2023-03-08**|**Implications of Personality on Cognitive Workload, Affect, and Task Performance in Robot Remote Control**|Go-Eum Cha et.al.|[2303.04398v1](http://arxiv.org/abs/2303.04398v1)|null|\n", "2303.04364": "|**2023-03-08**|**Dynamic Scenario Representation Learning for Motion Forecasting with Heterogeneous Graph Convolutional Recurrent Networks**|Xing Gao et.al.|[2303.04364v1](http://arxiv.org/abs/2303.04364v1)|null|\n", "2303.05499": "|**2023-03-10**|**Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection**|Shilong Liu et.al.|[2303.05499v2](http://arxiv.org/abs/2303.05499v2)|**[link](https://github.com/idea-research/groundingdino)**|\n", "2303.05338": "|**2023-03-11**|**MMCosine: Multi-Modal Cosine Loss Towards Balanced Audio-Visual Fine-Grained Learning**|Ruize Xu et.al.|[2303.05338v2](http://arxiv.org/abs/2303.05338v2)|null|\n", "2303.05313": "|**2023-03-09**|**Replacement as a Self-supervision for Fine-grained Vision-language Pre-training**|Lisai Zhang et.al.|[2303.05313v1](http://arxiv.org/abs/2303.05313v1)|null|\n", "2303.05309": "|**2023-03-09**|**MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream Mixup for Visual Speech Translation and Recognition**|Xize Cheng et.al.|[2303.05309v1](http://arxiv.org/abs/2303.05309v1)|**[link](https://github.com/exgc/avmust-ted)**|\n", "2303.05193": "|**2023-03-09**|**GOATS: Goal Sampling Adaptation for Scooping with Curriculum Reinforcement Learning**|Yaru Niu et.al.|[2303.05193v1](http://arxiv.org/abs/2303.05193v1)|null|\n", "2303.05093": "|**2023-03-09**|**Improving Video Retrieval by Adaptive Margin**|Feng He et.al.|[2303.05093v1](http://arxiv.org/abs/2303.05093v1)|null|\n", "2303.05026": "|**2023-03-09**|**SSL^2: Self-Supervised Learning meets Semi-Supervised Learning: Multiple Sclerosis Segmentation in 7T-MRI from large-scale 3T-MRI**|Jiacheng Wang et.al.|[2303.05026v1](http://arxiv.org/abs/2303.05026v1)|null|\n", "2303.04955": "|**2023-03-09**|**Exploring Smart Commercial Building Occupants' Perceptions and Notification Preferences of Internet of Things Data Collection in the United States**|Tu Le et.al.|[2303.04955v1](http://arxiv.org/abs/2303.04955v1)|null|\n", "2303.06129": "|**2023-03-10**|**Single-branch Network for Multimodal Training**|Muhammad Saad Saeed et.al.|[2303.06129v1](http://arxiv.org/abs/2303.06129v1)|**[link](https://github.com/msaadsaeed/sbnet)**|\n", "2303.05952": "|**2023-03-10**|**Understanding and Constructing Latent Modality Structures in Multi-modal Representation Learning**|Qian Jiang et.al.|[2303.05952v1](http://arxiv.org/abs/2303.05952v1)|null|\n", "2303.05936": "|**2023-03-10**|**Learning Decoupled Multi-touch Force Estimation, Localization and Stretch for Soft Capacitive E-skin**|Abu Bakar Dawood et.al.|[2303.05936v1](http://arxiv.org/abs/2303.05936v1)|null|\n", "2303.05793": "|**2023-03-10**|**Analyzing covariate clustering effects in healthcare cost subgroups: insights and applications for prediction**|Zhengxiao Li et.al.|[2303.05793v1](http://arxiv.org/abs/2303.05793v1)|**[link](https://github.com/huangyf2217/fmr-covariates-clustering)**|\n", "2303.05725": "|**2023-03-10**|**CVT-SLR: Contrastive Visual-Textual Transformation for Sign Language Recognition with Variational Alignment**|Jiangbin Zheng et.al.|[2303.05725v1](http://arxiv.org/abs/2303.05725v1)|**[link](https://github.com/binbinjiang/cvt-slr)**|\n", "2303.05714": "|**2023-03-10**|**Simultaneous estimation of multiple eigenvalues with short-depth quantum circuit on early fault-tolerant quantum computers**|Zhiyan Ding et.al.|[2303.05714v1](http://arxiv.org/abs/2303.05714v1)|null|\n", "2303.05707": "|**2023-03-10**|**MuLTI: Efficient Video-and-Language Understanding with MultiWay-Sampler and Multiple Choice Modeling**|Jiaqi Xu et.al.|[2303.05707v1](http://arxiv.org/abs/2303.05707v1)|null|\n", "2303.07284": "|**2023-03-13**|**Align and Attend: Multimodal Summarization with Dual Contrastive Losses**|Bo He et.al.|[2303.07284v1](http://arxiv.org/abs/2303.07284v1)|**[link](https://github.com/boheumd/A2Summ)**|\n", "2303.07274": "|**2023-03-14**|**Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of Synthetic and Compositional Images**|Nitzan Bitton-Guetta et.al.|[2303.07274v2](http://arxiv.org/abs/2303.07274v2)|null|\n", "2303.07265": "|**2023-03-13**|**Multimodal Reinforcement Learning for Robots Collaborating with Humans**|Afagh Mehri Shervedani et.al.|[2303.07265v1](http://arxiv.org/abs/2303.07265v1)|null|\n", "2303.07064": "|**2023-03-13**|**A Generalized Multi-Modal Fusion Detection Framework**|Leichao Cui et.al.|[2303.07064v1](http://arxiv.org/abs/2303.07064v1)|null|\n", "2303.07000": "|**2023-03-13**|**Predicting Density of States via Multi-modal Transformer**|Namkyeong Lee et.al.|[2303.07000v1](http://arxiv.org/abs/2303.07000v1)|**[link](https://github.com/heewoongnoh/dostransformer)**|\n", "2303.06947": "|**2023-03-13**|**A Multi-Modal Simulation Framework to Enable Digital Twin-based V2X Communications in Dynamic Environments**|Lorenzo Cazzella et.al.|[2303.06947v1](http://arxiv.org/abs/2303.06947v1)|null|\n", "2303.06840": "|**2023-03-13**|**DDFM: Denoising Diffusion Model for Multi-Modality Image Fusion**|Zixiang Zhao et.al.|[2303.06840v1](http://arxiv.org/abs/2303.06840v1)|**[link](https://github.com/zhaozixiang1228/mmif-ddfm)**|\n", "2303.06662": "|**2023-03-12**|**Fuzzy Alignments in Directed Acyclic Graph for Non-Autoregressive Machine Translation**|Zhengrui Ma et.al.|[2303.06662v1](http://arxiv.org/abs/2303.06662v1)|**[link](https://github.com/ictnlp/fa-dat)**|\n", "2303.06555": "|**2023-03-12**|**One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale**|Fan Bao et.al.|[2303.06555v1](http://arxiv.org/abs/2303.06555v1)|**[link](https://github.com/thu-ml/unidiffuser)**|\n", "2303.06536": "|**2023-03-12**|**AutoOptLib: A Library of Automatically Designing Metaheuristic Optimization Algorithms in MATLAB**|Qi Zhao et.al.|[2303.06536v1](http://arxiv.org/abs/2303.06536v1)|**[link](https://github.com/qz89/AutoOpt)**|\n", "2303.06464": "|**2023-03-11**|**PARASOL: Parametric Style Control for Diffusion Image Synthesis**|Gemma Canet Tarr\u00e9s et.al.|[2303.06464v1](http://arxiv.org/abs/2303.06464v1)|null|\n", "2303.06422": "|**2023-03-11**|**An approximate control variates approach to multifidelity distribution estimation**|Ruijian Han et.al.|[2303.06422v1](http://arxiv.org/abs/2303.06422v1)|null|\n", "2303.06398": "|**2023-03-11**|**Variational Gaussian filtering via Wasserstein gradient flows**|Adrie Corenflos et.al.|[2303.06398v1](http://arxiv.org/abs/2303.06398v1)|**[link](https://github.com/hanyas/wasserstein-flow-filter)**|\n", "2303.06378": "|**2023-03-11**|**Learning Grounded Vision-Language Representation for Versatile Understanding in Untrimmed Videos**|Teng Wang et.al.|[2303.06378v1](http://arxiv.org/abs/2303.06378v1)|**[link](https://github.com/zjr2000/gvl)**|\n", "2303.06345": "|**2023-03-11**|**Semantics-Aware Dynamic Localization and Refinement for Referring Image Segmentation**|Zhao Yang et.al.|[2303.06345v1](http://arxiv.org/abs/2303.06345v1)|null|\n", "2303.08129": "|**2023-03-14**|**PiMAE: Point Cloud and Image Interactive Masked Autoencoders for 3D Object Detection**|Anthony Chen et.al.|[2303.08129v1](http://arxiv.org/abs/2303.08129v1)|**[link](https://github.com/blvlab/pimae)**|\n", "2303.08054": "|**2023-03-15**|**Statistical Hardware Design With Multi-model Active Learning**|Alireza Ghaffari et.al.|[2303.08054v2](http://arxiv.org/abs/2303.08054v2)|null|\n", "2303.08017": "|**2023-03-14**|**Reliable Beamforming at Terahertz Bands: Are Causal Representations the Way Forward?**|Christo Kurisummoottil Thomas et.al.|[2303.08017v1](http://arxiv.org/abs/2303.08017v1)|null|\n", "2303.07896": "|**2023-03-16**|**Exploring Weakly Supervised Semantic Segmentation Ensembles for Medical Imaging Systems**|Erik Ostrowski et.al.|[2303.07896v2](http://arxiv.org/abs/2303.07896v2)|**[link](https://github.com/erikostrowski/automated_ensemble)**|\n", "2303.07775": "|**2023-03-14**|**Data-Free Sketch-Based Image Retrieval**|Abhra Chaudhuri et.al.|[2303.07775v1](http://arxiv.org/abs/2303.07775v1)|**[link](https://github.com/abhrac/data-free-sbir)**|\n", "2303.07748": "|**2023-03-14**|**Generation-Guided Multi-Level Unified Network for Video Grounding**|Xing Cheng et.al.|[2303.07748v1](http://arxiv.org/abs/2303.07748v1)|null|\n", "2303.07742": "|**2023-03-14**|**ForDigitStress: A multi-modal stress dataset employing a digital job interview scenario**|Alexander Heimerl et.al.|[2303.07742v1](http://arxiv.org/abs/2303.07742v1)|null|\n", "2303.07674": "|**2023-03-14**|**Koos Classification of Vestibular Schwannoma via Image Translation-Based Unsupervised Cross-Modality Domain Adaptation**|Tao Yang et.al.|[2303.07674v1](http://arxiv.org/abs/2303.07674v1)|null|\n", "2303.07667": "|**2023-03-14**|**Improving Music Genre Classification from multi-modal properties of music and genre correlations Perspective**|Ganghui Ru et.al.|[2303.07667v1](http://arxiv.org/abs/2303.07667v1)|null|\n", "2303.07647": "|**2023-03-15**|**Recent Advances and Applications of Machine Learning in Experimental Solid Mechanics: A Review**|Hanxun Jin et.al.|[2303.07647v2](http://arxiv.org/abs/2303.07647v2)|null|\n", "2303.07601": "|**2023-03-14**|**V2V4Real: A Real-world Large-scale Dataset for Vehicle-to-Vehicle Cooperative Perception**|Runsheng Xu et.al.|[2303.07601v1](http://arxiv.org/abs/2303.07601v1)|**[link](https://github.com/ucla-mobility/v2v4real)**|\n", "2303.07543": "|**2023-03-14**|**WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminative Analysis**|Yiye Chen et.al.|[2303.07543v1](http://arxiv.org/abs/2303.07543v1)|**[link](https://github.com/ivalab/wdiscood)**|\n", "2303.07522": "|**2023-03-13**|**Audio Visual Language Maps for Robot Navigation**|Chenguang Huang et.al.|[2303.07522v1](http://arxiv.org/abs/2303.07522v1)|null|\n", "2303.08692": "|**2023-03-15**|**SpiderMesh: Spatial-aware Demand-guided Recursive Meshing for RGB-T Semantic Segmentation**|Siqi Fan et.al.|[2303.08692v1](http://arxiv.org/abs/2303.08692v1)|**[link](https://github.com/leofansq/spidermesh)**|\n", "2303.08600": "|**2023-03-15**|**MSeg3D: Multi-modal 3D Semantic Segmentation for Autonomous Driving**|Jiale Li et.al.|[2303.08600v1](http://arxiv.org/abs/2303.08600v1)|**[link](https://github.com/jialeli1/lidarseg3d)**|\n", "2303.08562": "|**2023-03-15**|**MGA: Medical generalist agent through text-guided knowledge transformation**|Weijian Huang et.al.|[2303.08562v1](http://arxiv.org/abs/2303.08562v1)|null|\n", "2303.08518": "|**2023-03-15**|**UPRISE: Universal Prompt Retrieval for Improving Zero-Shot Evaluation**|Daixuan Cheng et.al.|[2303.08518v1](http://arxiv.org/abs/2303.08518v1)|**[link](https://github.com/microsoft/lmops)**|\n", "2303.08419": "|**2023-03-15**|**Multi-Modal Facial Expression Recognition with Transformer-Based Fusion Networks and Dynamic Sampling**|Jun-Hwa Kim et.al.|[2303.08419v1](http://arxiv.org/abs/2303.08419v1)|null|\n", "2303.08372": "|**2023-03-15**|**Target Sound Extraction with Variable Cross-modality Clues**|Chenda Li et.al.|[2303.08372v1](http://arxiv.org/abs/2303.08372v1)|**[link](https://github.com/lichenda/multi-clue-tse-data)**|\n", "2303.08367": "|**2023-03-15**|**Uncertainty-Aware Pedestrian Trajectory Prediction via Distributional Diffusion**|Yao Liu et.al.|[2303.08367v1](http://arxiv.org/abs/2303.08367v1)|null|\n", "2303.08359": "|**2023-03-15**|**Haptics-Enabled Forceps with Multi-Modal Force Sensing: Towards Task-Autonomous Robotic Surgery**|Tangyou Liu et.al.|[2303.08359v1](http://arxiv.org/abs/2303.08359v1)|null|\n", "2303.08356": "|**2023-03-15**|**Continuous emotion recognition based on TCN and Transformer**|Weiwei Zhou et.al.|[2303.08356v1](http://arxiv.org/abs/2303.08356v1)|**[link](https://github.com/upczww/abaw5)**|\n", "2303.09463": "|**2023-03-16**|**An Autonomous System for Head-to-Head Race: Design, Implementation and Analysis; Team KAIST at the Indy Autonomous Challenge**|Chanyoung Jung et.al.|[2303.09463v1](http://arxiv.org/abs/2303.09463v1)|null|\n", "2303.09381": "|**2023-03-16**|**Multi-modal Differentiable Unsupervised Feature Selection**|Junchen Yang et.al.|[2303.09381v1](http://arxiv.org/abs/2303.09381v1)|**[link](https://github.com/jcyang34/mmdufs)**|\n", "2303.09373": "|**2023-03-16**|**3D Masked Autoencoding and Pseudo-labeling for Domain Adaptive Segmentation of Heterogeneous Infant Brain MRI**|Xuzhe Zhang et.al.|[2303.09373v1](http://arxiv.org/abs/2303.09373v1)|null|\n", "2303.09367": "|**2023-03-16**|**Goal-conditioned Offline Reinforcement Learning through State Space Partitioning**|Mianchu Wang et.al.|[2303.09367v1](http://arxiv.org/abs/2303.09367v1)|null|\n", "2303.09319": "|**2023-03-16**|**Unified Multi-Modal Latent Diffusion for Joint Subject and Text Conditional Image Generation**|Yiyang Ma et.al.|[2303.09319v1](http://arxiv.org/abs/2303.09319v1)|null|\n", "2303.09270": "|**2023-03-16**|**SpectralCLIP: Preventing Artifacts in Text-Guided Style Transfer from a Spectral Perspective**|Zipeng Xu et.al.|[2303.09270v1](http://arxiv.org/abs/2303.09270v1)|**[link](https://github.com/zipengxuc/spectralclip)**|\n", "2303.09167": "|**2023-03-16**|**Emotional Reaction Intensity Estimation Based on Multimodal Data**|Shangfei Wang et.al.|[2303.09167v1](http://arxiv.org/abs/2303.09167v1)|null|\n", "2303.09119": "|**2023-03-16**|**Taming Diffusion Models for Audio-Driven Co-Speech Gesture Generation**|Lingting Zhu et.al.|[2303.09119v1](http://arxiv.org/abs/2303.09119v1)|**[link](https://github.com/advocate99/diffgesture)**|\n", "2303.09117": "|**2023-03-16**|**Visual-Linguistic Causal Intervention for Radiology Report Generation**|Weixing Chen et.al.|[2303.09117v1](http://arxiv.org/abs/2303.09117v1)|**[link](https://github.com/wissingchen/vlci)**|\n", "2303.08942": "|**2023-03-15**|**Spherical Space Feature Decomposition for Guided Depth Map Super-Resolution**|Zixiang Zhao et.al.|[2303.08942v1](http://arxiv.org/abs/2303.08942v1)|null|\n", "2303.10056": "|**2023-03-17**|**GlueGen: Plug and Play Multi-modal Encoders for X-to-image Generation**|Can Qin et.al.|[2303.10056v1](http://arxiv.org/abs/2303.10056v1)|**[link](https://github.com/salesforce/gluegen)**|\n", "2303.10033": "|**2023-03-17**|**Multi-modal Expression Recognition with Ensemble Method**|Chuanhe Liu et.al.|[2303.10033v1](http://arxiv.org/abs/2303.10033v1)|null|\n", "2303.09858": "|**2023-03-20**|**MedLocker: A Transferable Adversarial Watermarking for Preventing Unauthorized Analysis of Medical Image Dataset**|Bangzheng Pu et.al.|[2303.09858v2](http://arxiv.org/abs/2303.09858v2)|null|\n", "2303.09830": "|**2023-03-17**|**Prototype Knowledge Distillation for Medical Segmentation with Missing Modality**|Shuai Wang et.al.|[2303.09830v1](http://arxiv.org/abs/2303.09830v1)|**[link](https://github.com/sakurajimamaiii/protokd)**|\n", "2303.09825": "|**2023-03-17**|**LCE-Calib: Automatic LiDAR-Frame/Event Camera Extrinsic Calibration With A Globally Optimal Solution**|Jianhao Jiao et.al.|[2303.09825v1](http://arxiv.org/abs/2303.09825v1)|**[link](https://github.com/hkustgz-iadc/lcecalib)**|\n", "2303.09817": "|**2023-03-17**|**Hospital Length of Stay Prediction Based on Multi-modal Data towards Trustworthy Human-AI Collaboration in Radiomics**|Hubert Baniecki et.al.|[2303.09817v1](http://arxiv.org/abs/2303.09817v1)|**[link](https://github.com/modeloriented/survex)**|\n", "2303.09800": "|**2023-03-17**|**GOOD: General Optimization-based Fusion for 3D Object Detection via LiDAR-Camera Object Candidates**|Bingqi Shen et.al.|[2303.09800v1](http://arxiv.org/abs/2303.09800v1)|null|\n", "2303.09797": "|**2023-03-17**|**MMFace4D: A Large-Scale Multi-Modal 4D Face Dataset for Audio-Driven 3D Face Animation**|Haozhe Wu et.al.|[2303.09797v1](http://arxiv.org/abs/2303.09797v1)|null|\n", "2303.09756": "|**2023-03-17**|**Video Action Recognition with Attentive Semantic Units**|Yifei Chen et.al.|[2303.09756v1](http://arxiv.org/abs/2303.09756v1)|null|\n", "2303.09733": "|**2023-03-17**|**Scribble-Supervised RGB-T Salient Object Detection**|Zhengyi Liu et.al.|[2303.09733v1](http://arxiv.org/abs/2303.09733v1)|**[link](https://github.com/liuzywen/rgbtscribble-icme2023)**|\n", "2303.09695": "|**2023-03-17**|**PersonalTailor: Personalizing 2D Pattern Design from 3D Garment Point Clouds**|Anran Qi et.al.|[2303.09695v1](http://arxiv.org/abs/2303.09695v1)|null|\n", "2303.11181": "|**2023-03-20**|**Non-Markovian paths and cycles in NFT trades**|Haaroon Yousaf et.al.|[2303.11181v1](http://arxiv.org/abs/2303.11181v1)|null|\n", "2303.11090": "|**2023-03-20**|**Scene Graph Based Fusion Network For Image-Text Retrieval**|Guoliang Wang et.al.|[2303.11090v1](http://arxiv.org/abs/2303.11090v1)|null|\n", "2303.10895": "|**2023-03-20**|**Leapfrog Diffusion Model for Stochastic Trajectory Prediction**|Weibo Mao et.al.|[2303.10895v1](http://arxiv.org/abs/2303.10895v1)|**[link](https://github.com/mediabrain-sjtu/led)**|\n", "2303.10865": "|**2023-03-21**|**Rotating Objects via In-Hand Pivoting using Vision, Force and Touch**|Shiyu Xu et.al.|[2303.10865v2](http://arxiv.org/abs/2303.10865v2)|null|\n", "2303.10849": "|**2023-03-20**|**Facial Affective Analysis based on MAE and Multi-modal Information for 5th ABAW Competition**|Wei Zhang et.al.|[2303.10849v1](http://arxiv.org/abs/2303.10849v1)|null|\n", "2303.10839": "|**2023-03-21**|**MXM-CLR: A Unified Framework for Contrastive Learning of Multifold Cross-Modal Representations**|Ye Wang et.al.|[2303.10839v2](http://arxiv.org/abs/2303.10839v2)|null|\n", "2303.10835": "|**2023-03-20**|**Bifurcation analysis of the Keynesian cross model**|Xinyu Li et.al.|[2303.10835v1](http://arxiv.org/abs/2303.10835v1)|null|\n", "2303.10826": "|**2023-03-20**|**Visual Prompt Multi-Modal Tracking**|Jiawen Zhu et.al.|[2303.10826v1](http://arxiv.org/abs/2303.10826v1)|**[link](https://github.com/jiawen-zhu/vipt)**|\n", "2303.10794": "|**2023-03-19**|**PheME: A deep ensemble framework for improving phenotype prediction from multi-modal data**|Shenghan Zhang et.al.|[2303.10794v1](http://arxiv.org/abs/2303.10794v1)|null|\n", "2303.10766": "|**2023-03-21**|**Multi-modal reward for visual relationships-based image captioning**|Ali Abedi et.al.|[2303.10766v2](http://arxiv.org/abs/2303.10766v2)|null|\n", "2303.10667": "|**2023-03-19**|**Audio-Text Models Do Not Yet Leverage Natural Language**|Ho-Hsiang Wu et.al.|[2303.10667v1](http://arxiv.org/abs/2303.10667v1)|**[link](https://github.com/hohsiangwu/preposition-synthesis)**|\n", "2303.10590": "|**2023-03-19**|**Multi-modal Facial Action Unit Detection with Large Pre-trained Models for the 5th Competition on Affective Behavior Analysis in-the-wild**|Yufeng Yin et.al.|[2303.10590v1](http://arxiv.org/abs/2303.10590v1)|null|\n", "2303.10571": "|**2023-03-19**|**CLIP4MC: An RL-Friendly Vision-Language Model for Minecraft**|Ziluo Ding et.al.|[2303.10571v1](http://arxiv.org/abs/2303.10571v1)|**[link](https://github.com/PKU-RL/CLIP4MC)**|\n", "2303.10457": "|**2023-03-18**|**Multi-Modal Continual Test-Time Adaptation for 3D Semantic Segmentation**|Haozhi Cao et.al.|[2303.10457v1](http://arxiv.org/abs/2303.10457v1)|null|\n", "2303.10406": "|**2023-03-18**|**3DQD: Generalized Deep 3D Shape Prior via Part-Discretized Diffusion Process**|Yuhan Li et.al.|[2303.10406v1](http://arxiv.org/abs/2303.10406v1)|**[link](https://github.com/colorful-liyu/3dqd)**|\n", "2303.12060": "|**2023-03-21**|**VideoXum: Cross-modal Visual and Textural Summarization of Videos**|Jingyang Lin et.al.|[2303.12060v1](http://arxiv.org/abs/2303.12060v1)|null|\n", "2303.11771": "|**2023-03-21**|**Self-Sufficient Framework for Continuous Sign Language Recognition**|Youngjoon Jang et.al.|[2303.11771v1](http://arxiv.org/abs/2303.11771v1)|null|\n", "2303.11732": "|**2023-03-21**|**Multi-modal Prompting for Low-Shot Temporal Action Localization**|Chen Ju et.al.|[2303.11732v1](http://arxiv.org/abs/2303.11732v1)|null|\n", "2303.11625": "|**2023-03-21**|**Information-containing Adversarial Perturbation for Combating Facial Manipulation Systems**|Yao Zhu et.al.|[2303.11625v1](http://arxiv.org/abs/2303.11625v1)|null|\n", "2303.12501": "|**2023-03-22**|**Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval**|Ding Jiang et.al.|[2303.12501v1](http://arxiv.org/abs/2303.12501v1)|**[link](https://github.com/anosorae/irra)**|\n", "2303.12445": "|**2023-03-22**|**MEDIMP: Medical Images and Prompts for renal transplant representation learning**|Leo Milecki et.al.|[2303.12445v1](http://arxiv.org/abs/2303.12445v1)|**[link](https://github.com/leomlck/medimp)**|\n", "2303.12423": "|**2023-03-22**|**Text with Knowledge Graph Augmented Transformer for Video Captioning**|Xin Gu et.al.|[2303.12423v1](http://arxiv.org/abs/2303.12423v1)|null|\n", "2303.12419": "|**2023-03-22**|**BiCro: Noisy Correspondence Rectification for Multi-modality Data via Bi-directional Cross-modal Similarity Consistency**|Shuo Yang et.al.|[2303.12419v1](http://arxiv.org/abs/2303.12419v1)|**[link](https://github.com/xu5zhao/bicro)**|\n", "2303.12417": "|**2023-03-22**|**CLIP^2: Contrastive Language-Image-Point Pretraining from Real-World Point Cloud Data**|Yihan Zeng et.al.|[2303.12417v1](http://arxiv.org/abs/2303.12417v1)|null|\n", "2303.12379": "|**2023-03-22**|**VMCML: Video and Music Matching via Cross-Modality Lifting**|Yi-Shan Lee et.al.|[2303.12379v1](http://arxiv.org/abs/2303.12379v1)|null|\n", "2303.12112": "|**2023-03-21**|**Positive-Augmented Constrastive Learning for Image and Video Captioning Evaluation**|Sara Sarto et.al.|[2303.12112v1](http://arxiv.org/abs/2303.12112v1)|**[link](https://github.com/aimagelab/pacscore)**|\n", "2303.13471": "|**2023-03-23**|**Egocentric Audio-Visual Object Localization**|Chao Huang et.al.|[2303.13471v1](http://arxiv.org/abs/2303.13471v1)|**[link](https://github.com/wikichao/ego-av-loc)**|\n", "2303.13455": "|**2023-03-23**|**CoBIT: A Contrastive Bi-directional Image-Text Generation Model**|Haoxuan You et.al.|[2303.13455v1](http://arxiv.org/abs/2303.13455v1)|null|\n", "2303.13430": "|**2023-03-23**|**Medical diffusion on a budget: textual inversion for medical image generation**|Bram de Wilde et.al.|[2303.13430v1](http://arxiv.org/abs/2303.13430v1)|null|\n", "2303.13371": "|**2023-03-23**|**Plug-and-Play Regulators for Image-Text Matching**|Haiwen Diao et.al.|[2303.13371v1](http://arxiv.org/abs/2303.13371v1)|**[link](https://github.com/paranioar/rcar)**|\n", "2303.13233": "|**2023-03-23**|**Visually-Prompted Language Model for Fine-Grained Scene Graph Generation in an Open World**|Qifan Yu et.al.|[2303.13233v1](http://arxiv.org/abs/2303.13233v1)|**[link](https://github.com/Yuqifan1117/CaCao)**|\n", "2303.13095": "|**2023-03-23**|**Modeling Entities as Semantic Points for Visual Information Extraction in the Wild**|Zhibo Yang et.al.|[2303.13095v1](http://arxiv.org/abs/2303.13095v1)|null|\n", "2303.13041": "|**2023-03-23**|**gDoc: Automatic Generation of Structured API Documentation**|Shujun Wang et.al.|[2303.13041v1](http://arxiv.org/abs/2303.13041v1)|null|\n", "2303.13009": "|**2023-03-23**|**MELTR: Meta Loss Transformer for Learning to Fine-tune Video Foundation Models**|Dohwan Ko et.al.|[2303.13009v1](http://arxiv.org/abs/2303.13009v1)|**[link](https://github.com/mlvlab/MELTR)**|\n", "2303.12997": "|**2023-03-23**|**FER-former: Multi-modal Transformer for Facial Expression Recognition**|Yande Li et.al.|[2303.12997v1](http://arxiv.org/abs/2303.12997v1)|null|\n", "2303.12930": "|**2023-03-24**|**Dense-Localizing Audio-Visual Events in Untrimmed Videos: A Large-Scale Benchmark and Baseline**|Tiantian Geng et.al.|[2303.12930v2](http://arxiv.org/abs/2303.12930v2)|**[link](https://github.com/ttgeng233/UnAV)**|\n", "2303.14153": "|**2023-03-24**|**Local Contrastive Learning for Medical Image Recognition**|S. A. Rizvi et.al.|[2303.14153v1](http://arxiv.org/abs/2303.14153v1)|null|\n", "2303.14139": "|**2023-03-24**|**MindDiffuser: Controlled Image Reconstruction from Human Brain Activity with Semantic and Structural Diffusion**|Yizhuo Lu et.al.|[2303.14139v1](http://arxiv.org/abs/2303.14139v1)|null|\n", "2303.14081": "|**2023-03-24**|**CoLa-Diff: Conditional Latent Diffusion Model for Multi-Modal MRI Synthesis**|Lan Jiang et.al.|[2303.14081v1](http://arxiv.org/abs/2303.14081v1)|null|\n", "2303.13885": "|**2023-03-24**|**ARKitTrack: A New Diverse Dataset for Tracking Using Mobile RGB-D Data**|Haojie Zhao et.al.|[2303.13885v1](http://arxiv.org/abs/2303.13885v1)|**[link](https://github.com/lawrence-cj/ARKitTrack)**|\n", "2303.13839": "|**2023-03-24**|**HRDoc: Dataset and Baseline Method Toward Hierarchical Reconstruction of Document Structures**|Jiefeng Ma et.al.|[2303.13839v1](http://arxiv.org/abs/2303.13839v1)|**[link](https://github.com/jfma-ustc/hrdoc)**|\n", "2303.13810": "|**2023-03-24**|**Evidence-aware multi-modal data fusion and its application to total knee replacement prediction**|Xinwen Liu et.al.|[2303.13810v1](http://arxiv.org/abs/2303.13810v1)|null|\n", "2303.15444": "|**2023-03-27**|**Quantum Multi-Model Fitting**|Matteo Farina et.al.|[2303.15444v1](http://arxiv.org/abs/2303.15444v1)|**[link](https://github.com/farinamatteo/qmmf)**|\n", "2303.15230": "|**2023-03-27**|**Troika: Multi-Path Cross-Modal Traction for Compositional Zero-Shot Learning**|Siteng Huang et.al.|[2303.15230v1](http://arxiv.org/abs/2303.15230v1)|null|\n", "2303.15219": "|**2023-03-27**|**Knowing the Distance: Understanding the Gap Between Synthetic and Real Data For Face Parsing**|Eli Friedman et.al.|[2303.15219v1](http://arxiv.org/abs/2303.15219v1)|null|\n", "2303.15103": "|**2023-03-27**|**Contrastive Learning Is Spectral Clustering On Similarity Graph**|Zhiquan Tan et.al.|[2303.15103v1](http://arxiv.org/abs/2303.15103v1)|**[link](https://github.com/yifanzhang-pro/kernel-infonce)**|\n", "2303.15083": "|**2023-03-27**|**UniDistill: A Universal Cross-Modality Knowledge Distillation Framework for 3D Object Detection in Bird's-Eye View**|Shengchao Zhou et.al.|[2303.15083v1](http://arxiv.org/abs/2303.15083v1)|**[link](https://github.com/megvii-research/cvpr2023-unidistill)**|\n", "2303.15016": "|**2023-03-27**|**Borrowing Human Senses: Comment-Aware Self-Training for Social Media Multimodal Classification**|Chunpu Xu et.al.|[2303.15016v1](http://arxiv.org/abs/2303.15016v1)|**[link](https://github.com/cpaaax/multimodal_cast)**|\n", "2303.15006": "|**2023-03-27**|**Curriculum Learning for Compositional Visual Reasoning**|Wafa Aissa et.al.|[2303.15006v1](http://arxiv.org/abs/2303.15006v1)|null|\n", "2303.14998": "|**2023-03-27**|**Multi-view Cross-Modality MR Image Translation for Vestibular Schwannoma and Cochlea Segmentation**|Bogyeong Kang et.al.|[2303.14998v1](http://arxiv.org/abs/2303.14998v1)|null|\n", "2303.14880": "|**2023-03-27**|**Toward Human-Like Social Robot Navigation: A Large-Scale, Multi-Modal, Social Human Navigation Dataset**|Duc M. Nguyen et.al.|[2303.14880v1](http://arxiv.org/abs/2303.14880v1)|null|\n", "2303.14865": "|**2023-03-27**|**Revisiting Multimodal Representation in Contrastive Learning: From Patch and Token Embeddings to Finite Discrete Tokens**|Yuxiao Chen et.al.|[2303.14865v1](http://arxiv.org/abs/2303.14865v1)|**[link](https://github.com/yuxiaochen1103/fdt)**|\n", "2303.14840": "|**2023-03-26**|**On the Importance of Accurate Geometry Data for Dense 3D Vision Tasks**|HyunJun Jung et.al.|[2303.14840v1](http://arxiv.org/abs/2303.14840v1)|**[link](https://github.com/junggy/hammer-dataset)**|\n", "2303.14768": "|**2023-03-26**|**Collaborative Noisy Label Cleaner: Learning Scene-aware Trailers for Multi-modal Highlight Detection in Movies**|Bei Gan et.al.|[2303.14768v1](http://arxiv.org/abs/2303.14768v1)|**[link](https://github.com/tencentyouturesearch/highlightdetection-clc)**|\n", "2303.14730": "|**2023-03-26**|**Semantic Neural Decoding via Cross-Modal Generation**|Xuelin Qian et.al.|[2303.14730v1](http://arxiv.org/abs/2303.14730v1)|null|\n", "2303.14666": "|**2023-03-26**|**Generalization Matters: Loss Minima Flattening via Parameter Hybridization for Efficient Online Knowledge Distillation**|Tianli Zhang et.al.|[2303.14666v1](http://arxiv.org/abs/2303.14666v1)|null|\n", "2303.14626": "|**2023-03-26**|**MRCN: A Novel Modality Restitution and Compensation Network for Visible-Infrared Person Re-identification**|Yukang Zhang et.al.|[2303.14626v1](http://arxiv.org/abs/2303.14626v1)|null|\n", "2303.16199": "|**2023-03-28**|**LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention**|Renrui Zhang et.al.|[2303.16199v1](http://arxiv.org/abs/2303.16199v1)|**[link](https://github.com/zrrskywalker/llama-adapter)**|\n", "2303.16099": "|**2023-03-28**|**Medical Image Analysis using Deep Relational Learning**|Zhihua Liu et.al.|[2303.16099v1](http://arxiv.org/abs/2303.16099v1)|null|\n", "2303.16058": "|**2023-03-28**|**Unmasked Teacher: Towards Training-Efficient Video Foundation Models**|Kunchang Li et.al.|[2303.16058v1](http://arxiv.org/abs/2303.16058v1)|**[link](https://github.com/opengvlab/unmasked_teacher)**|\n", "2303.15932": "|**2023-03-29**|**Unify, Align and Refine: Multi-Level Semantic Alignment for Radiology Report Generation**|Yaowei Li et.al.|[2303.15932v2](http://arxiv.org/abs/2303.15932v2)|null|\n", "2303.15826": "|**2023-03-28**|**MS-MT: Multi-Scale Mean Teacher with Contrastive Unpaired Translation for Cross-Modality Vestibular Schwannoma and Cochlea Segmentation**|Ziyuan Zhao et.al.|[2303.15826v1](http://arxiv.org/abs/2303.15826v1)|null|\n", "2303.15777": "|**2023-03-28**|**Imbalance Knowledge-Driven Multi-modal Network for Land-Cover Semantic Segmentation Using Images and LiDAR Point Clouds**|Yameng Wang et.al.|[2303.15777v1](http://arxiv.org/abs/2303.15777v1)|null|\n", "2303.15770": "|**2023-03-28**|**DDMM-Synth: A Denoising Diffusion Model for Cross-modal Medical Image Synthesis with Sparse-view Measurement Embedding**|Xiaoyue Li et.al.|[2303.15770v1](http://arxiv.org/abs/2303.15770v1)|null|\n", "2303.15710": "|**2023-03-28**|**Explicit Attention-Enhanced Fusion for RGB-Thermal Perception Tasks**|Mingjian Liang et.al.|[2303.15710v1](http://arxiv.org/abs/2303.15710v1)|**[link](https://github.com/freeformrobotics/eaefnet)**|\n", "2303.16818": "|**2023-03-30**|**BEVSimDet: Simulated Multi-modal Distillation in Bird's-Eye View for Multi-view 3D Object Detection**|Haimei Zhao et.al.|[2303.16818v2](http://arxiv.org/abs/2303.16818v2)|**[link](https://github.com/vitae-transformer/bevsimdet)**|\n", "2303.16604": "|**2023-03-29**|**Bi-directional Training for Composed Image Retrieval via Text Prompt Learning**|Zheyuan Liu et.al.|[2303.16604v1](http://arxiv.org/abs/2303.16604v1)|**[link](https://github.com/Cuberick-Orion/Bi-Blip4CIR)**|\n", "2303.16541": "|**2023-03-29**|**Sounding Video Generator: A Unified Framework for Text-guided Sounding Video Generation**|Jiawei Liu et.al.|[2303.16541v1](http://arxiv.org/abs/2303.16541v1)|**[link](https://github.com/jwliu-cc/svg)**|\n", "2303.16443": "|**2023-03-29**|**A tensor based varying-coefficient model for multi-modal neuroimaging data analysis**|Pratim Guha Niyogi et.al.|[2303.16443v1](http://arxiv.org/abs/2303.16443v1)|null|\n", "2303.17561": "|**2023-03-30**|**SoftCLIP: Softer Cross-modal Alignment Makes CLIP Stronger**|Yuting Gao et.al.|[2303.17561v1](http://arxiv.org/abs/2303.17561v1)|null|\n", "2303.17531": "|**2023-03-30**|**Asymmetric Face Recognition with Cross Model Compatible Ensembles**|Ori Linial et.al.|[2303.17531v1](http://arxiv.org/abs/2303.17531v1)|null|\n", "2303.17517": "|**2023-03-30**|**Hindi as a Second Language: Improving Visually Grounded Speech with Semantically Similar Samples**|Hyeonggon Ryu et.al.|[2303.17517v1](http://arxiv.org/abs/2303.17517v1)|null|\n", "2303.17490": "|**2023-03-30**|**Sound to Visual Scene Generation by Audio-to-Visual Latent Alignment**|Kim Sung-Bin et.al.|[2303.17490v1](http://arxiv.org/abs/2303.17490v1)|null|\n", "2303.17409": "|**2023-03-30**|**Steered Mixture of Experts Regression for Image Denoising with Multi-Model-Inference**|Aytac \u00d6zkan et.al.|[2303.17409v1](http://arxiv.org/abs/2303.17409v1)|null|\n", "2303.17386": "|**2023-03-30**|**Complementary Random Masking for RGB-Thermal Semantic Segmentation**|Ukcheol Shin et.al.|[2303.17386v1](http://arxiv.org/abs/2303.17386v1)|**[link](https://github.com/UkcheolShin/CRM_RGBTSeg)**|\n", "2303.17297": "|**2023-03-30**|**Understanding the Robustness of 3D Object Detection with Bird's-Eye-View Representations in Autonomous Driving**|Zijian Zhu et.al.|[2303.17297v1](http://arxiv.org/abs/2303.17297v1)|**[link](https://github.com/zzj403/BEV_Robust)**|\n", "2303.17285": "|**2023-03-30**|**Decomposed Cross-modal Distillation for RGB-based Temporal Action Detection**|Pilhyeon Lee et.al.|[2303.17285v1](http://arxiv.org/abs/2303.17285v1)|null|\n", "2303.17169": "|**2023-03-30**|**Task-Oriented Multi-Modal Mutual Leaning for Vision-Language Models**|Sifan Long et.al.|[2303.17169v1](http://arxiv.org/abs/2303.17169v1)|null|\n", "2303.17099": "|**2023-03-30**|**BEVFusion4D: Learning LiDAR-Camera Fusion Under Bird's-Eye-View via Cross-Modality Guidance and Temporal Aggregation**|Hongxiang Cai et.al.|[2303.17099v1](http://arxiv.org/abs/2303.17099v1)|null|\n", "2303.18248": "|**2023-03-31**|**Towards Flexible Multi-modal Document Models**|Naoto Inoue et.al.|[2303.18248v1](http://arxiv.org/abs/2303.18248v1)|**[link](https://github.com/CyberAgentAILab/flex-dm)**|\n", "2303.17981": "|**2023-03-31**|**Knowledge Distillation for Feature Extraction in Underwater VSLAM**|Jinghe Yang et.al.|[2303.17981v1](http://arxiv.org/abs/2303.17981v1)|**[link](https://github.com/jinghe-mel/ufen-slam)**|\n", "2303.17859": "|**2023-03-31**|**MapFormer: Boosting Change Detection by Using Pre-change Information**|Maximilian Bernhard et.al.|[2303.17859v1](http://arxiv.org/abs/2303.17859v1)|**[link](https://github.com/mxbh/mapformer)**|\n", "2303.17811": "|**2023-04-03**|**Zero-shot Referring Image Segmentation with Global-Local Context Features**|Seonghoon Yu et.al.|[2303.17811v2](http://arxiv.org/abs/2303.17811v2)|**[link](https://github.com/seonghoon-yu/zero-shot-ris)**|\n", "2304.00932": "|**2023-04-03**|**HypLiLoc: Towards Effective LiDAR Pose Regression with Hyperbolic Fusion**|Sijie Wang et.al.|[2304.00932v1](http://arxiv.org/abs/2304.00932v1)|**[link](https://github.com/sijieaaa/hypliloc)**|\n", "2304.00827": "|**2023-04-03**|**Multi-modal Fake News Detection on Social Media via Multi-grained Information Fusion**|Yangming Zhou et.al.|[2304.00827v1](http://arxiv.org/abs/2304.00827v1)|null|\n", "2304.00788": "|**2023-04-03**|**Open-Vocabulary Point-Cloud Object Detection without 3D Annotation**|Yuheng Lu et.al.|[2304.00788v1](http://arxiv.org/abs/2304.00788v1)|**[link](https://github.com/lyhdet/ov-3det)**|\n", "2304.00719": "|**2023-04-03**|**Multi-Modal Representation Learning with Text-Driven Soft Masks**|Jaeyoo Park et.al.|[2304.00719v1](http://arxiv.org/abs/2304.00719v1)|null|\n", "2304.00670": "|**2023-04-03**|**CRN: Camera Radar Net for Accurate, Robust, Efficient 3D Perception**|Youngseok Kim et.al.|[2304.00670v1](http://arxiv.org/abs/2304.00670v1)|null|\n", "2304.00495": "|**2023-04-02**|**Multimodal Hyperspectral Image Classification via Interconnected Fusion**|Lu Huo et.al.|[2304.00495v1](http://arxiv.org/abs/2304.00495v1)|null|\n", "2304.00450": "|**2023-04-02**|**Sketch-based Video Object Localization**|Sangmin Woo et.al.|[2304.00450v1](http://arxiv.org/abs/2304.00450v1)|null|\n", "2304.00379": "|**2023-04-01**|**Improved Multimodal Fusion for Small Datasets with Auxiliary Supervision**|Gregory Holste et.al.|[2304.00379v1](http://arxiv.org/abs/2304.00379v1)|null|\n", "2304.00157": "|**2023-03-31**|**Robotic Perception of Transparent Objects: A Review**|Jiaqi Jiang et.al.|[2304.00157v1](http://arxiv.org/abs/2304.00157v1)|null|\n", "2304.01961": "|**2023-04-04**|**AToMiC: An Image/Text Retrieval Test Collection to Support Multimedia Content Creation**|Jheng-Hong Yang et.al.|[2304.01961v1](http://arxiv.org/abs/2304.01961v1)|**[link](https://github.com/trec-atomic/atomic)**|\n", "2304.01799": "|**2023-04-04**|**naplib-python: Neural Acoustic Data Processing and Analysis Tools in Python**|Gavin Mischler et.al.|[2304.01799v1](http://arxiv.org/abs/2304.01799v1)|**[link](https://github.com/naplab/naplib-python)**|\n", "2304.01705": "|**2023-04-04**|**Cross-modal tumor segmentation using generative blending augmentation and self training**|Guillaume Sall\u00e9 et.al.|[2304.01705v1](http://arxiv.org/abs/2304.01705v1)|null|\n", "2304.01603": "|**2023-04-04**|**Locate Then Generate: Bridging Vision and Language with Bounding Box for Scene-Text VQA**|Yongxin Zhu et.al.|[2304.01603v1](http://arxiv.org/abs/2304.01603v1)|null|\n", "2304.01601": "|**2023-04-04**|**Primitive Simultaneous Optimization of Similarity Metrics for Image Registration**|Diana Waldmannstetter et.al.|[2304.01601v1](http://arxiv.org/abs/2304.01601v1)|null|\n", "2304.01563": "|**2023-04-04**|**Attribute-Consistent Knowledge Graph Representation Learning for Multi-Modal Entity Alignment**|Qian Li et.al.|[2304.01563v1](http://arxiv.org/abs/2304.01563v1)|null|\n", "2304.01491": "|**2023-04-04**|**Multi model LSTM architecture for Track Association based on Automatic Identification System Data**|Md Asif Bin Syed et.al.|[2304.01491v1](http://arxiv.org/abs/2304.01491v1)|null|\n", "2304.01440": "|**2023-04-04**|**A Deep Multi-Modal Cyber-Attack Detection in Industrial Control Systems**|Sepideh Bahadoripour et.al.|[2304.01440v1](http://arxiv.org/abs/2304.01440v1)|null|\n", "2304.01430": "|**2023-04-04**|**Divided Attention: Unsupervised Multi-Object Discovery with Contextually Separated Slots**|Dong Lao et.al.|[2304.01430v1](http://arxiv.org/abs/2304.01430v1)|null|\n", "2304.01233": "|**2023-04-03**|**Multi-Modal Perceiver Language Model for Outcome Prediction in Emergency Department**|Sabri Boughorbel et.al.|[2304.01233v1](http://arxiv.org/abs/2304.01233v1)|null|\n", "2304.02556": "|**2023-04-05**|**Detecting and Grounding Multi-Modal Media Manipulation**|Rui Shao et.al.|[2304.02556v1](http://arxiv.org/abs/2304.02556v1)|**[link](https://github.com/rshaojimmy/multimodal-deepfake)**|\n", "2304.02532": "|**2023-04-05**|**Goal-Conditioned Imitation Learning using Score-based Diffusion Policies**|Moritz Reuss et.al.|[2304.02532v1](http://arxiv.org/abs/2304.02532v1)|null|\n", "2304.02419": "|**2023-04-05**|**TM2D: Bimodality Driven 3D Dance Generation via Music-Text Integration**|Kehong Gong et.al.|[2304.02419v1](http://arxiv.org/abs/2304.02419v1)|**[link](https://github.com/Garfield-kh/TM2D)**|\n", "2304.02407": "|**2023-04-05**|**Explaining Multimodal Data Fusion: Occlusion Analysis for Wilderness Mapping**|Burak Ekim et.al.|[2304.02407v1](http://arxiv.org/abs/2304.02407v1)|null|\n", "2304.02328": "|**2023-04-05**|**Enhancing Multimodal Entity and Relation Extraction with Variational Information Bottleneck**|Shiyao Cui et.al.|[2304.02328v1](http://arxiv.org/abs/2304.02328v1)|null|\n", "2304.02278": "|**2023-04-05**|**Calibrating Cross-modal Feature for Text-Based Person Searching**|Donglai Wei et.al.|[2304.02278v1](http://arxiv.org/abs/2304.02278v1)|null|\n", "2304.03047": "|**2023-04-07**|**ETPNav: Evolving Topological Planning for Vision-Language Navigation in Continuous Environments**|Dong An et.al.|[2304.03047v2](http://arxiv.org/abs/2304.03047v2)|**[link](https://github.com/marsaki/etpnav)**|\n", "2304.02991": "|**2023-04-06**|**Exploiting the Complementarity of 2D and 3D Networks to Address Domain-Shift in 3D Semantic Segmentation**|Adriano Cardace et.al.|[2304.02991v1](http://arxiv.org/abs/2304.02991v1)|**[link](https://github.com/cvlab-unibo/mm2d3d)**|\n", "2304.02948": "|**2023-04-06**|**FengWu: Pushing the Skillful Global Medium-range Weather Forecast beyond 10 Days Lead**|Kang Chen et.al.|[2304.02948v1](http://arxiv.org/abs/2304.02948v1)|null|\n", "2304.02916": "|**2023-04-06**|**Efficient Audio Captioning Transformer with Patchout and Text Guidance**|Thodoris Kouzelis et.al.|[2304.02916v1](http://arxiv.org/abs/2304.02916v1)|null|\n", "2304.02902": "|**2023-04-06**|**Towards Efficient MCMC Sampling in Bayesian Neural Networks by Exploiting Symmetry**|Jonas Gregor Wiese et.al.|[2304.02902v1](http://arxiv.org/abs/2304.02902v1)|null|\n", "2304.02853": "|**2023-04-06**|**Learning Instance-Level Representation for Large-Scale Multi-Modal Pretraining in E-commerce**|Yang Jin et.al.|[2304.02853v1](http://arxiv.org/abs/2304.02853v1)|null|\n", "2304.03669": "|**2023-04-07**|**DATE: Domain Adaptive Product Seeker for E-commerce**|Haoyuan Li et.al.|[2304.03669v1](http://arxiv.org/abs/2304.03669v1)|null|\n", "2304.03542": "|**2023-04-07**|**Better \"CMOS\" Produces Clearer Images: Learning Space-Variant Blur Estimation for Blind Image Super-Resolution**|Xuhai Chen et.al.|[2304.03542v1](http://arxiv.org/abs/2304.03542v1)|null|\n", "2304.03391": "|**2023-04-06**|**Exposing and Mitigating Spurious Correlations for Cross-Modal Retrieval**|Jae Myung Kim et.al.|[2304.03391v1](http://arxiv.org/abs/2304.03391v1)|null|\n", "2304.04523": "|**2023-04-10**|**PoseFusion: Robust Object-in-Hand Pose Estimation with SelectLSTM**|Yuyang Tu et.al.|[2304.04523v1](http://arxiv.org/abs/2304.04523v1)|null|\n", "2304.04302": "|**2023-04-09**|**Bionic Collapsible Wings in Aquatic-aerial Robot**|Xiao Xiong et.al.|[2304.04302v1](http://arxiv.org/abs/2304.04302v1)|null|\n", "2304.04298": "|**2023-04-09**|**Unsupervised Sampling Promoting for Stochastic Human Trajectory Prediction**|Guangyi Chen et.al.|[2304.04298v1](http://arxiv.org/abs/2304.04298v1)|**[link](https://github.com/viewsetting/unsupervised_sampling_promoting)**|\n", "2304.04290": "|**2023-04-09**|**Distributed Conditional GAN (discGAN) For Synthetic Healthcare Data Generation**|David Fuentes et.al.|[2304.04290v1](http://arxiv.org/abs/2304.04290v1)|null|\n", "2304.04231": "|**2023-04-09**|**CrowdCLIP: Unsupervised Crowd Counting via Vision-Language Model**|Dingkang Liang et.al.|[2304.04231v1](http://arxiv.org/abs/2304.04231v1)|**[link](https://github.com/dk-liang/crowdclip)**|\n", "2304.04187": "|**2023-04-09**|**Similarity-Aware Multimodal Prompt Learning for Fake News Detection**|Ye Jiang et.al.|[2304.04187v1](http://arxiv.org/abs/2304.04187v1)|null|\n", "2304.04113": "|**2023-04-08**|**An Automated Fully-Computational Framework to Construct Printability Maps for Additively Manufactured Metal Alloys**|Sofia Sheikh et.al.|[2304.04113v1](http://arxiv.org/abs/2304.04113v1)|null|\n", "2304.04062": "|**2023-04-08**|**Predicting multiple sclerosis disease severity with multimodal deep neural networks**|Kai Zhang et.al.|[2304.04062v1](http://arxiv.org/abs/2304.04062v1)|**[link](https://github.com/anotherkaizhang/ms)**|\n", "2304.03916": "|**2023-04-08**|**Mitigating Spurious Correlations in Multi-modal Models during Fine-tuning**|Yu Yang et.al.|[2304.03916v1](http://arxiv.org/abs/2304.03916v1)|null|\n", "2304.03910": "|**2023-04-08**|**Co-attention Propagation Network for Zero-Shot Video Object Segmentation**|Gensheng Pei et.al.|[2304.03910v1](http://arxiv.org/abs/2304.03910v1)|**[link](https://github.com/nust-machine-intelligence-laboratory/hcpn)**|\n", "2304.03897": "|**2023-04-08**|**Factify 2: A Multimodal Fake News and Satire News Dataset**|S Suryavardan et.al.|[2304.03897v1](http://arxiv.org/abs/2304.03897v1)|**[link](https://github.com/surya1701/factify-2.0)**|\n", "2304.05340": "|**2023-04-11**|**Unified Multi-Modal Image Synthesis for Missing Modality Imputation**|Yue Zhang et.al.|[2304.05340v1](http://arxiv.org/abs/2304.05340v1)|null|\n", "2304.05171": "|**2023-04-11**|**Curriculum-Based Imitation of Versatile Skills**|Maximilian Xiling Li et.al.|[2304.05171v1](http://arxiv.org/abs/2304.05171v1)|**[link](https://github.com/intuitive-robots/ml-cur)**|\n", "2304.05166": "|**2023-04-11**|**TrajFlow: Learning the Distribution over Trajectories**|Anna M\u00e9sz\u00e1ros et.al.|[2304.05166v1](http://arxiv.org/abs/2304.05166v1)|null|\n", "2304.05080": "|**2023-04-11**|**Investigating Imbalances Between SAR and Optical Utilization for Multi-Modal Urban Mapping**|Sebastian Hafner et.al.|[2304.05080v1](http://arxiv.org/abs/2304.05080v1)|null|\n", "2304.05051": "|**2023-04-11**|**FashionSAP: Symbols and Attributes Prompt for Fine-grained Fashion Vision-Language Pre-training**|Yunpeng Han et.al.|[2304.05051v1](http://arxiv.org/abs/2304.05051v1)|**[link](https://github.com/hssip/fashionsap)**|\n", "2304.05979": "|**2023-04-12**|**NaviSTAR: Socially Aware Robot Navigation with Hybrid Spatio-Temporal Graph Transformer and Preference Learning**|Weizheng Wang et.al.|[2304.05979v1](http://arxiv.org/abs/2304.05979v1)|null|\n", "2304.05754": "|**2023-04-12**|**Self-Supervised Learning with Cluster-Aware-DINO for High-Performance Robust Speaker Verification**|Bing Han et.al.|[2304.05754v1](http://arxiv.org/abs/2304.05754v1)|null|\n", "2304.05720": "|**2023-04-12**|**Towards a more comprehensive open-source model for interdisciplinary smart integrated energy systems**|B\u00e9la Wiegel et.al.|[2304.05720v1](http://arxiv.org/abs/2304.05720v1)|null|\n", "2304.05646": "|**2023-04-12**|**Modality-Invariant Representation for Infrared and Visible Image Registration**|Zhiying Jiang et.al.|[2304.05646v1](http://arxiv.org/abs/2304.05646v1)|null|\n", "2304.05645": "|**2023-04-12**|**WildRefer: 3D Object Localization in Large-scale Dynamic Scenes with Multi-modal Visual Data and Natural Language**|Zhenxiang Lin et.al.|[2304.05645v1](http://arxiv.org/abs/2304.05645v1)|null|\n", "2304.05600": "|**2023-04-12**|**Looking Similar, Sounding Different: Leveraging Counterfactual Cross-Modal Pairs for Audiovisual Representation Learning**|Nikhil Singh et.al.|[2304.05600v1](http://arxiv.org/abs/2304.05600v1)|null|\n", "2304.05523": "|**2023-04-11**|**MoMo: A shared encoder Model for text, image and multi-Modal representations**|Rakesh Chada et.al.|[2304.05523v1](http://arxiv.org/abs/2304.05523v1)|null|\n", "2304.05402": "|**2023-04-11**|**Boosting Cross-task Transferability of Adversarial Patches with Visual Relations**|Tony Ma et.al.|[2304.05402v1](http://arxiv.org/abs/2304.05402v1)|null|\n", "2304.06708": "|**2023-04-13**|**Verbs in Action: Improving verb understanding in video-language models**|Liliane Momeni et.al.|[2304.06708v1](http://arxiv.org/abs/2304.06708v1)|null|\n", "2304.06306": "|**2023-04-13**|**Efficient Multimodal Fusion via Interactive Prompting**|Yaowei Li et.al.|[2304.06306v1](http://arxiv.org/abs/2304.06306v1)|null|\n", "2304.06275": "|**2023-04-13**|**Noisy Correspondence Learning with Meta Similarity Correction**|Haochen Han et.al.|[2304.06275v1](http://arxiv.org/abs/2304.06275v1)|**[link](https://github.com/hhc1997/mscn)**|\n", "2304.06264": "|**2023-04-13**|**Loosely Coupled Odometry, UWB Ranging, and Cooperative Spatial Detection for Relative Monte-Carlo Multi-Robot Localization**|Xianjia Yu et.al.|[2304.06264v1](http://arxiv.org/abs/2304.06264v1)|**[link](https://github.com/tiers/uwb-cooperative-mrs-localization)**|\n", "2304.06051": "|**2023-04-12**|**Open-TransMind: A New Baseline and Benchmark for 1st Foundation Model Challenge of Intelligent Transportation**|Yifeng Shi et.al.|[2304.06051v1](http://arxiv.org/abs/2304.06051v1)|**[link](https://github.com/Traffic-X/Open-TransMind)**|\n", "2304.07199": "|**2023-04-14**|**CROVIA: Seeing Drone Scenes from Car Perspective via Cross-View Adaptation**|Thanh-Dat Truong et.al.|[2304.07199v1](http://arxiv.org/abs/2304.07199v1)|null|\n", "2304.07151": "|**2023-04-14**|**End-to-End Learning with Multiple Modalities for System-Optimised Renewables Nowcasting**|Rushil Vohra et.al.|[2304.07151v1](http://arxiv.org/abs/2304.07151v1)|null|\n", "2304.07147": "|**2023-04-14**|**Cross Attention Transformers for Multi-modal Unsupervised Whole-Body PET Anomaly Detection**|Ashay Patel et.al.|[2304.07147v1](http://arxiv.org/abs/2304.07147v1)|null|\n", "2304.06991": "|**2023-04-14**|**WYTIWYR: A User Intent-Aware Framework with Multi-modal Inputs for Visualization Retrieval**|Shishi Xiao et.al.|[2304.06991v1](http://arxiv.org/abs/2304.06991v1)|**[link](https://github.com/serendipitysx/wytiwyr)**|\n", "2304.06910": "|**2023-04-14**|**HCAM -- Hierarchical Cross Attention Model for Multi-modal Emotion Recognition**|Soumya Dutta et.al.|[2304.06910v1](http://arxiv.org/abs/2304.06910v1)|null|\n", "2304.06786": "|**2023-04-13**|**The future of hearing aid technology**|Volker Hohmann et.al.|[2304.06786v1](http://arxiv.org/abs/2304.06786v1)|null|\n", "2304.08345": "|**2023-04-17**|**VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset**|Sihan Chen et.al.|[2304.08345v1](http://arxiv.org/abs/2304.08345v1)|**[link](https://github.com/TXH-mercury/VALOR)**|\n", "2304.08304": "|**2023-04-17**|**SDVRF: Sparse-to-Dense Voxel Region Fusion for Multi-modal 3D Object Detection**|Binglu Ren et.al.|[2304.08304v1](http://arxiv.org/abs/2304.08304v1)|null|\n", "2304.08083": "|**2023-04-17**|**Causality-aware Visual Scene Discovery for Cross-Modal Question Reasoning**|Yang Liu et.al.|[2304.08083v1](http://arxiv.org/abs/2304.08083v1)|null|\n", "2304.08072": "|**2023-04-17**|**Two-stage MR Image Segmentation Method for Brain Tumors based on Attention Mechanism**|Li Zhu et.al.|[2304.08072v1](http://arxiv.org/abs/2304.08072v1)|null|\n", "2304.08058": "|**2023-04-17**|**One-Class SVM on siamese neural network latent space for Unsupervised Anomaly Detection on brain MRI White Matter Hyperintensities**|Nicolas Pinon et.al.|[2304.08058v1](http://arxiv.org/abs/2304.08058v1)|null|\n", "2304.08054": "|**2023-04-17**|**Fed-MIWAE: Federated Imputation of Incomplete Data via Deep Generative Models**|Irene Balelli et.al.|[2304.08054v1](http://arxiv.org/abs/2304.08054v1)|null|\n", "2304.07775": "|**2023-04-16**|**Robust Cross-Modal Knowledge Distillation for Unconstrained Videos**|Wenke Xia et.al.|[2304.07775v1](http://arxiv.org/abs/2304.07775v1)|**[link](https://github.com/gewu-lab/cross-modal-distillation)**|\n", "2304.07728": "|**2023-04-16**|**TransFusionOdom: Interpretable Transformer-based LiDAR-Inertial Fusion Odometry Estimation**|Leyuan Sun et.al.|[2304.07728v1](http://arxiv.org/abs/2304.07728v1)|**[link](https://github.com/rakugenson/multi-modal-dataset-for-odometry-estimation)**|\n", "2304.07633": "|**2023-04-15**|**Detecting Out-of-Context Multimodal Misinformation with interpretable neural-symbolic model**|Yizhou Zhang et.al.|[2304.07633v1](http://arxiv.org/abs/2304.07633v1)|null|\n", "2304.07567": "|**2023-04-15**|**CoVLR: Coordinating Cross-Modal Consistency and Intra-Modal Structure for Vision-Language Retrieval**|Yang Yang et.al.|[2304.07567v1](http://arxiv.org/abs/2304.07567v1)|null|\n", "2304.07549": "|**2023-04-15**|**MA-ViT: Modality-Agnostic Vision Transformers for Face Anti-Spoofing**|Ajian Liu et.al.|[2304.07549v1](http://arxiv.org/abs/2304.07549v1)|null|\n", "2304.07387": "|**2023-04-14**|**Cross-domain Food Image-to-Recipe Retrieval by Weighted Adversarial Learning**|Bin Zhu et.al.|[2304.07387v1](http://arxiv.org/abs/2304.07387v1)|null|\n", "2304.09172": "|**2023-04-18**|**Hyperbolic Image-Text Representations**|Karan Desai et.al.|[2304.09172v1](http://arxiv.org/abs/2304.09172v1)|null|\n", "2304.09164": "|**2023-04-18**|**Structure Preserving Cycle-GAN for Unsupervised Medical Image Domain Adaptation**|Paolo Iacono et.al.|[2304.09164v1](http://arxiv.org/abs/2304.09164v1)|null|\n", "2304.08965": "|**2023-04-18**|**Unsupervised Semantic Segmentation of 3D Point Clouds via Cross-modal Distillation and Super-Voxel Clustering**|Zisheng Chen et.al.|[2304.08965v1](http://arxiv.org/abs/2304.08965v1)|**[link](https://github.com/scut-bip-lab/pointdc)**|\n", "2304.08881": "|**2023-04-18**|**Segmentation of glioblastomas in early post-operative multi-modal MRI with deep neural networks**|Ragnhild Holden Helland et.al.|[2304.08881v1](http://arxiv.org/abs/2304.08881v1)|**[link](https://github.com/dbouget/validation_metrics_computation)**|\n", "2304.08709": "|**2023-04-18**|**You Only Need Two Detectors to Achieve Multi-Modal 3D Multi-Object Tracking**|Xiyang Wang et.al.|[2304.08709v1](http://arxiv.org/abs/2304.08709v1)|**[link](https://github.com/wangxiyang2022/YONTD-MOT)**|\n", "2304.08660": "|**2023-04-17**|**(LC)$^2$: LiDAR-Camera Loop Constraints For Cross-Modal Place Recognition**|Alex Junho Lee et.al.|[2304.08660v1](http://arxiv.org/abs/2304.08660v1)|null|\n", "2304.08658": "|**2023-04-20**|**In-situ surface porosity prediction in DED (directed energy deposition) printed SS316L parts using multimodal sensor fusion**|Adithyaa Karthikeyan et.al.|[2304.08658v2](http://arxiv.org/abs/2304.08658v2)|null|\n", "2304.09801": "|**2023-04-19**|**MetaBEV: Solving Sensor Failures for BEV Detection and Map Segmentation**|Chongjian Ge et.al.|[2304.09801v1](http://arxiv.org/abs/2304.09801v1)|**[link](https://github.com/ChongjianGE/MetaBEV)**|\n", "2304.09694": "|**2023-04-19**|**CrossFusion: Interleaving Cross-modal Complementation for Noise-resistant 3D Object Detection**|Yang Yang et.al.|[2304.09694v1](http://arxiv.org/abs/2304.09694v1)|null|\n", "2304.09609": "|**2023-04-19**|**MMDR: A Result Feature Fusion Object Detection Approach for Autonomous System**|Wendong Zhang et.al.|[2304.09609v1](http://arxiv.org/abs/2304.09609v1)|null|\n", "2304.09498": "|**2023-04-19**|**Learning Robust Visual-Semantic Embedding for Generalizable Person Re-identification**|Suncheng Xiang et.al.|[2304.09498v1](http://arxiv.org/abs/2304.09498v1)|**[link](https://github.com/jeremyxsc/mmet)**|\n", "2304.09448": "|**2023-04-19**|**EC^2: Emergent Communication for Embodied Control**|Yao Mu et.al.|[2304.09448v1](http://arxiv.org/abs/2304.09448v1)|null|\n", "2304.09421": "|**2023-04-19**|**TieFake: Title-Text Similarity and Emotion-Aware Fake News Detection**|Quanjiang Guo et.al.|[2304.09421v1](http://arxiv.org/abs/2304.09421v1)|**[link](https://github.com/uestc-gqj/tiefake)**|\n", "2304.09370": "|**2023-04-19**|**Integrating Reconfigurable Foot Design, Multi-modal Contact Sensing, and Terrain Classification for Bipedal Locomotion**|Ted Tyler et.al.|[2304.09370v1](http://arxiv.org/abs/2304.09370v1)|null|\n", "2304.09322": "|**2023-04-18**|**Multi-Modality Multi-Scale Cardiovascular Disease Subtypes Classification Using Raman Image and Medical History**|Bo Yu et.al.|[2304.09322v1](http://arxiv.org/abs/2304.09322v1)|null|\n", "2304.10530": "|**2023-04-20**|**Collaborative Diffusion for Multi-Modal Face Generation and Editing**|Ziqi Huang et.al.|[2304.10530v1](http://arxiv.org/abs/2304.10530v1)|**[link](https://github.com/ziqihuangg/collaborative-diffusion)**|\n", "2304.10309": "|**2023-04-20**|**Improving Speech Translation by Cross-Modal Multi-Grained Contrastive Learning**|Hao Zhang et.al.|[2304.10309v1](http://arxiv.org/abs/2304.10309v1)|null|\n", "2304.10254": "|**2023-04-20**|**Image-text Retrieval via preserving main Semantics of Vision**|Xu Zhang et.al.|[2304.10254v1](http://arxiv.org/abs/2304.10254v1)|**[link](https://github.com/zhangxu0963/vsl)**|\n", "2304.10091": "|**2023-04-20**|**Learning CLIP Guided Visual-Text Fusion Transformer for Video-based Pedestrian Attribute Recognition**|Jun Zhu et.al.|[2304.10091v1](http://arxiv.org/abs/2304.10091v1)|**[link](https://github.com/event-ahu/vtf_par)**|\n", "2304.09941": "|**2023-04-19**|**A robust and interpretable deep learning framework for multi-modal registration via keypoints**|Alan Q. Wang et.al.|[2304.09941v1](http://arxiv.org/abs/2304.09941v1)|**[link](https://github.com/evanmy/keymorph)**|\n", "2304.09921": "|**2023-04-19**|**Regularization for distributionally robust state estimation and prediction**|Jean-S\u00e9bastien Brouillon et.al.|[2304.09921v1](http://arxiv.org/abs/2304.09921v1)|null|\n", "2304.10382": "|**2023-04-21**|**Conditional Generative Models for Learning Stochastic Processes**|Salvatore Certo et.al.|[2304.10382v2](http://arxiv.org/abs/2304.10382v2)|null|\n", "2304.11098": "|**2023-04-21**|**Generative AI-enabled Vehicular Networks: Fundamentals, Framework, and Case Study**|Ruichen Zhang et.al.|[2304.11098v1](http://arxiv.org/abs/2304.11098v1)|null|\n", "2304.11029": "|**2023-04-24**|**CLaMP: Contrastive Language-Music Pre-training for Cross-Modal Symbolic Music Information Retrieval**|Shangda Wu et.al.|[2304.11029v2](http://arxiv.org/abs/2304.11029v2)|**[link](https://github.com/microsoft/muzic/tree/main/clamp)**|\n", "2304.10893": "|**2023-04-21**|**FindVehicle and VehicleFinder: A NER dataset for natural language-based vehicle retrieval and a keyword-based cross-modal vehicle retrieval system**|Runwei Guan et.al.|[2304.10893v1](http://arxiv.org/abs/2304.10893v1)|**[link](https://github.com/guanrunwei/vehiclefinder-ctim)**|\n", "2304.10824": "|**2023-04-21**|**Rethinking Benchmarks for Cross-modal Image-text Retrieval**|Weijing Chen et.al.|[2304.10824v1](http://arxiv.org/abs/2304.10824v1)|**[link](https://github.com/cwj1412/mscoco-flikcr30k_fg)**|\n", "2304.10759": "|**2023-04-21**|**GeoLayoutLM: Geometric Pre-training for Visual Information Extraction**|Chuwei Luo et.al.|[2304.10759v1](http://arxiv.org/abs/2304.10759v1)|**[link](https://github.com/alibabaresearch/advancedliteratemachinery)**|\n", "2304.10756": "|**2023-04-21**|**Missing Modality Robustness in Semi-Supervised Multi-Modal Semantic Segmentation**|Harsh Maheshwari et.al.|[2304.10756v1](http://arxiv.org/abs/2304.10756v1)|**[link](https://github.com/harshm121/m3l)**|\n", "2304.10740": "|**2023-04-21**|**Multi-Modal Deep Learning for Credit Rating Prediction Using Text and Numerical Data Streams**|Mahsa Tavakoli et.al.|[2304.10740v1](http://arxiv.org/abs/2304.10740v1)|**[link](https://github.com/banking-analytics-lab/multimodalfusionratings)**|\n", "2304.10727": "|**2023-04-21**|**RoCOCO: Robust Benchmark MS-COCO to Stress-test Robustness of Image-Text Matching Models**|Seulki Park et.al.|[2304.10727v1](http://arxiv.org/abs/2304.10727v1)|**[link](https://github.com/pseulki/rococo)**|\n", "2304.10658": "|**2023-04-20**|**Linear to multi-linear algebra and systems using tensors**|Divyanshu Pandey et.al.|[2304.10658v1](http://arxiv.org/abs/2304.10658v1)|null|\n", "2304.10628": "|**2023-04-20**|**HM-ViT: Hetero-modal Vehicle-to-Vehicle Cooperative perception with vision transformer**|Hao Xiang et.al.|[2304.10628v1](http://arxiv.org/abs/2304.10628v1)|null|\n", "2304.10592": "|**2023-04-20**|**MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models**|Deyao Zhu et.al.|[2304.10592v1](http://arxiv.org/abs/2304.10592v1)|**[link](https://github.com/vision-cair/minigpt-4)**|\n", "2304.12269": "|**2023-04-24**|**Enriching Source Code with Contextual Data for Code Completion Models: An Empirical Study**|Tim van Dam et.al.|[2304.12269v1](http://arxiv.org/abs/2304.12269v1)|**[link](https://github.com/aise-tudelft/contextualdatacodecompletion)**|\n", "2304.12259": "|**2023-04-24**|**Imaging 3D Chemistry at 1 nm Resolution with Fused Multi-Modal Electron Tomography**|Jonathan Schwartz et.al.|[2304.12259v1](http://arxiv.org/abs/2304.12259v1)|**[link](https://github.com/jtschwar/projection_refinement)**|\n", "2304.11993": "|**2023-04-25**|**MMC: Multi-Modal Colorization of Images using Textual Descriptions**|Subhankar Ghosh et.al.|[2304.11993v2](http://arxiv.org/abs/2304.11993v2)|null|\n", "2304.11875": "|**2023-04-24**|**Underwater object classification combining SAS and transferred optical-to-SAS Imagery**|Avi Abu et.al.|[2304.11875v1](http://arxiv.org/abs/2304.11875v1)|null|\n", "2304.11829": "|**2023-04-25**|**Hierarchical Diffusion Autoencoders and Disentangled Image Manipulation**|Zeyu Lu et.al.|[2304.11829v2](http://arxiv.org/abs/2304.11829v2)|null|\n", "2304.11764": "|**2023-04-23**|**Learning-enabled multi-modal motion prediction in urban environments**|Vinicius Trentin et.al.|[2304.11764v1](http://arxiv.org/abs/2304.11764v1)|null|\n", "2304.11697": "|**2023-04-23**|**Informative Data Selection with Uncertainty for Multi-modal Object Detection**|Xinyu Zhang et.al.|[2304.11697v1](http://arxiv.org/abs/2304.11697v1)|null|\n", "2304.11618": "|**2023-04-23**|**Modality-Aware Negative Sampling for Multi-modal Knowledge Graph Embedding**|Yichi Zhang et.al.|[2304.11618v1](http://arxiv.org/abs/2304.11618v1)|**[link](https://github.com/zjukg/mans)**|\n", "2304.11603": "|**2023-04-23**|**LaMD: Latent Motion Diffusion for Video Generation**|Yaosi Hu et.al.|[2304.11603v1](http://arxiv.org/abs/2304.11603v1)|null|\n", "2304.11193": "|**2023-04-21**|**Combining Vision and Tactile Sensation for Video Prediction**|Willow Mandil et.al.|[2304.11193v1](http://arxiv.org/abs/2304.11193v1)|null|\n", "2304.12995": "|**2023-04-25**|**AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head**|Rongjie Huang et.al.|[2304.12995v1](http://arxiv.org/abs/2304.12995v1)|**[link](https://github.com/aigc-audio/audiogpt)**|\n", "2304.12725": "|**2023-04-25**|**Quantitative analysis of collagen remodeling in pancreatic lesions using computationally translated collagen images derived from brightfield microscopy images**|Varun Nair et.al.|[2304.12725v1](http://arxiv.org/abs/2304.12725v1)|null|\n", "2304.12570": "|**2023-04-25**|**Learnable Pillar-based Re-ranking for Image-Text Retrieval**|Leigang Qu et.al.|[2304.12570v1](http://arxiv.org/abs/2304.12570v1)|**[link](https://github.com/lgqu/leaprr)**|\n", "2304.12412": "|**2023-04-24**|**End-to-End Lidar-Camera Self-Calibration for Autonomous Vehicles**|Arya Rachman et.al.|[2304.12412v1](http://arxiv.org/abs/2304.12412v1)|null|\n", "2304.13649": "|**2023-04-26**|**A Symmetric Dual Encoding Dense Retrieval Framework for Knowledge-Intensive Visual Question Answering**|Alireza Salemi et.al.|[2304.13649v1](http://arxiv.org/abs/2304.13649v1)|**[link](https://github.com/alirezasalemi7/dedr-mm-fid)**|\n", "2304.13583": "|**2023-04-26**|**Multi-Modality Deep Network for Extreme Learned Image Compression**|Xuhao Jiang et.al.|[2304.13583v1](http://arxiv.org/abs/2304.13583v1)|null|\n", "2304.13559": "|**2023-04-28**|**Towards Multi-Modal DBMSs for Seamless Querying of Texts and Tables**|Matthias Urban et.al.|[2304.13559v2](http://arxiv.org/abs/2304.13559v2)|null|\n", "2304.13425": "|**2023-04-26**|**Learnable Ophthalmology SAM**|Zhongxi Qiu et.al.|[2304.13425v1](http://arxiv.org/abs/2304.13425v1)|**[link](https://github.com/qsingle/learnablepromptsam)**|\n", "2304.13357": "|**2023-04-26**|**Deep Lifelong Cross-modal Hashing**|Liming Xu et.al.|[2304.13357v1](http://arxiv.org/abs/2304.13357v1)|null|\n", "2304.13277": "|**2023-04-26**|**Self-Supervised Multi-Modal Sequential Recommendation**|Kunzhe Song et.al.|[2304.13277v1](http://arxiv.org/abs/2304.13277v1)|**[link](https://github.com/kz-song/mmsrec)**|\n", "2304.13273": "|**2023-04-27**|**From Association to Generation: Text-only Captioning by Unsupervised Cross-modal Mapping**|Junyang Wang et.al.|[2304.13273v2](http://arxiv.org/abs/2304.13273v2)|**[link](https://github.com/junyangwang0410/knight)**|\n", "2304.13181": "|**2023-04-25**|**Sample-Specific Debiasing for Better Image-Text Models**|Peiqi Wang et.al.|[2304.13181v1](http://arxiv.org/abs/2304.13181v1)|null|\n", "2304.13172": "|**2023-04-25**|**Generating Procedural Materials from Text or Image Prompts**|Yiwei Hu et.al.|[2304.13172v1](http://arxiv.org/abs/2304.13172v1)|null|\n", "2304.13130": "|**2023-04-25**|**Hypernymization of named entity-rich captions for grounding-based multi-modal pretraining**|Giacomo Nebbia et.al.|[2304.13130v1](http://arxiv.org/abs/2304.13130v1)|null|\n", "2304.13103": "|**2023-04-25**|**HyMo: Vulnerability Detection in Smart Contracts using a Novel Multi-Modal Hybrid Model**|Mohammad Khodadadi et.al.|[2304.13103v1](http://arxiv.org/abs/2304.13103v1)|null|\n", "2304.13097": "|**2023-04-25**|**Bridging graph data models: RDF, RDF-star, and property graphs as directed acyclic graphs**|Ewout Gelling et.al.|[2304.13097v1](http://arxiv.org/abs/2304.13097v1)|**[link](https://github.com/ewoutgelling/bridging-data-models)**|\n", "2304.14340": "|**2023-04-27**|**SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection**|Yichen Xie et.al.|[2304.14340v1](http://arxiv.org/abs/2304.14340v1)|**[link](https://github.com/yichen928/sparsefusion)**|\n", "2304.14323": "|**2023-04-27**|**Pushing the Boundaries of Tractable Multiperspective Reasoning: A Deduction Calculus for Standpoint EL+**|Luc\u00eda {G\u00f3mez \u00c1lvarez} et.al.|[2304.14323v1](http://arxiv.org/abs/2304.14323v1)|**[link](https://github.com/cl-tud/standpoint-el-souffle-reasoner)**|\n", "2304.14243": "|**2023-04-27**|**Standpoint Linear Temporal Logic**|Nicola Gigante et.al.|[2304.14243v1](http://arxiv.org/abs/2304.14243v1)|null|\n", "2304.14178": "|**2023-04-27**|**mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality**|Qinghao Ye et.al.|[2304.14178v1](http://arxiv.org/abs/2304.14178v1)|**[link](https://github.com/x-plug/mplug-owl)**|\n", "2304.13979": "|**2023-04-27**|**Adaptive-Mask Fusion Network for Segmentation of Drivable Road and Negative Obstacle With Untrustworthy Features**|Zhen Feng et.al.|[2304.13979v1](http://arxiv.org/abs/2304.13979v1)|**[link](https://github.com/lab-sun/amfnet)**|\n", "2304.13923": "|**2023-04-27**|**Retrieval-based Knowledge Augmented Vision Language Pre-training**|Jiahua Rao et.al.|[2304.13923v1](http://arxiv.org/abs/2304.13923v1)|null|\n", "2304.13833": "|**2023-04-26**|**Mixtures of Gaussian process experts based on kernel stick-breaking processes**|Yuji Saikai et.al.|[2304.13833v1](http://arxiv.org/abs/2304.13833v1)|**[link](https://github.com/ysaikai/gpksbp)**|\n", "2304.14204": "|**2023-04-26**|**Towards Medical Artificial General Intelligence via Knowledge-Enhanced Multimodal Pretraining**|Bingqian Lin et.al.|[2304.14204v1](http://arxiv.org/abs/2304.14204v1)|**[link](https://github.com/chenzcv7/motor)**|\n", "2304.15010": "|**2023-04-28**|**LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model**|Peng Gao et.al.|[2304.15010v1](http://arxiv.org/abs/2304.15010v1)|**[link](https://github.com/zrrskywalker/llama-adapter)**|\n", "2304.14942": "|**2023-04-28**|**The Emotions of the Crowd: Learning Image Sentiment from Tweets via Cross-modal Distillation**|Alessio Serra et.al.|[2304.14942v1](http://arxiv.org/abs/2304.14942v1)|null|\n", "2304.14936": "|**2023-04-28**|**Information Redundancy and Biases in Public Document Information Extraction Benchmarks**|Seif Laatiri et.al.|[2304.14936v1](http://arxiv.org/abs/2304.14936v1)|**[link](https://github.com/seif-lat/bias-study-funsd-sroie)**|\n", "2304.14933": "|**2023-04-28**|**An Empirical Study of Multimodal Model Merging**|Yi-Lin Sung et.al.|[2304.14933v1](http://arxiv.org/abs/2304.14933v1)|**[link](https://github.com/ylsung/vl-merging)**|\n", "2304.14880": "|**2023-04-28**|**SGAligner : 3D Scene Alignment with Scene Graphs**|Sayan Deb Sarkar et.al.|[2304.14880v1](http://arxiv.org/abs/2304.14880v1)|**[link](https://github.com/sayands/sgaligner)**|\n", "2305.00970": "|**2023-05-01**|**ArK: Augmented Reality with Knowledge Interactive Emergent Ability**|Qiuyuan Huang et.al.|[2305.00970v1](http://arxiv.org/abs/2305.00970v1)|null|\n", "2305.00769": "|**2023-05-01**|**Multi-scale Transformer-based Network for Emotion Recognition from Multi Physiological Signals**|Tu Vu et.al.|[2305.00769v1](http://arxiv.org/abs/2305.00769v1)|**[link](https://github.com/vsl-team/EPiC-2023-ACII)**|\n", "2305.00537": "|**2023-04-30**|**Interpretability of Machine Learning: Recent Advances and Future Prospects**|Lei Gao et.al.|[2305.00537v1](http://arxiv.org/abs/2305.00537v1)|null|\n", "2305.00355": "|**2023-04-29**|**MH-DETR: Video Moment and Highlight Detection with Cross-modal Transformer**|Yifang Xu et.al.|[2305.00355v1](http://arxiv.org/abs/2305.00355v1)|null|\n", "2305.00320": "|**2023-04-29**|**Fusion for Visual-Infrared Person ReID in Real-World Surveillance Using Corrupted Multimodal Data**|Arthur Josi et.al.|[2305.00320v1](http://arxiv.org/abs/2305.00320v1)|**[link](https://github.com/art2611/mreid-ucd-ccd)**|\n", "2305.00314": "|**2023-04-29**|**InfraDet3D: Multi-Modal 3D Object Detection based on Roadside Infrastructure Camera and LiDAR Sensors**|Walter Zimmer et.al.|[2305.00314v1](http://arxiv.org/abs/2305.00314v1)|null|\n", "2305.00207": "|**2023-04-29**|**Mixed-Response State-Space Model for Analyzing Multi-Dimensional Digital Phenotypes**|Tianchen Xu et.al.|[2305.00207v1](http://arxiv.org/abs/2305.00207v1)|**[link](https://github.com/zjph602xtc/MRSS)**|\n", "2305.00201": "|**2023-04-29**|**Instruction-ViT: Multi-Modal Prompts for Instruction Learning in ViT**|Zhenxiang Xiao et.al.|[2305.00201v1](http://arxiv.org/abs/2305.00201v1)|null|\n", "2305.00042": "|**2023-04-28**|**Cycle-guided Denoising Diffusion Probability Model for 3D Cross-modality MRI Synthesis**|Shaoyan Pan et.al.|[2305.00042v1](http://arxiv.org/abs/2305.00042v1)|null|\n", "2305.00976": "|**2023-05-02**|**TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion Synthesis**|Mathis Petrovich et.al.|[2305.00976v1](http://arxiv.org/abs/2305.00976v1)|null|\n", "2305.01412": "|**2023-05-02**|**A Computational Approach for the Characterization of Airborne Pathogen Transmission in Turbulent Molecular Communication Channels**|Fatih Gulec et.al.|[2305.01412v1](http://arxiv.org/abs/2305.01412v1)|null|\n", "2305.01366": "|**2023-05-02**|**Establishing a Learning Model for Correct Hand Hygiene Technique in a NICU**|Ir\u00e9n A. Kopcs\u00f3n\u00e9 N\u00e9meth et.al.|[2305.01366v1](http://arxiv.org/abs/2305.01366v1)|null|\n", "2305.01245": "|**2023-05-02**|**MDENet: Multi-modal Dual-embedding Networks for Malware Open-set Recognition**|Jingcai Guo et.al.|[2305.01245v1](http://arxiv.org/abs/2305.01245v1)|null|\n", "2305.01233": "|**2023-05-03**|**On Uni-Modal Feature Learning in Supervised Multi-Modal Learning**|Chenzhuang Du et.al.|[2305.01233v2](http://arxiv.org/abs/2305.01233v2)|**[link](https://github.com/gewu-lab/ogm-ge_cvpr2022)**|\n", "2305.01111": "|**2023-05-01**|**Local and Global Contextual Features Fusion for Pedestrian Intention Prediction**|Mohsen Azarmi et.al.|[2305.01111v1](http://arxiv.org/abs/2305.01111v1)|null|\n", "2305.02269": "|**2023-05-03**|**M2-CTTS: End-to-End Multi-scale Multi-modal Conversational Text-to-Speech Synthesis**|Jinlong Xue et.al.|[2305.02269v1](http://arxiv.org/abs/2305.02269v1)|null|\n", "2305.01971": "|**2023-05-03**|**District-scale surface temperatures generated from high-resolution longitudinal thermal infrared images**|Subin Lin et.al.|[2305.01971v1](http://arxiv.org/abs/2305.01971v1)|**[link](https://github.com/buds-lab/project-iris-dataset)**|\n", "2305.01915": "|**2023-05-03**|**Denoising Multi-modal Sequential Recommenders with Contrastive Learning**|Dong Yao et.al.|[2305.01915v1](http://arxiv.org/abs/2305.01915v1)|null|\n", "2305.01912": "|**2023-05-03**|**MolKD: Distilling Cross-Modal Knowledge in Chemical Reactions for Molecular Property Prediction**|Liang Zeng et.al.|[2305.01912v1](http://arxiv.org/abs/2305.01912v1)|null|\n", "2305.01877": "|**2023-05-04**|**The Impacts of Dimensionality, Diffusion, and Directedness on Intrinsic Cross-Model Simulation in Tile-Based Self-Assembly**|Daniel Hader et.al.|[2305.01877v2](http://arxiv.org/abs/2305.01877v2)|null|\n", "2305.01864": "|**2023-05-05**|**Unsupervised Improvement of Audio-Text Cross-Modal Representations**|Zhepei Wang et.al.|[2305.01864v2](http://arxiv.org/abs/2305.01864v2)|**[link](https://github.com/zhepeiw/clap_curation)**|\n", "2305.01836": "|**2023-05-03**|**AV-SAM: Segment Anything Model Meets Audio-Visual Localization and Segmentation**|Shentong Mo et.al.|[2305.01836v1](http://arxiv.org/abs/2305.01836v1)|null|\n", "2305.01778": "|**2023-05-02**|**SLTUNET: A Simple Unified Model for Sign Language Translation**|Biao Zhang et.al.|[2305.01778v1](http://arxiv.org/abs/2305.01778v1)|**[link](https://github.com/bzhangGo/sltunet)**|\n", "2305.01661": "|**2023-05-02**|**SIA-FTP: A Spoken Instruction Aware Flight Trajectory Prediction Framework**|Dongyue Guo et.al.|[2305.01661v1](http://arxiv.org/abs/2305.01661v1)|null|\n", "2305.02930": "|**2023-05-04**|**Piecewise Normalizing Flows**|Harry Bevins et.al.|[2305.02930v1](http://arxiv.org/abs/2305.02930v1)|**[link](https://github.com/htjb/margarine)**|\n", "2305.02774": "|**2023-05-04**|**Spatial and Modal Optimal Transport for Fast Cross-Modal MRI Reconstruction**|Qi Wang et.al.|[2305.02774v1](http://arxiv.org/abs/2305.02774v1)|null|\n", "2305.02760": "|**2023-05-04**|**Multi-Modality Deep Network for JPEG Artifacts Reduction**|Xuhao Jiang et.al.|[2305.02760v1](http://arxiv.org/abs/2305.02760v1)|null|\n", "2305.02577": "|**2023-05-04**|**Text Reading Order in Uncontrolled Conditions by Sparse Graph Segmentation**|Renshen Wang et.al.|[2305.02577v1](http://arxiv.org/abs/2305.02577v1)|null|\n", "2305.02572": "|**2023-05-04**|**High-fidelity Generalized Emotional Talking Face Generation with Multi-modal Emotion Space Learning**|Chao Xu et.al.|[2305.02572v1](http://arxiv.org/abs/2305.02572v1)|null|\n", "2305.02504": "|**2023-05-04**|**Learning Missing Modal Electronic Health Records with Unified Multi-modal Data Embedding and Modality-Aware Attention**|Kwanhyung Lee et.al.|[2305.02504v1](http://arxiv.org/abs/2305.02504v1)|null|\n", "2305.03726": "|**2023-05-05**|**Otter: A Multi-Modal Model with In-Context Instruction Tuning**|Bo Li et.al.|[2305.03726v1](http://arxiv.org/abs/2305.03726v1)|**[link](https://github.com/luodian/otter)**|\n", "2305.03724": "|**2023-05-05**|**DualCross: Cross-Modality Cross-Domain Adaptation for Monocular BEV Perception**|Yunze Man et.al.|[2305.03724v1](http://arxiv.org/abs/2305.03724v1)|null|\n", "2305.03689": "|**2023-05-05**|**COLA: How to adapt vision-language models to Compose Objects Localized with Attributes?**|Arijit Ray et.al.|[2305.03689v1](http://arxiv.org/abs/2305.03689v1)|**[link](https://github.com/arijitray1993/COLA)**|\n", "2305.03347": "|**2023-05-05**|**A Large Cross-Modal Video Retrieval Dataset with Reading Comprehension**|Weijia Wu et.al.|[2305.03347v1](http://arxiv.org/abs/2305.03347v1)|**[link](https://github.com/callsys/textvr)**|\n", "2305.03314": "|**2023-05-05**|**Block the Label and Noise: An N-Gram Masked Speller for Chinese Spell Checking**|Haiyun Yang et.al.|[2305.03314v1](http://arxiv.org/abs/2305.03314v1)|null|\n", "2305.03277": "|**2023-05-05**|**FM-ViT: Flexible Modal Vision Transformers for Face Anti-Spoofing**|Ajian Liu et.al.|[2305.03277v1](http://arxiv.org/abs/2305.03277v1)|null|\n", "2305.03252": "|**2023-05-05**|**HeteroEdge: Addressing Asymmetry in Heterogeneous Collaborative Autonomous Systems**|Mohammad Saeid Anwar et.al.|[2305.03252v1](http://arxiv.org/abs/2305.03252v1)|null|\n", "2305.03212": "|**2023-05-04**|**LLM2Loss: Leveraging Language Models for Explainable Model Diagnostics**|Shervin Ardeshir et.al.|[2305.03212v1](http://arxiv.org/abs/2305.03212v1)|null|\n", "2305.03187": "|**2023-05-04**|**Generating Virtual On-body Accelerometer Data from Virtual Textual Descriptions for Human Activity Recognition**|Zikang Leng et.al.|[2305.03187v1](http://arxiv.org/abs/2305.03187v1)|**[link](https://github.com/ZikangLeng/IMUGPT)**|\n", "2305.03506": "|**2023-05-04**|**SI-LSTM: Speaker Hybrid Long-short Term Memory and Cross Modal Attention for Emotion Recognition in Conversation**|Xingwei Liang et.al.|[2305.03506v1](http://arxiv.org/abs/2305.03506v1)|null|\n", "2305.04824": "|**2023-05-08**|**Learning Summary-Worthy Visual Representation for Abstractive Summarization in Video**|Zenan Xu et.al.|[2305.04824v1](http://arxiv.org/abs/2305.04824v1)|null|\n", "2305.04790": "|**2023-05-09**|**MultiModal-GPT: A Vision and Language Model for Dialogue with Humans**|Tao Gong et.al.|[2305.04790v2](http://arxiv.org/abs/2305.04790v2)|**[link](https://github.com/open-mmlab/multimodal-gpt)**|\n", "2305.04685": "|**2023-05-08**|**ARDIE: AR, Dialogue, and Eye Gaze Policies for Human-Robot Collaboration**|Chelsea Zou et.al.|[2305.04685v1](http://arxiv.org/abs/2305.04685v1)|null|\n", "2305.04530": "|**2023-05-08**|**A Multi-Modal Context Reasoning Approach for Conditional Inference on Joint Textual and Visual Clues**|Yunxin Li et.al.|[2305.04530v1](http://arxiv.org/abs/2305.04530v1)|**[link](https://github.com/yunxinli/multimodal-context-reasoning)**|\n", "2305.04476": "|**2023-05-09**|**AlignSTS: Speech-to-Singing Conversion via Cross-Modal Alignment**|Ruiqi Li et.al.|[2305.04476v2](http://arxiv.org/abs/2305.04476v2)|null|\n", "2305.04474": "|**2023-05-09**|**Vision Langauge Pre-training by Contrastive Learning with Cross-Modal Similarity Regulation**|Chaoya Jiang et.al.|[2305.04474v2](http://arxiv.org/abs/2305.04474v2)|null|\n", "2305.04469": "|**2023-05-08**|**HACK: Learning a Parametric Head and Neck Model for High-fidelity Animation**|Longwen Zhang et.al.|[2305.04469v1](http://arxiv.org/abs/2305.04469v1)|**[link](https://github.com/zonelikewonderland/hack-model)**|\n", "2305.04451": "|**2023-05-08**|**FashionTex: Controllable Virtual Try-on with Text and Texture**|Anran Lin et.al.|[2305.04451v1](http://arxiv.org/abs/2305.04451v1)|**[link](https://github.com/picksh/fashiontex)**|\n", "2305.04298": "|**2023-05-07**|**Poses as Queries: Image-to-LiDAR Map Localization with Transformers**|Jinyu Miao et.al.|[2305.04298v1](http://arxiv.org/abs/2305.04298v1)|null|\n", "2305.04239": "|**2023-05-07**|**Instance-Variant Loss with Gaussian RBF Kernel for 3D Cross-modal Retriveal**|Zhitao Liu et.al.|[2305.04239v1](http://arxiv.org/abs/2305.04239v1)|null|\n", "2305.04224": "|**2023-05-07**|**Visual Causal Scene Refinement for Video Question Answering**|Yushen Wei et.al.|[2305.04224v1](http://arxiv.org/abs/2305.04224v1)|**[link](https://github.com/yangliu9208/vcsr)**|\n", "2305.04195": "|**2023-05-07**|**Cross-Modal Retrieval for Motion and Text via MildTriple Loss**|Sheng Yan et.al.|[2305.04195v1](http://arxiv.org/abs/2305.04195v1)|**[link](https://github.com/eanson023/rehamot)**|\n", "2305.04160": "|**2023-05-07**|**X-LLM: Bootstrapping Advanced Large Language Models by Treating Multi-Modalities as Foreign Languages**|Feilong Chen et.al.|[2305.04160v1](http://arxiv.org/abs/2305.04160v1)|null|\n", "2305.04156": "|**2023-05-07**|**SynthMix: Mixing up Aligned Synthesis for Medical Cross-Modality Domain Adaptation**|Xinwen Zhang et.al.|[2305.04156v1](http://arxiv.org/abs/2305.04156v1)|null|\n", "2305.04072": "|**2023-05-06**|**Keyword-Based Diverse Image Retrieval by Semantics-aware Contrastive Learning and Transformer**|Minyi Zhao et.al.|[2305.04072v1](http://arxiv.org/abs/2305.04072v1)|null|\n", "2305.05665": "|**2023-05-09**|**ImageBind: One Embedding Space To Bind Them All**|Rohit Girdhar et.al.|[2305.05665v1](http://arxiv.org/abs/2305.05665v1)|**[link](https://github.com/facebookresearch/imagebind)**|\n", "2305.05662": "|**2023-05-11**|**InternGPT: Solving Vision-Centric Tasks by Interacting with ChatGPT Beyond Language**|Zhaoyang Liu et.al.|[2305.05662v3](http://arxiv.org/abs/2305.05662v3)|**[link](https://github.com/opengvlab/interngpt)**|\n", "2305.05534": "|**2023-05-09**|**Integrating Holistic and Local Information to Estimate Emotional Reaction Intensity**|Yini Fang et.al.|[2305.05534v1](http://arxiv.org/abs/2305.05534v1)|**[link](https://github.com/hkust-nisl/abaw5)**|\n", "2305.05496": "|**2023-05-09**|**Exploiting Pseudo Image Captions for Multimodal Summarization**|Chaoya Jiang et.al.|[2305.05496v1](http://arxiv.org/abs/2305.05496v1)|**[link](https://github.com/sitaproject/sita)**|\n", "2305.05260": "|**2023-05-09**|**Guided Focal Stack Refinement Network for Light Field Salient Object Detection**|Bo Yuan et.al.|[2305.05260v1](http://arxiv.org/abs/2305.05260v1)|null|\n", "2305.05189": "|**2023-05-09**|**SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with Large Language Models**|Shanshan Zhong et.al.|[2305.05189v1](http://arxiv.org/abs/2305.05189v1)|**[link](https://github.com/Qrange-group/SUR-adapter)**|\n", "2305.05166": "|**2023-05-10**|**E2TIMT: Efficient and Effective Modal Adapter for Text Image Machine Translation**|Cong Ma et.al.|[2305.05166v2](http://arxiv.org/abs/2305.05166v2)|**[link](https://github.com/ericongma/e2timt)**|\n", "2305.05126": "|**2023-05-09**|**Comparing Foundation Models using Data Kernels**|Brandon Duderstadt et.al.|[2305.05126v1](http://arxiv.org/abs/2305.05126v1)|null|\n", "2305.04961": "|**2023-05-08**|**Joint Moment Retrieval and Highlight Detection Via Natural Language Queries**|Richard Luo et.al.|[2305.04961v1](http://arxiv.org/abs/2305.04961v1)|**[link](https://github.com/skyline-9/visionary-vids)**|\n", "2305.06292": "|**2023-05-10**|**Joint Metrics Matter: A Better Standard for Trajectory Forecasting**|Erica Weng et.al.|[2305.06292v1](http://arxiv.org/abs/2305.06292v1)|**[link](https://github.com/ericaweng/joint-metrics-matter)**|\n", "2305.06278": "|**2023-05-10**|**A Multi-modal Garden Dataset and Hybrid 3D Dense Reconstruction Framework Based on Panoramic Stereo Images for a Trimming Robot**|Can Pu et.al.|[2305.06278v1](http://arxiv.org/abs/2305.06278v1)|**[link](https://github.com/canpu999/trimbot-wageningen-slam-dataset)**|\n", "2305.06225": "|**2023-05-10**|**DaGAN++: Depth-Aware Generative Adversarial Network for Talking Head Video Generation**|Fa-Ting Hong et.al.|[2305.06225v1](http://arxiv.org/abs/2305.06225v1)|**[link](https://github.com/harlanhong/cvpr2022-dagan)**|\n", "2305.06221": "|**2023-05-10**|**Multi-Prompt with Depth Partitioned Cross-Modal Learning**|Yiqi Wang et.al.|[2305.06221v1](http://arxiv.org/abs/2305.06221v1)|**[link](https://github.com/wangyiqi/pmpo)**|\n", "2305.06203": "|**2023-05-10**|**Multiclass MRI Brain Tumor Segmentation using 3D Attention-based U-Net**|Maryann M. Gitonga et.al.|[2305.06203v1](http://arxiv.org/abs/2305.06203v1)|null|\n", "2305.06179": "|**2023-05-11**|**A Multi-modal Approach to Single-modal Visual Place Classification**|Tomoya Iwasaki et.al.|[2305.06179v2](http://arxiv.org/abs/2305.06179v2)|null|\n", "2305.05992": "|**2023-05-10**|**MMoT: Mixture-of-Modality-Tokens Transformer for Composed Multimodal Conditional Image Synthesis**|Jianbin Zheng et.al.|[2305.05992v1](http://arxiv.org/abs/2305.05992v1)|null|\n", "2305.05880": "|**2023-05-10**|**ChinaOpen: A Dataset for Open-world Multimodal Learning**|Aozhu Chen et.al.|[2305.05880v1](http://arxiv.org/abs/2305.05880v1)|**[link](https://github.com/dong03/GenerativeVideo2Text)**|\n", "2305.06978": "|**2023-05-11**|**Meta-hallucinator: Towards Few-Shot Cross-Modality Cardiac Image Segmentation**|Ziyuan Zhao et.al.|[2305.06978v1](http://arxiv.org/abs/2305.06978v1)|null|\n", "2305.06923": "|**2023-05-11**|**EAML: Ensemble Self-Attention-based Mutual Learning Network for Document Image Classification**|Souhail Bakkali et.al.|[2305.06923v1](http://arxiv.org/abs/2305.06923v1)|null|\n", "2305.06794": "|**2023-05-11**|**Multi-modal Multi-level Fusion for 3D Single Object Tracking**|Zhiheng Li et.al.|[2305.06794v1](http://arxiv.org/abs/2305.06794v1)|null|\n", "2305.06720": "|**2023-05-11**|**Bi-level Dynamic Learning for Jointly Multi-modality Image Fusion and Beyond**|Zhu Liu et.al.|[2305.06720v1](http://arxiv.org/abs/2305.06720v1)|**[link](https://github.com/LiuZhu-CV/BDLFusion)**|\n", "2305.06472": "|**2023-05-12**|**ChatGPT-Like Large-Scale Foundation Models for Prognostics and Health Management: A Survey and Roadmaps**|Yan-Fu Li et.al.|[2305.06472v2](http://arxiv.org/abs/2305.06472v2)|null|\n", "2305.06407": "|**2023-05-10**|**Combo of Thinking and Observing for Outside-Knowledge VQA**|Qingyi Si et.al.|[2305.06407v1](http://arxiv.org/abs/2305.06407v1)|**[link](https://github.com/phoebussi/thinking-while-observing)**|\n", "2305.06386": "|**2023-05-10**|**Text-To-Concept (and Back) via Cross-Model Alignment**|Mazda Moayeri et.al.|[2305.06386v1](http://arxiv.org/abs/2305.06386v1)|null|\n", "2305.07358": "|**2023-05-12**|**Towards Versatile and Efficient Visual Knowledge Injection into Pre-trained Language Models with Cross-Modal Adapters**|Xinyun Zhang et.al.|[2305.07358v1](http://arxiv.org/abs/2305.07358v1)|null|\n", "2305.07334": "|**2023-05-12**|**Locking and Quacking: Stacking Bayesian model predictions by log-pooling and superposition**|Yuling Yao et.al.|[2305.07334v1](http://arxiv.org/abs/2305.07334v1)|null|\n", "2305.07216": "|**2023-05-12**|**Versatile Audio-Visual Learning for Handling Single and Multi Modalities in Emotion Regression and Classification Tasks**|Lucas Goncalves et.al.|[2305.07216v1](http://arxiv.org/abs/2305.07216v1)|**[link](https://github.com/ilucasgoncalves/vavl)**|\n", "2305.07214": "|**2023-05-12**|**MMG-Ego4D: Multi-Modal Generalization in Egocentric Action Recognition**|Xinyu Gong et.al.|[2305.07214v1](http://arxiv.org/abs/2305.07214v1)|null|\n", "2305.07437": "|**2023-05-15**|**Continual Vision-Language Representation Learning with Off-Diagonal Information**|Zixuan Ni et.al.|[2305.07437v2](http://arxiv.org/abs/2305.07437v2)|null|\n", "2305.08706": "|**2023-05-15**|**Understanding and Bridging the Modality Gap for Speech Translation**|Qingkai Fang et.al.|[2305.08706v1](http://arxiv.org/abs/2305.08706v1)|**[link](https://github.com/ictnlp/cress)**|\n", "2305.08698": "|**2023-05-15**|**Continual Multimodal Knowledge Graph Construction**|Xiang Chen et.al.|[2305.08698v1](http://arxiv.org/abs/2305.08698v1)|**[link](https://github.com/zjunlp/ContinueMKGC)**|\n", "2305.08685": "|**2023-05-15**|**CLIP-VG: Self-paced Curriculum Adapting of CLIP via Exploiting Pseudo-Language Labels for Visual Grounding**|Linhui Xiao et.al.|[2305.08685v1](http://arxiv.org/abs/2305.08685v1)|**[link](https://github.com/linhuixiao/clip-vg)**|\n", "2305.08532": "|**2023-05-15**|**Benchmarking UWB-Based Infrastructure-Free Positioning and Multi-Robot Relative Localization: Dataset and Characterization**|Paola Torrico Mor\u00f3n et.al.|[2305.08532v1](http://arxiv.org/abs/2305.08532v1)|null|\n", "2305.08522": "|**2023-05-15**|**Cross-Modality Time-Variant Relation Learning for Generating Dynamic Scene Graphs**|Jingyi Wang et.al.|[2305.08522v1](http://arxiv.org/abs/2305.08522v1)|**[link](https://github.com/qncsn2016/TR2)**|\n", "2305.08386": "|**2023-05-15**|**PLIP: Language-Image Pre-training for Person Representation Learning**|Jialong Zuo et.al.|[2305.08386v1](http://arxiv.org/abs/2305.08386v1)|**[link](https://github.com/zplusdragon/plip)**|\n", "2305.08381": "|**2023-05-15**|**Mode Approximation Makes Good Vision-Language Prompts**|Haixin Wang et.al.|[2305.08381v1](http://arxiv.org/abs/2305.08381v1)|**[link](https://github.com/willdreamer/aurora)**|\n", "2305.08372": "|**2023-05-15**|**A Novel Framework for Multimodal Named Entity Recognition with Multi-level Alignments**|Peipei Liu et.al.|[2305.08372v1](http://arxiv.org/abs/2305.08372v1)|null|\n", "2305.08252": "|**2023-05-14**|**Parameter-Efficient Fine-Tuning for Medical Image Analysis: The Missed Opportunity**|Raman Dutt et.al.|[2305.08252v1](http://arxiv.org/abs/2305.08252v1)|null|\n", "2305.08120": "|**2023-05-14**|**Unraveling Cold Start Enigmas in Predictive Analytics for OTT Media: Synergistic Meta-Insights and Multimodal Ensemble Mastery**|K. Ganguly et.al.|[2305.08120v1](http://arxiv.org/abs/2305.08120v1)|null|\n", "2305.07927": "|**2023-05-13**|**RC3: Regularized Contrastive Cross-lingual Cross-modal Pre-training**|Chulun Zhou et.al.|[2305.07927v1](http://arxiv.org/abs/2305.07927v1)|null|\n", "2305.07920": "|**2023-05-13**|**Multi-task Paired Masking with Alignment Modeling for Medical Vision-Language Pre-training**|Ke Zhang et.al.|[2305.07920v1](http://arxiv.org/abs/2305.07920v1)|null|\n", "2305.07910": "|**2023-05-13**|**Mask to reconstruct: Cooperative Semantics Completion for Video-text Retrieval**|Han Fang et.al.|[2305.07910v1](http://arxiv.org/abs/2305.07910v1)|null|\n", "2305.07825": "|**2023-05-13**|**Student Classroom Behavior Detection based on YOLOv7-BRA and Multi-Model Fusion**|Fan Yang et.al.|[2305.07825v1](http://arxiv.org/abs/2305.07825v1)|**[link](https://github.com/whiffe/scb-dataset)**|\n", "2305.07792": "|**2023-05-12**|**Contextuality in multi-agent paradoxes**|Sidiney B. Montanhano et.al.|[2305.07792v1](http://arxiv.org/abs/2305.07792v1)|null|\n", "2305.09641": "|**2023-05-16**|**FitMe: Deep Photorealistic 3D Morphable Model Avatars**|Alexandros Lattas et.al.|[2305.09641v1](http://arxiv.org/abs/2305.09641v1)|null|\n", "2305.09600": "|**2023-05-16**|**Deep Reinforcement Learning to Maximize Arterial Usage during Extreme Congestion**|Ashutosh Dutta et.al.|[2305.09600v1](http://arxiv.org/abs/2305.09600v1)|null|\n", "2305.09333": "|**2023-05-16**|**Multi-modal Visual Understanding with Prompts for Semantic Information Disentanglement of Image**|Yuzhou Peng et.al.|[2305.09333v1](http://arxiv.org/abs/2305.09333v1)|null|\n", "2305.09272": "|**2023-05-16**|**Age of Incorrect Information in Semantic Communications for NOMA Aided XR Applications**|Jianrui Chen et.al.|[2305.09272v1](http://arxiv.org/abs/2305.09272v1)|null|\n", "2305.09255": "|**2023-05-16**|**Trust-Worthy Semantic Communications for the Metaverse Relying on Federated Learning**|Jianrui Chen et.al.|[2305.09255v1](http://arxiv.org/abs/2305.09255v1)|null|\n", "2305.09212": "|**2023-05-16**|**Cross-Modal Global Interaction and Local Alignment for Audio-Visual Speech Recognition**|Yuchen Hu et.al.|[2305.09212v1](http://arxiv.org/abs/2305.09212v1)|**[link](https://github.com/yuchen005/gila)**|\n", "2305.09011": "|**2023-05-18**|**The Brain Tumor Segmentation (BraTS) Challenge 2023: Brain MR Image Synthesis for Tumor Segmentation (BraSyn)**|Hongwei Bran Li et.al.|[2305.09011v2](http://arxiv.org/abs/2305.09011v2)|null|\n", "2305.10420": "|**2023-05-17**|**CLIP-GCD: Simple Language Guided Generalized Category Discovery**|Rabah Ouldnoughi et.al.|[2305.10420v1](http://arxiv.org/abs/2305.10420v1)|null|\n", "2305.10046": "|**2023-05-17**|**Probing the Role of Positional Information in Vision-Language Models**|Philipp J. R\u00f6sch et.al.|[2305.10046v1](http://arxiv.org/abs/2305.10046v1)|null|\n", "2305.09946": "|**2023-05-17**|**DeepMSS: Deep Multi-Modality Segmentation-to-Survival Learning for Survival Outcome Prediction from PET/CT Images**|Mingyuan Meng et.al.|[2305.09946v1](http://arxiv.org/abs/2305.09946v1)|**[link](https://github.com/mungomeng/survival-deepmss)**|\n", "2305.11176": "|**2023-05-18**|**Instruct2Act: Mapping Multi-modality Instructions to Robotic Actions with Large Language Model**|Siyuan Huang et.al.|[2305.11176v1](http://arxiv.org/abs/2305.11176v1)|**[link](https://github.com/opengvlab/instruct2act)**|\n", "2305.11172": "|**2023-05-18**|**ONE-PEACE: Exploring One General Representation Model Toward Unlimited Modalities**|Peng Wang et.al.|[2305.11172v1](http://arxiv.org/abs/2305.11172v1)|**[link](https://github.com/OFA-Sys/ONE-PEACE)**|\n", "2305.11101": "|**2023-05-18**|**XFormer: Fast and Accurate Monocular 3D Body Capture**|Lihui Qian et.al.|[2305.11101v1](http://arxiv.org/abs/2305.11101v1)|null|\n", "2305.11096": "|**2023-05-22**|**Cross-modality Data Augmentation for End-to-End Sign Language Translation**|Jinhui Ye et.al.|[2305.11096v2](http://arxiv.org/abs/2305.11096v2)|**[link](https://github.com/atrewin/signxmda)**|\n", "2305.11012": "|**2023-05-18**|**SDC-UDA: Volumetric Unsupervised Domain Adaptation Framework for Slice-Direction Continuous Cross-Modality Medical Image Segmentation**|Hyungseob Shin et.al.|[2305.11012v1](http://arxiv.org/abs/2305.11012v1)|null|\n", "2305.11000": "|**2023-05-19**|**SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities**|Dong Zhang et.al.|[2305.11000v2](http://arxiv.org/abs/2305.11000v2)|**[link](https://github.com/0nutation/speechgpt)**|\n", "2305.10920": "|**2023-05-18**|**Emergent Communication with Attention**|Ryokan Ri et.al.|[2305.10920v1](http://arxiv.org/abs/2305.10920v1)|null|\n", "2305.10838": "|**2023-05-18**|**ProgSG: Cross-Modality Representation Learning for Programs in Electronic Design Automation**|Yunsheng Bai et.al.|[2305.10838v1](http://arxiv.org/abs/2305.10838v1)|null|\n", "2305.10783": "|**2023-05-18**|**Transforming Human-Centered AI Collaboration: Redefining Embodied Agents Capabilities through Interactive Grounded Language Instructions**|Shrestha Mohanty et.al.|[2305.10783v1](http://arxiv.org/abs/2305.10783v1)|**[link](https://github.com/iglu-contest/nlp-baselines-2022)**|\n", "2305.10773": "|**2023-05-18**|**Rate-Adaptive Coding Mechanism for Semantic Communications With Multi-Modal Data**|Yangshuo He et.al.|[2305.10773v1](http://arxiv.org/abs/2305.10773v1)|null|\n", "2305.10764": "|**2023-05-18**|**OpenShape: Scaling Up 3D Shape Representation Towards Open-World Understanding**|Minghua Liu et.al.|[2305.10764v1](http://arxiv.org/abs/2305.10764v1)|null|\n", "2305.10763": "|**2023-05-18**|**CLAPSpeech: Learning Prosody from Text Context with Contrastive Language-Audio Pre-training**|Zhenhui Ye et.al.|[2305.10763v1](http://arxiv.org/abs/2305.10763v1)|null|\n", "2305.10724": "|**2023-05-18**|**Segment Any Anomaly without Training via Hybrid Prompt Regularization**|Yunkang Cao et.al.|[2305.10724v1](http://arxiv.org/abs/2305.10724v1)|**[link](https://github.com/caoyunkang/segment-any-anomaly)**|\n", "2305.10547": "|**2023-05-17**|**Rethinking Multimodal Content Moderation from an Asymmetric Angle with Mixed-modality**|Jialin Yuan et.al.|[2305.10547v1](http://arxiv.org/abs/2305.10547v1)|null|\n", "2305.10512": "|**2023-05-17**|**IMAD: IMage-Augmented multi-modal Dialogue**|Moskvoretskii Viktor et.al.|[2305.10512v1](http://arxiv.org/abs/2305.10512v1)|**[link](https://github.com/vityavitalich/imad)**|\n", "2305.11832": "|**2023-05-19**|**Improving Multimodal Joint Variational Autoencoders through Normalizing Flows and Correlation Analysis**|Agathe Senellart et.al.|[2305.11832v1](http://arxiv.org/abs/2305.11832v1)|null|\n", "2305.11818": "|**2023-05-19**|**MaGIC: Multi-modality Guided Image Completion**|Yongsheng Yu et.al.|[2305.11818v1](http://arxiv.org/abs/2305.11818v1)|null|\n", "2305.11719": "|**2023-05-19**|**Information Screening whilst Exploiting! Multimodal Relation Extraction with Feature Denoising and Multimodal Topic Modeling**|Shengqiong Wu et.al.|[2305.11719v1](http://arxiv.org/abs/2305.11719v1)|**[link](https://github.com/chocowu/mre-ise)**|\n", "2305.11579": "|**2023-05-19**|**Speech-Text Dialog Pre-training for Spoken Dialog Understanding with Explicit Cross-Modal Alignment**|Tianshu Yu et.al.|[2305.11579v1](http://arxiv.org/abs/2305.11579v1)|**[link](https://github.com/alibabaresearch/damo-convai)**|\n", "2305.11503": "|**2023-05-19**|**A Topic-aware Summarization Framework with Different Modal Side Information**|Xiuying Chen et.al.|[2305.11503v1](http://arxiv.org/abs/2305.11503v1)|null|\n", "2305.11481": "|**2023-05-22**|**CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring Image Segmentation**|Wenxuan Wang et.al.|[2305.11481v2](http://arxiv.org/abs/2305.11481v2)|null|\n", "2305.11443": "|**2023-05-19**|**Equivariant Multi-Modality Image Fusion**|Zixiang Zhao et.al.|[2305.11443v1](http://arxiv.org/abs/2305.11443v1)|null|\n", "2305.11439": "|**2023-05-19**|**Few-Shot Learning with Visual Distribution Calibration and Cross-Modal Distribution Alignment**|Runqi Wang et.al.|[2305.11439v1](http://arxiv.org/abs/2305.11439v1)|**[link](https://github.com/bhrqw/sada)**|\n", "2305.11392": "|**2023-05-19**|**Fast-StrucTexT: An Efficient Hourglass Transformer with Modality-guided Dynamic Token Merge for Document Understanding**|Mingliang Zhai et.al.|[2305.11392v1](http://arxiv.org/abs/2305.11392v1)|null|\n", "2305.11349": "|**2023-05-18**|**Unsupervised Domain-agnostic Fake News Detection using Multi-modal Weak Signals**|Amila Silva et.al.|[2305.11349v1](http://arxiv.org/abs/2305.11349v1)|null|\n", "2305.11327": "|**2023-05-18**|**MALM: Mask Augmentation based Local Matching for Food-Recipe Retrieval**|Bhanu Prakash Voutharoja et.al.|[2305.11327v1](http://arxiv.org/abs/2305.11327v1)|**[link](https://github.com/myfoodchoice/malm_mask_augmentation_based_local_matching-_for-_food_recipe_retrieval)**|\n", "2305.13220": "|**2023-05-22**|**Fast Monocular Scene Reconstruction with Global-Sparse Local-Dense Grids**|Wei Dong et.al.|[2305.13220v1](http://arxiv.org/abs/2305.13220v1)|null|\n", "2305.12953": "|**2023-05-22**|**Enhancing Next Active Object-based Egocentric Action Anticipation with Guided Attention**|Sanket Thakur et.al.|[2305.12953v1](http://arxiv.org/abs/2305.12953v1)|**[link](https://github.com/sanketsans/ganov2)**|\n", "2305.12903": "|**2023-05-22**|**DiffAVA: Personalized Text-to-Audio Generation with Visual Alignment**|Shentong Mo et.al.|[2305.12903v1](http://arxiv.org/abs/2305.12903v1)|null|\n", "2305.12878": "|**2023-05-22**|**Non-Autoregressive Document-Level Machine Translation (NA-DMT): Exploring Effective Approaches, Challenges, and Opportunities**|Guangsheng Bao et.al.|[2305.12878v1](http://arxiv.org/abs/2305.12878v1)|**[link](https://github.com/baoguangsheng/nat-on-doc)**|\n", "2305.12807": "|**2023-05-22**|**Multi-task Combinatorial Optimization: Adaptive Multi-modality Knowledge Transfer by an Explicit Inter-task Distance**|Peng Li et.al.|[2305.12807v1](http://arxiv.org/abs/2305.12807v1)|null|\n", "2305.12793": "|**2023-05-22**|**Zero-Shot End-to-End Spoken Language Understanding via Cross-Modal Selective Self-Training**|Jianfeng He et.al.|[2305.12793v1](http://arxiv.org/abs/2305.12793v1)|null|\n", "2305.12711": "|**2023-05-22**|**Unsupervised Visible-Infrared Person ReID by Collaborative Learning with Neighbor-Guided Label Refinement**|De Cheng et.al.|[2305.12711v1](http://arxiv.org/abs/2305.12711v1)|null|\n", "2305.12703": "|**2023-05-22**|**Progressive Sub-Graph Clustering Algorithm for Semi-Supervised Domain Adaptation Speaker Verification**|Zhuo Li et.al.|[2305.12703v1](http://arxiv.org/abs/2305.12703v1)|null|\n", "2305.12673": "|**2023-05-22**|**Efficient Bilateral Cross-Modality Cluster Matching for Unsupervised Visible-Infrared Person ReID**|De cheng et.al.|[2305.12673v1](http://arxiv.org/abs/2305.12673v1)|null|\n", "2305.12530": "|**2023-05-21**|**Towards Robust Family-Infant Audio Analysis Based on Unsupervised Pretraining of Wav2vec 2.0 on Large-Scale Unlabeled Family Audio**|Jialu Li et.al.|[2305.12530v1](http://arxiv.org/abs/2305.12530v1)|null|\n", "2305.12452": "|**2023-05-21**|**Advancing Referring Expression Segmentation Beyond Single Image**|Yixuan Wu et.al.|[2305.12452v1](http://arxiv.org/abs/2305.12452v1)|null|\n", "2305.12369": "|**2023-05-21**|**HIINT: Historical, Intra- and Inter- personal Dynamics Modeling with Cross-person Memory Transformer**|Yubin Kim et.al.|[2305.12369v1](http://arxiv.org/abs/2305.12369v1)|null|\n", "2305.12260": "|**2023-05-20**|**Cross2StrA: Unpaired Cross-lingual Image Captioning with Cross-lingual Cross-modal Structure-pivoted Alignment**|Shengqiong Wu et.al.|[2305.12260v1](http://arxiv.org/abs/2305.12260v1)|null|\n", "2305.12218": "|**2023-05-20**|**Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment**|Peng Jin et.al.|[2305.12218v1](http://arxiv.org/abs/2305.12218v1)|**[link](https://github.com/jpthu17/dicosa)**|\n", "2305.12011": "|**2023-05-19**|**Boosting Crop Classification by Hierarchically Fusing Satellite, Rotational, and Contextual Data**|Barriere Valentin et.al.|[2305.12011v1](http://arxiv.org/abs/2305.12011v1)|null|\n", "2305.14312": "|**2023-05-23**|**Text-guided 3D Human Generation from 2D Collections**|Tsu-Jui Fu et.al.|[2305.14312v1](http://arxiv.org/abs/2305.14312v1)|null|\n", "2305.14167": "|**2023-05-24**|**DetGPT: Detect What You Need via Reasoning**|Renjie Pi et.al.|[2305.14167v2](http://arxiv.org/abs/2305.14167v2)|null|\n", "2305.14042": "|**2023-05-23**|**Improving speech translation by fusing speech and text**|Wenbiao Yin et.al.|[2305.14042v1](http://arxiv.org/abs/2305.14042v1)|null|\n", "2305.14017": "|**2023-05-23**|**Faster Video Moment Retrieval with Point-Level Supervision**|Xun Jiang et.al.|[2305.14017v1](http://arxiv.org/abs/2305.14017v1)|null|\n", "2305.14014": "|**2023-05-23**|**CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained Vision-Language Model**|Shuai Zhao et.al.|[2305.14014v1](http://arxiv.org/abs/2305.14014v1)|null|\n", "2305.13986": "|**2023-05-23**|**A Multi-Modal Network Equilibrium Model with Interacting Mobility Service Providers'Strategies**|Claudia Bandiera et.al.|[2305.13986v1](http://arxiv.org/abs/2305.13986v1)|null|\n", "2305.13705": "|**2023-05-23**|**DiffHand: End-to-End Hand Mesh Reconstruction via Diffusion Models**|Lijun Li et.al.|[2305.13705v1](http://arxiv.org/abs/2305.13705v1)|null|\n", "2305.13697": "|**2023-05-23**|**UNIMO-3: Multi-granularity Interaction for Vision-Language Representation Learning**|Hao Yang et.al.|[2305.13697v1](http://arxiv.org/abs/2305.13697v1)|null|\n", "2305.13667": "|**2023-05-23**|**Optimizing Non-Autoregressive Transformers with Contrastive Learning**|Chenxin An et.al.|[2305.13667v1](http://arxiv.org/abs/2305.13667v1)|null|\n", "2305.13659": "|**2023-05-23**|**Flare-Aware Cross-modal Enhancement Network for Multi-spectral Vehicle Re-identification**|Aihua Zheng et.al.|[2305.13659v1](http://arxiv.org/abs/2305.13659v1)|**[link](https://github.com/Mzq12138/Official-Implementation-for-Flare-Aware-Cross-modal-Enhancement-for-Multi-spectral-Vehicle-ReID)**|\n", "2305.13653": "|**2023-05-23**|**RaSa: Relation and Sensitivity Aware Representation Learning for Text-based Person Search**|Yang Bai et.al.|[2305.13653v1](http://arxiv.org/abs/2305.13653v1)|**[link](https://github.com/flame-chasers/rasa)**|\n", "2305.13631": "|**2023-05-23**|**EDIS: Entity-Driven Image Search over Multimodal Web Content**|Siqi Liu et.al.|[2305.13631v1](http://arxiv.org/abs/2305.13631v1)|**[link](https://github.com/emerisly/edis)**|\n", "2305.13503": "|**2023-05-22**|**Asynchronous Multi-Model Federated Learning over Wireless Networks: Theory, Modeling, and Optimization**|Zhan-Lun Chang et.al.|[2305.13503v1](http://arxiv.org/abs/2305.13503v1)|null|\n", "2305.15403": "|**2023-05-24**|**AV-TranSpeech: Audio-Visual Robust Speech-to-Speech Translation**|Rongjie Huang et.al.|[2305.15403v1](http://arxiv.org/abs/2305.15403v1)|null|\n", "2305.15302": "|**2023-05-24**|**Multi-Modal Mutual Attention and Iterative Interaction for Referring Image Segmentation**|Chang Liu et.al.|[2305.15302v1](http://arxiv.org/abs/2305.15302v1)|null|\n", "2305.15296": "|**2023-05-24**|**MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal Image Generation**|Marco Bellagente et.al.|[2305.15296v1](http://arxiv.org/abs/2305.15296v1)|null|\n", "2305.15218": "|**2023-05-24**|**Multi-modal Machine Learning for Vehicle Rating Predictions Using Image, Text, and Parametric Data**|Hanqi Su et.al.|[2305.15218v1](http://arxiv.org/abs/2305.15218v1)|null|\n", "2305.15217": "|**2023-05-24**|**L-CAD: Language-based Colorization with Any-level Descriptions**|Zheng Chang et.al.|[2305.15217v1](http://arxiv.org/abs/2305.15217v1)|null|\n", "2305.15159": "|**2023-05-24**|**Collaborative Recommendation Model Based on Multi-modal Multi-view Attention Network: Movie and literature cases**|Zheng Hu et.al.|[2305.15159v1](http://arxiv.org/abs/2305.15159v1)|null|\n", "2305.15094": "|**2023-05-24**|**InpaintNeRF360: Text-Guided 3D Inpainting on Unbounded Neural Radiance Fields**|Dongqing Wang et.al.|[2305.15094v1](http://arxiv.org/abs/2305.15094v1)|null|\n", "2305.15033": "|**2023-05-24**|**SmartTrim: Adaptive Tokens and Parameters Pruning for Efficient Vision-Language Models**|Zekun Wang et.al.|[2305.15033v1](http://arxiv.org/abs/2305.15033v1)|null|\n", "2305.15023": "|**2023-05-24**|**Cheap and Quick: Efficient Vision-Language Instruction Tuning for Large Language Models**|Gen Luo et.al.|[2305.15023v1](http://arxiv.org/abs/2305.15023v1)|null|\n", "2305.15021": "|**2023-05-24**|**EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought**|Yao Mu et.al.|[2305.15021v1](http://arxiv.org/abs/2305.15021v1)|**[link](https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch)**|\n", "2305.14969": "|**2023-05-24**|**MMNet: Multi-Mask Network for Referring Image Segmentation**|Yichen Yan et.al.|[2305.14969v1](http://arxiv.org/abs/2305.14969v1)|null|\n", "2305.14914": "|**2023-05-24**|**GAMUS: A Geometry-aware Multi-modal Semantic Segmentation Benchmark for Remote Sensing Data**|Zhitong Xiong et.al.|[2305.14914v1](http://arxiv.org/abs/2305.14914v1)|**[link](https://github.com/earthnets/rsi-mmsegmentation)**|\n", "2305.14897": "|**2023-05-24**|**Text encoders are performance bottlenecks in contrastive vision-language models**|Amita Kamath et.al.|[2305.14897v1](http://arxiv.org/abs/2305.14897v1)|**[link](https://github.com/amitakamath/vl_text_encoders_are_bottlenecks)**|\n", "2305.14843": "|**2023-05-24**|**Meta-Learning For Vision-and-Language Cross-lingual Transfer**|Hanxu Hu et.al.|[2305.14843v1](http://arxiv.org/abs/2305.14843v1)|null|\n", "2305.14839": "|**2023-05-24**|**PaCE: Unified Multi-modal Dialogue Pre-training with Progressive and Compositional Experts**|Yunshui Li et.al.|[2305.14839v1](http://arxiv.org/abs/2305.14839v1)|**[link](https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/pace)**|\n", "2305.16318": "|**2023-05-25**|**Referred by Multi-Modality: A Unified Temporal Transformer for Video Object Segmentation**|Shilin Yan et.al.|[2305.16318v1](http://arxiv.org/abs/2305.16318v1)|**[link](https://github.com/opengvlab/mutr)**|\n", "2305.16304": "|**2023-05-25**|**Candidate Set Re-ranking for Composed Image Retrieval with Dual Multi-modal Encoder**|Zheyuan Liu et.al.|[2305.16304v1](http://arxiv.org/abs/2305.16304v1)|null|\n", "2305.16166": "|**2023-05-25**|**Multimodal Relation Extraction with Cross-Modal Retrieval and Synthesis**|Xuming Hu et.al.|[2305.16166v1](http://arxiv.org/abs/2305.16166v1)|null|\n", "2305.16107": "|**2023-05-25**|**VioLA: Unified Codec Language Models for Speech Recognition, Synthesis, and Translation**|Tianrui Wang et.al.|[2305.16107v1](http://arxiv.org/abs/2305.16107v1)|null|\n", "2305.15957": "|**2023-05-25**|**DiffCLIP: Leveraging Stable Diffusion for Language Grounded 3D Classification**|Sitian Shen et.al.|[2305.15957v1](http://arxiv.org/abs/2305.15957v1)|null|\n", "2305.15920": "|**2023-05-25**|**Learning and accurate generation of stochastic dynamics based on multi-model Generative Adversarial Networks**|Daniele Lanzoni et.al.|[2305.15920v1](http://arxiv.org/abs/2305.15920v1)|null|\n", "2305.15913": "|**2023-05-27**|**MEMEX: Detecting Explanatory Evidence for Memes via Knowledge-Enriched Contextualization**|Shivam Sharma et.al.|[2305.15913v2](http://arxiv.org/abs/2305.15913v2)|**[link](https://github.com/lcs2-iiitd/memex_meme_evidence)**|\n", "2305.15765": "|**2023-05-25**|**Language-Guided 3D Object Detection in Point Cloud for Autonomous Driving**|Wenhao Cheng et.al.|[2305.15765v1](http://arxiv.org/abs/2305.15765v1)|null|\n", "2305.15762": "|**2023-05-25**|**Dynamic Enhancement Network for Partial Multi-modality Person Re-identification**|Aihua Zheng et.al.|[2305.15762v1](http://arxiv.org/abs/2305.15762v1)|null|\n", "2305.15753": "|**2023-05-25**|**T2TD: Text-3D Generation Model based on Prior Knowledge Guidance**|Weizhi Nie et.al.|[2305.15753v1](http://arxiv.org/abs/2305.15753v1)|null|\n", "2305.15732": "|**2023-05-26**|**CLIP3Dstyler: Language Guided 3D Arbitrary Neural Style Transfer**|Ming Gao et.al.|[2305.15732v2](http://arxiv.org/abs/2305.15732v2)|null|\n", "2305.15688": "|**2023-05-25**|**Frame-Event Alignment and Fusion Network for High Frame Rate Tracking**|Jiqing Zhang et.al.|[2305.15688v1](http://arxiv.org/abs/2305.15688v1)|null|\n", "2305.15483": "|**2023-05-24**|**Weakly Supervised Vision-and-Language Pre-training with Relative Representations**|Chi Chen et.al.|[2305.15483v1](http://arxiv.org/abs/2305.15483v1)|null|\n", "2305.17102": "|**2023-05-26**|**GeoVLN: Learning Geometry-Enhanced Visual Representation with Slot Attention for Vision-and-Language Navigation**|Jingyang Huo et.al.|[2305.17102v1](http://arxiv.org/abs/2305.17102v1)|**[link](https://github.com/jingyanghuo/GeoVLN)**|\n", "2305.17100": "|**2023-05-26**|**BiomedGPT: A Unified and Generalist Biomedical Generative Pre-trained Transformer for Vision, Language, and Multimodal Tasks**|Kai Zhang et.al.|[2305.17100v1](http://arxiv.org/abs/2305.17100v1)|**[link](https://github.com/taokz/biomedgpt)**|\n", "2305.17011": "|**2023-05-26**|**SOC: Semantic-Assisted Object Cluster for Referring Video Object Segmentation**|Zhuoyan Luo et.al.|[2305.17011v1](http://arxiv.org/abs/2305.17011v1)|null|\n", "2305.16986": "|**2023-05-29**|**NavGPT: Explicit Reasoning in Vision-and-Language Navigation with Large Language Models**|Gengze Zhou et.al.|[2305.16986v2](http://arxiv.org/abs/2305.16986v2)|**[link](https://github.com/gengzezhou/navgpt)**|\n", "2305.16685": "|**2023-05-26**|**S4M: Generating Radiology Reports by A Single Model for Multiple Body Parts**|Qi Chen et.al.|[2305.16685v1](http://arxiv.org/abs/2305.16685v1)|**[link](https://github.com/ytongxie/s4m)**|\n", "2305.16556": "|**2023-05-26**|**LANISTR: Multimodal Learning from Structured and Unstructured Data**|Sayna Ebrahimi et.al.|[2305.16556v1](http://arxiv.org/abs/2305.16556v1)|null|\n", "2305.16434": "|**2023-05-25**|**Credit Valuation Adjustment in Financial Networks**|Irena Barja\u0161i\u0107 et.al.|[2305.16434v1](http://arxiv.org/abs/2305.16434v1)|null|\n", "2305.16406": "|**2023-05-25**|**Context-Aware Attention Layers coupled with Optimal Transport Domain Adaptation methods for recognizing dementia from spontaneous speech**|Loukas Ilias et.al.|[2305.16406v1](http://arxiv.org/abs/2305.16406v1)|null|\n", "2305.18171": "|**2023-05-29**|**Improved Probabilistic Image-Text Representations**|Sanghyuk Chun et.al.|[2305.18171v1](http://arxiv.org/abs/2305.18171v1)|**[link](https://github.com/naver-ai/pcmepp)**|\n", "2305.18009": "|**2023-05-29**|**Multi-Modal Face Stylization with a Generative Prior**|Mengtian Li et.al.|[2305.18009v1](http://arxiv.org/abs/2305.18009v1)|null|\n", "2305.17993": "|**2023-05-29**|**Multi-Scale Attention for Audio Question Answering**|Guangyao Li et.al.|[2305.17993v1](http://arxiv.org/abs/2305.17993v1)|**[link](https://github.com/gewu-lab/mwafm)**|\n", "2305.17941": "|**2023-05-29**|**Safety of autonomous vehicles: A survey on Model-based vs. AI-based approaches**|Dimia Iberraken et.al.|[2305.17941v1](http://arxiv.org/abs/2305.17941v1)|null|\n", "2305.17925": "|**2023-05-29**|**Identifying shifts in multi-modal travel patterns during special events using mobile data: Celebrating Vappu in Helsinki**|Zhiren Huang et.al.|[2305.17925v1](http://arxiv.org/abs/2305.17925v1)|null|\n", "2305.17911": "|**2023-05-29**|**TotalDefMeme: A Multi-Attribute Meme dataset on Total Defence in Singapore**|Nirmalendu Prakash et.al.|[2305.17911v1](http://arxiv.org/abs/2305.17911v1)|null|\n", "2305.17903": "|**2023-05-30**|**Deeply Coupled Cross-Modal Prompt Learning**|Xuejing Liu et.al.|[2305.17903v2](http://arxiv.org/abs/2305.17903v2)|**[link](https://github.com/gingl/cmpa)**|\n", "2305.17652": "|**2023-05-28**|**ConaCLIP: Exploring Distillation of Fully-Connected Knowledge Interaction Graph for Lightweight Text-Image Retrieval**|Jiapeng Wang et.al.|[2305.17652v1](http://arxiv.org/abs/2305.17652v1)|null|\n", "2305.17629": "|**2023-05-28**|**Multi-Modal Wireless Flexible Gel-Free Sensors with Edge Deep Learning for Detecting and Alerting Freezing of Gait in Parkinson's Patients**|Yuhan Hou et.al.|[2305.17629v1](http://arxiv.org/abs/2305.17629v1)|null|\n", "2305.17600": "|**2023-05-28**|**GAME-UP: Game-Aware Mode Enumeration and Understanding for Trajectory Prediction**|Justin Lidard et.al.|[2305.17600v1](http://arxiv.org/abs/2305.17600v1)|null|\n", "2305.17530": "|**2023-05-27**|**PuMer: Pruning and Merging Tokens for Efficient Vision Language Models**|Qingqing Cao et.al.|[2305.17530v1](http://arxiv.org/abs/2305.17530v1)|**[link](https://github.com/csarron/pumer)**|\n", "2305.17499": "|**2023-05-27**|**CIF-PT: Bridging Speech and Text Representations for Spoken Language Understanding via Continuous Integrate-and-Fire Pre-Training**|Linhao Dong et.al.|[2305.17499v1](http://arxiv.org/abs/2305.17499v1)|null|\n", "2305.17455": "|**2023-05-27**|**CrossGET: Cross-Guided Ensemble of Tokens for Accelerating Vision-Language Transformers**|Dachuan Shi et.al.|[2305.17455v1](http://arxiv.org/abs/2305.17455v1)|**[link](https://github.com/sdc17/crossget)**|\n", "2305.17343": "|**2023-05-27**|**Modality-Independent Teachers Meet Weakly-Supervised Audio-Visual Event Parser**|Yung-Hsuan Lai et.al.|[2305.17343v1](http://arxiv.org/abs/2305.17343v1)|**[link](https://github.com/franklin905/valor)**|\n", "2305.17219": "|**2023-05-26**|**GVdoc: Graph-based Visual Document Classification**|Fnu Mohbat et.al.|[2305.17219v1](http://arxiv.org/abs/2305.17219v1)|**[link](https://github.com/mohbattharani/GVdoc)**|\n", "2305.19270": "|**2023-05-30**|**Learning without Forgetting for Vision-Language Models**|Da-Wei Zhou et.al.|[2305.19270v1](http://arxiv.org/abs/2305.19270v1)|null|\n", "2305.19240": "|**2023-05-30**|**NetHack is Hard to Hack**|Ulyana Piterbarg et.al.|[2305.19240v1](http://arxiv.org/abs/2305.19240v1)|**[link](https://github.com/upiterbarg/hihack)**|\n", "2305.19228": "|**2023-05-30**|**Unsupervised Melody-to-Lyric Generation**|Yufei Tian et.al.|[2305.19228v1](http://arxiv.org/abs/2305.19228v1)|**[link](https://github.com/amazon-science/unsupervised-melody-to-lyrics-generation)**|\n", "2305.19216": "|**2023-05-30**|**Translation-Enhanced Multilingual Text-to-Image Generation**|Yaoyiran Li et.al.|[2305.19216v1](http://arxiv.org/abs/2305.19216v1)|null|\n", "2305.18980": "|**2023-05-30**|**Multi-modal Queried Object Detection in the Wild**|Yifan Xu et.al.|[2305.18980v1](http://arxiv.org/abs/2305.18980v1)|**[link](https://github.com/yifanxu74/mq-det)**|\n", "2305.18969": "|**2023-05-30**|**MS-DETR: Natural Language Video Localization with Sampling Moment-Moment Interaction**|Jing Wang et.al.|[2305.18969v1](http://arxiv.org/abs/2305.18969v1)|**[link](https://github.com/k-nick/ms-detr)**|\n", "2305.18898": "|**2023-05-30**|**AlphaBlock: Embodied Finetuning for Vision-Language Reasoning in Robot Manipulation**|Chuhao Jin et.al.|[2305.18898v1](http://arxiv.org/abs/2305.18898v1)|null|\n", "2305.18842": "|**2023-05-30**|**Generate then Select: Open-ended Visual Question Answering Guided by World Knowledge**|Xingyu Fu et.al.|[2305.18842v1](http://arxiv.org/abs/2305.18842v1)|null|\n", "2305.18752": "|**2023-05-30**|**GPT4Tools: Teaching Large Language Model to Use Tools via Self-instruction**|Rui Yang et.al.|[2305.18752v1](http://arxiv.org/abs/2305.18752v1)|**[link](https://github.com/stevengrove/gpt4tools)**|\n", "2305.18721": "|**2023-05-30**|**LayoutMask: Enhance Text-Layout Interaction in Multi-modal Pre-training for Document Understanding**|Yi Tu et.al.|[2305.18721v1](http://arxiv.org/abs/2305.18721v1)|null|\n", "2305.18641": "|**2023-05-29**|**Enhanced Chart Understanding in Vision and Language Task via Cross-modal Pre-training on Plot Table Pairs**|Mingyang Zhou et.al.|[2305.18641v1](http://arxiv.org/abs/2305.18641v1)|null|\n", "2305.18500": "|**2023-05-29**|**VAST: A Vision-Audio-Subtitle-Text Omni-Modality Foundation Model and Dataset**|Sihan Chen et.al.|[2305.18500v1](http://arxiv.org/abs/2305.18500v1)|**[link](https://github.com/txh-mercury/vast)**|\n", "2305.19972": "|**2023-05-31**|**ViLaS: Integrating Vision and Language into Automatic Speech Recognition**|Minglun Han et.al.|[2305.19972v1](http://arxiv.org/abs/2305.19972v1)|null|\n", "2305.19924": "|**2023-06-01**|**Joint Adaptive Representations for Image-Language Learning**|AJ Piergiovanni et.al.|[2305.19924v2](http://arxiv.org/abs/2305.19924v2)|null|\n", "2305.19912": "|**2023-05-31**|**Structure-Aware Language Model Pretraining Improves Dense Retrieval on Structured Data**|Xinze Li et.al.|[2305.19912v1](http://arxiv.org/abs/2305.19912v1)|**[link](https://github.com/openmatch/openmatch)**|\n", "2305.19894": "|**2023-05-31**|**Med-UniC: Unifying Cross-Lingual Medical Vision-Language Pre-Training by Diminishing Bias**|Zhongwei Wan et.al.|[2305.19894v1](http://arxiv.org/abs/2305.19894v1)|**[link](https://github.com/SUSTechBruce/Med-UniC)**|\n", "2305.19664": "|**2023-05-31**|**Unveiling Cross Modality Bias in Visual Question Answering: A Causal View with Possible Worlds VQA**|Ali Vosoughi et.al.|[2305.19664v1](http://arxiv.org/abs/2305.19664v1)|null|\n", "2305.19624": "|**2023-05-31**|**A Multi-Modal Transformer Network for Action Detection**|Matthew Korban et.al.|[2305.19624v1](http://arxiv.org/abs/2305.19624v1)|null|\n", "2305.19595": "|**2023-06-01**|**Dense and Aligned Captions (DAC) Promote Compositional Reasoning in VL Models**|Sivan Doveh et.al.|[2305.19595v2](http://arxiv.org/abs/2305.19595v2)|null|\n", "2305.19522": "|**2023-06-01**|**PromptStyle: Controllable Style Transfer for Text-to-Speech with Natural Language Descriptions**|Guanghou Liu et.al.|[2305.19522v2](http://arxiv.org/abs/2305.19522v2)|null|\n", "2306.00978": "|**2023-06-01**|**AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration**|Ji Lin et.al.|[2306.00978v1](http://arxiv.org/abs/2306.00978v1)|**[link](https://github.com/mit-han-lab/llm-awq)**|\n", "2306.00964": "|**2023-06-01**|**Cocktail: Mixing Multi-Modality Controls for Text-Conditional Image Generation**|Minghui Hu et.al.|[2306.00964v1](http://arxiv.org/abs/2306.00964v1)|null|\n", "2306.00958": "|**2023-06-01**|**LIV: Language-Image Representations and Rewards for Robotic Control**|Yecheng Jason Ma et.al.|[2306.00958v1](http://arxiv.org/abs/2306.00958v1)|**[link](https://github.com/penn-pal-lab/liv)**|\n", "2306.00932": "|**2023-06-01**|**Cross Modal Data Discovery over Structured and Unstructured Data Lakes**|Mohamed Y. Eltabakh et.al.|[2306.00932v1](http://arxiv.org/abs/2306.00932v1)|**[link](https://github.com/qcri/cmdl)**|\n", "2306.00813": "|**2023-06-01**|**UniDiff: Advancing Vision-Language Models with Generative and Discriminative Learning**|Xiao Dong et.al.|[2306.00813v1](http://arxiv.org/abs/2306.00813v1)|null|\n", "2306.00792": "|**2023-06-01**|**Learning Across Decentralized Multi-Modal Remote Sensing Archives with Federated Learning**|Bar\u0131\u015f B\u00fcy\u00fckta\u015f et.al.|[2306.00792v1](http://arxiv.org/abs/2306.00792v1)|null|\n", "2306.00789": "|**2023-06-01**|**Improved Cross-Lingual Transfer Learning For Automatic Speech Translation**|Sameer Khurana et.al.|[2306.00789v1](http://arxiv.org/abs/2306.00789v1)|null|\n", "2306.00783": "|**2023-06-01**|**FDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and Relighting with Diffusion Models**|Hao Zhang et.al.|[2306.00783v1](http://arxiv.org/abs/2306.00783v1)|**[link](https://github.com/billyxyb/fdnerf)**|\n", "2306.00640": "|**2023-06-01**|**Multi-Modal Deep Learning for Multi-Temporal Urban Mapping With a Partly Missing Optical Modality**|Sebastian Hafner et.al.|[2306.00640v1](http://arxiv.org/abs/2306.00640v1)|null|\n", "2306.00424": "|**2023-06-01**|**End-to-end Knowledge Retrieval with Multi-modal Queries**|Man Luo et.al.|[2306.00424v1](http://arxiv.org/abs/2306.00424v1)|**[link](https://github.com/luomancs/remuq)**|\n", "2306.00409": "|**2023-06-01**|**Adapting Pre-trained Language Models to Vision-Language Tasks via Dynamic Visual Prompting**|Shubin Huang et.al.|[2306.00409v1](http://arxiv.org/abs/2306.00409v1)|**[link](https://github.com/hsb1357173526/dynamic_visual_prompting)**|\n", "2306.00386": "|**2023-06-01**|**Symmetric Uncertainty-Aware Feature Transmission for Depth Super-Resolution**|Wuxuan Shi et.al.|[2306.00386v1](http://arxiv.org/abs/2306.00386v1)|**[link](https://github.com/shiwuxuan/suft)**|\n", "2306.00228": "|**2023-05-31**|**Using Visual Cropping to Enhance Fine-Detail Question Answering of BLIP-Family Models**|Jiarui Zhang et.al.|[2306.00228v1](http://arxiv.org/abs/2306.00228v1)|null|\n", "2306.00179": "|**2023-05-31**|**LeggedWalking on Inclined Surfaces**|Chenghao Wang et.al.|[2306.00179v1](http://arxiv.org/abs/2306.00179v1)|null|\n", "2306.00103": "|**2023-05-31**|**ManagerTower: Aggregating the Insights of Uni-Modal Experts for Vision-Language Representation Learning**|Xiao Xu et.al.|[2306.00103v1](http://arxiv.org/abs/2306.00103v1)|**[link](https://github.com/looperxx/managertower)**|\n", "2306.01733": "|**2023-06-02**|**DocFormerv2: Local Features for Document Understanding**|Srikar Appalaraju et.al.|[2306.01733v1](http://arxiv.org/abs/2306.01733v1)|null|\n", "2306.01675": "|**2023-06-02**|**Bayesian Segmentation Modeling of Epidemic Growth**|Tejasv Bedi et.al.|[2306.01675v1](http://arxiv.org/abs/2306.01675v1)|null|\n", "2306.01656": "|**2023-06-02**|**Backchannel Detection and Agreement Estimation from Video with Transformer Networks**|Ahmed Amer et.al.|[2306.01656v1](http://arxiv.org/abs/2306.01656v1)|**[link](https://git.opendfki.de/body_language/ijcnn23-backchannel-detection)**|\n", "2306.01523": "|**2023-06-02**|**Transformer-based Multi-Modal Learning for Multi Label Remote Sensing Image Classification**|David Hoffmann et.al.|[2306.01523v1](http://arxiv.org/abs/2306.01523v1)|null|\n", "2306.01492": "|**2023-06-02**|**Multi-Modal Emotion Recognition for Enhanced Requirements Engineering: A Novel Approach**|Ben Cheng et.al.|[2306.01492v1](http://arxiv.org/abs/2306.01492v1)|null|\n", "2306.01312": "|**2023-06-02**|**Syntax-aware Hybrid prompt model for Few-shot multi-modal sentiment analysis**|Zikai Zhou et.al.|[2306.01312v1](http://arxiv.org/abs/2306.01312v1)|null|\n", "2306.01311": "|**2023-06-02**|**MetaVL: Transferring In-Context Learning Ability From Language Models to Vision-Language Models**|Masoud Monajatipoor et.al.|[2306.01311v1](http://arxiv.org/abs/2306.01311v1)|null|\n", "2306.01163": "|**2023-06-01**|**A Multi-Modal Latent-Features based Service Recommendation System for the Social Internet of Things**|Amar Khelloufi et.al.|[2306.01163v1](http://arxiv.org/abs/2306.01163v1)|null|\n", "2306.01144": "|**2023-06-01**|**Evaluating the Capabilities of Multi-modal Reasoning Models with Synthetic Task Data**|Nathan Vaska et.al.|[2306.01144v1](http://arxiv.org/abs/2306.01144v1)|null|\n", "2306.01112": "|**2023-06-01**|**What if We Enrich day-ahead Solar Irradiance Time Series Forecasting with Spatio-Temporal Context?**|Oussama Boussif et.al.|[2306.01112v1](http://arxiv.org/abs/2306.01112v1)|**[link](https://github.com/gitbooo/CrossViVit)**|\n", "2306.02972": "|**2023-06-05**|**Simultaneous or Sequential Training? How Speech Representations Cooperate in a Multi-Task Self-Supervised Learning System**|Khazar Khorrami et.al.|[2306.02972v1](http://arxiv.org/abs/2306.02972v1)|null|\n", "2306.02901": "|**2023-06-05**|**A Vessel-Segmentation-Based CycleGAN for Unpaired Multi-modal Retinal Image Synthesis**|Aline Sindel et.al.|[2306.02901v1](http://arxiv.org/abs/2306.02901v1)|null|\n", "2306.02894": "|**2023-06-05**|**Recyclable Semi-supervised Method Based on Multi-model Ensemble for Video Scene Parsing**|Biao Wu et.al.|[2306.02894v1](http://arxiv.org/abs/2306.02894v1)|null|\n", "2306.02858": "|**2023-06-06**|**Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding**|Hang Zhang et.al.|[2306.02858v2](http://arxiv.org/abs/2306.02858v2)|**[link](https://github.com/damo-nlp-sg/video-llama)**|\n", "2306.02841": "|**2023-06-05**|**CTRL: Connect Tabular and Language Model for CTR Prediction**|Xiangyang Li et.al.|[2306.02841v1](http://arxiv.org/abs/2306.02841v1)|null|\n", "2306.02831": "|**2023-06-05**|**MM-DAG: Multi-task DAG Learning for Multi-modal Data -- with Application for Traffic Congestion Analysis**|Tian Lan et.al.|[2306.02831v1](http://arxiv.org/abs/2306.02831v1)|**[link](https://github.com/lantian72/mm-dag)**|\n", "2306.02673": "|**2023-06-05**|**Cross-Modal Vertical Federated Learning for MRI Reconstruction**|Yunlu Yan et.al.|[2306.02673v1](http://arxiv.org/abs/2306.02673v1)|null|\n", "2306.02596": "|**2023-06-05**|**A Novel Interpretable and Generalizable Re-synchronization Model for Cued Speech based on a Multi-Cuer Corpus**|Lufei Gao et.al.|[2306.02596v1](http://arxiv.org/abs/2306.02596v1)|**[link](https://github.com/lufei321/resync-cs)**|\n", "2306.02546": "|**2023-06-05**|**LmPa: Improving Decompilation by Synergy of Large Language Model and Program Analysis**|Xiangzhe Xu et.al.|[2306.02546v1](http://arxiv.org/abs/2306.02546v1)|null|\n", "2306.02329": "|**2023-06-04**|**Multi-CLIP: Contrastive Vision-Language Pre-training for Question Answering tasks in 3D Scenes**|Alexandros Delitzas et.al.|[2306.02329v1](http://arxiv.org/abs/2306.02329v1)|null|\n", "2306.02307": "|**2023-06-04**|**Finding the SWEET Spot: Analysis and Improvement of Adaptive Inference in Low Resource Settings**|Daniel Rotem et.al.|[2306.02307v1](http://arxiv.org/abs/2306.02307v1)|null|\n", "2306.02259": "|**2023-06-04**|**Predicting Information Pathways Across Online Communities**|Yiqiao Jin et.al.|[2306.02259v1](http://arxiv.org/abs/2306.02259v1)|**[link](https://github.com/claws-lab/inpac)**|\n", "2306.02137": "|**2023-06-03**|**Inconsistent Matters: A Knowledge-guided Dual-consistency Network for Multi-modal Rumor Detection**|Mengzhu Sun et.al.|[2306.02137v1](http://arxiv.org/abs/2306.02137v1)|**[link](https://github.com/mengzsun/kdcn)**|\n", "2306.02050": "|**2023-06-06**|**Provable Dynamic Fusion for Low-Quality Multimodal Data**|Qingyang Zhang et.al.|[2306.02050v2](http://arxiv.org/abs/2306.02050v2)|**[link](https://github.com/qingyangzhang/qmf)**|\n", "2306.01929": "|**2023-06-02**|**Recent Advances of Local Mechanisms in Computer Vision: A Survey and Outlook of Recent Work**|Qiangchang Wang et.al.|[2306.01929v1](http://arxiv.org/abs/2306.01929v1)|null|\n", "2306.03899": "|**2023-06-06**|**Towards Label-free Scene Understanding by Vision Foundation Models**|Runnan Chen et.al.|[2306.03899v1](http://arxiv.org/abs/2306.03899v1)|**[link](https://github.com/runnanchen/label-free-scene-understanding)**|\n", "2306.03810": "|**2023-06-06**|**X-Align++: cross-modal cross-view alignment for Bird's-eye-view segmentation**|Shubhankar Borse et.al.|[2306.03810v1](http://arxiv.org/abs/2306.03810v1)|null|\n", "2306.03802": "|**2023-06-06**|**Learning to Ground Instructional Articles in Videos through Narrations**|Effrosyni Mavroudi et.al.|[2306.03802v1](http://arxiv.org/abs/2306.03802v1)|null|\n", "2306.03730": "|**2023-06-06**|**Modality-Agnostic Learning for Medical Image Segmentation Using Multi-modality Self-distillation**|Qisheng He et.al.|[2306.03730v1](http://arxiv.org/abs/2306.03730v1)|null|\n", "2306.03678": "|**2023-06-06**|**On the Difference of BERT-style and CLIP-style Text Encoders**|Zhihong Chen et.al.|[2306.03678v1](http://arxiv.org/abs/2306.03678v1)|**[link](https://github.com/zhjohnchan/bert-clip-synesthesia)**|\n", "2306.03650": "|**2023-06-06**|**A Quantum Probability Driven Framework for Joint Multi-Modal Sarcasm, Sentiment and Emotion Analysis**|Yaochen Liu et.al.|[2306.03650v1](http://arxiv.org/abs/2306.03650v1)|null|\n", "2306.03617": "|**2023-06-06**|**A Data-Efficient Approach for Long-Term Human Motion Prediction Using Maps of Dynamics**|Yufei Zhu et.al.|[2306.03617v1](http://arxiv.org/abs/2306.03617v1)|null|\n", "2306.03367": "|**2023-06-06**|**Bridging the Gap Between Multi-Step and One-Shot Trajectory Prediction via Self-Supervision**|Faris Janjo\u0161 et.al.|[2306.03367v1](http://arxiv.org/abs/2306.03367v1)|null|\n", "2306.03252": "|**2023-06-05**|**RACECAR -- The Dataset for High-Speed Autonomous Racing**|Amar Kulkarni et.al.|[2306.03252v1](http://arxiv.org/abs/2306.03252v1)|**[link](https://github.com/linklab-uva/racecar_data)**|\n", "2306.04445": "|**2023-06-07**|**Multi-modal Latent Diffusion**|Mustapha Bounoua et.al.|[2306.04445v1](http://arxiv.org/abs/2306.04445v1)|null|\n", "2306.04387": "|**2023-06-08**|**M$^3$IT: A Large-Scale Dataset towards Multi-Modal Multilingual Instruction Tuning**|Lei Li et.al.|[2306.04387v2](http://arxiv.org/abs/2306.04387v2)|null|\n", "2306.04362": "|**2023-06-07**|**Youku-mPLUG: A 10 Million Large-scale Chinese Video-Language Dataset for Pre-training and Benchmarks**|Haiyang Xu et.al.|[2306.04362v1](http://arxiv.org/abs/2306.04362v1)|**[link](https://github.com/x-plug/youku-mplug)**|\n", "2306.04272": "|**2023-06-07**|**On the Generalization of Multi-modal Contrastive Learning**|Qi Zhang et.al.|[2306.04272v1](http://arxiv.org/abs/2306.04272v1)|**[link](https://github.com/pku-ml/clip-help-simclr)**|\n", "2306.04163": "|**2023-06-07**|**Enhancing Virtual Assistant Intelligence: Precise Area Targeting for Instance-level User Intents beyond Metadata**|Mengyu Chen et.al.|[2306.04163v1](http://arxiv.org/abs/2306.04163v1)|null|\n", "2306.04083": "|**2023-06-07**|**Coverage Path Planning with Budget Constraints for Multiple Unmanned Ground Vehicles**|Vu Phi Tran et.al.|[2306.04083v1](http://arxiv.org/abs/2306.04083v1)|null|\n", "2306.04021": "|**2023-06-06**|**Energy-Based Models for Cross-Modal Localization using Convolutional Transformers**|Alan Wu et.al.|[2306.04021v1](http://arxiv.org/abs/2306.04021v1)|null|\n", "2306.05425": "|**2023-06-08**|**MIMIC-IT: Multi-Modal In-Context Instruction Tuning**|Bo Li et.al.|[2306.05425v1](http://arxiv.org/abs/2306.05425v1)|**[link](https://github.com/luodian/otter)**|\n", "2306.04928": "|**2023-06-08**|**Underwater Intention Recognition using Head Motion and Throat Vibration for Supernumerary Robotic Assistance**|Yuqin Guo et.al.|[2306.04928v1](http://arxiv.org/abs/2306.04928v1)|null|\n", "2306.06048": "|**2023-06-09**|**How Does Fine-Tuning Impact Out-of-Distribution Detection for Vision-Language Models?**|Yifei Ming et.al.|[2306.06048v1](http://arxiv.org/abs/2306.06048v1)|null|\n", "2306.05716": "|**2023-06-09**|**Pave the Way to Grasp Anything: Transferring Foundation Models for Universal Pick-Place Robots**|Jiange Yang et.al.|[2306.05716v1](http://arxiv.org/abs/2306.05716v1)|null|\n", "2306.05493": "|**2023-06-08**|**Multi-Modal Classifiers for Open-Vocabulary Object Detection**|Prannay Kaul et.al.|[2306.05493v1](http://arxiv.org/abs/2306.05493v1)|null|\n", "2306.07272": "|**2023-06-12**|**Zero-shot Composed Text-Image Retrieval**|Yikun Liu et.al.|[2306.07272v1](http://arxiv.org/abs/2306.07272v1)|**[link](https://github.com/Code-kunkun/ZS-CIR)**|\n", "2306.07257": "|**2023-06-12**|**MovieFactory: Automatic Movie Creation from Text using Large Generative Models for Language and Images**|Junchen Zhu et.al.|[2306.07257v1](http://arxiv.org/abs/2306.07257v1)|null|\n", "2306.07207": "|**2023-06-12**|**Valley: Video Assistant with Large Language model Enhanced abilitY**|Ruipu Luo et.al.|[2306.07207v1](http://arxiv.org/abs/2306.07207v1)|**[link](https://github.com/rupertluo/valley)**|\n", "2306.07196": "|**2023-06-12**|**Retrieval-Enhanced Contrastive Vision-Text Models**|Ahmet Iscen et.al.|[2306.07196v1](http://arxiv.org/abs/2306.07196v1)|null|\n", "2306.07187": "|**2023-06-12**|**Video-to-Music Recommendation using Temporal Alignment of Segments**|Laure Pr\u00e9tet et.al.|[2306.07187v1](http://arxiv.org/abs/2306.07187v1)|null|\n", "2306.07096": "|**2023-06-12**|**Global and Local Semantic Completion Learning for Vision-Language Pre-training**|Rong-Cheng Tu et.al.|[2306.07096v1](http://arxiv.org/abs/2306.07096v1)|**[link](https://github.com/iigroup/scl)**|\n", "2306.06885": "|**2023-06-12**|**NPVForensics: Jointing Non-critical Phonemes and Visemes for Deepfake Detection**|Yu Chen et.al.|[2306.06885v1](http://arxiv.org/abs/2306.06885v1)|null|\n", "2306.06691": "|**2023-06-11**|**Self-Enhancement Improves Text-Image Retrieval in Foundation Visual-Language Models**|Yuguang Yang et.al.|[2306.06691v1](http://arxiv.org/abs/2306.06691v1)|null|\n", "2306.06687": "|**2023-06-11**|**LAMM: Language-Assisted Multi-Modal Instruction-Tuning Dataset, Framework, and Benchmark**|Zhenfei Yin et.al.|[2306.06687v1](http://arxiv.org/abs/2306.06687v1)|**[link](https://github.com/openlamm/lamm)**|\n", "2306.06615": "|**2023-06-11**|**Empowering Molecule Discovery for Molecule-Caption Translation with Large Language Models: A ChatGPT Perspective**|Jiatong Li et.al.|[2306.06615v1](http://arxiv.org/abs/2306.06615v1)|**[link](https://github.com/phenixace/molregpt)**|\n", "2306.06583": "|**2023-06-11**|**REACT2023: the first Multi-modal Multiple Appropriate Facial Reaction Generation Challenge**|Siyang Song et.al.|[2306.06583v1](http://arxiv.org/abs/2306.06583v1)|**[link](https://github.com/reactmultimodalchallenge/baseline_react2023)**|\n", "2306.06494": "|**2023-06-10**|**Multi-modal Pre-training for Medical Vision-language Understanding and Generation: An Empirical Study with A New Benchmark**|Li Xu et.al.|[2306.06494v1](http://arxiv.org/abs/2306.06494v1)|**[link](https://github.com/control-xl/medical-vision-langauge-transformer)**|\n", "2306.06476": "|**2023-06-10**|**Modality Influence in Multimodal Machine Learning**|Abdelhamid Haouhat et.al.|[2306.06476v1](http://arxiv.org/abs/2306.06476v1)|null|\n", "2306.06465": "|**2023-06-10**|**Simultaneous Trajectory Optimization and Contact Selection for Multi-Modal Manipulation Planning**|Mengchao Zhang et.al.|[2306.06465v1](http://arxiv.org/abs/2306.06465v1)|null|\n", "2306.06410": "|**2023-06-10**|**OpenSR: Open-Modality Speech Recognition via Maintaining Multi-Modality Alignment**|Xize Cheng et.al.|[2306.06410v1](http://arxiv.org/abs/2306.06410v1)|**[link](https://github.com/exgc/opensr)**|\n", "2306.07744": "|**2023-06-13**|**Contrastive Learning-Based Audio to Lyrics Alignment for Multiple Languages**|Simon Durand et.al.|[2306.07744v1](http://arxiv.org/abs/2306.07744v1)|**[link](https://github.com/f90/jamendolyrics)**|\n", "2306.07646": "|**2023-06-13**|**Enhanced Multimodal Representation Learning with Cross-modal KD**|Mengxi Chen et.al.|[2306.07646v1](http://arxiv.org/abs/2306.07646v1)|null|\n", "2306.07505": "|**2023-06-13**|**Deep learning radiomics for assessment of gastroesophageal varices in people with compensated advanced chronic liver disease**|Lan Wang et.al.|[2306.07505v1](http://arxiv.org/abs/2306.07505v1)|null|\n", "2306.07303": "|**2023-06-11**|**A Comprehensive Survey on Applications of Transformers for Deep Learning Tasks**|Saidul Islam et.al.|[2306.07303v1](http://arxiv.org/abs/2306.07303v1)|null|\n", "2306.09347": "|**2023-06-15**|**Segment Any Point Cloud Sequences by Distilling Vision Foundation Models**|Youquan Liu et.al.|[2306.09347v1](http://arxiv.org/abs/2306.09347v1)|**[link](https://github.com/youquanl/segment-any-point-cloud)**|\n", "2306.09265": "|**2023-06-15**|**LVLM-eHub: A Comprehensive Evaluation Benchmark for Large Vision-Language Models**|Peng Xu et.al.|[2306.09265v1](http://arxiv.org/abs/2306.09265v1)|**[link](https://github.com/opengvlab/multi-modality-arena)**|\n", "2306.09093": "|**2023-06-15**|**Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration**|Chenyang Lyu et.al.|[2306.09093v1](http://arxiv.org/abs/2306.09093v1)|**[link](https://github.com/lyuchenyang/macaw-llm)**|\n", "2306.09067": "|**2023-06-15**|**Winning Solution for the CVPR2023 Visual Anomaly and Novelty Detection Challenge: Multimodal Prompting for Data-centric Anomaly Detection**|Yunkang Cao et.al.|[2306.09067v1](http://arxiv.org/abs/2306.09067v1)|**[link](https://github.com/caoyunkang/segment-any-anomaly)**|\n", "2306.08966": "|**2023-06-15**|**Training Multimedia Event Extraction With Generated Images and Captions**|Zilin Du et.al.|[2306.08966v1](http://arxiv.org/abs/2306.08966v1)|null|\n", "2306.08893": "|**2023-06-15**|**LOVM: Language-Only Vision Model Selection**|Orr Zohar et.al.|[2306.08893v1](http://arxiv.org/abs/2306.08893v1)|**[link](https://github.com/orrzohar/lovm)**|\n", "2306.08871": "|**2023-06-15**|**Med-MMHL: A Multi-Modal Dataset for Detecting Human- and LLM-Generated Misinformation in the Medical Domain**|Yanshen Sun et.al.|[2306.08871v1](http://arxiv.org/abs/2306.08871v1)|**[link](https://github.com/styxsys0927/med-mmhl)**|\n", "2306.08832": "|**2023-06-15**|**Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to Enhance Visio-Linguistic Fine-grained Understanding**|Le Zhang et.al.|[2306.08832v1](http://arxiv.org/abs/2306.08832v1)|**[link](https://github.com/magiccircuit/enhance-finegrained)**|\n", "2306.08789": "|**2023-06-15**|**Efficient Token-Guided Image-Text Retrieval with Consistent Multimodal Contrastive Training**|Chong Liu et.al.|[2306.08789v1](http://arxiv.org/abs/2306.08789v1)|null|\n", "2306.08749": "|**2023-06-14**|**Utilizing Longitudinal Chest X-Rays and Reports to Pre-Fill Radiology Reports**|Qingqing Zhu et.al.|[2306.08749v1](http://arxiv.org/abs/2306.08749v1)|null|\n", "2306.08657": "|**2023-06-14**|**EMERSK -- Explainable Multimodal Emotion Recognition with Situational Knowledge**|Mijanur Palash et.al.|[2306.08657v1](http://arxiv.org/abs/2306.08657v1)|null|\n", "2306.08640": "|**2023-06-14**|**AssistGPT: A General Multi-modal Assistant that can Plan, Execute, Inspect, and Learn**|Difei Gao et.al.|[2306.08640v1](http://arxiv.org/abs/2306.08640v1)|null|\n", "2306.08522": "|**2023-06-14**|**Challenges of Indoor SLAM: A multi-modal multi-floor dataset for SLAM evaluation**|Pushyami Kaveti et.al.|[2306.08522v1](http://arxiv.org/abs/2306.08522v1)|**[link](https://github.com/neufieldrobotics/nufr-m3f)**|\n", "2306.08498": "|**2023-06-14**|**RISCLIP: Referring Image Segmentation Framework using CLIP**|Seoyeon Kim et.al.|[2306.08498v1](http://arxiv.org/abs/2306.08498v1)|**[link](https://github.com/Yeon07/RISCLIP)**|\n", "2306.08247": "|**2023-06-14**|**Diffusion in Diffusion: Cyclic One-Way Diffusion for Text-Vision-Conditioned Generation**|Yongqi Yang et.al.|[2306.08247v1](http://arxiv.org/abs/2306.08247v1)|null|\n", "2306.09851": "|**2023-06-16**|**Joint multi-modal Self-Supervised pre-training in Remote Sensing: Application to Methane Source Classification**|Paul Berg et.al.|[2306.09851v1](http://arxiv.org/abs/2306.09851v1)|null|\n", "2306.09546": "|**2023-06-15**|**Cross-Modal Video to Body-joints Augmentation for Rehabilitation Exercise Quality Assessment**|Ali Abedi et.al.|[2306.09546v1](http://arxiv.org/abs/2306.09546v1)|null|\n", "2306.09523": "|**2023-06-19**|**Tell Me Where to Go: A Composable Framework for Context-Aware Embodied Robot Navigation**|Harel Biggie et.al.|[2306.09523v2](http://arxiv.org/abs/2306.09523v2)|null|\n", "2306.09417": "|**2023-06-15**|**Diff-TTSG: Denoising probabilistic integrated speech and gesture synthesis**|Shivam Mehta et.al.|[2306.09417v1](http://arxiv.org/abs/2306.09417v1)|null|\n", "2306.11510": "|**2023-06-20**|**Pushing the Limits of 3D Shape Generation at Scale**|Wang Yu et.al.|[2306.11510v1](http://arxiv.org/abs/2306.11510v1)|null|\n", "2306.11504": "|**2023-06-20**|**Align, Adapt and Inject: Sound-guided Unified Image Generation**|Yue Yang et.al.|[2306.11504v1](http://arxiv.org/abs/2306.11504v1)|null|\n", "2306.11400": "|**2023-06-20**|**MuDPT: Multi-modal Deep-symphysis Prompt Tuning for Large Pre-trained Vision-Language Models**|Yongzhu Miao et.al.|[2306.11400v1](http://arxiv.org/abs/2306.11400v1)|**[link](https://github.com/mechrev0/mudpt)**|\n", "2306.11207": "|**2023-06-22**|**Quilt-1M: One Million Image-Text Pairs for Histopathology**|Wisdom Oluchi Ikezogwo et.al.|[2306.11207v2](http://arxiv.org/abs/2306.11207v2)|**[link](https://github.com/wisdomikezogwo/quilt1m)**|\n", "2306.11137": "|**2023-06-19**|**Deep Learning Framework with Multi-Head Dilated Encoders for Enhanced Segmentation of Cervical Cancer on Multiparametric Magnetic Resonance Imaging**|Reza Kalantar et.al.|[2306.11137v1](http://arxiv.org/abs/2306.11137v1)|null|\n", "2306.11065": "|**2023-06-19**|**Cross-Modal Attribute Insertions for Assessing the Robustness of Vision-and-Language Learning**|Shivaen Ramshetty et.al.|[2306.11065v1](http://arxiv.org/abs/2306.11065v1)|**[link](https://github.com/claws-lab/multimodal-robustness-xmai)**|\n", "2306.11025": "|**2023-06-19**|**Temporal Data Meets LLM -- Explainable Financial Time Series Forecasting**|Xinli Yu et.al.|[2306.11025v1](http://arxiv.org/abs/2306.11025v1)|null|\n", "2306.11020": "|**2023-06-19**|**Dual-Gated Fusion with Prefix-Tuning for Multi-Modal Relation Extraction**|Qian Li et.al.|[2306.11020v1](http://arxiv.org/abs/2306.11020v1)|null|\n", "2306.10830": "|**2023-06-19**|**3D VR Sketch Guided 3D Shape Prototyping and Exploration**|Ling Luo et.al.|[2306.10830v1](http://arxiv.org/abs/2306.10830v1)|**[link](https://github.com/rowl1ng/3dsketch2shape)**|\n", "2306.10799": "|**2023-06-19**|**SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend 3D Talking Faces**|Ziqiao Peng et.al.|[2306.10799v1](http://arxiv.org/abs/2306.10799v1)|**[link](https://github.com/psyai-net/SelfTalk_release)**|\n", "2306.10772": "|**2023-06-19**|**Learning an Interpretable End-to-End Network for Real-Time Acoustic Beamforming**|Hao Liang et.al.|[2306.10772v1](http://arxiv.org/abs/2306.10772v1)|null|\n", "2306.10750": "|**2023-06-19**|**WiCo: Win-win Cooperation of Bottom-up and Top-down Referring Image Segmentation**|Zesen Cheng et.al.|[2306.10750v1](http://arxiv.org/abs/2306.10750v1)|null|\n", "2306.10730": "|**2023-06-19**|**UniG3D: A Unified 3D Object Generation Dataset**|Qinghong Sun et.al.|[2306.10730v1](http://arxiv.org/abs/2306.10730v1)|null|\n", "2306.10687": "|**2023-06-19**|**Categories of Response-Based, Feature-Based, and Relation-Based Knowledge Distillation**|Chuanguang Yang et.al.|[2306.10687v1](http://arxiv.org/abs/2306.10687v1)|null|\n", "2306.10567": "|**2023-06-18**|**MIR-GAN: Refining Frame-Level Modality-Invariant Representations with Adversarial Network for Audio-Visual Speech Recognition**|Yuchen Hu et.al.|[2306.10567v1](http://arxiv.org/abs/2306.10567v1)|**[link](https://github.com/yuchen005/mir-gan)**|\n", "2306.12387": "|**2023-06-21**|**Solving Dialogue Grounding Embodied Task in a Simulated Environment using Further Masked Language Modeling**|Weijie Jack Zhang et.al.|[2306.12387v1](http://arxiv.org/abs/2306.12387v1)|null|\n", "2306.11762": "|**2023-06-20**|**MultiEarth 2023 Deforestation Challenge -- Team FOREVER**|Seunghan Park et.al.|[2306.11762v1](http://arxiv.org/abs/2306.11762v1)|null|\n", "2306.13076": "|**2023-06-22**|**A Comparison of Time-based Models for Multimodal Emotion Recognition**|Ege Kesim et.al.|[2306.13076v1](http://arxiv.org/abs/2306.13076v1)|null|\n", "2306.12819": "|**2023-06-22**|**XACML Extension for Graphs: Flexible Authorization Policy Specification and Datastore-independent Enforcement**|Aya Mohamed et.al.|[2306.12819v1](http://arxiv.org/abs/2306.12819v1)|null|\n", "2306.12795": "|**2023-06-22**|**Learning Unseen Modality Interaction**|Yunhua Zhang et.al.|[2306.12795v1](http://arxiv.org/abs/2306.12795v1)|null|\n", "2306.12725": "|**2023-06-22**|**Generative Multimodal Entity Linking**|Senbao Shi et.al.|[2306.12725v1](http://arxiv.org/abs/2306.12725v1)|**[link](https://github.com/hitsz-tmg/gemel)**|\n", "2306.12559": "|**2023-06-21**|**Exploring the Role of Audio in Video Captioning**|Yuhan Shen et.al.|[2306.12559v1](http://arxiv.org/abs/2306.12559v1)|null|\n", "2306.12525": "|**2023-06-21**|**LPFormer: LiDAR Pose Estimation Transformer with Multi-Task Network**|Dongqiangzi Ye et.al.|[2306.12525v1](http://arxiv.org/abs/2306.12525v1)|null|\n", "2306.13592": "|**2023-06-23**|**TACOformer:Token-channel compounded Cross Attention for Multimodal Emotion Recognition**|Xinda Li et.al.|[2306.13592v1](http://arxiv.org/abs/2306.13592v1)|null|\n", "2306.13285": "|**2023-06-23**|**Learning Scene Flow With Skeleton Guidance For 3D Action Recognition**|Vasileios Magoulianitis et.al.|[2306.13285v1](http://arxiv.org/abs/2306.13285v1)|null|\n", "2306.13240": "|**2023-06-22**|**Continuous Online Extrinsic Calibration of Fisheye Camera and LiDAR**|Jack Borer et.al.|[2306.13240v1](http://arxiv.org/abs/2306.13240v1)|null|\n", "2306.14795": "|**2023-06-26**|**MotionGPT: Human Motion as a Foreign Language**|Biao Jiang et.al.|[2306.14795v1](http://arxiv.org/abs/2306.14795v1)|**[link](https://github.com/openmotionlab/motiongpt)**|\n", "2306.14565": "|**2023-06-26**|**Aligning Large Multi-Modal Model with Robust Instruction Tuning**|Fuxiao Liu et.al.|[2306.14565v1](http://arxiv.org/abs/2306.14565v1)|**[link](https://github.com/FuxiaoLiu/LRV-Instruction)**|\n", "2306.14406": "|**2023-06-26**|**TCEIP: Text Condition Embedded Regression Network for Dental Implant Position Prediction**|Xinquan Yang et.al.|[2306.14406v1](http://arxiv.org/abs/2306.14406v1)|null|\n", "2306.14399": "|**2023-06-26**|**Mutual Query Network for Multi-Modal Product Image Segmentation**|Yun Guo et.al.|[2306.14399v1](http://arxiv.org/abs/2306.14399v1)|**[link](https://github.com/weifeng-github/mqn)**|\n", "2306.14177": "|**2023-06-25**|**Enhancing Mapless Trajectory Prediction through Knowledge Distillation**|Yuning Wang et.al.|[2306.14177v1](http://arxiv.org/abs/2306.14177v1)|null|\n", "2306.14170": "|**2023-06-25**|**AV-SepFormer: Cross-Attention SepFormer for Audio-Visual Target Speaker Extraction**|Jiuxin Lin et.al.|[2306.14170v1](http://arxiv.org/abs/2306.14170v1)|**[link](https://github.com/lin9x/av-sepformer)**|\n", "2306.14143": "|**2023-06-25**|**Intelligent Multi-Modal Sensing-Communication Integration: Synesthesia of Machines**|Xiang Cheng et.al.|[2306.14143v1](http://arxiv.org/abs/2306.14143v1)|null|\n", "2306.14125": "|**2023-06-25**|**M$^3$SC: A Generic Dataset for Mixed Multi-Modal (MMM) Sensing and Communication Integration**|Xiang Cheng et.al.|[2306.14125v1](http://arxiv.org/abs/2306.14125v1)|null|\n", "2306.14112": "|**2023-06-25**|**Enhancing Dynamic Image Advertising with Vision-Language Pre-training**|Zhoufutu Wen et.al.|[2306.14112v1](http://arxiv.org/abs/2306.14112v1)|null|\n", "2306.13856": "|**2023-06-24**|**Learning-to-Rank Meets Language: Boosting Language-Driven Ordering Alignment for Ordinal Classification**|Rui Wang et.al.|[2306.13856v1](http://arxiv.org/abs/2306.13856v1)|**[link](https://github.com/raywang335/l2rclip)**|\n", "2306.13804": "|**2023-06-27**|**Cross-Language Speech Emotion Recognition Using Multimodal Dual Attention Transformers**|Syed Aun Muhammad Zaidi et.al.|[2306.13804v2](http://arxiv.org/abs/2306.13804v2)|null|\n", "2306.15644": "|**2023-06-27**|**Style-transfer based Speech and Audio-visual Scene Understanding for Robot Action Sequence Acquisition from Videos**|Chiori Hori et.al.|[2306.15644v1](http://arxiv.org/abs/2306.15644v1)|null|\n", "2306.15612": "|**2023-06-27**|**Rethinking Cross-Entropy Loss for Stereo Matching Networks**|Peng Xu et.al.|[2306.15612v1](http://arxiv.org/abs/2306.15612v1)|null|\n", "2306.15605": "|**2023-06-27**|**Deep Normalizing Flows for State Estimation**|Harrison Delecki et.al.|[2306.15605v1](http://arxiv.org/abs/2306.15605v1)|**[link](https://github.com/sisl/deepnfstateestimation)**|\n", "2306.15464": "|**2023-06-27**|**Large-scale unsupervised audio pre-training for video-to-speech synthesis**|Triantafyllos Kefalas et.al.|[2306.15464v1](http://arxiv.org/abs/2306.15464v1)|null|\n", "2306.15255": "|**2023-06-27**|**GroundNLQ @ Ego4D Natural Language Queries Challenge 2023**|Zhijian Hou et.al.|[2306.15255v1](http://arxiv.org/abs/2306.15255v1)|**[link](https://github.com/houzhijian/groundnlq)**|\n", "2306.15231": "|**2023-06-27**|**Emulating Reader Behaviors for Fake News Detection**|Junwei Yin et.al.|[2306.15231v1](http://arxiv.org/abs/2306.15231v1)|null|\n", "2306.15114": "|**2023-06-26**|**Transfer: Cross Modality Knowledge Transfer using Adversarial Networks -- A Study on Gesture Recognition**|Payal Kamboj et.al.|[2306.15114v1](http://arxiv.org/abs/2306.15114v1)|null|\n", "2306.16349": "|**2023-06-28**|**Accurate, uncertainty-aware classification of molecular chemical motifs from multi-modal X-ray absorption spectroscopy**|Matthew R. Carbone et.al.|[2306.16349v1](http://arxiv.org/abs/2306.16349v1)|null|\n", "2306.16329": "|**2023-06-28**|**DiffComplete: Diffusion-based Generative 3D Shape Completion**|Ruihang Chu et.al.|[2306.16329v1](http://arxiv.org/abs/2306.16329v1)|null|\n", "2306.16207": "|**2023-06-28**|**Inferring the Goals of Communicating Agents from Actions and Instructions**|Lance Ying et.al.|[2306.16207v1](http://arxiv.org/abs/2306.16207v1)|null|\n", "2306.16034": "|**2023-06-28**|**Stone Needle: A General Multimodal Large-scale Model Framework towards Healthcare**|Weihua Liu et.al.|[2306.16034v1](http://arxiv.org/abs/2306.16034v1)|null|\n", "2306.15977": "|**2023-06-28**|**A Dimensional Structure based Knowledge Distillation Method for Cross-Modal Learning**|Lingyu Si et.al.|[2306.15977v1](http://arxiv.org/abs/2306.15977v1)|null|\n", "2306.15955": "|**2023-06-29**|**Bridging the Gap: Neural Collapse Inspired Prompt Tuning for Generalization under Class Imbalance**|Didi Zhu et.al.|[2306.15955v2](http://arxiv.org/abs/2306.15955v2)|null|\n", "2306.15946": "|**2023-06-28**|**Knowledge-Enhanced Hierarchical Information Correlation Learning for Multi-Modal Rumor Detection**|Jiawei Liu et.al.|[2306.15946v1](http://arxiv.org/abs/2306.15946v1)|null|\n", "2306.15943": "|**2023-06-28**|**No Transfers Required: Integrating Last Mile with Public Transit Using Opti-Mile**|Raashid Altaf et.al.|[2306.15943v1](http://arxiv.org/abs/2306.15943v1)|null|\n", "2306.15837": "|**2023-06-27**|**Symbol emergence as interpersonal cross-situational learning: the emergence of lexical knowledge with combinatoriality**|Yoshinobu Hagiwara et.al.|[2306.15837v1](http://arxiv.org/abs/2306.15837v1)|null|\n", "2306.15808": "|**2023-06-27**|**Classification of Infant Sleep/Wake States: Cross-Attention among Large Scale Pretrained Transformer Networks using Audio, ECG, and IMU Data**|Kai Chieh Chang et.al.|[2306.15808v1](http://arxiv.org/abs/2306.15808v1)|null|\n", "2306.15711": "|**2023-06-27**|**Semi-supervised Multimodal Representation Learning through a Global Workspace**|Benjamin Devillers et.al.|[2306.15711v1](http://arxiv.org/abs/2306.15711v1)|**[link](https://github.com/bdvllrs/bimgw)**|\n", "2306.17115": "|**2023-07-03**|**Michelangelo: Conditional 3D Shape Generation based on Shape-Image-Text Aligned Latent Representation**|Zibo Zhao et.al.|[2306.17115v2](http://arxiv.org/abs/2306.17115v2)|**[link](https://github.com/neuralcarver/michelangelo)**|\n", "2306.17107": "|**2023-06-29**|**LLaVAR: Enhanced Visual Instruction Tuning for Text-Rich Image Understanding**|Yanzhe Zhang et.al.|[2306.17107v1](http://arxiv.org/abs/2306.17107v1)|**[link](https://github.com/SALT-NLP/LLaVAR)**|\n", "2306.17000": "|**2023-06-29**|**MotionTrack: End-to-End Transformer-based Multi-Object Tracing with LiDAR-Camera Fusion**|Ce Zhang et.al.|[2306.17000v1](http://arxiv.org/abs/2306.17000v1)|null|\n", "2306.16927": "|**2023-06-29**|**End-to-end Autonomous Driving: Challenges and Frontiers**|Li Chen et.al.|[2306.16927v1](http://arxiv.org/abs/2306.16927v1)|**[link](https://github.com/opendrivelab/end-to-end-autonomous-driving)**|\n", "2306.16862": "|**2023-06-29**|**Sustainable Palm Tree Farming: Leveraging IoT and Multi-Modal Data for Early Detection and Mapping of Red Palm Weevil**|Yosra Hajjaji et.al.|[2306.16862v1](http://arxiv.org/abs/2306.16862v1)|null|\n", "2306.16762": "|**2023-06-29**|**Unified Language Representation for Question Answering over Text, Tables, and Images**|Bowen Yu et.al.|[2306.16762v1](http://arxiv.org/abs/2306.16762v1)|null|\n", "2306.16478": "|**2023-06-28**|**Pre-Training Multi-Modal Dense Retrievers for Outside-Knowledge Visual Question Answering**|Alireza Salemi et.al.|[2306.16478v1](http://arxiv.org/abs/2306.16478v1)|**[link](https://github.com/alirezasalemi7/pretraining-multimodal-dense-retriever-for-okvqa)**|\n", "2306.17525": "|**2023-06-30**|**MeLM, a generative pretrained language modeling framework that solves forward and inverse mechanics problems**|Markus J. Buehler et.al.|[2306.17525v1](http://arxiv.org/abs/2306.17525v1)|null|\n", "2306.17400": "|**2023-06-30**|**Topological Data Analysis Guided Segment Anything Model Prompt Optimization for Zero-Shot Segmentation in Biological Imaging**|Ruben Glatt et.al.|[2306.17400v1](http://arxiv.org/abs/2306.17400v1)|null|\n", "2306.17371": "|**2023-06-30**|**Capturing functional connectomics using Riemannian partial least squares**|Matt Ryan et.al.|[2306.17371v1](http://arxiv.org/abs/2306.17371v1)|null|\n", "2307.01146": "|**2023-07-05**|**AVSegFormer: Audio-Visual Segmentation with Transformer**|Shengyi Gao et.al.|[2307.01146v2](http://arxiv.org/abs/2307.01146v2)|**[link](https://github.com/vvvb-github/avsegformer)**|\n", "2307.01124": "|**2023-07-03**|**Cross-modality Attention Adapter: A Glioma Segmentation Fine-tuning Method for SAM Using Multimodal Brain MR Images**|Xiaoyu Shi et.al.|[2307.01124v1](http://arxiv.org/abs/2307.01124v1)|null|\n", "2307.01121": "|**2023-07-03**|**Artifacts Mapping: Multi-Modal Semantic Mapping for Object Detection and 3D Localization**|Federico Rollo et.al.|[2307.01121v1](http://arxiv.org/abs/2307.01121v1)|null|\n", "2307.01047": "|**2023-07-03**|**Cross-modal Place Recognition in Image Databases using Event-based Sensors**|Xiang Ji et.al.|[2307.01047v1](http://arxiv.org/abs/2307.01047v1)|null|\n", "2307.01003": "|**2023-07-03**|**Visual Instruction Tuning with Polite Flamingo**|Delong Chen et.al.|[2307.01003v1](http://arxiv.org/abs/2307.01003v1)|**[link](https://github.com/chendelong1999/polite_flamingo)**|\n", "2307.00997": "|**2023-07-03**|**RefSAM: Efficiently Adapting Segmenting Anything Model for Referring Video Object Segmentation**|Yonglin Li et.al.|[2307.00997v1](http://arxiv.org/abs/2307.00997v1)|**[link](https://github.com/lancasterli/refsam)**|\n", "2307.00954": "|**2023-07-03**|**HODINet: High-Order Discrepant Interaction Network for RGB-D Salient Object Detection**|Kang Yi et.al.|[2307.00954v1](http://arxiv.org/abs/2307.00954v1)|null|\n", "2307.00877": "|**2023-07-03**|**Exploring the Multi-modal Demand Dynamics During Transport System Disruptions**|Ali Shateri Benam et.al.|[2307.00877v1](http://arxiv.org/abs/2307.00877v1)|null|\n", "2307.00873": "|**2023-07-03**|**End-To-End Prediction of Knee Osteoarthritis Progression With Multi-Modal Transformers**|Egor Panfilov et.al.|[2307.00873v1](http://arxiv.org/abs/2307.00873v1)|null|\n", "2307.00716": "|**2023-07-03**|**JourneyDB: A Benchmark for Generative Image Understanding**|Junting Pan et.al.|[2307.00716v1](http://arxiv.org/abs/2307.00716v1)|null|\n", "2307.00671": "|**2023-07-02**|**Leveraging Multi-modal Sensing for Robotic Insertion Tasks in R&D Laboratories**|Aaron Butterworth et.al.|[2307.00671v1](http://arxiv.org/abs/2307.00671v1)|null|\n", "2307.00610": "|**2023-07-02**|**Fraunhofer SIT at CheckThat! 2023: Mixing Single-Modal Classifiers to Estimate the Check-Worthiness of Multi-Modal Tweets**|Raphael Frick et.al.|[2307.00610v1](http://arxiv.org/abs/2307.00610v1)|null|\n", "2307.00595": "|**2023-07-02**|**RH20T: A Robotic Dataset for Learning Diverse Skills in One-Shot**|Hao-Shu Fang et.al.|[2307.00595v1](http://arxiv.org/abs/2307.00595v1)|null|\n", "2307.00536": "|**2023-07-02**|**Referring Video Object Segmentation with Inter-Frame Interaction and Cross-Modal Correlation**|Meng Lan et.al.|[2307.00536v1](http://arxiv.org/abs/2307.00536v1)|null|\n", "2307.00398": "|**2023-07-01**|**ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models**|Uddeshya Upadhyay et.al.|[2307.00398v1](http://arxiv.org/abs/2307.00398v1)|**[link](https://github.com/explainableml/probvlm)**|\n", "2307.02469": "|**2023-07-05**|**What Matters in Training a GPT4-Style Language Model with Multimodal Inputs?**|Yan Zeng et.al.|[2307.02469v1](http://arxiv.org/abs/2307.02469v1)|null|\n", "2307.02280": "|**2023-07-05**|**Interactive Image Segmentation with Cross-Modality Vision Transformers**|Kun Li et.al.|[2307.02280v1](http://arxiv.org/abs/2307.02280v1)|**[link](https://github.com/lik1996/icmformer)**|\n", "2307.02041": "|**2023-07-05**|**Multimodal Imbalance-Aware Gradient Modulation for Weakly-supervised Audio-Visual Video Parsing**|Jie Fu et.al.|[2307.02041v1](http://arxiv.org/abs/2307.02041v1)|null|\n", "2307.02003": "|**2023-07-05**|**Multi-Modal Prototypes for Open-Set Semantic Segmentation**|Yuhuan Yang et.al.|[2307.02003v1](http://arxiv.org/abs/2307.02003v1)|null|\n", "2307.01947": "|**2023-07-04**|**Causal Video Summarizer for Video Exploration**|Jia-Hong Huang et.al.|[2307.01947v1](http://arxiv.org/abs/2307.01947v1)|null|\n", "2307.01824": "|**2023-07-04**|**Multi-Channel Feature Extraction for Virtual Histological Staining of Photon Absorption Remote Sensing Images**|Marian Boktor et.al.|[2307.01824v1](http://arxiv.org/abs/2307.01824v1)|null|\n", "2307.01798": "|**2023-07-04**|**Edge-aware Multi-task Network for Integrating Quantification Segmentation and Uncertainty Prediction of Liver Tumor on Multi-modality Non-contrast MRI**|Xiaojiao Xiao et.al.|[2307.01798v1](http://arxiv.org/abs/2307.01798v1)|null|\n", "2307.01741": "|**2023-07-04**|**Ben-ge: Extending BigEarthNet with Geographical and Environmental Data**|Michael Mommert et.al.|[2307.01741v1](http://arxiv.org/abs/2307.01741v1)|**[link](https://github.com/hsg-aiml/ben-ge)**|\n", "2307.01704": "|**2023-07-04**|**Graph-Ensemble Learning Model for Multi-label Skin Lesion Classification using Dermoscopy and Clinical Images**|Peng Tang et.al.|[2307.01704v1](http://arxiv.org/abs/2307.01704v1)|null|\n", "2307.01691": "|**2023-07-06**|**SeePrivacy: Automated Contextual Privacy Policy Generation for Mobile Applications**|Shidong Pan et.al.|[2307.01691v2](http://arxiv.org/abs/2307.01691v2)|**[link](https://github.com/cpp4app/cpp4app)**|\n", "2307.01577": "|**2023-07-04**|**Conceptual Cognitive Maps Formation with Neural Successor Networks and Word Embeddings**|Paul Stoewer et.al.|[2307.01577v1](http://arxiv.org/abs/2307.01577v1)|null|\n", "2307.01515": "|**2023-07-04**|**LPN: Language-guided Prototypical Network for few-shot classification**|Kaihui Cheng et.al.|[2307.01515v1](http://arxiv.org/abs/2307.01515v1)|null|\n", "2307.01425": "|**2023-07-04**|**Consistent Multimodal Generation via A Unified GAN Framework**|Zhen Zhu et.al.|[2307.01425v1](http://arxiv.org/abs/2307.01425v1)|null|\n", "2307.01422": "|**2023-07-04**|**Generative Flow Networks: a Markov Chain Perspective**|Tristan Deleu et.al.|[2307.01422v1](http://arxiv.org/abs/2307.01422v1)|null|\n", "2307.03068": "|**2023-07-06**|**A Hybrid End-to-End Spatio-Temporal Attention Neural Network with Graph-Smooth Signals for EEG Emotion Recognition**|Shadi Sartipi et.al.|[2307.03068v1](http://arxiv.org/abs/2307.03068v1)|null|\n", "2307.02978": "|**2023-07-06**|**Multi-modal multi-class Parkinson disease classification using CNN and decision level fusion**|Sushanta Kumar Sahu et.al.|[2307.02978v1](http://arxiv.org/abs/2307.02978v1)|null|\n", "2307.02971": "|**2023-07-06**|**On the Cultural Gap in Text-to-Image Generation**|Bingshuai Liu et.al.|[2307.02971v1](http://arxiv.org/abs/2307.02971v1)|null|\n", "2307.02862": "|**2023-07-06**|**A Critical Look at the Current Usage of Foundation Model for Dense Recognition Task**|Shiqi Yang et.al.|[2307.02862v1](http://arxiv.org/abs/2307.02862v1)|null|\n", "2307.02796": "|**2023-07-06**|**VerifAI: Verified Generative AI**|Nan Tang et.al.|[2307.02796v1](http://arxiv.org/abs/2307.02796v1)|null|\n", "2307.02761": "|**2023-07-06**|**Cross-Modal Content Inference and Feature Enrichment for Cold-Start Recommendation**|Haokai Ma et.al.|[2307.02761v1](http://arxiv.org/abs/2307.02761v1)|null|\n", "2307.02730": "|**2023-07-06**|**Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of Figure Skating**|Sheng-Lan Liu et.al.|[2307.02730v1](http://arxiv.org/abs/2307.02730v1)|null|\n", "2307.03706": "|**2023-07-07**|**Counterion-controlled phase equilibria in a charge-regulated polymer solution**|Giulia L. Celora et.al.|[2307.03706v1](http://arxiv.org/abs/2307.03706v1)|null|\n", "2307.03638": "|**2023-07-07**|**Physical-aware Cross-modal Adversarial Network for Wearable Sensor-based Human Action Recognition**|Jianyuan Ni et.al.|[2307.03638v1](http://arxiv.org/abs/2307.03638v1)|null|\n", "2307.03623": "|**2023-07-07**|**Robust Human Detection under Visual Degradation via Thermal and mmWave Radar Fusion**|Kaiwen Cai et.al.|[2307.03623v1](http://arxiv.org/abs/2307.03623v1)|**[link](https://github.com/ramdrop/utm)**|\n", "2307.03535": "|**2023-07-07**|**Matching in the Wild: Learning Anatomical Embeddings for Multi-Modality Images**|Xiaoyu Bai et.al.|[2307.03535v1](http://arxiv.org/abs/2307.03535v1)|null|\n", "2307.03427": "|**2023-07-07**|**Merging-Diverging Hybrid Transformer Networks for Survival Prediction in Head and Neck Cancer**|Mingyuan Meng et.al.|[2307.03427v1](http://arxiv.org/abs/2307.03427v1)|**[link](https://github.com/mungomeng/survival-xsurv)**|\n", "2307.03388": "|**2023-07-07**|**General-Purpose Multimodal Transformer meets Remote Sensing Semantic Segmentation**|Nhi Kieu et.al.|[2307.03388v1](http://arxiv.org/abs/2307.03388v1)|**[link](https://github.com/nhikieu/spatialvolumetricmultimodal)**|\n", "2307.03373": "|**2023-07-07**|**All in One: Exploring Unified Vision-Language Tracking with Multi-Modal Alignment**|Chunhui Zhang et.al.|[2307.03373v1](http://arxiv.org/abs/2307.03373v1)|null|\n", "2307.03339": "|**2023-07-07**|**Open-Vocabulary Object Detection via Scene Graph Discovery**|Hengcan Shi et.al.|[2307.03339v1](http://arxiv.org/abs/2307.03339v1)|null|\n", "2307.03274": "|**2023-07-06**|**It is not Sexually Suggestive, It is Educative. Separating Sex Education from Suggestive Content on TikTok Videos**|Enfa George et.al.|[2307.03274v1](http://arxiv.org/abs/2307.03274v1)|null|\n", "2307.03240": "|**2023-07-06**|**Adaptive Generation of Privileged Intermediate Information for Visible-Infrared Person Re-Identification**|Mahdi Alehdaghi et.al.|[2307.03240v1](http://arxiv.org/abs/2307.03240v1)|null|\n", "2307.03591": "|**2023-07-06**|**Structure Guided Multi-modal Pre-trained Transformer for Knowledge Graph Reasoning**|Ke Liang et.al.|[2307.03591v1](http://arxiv.org/abs/2307.03591v1)|null|\n", "2307.04751": "|**2023-07-10**|**Shelving, Stacking, Hanging: Relational Pose Diffusion for Multi-modal Rearrangement**|Anthony Simeonov et.al.|[2307.04751v1](http://arxiv.org/abs/2307.04751v1)|null|\n", "2307.04749": "|**2023-07-10**|**Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image Alignment with Iterative VQA Feedback**|Jaskirat Singh et.al.|[2307.04749v1](http://arxiv.org/abs/2307.04749v1)|null|\n", "2307.04722": "|**2023-07-10**|**Advances and Challenges in Meta-Learning: A Technical Review**|Anna Vettoruzzo et.al.|[2307.04722v1](http://arxiv.org/abs/2307.04722v1)|null|\n", "2307.04470": "|**2023-07-10**|**Test-Time Adaptation for Nighttime Color-Thermal Semantic Segmentation**|Yexin Liu et.al.|[2307.04470v1](http://arxiv.org/abs/2307.04470v1)|null|\n", "2307.04461": "|**2023-07-10**|**Multi-modal Graph Learning over UMLS Knowledge Graphs**|Manuel Burger et.al.|[2307.04461v1](http://arxiv.org/abs/2307.04461v1)|**[link](https://github.com/ratschlab/mmugl)**|\n", "2307.04421": "|**2023-07-13**|**Towards Enabling Cardiac Digital Twins of Myocardial Infarction Using Deep Computational Models for Inverse Inference**|Lei Li et.al.|[2307.04421v2](http://arxiv.org/abs/2307.04421v2)|null|\n", "2307.04361": "|**2023-07-10**|**Enhancing Cross-lingual Transfer via Phonemic Transcription Integration**|Hoang H. Nguyen et.al.|[2307.04361v1](http://arxiv.org/abs/2307.04361v1)|**[link](https://github.com/nhhoang96/phonemic_xlingual)**|\n", "2307.04296": "|**2023-07-10**|**K-Space-Aware Cross-Modality Score for Synthesized Neuroimage Quality Assessment**|Jinbao Wang et.al.|[2307.04296v1](http://arxiv.org/abs/2307.04296v1)|null|\n", "2307.04231": "|**2023-07-09**|**Mx2M: Masked Cross-Modality Modeling in Domain Adaptation for 3D Semantic Segmentation**|Boxiang Zhang et.al.|[2307.04231v1](http://arxiv.org/abs/2307.04231v1)|null|\n", "2307.04129": "|**2023-07-09**|**Cross-modal Orthogonal High-rank Augmentation for RGB-Event Transformer-trackers**|Zhiyu Zhu et.al.|[2307.04129v1](http://arxiv.org/abs/2307.04129v1)|**[link](https://github.com/ZHU-Zhiyu/High-Rank_RGB-Event_Tracker)**|\n", "2307.04091": "|**2023-07-09**|**CMDFusion: Bidirectional Fusion Network with Cross-modality Knowledge Distillation for LIDAR Semantic Segmentation**|Jun Cen et.al.|[2307.04091v1](http://arxiv.org/abs/2307.04091v1)|null|\n", "2307.03990": "|**2023-07-08**|**FTFDNet: Learning to Detect Talking Face Video Manipulation with Tri-Modality Interaction**|Ganglai Wang et.al.|[2307.03990v1](http://arxiv.org/abs/2307.03990v1)|null|\n", "2307.03942": "|**2023-07-08**|**Ariadne's Thread:Using Text Prompts to Improve Segmentation of Infected Areas from Chest X-ray images**|Yi Zhong et.al.|[2307.03942v1](http://arxiv.org/abs/2307.03942v1)|**[link](https://github.com/junelin2333/languidemedseg-miccai2023)**|\n", "2307.03903": "|**2023-07-08**|**Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for Visible-Infrared Video Person Re-Identification**|Huafeng Li et.al.|[2307.03903v1](http://arxiv.org/abs/2307.03903v1)|null|\n", "2307.03798": "|**2023-07-07**|**CLIPMasterPrints: Fooling Contrastive Language-Image Pre-training Using Latent Variable Evolution**|Matthias Freiberger et.al.|[2307.03798v1](http://arxiv.org/abs/2307.03798v1)|**[link](https://github.com/matfrei/clipmasterprints)**|\n", "2307.05463": "|**2023-07-11**|**EgoVLPv2: Egocentric Video-Language Pre-training with Fusion in the Backbone**|Shraman Pramanick et.al.|[2307.05463v1](http://arxiv.org/abs/2307.05463v1)|null|\n", "2307.05435": "|**2023-07-11**|**One-Versus-Others Attention: Scalable Multimodal Integration**|Michal Golovanevsky et.al.|[2307.05435v1](http://arxiv.org/abs/2307.05435v1)|**[link](https://github.com/rsinghlab/ovo)**|\n", "2307.04978": "|**2023-07-11**|**Diffusion idea exploration for art generation**|Nikhil Verma et.al.|[2307.04978v1](http://arxiv.org/abs/2307.04978v1)|null|\n", "2307.06281": "|**2023-07-12**|**MMBench: Is Your Multi-modal Model an All-around Player?**|Yuan Liu et.al.|[2307.06281v1](http://arxiv.org/abs/2307.06281v1)|**[link](https://github.com/InternLM/opencompass)**|\n", "2307.06174": "|**2023-07-12**|**Identification in Multiple Treatment Models under Discrete Variation**|Vishal Kamat et.al.|[2307.06174v1](http://arxiv.org/abs/2307.06174v1)|null|\n", "2307.05591": "|**2023-07-10**|**SITTA: A Semantic Image-Text Alignment for Image Captioning**|Fabian Paischer et.al.|[2307.05591v1](http://arxiv.org/abs/2307.05591v1)|**[link](https://github.com/ml-jku/semantic-image-text-alignment)**|\n", "2307.06505": "|**2023-07-13**|**WaterScenes: A Multi-Task 4D Radar-Camera Fusion Dataset and Benchmark for Autonomous Driving on Water Surfaces**|Shanliang Yao et.al.|[2307.06505v1](http://arxiv.org/abs/2307.06505v1)|**[link](https://github.com/waterscenes/waterscenes)**|\n", "2307.06424": "|**2023-07-12**|**Robust scalable initialization for Bayesian variational inference with multi-modal Laplace approximations**|Wyatt Bridgman et.al.|[2307.06424v1](http://arxiv.org/abs/2307.06424v1)|null|\n", "2307.07453": "|**2023-07-14**|**Investigation of Deep Learning-Based Filtered Density Function for Large Eddy Simulation of Turbulent Scalar Mixing**|Shubhangi Bansude et.al.|[2307.07453v1](http://arxiv.org/abs/2307.07453v1)|null|\n", "2307.07362": "|**2023-07-14**|**A scoping review on multimodal deep learning in biomedical images and texts**|Zhaoyi Sun et.al.|[2307.07362v1](http://arxiv.org/abs/2307.07362v1)|null|\n", "2307.07341": "|**2023-07-14**|**PiTL: Cross-modal Retrieval with Weakly-supervised Vision-language Pre-training via Prompting**|Zixin Guo et.al.|[2307.07341v1](http://arxiv.org/abs/2307.07341v1)|null|\n", "2307.07184": "|**2023-07-14**|**TVPR: Text-to-Video Person Retrieval and a New Benchmark**|Fan Ni et.al.|[2307.07184v1](http://arxiv.org/abs/2307.07184v1)|null|\n", "2307.07177": "|**2023-07-14**|**TriFormer: A Multi-modal Transformer Framework For Mild Cognitive Impairment Conversion Prediction**|Linfeng Liu et.al.|[2307.07177v1](http://arxiv.org/abs/2307.07177v1)|null|\n", "2307.07142": "|**2023-07-14**|**CFI2P: Coarse-to-Fine Cross-Modal Correspondence Learning for Image-to-Point Cloud Registration**|Gongxin Yao et.al.|[2307.07142v1](http://arxiv.org/abs/2307.07142v1)|null|\n", "2307.07135": "|**2023-07-14**|**MMSD2.0: Towards a Reliable Multi-modal Sarcasm Detection System**|Libo Qin et.al.|[2307.07135v1](http://arxiv.org/abs/2307.07135v1)|**[link](https://github.com/joeying1019/mmsd2.0)**|\n", "2307.08581": "|**2023-07-17**|**BuboGPT: Enabling Visual Grounding in Multi-Modal LLMs**|Yang Zhao et.al.|[2307.08581v1](http://arxiv.org/abs/2307.08581v1)|null|\n", "2307.08492": "|**2023-07-17**|**SVDFormer: Complementing Point Cloud via Self-view Augmentation and Self-structure Dual-generator**|Zhe Zhu et.al.|[2307.08492v1](http://arxiv.org/abs/2307.08492v1)|**[link](https://github.com/czvvd/svdformer)**|\n", "2307.08415": "|**2023-07-17**|**Monocular 3D Object Detection with LiDAR Guided Semi Supervised Active Learning**|Aral Hekimoglu et.al.|[2307.08415v1](http://arxiv.org/abs/2307.08415v1)|null|\n", "2307.08339": "|**2023-07-17**|**Multi-Task Cross-Modality Attention-Fusion for 2D Object Detection**|Huawei Sun et.al.|[2307.08339v1](http://arxiv.org/abs/2307.08339v1)|null|\n", "2307.08316": "|**2023-07-17**|**Bridging the Gap: Multi-Level Cross-Modality Joint Alignment for Visible-Infrared Person Re-Identification**|Tengfei Liang et.al.|[2307.08316v1](http://arxiv.org/abs/2307.08316v1)|null|\n", "2307.08238": "|**2023-07-17**|**Unified Open-Vocabulary Dense Visual Prediction**|Hengcan Shi et.al.|[2307.08238v1](http://arxiv.org/abs/2307.08238v1)|null|\n", "2307.08233": "|**2023-07-17**|**ROFusion: Efficient Object Detection using Hybrid Point-wise Radar-Optical Fusion**|Liu Liu et.al.|[2307.08233v1](http://arxiv.org/abs/2307.08233v1)|**[link](https://github.com/liuliu-55/rofusion)**|\n", "2307.08228": "|**2023-07-17**|**Video Frame Interpolation with Stereo Event and Intensity Camera**|Chao Ding et.al.|[2307.08228v1](http://arxiv.org/abs/2307.08228v1)|null|\n", "2307.08098": "|**2023-07-16**|**CalibNet: Dual-branch Cross-modal Calibration for RGB-D Salient Instance Segmentation**|Jialun Pei et.al.|[2307.08098v1](http://arxiv.org/abs/2307.08098v1)|**[link](https://github.com/pjlallen/calibnet)**|\n", "2307.08019": "|**2023-07-16**|**A Multi-model and Multi-scenario Assessment of the Impact of Climate Change on the Heating and Cooling Load Components of an Archetypical Residential Room in Major Indian Cities**|Raj S. Srivastava et.al.|[2307.08019v1](http://arxiv.org/abs/2307.08019v1)|null|\n", "2307.08016": "|**2023-07-16**|**Breaking Down the Task: A Unit-Grained Hybrid Training Framework for Vision and Language Decision Making**|Ruipu Luo et.al.|[2307.08016v1](http://arxiv.org/abs/2307.08016v1)|null|\n", "2307.07859": "|**2023-07-15**|**Unified Adversarial Patch for Cross-modal Attacks in the Physical World**|Xingxing Wei et.al.|[2307.07859v1](http://arxiv.org/abs/2307.07859v1)|null|\n", "2307.07807": "|**2023-07-15**|**MUVF-YOLOX: A Multi-modal Ultrasound Video Fusion Network for Renal Tumor Diagnosis**|Junyu Li et.al.|[2307.07807v1](http://arxiv.org/abs/2307.07807v1)|**[link](https://github.com/jeunyuli/muaf)**|\n", "2307.07791": "|**2023-07-15**|**Joint Adversarial and Collaborative Learning for Self-Supervised Action Recognition**|Tianyu Guo et.al.|[2307.07791v1](http://arxiv.org/abs/2307.07791v1)|**[link](https://github.com/levigty/acl)**|\n", "2307.07763": "|**2023-07-15**|**Tightly-Coupled LiDAR-Visual SLAM Based on Geometric Features for Mobile Agents**|Ke Cao et.al.|[2307.07763v1](http://arxiv.org/abs/2307.07763v1)|null|\n", "2307.09356": "|**2023-07-18**|**OnlineRefer: A Simple Online Baseline for Referring Video Object Segmentation**|Dongming Wu et.al.|[2307.09356v1](http://arxiv.org/abs/2307.09356v1)|**[link](https://github.com/wudongming97/onlinerefer)**|\n", "2307.09329": "|**2023-07-18**|**Towards a performance analysis on pre-trained Visual Question Answering models for autonomous driving**|Kaavya Rekanar et.al.|[2307.09329v1](http://arxiv.org/abs/2307.09329v1)|**[link](https://github.com/kaavyarekanar/towards-a-performance-analysis-on-pre-trained-vqa-models-for-autonomous-driving)**|\n", "2307.09323": "|**2023-07-18**|**Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Jiahe Li et.al.|[2307.09323v1](http://arxiv.org/abs/2307.09323v1)|**[link](https://github.com/fictionarry/er-nerf)**|\n", "2307.09312": "|**2023-07-18**|**Multi-Modal Discussion Transformer: Integrating Text, Images and Graph Transformers to Detect Hate Speech on Social Media**|Liam Hebert et.al.|[2307.09312v1](http://arxiv.org/abs/2307.09312v1)|**[link](https://github.com/liamhebert/multimodaldiscussiontransformer)**|\n", "2307.09306": "|**2023-07-18**|**EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory Forecasting**|Inhwan Bae et.al.|[2307.09306v1](http://arxiv.org/abs/2307.09306v1)|**[link](https://github.com/inhwanbae/eigentrajectory)**|\n", "2307.09184": "|**2023-07-18**|**You've Got Two Teachers: Co-evolutionary Image and Report Distillation for Semi-supervised Anatomical Abnormality Detection in Chest X-ray**|Jinghan Sun et.al.|[2307.09184v1](http://arxiv.org/abs/2307.09184v1)|null|\n", "2307.09155": "|**2023-07-18**|**MLF-DET: Multi-Level Fusion for Cross-Modal 3D Object Detection**|Zewei Lin et.al.|[2307.09155v1](http://arxiv.org/abs/2307.09155v1)|null|\n", "2307.09066": "|**2023-07-18**|**PatchCT: Aligning Patch Set and Label Set with Conditional Transport for Multi-Label Image Classification**|Miaoge Li et.al.|[2307.09066v1](http://arxiv.org/abs/2307.09066v1)|**[link](https://github.com/keepgoingjkg/patchct)**|\n", "2307.09059": "|**2023-07-18**|**Unleashing the Imagination of Text: A Novel Framework for Text-to-image Person Retrieval via Exploring the Power of Words**|Delong Liu et.al.|[2307.09059v1](http://arxiv.org/abs/2307.09059v1)|null|\n", "2307.09050": "|**2023-07-18**|**R-Cut: Enhancing Explainability in Vision Transformers with Relationship Weighted Out and Cut**|Yingjie Niu et.al.|[2307.09050v1](http://arxiv.org/abs/2307.09050v1)|null|\n", "2307.09036": "|**2023-07-18**|**PromptMagician: Interactive Prompt Engineering for Text-to-Image Creation**|Yingchaojie Feng et.al.|[2307.09036v1](http://arxiv.org/abs/2307.09036v1)|**[link](https://github.com/yingchaojiefeng/promptmagician)**|\n", "2307.08991": "|**2023-07-18**|**EgoVM: Achieving Precise Ego-Localization using Lightweight Vectorized Maps**|Yuzhe He et.al.|[2307.08991v1](http://arxiv.org/abs/2307.08991v1)|null|\n", "2307.08788": "|**2023-07-17**|**Uncovering Load-Altering Attacks Against N-1 Secure Power Grids: A Rare-Event Sampling Approach**|Maldon Patrice Goodridge et.al.|[2307.08788v1](http://arxiv.org/abs/2307.08788v1)|null|\n", "2307.08752": "|**2023-07-17**|**A Re-Appraisal of CO/O$_2$ Runaway on Habitable Planets Orbiting Low-Mass Stars**|Sukrit Ranjan et.al.|[2307.08752v1](http://arxiv.org/abs/2307.08752v1)|null|\n", "2307.10094": "|**2023-07-19**|**Make-A-Volume: Leveraging Latent Diffusion Models for Cross-Modality 3D Brain MRI Synthesis**|Lingting Zhu et.al.|[2307.10094v1](http://arxiv.org/abs/2307.10094v1)|null|\n", "2307.09931": "|**2023-07-19**|**DISA: DIfferentiable Similarity Approximation for Universal Multimodal Registration**|Matteo Ronchetti et.al.|[2307.09931v1](http://arxiv.org/abs/2307.09931v1)|**[link](https://github.com/imfusiongmbh/disa-universal-multimodal-registration)**|\n", "2307.09915": "|**2023-07-19**|**Embedded Heterogeneous Attention Transformer for Cross-lingual Image Captioning**|Zijie Song et.al.|[2307.09915v1](http://arxiv.org/abs/2307.09915v1)|null|\n", "2307.09823": "|**2023-07-19**|**Multi-modal Learning based Prediction for Disease**|Yaran Chen et.al.|[2307.09823v1](http://arxiv.org/abs/2307.09823v1)|null|\n", "2307.09769": "|**2023-07-19**|**Source-Free Domain Adaptation for Medical Image Segmentation via Prototype-Anchored Feature Alignment and Contrastive Learning**|Qinji Yu et.al.|[2307.09769v1](http://arxiv.org/abs/2307.09769v1)|**[link](https://github.com/cscyqj/miccai23-protocontra-sfda)**|\n", "2307.09749": "|**2023-07-19**|**Towards Robust Scene Text Image Super-resolution via Explicit Location Enhancement**|Hang Guo et.al.|[2307.09749v1](http://arxiv.org/abs/2307.09749v1)|**[link](https://github.com/csguoh/lemma)**|\n", "2307.09721": "|**2023-07-19**|**Multi-Grained Multimodal Interaction Network for Entity Linking**|Pengfei Luo et.al.|[2307.09721v1](http://arxiv.org/abs/2307.09721v1)|**[link](https://github.com/pengfei-luo/mimic)**|\n", "2307.10810": "|**2023-07-20**|**On Combining Expert Demonstrations in Imitation Learning via Optimal Transport**|Ilana Sebag et.al.|[2307.10810v1](http://arxiv.org/abs/2307.10810v1)|null|\n", "2307.10782": "|**2023-07-20**|**See More and Know More: Zero-shot Point Cloud Segmentation via Multi-modal Visual Data**|Yuhang Lu et.al.|[2307.10782v1](http://arxiv.org/abs/2307.10782v1)|null|\n", "2307.10763": "|**2023-07-20**|**MSQNet: Actor-agnostic Action Recognition with Multi-modal Query**|Anindya Mondal et.al.|[2307.10763v1](http://arxiv.org/abs/2307.10763v1)|**[link](https://github.com/mondalanindya/msqnet)**|\n", "2307.10685": "|**2023-07-20**|**Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged Object Detection**|Yinghui Xing et.al.|[2307.10685v1](http://arxiv.org/abs/2307.10685v1)|null|\n", "2307.10601": "|**2023-07-20**|**SCA-PVNet: Self-and-Cross Attention Based Aggregation of Point Cloud and Multi-View for 3D Object Retrieval**|Dongyun Lin et.al.|[2307.10601v1](http://arxiv.org/abs/2307.10601v1)|null|\n", "2307.10577": "|**2023-07-21**|**Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced Perception based on Joint-Embedding & Contextual Label Affinity**|Hugo Latapie et.al.|[2307.10577v2](http://arxiv.org/abs/2307.10577v2)|null|\n", "2307.10519": "|**2023-07-20**|**Probabilistic Multimodal Depth Estimation Based on Camera-LiDAR Sensor Fusion**|Johan S. Obando-Ceron et.al.|[2307.10519v1](http://arxiv.org/abs/2307.10519v1)|null|\n", "2307.10490": "|**2023-07-24**|**(Ab)using Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs**|Eugene Bagdasaryan et.al.|[2307.10490v3](http://arxiv.org/abs/2307.10490v3)|**[link](https://github.com/ebagdasa/multimodal_injection)**|\n", "2307.10475": "|**2023-07-19**|**Findings of Factify 2: Multimodal Fake News Detection**|S Suryavardan et.al.|[2307.10475v1](http://arxiv.org/abs/2307.10475v1)|null|\n", "2307.11552": "|**2023-07-21**|**A multi-modal representation of El Ni\u00f1o Southern Oscillation Diversity**|Jakob Schl\u00f6r et.al.|[2307.11552v1](http://arxiv.org/abs/2307.11552v1)|**[link](https://github.com/jakob-schloer/latentgmm)**|\n", "2307.11545": "|**2023-07-21**|**Bridging Vision and Language Encoders: Parameter-Efficient Tuning for Referring Image Segmentation**|Zunnan Xu et.al.|[2307.11545v1](http://arxiv.org/abs/2307.11545v1)|**[link](https://github.com/kkakkkka/etris)**|\n", "2307.11530": "|**2023-07-21**|**UWAT-GAN: Fundus Fluorescein Angiography Synthesis via Ultra-wide-angle Transformation Multi-scale GAN**|Zhaojie Fang et.al.|[2307.11530v1](http://arxiv.org/abs/2307.11530v1)|**[link](https://github.com/Tinysqua/UWAT-GAN)**|\n", "2307.11450": "|**2023-07-21**|**Topic Identification For Spontaneous Speech: Enriching Audio Features With Embedded Linguistic Information**|Dejan Porjazovski et.al.|[2307.11450v1](http://arxiv.org/abs/2307.11450v1)|**[link](https://github.com/aalto-speech/Topic-identification-for-spontaneous-Finnish-speech)**|\n", "2307.11323": "|**2023-07-21**|**HVDetFusion: A Simple and Robust Camera-Radar Fusion Framework**|Kai Lei et.al.|[2307.11323v1](http://arxiv.org/abs/2307.11323v1)|**[link](https://github.com/hvxlab/hvdetfusion)**|\n", "2307.12964": "|**2023-07-24**|**Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature Alignment**|Sarah Ibrahimi et.al.|[2307.12964v1](http://arxiv.org/abs/2307.12964v1)|null|\n", "2307.12853": "|**2023-07-25**|**Spatiotemporal Modeling Encounters 3D Medical Image Analysis: Slice-Shift UNet with Multi-View Fusion**|C. I. Ugwu et.al.|[2307.12853v2](http://arxiv.org/abs/2307.12853v2)|null|\n", "2307.12732": "|**2023-07-24**|**CLIP-KD: An Empirical Study of Distilling CLIP Models**|Chuanguang Yang et.al.|[2307.12732v1](http://arxiv.org/abs/2307.12732v1)|null|\n", "2307.12626": "|**2023-07-24**|**Enhancing Human-like Multi-Modal Reasoning: A New Challenging Dataset and Comprehensive Framework**|Jingxuan Wei et.al.|[2307.12626v1](http://arxiv.org/abs/2307.12626v1)|**[link](https://github.com/weijingxuan/COCO-MMR)**|\n", "2307.12577": "|**2023-07-24**|**PRIOR: Prototype Representation Joint Learning from Medical Images and Reports**|Pujin Cheng et.al.|[2307.12577v1](http://arxiv.org/abs/2307.12577v1)|**[link](https://github.com/qtacierp/prior)**|\n", "2307.12545": "|**2023-07-24**|**Towards Video Anomaly Retrieval from Video Anomaly Detection: New Benchmarks and Model**|Peng Wu et.al.|[2307.12545v1](http://arxiv.org/abs/2307.12545v1)|null|\n", "2307.12242": "|**2023-07-23**|**HealthPrism: A Visual Analytics System for Exploring Children's Physical and Mental Health Profiles with Multimodal Data**|Zhihan Jiang et.al.|[2307.12242v1](http://arxiv.org/abs/2307.12242v1)|null|\n", "2307.12236": "|**2023-07-23**|**Multi-Modal Machine Learning for Assessing Gaming Skills in Online Streaming: A Case Study with CS:GO**|Longxiang Zhang et.al.|[2307.12236v1](http://arxiv.org/abs/2307.12236v1)|null|\n", "2307.12180": "|**2023-07-22**|**Prototype-Driven and Multi-Expert Integrated Multi-Modal MR Brain Tumor Image Segmentation**|Yafei Zhang et.al.|[2307.12180v1](http://arxiv.org/abs/2307.12180v1)|**[link](https://github.com/linzy0227/pdminet)**|\n", "2307.12067": "|**2023-07-22**|**Replay: Multi-modal Multi-view Acted Videos for Casual Holography**|Roman Shapovalov et.al.|[2307.12067v1](http://arxiv.org/abs/2307.12067v1)|**[link](https://github.com/facebookresearch/replay_dataset)**|\n", "2307.12058": "|**2023-07-22**|**Discovering Spatio-Temporal Rationales for Video Question Answering**|Yicong Li et.al.|[2307.12058v1](http://arxiv.org/abs/2307.12058v1)|null|\n", "2307.11921": "|**2023-07-21**|**Poverty rate prediction using multi-modal survey and earth observation data**|Simone Fobi et.al.|[2307.11921v1](http://arxiv.org/abs/2307.11921v1)|null|\n", "2307.13600": "|**2023-07-25**|**Decisive Data using Multi-Modality Optical Sensors for Advanced Vehicular Systems**|Muhammad Ali Farooq et.al.|[2307.13600v1](http://arxiv.org/abs/2307.13600v1)|null|\n", "2307.13537": "|**2023-07-25**|**Spectrum-guided Multi-granularity Referring Video Object Segmentation**|Bo Miao et.al.|[2307.13537v1](http://arxiv.org/abs/2307.13537v1)|**[link](https://github.com/bo-miao/sgmg)**|\n", "2307.13529": "|**2023-07-25**|**Re-mine, Learn and Reason: Exploring the Cross-modal Semantic Correlations for Language-guided HOI detection**|Yichao Cao et.al.|[2307.13529v1](http://arxiv.org/abs/2307.13529v1)|null|\n", "2307.13205": "|**2023-07-25**|**Text-oriented Modality Reinforcement Network for Multimodal Sentiment Analysis from Unaligned Multimodal Sequences**|Yuxuan Lei et.al.|[2307.13205v1](http://arxiv.org/abs/2307.13205v1)|null|\n", "2307.13125": "|**2023-07-24**|**Deep Learning Approaches for Data Augmentation in Medical Imaging: A Review**|Aghiles Kebaili et.al.|[2307.13125v1](http://arxiv.org/abs/2307.13125v1)|null|\n", "2307.13069": "|**2023-07-24**|**General-Purpose Multi-Modal OOD Detection Framework**|Viet Duong et.al.|[2307.13069v1](http://arxiv.org/abs/2307.13069v1)|null|\n", "2307.14277": "|**2023-07-26**|**G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and Game Theory**|Hongxiang Li et.al.|[2307.14277v1](http://arxiv.org/abs/2307.14277v1)|null|\n", "2307.14273": "|**2023-07-26**|**Deepfake Image Generation for Improved Brain Tumor Segmentation**|Roa'a Al-Emaryeen et.al.|[2307.14273v1](http://arxiv.org/abs/2307.14273v1)|null|\n", "2307.14244": "|**2023-07-26**|**Neural-based Cross-modal Search and Retrieval of Artwork**|Yan Gong et.al.|[2307.14244v1](http://arxiv.org/abs/2307.14244v1)|null|\n", "2307.14240": "|**2023-07-26**|**Boon: A Neural Search Engine for Cross-Modal Information Retrieval**|Yan Gong et.al.|[2307.14240v1](http://arxiv.org/abs/2307.14240v1)|null|\n", "2307.14185": "|**2023-07-26**|**A comparison of machine learning surrogate models of street-scale flooding in Norfolk, Virginia**|Diana McSpadden et.al.|[2307.14185v1](http://arxiv.org/abs/2307.14185v1)|null|\n", "2307.14126": "|**2023-07-26**|**Multi-modal Learning with Missing Modality via Shared-Specific Feature Modelling**|Hu Wang et.al.|[2307.14126v1](http://arxiv.org/abs/2307.14126v1)|null|\n", "2307.14061": "|**2023-07-26**|**Set-level Guidance Attack: Boosting Adversarial Transferability of Vision-Language Pre-training Models**|Dong Lu et.al.|[2307.14061v1](http://arxiv.org/abs/2307.14061v1)|**[link](https://github.com/Zoky-2020/Set-level_Guidance_Attack)**|\n", "2307.13950": "|**2023-07-26**|**Deep Robust Multi-Robot Re-localisation in Natural Environments**|Milad Ramezani et.al.|[2307.13950v1](http://arxiv.org/abs/2307.13950v1)|null|\n", "2307.13933": "|**2023-07-26**|**AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking Dataset for Assistive Driving Perception**|Dingkang Yang et.al.|[2307.13933v1](http://arxiv.org/abs/2307.13933v1)|**[link](https://github.com/ydk122024/aide)**|\n", "2307.13925": "|**2023-07-27**|**EasyNet: An Easy Network for 3D Industrial Anomaly Detection**|Ruitao Chen et.al.|[2307.13925v2](http://arxiv.org/abs/2307.13925v2)|null|\n", "2307.13871": "|**2023-07-26**|**Emulating Expert Insight: A Robust Strategy for Optimal Experimental Design**|Matthew R. Carbone et.al.|[2307.13871v1](http://arxiv.org/abs/2307.13871v1)|**[link](https://github.com/matthewcarbone/scientificvalueagent)**|\n", "2307.15016": "|**2023-07-27**|**How Good is Google Bard's Visual Understanding? An Empirical Study on Open Challenges**|Haotong Qin et.al.|[2307.15016v1](http://arxiv.org/abs/2307.15016v1)|**[link](https://github.com/htqin/googlebard-visunderstand)**|\n", "2307.14901": "|**2023-07-27**|**Text-guided Foundation Model Adaptation for Pathological Image Classification**|Yunkun Zhang et.al.|[2307.14901v1](http://arxiv.org/abs/2307.14901v1)|**[link](https://github.com/yunkun-zhang/cite)**|\n", "2307.14889": "|**2023-07-27**|**Weakly Supervised Multi-Modal 3D Human Body Pose Estimation for Autonomous Driving**|Peter Bauer et.al.|[2307.14889v1](http://arxiv.org/abs/2307.14889v1)|null|\n", "2307.14878": "|**2023-07-27**|**MESED: A Multi-modal Entity Set Expansion Dataset with Fine-grained Semantic Classes and Hard Negative Entities**|Yangning Li et.al.|[2307.14878v1](http://arxiv.org/abs/2307.14878v1)|**[link](https://github.com/thukelab/mesed)**|\n", "2307.14682": "|**2023-07-27**|**Unified Adversarial Patch for Visible-Infrared Cross-modal Attacks in the Physical World**|Xingxing Wei et.al.|[2307.14682v1](http://arxiv.org/abs/2307.14682v1)|**[link](https://github.com/aries-iai/cross-modal_patch_attack)**|\n", "2307.14619": "|**2023-07-29**|**Imitating Complex Trajectories: Bridging Low-Level Stability and High-Level Behavior**|Adam Block et.al.|[2307.14619v2](http://arxiv.org/abs/2307.14619v2)|null|\n", "2307.14572": "|**2023-07-27**|**Non-invasive Deep-Brain Imaging with 3D Integrated Photoacoustic Tomography and Ultrasound Localization Microscopy (3D-PAULM)**|Yuqi Tang et.al.|[2307.14572v1](http://arxiv.org/abs/2307.14572v1)|null|\n", "2307.14539": "|**2023-07-26**|**Plug and Pray: Exploiting off-the-shelf components of Multi-Modal Models**|Erfan Shayegani et.al.|[2307.14539v1](http://arxiv.org/abs/2307.14539v1)|null|\n", "2307.14523": "|**2023-07-26**|**Towards multi-modal anatomical landmark detection for ultrasound-guided brain tumor resection with contrastive learning**|Soorena Salari et.al.|[2307.14523v1](http://arxiv.org/abs/2307.14523v1)|null|\n", "2307.14491": "|**2023-07-26**|**Modality-Agnostic Audio-Visual Deepfake Detection**|Cai Yu et.al.|[2307.14491v1](http://arxiv.org/abs/2307.14491v1)|null|\n", "2307.15554": "|**2023-07-28**|**'What are you referring to?' Evaluating the Ability of Multi-Modal Dialogue Models to Process Clarificational Exchanges**|Javier Chiyah-Garcia et.al.|[2307.15554v1](http://arxiv.org/abs/2307.15554v1)|**[link](https://github.com/jchiyah/what-are-you-referring-to)**|\n", "2307.15460": "|**2023-07-28**|**Cross-Modal Concept Learning and Inference for Vision-Language Models**|Yi Zhang et.al.|[2307.15460v1](http://arxiv.org/abs/2307.15460v1)|null|\n", "2307.15432": "|**2023-07-28**|**CFN-ESA: A Cross-Modal Fusion Network with Emotion-Shift Awareness for Dialogue Emotion Recognition**|Jiang Li et.al.|[2307.15432v1](http://arxiv.org/abs/2307.15432v1)|null|\n", "2307.15344": "|**2023-07-28**|**Improving Audio-Text Retrieval via Hierarchical Cross-Modal Interaction and Auxiliary Captions**|Yifei Xin et.al.|[2307.15344v1](http://arxiv.org/abs/2307.15344v1)|null|\n", "2307.15220": "|**2023-07-27**|**Learning Multi-modal Representations by Watching Hundreds of Surgical Video Lectures**|Kun Yuan et.al.|[2307.15220v1](http://arxiv.org/abs/2307.15220v1)|**[link](https://github.com/camma-public/surgvlp)**|\n", "2307.15167": "|**2023-07-27**|**PEANUT: A Human-AI Collaborative Tool for Annotating Audio-Visual Data**|Zheng Zhang et.al.|[2307.15167v1](http://arxiv.org/abs/2307.15167v1)|null|\n", "2307.15097": "|**2023-07-27**|**Cascaded Cross-Modal Transformer for Request and Complaint Detection**|Nicolae-Catalin Ristea et.al.|[2307.15097v1](http://arxiv.org/abs/2307.15097v1)|null|\n", "2307.16896": "|**2023-07-31**|**Disruptive Autoencoders: Leveraging Low-level features for 3D Medical Image Pre-training**|Jeya Maria Jose Valanarasu et.al.|[2307.16896v1](http://arxiv.org/abs/2307.16896v1)|null|\n", "2307.16847": "|**2023-07-31**|**Latent Masking for Multimodal Self-supervised Learning in Health Timeseries**|Shohreh Deldari et.al.|[2307.16847v1](http://arxiv.org/abs/2307.16847v1)|null|\n", "2307.16745": "|**2023-07-31**|**Advancing Smart Malnutrition Monitoring: A Multi-Modal Learning Approach for Vital Health Parameter Estimation**|Ashish Marisetty et.al.|[2307.16745v1](http://arxiv.org/abs/2307.16745v1)|null|\n", "2307.16617": "|**2023-07-31**|**FULLER: Unified Multi-modality Multi-task 3D Perception via Multi-level Gradient Calibration**|Zhijian Huang et.al.|[2307.16617v1](http://arxiv.org/abs/2307.16617v1)|null|\n", "2307.16532": "|**2023-07-31**|**Echoes Beyond Points: Unleashing the Power of Raw Radar Data in Multi-modality Fusion**|Yang Liu et.al.|[2307.16532v1](http://arxiv.org/abs/2307.16532v1)|null|\n", "2307.16395": "|**2023-07-31**|**Bridging the Gap: Exploring the Capabilities of Bridge-Architectures for Complex Visual Reasoning Tasks**|Kousik Rajesh et.al.|[2307.16395v1](http://arxiv.org/abs/2307.16395v1)|null|\n", "2307.16366": "|**2023-07-31**|**Multi-modal Graph Neural Network for Early Diagnosis of Alzheimer's Disease from sMRI and PET Scans**|Yanteng Zhanga et.al.|[2307.16366v1](http://arxiv.org/abs/2307.16366v1)|null|\n", "2307.16210": "|**2023-08-01**|**Rethinking Uncertainly Missing and Ambiguous Visual Modality in Multi-Modal Entity Alignment**|Zhuo Chen et.al.|[2307.16210v2](http://arxiv.org/abs/2307.16210v2)|**[link](https://github.com/zjukg/umaea)**|\n", "2307.16142": "|**2023-07-30**|**Implicit Neural Representation in Medical Imaging: A Comparative Survey**|Amirali Molaei et.al.|[2307.16142v1](http://arxiv.org/abs/2307.16142v1)|**[link](https://github.com/mindflow-institue/awesome-implicit-neural-representations-in-medical-imaging)**|\n", "2307.16121": "|**2023-07-30**|**Uncertainty-Encoded Multi-Modal Fusion for Robust Object Detection in Autonomous Driving**|Yang Lou et.al.|[2307.16121v1](http://arxiv.org/abs/2307.16121v1)|null|\n", "2307.16106": "|**2023-07-30**|**TransFusion: A Practical and Effective Transformer-based Diffusion Model for 3D Human Motion Prediction**|Sibo Tian et.al.|[2307.16106v1](http://arxiv.org/abs/2307.16106v1)|null|\n", "2307.16013": "|**2023-07-29**|**Marrying Dialogue Systems with Data Visualization: Interactive Data Visualization Generation from Natural Language Conversations**|Yuanfeng Song et.al.|[2307.16013v1](http://arxiv.org/abs/2307.16013v1)|null|\n", "2307.15988": "|**2023-07-29**|**RGB-D-Fusion: Image Conditioned Depth Diffusion of Humanoid Subjects**|Sascha Kirch et.al.|[2307.15988v1](http://arxiv.org/abs/2307.15988v1)|**[link](https://github.com/sascha-kirch/rgb-d-fusion)**|\n", "2307.15942": "|**2023-07-29**|**CMDA: Cross-Modality Domain Adaptation for Nighttime Semantic Segmentation**|Ruihao Xia et.al.|[2307.15942v1](http://arxiv.org/abs/2307.15942v1)|**[link](https://github.com/xiarho/cmda)**|\n", "2307.15872": "|**2023-07-29**|**Cross-dimensional transfer learning in medical image segmentation with deep learning**|Hicham Messaoudi et.al.|[2307.15872v1](http://arxiv.org/abs/2307.15872v1)|**[link](https://github.com/hic-messaoudi/cross-dimensional-transfer-learning-in-medical-image-segmentation-with-deep-learning)**|\n", "2308.00692": "|**2023-08-03**|**LISA: Reasoning Segmentation via Large Language Model**|Xin Lai et.al.|[2308.00692v2](http://arxiv.org/abs/2308.00692v2)|**[link](https://github.com/dvlab-research/lisa)**|\n", "2308.00628": "|**2023-08-01**|**Human-M3: A Multi-view Multi-modal Dataset for 3D Human Pose Estimation in Outdoor Scenes**|Bohao Fan et.al.|[2308.00628v1](http://arxiv.org/abs/2308.00628v1)|**[link](https://github.com/soullessrobot/human-m3-dataset)**|\n", "2308.00588": "|**2023-08-01**|**Relation-Aware Distribution Representation Network for Person Clustering with Multiple Modalities**|Kaijian Liu et.al.|[2308.00588v1](http://arxiv.org/abs/2308.00588v1)|null|\n", "2308.00330": "|**2023-08-01**|**Advancing Frame-Dropping in Multi-Object Tracking-by-Detection Systems Through Event-Based Detection Triggering**|Matti Henning et.al.|[2308.00330v1](http://arxiv.org/abs/2308.00330v1)|null|\n", "2308.00295": "|**2023-08-01**|**Making the V in Text-VQA Matter**|Shamanthak Hegde et.al.|[2308.00295v1](http://arxiv.org/abs/2308.00295v1)|null|\n", "2308.00291": "|**2023-08-01**|**Fundus-Enhanced Disease-Aware Distillation Model for Retinal Disease Classification from OCT Images**|Lehan Wang et.al.|[2308.00291v1](http://arxiv.org/abs/2308.00291v1)|**[link](https://github.com/xmed-lab/fddm)**|\n", "2308.00264": "|**2023-08-01**|**Multi-Modality Multi-Loss Fusion Network**|Zehui Wu et.al.|[2308.00264v1](http://arxiv.org/abs/2308.00264v1)|null|\n", "2308.00235": "|**2023-08-01**|**Demonstrating Autonomous 3D Path Planning on a Novel Scalable UGV-UAV Morphing Robot**|Eric Sihite et.al.|[2308.00235v1](http://arxiv.org/abs/2308.00235v1)|null|\n", "2308.00228": "|**2023-08-01**|**Using Scene and Semantic Features for Multi-modal Emotion Recognition**|Zhifeng Wang et.al.|[2308.00228v1](http://arxiv.org/abs/2308.00228v1)|null|\n", "2307.16620": "|**2023-08-01**|**Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics**|Chen Liu et.al.|[2307.16620v2](http://arxiv.org/abs/2307.16620v2)|null|\n", "2308.01217": "|**2023-08-02**|**TeachCLIP: Multi-Grained Teaching for Efficient Text-to-Video Retrieval**|Kaibin Tian et.al.|[2308.01217v1](http://arxiv.org/abs/2308.01217v1)|null|\n", "2308.01147": "|**2023-08-02**|**Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment for Markup-to-Image Generation**|Guojin Zhong et.al.|[2308.01147v1](http://arxiv.org/abs/2308.01147v1)|**[link](https://github.com/zgj77/fsacdm)**|\n", "2308.01006": "|**2023-08-03**|**FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of Autonomous Driving**|Tengju Ye et.al.|[2308.01006v2](http://arxiv.org/abs/2308.01006v2)|**[link](https://github.com/westlake-autolab/fusionad)**|\n", "2308.00980": "|**2023-08-02**|**Grasp Stability Assessment Through Attention-Guided Cross-Modality Fusion and Transfer Learning**|Zhuangzhuang Zhang et.al.|[2308.00980v1](http://arxiv.org/abs/2308.00980v1)|null|\n", "2308.00906": "|**2023-08-02**|**ImageBrush: Learning Visual In-Context Instructions for Exemplar-Based Image Manipulation**|Yasheng Sun et.al.|[2308.00906v1](http://arxiv.org/abs/2308.00906v1)|null|\n", "2308.00856": "|**2023-08-01**|**Differential Privacy for Adaptive Weight Aggregation in Federated Tumor Segmentation**|Muhammad Irfan Khan et.al.|[2308.00856v1](http://arxiv.org/abs/2308.00856v1)|null|\n", "2308.01731": "|**2023-08-03**|**Quantification of Predictive Uncertainty via Inference-Time Sampling**|Katar\u00edna T\u00f3thov\u00e1 et.al.|[2308.01731v1](http://arxiv.org/abs/2308.01731v1)|null|\n", "2308.01546": "|**2023-08-03**|**MusicLDM: Enhancing Novelty in Text-to-Music Generation Using Beat-Synchronous Mixup Strategies**|Ke Chen et.al.|[2308.01546v1](http://arxiv.org/abs/2308.01546v1)|**[link](https://github.com/retrocirce/musicldm)**|\n", "2308.01526": "|**2023-08-03**|**Data Augmentation for Human Behavior Analysis in Multi-Person Conversations**|Kun Li et.al.|[2308.01526v1](http://arxiv.org/abs/2308.01526v1)|null|\n", "2308.01328": "|**2023-08-02**|**A vision transformer-based framework for knowledge transfer from multi-modal to mono-modal lymphoma subtyping models**|Bilel Guetarni et.al.|[2308.01328v1](http://arxiv.org/abs/2308.01328v1)|null|\n", "2308.02487": "|**2023-08-04**|**Convolutions Die Hard: Open-Vocabulary Segmentation with Single Frozen Convolutional CLIP**|Qihang Yu et.al.|[2308.02487v1](http://arxiv.org/abs/2308.02487v1)|**[link](https://github.com/bytedance/fc-clip)**|\n", "2308.02463": "|**2023-08-04**|**Towards Generalist Foundation Model for Radiology**|Chaoyi Wu et.al.|[2308.02463v1](http://arxiv.org/abs/2308.02463v1)|**[link](https://github.com/chaoyi-wu/radfm)**|\n", "2308.02239": "|**2023-08-04**|**DTF-Net: Category-Level Pose Estimation and Shape Reconstruction via Deformable Template Field**|Haowen Wang et.al.|[2308.02239v1](http://arxiv.org/abs/2308.02239v1)|null|\n", "2308.02097": "|**2023-08-04**|**Multi-interactive Feature Learning and a Full-time Multi-modality Benchmark for Image Fusion and Segmentation**|Jinyuan Liu et.al.|[2308.02097v1](http://arxiv.org/abs/2308.02097v1)|**[link](https://github.com/jinyuanliu-cv/segmif)**|\n", "2308.01994": "|**2023-08-03**|**Explainable unsupervised multi-modal image registration using deep networks**|Chengjia Wang et.al.|[2308.01994v1](http://arxiv.org/abs/2308.01994v1)|null|\n", "2308.02299": "|**2023-08-03**|**RegionBLIP: A Unified Multi-modal Pre-training Framework for Holistic and Regional Comprehension**|Qiang Zhou et.al.|[2308.02299v1](http://arxiv.org/abs/2308.02299v1)|**[link](https://github.com/mightyzau/regionblip)**|\n", "2308.03729": "|**2023-08-07**|**Tiny LVLM-eHub: Early Multimodal Experiments with Bard**|Wenqi Shao et.al.|[2308.03729v1](http://arxiv.org/abs/2308.03729v1)|**[link](https://github.com/opengvlab/multi-modality-arena)**|\n", "2308.03666": "|**2023-08-07**|**Bridging Trustworthiness and Open-World Learning: An Exploratory Neural Approach for Enhancing Interpretability, Generalization, and Robustness**|Shide Du et.al.|[2308.03666v1](http://arxiv.org/abs/2308.03666v1)|null|\n", "2308.03475": "|**2023-08-07**|**COPA: Efficient Vision-Language Pre-training Through Collaborative Object- and Patch-Text Alignment**|Chaoya Jiang et.al.|[2308.03475v1](http://arxiv.org/abs/2308.03475v1)|null|\n", "2308.03432": "|**2023-08-07**|**Cuing Without Sharing: A Federated Cued Speech Recognition Framework via Mutual Knowledge Distillation**|Yuxuan Zhang et.al.|[2308.03432v1](http://arxiv.org/abs/2308.03432v1)|**[link](https://github.com/yuxuanzhang0713/fedcsr)**|\n", "2308.03424": "|**2023-08-07**|**CAESURA: Language Models as Multi-Modal Query Planners**|Matthias Urban et.al.|[2308.03424v1](http://arxiv.org/abs/2308.03424v1)|null|\n", "2308.03267": "|**2023-08-07**|**Redundancy-aware Transformer for Video Question Answering**|Yicong Li et.al.|[2308.03267v1](http://arxiv.org/abs/2308.03267v1)|null|\n", "2308.03256": "|**2023-08-07**|**Learning a Graph Neural Network with Cross Modality Interaction for Image Fusion**|Jiawei Li et.al.|[2308.03256v1](http://arxiv.org/abs/2308.03256v1)|**[link](https://github.com/lok-18/ignet)**|\n", "2308.03151": "|**2023-08-06**|**Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating Vision-Language Models**|Zheng Ma et.al.|[2308.03151v1](http://arxiv.org/abs/2308.03151v1)|**[link](https://github.com/aaronma2020/Food500-Cap)**|\n", "2308.03135": "|**2023-08-06**|**E-CLIP: Towards Label-efficient Event-based Open-world Understanding by CLIP**|Jiazhou Zhou et.al.|[2308.03135v1](http://arxiv.org/abs/2308.03135v1)|null|\n", "2308.02982": "|**2023-08-06**|**Beyond First Impressions: Integrating Joint Multi-modal Cues for Comprehensive 3D Representation**|Haowei Wang et.al.|[2308.02982v1](http://arxiv.org/abs/2308.02982v1)|**[link](https://github.com/mr-neko/jm3d)**|\n", "2308.02883": "|**2023-08-05**|**Cross-modal & Cross-domain Learning for Unsupervised LiDAR Semantic Segmentation**|Yiyang Chen et.al.|[2308.02883v1](http://arxiv.org/abs/2308.02883v1)|null|\n", "2308.02872": "|**2023-08-05**|**Data-Based Design of Multi-Model Inferential Sensors**|Martin Mojto et.al.|[2308.02872v1](http://arxiv.org/abs/2308.02872v1)|null|\n", "2308.02823": "|**2023-08-05**|**A Symbolic Character-Aware Model for Solving Geometry Problems**|Maizhen Ning et.al.|[2308.02823v1](http://arxiv.org/abs/2308.02823v1)|**[link](https://github.com/ning-mz/sca-gps)**|\n", "2308.04369": "|**2023-08-08**|**SSTFormer: Bridging Spiking Neural Network and Memory Support Transformer for Frame-Event based Recognition**|Xiao Wang et.al.|[2308.04369v1](http://arxiv.org/abs/2308.04369v1)|**[link](https://github.com/event-ahu/sstformer)**|\n", "2308.04352": "|**2023-08-08**|**3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment**|Ziyu Zhu et.al.|[2308.04352v1](http://arxiv.org/abs/2308.04352v1)|null|\n", "2308.04343": "|**2023-08-08**|**Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval**|Yi Bin et.al.|[2308.04343v1](http://arxiv.org/abs/2308.04343v1)|**[link](https://github.com/luminosityx/hat)**|\n", "2308.04126": "|**2023-08-08**|**OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion and Infinite Data Generation**|Dongyang Yu et.al.|[2308.04126v1](http://arxiv.org/abs/2308.04126v1)|**[link](https://github.com/shajiayu1/OmniDataComposer)**|\n", "2308.04067": "|**2023-08-08**|**Online Distillation-enhanced Multi-modal Transformer for Sequential Recommendation**|Wei Ji et.al.|[2308.04067v1](http://arxiv.org/abs/2308.04067v1)|**[link](https://github.com/xyliugo/odmt)**|\n", "2308.03908": "|**2023-08-07**|**ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings for Video Action Recognition**|Soumyabrata Chaudhuri et.al.|[2308.03908v1](http://arxiv.org/abs/2308.03908v1)|null|\n", "2308.05061": "|**2023-08-09**|**Prompting In-Context Operator Learning with Sensor Data, Equations, and Natural Language**|Liu Yang et.al.|[2308.05061v1](http://arxiv.org/abs/2308.05061v1)|**[link](https://github.com/liuyangmage/in-context-operator-networks)**|\n", "2308.04992": "|**2023-08-09**|**AspectMMKG: A Multi-modal Knowledge Graph with Aspect-aware Entities**|Jingdan Zhang et.al.|[2308.04992v1](http://arxiv.org/abs/2308.04992v1)|**[link](https://github.com/thezjd/aspectmmkg)**|\n", "2308.04829": "|**2023-08-09**|**MixReorg: Cross-Modal Mixed Patch Reorganization is a Good Mask Learner for Open-World Semantic Segmentation**|Kaixin Cai et.al.|[2308.04829v1](http://arxiv.org/abs/2308.04829v1)|null|\n", "2308.04820": "|**2023-08-09**|**Strategic Interactions in Multi-modal Mobility Systems: A Game-Theoretic Perspective**|Gioele Zardini et.al.|[2308.04820v1](http://arxiv.org/abs/2308.04820v1)|null|\n", "2308.04779": "|**2023-08-09**|**Multi-View Fusion and Distillation for Subgrade Distresses Detection based on 3D-GPR**|Chunpeng Zhou et.al.|[2308.04779v1](http://arxiv.org/abs/2308.04779v1)|null|\n", "2308.04778": "|**2023-08-09**|**Multi-modal Multi-view Clustering based on Non-negative Matrix Factorization**|Yasser Khalafaoui et.al.|[2308.04778v1](http://arxiv.org/abs/2308.04778v1)|null|\n", "2308.04706": "|**2023-08-09**|**Pareto Invariant Representation Learning for Multimedia Recommendation**|Shanshan Huang et.al.|[2308.04706v1](http://arxiv.org/abs/2308.04706v1)|null|\n", "2308.04702": "|**2023-08-09**|**Continual Road-Scene Semantic Segmentation via Feature-Aligned Symmetric Multi-Modal Network**|Francesco Barbato et.al.|[2308.04702v1](http://arxiv.org/abs/2308.04702v1)|null|\n", "2308.04663": "|**2023-08-09**|**Classification of lung cancer subtypes on CT images with synthetic pathological priors**|Wentao Zhu et.al.|[2308.04663v1](http://arxiv.org/abs/2308.04663v1)|null|\n", "2308.04579": "|**2023-08-08**|**RECipe: Does a Multi-Modal Recipe Knowledge Graph Fit a Multi-Purpose Recommendation System?**|Ali Pesaranghader et.al.|[2308.04579v1](http://arxiv.org/abs/2308.04579v1)|null|\n", "2308.04556": "|**2023-08-08**|**FocalFormer3D : Focusing on Hard Instance for 3D Object Detection**|Yilun Chen et.al.|[2308.04556v1](http://arxiv.org/abs/2308.04556v1)|**[link](https://github.com/NVlabs/FocalFormer3D)**|\n", "2308.05667": "|**2023-08-14**|**2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration between Images and Point Clouds**|Minhao Li et.al.|[2308.05667v2](http://arxiv.org/abs/2308.05667v2)|**[link](https://github.com/minhaolee/2d3dmatr)**|\n", "2308.05648": "|**2023-08-10**|**Counterfactual Cross-modality Reasoning for Weakly Supervised Video Moment Localization**|Zezhong Lv et.al.|[2308.05648v1](http://arxiv.org/abs/2308.05648v1)|**[link](https://github.com/sldz0306/ccr)**|\n", "2308.05478": "|**2023-08-10**|**Reviewing 3D Object Detectors in the Context of High-Resolution 3+1D Radar**|Patrick Palmer et.al.|[2308.05478v1](http://arxiv.org/abs/2308.05478v1)|null|\n", "2308.05438": "|**2023-08-10**|**Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints Voting for Robust 6D Object Pose Estimation**|Jun Zhou et.al.|[2308.05438v1](http://arxiv.org/abs/2308.05438v1)|**[link](https://github.com/junzastar/dftr_voting)**|\n", "2308.05421": "|**2023-08-10**|**Progressive Spatio-temporal Perception for Audio-Visual Question Answering**|Guangyao Li et.al.|[2308.05421v1](http://arxiv.org/abs/2308.05421v1)|**[link](https://github.com/gewu-lab/pstp-net)**|\n", "2308.05128": "|**2023-08-09**|**High-Level Features Parallelization for Inference Cost Reduction Through Selective Attention**|Andr\u00e9 Peter Kelm et.al.|[2308.05128v1](http://arxiv.org/abs/2308.05128v1)|null|\n", "2308.06262": "|**2023-08-11**|**Foundation Model is Efficient Multimodal Multitask Model Selector**|Fanqing Meng et.al.|[2308.06262v1](http://arxiv.org/abs/2308.06262v1)|**[link](https://github.com/opengvlab/multitask-model-selector)**|\n", "2308.06207": "|**2023-08-11**|**Thinking Like an Expert:Multimodal Hypergraph-of-Thought (HoT) Reasoning to boost Foundation Modals**|Fanglong Yao et.al.|[2308.06207v1](http://arxiv.org/abs/2308.06207v1)|null|\n", "2308.06125": "|**2023-08-11**|**Improving Joint Speech-Text Representations Without Alignment**|Cal Peyser et.al.|[2308.06125v1](http://arxiv.org/abs/2308.06125v1)|null|\n", "2308.06024": "|**2023-08-11**|**Spatial-information Guided Adaptive Context-aware Network for Efficient RGB-D Semantic Segmentation**|Yang Zhang et.al.|[2308.06024v1](http://arxiv.org/abs/2308.06024v1)|**[link](https://github.com/mvme-hbut/sgacnet)**|\n", "2308.06009": "|**2023-08-11**|**ViGT: Proposal-free Video Grounding with Learnable Token in Transformer**|Kun Li et.al.|[2308.06009v1](http://arxiv.org/abs/2308.06009v1)|null|\n", "2308.05993": "|**2023-08-11**|**Image-based Geolocalization by Ground-to-2.5D Map Matching**|Mengjie Zhou et.al.|[2308.05993v1](http://arxiv.org/abs/2308.05993v1)|**[link](https://github.com/zhoumengjie/2-5dmap-dataset)**|\n", "2308.05948": "|**2023-08-11**|**Uncertainty-Aware Cross-Modal Transfer Network for Sketch-Based 3D Shape Retrieval**|Yiyang Cai et.al.|[2308.05948v1](http://arxiv.org/abs/2308.05948v1)|null|\n", "2308.05864": "|**2023-08-10**|**The Multi-modality Cell Segmentation Challenge: Towards Universal Solutions**|Jun Ma et.al.|[2308.05864v1](http://arxiv.org/abs/2308.05864v1)|null|\n", "2308.07222": "|**2023-08-14**|**MM-GEF: Multi-modal representation meet collaborative filtering**|Hao Wu et.al.|[2308.07222v1](http://arxiv.org/abs/2308.07222v1)|null|\n", "2308.07214": "|**2023-08-14**|**Automated Ensemble-Based Segmentation of Adult Brain Tumors: A Novel Approach Using the BraTS AFRICA Challenge Data**|Chiranjeewee Prasad Koirala et.al.|[2308.07214v1](http://arxiv.org/abs/2308.07214v1)|null|\n", "2308.07173": "|**2023-08-14**|**Enhancing State Estimator for Autonomous Race Car : Leveraging Multi-modal System and Managing Computing Resources**|Daegyu Lee et.al.|[2308.07173v1](http://arxiv.org/abs/2308.07173v1)|null|\n", "2308.07146": "|**2023-08-14**|**CTP: Towards Vision-Language Continual Pretraining via Compatible Momentum Contrast and Topology Preservation**|Hongguang Zhu et.al.|[2308.07146v1](http://arxiv.org/abs/2308.07146v1)|**[link](https://github.com/kevinlight831/ctp)**|\n", "2308.07026": "|**2023-08-14**|**AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal Contrastive Learning**|Ziqi Zhou et.al.|[2308.07026v1](http://arxiv.org/abs/2308.07026v1)|**[link](https://github.com/cgcl-codes/advclip)**|\n", "2308.06911": "|**2023-08-14**|**GIT-Mol: A Multi-modal Large Language Model for Molecular Science with Graph, Image, and Text**|Pengfei Liu et.al.|[2308.06911v1](http://arxiv.org/abs/2308.06911v1)|null|\n", "2308.06866": "|**2023-08-13**|**Improving Face Recognition from Caption Supervision with Multi-Granular Contextual Feature Aggregation**|Md Mahedi Hasan et.al.|[2308.06866v1](http://arxiv.org/abs/2308.06866v1)|null|\n", "2308.06735": "|**2023-08-13**|**AerialVLN: Vision-and-Language Navigation for UAVs**|Shubo Liu et.al.|[2308.06735v1](http://arxiv.org/abs/2308.06735v1)|**[link](https://github.com/airvln/airvln)**|\n", "2308.06696": "|**2023-08-13**|**MACO: A Modality Adversarial and Contrastive Framework for Modality-missing Multi-modal Knowledge Graph Completion**|Yichi Zhang et.al.|[2308.06696v1](http://arxiv.org/abs/2308.06696v1)|**[link](https://github.com/zjukg/maco)**|\n", "2308.06573": "|**2023-08-12**|**4DRVO-Net: Deep 4D Radar-Visual Odometry Using Multi-Modal and Multi-Scale Adaptive Fusion**|Guirong Zhuo et.al.|[2308.06573v1](http://arxiv.org/abs/2308.06573v1)|null|\n", "2308.06556": "|**2023-08-12**|**Contrastive Learning for Cross-modal Artist Retrieval**|Andres Ferraro et.al.|[2308.06556v1](http://arxiv.org/abs/2308.06556v1)|null|\n", "2308.06530": "|**2023-08-12**|**BEV-DG: Cross-Modal Learning under Bird's-Eye View for Domain Generalization of 3D Semantic Segmentation**|Miaoyu Li et.al.|[2308.06530v1](http://arxiv.org/abs/2308.06530v1)|null|\n", "2308.06498": "|**2023-08-12**|**Latent Emission-Augmented Perspective-Taking (LEAPT) for Human-Robot Interaction**|Kaiqi Chen et.al.|[2308.06498v1](http://arxiv.org/abs/2308.06498v1)|null|\n", "2308.06394": "|**2023-08-11**|**Detecting and Preventing Hallucinations in Large Vision Language Models**|Anisha Gunjal et.al.|[2308.06394v1](http://arxiv.org/abs/2308.06394v1)|null|\n", "2308.06377": "|**2023-08-11**|**CATS v2: Hybrid encoders for robust medical segmentation**|Hao Li et.al.|[2308.06377v1](http://arxiv.org/abs/2308.06377v1)|**[link](https://github.com/haoli12345/cats)**|\n", "2308.07907": "|**2023-08-15**|**Sequential Monte Carlo with Cross-validated Neural Networks for Complexity of Hyperbolic Black Hole Solutions in 4D**|Armin Hatefi et.al.|[2308.07907v1](http://arxiv.org/abs/2308.07907v1)|null|\n", "2308.07777": "|**2023-08-15**|**Enhancing Visually-Rich Document Understanding via Layout Structure Modeling**|Qiwei Li et.al.|[2308.07777v1](http://arxiv.org/abs/2308.07777v1)|null|\n", "2308.07751": "|**2023-08-15**|**CASPNet++: Joint Multi-Agent Motion Prediction**|Maximilian Sch\u00e4fer et.al.|[2308.07751v1](http://arxiv.org/abs/2308.07751v1)|null|\n", "2308.07732": "|**2023-08-15**|**UniTR: A Unified and Efficient Multi-Modal Transformer for Bird's-Eye-View Representation**|Haiyang Wang et.al.|[2308.07732v1](http://arxiv.org/abs/2308.07732v1)|**[link](https://github.com/haiyang-w/unitr)**|\n", "2308.07686": "|**2023-08-15**|**Boosting Multi-modal Model Performance with Adaptive Gradient Modulation**|Hong Li et.al.|[2308.07686v1](http://arxiv.org/abs/2308.07686v1)|**[link](https://github.com/lihong2303/agm_iccv2023)**|\n", "2308.07648": "|**2023-08-15**|**Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval**|Chaorui Deng et.al.|[2308.07648v1](http://arxiv.org/abs/2308.07648v1)|**[link](https://github.com/bladewaltz1/promptswitch)**|\n", "2308.07622": "|**2023-08-15**|**EMID: An Emotional Aligned Dataset in Audio-Visual Modality**|Jialing Zou et.al.|[2308.07622v1](http://arxiv.org/abs/2308.07622v1)|**[link](https://github.com/ecnu-aigc/emid)**|\n", "2308.07605": "|**2023-08-15**|**SGDiff: A Style Guided Diffusion Model for Fashion Synthesis**|Zhengwentai Sun et.al.|[2308.07605v1](http://arxiv.org/abs/2308.07605v1)|**[link](https://github.com/taited/sgdiff)**|\n", "2308.08546": "|**2023-08-16**|**What is the source of the PTA GW signal?**|John Ellis et.al.|[2308.08546v1](http://arxiv.org/abs/2308.08546v1)|null|\n", "2308.08409": "|**2023-08-16**|**X-PSI Parameter Recovery for Temperature Map Configurations Inspired by PSR J0030+0451**|Serena Vinciguerra et.al.|[2308.08409v1](http://arxiv.org/abs/2308.08409v1)|null|\n", "2308.08303": "|**2023-08-16**|**Leveraging Next-Active Objects for Context-Aware Anticipation in Egocentric Videos**|Sanket Thakur et.al.|[2308.08303v1](http://arxiv.org/abs/2308.08303v1)|null|\n", "2308.08157": "|**2023-08-16**|**Learning to Generate Semantic Layouts for Higher Text-Image Correspondence in Text-to-Image Synthesis**|Minho Park et.al.|[2308.08157v1](http://arxiv.org/abs/2308.08157v1)|**[link](https://github.com/pmh9960/GCDP)**|\n", "2308.08143": "|**2023-08-16**|**SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech Separation**|Kai Li et.al.|[2308.08143v1](http://arxiv.org/abs/2308.08143v1)|null|\n", "2308.08125": "|**2023-08-16**|**Radio2Text: Streaming Speech Recognition Using mmWave Radio Signals**|Running Zhao et.al.|[2308.08125v1](http://arxiv.org/abs/2308.08125v1)|null|\n", "2308.08088": "|**2023-08-16**|**Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme Detection**|Rui Cao et.al.|[2308.08088v1](http://arxiv.org/abs/2308.08088v1)|**[link](https://github.com/social-ai-studio/pro-cap)**|\n", "2308.09622": "|**2023-08-18**|**Is context all you need? Scaling Neural Sign Language Translation to Large Domains of Discourse**|Ozge Mercanoglu Sincan et.al.|[2308.09622v1](http://arxiv.org/abs/2308.09622v1)|null|\n", "2308.09599": "|**2023-08-18**|**Language-Guided Diffusion Model for Visual Grounding**|Sijia Chen et.al.|[2308.09599v1](http://arxiv.org/abs/2308.09599v1)|null|\n", "2308.09568": "|**2023-08-18**|**PUMGPT: A Large Vision-Language Model for Product Understanding**|Shuhui Wu et.al.|[2308.09568v1](http://arxiv.org/abs/2308.09568v1)|null|\n", "2308.09475": "|**2023-08-18**|**Video-Instrument Synergistic Network for Referring Video Instrument Segmentation in Robotic Surgery**|Hongqiu Wang et.al.|[2308.09475v1](http://arxiv.org/abs/2308.09475v1)|null|\n", "2308.09469": "|**2023-08-18**|**An updated mass-radius analysis of the 2017-2018 NICER data set of PSR J0030+0451**|Serena Vinciguerra et.al.|[2308.09469v1](http://arxiv.org/abs/2308.09469v1)|null|\n", "2308.09442": "|**2023-08-21**|**BioMedGPT: Open Multimodal Generative Pre-trained Transformer for BioMedicine**|Yizhen Luo et.al.|[2308.09442v2](http://arxiv.org/abs/2308.09442v2)|**[link](https://github.com/pharmolix/openbiomed)**|\n", "2308.09369": "|**2023-08-18**|**Single Frame Semantic Segmentation Using Multi-Modal Spherical Images**|Suresh Guttikonda et.al.|[2308.09369v1](http://arxiv.org/abs/2308.09369v1)|**[link](https://github.com/sguttikon/SFSS-MMSI)**|\n", "2308.09363": "|**2023-08-18**|**Open-vocabulary Video Question Answering: A New Benchmark for Evaluating the Generalizability of Video Question Answering Models**|Dohwan Ko et.al.|[2308.09363v1](http://arxiv.org/abs/2308.09363v1)|**[link](https://github.com/mlvlab/ovqa)**|\n", "2308.09351": "|**2023-08-18**|**RLIPv2: Fast Scaling of Relational Language-Image Pre-training**|Hangjie Yuan et.al.|[2308.09351v1](http://arxiv.org/abs/2308.09351v1)|**[link](https://github.com/jacobyuan7/rlipv2)**|\n", "2308.09322": "|**2023-08-18**|**Audio-Visual Glance Network for Efficient Video Recognition**|Muhammad Adi Nugroho et.al.|[2308.09322v1](http://arxiv.org/abs/2308.09322v1)|null|\n", "2308.09306": "|**2023-08-18**|**DiffDis: Empowering Generative Diffusion Model with Cross-Modal Discrimination Capability**|Runhui Huang et.al.|[2308.09306v1](http://arxiv.org/abs/2308.09306v1)|null|\n", "2308.09300": "|**2023-08-21**|**V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by Connecting Foundation Models**|Heng Wang et.al.|[2308.09300v2](http://arxiv.org/abs/2308.09300v2)|**[link](https://github.com/heng-hw/V2A-Mapper)**|\n", "2308.09234": "|**2023-08-18**|**Deep Boosting Multi-Modal Ensemble Face Recognition with Sample-Level Weighting**|Sahar Rahimi Malakshan et.al.|[2308.09234v1](http://arxiv.org/abs/2308.09234v1)|null|\n", "2308.09179": "|**2023-08-17**|**Versatile Multi-Contact Planning and Control for Legged Loco-Manipulation**|Jean-Pierre Sleiman et.al.|[2308.09179v1](http://arxiv.org/abs/2308.09179v1)|null|\n", "2308.08930": "|**2023-08-17**|**Point-aware Interaction and CNN-induced Refinement Network for RGB-D Salient Object Detection**|Runmin Cong et.al.|[2308.08930v1](http://arxiv.org/abs/2308.08930v1)|**[link](https://github.com/rmcong/picr-net_acmmm23)**|\n", "2308.10777": "|**2023-08-21**|**I-BaR: Integrated Balance Rehabilitation Framework**|Tugce Ersoy et.al.|[2308.10777v1](http://arxiv.org/abs/2308.10777v1)|null|\n", "2308.10741": "|**2023-08-21**|**On the Adversarial Robustness of Multi-Modal Foundation Models**|Christian Schlarmann et.al.|[2308.10741v1](http://arxiv.org/abs/2308.10741v1)|null|\n", "2308.10631": "|**2023-08-21**|**PsyMo: A Dataset for Estimating Self-Reported Psychological Traits from Gait**|Adrian Cosma et.al.|[2308.10631v1](http://arxiv.org/abs/2308.10631v1)|null|\n", "2308.10627": "|**2023-08-21**|**Polarimetric Information for Multi-Modal 6D Pose Estimation of Photometrically Challenging Objects with Limited Data**|Patrick Ruhkamp et.al.|[2308.10627v1](http://arxiv.org/abs/2308.10627v1)|null|\n", "2308.10621": "|**2023-08-21**|**Multi-Modal Dataset Acquisition for Photometrically Challenging Object**|HyunJun Jung et.al.|[2308.10621v1](http://arxiv.org/abs/2308.10621v1)|null|\n", "2308.10491": "|**2023-08-21**|**SynDrone -- Multi-modal UAV Dataset for Urban Scenarios**|Giulia Rizzoli et.al.|[2308.10491v1](http://arxiv.org/abs/2308.10491v1)|**[link](https://github.com/lttm/syndrone)**|\n", "2308.10486": "|**2023-08-21**|**Deep Metric Loss for Multimodal Learning**|Sehwan Moon et.al.|[2308.10486v1](http://arxiv.org/abs/2308.10486v1)|**[link](https://github.com/sehwanmoon/multimodalloss)**|\n", "2308.10454": "|**2023-08-21**|**Elucidating STEM Concepts through Generative AI: A Multi-modal Exploration of Analogical Reasoning**|Chen Cao et.al.|[2308.10454v1](http://arxiv.org/abs/2308.10454v1)|null|\n", "2308.10421": "|**2023-08-21**|**UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D Representation for 3D Perception in Autonomous Driving**|Jian Zou et.al.|[2308.10421v1](http://arxiv.org/abs/2308.10421v1)|**[link](https://github.com/hollow-503/unim2ae)**|\n", "2308.10362": "|**2023-08-20**|**Vehicle Cameras Guide mmWave Beams: Approach and Real-World V2V Demonstration**|Tawfik Osman et.al.|[2308.10362v1](http://arxiv.org/abs/2308.10362v1)|null|\n", "2308.10240": "|**2023-08-20**|**Generic Attention-model Explainability by Weighted Relevance Accumulation**|Yiming Huang et.al.|[2308.10240v1](http://arxiv.org/abs/2308.10240v1)|null|\n", "2308.10175": "|**2023-08-20**|**BAVS: Bootstrapping Audio-Visual Segmentation by Integrating Foundation Knowledge**|Chen Liu et.al.|[2308.10175v1](http://arxiv.org/abs/2308.10175v1)|null|\n", "2308.10172": "|**2023-08-20**|**VLN-PETL: Parameter-Efficient Transfer Learning for Vision-and-Language Navigation**|Yanyuan Qiao et.al.|[2308.10172v1](http://arxiv.org/abs/2308.10172v1)|**[link](https://github.com/yanyuanqiao/vln-petl)**|\n", "2308.10161": "|**2023-08-20**|**ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under Challenging Conditions**|Qiao Yan et.al.|[2308.10161v1](http://arxiv.org/abs/2308.10161v1)|null|\n", "2308.10146": "|**2023-08-20**|**OCHID-Fi: Occlusion-Robust Hand Pose Estimation in 3D via RF-Vision**|Shujie Zhang et.al.|[2308.10146v1](http://arxiv.org/abs/2308.10146v1)|null|\n", "2308.11601": "|**2023-08-23**|**Tryage: Real-time, intelligent Routing of User Prompts to Large Language Models**|Surya Narayanan Hari et.al.|[2308.11601v2](http://arxiv.org/abs/2308.11601v2)|null|\n", "2308.11561": "|**2023-08-23**|**Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog Navigation**|Yifei Su et.al.|[2308.11561v2](http://arxiv.org/abs/2308.11561v2)|**[link](https://github.com/yifeisu/avdn-challenge)**|\n", "2308.11551": "|**2023-08-22**|**Multi-event Video-Text Retrieval**|Gengyuan Zhang et.al.|[2308.11551v1](http://arxiv.org/abs/2308.11551v1)|**[link](https://github.com/gengyuanmax/mevtr)**|\n", "2308.11530": "|**2023-08-22**|**Furnishing Sound Event Detection with Language Model Abilities**|Hualei Wang et.al.|[2308.11530v1](http://arxiv.org/abs/2308.11530v1)|null|\n", "2308.11513": "|**2023-08-22**|**TrackFlow: Multi-Object Tracking with Normalizing Flows**|Gianluca Mancusi et.al.|[2308.11513v1](http://arxiv.org/abs/2308.11513v1)|null|\n", "2308.11501": "|**2023-08-22**|**Four years of multi-modal odometry and mapping on the rail vehicles**|Yusheng Wang et.al.|[2308.11501v1](http://arxiv.org/abs/2308.11501v1)|null|\n", "2308.11492": "|**2023-08-22**|**A LiDAR-Inertial SLAM Tightly-Coupled with Dropout-Tolerant GNSS Fusion for Autonomous Mine Service Vehicles**|Yusheng Wang et.al.|[2308.11492v1](http://arxiv.org/abs/2308.11492v1)|null|\n", "2308.11356": "|**2023-08-22**|**Semantic RGB-D Image Synthesis**|Shijie Li et.al.|[2308.11356v1](http://arxiv.org/abs/2308.11356v1)|null|\n", "2308.11351": "|**2023-08-22**|**M3PS: End-to-End Multi-Grained Multi-Modal Attribute-Aware Product Summarization in E-commerce**|Tao Chen et.al.|[2308.11351v1](http://arxiv.org/abs/2308.11351v1)|null|\n", "2308.11331": "|**2023-08-22**|**GrowCLIP: Data-aware Automatic Model Growing for Large-scale Contrastive Language-Image Pre-training**|Xinchi Deng et.al.|[2308.11331v1](http://arxiv.org/abs/2308.11331v1)|null|\n", "2308.11206": "|**2023-08-22**|**DiffCloth: Diffusion Based Garment Synthesis and Manipulation via Structural Cross-modal Semantic Alignment**|Xujie Zhang et.al.|[2308.11206v1](http://arxiv.org/abs/2308.11206v1)|null|\n", "2308.11175": "|**2023-08-22**|**MISSRec: Pre-training and Transferring Multi-modal Interest-aware Sequence Representation for Recommendation**|Jinpeng Wang et.al.|[2308.11175v1](http://arxiv.org/abs/2308.11175v1)|**[link](https://github.com/gimpong/MM23-MISSRec)**|\n", "2308.11165": "|**2023-08-22**|**Improving Misaligned Multi-modality Image Fusion with One-stage Progressive Dense Registration**|Di Wang et.al.|[2308.11165v1](http://arxiv.org/abs/2308.11165v1)|null|\n", "2308.12199": "|**2023-08-23**|**Towards Real-Time Analysis of Broadcast Badminton Videos**|Nitin Nilesh et.al.|[2308.12199v1](http://arxiv.org/abs/2308.12199v1)|**[link](https://gitlab.com/nitin.nilesh/badminton-analysis-star)**|\n", "2308.12163": "|**2023-08-23**|**NPF-200: A Multi-Modal Eye Fixation Dataset and Method for Non-Photorealistic Videos**|Ziyu Yang et.al.|[2308.12163v1](http://arxiv.org/abs/2308.12163v1)|**[link](https://github.com/yangziyu/npf200)**|\n", "2308.12111": "|**2023-08-23**|**Cross-Modality Proposal-guided Feature Mining for Unregistered RGB-Thermal Pedestrian Detection**|Chao Tian et.al.|[2308.12111v1](http://arxiv.org/abs/2308.12111v1)|null|\n", "2308.12049": "|**2023-08-23**|**Towards Privacy-Supporting Fall Detection via Deep Unsupervised RGB2Depth Adaptation**|Hejun Xiao et.al.|[2308.12049v1](http://arxiv.org/abs/2308.12049v1)|**[link](https://github.com/1015206533/privacy_supporting_fall_detection)**|\n", "2308.11994": "|**2023-08-23**|**Progressive Feature Mining and External Knowledge-Assisted Text-Pedestrian Image Retrieval**|Huafeng Li et.al.|[2308.11994v1](http://arxiv.org/abs/2308.11994v1)|null|\n", "2308.11983": "|**2023-08-23**|**Multi-Modal Multi-Task (3MT) Road Segmentation**|Erkan Milli et.al.|[2308.11983v1](http://arxiv.org/abs/2308.11983v1)|**[link](https://github.com/erkanmilli/3mt-roadseg)**|\n", "2308.11880": "|**2023-08-23**|**SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal Targets**|Cody Simons et.al.|[2308.11880v1](http://arxiv.org/abs/2308.11880v1)|**[link](https://github.com/csimo005/summit)**|\n", "2308.11877": "|**2023-08-24**|**Integrated Image and Location Analysis for Wound Classification: A Deep Learning Approach**|Yash Patel et.al.|[2308.11877v2](http://arxiv.org/abs/2308.11877v2)|null|\n", "2308.11804": "|**2023-08-22**|**Ceci n'est pas une pomme: Adversarial Illusions in Multi-Modal Embeddings**|Eugene Bagdasaryan et.al.|[2308.11804v1](http://arxiv.org/abs/2308.11804v1)|**[link](https://github.com/ebagdasa/adversarial_illusions)**|\n", "2308.11797": "|**2023-08-22**|**CLIP Multi-modal Hashing: A new baseline CLIPMH**|Jian Zhu et.al.|[2308.11797v1](http://arxiv.org/abs/2308.11797v1)|null|\n", "2308.12956": "|**2023-08-24**|**DLIP: Distilling Language-Image Pre-training**|Huafeng Kuang et.al.|[2308.12956v1](http://arxiv.org/abs/2308.12956v1)|null|\n", "2308.12871": "|**2023-08-24**|**IPA: Inference Pipeline Adaptation to Achieve High Accuracy and Cost-Efficiency**|Saeid Ghafouri et.al.|[2308.12871v1](http://arxiv.org/abs/2308.12871v1)|null|\n", "2308.12863": "|**2023-08-24**|**SkipcrossNets: Adaptive Skip-cross Fusion for Road Detection**|Xinyu Zhang et.al.|[2308.12863v1](http://arxiv.org/abs/2308.12863v1)|null|\n", "2308.12755": "|**2023-08-24**|**Acquiring Qualitative Explainable Graphs for Automated Driving Scene Interpretation**|Nassim Belmecheri et.al.|[2308.12755v1](http://arxiv.org/abs/2308.12755v1)|**[link](https://github.com/simula-vias/qxg-builder)**|\n", "2308.12736": "|**2023-08-24**|**FastSurfer-HypVINN: Automated sub-segmentation of the hypothalamus and adjacent structures on high-resolutional brain MRI**|Santiago Estrada et.al.|[2308.12736v1](http://arxiv.org/abs/2308.12736v1)|**[link](https://github.com/Deep-MI/FastSurfer)**|\n", "2308.12610": "|**2023-08-24**|**Emotion-Aligned Contrastive Learning Between Images and Music**|Shanti Stewart et.al.|[2308.12610v1](http://arxiv.org/abs/2308.12610v1)|null|\n", "2308.12604": "|**2023-08-24**|**PromptMRG: Diagnosis-Driven Prompts for Medical Report Generation**|Haibo Jin et.al.|[2308.12604v1](http://arxiv.org/abs/2308.12604v1)|null|\n", "2308.12587": "|**2023-08-24**|**Grounded Entity-Landmark Adaptive Pre-training for Vision-and-Language Navigation**|Yibo Cui et.al.|[2308.12587v1](http://arxiv.org/abs/2308.12587v1)|**[link](https://github.com/csir1996/vln-gela)**|\n", "2308.12558": "|**2023-08-24**|**Hyperbolic Audio-visual Zero-shot Learning**|Jie Hong et.al.|[2308.12558v1](http://arxiv.org/abs/2308.12558v1)|null|\n", "2308.12509": "|**2023-08-24**|**Parameter-Efficient Transfer Learning for Remote Sensing Image-Text Retrieval**|Yuan Yuan et.al.|[2308.12509v1](http://arxiv.org/abs/2308.12509v1)|**[link](https://github.com/ZhanYang-nwpu/PE-RSITR)**|\n", "2308.12370": "|**2023-08-23**|**AdVerb: Visually Guided Audio Dereverberation**|Sanjoy Chowdhury et.al.|[2308.12370v1](http://arxiv.org/abs/2308.12370v1)|null|\n", "2308.12320": "|**2023-08-23**|**Understanding Dark Scenes by Contrasting Multi-Modal Observations**|Xiaoyu Dong et.al.|[2308.12320v1](http://arxiv.org/abs/2308.12320v1)|**[link](https://github.com/palmdong/smmcl)**|\n", "2308.13437": "|**2023-08-25**|**Position-Enhanced Visual Instruction Tuning for Multimodal Large Language Models**|Chi Chen et.al.|[2308.13437v1](http://arxiv.org/abs/2308.13437v1)|**[link](https://github.com/pvit-official/pvit)**|\n", "2308.13392": "|**2023-08-25**|**Self-Supervised Representation Learning with Cross-Context Learning between Global and Hypercolumn Features**|Zheng Gao et.al.|[2308.13392v1](http://arxiv.org/abs/2308.13392v1)|null|\n", "2308.13355": "|**2023-08-25**|**WorldSmith: Iterative and Expressive Prompting for World Building with a Generative AI**|Hai Dang et.al.|[2308.13355v1](http://arxiv.org/abs/2308.13355v1)|null|\n", "2308.13340": "|**2023-08-25**|**TriGait: Aligning and Fusing Skeleton and Silhouette Gait Data via a Tri-Branch Network**|Yan Sun et.al.|[2308.13340v1](http://arxiv.org/abs/2308.13340v1)|**[link](https://github.com/feng-xueling/trigait)**|\n", "2308.13077": "|**2023-08-24**|**Preserving Modality Structure Improves Multi-Modal Learning**|Swetha Sirnam et.al.|[2308.13077v1](http://arxiv.org/abs/2308.13077v1)|null|\n", "2308.14713": "|**2023-08-28**|**R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras**|Aron Schmied et.al.|[2308.14713v1](http://arxiv.org/abs/2308.14713v1)|null|\n", "2308.14619": "|**2023-08-29**|**Compositional Semantic Mix for Domain Adaptation in Point Cloud Segmentation**|Cristiano Saltori et.al.|[2308.14619v2](http://arxiv.org/abs/2308.14619v2)|**[link](https://github.com/saltoricristiano/cosmix-uda)**|\n", "2308.14613": "|**2023-08-28**|**MS-Net: A Multi-modal Self-supervised Network for Fine-Grained Classification of Aircraft in SAR Images**|Bingying Yue et.al.|[2308.14613v1](http://arxiv.org/abs/2308.14613v1)|null|\n", "2308.14482": "|**2023-08-28**|**An Empirical Study of Consistency Regularization for End-to-End Speech-to-Text Translation**|Pengzhi Gao et.al.|[2308.14482v1](http://arxiv.org/abs/2308.14482v1)|**[link](https://github.com/gpengzhi/simcr)**|\n", "2308.14383": "|**2023-08-28**|**Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a Light-Weight ToF Sensor**|Xinyang Liu et.al.|[2308.14383v1](http://arxiv.org/abs/2308.14383v1)|null|\n", "2308.14263": "|**2023-08-28**|**Cross-Modal Retrieval: A Systematic Review of Methods and Future Directions**|Lei Zhu et.al.|[2308.14263v1](http://arxiv.org/abs/2308.14263v1)|**[link](https://github.com/bmc-sdnu/cross-modal-retrieval)**|\n", "2308.14212": "|**2023-08-27**|**Exploring the Transfer Learning Capabilities of CLIP in Domain Generalization for Diabetic Retinopathy**|Sanoojan Baliah et.al.|[2308.14212v1](http://arxiv.org/abs/2308.14212v1)|**[link](https://github.com/sanoojan/clip-drdg)**|\n", "2308.14177": "|**2023-08-27**|**AIGC for Various Data Modalities: A Survey**|Lin Geng Foo et.al.|[2308.14177v1](http://arxiv.org/abs/2308.14177v1)|null|\n", "2308.14160": "|**2023-08-27**|**A Unified Transformer-based Network for multimodal Emotion Recognition**|Kamran Ali et.al.|[2308.14160v1](http://arxiv.org/abs/2308.14160v1)|null|\n", "2308.14105": "|**2023-08-29**|**Unified and Dynamic Graph for Temporal Character Grouping in Long Videos**|Xiujun Shu et.al.|[2308.14105v2](http://arxiv.org/abs/2308.14105v2)|null|\n", "2308.14083": "|**2023-08-27**|**4D Myocardium Reconstruction with Decoupled Motion and Shape Model**|Xiaohan Yuan et.al.|[2308.14083v1](http://arxiv.org/abs/2308.14083v1)|**[link](https://github.com/yuan-xiaohan/4d-myocardium-reconstruction-with-decoupled-motion-and-shape-model)**|\n", "2308.14064": "|**2023-08-27**|**Multi-model fusion for Aerial Vision and Dialog Navigation based on human attention aids**|Xinyi Wang et.al.|[2308.14064v1](http://arxiv.org/abs/2308.14064v1)|null|\n", "2308.14023": "|**2023-08-27**|**Domain-Specificity Inducing Transformers for Source-Free Domain Adaptation**|Sunandini Sanyal et.al.|[2308.14023v1](http://arxiv.org/abs/2308.14023v1)|null|\n", "2308.14009": "|**2023-08-27**|**Towards Fast and Accurate Image-Text Retrieval with Self-Supervised Fine-Grained Alignment**|Jiamin Zhuang et.al.|[2308.14009v1](http://arxiv.org/abs/2308.14009v1)|**[link](https://github.com/zjamie813/selfalign)**|\n", "2308.13976": "|**2023-08-27**|**Label Denoising through Cross-Model Agreement**|Yu Wang et.al.|[2308.13976v1](http://arxiv.org/abs/2308.13976v1)|null|\n", "2308.15273": "|**2023-08-29**|**Cross-Modal Retrieval Meets Inference:Improving Zero-Shot Classification with Cross-Modal Retrieval**|Seongha Eom et.al.|[2308.15273v1](http://arxiv.org/abs/2308.15273v1)|null|\n", "2308.15063": "|**2023-08-29**|**Learning Cross-modality Information Bottleneck Representation for Heterogeneous Person Re-Identification**|Haichao Shi et.al.|[2308.15063v1](http://arxiv.org/abs/2308.15063v1)|null|\n", "2308.14978": "|**2023-08-29**|**Vision Grid Transformer for Document Layout Analysis**|Cheng Da et.al.|[2308.14978v1](http://arxiv.org/abs/2308.14978v1)|**[link](https://github.com/alibabaresearch/advancedliteratemachinery)**|\n", "2308.14786": "|**2023-08-28**|**Extending Cross-Modal Retrieval with Interactive Learning to Improve Image Retrieval Performance in Forensics**|Nils B\u00f6hne et.al.|[2308.14786v1](http://arxiv.org/abs/2308.14786v1)|null|\n", "2308.16150": "|**2023-08-30**|**Modality Cycles with Masked Conditional Diffusion for Unsupervised Anomaly Segmentation in MRI**|Ziyun Liang et.al.|[2308.16150v1](http://arxiv.org/abs/2308.16150v1)|**[link](https://github.com/ziyunliang/mmccd)**|\n", "2308.16071": "|**2023-08-30**|**Semantic Image Synthesis via Class-Adaptive Cross-Attention**|Tomaso Fontanini et.al.|[2308.16071v1](http://arxiv.org/abs/2308.16071v1)|null|\n", "2308.16021": "|**2023-08-30**|**CALM: Contrastive Cross-modal Speaking Style Modeling for Expressive Text-to-Speech Synthesis**|Yi Meng et.al.|[2308.16021v1](http://arxiv.org/abs/2308.16021v1)|null|\n", "2308.15980": "|**2023-08-30**|**Adaptive Multi-Modalities Fusion in Sequential Recommendation Systems**|Hengchang Hu et.al.|[2308.15980v1](http://arxiv.org/abs/2308.15980v1)|**[link](https://github.com/holdenhu/mmsr)**|\n", "2308.15930": "|**2023-08-30**|**LLaSM: Large Language and Speech Model**|Yu Shu et.al.|[2308.15930v1](http://arxiv.org/abs/2308.15930v1)|**[link](https://github.com/linksoul-ai/llasm)**|\n", "2308.15846": "|**2023-08-30**|**Exploring Multi-Modal Contextual Knowledge for Open-Vocabulary Object Detection**|Yifan Xu et.al.|[2308.15846v1](http://arxiv.org/abs/2308.15846v1)|null|\n", "2308.15670": "|**2023-08-29**|**Multimodal Foundation Models For Echocardiogram Interpretation**|Matthew Christensen et.al.|[2308.15670v1](http://arxiv.org/abs/2308.15670v1)|**[link](https://github.com/echonet/echo_CLIP)**|\n", "2308.15640": "|**2023-08-29**|**Identifying Constitutive Parameters for Complex Hyperelastic Solids using Physics-Informed Neural Networks**|Siyuan Song et.al.|[2308.15640v1](http://arxiv.org/abs/2308.15640v1)|null|\n", "2308.15609": "|**2023-08-29**|**InstaTune: Instantaneous Neural Architecture Search During Fine-Tuning**|Sharath Nittur Sridhar et.al.|[2308.15609v1](http://arxiv.org/abs/2308.15609v1)|null|\n", "2308.15592": "|**2023-08-29**|**Non-local Interactions are Essential Elements for Dark Matter Halo Stability: A Cross-Model Study**|Ahmad Borzou et.al.|[2308.15592v1](http://arxiv.org/abs/2308.15592v1)|null|\n", "2308.16896": "|**2023-08-31**|**PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic Occupancy Prediction**|Sicheng Zuo et.al.|[2308.16896v1](http://arxiv.org/abs/2308.16896v1)|**[link](https://github.com/wzzheng/pointocc)**|\n", "2308.16777": "|**2023-09-01**|**Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models**|Minheng Ni et.al.|[2308.16777v2](http://arxiv.org/abs/2308.16777v2)|null|\n", "2308.16758": "|**2023-08-31**|**Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation Using only Images**|Cuican Yu et.al.|[2308.16758v1](http://arxiv.org/abs/2308.16758v1)|null|\n", "2308.16649": "|**2023-08-31**|**Learning with Multi-modal Gradient Attention for Explainable Composed Image Retrieval**|Prateksha Udhayanan et.al.|[2308.16649v1](http://arxiv.org/abs/2308.16649v1)|null|\n", "2308.16632": "|**2023-08-31**|**3D-STMN: Dependency-Driven Superpoint-Text Matching Network for End-to-End 3D Referring Expression Segmentation**|Changli Wu et.al.|[2308.16632v1](http://arxiv.org/abs/2308.16632v1)|**[link](https://github.com/sosppxo/3d-stmn)**|\n", "2308.16493": "|**2023-08-31**|**Expanding Frozen Vision-Language Models without Retraining: Towards Improved Robot Perception**|Riley Tavassoli et.al.|[2308.16493v1](http://arxiv.org/abs/2308.16493v1)|null|\n", "2308.16474": "|**2023-08-31**|**Enhancing Subtask Performance of Multi-modal Large Language Model**|Yongqiang Zhao et.al.|[2308.16474v1](http://arxiv.org/abs/2308.16474v1)|null|\n", "2308.16437": "|**2023-08-31**|**AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR Prediction**|Zhaoxin Huan et.al.|[2308.16437v1](http://arxiv.org/abs/2308.16437v1)|null|\n", "2308.16386": "|**2023-08-31**|**RGB-T Tracking via Multi-Modal Mutual Prompt Learning**|Yang Luo et.al.|[2308.16386v1](http://arxiv.org/abs/2308.16386v1)|**[link](https://github.com/husteryoung/mplt)**|\n", "2309.00615": "|**2023-09-01**|**Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D Understanding, Generation, and Instruction Following**|Ziyu Guo et.al.|[2309.00615v1](http://arxiv.org/abs/2309.00615v1)|**[link](https://github.com/ziyuguo99/point-bind_point-llm)**|\n", "2309.00406": "|**2023-09-01**|**Constraining X-ray variability of the blazar 3C 273 using XMM-Newton observations over two decades**|Adithiya Dinesh et.al.|[2309.00406v1](http://arxiv.org/abs/2309.00406v1)|null|\n", "2309.00380": "|**2023-09-01**|**Learning multi-modal generative models with permutation-invariant encoders and tighter variational bounds**|Marcel Hirt et.al.|[2309.00380v1](http://arxiv.org/abs/2309.00380v1)|null|\n", "2309.00372": "|**2023-09-01**|**On the Localization of Ultrasound Image Slices within Point Distribution Models**|Lennart Bastian et.al.|[2309.00372v1](http://arxiv.org/abs/2309.00372v1)|**[link](https://github.com/vuenc/slice-to-shape)**|\n", "2309.00227": "|**2023-09-01**|**What Makes Good Open-Vocabulary Detector: A Disassembling Perspective**|Jincheng Li et.al.|[2309.00227v1](http://arxiv.org/abs/2309.00227v1)|null|\n", "2309.00133": "|**2023-08-31**|**Distraction-free Embeddings for Robust VQA**|Atharvan Dogra et.al.|[2309.00133v1](http://arxiv.org/abs/2309.00133v1)|null|\n", "2309.00030": "|**2023-08-31**|**Audio-Driven Dubbing for User Generated Contents via Style-Aware Semi-Parametric Synthesis**|Linsen Song et.al.|[2309.00030v1](http://arxiv.org/abs/2309.00030v1)|null|\n", "2309.02320": "|**2023-09-05**|**SeisCLIP: A seismology foundation model pre-trained by multi-modal data for multi-purpose seismic feature extraction**|Xu Si et.al.|[2309.02320v1](http://arxiv.org/abs/2309.02320v1)|**[link](https://github.com/sixu0/SeisCLIP)**|\n", "2309.02169": "|**2023-09-05**|**Dual Relation Alignment for Composed Image Retrieval**|Xintong Jiang et.al.|[2309.02169v1](http://arxiv.org/abs/2309.02169v1)|null|\n", "2309.02124": "|**2023-09-05**|**Exploiting Spatial-temporal Data for Sleep Stage Classification via Hypergraph Learning**|Yuze Liu et.al.|[2309.02124v1](http://arxiv.org/abs/2309.02124v1)|null|\n", "2309.02043": "|**2023-09-05**|**Decomposed Guided Dynamic Filters for Efficient RGB-Guided Depth Completion**|Yufei Wang et.al.|[2309.02043v1](http://arxiv.org/abs/2309.02043v1)|null|\n", "2309.02041": "|**2023-09-05**|**Learning Cross-Modal Affinity for Referring Video Object Segmentation Targeting Limited Samples**|Guanghui Li et.al.|[2309.02041v1](http://arxiv.org/abs/2309.02041v1)|**[link](https://github.com/hengliusky/few_shot_rvos)**|\n", "2309.01981": "|**2023-09-05**|**Graph-Based Interaction-Aware Multimodal 2D Vehicle Trajectory Prediction using Diffusion Graph Convolutional Networks**|Keshu Wu et.al.|[2309.01981v1](http://arxiv.org/abs/2309.01981v1)|null|\n", "2309.01955": "|**2023-09-05**|**A Survey on Interpretable Cross-modal Reasoning**|Dizhan Xue et.al.|[2309.01955v1](http://arxiv.org/abs/2309.01955v1)|**[link](https://github.com/ZuyiZhou/Awesome-Interpretable-Cross-modal-Reasoning)**|\n", "2309.01918": "|**2023-09-05**|**RoboAgent: Generalization and Efficiency in Robot Manipulation via Semantic Augmentations and Action Chunking**|Homanga Bharadhwaj et.al.|[2309.01918v1](http://arxiv.org/abs/2309.01918v1)|null|\n", "2309.01860": "|**2023-09-06**|**Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition and Translation**|Zaber Ibn Abdul Hakim et.al.|[2309.01860v2](http://arxiv.org/abs/2309.01860v2)|null|\n", "2309.01728": "|**2023-09-04**|**Generative-based Fusion Mechanism for Multi-Modal Tracking**|Zhangyong Tang et.al.|[2309.01728v1](http://arxiv.org/abs/2309.01728v1)|**[link](https://github.com/zhangyong-tang/gmmt)**|\n", "2309.01516": "|**2023-09-04**|**MultiWay-Adapater: Adapting large-scale multi-modal models for scalable image-text retrieval**|Zijun Long et.al.|[2309.01516v1](http://arxiv.org/abs/2309.01516v1)|**[link](https://github.com/longkukuhi/multiway-adapter)**|\n", "2309.01420": "|**2023-09-04**|**Unified Pre-training with Pseudo Texts for Text-To-Image Person Re-identification**|Zhiyin Shao et.al.|[2309.01420v1](http://arxiv.org/abs/2309.01420v1)|**[link](https://github.com/zhiyinshao-h/unipt)**|\n", "2309.01327": "|**2023-09-04**|**Can I Trust Your Answer? Visually Grounded Video Question Answering**|Junbin Xiao et.al.|[2309.01327v1](http://arxiv.org/abs/2309.01327v1)|**[link](https://github.com/doc-doc/next-gqa)**|\n", "2309.01256": "|**2023-09-03**|**BDC-Adapter: Brownian Distance Covariance for Better Vision-Language Reasoning**|Yi Zhang et.al.|[2309.01256v1](http://arxiv.org/abs/2309.01256v1)|null|\n", "2309.01073": "|**2023-09-03**|**Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding**|Cheng Shi et.al.|[2309.01073v1](http://arxiv.org/abs/2309.01073v1)|null|\n", "2309.03177": "|**2023-09-06**|**3D Object Positioning Using Differentiable Multimodal Learning**|Sean Zanyk-McLean et.al.|[2309.03177v1](http://arxiv.org/abs/2309.03177v1)|null|\n", "2309.03147": "|**2023-09-06**|**Real-Time Non-Invasive Imaging and Detection of Spreading Depolarizations through EEG: An Ultra-Light Explainable Deep Learning Approach**|Yinzhe Wu et.al.|[2309.03147v1](http://arxiv.org/abs/2309.03147v1)|null|\n", "2309.03100": "|**2023-09-06**|**FArMARe: a Furniture-Aware Multi-task methodology for Recommending Apartments based on the user interests**|Ali Abdari et.al.|[2309.03100v1](http://arxiv.org/abs/2309.03100v1)|**[link](https://github.com/aliabdari/farmare)**|\n", "2309.02965": "|**2023-09-06**|**Dynamic Hyperbolic Attention Network for Fine Hand-object Reconstruction**|Zhiying Leng et.al.|[2309.02965v1](http://arxiv.org/abs/2309.02965v1)|null|\n", "2309.02875": "|**2023-09-06**|**MAD: Modality Agnostic Distance Measure for Image Registration**|Vasiliki Sideri-Lampretsa et.al.|[2309.02875v1](http://arxiv.org/abs/2309.02875v1)|null|\n", "2309.02702": "|**2023-09-06**|**Gene-induced Multimodal Pre-training for Image-omic Classification**|Ting Jin et.al.|[2309.02702v1](http://arxiv.org/abs/2309.02702v1)|null|\n", "2309.02616": "|**2023-09-05**|**Generative AI-aided Joint Training-free Secure Semantic Communications via Multi-modal Prompts**|Hongyang Du et.al.|[2309.02616v1](http://arxiv.org/abs/2309.02616v1)|null|\n", "2309.02591": "|**2023-09-05**|**Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction Tuning**|Lili Yu et.al.|[2309.02591v1](http://arxiv.org/abs/2309.02591v1)|null|\n", "2309.03905": "|**2023-09-07**|**ImageBind-LLM: Multi-modality Instruction Tuning**|Jiaming Han et.al.|[2309.03905v1](http://arxiv.org/abs/2309.03905v1)|**[link](https://github.com/opengvlab/llama-adapter)**|\n", "2309.03869": "|**2023-09-07**|**Text-to-feature diffusion for audio-visual few-shot learning**|Otniel-Bogdan Mercea et.al.|[2309.03869v1](http://arxiv.org/abs/2309.03869v1)|**[link](https://github.com/explainableml/avdiff-gfsl)**|\n", "2309.03734": "|**2023-09-07**|**ClusterFusion: Leveraging Radar Spatial Features for Radar-Camera 3D Object Detection in Autonomous Vehicles**|Irfan Tito Kurniawan et.al.|[2309.03734v1](http://arxiv.org/abs/2309.03734v1)|null|\n", "2309.03661": "|**2023-09-07**|**Prompt-based Context- and Domain-aware Pretraining for Vision and Language Navigation**|Ting Liu et.al.|[2309.03661v1](http://arxiv.org/abs/2309.03661v1)|null|\n", "2309.03473": "|**2023-09-07**|**Temporal Collection and Distribution for Referring Video Object Segmentation**|Jiajin Tang et.al.|[2309.03473v1](http://arxiv.org/abs/2309.03473v1)|null|\n", "2309.03452": "|**2023-09-07**|**Multi-Modality Guidance Network For Missing Modality Inference**|Zhuokai Zhao et.al.|[2309.03452v1](http://arxiv.org/abs/2309.03452v1)|null|\n", "2309.04453": "|**2023-09-08**|**WiSARD: A Labeled Visual and Thermal Image Dataset for Wilderness Search and Rescue**|Daniel Broyles et.al.|[2309.04453v1](http://arxiv.org/abs/2309.04453v1)|null|\n", "2309.04399": "|**2023-09-08**|**MaskDiffusion: Boosting Text-to-Image Consistency with Conditional Mask**|Yupeng Zhou et.al.|[2309.04399v1](http://arxiv.org/abs/2309.04399v1)|null|\n", "2309.04302": "|**2023-09-08**|**Have We Ever Encountered This Before? Retrieving Out-of-Distribution Road Obstacles from Driving Scenes**|Youssef Shoeb et.al.|[2309.04302v1](http://arxiv.org/abs/2309.04302v1)|null|\n", "2309.04287": "|**2023-09-08**|**Sequential Semantic Generative Communication for Progressive Text-to-Image Generation**|Hyelin Nam et.al.|[2309.04287v1](http://arxiv.org/abs/2309.04287v1)|null|\n", "2309.04109": "|**2023-09-08**|**From Text to Mask: Localizing Entities Using the Attention of Text-to-Image Diffusion Models**|Changming Xiao et.al.|[2309.04109v1](http://arxiv.org/abs/2309.04109v1)|null|\n", "2309.04062": "|**2023-09-08**|**3D Denoisers are Good 2D Teachers: Molecular Pretraining via Denoising and Cross-Modal Distillation**|Sungjun Cho et.al.|[2309.04062v1](http://arxiv.org/abs/2309.04062v1)|null|\n", "2309.04001": "|**2023-09-07**|**Multimodal Transformer for Material Segmentation**|Md Kaykobad Reza et.al.|[2309.04001v1](http://arxiv.org/abs/2309.04001v1)|**[link](https://github.com/csiplab/mmsformer)**|\n", "2309.05644": "|**2023-09-11**|**Grid-based Hybrid 3DMA GNSS and Terrestrial Positioning**|Paul Schwarzbach et.al.|[2309.05644v1](http://arxiv.org/abs/2309.05644v1)|null|\n", "2309.05608": "|**2023-09-11**|**Incorporating Pre-trained Model Prompting in Multimodal Stock Volume Movement Prediction**|Ruibo Chen et.al.|[2309.05608v1](http://arxiv.org/abs/2309.05608v1)|**[link](https://github.com/rayruibochen/promuse)**|\n", "2309.05573": "|**2023-09-11**|**UniSeg: A Unified Multi-Modal LiDAR Segmentation Network and the OpenPCSeg Codebase**|Youquan Liu et.al.|[2309.05573v1](http://arxiv.org/abs/2309.05573v1)|**[link](https://github.com/pjlab-adg/pcseg)**|\n", "2309.05519": "|**2023-09-13**|**NExT-GPT: Any-to-Any Multimodal LLM**|Shengqiong Wu et.al.|[2309.05519v2](http://arxiv.org/abs/2309.05519v2)|**[link](https://github.com/NExT-GPT/NExT-GPT)**|\n", "2309.05503": "|**2023-09-11**|**Long-Range Transformer Architectures for Document Understanding**|Thibault Douzon et.al.|[2309.05503v1](http://arxiv.org/abs/2309.05503v1)|**[link](https://github.com/thibaultdouzon/long-range-document-transformer)**|\n", "2309.05451": "|**2023-09-11**|**Dual-view Curricular Optimal Transport for Cross-lingual Cross-modal Retrieval**|Yabing Wang et.al.|[2309.05451v1](http://arxiv.org/abs/2309.05451v1)|null|\n", "2309.05423": "|**2023-09-11**|**Multi-Modal Automatic Prosody Annotation with Contrastive Pretraining of SSWP**|Jinzuomu Zhong et.al.|[2309.05423v1](http://arxiv.org/abs/2309.05423v1)|null|\n", "2309.05396": "|**2023-09-12**|**SlideSpeech: A Large-Scale Slide-Enriched Audio-Visual Corpus**|Haoxu Wang et.al.|[2309.05396v2](http://arxiv.org/abs/2309.05396v2)|null|\n", "2309.05298": "|**2023-09-11**|**Real-Time Parallel Trajectory Optimization with Spatiotemporal Safety Constraints for Autonomous Driving in Congested Traffic**|Lei Zheng et.al.|[2309.05298v1](http://arxiv.org/abs/2309.05298v1)|null|\n", "2309.05281": "|**2023-09-11**|**Class-Incremental Grouping Network for Continual Audio-Visual Learning**|Shentong Mo et.al.|[2309.05281v1](http://arxiv.org/abs/2309.05281v1)|**[link](https://github.com/stonemo/cign)**|\n", "2309.05257": "|**2023-09-11**|**FusionFormer: A Multi-sensory Fusion in Bird's-Eye-View and Temporal Consistent Transformer for 3D Objection**|Chunyong Hu et.al.|[2309.05257v1](http://arxiv.org/abs/2309.05257v1)|null|\n", "2309.05251": "|**2023-09-11**|**Multi3DRefer: Grounding Text Description to Multiple 3D Objects**|Yiming Zhang et.al.|[2309.05251v1](http://arxiv.org/abs/2309.05251v1)|null|\n", "2309.05248": "|**2023-09-11**|**Enhancing Speaker Diarization with Large Language Models: A Contextual Beam Search Approach**|Tae Jin Park et.al.|[2309.05248v1](http://arxiv.org/abs/2309.05248v1)|null|\n", "2309.05203": "|**2023-09-11**|**From Artificially Real to Real: Leveraging Pseudo Data from Large Language Models for Low-Resource Molecule Discovery**|Yuhan Chen et.al.|[2309.05203v1](http://arxiv.org/abs/2309.05203v1)|null|\n", "2309.05090": "|**2023-09-10**|**Sculpting Efficiency: Pruning Medical Imaging Models for On-Device Inference**|Sudarshan Sreeram et.al.|[2309.05090v1](http://arxiv.org/abs/2309.05090v1)|null|\n", "2309.06262": "|**2023-09-12**|**Modality Unifying Network for Visible-Infrared Person Re-Identification**|Hao Yu et.al.|[2309.06262v1](http://arxiv.org/abs/2309.06262v1)|null|\n", "2309.06255": "|**2023-09-12**|**Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation**|Yake Wei et.al.|[2309.06255v1](http://arxiv.org/abs/2309.06255v1)|null|\n", "2309.06176": "|**2023-09-12**|**Dual-Path Temporal Map Optimization for Make-up Temporal Video Grounding**|Jiaxiu Li et.al.|[2309.06176v1](http://arxiv.org/abs/2309.06176v1)|null|\n", "2309.06102": "|**2023-09-12**|**Can we predict the Most Replayed data of video streaming platforms?**|Alessandro Duico et.al.|[2309.06102v1](http://arxiv.org/abs/2309.06102v1)|**[link](https://github.com/ombretta/most-replayed-data)**|\n", "2309.06081": "|**2023-09-12**|**Information Flow in Graph Neural Networks: A Clinical Triage Use Case**|V\u00edctor Valls et.al.|[2309.06081v1](http://arxiv.org/abs/2309.06081v1)|null|\n", "2309.05904": "|**2023-09-12**|**Enhancing Representation in Radiography-Reports Foundation Model: A Granular Alignment Algorithm Using Masked Contrastive Learning**|Weijian Huang et.al.|[2309.05904v1](http://arxiv.org/abs/2309.05904v1)|null|\n", "2309.05818": "|**2023-09-11**|**Rice Plant Disease Detection and Diagnosis using Deep Convolutional Neural Networks and Multispectral Imaging**|Yara Ali Alnaggar et.al.|[2309.05818v1](http://arxiv.org/abs/2309.05818v1)|null|\n", "2309.05803": "|**2023-09-11**|**Revisiting Energy Based Models as Policies: Ranking Noise Contrastive Estimation and Interpolating Energy Models**|Sumeet Singh et.al.|[2309.05803v1](http://arxiv.org/abs/2309.05803v1)|null|\n", "2309.05756": "|**2023-09-11**|**TransferDoc: A Self-Supervised Transferable Document Representation Learning Model Unifying Vision and Language**|Souhail Bakkali et.al.|[2309.05756v1](http://arxiv.org/abs/2309.05756v1)|null|\n", "2309.07120": "|**2023-09-13**|**Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness and Ethics**|Haoqin Tu et.al.|[2309.07120v1](http://arxiv.org/abs/2309.07120v1)|**[link](https://github.com/ucsc-vlaa/sight-beyond-text)**|\n", "2309.07066": "|**2023-09-13**|**CLiFF-LHMP: Using Spatial Dynamics Patterns for Long-Term Human Motion Prediction**|Yufei Zhu et.al.|[2309.07066v1](http://arxiv.org/abs/2309.07066v1)|null|\n", "2309.06799": "|**2023-09-13**|**When Geoscience Meets Foundation Models: Towards General Geoscience Artificial Intelligence System**|Hao Zhang et.al.|[2309.06799v1](http://arxiv.org/abs/2309.06799v1)|null|\n", "2309.06735": "|**2023-09-13**|**GelFlow: Self-supervised Learning of Optical Flow for Vision-Based Tactile Sensor Displacement Measurement**|Zhiyuan Zhang et.al.|[2309.06735v1](http://arxiv.org/abs/2309.06735v1)|null|\n", "2309.06728": "|**2023-09-13**|**Leveraging Foundation models for Unsupervised Audio-Visual Segmentation**|Swapnil Bhosale et.al.|[2309.06728v1](http://arxiv.org/abs/2309.06728v1)|null|\n", "2309.06599": "|**2023-09-12**|**Reasoning with Latent Diffusion in Offline Reinforcement Learning**|Siddarth Venkatraman et.al.|[2309.06599v1](http://arxiv.org/abs/2309.06599v1)|**[link](https://github.com/ldcq/ldcq)**|\n", "2309.06597": "|**2023-09-12**|**Rank2Tell: A Multimodal Driving Dataset for Joint Importance Ranking and Reasoning**|Enna Sachdeva et.al.|[2309.06597v1](http://arxiv.org/abs/2309.06597v1)|null|\n", "2309.06547": "|**2023-09-12**|**AmodalSynthDrive: A Synthetic Amodal Perception Dataset for Autonomous Driving**|Ahmed Rida Sekkat et.al.|[2309.06547v1](http://arxiv.org/abs/2309.06547v1)|null|\n", "2309.06517": "|**2023-09-12**|**Overview of Memotion 3: Sentiment and Emotion Analysis of Codemixed Hinglish Memes**|Shreyash Mishra et.al.|[2309.06517v1](http://arxiv.org/abs/2309.06517v1)|null|\n", "2309.06511": "|**2023-09-12**|**DF-TransFusion: Multimodal Deepfake Detection via Lip-Audio Cross-Attention and Facial Self-Attention**|Aaditya Kharel et.al.|[2309.06511v1](http://arxiv.org/abs/2309.06511v1)|null|\n", "2309.07915": "|**2023-09-14**|**MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning**|Haozhe Zhao et.al.|[2309.07915v1](http://arxiv.org/abs/2309.07915v1)|**[link](https://github.com/haozhezhao/mic)**|\n", "2309.07794": "|**2023-09-14**|**Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary tasks**|Danae S\u00e1nchez Villegas et.al.|[2309.07794v1](http://arxiv.org/abs/2309.07794v1)|null|\n", "2309.07759": "|**2023-09-14**|**PROGrasp: Pragmatic Human-Robot Communication for Object Grasping**|Gi-Cheon Kang et.al.|[2309.07759v1](http://arxiv.org/abs/2309.07759v1)|null|\n", "2309.07623": "|**2023-09-14**|**SwitchGPT: Adapting Large Language Models for Non-Text Outputs**|Xinyu Wang et.al.|[2309.07623v1](http://arxiv.org/abs/2309.07623v1)|null|\n", "2309.07495": "|**2023-09-14**|**HDTR-Net: A Real-Time High-Definition Teeth Restoration Network for Arbitrary Talking Face Generation Methods**|Yongyuan Li et.al.|[2309.07495v1](http://arxiv.org/abs/2309.07495v1)|**[link](https://github.com/yylgoodlucky/hdtr)**|\n", "2309.07387": "|**2023-09-14**|**VDialogUE: A Unified Evaluation Benchmark for Visually-grounded Dialogue**|Yunshui Li et.al.|[2309.07387v1](http://arxiv.org/abs/2309.07387v1)|null|\n", "2309.07332": "|**2023-09-13**|**Reliability-based cleaning of noisy training labels with inductive conformal prediction in multi-modal biomedical data mining**|Xianghao Zhan et.al.|[2309.07332v1](http://arxiv.org/abs/2309.07332v1)|**[link](https://github.com/xzhan96-stf/icp_train_clean)**|\n", "2309.07297": "|**2023-09-13**|**Multi-Modal Hybrid Learning and Sequential Training for RGB-T Saliency Detection**|Guangyu Ren et.al.|[2309.07297v1](http://arxiv.org/abs/2309.07297v1)|null|\n", "2309.08531": "|**2023-09-15**|**Towards Practical and Efficient Image-to-Speech Captioning with Vision-Language Pre-training and Multi-modal Tokens**|Minsu Kim et.al.|[2309.08531v1](http://arxiv.org/abs/2309.08531v1)|null|\n", "2309.08508": "|**2023-09-15**|**MOSAIC: Learning Unified Multi-Sensory Object Property Representations for Robot Perception**|Gyan Tatiya et.al.|[2309.08508v1](http://arxiv.org/abs/2309.08508v1)|**[link](https://github.com/gtatiya/MOSAIC)**|\n", "2309.08229": "|**2023-09-15**|**Automated Multi-Drugs Administration During Total Intravenous Anesthesia Using Multi-Model Predictive Control**|Bob Aubouin-Pairault et.al.|[2309.08229v1](http://arxiv.org/abs/2309.08229v1)|**[link](https://github.com/bobaubouin/tiva_drug_control)**|\n", "2309.08204": "|**2023-09-15**|**One-stage Modality Distillation for Incomplete Multimodal Learning**|Shicai Wei et.al.|[2309.08204v1](http://arxiv.org/abs/2309.08204v1)|null|\n", "2309.08160": "|**2023-09-15**|**Cross-Modal Synthesis of Structural MRI and Functional Connectivity Networks via Conditional ViT-GANs**|Yuda Bi et.al.|[2309.08160v1](http://arxiv.org/abs/2309.08160v1)|null|\n", "2309.08154": "|**2023-09-15**|**Uncertainty-Aware Multi-View Visual Semantic Embedding**|Wenzhang Wei et.al.|[2309.08154v1](http://arxiv.org/abs/2309.08154v1)|null|\n", "2309.08096": "|**2023-09-15**|**GelSplitter: Tactile Reconstruction from Near Infrared and Visible Images**|Yuankai Lin et.al.|[2309.08096v1](http://arxiv.org/abs/2309.08096v1)|null|\n", "2309.08088": "|**2023-09-15**|**Interactive Model Fusion-Based GM-PHD Filter**|Jiacheng He et.al.|[2309.08088v1](http://arxiv.org/abs/2309.08088v1)|null|\n", "2309.08021": "|**2023-09-14**|**Vision-based Analysis of Driver Activity and Driving Performance Under the Influence of Alcohol**|Ross Greer et.al.|[2309.08021v1](http://arxiv.org/abs/2309.08021v1)|null|\n", "2309.09958": "|**2023-09-18**|**An Empirical Study of Scaling Instruct-Tuned Large Multimodal Models**|Yadong Lu et.al.|[2309.09958v1](http://arxiv.org/abs/2309.09958v1)|**[link](https://github.com/haotian-liu/LLaVA)**|\n", "2309.09875": "|**2023-09-18**|**RaLF: Flow-based Global and Metric Radar Localization in LiDAR Maps**|Abhijeet Nayak et.al.|[2309.09875v1](http://arxiv.org/abs/2309.09875v1)|null|\n", "2309.09867": "|**2023-09-18**|**EGFE: End-to-end Grouping of Fragmented Elements in UI Designs with Multimodal Learning**|Liuqing Chen et.al.|[2309.09867v1](http://arxiv.org/abs/2309.09867v1)|**[link](https://github.com/test2975/egfe)**|\n", "2309.09832": "|**2023-09-18**|**Task Selection and Assignment for Multi-modal Multi-task Dialogue Act Classification with Non-stationary Multi-armed Bandits**|Xiangheng He et.al.|[2309.09832v1](http://arxiv.org/abs/2309.09832v1)|null|\n", "2309.09667": "|**2023-09-18**|**Unified Frequency-Assisted Transformer Framework for Detecting and Grounding Multi-Modal Manipulation**|Huan Liu et.al.|[2309.09667v1](http://arxiv.org/abs/2309.09667v1)|null|\n", "2309.09646": "|**2023-09-18**|**Concurrent Haptic, Audio, and Visual Data Set During Bare Finger Interaction with Textured Surfaces**|Alexis W. M. Devillard et.al.|[2309.09646v1](http://arxiv.org/abs/2309.09646v1)|null|\n", "2309.09592": "|**2023-09-18**|**Multi-Semantic Fusion Model for Generalized Zero-Shot Skeleton-Based Action Recognition**|Ming-Zhe Li et.al.|[2309.09592v1](http://arxiv.org/abs/2309.09592v1)|**[link](https://github.com/EHZ9NIWI7/MSF-GZSSAR)**|\n", "2309.09513": "|**2023-09-18**|**Learning Parallax for Stereo Event-based Motion Deblurring**|Mingyuan Lin et.al.|[2309.09513v1](http://arxiv.org/abs/2309.09513v1)|null|\n", "2309.09501": "|**2023-09-18**|**Discovering Sounding Objects by Audio Queries for Audio Visual Segmentation**|Shaofei Huang et.al.|[2309.09501v1](http://arxiv.org/abs/2309.09501v1)|null|\n", "2309.09473": "|**2023-09-18**|**Self-supervised Multi-view Clustering in Computer Vision: A Survey**|Jiatai Wang et.al.|[2309.09473v1](http://arxiv.org/abs/2309.09473v1)|null|\n", "2309.09421": "|**2023-09-18**|**Unified Pretraining Target Based Video-music Retrieval With Music Rhythm And Video Optical Flow Information**|Tianjun Mao et.al.|[2309.09421v1](http://arxiv.org/abs/2309.09421v1)|null|\n", "2309.09246": "|**2023-09-17**|**Image-level supervision and self-training for transformer-based cross-modality tumor segmentation**|Malo de Boisredon et.al.|[2309.09246v1](http://arxiv.org/abs/2309.09246v1)|null|\n", "2309.09088": "|**2023-09-16**|**Enhancing GAN-Based Vocoders with Contrastive Learning Under Data-limited Condition**|Haoming Guo et.al.|[2309.09088v1](http://arxiv.org/abs/2309.09088v1)|null|\n", "2309.09067": "|**2023-09-19**|**MMST-ViT: Climate Change-aware Crop Yield Prediction via Multi-Modal Spatial-Temporal Vision Transformer**|Fudong Lin et.al.|[2309.09067v2](http://arxiv.org/abs/2309.09067v2)|**[link](https://github.com/fudong03/mmst-vit)**|\n", "2309.08966": "|**2023-09-16**|**FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering and Local to Global Optimization**|Nan Ma et.al.|[2309.08966v1](http://arxiv.org/abs/2309.08966v1)|null|\n", "2309.10724": "|**2023-09-19**|**Sound Source Localization is All about Cross-Modal Alignment**|Arda Senocak et.al.|[2309.10724v1](http://arxiv.org/abs/2309.10724v1)|null|\n", "2309.10649": "|**2023-09-19**|**Cross-modal and Cross-domain Knowledge Transfer for Label-free 3D Segmentation**|Jingyu Zhang et.al.|[2309.10649v1](http://arxiv.org/abs/2309.10649v1)|null|\n", "2309.10606": "|**2023-09-19**|**A Novel Hybrid Algorithm for Optimized Solutions in Ocean Renewable Energy Industry: Enhancing Power Take-Off Parameters and Site Selection Procedure of Wave Energy Converters**|Hossein Mehdipour et.al.|[2309.10606v1](http://arxiv.org/abs/2309.10606v1)|null|\n", "2309.10537": "|**2023-09-19**|**FoleyGen: Visually-Guided Audio Generation**|Xinhao Mei et.al.|[2309.10537v1](http://arxiv.org/abs/2309.10537v1)|null|\n", "2309.10365": "|**2023-09-19**|**Testable Likelihoods for Beyond-the-Standard Model Fits**|Anja Beck et.al.|[2309.10365v1](http://arxiv.org/abs/2309.10365v1)|null|\n", "2309.10361": "|**2023-09-19**|**Improving CLIP Robustness with Knowledge Distillation and Self-Training**|Clement Laroudie et.al.|[2309.10361v1](http://arxiv.org/abs/2309.10361v1)|null|\n", "2309.10283": "|**2023-09-19**|**FRAMU: Attention-based Machine Unlearning using Federated Reinforcement Learning**|Thanveer Shaik et.al.|[2309.10283v1](http://arxiv.org/abs/2309.10283v1)|null|\n", "2309.10244": "|**2023-09-19**|**UPL-SFDA: Uncertainty-aware Pseudo Label Guided Source-Free Domain Adaptation for Medical Image Segmentation**|Jianghao Wu et.al.|[2309.10244v1](http://arxiv.org/abs/2309.10244v1)|**[link](https://github.com/hilab-git/upl-sfda)**|\n", "2309.10195": "|**2023-09-20**|**Multi-modality Meets Re-learning: Mitigating Negative Transfer in Sequential Recommendation**|Bo Peng et.al.|[2309.10195v2](http://arxiv.org/abs/2309.10195v2)|null|\n", "2309.10091": "|**2023-09-18**|**Unified Coarse-to-Fine Alignment for Video-Text Retrieval**|Ziyang Wang et.al.|[2309.10091v1](http://arxiv.org/abs/2309.10091v1)|**[link](https://github.com/ziyang412/ucofia)**|\n", "2309.10077": "|**2023-09-18**|**GAME: Generalized deep learning model towards multimodal data integration for early screening of adolescent mental disorders**|Zhicheng Du et.al.|[2309.10077v1](http://arxiv.org/abs/2309.10077v1)|null|\n", "2309.11335": "|**2023-09-20**|**2D-3D Pose Tracking with Multi-View Constraints**|Huai Yu et.al.|[2309.11335v1](http://arxiv.org/abs/2309.11335v1)|null|\n", "2309.11119": "|**2023-09-21**|**BroadBEV: Collaborative LiDAR-camera Fusion for Broad-sighted Bird's Eye View Map Construction**|Minsu Kim et.al.|[2309.11119v2](http://arxiv.org/abs/2309.11119v2)|null|\n", "2309.11082": "|**2023-09-20**|**Dual-Modal Attention-Enhanced Text-Video Retrieval with Triplet Partial Margin Contrastive Learning**|Chen Jiang et.al.|[2309.11082v1](http://arxiv.org/abs/2309.11082v1)|null|\n", "2309.11081": "|**2023-09-20**|**Dense 2D-3D Indoor Prediction with Sound via Aligned Cross-Modal Distillation**|Heeseung Yun et.al.|[2309.11081v1](http://arxiv.org/abs/2309.11081v1)|**[link](https://github.com/hs-yn/daps)**|\n", "2309.12314": "|**2023-09-21**|**TinyCLIP: CLIP Distillation via Affinity Mimicking and Weight Inheritance**|Kan Wu et.al.|[2309.12314v1](http://arxiv.org/abs/2309.12314v1)|**[link](https://github.com/microsoft/Cream/tree/main/TinyCLIP)**|\n", "2309.12224": "|**2023-09-21**|**Towards Answering Health-related Questions from Medical Videos: Datasets and Approaches**|Deepak Gupta et.al.|[2309.12224v1](http://arxiv.org/abs/2309.12224v1)|null|\n", "2309.12158": "|**2023-09-21**|**Towards Robust and Truly Large-Scale Audio-Sheet Music Retrieval**|Luis Carvalho et.al.|[2309.12158v1](http://arxiv.org/abs/2309.12158v1)|null|\n", "2309.12134": "|**2023-09-21**|**Self-Supervised Contrastive Learning for Robust Audio-Sheet Music Retrieval Systems**|Luis Carvalho et.al.|[2309.12134v1](http://arxiv.org/abs/2309.12134v1)|null|\n", "2309.12111": "|**2023-09-21**|**Passage Summarization with Recurrent Models for Audio-Sheet Music Retrieval**|Luis Carvalho et.al.|[2309.12111v1](http://arxiv.org/abs/2309.12111v1)|null|\n", "2309.12110": "|**2023-09-21**|**Exploiting CLIP-based Multi-modal Approach for Artwork Classification and Retrieval**|Alberto Baldrati et.al.|[2309.12110v1](http://arxiv.org/abs/2309.12110v1)|null|\n", "2309.12030": "|**2023-09-21**|**CAMERA: A Multimodal Dataset and Benchmark for Ad Text Generation**|Masato Mita et.al.|[2309.12030v1](http://arxiv.org/abs/2309.12030v1)|**[link](https://github.com/cyberagentailab/camera)**|\n", "2309.12009": "|**2023-09-21**|**Elevating Skeleton-Based Action Recognition with Efficient Multi-Modality Self-Supervision**|Yiping Wei et.al.|[2309.12009v1](http://arxiv.org/abs/2309.12009v1)|**[link](https://github.com/desehuileng0o0/ikem)**|\n", "2309.11933": "|**2023-09-21**|**Fully Transformer-Equipped Architecture for End-to-End Referring Video Object Segmentation**|Ping Li et.al.|[2309.11933v1](http://arxiv.org/abs/2309.11933v1)|null|\n", "2309.11923": "|**2023-09-21**|**TextCLIP: Text-Guided Face Image Generation And Manipulation Without Adversarial Training**|Xiaozhou You et.al.|[2309.11923v1](http://arxiv.org/abs/2309.11923v1)|null|\n", "2309.11860": "|**2023-09-21**|**QUEST: An Efficient Query Evaluation Scheme Towards Scan-Intensive Cross-Model Analysis**|Jianfeng Huang et.al.|[2309.11860v1](http://arxiv.org/abs/2309.11860v1)|null|\n", "2309.11845": "|**2023-09-21**|**TMac: Temporal Multi-Modal Graph Learning for Acoustic Event Classification**|Meng Liu et.al.|[2309.11845v1](http://arxiv.org/abs/2309.11845v1)|**[link](https://github.com/mgithubl/tmac)**|\n", "2309.11839": "|**2023-09-21**|**MoPA: Multi-Modal Prior Aided Domain Adaptation for 3D Semantic Segmentation**|Haozhi Cao et.al.|[2309.11839v1](http://arxiv.org/abs/2309.11839v1)|null|\n", "2309.11837": "|**2023-09-21**|**Stellar model calibrations with the Ai Phe binary system. Open questions about the robustness of the fit**|G. Valle et.al.|[2309.11837v1](http://arxiv.org/abs/2309.11837v1)|null|\n", "2309.11755": "|**2023-09-21**|**2DDATA: 2D Detection Annotations Transmittable Aggregation for Semantic Segmentation on Point Cloud**|Guan-Cheng Lee et.al.|[2309.11755v1](http://arxiv.org/abs/2309.11755v1)|null|\n", "2309.13007": "|**2023-09-22**|**ReConcile: Round-Table Conference Improves Reasoning via Consensus among Diverse LLMs**|Justin Chih-Yao Chen et.al.|[2309.13007v1](http://arxiv.org/abs/2309.13007v1)|**[link](https://github.com/dinobby/reconcile)**|\n", "2309.12865": "|**2023-09-22**|**Bridging Sensor Gaps via Single-Direction Tuning for Hyperspectral Image Classification**|Xizhe Xue et.al.|[2309.12865v1](http://arxiv.org/abs/2309.12865v1)|**[link](https://github.com/cecilia-xue/hyt-nas)**|\n", "2309.12855": "|**2023-09-22**|**Cross-Modal Translation and Alignment for Survival Analysis**|Fengtao Zhou et.al.|[2309.12855v1](http://arxiv.org/abs/2309.12855v1)|**[link](https://github.com/ft-zhou-zzz/cmta)**|\n", "2309.12764": "|**2023-09-22**|**Multi-Modal Embeddings for Isolating Cross-Platform Coordinated Information Campaigns on Social Media**|Fabio Barbero et.al.|[2309.12764v1](http://arxiv.org/abs/2309.12764v1)|null|\n", "2309.12657": "|**2023-09-22**|**Exploiting Modality-Specific Features For Multi-Modal Manipulation Detection And Grounding**|Jiazhen Wang et.al.|[2309.12657v1](http://arxiv.org/abs/2309.12657v1)|null|\n", "2309.12572": "|**2023-09-22**|**Interpretable 3D Multi-Modal Residual Convolutional Neural Network for Mild Traumatic Brain Injury Diagnosis**|Hanem Ellethy et.al.|[2309.12572v1](http://arxiv.org/abs/2309.12572v1)|null|\n", "2309.14327": "|**2023-09-25**|**DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention**|Zhewei Yao et.al.|[2309.14327v1](http://arxiv.org/abs/2309.14327v1)|**[link](https://github.com/microsoft/deepspeedexamples)**|\n", "2309.14320": "|**2023-09-25**|**MUTEX: Learning Unified Policies from Multimodal Task Specifications**|Rutav Shah et.al.|[2309.14320v1](http://arxiv.org/abs/2309.14320v1)|null|\n", "2309.14203": "|**2023-09-25**|**Detecting and Grounding Multi-Modal Media Manipulation and Beyond**|Rui Shao et.al.|[2309.14203v1](http://arxiv.org/abs/2309.14203v1)|**[link](https://github.com/rshaojimmy/multimodal-deepfake)**|\n", "2309.14183": "|**2023-09-26**|**Species196: A One-Million Semi-supervised Dataset for Fine-grained Species Recognition**|Wei He et.al.|[2309.14183v2](http://arxiv.org/abs/2309.14183v2)|**[link](https://github.com/Species-Dataset/species-dataset.github.io)**|\n", "2309.14181": "|**2023-09-25**|**Q-Bench: A Benchmark for General-Purpose Foundation Models on Low-level Vision**|Haoning Wu et.al.|[2309.14181v1](http://arxiv.org/abs/2309.14181v1)|**[link](https://github.com/VQAssessment/Q-Bench)**|\n", "2309.14065": "|**2023-09-26**|**AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile Platform Real-Time RGB-D Semantic Segmentation**|Siqi Du et.al.|[2309.14065v2](http://arxiv.org/abs/2309.14065v2)|**[link](https://github.com/Fourier7754/AsymFormer)**|\n", "2309.14050": "|**2023-09-26**|**NNgTL: Neural Network Guided Optimal Temporal Logic Task Planning for Mobile Robots**|Ruijia Liu et.al.|[2309.14050v2](http://arxiv.org/abs/2309.14050v2)|null|\n", "2309.14003": "|**2023-09-25**|**Hierarchical Imitation Learning for Stochastic Environments**|Maximilian Igl et.al.|[2309.14003v1](http://arxiv.org/abs/2309.14003v1)|null|\n", "2309.13770": "|**2023-09-24**|**Devil in the Number: Towards Robust Multi-modality Data Filter**|Yichen Xu et.al.|[2309.13770v1](http://arxiv.org/abs/2309.13770v1)|null|\n", "2309.13650": "|**2023-09-24**|**Cross-modal Alignment with Optimal Transport for CTC-based ASR**|Xugang Lu et.al.|[2309.13650v1](http://arxiv.org/abs/2309.13650v1)|null|\n", "2309.13554": "|**2023-09-24**|**A Novel Stochastic Interacting Particle-Field Algorithm for 3D Parabolic-Parabolic Keller-Segel Chemotaxis System**|Zhongjian Wang et.al.|[2309.13554v1](http://arxiv.org/abs/2309.13554v1)|null|\n", "2309.13504": "|**2023-09-23**|**Attention Is All You Need For Blind Room Volume Estimation**|Chunxi Wang et.al.|[2309.13504v1](http://arxiv.org/abs/2309.13504v1)|null|\n", "2309.13470": "|**2023-09-23**|**HAVE-Net: Hallucinated Audio-Visual Embeddings for Few-Shot Classification with Unimodal Cues**|Ankit Jha et.al.|[2309.13470v1](http://arxiv.org/abs/2309.13470v1)|null|\n", "2309.13322": "|**2023-09-23**|**From Text to Source: Results in Detecting Large Language Model-Generated Content**|Wissam Antoun et.al.|[2309.13322v1](http://arxiv.org/abs/2309.13322v1)|null|\n", "2309.13266": "|**2023-09-23**|**Robust Navigation with Cross-Modal Fusion and Knowledge Transfer**|Wenzhe Cai et.al.|[2309.13266v1](http://arxiv.org/abs/2309.13266v1)|**[link](https://github.com/wzcai99/Distill-Navigator)**|\n", "2309.15117": "|**2023-09-26**|**Generating Visual Scenes from Touch**|Fengyu Yang et.al.|[2309.15117v1](http://arxiv.org/abs/2309.15117v1)|null|\n", "2309.15112": "|**2023-09-27**|**InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition**|Pan Zhang et.al.|[2309.15112v2](http://arxiv.org/abs/2309.15112v2)|**[link](https://github.com/internlm/internlm-xcomposer)**|\n", "2309.15109": "|**2023-09-26**|**DistillBEV: Boosting Multi-Camera 3D Object Detection with Cross-Modal Knowledge Distillation**|Zeyu Wang et.al.|[2309.15109v1](http://arxiv.org/abs/2309.15109v1)|**[link](https://github.com/qcraftai/distill-bev)**|\n", "2309.15082": "|**2023-09-26**|**RPEFlow: Multimodal Fusion of RGB-PointCloud-Event for Joint Optical Flow and Scene Flow Estimation**|Zhexiong Wan et.al.|[2309.15082v1](http://arxiv.org/abs/2309.15082v1)|**[link](https://github.com/danqu130/RPEFlow)**|\n", "2309.14704": "|**2023-09-26**|**Tile Classification Based Viewport Prediction with Multi-modal Fusion Transformer**|Zhihao Zhang et.al.|[2309.14704v1](http://arxiv.org/abs/2309.14704v1)|null|\n", "2309.14673": "|**2023-09-26**|**ALEX: Towards Effective Graph Transfer Learning with Noisy Labels**|Jingyang Yuan et.al.|[2309.14673v1](http://arxiv.org/abs/2309.14673v1)|null|\n", "2309.14611": "|**2023-09-26**|**Event Stream-based Visual Object Tracking: A High-Resolution Benchmark Dataset and A Novel Baseline**|Xiao Wang et.al.|[2309.14611v1](http://arxiv.org/abs/2309.14611v1)|**[link](https://github.com/event-ahu/eventvot_benchmark)**|\n", "2309.14580": "|**2023-09-26**|**CWCL: Cross-Modal Transfer with Continuously Weighted Contrastive Loss**|Rakshith Sharma Srinivasa et.al.|[2309.14580v1](http://arxiv.org/abs/2309.14580v1)|null|\n", "2309.14516": "|**2023-09-25**|**UniBEV: Multi-modal 3D Object Detection with Uniform BEV Encoders for Robustness against Missing Sensor Modalities**|Shiming Wang et.al.|[2309.14516v1](http://arxiv.org/abs/2309.14516v1)|null|\n", "2309.14491": "|**2023-09-25**|**Unsupervised 3D Perception with 2D Vision-Language Distillation for Autonomous Driving**|Mahyar Najibi et.al.|[2309.14491v1](http://arxiv.org/abs/2309.14491v1)|null|\n", "2309.15826": "|**2023-09-27**|**Cross-Modal Multi-Tasking for Speech-to-Text Translation via Hard Parameter Sharing**|Brian Yan et.al.|[2309.15826v1](http://arxiv.org/abs/2309.15826v1)|null|\n", "2309.15751": "|**2023-09-27**|**InfraParis: A multi-modal and multi-task autonomous driving dataset**|Gianni Franchi et.al.|[2309.15751v1](http://arxiv.org/abs/2309.15751v1)|null|\n", "2309.15739": "|**2023-09-27**|**Experience and Evidence are the eyes of an excellent summarizer! Towards Knowledge Infused Multi-modal Clinical Conversation Summarization**|Abhisek Tiwari et.al.|[2309.15739v1](http://arxiv.org/abs/2309.15739v1)|**[link](https://github.com/nlp-rl/mm-cliconsummation)**|\n", "2309.15683": "|**2023-09-27**|**End-to-End Streaming Video Temporal Action Segmentation with Reinforce Learning**|Wujun Wen et.al.|[2309.15683v1](http://arxiv.org/abs/2309.15683v1)|**[link](https://github.com/Thinksky5124/SVTAS)**|\n", "2309.15599": "|**2023-09-27**|**OceanBench: The Sea Surface Height Edition**|J. Emmanuel Johnson et.al.|[2309.15599v1](http://arxiv.org/abs/2309.15599v1)|**[link](https://github.com/jejjohnson/oceanbench)**|\n", "2309.15529": "|**2023-09-27**|**Missing-modality Enabled Multi-modal Fusion Architecture for Medical Data**|Muyu Wang et.al.|[2309.15529v1](http://arxiv.org/abs/2309.15529v1)|null|\n", "2309.15427": "|**2023-09-27**|**Graph Neural Prompting with Large Language Models**|Yijun Tian et.al.|[2309.15427v1](http://arxiv.org/abs/2309.15427v1)|null|\n", "2309.15402": "|**2023-09-27**|**A Survey of Chain of Thought Reasoning: Advances, Frontiers and Future**|Zheng Chu et.al.|[2309.15402v1](http://arxiv.org/abs/2309.15402v1)|**[link](https://github.com/zchuz/cot-reasoning-survey)**|\n", "2309.15390": "|**2023-09-27**|**MINS: Efficient and Robust Multisensor-aided Inertial Navigation System**|Woosik Lee et.al.|[2309.15390v1](http://arxiv.org/abs/2309.15390v1)|**[link](https://github.com/rpng/mins)**|\n", "2309.15313": "|**2023-09-26**|**M$^{3}$3D: Learning 3D priors using Multi-Modal Masked Autoencoders for 2D image and video understanding**|Muhammad Abdullah Jamal et.al.|[2309.15313v1](http://arxiv.org/abs/2309.15313v1)|null|\n", "2309.15302": "|**2023-09-26**|**Self-Supervised Terrain Representation Learning from Unconstrained Robot Experience**|Haresh Karnan et.al.|[2309.15302v1](http://arxiv.org/abs/2309.15302v1)|null|\n", "2309.15283": "|**2023-09-26**|**Multi-Modal Planning on Regrasping for Stable Manipulation**|Jiaming Hu et.al.|[2309.15283v1](http://arxiv.org/abs/2309.15283v1)|null|\n", "2309.16592": "|**2023-09-28**|**Tensor Factorization for Leveraging Cross-Modal Knowledge in Data-Constrained Infrared Object Detection**|Manish Sharma et.al.|[2309.16592v1](http://arxiv.org/abs/2309.16592v1)|null|\n", "2309.16569": "|**2023-09-28**|**Audio-Visual Speaker Verification via Joint Cross-Attention**|R. Gnana Praveen et.al.|[2309.16569v1](http://arxiv.org/abs/2309.16569v1)|null|\n", "2309.16283": "|**2023-09-28**|**Self-supervised Cross-view Representation Reconstruction for Change Captioning**|Yunbin Tu et.al.|[2309.16283v1](http://arxiv.org/abs/2309.16283v1)|null|\n", "2309.16211": "|**2023-09-28**|**VDC: Versatile Data Cleanser for Detecting Dirty Samples via Visual-Linguistic Inconsistency**|Zihao Zhu et.al.|[2309.16211v1](http://arxiv.org/abs/2309.16211v1)|null|\n", "2309.16206": "|**2023-09-28**|**Cross-Modal Transformer GAN: Brain Structural-Functional Deep Fusing Network for Alzheimer's Disease Analysis**|Qiankun Zuo et.al.|[2309.16206v1](http://arxiv.org/abs/2309.16206v1)|null|\n", "2309.16203": "|**2023-09-28**|**The Cloud Strikes Back: Investigating the Decentralization of IPFS**|Leonhard Balduf et.al.|[2309.16203v1](http://arxiv.org/abs/2309.16203v1)|null|\n", "2309.16141": "|**2023-09-28**|**Align before Search: Aligning Ads Image to Text for Accurate Cross-Modal Sponsored Search**|Yuanmin Tang et.al.|[2309.16141v1](http://arxiv.org/abs/2309.16141v1)|**[link](https://github.com/pter61/aligncmss)**|\n", "2309.16093": "|**2023-09-28**|**Hierarchical Cross-Modality Knowledge Transfer with Sinkhorn Attention for CTC-based ASR**|Xugang Lu et.al.|[2309.16093v1](http://arxiv.org/abs/2309.16093v1)|null|\n", "2309.15954": "|**2023-09-27**|**The Devil is in the Details: A Deep Dive into the Rabbit Hole of Data Filtering**|Haichao Yu et.al.|[2309.15954v1](http://arxiv.org/abs/2309.15954v1)|null|\n", "2309.15915": "|**2023-09-27**|**Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts**|Deniz Engin et.al.|[2309.15915v1](http://arxiv.org/abs/2309.15915v1)|**[link](https://github.com/engindeniz/vitis)**|\n", "2309.17395": "|**2023-09-29**|**AV-CPL: Continuous Pseudo-Labeling for Audio-Visual Speech Recognition**|Andrew Rouditchenko et.al.|[2309.17395v1](http://arxiv.org/abs/2309.17395v1)|null|\n", "2309.17336": "|**2023-09-29**|**See Beyond Seeing: Robust 3D Object Detection from Point Clouds via Cross-Modal Hallucination**|Jianning Deng et.al.|[2309.17336v1](http://arxiv.org/abs/2309.17336v1)|null|\n", "2309.17264": "|**2023-09-29**|**A Foundation Model for General Moving Object Segmentation in Medical Images**|Zhongnuo Yan et.al.|[2309.17264v1](http://arxiv.org/abs/2309.17264v1)|null|\n", "2309.17239": "|**2023-09-29**|**EGVD: Event-Guided Video Deraining**|Yueyi Zhang et.al.|[2309.17239v1](http://arxiv.org/abs/2309.17239v1)|**[link](https://github.com/booker-max/egvd)**|\n", "2309.17175": "|**2023-09-29**|**TextField3D: Towards Enhancing Open-Vocabulary 3D Generation with Noisy Text Fields**|Tianyu Huang et.al.|[2309.17175v1](http://arxiv.org/abs/2309.17175v1)|null|\n", "2309.17133": "|**2023-09-29**|**Fine-grained Late-interaction Multi-modal Retrieval for Retrieval Augmented Visual Question Answering**|Weizhe Lin et.al.|[2309.17133v1](http://arxiv.org/abs/2309.17133v1)|**[link](https://github.com/linweizhedragon/retrieval-augmented-visual-question-answering)**|\n", "2309.17104": "|**2023-10-03**|**Prototype-guided Cross-modal Completion and Alignment for Incomplete Text-based Person Re-identification**|Tiantian Gong et.al.|[2309.17104v2](http://arxiv.org/abs/2309.17104v2)|null|\n", "2309.17102": "|**2023-09-29**|**Guiding Instruction-based Image Editing via Multimodal Large Language Models**|Tsu-Jui Fu et.al.|[2309.17102v1](http://arxiv.org/abs/2309.17102v1)|**[link](https://github.com/tsujuifu/pytorch_mgie)**|\n", "2309.17093": "|**2023-09-29**|**Prototype-based Aleatoric Uncertainty Quantification for Cross-modal Retrieval**|Hao Li et.al.|[2309.17093v1](http://arxiv.org/abs/2309.17093v1)|**[link](https://github.com/leolee99/pau)**|\n", "2309.17037": "|**2023-09-29**|**Beyond Co-occurrence: Multi-modal Session-based Recommendation**|Xiaokun Zhang et.al.|[2309.17037v1](http://arxiv.org/abs/2309.17037v1)|**[link](https://github.com/zhang-xiaokun/mmsbr)**|\n", "2309.16984": "|**2023-09-29**|**Consistency Models as a Rich and Efficient Policy Class for Reinforcement Learning**|Zihan Ding et.al.|[2309.16984v1](http://arxiv.org/abs/2309.16984v1)|null|\n", "2309.16949": "|**2023-09-29**|**CrossZoom: Simultaneously Motion Deblurring and Event Super-Resolving**|Chi Zhang et.al.|[2309.16949v1](http://arxiv.org/abs/2309.16949v1)|**[link](https://github.com/bestrivenzc/CZ-Net)**|\n", "2309.16830": "|**2023-09-28**|**Robust Safe Control with Multi-Modal Uncertainty**|Tianhao Wei et.al.|[2309.16830v1](http://arxiv.org/abs/2309.16830v1)|null|\n", "2309.16818": "|**2023-09-28**|**MEM: Multi-Modal Elevation Mapping for Robotics and Learning**|Gian Erni et.al.|[2309.16818v1](http://arxiv.org/abs/2309.16818v1)|**[link](https://github.com/leggedrobotics/elevation_mapping_cupy)**|\n", "2309.16772": "|**2023-10-02**|**XVO: Generalized Visual Odometry via Cross-Modal Self-Training**|Lei Lai et.al.|[2309.16772v2](http://arxiv.org/abs/2309.16772v2)|null|\n", "2310.02071": "|**2023-10-03**|**Towards End-to-End Embodied Decision Making via Multi-modal Large Language Model: Explorations with GPT4-Vision and Beyond**|Liang Chen et.al.|[2310.02071v1](http://arxiv.org/abs/2310.02071v1)|**[link](https://github.com/pkunlp-icler/pca-eval)**|\n", "2310.02050": "|**2023-10-03**|**Tuning Large language model for End-to-end Speech Translation**|Hao Zhang et.al.|[2310.02050v1](http://arxiv.org/abs/2310.02050v1)|null|\n", "2310.01852": "|**2023-10-04**|**LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment**|Bin Zhu et.al.|[2310.01852v2](http://arxiv.org/abs/2310.01852v2)|**[link](https://github.com/pku-yuangroup/languagebind)**|\n", "2310.01733": "|**2023-10-03**|**Health Guardian: Using Multi-modal Data to Understand Individual Health**|Vince S. Siu et.al.|[2310.01733v1](http://arxiv.org/abs/2310.01733v1)|null|\n", "2310.01358": "|**2023-10-02**|**NEUCORE: Neural Concept Reasoning for Composed Image Retrieval**|Shu Zhao et.al.|[2310.01358v1](http://arxiv.org/abs/2310.01358v1)|null|\n", "2310.01351": "|**2023-10-02**|**Streaming Motion Forecasting for Autonomous Driving**|Ziqi Pang et.al.|[2310.01351v1](http://arxiv.org/abs/2310.01351v1)|**[link](https://github.com/ziqipang/streamingforecasting)**|\n", "2310.01330": "|**2023-10-02**|**Towards reporting bias in visual-language datasets: bimodal augmentation by decoupling object-attribute association**|Qiyu Wu et.al.|[2310.01330v1](http://arxiv.org/abs/2310.01330v1)|null|\n", "2310.01286": "|**2023-10-02**|**A Dynamic Macroscopic Framework for Pricing of Ride-hailing Services with an Optional Bus Lane Access for Pool Vehicles**|Lynn Fayed et.al.|[2310.01286v1](http://arxiv.org/abs/2310.01286v1)|null|\n", "2310.01232": "|**2023-10-02**|**Modality-aware Transformer for Time series Forecasting**|Hajar Emami et.al.|[2310.01232v1](http://arxiv.org/abs/2310.01232v1)|null|\n", "2310.01035": "|**2023-10-02**|**Learnable Cross-modal Knowledge Distillation for Multi-modal Learning with Missing Modality**|Hu Wang et.al.|[2310.01035v1](http://arxiv.org/abs/2310.01035v1)|null|\n", "2310.00927": "|**2023-10-02**|**Understanding Transferable Representation Learning and Zero-shot Transfer in CLIP**|Zixiang Chen et.al.|[2310.00927v1](http://arxiv.org/abs/2310.00927v1)|null|\n", "2310.00862": "|**2023-10-02**|**Shack-Hartmann wavefront sensing: A new approach to time-resolved measurement of stress intensity during dynamic fracture of small brittle specimens**|Liuchi Li et.al.|[2310.00862v1](http://arxiv.org/abs/2310.00862v1)|null|\n", "2310.00745": "|**2023-10-01**|**Deterministic Langevin Unconstrained Optimization with Normalizing Flows**|James M. Sullivan et.al.|[2310.00745v1](http://arxiv.org/abs/2310.00745v1)|null|\n", "2310.00740": "|**2023-10-01**|**Top-down Green-ups: Satellite Sensing and Deep Models to Predict Buffelgrass Phenology**|Lucas Rosenblatt et.al.|[2310.00740v1](http://arxiv.org/abs/2310.00740v1)|**[link](https://github.com/lurosenb/phenology_projects)**|\n", "2310.00672": "|**2023-10-01**|**GeRA: Label-Efficient Geometrically Regularized Alignment**|Dustin Klebe et.al.|[2310.00672v1](http://arxiv.org/abs/2310.00672v1)|null|\n", "2310.03024": "|**2023-10-04**|**AstroCLIP: Cross-Modal Pre-Training for Astronomical Foundation Models**|Francois Lanusse et.al.|[2310.03024v1](http://arxiv.org/abs/2310.03024v1)|**[link](https://github.com/PolymathicAI/AstroCLIP)**|\n", "2310.02960": "|**2023-10-04**|**CoDA: Collaborative Novel Box Discovery and Cross-modal Alignment for Open-vocabulary 3D Object Detection**|Yang Cao et.al.|[2310.02960v1](http://arxiv.org/abs/2310.02960v1)|**[link](https://github.com/yangcaoai/CoDA_NeurIPS2023)**|\n", "2310.02821": "|**2023-10-04**|**Improving Vision Anomaly Detection with the Guidance of Language Modality**|Dong Chen et.al.|[2310.02821v1](http://arxiv.org/abs/2310.02821v1)|**[link](https://github.com/Anfeather/CMG)**|\n", "2310.02777": "|**2023-10-04**|**The Role of Linguistic Priors in Measuring Compositional Generalization of Vision-Language Models**|Chenwei Wu et.al.|[2310.02777v1](http://arxiv.org/abs/2310.02777v1)|null|\n", "2310.02690": "|**2023-10-04**|**Multi-Dimension-Embedding-Aware Modality Fusion Transformer for Psychiatric Disorder Clasification**|Guoxin Wang et.al.|[2310.02690v1](http://arxiv.org/abs/2310.02690v1)|null|\n", "2310.02663": "|**2023-10-04**|**MedPrompt: Cross-Modal Prompting for Multi-Task Medical Image Translation**|Xuhang Chen et.al.|[2310.02663v1](http://arxiv.org/abs/2310.02663v1)|null|\n", "2310.02569": "|**2023-10-04**|**ReForm-Eval: Evaluating Large Vision Language Models via Unified Re-Formulation of Task-Oriented Benchmarks**|Zejun Li et.al.|[2310.02569v1](http://arxiv.org/abs/2310.02569v1)|**[link](https://github.com/fudandisc/reform-eval)**|\n", "2310.02561": "|**2023-10-04**|**Integrated Sensing and Communications towards Proactive Beamforming in mmWave V2I via Multi-Modal Feature Fusion (MMFF)**|Haotian Zhang et.al.|[2310.02561v1](http://arxiv.org/abs/2310.02561v1)|null|\n", "2310.02528": "|**2023-10-04**|**On the Cognition of Visual Question Answering Models and Human Intelligence: A Comparative Study**|Liben Chen et.al.|[2310.02528v1](http://arxiv.org/abs/2310.02528v1)|null|\n", "2310.02361": "|**2023-10-03**|**Event-Enhanced Multi-Modal Spiking Neural Network for Dynamic Obstacle Avoidance**|Yang Wang et.al.|[2310.02361v1](http://arxiv.org/abs/2310.02361v1)|null|\n", "2310.03744": "|**2023-10-05**|**Improved Baselines with Visual Instruction Tuning**|Haotian Liu et.al.|[2310.03744v1](http://arxiv.org/abs/2310.03744v1)|null|\n", "2310.03724": "|**2023-10-05**|**Modular Speech-to-Text Translation for Zero-Shot Cross-Modal Transfer**|Paul-Ambroise Duquenne et.al.|[2310.03724v1](http://arxiv.org/abs/2310.03724v1)|null|\n", "2310.03485": "|**2023-10-07**|**BTDNet: a Multi-Modal Approach for Brain Tumor Radiogenomic Classification**|Dimitrios Kollias et.al.|[2310.03485v2](http://arxiv.org/abs/2310.03485v2)|null|\n", "2310.03420": "|**2023-10-05**|**FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained Diffusion Models and Monocular Depth Estimators**|Haiping Wang et.al.|[2310.03420v1](http://arxiv.org/abs/2310.03420v1)|**[link](https://github.com/WHU-USI3DV/FreeReg)**|\n", "2310.03333": "|**2023-10-05**|**Real-time Multi-modal Object Detection and Tracking on Edge for Regulatory Compliance Monitoring**|Jia Syuen Lim et.al.|[2310.03333v1](http://arxiv.org/abs/2310.03333v1)|null|\n", "2310.03320": "|**2023-10-05**|**BioBridge: Bridging Biomedical Foundation Models via Knowledge Graph**|Zifeng Wang et.al.|[2310.03320v1](http://arxiv.org/abs/2310.03320v1)|null|\n", "2310.03221": "|**2023-10-05**|**Know2BIO: A Comprehensive Dual-View Benchmark for Evolving Biomedical Knowledge Graphs**|Yijia Xiao et.al.|[2310.03221v1](http://arxiv.org/abs/2310.03221v1)|**[link](https://github.com/yijia-xiao/know2bio)**|\n", "2310.03218": "|**2023-10-05**|**Learning Energy-Based Prior Model with Diffusion-Amortized MCMC**|Peiyu Yu et.al.|[2310.03218v1](http://arxiv.org/abs/2310.03218v1)|**[link](https://github.com/yupeiyu98/diffusion-amortized-mcmc)**|\n", "2310.03140": "|**2023-10-04**|**ViFiT: Reconstructing Vision Trajectories from IMU and Wi-Fi Fine Time Measurements**|Bryan Bo Cao et.al.|[2310.03140v1](http://arxiv.org/abs/2310.03140v1)|**[link](https://github.com/bryanbocao/vifit)**|\n", "2310.03111": "|**2023-10-04**|**Multi-modal Gaussian Process Variational Autoencoders for Neural and Behavioral Data**|Rabia Gondur et.al.|[2310.03111v1](http://arxiv.org/abs/2310.03111v1)|null|\n", "2310.03059": "|**2023-10-04**|**Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models**|Ivan Tang et.al.|[2310.03059v1](http://arxiv.org/abs/2310.03059v1)|**[link](https://github.com/EvenJoker/Point-PEFT)**|\n", "2310.04122": "|**2023-10-06**|**VI-Diff: Unpaired Visible-Infrared Translation Diffusion Model for Single Modality Labeled Visible-Infrared Person Re-identification**|Han Huang et.al.|[2310.04122v1](http://arxiv.org/abs/2310.04122v1)|null|\n", "2310.03958": "|**2023-10-06**|**The \"Seen but Unnoticed\" Vocabulary of Natural Touch: Revolutionizing Direct Interaction with Our Devices and One Another (UIST 2021 Vision)**|Ken Hinckley et.al.|[2310.03958v1](http://arxiv.org/abs/2310.03958v1)|null|\n", "2310.05863": "|**2023-10-10**|**Fine-grained Audio-Visual Joint Representations for Multimodal Large Language Models**|Guangzhi Sun et.al.|[2310.05863v2](http://arxiv.org/abs/2310.05863v2)|**[link](https://github.com/briansidp/audiovisualllm)**|\n", "2310.05628": "|**2023-10-09**|**Glitter or Gold? Deriving Structured Insights from Sustainability Reports via Large Language Models**|Marco Bronzini et.al.|[2310.05628v1](http://arxiv.org/abs/2310.05628v1)|**[link](https://github.com/saturnmars/derivingstructuredinsightsfromsustainabilityreportsvialargelanguagemodels)**|\n", "2310.05608": "|**2023-10-09**|**FlexKnot and Gaussian Process for 21 cm global signal analysis and foreground separation**|Stefan Heimersheim et.al.|[2310.05608v1](http://arxiv.org/abs/2310.05608v1)|null|\n", "2310.05572": "|**2023-10-09**|**A Simple and Robust Framework for Cross-Modality Medical Image Segmentation applied to Vision Transformers**|Matteo Bastico et.al.|[2310.05572v1](http://arxiv.org/abs/2310.05572v1)|**[link](https://github.com/matteo-bastico/mi-seg)**|\n", "2310.05462": "|**2023-10-09**|**AdaFuse: Adaptive Medical Image Fusion Based on Spatial-Frequential Cross Attention**|Xianming Gu et.al.|[2310.05462v1](http://arxiv.org/abs/2310.05462v1)|**[link](https://github.com/xianming-gu/adafuse)**|\n", "2310.05401": "|**2023-10-09**|**Entropy-MCMC: Sampling from Flat Basins with Ease**|Bolian Li et.al.|[2310.05401v1](http://arxiv.org/abs/2310.05401v1)|null|\n", "2310.05364": "|**2023-10-10**|**Universal Multi-modal Entity Alignment via Iteratively Fusing Modality Similarity Paths**|Bolin Zhu et.al.|[2310.05364v2](http://arxiv.org/abs/2310.05364v2)|**[link](https://github.com/blzhu0823/pathfusion)**|\n", "2310.05355": "|**2023-10-09**|**C^2M-DoT: Cross-modal consistent multi-view medical report generation with domain transfer network**|Ruizhi Wang et.al.|[2310.05355v1](http://arxiv.org/abs/2310.05355v1)|null|\n", "2310.05245": "|**2023-10-08**|**Influence of Camera-LiDAR Configuration on 3D Object Detection for Autonomous Driving**|Ye Li et.al.|[2310.05245v1](http://arxiv.org/abs/2310.05245v1)|**[link](https://github.com/safeai-lab/lidar-camera-placement)**|\n", "2310.05193": "|**2023-10-08**|**Improving Discriminative Multi-Modal Learning with Large-Scale Pre-Trained Models**|Chenzhuang Du et.al.|[2310.05193v1](http://arxiv.org/abs/2310.05193v1)|null|\n", "2310.05181": "|**2023-10-08**|**Unified speech and gesture synthesis using flow matching**|Shivam Mehta et.al.|[2310.05181v1](http://arxiv.org/abs/2310.05181v1)|null|\n", "2310.05060": "|**2023-10-08**|**Video-CSR: Complex Video Digest Creation for Visual-Language Models**|Tingkai Liu et.al.|[2310.05060v1](http://arxiv.org/abs/2310.05060v1)|null|\n", "2310.04992": "|**2023-10-08**|**VisionFM: a Multi-Modal Multi-Task Vision Foundation Model for Generalist Ophthalmic Artificial Intelligence**|Jianing Qiu et.al.|[2310.04992v1](http://arxiv.org/abs/2310.04992v1)|null|\n", "2310.04991": "|**2023-10-10**|**Video-Teller: Enhancing Cross-Modal Generation with Fusion and Decoupling**|Haogeng Liu et.al.|[2310.04991v2](http://arxiv.org/abs/2310.04991v2)|null|\n", "2310.04971": "|**2023-10-08**|**Understanding the Robustness of Multi-modal Contrastive Learning to Distribution Shift**|Yihao Xue et.al.|[2310.04971v1](http://arxiv.org/abs/2310.04971v1)|null|\n", "2310.06633": "|**2023-10-10**|**Blind Dates: Examining the Expression of Temporality in Historical Photographs**|Alexandra Barancov\u00e1 et.al.|[2310.06633v1](http://arxiv.org/abs/2310.06633v1)|null|\n", "2310.06627": "|**2023-10-10**|**What If the TV Was Off? Examining Counterfactual Reasoning Abilities of Multi-modal Language Models**|Letian Zhang et.al.|[2310.06627v1](http://arxiv.org/abs/2310.06627v1)|**[link](https://github.com/letian2003/c-vqa)**|\n", "2310.06440": "|**2023-10-10**|**Solution for SMART-101 Challenge of ICCV Multi-modal Algorithmic Reasoning Task 2023**|Xiangyu Wu et.al.|[2310.06440v1](http://arxiv.org/abs/2310.06440v1)|null|\n", "2310.06434": "|**2023-10-10**|**Whispering LLaMA: A Cross-Modal Generative Error Correction Framework for Speech Recognition**|Srijith Radhakrishnan et.al.|[2310.06434v1](http://arxiv.org/abs/2310.06434v1)|**[link](https://github.com/srijith-rkr/whispering-llama)**|\n", "2310.06383": "|**2023-10-10**|**What Makes for Robust Multi-Modal Models in the Face of Missing Modalities?**|Siting Li et.al.|[2310.06383v1](http://arxiv.org/abs/2310.06383v1)|null|\n", "2310.06365": "|**2023-10-10**|**Multi-Modal Knowledge Graph Transformer Framework for Multi-Modal Entity Alignment**|Qian Li et.al.|[2310.06365v1](http://arxiv.org/abs/2310.06365v1)|**[link](https://github.com/xiaoqian19940510/moalign)**|\n", "2310.06342": "|**2023-10-10**|**Contrastive Prompt Learning-based Code Search based on Interaction Matrix**|Yubo Zhang et.al.|[2310.06342v1](http://arxiv.org/abs/2310.06342v1)|null|\n", "2310.06282": "|**2023-10-11**|**MuseChat: A Conversational Music Recommendation System for Videos**|Zhikang Dong et.al.|[2310.06282v2](http://arxiv.org/abs/2310.06282v2)|null|\n", "2310.06259": "|**2023-10-10**|**Cross-modal Cognitive Consensus guided Audio-Visual Segmentation**|Zhaofeng Shi et.al.|[2310.06259v1](http://arxiv.org/abs/2310.06259v1)|null|\n", "2310.06212": "|**2023-10-09**|**Comparison of deep-learning data fusion strategies in mandibular osteoradionecrosis prediction modelling using clinical variables and radiation dose distribution volumes**|Laia Humbert-Vidan et.al.|[2310.06212v1](http://arxiv.org/abs/2310.06212v1)|null|\n", "2310.06008": "|**2023-10-09**|**CoBEVFusion: Cooperative Perception with LiDAR-Camera Bird's-Eye View Fusion**|Donghao Qiao et.al.|[2310.06008v1](http://arxiv.org/abs/2310.06008v1)|null|\n", "2310.07706": "|**2023-10-11**|**Pixel State Value Network for Combined Prediction and Planning in Interactive Environments**|Sascha Rosbach et.al.|[2310.07706v1](http://arxiv.org/abs/2310.07706v1)|null|\n", "2310.07668": "|**2023-10-11**|**GRaMuFeN: Graph-based Multi-modal Fake News Detection in Social Media**|Makan Kananian et.al.|[2310.07668v1](http://arxiv.org/abs/2310.07668v1)|null|\n", "2310.07602": "|**2023-10-11**|**Dual Radar: A Multi-modal Dataset with Dual 4D Radar for Autononous Driving**|Xinyu Zhang et.al.|[2310.07602v1](http://arxiv.org/abs/2310.07602v1)|**[link](https://github.com/adept-thu/dual-radar)**|\n", "2310.07591": "|**2023-10-11**|**PeP: a Point enhanced Painting method for unified point cloud tasks**|Zichao Dong et.al.|[2310.07591v1](http://arxiv.org/abs/2310.07591v1)|null|\n", "2310.07552": "|**2023-10-11**|**ProtoHPE: Prototype-guided High-frequency Patch Enhancement for Visible-Infrared Person Re-identification**|Guiwei Zhang et.al.|[2310.07552v1](http://arxiv.org/abs/2310.07552v1)|null|\n", "2310.07517": "|**2023-10-11**|**CM-PIE: Cross-modal perception for interactive-enhanced audio-visual video parsing**|Yaru Chen et.al.|[2310.07517v1](http://arxiv.org/abs/2310.07517v1)|null|\n", "2310.07355": "|**2023-10-11**|**IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training**|Che Liu et.al.|[2310.07355v1](http://arxiv.org/abs/2310.07355v1)|null|\n", "2310.07276": "|**2023-10-11**|**BioT5: Enriching Cross-modal Integration in Biology with Chemical Knowledge and Natural Language Associations**|Qizhi Pei et.al.|[2310.07276v1](http://arxiv.org/abs/2310.07276v1)|**[link](https://github.com/QizhiPei/BioT5)**|\n", "2310.07265": "|**2023-10-11**|**Distilling Efficient Vision Transformers from CNNs for Semantic Segmentation**|Xu Zheng et.al.|[2310.07265v1](http://arxiv.org/abs/2310.07265v1)|null|\n", "2310.07222": "|**2023-10-11**|**Uni-paint: A Unified Framework for Multimodal Image Inpainting with Pretrained Diffusion Model**|Shiyuan Yang et.al.|[2310.07222v1](http://arxiv.org/abs/2310.07222v1)|**[link](https://github.com/ysy31415/unipaint)**|\n", "2310.07005": "|**2023-10-10**|**Sound-skwatter (Did You Mean: Sound-squatter?) AI-powered Generator for Phishing Prevention**|Rodolfo Valentim et.al.|[2310.07005v1](http://arxiv.org/abs/2310.07005v1)|null|\n", "2310.08530": "|**2023-10-12**|**UniPose: Detecting Any Keypoints**|Jie Yang et.al.|[2310.08530v1](http://arxiv.org/abs/2310.08530v1)|**[link](https://github.com/IDEA-Research/UniPose)**|\n", "2310.08487": "|**2023-10-12**|**GraphextQA: A Benchmark for Evaluating Graph-Enhanced Large Language Models**|Yuanchun Shen et.al.|[2310.08487v1](http://arxiv.org/abs/2310.08487v1)|**[link](https://github.com/happen2me/cross-gnn)**|\n", "2310.08446": "|**2023-10-12**|**Towards Robust Multi-Modal Reasoning via Model Selection**|Xiangyan Liu et.al.|[2310.08446v1](http://arxiv.org/abs/2310.08446v1)|null|\n", "2310.08303": "|**2023-10-12**|**Multimodal Variational Auto-encoder based Audio-Visual Segmentation**|Yuxin Mao et.al.|[2310.08303v1](http://arxiv.org/abs/2310.08303v1)|**[link](https://github.com/opennlplab/mmvae-avs)**|\n", "2310.08285": "|**2023-10-12**|**How would mobility-as-a-service (MaaS) platform survive as an intermediary? From the viewpoint of stability in many-to-many matching**|Rui Yao et.al.|[2310.08285v1](http://arxiv.org/abs/2310.08285v1)|null|\n", "2310.08270": "|**2023-10-12**|**Hilbert Space Embedding-based Trajectory Optimization for Multi-Modal Uncertain Obstacle Trajectory Prediction**|Basant Sharma et.al.|[2310.08270v1](http://arxiv.org/abs/2310.08270v1)|null|\n", "2310.08261": "|**2023-10-12**|**GraphAlign: Enhancing Accurate Feature Alignment by Graph matching for Multi-Modal 3D Object Detection**|Ziying Song et.al.|[2310.08261v1](http://arxiv.org/abs/2310.08261v1)|null|\n", "2310.08166": "|**2023-10-12**|**Ziya-VL: Bilingual Large Vision-Language Model via Multi-Task Instruction Tuning**|Junyu Lu et.al.|[2310.08166v1](http://arxiv.org/abs/2310.08166v1)|null|\n", "2310.08114": "|**2023-10-12**|**Multi-Modal Sensor Fusion and Object Tracking for Autonomous Racing**|Phillip Karle et.al.|[2310.08114v1](http://arxiv.org/abs/2310.08114v1)|**[link](https://github.com/tumftm/fusiontracking)**|\n", "2310.08103": "|**2023-10-12**|**Radio Galaxy Zoo: tagging radio subjects using text**|Dawei Chen et.al.|[2310.08103v1](http://arxiv.org/abs/2310.08103v1)|null|\n", "2310.08027": "|**2023-10-12**|**Exploring Large Language Models for Multi-Modal Out-of-Distribution Detection**|Yi Dai et.al.|[2310.08027v1](http://arxiv.org/abs/2310.08027v1)|null|\n", "2310.08026": "|**2023-10-12**|**Beyond Sharing Weights in Decoupling Feature Learning Network for UAV RGB-Infrared Vehicle Re-Identification**|Xingyue Liu et.al.|[2310.08026v1](http://arxiv.org/abs/2310.08026v1)|null|\n", "2310.07990": "|**2023-10-12**|**Multi-View Variational Autoencoder for Missing Value Imputation in Untargeted Metabolomics**|Chen Zhao et.al.|[2310.07990v1](http://arxiv.org/abs/2310.07990v1)|null|\n", "2310.07944": "|**2023-10-11**|**AutoRepo: A general framework for multi-modal LLM-based automated construction reporting**|Hongxu Pu et.al.|[2310.07944v1](http://arxiv.org/abs/2310.07944v1)|null|\n", "2310.07940": "|**2023-10-11**|**Cost-Driven Hardware-Software Co-Optimization of Machine Learning Pipelines**|Ravit Sharma et.al.|[2310.07940v1](http://arxiv.org/abs/2310.07940v1)|null|\n", "2310.10651": "|**2023-10-16**|**HairCLIPv2: Unifying Hair Editing via Proxy Feature Blending**|Tianyi Wei et.al.|[2310.10651v1](http://arxiv.org/abs/2310.10651v1)|**[link](https://github.com/wty-ustc/hairclipv2)**|\n", "2310.10414": "|**2023-10-16**|**Style transfer between Microscopy and Magnetic Resonance Imaging via Generative Adversarial Network in small sample size settings**|Monika Pytlarz et.al.|[2310.10414v1](http://arxiv.org/abs/2310.10414v1)|null|\n", "2310.10371": "|**2023-10-16**|**Camera-LiDAR Fusion with Latent Contact for Place Recognition in Challenging Cross-Scenes**|Yan Pan et.al.|[2310.10371v1](http://arxiv.org/abs/2310.10371v1)|null|\n", "2310.10347": "|**2023-10-16**|**Editable-DeepSC: Cross-Modal Editable Semantic Communication Systems**|Wenbo Yu et.al.|[2310.10347v1](http://arxiv.org/abs/2310.10347v1)|null|\n", "2310.10290": "|**2023-10-16**|**Autonomous Mapping and Navigation using Fiducial Markers and Pan-Tilt Camera for Assisting Indoor Mobility of Blind and Visually Impaired People**|Dharmateja Adapa et.al.|[2310.10290v1](http://arxiv.org/abs/2310.10290v1)|null|\n", "2310.10125": "|**2023-10-16**|**Few-shot Action Recognition with Captioning Foundation Models**|Xiang Wang et.al.|[2310.10125v1](http://arxiv.org/abs/2310.10125v1)|null|\n", "2310.10010": "|**2023-10-16**|**Black-box Targeted Adversarial Attack on Segment Anything (SAM)**|Sheng Zheng et.al.|[2310.10010v1](http://arxiv.org/abs/2310.10010v1)|null|\n", "2310.09761": "|**2023-10-15**|**CAPro: Webly Supervised Learning with Cross-Modality Aligned Prototypes**|Yulei Qin et.al.|[2310.09761v1](http://arxiv.org/abs/2310.09761v1)|**[link](https://github.com/yuleiqin/capro)**|\n", "2310.09755": "|**2023-10-15**|**Beyond Segmentation: Road Network Generation with Multi-Modal LLMs**|Sumedh Rasal et.al.|[2310.09755v1](http://arxiv.org/abs/2310.09755v1)|null|\n", "2310.09714": "|**2023-10-15**|**Enhancing Task Performance of Learned Simplified Models via Reinforcement Learning**|Hien Bui et.al.|[2310.09714v1](http://arxiv.org/abs/2310.09714v1)|null|\n", "2310.09696": "|**2023-10-15**|**Progressive Evidence Refinement for Open-domain Multimodal Retrieval Question Answering**|Shuwen Yang et.al.|[2310.09696v1](http://arxiv.org/abs/2310.09696v1)|null|\n", "2310.09503": "|**2023-10-14**|**JM3D & JM3D-LLM: Elevating 3D Representation with Joint Multi-modal Cues**|Jiayi Ji et.al.|[2310.09503v1](http://arxiv.org/abs/2310.09503v1)|**[link](https://github.com/mr-neko/jm3d)**|\n", "2310.09478": "|**2023-10-14**|**MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning**|Jun Chen et.al.|[2310.09478v1](http://arxiv.org/abs/2310.09478v1)|null|\n", "2310.09199": "|**2023-10-13**|**PaLI-3 Vision Language Models: Smaller, Faster, Stronger**|Xi Chen et.al.|[2310.09199v1](http://arxiv.org/abs/2310.09199v1)|null|\n", "2310.09165": "|**2023-10-13**|**Towards Robust UAV Tracking in GNSS-Denied Environments: A Multi-LiDAR Multi-UAV Dataset**|Iacopo Catalano et.al.|[2310.09165v1](http://arxiv.org/abs/2310.09165v1)|**[link](https://github.com/tiers/uav_multi_lidar_dataset)**|\n", "2310.11374": "|**2023-10-17**|**DialogueLLM: Context and Emotion Knowledge-Tuned LLaMA Models for Emotion Recognition in Conversations**|Yazhou Zhang et.al.|[2310.11374v1](http://arxiv.org/abs/2310.11374v1)|null|\n", "2310.11316": "|**2023-10-17**|**MonoSKD: General Distillation Framework for Monocular 3D Object Detection via Spearman Correlation Coefficient**|Sen Wang et.al.|[2310.11316v1](http://arxiv.org/abs/2310.11316v1)|**[link](https://github.com/senwang98/monoskd)**|\n", "2310.11307": "|**2023-10-17**|**Multi Self-supervised Pre-fine-tuned Transformer Fusion for Better Intelligent Transportation Detection**|Juwu Zheng et.al.|[2310.11307v1](http://arxiv.org/abs/2310.11307v1)|null|\n", "2310.11295": "|**2023-10-17**|**CorrTalk: Correlation Between Hierarchical Speech and Facial Activity Variances for 3D Animation**|Zhaojie Chu et.al.|[2310.11295v1](http://arxiv.org/abs/2310.11295v1)|null|\n", "2310.10942": "|**2023-10-17**|**Unanswerable Visual Question Answering**|Yanyang Guo et.al.|[2310.10942v1](http://arxiv.org/abs/2310.10942v1)|**[link](https://github.com/guoyang9/unk-vqa)**|\n", "2310.10844": "|**2023-10-16**|**Survey of Vulnerabilities in Large Language Models Revealed by Adversarial Attacks**|Erfan Shayegani et.al.|[2310.10844v1](http://arxiv.org/abs/2310.10844v1)|null|\n", "2310.12081": "|**2023-10-18**|**DHOT-GM: Robust Graph Matching Using A Differentiable Hierarchical Optimal Transport Framework**|Haoran Cheng et.al.|[2310.12081v1](http://arxiv.org/abs/2310.12081v1)|null|\n", "2310.11989": "|**2023-10-18**|**Image Clustering with External Guidance**|Yunfan Li et.al.|[2310.11989v1](http://arxiv.org/abs/2310.11989v1)|null|\n", "2310.11939": "|**2023-10-18**|**Mixture distributions for probabilistic forecasts of disease outbreaks**|Spencer Wadsworth et.al.|[2310.11939v1](http://arxiv.org/abs/2310.11939v1)|null|\n", "2310.11938": "|**2023-10-18**|**Grounded and Well-rounded: A Methodological Approach to the Study of Cross-modal and Cross-lingual Grounding**|Timothee Mickus et.al.|[2310.11938v1](http://arxiv.org/abs/2310.11938v1)|null|\n", "2310.11910": "|**2023-10-18**|**Multi-modal Medical Neurological Image Fusion using Wavelet Pooled Edge Preserving Autoencoder**|Manisha Das et.al.|[2310.11910v1](http://arxiv.org/abs/2310.11910v1)|null|\n", "2310.11713": "|**2023-10-18**|**Separating Invisible Sounds Toward Universal Audiovisual Scene-Aware Sound Separation**|Yiyang Su et.al.|[2310.11713v1](http://arxiv.org/abs/2310.11713v1)|null|\n", "2310.11612": "|**2023-10-17**|**Balance Act: Mitigating Hubness in Cross-Modal Retrieval with Query and Gallery Banks**|Yimu Wang et.al.|[2310.11612v1](http://arxiv.org/abs/2310.11612v1)|**[link](https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval)**|\n", "2310.12973": "|**2023-10-19**|**Frozen Transformers in Language Models Are Effective Visual Encoder Layers**|Ziqi Pang et.al.|[2310.12973v1](http://arxiv.org/abs/2310.12973v1)|**[link](https://github.com/ziqipang/lm4visualencoding)**|\n", "2310.12798": "|**2023-10-19**|**MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal Adapter**|Zhiyuan Liu et.al.|[2310.12798v1](http://arxiv.org/abs/2310.12798v1)|**[link](https://github.com/acharkq/molca)**|\n", "2310.12609": "|**2023-10-19**|**Denoising Heat-inspired Diffusion with Insulators for Collision Free Motion Planning**|Junwoo Chang et.al.|[2310.12609v1](http://arxiv.org/abs/2310.12609v1)|null|\n", "2310.12520": "|**2023-10-19**|**Lost in Translation: When GPT-4V(ision) Can't See Eye to Eye with Text. A Vision-Language-Consistency Analysis of VLLMs and Beyond**|Xiang Zhang et.al.|[2310.12520v1](http://arxiv.org/abs/2310.12520v1)|null|\n", "2310.12518": "|**2023-10-19**|**Light-enhanced van der Waals force microscopy**|Han Yu-Xiao et.al.|[2310.12518v1](http://arxiv.org/abs/2310.12518v1)|null|\n", "2310.12344": "|**2023-10-18**|**LACMA: Language-Aligning Contrastive Learning with Meta-Actions for Embodied Instruction Following**|Cheng-Fu Yang et.al.|[2310.12344v1](http://arxiv.org/abs/2310.12344v1)|**[link](https://github.com/joeyy5588/lacma)**|\n", "2310.13619": "|**2023-10-20**|**Semi-supervised multimodal coreference resolution in image narrations**|Arushi Goel et.al.|[2310.13619v1](http://arxiv.org/abs/2310.13619v1)|**[link](https://github.com/vico-uoe/cin-ssl)**|\n", "2310.13596": "|**2023-10-20**|**MarineGPT: Unlocking Secrets of Ocean to the Public**|Ziqiang Zheng et.al.|[2310.13596v1](http://arxiv.org/abs/2310.13596v1)|**[link](https://github.com/hkust-vgd/marinegpt)**|\n", "2310.13451": "|**2023-10-20**|**Two-Stage Triplet Loss Training with Curriculum Augmentation for Audio-Visual Retrieval**|Donghuo Zeng et.al.|[2310.13451v1](http://arxiv.org/abs/2310.13451v1)|null|\n", "2310.13398": "|**2023-10-20**|**OpenAnnotate3D: Open-Vocabulary Auto-Labeling System for Multi-modal 3D Data**|Yijie Zhou et.al.|[2310.13398v1](http://arxiv.org/abs/2310.13398v1)|null|\n", "2310.13289": "|**2023-10-20**|**SALMONN: Towards Generic Hearing Abilities for Large Language Models**|Changli Tang et.al.|[2310.13289v1](http://arxiv.org/abs/2310.13289v1)|**[link](https://github.com/bytedance/salmonn)**|\n", "2310.13276": "|**2023-10-20**|**InvGC: Robust Cross-Modal Retrieval by Inverse Graph Convolution**|Xiangru Jian et.al.|[2310.13276v1](http://arxiv.org/abs/2310.13276v1)|**[link](https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval)**|\n", "2310.13267": "|**2023-10-20**|**On the Language Encoder of Contrastive Cross-modal Models**|Mengjie Zhao et.al.|[2310.13267v1](http://arxiv.org/abs/2310.13267v1)|null|\n", "2310.13265": "|**2023-10-20**|**MoqaGPT : Zero-Shot Multi-modal Open-domain Question Answering with Large Language Model**|Le Zhang et.al.|[2310.13265v1](http://arxiv.org/abs/2310.13265v1)|**[link](https://github.com/lezhang7/moqagpt)**|\n", "2310.13257": "|**2023-10-20**|**Visual Grounding Helps Learn Word Meanings in Low-Data Regimes**|Chengxu Zhuang et.al.|[2310.13257v1](http://arxiv.org/abs/2310.13257v1)|null|\n", "2310.13235": "|**2023-10-20**|**Auxiliary Features-Guided Super Resolution for Monte Carlo Rendering**|Qiqi Hou et.al.|[2310.13235v1](http://arxiv.org/abs/2310.13235v1)|null|\n", "2310.13103": "|**2023-10-19**|**AVTENet: Audio-Visual Transformer-based Ensemble Network Exploiting Multiple Experts for Video Deepfake Detection**|Ammarah Hashmi et.al.|[2310.13103v1](http://arxiv.org/abs/2310.13103v1)|null|\n", "2310.14924": "|**2023-10-23**|**Converting Depth Images and Point Clouds for Feature-based Pose Estimation**|Robert L\u00f6sch et.al.|[2310.14924v1](http://arxiv.org/abs/2310.14924v1)|**[link](https://github.com/rlsch/depth-conversions)**|\n", "2310.14805": "|**2023-10-23**|**Cross-Modal Conceptualization in Bottleneck Models**|Danis Alukaev et.al.|[2310.14805v1](http://arxiv.org/abs/2310.14805v1)|**[link](https://github.com/danisalukaev/xcbs)**|\n", "2310.14785": "|**2023-10-23**|**Vision-Enhanced Semantic Entity Recognition in Document Images via Visually-Asymmetric Consistency Learning**|Hao Wang et.al.|[2310.14785v1](http://arxiv.org/abs/2310.14785v1)|null|\n", "2310.14720": "|**2023-10-23**|**Extended Deep Adaptive Input Normalization for Preprocessing Time Series Data for Neural Networks**|Marcus A. K. September et.al.|[2310.14720v1](http://arxiv.org/abs/2310.14720v1)|**[link](https://github.com/marcusgh/edain_paper)**|\n", "2310.14702": "|**2023-10-23**|**BM2CP: Efficient Collaborative Perception with LiDAR-Camera Modalities**|Binyu Zhao et.al.|[2310.14702v1](http://arxiv.org/abs/2310.14702v1)|**[link](https://github.com/byzhaoai/bm2cp)**|\n", "2310.14643": "|**2023-10-23**|**Dynamic gain and frequency comb formation in exceptional-point lasers**|Xingwei Gao et.al.|[2310.14643v1](http://arxiv.org/abs/2310.14643v1)|null|\n", "2310.14566": "|**2023-10-23**|**HallusionBench: You See What You Think? Or You Think What You See? An Image-Context Reasoning Benchmark Challenging for GPT-4V(ision), LLaVA-1.5, and Other Multi-modality Models**|Fuxiao Liu et.al.|[2310.14566v1](http://arxiv.org/abs/2310.14566v1)|**[link](https://github.com/tianyi-lab/hallusionbench)**|\n", "2310.14549": "|**2023-10-23**|**Multimodal Graph Learning for Modeling Emerging Pandemics with Big Data**|Khanh-Tung Tran et.al.|[2310.14549v1](http://arxiv.org/abs/2310.14549v1)|**[link](https://github.com/khanhtungtran/mgl4mep)**|\n", "2310.14278": "|**2023-10-22**|**Conversational Speech Recognition by Learning Audio-textual Cross-modal Contextual Representation**|Kun Wei et.al.|[2310.14278v1](http://arxiv.org/abs/2310.14278v1)|null|\n", "2310.14226": "|**2023-10-22**|**Multi-stream Cell Segmentation with Low-level Cues for Multi-modality Images**|Wei Lou et.al.|[2310.14226v1](http://arxiv.org/abs/2310.14226v1)|**[link](https://github.com/lhaof/cellseg)**|\n", "2310.14216": "|**2023-10-22**|**UniMAP: Universal SMILES-Graph Representation Learning**|Shikun Feng et.al.|[2310.14216v1](http://arxiv.org/abs/2310.14216v1)|**[link](https://github.com/fengshikun/unimap)**|\n", "2310.14158": "|**2023-10-22**|**Visual-Attribute Prompt Learning for Progressive Mild Cognitive Impairment Prediction**|Luoyao Kang et.al.|[2310.14158v1](http://arxiv.org/abs/2310.14158v1)|**[link](https://github.com/lhaof/vapl)**|\n", "2310.14075": "|**2023-10-21**|**Unsupervised Sim-to-Real Adaptation of Soft Robot Proprioception using a Dual Cross-modal Autoencoder**|Chaeree Park et.al.|[2310.14075v1](http://arxiv.org/abs/2310.14075v1)|null|\n", "2310.14037": "|**2023-10-21**|**Unlock Multi-Modal Capability of Dense Retrieval via Visual Module Plugin**|Tianshuo Zhou et.al.|[2310.14037v1](http://arxiv.org/abs/2310.14037v1)|**[link](https://github.com/openmatch/marvel)**|\n", "2310.13898": "|**2023-10-21**|**Computational and Systems Biology Advances to Enable for Bioagent Agnostic Signatures**|Andy Lin et.al.|[2310.13898v1](http://arxiv.org/abs/2310.13898v1)|null|\n", "2310.15887": "|**2023-10-24**|**AdaptiX -- A Transitional XR Framework for Development and Evaluation of Shared Control Applications in Assistive Robotics**|Max Pascher et.al.|[2310.15887v1](http://arxiv.org/abs/2310.15887v1)|**[link](https://github.com/maxpascher/AdaptiX)**|\n", "2310.15676": "|**2023-10-24**|**Recent Advances in Multi-modal 3D Scene Understanding: A Comprehensive Survey and Evaluation**|Yinjie Lei et.al.|[2310.15676v1](http://arxiv.org/abs/2310.15676v1)|null|\n", "2310.15670": "|**2023-10-24**|**Leveraging Vision-Centric Multi-Modal Expertise for 3D Object Detection**|Linyan Huang et.al.|[2310.15670v1](http://arxiv.org/abs/2310.15670v1)|**[link](https://github.com/opendrivelab/birds-eye-view-perception)**|\n", "2310.15587": "|**2023-10-24**|**ScanDL: A Diffusion Model for Generating Synthetic Scanpaths on Texts**|Lena S. Bolliger et.al.|[2310.15587v1](http://arxiv.org/abs/2310.15587v1)|**[link](https://github.com/dili-lab/scandl)**|\n", "2310.15585": "|**2023-10-24**|**Multimodal Representations for Teacher-Guided Compositional Visual Reasoning**|Wafa Aissa et.al.|[2310.15585v1](http://arxiv.org/abs/2310.15585v1)|null|\n", "2310.15568": "|**2023-10-24**|**I$^2$MD: 3D Action Representation Learning with Inter- and Intra-modal Mutual Distillation**|Yunyao Mao et.al.|[2310.15568v1](http://arxiv.org/abs/2310.15568v1)|null|\n", "2310.15482": "|**2023-10-24**|**Salient Object Detection in RGB-D Videos**|Ao Mou et.al.|[2310.15482v1](http://arxiv.org/abs/2310.15482v1)|**[link](https://github.com/kerenfu/rdvs)**|\n", "2310.15325": "|**2023-10-23**|**LXMERT Model Compression for Visual Question Answering**|Maryam Hashemi et.al.|[2310.15325v1](http://arxiv.org/abs/2310.15325v1)|**[link](https://github.com/ghazaleh-mahmoodi/lxmert_compression)**|\n", "2310.15301": "|**2023-10-23**|**ADMarker: A Multi-Modal Federated Learning System for Monitoring Digital Biomarkers of Alzheimer's Disease**|Xiaomin Ouyang et.al.|[2310.15301v1](http://arxiv.org/abs/2310.15301v1)|null|\n", "2310.15281": "|**2023-10-23**|**UncertaintyPlayground: A Fast and Simplified Python Library for Uncertainty Estimation**|Ilia Azizi et.al.|[2310.15281v1](http://arxiv.org/abs/2310.15281v1)|**[link](https://github.com/Unco3892/UncertaintyPlayground)**|\n", "2310.16781": "|**2023-10-25**|**Kiki or Bouba? Sound Symbolism in Vision-and-Language Models**|Morris Alper et.al.|[2310.16781v1](http://arxiv.org/abs/2310.16781v1)|null|\n", "2310.16754": "|**2023-10-25**|**CAD -- Contextual Multi-modal Alignment for Dynamic AVQA**|Asmar Nadeem et.al.|[2310.16754v1](http://arxiv.org/abs/2310.16754v1)|null|\n", "2310.16641": "|**2023-10-25**|**The Next Evolution of Artificial Sense of Touch**|Sonja Gro\u00df et.al.|[2310.16641v1](http://arxiv.org/abs/2310.16641v1)|null|\n", "2310.16629": "|**2023-10-25**|**EdgeCalib: Multi-Frame Weighted Edge Features for Automatic Targetless LiDAR-Camera Calibration**|Xingchen Li et.al.|[2310.16629v1](http://arxiv.org/abs/2310.16629v1)|null|\n", "2310.16590": "|**2023-10-25**|**$\\mathbb{VD}$-$\\mathbb{GR}$: Boosting $\\mathbb{V}$isual $\\mathbb{D}$ialog with Cascaded Spatial-Temporal Multi-Modal $\\mathbb{GR}$aphs**|Adnen Abdessaied et.al.|[2310.16590v1](http://arxiv.org/abs/2310.16590v1)|null|\n", "2310.16477": "|**2023-10-25**|**Show from Tell: Audio-Visual Modelling in Clinical Settings**|Jianbo Jiao et.al.|[2310.16477v1](http://arxiv.org/abs/2310.16477v1)|null|\n", "2310.16402": "|**2023-10-25**|**Video Referring Expression Comprehension via Transformer with Content-conditioned Query**|Ji Jiang et.al.|[2310.16402v1](http://arxiv.org/abs/2310.16402v1)|null|\n", "2310.16380": "|**2023-10-25**|**A model for multi-attack classification to improve intrusion detection performance using deep learning approaches**|Arun Kumar Silivery et.al.|[2310.16380v1](http://arxiv.org/abs/2310.16380v1)|null|\n", "2310.16356": "|**2023-10-25**|**A Multi-Modal Multilingual Benchmark for Document Image Classification**|Yoshinari Fujinuma et.al.|[2310.16356v1](http://arxiv.org/abs/2310.16356v1)|null|\n", "2310.16273": "|**2023-10-25**|**Deep Learning for Plant Identification and Disease Classification from Leaf Images: Multi-prediction Approaches**|Jianping Yao et.al.|[2310.16273v1](http://arxiv.org/abs/2310.16273v1)|**[link](https://github.com/funzi-son/plant_pathology_dl)**|\n", "2310.17642": "|**2023-10-26**|**Drive Anywhere: Generalizable End-to-end Autonomous Driving with Multi-modal Foundation Models**|Tsun-Hsuan Wang et.al.|[2310.17642v1](http://arxiv.org/abs/2310.17642v1)|null|\n", "2310.17568": "|**2023-10-26**|**Navigating to Success in Multi-Modal Human-Robot Collaboration: Analysis and Corpus Release**|Stephanie M. Lukin et.al.|[2310.17568v1](http://arxiv.org/abs/2310.17568v1)|null|\n", "2310.17540": "|**2023-10-26**|**EqDrive: Efficient Equivariant Motion Forecasting with Multi-Modality for Autonomous Driving**|Yuping Wang et.al.|[2310.17540v1](http://arxiv.org/abs/2310.17540v1)|null|\n", "2310.17468": "|**2023-10-26**|**Cross-modal Active Complementary Learning with Self-refining Correspondence**|Yang Qin et.al.|[2310.17468v1](http://arxiv.org/abs/2310.17468v1)|**[link](https://github.com/qinyang79/crcl)**|\n", "2310.17323": "|**2023-10-26**|**IndustReal: A Dataset for Procedure Step Recognition Handling Execution Errors in Egocentric Videos in an Industrial-Like Setting**|Tim J. Schoonbeek et.al.|[2310.17323v1](http://arxiv.org/abs/2310.17323v1)|**[link](https://github.com/timschoonbeek/industreal)**|\n", "2310.17133": "|**2023-10-26**|**Incorporating Probing Signals into Multimodal Machine Translation via Visual Question-Answering Pairs**|Yuxin Zuo et.al.|[2310.17133v1](http://arxiv.org/abs/2310.17133v1)|**[link](https://github.com/libeineu/mmt-vqa)**|\n", "2310.17025": "|**2023-10-25**|**netFound: Foundation Model for Network Security**|Satyandra Guthula et.al.|[2310.17025v1](http://arxiv.org/abs/2310.17025v1)|null|\n", "2310.16917": "|**2023-10-25**|**MimicTouch: Learning Human's Control Strategy with Multi-Modal Tactile Feedback**|Kelin Yu et.al.|[2310.16917v1](http://arxiv.org/abs/2310.16917v1)|null|\n", "2310.18049": "|**2023-10-27**|**Text Augmented Spatial-aware Zero-shot Referring Image Segmentation**|Yucheng Suo et.al.|[2310.18049v1](http://arxiv.org/abs/2310.18049v1)|null|\n", "2310.17956": "|**2023-10-27**|**Qilin-Med-VL: Towards Chinese Large Vision-Language Model for General Healthcare**|Junling Liu et.al.|[2310.17956v1](http://arxiv.org/abs/2310.17956v1)|**[link](https://github.com/williamliujl/qilin-med-vl)**|\n", "2310.17933": "|**2023-10-27**|**A barycenter-based approach for the multi-model ensembling of subseasonal forecasts**|Camille Le Coz et.al.|[2310.17933v1](http://arxiv.org/abs/2310.17933v1)|null|\n", "2310.17852": "|**2023-10-27**|**Function Space Bayesian Pseudocoreset for Bayesian Neural Networks**|Balhae Kim et.al.|[2310.17852v1](http://arxiv.org/abs/2310.17852v1)|null|\n", "2310.17796": "|**2023-10-26**|**ControlLLM: Augment Language Models with Tools by Searching on Graphs**|Zhaoyang Liu et.al.|[2310.17796v1](http://arxiv.org/abs/2310.17796v1)|**[link](https://github.com/opengvlab/controlllm)**|\n", "2310.17770": "|**2023-10-26**|**GROOViST: A Metric for Grounding Objects in Visual Storytelling**|Aditya K Surikuchi et.al.|[2310.17770v1](http://arxiv.org/abs/2310.17770v1)|**[link](https://github.com/akskuchi/groovist)**|\n", "2310.17737": "|**2023-10-26**|**ArchBERT: Bi-Modal Understanding of Neural Architectures and Natural Languages**|Mohammad Akbari et.al.|[2310.17737v1](http://arxiv.org/abs/2310.17737v1)|null|\n", "2310.19168": "|**2023-10-29**|**BirdSAT: Cross-View Contrastive Masked Autoencoders for Bird Species Classification and Mapping**|Srikumar Sastry et.al.|[2310.19168v1](http://arxiv.org/abs/2310.19168v1)|**[link](https://github.com/mvrl/birdsat)**|\n", "2310.19070": "|**2023-10-29**|**Myriad: Large Multimodal Model by Applying Vision Experts for Industrial Anomaly Detection**|Yuanze Li et.al.|[2310.19070v1](http://arxiv.org/abs/2310.19070v1)|null|\n", "2310.19062": "|**2023-10-29**|**A multi-modal table tennis robot system**|Andreas Ziegler et.al.|[2310.19062v1](http://arxiv.org/abs/2310.19062v1)|null|\n", "2310.19001": "|**2023-10-29**|**Uncovering Prototypical Knowledge for Weakly Open-Vocabulary Semantic Segmentation**|Fei Zhang et.al.|[2310.19001v1](http://arxiv.org/abs/2310.19001v1)|null|\n", "2310.18949": "|**2023-10-29**|**Customize StyleGAN with One Hand Sketch**|Shaocong Zhang et.al.|[2310.18949v1](http://arxiv.org/abs/2310.18949v1)|null|\n", "2310.18890": "|**2023-10-29**|**Towards Generalized Multi-stage Clustering: Multi-view Self-distillation**|Jiatai Wang et.al.|[2310.18890v1](http://arxiv.org/abs/2310.18890v1)|null|\n", "2310.18728": "|**2023-10-28**|**Online Multi-view Anomaly Detection with Disentangled Product-of-Experts Modeling**|Hao Wang et.al.|[2310.18728v1](http://arxiv.org/abs/2310.18728v1)|null|\n", "2310.18709": "|**2023-10-28**|**Audio-Visual Instance Segmentation**|Ruohao Guo et.al.|[2310.18709v1](http://arxiv.org/abs/2310.18709v1)|null|\n", "2310.18652": "|**2023-10-28**|**EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health Records with Chest X-ray Images**|Seongsu Bae et.al.|[2310.18652v1](http://arxiv.org/abs/2310.18652v1)|**[link](https://github.com/baeseongsu/ehrxqa)**|\n", "2310.18620": "|**2023-10-28**|**ODM3D: Alleviating Foreground Sparsity for Enhanced Semi-Supervised Monocular 3D Object Detection**|Weijia Zhang et.al.|[2310.18620v1](http://arxiv.org/abs/2310.18620v1)|null|\n", "2310.18583": "|**2023-10-28**|**Self-Supervised Multi-Modality Learning for Multi-Label Skin Lesion Classification**|Hao Wang et.al.|[2310.18583v1](http://arxiv.org/abs/2310.18583v1)|**[link](https://github.com/dylan-h-wang/skin-sm3)**|\n", "2310.18481": "|**2023-10-27**|**MOSEL: Inference Serving Using Dynamic Modality Selection**|Bodun Hu et.al.|[2310.18481v1](http://arxiv.org/abs/2310.18481v1)|null|\n", "2310.18438": "|**2023-10-27**|**Exploring Shape Embedding for Cloth-Changing Person Re-Identification via 2D-3D Correspondences**|Yubin Wang et.al.|[2310.18438v1](http://arxiv.org/abs/2310.18438v1)|null|\n", "2310.20561": "|**2023-10-31**|**Predictive Control for Autonomous Driving with Uncertain, Multi-modal Predictions**|Siddharth H. Nair et.al.|[2310.20561v1](http://arxiv.org/abs/2310.20561v1)|null|\n", "2310.20446": "|**2023-10-31**|**LAVSS: Location-Guided Audio-Visual Spatial Audio Separation**|Yuxin Ye et.al.|[2310.20446v1](http://arxiv.org/abs/2310.20446v1)|null|\n", "2310.20357": "|**2023-11-01**|**Enhancing the Spatial Awareness Capability of Multi-Modal Large Language Model**|Yongqiang Zhao et.al.|[2310.20357v2](http://arxiv.org/abs/2310.20357v2)|null|\n", "2310.20343": "|**2023-10-31**|**Large Multi-modal Encoders for Recommendation**|Zixuan Yi et.al.|[2310.20343v1](http://arxiv.org/abs/2310.20343v1)|null|\n", "2310.20025": "|**2023-10-30**|**GOPlan: Goal-conditioned Offline Reinforcement Learning by Planning with Learned Models**|Mianchu Wang et.al.|[2310.20025v1](http://arxiv.org/abs/2310.20025v1)|null|\n", "2310.19795": "|**2023-10-30**|**SimMMDG: A Simple and Effective Framework for Multi-modal Domain Generalization**|Hao Dong et.al.|[2310.19795v1](http://arxiv.org/abs/2310.19795v1)|**[link](https://github.com/donghao51/simmmdg)**|\n", "2310.19743": "|**2023-10-30**|**Tell Me What Is Good About This Property: Leveraging Reviews For Segment-Personalized Image Collection Summarization**|Monika Wysoczanska et.al.|[2310.19743v1](http://arxiv.org/abs/2310.19743v1)|null|\n", "2310.19654": "|**2023-10-30**|**MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient image-text retrieval**|Youbo Lei et.al.|[2310.19654v1](http://arxiv.org/abs/2310.19654v1)|null|\n", "2310.19635": "|**2023-10-30**|**Bidirectional Captioning for Clinically Accurate and Interpretable Models**|Keegan Quigley et.al.|[2310.19635v1](http://arxiv.org/abs/2310.19635v1)|null|\n", "2310.19608": "|**2023-10-30**|**On Feynman--Kac training of partial Bayesian neural networks**|Zheng Zhao et.al.|[2310.19608v1](http://arxiv.org/abs/2310.19608v1)|null|\n", "2310.19559": "|**2023-10-30**|**Disentangled Counterfactual Learning for Physical Audiovisual Commonsense Reasoning**|Changsheng Lv et.al.|[2310.19559v1](http://arxiv.org/abs/2310.19559v1)|null|\n", "2310.19554": "|**2023-10-30**|**Harvest Video Foundation Models via Efficient Post-Pretraining**|Yizhuo Li et.al.|[2310.19554v1](http://arxiv.org/abs/2310.19554v1)|**[link](https://github.com/opengvlab/internvideo)**|\n", "2310.19432": "|**2023-10-30**|**Explaining the Decisions of Deep Policy Networks for Robotic Manipulations**|Seongun Kim et.al.|[2310.19432v1](http://arxiv.org/abs/2310.19432v1)|null|\n", "2310.19264": "|**2023-10-30**|**Sound of Story: Multi-modal Storytelling with Audio**|Jaeyeon Bae et.al.|[2310.19264v1](http://arxiv.org/abs/2310.19264v1)|null|\n", "2311.00618": "|**2023-11-01**|**De-Diffusion Makes Text a Strong Cross-Modal Interface**|Chen Wei et.al.|[2311.00618v1](http://arxiv.org/abs/2311.00618v1)|null|\n", "2311.00566": "|**2023-11-01**|**CROMA: Remote Sensing Representations with Contrastive Radar-Optical Masked Autoencoders**|Anthony Fuller et.al.|[2311.00566v1](http://arxiv.org/abs/2311.00566v1)|**[link](https://github.com/antofuller/croma)**|\n", "2311.00436": "|**2023-11-01**|**Enhancing Traffic Object Detection in Variable Illumination with RGB-Event Fusion**|Zhanwen Liu et.al.|[2311.00436v1](http://arxiv.org/abs/2311.00436v1)|null|\n", "2311.00265": "|**2023-11-01**|**Adaptive Latent Diffusion Model for 3D Medical Image to Image Translation: Multi-modal Magnetic Resonance Imaging Study**|Jonghun Kim et.al.|[2311.00265v1](http://arxiv.org/abs/2311.00265v1)|**[link](https://github.com/jongdory/aldm)**|\n", "2311.00207": "|**2023-11-01**|**Magmaw: Modality-Agnostic Adversarial Attacks on Machine Learning-Based Wireless Communication Systems**|Jung-Woo Chang et.al.|[2311.00207v1](http://arxiv.org/abs/2311.00207v1)|null|\n", "2311.01459": "|**2023-11-02**|**Align Your Prompts: Test-Time Prompting with Distribution Alignment for Zero-Shot Generalization**|Jameel Hassan et.al.|[2311.01459v1](http://arxiv.org/abs/2311.01459v1)|null|\n", "2311.01361": "|**2023-11-02**|**GPT-4V(ision) as a Generalist Evaluator for Vision-Language Tasks**|Xinlu Zhang et.al.|[2311.01361v1](http://arxiv.org/abs/2311.01361v1)|null|\n", "2311.01202": "|**2023-11-02**|**Cross-Modal Information-Guided Network using Contrastive Learning for Point Cloud Registration**|Yifan Xie et.al.|[2311.01202v1](http://arxiv.org/abs/2311.01202v1)|**[link](https://github.com/ivanxie416/cmignet)**|\n", "2311.01092": "|**2023-11-02**|**Learning A Multi-Task Transformer Via Unified And Customized Instruction Tuning For Chest Radiograph Interpretation**|Lijian Xu et.al.|[2311.01092v1](http://arxiv.org/abs/2311.01092v1)|**[link](https://github.com/medhk23/omnifm-dr)**|\n", "2311.01066": "|**2023-11-02**|**Dynamic Multimodal Information Bottleneck for Multimodality Classification**|Yingying Fang et.al.|[2311.01066v1](http://arxiv.org/abs/2311.01066v1)|**[link](https://github.com/bii-wushuang/dmib)**|\n", "2311.00807": "|**2023-11-01**|**VQA-GEN: A Visual Question Answering Benchmark for Domain Generalization**|Suraj Jyothi Unni et.al.|[2311.00807v1](http://arxiv.org/abs/2311.00807v1)|null|\n", "2311.00737": "|**2023-11-01**|**Real-Time Magnetic Tracking and Diagnosis of COVID-19 via Machine Learning**|Dang Nguyen et.al.|[2311.00737v1](http://arxiv.org/abs/2311.00737v1)|null|\n", "2311.01908": "|**2023-11-03**|**LLM-driven Multimodal Target Volume Contouring in Radiation Oncology**|Yujin Oh et.al.|[2311.01908v1](http://arxiv.org/abs/2311.01908v1)|null|\n", "2311.01886": "|**2023-11-03**|**Bridging the Gap between Multi-focus and Multi-modal: A Focused Integration Framework for Multi-modal Image Fusion**|Xilai Li et.al.|[2311.01886v1](http://arxiv.org/abs/2311.01886v1)|null|\n", "2311.01881": "|**2023-11-03**|**Quantitative Evaluation of a Multi-Modal Camera Setup for Fusing Event Data with RGB Images**|Julian Moosmann et.al.|[2311.01881v1](http://arxiv.org/abs/2311.01881v1)|null|\n", "2311.01831": "|**2023-11-03**|**Universal Multi-modal Multi-domain Pre-trained Recommendation**|Wenqi Sun et.al.|[2311.01831v1](http://arxiv.org/abs/2311.01831v1)|null|\n", "2311.01807": "|**2023-11-03**|**Cross-modal Consistency Learning with Fine-grained Fusion Network for Multimodal Fake News Detection**|Jun Li et.al.|[2311.01807v1](http://arxiv.org/abs/2311.01807v1)|**[link](https://github.com/uestc-lj/cffn)**|\n", "2311.01767": "|**2023-11-03**|**PPTC Benchmark: Evaluating Large Language Models for PowerPoint Task Completion**|Yiduo Guo et.al.|[2311.01767v1](http://arxiv.org/abs/2311.01767v1)|**[link](https://github.com/gydpku/pptc)**|\n", "2311.01766": "|**2023-11-03**|**Support or Refute: Analyzing the Stance of Evidence to Detect Out-of-Context Mis- and Disinformation**|Xin Yuan et.al.|[2311.01766v1](http://arxiv.org/abs/2311.01766v1)|null|\n", "2311.01740": "|**2023-11-03**|**SAC$^3$: Reliable Hallucination Detection in Black-Box Language Models via Semantic-aware Cross-check Consistency**|Jiaxin Zhang et.al.|[2311.01740v1](http://arxiv.org/abs/2311.01740v1)|null|\n", "2311.01734": "|**2023-11-03**|**MixCon3D: Synergizing Multi-View and Cross-Modal Contrastive Learning for Enhancing 3D Representation**|Yipeng Gao et.al.|[2311.01734v1](http://arxiv.org/abs/2311.01734v1)|**[link](https://github.com/ucsc-vlaa/mixcon3d)**|\n", "2311.01487": "|**2023-11-02**|**What Makes for Good Visual Instructions? Synthesizing Complex Visual Reasoning Instructions for Visual Instruction Tuning**|Yifan Du et.al.|[2311.01487v1](http://arxiv.org/abs/2311.01487v1)|**[link](https://github.com/rucaibox/comvint)**|\n", "2311.03328": "|**2023-11-06**|**On Asynchrony, Memory, and Communication: Separations and Landscapes**|Paola Flocchini et.al.|[2311.03328v1](http://arxiv.org/abs/2311.03328v1)|null|\n", "2311.03217": "|**2023-11-06**|**Leveraging Transformers to Improve Breast Cancer Classification and Risk Assessment with Multi-modal and Longitudinal Data**|Yiqiu Shen et.al.|[2311.03217v1](http://arxiv.org/abs/2311.03217v1)|null|\n", "2311.03106": "|**2023-11-06**|**Unified Multi-modal Unsupervised Representation Learning for Skeleton-based Action Understanding**|Shengkai Sun et.al.|[2311.03106v1](http://arxiv.org/abs/2311.03106v1)|**[link](https://github.com/huiguanlab/umurl)**|\n", "2311.03090": "|**2023-11-06**|**A multi-modal approach to continuous material identification through tactile sensing**|Augusto G\u00f3mez Egu\u00edluz et.al.|[2311.03090v1](http://arxiv.org/abs/2311.03090v1)|null|\n", "2311.03079": "|**2023-11-06**|**CogVLM: Visual Expert for Pretrained Language Models**|Weihan Wang et.al.|[2311.03079v1](http://arxiv.org/abs/2311.03079v1)|**[link](https://github.com/thudm/cogvlm)**|\n", "2311.02863": "|**2023-11-06**|**Temporal Shift -- Multi-Objective Loss Function for Improved Anomaly Fall Detection**|Stefan Denkovski et.al.|[2311.02863v1](http://arxiv.org/abs/2311.02863v1)|null|\n", "2311.02850": "|**2023-11-06**|**IR-STP: Enhancing Autonomous Driving with Interaction Reasoning in Spatio-Temporal Planning**|Yingbing Chen et.al.|[2311.02850v1](http://arxiv.org/abs/2311.02850v1)|**[link](https://github.com/chenyingbing/ir-stp-planner)**|\n", "2311.02842": "|**2023-11-06**|**An invariant feature extraction for multi-modal images matching**|Chenzhong Gao et.al.|[2311.02842v1](http://arxiv.org/abs/2311.02842v1)|null|\n", "2311.02820": "|**2023-11-06**|**Mesh Neural Cellular Automata**|Ehsan Pajouheshgar et.al.|[2311.02820v1](http://arxiv.org/abs/2311.02820v1)|null|\n", "2311.02782": "|**2023-11-05**|**Towards Generic Anomaly Detection and Understanding: Large-scale Visual-linguistic Model (GPT-4V) Takes the Lead**|Yunkang Cao et.al.|[2311.02782v1](http://arxiv.org/abs/2311.02782v1)|**[link](https://github.com/caoyunkang/gpt4v-for-generic-anomaly-detection)**|\n", "2311.02733": "|**2023-11-05**|**AV-Lip-Sync+: Leveraging AV-HuBERT to Exploit Multimodal Inconsistency for Video Deepfake Detection**|Sahibzada Adil Shahzad et.al.|[2311.02733v1](http://arxiv.org/abs/2311.02733v1)|null|\n", "2311.02559": "|**2023-11-05**|**Rotation Invariant Transformer for Recognizing Object in UAVs**|Shuoyi Chen et.al.|[2311.02559v1](http://arxiv.org/abs/2311.02559v1)|null|\n", "2311.02329": "|**2023-11-04**|**Complex Organ Mask Guided Radiology Report Generation**|Gu Tiancheng et.al.|[2311.02329v1](http://arxiv.org/abs/2311.02329v1)|**[link](https://github.com/garygutc/comg_model)**|\n", "2311.02282": "|**2023-11-04**|**Contrastive Multi-Modal Representation Learning for Spark Plug Fault Diagnosis**|Ardavan Modarres et.al.|[2311.02282v1](http://arxiv.org/abs/2311.02282v1)|null|\n", "2311.02248": "|**2023-11-03**|**COSMIC: Data Efficient Instruction-tuning For Speech In-Context Learning**|Jing Pan et.al.|[2311.02248v1](http://arxiv.org/abs/2311.02248v1)|null|\n", "2311.04219": "|**2023-11-07**|**OtterHD: A High-Resolution Multi-modality Model**|Bo Li et.al.|[2311.04219v1](http://arxiv.org/abs/2311.04219v1)|null|\n", "2311.04160": "|**2023-11-07**|**\"Tell me about that church\": Exploring the Design and User Experience of In-Vehicle Multi-modal Intuitive Interface in the Context of Driving Scenario**|Yueteng Yu et.al.|[2311.04160v1](http://arxiv.org/abs/2311.04160v1)|null|\n", "2311.04091": "|**2023-11-07**|**Proceedings of the 5th International Workshop on Reading Music Systems**|Jorge Calvo-Zaragoza et.al.|[2311.04091v1](http://arxiv.org/abs/2311.04091v1)|**[link](https://github.com/suziai/gui-tools)**|\n", "2311.04058": "|**2023-11-07**|**mmFUSION: Multimodal Fusion for 3D Objects Detection**|Javed Ahmad et.al.|[2311.04058v1](http://arxiv.org/abs/2311.04058v1)|null|\n", "2311.04056": "|**2023-11-07**|**Multi-View Causal Representation Learning with Partial Observability**|Dingling Yao et.al.|[2311.04056v1](http://arxiv.org/abs/2311.04056v1)|null|\n", "2311.03810": "|**2023-11-07**|**Rethinking and Improving Multi-task Learning for End-to-end Speech Translation**|Yuhao Zhang et.al.|[2311.03810v1](http://arxiv.org/abs/2311.03810v1)|**[link](https://github.com/xiaozhang521/imtl)**|\n", "2311.03620": "|**2023-11-07**|**FusionViT: Hierarchical 3D Object Detection via LiDAR-Camera Vision Transformer Fusion**|Xinhao Xiang et.al.|[2311.03620v1](http://arxiv.org/abs/2311.03620v1)|null|\n", "2311.03606": "|**2023-11-06**|**Multimodal Stress Detection Using Facial Landmarks and Biometric Signals**|Majid Hosseini et.al.|[2311.03606v1](http://arxiv.org/abs/2311.03606v1)|null|\n", "2311.03413": "|**2023-11-06**|**Discret2Di -- Deep Learning based Discretization for Model-based Diagnosis**|Lukas Moddemann et.al.|[2311.03413v1](http://arxiv.org/abs/2311.03413v1)|null|\n", "2311.04766": "|**2023-11-08**|**DualTalker: A Cross-Modal Dual Learning Approach for Speech-Driven 3D Facial Animation**|Guinan Su et.al.|[2311.04766v1](http://arxiv.org/abs/2311.04766v1)|null|\n", "2311.04678": "|**2023-11-08**|**Weakly supervised cross-model learning in high-content screening**|Watkinson Gabriel et.al.|[2311.04678v1](http://arxiv.org/abs/2311.04678v1)|null|\n", "2311.04589": "|**2023-11-08**|**TEAL: Tokenize and Embed ALL for Multi-modal Large Language Models**|Zhen Yang et.al.|[2311.04589v1](http://arxiv.org/abs/2311.04589v1)|null|\n", "2311.04563": "|**2023-11-08**|**Investigating the Nature of Disagreements on Mid-Scale Ratings: A Case Study on the Abstractness-Concreteness Continuum**|Urban Knuple\u0161 et.al.|[2311.04563v1](http://arxiv.org/abs/2311.04563v1)|null|\n", "2311.04552": "|**2023-11-08**|**A 3D generative model of pathological multi-modal MR images and segmentations**|Virginia Fernandez et.al.|[2311.04552v1](http://arxiv.org/abs/2311.04552v1)|**[link](https://github.com/virginiafdez/brainspade3d_rel)**|\n", "2311.04512": "|**2023-11-08**|**FFINet: Future Feedback Interaction Network for Motion Forecasting**|Miao Kang et.al.|[2311.04512v1](http://arxiv.org/abs/2311.04512v1)|null|\n", "2311.04507": "|**2023-11-08**|**Conversation Understanding using Relational Temporal Graph Neural Networks with Auxiliary Cross-Modality Interaction**|Cam-Van Thi Nguyen et.al.|[2311.04507v1](http://arxiv.org/abs/2311.04507v1)|null|\n", "2311.04390": "|**2023-11-07**|**Force-Constrained Visual Policy: Safe Robot-Assisted Dressing via Multi-Modal Sensing**|Zhanyi Sun et.al.|[2311.04390v1](http://arxiv.org/abs/2311.04390v1)|null|\n", "2311.04257": "|**2023-11-07**|**mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration**|Qinghao Ye et.al.|[2311.04257v1](http://arxiv.org/abs/2311.04257v1)|**[link](https://github.com/x-plug/mplug-owl)**|\n", "2311.05494": "|**2023-11-09**|**Object-centric Cross-modal Feature Distillation for Event-based Object Detection**|Lei Li et.al.|[2311.05494v1](http://arxiv.org/abs/2311.05494v1)|null|\n", "2311.05464": "|**2023-11-09**|**3DStyle-Diffusion: Pursuing Fine-grained Text-driven 3D Stylization with 2D Diffusion Models**|Haibo Yang et.al.|[2311.05464v1](http://arxiv.org/abs/2311.05464v1)|**[link](https://github.com/yanghb22-fdu/3dstyle-diffusion-official)**|\n", "2311.05463": "|**2023-11-09**|**ControlStyle: Text-Driven Stylized Image Generation Using Diffusion Priors**|Jingwen Chen et.al.|[2311.05463v1](http://arxiv.org/abs/2311.05463v1)|null|\n", "2311.05348": "|**2023-11-09**|**u-LLaVA: Unifying Multi-Modal Tasks via Large Language Model**|Jinjin Xu et.al.|[2311.05348v1](http://arxiv.org/abs/2311.05348v1)|null|\n", "2311.05319": "|**2023-11-09**|**TLCFuse: Temporal Multi-Modality Fusion Towards Occlusion-Aware Semantic Segmentation-Aided Motion Planning**|Gustavo Salazar-Gomez et.al.|[2311.05319v1](http://arxiv.org/abs/2311.05319v1)|null|\n", "2311.05298": "|**2023-11-09**|**Improving Vision-and-Language Reasoning via Spatial Relations Modeling**|Cheng Yang et.al.|[2311.05298v1](http://arxiv.org/abs/2311.05298v1)|null|\n", "2311.05152": "|**2023-11-09**|**Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual Downstream Tasks**|Haoyi Duan et.al.|[2311.05152v1](http://arxiv.org/abs/2311.05152v1)|**[link](https://github.com/haoyi-duan/dg-sct)**|\n", "2311.05032": "|**2023-11-08**|**Transfer learning from a sparsely annotated dataset of 3D medical images**|Gabriel Efrain Humpire-Mamani et.al.|[2311.05032v1](http://arxiv.org/abs/2311.05032v1)|**[link](https://github.com/diagnijmegen/medicaltransferlearning3d-unet)**|\n"}, "Point Cloud Localization": {"2301.05372": "|**2023-01-13**|**Text to Point Cloud Localization with Relation-Enhanced Transformer**|Guangzhi Wang et.al.|[2301.05372v1](http://arxiv.org/abs/2301.05372v1)|null|\n", "2209.15475": "|**2022-09-30**|**Point Cloud Quality Assessment using 3D Saliency Maps**|Zhengyu Wang et.al.|[2209.15475v1](http://arxiv.org/abs/2209.15475v1)|null|\n", "2207.05317": "|**2022-07-12**|**CPO: Change Robust Panorama to Point Cloud Localization**|Junho Kim et.al.|[2207.05317v1](http://arxiv.org/abs/2207.05317v1)|null|\n", "2205.14965": "|**2022-05-31**|**PSNet: Fast Data Structuring for Hierarchical Deep Learning on Point Cloud**|Luyang Li et.al.|[2205.14965v2](http://arxiv.org/abs/2205.14965v2)|**[link](https://github.com/lly007/pointstructuringnet)**|\n", "2203.15125": "|**2022-04-05**|**Text2Pos: Text-to-Point-Cloud Cross-Modal Localization**|Manuel Kolmet et.al.|[2203.15125v2](http://arxiv.org/abs/2203.15125v2)|null|\n", "2003.02392": "|**2021-11-22**|**PointLoc: Deep Pose Regressor for LiDAR Point Cloud Localization**|Wei Wang et.al.|[2003.02392v3](http://arxiv.org/abs/2003.02392v3)|**[link](https://github.com/loveoxford/vreloc)**|\n", "1812.01711": "|**2018-11-28**|**A Graph-CNN for 3D Point Cloud Classification**|Yingxue Zhang et.al.|[1812.01711v1](http://arxiv.org/abs/1812.01711v1)|**[link](https://github.com/maggie0106/Graph-CNN-in-3D-Point-Cloud-Classification)**|\n", "1712.06760": "|**2018-04-03**|**Mining Point Cloud Local Structures by Kernel Correlation and Graph Pooling**|Yiru Shen et.al.|[1712.06760v2](http://arxiv.org/abs/1712.06760v2)|null|\n", "1702.04114": "|**2017-02-14**|**Graph Based Over-Segmentation Methods for 3D Point Clouds**|Yizhak Ben-Shabat et.al.|[1702.04114v1](http://arxiv.org/abs/1702.04114v1)|null|\n"}, "Place Recognization": {"2302.06149": "|**2023-02-13**|**Contour Context: Abstract Structural Distribution for 3D LiDAR Loop Detection and Metric Pose Estimation**|Binqian Jiang et.al.|[2302.06149v1](http://arxiv.org/abs/2302.06149v1)|**[link](https://github.com/lewisjiang/contour-context)**|\n", "2301.05604": "|**2023-01-13**|**A LiDAR-Inertial-Visual SLAM System with Loop Detection**|Kangcheng Liu et.al.|[2301.05604v1](http://arxiv.org/abs/2301.05604v1)|null|\n", "2212.12745": "|**2022-12-24**|**GraffMatch: Global Matching of 3D Lines and Planes for Wide Baseline LiDAR Registration**|Parker C. Lusk et.al.|[2212.12745v1](http://arxiv.org/abs/2212.12745v1)|null|\n", "2211.14864": "|**2022-11-27**|**A Faster, Lighter and Stronger Deep Learning-Based Approach for Place Recognition**|Rui Huang et.al.|[2211.14864v1](http://arxiv.org/abs/2211.14864v1)|null|\n", "2211.12732": "|**2023-03-02**|**Wild-Places: A Large-Scale Dataset for Lidar Place Recognition in Unstructured Natural Environments**|Joshua Knights et.al.|[2211.12732v3](http://arxiv.org/abs/2211.12732v3)|**[link](https://github.com/csiro-robotics/Wild-Places)**|\n", "2210.13856": "|**2022-11-02**|**A Framework for Collaborative Multi-Robot Mapping using Spectral Graph Wavelets**|Lukas Bernreiter et.al.|[2210.13856v2](http://arxiv.org/abs/2210.13856v2)|null|\n", "2210.11029": "|**2022-10-20**|**DeepRING: Learning Roto-translation Invariant Representation for LiDAR based Place Recognition**|Sha Lu et.al.|[2210.11029v1](http://arxiv.org/abs/2210.11029v1)|null|\n", "2210.04432": "|**2023-03-06**|**Spectral Geometric Verification: Re-Ranking Point Cloud Retrieval for Metric Localization**|Kavisha Vidanapathirana et.al.|[2210.04432v2](http://arxiv.org/abs/2210.04432v2)|**[link](https://github.com/csiro-robotics/spectralgv)**|\n", "2210.04236": "|**2022-10-09**|**Fusing Event-based Camera and Radar for SLAM Using Spiking Neural Networks with Continual STDP Learning**|Ali Safa et.al.|[2210.04236v1](http://arxiv.org/abs/2210.04236v1)|null|\n", "2210.01320": "|**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|\n", "2209.12513": "|**2022-09-26**|**NDD: A 3D Point Cloud Descriptor Based on Normal Distribution for Loop Closure Detection**|Ruihao Zhou et.al.|[2209.12513v1](http://arxiv.org/abs/2209.12513v1)|**[link](https://github.com/zhouruihao1001/ndd)**|\n", "2209.11894": "|**2022-09-24**|**Closing the Loop: Graph Networks to Unify Semantic Objects and Visual Features for Multi-object Scenes**|Jonathan J. Y. Kim et.al.|[2209.11894v1](http://arxiv.org/abs/2209.11894v1)|null|\n", "2209.09699": "|**2023-03-28**|**PADLoC: LiDAR-Based Deep Loop Closure Detection and Registration Using Panoptic Attention**|Jos\u00e9 Arce et.al.|[2209.09699v3](http://arxiv.org/abs/2209.09699v3)|**[link](https://github.com/robot-learning-freiburg/PADLoC)**|\n", "2209.08608": "|**2022-09-18**|**HGI-SLAM: Loop Closure With Human and Geometric Importance Features**|Shuhul Mujoo et.al.|[2209.08608v1](http://arxiv.org/abs/2209.08608v1)|null|\n", "2209.08578": "|**2022-09-18**|**Data-driven Loop Closure Detection in Bathymetric Point Clouds for Underwater SLAM**|Jiarui Tan et.al.|[2209.08578v1](http://arxiv.org/abs/2209.08578v1)|**[link](https://github.com/tjr16/bathy_nn_learning)**|\n", "2209.06779": "|**2022-10-15**|**Efficient Planar Pose Estimation via UWB Measurements**|Haodong Jiang et.al.|[2209.06779v3](http://arxiv.org/abs/2209.06779v3)|**[link](https://github.com/SLAMLab-CUHKSZ/Efficient-Pose-Estimation-via-UWB-measurements)**|\n", "2209.06545": "|**2023-01-12**|**Tac2Structure: Object Surface Reconstruction Only through Multi Times Touch**|Junyuan Lu et.al.|[2209.06545v3](http://arxiv.org/abs/2209.06545v3)|**[link](https://github.com/ljy-zju/tac2structure)**|\n", "2209.04497": "|**2022-09-09**|**General Place Recognition Survey: Towards the Real-world Autonomy Age**|Peng Yin et.al.|[2209.04497v1](http://arxiv.org/abs/2209.04497v1)|**[link](https://github.com/MetaSLAM/GPRS)**|\n", "2207.10916": "|**2022-07-22**|**PLD-SLAM: A Real-Time Visual SLAM Using Points and Line Segments in Dynamic Scenes**|BaoSheng Zhang et.al.|[2207.10916v1](http://arxiv.org/abs/2207.10916v1)|null|\n", "2207.06965": "|**2022-09-28**|**AutoMerge: A Framework for Map Assembling and Smoothing in City-scale Environments**|Peng Yin et.al.|[2207.06965v3](http://arxiv.org/abs/2207.06965v3)|null|\n", "2207.06738": "|**2022-07-14**|**Semi-supervised Vector-Quantization in Visual SLAM using HGCN**|Amir Zarringhalam et.al.|[2207.06738v1](http://arxiv.org/abs/2207.06738v1)|null|\n", "2207.06732": "|**2022-07-14**|**Self-supervised Vector-Quantization in Visual SLAM using Deep Convolutional Autoencoders**|Amir Zarringhalam et.al.|[2207.06732v1](http://arxiv.org/abs/2207.06732v1)|null|\n", "2206.12628": "|**2022-09-27**|**FreSCo: Frequency-Domain Scan Context for LiDAR-based Place Recognition with Translation and Rotation Invariance**|Yongzhi Fan et.al.|[2206.12628v2](http://arxiv.org/abs/2206.12628v2)|**[link](https://github.com/soytony/fresco)**|\n", "2206.08733": "|**2022-06-17**|**Efficient WiFi LiDAR SLAM for Autonomous Robots in Large Environments**|Khairuldanial Ismail et.al.|[2206.08733v1](http://arxiv.org/abs/2206.08733v1)|null|\n", "2205.13135": "|**2022-07-09**|**LAMP 2.0: A Robust Multi-Robot SLAM System for Operation in Challenging Large-Scale Underground Environments**|Yun Chang et.al.|[2205.13135v3](http://arxiv.org/abs/2205.13135v3)|**[link](https://github.com/nebula-autonomy/nebula-multirobot-dataset)**|\n", "2204.12831": "|**2022-11-09**|**The Revisiting Problem in Simultaneous Localization and Mapping: A Survey on Visual Loop Closure Detection**|Konstantinos A. Tsintotas et.al.|[2204.12831v3](http://arxiv.org/abs/2204.12831v3)|null|\n", "2204.05481": "|**2022-04-12**|**HiTPR: Hierarchical Transformer for Place Recognition in Point Cloud**|Zhixing Hou et.al.|[2204.05481v1](http://arxiv.org/abs/2204.05481v1)|null|\n", "2204.04932": "|**2022-04-11**|**Optimized SC-F-LOAM: Optimized Fast LiDAR Odometry and Mapping Using Scan Context**|Lizhou Liao et.al.|[2204.04932v1](http://arxiv.org/abs/2204.04932v1)|**[link](https://github.com/SlamCabbage/Optimized-SC-F-LOAM)**|\n", "2204.01524": "|**2022-04-01**|**Bi-directional Loop Closure for Visual SLAM**|Ihtisham Ali et.al.|[2204.01524v1](http://arxiv.org/abs/2204.01524v1)|null|\n", "2203.03454": "|**2022-03-07**|**Multi-Modal Lidar Dataset for Benchmarking General-Purpose Localization and Mapping Algorithms**|Qingqing Li et.al.|[2203.03454v1](http://arxiv.org/abs/2203.03454v1)|**[link](https://github.com/tiers/tiers-lidars-dataset)**|\n", "2201.13360": "|**2022-06-20**|**Hydra: A Real-time Spatial Perception System for 3D Scene Graph Construction and Optimization**|Nathan Hughes et.al.|[2201.13360v2](http://arxiv.org/abs/2201.13360v2)|null|\n", "2201.09048": "|**2022-01-22**|**Phase-SLAM: Phase Based Simultaneous Localization and Mapping for Mobile Structured Light Illumination Systems**|Xi Zheng et.al.|[2201.09048v1](http://arxiv.org/abs/2201.09048v1)|**[link](https://github.com/zhengxi-git/phase-slam)**|\n", "2201.03212": "|**2022-01-10**|**Why-So-Deep: Towards Boosting Previously Trained Models for Visual Place Recognition**|M. Usman Maqbool Bhutta et.al.|[2201.03212v1](http://arxiv.org/abs/2201.03212v1)|**[link](https://github.com/UsmanMaqbool/why-so-deep)**|\n", "2111.14990": "|**2021-11-29**|**MIXER: A Principled Framework for Multimodal, Multiway Data Association**|Parker C. Lusk et.al.|[2111.14990v1](http://arxiv.org/abs/2111.14990v1)|null|\n", "2111.13838": "|**2021-11-27**|**DSC: Deep Scan Context Descriptor for Large-Scale Place Recognition**|Jiafeng Cui et.al.|[2111.13838v1](http://arxiv.org/abs/2111.13838v1)|null|\n", "2111.13826": "|**2021-11-27**|**Average Outward Flux Skeletons for Environment Mapping and Topology Matching**|Morteza Rezanejad et.al.|[2111.13826v1](http://arxiv.org/abs/2111.13826v1)|null|\n", "2111.00440": "|**2022-02-27**|**Loop closure detection using local 3D deep descriptors**|Youjie Zhou et.al.|[2111.00440v2](http://arxiv.org/abs/2111.00440v2)|**[link](https://github.com/yiming107/l3d_loop_closure)**|\n", "2110.11491": "|**2021-10-21**|**SymbioLCD: Ensemble-Based Loop Closure Detection using CNN-Extracted Objects and Visual Bag-of-Words**|Jonathan J. Y. Kim et.al.|[2110.11491v1](http://arxiv.org/abs/2110.11491v1)|null|\n", "2109.08975": "|**2022-03-09**|**AirLoop: Lifelong Loop Closure Detection**|Dasong Gao et.al.|[2109.08975v3](http://arxiv.org/abs/2109.08975v3)|**[link](https://github.com/wang-chen/airloop)**|\n", "2109.06596": "|**2021-09-14**|**GPGM-SLAM: a Robust SLAM System for Unstructured Planetary Environments with Gaussian Process Gradient Maps**|Riccardo Giubilato et.al.|[2109.06596v1](http://arxiv.org/abs/2109.06596v1)|null|\n", "2108.12790": "|**2022-08-28**|**RPR-Net: A Point Cloud-based Rotation-aware Large Scale Place Recognition Network**|Zhaoxin Fan et.al.|[2108.12790v3](http://arxiv.org/abs/2108.12790v3)|null|\n", "2108.02028": "|**2021-08-04**|**Incorporating Learnt Local and Global Embeddings into Monocular Visual SLAM**|Huaiyang Huang et.al.|[2108.02028v1](http://arxiv.org/abs/2108.02028v1)|null|\n", "2108.01383": "|**2021-08-03**|**On the descriptive power of LiDAR intensity images for segment-based loop closing in 3-D SLAM**|Jan Wietrzykowski et.al.|[2108.01383v1](http://arxiv.org/abs/2108.01383v1)|**[link](https://github.com/LRMPUT/segmap_vis_views)**|\n", "2107.14611": "|**2021-07-30**|**Automatic Vocabulary and Graph Verification for Accurate Loop Closure Detection**|Haosong Yue et.al.|[2107.14611v1](http://arxiv.org/abs/2107.14611v1)|null|\n", "2107.07707": "|**2021-07-16**|**Probabilistic Appearance-Invariant Topometric Localization with New Place Awareness**|Ming Xu et.al.|[2107.07707v1](http://arxiv.org/abs/2107.07707v1)|**[link](https://github.com/mingu6/TopometricLoc)**|\n", "2107.07133": "|**2021-07-15**|**A life-long SLAM approach using adaptable local maps based on rasterized LIDAR images**|Waqas Ali et.al.|[2107.07133v1](http://arxiv.org/abs/2107.07133v1)|null|\n", "2106.11516": "|**2021-07-01**|**SA-LOAM: Semantic-aided LiDAR SLAM with Loop Closure**|Lin Li et.al.|[2106.11516v2](http://arxiv.org/abs/2106.11516v2)|null|\n", "2106.09637": "|**2023-01-04**|**AttDLNet: Attention-based DL Network for 3D LiDAR Place Recognition**|Tiago Barros et.al.|[2106.09637v4](http://arxiv.org/abs/2106.09637v4)|**[link](https://github.com/cybonic/attdlnet)**|\n", "2105.11344": "|**2021-05-24**|**OverlapNet: Loop Closing for LiDAR-based SLAM**|Xieyuanli Chen et.al.|[2105.11344v1](http://arxiv.org/abs/2105.11344v1)|**[link](https://github.com/PRBonn/OverlapNet)**|\n", "2103.12292": "|**2021-03-23**|**NDT-Transformer: Large-Scale 3D Point Cloud Localisation using the Normal Distribution Transform Representation**|Zhicheng Zhou et.al.|[2103.12292v1](http://arxiv.org/abs/2103.12292v1)|**[link](https://github.com/dachengxiaocheng/NDT-Transformer)**|\n", "2303.00477": "|**2023-03-01**|**ORCHNet: A Robust Global Feature Aggregation approach for 3D LiDAR-based Place recognition in Orchards**|T. Barros et.al.|[2303.00477v1](http://arxiv.org/abs/2303.00477v1)|**[link](https://github.com/cybonic/orchnet)**|\n", "2303.00295": "|**2023-03-01**|**Region Prediction for Efficient Robot Localization on Large Maps**|Matteo Scucchia et.al.|[2303.00295v1](http://arxiv.org/abs/2303.00295v1)|null|\n", "2304.03872": "|**2023-06-24**|**LSGDDN-LCD: An Appearance-based Loop Closure Detection using Local Superpixel Grid Descriptors and Incremental Dynamic Nodes**|Baosheng Zhang et.al.|[2304.03872v2](http://arxiv.org/abs/2304.03872v2)|null|\n", "2304.05146": "|**2023-04-14**|**Loop Closure Detection Based on Object-level Spatial Layout and Semantic Consistency**|Xingwu Ji et.al.|[2304.05146v2](http://arxiv.org/abs/2304.05146v2)|**[link](https://github.com/jixingwu/ss-lcd)**|\n", "2304.13487": "|**2023-04-26**|**Hydra-Multi: Collaborative Online Construction of 3D Scene Graphs with Multi-Robot Teams**|Yun Chang et.al.|[2304.13487v1](http://arxiv.org/abs/2304.13487v1)|null|\n", "2305.07154": "|**2023-05-11**|**Foundations of Spatial Perception for Robotics: Hierarchical Representations and Real-time Systems**|Nathan Hughes et.al.|[2305.07154v1](http://arxiv.org/abs/2305.07154v1)|**[link](https://github.com/mit-spark/hydra)**|\n", "2305.18013": "|**2023-05-29**|**TReR: A Lightweight Transformer Re-Ranking Approach for 3D LiDAR Place Recognition**|Tiago Barros et.al.|[2305.18013v1](http://arxiv.org/abs/2305.18013v1)|null|\n", "2307.04321": "|**2023-07-10**|**RaPlace: Place Recognition for Imaging Radar using Radon Transform and Mutable Threshold**|Hyesu Jang et.al.|[2307.04321v1](http://arxiv.org/abs/2307.04321v1)|**[link](https://github.com/hyesu-jang/raplace)**|\n", "2307.08221": "|**2023-07-17**|**NDT-Map-Code: A 3D global descriptor for real-time loop closure detection in lidar SLAM**|Lizhou Liao et.al.|[2307.08221v1](http://arxiv.org/abs/2307.08221v1)|**[link](https://github.com/SlamCabbage/NDTMC)**|\n", "2307.09044": "|**2023-07-18**|**3D-SeqMOS: A Novel Sequential 3D Moving Object Segmentation in Autonomous Driving**|Qipeng Li et.al.|[2307.09044v1](http://arxiv.org/abs/2307.09044v1)|null|\n", "2309.02394": "|**2023-09-05**|**Magnetic Navigation using Attitude-Invariant Magnetic Field Information for Loop Closure Detection**|Natalia Pavlasek et.al.|[2309.02394v1](http://arxiv.org/abs/2309.02394v1)|null|\n", "2309.07094": "|**2023-09-13**|**RadarLCD: Learnable Radar-based Loop Closure Detection Pipeline**|Mirko Usuelli et.al.|[2309.07094v1](http://arxiv.org/abs/2309.07094v1)|null|\n", "2309.09879": "|**2023-09-18**|**DynaPix SLAM: A Pixel-Based Dynamic SLAM Approach**|Chenghao Xu et.al.|[2309.09879v1](http://arxiv.org/abs/2309.09879v1)|null|\n", "2309.08914": "|**2023-09-16**|**Outram: One-shot Global Localization via Triangulated Scene Graph and Global Outlier Pruning**|Pengyu Yin et.al.|[2309.08914v1](http://arxiv.org/abs/2309.08914v1)|**[link](https://github.com/pamphlett/outram)**|\n"}, "LiDAR SLAM": {"2212.14209": "|**2022-12-29**|**An Enhanced LiDAR-Inertial SLAM System for Robotics Localization and Mapping**|Kangcheng Liu et.al.|[2212.14209v1](http://arxiv.org/abs/2212.14209v1)|**[link](https://github.com/KangchengLiu/slam_resources)**|\n", "2212.05705": "|**2022-12-12**|**An Integrated LiDAR-SLAM System for Complex Environment with Noisy Point Clouds**|Kangcheng Liu et.al.|[2212.05705v1](http://arxiv.org/abs/2212.05705v1)|**[link](https://github.com/KangchengLiu/DLC_LiDAR_SLAM)**|\n", "2212.02077": "|**2022-12-05**|**DL-SLOT: Dynamic LiDAR SLAM and object tracking based on collaborative graph optimization**|Xuebo Tian et.al.|[2212.02077v1](http://arxiv.org/abs/2212.02077v1)|null|\n", "2211.03484": "|**2022-11-07**|**When Geometry is not Enough: Using Reflector Markers in Lidar SLAM**|Gerhard Kurz et.al.|[2211.03484v1](http://arxiv.org/abs/2211.03484v1)|null|\n", "2211.02445": "|**2023-04-14**|**Lidar-level localization with radar? The CFEAR approach to accurate, fast and robust large-scale radar odometry in diverse environments**|Daniel Adolfsson et.al.|[2211.02445v3](http://arxiv.org/abs/2211.02445v3)|**[link](https://github.com/dan11003/CFEAR_Radarodometry_code_public)**|\n", "2210.11978": "|**2023-04-13**|**DCL-SLAM: A Distributed Collaborative LiDAR SLAM Framework for a Robotic Swarm**|Shipeng Zhong et.al.|[2210.11978v2](http://arxiv.org/abs/2210.11978v2)|**[link](https://github.com/pengyu-team/dcl-slam)**|\n", "2210.00812": "|**2022-10-03**|**A Benchmark for Multi-Modal Lidar SLAM with Ground Truth in GNSS-Denied Environments**|Ha Sier et.al.|[2210.00812v1](http://arxiv.org/abs/2210.00812v1)|**[link](https://github.com/tiers/tiers-lidars-dataset-enhanced)**|\n", "2209.08810": "|**2022-09-19**|**LMBAO: A Landmark Map for Bundle Adjustment Odometry in LiDAR SLAM**|Letian Zhang et.al.|[2209.08810v1](http://arxiv.org/abs/2209.08810v1)|null|\n", "2209.08248": "|**2022-09-29**|**PlaneSLAM: Plane-based LiDAR SLAM for Motion Planning in Structured 3D Environments**|Adam Dai et.al.|[2209.08248v2](http://arxiv.org/abs/2209.08248v2)|**[link](https://github.com/stanford-navlab/planeslam)**|\n", "2209.08091": "|**2022-09-16**|**ViWiD: Leveraging WiFi for Robust and Resource-Efficient SLAM**|Aditya Arun et.al.|[2209.08091v1](http://arxiv.org/abs/2209.08091v1)|null|\n", "2208.11855": "|**2022-08-25**|**Lidar SLAM for Autonomous Driving Vehicles**|Farhad Aghili et.al.|[2208.11855v1](http://arxiv.org/abs/2208.11855v1)|null|\n", "2208.09777": "|**2022-09-08**|**JVLDLoc: a Joint Optimization of Visual-LiDAR Constraints and Direction Priors for Localization in Driving Scenario**|Longrui Dong et.al.|[2208.09777v3](http://arxiv.org/abs/2208.09777v3)|null|\n", "2208.07473": "|**2022-11-18**|**BoW3D: Bag of Words for Real-Time Loop Closing in 3D LiDAR SLAM**|Yunge Cui et.al.|[2208.07473v2](http://arxiv.org/abs/2208.07473v2)|**[link](https://github.com/yungecui/bow3d)**|\n", "2207.06815": "|**2022-07-14**|**Challenges of SLAM in extremely unstructured environments: the DLR Planetary Stereo, Solid-State LiDAR, Inertial Dataset**|Riccardo Giubilato et.al.|[2207.06815v1](http://arxiv.org/abs/2207.06815v1)|null|\n", "2206.09463": "|**2022-06-19**|**RF-LIO: Removal-First Tightly-coupled Lidar Inertial Odometry in High Dynamic Environments**|Chenglong Qian et.al.|[2206.09463v1](http://arxiv.org/abs/2206.09463v1)|null|\n", "2206.08733": "|**2022-06-17**|**Efficient WiFi LiDAR SLAM for Autonomous Robots in Large Environments**|Khairuldanial Ismail et.al.|[2206.08733v1](http://arxiv.org/abs/2206.08733v1)|null|\n", "2206.00266": "|**2022-06-01**|**PaGO-LOAM: Robust Ground-Optimized LiDAR Odometry**|Dong-Uk Seo et.al.|[2206.00266v1](http://arxiv.org/abs/2206.00266v1)|**[link](https://github.com/url-kaist/alterground-lego-loam)**|\n", "2205.08556": "|**2022-05-17**|**Global Data Association for SLAM with 3D Grassmannian Manifold Objects**|Parker C. Lusk et.al.|[2205.08556v1](http://arxiv.org/abs/2205.08556v1)|null|\n", "2204.12769": "|**2022-04-27**|**Dynamic Registration: Joint Ego Motion Estimation and 3D Moving Object Detection in Dynamic Environment**|Wenyu Li et.al.|[2204.12769v1](http://arxiv.org/abs/2204.12769v1)|null|\n", "2204.08163": "|**2022-04-18**|**Mapping While Following: 2D LiDAR SLAM in Indoor Dynamic Environments with a Person Tracker**|Hanjing Ye et.al.|[2204.08163v1](http://arxiv.org/abs/2204.08163v1)|null|\n", "2203.13799": "|**2022-03-25**|**Gravity-constrained point cloud registration**|Vladim\u00edr Kubelka et.al.|[2203.13799v1](http://arxiv.org/abs/2203.13799v1)|null|\n", "2202.11431": "|**2022-02-23**|**DL-SLOT: Dynamic Lidar SLAM and Object Tracking Based On Graph Optimization**|Xuebo Tian et.al.|[2202.11431v1](http://arxiv.org/abs/2202.11431v1)|null|\n", "2201.06423": "|**2022-01-17**|**SC-LiDAR-SLAM: a Front-end Agnostic Versatile LiDAR SLAM System**|Giseop Kim et.al.|[2201.06423v1](http://arxiv.org/abs/2201.06423v1)|null|\n", "2110.11517": "|**2021-10-21**|**Real-Time Ground-Plane Refined LiDAR SLAM**|Fan Yang et.al.|[2110.11517v1](http://arxiv.org/abs/2110.11517v1)|null|\n", "2110.02018": "|**2021-10-03**|**AEROS: Adaptive RObust least-Squares for Graph-Based SLAM**|Milad Ramezani et.al.|[2110.02018v1](http://arxiv.org/abs/2110.02018v1)|null|\n", "2109.05483": "|**2021-09-12**|**ART-SLAM: Accurate Real-Time 6DoF LiDAR SLAM**|Matteo Frosi et.al.|[2109.05483v1](http://arxiv.org/abs/2109.05483v1)|**[link](https://github.com/matteof94/artslam)**|\n", "2109.00200": "|**2021-09-01**|**A real-time global re-localization framework for 3D LiDAR SLAM**|Ziqi Chai et.al.|[2109.00200v1](http://arxiv.org/abs/2109.00200v1)|null|\n", "2108.01383": "|**2021-08-03**|**On the descriptive power of LiDAR intensity images for segment-based loop closing in 3-D SLAM**|Jan Wietrzykowski et.al.|[2108.01383v1](http://arxiv.org/abs/2108.01383v1)|**[link](https://github.com/LRMPUT/segmap_vis_views)**|\n", "2107.05283": "|**2021-07-12**|**Benchmark of visual and 3D lidar SLAM systems in simulation environment for vineyards**|Ibrahim Hroob et.al.|[2107.05283v1](http://arxiv.org/abs/2107.05283v1)|null|\n", "2106.11516": "|**2021-07-01**|**SA-LOAM: Semantic-aided LiDAR SLAM with Loop Closure**|Lin Li et.al.|[2106.11516v2](http://arxiv.org/abs/2106.11516v2)|null|\n", "2105.08941": "|**2021-05-19**|**Large-scale Localization Datasets in Crowded Indoor Spaces**|Donghwan Lee et.al.|[2105.08941v1](http://arxiv.org/abs/2105.08941v1)|null|\n", "2105.03296": "|**2021-10-05**|**VIRAL SLAM: Tightly Coupled Camera-IMU-UWB-Lidar SLAM**|Thien-Minh Nguyen et.al.|[2105.03296v3](http://arxiv.org/abs/2105.03296v3)|null|\n", "2104.05347": "|**2021-04-12**|**Radar SLAM: A Robust SLAM System for All Weather Conditions**|Ziyang Hong et.al.|[2104.05347v1](http://arxiv.org/abs/2104.05347v1)|null|\n", "2104.03657": "|**2021-04-08**|**Dynamic Object Aware LiDAR SLAM based on Automatic Generation of Training Data**|Patrick Pfreundschuh et.al.|[2104.03657v1](http://arxiv.org/abs/2104.03657v1)|null|\n", "2103.13090": "|**2021-03-24**|**Greedy-Based Feature Selection for Efficient LiDAR SLAM**|Jianhao Jiao et.al.|[2103.13090v1](http://arxiv.org/abs/2103.13090v1)|null|\n", "2103.10678": "|**2021-03-19**|**6-DOF Feature based LIDAR SLAM using ORB Features from Rasterized Images of 3D LIDAR Point Cloud**|Waqas Ali et.al.|[2103.10678v1](http://arxiv.org/abs/2103.10678v1)|null|\n", "2103.09523": "|**2021-12-30**|**A Universal LiDAR SLAM Accelerator System on Low-cost FPGA**|Keisuke Sugiura et.al.|[2103.09523v2](http://arxiv.org/abs/2103.09523v2)|null|\n", "2103.05056": "|**2022-02-08**|**LCDNet: Deep Loop Closure Detection and Point Cloud Registration for LiDAR SLAM**|Daniele Cattaneo et.al.|[2103.05056v4](http://arxiv.org/abs/2103.05056v4)|**[link](https://github.com/robot-learning-freiburg/LCDNet)**|\n", "2103.03713": "|**2021-03-05**|**Ground-SLAM: Ground Constrained LiDAR SLAM for Structured Multi-Floor Environments**|Xin Wei et.al.|[2103.03713v1](http://arxiv.org/abs/2103.03713v1)|null|\n", "2102.03800": "|**2021-02-17**|**Lightweight 3-D Localization and Mapping for Solid-State LiDAR**|Han Wang et.al.|[2102.03800v2](http://arxiv.org/abs/2102.03800v2)|**[link](https://github.com/wh200720041/SSL_SLAM)**|\n", "2102.03798": "|**2021-02-17**|**Intensity-SLAM: Intensity Assisted Localization and Mapping for Large Scale Environment**|Han Wang et.al.|[2102.03798v2](http://arxiv.org/abs/2102.03798v2)|**[link](https://github.com/wh200720041/intensity_slam)**|\n", "2102.03771": "|**2021-04-27**|**MULLS: Versatile LiDAR SLAM via Multi-metric Linear Least Square**|Yue Pan et.al.|[2102.03771v3](http://arxiv.org/abs/2102.03771v3)|**[link](https://github.com/YuePanEdward/MULLS)**|\n", "2101.06615": "|**2021-05-31**|**Online Robust Sliding-Windowed LiDAR SLAM in Natural Environments**|Quang-Ha Pham et.al.|[2101.06615v6](http://arxiv.org/abs/2101.06615v6)|null|\n", "2012.03455": "|**2020-12-07**|**TP-TIO: A Robust Thermal-Inertial Odometry with Deep ThermalPoint**|Shibo Zhao et.al.|[2012.03455v1](http://arxiv.org/abs/2012.03455v1)|null|\n", "2012.02399": "|**2020-12-04**|**P3-LOAM: PPP/LiDAR Loosely Coupled SLAM with Accurate Covariance Estimation and Robust RAIM in Urban Canyon Environment**|Tao Li et.al.|[2012.02399v1](http://arxiv.org/abs/2012.02399v1)|null|\n", "2011.11357": "|**2020-11-23**|**CamVox: A Low-cost and Accurate Lidar-assisted Visual SLAM System**|Yuewen Zhu et.al.|[2011.11357v1](http://arxiv.org/abs/2011.11357v1)|**[link](https://github.com/ISEE-Technology/CamVox)**|\n", "2011.02306": "|**2021-09-11**|**A Comparison of LiDAR-based SLAM Systems for Control of Unmanned Aerial Vehicles**|Robert Milijas et.al.|[2011.02306v3](http://arxiv.org/abs/2011.02306v3)|null|\n", "2010.08215": "|**2021-01-13**|**BALM: Bundle Adjustment for Lidar Mapping**|Zheng Liu et.al.|[2010.08215v2](http://arxiv.org/abs/2010.08215v2)|**[link](https://github.com/hku-mars/BALM)**|\n", "2008.03694": "|**2020-08-09**|**LiDAR Data Enrichment Using Deep Learning Based on High-Resolution Image: An Approach to Achieve High-Performance LiDAR SLAM Using Low-cost LiDAR**|Jiang Yue et.al.|[2008.03694v1](http://arxiv.org/abs/2008.03694v1)|null|\n", "2008.02274": "|**2020-08-05**|**Elasticity Meets Continuous-Time: Map-Centric Dense 3D LiDAR SLAM**|Chanoh Park et.al.|[2008.02274v1](http://arxiv.org/abs/2008.02274v1)|null|\n", "2302.13613": "|**2023-03-13**|**Evaluation of Lidar-based 3D SLAM algorithms in SubT environment**|Anton Koval et.al.|[2302.13613v2](http://arxiv.org/abs/2302.13613v2)|null|\n", "2303.01155": "|**2023-04-07**|**Marker-based Visual SLAM leveraging Hierarchical Representations**|Ali Tourani et.al.|[2303.01155v2](http://arxiv.org/abs/2303.01155v2)|null|\n", "2303.05252": "|**2023-03-09**|**SLAMesh: Real-time LiDAR Simultaneous Localization and Meshing**|Jianyuan Ruan et.al.|[2303.05252v1](http://arxiv.org/abs/2303.05252v1)|**[link](https://github.com/RuanJY/SLAMesh)**|\n", "2305.01843": "|**2023-05-03**|**Direct LiDAR-Inertial Odometry and Mapping: Perceptive and Connective SLAM**|Kenny Chen et.al.|[2305.01843v1](http://arxiv.org/abs/2305.01843v1)|null|\n", "2306.03660": "|**2023-06-06**|**PQM: A Point Quality Evaluation Metric for Dense Maps**|Yash Turkar et.al.|[2306.03660v1](http://arxiv.org/abs/2306.03660v1)|**[link](https://github.com/droneslab/pqm-sim)**|\n", "2307.08221": "|**2023-07-17**|**NDT-Map-Code: A 3D global descriptor for real-time loop closure detection in lidar SLAM**|Lizhou Liao et.al.|[2307.08221v1](http://arxiv.org/abs/2307.08221v1)|**[link](https://github.com/SlamCabbage/NDTMC)**|\n", "2307.09044": "|**2023-07-18**|**3D-SeqMOS: A Novel Sequential 3D Moving Object Segmentation in Autonomous Driving**|Qipeng Li et.al.|[2307.09044v1](http://arxiv.org/abs/2307.09044v1)|null|\n", "2307.15005": "|**2023-07-27**|**FLiCR: A Fast and Lightweight LiDAR Point Cloud Compression Based on Lossy RI**|Jin Heo et.al.|[2307.15005v1](http://arxiv.org/abs/2307.15005v1)|null|\n", "2309.04937": "|**2023-09-12**|**LONER: LiDAR Only Neural Representations for Real-Time SLAM**|Seth Isaacson et.al.|[2309.04937v2](http://arxiv.org/abs/2309.04937v2)|null|\n", "2309.08086": "|**2023-09-15**|**Fast and Accurate Deep Loop Closing and Relocalization for Reliable LiDAR SLAM**|Chenghao Shi et.al.|[2309.08086v1](http://arxiv.org/abs/2309.08086v1)|null|\n", "2311.00928": "|**2023-11-02**|**Quatro++: Robust Global Registration Exploiting Ground Segmentation for Loop Closing in LiDAR SLAM**|Hyungtae Lim et.al.|[2311.00928v1](http://arxiv.org/abs/2311.00928v1)|null|\n", "2311.02327": "|**2023-11-04**|**ECMD: An Event-Centric Multisensory Driving Dataset for SLAM**|Peiyu Chen et.al.|[2311.02327v1](http://arxiv.org/abs/2311.02327v1)|null|\n"}, "Transformer": {"2302.08104": "|**2023-02-16**|**Multiscalar field cosmological model and possible solutions using Noether symmetry approach**|Santu Mondal et.al.|[2302.08104v1](http://arxiv.org/abs/2302.08104v1)|null|\n", "2301.11622": "|**2023-01-30**|**Darboux transformations for Dunkl-Schroedinger equations with energy dependent potential and position dependent mass**|Axel Schulze-Halberg et.al.|[2301.11622v2](http://arxiv.org/abs/2301.11622v2)|null|\n", "2301.09364": "|**2023-04-06**|**On uniqueness of submaximally symmetric vector ordinary differential equations of C-class**|Johnson Allen Kessy et.al.|[2301.09364v2](http://arxiv.org/abs/2301.09364v2)|null|\n", "2301.08739": "|**2023-03-30**|**FlatFormer: Flattened Window Attention for Efficient Point Cloud Transformer**|Zhijian Liu et.al.|[2301.08739v2](http://arxiv.org/abs/2301.08739v2)|null|\n", "2301.07301": "|**2023-01-18**|**PTA-Det: Point Transformer Associating Point cloud and Image for 3D Object Detection**|Rui Wan et.al.|[2301.07301v1](http://arxiv.org/abs/2301.07301v1)|null|\n", "2301.02650": "|**2023-01-06**|**Model-Agnostic Hierarchical Attention for 3D Object Detection**|Manli Shu et.al.|[2301.02650v1](http://arxiv.org/abs/2301.02650v1)|null|\n", "2212.13736": "|**2022-12-28**|**Hermitian Topologies originating from non-Hermitian braidings**|W. B. Rui et.al.|[2212.13736v1](http://arxiv.org/abs/2212.13736v1)|null|\n", "2212.13276": "|**2022-12-26**|**Generalization of non-Cartan Symmetries to arbitrary dimensions**|J. C. Ndogmo et.al.|[2212.13276v1](http://arxiv.org/abs/2212.13276v1)|null|\n", "2212.13244": "|**2022-12-26**|**Equivalence classes and Linearization of the Riccati and Abel chain**|J. C. Ndogmo et.al.|[2212.13244v1](http://arxiv.org/abs/2212.13244v1)|null|\n", "2211.12510": "|**2022-11-22**|**Reconstructing the Image Scanning Microscopy Dataset: an Inverse Problem**|Alessandro Zunino et.al.|[2211.12510v1](http://arxiv.org/abs/2211.12510v1)|null|\n", "2211.02079": "|**2022-11-03**|**On Darboux non-integrability of the Hietarinta equation**|S. Ya. Startsev et.al.|[2211.02079v1](http://arxiv.org/abs/2211.02079v1)|null|\n", "2210.15933": "|**2022-10-28**|**PSFormer: Point Transformer for 3D Salient Object Detection**|Baian Chen et.al.|[2210.15933v1](http://arxiv.org/abs/2210.15933v1)|null|\n", "2210.06668": "|**2022-11-05**|**Aspects of the Equivalence Between the $f^\u03bc$ and $c^{\u03bd\u03bc}$ Terms in Lorentz-Violating Quantum Field Theory**|Sapan Karki et.al.|[2210.06668v2](http://arxiv.org/abs/2210.06668v2)|null|\n", "2210.05666": "|**2022-10-12**|**Point Transformer V2: Grouped Vector Attention and Partition-based Pooling**|Xiaoyang Wu et.al.|[2210.05666v2](http://arxiv.org/abs/2210.05666v2)|**[link](https://github.com/gofinge/pointtransformerv2)**|\n", "2209.11255": "|**2022-09-21**|**3DPCT: 3D Point Cloud Transformer with Dual Self-attention**|Dening Lu et.al.|[2209.11255v1](http://arxiv.org/abs/2209.11255v1)|null|\n", "2208.10395": "|**2022-08-22**|**Symmetry Classification of Scalar $n$th Order Ordinary Differential Equations**|Said Waqas Shah et.al.|[2208.10395v1](http://arxiv.org/abs/2208.10395v1)|null|\n", "2208.00281": "|**2022-12-20**|**Point Primitive Transformer for Long-Term 4D Point Cloud Video Understanding**|Hao Wen et.al.|[2208.00281v2](http://arxiv.org/abs/2208.00281v2)|**[link](https://github.com/hoi4d/PPTr)**|\n", "2207.13226": "|**2022-08-15**|**Boosting Point-BERT by Multi-choice Tokens**|Kexue Fu et.al.|[2207.13226v2](http://arxiv.org/abs/2207.13226v2)|**[link](https://github.com/fukexue/mcp-bert)**|\n", "2207.11995": "|**2022-07-26**|**3D Siamese Transformer Network for Single Object Tracking on Point Clouds**|Le Hui et.al.|[2207.11995v2](http://arxiv.org/abs/2207.11995v2)|**[link](https://github.com/fpthink/stnet)**|\n", "2207.10994": "|**2022-07-22**|**Learning Generalized Non-Rigid Multimodal Biomedical Image Registration from Generic Point Set Data**|Zachary MC Baum et.al.|[2207.10994v1](http://arxiv.org/abs/2207.10994v1)|null|\n", "2207.08575": "|**2022-07-18**|**Anisotropic spacetimes in $f(T,B)$ theory IV: Noether symmetry analysis**|Andronikos Paliathanasis et.al.|[2207.08575v1](http://arxiv.org/abs/2207.08575v1)|null|\n", "2206.15191": "|**2022-06-30**|**Lewis-Riesenfeld invariants for PT-symmetrically coupled oscillators from two dimensional point transformations and Lie algebraic expansions**|Andreas Fring et.al.|[2206.15191v1](http://arxiv.org/abs/2206.15191v1)|null|\n", "2206.04670": "|**2022-10-12**|**PointNeXt: Revisiting PointNet++ with Improved Training and Scaling Strategies**|Guocheng Qian et.al.|[2206.04670v2](http://arxiv.org/abs/2206.04670v2)|**[link](https://github.com/guochengqian/pointnext)**|\n", "2206.04511": "|**2022-08-29**|**Efficient Human Pose Estimation via 3D Event Point Cloud**|Jiaan Chen et.al.|[2206.04511v2](http://arxiv.org/abs/2206.04511v2)|**[link](https://github.com/masterhow/eventpointpose)**|\n", "2205.08886": "|**2022-05-18**|**GeoPointGAN: Synthetic Spatial Data with Local Label Differential Privacy**|Teddy Cunningham et.al.|[2205.08886v1](http://arxiv.org/abs/2205.08886v1)|**[link](https://github.com/konstantinklemmer/geopointgan)**|\n", "2204.03957": "|**2022-04-08**|**Points to Patches: Enabling the Use of Self-Attention for 3D Shape Recognition**|Axel Berg et.al.|[2204.03957v1](http://arxiv.org/abs/2204.03957v1)|**[link](https://github.com/axeber01/point-tnt)**|\n", "2203.12758": "|**2022-03-23**|**Mokey: Enabling Narrow Fixed-Point Inference for Out-of-the-Box Floating-Point Transformer Models**|Ali Hadi Zadeh et.al.|[2203.12758v1](http://arxiv.org/abs/2203.12758v1)|null|\n", "2203.04007": "|**2022-08-31**|**DuMLP-Pin: A Dual-MLP-dot-product Permutation-invariant Network for Set Feature Extraction**|Jiajun Fei et.al.|[2203.04007v2](http://arxiv.org/abs/2203.04007v2)|**[link](https://github.com/jaronthu/dumlp-pin)**|\n", "2203.00972": "|**2022-04-07**|**Improving Point Cloud Based Place Recognition with Ranking-based Loss and Large Batch Training**|Jacek Komorowski et.al.|[2203.00972v2](http://arxiv.org/abs/2203.00972v2)|**[link](https://github.com/jac99/minkloc3dv2)**|\n", "2201.05140": "|**2022-01-13**|**An introduction to PT-symmetric quantum mechanics -- time-dependent systems**|Andreas Fring et.al.|[2201.05140v1](http://arxiv.org/abs/2201.05140v1)|null|\n", "2112.13725": "|**2021-12-27**|**Near-Optimal Bounds for Generalized Orthogonal Procrustes Problem via Generalized Power Method**|Shuyang Ling et.al.|[2112.13725v1](http://arxiv.org/abs/2112.13725v1)|null|\n", "2112.11959": "|**2021-12-22**|**Dynamics of a symmetrically decoupled three-dimensional point transformation**|Hacene Gharout et.al.|[2112.11959v1](http://arxiv.org/abs/2112.11959v1)|null|\n", "2112.05635": "|**2021-12-10**|**Geometry of inhomogeneous Poisson brackets, multicomponent Harry Dym hierarchies and multicomponent Hunter-Saxton equations**|Andrey Yu. Konyaev et.al.|[2112.05635v1](http://arxiv.org/abs/2112.05635v1)|null|\n", "2112.04863": "|**2021-12-17**|**3D Medical Point Transformer: Introducing Convolution to Attention Networks for Medical Point Cloud Analysis**|Jianhui Yu et.al.|[2112.04863v2](http://arxiv.org/abs/2112.04863v2)|**[link](https://github.com/crane-papercode/3dmedpt)**|\n", "2112.04702": "|**2022-04-04**|**Fast Point Transformer**|Chunghyun Park et.al.|[2112.04702v2](http://arxiv.org/abs/2112.04702v2)|**[link](https://github.com/POSTECH-CVLab/FastPointTransformer)**|\n", "2111.14819": "|**2022-06-06**|**Point-BERT: Pre-training 3D Point Cloud Transformers with Masked Point Modeling**|Xumin Yu et.al.|[2111.14819v2](http://arxiv.org/abs/2111.14819v2)|**[link](https://github.com/lulutang0608/Point-BERT)**|\n", "2111.14451": "|**2022-03-31**|**HDR-NeRF: High Dynamic Range Neural Radiance Fields**|Xin Huang et.al.|[2111.14451v3](http://arxiv.org/abs/2111.14451v3)|null|\n", "2111.13702": "|**2022-12-12**|**The Information Content of Projected Galaxy Fields**|Lucas Porth et.al.|[2111.13702v2](http://arxiv.org/abs/2111.13702v2)|null|\n", "2111.10866": "|**2021-11-21**|**CpT: Convolutional Point Transformer for 3D Point Cloud Processing**|Chaitanya Kaul et.al.|[2111.10866v1](http://arxiv.org/abs/2111.10866v1)|null|\n", "2111.08973": "|**2021-11-19**|**Generating Unrestricted 3D Adversarial Point Clouds**|Xuelong Dai et.al.|[2111.08973v2](http://arxiv.org/abs/2111.08973v2)|**[link](https://github.com/EricDai0/AdvGCGAN)**|\n", "2111.00207": "|**2022-03-24**|**PatchFormer: An Efficient Point Transformer with Patch Attention**|Zhang Cheng et.al.|[2111.00207v3](http://arxiv.org/abs/2111.00207v3)|null|\n", "2110.05609": "|**2021-11-03**|**Comparison between time-independent and time-dependent quantum systems in the context of energy, Heisenberg uncertainty, average energy, force, average force and thermodynamic quantities**|Debraj Nath et.al.|[2110.05609v2](http://arxiv.org/abs/2110.05609v2)|null|\n", "2110.09230": "|**2021-10-07**|**A study on the Friedmann like Universe with Torsion using Noether Symmetry**|Ramkumar Radhakrishnan et.al.|[2110.09230v1](http://arxiv.org/abs/2110.09230v1)|null|\n", "2109.05023": "|**2021-09-20**|**Real-time multimodal image registration with partial intraoperative point-set data**|Zachary M C Baum et.al.|[2109.05023v2](http://arxiv.org/abs/2109.05023v2)|null|\n", "2109.02107": "|**2021-09-05**|**Normal Forms of second order Ordinary Differential Equations $y_{xx}=J(x,y,y_{x})$ under Fibre-Preserving Maps**|Wei Guo Foo et.al.|[2109.02107v1](http://arxiv.org/abs/2109.02107v1)|null|\n", "2108.08958": "|**2021-08-20**|**Exact solutions for time-dependent non-Hermitian oscillators: classical and quantum pictures**|Kevin Zelaya et.al.|[2108.08958v1](http://arxiv.org/abs/2108.08958v1)|null|\n", "2108.08891": "|**2021-08-19**|**Neural TMDlayer: Modeling Instantaneous flow of features via SDE Generators**|Zihang Meng et.al.|[2108.08891v1](http://arxiv.org/abs/2108.08891v1)|**[link](https://github.com/zihangm/neural-tmd-layer)**|\n", "2108.06076": "|**2022-05-25**|**PVT: Point-Voxel Transformer for Point Cloud Learning**|Cheng Zhang et.al.|[2108.06076v4](http://arxiv.org/abs/2108.06076v4)|**[link](https://github.com/HaochengWan/PVT)**|\n", "2108.00620": "|**2021-10-14**|**Investigating Attention Mechanism in 3D Point Cloud Object Detection**|Shi Qiu et.al.|[2108.00620v2](http://arxiv.org/abs/2108.00620v2)|**[link](https://github.com/ShiQiu0419/attentions_in_3D_detection)**|\n", "2107.14144": "|**2021-07-29**|**Reduction of balance laws in (3+1)--dimensions to autonomous conservation laws by means of equivalence transformations**|Matteo Gorgone et.al.|[2107.14144v1](http://arxiv.org/abs/2107.14144v1)|null|\n", "2303.01166": "|**2023-03-02**|**BPT: Binary Point Cloud Transformer for Place Recognition**|Zhixing Hou et.al.|[2303.01166v1](http://arxiv.org/abs/2303.01166v1)|null|\n", "2303.04458": "|**2023-03-08**|**Full Point Encoding for Local Feature Aggregation in 3D Point Clouds**|Yong He et.al.|[2303.04458v1](http://arxiv.org/abs/2303.04458v1)|null|\n", "2303.07766": "|**2023-03-14**|**Classical and quantum cosmology in $f(T)$-gravity theory: A Noether symmetry approach**|Roshni Bhaumik et.al.|[2303.07766v1](http://arxiv.org/abs/2303.07766v1)|null|\n", "2303.08274": "|**2023-03-14**|**GeoSpark: Sparking up Point Cloud Segmentation with Geometry Clue**|Zhening Huang et.al.|[2303.08274v1](http://arxiv.org/abs/2303.08274v1)|null|\n", "2303.15320": "|**2023-03-22**|**Noether's theorem and Lie symmetries for time-dependent Hamilton-Lagrange systems**|J\u00fcrgen Struckmeier et.al.|[2303.15320v1](http://arxiv.org/abs/2303.15320v1)|null|\n", "2303.17815": "|**2023-03-31**|**APPT : Asymmetric Parallel Point Transformer for 3D Point Cloud Understanding**|Hengjia Li et.al.|[2303.17815v1](http://arxiv.org/abs/2303.17815v1)|null|\n", "2304.02013": "|**2023-09-01**|**NPC: Neural Point Characters from Video**|Shih-Yang Su et.al.|[2304.02013v2](http://arxiv.org/abs/2304.02013v2)|null|\n", "2304.08279": "|**2023-05-27**|**MoDA: Modeling Deformable 3D Objects from Casual Videos**|Chaoyue Song et.al.|[2304.08279v2](http://arxiv.org/abs/2304.08279v2)|**[link](https://github.com/chaoyuesong/moda)**|\n", "2304.08681": "|**2023-09-07**|**The integer point transform as a complete invariant**|Sinai Robins et.al.|[2304.08681v4](http://arxiv.org/abs/2304.08681v4)|null|\n", "2304.14132": "|**2023-04-28**|**Human Semantic Segmentation using Millimeter-Wave Radar Sparse Point Clouds**|Pengfei Song et.al.|[2304.14132v2](http://arxiv.org/abs/2304.14132v2)|null|\n", "2305.00773": "|**2023-05-01**|**Point Cloud Semantic Segmentation**|Ivan Martinovi\u0107 et.al.|[2305.00773v1](http://arxiv.org/abs/2305.00773v1)|null|\n", "2305.03045": "|**2023-05-08**|**OctFormer: Octree-based Transformers for 3D Point Clouds**|Peng-Shuai Wang et.al.|[2305.03045v2](http://arxiv.org/abs/2305.03045v2)|**[link](https://github.com/octree-nn/octformer)**|\n", "2305.02533": "|**2023-05-04**|**Point Transformer For Coronary Artery Labeling**|Xu Wang et.al.|[2305.02533v1](http://arxiv.org/abs/2305.02533v1)|null|\n", "2306.10759": "|**2023-10-31**|**Simplifying and Empowering Transformers for Large-Graph Representations**|Qitian Wu et.al.|[2306.10759v3](http://arxiv.org/abs/2306.10759v3)|**[link](https://github.com/qitianwu/sgformer)**|\n", "2306.12361": "|**2023-06-21**|**Sigma-point Kalman Filter with Nonlinear Unknown Input Estimation via Optimization and Data-driven Approach for Dynamic Systems**|Junn Yong Loo et.al.|[2306.12361v1](http://arxiv.org/abs/2306.12361v1)|null|\n", "2306.10798": "|**2023-06-23**|**ExpPoint-MAE: Better interpretability and performance for self-supervised point cloud transformers**|Ioannis Romanelis et.al.|[2306.10798v2](http://arxiv.org/abs/2306.10798v2)|**[link](https://github.com/vvrpanda/exppoint-mae)**|\n", "2307.04723": "|**2023-07-18**|**Quark/Gluon Discrimination and Top Tagging with Dual Attention Transformer**|Minxuan He et.al.|[2307.04723v2](http://arxiv.org/abs/2307.04723v2)|null|\n", "2307.11973": "|**2023-07-22**|**Two-stream Multi-level Dynamic Point Transformer for Two-person Interaction Recognition**|Yao Liu et.al.|[2307.11973v1](http://arxiv.org/abs/2307.11973v1)|null|\n", "2308.04637": "|**2023-08-09**|**Sparse Binary Transformers for Multivariate Time Series Modeling**|Matt Gorbett et.al.|[2308.04637v1](http://arxiv.org/abs/2308.04637v1)|null|\n", "2308.09403": "|**2023-08-18**|**Target Clustering Based Multi-Bernoulli Filter for Superpositional Sensors**|Wang Sen et.al.|[2308.09403v1](http://arxiv.org/abs/2308.09403v1)|null|\n", "2309.00339": "|**2023-09-01**|**Robust Point Cloud Processing through Positional Embedding**|Jianqiao Zheng et.al.|[2309.00339v1](http://arxiv.org/abs/2309.00339v1)|null|\n", "2309.04105": "|**2023-09-08**|**Weakly Supervised Point Clouds Transformer for 3D Object Detection**|Zuojin Tang et.al.|[2309.04105v1](http://arxiv.org/abs/2309.04105v1)|null|\n", "2310.01545": "|**2023-10-02**|**RF-ULM: Deep Learning for Radio-Frequency Ultrasound Localization Microscopy**|Christopher Hahne et.al.|[2310.01545v1](http://arxiv.org/abs/2310.01545v1)|**[link](https://github.com/hahnec/rf-ulm)**|\n", "2310.05780": "|**2023-10-09**|**Lie symmetries for the cosmological field equations in brane-world gravity with bulk scalar field**|Andronikos Paliathanasis et.al.|[2310.05780v1](http://arxiv.org/abs/2310.05780v1)|null|\n", "2310.16861": "|**2023-10-25**|**General Point Model with Autoencoding and Autoregressive**|Zhe Li et.al.|[2310.16861v1](http://arxiv.org/abs/2310.16861v1)|null|\n", "2310.19772": "|**2023-10-22**|**Exact FLRW cosmological solutions via invariants of the symmetry groups**|E. Ahmadi Azar et.al.|[2310.19772v1](http://arxiv.org/abs/2310.19772v1)|null|\n", "2311.04081": "|**2023-11-07**|**Learning Super-Resolution Ultrasound Localization Microscopy from Radio-Frequency Data**|Christopher Hahne et.al.|[2311.04081v1](http://arxiv.org/abs/2311.04081v1)|null|\n"}, "NeRF": {"2302.12237": "|**2023-02-24**|**Learning Neural Volumetric Representations of Dynamic Humans in Minutes**|Chen Geng et.al.|[2302.12237v2](http://arxiv.org/abs/2302.12237v2)|**[link](https://github.com/zju3dv/instant-nvr)**|\n", "2302.12231": "|**2023-02-23**|**DiffusioNeRF: Regularizing Neural Radiance Fields with Denoising Diffusion Models**|Jamie Wynn et.al.|[2302.12231v1](http://arxiv.org/abs/2302.12231v1)|**[link](https://github.com/nianticlabs/diffusionerf)**|\n", "2302.10109": "|**2023-02-20**|**NerfDiff: Single-image View Synthesis with NeRF-guided Distillation from 3D-aware Diffusion**|Jiatao Gu et.al.|[2302.10109v1](http://arxiv.org/abs/2302.10109v1)|null|\n", "2302.09486": "|**2023-02-19**|**LC-NeRF: Local Controllable Face Generation in Neural Randiance Field**|Wenyang Zhou et.al.|[2302.09486v1](http://arxiv.org/abs/2302.09486v1)|null|\n", "2302.08788": "|**2023-02-17**|**MixNeRF: Modeling a Ray with Mixture Density for Novel View Synthesis from Sparse Inputs**|Seunghyeon Seo et.al.|[2302.08788v1](http://arxiv.org/abs/2302.08788v1)|**[link](https://github.com/shawn615/MixNeRF)**|\n", "2302.06833": "|**2023-02-14**|**VQ3D: Learning a 3D-Aware Generative Model on ImageNet**|Kyle Sargent et.al.|[2302.06833v1](http://arxiv.org/abs/2302.06833v1)|null|\n", "2302.06608": "|**2023-02-13**|**3D-aware Blending with Generative NeRFs**|Hyunsu Kim et.al.|[2302.06608v1](http://arxiv.org/abs/2302.06608v1)|**[link](https://github.com/naver-ai/BlendNeRF)**|\n", "2302.05573": "|**2023-02-11**|**3D Colored Shape Reconstruction from a Single RGB Image through Diffusion**|Bo Li et.al.|[2302.05573v1](http://arxiv.org/abs/2302.05573v1)|null|\n", "2302.04264": "|**2023-02-08**|**Nerfstudio: A Modular Framework for Neural Radiance Field Development**|Matthew Tancik et.al.|[2302.04264v1](http://arxiv.org/abs/2302.04264v1)|null|\n", "2302.02088": "|**2023-02-07**|**AV-NeRF: Learning Neural Fields for Real-World Audio-Visual Scene Synthesis**|Susan Liang et.al.|[2302.02088v2](http://arxiv.org/abs/2302.02088v2)|null|\n", "2302.01579": "|**2023-02-03**|**Semantic 3D-aware Portrait Synthesis and Manipulation Based on Compositional Neural Radiance Field**|Tianxiang Ma et.al.|[2302.01579v1](http://arxiv.org/abs/2302.01579v1)|**[link](https://github.com/tianxiangma/cnerf)**|\n", "2302.01571": "|**2023-02-03**|**Robust Camera Pose Refinement for Multi-Resolution Hash Encoding**|Hwan Heo et.al.|[2302.01571v1](http://arxiv.org/abs/2302.01571v1)|null|\n", "2302.01532": "|**2023-02-03**|**INV: Towards Streaming Incremental Neural Videos**|Shengze Wang et.al.|[2302.01532v1](http://arxiv.org/abs/2302.01532v1)|null|\n", "2302.01226": "|**2023-02-02**|**Factor Fields: A Unified Framework for Neural Fields and Beyond**|Anpei Chen et.al.|[2302.01226v1](http://arxiv.org/abs/2302.01226v1)|null|\n", "2302.00833": "|**2023-02-02**|**RobustNeRF: Ignoring Distractors with Robust Losses**|Sara Sabour et.al.|[2302.00833v1](http://arxiv.org/abs/2302.00833v1)|null|\n", "2301.13430": "|**2023-01-31**|**GeneFace: Generalized and High-Fidelity Audio-Driven 3D Talking Face Synthesis**|Zhenhui Ye et.al.|[2301.13430v1](http://arxiv.org/abs/2301.13430v1)|null|\n", "2301.12780": "|**2023-01-30**|**Equivariant Architectures for Learning in Deep Weight Spaces**|Aviv Navon et.al.|[2301.12780v1](http://arxiv.org/abs/2301.12780v1)|**[link](https://github.com/AvivNavon/DWSNets)**|\n", "2301.11631": "|**2023-01-27**|**HyperNeRFGAN: Hypernetwork approach to 3D NeRF GAN**|Adam Kania et.al.|[2301.11631v1](http://arxiv.org/abs/2301.11631v1)|**[link](https://github.com/gmum/hypernerfgan)**|\n", "2301.11522": "|**2023-01-27**|**A Comparison of Tiny-nerf versus Spatial Representations for 3d Reconstruction**|Saulo Abraham Gante et.al.|[2301.11522v1](http://arxiv.org/abs/2301.11522v1)|null|\n", "2301.11520": "|**2023-01-27**|**SNeRL: Semantic-aware Neural Radiance Fields for Reinforcement Learning**|Dongseok Shim et.al.|[2301.11520v1](http://arxiv.org/abs/2301.11520v1)|null|\n", "2301.11280": "|**2023-01-26**|**Text-To-4D Dynamic Scene Generation**|Uriel Singer et.al.|[2301.11280v1](http://arxiv.org/abs/2301.11280v1)|null|\n", "2301.10941": "|**2023-01-26**|**GeCoNeRF: Few-shot Neural Radiance Fields via Geometric Consistency**|Minseop Kwak et.al.|[2301.10941v1](http://arxiv.org/abs/2301.10941v1)|**[link](https://github.com/KU-CVLAB/GeCoNeRF)**|\n", "2301.09632": "|**2023-01-23**|**HexPlane: A Fast Representation for Dynamic Scenes**|Ang Cao et.al.|[2301.09632v1](http://arxiv.org/abs/2301.09632v1)|**[link](https://github.com/Caoang327/HexPlane)**|\n", "2301.09060": "|**2023-02-02**|**3D Reconstruction of Non-cooperative Resident Space Objects using Instant NGP-accelerated NeRF and D-NeRF**|Trupti Mahendrakar et.al.|[2301.09060v2](http://arxiv.org/abs/2301.09060v2)|null|\n", "2301.07958": "|**2023-02-05**|**RecolorNeRF: Layer Decomposed Radiance Fields for Efficient Color Editing of 3D Scenes**|Bingchen Gong et.al.|[2301.07958v2](http://arxiv.org/abs/2301.07958v2)|null|\n", "2301.08556": "|**2023-01-18**|**NeRF in the Palm of Your Hand: Corrective Augmentation for Robotics via Novel-View Synthesis**|Allan Zhou et.al.|[2301.08556v1](http://arxiv.org/abs/2301.08556v1)|null|\n", "2301.07668": "|**2023-01-18**|**Behind the Scenes: Density Fields for Single View Reconstruction**|Felix Wimbauer et.al.|[2301.07668v1](http://arxiv.org/abs/2301.07668v1)|**[link](https://github.com/Brummi/BehindTheScenes)**|\n", "2301.06782": "|**2023-01-17**|**A Large-Scale Outdoor Multi-modal Dataset and Benchmark for Novel View Synthesis and Implicit Scene Reconstruction**|Chongshan Lu et.al.|[2301.06782v1](http://arxiv.org/abs/2301.06782v1)|null|\n", "2301.05747": "|**2023-01-13**|**Laser: Latent Set Representations for 3D Generative Modeling**|Pol Moreno et.al.|[2301.05747v1](http://arxiv.org/abs/2301.05747v1)|null|\n", "2301.04075": "|**2023-01-10**|**Benchmarking Robustness in Neural Radiance Fields**|Chen Wang et.al.|[2301.04075v1](http://arxiv.org/abs/2301.04075v1)|null|\n", "2301.03102": "|**2023-01-08**|**Towards Open World NeRF-Based SLAM**|Daniil Lisus et.al.|[2301.03102v1](http://arxiv.org/abs/2301.03102v1)|null|\n", "2301.02975": "|**2023-01-10**|**Traditional Readability Formulas Compared for English**|Bruce W. Lee et.al.|[2301.02975v2](http://arxiv.org/abs/2301.02975v2)|null|\n", "2301.00950": "|**2023-01-09**|**Class-Continuous Conditional Generative Neural Radiance Field**|Jiwook Kim et.al.|[2301.00950v2](http://arxiv.org/abs/2301.00950v2)|**[link](https://github.com/tom919654/C3G-NeRF)**|\n", "2301.00411": "|**2023-01-11**|**Detachable Novel Views Synthesis of Dynamic Scenes Using Distribution-Driven Neural Radiance Fields**|Boyu Zhang et.al.|[2301.00411v2](http://arxiv.org/abs/2301.00411v2)|**[link](https://github.com/luciferbobo/d4nerf)**|\n", "2212.13056": "|**2022-12-26**|**MonoNeRF: Learning a Generalizable Dynamic Radiance Field from Monocular Videos**|Fengrui Tian et.al.|[2212.13056v1](http://arxiv.org/abs/2212.13056v1)|**[link](https://github.com/tianfr/mononerf)**|\n", "2212.12871": "|**2022-12-25**|**PaletteNeRF: Palette-based Color Editing for NeRFs**|Qiling Wu et.al.|[2212.12871v1](http://arxiv.org/abs/2212.12871v1)|null|\n", "2212.11966": "|**2022-12-22**|**Removing Objects From Neural Radiance Fields**|Silvan Weder et.al.|[2212.11966v1](http://arxiv.org/abs/2212.11966v1)|null|\n", "2212.10950": "|**2022-12-21**|**Incremental Learning for Neural Radiance Field with Uncertainty-Filtered Knowledge Distillation**|Mengqi Guo et.al.|[2212.10950v1](http://arxiv.org/abs/2212.10950v1)|null|\n", "2212.10699": "|**2023-01-24**|**PaletteNeRF: Palette-based Appearance Editing of Neural Radiance Fields**|Zhengfei Kuang et.al.|[2212.10699v2](http://arxiv.org/abs/2212.10699v2)|null|\n", "2212.09735": "|**2022-12-20**|**Correspondence Distillation from NeRF-based GAN**|Yushi Lan et.al.|[2212.09735v2](http://arxiv.org/abs/2212.09735v2)|null|\n", "2212.09330": "|**2022-12-19**|**StyleTRF: Stylizing Tensorial Radiance Fields**|Rahul Goel et.al.|[2212.09330v1](http://arxiv.org/abs/2212.09330v1)|null|\n", "2212.09100": "|**2022-12-18**|**SPARF: Large-Scale Learning of 3D Sparse Radiance Fields from Few Input Images**|Abdullah Hamdi et.al.|[2212.09100v1](http://arxiv.org/abs/2212.09100v1)|**[link](https://github.com/ajhamdi/sparf_pytorch)**|\n", "2212.09069": "|**2022-12-18**|**Masked Wavelet Representation for Compact Neural Radiance Fields**|Daniel Rho et.al.|[2212.09069v1](http://arxiv.org/abs/2212.09069v1)|**[link](https://github.com/daniel03c1/masked_wavelet_nerf)**|\n", "2212.08328": "|**2022-12-31**|**MEIL-NeRF: Memory-Efficient Incremental Learning of Neural Radiance Fields**|Jaeyoung Chung et.al.|[2212.08328v2](http://arxiv.org/abs/2212.08328v2)|null|\n", "2212.08070": "|**2022-12-15**|**NeRF-Art: Text-Driven Neural Radiance Fields Stylization**|Can Wang et.al.|[2212.08070v1](http://arxiv.org/abs/2212.08070v1)|**[link](https://github.com/cassiePython/NeRF-Art)**|\n", "2212.08057": "|**2022-12-15**|**Real-Time Neural Light Field on Mobile Devices**|Junli Cao et.al.|[2212.08057v1](http://arxiv.org/abs/2212.08057v1)|**[link](https://github.com/snap-research/mobiler2l)**|\n", "2212.08476": "|**2022-12-15**|**SteerNeRF: Accelerating NeRF Rendering via Smooth Viewpoint Trajectory**|Sicheng Li et.al.|[2212.08476v1](http://arxiv.org/abs/2212.08476v1)|null|\n", "2212.07388": "|**2022-12-14**|**NoPe-NeRF: Optimising Neural Radiance Field with No Pose Prior**|Wenjing Bian et.al.|[2212.07388v1](http://arxiv.org/abs/2212.07388v1)|**[link](https://github.com/ActiveVisionLab/nope-nerf)**|\n", "2212.04701": "|**2022-12-09**|**4K-NeRF: High Fidelity Neural Radiance Fields at Ultra High Resolutions**|Zhongshu Wang et.al.|[2212.04701v1](http://arxiv.org/abs/2212.04701v1)|**[link](https://github.com/frozoul/4k-nerf)**|\n", "2212.04823": "|**2022-12-08**|**GazeNeRF: 3D-Aware Gaze Redirection with Neural Radiance Fields**|Alessandro Ruzzi et.al.|[2212.04823v1](http://arxiv.org/abs/2212.04823v1)|**[link](https://github.com/alessandroruzzi/gazenerf)**|\n", "2302.13543": "|**2023-02-27**|**BaLi-RF: Bandlimited Radiance Fields for Dynamic Scene Modeling**|Sameera Ramasinghe et.al.|[2302.13543v1](http://arxiv.org/abs/2302.13543v1)|null|\n", "2302.13397": "|**2023-02-26**|**Efficient physics-informed neural networks using hash encoding**|Xinquan Huang et.al.|[2302.13397v1](http://arxiv.org/abs/2302.13397v1)|null|\n", "2302.12931": "|**2023-02-24**|**CATNIPS: Collision Avoidance Through Neural Implicit Probabilistic Scenes**|Timothy Chen et.al.|[2302.12931v1](http://arxiv.org/abs/2302.12931v1)|null|\n", "2302.14683": "|**2023-03-09**|**IntrinsicNGP: Intrinsic Coordinate based Hash Encoding for Human NeRF**|Bo Peng et.al.|[2302.14683v2](http://arxiv.org/abs/2302.14683v2)|null|\n", "2303.00749": "|**2023-03-01**|**S-NeRF: Neural Radiance Fields for Street Views**|Ziyang Xie et.al.|[2303.00749v1](http://arxiv.org/abs/2303.00749v1)|null|\n", "2303.02091": "|**2023-03-03**|**Delicate Textured Mesh Recovery from NeRF via Adaptive Surface Refinement**|Jiaxiang Tang et.al.|[2303.02091v1](http://arxiv.org/abs/2303.02091v1)|**[link](https://github.com/ashawkey/nerf2mesh)**|\n", "2303.01736": "|**2023-03-03**|**Multi-Plane Neural Radiance Fields for Novel View Synthesis**|Youssef Abdelkareem et.al.|[2303.01736v1](http://arxiv.org/abs/2303.01736v1)|null|\n", "2303.03361": "|**2023-03-10**|**Nerflets: Local Radiance Fields for Efficient Structure-Aware 3D Scene Representation from 2D Supervision**|Xiaoshuai Zhang et.al.|[2303.03361v2](http://arxiv.org/abs/2303.03361v2)|null|\n", "2303.03003": "|**2023-03-07**|**Efficient Large-scale Scene Representation with a Hybrid of High-resolution Grid and Plane Features**|Yuqi Zhang et.al.|[2303.03003v2](http://arxiv.org/abs/2303.03003v2)|**[link](https://github.com/zyqz97/gp-nerf)**|\n", "2303.04086": "|**2023-03-07**|**NEPHELE: A Neural Platform for Highly Realistic Cloud Radiance Rendering**|Haimin Luo et.al.|[2303.04086v1](http://arxiv.org/abs/2303.04086v1)|null|\n", "2303.03808": "|**2023-03-07**|**Multiscale Tensor Decomposition and Rendering Equation Encoding for View Synthesis**|Kang Han et.al.|[2303.03808v1](http://arxiv.org/abs/2303.03808v1)|**[link](https://github.com/imkanghan/nrff)**|\n", "2303.03966": "|**2023-03-05**|**Semantic-aware Occlusion Filtering Neural Radiance Fields in the Wild**|Jaewon Lee et.al.|[2303.03966v1](http://arxiv.org/abs/2303.03966v1)|null|\n", "2303.04508": "|**2023-03-08**|**FastSurf: Fast Neural RGB-D Surface Reconstruction using Per-Frame Intrinsic Refinement and TSDF Fusion Prior Learning**|Seunghwan Lee et.al.|[2303.04508v1](http://arxiv.org/abs/2303.04508v1)|**[link](https://github.com/ROKIT-Healthcare/FastSurf)**|\n", "2303.04322": "|**2023-03-08**|**DroNeRF: Real-time Multi-agent Drone Pose Optimization for Computing Neural Radiance Fields**|Dipam Patel et.al.|[2303.04322v1](http://arxiv.org/abs/2303.04322v1)|null|\n", "2303.05512": "|**2023-03-09**|**PAC-NeRF: Physics Augmented Continuum Neural Radiance Fields for Geometry-Agnostic System Identification**|Xuan Li et.al.|[2303.05512v1](http://arxiv.org/abs/2303.05512v1)|null|\n", "2303.05835": "|**2023-03-10**|**You Only Train Once: Multi-Identity Free-Viewpoint Neural Human Rendering from Monocular Videos**|Jaehyeok Kim et.al.|[2303.05835v1](http://arxiv.org/abs/2303.05835v1)|null|\n", "2303.05807": "|**2023-03-10**|**Aleth-NeRF: Low-light Condition View Synthesis with Concealing Fields**|Ziteng Cui et.al.|[2303.05807v1](http://arxiv.org/abs/2303.05807v1)|null|\n", "2303.05775": "|**2023-03-10**|**Self-NeRF: A Self-Training Pipeline for Few-Shot Neural Radiance Fields**|Jiayang Bai et.al.|[2303.05775v1](http://arxiv.org/abs/2303.05775v1)|null|\n", "2303.05735": "|**2023-03-14**|**Hardware Acceleration of Neural Graphics**|Muhammad Husnain Mubarik et.al.|[2303.05735v2](http://arxiv.org/abs/2303.05735v2)|null|\n", "2303.05703": "|**2023-03-10**|**MovingParts: Motion-based 3D Part Discovery in Dynamic Radiance Field**|Kaizhi Yang et.al.|[2303.05703v1](http://arxiv.org/abs/2303.05703v1)|null|\n", "2303.06919": "|**2023-03-13**|**NeRFLiX: High-Quality Neural View Synthesis by Learning a Degradation-Driven Inter-viewpoint MiXer**|Kun Zhou et.al.|[2303.06919v1](http://arxiv.org/abs/2303.06919v1)|**[link](https://github.com/redrock303/NeRFLiX_CPVR2023)**|\n", "2303.06335": "|**2023-03-11**|**Just Flip: Flipped Observation Generation and Optimization for Neural Radiance Fields to Cover Unobserved View**|Minjae Lee et.al.|[2303.06335v1](http://arxiv.org/abs/2303.06335v1)|**[link](https://github.com/minjae-lulu/just-flip)**|\n", "2303.06226": "|**2023-03-10**|**NeRFlame: FLAME-based conditioning of NeRF for 3D face rendering**|Wojciech Zaj\u0105c et.al.|[2303.06226v1](http://arxiv.org/abs/2303.06226v1)|**[link](https://github.com/wojtekz4/nerflame)**|\n", "2303.08096": "|**2023-03-14**|**MELON: NeRF with Unposed Images Using Equivalence Class Estimation**|Axel Levy et.al.|[2303.08096v1](http://arxiv.org/abs/2303.08096v1)|null|\n", "2303.07937": "|**2023-03-16**|**Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation**|Junyoung Seo et.al.|[2303.07937v3](http://arxiv.org/abs/2303.07937v3)|**[link](https://github.com/KU-CVLAB/3DFuse)**|\n", "2303.07653": "|**2023-03-16**|**NEF: Neural Edge Fields for 3D Parametric Curve Reconstruction from Multi-view Images**|Yunfan Ye et.al.|[2303.07653v2](http://arxiv.org/abs/2303.07653v2)|**[link](https://github.com/yunfan1202/NEF_code)**|\n", "2303.07596": "|**2023-03-18**|**Frequency-Modulated Point Cloud Rendering with Easy Editing**|Yi Zhang et.al.|[2303.07596v2](http://arxiv.org/abs/2303.07596v2)|**[link](https://github.com/yizhangphd/freqpcr)**|\n", "2303.07418": "|**2023-03-13**|**FreeNeRF: Improving Few-shot Neural Rendering with Free Frequency Regularization**|Jiawei Yang et.al.|[2303.07418v1](http://arxiv.org/abs/2303.07418v1)|**[link](https://github.com/jiawei-yang/freenerf)**|\n", "2303.08808": "|**2023-03-15**|**Mesh Strikes Back: Fast and Efficient Human Reconstruction from RGB videos**|Rohit Jena et.al.|[2303.08808v1](http://arxiv.org/abs/2303.08808v1)|null|\n", "2303.08717": "|**2023-03-15**|**Re-ReND: Real-time Rendering of NeRFs across Devices**|Sara Rojas et.al.|[2303.08717v1](http://arxiv.org/abs/2303.08717v1)|**[link](https://github.com/sararoma95/Re-ReND)**|\n", "2303.08695": "|**2023-03-15**|**RefiNeRF: Modelling dynamic neural radiance fields with inconsistent or missing camera parameters**|Shuja Khalid et.al.|[2303.08695v1](http://arxiv.org/abs/2303.08695v1)|null|\n", "2303.08370": "|**2023-03-15**|**Harnessing Low-Frequency Neural Fields for Few-Shot View Synthesis**|Liangchen Song et.al.|[2303.08370v1](http://arxiv.org/abs/2303.08370v1)|**[link](https://github.com/lsongx/halo)**|\n", "2303.09554": "|**2023-03-21**|**PartNeRF: Generating Part-Aware Editable 3D Shapes without 3D Supervision**|Konstantinos Tertikas et.al.|[2303.09554v3](http://arxiv.org/abs/2303.09554v3)|null|\n", "2303.09553": "|**2023-03-16**|**LERF: Language Embedded Radiance Fields**|Justin Kerr et.al.|[2303.09553v1](http://arxiv.org/abs/2303.09553v1)|null|\n", "2303.09431": "|**2023-03-16**|**NeRFMeshing: Distilling Neural Radiance Fields into Geometrically-Accurate 3D Meshes**|Marie-Julie Rakotosaona et.al.|[2303.09431v1](http://arxiv.org/abs/2303.09431v1)|null|\n", "2303.09412": "|**2023-03-17**|**NeRFtrinsic Four: An End-To-End Trainable NeRF Jointly Optimizing Diverse Intrinsic and Extrinsic Camera Parameters**|Hannah Schieber et.al.|[2303.09412v2](http://arxiv.org/abs/2303.09412v2)|**[link](https://github.com/hannahhaensen/nerftrinsic_four)**|\n", "2303.09153": "|**2023-03-16**|**Reliable Image Dehazing by NeRF**|Zheyan Jin et.al.|[2303.09153v1](http://arxiv.org/abs/2303.09153v1)|null|\n", "2303.10083": "|**2023-03-17**|**$\u03b1$Surf: Implicit Surface Reconstruction for Semi-Transparent and Thin Objects with Decoupled Geometry and Opacity**|Tianhao Wu et.al.|[2303.10083v1](http://arxiv.org/abs/2303.10083v1)|null|\n", "2303.09952": "|**2023-03-17**|**Single-view Neural Radiance Fields with Depth Teacher**|Yurui Chen et.al.|[2303.09952v1](http://arxiv.org/abs/2303.09952v1)|null|\n", "2303.11052": "|**2023-03-20**|**ContraNeRF: Generalizable Neural Radiance Fields for Synthetic-to-real Novel View Synthesis via Contrastive Learning**|Hao Yang et.al.|[2303.11052v1](http://arxiv.org/abs/2303.11052v1)|null|\n", "2303.10735": "|**2023-03-19**|**SKED: Sketch-guided Text-based 3D Editing**|Aryan Mikaeili et.al.|[2303.10735v1](http://arxiv.org/abs/2303.10735v1)|null|\n", "2303.10709": "|**2023-03-19**|**NeRF-LOAM: Neural Implicit Representation for Large-Scale Incremental LiDAR Odometry and Mapping**|Junyuan Deng et.al.|[2303.10709v1](http://arxiv.org/abs/2303.10709v1)|**[link](https://github.com/junyuandeng/nerf-loam)**|\n", "2303.10340": "|**2023-03-18**|**3D Data Augmentation for Driving Scenes on Camera**|Wenwen Tong et.al.|[2303.10340v1](http://arxiv.org/abs/2303.10340v1)|null|\n", "2303.11938": "|**2023-03-21**|**3D-CLFusion: Fast Text-to-3D Rendering with Contrastive Latent Diffusion**|Yu-Jhe Li et.al.|[2303.11938v1](http://arxiv.org/abs/2303.11938v1)|null|\n", "2303.11728": "|**2023-03-22**|**ExtremeNeRF: Few-shot Neural Radiance Fields Under Unconstrained Illumination**|SeokYeong Lee et.al.|[2303.11728v2](http://arxiv.org/abs/2303.11728v2)|null|\n", "2303.11364": "|**2023-03-20**|**DehazeNeRF: Multiple Image Haze Removal and 3D Shape Reconstruction using Neural Radiance Fields**|Wei-Ting Chen et.al.|[2303.11364v1](http://arxiv.org/abs/2303.11364v1)|null|\n", "2303.12791": "|**2023-03-22**|**SHERF: Generalizable Human NeRF from a Single Image**|Shoukang Hu et.al.|[2303.12791v1](http://arxiv.org/abs/2303.12791v1)|**[link](https://github.com/skhu101/sherf)**|\n", "2303.12789": "|**2023-03-22**|**Instruct-NeRF2NeRF: Editing 3D Scenes with Instructions**|Ayaan Haque et.al.|[2303.12789v1](http://arxiv.org/abs/2303.12789v1)|null|\n", "2303.12786": "|**2023-03-22**|**FeatureNeRF: Learning Generalizable NeRFs by Distilling Foundation Models**|Jianglong Ye et.al.|[2303.12786v1](http://arxiv.org/abs/2303.12786v1)|null|\n", "2303.12408": "|**2023-03-24**|**Balanced Spherical Grid for Egocentric View Synthesis**|Changwoon Choi et.al.|[2303.12408v2](http://arxiv.org/abs/2303.12408v2)|**[link](https://github.com/changwoonchoi/EgoNeRF)**|\n", "2303.12234": "|**2023-03-21**|**Pre-NeRF 360: Enriching Unbounded Appearances for Neural Radiance Fields**|Ahmad AlMughrabi et.al.|[2303.12234v1](http://arxiv.org/abs/2303.12234v1)|**[link](https://github.com/amughrabi/pre-nerf)**|\n", "2303.13497": "|**2023-03-23**|**TriPlaneNet: An Encoder for EG3D Inversion**|Ananta R. Bhattarai et.al.|[2303.13497v1](http://arxiv.org/abs/2303.13497v1)|null|\n", "2303.13472": "|**2023-03-23**|**Plotting Behind the Scenes: Towards Learnable Game Engines**|Willi Menapace et.al.|[2303.13472v1](http://arxiv.org/abs/2303.13472v1)|null|\n", "2303.13450": "|**2023-03-23**|**Set-the-Scene: Global-Local Training for Generating Controllable NeRF Scenes**|Dana Cohen-Bar et.al.|[2303.13450v1](http://arxiv.org/abs/2303.13450v1)|**[link](https://github.com/DanaCohen95/Set-the-Scene)**|\n", "2303.13277": "|**2023-03-25**|**SINE: Semantic-driven Image-based NeRF Editing with Prior-guided Editing Field**|Chong Bao et.al.|[2303.13277v2](http://arxiv.org/abs/2303.13277v2)|null|\n", "2303.13232": "|**2023-03-23**|**Transforming Radiance Field with Lipschitz Network for Photorealistic 3D Scene Stylization**|Zicheng Zhang et.al.|[2303.13232v1](http://arxiv.org/abs/2303.13232v1)|null|\n", "2303.13014": "|**2023-03-23**|**Semantic Ray: Learning a Generalizable Semantic Field with Cross-Reprojection Attention**|Fangfu Liu et.al.|[2303.13014v1](http://arxiv.org/abs/2303.13014v1)|**[link](https://github.com/liuff19/Semantic-Ray)**|\n", "2303.12865": "|**2023-03-22**|**NeRF-GAN Distillation for Efficient 3D-Aware Generation with Convolutions**|Mohamad Shahbazi et.al.|[2303.12865v1](http://arxiv.org/abs/2303.12865v1)|**[link](https://github.com/mshahbazi72/nerf-gan-distillation)**|\n", "2303.14001": "|**2023-03-24**|**Grid-guided Neural Radiance Fields for Large Urban Scenes**|Linning Xu et.al.|[2303.14001v1](http://arxiv.org/abs/2303.14001v1)|null|\n", "2303.13843": "|**2023-03-24**|**CompoNeRF: Text-guided Multi-object Compositional NeRF with Editable 3D Scene Layout**|Yiqi Lin et.al.|[2303.13843v1](http://arxiv.org/abs/2303.13843v1)|null|\n", "2303.13825": "|**2023-03-24**|**HandNeRF: Neural Radiance Fields for Animatable Interacting Hands**|Zhiyang Guo et.al.|[2303.13825v1](http://arxiv.org/abs/2303.13825v1)|null|\n", "2303.13817": "|**2023-03-24**|**ABLE-NeRF: Attention-Based Rendering with Learnable Embeddings for Neural Radiance Field**|Zhe Jun Tang et.al.|[2303.13817v1](http://arxiv.org/abs/2303.13817v1)|**[link](https://github.com/tangzj/able-nerf)**|\n", "2303.13777": "|**2023-03-24**|**GM-NeRF: Learning Generalizable Model-based Neural Radiance Fields from Multi-view Images**|Jianchuan Chen et.al.|[2303.13777v1](http://arxiv.org/abs/2303.13777v1)|null|\n", "2303.13743": "|**2023-03-24**|**TEGLO: High Fidelity Canonical Texture Mapping from Single-View Images**|Vishal Vinod et.al.|[2303.13743v1](http://arxiv.org/abs/2303.13743v1)|null|\n", "2303.13582": "|**2023-03-23**|**SCADE: NeRFs from Space Carving with Ambiguity-Aware Depth Estimates**|Mikaela Angelina Uy et.al.|[2303.13582v1](http://arxiv.org/abs/2303.13582v1)|null|\n", "2303.15427": "|**2023-03-27**|**JAWS: Just A Wild Shot for Cinematic Transfer in Neural Radiance Fields**|Xi Wang et.al.|[2303.15427v1](http://arxiv.org/abs/2303.15427v1)|**[link](https://github.com/robincourant/jaws)**|\n", "2303.15387": "|**2023-03-27**|**Generalizable Neural Voxels for Fast Human Radiance Fields**|Taoran Yi et.al.|[2303.15387v1](http://arxiv.org/abs/2303.15387v1)|null|\n", "2303.15368": "|**2023-03-27**|**NeUDF: Learning Unsigned Distance Fields from Multi-view Images for Reconstructing Non-watertight Models**|Fei Hou et.al.|[2303.15368v1](http://arxiv.org/abs/2303.15368v1)|null|\n", "2303.15012": "|**2023-03-27**|**3D-Aware Multi-Class Image-to-Image Translation with NeRFs**|Senmao Li et.al.|[2303.15012v1](http://arxiv.org/abs/2303.15012v1)|**[link](https://github.com/sen-mao/3di2i-translation)**|\n", "2303.14707": "|**2023-03-26**|**Clean-NeRF: Reformulating NeRF to account for View-Dependent Observations**|Xinhang Liu et.al.|[2303.14707v1](http://arxiv.org/abs/2303.14707v1)|null|\n", "2303.14536": "|**2023-03-25**|**SUDS: Scalable Urban Dynamic Scenes**|Haithem Turki et.al.|[2303.14536v1](http://arxiv.org/abs/2303.14536v1)|null|\n", "2303.14478": "|**2023-03-25**|**DBARF: Deep Bundle-Adjusting Generalizable Neural Radiance Fields**|Yu Chen et.al.|[2303.14478v1](http://arxiv.org/abs/2303.14478v1)|null|\n", "2303.14435": "|**2023-03-25**|**NeRF-DS: Neural Radiance Fields for Dynamic Specular Objects**|Zhiwen Yan et.al.|[2303.14435v1](http://arxiv.org/abs/2303.14435v1)|**[link](https://github.com/jokeryan/nerf-ds)**|\n", "2303.15206": "|**2023-03-24**|**Perceptual Quality Assessment of NeRF and Neural View Synthesis Methods for Front-Facing Views**|Hanxue Liang et.al.|[2303.15206v1](http://arxiv.org/abs/2303.15206v1)|null|\n", "2303.16196": "|**2023-03-28**|**SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis**|Guangcong Wang et.al.|[2303.16196v1](http://arxiv.org/abs/2303.16196v1)|null|\n", "2303.16184": "|**2023-03-28**|**VMesh: Hybrid Volume-Mesh Representation for Efficient View Synthesis**|Yuan-Chen Guo et.al.|[2303.16184v1](http://arxiv.org/abs/2303.16184v1)|null|\n", "2303.16001": "|**2023-03-30**|**Adaptive Voronoi NeRFs**|Tim Elsner et.al.|[2303.16001v2](http://arxiv.org/abs/2303.16001v2)|null|\n", "2303.15951": "|**2023-03-28**|**F$^{2}$-NeRF: Fast Neural Radiance Field Training with Free Camera Trajectories**|Peng Wang et.al.|[2303.15951v1](http://arxiv.org/abs/2303.15951v1)|**[link](https://github.com/Totoro97/f2-nerf)**|\n", "2303.16485": "|**2023-03-29**|**TriVol: Point Cloud Rendering via Triple Volumes**|Tao Hu et.al.|[2303.16485v1](http://arxiv.org/abs/2303.16485v1)|**[link](https://github.com/dvlab-research/trivol)**|\n", "2303.16482": "|**2023-03-29**|**Point2Pix: Photo-Realistic Point Cloud Rendering via Neural Radiance Fields**|Tao Hu et.al.|[2303.16482v1](http://arxiv.org/abs/2303.16482v1)|null|\n", "2303.16333": "|**2023-03-28**|**Flow supervision for Deformable NeRF**|Chaoyang Wang et.al.|[2303.16333v1](http://arxiv.org/abs/2303.16333v1)|null|\n", "2303.17603": "|**2023-03-30**|**NeRF-Supervised Deep Stereo**|Fabio Tosi et.al.|[2303.17603v1](http://arxiv.org/abs/2303.17603v1)|**[link](https://github.com/fabiotosi92/nerf-supervised-deep-stereo)**|\n", "2303.17368": "|**2023-03-30**|**SynBody: Synthetic Dataset with Layered Human Models for 3D Human Perception and Modeling**|Zhitao Yang et.al.|[2303.17368v1](http://arxiv.org/abs/2303.17368v1)|**[link](https://github.com/openxrlab/xrfeitoria)**|\n", "2303.17147": "|**2023-03-30**|**NeILF++: Inter-Reflectable Light Fields for Geometry and Material Estimation**|Jingyang Zhang et.al.|[2303.17147v1](http://arxiv.org/abs/2303.17147v1)|null|\n", "2303.17094": "|**2023-03-30**|**Enhanced Stable View Synthesis**|Nishant Jain et.al.|[2303.17094v1](http://arxiv.org/abs/2303.17094v1)|null|\n", "2303.17968": "|**2023-03-31**|**VDN-NeRF: Resolving Shape-Radiance Ambiguity via View-Dependence Normalization**|Bingfan Zhu et.al.|[2303.17968v1](http://arxiv.org/abs/2303.17968v1)|**[link](https://github.com/boifz/vdn-nerf)**|\n", "2304.00916": "|**2023-04-06**|**DreamAvatar: Text-and-Shape Guided 3D Human Avatar Generation via Diffusion Models**|Yukang Cao et.al.|[2304.00916v2](http://arxiv.org/abs/2304.00916v2)|null|\n", "2304.00341": "|**2023-04-01**|**JacobiNeRF: NeRF Shaping with Mutual Information Gradients**|Xiaomeng Xu et.al.|[2304.00341v1](http://arxiv.org/abs/2304.00341v1)|**[link](https://github.com/xxm19/jacobinerf)**|\n", "2304.02001": "|**2023-04-04**|**MonoHuman: Animatable Human Neural Field from Monocular Video**|Zhengming Yu et.al.|[2304.02001v1](http://arxiv.org/abs/2304.02001v1)|null|\n", "2304.02061": "|**2023-04-11**|**Generating Continual Human Motion in Diverse 3D Scenes**|Aymen Mir et.al.|[2304.02061v2](http://arxiv.org/abs/2304.02061v2)|null|\n", "2304.03280": "|**2023-04-06**|**LANe: Lighting-Aware Neural Fields for Compositional Scene Synthesis**|Akshay Krishnan et.al.|[2304.03280v1](http://arxiv.org/abs/2304.03280v1)|null|\n", "2304.03266": "|**2023-04-06**|**Neural Fields meet Explicit Geometric Representation for Inverse Rendering of Urban Scenes**|Zian Wang et.al.|[2304.03266v1](http://arxiv.org/abs/2304.03266v1)|null|\n", "2304.02827": "|**2023-04-06**|**DITTO-NeRF: Diffusion-based Iterative Text To Omni-directional 3D Model**|Hoigi Seo et.al.|[2304.02827v1](http://arxiv.org/abs/2304.02827v1)|null|\n", "2304.02736": "|**2023-04-05**|**Image Stabilization for Hololens Camera in Remote Collaboration**|Gowtham Senthil et.al.|[2304.02736v1](http://arxiv.org/abs/2304.02736v1)|null|\n", "2304.03526": "|**2023-04-07**|**Lift3D: Synthesize 3D Training Data by Lifting 2D GAN to 3D Generative Radiance Field**|Leheng Li et.al.|[2304.03526v1](http://arxiv.org/abs/2304.03526v1)|null|\n", "2304.03384": "|**2023-04-06**|**Beyond NeRF Underwater: Learning Neural Reflectance Fields for True Color Correction of Marine Imagery**|Tianyi Zhang et.al.|[2304.03384v1](http://arxiv.org/abs/2304.03384v1)|**[link](https://github.com/tyz1030/neuralsea)**|\n", "2304.04452": "|**2023-04-10**|**Neural Residual Radiance Fields for Streamably Free-Viewpoint Videos**|Liao Wang et.al.|[2304.04452v1](http://arxiv.org/abs/2304.04452v1)|null|\n", "2304.04446": "|**2023-04-10**|**Inferring Fluid Dynamics via Inverse Rendering**|Jinxian Liu et.al.|[2304.04446v1](http://arxiv.org/abs/2304.04446v1)|null|\n", "2304.04395": "|**2023-04-10**|**Instance Neural Radiance Field**|Benran Hu et.al.|[2304.04395v1](http://arxiv.org/abs/2304.04395v1)|**[link](https://github.com/lyclyc52/instance_nerf)**|\n", "2304.04133": "|**2023-04-12**|**NeRF applied to satellite imagery for surface reconstruction**|Federico Semeraro et.al.|[2304.04133v3](http://arxiv.org/abs/2304.04133v3)|**[link](https://github.com/fsemerar/satnerf)**|\n", "2304.04012": "|**2023-04-08**|**PVD-AL: Progressive Volume Distillation with Active Learning for Efficient Conversion Between Different NeRF Architectures**|Shuangkang Fang et.al.|[2304.04012v1](http://arxiv.org/abs/2304.04012v1)|**[link](https://github.com/megvii-research/AAAI2023-PVD)**|\n", "2304.04559": "|**2023-04-07**|**Event-based Camera Tracker by $\\nabla$t NeRF**|Mana Masuda et.al.|[2304.04559v1](http://arxiv.org/abs/2304.04559v1)|null|\n", "2304.05218": "|**2023-04-11**|**Improving Neural Radiance Fields with Depth-aware Optimization for Novel View Synthesis**|Shu Chen et.al.|[2304.05218v1](http://arxiv.org/abs/2304.05218v1)|**[link](https://github.com/xtu-pr-lab/sfmnerf)**|\n", "2304.05097": "|**2023-04-11**|**One-Shot High-Fidelity Talking-Head Synthesis with Deformable Neural Radiance Field**|Weichuang Li et.al.|[2304.05097v1](http://arxiv.org/abs/2304.05097v1)|null|\n", "2304.04962": "|**2023-04-11**|**MRVM-NeRF: Mask-Based Pretraining for Neural Radiance Fields**|Ganlin Yang et.al.|[2304.04962v1](http://arxiv.org/abs/2304.04962v1)|null|\n", "2304.04897": "|**2023-04-10**|**Neural Image-based Avatars: Generalizable Radiance Fields for Human Avatar Modeling**|Youngjoong Kwon et.al.|[2304.04897v1](http://arxiv.org/abs/2304.04897v1)|null|\n", "2304.05620": "|**2023-04-12**|**NutritionVerse-Thin: An Optimized Strategy for Enabling Improved Rendering of 3D Thin Food Models**|Chi-en Amy Tai et.al.|[2304.05620v1](http://arxiv.org/abs/2304.05620v1)|null|\n", "2304.06714": "|**2023-04-17**|**Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and Reconstruction**|Hansheng Chen et.al.|[2304.06714v2](http://arxiv.org/abs/2304.06714v2)|**[link](https://github.com/Lakonik/SSDNeRF)**|\n", "2304.06706": "|**2023-04-13**|**Zip-NeRF: Anti-Aliased Grid-Based Neural Radiance Fields**|Jonathan T. Barron et.al.|[2304.06706v1](http://arxiv.org/abs/2304.06706v1)|null|\n", "2304.06287": "|**2023-04-13**|**NeRFVS: Neural Radiance Fields for Free View Synthesis via Geometry Scaffolds**|Chen Yang et.al.|[2304.06287v1](http://arxiv.org/abs/2304.06287v1)|null|\n", "2304.06969": "|**2023-04-14**|**UVA: Towards Unified Volumetric Avatar for View Synthesis, Pose rendering, Geometry and Texture Editing**|Jinlong Fan et.al.|[2304.06969v1](http://arxiv.org/abs/2304.06969v1)|null|\n", "2304.08279": "|**2023-04-17**|**MoDA: Modeling Deformable 3D Objects from Casual Videos**|Chaoyue Song et.al.|[2304.08279v1](http://arxiv.org/abs/2304.08279v1)|**[link](https://github.com/chaoyuesong/moda)**|\n", "2304.07979": "|**2023-04-17**|**NeRF-Loc: Visual Localization with Conditional Neural Radiance Field**|Jianlin Liu et.al.|[2304.07979v1](http://arxiv.org/abs/2304.07979v1)|**[link](https://github.com/jenningsl/nerf-loc)**|\n", "2304.07918": "|**2023-04-16**|**Likelihood-Based Generative Radiance Field with Latent Space Energy-Based Model for 3D-Aware Disentangled Image Representation**|Yaxuan Zhu et.al.|[2304.07918v1](http://arxiv.org/abs/2304.07918v1)|null|\n", "2304.07915": "|**2023-04-16**|**CAT-NeRF: Constancy-Aware Tx$^2$Former for Dynamic Body Modeling**|Haidong Zhu et.al.|[2304.07915v1](http://arxiv.org/abs/2304.07915v1)|**[link](https://github.com/haidongz-usc/CAT-NeRF)**|\n", "2304.07743": "|**2023-04-16**|**SeaThru-NeRF: Neural Radiance Fields in Scattering Media**|Deborah Levy et.al.|[2304.07743v1](http://arxiv.org/abs/2304.07743v1)|**[link](https://github.com/deborahLevy130/seathru_NeRF)**|\n", "2304.08971": "|**2023-04-18**|**SurfelNeRF: Neural Surfel Radiance Fields for Online Photorealistic Reconstruction of Indoor Scenes**|Yiming Gao et.al.|[2304.08971v1](http://arxiv.org/abs/2304.08971v1)|null|\n", "2304.08757": "|**2023-04-18**|**NeAI: A Pre-convoluted Representation for Plug-and-Play Neural Ambient Illumination**|Yiyu Zhuang et.al.|[2304.08757v1](http://arxiv.org/abs/2304.08757v1)|null|\n", "2304.09677": "|**2023-04-20**|**Reference-guided Controllable Inpainting of Neural Radiance Fields**|Ashkan Mirzaei et.al.|[2304.09677v2](http://arxiv.org/abs/2304.09677v2)|null|\n", "2304.10537": "|**2023-04-20**|**Learning Neural Duplex Radiance Fields for Real-Time View Synthesis**|Ziyu Wan et.al.|[2304.10537v1](http://arxiv.org/abs/2304.10537v1)|null|\n", "2304.10532": "|**2023-04-21**|**Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs**|Frederik Warburg et.al.|[2304.10532v2](http://arxiv.org/abs/2304.10532v2)|**[link](https://github.com/ethanweber/nerfbusters)**|\n", "2304.10448": "|**2023-04-20**|**ReLight My NeRF: A Dataset for Novel View Synthesis and Relighting of Real World Objects**|Marco Toschi et.al.|[2304.10448v1](http://arxiv.org/abs/2304.10448v1)|null|\n", "2304.10406": "|**2023-04-20**|**LiDAR-NeRF: Novel LiDAR View Synthesis via Neural Radiance Fields**|Tang Tao et.al.|[2304.10406v1](http://arxiv.org/abs/2304.10406v1)|**[link](https://github.com/tangtaogo/lidar-nerf)**|\n", "2304.10250": "|**2023-04-20**|**Revisiting Implicit Neural Representations in Low-Level Vision**|Wentian Xu et.al.|[2304.10250v1](http://arxiv.org/abs/2304.10250v1)|**[link](https://github.com/wentxul/linr)**|\n", "2304.10075": "|**2023-04-20**|**Multiscale Representation for Real-Time Anti-Aliasing Neural Rendering**|Dongting Hu et.al.|[2304.10075v1](http://arxiv.org/abs/2304.10075v1)|null|\n", "2304.10050": "|**2023-04-20**|**Neural Radiance Fields: Past, Present, and Future**|Ansh Mittal et.al.|[2304.10050v1](http://arxiv.org/abs/2304.10050v1)|null|\n", "2304.09987": "|**2023-04-19**|**Tetra-NeRF: Representing Neural Radiance Fields Using Tetrahedra**|Jonas Kulhanek et.al.|[2304.09987v1](http://arxiv.org/abs/2304.09987v1)|**[link](https://github.com/jkulhanek/tetra-nerf)**|\n", "2304.10780": "|**2023-04-21**|**Omni-Line-of-Sight Imaging for Holistic Shape Reconstruction**|Binbin Huang et.al.|[2304.10780v1](http://arxiv.org/abs/2304.10780v1)|null|\n", "2304.10664": "|**2023-04-20**|**A Comparative Neural Radiance Field (NeRF) 3D Analysis of Camera Poses from HoloLens Trajectories and Structure from Motion**|Miriam J\u00e4ger et.al.|[2304.10664v1](http://arxiv.org/abs/2304.10664v1)|null|\n", "2304.12308": "|**2023-04-26**|**Segment Anything in 3D with NeRFs**|Jiazhong Cen et.al.|[2304.12308v2](http://arxiv.org/abs/2304.12308v2)|null|\n", "2304.12294": "|**2023-04-24**|**Explicit Correspondence Matching for Generalizable Neural Radiance Fields**|Yuedong Chen et.al.|[2304.12294v1](http://arxiv.org/abs/2304.12294v1)|**[link](https://github.com/donydchen/matchnerf)**|\n", "2304.11842": "|**2023-04-25**|**Gen-NeRF: Efficient and Generalizable Neural Radiance Fields via Algorithm-Hardware Co-Design**|Yonggan Fu et.al.|[2304.11842v2](http://arxiv.org/abs/2304.11842v2)|null|\n", "2304.11470": "|**2023-04-22**|**3D-IntPhys: Towards More Generalized 3D-grounded Visual Intuitive Physics under Challenging Scenes**|Haotian Xue et.al.|[2304.11470v1](http://arxiv.org/abs/2304.11470v1)|null|\n", "2304.11448": "|**2023-04-22**|**Dehazing-NeRF: Neural Radiance Fields from Hazy Images**|Tian Li et.al.|[2304.11448v1](http://arxiv.org/abs/2304.11448v1)|null|\n", "2304.11342": "|**2023-04-22**|**NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent Semantic Navigation**|Baao Xie et.al.|[2304.11342v1](http://arxiv.org/abs/2304.11342v1)|null|\n", "2304.11241": "|**2023-04-21**|**AutoNeRF: Training Implicit Scene Representations with Autonomous Agents**|Pierre Marza et.al.|[2304.11241v1](http://arxiv.org/abs/2304.11241v1)|null|\n", "2304.12746": "|**2023-04-25**|**Local Implicit Ray Function for Generalizable Radiance Field Representation**|Xin Huang et.al.|[2304.12746v1](http://arxiv.org/abs/2304.12746v1)|null|\n", "2304.12587": "|**2023-04-27**|**MF-NeRF: Memory Efficient NeRF with Mixed-Feature Hash Table**|Yongjae Lee et.al.|[2304.12587v3](http://arxiv.org/abs/2304.12587v3)|**[link](https://github.com/nfyfamr/mf-nerf)**|\n", "2304.12467": "|**2023-04-24**|**Instant-3D: Instant Neural Radiance Field Training Towards On-Device AR/VR 3D Reconstruction**|Sixu Li et.al.|[2304.12467v1](http://arxiv.org/abs/2304.12467v1)|null|\n", "2304.12439": "|**2023-04-24**|**TextMesh: Generation of Realistic 3D Meshes From Text Prompts**|Christina Tsalicoglou et.al.|[2304.12439v1](http://arxiv.org/abs/2304.12439v1)|null|\n", "2304.13518": "|**2023-04-26**|**Super-NeRF: View-consistent Detail Generation for NeRF super-resolution**|Yuqi Han et.al.|[2304.13518v1](http://arxiv.org/abs/2304.13518v1)|null|\n", "2304.13386": "|**2023-04-26**|**VGOS: Voxel Grid Optimization for View Synthesis from Sparse Inputs**|Jiakai Sun et.al.|[2304.13386v1](http://arxiv.org/abs/2304.13386v1)|**[link](https://github.com/sjojok/vgos)**|\n", "2304.14401": "|**2023-04-27**|**ActorsNeRF: Animatable Few-shot Human Rendering with Generalizable NeRFs**|Jiteng Mu et.al.|[2304.14401v1](http://arxiv.org/abs/2304.14401v1)|null|\n", "2304.14301": "|**2023-05-03**|**Combining HoloLens with Instant-NeRFs: Advanced Real-Time 3D Mobile Mapping**|Dennis Haitz et.al.|[2304.14301v2](http://arxiv.org/abs/2304.14301v2)|null|\n", "2304.14070": "|**2023-04-27**|**Compositional 3D Human-Object Neural Animation**|Zhi Hou et.al.|[2304.14070v1](http://arxiv.org/abs/2304.14070v1)|null|\n", "2304.14811": "|**2023-04-28**|**NeRF-LiDAR: Generating Realistic LiDAR Point Clouds with Neural Radiance Fields**|Junge Zhang et.al.|[2304.14811v1](http://arxiv.org/abs/2304.14811v1)|null|\n", "2304.14473": "|**2023-04-27**|**Learning a Diffusion Prior for NeRFs**|Guandao Yang et.al.|[2304.14473v1](http://arxiv.org/abs/2304.14473v1)|null|\n", "2305.00787": "|**2023-05-01**|**GeneFace++: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation**|Zhenhui Ye et.al.|[2305.00787v1](http://arxiv.org/abs/2305.00787v1)|null|\n", "2305.00375": "|**2023-04-30**|**Neural Radiance Fields (NeRFs): A Review and Some Recent Developments**|Mohamed Debbagh et.al.|[2305.00375v1](http://arxiv.org/abs/2305.00375v1)|null|\n", "2305.00041": "|**2023-04-28**|**ViP-NeRF: Visibility Prior for Sparse Input Neural Radiance Fields**|Nagabhushan Somraj et.al.|[2305.00041v1](http://arxiv.org/abs/2305.00041v1)|**[link](https://github.com/NagabhushanSN95/ViP-NeRF)**|\n", "2305.01643": "|**2023-05-02**|**Neural LiDAR Fields for Novel View Synthesis**|Shengyu Huang et.al.|[2305.01643v1](http://arxiv.org/abs/2305.01643v1)|null|\n", "2305.01190": "|**2023-05-03**|**LatentAvatar: Learning Latent Expression Code for Expressive Neural Head Avatar**|Yuelang Xu et.al.|[2305.01190v2](http://arxiv.org/abs/2305.01190v2)|null|\n", "2305.01163": "|**2023-05-02**|**Federated Neural Radiance Fields**|Lachlan Holden et.al.|[2305.01163v1](http://arxiv.org/abs/2305.01163v1)|**[link](https://github.com/lachholden/fednerf-pytorch)**|\n", "2305.03049": "|**2023-05-04**|**NeuralEditor: Editing Neural Radiance Fields via Manipulating Point Clouds**|Jun-Kun Chen et.al.|[2305.03049v1](http://arxiv.org/abs/2305.03049v1)|null|\n", "2305.02756": "|**2023-05-04**|**Radiance Field Gradient Scaling for Unbiased Near-Camera Training**|Julien Philip et.al.|[2305.02756v1](http://arxiv.org/abs/2305.02756v1)|**[link](https://github.com/gradient-scaling/gradient-scaling.github.io)**|\n", "2305.02618": "|**2023-05-04**|**Semantic-aware Generation of Multi-view Portrait Drawings**|Biao Ma et.al.|[2305.02618v1](http://arxiv.org/abs/2305.02618v1)|**[link](https://github.com/aiart-hdu/sage)**|\n", "2305.03176": "|**2023-05-04**|**NeRF-QA: Neural Radiance Fields Quality Assessment Database**|Pedro Martin et.al.|[2305.03176v1](http://arxiv.org/abs/2305.03176v1)|null|\n", "2305.04789": "|**2023-05-08**|**AvatarReX: Real-time Expressive Full-body Avatars**|Zerong Zheng et.al.|[2305.04789v1](http://arxiv.org/abs/2305.04789v1)|null|\n", "2305.04296": "|**2023-05-07**|**HashCC: Lightweight Method to Improve the Quality of the Camera-less NeRF Scene Generation**|Jan Olszewski et.al.|[2305.04296v1](http://arxiv.org/abs/2305.04296v1)|null|\n", "2305.04268": "|**2023-05-07**|**Multi-Space Neural Radiance Fields**|Ze-Xin Yin et.al.|[2305.04268v1](http://arxiv.org/abs/2305.04268v1)|null|\n", "2305.05594": "|**2023-05-09**|**PET-NeuS: Positional Encoding Tri-Planes for Neural Surfaces**|Yiqun Wang et.al.|[2305.05594v1](http://arxiv.org/abs/2305.05594v1)|**[link](https://github.com/yiqun-wang/pet-neus)**|\n", "2305.04966": "|**2023-05-08**|**NerfAcc: Efficient Sampling Accelerates NeRFs**|Ruilong Li et.al.|[2305.04966v1](http://arxiv.org/abs/2305.04966v1)|null|\n", "2305.06131": "|**2023-05-10**|**Generative AI meets 3D: A Survey on Text-to-3D in AIGC Era**|Chenghao Li et.al.|[2305.06131v1](http://arxiv.org/abs/2305.06131v1)|null|\n", "2305.06118": "|**2023-05-10**|**NeRF$^\\textbf{2}$: Neural Radio-Frequency Radiance Fields**|Xiaopeng Zhao et.al.|[2305.06118v1](http://arxiv.org/abs/2305.06118v1)|null|\n", "2305.05766": "|**2023-05-09**|**Instant-NeRF: Instant On-Device Neural Radiance Field Training via Algorithm-Accelerator Co-Designed Near-Memory Processing**|Yang Zhao et.al.|[2305.05766v1](http://arxiv.org/abs/2305.05766v1)|null|\n", "2305.07342": "|**2023-05-12**|**BundleRecon: Ray Bundle-Based 3D Neural Reconstruction**|Weikun Zhang et.al.|[2305.07342v1](http://arxiv.org/abs/2305.07342v1)|null|\n", "2305.08851": "|**2023-05-15**|**MV-Map: Offboard HD-Map Generation with Multi-view Consistency**|Ziyang Xie et.al.|[2305.08851v1](http://arxiv.org/abs/2305.08851v1)|**[link](https://github.com/ziyang-xie/mv-map)**|\n", "2305.09761": "|**2023-05-16**|**NerfBridge: Bringing Real-time, Online Neural Radiance Field Training to Robotics**|Javier Yu et.al.|[2305.09761v1](http://arxiv.org/abs/2305.09761v1)|**[link](https://github.com/javieryu/nerf_bridge)**|\n", "2305.11167": "|**2023-05-18**|**MVPSNet: Fast Generalizable Multi-view Photometric Stereo**|Dongxu Zhao et.al.|[2305.11167v1](http://arxiv.org/abs/2305.11167v1)|null|\n", "2305.11031": "|**2023-05-18**|**ConsistentNeRF: Enhancing Neural Radiance Fields with 3D Consistency for Sparse View Synthesis**|Shoukang Hu et.al.|[2305.11031v1](http://arxiv.org/abs/2305.11031v1)|**[link](https://github.com/skhu101/consistentnerf)**|\n", "2305.10579": "|**2023-05-17**|**MultiPlaneNeRF: Neural Radiance Field with Non-Trainable Representation**|Dominik Zimny et.al.|[2305.10579v1](http://arxiv.org/abs/2305.10579v1)|**[link](https://github.com/gmum/multiplanenerf)**|\n", "2305.10503": "|**2023-05-24**|**OR-NeRF: Object Removing from 3D Scenes Guided by Multiview Segmentation with Neural Radiance Fields**|Youtan Yin et.al.|[2305.10503v2](http://arxiv.org/abs/2305.10503v2)|**[link](https://github.com/cuteyyt/or-nerf)**|\n", "2305.11588": "|**2023-05-19**|**Text2NeRF: Text-Driven 3D Scene Generation with Neural Radiance Fields**|Jingbo Zhang et.al.|[2305.11588v1](http://arxiv.org/abs/2305.11588v1)|null|\n", "2305.13307": "|**2023-05-22**|**NeRFuser: Large-Scale Scene Representation by NeRF Fusion**|Jiading Fang et.al.|[2305.13307v1](http://arxiv.org/abs/2305.13307v1)|**[link](https://github.com/ripl/nerfuser)**|\n", "2305.12843": "|**2023-05-22**|**Registering Neural Radiance Fields as 3D Density Images**|Han Jiang et.al.|[2305.12843v1](http://arxiv.org/abs/2305.12843v1)|null|\n", "2305.14093": "|**2023-05-24**|**3D Open-vocabulary Segmentation with Foundation Models**|Kunhao Liu et.al.|[2305.14093v2](http://arxiv.org/abs/2305.14093v2)|**[link](https://github.com/kunhao-liu/3d-ovs)**|\n", "2305.15171": "|**2023-05-31**|**Deceptive-NeRF: Enhancing NeRF Reconstruction using Pseudo-Observations from Diffusion Models**|Xinhang Liu et.al.|[2305.15171v2](http://arxiv.org/abs/2305.15171v2)|null|\n", "2305.15094": "|**2023-05-24**|**InpaintNeRF360: Text-Guided 3D Inpainting on Unbounded Neural Radiance Fields**|Dongqing Wang et.al.|[2305.15094v1](http://arxiv.org/abs/2305.15094v1)|null|\n", "2305.14831": "|**2023-05-24**|**OD-NeRF: Efficient Training of On-the-Fly Dynamic Neural Radiance Fields**|Zhiwen Yan et.al.|[2305.14831v1](http://arxiv.org/abs/2305.14831v1)|null|\n", "2305.16233": "|**2023-05-25**|**Interactive Segment Anything NeRF with Feature Imitation**|Xiaokang Chen et.al.|[2305.16233v1](http://arxiv.org/abs/2305.16233v1)|null|\n", "2305.16213": "|**2023-05-25**|**ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with Variational Score Distillation**|Zhengyi Wang et.al.|[2305.16213v1](http://arxiv.org/abs/2305.16213v1)|**[link](https://github.com/thu-ml/prolificdreamer)**|\n", "2305.16914": "|**2023-06-06**|**PlaNeRF: SVD Unsupervised 3D Plane Regularization for NeRF Large-Scale Scene Reconstruction**|Fusang Wang et.al.|[2305.16914v3](http://arxiv.org/abs/2305.16914v3)|null|\n", "2305.16411": "|**2023-05-25**|**ZeroAvatar: Zero-shot 3D Avatar Generation from a Single Image**|Zhenzhen Weng et.al.|[2305.16411v1](http://arxiv.org/abs/2305.16411v1)|null|\n", "2305.18079": "|**2023-05-31**|**Towards a Robust Framework for NeRF Evaluation**|Adrian Azzarelli et.al.|[2305.18079v3](http://arxiv.org/abs/2305.18079v3)|**[link](https://github.com/azzarelli/wape)**|\n", "2305.17916": "|**2023-05-31**|**Volume Feature Rendering for Fast Neural Radiance Field Reconstruction**|Kang Han et.al.|[2305.17916v2](http://arxiv.org/abs/2305.17916v2)|null|\n", "2305.19201": "|**2023-05-30**|**D\u00e4RF: Boosting Radiance Fields from Sparse Inputs with Monocular Depth Adaptation**|Jiuhn Song et.al.|[2305.19201v1](http://arxiv.org/abs/2305.19201v1)|**[link](https://github.com/KU-CVLAB/DaRF)**|\n", "2305.19065": "|**2023-05-30**|**Template-free Articulated Neural Point Clouds for Reposable View Synthesis**|Lukas Uzolas et.al.|[2305.19065v1](http://arxiv.org/abs/2305.19065v1)|**[link](https://github.com/lukasuz/articulated-point-nerf)**|\n", "2305.18766": "|**2023-05-31**|**HiFA: High-fidelity Text-to-3D with Advanced Diffusion Guidance**|Junzhe Zhu et.al.|[2305.18766v2](http://arxiv.org/abs/2305.18766v2)|null|\n", "2306.00783": "|**2023-06-01**|**FDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and Relighting with Diffusion Models**|Hao Zhang et.al.|[2306.00783v1](http://arxiv.org/abs/2306.00783v1)|**[link](https://github.com/billyxyb/fdnerf)**|\n", "2306.00696": "|**2023-06-01**|**Analyzing the Internals of Neural Radiance Fields**|Lukas Radl et.al.|[2306.00696v1](http://arxiv.org/abs/2306.00696v1)|**[link](https://github.com/r4dl/nerfinternals)**|\n", "2306.00547": "|**2023-06-02**|**AvatarStudio: Text-driven Editing of 3D Dynamic Human Head Avatars**|Mohit Mendiratta et.al.|[2306.00547v2](http://arxiv.org/abs/2306.00547v2)|null|\n", "2306.03000": "|**2023-06-05**|**BeyondPixels: A Comprehensive Review of the Evolution of Neural Radiance Fields**|AKM Shahariar Azad Rabby et.al.|[2306.03000v1](http://arxiv.org/abs/2306.03000v1)|null|\n", "2306.02741": "|**2023-06-05**|**ZIGNeRF: Zero-shot 3D Scene Representation with Invertible Generative Neural Radiance Fields**|Kanghyeok Ko et.al.|[2306.02741v1](http://arxiv.org/abs/2306.02741v1)|null|\n", "2306.03727": "|**2023-06-06**|**Towards Visual Foundational Models of Physical Scenes**|Chethan Parameshwara et.al.|[2306.03727v1](http://arxiv.org/abs/2306.03727v1)|null|\n", "2306.03576": "|**2023-06-06**|**Human 3D Avatar Modeling with Implicit Neural Representation: A Brief Survey**|Mingyang Sun et.al.|[2306.03576v1](http://arxiv.org/abs/2306.03576v1)|null|\n", "2306.03207": "|**2023-06-05**|**H2-Mapping: Real-time Dense Mapping Using Hierarchical Hybrid Representation**|Chenxing Jiang et.al.|[2306.03207v1](http://arxiv.org/abs/2306.03207v1)|**[link](https://github.com/sysu-star/h2-mapping)**|\n", "2306.05410": "|**2023-06-08**|**LU-NeRF: Scene and Pose Estimation by Synchronizing Local Unposed NeRFs**|Zezhou Cheng et.al.|[2306.05410v1](http://arxiv.org/abs/2306.05410v1)|null|\n", "2306.05303": "|**2023-06-08**|**Enhance-NeRF: Multiple Performance Evaluation for Neural Radiance Fields**|Qianqiu Tan et.al.|[2306.05303v1](http://arxiv.org/abs/2306.05303v1)|**[link](https://github.com/tanqianq/enhance-nerf)**|\n", "2306.06093": "|**2023-06-09**|**HyP-NeRF: Learning Improved NeRF Priors using a HyperNetwork**|Bipasha Sen et.al.|[2306.06093v1](http://arxiv.org/abs/2306.06093v1)|null|\n", "2306.06044": "|**2023-06-09**|**GANeRF: Leveraging Discriminators to Optimize Neural Radiance Fields**|Barbara Roessle et.al.|[2306.06044v1](http://arxiv.org/abs/2306.06044v1)|null|\n", "2306.05668": "|**2023-06-09**|**RePaint-NeRF: NeRF Editting via Semantic Masks and Diffusion Models**|Xingchen Zhou et.al.|[2306.05668v1](http://arxiv.org/abs/2306.05668v1)|null|\n", "2306.06388": "|**2023-06-10**|**From NeRFLiX to NeRFLiX++: A General NeRF-Agnostic Restorer Paradigm**|Kun Zhou et.al.|[2306.06388v1](http://arxiv.org/abs/2306.06388v1)|null|\n", "2306.06300": "|**2023-06-15**|**NERFBK: A High-Quality Benchmark for NERF-Based 3D Reconstruction**|Ali Karami et.al.|[2306.06300v2](http://arxiv.org/abs/2306.06300v2)|**[link](https://github.com/3dom-fbk/nerfbk)**|\n", "2306.07581": "|**2023-06-13**|**Binary Radiance Fields**|Seungjoo Shin et.al.|[2306.07581v1](http://arxiv.org/abs/2306.07581v1)|null|\n", "2306.09349": "|**2023-06-16**|**UrbanIR: Large-Scale Urban Scene Inverse Rendering from a Single Video**|Zhi-Hao Lin et.al.|[2306.09349v2](http://arxiv.org/abs/2306.09349v2)|null|\n", "2306.08068": "|**2023-06-13**|**DORSal: Diffusion for Object-centric Representations of Scenes $\\textit{et al.}$**|Allan Jabri et.al.|[2306.08068v1](http://arxiv.org/abs/2306.08068v1)|null|\n", "2306.09551": "|**2023-06-15**|**Edit-DiffNeRF: Editing 3D Neural Radiance Fields using 2D Diffusion Model**|Lu Yu et.al.|[2306.09551v1](http://arxiv.org/abs/2306.09551v1)|null|\n", "2306.11556": "|**2023-06-20**|**NeRF synthesis with shading guidance**|Chenbin Li et.al.|[2306.11556v1](http://arxiv.org/abs/2306.11556v1)|null|\n", "2306.10350": "|**2023-06-24**|**MA-NeRF: Motion-Assisted Neural Radiance Fields for Face Synthesis from Sparse Images**|Weichen Zhang et.al.|[2306.10350v2](http://arxiv.org/abs/2306.10350v2)|null|\n", "2306.12423": "|**2023-06-21**|**Benchmarking and Analyzing 3D-aware Image Synthesis with a Modularized Codebase**|Qiuyu Wang et.al.|[2306.12423v1](http://arxiv.org/abs/2306.12423v1)|**[link](https://github.com/qiuyu96/carver)**|\n", "2306.12422": "|**2023-06-21**|**DreamTime: An Improved Optimization Strategy for Text-to-3D Content Creation**|Yukun Huang et.al.|[2306.12422v1](http://arxiv.org/abs/2306.12422v1)|null|\n", "2306.12760": "|**2023-06-22**|**Blended-NeRF: Zero-Shot Object Generation and Blending in Existing Neural Radiance Fields**|Ori Gordon et.al.|[2306.12760v1](http://arxiv.org/abs/2306.12760v1)|**[link](https://github.com/orig333/Blended-NeRF)**|\n", "2306.12570": "|**2023-06-21**|**Local 3D Editing via 3D Distillation of CLIP Knowledge**|Junha Hyung et.al.|[2306.12570v1](http://arxiv.org/abs/2306.12570v1)|null|\n", "2306.15203": "|**2023-06-27**|**Unsupervised Polychromatic Neural Representation for CT Metal Artifact Reduction**|Qing Wu et.al.|[2306.15203v1](http://arxiv.org/abs/2306.15203v1)|**[link](https://github.com/iwuqing/polyner)**|\n", "2306.16541": "|**2023-06-28**|**Envisioning a Next Generation Extended Reality Conferencing System with Efficient Photorealistic Human Rendering**|Chuanyue Shen et.al.|[2306.16541v1](http://arxiv.org/abs/2306.16541v1)|null|\n", "2306.17723": "|**2023-07-16**|**FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis**|Seunghyeon Seo et.al.|[2306.17723v2](http://arxiv.org/abs/2306.17723v2)|**[link](https://github.com/shawn615/FlipNeRF)**|\n", "2306.17624": "|**2023-07-03**|**Sphere2Vec: A General-Purpose Location Representation Learning over a Spherical Surface for Large-Scale Geospatial Predictions**|Gengchen Mai et.al.|[2306.17624v2](http://arxiv.org/abs/2306.17624v2)|null|\n", "2307.03441": "|**2023-07-07**|**NOFA: NeRF-based One-shot Facial Avatar Reconstruction**|Wangbo Yu et.al.|[2307.03441v1](http://arxiv.org/abs/2307.03441v1)|null|\n", "2307.03404": "|**2023-07-07**|**RGB-D Mapping and Tracking in a Plenoxel Radiance Field**|Andreas L. Teigen et.al.|[2307.03404v1](http://arxiv.org/abs/2307.03404v1)|**[link](https://github.com/ysus33/rgb-d_plenoxel_mapping_tracking)**|\n", "2307.05087": "|**2023-07-11**|**SAR-NeRF: Neural Radiance Fields for Synthetic Aperture Radar Multi-View Representation**|Zhengxin Lei et.al.|[2307.05087v1](http://arxiv.org/abs/2307.05087v1)|null|\n", "2307.08093": "|**2023-07-16**|**Cross-Ray Neural Radiance Fields for Novel-view Synthesis from Unconstrained Image Collections**|Yifan Yang et.al.|[2307.08093v1](http://arxiv.org/abs/2307.08093v1)|**[link](https://github.com/yifyang993/cr-nerf-pytorch)**|\n", "2307.07729": "|**2023-07-15**|**Improving NeRF with Height Data for Utilization of GIS Data**|Hinata Aoki et.al.|[2307.07729v1](http://arxiv.org/abs/2307.07729v1)|null|\n", "2307.09323": "|**2023-07-18**|**Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Jiahe Li et.al.|[2307.09323v1](http://arxiv.org/abs/2307.09323v1)|**[link](https://github.com/fictionarry/er-nerf)**|\n", "2307.10135": "|**2023-07-19**|**An Improved NeuMIP with Better Accuracy**|Bowen Xue et.al.|[2307.10135v1](http://arxiv.org/abs/2307.10135v1)|null|\n", "2307.09860": "|**2023-07-19**|**Magic NeRF Lens: Interactive Fusion of Neural Radiance Fields for Virtual Facility Inspection**|Ke Li et.al.|[2307.09860v1](http://arxiv.org/abs/2307.09860v1)|**[link](https://github.com/uhhhci/immersive-ngp)**|\n", "2307.09555": "|**2023-07-14**|**Transient Neural Radiance Fields for Lidar View Synthesis and 3D Reconstruction**|Anagh Malik et.al.|[2307.09555v1](http://arxiv.org/abs/2307.09555v1)|null|\n", "2307.10776": "|**2023-07-20**|**Urban Radiance Field Representation with Deformable Neural Mesh Primitives**|Fan Lu et.al.|[2307.10776v1](http://arxiv.org/abs/2307.10776v1)|null|\n", "2307.10664": "|**2023-07-20**|**Lighting up NeRF via Unsupervised Decomposition and Enhancement**|Haoyuan Wang et.al.|[2307.10664v1](http://arxiv.org/abs/2307.10664v1)|**[link](https://github.com/onpix/LLNeRF)**|\n", "2307.11526": "|**2023-07-29**|**CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields**|Ziyuan Luo et.al.|[2307.11526v2](http://arxiv.org/abs/2307.11526v2)|null|\n", "2307.11418": "|**2023-08-07**|**FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural Radiance Fields**|Sungwon Hwang et.al.|[2307.11418v2](http://arxiv.org/abs/2307.11418v2)|null|\n", "2307.11335": "|**2023-07-21**|**Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural Radiance Fields**|Wenbo Hu et.al.|[2307.11335v1](http://arxiv.org/abs/2307.11335v1)|null|\n", "2307.12909": "|**2023-07-24**|**Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields**|Shangzhan Zhang et.al.|[2307.12909v1](http://arxiv.org/abs/2307.12909v1)|null|\n", "2307.12718": "|**2023-07-24**|**CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle Components**|Davide Di Nucci et.al.|[2307.12718v1](http://arxiv.org/abs/2307.12718v1)|null|\n", "2307.12291": "|**2023-07-23**|**TransHuman: A Transformer-based Human Representation for Generalizable Neural Human Rendering**|Xiao Pan et.al.|[2307.12291v1](http://arxiv.org/abs/2307.12291v1)|null|\n", "2307.13908": "|**2023-07-26**|**Points-to-3D: Bridging the Gap between Sparse Points and Shape-Controllable Text-to-3D Generation**|Chaohui Yu et.al.|[2307.13908v1](http://arxiv.org/abs/2307.13908v1)|null|\n", "2307.15058": "|**2023-07-27**|**MARS: An Instance-aware, Modular and Realistic Simulator for Autonomous Driving**|Zirui Wu et.al.|[2307.15058v1](http://arxiv.org/abs/2307.15058v1)|**[link](https://github.com/open-air-sun/mars)**|\n", "2307.14620": "|**2023-07-27**|**NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection**|Chenfeng Xu et.al.|[2307.14620v1](http://arxiv.org/abs/2307.14620v1)|**[link](https://github.com/facebookresearch/nerf-det)**|\n", "2307.15333": "|**2023-07-28**|**Dynamic PlenOctree for Adaptive Sampling Refinement in Explicit NeRF**|Haotian Bai et.al.|[2307.15333v1](http://arxiv.org/abs/2307.15333v1)|null|\n", "2307.15131": "|**2023-07-27**|**Seal-3D: Interactive Pixel-Level Editing for Neural Radiance Fields**|Xiangyu Wang et.al.|[2307.15131v1](http://arxiv.org/abs/2307.15131v1)|**[link](https://github.com/windingwind/seal-3d)**|\n", "2308.00462": "|**2023-08-01**|**Context-Aware Talking-Head Video Editing**|Songlin Yang et.al.|[2308.00462v1](http://arxiv.org/abs/2308.00462v1)|null|\n", "2308.01262": "|**2023-08-02**|**Incorporating Season and Solar Specificity into Renderings made by a NeRF Architecture using Satellite Images**|Michael Gableman et.al.|[2308.01262v1](http://arxiv.org/abs/2308.01262v1)|**[link](https://github.com/enterprisecv-6/season-nerf)**|\n", "2308.00773": "|**2023-08-01**|**High-Fidelity Eye Animatable Neural Radiance Fields for Human Face**|Hengfei Wang et.al.|[2308.00773v1](http://arxiv.org/abs/2308.00773v1)|null|\n", "2308.02191": "|**2023-08-04**|**ES-MVSNet: Efficient Framework for End-to-end Self-supervised Multi-View Stereo**|Qiang Zhou et.al.|[2308.02191v1](http://arxiv.org/abs/2308.02191v1)|null|\n", "2308.03280": "|**2023-08-07**|**Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with Whitted-Style Ray Tracing**|Junyi Zeng et.al.|[2308.03280v1](http://arxiv.org/abs/2308.03280v1)|null|\n", "2308.02908": "|**2023-08-05**|**Where and How: Mitigating Confusion in Neural Radiance Fields from Sparse Inputs**|Yanqi Bao et.al.|[2308.02908v1](http://arxiv.org/abs/2308.02908v1)|**[link](https://github.com/bbbbby-99/wah-nerf)**|\n", "2308.02840": "|**2023-08-05**|**Learning Unified Decompositional and Compositional NeRF for Editable Novel View Synthesis**|Yuxin Wang et.al.|[2308.02840v1](http://arxiv.org/abs/2308.02840v1)|null|\n", "2308.02751": "|**2023-08-05**|**NeRFs: The Search for the Best 3D Representation**|Ravi Ramamoorthi et.al.|[2308.02751v1](http://arxiv.org/abs/2308.02751v1)|null|\n", "2308.04413": "|**2023-08-08**|**Digging into Depth Priors for Outdoor Neural Radiance Fields**|Chen Wang et.al.|[2308.04413v1](http://arxiv.org/abs/2308.04413v1)|null|\n", "2308.03772": "|**2023-07-27**|**Improved Neural Radiance Fields Using Pseudo-depth and Fusion**|Jingliang Li et.al.|[2308.03772v1](http://arxiv.org/abs/2308.03772v1)|null|\n", "2308.04826": "|**2023-08-09**|**WaveNeRF: Wavelet-based Generalizable Neural Radiance Fields**|Muyu Xu et.al.|[2308.04826v1](http://arxiv.org/abs/2308.04826v1)|null|\n", "2308.04669": "|**2023-08-14**|**A General Implicit Framework for Fast NeRF Composition and Rendering**|Xinyu Gao et.al.|[2308.04669v2](http://arxiv.org/abs/2308.04669v2)|null|\n", "2308.05970": "|**2023-08-11**|**Focused Specific Objects NeRF**|Yuesong Li et.al.|[2308.05970v1](http://arxiv.org/abs/2308.05970v1)|null|\n", "2308.05939": "|**2023-08-11**|**VERF: Runtime Monitoring of Pose Estimation with Neural Radiance Fields**|Dominic Maggio et.al.|[2308.05939v1](http://arxiv.org/abs/2308.05939v1)|null|\n", "2308.07118": "|**2023-08-16**|**Neural radiance fields in the industrial and robotics domain: applications, research opportunities and use cases**|Eugen \u0160lapak et.al.|[2308.07118v2](http://arxiv.org/abs/2308.07118v2)|**[link](https://github.com/maftej/iisnerf)**|\n", "2308.07032": "|**2023-08-14**|**S3IM: Stochastic Structural SIMilarity and Its Unreasonable Effectiveness for Neural Fields**|Zeke Xie et.al.|[2308.07032v1](http://arxiv.org/abs/2308.07032v1)|**[link](https://github.com/madaoer/s3im_nerf)**|\n", "2308.08530": "|**2023-08-21**|**Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstruction**|Georgios Kouros et.al.|[2308.08530v3](http://arxiv.org/abs/2308.08530v3)|**[link](https://github.com/gkouros/ref-dvgo)**|\n", "2308.08258": "|**2023-08-16**|**SceNeRFlow: Time-Consistent Reconstruction of General Dynamic Scenes**|Edith Tretschk et.al.|[2308.08258v1](http://arxiv.org/abs/2308.08258v1)|null|\n", "2308.09421": "|**2023-08-18**|**MonoNeRD: NeRF-like Representations for Monocular 3D Object Detection**|Junkai Xu et.al.|[2308.09421v1](http://arxiv.org/abs/2308.09421v1)|**[link](https://github.com/cskkxjk/mononerd)**|\n", "2308.09386": "|**2023-08-18**|**DReg-NeRF: Deep Registration for Neural Radiance Fields**|Yu Chen et.al.|[2308.09386v1](http://arxiv.org/abs/2308.09386v1)|**[link](https://github.com/aibluefisher/dreg-nerf)**|\n", "2308.08947": "|**2023-08-17**|**Watch Your Steps: Local Image and Scene Editing by Text Instructions**|Ashkan Mirzaei et.al.|[2308.08947v1](http://arxiv.org/abs/2308.08947v1)|null|\n", "2308.10902": "|**2023-08-30**|**CamP: Camera Preconditioning for Neural Radiance Fields**|Keunhong Park et.al.|[2308.10902v2](http://arxiv.org/abs/2308.10902v2)|null|\n", "2308.10337": "|**2023-08-20**|**Strata-NeRF : Neural Radiance Fields for Stratified Scenes**|Ankit Dhiman et.al.|[2308.10337v1](http://arxiv.org/abs/2308.10337v1)|null|\n", "2308.10122": "|**2023-08-19**|**HollowNeRF: Pruning Hashgrid-Based NeRFs with Trainable Collision Mitigation**|Xiufeng Xie et.al.|[2308.10122v1](http://arxiv.org/abs/2308.10122v1)|null|\n", "2308.10001": "|**2023-08-19**|**AltNeRF: Learning Robust Neural Radiance Field via Alternating Depth-Pose Optimization**|Kun Wang et.al.|[2308.10001v1](http://arxiv.org/abs/2308.10001v1)|null|\n", "2308.09894": "|**2023-08-19**|**Semantic-Human: Neural Rendering of Humans from Monocular Video with Human Parsing**|Jie Zhang et.al.|[2308.09894v1](http://arxiv.org/abs/2308.09894v1)|null|\n", "2308.11198": "|**2023-08-22**|**Novel-view Synthesis and Pose Estimation for Hand-Object Interaction from Sparse Views**|Wentian Qu et.al.|[2308.11198v1](http://arxiv.org/abs/2308.11198v1)|null|\n", "2308.11130": "|**2023-08-22**|**Efficient View Synthesis with Neural Radiance Distribution Field**|Yushuang Wu et.al.|[2308.11130v1](http://arxiv.org/abs/2308.11130v1)|null|\n", "2308.11974": "|**2023-08-23**|**Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields**|Hyeonseop Song et.al.|[2308.11974v1](http://arxiv.org/abs/2308.11974v1)|null|\n", "2308.11951": "|**2023-08-25**|**Pose Modulated Avatars from Video**|Chunjin Song et.al.|[2308.11951v2](http://arxiv.org/abs/2308.11951v2)|null|\n", "2308.11793": "|**2023-08-22**|**Enhancing NeRF akin to Enhancing LLMs: Generalizable NeRF Transformer with Mixture-of-View-Experts**|Wenyan Cong et.al.|[2308.11793v1](http://arxiv.org/abs/2308.11793v1)|**[link](https://github.com/vita-group/gnt-move)**|\n", "2308.11774": "|**2023-08-22**|**SAMSNeRF: Segment Anything Model (SAM) Guides Dynamic Surgical Scene Reconstruction by Neural Radiance Field (NeRF)**|Ange Lou et.al.|[2308.11774v1](http://arxiv.org/abs/2308.11774v1)|null|\n", "2308.12560": "|**2023-08-24**|**NOVA: NOvel View Augmentation for Neural Composition of Dynamic Objects**|Dakshit Agrawal et.al.|[2308.12560v1](http://arxiv.org/abs/2308.12560v1)|**[link](https://github.com/dakshitagrawal/nova)**|\n", "2308.13897": "|**2023-08-26**|**InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules**|Yanqi Bao et.al.|[2308.13897v1](http://arxiv.org/abs/2308.13897v1)|**[link](https://github.com/bbbbby-99/insertnerf)**|\n", "2308.15049": "|**2023-08-29**|**Pose-Free Neural Radiance Fields via Implicit Pose Regularization**|Jiahui Zhang et.al.|[2308.15049v1](http://arxiv.org/abs/2308.15049v1)|null|\n", "2308.14816": "|**2023-08-28**|**CLNeRF: Continual Learning Meets NeRF**|Zhipeng Cai et.al.|[2308.14816v1](http://arxiv.org/abs/2308.14816v1)|**[link](https://github.com/intellabs/clnerf)**|\n", "2308.16041": "|**2023-08-30**|**From Pixels to Portraits: A Comprehensive Survey of Talking Head Generation Techniques and Applications**|Shreyank N Gowda et.al.|[2308.16041v1](http://arxiv.org/abs/2308.16041v1)|null|\n", "2308.15733": "|**2023-08-30**|**Drone-NeRF: Efficient NeRF Based 3D Scene Reconstruction for Large-Scale Drone Survey**|Zhihao Jia et.al.|[2308.15733v1](http://arxiv.org/abs/2308.15733v1)|null|\n", "2308.15547": "|**2023-08-29**|**Efficient Ray Sampling for Radiance Fields Reconstruction**|Shilei Sun et.al.|[2308.15547v1](http://arxiv.org/abs/2308.15547v1)|null|\n", "2308.16576": "|**2023-09-03**|**GHuNeRF: Generalizable Human NeRF from a Monocular Video**|Chen Li et.al.|[2308.16576v2](http://arxiv.org/abs/2308.16576v2)|null|\n", "2309.00277": "|**2023-09-01**|**SparseSat-NeRF: Dense Depth Supervised Neural Radiance Fields for Sparse Satellite Images**|Lulin Zhang et.al.|[2309.00277v1](http://arxiv.org/abs/2309.00277v1)|**[link](https://github.com/lulinzhang/sps-nerf)**|\n", "2309.00014": "|**2023-09-04**|**Improving NeRF Quality by Progressive Camera Placement for Unrestricted Navigation in Complex Environments**|Georgios Kopanas et.al.|[2309.00014v2](http://arxiv.org/abs/2309.00014v2)|null|\n", "2309.01811": "|**2023-09-06**|**Instant Continual Learning of Neural Radiance Fields**|Ryan Po et.al.|[2309.01811v2](http://arxiv.org/abs/2309.01811v2)|null|\n", "2309.01351": "|**2023-09-04**|**Adv3D: Generating 3D Adversarial Examples in Driving Scenarios with NeRF**|Leheng Li et.al.|[2309.01351v1](http://arxiv.org/abs/2309.01351v1)|null|\n", "2309.03185": "|**2023-09-06**|**Bayes' Rays: Uncertainty Quantification for Neural Radiance Fields**|Lily Goli et.al.|[2309.03185v1](http://arxiv.org/abs/2309.03185v1)|**[link](https://github.com/BayesRays/BayesRays)**|\n", "2309.03160": "|**2023-09-06**|**ResFields: Residual Neural Fields for Spatiotemporal Signals**|Marko Mihajlovic et.al.|[2309.03160v1](http://arxiv.org/abs/2309.03160v1)|**[link](https://github.com/markomih/ResFields)**|\n", "2309.03550": "|**2023-09-07**|**Text2Control3D: Controllable 3D Avatar Generation in Neural Radiance Fields using Geometry-Guided Text-to-Image Diffusion Model**|Sungwon Hwang et.al.|[2309.03550v1](http://arxiv.org/abs/2309.03550v1)|null|\n", "2309.04410": "|**2023-09-08**|**DeformToon3D: Deformable 3D Toonification from Neural Radiance Fields**|Junzhe Zhang et.al.|[2309.04410v1](http://arxiv.org/abs/2309.04410v1)|**[link](https://github.com/junzhezhang/deformtoon3d)**|\n", "2309.03955": "|**2023-09-14**|**SimpleNeRF: Regularizing Sparse Input Neural Radiance Fields with Simpler Solutions**|Nagabhushan Somraj et.al.|[2309.03955v2](http://arxiv.org/abs/2309.03955v2)|null|\n", "2309.03933": "|**2023-09-07**|**BluNF: Blueprint Neural Field**|Robin Courant et.al.|[2309.03933v1](http://arxiv.org/abs/2309.03933v1)|null|\n", "2309.05339": "|**2023-09-11**|**PAg-NeRF: Towards fast and efficient end-to-end panoptic 3D representations for agricultural robotics**|Claus Smitt et.al.|[2309.05339v1](http://arxiv.org/abs/2309.05339v1)|null|\n", "2309.04917": "|**2023-09-10**|**Text-driven Editing of 3D Scenes without Retraining**|Shuangkang Fang et.al.|[2309.04917v1](http://arxiv.org/abs/2309.04917v1)|**[link](https://github.com/Fangkang515/DN2N)**|\n", "2309.04750": "|**2023-09-09**|**Mirror-Aware Neural Humans**|Daniel Ajisafe et.al.|[2309.04750v1](http://arxiv.org/abs/2309.04750v1)|null|\n", "2309.04581": "|**2023-09-08**|**Dynamic Mesh-Aware Radiance Fields**|Yi-Ling Qiao et.al.|[2309.04581v1](http://arxiv.org/abs/2309.04581v1)|null|\n", "2309.06030": "|**2023-09-12**|**Federated Learning for Large-Scale Scene Modeling with Neural Radiance Fields**|Teppei Suzuki et.al.|[2309.06030v1](http://arxiv.org/abs/2309.06030v1)|null|\n", "2309.07125": "|**2023-09-13**|**Text-Guided Generation and Editing of Compositional 3D Avatars**|Hao Zhang et.al.|[2309.07125v1](http://arxiv.org/abs/2309.07125v1)|null|\n", "2309.06802": "|**2023-09-13**|**Dynamic NeRFs for Soccer Scenes**|Sacha Lewin et.al.|[2309.06802v1](http://arxiv.org/abs/2309.06802v1)|null|\n", "2309.07846": "|**2023-09-14**|**MC-NeRF: Muti-Camera Neural Radiance Fields for Muti-Camera Image Acquisition Systems**|Yu Gao et.al.|[2309.07846v1](http://arxiv.org/abs/2309.07846v1)|null|\n", "2309.07752": "|**2023-09-14**|**DT-NeRF: Decomposed Triplane-Hash Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Yaoyu Su et.al.|[2309.07752v1](http://arxiv.org/abs/2309.07752v1)|null|\n", "2309.07668": "|**2023-09-14**|**CoRF : Colorizing Radiance Fields using Knowledge Distillation**|Ankit Dhiman et.al.|[2309.07668v1](http://arxiv.org/abs/2309.07668v1)|null|\n", "2309.08596": "|**2023-09-15**|**Robust e-NeRF: NeRF from Sparse & Noisy Events under Non-Uniform Motion**|Weng Fei Low et.al.|[2309.08596v1](http://arxiv.org/abs/2309.08596v1)|**[link](https://github.com/wengflow/robust-e-nerf)**|\n", "2309.08040": "|**2023-09-14**|**Gradient based Grasp Pose Optimization on a NeRF that Approximates Grasp Success**|Gergely S\u00f3ti et.al.|[2309.08040v1](http://arxiv.org/abs/2309.08040v1)|null|\n", "2309.09502": "|**2023-09-18**|**RenderOcc: Vision-Centric 3D Occupancy Prediction with 2D Rendering Supervision**|Mingjie Pan et.al.|[2309.09502v1](http://arxiv.org/abs/2309.09502v1)|**[link](https://github.com/pmj110119/renderocc)**|\n", "2309.09295": "|**2023-09-17**|**NeRF-VINS: A Real-time Neural Radiance Field Map-based Visual-Inertial Navigation System**|Saimouli Katragadda et.al.|[2309.09295v1](http://arxiv.org/abs/2309.09295v1)|null|\n", "2309.08927": "|**2023-09-16**|**DynaMoN: Motion-Aware Fast And Robust Camera Localization for Dynamic NeRF**|Mert Asim Karaoglu et.al.|[2309.08927v1](http://arxiv.org/abs/2309.08927v1)|null|\n", "2309.10684": "|**2023-09-19**|**Locally Stylized Neural Radiance Fields**|Hong-Wing Pang et.al.|[2309.10684v1](http://arxiv.org/abs/2309.10684v1)|null|\n", "2309.10503": "|**2023-09-19**|**Steganography for Neural Radiance Fields by Backdooring**|Weina Dong et.al.|[2309.10503v1](http://arxiv.org/abs/2309.10503v1)|null|\n", "2309.10011": "|**2023-09-18**|**Instant Photorealistic Style Transfer: A Lightweight and Adaptive Approach**|Rong Liu et.al.|[2309.10011v1](http://arxiv.org/abs/2309.10011v1)|null|\n", "2309.11009": "|**2023-09-21**|**Controllable Dynamic Appearance for Neural 3D Portraits**|ShahRukh Athar et.al.|[2309.11009v2](http://arxiv.org/abs/2309.11009v2)|null|\n", "2309.10987": "|**2023-09-20**|**Spiking NeRF: Making Bio-inspired Neural Networks See through the Real World**|Xingting Yao et.al.|[2309.10987v1](http://arxiv.org/abs/2309.10987v1)|null|\n", "2309.12183": "|**2023-09-21**|**ORTexME: Occlusion-Robust Human Shape and Pose via Temporal Average Texture and Mesh Encoding**|Yu Cheng et.al.|[2309.12183v1](http://arxiv.org/abs/2309.12183v1)|null|\n", "2309.11966": "|**2023-09-21**|**NeuralLabeling: A versatile toolset for labeling vision datasets using Neural Radiance Fields**|Floris Erich et.al.|[2309.11966v1](http://arxiv.org/abs/2309.11966v1)|**[link](https://github.com/FlorisE/neural-labeling)**|\n", "2309.11767": "|**2023-09-21**|**Fast Satellite Tensorial Radiance Field for Multi-date Satellite Imagery of Large Size**|Tongtong Zhang et.al.|[2309.11767v1](http://arxiv.org/abs/2309.11767v1)|null|\n", "2309.11747": "|**2023-09-21**|**MarkNerf:Watermarking for Neural Radiance Field**|Lifeng Chen et.al.|[2309.11747v1](http://arxiv.org/abs/2309.11747v1)|null|\n", "2309.11698": "|**2023-09-21**|**Rendering stable features improves sampling-based localisation with Neural radiance fields**|Boxuan Zhang et.al.|[2309.11698v1](http://arxiv.org/abs/2309.11698v1)|null|\n", "2309.11627": "|**2023-09-20**|**GenLayNeRF: Generalizable Layered Representations with 3D Model Alignment for Multi-Human View Synthesis**|Youssef Abdelkareem et.al.|[2309.11627v1](http://arxiv.org/abs/2309.11627v1)|null|\n", "2309.11525": "|**2023-09-23**|**Light Field Diffusion for Single-View Novel View Synthesis**|Yifeng Xiong et.al.|[2309.11525v2](http://arxiv.org/abs/2309.11525v2)|null|\n", "2309.13039": "|**2023-09-22**|**NeRRF: 3D Reconstruction and View Synthesis for Transparent and Specular Objects with Neural Refractive-Reflective Fields**|Xiaoxue Chen et.al.|[2309.13039v1](http://arxiv.org/abs/2309.13039v1)|**[link](https://github.com/dawning77/nerrf)**|\n", "2309.14293": "|**2023-09-25**|**NAS-NeRF: Generative Neural Architecture Search for Neural Radiance Fields**|Saeejith Nair et.al.|[2309.14293v1](http://arxiv.org/abs/2309.14293v1)|null|\n", "2309.14010": "|**2023-09-25**|**Variational Inference for Scalable 3D Object-centric Learning**|Tianyu Wang et.al.|[2309.14010v1](http://arxiv.org/abs/2309.14010v1)|null|\n", "2309.13607": "|**2023-09-24**|**MM-NeRF: Multimodal-Guided 3D Multi-Style Transfer of Neural Radiance Field**|Zijiang Yang et.al.|[2309.13607v1](http://arxiv.org/abs/2309.13607v1)|null|\n", "2309.13240": "|**2023-09-23**|**NeRF-Enhanced Outpainting for Faithful Field-of-View Extrapolation**|Rui Yu et.al.|[2309.13240v1](http://arxiv.org/abs/2309.13240v1)|null|\n", "2309.14800": "|**2023-09-26**|**3D Density-Gradient based Edge Detection on Neural Radiance Fields (NeRFs) for Geometric Reconstruction**|Miriam J\u00e4ger et.al.|[2309.14800v1](http://arxiv.org/abs/2309.14800v1)|null|\n", "2309.15526": "|**2023-09-27**|**P2I-NET: Mapping Camera Pose to Image via Adversarial Learning for New View Synthesis in Real Indoor Environments**|Xujie Kang et.al.|[2309.15526v1](http://arxiv.org/abs/2309.15526v1)|null|\n", "2309.15329": "|**2023-09-27**|**BASED: Bundle-Adjusting Surgical Endoscopic Dynamic Video Reconstruction using Neural Radiance Fields**|Shreya Saha et.al.|[2309.15329v1](http://arxiv.org/abs/2309.15329v1)|null|\n", "2309.16553": "|**2023-09-28**|**MatrixCity: A Large-scale City Dataset for City-scale Neural Rendering and Beyond**|Yixuan Li et.al.|[2309.16553v1](http://arxiv.org/abs/2309.16553v1)|null|\n", "2309.16364": "|**2023-10-04**|**FG-NeRF: Flow-GAN based Probabilistic Neural Radiance Field for Independence-Assumption-Free Uncertainty Estimation**|Songlin Wei et.al.|[2309.16364v2](http://arxiv.org/abs/2309.16364v2)|null|\n", "2309.16110": "|**2023-09-28**|**Learning Effective NeRFs and SDFs Representations with 3D Generative Adversarial Networks for 3D Object Generation: Technical Report for ICCV 2023 OmniObject3D Challenge**|Zheyuan Yang et.al.|[2309.16110v1](http://arxiv.org/abs/2309.16110v1)|null|\n", "2309.17450": "|**2023-09-29**|**Multi-task View Synthesis with Neural Radiance Fields**|Shuhong Zheng et.al.|[2309.17450v1](http://arxiv.org/abs/2309.17450v1)|**[link](https://github.com/zsh2000/muvienerf)**|\n", "2309.17390": "|**2023-09-29**|**Forward Flow for Novel View Synthesis of Dynamic Scenes**|Xiang Guo et.al.|[2309.17390v1](http://arxiv.org/abs/2309.17390v1)|null|\n", "2309.17128": "|**2023-09-29**|**HAvatar: High-fidelity Head Avatar via Facial Model Conditioned Neural Radiance Field**|Xiaochen Zhao et.al.|[2309.17128v1](http://arxiv.org/abs/2309.17128v1)|null|\n", "2309.16859": "|**2023-09-28**|**Preface: A Data-driven Volumetric Prior for Few-shot Ultra High-resolution Face Synthesis**|Marcel C. B\u00fchler et.al.|[2309.16859v1](http://arxiv.org/abs/2309.16859v1)|null|\n", "2310.01881": "|**2023-10-03**|**Adaptive Multi-NeRF: Exploit Efficient Parallelism in Adaptive Multiple Scale Neural Radiance Field Rendering**|Tong Wang et.al.|[2310.01881v1](http://arxiv.org/abs/2310.01881v1)|null|\n", "2310.01821": "|**2023-10-03**|**MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural Radiance Fields**|Takuhiro Kaneko et.al.|[2310.01821v1](http://arxiv.org/abs/2310.01821v1)|null|\n", "2310.00874": "|**2023-10-02**|**PC-NeRF: Parent-Child Neural Radiance Fields under Partial Sensor Data Loss in Autonomous Driving Environments**|Xiuzhong Hu et.al.|[2310.00874v1](http://arxiv.org/abs/2310.00874v1)|**[link](https://github.com/biter0088/pc-nerf)**|\n", "2310.00684": "|**2023-10-01**|**How Many Views Are Needed to Reconstruct an Unknown Object Using NeRF?**|Sicong Pan et.al.|[2310.00684v1](http://arxiv.org/abs/2310.00684v1)|**[link](https://github.com/psc0628/nerf-prv)**|\n", "2310.00530": "|**2023-10-01**|**Enabling Neural Radiance Fields (NeRF) for Large-scale Aerial Images -- A Multi-tiling Approaching and the Geometry Assessment of NeRF**|Ningli Xu et.al.|[2310.00530v1](http://arxiv.org/abs/2310.00530v1)|null|\n", "2310.00249": "|**2023-09-30**|**MMPI: a Flexible Radiance Field Representation by Multiple Multi-plane Images Blending**|Yuze He et.al.|[2310.00249v1](http://arxiv.org/abs/2310.00249v1)|null|\n", "2310.02977": "|**2023-10-04**|**T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation**|Yuze He et.al.|[2310.02977v1](http://arxiv.org/abs/2310.02977v1)|**[link](https://github.com/THU-LYJ-Lab/T3Bench)**|\n", "2310.02712": "|**2023-10-04**|**ED-NeRF: Efficient Text-Guided Editing of 3D Scene using Latent Space NeRF**|Jangho Park et.al.|[2310.02712v1](http://arxiv.org/abs/2310.02712v1)|null|\n", "2310.02687": "|**2023-10-05**|**USB-NeRF: Unrolling Shutter Bundle Adjusted Neural Radiance Fields**|Moyang Li et.al.|[2310.02687v2](http://arxiv.org/abs/2310.02687v2)|null|\n", "2310.02437": "|**2023-10-03**|**EvDNeRF: Reconstructing Event Data with Dynamic Neural Radiance Fields**|Anish Bhattacharya et.al.|[2310.02437v1](http://arxiv.org/abs/2310.02437v1)|**[link](https://github.com/anish-bhattacharya/evdnerf)**|\n", "2310.03704": "|**2023-10-05**|**Drag View: Generalizable Novel View Synthesis with Unposed Imagery**|Zhiwen Fan et.al.|[2310.03704v1](http://arxiv.org/abs/2310.03704v1)|**[link](https://github.com/zhiwenfan/DragView)**|\n", "2310.03578": "|**2023-10-05**|**Targeted Adversarial Attacks on Generalizable Neural Radiance Fields**|Andras Horvath et.al.|[2310.03578v1](http://arxiv.org/abs/2310.03578v1)|null|\n", "2310.03563": "|**2023-10-05**|**BID-NeRF: RGB-D image pose estimation with inverted Neural Radiance Fields**|\u00c1goston Istv\u00e1n Csehi et.al.|[2310.03563v1](http://arxiv.org/abs/2310.03563v1)|null|\n", "2310.03125": "|**2023-10-04**|**Shielding the Unseen: Privacy Protection through Poisoning NeRF with Spatial Deformation**|Yihan Wu et.al.|[2310.03125v1](http://arxiv.org/abs/2310.03125v1)|null|\n", "2310.04152": "|**2023-10-06**|**Improving Neural Radiance Field using Near-Surface Sampling with Point Cloud Generation**|Hye Bin Yoo et.al.|[2310.04152v1](http://arxiv.org/abs/2310.04152v1)|null|\n", "2310.05837": "|**2023-10-09**|**A Real-time Method for Inserting Virtual Objects into Neural Radiance Fields**|Keyang Ye et.al.|[2310.05837v1](http://arxiv.org/abs/2310.05837v1)|null|\n", "2310.05391": "|**2023-10-09**|**Neural Impostor: Editing Neural Radiance Fields with Explicit Shape Manipulation**|Ruiyang Liu et.al.|[2310.05391v1](http://arxiv.org/abs/2310.05391v1)|null|\n", "2310.05134": "|**2023-10-08**|**LocoNeRF: A NeRF-based Approach for Local Structure from Motion for Precise Localization**|Artem Nenashev et.al.|[2310.05134v1](http://arxiv.org/abs/2310.05134v1)|null|\n", "2310.05133": "|**2023-10-08**|**Geometry Aware Field-to-field Transformations for 3D Semantic Segmentation**|Dominik Hollidt et.al.|[2310.05133v1](http://arxiv.org/abs/2310.05133v1)|null|\n", "2310.06275": "|**2023-10-10**|**High-Fidelity 3D Head Avatars Reconstruction through Spatially-Varying Expression Conditioned Neural Radiance Field**|Minghan Qin et.al.|[2310.06275v1](http://arxiv.org/abs/2310.06275v1)|null|\n", "2310.07449": "|**2023-10-12**|**PoRF: Pose Residual Field for Accurate Neural Surface Reconstruction**|Jia-Wang Bian et.al.|[2310.07449v2](http://arxiv.org/abs/2310.07449v2)|null|\n", "2310.07179": "|**2023-10-11**|**rpcPRF: Generalizable MPI Neural Radiance Field for Satellite Camera**|Tongtong Zhang et.al.|[2310.07179v1](http://arxiv.org/abs/2310.07179v1)|null|\n", "2310.06984": "|**2023-10-10**|**Leveraging Neural Radiance Fields for Uncertainty-Aware Visual Localization**|Le Chen et.al.|[2310.06984v1](http://arxiv.org/abs/2310.06984v1)|null|\n", "2310.07916": "|**2023-10-11**|**Dynamic Appearance Particle Neural Radiance Field**|Ancheng Lin et.al.|[2310.07916v1](http://arxiv.org/abs/2310.07916v1)|null|\n", "2310.10650": "|**2023-10-16**|**TraM-NeRF: Tracing Mirror and Near-Perfect Specular Reflections through Neural Radiance Fields**|Leif Van Holland et.al.|[2310.10650v1](http://arxiv.org/abs/2310.10650v1)|**[link](https://github.com/Rubikalubi/TraM-NeRF)**|\n", "2310.10624": "|**2023-10-16**|**DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion- and View-Change Human-Centric Video Editing**|Jia-Wei Liu et.al.|[2310.10624v1](http://arxiv.org/abs/2310.10624v1)|null|\n", "2310.10209": "|**2023-10-16**|**Self-supervised Fetal MRI 3D Reconstruction Based on Radiation Diffusion Generation Model**|Junpeng Tan et.al.|[2310.10209v1](http://arxiv.org/abs/2310.10209v1)|null|\n", "2310.09965": "|**2023-10-15**|**ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context**|Binglun Wang et.al.|[2310.09965v1](http://arxiv.org/abs/2310.09965v1)|null|\n", "2310.09892": "|**2023-10-15**|**Active Perception using Neural Radiance Fields**|Siming He et.al.|[2310.09892v1](http://arxiv.org/abs/2310.09892v1)|**[link](https://github.com/grasp-lyrl/active-perception-using-neural-radiance-fields)**|\n", "2310.09776": "|**2023-10-15**|**CBARF: Cascaded Bundle-Adjusting Neural Radiance Fields from Imperfect Camera Poses**|Hongyu Fu et.al.|[2310.09776v1](http://arxiv.org/abs/2310.09776v1)|null|\n", "2310.11864": "|**2023-10-18**|**VQ-NeRF: Neural Reflectance Decomposition and Editing with Vector Quantization**|Hongliang Zhong et.al.|[2310.11864v1](http://arxiv.org/abs/2310.11864v1)|null|\n", "2310.11645": "|**2023-10-18**|**Towards Abdominal 3-D Scene Rendering from Laparoscopy Surgical Videos using NeRFs**|Khoa Tuan Nguyen et.al.|[2310.11645v1](http://arxiv.org/abs/2310.11645v1)|null|\n", "2310.13670": "|**2023-10-20**|**ManifoldNeRF: View-dependent Image Feature Supervision for Few-shot Neural Radiance Fields**|Daiju Kanaoka et.al.|[2310.13670v1](http://arxiv.org/abs/2310.13670v1)|null|\n", "2310.13356": "|**2023-10-20**|**Sync-NeRF: Generalizing Dynamic NeRFs to Unsynchronized Videos**|Seoha Kim et.al.|[2310.13356v1](http://arxiv.org/abs/2310.13356v1)|**[link](https://github.com/seoha-kim/Sync-NeRF)**|\n", "2310.13263": "|**2023-10-20**|**UE4-NeRF:Neural Radiance Field for Real-Time Rendering of Large-Scale Scene**|Jiaming Gu et.al.|[2310.13263v1](http://arxiv.org/abs/2310.13263v1)|null|\n", "2310.14695": "|**2023-10-23**|**CAwa-NeRF: Instant Learning of Compression-Aware NeRF Features**|Omnia Mahmoud et.al.|[2310.14695v1](http://arxiv.org/abs/2310.14695v1)|null|\n", "2310.14487": "|**2023-10-23**|**VQ-NeRF: Vector Quantization Enhances Implicit Neural Representations**|Yiying Yang et.al.|[2310.14487v1](http://arxiv.org/abs/2310.14487v1)|null|\n", "2310.15504": "|**2023-10-24**|**Cross-view Self-localization from Synthesized Scene-graphs**|Ryogo Yamamoto et.al.|[2310.15504v1](http://arxiv.org/abs/2310.15504v1)|null|\n", "2310.16832": "|**2023-10-26**|**LightSpeed: Light and Fast Neural Light Fields on Mobile Devices**|Aarush Gupta et.al.|[2310.16832v2](http://arxiv.org/abs/2310.16832v2)|**[link](https://github.com/lightspeed-r2l/lightspeed)**|\n", "2310.16831": "|**2023-10-28**|**PERF: Panoramic Neural Radiance Field from a Single Panorama**|Guangcong Wang et.al.|[2310.16831v2](http://arxiv.org/abs/2310.16831v2)|**[link](https://github.com/perf-project/PeRF)**|\n", "2310.16383": "|**2023-10-25**|**Open-NeRF: Towards Open Vocabulary NeRF Decomposition**|Hao Zhang et.al.|[2310.16383v1](http://arxiv.org/abs/2310.16383v1)|null|\n", "2310.16255": "|**2023-10-25**|**UAV-Sim: NeRF-based Synthetic Data Generation for UAV-based Perception**|Christopher Maxey et.al.|[2310.16255v1](http://arxiv.org/abs/2310.16255v1)|null|\n", "2310.17075": "|**2023-10-27**|**HyperFields: Towards Zero-Shot Generation of NeRFs from Text**|Sudarshan Babu et.al.|[2310.17075v2](http://arxiv.org/abs/2310.17075v2)|null|\n", "2310.16858": "|**2023-10-25**|**4D-Editor: Interactive Object-level Editing in Dynamic Neural Radiance Fields via 4D Semantic Segmentation**|Dadong Jiang et.al.|[2310.16858v1](http://arxiv.org/abs/2310.16858v1)|null|\n", "2310.17994": "|**2023-10-27**|**ZeroNVS: Zero-Shot 360-Degree View Synthesis from a Single Real Image**|Kyle Sargent et.al.|[2310.17994v1](http://arxiv.org/abs/2310.17994v1)|null|\n", "2310.17880": "|**2023-10-27**|**Reconstructive Latent-Space Neural Radiance Fields for Efficient 3D Scene Representations**|Tristan Aumentado-Armstrong et.al.|[2310.17880v1](http://arxiv.org/abs/2310.17880v1)|null|\n", "2310.18917": "|**2023-11-04**|**TiV-NeRF: Tracking and Mapping via Time-Varying Representation with Dynamic Neural Radiance Fields**|Chengyao Duan et.al.|[2310.18917v2](http://arxiv.org/abs/2310.18917v2)|null|\n", "2310.18846": "|**2023-10-28**|**INCODE: Implicit Neural Conditioning with Prior Knowledge Embeddings**|Amirhossein Kazerouni et.al.|[2310.18846v1](http://arxiv.org/abs/2310.18846v1)|**[link](https://github.com/xmindflow/INCODE)**|\n", "2310.20710": "|**2023-10-31**|**FPO++: Efficient Encoding and Rendering of Dynamic Neural Radiance Fields by Analyzing and Enhancing Fourier PlenOctrees**|Saskia Rabich et.al.|[2310.20710v1](http://arxiv.org/abs/2310.20710v1)|null|\n", "2310.20685": "|**2023-10-31**|**NeRF Revisited: Fixing Quadrature Instability in Volume Rendering**|Mikaela Angelina Uy et.al.|[2310.20685v1](http://arxiv.org/abs/2310.20685v1)|null|\n", "2310.19464": "|**2023-10-30**|**Generative Neural Fields by Mixtures of Neural Implicit Functions**|Tackgeun You et.al.|[2310.19464v1](http://arxiv.org/abs/2310.19464v1)|null|\n", "2311.01065": "|**2023-11-02**|**Novel View Synthesis from a Single RGBD Image for Indoor Scenes**|Congrui Hetang et.al.|[2311.01065v1](http://arxiv.org/abs/2311.01065v1)|null|\n", "2311.01815": "|**2023-11-03**|**Estimating 3D Uncertainty Field: Quantifying Uncertainty for Neural Radiance Fields**|Jianxiong Shen et.al.|[2311.01815v1](http://arxiv.org/abs/2311.01815v1)|null|\n", "2311.01773": "|**2023-11-03**|**PDF: Point Diffusion Implicit Function for Large-scale Scene Neural Representation**|Yuhan Ding et.al.|[2311.01773v1](http://arxiv.org/abs/2311.01773v1)|null|\n", "2311.01659": "|**2023-11-03**|**Efficient Cloud Pipelines for Neural Radiance Fields**|Derek Jacoby et.al.|[2311.01659v1](http://arxiv.org/abs/2311.01659v1)|null|\n", "2311.03140": "|**2023-11-06**|**Animating NeRFs from Texture Space: A Framework for Pose-Dependent Rendering of Human Performances**|Paul Knoll et.al.|[2311.03140v1](http://arxiv.org/abs/2311.03140v1)|null|\n", "2311.02826": "|**2023-11-06**|**InstructPix2NeRF: Instructed 3D Portrait Editing from a Single Image**|Jianhui Li et.al.|[2311.02826v1](http://arxiv.org/abs/2311.02826v1)|**[link](https://github.com/mybabyyh/instructpix2nerf)**|\n", "2311.04154": "|**2023-11-07**|**High-fidelity 3D Reconstruction of Plants using Neural Radiance Field**|Kewei Hu et.al.|[2311.04154v1](http://arxiv.org/abs/2311.04154v1)|null|\n", "2311.03965": "|**2023-11-07**|**Fast Sun-aligned Outdoor Scene Relighting based on TensoRF**|Yeonjin Chang et.al.|[2311.03965v1](http://arxiv.org/abs/2311.03965v1)|null|\n", "2311.03784": "|**2023-11-08**|**UP-NeRF: Unconstrained Pose-Prior-Free Neural Radiance Fields**|Injae Kim et.al.|[2311.03784v2](http://arxiv.org/abs/2311.03784v2)|**[link](https://github.com/mlvlab/upnerf)**|\n", "2311.03484": "|**2023-11-06**|**Osprey: Multi-Session Autonomous Aerial Mapping with LiDAR-based SLAM and Next Best View Planning**|Rowan Border et.al.|[2311.03484v1](http://arxiv.org/abs/2311.03484v1)|null|\n", "2311.04400": "|**2023-11-08**|**LRM: Large Reconstruction Model for Single Image to 3D**|Yicong Hong et.al.|[2311.04400v1](http://arxiv.org/abs/2311.04400v1)|null|\n", "2311.04246": "|**2023-11-07**|**ADFactory: Automated Data Factory for Optical Flow Tasks**|Han Ling et.al.|[2311.04246v1](http://arxiv.org/abs/2311.04246v1)|null|\n", "2311.05521": "|**2023-11-09**|**BakedAvatar: Baking Neural Fields for Real-Time Head Avatar Synthesis**|Hao-Bin Duan et.al.|[2311.05521v1](http://arxiv.org/abs/2311.05521v1)|null|\n", "2311.05461": "|**2023-11-09**|**Control3D: Towards Controllable Text-to-3D Generation**|Yang Chen et.al.|[2311.05461v1](http://arxiv.org/abs/2311.05461v1)|null|\n"}}
\ No newline at end of file
+{"Kinematic Mapping": {"2302.11988": "|**2023-02-23**|**Time Complexity of Broadcast and Consensus for Randomized Oblivious Message Adversaries**|Antoine El-Hayek et.al.|[2302.11988v1](http://arxiv.org/abs/2302.11988v1)|null|\n", "2302.09743": "|**2023-02-20**|**Dynamic Optimal Control: A Real-Time Control Optimization Algorithm for Dynamic Networks**|Chunyu Pan et.al.|[2302.09743v1](http://arxiv.org/abs/2302.09743v1)|null|\n", "2302.09382": "|**2023-02-18**|**Co-trading networks for modeling dynamic interdependency structures and estimating high-dimensional covariances in US equity markets**|Yutong Lu et.al.|[2302.09382v1](http://arxiv.org/abs/2302.09382v1)|null|\n", "2302.07657": "|**2023-02-15**|**Dynamic Flows with Time-Dependent Capacities**|Thomas Bl\u00e4sius et.al.|[2302.07657v1](http://arxiv.org/abs/2302.07657v1)|null|\n", "2302.04377": "|**2023-02-08**|**ER network heterogeneity guides diffusive transport and kinetics**|Zubenelgenubi C. Scott et.al.|[2302.04377v1](http://arxiv.org/abs/2302.04377v1)|null|\n", "2302.03677": "|**2023-02-24**|**Wealth distribution on a dynamic complex network**|Gustavo Kohlrausch et.al.|[2302.03677v2](http://arxiv.org/abs/2302.03677v2)|null|\n", "2302.03039": "|**2023-02-06**|**SUPER VII. Morphology and kinematics of H$\u03b1$ emission in AGN host galaxies at Cosmic noon using SINFONI**|D. Kakkad et.al.|[2302.03039v1](http://arxiv.org/abs/2302.03039v1)|null|\n", "2302.02313": "|**2023-02-05**|**A Game-Theoretic Approach to Solving the Roman Domination Problem**|Xiuyang Chen et.al.|[2302.02313v1](http://arxiv.org/abs/2302.02313v1)|null|\n", "2302.01694": "|**2023-02-03**|**Coevolving Boolean and Multi-Valued Regulatory Networks**|Larry Bull et.al.|[2302.01694v1](http://arxiv.org/abs/2302.01694v1)|null|\n", "2301.12892": "|**2023-01-30**|**Quantifying and maximizing the information flux in recurrent neural networks**|Claus Metzner et.al.|[2301.12892v1](http://arxiv.org/abs/2301.12892v1)|null|\n", "2301.12156": "|**2023-03-23**|**Perspective: How to overcome dynamical density functional theory**|Daniel de las Heras et.al.|[2301.12156v2](http://arxiv.org/abs/2301.12156v2)|null|\n", "2301.11982": "|**2023-02-01**|**Strategy evolution on dynamic networks**|Qi Su et.al.|[2301.11982v2](http://arxiv.org/abs/2301.11982v2)|null|\n", "2301.10962": "|**2023-01-26**|**Scheduling Policy for Value-of-Information (VoI) in Trajectory Estimation for Digital Twins**|Van-Phuc Bui et.al.|[2301.10962v1](http://arxiv.org/abs/2301.10962v1)|null|\n", "2301.07849": "|**2023-01-19**|**Efficient Computation in Congested Anonymous Dynamic Networks**|Giuseppe A. Di Luna et.al.|[2301.07849v1](http://arxiv.org/abs/2301.07849v1)|null|\n", "2301.07515": "|**2023-01-15**|**Towards the development of Dynamic Networked Psychology Hypotheses**|Liaquat Hossain et.al.|[2301.07515v1](http://arxiv.org/abs/2301.07515v1)|null|\n", "2301.04904": "|**2023-01-12**|**Lesion-aware Dynamic Kernel for Polyp Segmentation**|Ruifei Zhang et.al.|[2301.04904v1](http://arxiv.org/abs/2301.04904v1)|**[link](https://github.com/reafly/ldnet)**|\n", "2301.04296": "|**2023-01-11**|**A degree-corrected Cox model for dynamic networks**|Yuguo Chen et.al.|[2301.04296v1](http://arxiv.org/abs/2301.04296v1)|null|\n", "2301.03965": "|**2023-01-10**|**BiCurNet: Pre-Movement EEG based Neural Decoder for Biceps Curl Trajectory Estimation**|Manali Saini et.al.|[2301.03965v1](http://arxiv.org/abs/2301.03965v1)|null|\n", "2301.01314": "|**2023-01-03**|**Network-theoretic modeling of fluid-structure interactions**|Aditya G. Nair et.al.|[2301.01314v1](http://arxiv.org/abs/2301.01314v1)|null|\n", "2212.12843": "|**2022-12-25**|**A Note on Improved Results for One Round Distributed Clique Listing**|Quanquan C. Liu et.al.|[2212.12843v1](http://arxiv.org/abs/2212.12843v1)|null|\n", "2212.12345": "|**2022-12-23**|**Piecewise-Velocity Model for Learning Continuous-time Dynamic Node Representations**|Abdulkadir \u00c7elikkanat et.al.|[2212.12345v1](http://arxiv.org/abs/2212.12345v1)|null|\n", "2212.12130": "|**2023-02-04**|**Learning to Detect and Segment for Open Vocabulary Object Detection**|Tao Wang et.al.|[2212.12130v2](http://arxiv.org/abs/2212.12130v2)|null|\n", "2212.09483": "|**2022-12-19**|**Adaptive Control of Client Selection and Gradient Compression for Efficient Federated Learning**|Zhida Jiang et.al.|[2212.09483v1](http://arxiv.org/abs/2212.09483v1)|null|\n", "2212.08358": "|**2022-12-16**|**Some recent trends in embeddings of time series and dynamic networks**|Dag Tj\u00f8stheim et.al.|[2212.08358v1](http://arxiv.org/abs/2212.08358v1)|null|\n", "2212.08314": "|**2023-01-30**|**Synchronization-preserving clusters in hypergraphs**|Anirban Banerjee et.al.|[2212.08314v2](http://arxiv.org/abs/2212.08314v2)|null|\n", "2212.08239": "|**2022-12-16**|**Discovering Structural Hole Spanners in Dynamic Networks via Graph Neural Networks**|Diksha Goel et.al.|[2212.08239v1](http://arxiv.org/abs/2212.08239v1)|null|\n", "2212.07961": "|**2022-12-15**|**Topological Data Analysis Detects Percolation Thresholds in Arctic Melt-Pond Evolution**|Wilfred Offord et.al.|[2212.07961v1](http://arxiv.org/abs/2212.07961v1)|**[link](https://github.com/wilfofford/tda-for-sea-ice-percolation)**|\n", "2212.05980": "|**2022-12-12**|**Evaluation of RGB-D SLAM in Large Indoor Environments**|Kirill Muravyev et.al.|[2212.05980v1](http://arxiv.org/abs/2212.05980v1)|null|\n", "2212.03999": "|**2022-12-07**|**On the application of dimensionality reduction and clustering algorithms for the classification of kinematic morphologies of galaxies**|M. S. Rosito et.al.|[2212.03999v1](http://arxiv.org/abs/2212.03999v1)|null|\n", "2212.02410": "|**2023-03-17**|**Antipodal Self-Duality for a Four-Particle Form Factor**|Lance J. Dixon et.al.|[2212.02410v2](http://arxiv.org/abs/2212.02410v2)|null|\n", "2212.02383": "|**2022-12-05**|**An Approach for Detecting Dynamic Communities in Social Networks**|Souaad Boudebza et.al.|[2212.02383v1](http://arxiv.org/abs/2212.02383v1)|**[link](https://github.com/Yquetzal/ECML_PKDD_2019)**|\n", "2212.01594": "|**2022-12-03**|**Parameterized temporal exploration problems**|Thomas Erlebach et.al.|[2212.01594v1](http://arxiv.org/abs/2212.01594v1)|null|\n", "2211.16726": "|**2022-11-30**|**Boosted Dynamic Neural Networks**|Haichao Yu et.al.|[2211.16726v1](http://arxiv.org/abs/2211.16726v1)|**[link](https://github.com/SHI-Labs/Boosted-Dynamic-Networks)**|\n", "2211.15301": "|**2022-11-28**|**Learning Coherent Clusters in Weakly-Connected Network Systems**|Hancheng Min et.al.|[2211.15301v1](http://arxiv.org/abs/2211.15301v1)|null|\n", "2211.15043": "|**2022-11-28**|**Higher-order Knowledge Transfer for Dynamic Community Detection with Great Changes**|Huixin Ma et.al.|[2211.15043v1](http://arxiv.org/abs/2211.15043v1)|null|\n", "2211.14560": "|**2023-01-24**|**A dynamic multi-region MFD model for ride-sourcing with ridesplitting**|Caio Vitor Beojone et.al.|[2211.14560v2](http://arxiv.org/abs/2211.14560v2)|null|\n", "2211.12589": "|**2022-11-22**|**Building Squares with Optimal State Complexity in Restricted Active Self-Assembly**|Robert M. Alaniz et.al.|[2211.12589v1](http://arxiv.org/abs/2211.12589v1)|**[link](https://github.com/asarg/autotile)**|\n", "2211.11876": "|**2022-11-21**|**Structural Modelling of Dynamic Networks and Identifying Maximum Likelihood**|Christian Gourieroux et.al.|[2211.11876v1](http://arxiv.org/abs/2211.11876v1)|null|\n", "2211.11352": "|**2023-01-30**|**Brief Announcement: Broadcasting Time in Dynamic Rooted Trees is Linear**|Antoine El-Hayek et.al.|[2211.11352v3](http://arxiv.org/abs/2211.11352v3)|null|\n", "2211.11069": "|**2022-11-20**|**Learning Nonlinear Couplings in Network of Agents from a Single Sample Trajectory**|Arash Amini et.al.|[2211.11069v1](http://arxiv.org/abs/2211.11069v1)|null|\n", "2211.10825": "|**2022-11-20**|**Identifiability of dynamic networks: the essential r\u00f4le of dources and dinks**|Eduardo Mapurunga et.al.|[2211.10825v1](http://arxiv.org/abs/2211.10825v1)|null|\n", "2211.10151": "|**2023-01-27**|**Asymptotically Tight Bounds on the Time Complexity of Broadcast and its Variants in Dynamic Networks**|Antoine El-Hayek et.al.|[2211.10151v2](http://arxiv.org/abs/2211.10151v2)|null|\n", "2211.09139": "|**2022-11-16**|**The Pandora project. I: the impact of radiation and cosmic rays on baryonic and dark matter properties of dwarf galaxies**|Sergio Martin-Alvarez et.al.|[2211.09139v1](http://arxiv.org/abs/2211.09139v1)|null|\n", "2211.08820": "|**2022-11-16**|**Computing-Aware Routing for LEO Satellite Networks: A Transmission and Computation Integration Approach**|Jiaqi Cao et.al.|[2211.08820v1](http://arxiv.org/abs/2211.08820v1)|null|\n", "2211.08700": "|**2023-02-14**|**Bi-directional Digital Twin and Edge Computing in the Metaverse**|Jiadong Yu et.al.|[2211.08700v2](http://arxiv.org/abs/2211.08700v2)|null|\n", "2211.08639": "|**2022-11-16**|**Hierarchical Dynamic Image Harmonization**|Haoxing Chen et.al.|[2211.08639v1](http://arxiv.org/abs/2211.08639v1)|**[link](https://github.com/chenhaoxing/hdnet)**|\n", "2211.08378": "|**2022-11-15**|**Anomaly Detection in Multiplex Dynamic Networks: from Blockchain Security to Brain Disease Prediction**|Ali Behrouz et.al.|[2211.08378v1](http://arxiv.org/abs/2211.08378v1)|**[link](https://github.com/ubc-systopia/anomuly)**|\n", "2211.09664": "|**2022-11-15**|**Influencer Detection with Dynamic Graph Neural Networks**|Elena Tiukhova et.al.|[2211.09664v1](http://arxiv.org/abs/2211.09664v1)|**[link](https://github.com/banking-analytics-lab/dynamicgraphlearning)**|\n", "2211.07570": "|**2022-11-14**|**Tides Need STEMMED: A Locally Operating Spatio-Temporal Mutually Exciting Point Process with Dynamic Network for Improving Opioid Overdose Death Prediction**|Che-Yi Liao et.al.|[2211.07570v1](http://arxiv.org/abs/2211.07570v1)|null|\n", "2211.07449": "|**2022-11-14**|**Dual-based Online Learning of Dynamic Network Topologies**|Seyed Saman Saboksayr et.al.|[2211.07449v1](http://arxiv.org/abs/2211.07449v1)|null|\n", "2302.12759": "|**2023-02-24**|**Modularity-based approach for tracking communities in dynamic social networks**|Michele Mazza et.al.|[2302.12759v1](http://arxiv.org/abs/2302.12759v1)|null|\n", "2302.13629": "|**2023-02-27**|**Estimation of continuous environments by robot swarms: Correlated networks and decision-making**|Mohsen Raoufi et.al.|[2302.13629v1](http://arxiv.org/abs/2302.13629v1)|null|\n", "2302.13292": "|**2023-02-26**|**Discovering Top-k Structural Hole Spanners in Dynamic Networks**|Diksha Goel et.al.|[2302.13292v1](http://arxiv.org/abs/2302.13292v1)|null|\n", "2211.05668": "|**2022-12-07**|**Mapping the Milky Way Disk with Gaia DR3: 3D extended kinematic maps and rotation curve to $\\approx 30$ kpc**|Hai-Feng Wang et.al.|[2211.05668v2](http://arxiv.org/abs/2211.05668v2)|null|\n", "2211.01538": "|**2023-03-12**|**$D^2$SLAM: Decentralized and Distributed Collaborative Visual-inertial SLAM System for Aerial Swarm**|Hao Xu et.al.|[2211.01538v3](http://arxiv.org/abs/2211.01538v3)|**[link](https://github.com/hkust-aerial-robotics/d2slam)**|\n", "2210.14842": "|**2022-10-26**|**Continuum Robot State Estimation Using Gaussian Process Regression on $SE(3)$**|Sven Lilge et.al.|[2210.14842v1](http://arxiv.org/abs/2210.14842v1)|null|\n", "2210.04572": "|**2022-10-10**|**Floorplan-Aware Camera Poses Refinement**|Anna Sokolova et.al.|[2210.04572v1](http://arxiv.org/abs/2210.04572v1)|null|\n", "2210.03412": "|**2022-10-07**|**The Trajectory PHD Filter for Coexisting Point and Extended Target Tracking**|Shaoxiu Wei et.al.|[2210.03412v1](http://arxiv.org/abs/2210.03412v1)|null|\n", "2210.01320": "|**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|\n", "2209.09723": "|**2023-02-22**|**GANet: Goal Area Network for Motion Forecasting**|Mingkun Wang et.al.|[2209.09723v3](http://arxiv.org/abs/2209.09723v3)|**[link](https://github.com/kingwmk/ganet)**|\n", "2212.03441": "|**2023-03-23**|**Higher topological complexity of a map**|Cesar A. Ipanaque Zapata et.al.|[2212.03441v2](http://arxiv.org/abs/2212.03441v2)|null|\n", "2304.09043": "|**2023-05-16**|**Continuous-Time Range-Only Pose Estimation**|Abhishek Goudar et.al.|[2304.09043v2](http://arxiv.org/abs/2304.09043v2)|null|\n", "2304.11694": "|**2023-04-25**|**Vehicle State Estimation and Prediction**|Xinchen Li et.al.|[2304.11694v2](http://arxiv.org/abs/2304.11694v2)|null|\n", "2306.01188": "|**2023-09-12**|**Event-based Stereo Visual Odometry with Native Temporal Resolution via Continuous-time Gaussian Process Regression**|Jianeng Wang et.al.|[2306.01188v5](http://arxiv.org/abs/2306.01188v5)|null|\n", "2306.01056": "|**2023-06-01**|**ERGO-ML: Towards a robust machine learning model for inferring the fraction of accreted stars in galaxies from integral-field spectroscopic maps**|Eirini Angeloudi et.al.|[2306.01056v1](http://arxiv.org/abs/2306.01056v1)|null|\n", "2306.11091": "|**2023-06-19**|**Composite Bulges -- IV. Detecting Signatures of Gas Inflows in the IFU data: The MUSE View of Ionized Gas Kinematics in NGC 1097**|Tutku Kolcu et.al.|[2306.11091v1](http://arxiv.org/abs/2306.11091v1)|null|\n", "2306.14573": "|**2023-06-26**|**Hydrodynamic simulations of the Disk of Gas Around Supermassive black holes (HDGAS) -I; Molecular Gas Dynamics**|Mojtaba Raouf et.al.|[2306.14573v1](http://arxiv.org/abs/2306.14573v1)|null|\n", "2307.00728": "|**2023-07-03**|**A new approach to QCD evolution in processes with massive partons**|Benoit Assi et.al.|[2307.00728v1](http://arxiv.org/abs/2307.00728v1)|null|\n", "2307.03207": "|**2023-07-06**|**H$\u03b1$ Kinematics of Superbubbles and Supernova Remnants of the Dwarf galaxy NGC 4214**|M. S\u00e1nchez-Cruces et.al.|[2307.03207v1](http://arxiv.org/abs/2307.03207v1)|null|\n", "2307.10381": "|**2023-07-19**|**Accelerating galaxy dynamical modeling using a neural network for joint lensing and kinematics analyses**|Matthew R. Gomer et.al.|[2307.10381v1](http://arxiv.org/abs/2307.10381v1)|null|\n", "2307.14125": "|**2023-07-26**|**Multi-IMU Proprioceptive State Estimator for Humanoid Robots**|Fabio Elnecave Xavier et.al.|[2307.14125v1](http://arxiv.org/abs/2307.14125v1)|null|\n", "2308.04071": "|**2023-08-08**|**Path Signatures for Diversity in Probabilistic Trajectory Optimisation**|Lucas Barcelos et.al.|[2308.04071v1](http://arxiv.org/abs/2308.04071v1)|null|\n", "2308.08654": "|**2023-08-16**|**Advancing Brain-Computer Interface System Performance in Hand Trajectory Estimation with NeuroKinect**|Sidharth Pancholi et.al.|[2308.08654v1](http://arxiv.org/abs/2308.08654v1)|null|\n", "2308.11493": "|**2023-08-22**|**Looking into the faintEst WIth MUSE (LEWIS): on the nature of ultra-diffuse galaxies in the Hydra-I cluster.I. Project description and preliminary results**|Enrichetta Iodice et.al.|[2308.11493v1](http://arxiv.org/abs/2308.11493v1)|null|\n", "2308.12418": "|**2023-08-23**|**Certifiably Optimal Rotation and Pose Estimation Based on the Cayley Map**|Timothy D Barfoot et.al.|[2308.12418v1](http://arxiv.org/abs/2308.12418v1)|null|\n", "2308.16620": "|**2023-08-31**|**GA-NIFS: JWST/NIRSpec IFU observations of HFLS3 reveal a dense galaxy group at z~6.3**|G. C. Jones et.al.|[2308.16620v1](http://arxiv.org/abs/2308.16620v1)|null|\n", "2309.03396": "|**2023-09-06**|**Detection of open cluster rotation fields from Gaia EDR3 proper motions**|Pedro Guilherme-Garcia et.al.|[2309.03396v1](http://arxiv.org/abs/2309.03396v1)|null|\n", "2309.06792": "|**2023-09-13**|**Motion-Bias-Free Feature-Based SLAM**|Alejandro Fontan et.al.|[2309.06792v1](http://arxiv.org/abs/2309.06792v1)|null|\n", "2309.09808": "|**2023-09-18**|**Coco-LIC: Continuous-Time Tightly-Coupled LiDAR-Inertial-Camera Odometry using Non-Uniform B-spline**|Xiaolei Lang et.al.|[2309.09808v1](http://arxiv.org/abs/2309.09808v1)|**[link](https://github.com/april-zju/coco-lic)**|\n", "2309.09011": "|**2023-09-16**|**Optimal Initialization Strategies for Range-Only Trajectory Estimation**|Abhishek Goudar et.al.|[2309.09011v1](http://arxiv.org/abs/2309.09011v1)|null|\n", "2309.08780": "|**2023-09-15**|**Simultaneous Trajectory Estimation and Mapping for Autonomous Underwater Proximity Operations**|Aldo Ter\u00e1n Espinoza et.al.|[2309.08780v1](http://arxiv.org/abs/2309.08780v1)|null|\n", "2309.11134": "|**2023-09-20**|**GNSS/Multi-Sensor Fusion Using Continuous-Time Factor Graph Optimization for Robust Localization**|Haoming Zhang et.al.|[2309.11134v1](http://arxiv.org/abs/2309.11134v1)|**[link](https://github.com/rwth-irt/gnssfgo)**|\n", "2309.15065": "|**2023-09-26**|**Language-EXtended Indoor SLAM (LEXIS): A Versatile System for Real-time Visual Scene Understanding**|Christina Kassab et.al.|[2309.15065v1](http://arxiv.org/abs/2309.15065v1)|null|\n", "2310.03353": "|**2023-10-05**|**Deep Geometric Learning with Monotonicity Constraints for Alzheimer's Disease Progression**|Seungwoo Jeong et.al.|[2310.03353v1](http://arxiv.org/abs/2310.03353v1)|null|\n", "2310.06249": "|**2023-10-10**|**l-dyno: framework to learn consistent visual features using robot's motion**|Kartikeya Singh et.al.|[2310.06249v1](http://arxiv.org/abs/2310.06249v1)|null|\n", "2310.10723": "|**2023-10-16**|**Kinematical coherence between satellite galaxies and host stellar discs for MaNGA & SAMI galaxies**|Sen Wang et.al.|[2310.10723v1](http://arxiv.org/abs/2310.10723v1)|null|\n", "2310.12776": "|**2023-10-19**|**First holistic modelling of meteoroid ablation and fragmentation: A case study of the Orionids recorded by the Canadian Automated Meteor Observatory**|Denis Vida et.al.|[2310.12776v1](http://arxiv.org/abs/2310.12776v1)|null|\n", "2310.14506": "|**2023-10-23**|**Label Space Partition Selection for Multi-Object Tracking Using Two-Layer Partitioning**|Ji Youn Lee et.al.|[2310.14506v1](http://arxiv.org/abs/2310.14506v1)|null|\n"}, "Map fusion": {"2302.11106": "|**2023-02-22**|**Multi-Head Feature Pyramid Networks for Breast Mass Detection**|Hexiang Zhang et.al.|[2302.11106v1](http://arxiv.org/abs/2302.11106v1)|null|\n", "2301.09213": "|**2023-01-24**|**FRAME: Fast and Robust Autonomous 3D point cloud Map-merging for Egocentric multi-robot exploration**|Nikolaos Stathoulopoulos et.al.|[2301.09213v2](http://arxiv.org/abs/2301.09213v2)|null|\n", "2212.01538": "|**2022-12-03**|**Multi-resolution Monocular Depth Map Fusion by Self-supervised Gradient-based Composition**|Yaqiao Dai et.al.|[2212.01538v1](http://arxiv.org/abs/2212.01538v1)|**[link](https://github.com/yuinsky/gradient-based-depth-map-fusion)**|\n", "2211.03423": "|**2022-11-07**|**Detecting Invalid Map Merges in Lifelong SLAM**|Matthias Holoch et.al.|[2211.03423v1](http://arxiv.org/abs/2211.03423v1)|null|\n", "2209.10775": "|**2022-09-22**|**MUI-TARE: Multi-Agent Cooperative Exploration with Unknown Initial Position**|Jingtian Yan et.al.|[2209.10775v1](http://arxiv.org/abs/2209.10775v1)|null|\n", "2209.08988": "|**2022-09-19**|**MSA-GCN:Multiscale Adaptive Graph Convolution Network for Gait Emotion Recognition**|Yunfei Yin et.al.|[2209.08988v1](http://arxiv.org/abs/2209.08988v1)|null|\n", "2209.03096": "|**2022-09-07**|**Spherical wedge billiard: from chaos to fractals and Talbot carpets**|Tom\u00e1\u0161 Tyc et.al.|[2209.03096v1](http://arxiv.org/abs/2209.03096v1)|null|\n", "2208.06293": "|**2022-08-12**|**dual unet:a novel siamese network for change detection with cascade differential fusion**|Kaixuan Jiang et.al.|[2208.06293v1](http://arxiv.org/abs/2208.06293v1)|null|\n", "2207.09210": "|**2023-10-23**|**KinD-LCE Curve Estimation And Retinex Fusion On Low-Light Image**|Xiaochun Lei et.al.|[2207.09210v3](http://arxiv.org/abs/2207.09210v3)|null|\n", "2207.06965": "|**2023-06-27**|**AutoMerge: A Framework for Map Assembling and Smoothing in City-scale Environments**|Peng Yin et.al.|[2207.06965v4](http://arxiv.org/abs/2207.06965v4)|null|\n", "2203.00436": "|**2022-03-01**|**Boundary Corrected Multi-scale Fusion Network for Real-time Semantic Segmentation**|Tianjiao Jiang et.al.|[2203.00436v1](http://arxiv.org/abs/2203.00436v1)|null|\n", "2202.08498": "|**2022-02-17**|**Mirror-Yolo: An attention-based instance segmentation and detection model for mirrors**|Fengze Li et.al.|[2202.08498v1](http://arxiv.org/abs/2202.08498v1)|null|\n", "2201.11937": "|**2022-01-28**|**Stereo Matching with Cost Volume based Sparse Disparity Propagation**|Wei Xue et.al.|[2201.11937v1](http://arxiv.org/abs/2201.11937v1)|null|\n", "2201.10152": "|**2022-01-29**|**Unsupervised Image Fusion Method based on Feature Mutual Mapping**|Dongyu Rao et.al.|[2201.10152v2](http://arxiv.org/abs/2201.10152v2)|null|\n", "2112.13222": "|**2022-01-24**|**Edge Robotics: Edge-Computing-Accelerated Multi-Robot Simultaneous Localization and Mapping**|Peng Huang et.al.|[2112.13222v2](http://arxiv.org/abs/2112.13222v2)|null|\n", "2112.11044": "|**2021-12-21**|**Extending Merge Resolution to a Family of Proof Systems**|Sravanthi Chede et.al.|[2112.11044v1](http://arxiv.org/abs/2112.11044v1)|null|\n", "2111.14990": "|**2021-11-29**|**MIXER: A Principled Framework for Multimodal, Multiway Data Association**|Parker C. Lusk et.al.|[2111.14990v1](http://arxiv.org/abs/2111.14990v1)|null|\n", "2110.12338": "|**2021-10-24**|**Quality Map Fusion for Adversarial Learning**|Uche Osahor et.al.|[2110.12338v1](http://arxiv.org/abs/2110.12338v1)|null|\n", "2110.08172": "|**2021-10-18**|**MLFC: From 10 to 50 Planners in the Multi-Agent Programming Contest**|Rafael C. Cardoso et.al.|[2110.08172v2](http://arxiv.org/abs/2110.08172v2)|null|\n", "2110.06697": "|**2021-10-13**|**Semantic Image Fusion**|P. R. Hill et.al.|[2110.06697v1](http://arxiv.org/abs/2110.06697v1)|null|\n", "2110.06436": "|**2021-10-13**|**Non-local Recurrent Regularization Networks for Multi-view Stereo**|Qingshan Xu et.al.|[2110.06436v1](http://arxiv.org/abs/2110.06436v1)|null|\n", "2108.08623": "|**2021-08-19**|**VolumeFusion: Deep Depth Fusion for 3D Scene Reconstruction**|Jaesung Choe et.al.|[2108.08623v1](http://arxiv.org/abs/2108.08623v1)|null|\n", "2106.11515": "|**2021-06-23**|**Cooperative mmWave PHD-SLAM with Moving Scatterers**|Hyowon Kim et.al.|[2106.11515v2](http://arxiv.org/abs/2106.11515v2)|null|\n", "2106.10220": "|**2021-06-18**|**Semantic navigation with domain knowledge**|Rafael Gomes Braga et.al.|[2106.10220v1](http://arxiv.org/abs/2106.10220v1)|null|\n", "2106.04512": "|**2021-06-22**|**Formal Verification of a Map Merging Protocol in the Multi-Agent Programming Contest**|Matt Luckcuck et.al.|[2106.04512v2](http://arxiv.org/abs/2106.04512v2)|null|\n", "2105.14994": "|**2021-05-31**|**MAOMaps: A Photo-Realistic Benchmark For vSLAM and Map Merging Quality Assessment**|Andrey Bokovoy et.al.|[2105.14994v1](http://arxiv.org/abs/2105.14994v1)|**[link](https://github.com/CnnDepth/MAOMaps)**|\n", "2103.13246": "|**2021-03-24**|**Generic Merging of Structure from Motion Maps with a Low Memory Footprint**|Gabrielle Flood et.al.|[2103.13246v1](http://arxiv.org/abs/2103.13246v1)|null|\n", "2103.03786": "|**2022-09-22**|**Distributed Dynamic Map Fusion via Federated Learning for Intelligent Networked Vehicles**|Zijian Zhang et.al.|[2103.03786v3](http://arxiv.org/abs/2103.03786v3)|**[link](https://github.com/zijianzhang/CARLA_INVS)**|\n", "2102.10929": "|**2021-02-22**|**Deep Learning for Robust Motion Segmentation with Non-Static Cameras**|Markus Bosch et.al.|[2102.10929v1](http://arxiv.org/abs/2102.10929v1)|null|\n", "2012.10658": "|**2021-02-24**|**Generalize a Small Pre-trained Model to Arbitrarily Large TSP Instances**|Zhang-Hua Fu et.al.|[2012.10658v2](http://arxiv.org/abs/2012.10658v2)|**[link](https://github.com/Spider-scnu/TSP)**|\n", "2011.14791": "|**2021-06-08**|**NeuralFusion: Online Depth Fusion in Latent Space**|Silvan Weder et.al.|[2011.14791v2](http://arxiv.org/abs/2011.14791v2)|**[link](https://github.com/weders/NeuralFusion)**|\n", "2011.03975": "|**2020-11-11**|**Mapless-Planner: A Robust and Fast Planning Framework for Aggressive Autonomous Flight without Map Fusion**|Jialin Ji et.al.|[2011.03975v2](http://arxiv.org/abs/2011.03975v2)|null|\n", "2010.03026": "|**2020-11-16**|**Place Recognition in Forests with Urquhart Tessellations**|Guilherme V. Nardari et.al.|[2010.03026v2](http://arxiv.org/abs/2010.03026v2)|**[link](https://github.com/gnardari/urquhart)**|\n", "2009.05819": "|**2020-09-12**|**Map-merging Algorithms for Visual SLAM: Feasibility Study and Empirical Evaluation**|Andrey Bokovoy et.al.|[2009.05819v1](http://arxiv.org/abs/2009.05819v1)|null|\n", "2007.14177": "|**2020-07-28**|**Generative networks as inverse problems with fractional wavelet scattering networks**|Jiasong Wu et.al.|[2007.14177v1](http://arxiv.org/abs/2007.14177v1)|null|\n", "2007.02295": "|**2020-07-05**|**Multi view stereo with semantic priors**|Elisavet Konstantina Stathopoulou et.al.|[2007.02295v1](http://arxiv.org/abs/2007.02295v1)|null|\n", "2007.02108": "|**2020-07-04**|**SplitFusion: Simultaneous Tracking and Mapping for Non-Rigid Scenes**|Yang Li et.al.|[2007.02108v1](http://arxiv.org/abs/2007.02108v1)|null|\n", "2006.00420": "|**2020-05-31**|**VIR-SLAM: Visual, Inertial, and Ranging SLAM for single and multi-robot systems**|Yanjun Cao et.al.|[2006.00420v1](http://arxiv.org/abs/2006.00420v1)|null|\n", "2002.10342": "|**2020-02-24**|**Comparing View-Based and Map-Based Semantic Labelling in Real-Time SLAM**|Zoe Landgraf et.al.|[2002.10342v1](http://arxiv.org/abs/2002.10342v1)|null|\n", "2001.09796": "|**2020-01-16**|**Knowledge Integration of Collaborative Product Design Using Cloud Computing Infrastructure**|Mahdi Bohlouli et.al.|[2001.09796v1](http://arxiv.org/abs/2001.09796v1)|null|\n", "2001.04388": "|**2020-04-03**|**RoutedFusion: Learning Real-time Depth Map Fusion**|Silvan Weder et.al.|[2001.04388v2](http://arxiv.org/abs/2001.04388v2)|**[link](https://github.com/weders/RoutedFusion)**|\n", "1909.00703": "|**2019-09-02**|**Learned Semantic Multi-Sensor Depth Map Fusion**|Denys Rozumnyi et.al.|[1909.00703v1](http://arxiv.org/abs/1909.00703v1)|null|\n", "1908.11585": "|**2019-08-30**|**ORBSLAM-Atlas: a robust and accurate multi-map system**|Richard Elvira et.al.|[1908.11585v1](http://arxiv.org/abs/1908.11585v1)|null|\n", "1908.10541": "|**2020-06-07**|**Search and Rescue under the Forest Canopy using Multiple UAVs**|Yulun Tian et.al.|[1908.10541v2](http://arxiv.org/abs/1908.10541v2)|null|\n", "1908.09806": "|**2020-02-26**|**5G mmWave Cooperative Positioning and Mapping using Multi-Model PHD Filter and Map Fusion**|Hyowon Kim et.al.|[1908.09806v3](http://arxiv.org/abs/1908.09806v3)|**[link](https://github.com/HyowonKim-P1/5GmmWavePHDFilterMapFusion)**|\n", "1905.11257": "|**2019-05-27**|**IRAS23385+6053: An embedded massive cluster in the making**|R. Cesaroni et.al.|[1905.11257v1](http://arxiv.org/abs/1905.11257v1)|null|\n", "1812.08402": "|**2018-12-20**|**SFA: Small Faces Attention Face Detector**|Shi Luo et.al.|[1812.08402v1](http://arxiv.org/abs/1812.08402v1)|**[link](https://github.com/shiluo1990/SFA)**|\n", "1811.07632": "|**2018-11-21**|**Collaborative Dense SLAM**|Louis Gallagher et.al.|[1811.07632v2](http://arxiv.org/abs/1811.07632v2)|null|\n", "1810.00457": "|**2019-03-14**|**AgriColMap: Aerial-Ground Collaborative 3D Mapping for Precision Farming**|Ciro Potena et.al.|[1810.00457v2](http://arxiv.org/abs/1810.00457v2)|null|\n", "1809.09646": "|**2019-03-05**|**Efficient Constellation-Based Map-Merging for Semantic SLAM**|Kristoffer M. Frey et.al.|[1809.09646v2](http://arxiv.org/abs/1809.09646v2)|null|\n", "2306.15416": "|**2023-07-04**|**Irregular Change Detection in Sparse Bi-Temporal Point Clouds using Learned Place Recognition Descriptors and Point-to-Voxel Comparison**|Nikolaos Stathoulopoulos et.al.|[2306.15416v2](http://arxiv.org/abs/2306.15416v2)|null|\n", "2307.00500": "|**2023-07-02**|**CQLite: Communication-Efficient Multi-Robot Exploration Using Coverage-biased Distributed Q-Learning**|Ehsan Latif et.al.|[2307.00500v1](http://arxiv.org/abs/2307.00500v1)|null|\n", "2212.08334": "|**2023-07-10**|**Lightweight integration of 3D features to improve 2D image segmentation**|Olivier Pradelle et.al.|[2212.08334v2](http://arxiv.org/abs/2212.08334v2)|**[link](https://github.com/opradelle/2dguidedlight3d)**|\n", "2307.07126": "|**2023-07-14**|**Multi-Session, Localization-oriented and Lightweight LiDAR Mapping Using Semantic Lines and Planes**|Zehuan Yu et.al.|[2307.07126v1](http://arxiv.org/abs/2307.07126v1)|null|\n", "2308.02674": "|**2023-08-04**|**Group-$k$ consistent measurement set maximization via maximum clique over k-Uniform hypergraphs for robust multi-robot map merging**|Brendon Forsgren et.al.|[2308.02674v1](http://arxiv.org/abs/2308.02674v1)|**[link](https://bitbucket.org/jmangelson/gkcm)**|\n", "2308.08715": "|**2023-08-17**|**V-FUSE: Volumetric Depth Map Fusion with Long-Range Constraints**|Nathaniel Burgdorfer et.al.|[2308.08715v1](http://arxiv.org/abs/2308.08715v1)|**[link](https://github.com/nburgdorfer/v-fuse)**|\n", "2311.03146": "|**2023-11-06**|**Enabling In-Situ Resources Utilisation by leveraging collaborative robotics and astronaut-robot interaction**|Silvia Romero-Azpitarte et.al.|[2311.03146v1](http://arxiv.org/abs/2311.03146v1)|null|\n"}, "MultiModality": {"2302.12248": "|**2023-02-23**|**Learning Visual Representations via Language-Guided Sampling**|Mohamed El Banani et.al.|[2302.12248v1](http://arxiv.org/abs/2302.12248v1)|**[link](https://github.com/mbanani/lgssl)**|\n", "2302.11939": "|**2023-02-23**|**Power Time Series Forecasting by Pretrained LM**|Tian Zhou et.al.|[2302.11939v1](http://arxiv.org/abs/2302.11939v1)|**[link](https://github.com/damo-di-ml/one_fits_all)**|\n", "2302.11713": "|**2023-02-24**|**Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?**|Yang Chen et.al.|[2302.11713v2](http://arxiv.org/abs/2302.11713v2)|**[link](https://github.com/edchengg/infoseek_eval)**|\n", "2302.11529": "|**2023-02-22**|**Modular Deep Learning**|Jonas Pfeiffer et.al.|[2302.11529v1](http://arxiv.org/abs/2302.11529v1)|null|\n", "2302.11458": "|**2023-02-22**|**Fusing Visual Appearance and Geometry for Multi-modality 6DoF Object Tracking**|Manuel Stoiber et.al.|[2302.11458v1](http://arxiv.org/abs/2302.11458v1)|**[link](https://github.com/dlr-rm/3dobjecttracking)**|\n", "2302.11352": "|**2023-02-22**|**X-TRA: Improving Chest X-ray Tasks with Cross-Modal Retrieval Augmentation**|Tom van Sonsbeek et.al.|[2302.11352v1](http://arxiv.org/abs/2302.11352v1)|null|\n", "2302.11254": "|**2023-02-22**|**Cross-modal Audio-visual Co-learning for Text-independent Speaker Verification**|Meng Liu et.al.|[2302.11254v1](http://arxiv.org/abs/2302.11254v1)|**[link](https://github.com/danielmengliu/audiovisuallip)**|\n", "2302.11154": "|**2023-02-24**|**Open-domain Visual Entity Recognition: Towards Recognizing Millions of Wikipedia Entities**|Hexiang Hu et.al.|[2302.11154v2](http://arxiv.org/abs/2302.11154v2)|**[link](https://github.com/edchengg/oven_eval)**|\n", "2302.11097": "|**2023-02-22**|**A Multi-Modal Neural Geometric Solver with Textual Clauses Parsed from Diagram**|Ming-Liang Zhang et.al.|[2302.11097v1](http://arxiv.org/abs/2302.11097v1)|**[link](https://github.com/mingliangzhang2018/pgps)**|\n", "2302.11082": "|**2023-02-22**|**BB-GCN: A Bi-modal Bridged Graph Convolutional Network for Multi-label Chest X-Ray Recognition**|Guoli Wang et.al.|[2302.11082v1](http://arxiv.org/abs/2302.11082v1)|null|\n", "2302.11025": "|**2023-02-21**|**Asteroseismology of $\u03b4$ Scuti stars: emulating model grids using a neural network**|Owen J. Scutt et.al.|[2302.11025v1](http://arxiv.org/abs/2302.11025v1)|null|\n", "2302.11021": "|**2023-02-21**|**MVMTnet: A Multi-variate Multi-modal Transformer for Multi-class Classification of Cardiac Irregularities Using ECG Waveforms and Clinical Notes**|Ankur Samanta et.al.|[2302.11021v1](http://arxiv.org/abs/2302.11021v1)|null|\n", "2302.10873": "|**2023-02-21**|**Context-Aware Timewise VAEs for Real-Time Vehicle Trajectory Prediction**|Pei Xu et.al.|[2302.10873v1](http://arxiv.org/abs/2302.10873v1)|**[link](https://github.com/xupei0610/contextvae)**|\n", "2302.10859": "|**2023-02-21**|**SF2Former: Amyotrophic Lateral Sclerosis Identification From Multi-center MRI Data Using Spatial and Frequency Fusion Transformer**|Rafsanjany Kushol et.al.|[2302.10859v1](http://arxiv.org/abs/2302.10859v1)|**[link](https://github.com/raoyongming/GFNet)**|\n", "2302.10813": "|**2023-02-21**|**Tracking Objects and Activities with Attention for Temporal Sentence Grounding**|Zeyu Xiong et.al.|[2302.10813v1](http://arxiv.org/abs/2302.10813v1)|null|\n", "2302.10632": "|**2023-02-23**|**Multi-Modal Self-Supervised Learning for Recommendation**|Wei Wei et.al.|[2302.10632v2](http://arxiv.org/abs/2302.10632v2)|**[link](https://github.com/hkuds/mmssl)**|\n", "2302.10511": "|**2023-02-21**|**MVFusion: Multi-View 3D Object Detection with Semantic-aligned Radar and Camera Fusion**|Zizhang Wu et.al.|[2302.10511v1](http://arxiv.org/abs/2302.10511v1)|null|\n", "2302.10465": "|**2023-02-21**|**A Flexible Multi-view Multi-modal Imaging System for Outdoor Scenes**|Meng Zhang et.al.|[2302.10465v1](http://arxiv.org/abs/2302.10465v1)|null|\n", "2302.10035": "|**2023-02-20**|**Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey**|Xiao Wang et.al.|[2302.10035v1](http://arxiv.org/abs/2302.10035v1)|**[link](https://github.com/wangxiao5791509/multimodal_bigmodels_survey)**|\n", "2302.09934": "|**2023-02-20**|**CISum: Learning Cross-modality Interaction to Enhance Multimodal Semantic Coverage for Multimodal Summarization**|Litian Zhang et.al.|[2302.09934v1](http://arxiv.org/abs/2302.09934v1)|null|\n", "2302.09850": "|**2023-02-20**|**Constraint and Union for Partially-Supervised Temporal Sentence Grounding**|Chen Ju et.al.|[2302.09850v1](http://arxiv.org/abs/2302.09850v1)|null|\n", "2302.09636": "|**2023-02-19**|**Interpretable Medical Image Visual Question Answering via Multi-Modal Relationship Graph Learning**|Xinyue Hu et.al.|[2302.09636v1](http://arxiv.org/abs/2302.09636v1)|null|\n", "2302.09328": "|**2023-02-18**|**SSVMR: Saliency-based Self-training for Video-Music Retrieval**|Xuxin Cheng et.al.|[2302.09328v1](http://arxiv.org/abs/2302.09328v1)|null|\n", "2302.08958": "|**2023-02-17**|**Towards Unifying Medical Vision-and-Language Pre-training via Soft Prompts**|Zhihong Chen et.al.|[2302.08958v1](http://arxiv.org/abs/2302.08958v1)|**[link](https://github.com/zhjohnchan/ptunifier)**|\n", "2302.08888": "|**2023-02-17**|**Multimodal Federated Learning via Contrastive Representation Ensemble**|Qiying Yu et.al.|[2302.08888v1](http://arxiv.org/abs/2302.08888v1)|**[link](https://github.com/flair-thu/creamfl)**|\n", "2302.08820": "|**2023-02-17**|**Understanding Stationary and Moving Direct Skin Vibrotactile Stimulation on the Palm**|Hesham Elsayed et.al.|[2302.08820v1](http://arxiv.org/abs/2302.08820v1)|null|\n", "2302.08774": "|**2023-02-17**|**Vision, Deduction and Alignment: An Empirical Study on Multi-modal Knowledge Graph Alignment**|Yangning Li et.al.|[2302.08774v1](http://arxiv.org/abs/2302.08774v1)|null|\n", "2302.08706": "|**2023-02-20**|**Fine-grained Cross-modal Fusion based Refinement for Text-to-Image Synthesis**|Haoran Sun et.al.|[2302.08706v2](http://arxiv.org/abs/2302.08706v2)|**[link](https://github.com/haoranhfut/ff-gan)**|\n", "2302.08670": "|**2023-02-17**|**Cascaded information enhancement and cross-modal attention feature fusion for multispectral pedestrian detection**|Yang Yang et.al.|[2302.08670v1](http://arxiv.org/abs/2302.08670v1)|null|\n", "2302.09302": "|**2023-02-16**|**Bridge the Gap between Language models and Tabular Understanding**|Nuo Chen et.al.|[2302.09302v1](http://arxiv.org/abs/2302.09302v1)|null|\n", "2302.08326": "|**2023-02-16**|**NUAA-QMUL-AIIT at Memotion 3: Multi-modal Fusion with Squeeze-and-Excitation for Internet Meme Emotion Analysis**|Xiaoyu Guo et.al.|[2302.08326v1](http://arxiv.org/abs/2302.08326v1)|**[link](https://github.com/xxxxxxxxy/memotion3-SEFusion)**|\n", "2302.08212": "|**2023-02-16**|**Visible-Infrared Person Re-Identification via Patch-Mixed Cross-Modality Learning**|Zhihao Qian et.al.|[2302.08212v1](http://arxiv.org/abs/2302.08212v1)|null|\n", "2302.08180": "|**2023-02-16**|**Cross Modal Distillation for Flood Extent Mapping**|Shubhika Garg et.al.|[2302.08180v1](http://arxiv.org/abs/2302.08180v1)|null|\n", "2302.08052": "|**2023-02-16**|**Hierarchical Cross-modal Transformer for RGB-D Salient Object Detection**|Hao Chen et.al.|[2302.08052v1](http://arxiv.org/abs/2302.08052v1)|null|\n", "2302.08020": "|**2023-02-16**|**All-Electrical Skyrmionic Bits in a Chiral Magnetic Tunnel Junction**|Shaohai Chen et.al.|[2302.08020v1](http://arxiv.org/abs/2302.08020v1)|null|\n", "2302.08016": "|**2023-02-16**|**Unsupervised Domain Adaptation for MRI Volume Segmentation and Classification Using Image-to-Image Translation**|Satoshi Kondo et.al.|[2302.08016v1](http://arxiv.org/abs/2302.08016v1)|null|\n", "2302.07919": "|**2023-02-15**|**COVID-VTS: Fact Extraction and Verification on Short Video Platforms**|Fuxiao Liu et.al.|[2302.07919v1](http://arxiv.org/abs/2302.07919v1)|**[link](https://github.com/fuxiaoliu/twitter-video-dataset)**|\n", "2302.07702": "|**2023-02-15**|**Audio-Visual Contrastive Learning with Temporal Self-Supervision**|Simon Jenni et.al.|[2302.07702v1](http://arxiv.org/abs/2302.07702v1)|null|\n", "2302.07693": "|**2023-02-16**|**Fine-tuning of sign language recognition models: a technical report**|Maxim Novopoltsev et.al.|[2302.07693v2](http://arxiv.org/abs/2302.07693v2)|**[link](https://github.com/ds-hub-sochi/sl-techreport)**|\n", "2302.07661": "|**2023-02-15**|**Depth- and Semantics-aware Multi-modal Domain Translation: Generating 3D Panoramic Color Images from LiDAR Point Clouds**|Tiago Cortinhal et.al.|[2302.07661v1](http://arxiv.org/abs/2302.07661v1)|**[link](https://github.com/tiagocortinhal/titan-next)**|\n", "2302.07456": "|**2023-02-15**|**Continuous-Time Fixed-Lag Smoothing for LiDAR-Inertial-Camera SLAM**|Jiajun Lv et.al.|[2302.07456v1](http://arxiv.org/abs/2302.07456v1)|**[link](https://github.com/april-zju/clic)**|\n", "2302.07269": "|**2023-02-14**|**Dual-mode adaptive-SVD ghost imaging**|Dajing Wang et.al.|[2302.07269v1](http://arxiv.org/abs/2302.07269v1)|null|\n", "2302.06914": "|**2023-02-14**|**Heterogeneous Anomaly Detection for Software Systems via Semi-supervised Cross-modal Attention**|Cheryl Lee et.al.|[2302.06914v1](http://arxiv.org/abs/2302.06914v1)|**[link](https://github.com/bebillionaireusd/hades)**|\n", "2302.10909": "|**2023-02-14**|**Multi-modal Machine Learning in Engineering Design: A Review and Future Directions**|Binyang Song et.al.|[2302.10909v1](http://arxiv.org/abs/2302.10909v1)|null|\n", "2302.06643": "|**2023-02-13**|**Vision-RADAR fusion for Robotics BEV Detections: A Survey**|Apoorv Singh et.al.|[2302.06643v1](http://arxiv.org/abs/2302.06643v1)|null|\n", "2302.06605": "|**2023-02-13**|**UniAdapter: Unified Parameter-Efficient Transfer Learning for Cross-modal Modeling**|Haoyu Lu et.al.|[2302.06605v1](http://arxiv.org/abs/2302.06605v1)|**[link](https://github.com/rerv/uniadapter)**|\n", "2302.06560": "|**2023-02-13**|**Large Scale Multi-Lingual Multi-Modal Summarization Dataset**|Yash Verma et.al.|[2302.06560v1](http://arxiv.org/abs/2302.06560v1)|**[link](https://github.com/anubhav-jangra/m3ls)**|\n", "2302.06452": "|**2023-02-13**|**Mixed Multi-Model Semantic Interaction for Graph-based Narrative Visualizations**|Brian Keith Norambuena et.al.|[2302.06452v1](http://arxiv.org/abs/2302.06452v1)|null|\n", "2302.06350": "|**2023-02-13**|**CLIP-RR: Improved CLIP Network for Relation-Focused Cross-Modal Information Retrieval**|Yan Gong et.al.|[2302.06350v1](http://arxiv.org/abs/2302.06350v1)|null|\n", "2302.06148": "|**2023-02-13**|**CoMAE: Single Model Hybrid Pre-training on Small-Scale RGB-D Datasets**|Jiange Yang et.al.|[2302.06148v1](http://arxiv.org/abs/2302.06148v1)|**[link](https://github.com/mcg-nju/comae)**|\n", "2302.12816": "|**2023-02-24**|**Floquet Analysis of Frequency Collisions**|Kentaro Heya et.al.|[2302.12816v1](http://arxiv.org/abs/2302.12816v1)|null|\n", "2302.12610": "|**2023-02-24**|**A Joint Modeling of Vision-Language-Action for Target-oriented Grasping in Clutter**|Kechun Xu et.al.|[2302.12610v1](http://arxiv.org/abs/2302.12610v1)|**[link](https://github.com/xukechun/Vision-Language-Grasping)**|\n", "2302.12552": "|**2023-02-24**|**Deep Learning for Video-Text Retrieval: a Review**|Cunjuan Zhu et.al.|[2302.12552v1](http://arxiv.org/abs/2302.12552v1)|null|\n", "2302.12258": "|**2023-02-23**|**Data leakage in cross-modal retrieval training: A case study**|Benno Weck et.al.|[2302.12258v1](http://arxiv.org/abs/2302.12258v1)|null|\n", "2302.14045": "|**2023-02-27**|**Language Is Not All You Need: Aligning Perception with Language Models**|Shaohan Huang et.al.|[2302.14045v1](http://arxiv.org/abs/2302.14045v1)|**[link](https://github.com/microsoft/unilm)**|\n", "2302.14042": "|**2023-02-27**|**Knowledge-enhanced Pre-training for Auto-diagnosis of Chest Radiology Images**|Xiaoman Zhang et.al.|[2302.14042v1](http://arxiv.org/abs/2302.14042v1)|null|\n", "2302.14007": "|**2023-02-27**|**Joint-MAE: 2D-3D Joint Masked Autoencoders for 3D Point Cloud Pre-training**|Ziyu Guo et.al.|[2302.14007v1](http://arxiv.org/abs/2302.14007v1)|null|\n", "2302.13838": "|**2023-02-27**|**Cross-modal Face- and Voice-style Transfer**|Naoya Takahashi et.al.|[2302.13838v1](http://arxiv.org/abs/2302.13838v1)|null|\n", "2302.13668": "|**2023-02-27**|**Contrastive Video Question Answering via Video Graph Transformer**|Junbin Xiao et.al.|[2302.13668v1](http://arxiv.org/abs/2302.13668v1)|**[link](https://github.com/doc-doc/covgt)**|\n", "2302.13321": "|**2023-02-26**|**Multi-Modality in Music: Predicting Emotion in Music from High-Level Audio Features and Lyrics**|Tibor Krols et.al.|[2302.13321v1](http://arxiv.org/abs/2302.13321v1)|**[link](https://github.com/tibor-krols/cogsci2-spotify)**|\n", "2302.13311": "|**2023-02-26**|**Understanding Social Media Cross-Modality Discourse in Linguistic Space**|Chunpu Xu et.al.|[2302.13311v1](http://arxiv.org/abs/2302.13311v1)|**[link](https://github.com/cpaaax/multimodal_discourse)**|\n", "2302.13187": "|**2023-02-25**|**Tractable Diversity: Scalable Multiperspective Ontology Management via Standpoint EL**|Luc\u00eda G\u00f3mez \u00c1lvarez et.al.|[2302.13187v1](http://arxiv.org/abs/2302.13187v1)|null|\n", "2302.13094": "|**2023-02-25**|**Knowledge-infused Contrastive Learning for Urban Imagery-based Socioeconomic Prediction**|Yu Liu et.al.|[2302.13094v1](http://arxiv.org/abs/2302.13094v1)|**[link](https://github.com/tsinghua-fib-lab/urbankg-knowcl)**|\n", "2302.12971": "|**2023-02-25**|**BrainCLIP: Bridging Brain and Visual-Linguistic Representation via CLIP for Generic Natural Visual Stimulus Decoding from fMRI**|Yulong Liu et.al.|[2302.12971v1](http://arxiv.org/abs/2302.12971v1)|**[link](https://github.com/YulongBonjour/BrainCLIP)**|\n", "2302.14785": "|**2023-02-28**|**Joint Representations of Text and Knowledge Graphs for Retrieval and Evaluation**|Teven Le Scao et.al.|[2302.14785v1](http://arxiv.org/abs/2302.14785v1)|null|\n", "2302.14777": "|**2023-02-28**|**VQA with Cascade of Self- and Co-Attention Blocks**|Aakansha Mishra et.al.|[2302.14777v1](http://arxiv.org/abs/2302.14777v1)|null|\n", "2302.14564": "|**2023-02-28**|**Exploring Self-supervised Pre-trained ASR Models For Dysarthric and Elderly Speech Recognition**|Shujie Hu et.al.|[2302.14564v1](http://arxiv.org/abs/2302.14564v1)|null|\n", "2302.14418": "|**2023-02-28**|**PCR-CG: Point Cloud Registration via Deep Color and Geometry**|Yu Zhang et.al.|[2302.14418v1](http://arxiv.org/abs/2302.14418v1)|**[link](https://github.com/gardlin/pcr-cg)**|\n", "2302.14264": "|**2023-02-28**|**RGB-D Grasp Detection via Depth Guided Learning with Cross-modal Attention**|Ran Qin et.al.|[2302.14264v1](http://arxiv.org/abs/2302.14264v1)|null|\n", "2302.14115": "|**2023-02-27**|**Vid2Seq: Large-Scale Pretraining of a Visual Language Model for Dense Video Captioning**|Antoine Yang et.al.|[2302.14115v1](http://arxiv.org/abs/2302.14115v1)|**[link](https://github.com/google-research/scenic/tree/main/scenic/projects/vid2seq)**|\n", "2302.14082": "|**2023-02-27**|**Detecting and Mitigating Mode-Collapse for Flow-based Sampling of Lattice Field Theories**|Kim A. Nicoli et.al.|[2302.14082v1](http://arxiv.org/abs/2302.14082v1)|null|\n", "2303.00720": "|**2023-03-01**|**Cross-Modal Entity Matching for Visually Rich Documents**|Ritesh Sarkhel et.al.|[2303.00720v1](http://arxiv.org/abs/2303.00720v1)|null|\n", "2303.00534": "|**2023-03-01**|**RAMM: Retrieval-augmented Biomedical Visual Question Answering with Multi-modal Pre-training**|Zheng Yuan et.al.|[2303.00534v1](http://arxiv.org/abs/2303.00534v1)|**[link](https://github.com/GanjinZero/RAMM)**|\n", "2303.00462": "|**2023-03-02**|**Hidden Gems: 4D Radar Scene Flow Learning Using Cross-Modal Supervision**|Fangqiang Ding et.al.|[2303.00462v2](http://arxiv.org/abs/2303.00462v2)|**[link](https://github.com/toytiny/cmflow)**|\n", "2303.00448": "|**2023-03-01**|**The style transformer with common knowledge optimization for image-text retrieval**|Wenrui Li et.al.|[2303.00448v1](http://arxiv.org/abs/2303.00448v1)|null|\n", "2303.00369": "|**2023-03-02**|**Indescribable Multi-modal Spatial Evaluator**|Lingke Kong et.al.|[2303.00369v2](http://arxiv.org/abs/2303.00369v2)|**[link](https://github.com/kid-liet/imse)**|\n", "2303.00289": "|**2023-03-01**|**StrucTexTv2: Masked Visual-Textual Prediction for Document Image Pre-training**|Yuechen Yu et.al.|[2303.00289v1](http://arxiv.org/abs/2303.00289v1)|**[link](https://github.com/PaddlePaddle/VIMER/tree/main/StrucTexT/v2)**|\n", "2303.00277": "|**2023-03-01**|**UAV Tracking with Lidar as a Camera Sensors in GNSS-Denied Environments**|Ha Sier et.al.|[2303.00277v1](http://arxiv.org/abs/2303.00277v1)|**[link](https://github.com/tiers/uav-tracking-based-on-lidar-as-a-camera)**|\n", "2303.00233": "|**2023-03-01**|**Single-Cell Multimodal Prediction via Transformers**|Wenzhuo Tang et.al.|[2303.00233v1](http://arxiv.org/abs/2303.00233v1)|**[link](https://github.com/omicsml/scmoformer)**|\n", "2303.00200": "|**2023-03-01**|**Feature Extraction Matters More: Universal Deepfake Disruption through Attacking Ensemble Feature Extractors**|Long Tang et.al.|[2303.00200v1](http://arxiv.org/abs/2303.00200v1)|null|\n", "2303.00073": "|**2023-02-28**|**Cross-correlated quantum thermometry using diamond containing dual-defect centers**|Madhav Gupta et.al.|[2303.00073v1](http://arxiv.org/abs/2303.00073v1)|null|\n", "2303.00040": "|**2023-02-28**|**Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training**|Dezhao Luo et.al.|[2303.00040v1](http://arxiv.org/abs/2303.00040v1)|null|\n", "2303.01480": "|**2023-03-02**|**Delivering Arbitrary-Modal Semantic Segmentation**|Jiaming Zhang et.al.|[2303.01480v1](http://arxiv.org/abs/2303.01480v1)|**[link](https://github.com/jamycheung/DELIVER)**|\n", "2303.01311": "|**2023-03-02**|**Zero-Shot Text-to-Parameter Translation for Game Character Auto-Creation**|Rui Zhao et.al.|[2303.01311v1](http://arxiv.org/abs/2303.01311v1)|null|\n", "2303.01310": "|**2023-03-02**|**Learning Language-Conditioned Deformable Object Manipulation with Graph Dynamics**|Kai Mo et.al.|[2303.01310v1](http://arxiv.org/abs/2303.01310v1)|null|\n", "2303.01217": "|**2023-03-02**|**Synthetic Misinformers: Generating and Combating Multimodal Misinformation**|Stefanos-Iordanis Papadopoulos et.al.|[2303.01217v1](http://arxiv.org/abs/2303.01217v1)|null|\n", "2303.01043": "|**2023-03-02**|**I2P-Rec: Recognizing Images on Large-scale Point Cloud Maps through Bird's Eye View Projections**|Yixuan Li et.al.|[2303.01043v1](http://arxiv.org/abs/2303.01043v1)|null|\n", "2303.00882": "|**2023-03-02**|**X-Ray2EM: Uncertainty-Aware Cross-Modality Image Reconstruction from X-Ray to Electron Microscopy in Connectomics**|Yicong Li et.al.|[2303.00882v1](http://arxiv.org/abs/2303.00882v1)|null|\n", "2303.00865": "|**2023-03-01**|**AMIGO: Sparse Multi-Modal Graph Transformer with Shared-Context Processing for Representation Learning of Giga-pixel Images**|Ramin Nakhli et.al.|[2303.00865v1](http://arxiv.org/abs/2303.00865v1)|**[link](https://github.com/raminnakhli/amigo)**|\n", "2303.00806": "|**2023-03-01**|**Survival modelling of smartphone trigger data for earthquake parameter estimation in early warning. With applications to 2023 Turkish-Syrian and 2019 Ridgecrest events**|Luca Aiello et.al.|[2303.00806v1](http://arxiv.org/abs/2303.00806v1)|null|\n", "2303.02139": "|**2023-03-03**|**Data Association Aware POMDP Planning with Hypothesis Pruning Performance Guarantees**|Moran Barenboim et.al.|[2303.02139v1](http://arxiv.org/abs/2303.02139v1)|null|\n", "2303.01933": "|**2023-03-03**|**BogieCopter: A Multi-Modal Aerial-Ground Vehicle for Long-Endurance Inspection Applications**|Teodoro Dias et.al.|[2303.01933v1](http://arxiv.org/abs/2303.01933v1)|null|\n", "2303.01510": "|**2023-03-02**|**INO at Factify 2: Structure Coherence based Multi-Modal Fact Verification**|Yinuo Zhang et.al.|[2303.01510v1](http://arxiv.org/abs/2303.01510v1)|**[link](https://github.com/catrin-baze/ino-of-factify)**|\n", "2303.03378": "|**2023-03-06**|**PaLM-E: An Embodied Multimodal Language Model**|Danny Driess et.al.|[2303.03378v1](http://arxiv.org/abs/2303.03378v1)|null|\n", "2303.03131": "|**2023-03-08**|**Video Question Answering Using CLIP-Guided Visual-Text Attention**|Shuhong Ye et.al.|[2303.03131v2](http://arxiv.org/abs/2303.03131v2)|null|\n", "2303.03093": "|**2023-03-06**|**A Miniaturised Camera-based Multi-Modal Tactile Sensor**|Kaspar Althoefer et.al.|[2303.03093v1](http://arxiv.org/abs/2303.03093v1)|null|\n", "2303.03056": "|**2023-03-07**|**MOISST: Multi-modal Optimization of Implicit Scene for SpatioTemporal calibration**|Quentin Herau et.al.|[2303.03056v2](http://arxiv.org/abs/2303.03056v2)|null|\n", "2303.03032": "|**2023-03-06**|**DeCap: Decoding CLIP Latents for Zero-Shot Captioning via Text-Only Training**|Wei Li et.al.|[2303.03032v1](http://arxiv.org/abs/2303.03032v1)|**[link](https://github.com/dhg-wei/decap)**|\n", "2303.02995": "|**2023-03-06**|**HiCLIP: Contrastive Language-Image Pretraining with Hierarchy-aware Attention**|Shijie Geng et.al.|[2303.02995v1](http://arxiv.org/abs/2303.02995v1)|**[link](https://github.com/jeykigung/hiclip)**|\n", "2303.02976": "|**2023-03-06**|**Dronument: System for Reliable Deployment of Micro Aerial Vehicles in Dark Areas of Large Historical Monuments**|Pavel Petracek et.al.|[2303.02976v1](http://arxiv.org/abs/2303.02976v1)|null|\n", "2303.02688": "|**2023-03-05**|**Text2Face: A Multi-Modal 3D Face Model**|Will Rowan et.al.|[2303.02688v1](http://arxiv.org/abs/2303.02688v1)|null|\n", "2303.02684": "|**2023-03-05**|**Robust Multi-Modal Multi-LiDAR-Inertial Odometry and Mapping for Indoor Environments**|Li Qingqing et.al.|[2303.02684v1](http://arxiv.org/abs/2303.02684v1)|**[link](https://github.com/tiers/multi-modal-loam)**|\n", "2303.02506": "|**2023-03-04**|**Prismer: A Vision-Language Model with An Ensemble of Experts**|Shikun Liu et.al.|[2303.02506v1](http://arxiv.org/abs/2303.02506v1)|**[link](https://github.com/nvlabs/prismer)**|\n", "2303.02483": "|**2023-03-04**|**FAME-ViL: Multi-Tasking Vision-Language Model for Heterogeneous Fashion Tasks**|Xiao Han et.al.|[2303.02483v1](http://arxiv.org/abs/2303.02483v1)|**[link](https://github.com/brandonhanx/fame-vil)**|\n", "2303.02479": "|**2023-03-04**|**Chronic Kidney Disease of Unknown Aetiolgy (CKDu)-the search for causes and the impact of its politicization**|Chandre Dharma-wardana et.al.|[2303.02479v1](http://arxiv.org/abs/2303.02479v1)|null|\n", "2303.02407": "|**2023-03-04**|**Local Navigation Among Movable Obstacles with Deep Reinforcement Learning**|Linghong Yao et.al.|[2303.02407v1](http://arxiv.org/abs/2303.02407v1)|null|\n", "2303.02323": "|**2023-03-04**|**APE: An Open and Shared Annotated Dataset for Learning Urban Pedestrian Path Networks**|Yuxiang Zhang et.al.|[2303.02323v1](http://arxiv.org/abs/2303.02323v1)|null|\n", "2303.02203": "|**2023-03-03**|**X$^3$KD: Knowledge Distillation Across Modalities, Tasks and Stages for Multi-Camera 3D Object Detection**|Marvin Klingner et.al.|[2303.02203v1](http://arxiv.org/abs/2303.02203v1)|null|\n", "2303.03991": "|**2023-03-07**|**OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception**|Xiaofeng Wang et.al.|[2303.03991v1](http://arxiv.org/abs/2303.03991v1)|**[link](https://github.com/jeffwang987/openoccupancy)**|\n", "2303.03878": "|**2023-03-07**|**A convergence analysis of a structure-preserving gradient flow method for the all-electron Kohn-Sham model**|Yedan Shen et.al.|[2303.03878v1](http://arxiv.org/abs/2303.03878v1)|null|\n", "2303.03595": "|**2023-03-07**|**LoGoNet: Towards Accurate 3D Object Detection with Local-to-Global Cross-Modal Fusion**|Xin Li et.al.|[2303.03595v1](http://arxiv.org/abs/2303.03595v1)|**[link](https://github.com/sankin97/logonet)**|\n", "2303.03449": "|**2023-03-06**|**Dual-encoded magnetization transfer and diffusion imaging and its application to tract-specific microstructure mapping**|Ilana R Leppert et.al.|[2303.03449v1](http://arxiv.org/abs/2303.03449v1)|**[link](https://github.com/tardiflab/mt-diff)**|\n", "2303.04748": "|**2023-03-08**|**CLIP-FO3D: Learning Free Open-world 3D Scene Representations from 2D Dense CLIP**|Junbo Zhang et.al.|[2303.04748v1](http://arxiv.org/abs/2303.04748v1)|null|\n", "2303.04585": "|**2023-03-08**|**New Audio Representations Image Gan Generation from BriVL**|Sen Fang et.al.|[2303.04585v1](http://arxiv.org/abs/2303.04585v1)|**[link](https://github.com/fangsen9000/brivl-generation)**|\n", "2303.04439": "|**2023-03-08**|**A Light Weight Model for Active Speaker Detection**|Junhua Liao et.al.|[2303.04439v1](http://arxiv.org/abs/2303.04439v1)|**[link](https://github.com/junhua-liao/light-asd)**|\n", "2303.04398": "|**2023-03-08**|**Implications of Personality on Cognitive Workload, Affect, and Task Performance in Robot Remote Control**|Go-Eum Cha et.al.|[2303.04398v1](http://arxiv.org/abs/2303.04398v1)|null|\n", "2303.04364": "|**2023-03-08**|**Dynamic Scenario Representation Learning for Motion Forecasting with Heterogeneous Graph Convolutional Recurrent Networks**|Xing Gao et.al.|[2303.04364v1](http://arxiv.org/abs/2303.04364v1)|null|\n", "2303.05499": "|**2023-03-10**|**Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection**|Shilong Liu et.al.|[2303.05499v2](http://arxiv.org/abs/2303.05499v2)|**[link](https://github.com/idea-research/groundingdino)**|\n", "2303.05338": "|**2023-03-11**|**MMCosine: Multi-Modal Cosine Loss Towards Balanced Audio-Visual Fine-Grained Learning**|Ruize Xu et.al.|[2303.05338v2](http://arxiv.org/abs/2303.05338v2)|null|\n", "2303.05313": "|**2023-03-09**|**Replacement as a Self-supervision for Fine-grained Vision-language Pre-training**|Lisai Zhang et.al.|[2303.05313v1](http://arxiv.org/abs/2303.05313v1)|null|\n", "2303.05309": "|**2023-03-09**|**MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream Mixup for Visual Speech Translation and Recognition**|Xize Cheng et.al.|[2303.05309v1](http://arxiv.org/abs/2303.05309v1)|**[link](https://github.com/exgc/avmust-ted)**|\n", "2303.05193": "|**2023-03-09**|**GOATS: Goal Sampling Adaptation for Scooping with Curriculum Reinforcement Learning**|Yaru Niu et.al.|[2303.05193v1](http://arxiv.org/abs/2303.05193v1)|null|\n", "2303.05093": "|**2023-03-09**|**Improving Video Retrieval by Adaptive Margin**|Feng He et.al.|[2303.05093v1](http://arxiv.org/abs/2303.05093v1)|null|\n", "2303.05026": "|**2023-03-09**|**SSL^2: Self-Supervised Learning meets Semi-Supervised Learning: Multiple Sclerosis Segmentation in 7T-MRI from large-scale 3T-MRI**|Jiacheng Wang et.al.|[2303.05026v1](http://arxiv.org/abs/2303.05026v1)|null|\n", "2303.04955": "|**2023-03-09**|**Exploring Smart Commercial Building Occupants' Perceptions and Notification Preferences of Internet of Things Data Collection in the United States**|Tu Le et.al.|[2303.04955v1](http://arxiv.org/abs/2303.04955v1)|null|\n", "2303.06129": "|**2023-03-10**|**Single-branch Network for Multimodal Training**|Muhammad Saad Saeed et.al.|[2303.06129v1](http://arxiv.org/abs/2303.06129v1)|**[link](https://github.com/msaadsaeed/sbnet)**|\n", "2303.05952": "|**2023-03-10**|**Understanding and Constructing Latent Modality Structures in Multi-modal Representation Learning**|Qian Jiang et.al.|[2303.05952v1](http://arxiv.org/abs/2303.05952v1)|null|\n", "2303.05936": "|**2023-03-10**|**Learning Decoupled Multi-touch Force Estimation, Localization and Stretch for Soft Capacitive E-skin**|Abu Bakar Dawood et.al.|[2303.05936v1](http://arxiv.org/abs/2303.05936v1)|null|\n", "2303.05793": "|**2023-03-10**|**Analyzing covariate clustering effects in healthcare cost subgroups: insights and applications for prediction**|Zhengxiao Li et.al.|[2303.05793v1](http://arxiv.org/abs/2303.05793v1)|**[link](https://github.com/huangyf2217/fmr-covariates-clustering)**|\n", "2303.05725": "|**2023-03-10**|**CVT-SLR: Contrastive Visual-Textual Transformation for Sign Language Recognition with Variational Alignment**|Jiangbin Zheng et.al.|[2303.05725v1](http://arxiv.org/abs/2303.05725v1)|**[link](https://github.com/binbinjiang/cvt-slr)**|\n", "2303.05714": "|**2023-03-10**|**Simultaneous estimation of multiple eigenvalues with short-depth quantum circuit on early fault-tolerant quantum computers**|Zhiyan Ding et.al.|[2303.05714v1](http://arxiv.org/abs/2303.05714v1)|null|\n", "2303.05707": "|**2023-03-10**|**MuLTI: Efficient Video-and-Language Understanding with MultiWay-Sampler and Multiple Choice Modeling**|Jiaqi Xu et.al.|[2303.05707v1](http://arxiv.org/abs/2303.05707v1)|null|\n", "2303.07284": "|**2023-03-13**|**Align and Attend: Multimodal Summarization with Dual Contrastive Losses**|Bo He et.al.|[2303.07284v1](http://arxiv.org/abs/2303.07284v1)|**[link](https://github.com/boheumd/A2Summ)**|\n", "2303.07274": "|**2023-03-14**|**Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of Synthetic and Compositional Images**|Nitzan Bitton-Guetta et.al.|[2303.07274v2](http://arxiv.org/abs/2303.07274v2)|null|\n", "2303.07265": "|**2023-03-13**|**Multimodal Reinforcement Learning for Robots Collaborating with Humans**|Afagh Mehri Shervedani et.al.|[2303.07265v1](http://arxiv.org/abs/2303.07265v1)|null|\n", "2303.07064": "|**2023-03-13**|**A Generalized Multi-Modal Fusion Detection Framework**|Leichao Cui et.al.|[2303.07064v1](http://arxiv.org/abs/2303.07064v1)|null|\n", "2303.07000": "|**2023-03-13**|**Predicting Density of States via Multi-modal Transformer**|Namkyeong Lee et.al.|[2303.07000v1](http://arxiv.org/abs/2303.07000v1)|**[link](https://github.com/heewoongnoh/dostransformer)**|\n", "2303.06947": "|**2023-03-13**|**A Multi-Modal Simulation Framework to Enable Digital Twin-based V2X Communications in Dynamic Environments**|Lorenzo Cazzella et.al.|[2303.06947v1](http://arxiv.org/abs/2303.06947v1)|null|\n", "2303.06840": "|**2023-03-13**|**DDFM: Denoising Diffusion Model for Multi-Modality Image Fusion**|Zixiang Zhao et.al.|[2303.06840v1](http://arxiv.org/abs/2303.06840v1)|**[link](https://github.com/zhaozixiang1228/mmif-ddfm)**|\n", "2303.06662": "|**2023-03-12**|**Fuzzy Alignments in Directed Acyclic Graph for Non-Autoregressive Machine Translation**|Zhengrui Ma et.al.|[2303.06662v1](http://arxiv.org/abs/2303.06662v1)|**[link](https://github.com/ictnlp/fa-dat)**|\n", "2303.06555": "|**2023-03-12**|**One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale**|Fan Bao et.al.|[2303.06555v1](http://arxiv.org/abs/2303.06555v1)|**[link](https://github.com/thu-ml/unidiffuser)**|\n", "2303.06536": "|**2023-03-12**|**AutoOptLib: A Library of Automatically Designing Metaheuristic Optimization Algorithms in MATLAB**|Qi Zhao et.al.|[2303.06536v1](http://arxiv.org/abs/2303.06536v1)|**[link](https://github.com/qz89/AutoOpt)**|\n", "2303.06464": "|**2023-03-11**|**PARASOL: Parametric Style Control for Diffusion Image Synthesis**|Gemma Canet Tarr\u00e9s et.al.|[2303.06464v1](http://arxiv.org/abs/2303.06464v1)|null|\n", "2303.06422": "|**2023-03-11**|**An approximate control variates approach to multifidelity distribution estimation**|Ruijian Han et.al.|[2303.06422v1](http://arxiv.org/abs/2303.06422v1)|null|\n", "2303.06398": "|**2023-03-11**|**Variational Gaussian filtering via Wasserstein gradient flows**|Adrie Corenflos et.al.|[2303.06398v1](http://arxiv.org/abs/2303.06398v1)|**[link](https://github.com/hanyas/wasserstein-flow-filter)**|\n", "2303.06378": "|**2023-03-11**|**Learning Grounded Vision-Language Representation for Versatile Understanding in Untrimmed Videos**|Teng Wang et.al.|[2303.06378v1](http://arxiv.org/abs/2303.06378v1)|**[link](https://github.com/zjr2000/gvl)**|\n", "2303.06345": "|**2023-03-11**|**Semantics-Aware Dynamic Localization and Refinement for Referring Image Segmentation**|Zhao Yang et.al.|[2303.06345v1](http://arxiv.org/abs/2303.06345v1)|null|\n", "2303.08129": "|**2023-03-14**|**PiMAE: Point Cloud and Image Interactive Masked Autoencoders for 3D Object Detection**|Anthony Chen et.al.|[2303.08129v1](http://arxiv.org/abs/2303.08129v1)|**[link](https://github.com/blvlab/pimae)**|\n", "2303.08054": "|**2023-03-15**|**Statistical Hardware Design With Multi-model Active Learning**|Alireza Ghaffari et.al.|[2303.08054v2](http://arxiv.org/abs/2303.08054v2)|null|\n", "2303.08017": "|**2023-03-14**|**Reliable Beamforming at Terahertz Bands: Are Causal Representations the Way Forward?**|Christo Kurisummoottil Thomas et.al.|[2303.08017v1](http://arxiv.org/abs/2303.08017v1)|null|\n", "2303.07896": "|**2023-03-16**|**Exploring Weakly Supervised Semantic Segmentation Ensembles for Medical Imaging Systems**|Erik Ostrowski et.al.|[2303.07896v2](http://arxiv.org/abs/2303.07896v2)|**[link](https://github.com/erikostrowski/automated_ensemble)**|\n", "2303.07775": "|**2023-03-14**|**Data-Free Sketch-Based Image Retrieval**|Abhra Chaudhuri et.al.|[2303.07775v1](http://arxiv.org/abs/2303.07775v1)|**[link](https://github.com/abhrac/data-free-sbir)**|\n", "2303.07748": "|**2023-03-14**|**Generation-Guided Multi-Level Unified Network for Video Grounding**|Xing Cheng et.al.|[2303.07748v1](http://arxiv.org/abs/2303.07748v1)|null|\n", "2303.07742": "|**2023-03-14**|**ForDigitStress: A multi-modal stress dataset employing a digital job interview scenario**|Alexander Heimerl et.al.|[2303.07742v1](http://arxiv.org/abs/2303.07742v1)|null|\n", "2303.07674": "|**2023-03-14**|**Koos Classification of Vestibular Schwannoma via Image Translation-Based Unsupervised Cross-Modality Domain Adaptation**|Tao Yang et.al.|[2303.07674v1](http://arxiv.org/abs/2303.07674v1)|null|\n", "2303.07667": "|**2023-03-14**|**Improving Music Genre Classification from multi-modal properties of music and genre correlations Perspective**|Ganghui Ru et.al.|[2303.07667v1](http://arxiv.org/abs/2303.07667v1)|null|\n", "2303.07647": "|**2023-03-15**|**Recent Advances and Applications of Machine Learning in Experimental Solid Mechanics: A Review**|Hanxun Jin et.al.|[2303.07647v2](http://arxiv.org/abs/2303.07647v2)|null|\n", "2303.07601": "|**2023-03-14**|**V2V4Real: A Real-world Large-scale Dataset for Vehicle-to-Vehicle Cooperative Perception**|Runsheng Xu et.al.|[2303.07601v1](http://arxiv.org/abs/2303.07601v1)|**[link](https://github.com/ucla-mobility/v2v4real)**|\n", "2303.07543": "|**2023-03-14**|**WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminative Analysis**|Yiye Chen et.al.|[2303.07543v1](http://arxiv.org/abs/2303.07543v1)|**[link](https://github.com/ivalab/wdiscood)**|\n", "2303.07522": "|**2023-03-13**|**Audio Visual Language Maps for Robot Navigation**|Chenguang Huang et.al.|[2303.07522v1](http://arxiv.org/abs/2303.07522v1)|null|\n", "2303.08692": "|**2023-03-15**|**SpiderMesh: Spatial-aware Demand-guided Recursive Meshing for RGB-T Semantic Segmentation**|Siqi Fan et.al.|[2303.08692v1](http://arxiv.org/abs/2303.08692v1)|**[link](https://github.com/leofansq/spidermesh)**|\n", "2303.08600": "|**2023-03-15**|**MSeg3D: Multi-modal 3D Semantic Segmentation for Autonomous Driving**|Jiale Li et.al.|[2303.08600v1](http://arxiv.org/abs/2303.08600v1)|**[link](https://github.com/jialeli1/lidarseg3d)**|\n", "2303.08562": "|**2023-03-15**|**MGA: Medical generalist agent through text-guided knowledge transformation**|Weijian Huang et.al.|[2303.08562v1](http://arxiv.org/abs/2303.08562v1)|null|\n", "2303.08518": "|**2023-03-15**|**UPRISE: Universal Prompt Retrieval for Improving Zero-Shot Evaluation**|Daixuan Cheng et.al.|[2303.08518v1](http://arxiv.org/abs/2303.08518v1)|**[link](https://github.com/microsoft/lmops)**|\n", "2303.08419": "|**2023-03-15**|**Multi-Modal Facial Expression Recognition with Transformer-Based Fusion Networks and Dynamic Sampling**|Jun-Hwa Kim et.al.|[2303.08419v1](http://arxiv.org/abs/2303.08419v1)|null|\n", "2303.08372": "|**2023-03-15**|**Target Sound Extraction with Variable Cross-modality Clues**|Chenda Li et.al.|[2303.08372v1](http://arxiv.org/abs/2303.08372v1)|**[link](https://github.com/lichenda/multi-clue-tse-data)**|\n", "2303.08367": "|**2023-03-15**|**Uncertainty-Aware Pedestrian Trajectory Prediction via Distributional Diffusion**|Yao Liu et.al.|[2303.08367v1](http://arxiv.org/abs/2303.08367v1)|null|\n", "2303.08359": "|**2023-03-15**|**Haptics-Enabled Forceps with Multi-Modal Force Sensing: Towards Task-Autonomous Robotic Surgery**|Tangyou Liu et.al.|[2303.08359v1](http://arxiv.org/abs/2303.08359v1)|null|\n", "2303.08356": "|**2023-03-15**|**Continuous emotion recognition based on TCN and Transformer**|Weiwei Zhou et.al.|[2303.08356v1](http://arxiv.org/abs/2303.08356v1)|**[link](https://github.com/upczww/abaw5)**|\n", "2303.09463": "|**2023-03-16**|**An Autonomous System for Head-to-Head Race: Design, Implementation and Analysis; Team KAIST at the Indy Autonomous Challenge**|Chanyoung Jung et.al.|[2303.09463v1](http://arxiv.org/abs/2303.09463v1)|null|\n", "2303.09381": "|**2023-03-16**|**Multi-modal Differentiable Unsupervised Feature Selection**|Junchen Yang et.al.|[2303.09381v1](http://arxiv.org/abs/2303.09381v1)|**[link](https://github.com/jcyang34/mmdufs)**|\n", "2303.09373": "|**2023-03-16**|**3D Masked Autoencoding and Pseudo-labeling for Domain Adaptive Segmentation of Heterogeneous Infant Brain MRI**|Xuzhe Zhang et.al.|[2303.09373v1](http://arxiv.org/abs/2303.09373v1)|null|\n", "2303.09367": "|**2023-03-16**|**Goal-conditioned Offline Reinforcement Learning through State Space Partitioning**|Mianchu Wang et.al.|[2303.09367v1](http://arxiv.org/abs/2303.09367v1)|null|\n", "2303.09319": "|**2023-03-16**|**Unified Multi-Modal Latent Diffusion for Joint Subject and Text Conditional Image Generation**|Yiyang Ma et.al.|[2303.09319v1](http://arxiv.org/abs/2303.09319v1)|null|\n", "2303.09270": "|**2023-03-16**|**SpectralCLIP: Preventing Artifacts in Text-Guided Style Transfer from a Spectral Perspective**|Zipeng Xu et.al.|[2303.09270v1](http://arxiv.org/abs/2303.09270v1)|**[link](https://github.com/zipengxuc/spectralclip)**|\n", "2303.09167": "|**2023-03-16**|**Emotional Reaction Intensity Estimation Based on Multimodal Data**|Shangfei Wang et.al.|[2303.09167v1](http://arxiv.org/abs/2303.09167v1)|null|\n", "2303.09119": "|**2023-03-16**|**Taming Diffusion Models for Audio-Driven Co-Speech Gesture Generation**|Lingting Zhu et.al.|[2303.09119v1](http://arxiv.org/abs/2303.09119v1)|**[link](https://github.com/advocate99/diffgesture)**|\n", "2303.09117": "|**2023-03-16**|**Visual-Linguistic Causal Intervention for Radiology Report Generation**|Weixing Chen et.al.|[2303.09117v1](http://arxiv.org/abs/2303.09117v1)|**[link](https://github.com/wissingchen/vlci)**|\n", "2303.08942": "|**2023-03-15**|**Spherical Space Feature Decomposition for Guided Depth Map Super-Resolution**|Zixiang Zhao et.al.|[2303.08942v1](http://arxiv.org/abs/2303.08942v1)|null|\n", "2303.10056": "|**2023-03-17**|**GlueGen: Plug and Play Multi-modal Encoders for X-to-image Generation**|Can Qin et.al.|[2303.10056v1](http://arxiv.org/abs/2303.10056v1)|**[link](https://github.com/salesforce/gluegen)**|\n", "2303.10033": "|**2023-03-17**|**Multi-modal Expression Recognition with Ensemble Method**|Chuanhe Liu et.al.|[2303.10033v1](http://arxiv.org/abs/2303.10033v1)|null|\n", "2303.09858": "|**2023-03-20**|**MedLocker: A Transferable Adversarial Watermarking for Preventing Unauthorized Analysis of Medical Image Dataset**|Bangzheng Pu et.al.|[2303.09858v2](http://arxiv.org/abs/2303.09858v2)|null|\n", "2303.09830": "|**2023-03-17**|**Prototype Knowledge Distillation for Medical Segmentation with Missing Modality**|Shuai Wang et.al.|[2303.09830v1](http://arxiv.org/abs/2303.09830v1)|**[link](https://github.com/sakurajimamaiii/protokd)**|\n", "2303.09825": "|**2023-03-17**|**LCE-Calib: Automatic LiDAR-Frame/Event Camera Extrinsic Calibration With A Globally Optimal Solution**|Jianhao Jiao et.al.|[2303.09825v1](http://arxiv.org/abs/2303.09825v1)|**[link](https://github.com/hkustgz-iadc/lcecalib)**|\n", "2303.09817": "|**2023-03-17**|**Hospital Length of Stay Prediction Based on Multi-modal Data towards Trustworthy Human-AI Collaboration in Radiomics**|Hubert Baniecki et.al.|[2303.09817v1](http://arxiv.org/abs/2303.09817v1)|**[link](https://github.com/modeloriented/survex)**|\n", "2303.09800": "|**2023-03-17**|**GOOD: General Optimization-based Fusion for 3D Object Detection via LiDAR-Camera Object Candidates**|Bingqi Shen et.al.|[2303.09800v1](http://arxiv.org/abs/2303.09800v1)|null|\n", "2303.09797": "|**2023-03-17**|**MMFace4D: A Large-Scale Multi-Modal 4D Face Dataset for Audio-Driven 3D Face Animation**|Haozhe Wu et.al.|[2303.09797v1](http://arxiv.org/abs/2303.09797v1)|null|\n", "2303.09756": "|**2023-03-17**|**Video Action Recognition with Attentive Semantic Units**|Yifei Chen et.al.|[2303.09756v1](http://arxiv.org/abs/2303.09756v1)|null|\n", "2303.09733": "|**2023-03-17**|**Scribble-Supervised RGB-T Salient Object Detection**|Zhengyi Liu et.al.|[2303.09733v1](http://arxiv.org/abs/2303.09733v1)|**[link](https://github.com/liuzywen/rgbtscribble-icme2023)**|\n", "2303.09695": "|**2023-03-17**|**PersonalTailor: Personalizing 2D Pattern Design from 3D Garment Point Clouds**|Anran Qi et.al.|[2303.09695v1](http://arxiv.org/abs/2303.09695v1)|null|\n", "2303.11181": "|**2023-03-20**|**Non-Markovian paths and cycles in NFT trades**|Haaroon Yousaf et.al.|[2303.11181v1](http://arxiv.org/abs/2303.11181v1)|null|\n", "2303.11090": "|**2023-03-20**|**Scene Graph Based Fusion Network For Image-Text Retrieval**|Guoliang Wang et.al.|[2303.11090v1](http://arxiv.org/abs/2303.11090v1)|null|\n", "2303.10895": "|**2023-03-20**|**Leapfrog Diffusion Model for Stochastic Trajectory Prediction**|Weibo Mao et.al.|[2303.10895v1](http://arxiv.org/abs/2303.10895v1)|**[link](https://github.com/mediabrain-sjtu/led)**|\n", "2303.10865": "|**2023-03-21**|**Rotating Objects via In-Hand Pivoting using Vision, Force and Touch**|Shiyu Xu et.al.|[2303.10865v2](http://arxiv.org/abs/2303.10865v2)|null|\n", "2303.10849": "|**2023-03-20**|**Facial Affective Analysis based on MAE and Multi-modal Information for 5th ABAW Competition**|Wei Zhang et.al.|[2303.10849v1](http://arxiv.org/abs/2303.10849v1)|null|\n", "2303.10839": "|**2023-03-21**|**MXM-CLR: A Unified Framework for Contrastive Learning of Multifold Cross-Modal Representations**|Ye Wang et.al.|[2303.10839v2](http://arxiv.org/abs/2303.10839v2)|null|\n", "2303.10835": "|**2023-03-20**|**Bifurcation analysis of the Keynesian cross model**|Xinyu Li et.al.|[2303.10835v1](http://arxiv.org/abs/2303.10835v1)|null|\n", "2303.10826": "|**2023-03-20**|**Visual Prompt Multi-Modal Tracking**|Jiawen Zhu et.al.|[2303.10826v1](http://arxiv.org/abs/2303.10826v1)|**[link](https://github.com/jiawen-zhu/vipt)**|\n", "2303.10794": "|**2023-03-19**|**PheME: A deep ensemble framework for improving phenotype prediction from multi-modal data**|Shenghan Zhang et.al.|[2303.10794v1](http://arxiv.org/abs/2303.10794v1)|null|\n", "2303.10766": "|**2023-03-21**|**Multi-modal reward for visual relationships-based image captioning**|Ali Abedi et.al.|[2303.10766v2](http://arxiv.org/abs/2303.10766v2)|null|\n", "2303.10667": "|**2023-03-19**|**Audio-Text Models Do Not Yet Leverage Natural Language**|Ho-Hsiang Wu et.al.|[2303.10667v1](http://arxiv.org/abs/2303.10667v1)|**[link](https://github.com/hohsiangwu/preposition-synthesis)**|\n", "2303.10590": "|**2023-03-19**|**Multi-modal Facial Action Unit Detection with Large Pre-trained Models for the 5th Competition on Affective Behavior Analysis in-the-wild**|Yufeng Yin et.al.|[2303.10590v1](http://arxiv.org/abs/2303.10590v1)|null|\n", "2303.10571": "|**2023-03-19**|**CLIP4MC: An RL-Friendly Vision-Language Model for Minecraft**|Ziluo Ding et.al.|[2303.10571v1](http://arxiv.org/abs/2303.10571v1)|**[link](https://github.com/PKU-RL/CLIP4MC)**|\n", "2303.10457": "|**2023-03-18**|**Multi-Modal Continual Test-Time Adaptation for 3D Semantic Segmentation**|Haozhi Cao et.al.|[2303.10457v1](http://arxiv.org/abs/2303.10457v1)|null|\n", "2303.10406": "|**2023-03-18**|**3DQD: Generalized Deep 3D Shape Prior via Part-Discretized Diffusion Process**|Yuhan Li et.al.|[2303.10406v1](http://arxiv.org/abs/2303.10406v1)|**[link](https://github.com/colorful-liyu/3dqd)**|\n", "2303.12060": "|**2023-03-21**|**VideoXum: Cross-modal Visual and Textural Summarization of Videos**|Jingyang Lin et.al.|[2303.12060v1](http://arxiv.org/abs/2303.12060v1)|null|\n", "2303.11771": "|**2023-03-21**|**Self-Sufficient Framework for Continuous Sign Language Recognition**|Youngjoon Jang et.al.|[2303.11771v1](http://arxiv.org/abs/2303.11771v1)|null|\n", "2303.11732": "|**2023-03-21**|**Multi-modal Prompting for Low-Shot Temporal Action Localization**|Chen Ju et.al.|[2303.11732v1](http://arxiv.org/abs/2303.11732v1)|null|\n", "2303.11625": "|**2023-03-21**|**Information-containing Adversarial Perturbation for Combating Facial Manipulation Systems**|Yao Zhu et.al.|[2303.11625v1](http://arxiv.org/abs/2303.11625v1)|null|\n", "2303.12501": "|**2023-03-22**|**Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval**|Ding Jiang et.al.|[2303.12501v1](http://arxiv.org/abs/2303.12501v1)|**[link](https://github.com/anosorae/irra)**|\n", "2303.12445": "|**2023-03-22**|**MEDIMP: Medical Images and Prompts for renal transplant representation learning**|Leo Milecki et.al.|[2303.12445v1](http://arxiv.org/abs/2303.12445v1)|**[link](https://github.com/leomlck/medimp)**|\n", "2303.12423": "|**2023-03-22**|**Text with Knowledge Graph Augmented Transformer for Video Captioning**|Xin Gu et.al.|[2303.12423v1](http://arxiv.org/abs/2303.12423v1)|null|\n", "2303.12419": "|**2023-03-22**|**BiCro: Noisy Correspondence Rectification for Multi-modality Data via Bi-directional Cross-modal Similarity Consistency**|Shuo Yang et.al.|[2303.12419v1](http://arxiv.org/abs/2303.12419v1)|**[link](https://github.com/xu5zhao/bicro)**|\n", "2303.12417": "|**2023-03-22**|**CLIP^2: Contrastive Language-Image-Point Pretraining from Real-World Point Cloud Data**|Yihan Zeng et.al.|[2303.12417v1](http://arxiv.org/abs/2303.12417v1)|null|\n", "2303.12379": "|**2023-03-22**|**VMCML: Video and Music Matching via Cross-Modality Lifting**|Yi-Shan Lee et.al.|[2303.12379v1](http://arxiv.org/abs/2303.12379v1)|null|\n", "2303.12112": "|**2023-03-21**|**Positive-Augmented Constrastive Learning for Image and Video Captioning Evaluation**|Sara Sarto et.al.|[2303.12112v1](http://arxiv.org/abs/2303.12112v1)|**[link](https://github.com/aimagelab/pacscore)**|\n", "2303.13471": "|**2023-03-23**|**Egocentric Audio-Visual Object Localization**|Chao Huang et.al.|[2303.13471v1](http://arxiv.org/abs/2303.13471v1)|**[link](https://github.com/wikichao/ego-av-loc)**|\n", "2303.13455": "|**2023-03-23**|**CoBIT: A Contrastive Bi-directional Image-Text Generation Model**|Haoxuan You et.al.|[2303.13455v1](http://arxiv.org/abs/2303.13455v1)|null|\n", "2303.13430": "|**2023-03-23**|**Medical diffusion on a budget: textual inversion for medical image generation**|Bram de Wilde et.al.|[2303.13430v1](http://arxiv.org/abs/2303.13430v1)|null|\n", "2303.13371": "|**2023-03-23**|**Plug-and-Play Regulators for Image-Text Matching**|Haiwen Diao et.al.|[2303.13371v1](http://arxiv.org/abs/2303.13371v1)|**[link](https://github.com/paranioar/rcar)**|\n", "2303.13233": "|**2023-03-23**|**Visually-Prompted Language Model for Fine-Grained Scene Graph Generation in an Open World**|Qifan Yu et.al.|[2303.13233v1](http://arxiv.org/abs/2303.13233v1)|**[link](https://github.com/Yuqifan1117/CaCao)**|\n", "2303.13095": "|**2023-03-23**|**Modeling Entities as Semantic Points for Visual Information Extraction in the Wild**|Zhibo Yang et.al.|[2303.13095v1](http://arxiv.org/abs/2303.13095v1)|null|\n", "2303.13041": "|**2023-03-23**|**gDoc: Automatic Generation of Structured API Documentation**|Shujun Wang et.al.|[2303.13041v1](http://arxiv.org/abs/2303.13041v1)|null|\n", "2303.13009": "|**2023-03-23**|**MELTR: Meta Loss Transformer for Learning to Fine-tune Video Foundation Models**|Dohwan Ko et.al.|[2303.13009v1](http://arxiv.org/abs/2303.13009v1)|**[link](https://github.com/mlvlab/MELTR)**|\n", "2303.12997": "|**2023-03-23**|**FER-former: Multi-modal Transformer for Facial Expression Recognition**|Yande Li et.al.|[2303.12997v1](http://arxiv.org/abs/2303.12997v1)|null|\n", "2303.12930": "|**2023-03-24**|**Dense-Localizing Audio-Visual Events in Untrimmed Videos: A Large-Scale Benchmark and Baseline**|Tiantian Geng et.al.|[2303.12930v2](http://arxiv.org/abs/2303.12930v2)|**[link](https://github.com/ttgeng233/UnAV)**|\n", "2303.14153": "|**2023-03-24**|**Local Contrastive Learning for Medical Image Recognition**|S. A. Rizvi et.al.|[2303.14153v1](http://arxiv.org/abs/2303.14153v1)|null|\n", "2303.14139": "|**2023-03-24**|**MindDiffuser: Controlled Image Reconstruction from Human Brain Activity with Semantic and Structural Diffusion**|Yizhuo Lu et.al.|[2303.14139v1](http://arxiv.org/abs/2303.14139v1)|null|\n", "2303.14081": "|**2023-03-24**|**CoLa-Diff: Conditional Latent Diffusion Model for Multi-Modal MRI Synthesis**|Lan Jiang et.al.|[2303.14081v1](http://arxiv.org/abs/2303.14081v1)|null|\n", "2303.13885": "|**2023-03-24**|**ARKitTrack: A New Diverse Dataset for Tracking Using Mobile RGB-D Data**|Haojie Zhao et.al.|[2303.13885v1](http://arxiv.org/abs/2303.13885v1)|**[link](https://github.com/lawrence-cj/ARKitTrack)**|\n", "2303.13839": "|**2023-03-24**|**HRDoc: Dataset and Baseline Method Toward Hierarchical Reconstruction of Document Structures**|Jiefeng Ma et.al.|[2303.13839v1](http://arxiv.org/abs/2303.13839v1)|**[link](https://github.com/jfma-ustc/hrdoc)**|\n", "2303.13810": "|**2023-03-24**|**Evidence-aware multi-modal data fusion and its application to total knee replacement prediction**|Xinwen Liu et.al.|[2303.13810v1](http://arxiv.org/abs/2303.13810v1)|null|\n", "2303.15444": "|**2023-03-27**|**Quantum Multi-Model Fitting**|Matteo Farina et.al.|[2303.15444v1](http://arxiv.org/abs/2303.15444v1)|**[link](https://github.com/farinamatteo/qmmf)**|\n", "2303.15230": "|**2023-03-27**|**Troika: Multi-Path Cross-Modal Traction for Compositional Zero-Shot Learning**|Siteng Huang et.al.|[2303.15230v1](http://arxiv.org/abs/2303.15230v1)|null|\n", "2303.15219": "|**2023-03-27**|**Knowing the Distance: Understanding the Gap Between Synthetic and Real Data For Face Parsing**|Eli Friedman et.al.|[2303.15219v1](http://arxiv.org/abs/2303.15219v1)|null|\n", "2303.15103": "|**2023-03-27**|**Contrastive Learning Is Spectral Clustering On Similarity Graph**|Zhiquan Tan et.al.|[2303.15103v1](http://arxiv.org/abs/2303.15103v1)|**[link](https://github.com/yifanzhang-pro/kernel-infonce)**|\n", "2303.15083": "|**2023-03-27**|**UniDistill: A Universal Cross-Modality Knowledge Distillation Framework for 3D Object Detection in Bird's-Eye View**|Shengchao Zhou et.al.|[2303.15083v1](http://arxiv.org/abs/2303.15083v1)|**[link](https://github.com/megvii-research/cvpr2023-unidistill)**|\n", "2303.15016": "|**2023-03-27**|**Borrowing Human Senses: Comment-Aware Self-Training for Social Media Multimodal Classification**|Chunpu Xu et.al.|[2303.15016v1](http://arxiv.org/abs/2303.15016v1)|**[link](https://github.com/cpaaax/multimodal_cast)**|\n", "2303.15006": "|**2023-03-27**|**Curriculum Learning for Compositional Visual Reasoning**|Wafa Aissa et.al.|[2303.15006v1](http://arxiv.org/abs/2303.15006v1)|null|\n", "2303.14998": "|**2023-03-27**|**Multi-view Cross-Modality MR Image Translation for Vestibular Schwannoma and Cochlea Segmentation**|Bogyeong Kang et.al.|[2303.14998v1](http://arxiv.org/abs/2303.14998v1)|null|\n", "2303.14880": "|**2023-03-27**|**Toward Human-Like Social Robot Navigation: A Large-Scale, Multi-Modal, Social Human Navigation Dataset**|Duc M. Nguyen et.al.|[2303.14880v1](http://arxiv.org/abs/2303.14880v1)|null|\n", "2303.14865": "|**2023-03-27**|**Revisiting Multimodal Representation in Contrastive Learning: From Patch and Token Embeddings to Finite Discrete Tokens**|Yuxiao Chen et.al.|[2303.14865v1](http://arxiv.org/abs/2303.14865v1)|**[link](https://github.com/yuxiaochen1103/fdt)**|\n", "2303.14840": "|**2023-03-26**|**On the Importance of Accurate Geometry Data for Dense 3D Vision Tasks**|HyunJun Jung et.al.|[2303.14840v1](http://arxiv.org/abs/2303.14840v1)|**[link](https://github.com/junggy/hammer-dataset)**|\n", "2303.14768": "|**2023-03-26**|**Collaborative Noisy Label Cleaner: Learning Scene-aware Trailers for Multi-modal Highlight Detection in Movies**|Bei Gan et.al.|[2303.14768v1](http://arxiv.org/abs/2303.14768v1)|**[link](https://github.com/tencentyouturesearch/highlightdetection-clc)**|\n", "2303.14730": "|**2023-03-26**|**Semantic Neural Decoding via Cross-Modal Generation**|Xuelin Qian et.al.|[2303.14730v1](http://arxiv.org/abs/2303.14730v1)|null|\n", "2303.14666": "|**2023-03-26**|**Generalization Matters: Loss Minima Flattening via Parameter Hybridization for Efficient Online Knowledge Distillation**|Tianli Zhang et.al.|[2303.14666v1](http://arxiv.org/abs/2303.14666v1)|null|\n", "2303.14626": "|**2023-03-26**|**MRCN: A Novel Modality Restitution and Compensation Network for Visible-Infrared Person Re-identification**|Yukang Zhang et.al.|[2303.14626v1](http://arxiv.org/abs/2303.14626v1)|null|\n", "2303.16199": "|**2023-03-28**|**LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention**|Renrui Zhang et.al.|[2303.16199v1](http://arxiv.org/abs/2303.16199v1)|**[link](https://github.com/zrrskywalker/llama-adapter)**|\n", "2303.16099": "|**2023-03-28**|**Medical Image Analysis using Deep Relational Learning**|Zhihua Liu et.al.|[2303.16099v1](http://arxiv.org/abs/2303.16099v1)|null|\n", "2303.16058": "|**2023-03-28**|**Unmasked Teacher: Towards Training-Efficient Video Foundation Models**|Kunchang Li et.al.|[2303.16058v1](http://arxiv.org/abs/2303.16058v1)|**[link](https://github.com/opengvlab/unmasked_teacher)**|\n", "2303.15932": "|**2023-03-29**|**Unify, Align and Refine: Multi-Level Semantic Alignment for Radiology Report Generation**|Yaowei Li et.al.|[2303.15932v2](http://arxiv.org/abs/2303.15932v2)|null|\n", "2303.15826": "|**2023-03-28**|**MS-MT: Multi-Scale Mean Teacher with Contrastive Unpaired Translation for Cross-Modality Vestibular Schwannoma and Cochlea Segmentation**|Ziyuan Zhao et.al.|[2303.15826v1](http://arxiv.org/abs/2303.15826v1)|null|\n", "2303.15777": "|**2023-03-28**|**Imbalance Knowledge-Driven Multi-modal Network for Land-Cover Semantic Segmentation Using Images and LiDAR Point Clouds**|Yameng Wang et.al.|[2303.15777v1](http://arxiv.org/abs/2303.15777v1)|null|\n", "2303.15770": "|**2023-03-28**|**DDMM-Synth: A Denoising Diffusion Model for Cross-modal Medical Image Synthesis with Sparse-view Measurement Embedding**|Xiaoyue Li et.al.|[2303.15770v1](http://arxiv.org/abs/2303.15770v1)|null|\n", "2303.15710": "|**2023-03-28**|**Explicit Attention-Enhanced Fusion for RGB-Thermal Perception Tasks**|Mingjian Liang et.al.|[2303.15710v1](http://arxiv.org/abs/2303.15710v1)|**[link](https://github.com/freeformrobotics/eaefnet)**|\n", "2303.16818": "|**2023-03-30**|**BEVSimDet: Simulated Multi-modal Distillation in Bird's-Eye View for Multi-view 3D Object Detection**|Haimei Zhao et.al.|[2303.16818v2](http://arxiv.org/abs/2303.16818v2)|**[link](https://github.com/vitae-transformer/bevsimdet)**|\n", "2303.16604": "|**2023-03-29**|**Bi-directional Training for Composed Image Retrieval via Text Prompt Learning**|Zheyuan Liu et.al.|[2303.16604v1](http://arxiv.org/abs/2303.16604v1)|**[link](https://github.com/Cuberick-Orion/Bi-Blip4CIR)**|\n", "2303.16541": "|**2023-03-29**|**Sounding Video Generator: A Unified Framework for Text-guided Sounding Video Generation**|Jiawei Liu et.al.|[2303.16541v1](http://arxiv.org/abs/2303.16541v1)|**[link](https://github.com/jwliu-cc/svg)**|\n", "2303.16443": "|**2023-03-29**|**A tensor based varying-coefficient model for multi-modal neuroimaging data analysis**|Pratim Guha Niyogi et.al.|[2303.16443v1](http://arxiv.org/abs/2303.16443v1)|null|\n", "2303.17561": "|**2023-03-30**|**SoftCLIP: Softer Cross-modal Alignment Makes CLIP Stronger**|Yuting Gao et.al.|[2303.17561v1](http://arxiv.org/abs/2303.17561v1)|null|\n", "2303.17531": "|**2023-03-30**|**Asymmetric Face Recognition with Cross Model Compatible Ensembles**|Ori Linial et.al.|[2303.17531v1](http://arxiv.org/abs/2303.17531v1)|null|\n", "2303.17517": "|**2023-03-30**|**Hindi as a Second Language: Improving Visually Grounded Speech with Semantically Similar Samples**|Hyeonggon Ryu et.al.|[2303.17517v1](http://arxiv.org/abs/2303.17517v1)|null|\n", "2303.17490": "|**2023-03-30**|**Sound to Visual Scene Generation by Audio-to-Visual Latent Alignment**|Kim Sung-Bin et.al.|[2303.17490v1](http://arxiv.org/abs/2303.17490v1)|null|\n", "2303.17409": "|**2023-03-30**|**Steered Mixture of Experts Regression for Image Denoising with Multi-Model-Inference**|Aytac \u00d6zkan et.al.|[2303.17409v1](http://arxiv.org/abs/2303.17409v1)|null|\n", "2303.17386": "|**2023-03-30**|**Complementary Random Masking for RGB-Thermal Semantic Segmentation**|Ukcheol Shin et.al.|[2303.17386v1](http://arxiv.org/abs/2303.17386v1)|**[link](https://github.com/UkcheolShin/CRM_RGBTSeg)**|\n", "2303.17297": "|**2023-03-30**|**Understanding the Robustness of 3D Object Detection with Bird's-Eye-View Representations in Autonomous Driving**|Zijian Zhu et.al.|[2303.17297v1](http://arxiv.org/abs/2303.17297v1)|**[link](https://github.com/zzj403/BEV_Robust)**|\n", "2303.17285": "|**2023-03-30**|**Decomposed Cross-modal Distillation for RGB-based Temporal Action Detection**|Pilhyeon Lee et.al.|[2303.17285v1](http://arxiv.org/abs/2303.17285v1)|null|\n", "2303.17169": "|**2023-03-30**|**Task-Oriented Multi-Modal Mutual Leaning for Vision-Language Models**|Sifan Long et.al.|[2303.17169v1](http://arxiv.org/abs/2303.17169v1)|null|\n", "2303.17099": "|**2023-03-30**|**BEVFusion4D: Learning LiDAR-Camera Fusion Under Bird's-Eye-View via Cross-Modality Guidance and Temporal Aggregation**|Hongxiang Cai et.al.|[2303.17099v1](http://arxiv.org/abs/2303.17099v1)|null|\n", "2303.18248": "|**2023-03-31**|**Towards Flexible Multi-modal Document Models**|Naoto Inoue et.al.|[2303.18248v1](http://arxiv.org/abs/2303.18248v1)|**[link](https://github.com/CyberAgentAILab/flex-dm)**|\n", "2303.17981": "|**2023-03-31**|**Knowledge Distillation for Feature Extraction in Underwater VSLAM**|Jinghe Yang et.al.|[2303.17981v1](http://arxiv.org/abs/2303.17981v1)|**[link](https://github.com/jinghe-mel/ufen-slam)**|\n", "2303.17859": "|**2023-03-31**|**MapFormer: Boosting Change Detection by Using Pre-change Information**|Maximilian Bernhard et.al.|[2303.17859v1](http://arxiv.org/abs/2303.17859v1)|**[link](https://github.com/mxbh/mapformer)**|\n", "2303.17811": "|**2023-04-03**|**Zero-shot Referring Image Segmentation with Global-Local Context Features**|Seonghoon Yu et.al.|[2303.17811v2](http://arxiv.org/abs/2303.17811v2)|**[link](https://github.com/seonghoon-yu/zero-shot-ris)**|\n", "2304.00932": "|**2023-04-03**|**HypLiLoc: Towards Effective LiDAR Pose Regression with Hyperbolic Fusion**|Sijie Wang et.al.|[2304.00932v1](http://arxiv.org/abs/2304.00932v1)|**[link](https://github.com/sijieaaa/hypliloc)**|\n", "2304.00827": "|**2023-04-03**|**Multi-modal Fake News Detection on Social Media via Multi-grained Information Fusion**|Yangming Zhou et.al.|[2304.00827v1](http://arxiv.org/abs/2304.00827v1)|null|\n", "2304.00788": "|**2023-04-03**|**Open-Vocabulary Point-Cloud Object Detection without 3D Annotation**|Yuheng Lu et.al.|[2304.00788v1](http://arxiv.org/abs/2304.00788v1)|**[link](https://github.com/lyhdet/ov-3det)**|\n", "2304.00719": "|**2023-04-03**|**Multi-Modal Representation Learning with Text-Driven Soft Masks**|Jaeyoo Park et.al.|[2304.00719v1](http://arxiv.org/abs/2304.00719v1)|null|\n", "2304.00670": "|**2023-04-03**|**CRN: Camera Radar Net for Accurate, Robust, Efficient 3D Perception**|Youngseok Kim et.al.|[2304.00670v1](http://arxiv.org/abs/2304.00670v1)|null|\n", "2304.00495": "|**2023-04-02**|**Multimodal Hyperspectral Image Classification via Interconnected Fusion**|Lu Huo et.al.|[2304.00495v1](http://arxiv.org/abs/2304.00495v1)|null|\n", "2304.00450": "|**2023-04-02**|**Sketch-based Video Object Localization**|Sangmin Woo et.al.|[2304.00450v1](http://arxiv.org/abs/2304.00450v1)|null|\n", "2304.00379": "|**2023-04-01**|**Improved Multimodal Fusion for Small Datasets with Auxiliary Supervision**|Gregory Holste et.al.|[2304.00379v1](http://arxiv.org/abs/2304.00379v1)|null|\n", "2304.00157": "|**2023-03-31**|**Robotic Perception of Transparent Objects: A Review**|Jiaqi Jiang et.al.|[2304.00157v1](http://arxiv.org/abs/2304.00157v1)|null|\n", "2304.01961": "|**2023-04-04**|**AToMiC: An Image/Text Retrieval Test Collection to Support Multimedia Content Creation**|Jheng-Hong Yang et.al.|[2304.01961v1](http://arxiv.org/abs/2304.01961v1)|**[link](https://github.com/trec-atomic/atomic)**|\n", "2304.01799": "|**2023-04-04**|**naplib-python: Neural Acoustic Data Processing and Analysis Tools in Python**|Gavin Mischler et.al.|[2304.01799v1](http://arxiv.org/abs/2304.01799v1)|**[link](https://github.com/naplab/naplib-python)**|\n", "2304.01705": "|**2023-04-04**|**Cross-modal tumor segmentation using generative blending augmentation and self training**|Guillaume Sall\u00e9 et.al.|[2304.01705v1](http://arxiv.org/abs/2304.01705v1)|null|\n", "2304.01603": "|**2023-04-04**|**Locate Then Generate: Bridging Vision and Language with Bounding Box for Scene-Text VQA**|Yongxin Zhu et.al.|[2304.01603v1](http://arxiv.org/abs/2304.01603v1)|null|\n", "2304.01601": "|**2023-04-04**|**Primitive Simultaneous Optimization of Similarity Metrics for Image Registration**|Diana Waldmannstetter et.al.|[2304.01601v1](http://arxiv.org/abs/2304.01601v1)|null|\n", "2304.01563": "|**2023-04-04**|**Attribute-Consistent Knowledge Graph Representation Learning for Multi-Modal Entity Alignment**|Qian Li et.al.|[2304.01563v1](http://arxiv.org/abs/2304.01563v1)|null|\n", "2304.01491": "|**2023-04-04**|**Multi model LSTM architecture for Track Association based on Automatic Identification System Data**|Md Asif Bin Syed et.al.|[2304.01491v1](http://arxiv.org/abs/2304.01491v1)|null|\n", "2304.01440": "|**2023-04-04**|**A Deep Multi-Modal Cyber-Attack Detection in Industrial Control Systems**|Sepideh Bahadoripour et.al.|[2304.01440v1](http://arxiv.org/abs/2304.01440v1)|null|\n", "2304.01430": "|**2023-04-04**|**Divided Attention: Unsupervised Multi-Object Discovery with Contextually Separated Slots**|Dong Lao et.al.|[2304.01430v1](http://arxiv.org/abs/2304.01430v1)|null|\n", "2304.01233": "|**2023-04-03**|**Multi-Modal Perceiver Language Model for Outcome Prediction in Emergency Department**|Sabri Boughorbel et.al.|[2304.01233v1](http://arxiv.org/abs/2304.01233v1)|null|\n", "2304.02556": "|**2023-04-05**|**Detecting and Grounding Multi-Modal Media Manipulation**|Rui Shao et.al.|[2304.02556v1](http://arxiv.org/abs/2304.02556v1)|**[link](https://github.com/rshaojimmy/multimodal-deepfake)**|\n", "2304.02532": "|**2023-04-05**|**Goal-Conditioned Imitation Learning using Score-based Diffusion Policies**|Moritz Reuss et.al.|[2304.02532v1](http://arxiv.org/abs/2304.02532v1)|null|\n", "2304.02419": "|**2023-04-05**|**TM2D: Bimodality Driven 3D Dance Generation via Music-Text Integration**|Kehong Gong et.al.|[2304.02419v1](http://arxiv.org/abs/2304.02419v1)|**[link](https://github.com/Garfield-kh/TM2D)**|\n", "2304.02407": "|**2023-04-05**|**Explaining Multimodal Data Fusion: Occlusion Analysis for Wilderness Mapping**|Burak Ekim et.al.|[2304.02407v1](http://arxiv.org/abs/2304.02407v1)|null|\n", "2304.02328": "|**2023-04-05**|**Enhancing Multimodal Entity and Relation Extraction with Variational Information Bottleneck**|Shiyao Cui et.al.|[2304.02328v1](http://arxiv.org/abs/2304.02328v1)|null|\n", "2304.02278": "|**2023-04-05**|**Calibrating Cross-modal Feature for Text-Based Person Searching**|Donglai Wei et.al.|[2304.02278v1](http://arxiv.org/abs/2304.02278v1)|null|\n", "2304.03047": "|**2023-04-07**|**ETPNav: Evolving Topological Planning for Vision-Language Navigation in Continuous Environments**|Dong An et.al.|[2304.03047v2](http://arxiv.org/abs/2304.03047v2)|**[link](https://github.com/marsaki/etpnav)**|\n", "2304.02991": "|**2023-04-06**|**Exploiting the Complementarity of 2D and 3D Networks to Address Domain-Shift in 3D Semantic Segmentation**|Adriano Cardace et.al.|[2304.02991v1](http://arxiv.org/abs/2304.02991v1)|**[link](https://github.com/cvlab-unibo/mm2d3d)**|\n", "2304.02948": "|**2023-04-06**|**FengWu: Pushing the Skillful Global Medium-range Weather Forecast beyond 10 Days Lead**|Kang Chen et.al.|[2304.02948v1](http://arxiv.org/abs/2304.02948v1)|null|\n", "2304.02916": "|**2023-04-06**|**Efficient Audio Captioning Transformer with Patchout and Text Guidance**|Thodoris Kouzelis et.al.|[2304.02916v1](http://arxiv.org/abs/2304.02916v1)|null|\n", "2304.02902": "|**2023-04-06**|**Towards Efficient MCMC Sampling in Bayesian Neural Networks by Exploiting Symmetry**|Jonas Gregor Wiese et.al.|[2304.02902v1](http://arxiv.org/abs/2304.02902v1)|null|\n", "2304.02853": "|**2023-04-06**|**Learning Instance-Level Representation for Large-Scale Multi-Modal Pretraining in E-commerce**|Yang Jin et.al.|[2304.02853v1](http://arxiv.org/abs/2304.02853v1)|null|\n", "2304.03669": "|**2023-04-07**|**DATE: Domain Adaptive Product Seeker for E-commerce**|Haoyuan Li et.al.|[2304.03669v1](http://arxiv.org/abs/2304.03669v1)|null|\n", "2304.03542": "|**2023-04-07**|**Better \"CMOS\" Produces Clearer Images: Learning Space-Variant Blur Estimation for Blind Image Super-Resolution**|Xuhai Chen et.al.|[2304.03542v1](http://arxiv.org/abs/2304.03542v1)|null|\n", "2304.03391": "|**2023-04-06**|**Exposing and Mitigating Spurious Correlations for Cross-Modal Retrieval**|Jae Myung Kim et.al.|[2304.03391v1](http://arxiv.org/abs/2304.03391v1)|null|\n", "2304.04523": "|**2023-04-10**|**PoseFusion: Robust Object-in-Hand Pose Estimation with SelectLSTM**|Yuyang Tu et.al.|[2304.04523v1](http://arxiv.org/abs/2304.04523v1)|null|\n", "2304.04302": "|**2023-04-09**|**Bionic Collapsible Wings in Aquatic-aerial Robot**|Xiao Xiong et.al.|[2304.04302v1](http://arxiv.org/abs/2304.04302v1)|null|\n", "2304.04298": "|**2023-04-09**|**Unsupervised Sampling Promoting for Stochastic Human Trajectory Prediction**|Guangyi Chen et.al.|[2304.04298v1](http://arxiv.org/abs/2304.04298v1)|**[link](https://github.com/viewsetting/unsupervised_sampling_promoting)**|\n", "2304.04290": "|**2023-04-09**|**Distributed Conditional GAN (discGAN) For Synthetic Healthcare Data Generation**|David Fuentes et.al.|[2304.04290v1](http://arxiv.org/abs/2304.04290v1)|null|\n", "2304.04231": "|**2023-04-09**|**CrowdCLIP: Unsupervised Crowd Counting via Vision-Language Model**|Dingkang Liang et.al.|[2304.04231v1](http://arxiv.org/abs/2304.04231v1)|**[link](https://github.com/dk-liang/crowdclip)**|\n", "2304.04187": "|**2023-04-09**|**Similarity-Aware Multimodal Prompt Learning for Fake News Detection**|Ye Jiang et.al.|[2304.04187v1](http://arxiv.org/abs/2304.04187v1)|null|\n", "2304.04113": "|**2023-04-08**|**An Automated Fully-Computational Framework to Construct Printability Maps for Additively Manufactured Metal Alloys**|Sofia Sheikh et.al.|[2304.04113v1](http://arxiv.org/abs/2304.04113v1)|null|\n", "2304.04062": "|**2023-04-08**|**Predicting multiple sclerosis disease severity with multimodal deep neural networks**|Kai Zhang et.al.|[2304.04062v1](http://arxiv.org/abs/2304.04062v1)|**[link](https://github.com/anotherkaizhang/ms)**|\n", "2304.03916": "|**2023-04-08**|**Mitigating Spurious Correlations in Multi-modal Models during Fine-tuning**|Yu Yang et.al.|[2304.03916v1](http://arxiv.org/abs/2304.03916v1)|null|\n", "2304.03910": "|**2023-04-08**|**Co-attention Propagation Network for Zero-Shot Video Object Segmentation**|Gensheng Pei et.al.|[2304.03910v1](http://arxiv.org/abs/2304.03910v1)|**[link](https://github.com/nust-machine-intelligence-laboratory/hcpn)**|\n", "2304.03897": "|**2023-04-08**|**Factify 2: A Multimodal Fake News and Satire News Dataset**|S Suryavardan et.al.|[2304.03897v1](http://arxiv.org/abs/2304.03897v1)|**[link](https://github.com/surya1701/factify-2.0)**|\n", "2304.05340": "|**2023-04-11**|**Unified Multi-Modal Image Synthesis for Missing Modality Imputation**|Yue Zhang et.al.|[2304.05340v1](http://arxiv.org/abs/2304.05340v1)|null|\n", "2304.05171": "|**2023-04-11**|**Curriculum-Based Imitation of Versatile Skills**|Maximilian Xiling Li et.al.|[2304.05171v1](http://arxiv.org/abs/2304.05171v1)|**[link](https://github.com/intuitive-robots/ml-cur)**|\n", "2304.05166": "|**2023-04-11**|**TrajFlow: Learning the Distribution over Trajectories**|Anna M\u00e9sz\u00e1ros et.al.|[2304.05166v1](http://arxiv.org/abs/2304.05166v1)|null|\n", "2304.05080": "|**2023-04-11**|**Investigating Imbalances Between SAR and Optical Utilization for Multi-Modal Urban Mapping**|Sebastian Hafner et.al.|[2304.05080v1](http://arxiv.org/abs/2304.05080v1)|null|\n", "2304.05051": "|**2023-04-11**|**FashionSAP: Symbols and Attributes Prompt for Fine-grained Fashion Vision-Language Pre-training**|Yunpeng Han et.al.|[2304.05051v1](http://arxiv.org/abs/2304.05051v1)|**[link](https://github.com/hssip/fashionsap)**|\n", "2304.05979": "|**2023-04-12**|**NaviSTAR: Socially Aware Robot Navigation with Hybrid Spatio-Temporal Graph Transformer and Preference Learning**|Weizheng Wang et.al.|[2304.05979v1](http://arxiv.org/abs/2304.05979v1)|null|\n", "2304.05754": "|**2023-04-12**|**Self-Supervised Learning with Cluster-Aware-DINO for High-Performance Robust Speaker Verification**|Bing Han et.al.|[2304.05754v1](http://arxiv.org/abs/2304.05754v1)|null|\n", "2304.05720": "|**2023-04-12**|**Towards a more comprehensive open-source model for interdisciplinary smart integrated energy systems**|B\u00e9la Wiegel et.al.|[2304.05720v1](http://arxiv.org/abs/2304.05720v1)|null|\n", "2304.05646": "|**2023-04-12**|**Modality-Invariant Representation for Infrared and Visible Image Registration**|Zhiying Jiang et.al.|[2304.05646v1](http://arxiv.org/abs/2304.05646v1)|null|\n", "2304.05645": "|**2023-04-12**|**WildRefer: 3D Object Localization in Large-scale Dynamic Scenes with Multi-modal Visual Data and Natural Language**|Zhenxiang Lin et.al.|[2304.05645v1](http://arxiv.org/abs/2304.05645v1)|null|\n", "2304.05600": "|**2023-04-12**|**Looking Similar, Sounding Different: Leveraging Counterfactual Cross-Modal Pairs for Audiovisual Representation Learning**|Nikhil Singh et.al.|[2304.05600v1](http://arxiv.org/abs/2304.05600v1)|null|\n", "2304.05523": "|**2023-04-11**|**MoMo: A shared encoder Model for text, image and multi-Modal representations**|Rakesh Chada et.al.|[2304.05523v1](http://arxiv.org/abs/2304.05523v1)|null|\n", "2304.05402": "|**2023-04-11**|**Boosting Cross-task Transferability of Adversarial Patches with Visual Relations**|Tony Ma et.al.|[2304.05402v1](http://arxiv.org/abs/2304.05402v1)|null|\n", "2304.06708": "|**2023-04-13**|**Verbs in Action: Improving verb understanding in video-language models**|Liliane Momeni et.al.|[2304.06708v1](http://arxiv.org/abs/2304.06708v1)|null|\n", "2304.06306": "|**2023-04-13**|**Efficient Multimodal Fusion via Interactive Prompting**|Yaowei Li et.al.|[2304.06306v1](http://arxiv.org/abs/2304.06306v1)|null|\n", "2304.06275": "|**2023-04-13**|**Noisy Correspondence Learning with Meta Similarity Correction**|Haochen Han et.al.|[2304.06275v1](http://arxiv.org/abs/2304.06275v1)|**[link](https://github.com/hhc1997/mscn)**|\n", "2304.06264": "|**2023-04-13**|**Loosely Coupled Odometry, UWB Ranging, and Cooperative Spatial Detection for Relative Monte-Carlo Multi-Robot Localization**|Xianjia Yu et.al.|[2304.06264v1](http://arxiv.org/abs/2304.06264v1)|**[link](https://github.com/tiers/uwb-cooperative-mrs-localization)**|\n", "2304.06051": "|**2023-04-12**|**Open-TransMind: A New Baseline and Benchmark for 1st Foundation Model Challenge of Intelligent Transportation**|Yifeng Shi et.al.|[2304.06051v1](http://arxiv.org/abs/2304.06051v1)|**[link](https://github.com/Traffic-X/Open-TransMind)**|\n", "2304.07199": "|**2023-04-14**|**CROVIA: Seeing Drone Scenes from Car Perspective via Cross-View Adaptation**|Thanh-Dat Truong et.al.|[2304.07199v1](http://arxiv.org/abs/2304.07199v1)|null|\n", "2304.07151": "|**2023-04-14**|**End-to-End Learning with Multiple Modalities for System-Optimised Renewables Nowcasting**|Rushil Vohra et.al.|[2304.07151v1](http://arxiv.org/abs/2304.07151v1)|null|\n", "2304.07147": "|**2023-04-14**|**Cross Attention Transformers for Multi-modal Unsupervised Whole-Body PET Anomaly Detection**|Ashay Patel et.al.|[2304.07147v1](http://arxiv.org/abs/2304.07147v1)|null|\n", "2304.06991": "|**2023-04-14**|**WYTIWYR: A User Intent-Aware Framework with Multi-modal Inputs for Visualization Retrieval**|Shishi Xiao et.al.|[2304.06991v1](http://arxiv.org/abs/2304.06991v1)|**[link](https://github.com/serendipitysx/wytiwyr)**|\n", "2304.06910": "|**2023-04-14**|**HCAM -- Hierarchical Cross Attention Model for Multi-modal Emotion Recognition**|Soumya Dutta et.al.|[2304.06910v1](http://arxiv.org/abs/2304.06910v1)|null|\n", "2304.06786": "|**2023-04-13**|**The future of hearing aid technology**|Volker Hohmann et.al.|[2304.06786v1](http://arxiv.org/abs/2304.06786v1)|null|\n", "2304.08345": "|**2023-04-17**|**VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset**|Sihan Chen et.al.|[2304.08345v1](http://arxiv.org/abs/2304.08345v1)|**[link](https://github.com/TXH-mercury/VALOR)**|\n", "2304.08304": "|**2023-04-17**|**SDVRF: Sparse-to-Dense Voxel Region Fusion for Multi-modal 3D Object Detection**|Binglu Ren et.al.|[2304.08304v1](http://arxiv.org/abs/2304.08304v1)|null|\n", "2304.08083": "|**2023-04-17**|**Causality-aware Visual Scene Discovery for Cross-Modal Question Reasoning**|Yang Liu et.al.|[2304.08083v1](http://arxiv.org/abs/2304.08083v1)|null|\n", "2304.08072": "|**2023-04-17**|**Two-stage MR Image Segmentation Method for Brain Tumors based on Attention Mechanism**|Li Zhu et.al.|[2304.08072v1](http://arxiv.org/abs/2304.08072v1)|null|\n", "2304.08058": "|**2023-04-17**|**One-Class SVM on siamese neural network latent space for Unsupervised Anomaly Detection on brain MRI White Matter Hyperintensities**|Nicolas Pinon et.al.|[2304.08058v1](http://arxiv.org/abs/2304.08058v1)|null|\n", "2304.08054": "|**2023-04-17**|**Fed-MIWAE: Federated Imputation of Incomplete Data via Deep Generative Models**|Irene Balelli et.al.|[2304.08054v1](http://arxiv.org/abs/2304.08054v1)|null|\n", "2304.07775": "|**2023-04-16**|**Robust Cross-Modal Knowledge Distillation for Unconstrained Videos**|Wenke Xia et.al.|[2304.07775v1](http://arxiv.org/abs/2304.07775v1)|**[link](https://github.com/gewu-lab/cross-modal-distillation)**|\n", "2304.07728": "|**2023-04-16**|**TransFusionOdom: Interpretable Transformer-based LiDAR-Inertial Fusion Odometry Estimation**|Leyuan Sun et.al.|[2304.07728v1](http://arxiv.org/abs/2304.07728v1)|**[link](https://github.com/rakugenson/multi-modal-dataset-for-odometry-estimation)**|\n", "2304.07633": "|**2023-04-15**|**Detecting Out-of-Context Multimodal Misinformation with interpretable neural-symbolic model**|Yizhou Zhang et.al.|[2304.07633v1](http://arxiv.org/abs/2304.07633v1)|null|\n", "2304.07567": "|**2023-04-15**|**CoVLR: Coordinating Cross-Modal Consistency and Intra-Modal Structure for Vision-Language Retrieval**|Yang Yang et.al.|[2304.07567v1](http://arxiv.org/abs/2304.07567v1)|null|\n", "2304.07549": "|**2023-04-15**|**MA-ViT: Modality-Agnostic Vision Transformers for Face Anti-Spoofing**|Ajian Liu et.al.|[2304.07549v1](http://arxiv.org/abs/2304.07549v1)|null|\n", "2304.07387": "|**2023-04-14**|**Cross-domain Food Image-to-Recipe Retrieval by Weighted Adversarial Learning**|Bin Zhu et.al.|[2304.07387v1](http://arxiv.org/abs/2304.07387v1)|null|\n", "2304.09172": "|**2023-04-18**|**Hyperbolic Image-Text Representations**|Karan Desai et.al.|[2304.09172v1](http://arxiv.org/abs/2304.09172v1)|null|\n", "2304.09164": "|**2023-04-18**|**Structure Preserving Cycle-GAN for Unsupervised Medical Image Domain Adaptation**|Paolo Iacono et.al.|[2304.09164v1](http://arxiv.org/abs/2304.09164v1)|null|\n", "2304.08965": "|**2023-04-18**|**Unsupervised Semantic Segmentation of 3D Point Clouds via Cross-modal Distillation and Super-Voxel Clustering**|Zisheng Chen et.al.|[2304.08965v1](http://arxiv.org/abs/2304.08965v1)|**[link](https://github.com/scut-bip-lab/pointdc)**|\n", "2304.08881": "|**2023-04-18**|**Segmentation of glioblastomas in early post-operative multi-modal MRI with deep neural networks**|Ragnhild Holden Helland et.al.|[2304.08881v1](http://arxiv.org/abs/2304.08881v1)|**[link](https://github.com/dbouget/validation_metrics_computation)**|\n", "2304.08709": "|**2023-04-18**|**You Only Need Two Detectors to Achieve Multi-Modal 3D Multi-Object Tracking**|Xiyang Wang et.al.|[2304.08709v1](http://arxiv.org/abs/2304.08709v1)|**[link](https://github.com/wangxiyang2022/YONTD-MOT)**|\n", "2304.08660": "|**2023-04-17**|**(LC)$^2$: LiDAR-Camera Loop Constraints For Cross-Modal Place Recognition**|Alex Junho Lee et.al.|[2304.08660v1](http://arxiv.org/abs/2304.08660v1)|null|\n", "2304.08658": "|**2023-04-20**|**In-situ surface porosity prediction in DED (directed energy deposition) printed SS316L parts using multimodal sensor fusion**|Adithyaa Karthikeyan et.al.|[2304.08658v2](http://arxiv.org/abs/2304.08658v2)|null|\n", "2304.09801": "|**2023-04-19**|**MetaBEV: Solving Sensor Failures for BEV Detection and Map Segmentation**|Chongjian Ge et.al.|[2304.09801v1](http://arxiv.org/abs/2304.09801v1)|**[link](https://github.com/ChongjianGE/MetaBEV)**|\n", "2304.09694": "|**2023-04-19**|**CrossFusion: Interleaving Cross-modal Complementation for Noise-resistant 3D Object Detection**|Yang Yang et.al.|[2304.09694v1](http://arxiv.org/abs/2304.09694v1)|null|\n", "2304.09609": "|**2023-04-19**|**MMDR: A Result Feature Fusion Object Detection Approach for Autonomous System**|Wendong Zhang et.al.|[2304.09609v1](http://arxiv.org/abs/2304.09609v1)|null|\n", "2304.09498": "|**2023-04-19**|**Learning Robust Visual-Semantic Embedding for Generalizable Person Re-identification**|Suncheng Xiang et.al.|[2304.09498v1](http://arxiv.org/abs/2304.09498v1)|**[link](https://github.com/jeremyxsc/mmet)**|\n", "2304.09448": "|**2023-04-19**|**EC^2: Emergent Communication for Embodied Control**|Yao Mu et.al.|[2304.09448v1](http://arxiv.org/abs/2304.09448v1)|null|\n", "2304.09421": "|**2023-04-19**|**TieFake: Title-Text Similarity and Emotion-Aware Fake News Detection**|Quanjiang Guo et.al.|[2304.09421v1](http://arxiv.org/abs/2304.09421v1)|**[link](https://github.com/uestc-gqj/tiefake)**|\n", "2304.09370": "|**2023-04-19**|**Integrating Reconfigurable Foot Design, Multi-modal Contact Sensing, and Terrain Classification for Bipedal Locomotion**|Ted Tyler et.al.|[2304.09370v1](http://arxiv.org/abs/2304.09370v1)|null|\n", "2304.09322": "|**2023-04-18**|**Multi-Modality Multi-Scale Cardiovascular Disease Subtypes Classification Using Raman Image and Medical History**|Bo Yu et.al.|[2304.09322v1](http://arxiv.org/abs/2304.09322v1)|null|\n", "2304.10530": "|**2023-04-20**|**Collaborative Diffusion for Multi-Modal Face Generation and Editing**|Ziqi Huang et.al.|[2304.10530v1](http://arxiv.org/abs/2304.10530v1)|**[link](https://github.com/ziqihuangg/collaborative-diffusion)**|\n", "2304.10309": "|**2023-04-20**|**Improving Speech Translation by Cross-Modal Multi-Grained Contrastive Learning**|Hao Zhang et.al.|[2304.10309v1](http://arxiv.org/abs/2304.10309v1)|null|\n", "2304.10254": "|**2023-04-20**|**Image-text Retrieval via preserving main Semantics of Vision**|Xu Zhang et.al.|[2304.10254v1](http://arxiv.org/abs/2304.10254v1)|**[link](https://github.com/zhangxu0963/vsl)**|\n", "2304.10091": "|**2023-04-20**|**Learning CLIP Guided Visual-Text Fusion Transformer for Video-based Pedestrian Attribute Recognition**|Jun Zhu et.al.|[2304.10091v1](http://arxiv.org/abs/2304.10091v1)|**[link](https://github.com/event-ahu/vtf_par)**|\n", "2304.09941": "|**2023-04-19**|**A robust and interpretable deep learning framework for multi-modal registration via keypoints**|Alan Q. Wang et.al.|[2304.09941v1](http://arxiv.org/abs/2304.09941v1)|**[link](https://github.com/evanmy/keymorph)**|\n", "2304.09921": "|**2023-04-19**|**Regularization for distributionally robust state estimation and prediction**|Jean-S\u00e9bastien Brouillon et.al.|[2304.09921v1](http://arxiv.org/abs/2304.09921v1)|null|\n", "2304.10382": "|**2023-04-21**|**Conditional Generative Models for Learning Stochastic Processes**|Salvatore Certo et.al.|[2304.10382v2](http://arxiv.org/abs/2304.10382v2)|null|\n", "2304.11098": "|**2023-04-21**|**Generative AI-enabled Vehicular Networks: Fundamentals, Framework, and Case Study**|Ruichen Zhang et.al.|[2304.11098v1](http://arxiv.org/abs/2304.11098v1)|null|\n", "2304.11029": "|**2023-04-24**|**CLaMP: Contrastive Language-Music Pre-training for Cross-Modal Symbolic Music Information Retrieval**|Shangda Wu et.al.|[2304.11029v2](http://arxiv.org/abs/2304.11029v2)|**[link](https://github.com/microsoft/muzic/tree/main/clamp)**|\n", "2304.10893": "|**2023-04-21**|**FindVehicle and VehicleFinder: A NER dataset for natural language-based vehicle retrieval and a keyword-based cross-modal vehicle retrieval system**|Runwei Guan et.al.|[2304.10893v1](http://arxiv.org/abs/2304.10893v1)|**[link](https://github.com/guanrunwei/vehiclefinder-ctim)**|\n", "2304.10824": "|**2023-04-21**|**Rethinking Benchmarks for Cross-modal Image-text Retrieval**|Weijing Chen et.al.|[2304.10824v1](http://arxiv.org/abs/2304.10824v1)|**[link](https://github.com/cwj1412/mscoco-flikcr30k_fg)**|\n", "2304.10759": "|**2023-04-21**|**GeoLayoutLM: Geometric Pre-training for Visual Information Extraction**|Chuwei Luo et.al.|[2304.10759v1](http://arxiv.org/abs/2304.10759v1)|**[link](https://github.com/alibabaresearch/advancedliteratemachinery)**|\n", "2304.10756": "|**2023-04-21**|**Missing Modality Robustness in Semi-Supervised Multi-Modal Semantic Segmentation**|Harsh Maheshwari et.al.|[2304.10756v1](http://arxiv.org/abs/2304.10756v1)|**[link](https://github.com/harshm121/m3l)**|\n", "2304.10740": "|**2023-04-21**|**Multi-Modal Deep Learning for Credit Rating Prediction Using Text and Numerical Data Streams**|Mahsa Tavakoli et.al.|[2304.10740v1](http://arxiv.org/abs/2304.10740v1)|**[link](https://github.com/banking-analytics-lab/multimodalfusionratings)**|\n", "2304.10727": "|**2023-04-21**|**RoCOCO: Robust Benchmark MS-COCO to Stress-test Robustness of Image-Text Matching Models**|Seulki Park et.al.|[2304.10727v1](http://arxiv.org/abs/2304.10727v1)|**[link](https://github.com/pseulki/rococo)**|\n", "2304.10658": "|**2023-04-20**|**Linear to multi-linear algebra and systems using tensors**|Divyanshu Pandey et.al.|[2304.10658v1](http://arxiv.org/abs/2304.10658v1)|null|\n", "2304.10628": "|**2023-04-20**|**HM-ViT: Hetero-modal Vehicle-to-Vehicle Cooperative perception with vision transformer**|Hao Xiang et.al.|[2304.10628v1](http://arxiv.org/abs/2304.10628v1)|null|\n", "2304.10592": "|**2023-04-20**|**MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models**|Deyao Zhu et.al.|[2304.10592v1](http://arxiv.org/abs/2304.10592v1)|**[link](https://github.com/vision-cair/minigpt-4)**|\n", "2304.12269": "|**2023-04-24**|**Enriching Source Code with Contextual Data for Code Completion Models: An Empirical Study**|Tim van Dam et.al.|[2304.12269v1](http://arxiv.org/abs/2304.12269v1)|**[link](https://github.com/aise-tudelft/contextualdatacodecompletion)**|\n", "2304.12259": "|**2023-04-24**|**Imaging 3D Chemistry at 1 nm Resolution with Fused Multi-Modal Electron Tomography**|Jonathan Schwartz et.al.|[2304.12259v1](http://arxiv.org/abs/2304.12259v1)|**[link](https://github.com/jtschwar/projection_refinement)**|\n", "2304.11993": "|**2023-04-25**|**MMC: Multi-Modal Colorization of Images using Textual Descriptions**|Subhankar Ghosh et.al.|[2304.11993v2](http://arxiv.org/abs/2304.11993v2)|null|\n", "2304.11875": "|**2023-04-24**|**Underwater object classification combining SAS and transferred optical-to-SAS Imagery**|Avi Abu et.al.|[2304.11875v1](http://arxiv.org/abs/2304.11875v1)|null|\n", "2304.11829": "|**2023-04-25**|**Hierarchical Diffusion Autoencoders and Disentangled Image Manipulation**|Zeyu Lu et.al.|[2304.11829v2](http://arxiv.org/abs/2304.11829v2)|null|\n", "2304.11764": "|**2023-04-23**|**Learning-enabled multi-modal motion prediction in urban environments**|Vinicius Trentin et.al.|[2304.11764v1](http://arxiv.org/abs/2304.11764v1)|null|\n", "2304.11697": "|**2023-04-23**|**Informative Data Selection with Uncertainty for Multi-modal Object Detection**|Xinyu Zhang et.al.|[2304.11697v1](http://arxiv.org/abs/2304.11697v1)|null|\n", "2304.11618": "|**2023-04-23**|**Modality-Aware Negative Sampling for Multi-modal Knowledge Graph Embedding**|Yichi Zhang et.al.|[2304.11618v1](http://arxiv.org/abs/2304.11618v1)|**[link](https://github.com/zjukg/mans)**|\n", "2304.11603": "|**2023-04-23**|**LaMD: Latent Motion Diffusion for Video Generation**|Yaosi Hu et.al.|[2304.11603v1](http://arxiv.org/abs/2304.11603v1)|null|\n", "2304.11193": "|**2023-04-21**|**Combining Vision and Tactile Sensation for Video Prediction**|Willow Mandil et.al.|[2304.11193v1](http://arxiv.org/abs/2304.11193v1)|null|\n", "2304.12995": "|**2023-04-25**|**AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head**|Rongjie Huang et.al.|[2304.12995v1](http://arxiv.org/abs/2304.12995v1)|**[link](https://github.com/aigc-audio/audiogpt)**|\n", "2304.12725": "|**2023-04-25**|**Quantitative analysis of collagen remodeling in pancreatic lesions using computationally translated collagen images derived from brightfield microscopy images**|Varun Nair et.al.|[2304.12725v1](http://arxiv.org/abs/2304.12725v1)|null|\n", "2304.12570": "|**2023-04-25**|**Learnable Pillar-based Re-ranking for Image-Text Retrieval**|Leigang Qu et.al.|[2304.12570v1](http://arxiv.org/abs/2304.12570v1)|**[link](https://github.com/lgqu/leaprr)**|\n", "2304.12412": "|**2023-04-24**|**End-to-End Lidar-Camera Self-Calibration for Autonomous Vehicles**|Arya Rachman et.al.|[2304.12412v1](http://arxiv.org/abs/2304.12412v1)|null|\n", "2304.13649": "|**2023-04-26**|**A Symmetric Dual Encoding Dense Retrieval Framework for Knowledge-Intensive Visual Question Answering**|Alireza Salemi et.al.|[2304.13649v1](http://arxiv.org/abs/2304.13649v1)|**[link](https://github.com/alirezasalemi7/dedr-mm-fid)**|\n", "2304.13583": "|**2023-04-26**|**Multi-Modality Deep Network for Extreme Learned Image Compression**|Xuhao Jiang et.al.|[2304.13583v1](http://arxiv.org/abs/2304.13583v1)|null|\n", "2304.13559": "|**2023-04-28**|**Towards Multi-Modal DBMSs for Seamless Querying of Texts and Tables**|Matthias Urban et.al.|[2304.13559v2](http://arxiv.org/abs/2304.13559v2)|null|\n", "2304.13425": "|**2023-04-26**|**Learnable Ophthalmology SAM**|Zhongxi Qiu et.al.|[2304.13425v1](http://arxiv.org/abs/2304.13425v1)|**[link](https://github.com/qsingle/learnablepromptsam)**|\n", "2304.13357": "|**2023-04-26**|**Deep Lifelong Cross-modal Hashing**|Liming Xu et.al.|[2304.13357v1](http://arxiv.org/abs/2304.13357v1)|null|\n", "2304.13277": "|**2023-04-26**|**Self-Supervised Multi-Modal Sequential Recommendation**|Kunzhe Song et.al.|[2304.13277v1](http://arxiv.org/abs/2304.13277v1)|**[link](https://github.com/kz-song/mmsrec)**|\n", "2304.13273": "|**2023-04-27**|**From Association to Generation: Text-only Captioning by Unsupervised Cross-modal Mapping**|Junyang Wang et.al.|[2304.13273v2](http://arxiv.org/abs/2304.13273v2)|**[link](https://github.com/junyangwang0410/knight)**|\n", "2304.13181": "|**2023-04-25**|**Sample-Specific Debiasing for Better Image-Text Models**|Peiqi Wang et.al.|[2304.13181v1](http://arxiv.org/abs/2304.13181v1)|null|\n", "2304.13172": "|**2023-04-25**|**Generating Procedural Materials from Text or Image Prompts**|Yiwei Hu et.al.|[2304.13172v1](http://arxiv.org/abs/2304.13172v1)|null|\n", "2304.13130": "|**2023-04-25**|**Hypernymization of named entity-rich captions for grounding-based multi-modal pretraining**|Giacomo Nebbia et.al.|[2304.13130v1](http://arxiv.org/abs/2304.13130v1)|null|\n", "2304.13103": "|**2023-04-25**|**HyMo: Vulnerability Detection in Smart Contracts using a Novel Multi-Modal Hybrid Model**|Mohammad Khodadadi et.al.|[2304.13103v1](http://arxiv.org/abs/2304.13103v1)|null|\n", "2304.13097": "|**2023-04-25**|**Bridging graph data models: RDF, RDF-star, and property graphs as directed acyclic graphs**|Ewout Gelling et.al.|[2304.13097v1](http://arxiv.org/abs/2304.13097v1)|**[link](https://github.com/ewoutgelling/bridging-data-models)**|\n", "2304.14340": "|**2023-04-27**|**SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection**|Yichen Xie et.al.|[2304.14340v1](http://arxiv.org/abs/2304.14340v1)|**[link](https://github.com/yichen928/sparsefusion)**|\n", "2304.14323": "|**2023-04-27**|**Pushing the Boundaries of Tractable Multiperspective Reasoning: A Deduction Calculus for Standpoint EL+**|Luc\u00eda {G\u00f3mez \u00c1lvarez} et.al.|[2304.14323v1](http://arxiv.org/abs/2304.14323v1)|**[link](https://github.com/cl-tud/standpoint-el-souffle-reasoner)**|\n", "2304.14243": "|**2023-04-27**|**Standpoint Linear Temporal Logic**|Nicola Gigante et.al.|[2304.14243v1](http://arxiv.org/abs/2304.14243v1)|null|\n", "2304.14178": "|**2023-04-27**|**mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality**|Qinghao Ye et.al.|[2304.14178v1](http://arxiv.org/abs/2304.14178v1)|**[link](https://github.com/x-plug/mplug-owl)**|\n", "2304.13979": "|**2023-04-27**|**Adaptive-Mask Fusion Network for Segmentation of Drivable Road and Negative Obstacle With Untrustworthy Features**|Zhen Feng et.al.|[2304.13979v1](http://arxiv.org/abs/2304.13979v1)|**[link](https://github.com/lab-sun/amfnet)**|\n", "2304.13923": "|**2023-04-27**|**Retrieval-based Knowledge Augmented Vision Language Pre-training**|Jiahua Rao et.al.|[2304.13923v1](http://arxiv.org/abs/2304.13923v1)|null|\n", "2304.13833": "|**2023-04-26**|**Mixtures of Gaussian process experts based on kernel stick-breaking processes**|Yuji Saikai et.al.|[2304.13833v1](http://arxiv.org/abs/2304.13833v1)|**[link](https://github.com/ysaikai/gpksbp)**|\n", "2304.14204": "|**2023-04-26**|**Towards Medical Artificial General Intelligence via Knowledge-Enhanced Multimodal Pretraining**|Bingqian Lin et.al.|[2304.14204v1](http://arxiv.org/abs/2304.14204v1)|**[link](https://github.com/chenzcv7/motor)**|\n", "2304.15010": "|**2023-04-28**|**LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model**|Peng Gao et.al.|[2304.15010v1](http://arxiv.org/abs/2304.15010v1)|**[link](https://github.com/zrrskywalker/llama-adapter)**|\n", "2304.14942": "|**2023-04-28**|**The Emotions of the Crowd: Learning Image Sentiment from Tweets via Cross-modal Distillation**|Alessio Serra et.al.|[2304.14942v1](http://arxiv.org/abs/2304.14942v1)|null|\n", "2304.14936": "|**2023-04-28**|**Information Redundancy and Biases in Public Document Information Extraction Benchmarks**|Seif Laatiri et.al.|[2304.14936v1](http://arxiv.org/abs/2304.14936v1)|**[link](https://github.com/seif-lat/bias-study-funsd-sroie)**|\n", "2304.14933": "|**2023-04-28**|**An Empirical Study of Multimodal Model Merging**|Yi-Lin Sung et.al.|[2304.14933v1](http://arxiv.org/abs/2304.14933v1)|**[link](https://github.com/ylsung/vl-merging)**|\n", "2304.14880": "|**2023-04-28**|**SGAligner : 3D Scene Alignment with Scene Graphs**|Sayan Deb Sarkar et.al.|[2304.14880v1](http://arxiv.org/abs/2304.14880v1)|**[link](https://github.com/sayands/sgaligner)**|\n", "2305.00970": "|**2023-05-01**|**ArK: Augmented Reality with Knowledge Interactive Emergent Ability**|Qiuyuan Huang et.al.|[2305.00970v1](http://arxiv.org/abs/2305.00970v1)|null|\n", "2305.00769": "|**2023-05-01**|**Multi-scale Transformer-based Network for Emotion Recognition from Multi Physiological Signals**|Tu Vu et.al.|[2305.00769v1](http://arxiv.org/abs/2305.00769v1)|**[link](https://github.com/vsl-team/EPiC-2023-ACII)**|\n", "2305.00537": "|**2023-04-30**|**Interpretability of Machine Learning: Recent Advances and Future Prospects**|Lei Gao et.al.|[2305.00537v1](http://arxiv.org/abs/2305.00537v1)|null|\n", "2305.00355": "|**2023-04-29**|**MH-DETR: Video Moment and Highlight Detection with Cross-modal Transformer**|Yifang Xu et.al.|[2305.00355v1](http://arxiv.org/abs/2305.00355v1)|null|\n", "2305.00320": "|**2023-04-29**|**Fusion for Visual-Infrared Person ReID in Real-World Surveillance Using Corrupted Multimodal Data**|Arthur Josi et.al.|[2305.00320v1](http://arxiv.org/abs/2305.00320v1)|**[link](https://github.com/art2611/mreid-ucd-ccd)**|\n", "2305.00314": "|**2023-04-29**|**InfraDet3D: Multi-Modal 3D Object Detection based on Roadside Infrastructure Camera and LiDAR Sensors**|Walter Zimmer et.al.|[2305.00314v1](http://arxiv.org/abs/2305.00314v1)|null|\n", "2305.00207": "|**2023-04-29**|**Mixed-Response State-Space Model for Analyzing Multi-Dimensional Digital Phenotypes**|Tianchen Xu et.al.|[2305.00207v1](http://arxiv.org/abs/2305.00207v1)|**[link](https://github.com/zjph602xtc/MRSS)**|\n", "2305.00201": "|**2023-04-29**|**Instruction-ViT: Multi-Modal Prompts for Instruction Learning in ViT**|Zhenxiang Xiao et.al.|[2305.00201v1](http://arxiv.org/abs/2305.00201v1)|null|\n", "2305.00042": "|**2023-04-28**|**Cycle-guided Denoising Diffusion Probability Model for 3D Cross-modality MRI Synthesis**|Shaoyan Pan et.al.|[2305.00042v1](http://arxiv.org/abs/2305.00042v1)|null|\n", "2305.00976": "|**2023-05-02**|**TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion Synthesis**|Mathis Petrovich et.al.|[2305.00976v1](http://arxiv.org/abs/2305.00976v1)|null|\n", "2305.01412": "|**2023-05-02**|**A Computational Approach for the Characterization of Airborne Pathogen Transmission in Turbulent Molecular Communication Channels**|Fatih Gulec et.al.|[2305.01412v1](http://arxiv.org/abs/2305.01412v1)|null|\n", "2305.01366": "|**2023-05-02**|**Establishing a Learning Model for Correct Hand Hygiene Technique in a NICU**|Ir\u00e9n A. Kopcs\u00f3n\u00e9 N\u00e9meth et.al.|[2305.01366v1](http://arxiv.org/abs/2305.01366v1)|null|\n", "2305.01245": "|**2023-05-02**|**MDENet: Multi-modal Dual-embedding Networks for Malware Open-set Recognition**|Jingcai Guo et.al.|[2305.01245v1](http://arxiv.org/abs/2305.01245v1)|null|\n", "2305.01233": "|**2023-05-03**|**On Uni-Modal Feature Learning in Supervised Multi-Modal Learning**|Chenzhuang Du et.al.|[2305.01233v2](http://arxiv.org/abs/2305.01233v2)|**[link](https://github.com/gewu-lab/ogm-ge_cvpr2022)**|\n", "2305.01111": "|**2023-05-01**|**Local and Global Contextual Features Fusion for Pedestrian Intention Prediction**|Mohsen Azarmi et.al.|[2305.01111v1](http://arxiv.org/abs/2305.01111v1)|null|\n", "2305.02269": "|**2023-05-03**|**M2-CTTS: End-to-End Multi-scale Multi-modal Conversational Text-to-Speech Synthesis**|Jinlong Xue et.al.|[2305.02269v1](http://arxiv.org/abs/2305.02269v1)|null|\n", "2305.01971": "|**2023-05-03**|**District-scale surface temperatures generated from high-resolution longitudinal thermal infrared images**|Subin Lin et.al.|[2305.01971v1](http://arxiv.org/abs/2305.01971v1)|**[link](https://github.com/buds-lab/project-iris-dataset)**|\n", "2305.01915": "|**2023-05-03**|**Denoising Multi-modal Sequential Recommenders with Contrastive Learning**|Dong Yao et.al.|[2305.01915v1](http://arxiv.org/abs/2305.01915v1)|null|\n", "2305.01912": "|**2023-05-03**|**MolKD: Distilling Cross-Modal Knowledge in Chemical Reactions for Molecular Property Prediction**|Liang Zeng et.al.|[2305.01912v1](http://arxiv.org/abs/2305.01912v1)|null|\n", "2305.01877": "|**2023-05-04**|**The Impacts of Dimensionality, Diffusion, and Directedness on Intrinsic Cross-Model Simulation in Tile-Based Self-Assembly**|Daniel Hader et.al.|[2305.01877v2](http://arxiv.org/abs/2305.01877v2)|null|\n", "2305.01864": "|**2023-05-05**|**Unsupervised Improvement of Audio-Text Cross-Modal Representations**|Zhepei Wang et.al.|[2305.01864v2](http://arxiv.org/abs/2305.01864v2)|**[link](https://github.com/zhepeiw/clap_curation)**|\n", "2305.01836": "|**2023-05-03**|**AV-SAM: Segment Anything Model Meets Audio-Visual Localization and Segmentation**|Shentong Mo et.al.|[2305.01836v1](http://arxiv.org/abs/2305.01836v1)|null|\n", "2305.01778": "|**2023-05-02**|**SLTUNET: A Simple Unified Model for Sign Language Translation**|Biao Zhang et.al.|[2305.01778v1](http://arxiv.org/abs/2305.01778v1)|**[link](https://github.com/bzhangGo/sltunet)**|\n", "2305.01661": "|**2023-05-02**|**SIA-FTP: A Spoken Instruction Aware Flight Trajectory Prediction Framework**|Dongyue Guo et.al.|[2305.01661v1](http://arxiv.org/abs/2305.01661v1)|null|\n", "2305.02930": "|**2023-05-04**|**Piecewise Normalizing Flows**|Harry Bevins et.al.|[2305.02930v1](http://arxiv.org/abs/2305.02930v1)|**[link](https://github.com/htjb/margarine)**|\n", "2305.02774": "|**2023-05-04**|**Spatial and Modal Optimal Transport for Fast Cross-Modal MRI Reconstruction**|Qi Wang et.al.|[2305.02774v1](http://arxiv.org/abs/2305.02774v1)|null|\n", "2305.02760": "|**2023-05-04**|**Multi-Modality Deep Network for JPEG Artifacts Reduction**|Xuhao Jiang et.al.|[2305.02760v1](http://arxiv.org/abs/2305.02760v1)|null|\n", "2305.02577": "|**2023-05-04**|**Text Reading Order in Uncontrolled Conditions by Sparse Graph Segmentation**|Renshen Wang et.al.|[2305.02577v1](http://arxiv.org/abs/2305.02577v1)|null|\n", "2305.02572": "|**2023-05-04**|**High-fidelity Generalized Emotional Talking Face Generation with Multi-modal Emotion Space Learning**|Chao Xu et.al.|[2305.02572v1](http://arxiv.org/abs/2305.02572v1)|null|\n", "2305.02504": "|**2023-05-04**|**Learning Missing Modal Electronic Health Records with Unified Multi-modal Data Embedding and Modality-Aware Attention**|Kwanhyung Lee et.al.|[2305.02504v1](http://arxiv.org/abs/2305.02504v1)|null|\n", "2305.03726": "|**2023-05-05**|**Otter: A Multi-Modal Model with In-Context Instruction Tuning**|Bo Li et.al.|[2305.03726v1](http://arxiv.org/abs/2305.03726v1)|**[link](https://github.com/luodian/otter)**|\n", "2305.03724": "|**2023-05-05**|**DualCross: Cross-Modality Cross-Domain Adaptation for Monocular BEV Perception**|Yunze Man et.al.|[2305.03724v1](http://arxiv.org/abs/2305.03724v1)|null|\n", "2305.03689": "|**2023-05-05**|**COLA: How to adapt vision-language models to Compose Objects Localized with Attributes?**|Arijit Ray et.al.|[2305.03689v1](http://arxiv.org/abs/2305.03689v1)|**[link](https://github.com/arijitray1993/COLA)**|\n", "2305.03347": "|**2023-05-05**|**A Large Cross-Modal Video Retrieval Dataset with Reading Comprehension**|Weijia Wu et.al.|[2305.03347v1](http://arxiv.org/abs/2305.03347v1)|**[link](https://github.com/callsys/textvr)**|\n", "2305.03314": "|**2023-05-05**|**Block the Label and Noise: An N-Gram Masked Speller for Chinese Spell Checking**|Haiyun Yang et.al.|[2305.03314v1](http://arxiv.org/abs/2305.03314v1)|null|\n", "2305.03277": "|**2023-05-05**|**FM-ViT: Flexible Modal Vision Transformers for Face Anti-Spoofing**|Ajian Liu et.al.|[2305.03277v1](http://arxiv.org/abs/2305.03277v1)|null|\n", "2305.03252": "|**2023-05-05**|**HeteroEdge: Addressing Asymmetry in Heterogeneous Collaborative Autonomous Systems**|Mohammad Saeid Anwar et.al.|[2305.03252v1](http://arxiv.org/abs/2305.03252v1)|null|\n", "2305.03212": "|**2023-05-04**|**LLM2Loss: Leveraging Language Models for Explainable Model Diagnostics**|Shervin Ardeshir et.al.|[2305.03212v1](http://arxiv.org/abs/2305.03212v1)|null|\n", "2305.03187": "|**2023-05-04**|**Generating Virtual On-body Accelerometer Data from Virtual Textual Descriptions for Human Activity Recognition**|Zikang Leng et.al.|[2305.03187v1](http://arxiv.org/abs/2305.03187v1)|**[link](https://github.com/ZikangLeng/IMUGPT)**|\n", "2305.03506": "|**2023-05-04**|**SI-LSTM: Speaker Hybrid Long-short Term Memory and Cross Modal Attention for Emotion Recognition in Conversation**|Xingwei Liang et.al.|[2305.03506v1](http://arxiv.org/abs/2305.03506v1)|null|\n", "2305.04824": "|**2023-05-08**|**Learning Summary-Worthy Visual Representation for Abstractive Summarization in Video**|Zenan Xu et.al.|[2305.04824v1](http://arxiv.org/abs/2305.04824v1)|null|\n", "2305.04790": "|**2023-05-09**|**MultiModal-GPT: A Vision and Language Model for Dialogue with Humans**|Tao Gong et.al.|[2305.04790v2](http://arxiv.org/abs/2305.04790v2)|**[link](https://github.com/open-mmlab/multimodal-gpt)**|\n", "2305.04685": "|**2023-05-08**|**ARDIE: AR, Dialogue, and Eye Gaze Policies for Human-Robot Collaboration**|Chelsea Zou et.al.|[2305.04685v1](http://arxiv.org/abs/2305.04685v1)|null|\n", "2305.04530": "|**2023-05-08**|**A Multi-Modal Context Reasoning Approach for Conditional Inference on Joint Textual and Visual Clues**|Yunxin Li et.al.|[2305.04530v1](http://arxiv.org/abs/2305.04530v1)|**[link](https://github.com/yunxinli/multimodal-context-reasoning)**|\n", "2305.04476": "|**2023-05-09**|**AlignSTS: Speech-to-Singing Conversion via Cross-Modal Alignment**|Ruiqi Li et.al.|[2305.04476v2](http://arxiv.org/abs/2305.04476v2)|null|\n", "2305.04474": "|**2023-05-09**|**Vision Langauge Pre-training by Contrastive Learning with Cross-Modal Similarity Regulation**|Chaoya Jiang et.al.|[2305.04474v2](http://arxiv.org/abs/2305.04474v2)|null|\n", "2305.04469": "|**2023-05-08**|**HACK: Learning a Parametric Head and Neck Model for High-fidelity Animation**|Longwen Zhang et.al.|[2305.04469v1](http://arxiv.org/abs/2305.04469v1)|**[link](https://github.com/zonelikewonderland/hack-model)**|\n", "2305.04451": "|**2023-05-08**|**FashionTex: Controllable Virtual Try-on with Text and Texture**|Anran Lin et.al.|[2305.04451v1](http://arxiv.org/abs/2305.04451v1)|**[link](https://github.com/picksh/fashiontex)**|\n", "2305.04298": "|**2023-05-07**|**Poses as Queries: Image-to-LiDAR Map Localization with Transformers**|Jinyu Miao et.al.|[2305.04298v1](http://arxiv.org/abs/2305.04298v1)|null|\n", "2305.04239": "|**2023-05-07**|**Instance-Variant Loss with Gaussian RBF Kernel for 3D Cross-modal Retriveal**|Zhitao Liu et.al.|[2305.04239v1](http://arxiv.org/abs/2305.04239v1)|null|\n", "2305.04224": "|**2023-05-07**|**Visual Causal Scene Refinement for Video Question Answering**|Yushen Wei et.al.|[2305.04224v1](http://arxiv.org/abs/2305.04224v1)|**[link](https://github.com/yangliu9208/vcsr)**|\n", "2305.04195": "|**2023-05-07**|**Cross-Modal Retrieval for Motion and Text via MildTriple Loss**|Sheng Yan et.al.|[2305.04195v1](http://arxiv.org/abs/2305.04195v1)|**[link](https://github.com/eanson023/rehamot)**|\n", "2305.04160": "|**2023-05-07**|**X-LLM: Bootstrapping Advanced Large Language Models by Treating Multi-Modalities as Foreign Languages**|Feilong Chen et.al.|[2305.04160v1](http://arxiv.org/abs/2305.04160v1)|null|\n", "2305.04156": "|**2023-05-07**|**SynthMix: Mixing up Aligned Synthesis for Medical Cross-Modality Domain Adaptation**|Xinwen Zhang et.al.|[2305.04156v1](http://arxiv.org/abs/2305.04156v1)|null|\n", "2305.04072": "|**2023-05-06**|**Keyword-Based Diverse Image Retrieval by Semantics-aware Contrastive Learning and Transformer**|Minyi Zhao et.al.|[2305.04072v1](http://arxiv.org/abs/2305.04072v1)|null|\n", "2305.05665": "|**2023-05-09**|**ImageBind: One Embedding Space To Bind Them All**|Rohit Girdhar et.al.|[2305.05665v1](http://arxiv.org/abs/2305.05665v1)|**[link](https://github.com/facebookresearch/imagebind)**|\n", "2305.05662": "|**2023-05-11**|**InternGPT: Solving Vision-Centric Tasks by Interacting with ChatGPT Beyond Language**|Zhaoyang Liu et.al.|[2305.05662v3](http://arxiv.org/abs/2305.05662v3)|**[link](https://github.com/opengvlab/interngpt)**|\n", "2305.05534": "|**2023-05-09**|**Integrating Holistic and Local Information to Estimate Emotional Reaction Intensity**|Yini Fang et.al.|[2305.05534v1](http://arxiv.org/abs/2305.05534v1)|**[link](https://github.com/hkust-nisl/abaw5)**|\n", "2305.05496": "|**2023-05-09**|**Exploiting Pseudo Image Captions for Multimodal Summarization**|Chaoya Jiang et.al.|[2305.05496v1](http://arxiv.org/abs/2305.05496v1)|**[link](https://github.com/sitaproject/sita)**|\n", "2305.05260": "|**2023-05-09**|**Guided Focal Stack Refinement Network for Light Field Salient Object Detection**|Bo Yuan et.al.|[2305.05260v1](http://arxiv.org/abs/2305.05260v1)|null|\n", "2305.05189": "|**2023-05-09**|**SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with Large Language Models**|Shanshan Zhong et.al.|[2305.05189v1](http://arxiv.org/abs/2305.05189v1)|**[link](https://github.com/Qrange-group/SUR-adapter)**|\n", "2305.05166": "|**2023-05-10**|**E2TIMT: Efficient and Effective Modal Adapter for Text Image Machine Translation**|Cong Ma et.al.|[2305.05166v2](http://arxiv.org/abs/2305.05166v2)|**[link](https://github.com/ericongma/e2timt)**|\n", "2305.05126": "|**2023-05-09**|**Comparing Foundation Models using Data Kernels**|Brandon Duderstadt et.al.|[2305.05126v1](http://arxiv.org/abs/2305.05126v1)|null|\n", "2305.04961": "|**2023-05-08**|**Joint Moment Retrieval and Highlight Detection Via Natural Language Queries**|Richard Luo et.al.|[2305.04961v1](http://arxiv.org/abs/2305.04961v1)|**[link](https://github.com/skyline-9/visionary-vids)**|\n", "2305.06292": "|**2023-05-10**|**Joint Metrics Matter: A Better Standard for Trajectory Forecasting**|Erica Weng et.al.|[2305.06292v1](http://arxiv.org/abs/2305.06292v1)|**[link](https://github.com/ericaweng/joint-metrics-matter)**|\n", "2305.06278": "|**2023-05-10**|**A Multi-modal Garden Dataset and Hybrid 3D Dense Reconstruction Framework Based on Panoramic Stereo Images for a Trimming Robot**|Can Pu et.al.|[2305.06278v1](http://arxiv.org/abs/2305.06278v1)|**[link](https://github.com/canpu999/trimbot-wageningen-slam-dataset)**|\n", "2305.06225": "|**2023-05-10**|**DaGAN++: Depth-Aware Generative Adversarial Network for Talking Head Video Generation**|Fa-Ting Hong et.al.|[2305.06225v1](http://arxiv.org/abs/2305.06225v1)|**[link](https://github.com/harlanhong/cvpr2022-dagan)**|\n", "2305.06221": "|**2023-05-10**|**Multi-Prompt with Depth Partitioned Cross-Modal Learning**|Yiqi Wang et.al.|[2305.06221v1](http://arxiv.org/abs/2305.06221v1)|**[link](https://github.com/wangyiqi/pmpo)**|\n", "2305.06203": "|**2023-05-10**|**Multiclass MRI Brain Tumor Segmentation using 3D Attention-based U-Net**|Maryann M. Gitonga et.al.|[2305.06203v1](http://arxiv.org/abs/2305.06203v1)|null|\n", "2305.06179": "|**2023-05-11**|**A Multi-modal Approach to Single-modal Visual Place Classification**|Tomoya Iwasaki et.al.|[2305.06179v2](http://arxiv.org/abs/2305.06179v2)|null|\n", "2305.05992": "|**2023-05-10**|**MMoT: Mixture-of-Modality-Tokens Transformer for Composed Multimodal Conditional Image Synthesis**|Jianbin Zheng et.al.|[2305.05992v1](http://arxiv.org/abs/2305.05992v1)|null|\n", "2305.05880": "|**2023-05-10**|**ChinaOpen: A Dataset for Open-world Multimodal Learning**|Aozhu Chen et.al.|[2305.05880v1](http://arxiv.org/abs/2305.05880v1)|**[link](https://github.com/dong03/GenerativeVideo2Text)**|\n", "2305.06978": "|**2023-05-11**|**Meta-hallucinator: Towards Few-Shot Cross-Modality Cardiac Image Segmentation**|Ziyuan Zhao et.al.|[2305.06978v1](http://arxiv.org/abs/2305.06978v1)|null|\n", "2305.06923": "|**2023-05-11**|**EAML: Ensemble Self-Attention-based Mutual Learning Network for Document Image Classification**|Souhail Bakkali et.al.|[2305.06923v1](http://arxiv.org/abs/2305.06923v1)|null|\n", "2305.06794": "|**2023-05-11**|**Multi-modal Multi-level Fusion for 3D Single Object Tracking**|Zhiheng Li et.al.|[2305.06794v1](http://arxiv.org/abs/2305.06794v1)|null|\n", "2305.06720": "|**2023-05-11**|**Bi-level Dynamic Learning for Jointly Multi-modality Image Fusion and Beyond**|Zhu Liu et.al.|[2305.06720v1](http://arxiv.org/abs/2305.06720v1)|**[link](https://github.com/LiuZhu-CV/BDLFusion)**|\n", "2305.06472": "|**2023-05-12**|**ChatGPT-Like Large-Scale Foundation Models for Prognostics and Health Management: A Survey and Roadmaps**|Yan-Fu Li et.al.|[2305.06472v2](http://arxiv.org/abs/2305.06472v2)|null|\n", "2305.06407": "|**2023-05-10**|**Combo of Thinking and Observing for Outside-Knowledge VQA**|Qingyi Si et.al.|[2305.06407v1](http://arxiv.org/abs/2305.06407v1)|**[link](https://github.com/phoebussi/thinking-while-observing)**|\n", "2305.06386": "|**2023-05-10**|**Text-To-Concept (and Back) via Cross-Model Alignment**|Mazda Moayeri et.al.|[2305.06386v1](http://arxiv.org/abs/2305.06386v1)|null|\n", "2305.07358": "|**2023-05-12**|**Towards Versatile and Efficient Visual Knowledge Injection into Pre-trained Language Models with Cross-Modal Adapters**|Xinyun Zhang et.al.|[2305.07358v1](http://arxiv.org/abs/2305.07358v1)|null|\n", "2305.07334": "|**2023-05-12**|**Locking and Quacking: Stacking Bayesian model predictions by log-pooling and superposition**|Yuling Yao et.al.|[2305.07334v1](http://arxiv.org/abs/2305.07334v1)|null|\n", "2305.07216": "|**2023-05-12**|**Versatile Audio-Visual Learning for Handling Single and Multi Modalities in Emotion Regression and Classification Tasks**|Lucas Goncalves et.al.|[2305.07216v1](http://arxiv.org/abs/2305.07216v1)|**[link](https://github.com/ilucasgoncalves/vavl)**|\n", "2305.07214": "|**2023-05-12**|**MMG-Ego4D: Multi-Modal Generalization in Egocentric Action Recognition**|Xinyu Gong et.al.|[2305.07214v1](http://arxiv.org/abs/2305.07214v1)|null|\n", "2305.07437": "|**2023-05-15**|**Continual Vision-Language Representation Learning with Off-Diagonal Information**|Zixuan Ni et.al.|[2305.07437v2](http://arxiv.org/abs/2305.07437v2)|null|\n", "2305.08706": "|**2023-05-15**|**Understanding and Bridging the Modality Gap for Speech Translation**|Qingkai Fang et.al.|[2305.08706v1](http://arxiv.org/abs/2305.08706v1)|**[link](https://github.com/ictnlp/cress)**|\n", "2305.08698": "|**2023-05-15**|**Continual Multimodal Knowledge Graph Construction**|Xiang Chen et.al.|[2305.08698v1](http://arxiv.org/abs/2305.08698v1)|**[link](https://github.com/zjunlp/ContinueMKGC)**|\n", "2305.08685": "|**2023-05-15**|**CLIP-VG: Self-paced Curriculum Adapting of CLIP via Exploiting Pseudo-Language Labels for Visual Grounding**|Linhui Xiao et.al.|[2305.08685v1](http://arxiv.org/abs/2305.08685v1)|**[link](https://github.com/linhuixiao/clip-vg)**|\n", "2305.08532": "|**2023-05-15**|**Benchmarking UWB-Based Infrastructure-Free Positioning and Multi-Robot Relative Localization: Dataset and Characterization**|Paola Torrico Mor\u00f3n et.al.|[2305.08532v1](http://arxiv.org/abs/2305.08532v1)|null|\n", "2305.08522": "|**2023-05-15**|**Cross-Modality Time-Variant Relation Learning for Generating Dynamic Scene Graphs**|Jingyi Wang et.al.|[2305.08522v1](http://arxiv.org/abs/2305.08522v1)|**[link](https://github.com/qncsn2016/TR2)**|\n", "2305.08386": "|**2023-05-15**|**PLIP: Language-Image Pre-training for Person Representation Learning**|Jialong Zuo et.al.|[2305.08386v1](http://arxiv.org/abs/2305.08386v1)|**[link](https://github.com/zplusdragon/plip)**|\n", "2305.08381": "|**2023-05-15**|**Mode Approximation Makes Good Vision-Language Prompts**|Haixin Wang et.al.|[2305.08381v1](http://arxiv.org/abs/2305.08381v1)|**[link](https://github.com/willdreamer/aurora)**|\n", "2305.08372": "|**2023-05-15**|**A Novel Framework for Multimodal Named Entity Recognition with Multi-level Alignments**|Peipei Liu et.al.|[2305.08372v1](http://arxiv.org/abs/2305.08372v1)|null|\n", "2305.08252": "|**2023-05-14**|**Parameter-Efficient Fine-Tuning for Medical Image Analysis: The Missed Opportunity**|Raman Dutt et.al.|[2305.08252v1](http://arxiv.org/abs/2305.08252v1)|null|\n", "2305.08120": "|**2023-05-14**|**Unraveling Cold Start Enigmas in Predictive Analytics for OTT Media: Synergistic Meta-Insights and Multimodal Ensemble Mastery**|K. Ganguly et.al.|[2305.08120v1](http://arxiv.org/abs/2305.08120v1)|null|\n", "2305.07927": "|**2023-05-13**|**RC3: Regularized Contrastive Cross-lingual Cross-modal Pre-training**|Chulun Zhou et.al.|[2305.07927v1](http://arxiv.org/abs/2305.07927v1)|null|\n", "2305.07920": "|**2023-05-13**|**Multi-task Paired Masking with Alignment Modeling for Medical Vision-Language Pre-training**|Ke Zhang et.al.|[2305.07920v1](http://arxiv.org/abs/2305.07920v1)|null|\n", "2305.07910": "|**2023-05-13**|**Mask to reconstruct: Cooperative Semantics Completion for Video-text Retrieval**|Han Fang et.al.|[2305.07910v1](http://arxiv.org/abs/2305.07910v1)|null|\n", "2305.07825": "|**2023-05-13**|**Student Classroom Behavior Detection based on YOLOv7-BRA and Multi-Model Fusion**|Fan Yang et.al.|[2305.07825v1](http://arxiv.org/abs/2305.07825v1)|**[link](https://github.com/whiffe/scb-dataset)**|\n", "2305.07792": "|**2023-05-12**|**Contextuality in multi-agent paradoxes**|Sidiney B. Montanhano et.al.|[2305.07792v1](http://arxiv.org/abs/2305.07792v1)|null|\n", "2305.09641": "|**2023-05-16**|**FitMe: Deep Photorealistic 3D Morphable Model Avatars**|Alexandros Lattas et.al.|[2305.09641v1](http://arxiv.org/abs/2305.09641v1)|null|\n", "2305.09600": "|**2023-05-16**|**Deep Reinforcement Learning to Maximize Arterial Usage during Extreme Congestion**|Ashutosh Dutta et.al.|[2305.09600v1](http://arxiv.org/abs/2305.09600v1)|null|\n", "2305.09333": "|**2023-05-16**|**Multi-modal Visual Understanding with Prompts for Semantic Information Disentanglement of Image**|Yuzhou Peng et.al.|[2305.09333v1](http://arxiv.org/abs/2305.09333v1)|null|\n", "2305.09272": "|**2023-05-16**|**Age of Incorrect Information in Semantic Communications for NOMA Aided XR Applications**|Jianrui Chen et.al.|[2305.09272v1](http://arxiv.org/abs/2305.09272v1)|null|\n", "2305.09255": "|**2023-05-16**|**Trust-Worthy Semantic Communications for the Metaverse Relying on Federated Learning**|Jianrui Chen et.al.|[2305.09255v1](http://arxiv.org/abs/2305.09255v1)|null|\n", "2305.09212": "|**2023-05-16**|**Cross-Modal Global Interaction and Local Alignment for Audio-Visual Speech Recognition**|Yuchen Hu et.al.|[2305.09212v1](http://arxiv.org/abs/2305.09212v1)|**[link](https://github.com/yuchen005/gila)**|\n", "2305.09011": "|**2023-05-18**|**The Brain Tumor Segmentation (BraTS) Challenge 2023: Brain MR Image Synthesis for Tumor Segmentation (BraSyn)**|Hongwei Bran Li et.al.|[2305.09011v2](http://arxiv.org/abs/2305.09011v2)|null|\n", "2305.10420": "|**2023-05-17**|**CLIP-GCD: Simple Language Guided Generalized Category Discovery**|Rabah Ouldnoughi et.al.|[2305.10420v1](http://arxiv.org/abs/2305.10420v1)|null|\n", "2305.10046": "|**2023-05-17**|**Probing the Role of Positional Information in Vision-Language Models**|Philipp J. R\u00f6sch et.al.|[2305.10046v1](http://arxiv.org/abs/2305.10046v1)|null|\n", "2305.09946": "|**2023-05-17**|**DeepMSS: Deep Multi-Modality Segmentation-to-Survival Learning for Survival Outcome Prediction from PET/CT Images**|Mingyuan Meng et.al.|[2305.09946v1](http://arxiv.org/abs/2305.09946v1)|**[link](https://github.com/mungomeng/survival-deepmss)**|\n", "2305.11176": "|**2023-05-18**|**Instruct2Act: Mapping Multi-modality Instructions to Robotic Actions with Large Language Model**|Siyuan Huang et.al.|[2305.11176v1](http://arxiv.org/abs/2305.11176v1)|**[link](https://github.com/opengvlab/instruct2act)**|\n", "2305.11172": "|**2023-05-18**|**ONE-PEACE: Exploring One General Representation Model Toward Unlimited Modalities**|Peng Wang et.al.|[2305.11172v1](http://arxiv.org/abs/2305.11172v1)|**[link](https://github.com/OFA-Sys/ONE-PEACE)**|\n", "2305.11101": "|**2023-05-18**|**XFormer: Fast and Accurate Monocular 3D Body Capture**|Lihui Qian et.al.|[2305.11101v1](http://arxiv.org/abs/2305.11101v1)|null|\n", "2305.11096": "|**2023-05-22**|**Cross-modality Data Augmentation for End-to-End Sign Language Translation**|Jinhui Ye et.al.|[2305.11096v2](http://arxiv.org/abs/2305.11096v2)|**[link](https://github.com/atrewin/signxmda)**|\n", "2305.11012": "|**2023-05-18**|**SDC-UDA: Volumetric Unsupervised Domain Adaptation Framework for Slice-Direction Continuous Cross-Modality Medical Image Segmentation**|Hyungseob Shin et.al.|[2305.11012v1](http://arxiv.org/abs/2305.11012v1)|null|\n", "2305.11000": "|**2023-05-19**|**SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities**|Dong Zhang et.al.|[2305.11000v2](http://arxiv.org/abs/2305.11000v2)|**[link](https://github.com/0nutation/speechgpt)**|\n", "2305.10920": "|**2023-05-18**|**Emergent Communication with Attention**|Ryokan Ri et.al.|[2305.10920v1](http://arxiv.org/abs/2305.10920v1)|null|\n", "2305.10838": "|**2023-05-18**|**ProgSG: Cross-Modality Representation Learning for Programs in Electronic Design Automation**|Yunsheng Bai et.al.|[2305.10838v1](http://arxiv.org/abs/2305.10838v1)|null|\n", "2305.10783": "|**2023-05-18**|**Transforming Human-Centered AI Collaboration: Redefining Embodied Agents Capabilities through Interactive Grounded Language Instructions**|Shrestha Mohanty et.al.|[2305.10783v1](http://arxiv.org/abs/2305.10783v1)|**[link](https://github.com/iglu-contest/nlp-baselines-2022)**|\n", "2305.10773": "|**2023-05-18**|**Rate-Adaptive Coding Mechanism for Semantic Communications With Multi-Modal Data**|Yangshuo He et.al.|[2305.10773v1](http://arxiv.org/abs/2305.10773v1)|null|\n", "2305.10764": "|**2023-05-18**|**OpenShape: Scaling Up 3D Shape Representation Towards Open-World Understanding**|Minghua Liu et.al.|[2305.10764v1](http://arxiv.org/abs/2305.10764v1)|null|\n", "2305.10763": "|**2023-05-18**|**CLAPSpeech: Learning Prosody from Text Context with Contrastive Language-Audio Pre-training**|Zhenhui Ye et.al.|[2305.10763v1](http://arxiv.org/abs/2305.10763v1)|null|\n", "2305.10724": "|**2023-05-18**|**Segment Any Anomaly without Training via Hybrid Prompt Regularization**|Yunkang Cao et.al.|[2305.10724v1](http://arxiv.org/abs/2305.10724v1)|**[link](https://github.com/caoyunkang/segment-any-anomaly)**|\n", "2305.10547": "|**2023-05-17**|**Rethinking Multimodal Content Moderation from an Asymmetric Angle with Mixed-modality**|Jialin Yuan et.al.|[2305.10547v1](http://arxiv.org/abs/2305.10547v1)|null|\n", "2305.10512": "|**2023-05-17**|**IMAD: IMage-Augmented multi-modal Dialogue**|Moskvoretskii Viktor et.al.|[2305.10512v1](http://arxiv.org/abs/2305.10512v1)|**[link](https://github.com/vityavitalich/imad)**|\n", "2305.11832": "|**2023-05-19**|**Improving Multimodal Joint Variational Autoencoders through Normalizing Flows and Correlation Analysis**|Agathe Senellart et.al.|[2305.11832v1](http://arxiv.org/abs/2305.11832v1)|null|\n", "2305.11818": "|**2023-05-19**|**MaGIC: Multi-modality Guided Image Completion**|Yongsheng Yu et.al.|[2305.11818v1](http://arxiv.org/abs/2305.11818v1)|null|\n", "2305.11719": "|**2023-05-19**|**Information Screening whilst Exploiting! Multimodal Relation Extraction with Feature Denoising and Multimodal Topic Modeling**|Shengqiong Wu et.al.|[2305.11719v1](http://arxiv.org/abs/2305.11719v1)|**[link](https://github.com/chocowu/mre-ise)**|\n", "2305.11579": "|**2023-05-19**|**Speech-Text Dialog Pre-training for Spoken Dialog Understanding with Explicit Cross-Modal Alignment**|Tianshu Yu et.al.|[2305.11579v1](http://arxiv.org/abs/2305.11579v1)|**[link](https://github.com/alibabaresearch/damo-convai)**|\n", "2305.11503": "|**2023-05-19**|**A Topic-aware Summarization Framework with Different Modal Side Information**|Xiuying Chen et.al.|[2305.11503v1](http://arxiv.org/abs/2305.11503v1)|null|\n", "2305.11481": "|**2023-05-22**|**CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring Image Segmentation**|Wenxuan Wang et.al.|[2305.11481v2](http://arxiv.org/abs/2305.11481v2)|null|\n", "2305.11443": "|**2023-05-19**|**Equivariant Multi-Modality Image Fusion**|Zixiang Zhao et.al.|[2305.11443v1](http://arxiv.org/abs/2305.11443v1)|null|\n", "2305.11439": "|**2023-05-19**|**Few-Shot Learning with Visual Distribution Calibration and Cross-Modal Distribution Alignment**|Runqi Wang et.al.|[2305.11439v1](http://arxiv.org/abs/2305.11439v1)|**[link](https://github.com/bhrqw/sada)**|\n", "2305.11392": "|**2023-05-19**|**Fast-StrucTexT: An Efficient Hourglass Transformer with Modality-guided Dynamic Token Merge for Document Understanding**|Mingliang Zhai et.al.|[2305.11392v1](http://arxiv.org/abs/2305.11392v1)|null|\n", "2305.11349": "|**2023-05-18**|**Unsupervised Domain-agnostic Fake News Detection using Multi-modal Weak Signals**|Amila Silva et.al.|[2305.11349v1](http://arxiv.org/abs/2305.11349v1)|null|\n", "2305.11327": "|**2023-05-18**|**MALM: Mask Augmentation based Local Matching for Food-Recipe Retrieval**|Bhanu Prakash Voutharoja et.al.|[2305.11327v1](http://arxiv.org/abs/2305.11327v1)|**[link](https://github.com/myfoodchoice/malm_mask_augmentation_based_local_matching-_for-_food_recipe_retrieval)**|\n", "2305.13220": "|**2023-05-22**|**Fast Monocular Scene Reconstruction with Global-Sparse Local-Dense Grids**|Wei Dong et.al.|[2305.13220v1](http://arxiv.org/abs/2305.13220v1)|null|\n", "2305.12953": "|**2023-05-22**|**Enhancing Next Active Object-based Egocentric Action Anticipation with Guided Attention**|Sanket Thakur et.al.|[2305.12953v1](http://arxiv.org/abs/2305.12953v1)|**[link](https://github.com/sanketsans/ganov2)**|\n", "2305.12903": "|**2023-05-22**|**DiffAVA: Personalized Text-to-Audio Generation with Visual Alignment**|Shentong Mo et.al.|[2305.12903v1](http://arxiv.org/abs/2305.12903v1)|null|\n", "2305.12878": "|**2023-05-22**|**Non-Autoregressive Document-Level Machine Translation (NA-DMT): Exploring Effective Approaches, Challenges, and Opportunities**|Guangsheng Bao et.al.|[2305.12878v1](http://arxiv.org/abs/2305.12878v1)|**[link](https://github.com/baoguangsheng/nat-on-doc)**|\n", "2305.12807": "|**2023-05-22**|**Multi-task Combinatorial Optimization: Adaptive Multi-modality Knowledge Transfer by an Explicit Inter-task Distance**|Peng Li et.al.|[2305.12807v1](http://arxiv.org/abs/2305.12807v1)|null|\n", "2305.12793": "|**2023-05-22**|**Zero-Shot End-to-End Spoken Language Understanding via Cross-Modal Selective Self-Training**|Jianfeng He et.al.|[2305.12793v1](http://arxiv.org/abs/2305.12793v1)|null|\n", "2305.12711": "|**2023-05-22**|**Unsupervised Visible-Infrared Person ReID by Collaborative Learning with Neighbor-Guided Label Refinement**|De Cheng et.al.|[2305.12711v1](http://arxiv.org/abs/2305.12711v1)|null|\n", "2305.12703": "|**2023-05-22**|**Progressive Sub-Graph Clustering Algorithm for Semi-Supervised Domain Adaptation Speaker Verification**|Zhuo Li et.al.|[2305.12703v1](http://arxiv.org/abs/2305.12703v1)|null|\n", "2305.12673": "|**2023-05-22**|**Efficient Bilateral Cross-Modality Cluster Matching for Unsupervised Visible-Infrared Person ReID**|De cheng et.al.|[2305.12673v1](http://arxiv.org/abs/2305.12673v1)|null|\n", "2305.12530": "|**2023-05-21**|**Towards Robust Family-Infant Audio Analysis Based on Unsupervised Pretraining of Wav2vec 2.0 on Large-Scale Unlabeled Family Audio**|Jialu Li et.al.|[2305.12530v1](http://arxiv.org/abs/2305.12530v1)|null|\n", "2305.12452": "|**2023-05-21**|**Advancing Referring Expression Segmentation Beyond Single Image**|Yixuan Wu et.al.|[2305.12452v1](http://arxiv.org/abs/2305.12452v1)|null|\n", "2305.12369": "|**2023-05-21**|**HIINT: Historical, Intra- and Inter- personal Dynamics Modeling with Cross-person Memory Transformer**|Yubin Kim et.al.|[2305.12369v1](http://arxiv.org/abs/2305.12369v1)|null|\n", "2305.12260": "|**2023-05-20**|**Cross2StrA: Unpaired Cross-lingual Image Captioning with Cross-lingual Cross-modal Structure-pivoted Alignment**|Shengqiong Wu et.al.|[2305.12260v1](http://arxiv.org/abs/2305.12260v1)|null|\n", "2305.12218": "|**2023-05-20**|**Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment**|Peng Jin et.al.|[2305.12218v1](http://arxiv.org/abs/2305.12218v1)|**[link](https://github.com/jpthu17/dicosa)**|\n", "2305.12011": "|**2023-05-19**|**Boosting Crop Classification by Hierarchically Fusing Satellite, Rotational, and Contextual Data**|Barriere Valentin et.al.|[2305.12011v1](http://arxiv.org/abs/2305.12011v1)|null|\n", "2305.14312": "|**2023-05-23**|**Text-guided 3D Human Generation from 2D Collections**|Tsu-Jui Fu et.al.|[2305.14312v1](http://arxiv.org/abs/2305.14312v1)|null|\n", "2305.14167": "|**2023-05-24**|**DetGPT: Detect What You Need via Reasoning**|Renjie Pi et.al.|[2305.14167v2](http://arxiv.org/abs/2305.14167v2)|null|\n", "2305.14042": "|**2023-05-23**|**Improving speech translation by fusing speech and text**|Wenbiao Yin et.al.|[2305.14042v1](http://arxiv.org/abs/2305.14042v1)|null|\n", "2305.14017": "|**2023-05-23**|**Faster Video Moment Retrieval with Point-Level Supervision**|Xun Jiang et.al.|[2305.14017v1](http://arxiv.org/abs/2305.14017v1)|null|\n", "2305.14014": "|**2023-05-23**|**CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained Vision-Language Model**|Shuai Zhao et.al.|[2305.14014v1](http://arxiv.org/abs/2305.14014v1)|null|\n", "2305.13986": "|**2023-05-23**|**A Multi-Modal Network Equilibrium Model with Interacting Mobility Service Providers'Strategies**|Claudia Bandiera et.al.|[2305.13986v1](http://arxiv.org/abs/2305.13986v1)|null|\n", "2305.13705": "|**2023-05-23**|**DiffHand: End-to-End Hand Mesh Reconstruction via Diffusion Models**|Lijun Li et.al.|[2305.13705v1](http://arxiv.org/abs/2305.13705v1)|null|\n", "2305.13697": "|**2023-05-23**|**UNIMO-3: Multi-granularity Interaction for Vision-Language Representation Learning**|Hao Yang et.al.|[2305.13697v1](http://arxiv.org/abs/2305.13697v1)|null|\n", "2305.13667": "|**2023-05-23**|**Optimizing Non-Autoregressive Transformers with Contrastive Learning**|Chenxin An et.al.|[2305.13667v1](http://arxiv.org/abs/2305.13667v1)|null|\n", "2305.13659": "|**2023-05-23**|**Flare-Aware Cross-modal Enhancement Network for Multi-spectral Vehicle Re-identification**|Aihua Zheng et.al.|[2305.13659v1](http://arxiv.org/abs/2305.13659v1)|**[link](https://github.com/Mzq12138/Official-Implementation-for-Flare-Aware-Cross-modal-Enhancement-for-Multi-spectral-Vehicle-ReID)**|\n", "2305.13653": "|**2023-05-23**|**RaSa: Relation and Sensitivity Aware Representation Learning for Text-based Person Search**|Yang Bai et.al.|[2305.13653v1](http://arxiv.org/abs/2305.13653v1)|**[link](https://github.com/flame-chasers/rasa)**|\n", "2305.13631": "|**2023-05-23**|**EDIS: Entity-Driven Image Search over Multimodal Web Content**|Siqi Liu et.al.|[2305.13631v1](http://arxiv.org/abs/2305.13631v1)|**[link](https://github.com/emerisly/edis)**|\n", "2305.13503": "|**2023-05-22**|**Asynchronous Multi-Model Federated Learning over Wireless Networks: Theory, Modeling, and Optimization**|Zhan-Lun Chang et.al.|[2305.13503v1](http://arxiv.org/abs/2305.13503v1)|null|\n", "2305.15403": "|**2023-05-24**|**AV-TranSpeech: Audio-Visual Robust Speech-to-Speech Translation**|Rongjie Huang et.al.|[2305.15403v1](http://arxiv.org/abs/2305.15403v1)|null|\n", "2305.15302": "|**2023-05-24**|**Multi-Modal Mutual Attention and Iterative Interaction for Referring Image Segmentation**|Chang Liu et.al.|[2305.15302v1](http://arxiv.org/abs/2305.15302v1)|null|\n", "2305.15296": "|**2023-05-24**|**MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal Image Generation**|Marco Bellagente et.al.|[2305.15296v1](http://arxiv.org/abs/2305.15296v1)|null|\n", "2305.15218": "|**2023-05-24**|**Multi-modal Machine Learning for Vehicle Rating Predictions Using Image, Text, and Parametric Data**|Hanqi Su et.al.|[2305.15218v1](http://arxiv.org/abs/2305.15218v1)|null|\n", "2305.15217": "|**2023-05-24**|**L-CAD: Language-based Colorization with Any-level Descriptions**|Zheng Chang et.al.|[2305.15217v1](http://arxiv.org/abs/2305.15217v1)|null|\n", "2305.15159": "|**2023-05-24**|**Collaborative Recommendation Model Based on Multi-modal Multi-view Attention Network: Movie and literature cases**|Zheng Hu et.al.|[2305.15159v1](http://arxiv.org/abs/2305.15159v1)|null|\n", "2305.15094": "|**2023-05-24**|**InpaintNeRF360: Text-Guided 3D Inpainting on Unbounded Neural Radiance Fields**|Dongqing Wang et.al.|[2305.15094v1](http://arxiv.org/abs/2305.15094v1)|null|\n", "2305.15033": "|**2023-05-24**|**SmartTrim: Adaptive Tokens and Parameters Pruning for Efficient Vision-Language Models**|Zekun Wang et.al.|[2305.15033v1](http://arxiv.org/abs/2305.15033v1)|null|\n", "2305.15023": "|**2023-05-24**|**Cheap and Quick: Efficient Vision-Language Instruction Tuning for Large Language Models**|Gen Luo et.al.|[2305.15023v1](http://arxiv.org/abs/2305.15023v1)|null|\n", "2305.15021": "|**2023-05-24**|**EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought**|Yao Mu et.al.|[2305.15021v1](http://arxiv.org/abs/2305.15021v1)|**[link](https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch)**|\n", "2305.14969": "|**2023-05-24**|**MMNet: Multi-Mask Network for Referring Image Segmentation**|Yichen Yan et.al.|[2305.14969v1](http://arxiv.org/abs/2305.14969v1)|null|\n", "2305.14914": "|**2023-05-24**|**GAMUS: A Geometry-aware Multi-modal Semantic Segmentation Benchmark for Remote Sensing Data**|Zhitong Xiong et.al.|[2305.14914v1](http://arxiv.org/abs/2305.14914v1)|**[link](https://github.com/earthnets/rsi-mmsegmentation)**|\n", "2305.14897": "|**2023-05-24**|**Text encoders are performance bottlenecks in contrastive vision-language models**|Amita Kamath et.al.|[2305.14897v1](http://arxiv.org/abs/2305.14897v1)|**[link](https://github.com/amitakamath/vl_text_encoders_are_bottlenecks)**|\n", "2305.14843": "|**2023-05-24**|**Meta-Learning For Vision-and-Language Cross-lingual Transfer**|Hanxu Hu et.al.|[2305.14843v1](http://arxiv.org/abs/2305.14843v1)|null|\n", "2305.14839": "|**2023-05-24**|**PaCE: Unified Multi-modal Dialogue Pre-training with Progressive and Compositional Experts**|Yunshui Li et.al.|[2305.14839v1](http://arxiv.org/abs/2305.14839v1)|**[link](https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/pace)**|\n", "2305.16318": "|**2023-05-25**|**Referred by Multi-Modality: A Unified Temporal Transformer for Video Object Segmentation**|Shilin Yan et.al.|[2305.16318v1](http://arxiv.org/abs/2305.16318v1)|**[link](https://github.com/opengvlab/mutr)**|\n", "2305.16304": "|**2023-05-25**|**Candidate Set Re-ranking for Composed Image Retrieval with Dual Multi-modal Encoder**|Zheyuan Liu et.al.|[2305.16304v1](http://arxiv.org/abs/2305.16304v1)|null|\n", "2305.16166": "|**2023-05-25**|**Multimodal Relation Extraction with Cross-Modal Retrieval and Synthesis**|Xuming Hu et.al.|[2305.16166v1](http://arxiv.org/abs/2305.16166v1)|null|\n", "2305.16107": "|**2023-05-25**|**VioLA: Unified Codec Language Models for Speech Recognition, Synthesis, and Translation**|Tianrui Wang et.al.|[2305.16107v1](http://arxiv.org/abs/2305.16107v1)|null|\n", "2305.15957": "|**2023-05-25**|**DiffCLIP: Leveraging Stable Diffusion for Language Grounded 3D Classification**|Sitian Shen et.al.|[2305.15957v1](http://arxiv.org/abs/2305.15957v1)|null|\n", "2305.15920": "|**2023-05-25**|**Learning and accurate generation of stochastic dynamics based on multi-model Generative Adversarial Networks**|Daniele Lanzoni et.al.|[2305.15920v1](http://arxiv.org/abs/2305.15920v1)|null|\n", "2305.15913": "|**2023-05-27**|**MEMEX: Detecting Explanatory Evidence for Memes via Knowledge-Enriched Contextualization**|Shivam Sharma et.al.|[2305.15913v2](http://arxiv.org/abs/2305.15913v2)|**[link](https://github.com/lcs2-iiitd/memex_meme_evidence)**|\n", "2305.15765": "|**2023-05-25**|**Language-Guided 3D Object Detection in Point Cloud for Autonomous Driving**|Wenhao Cheng et.al.|[2305.15765v1](http://arxiv.org/abs/2305.15765v1)|null|\n", "2305.15762": "|**2023-05-25**|**Dynamic Enhancement Network for Partial Multi-modality Person Re-identification**|Aihua Zheng et.al.|[2305.15762v1](http://arxiv.org/abs/2305.15762v1)|null|\n", "2305.15753": "|**2023-05-25**|**T2TD: Text-3D Generation Model based on Prior Knowledge Guidance**|Weizhi Nie et.al.|[2305.15753v1](http://arxiv.org/abs/2305.15753v1)|null|\n", "2305.15732": "|**2023-05-26**|**CLIP3Dstyler: Language Guided 3D Arbitrary Neural Style Transfer**|Ming Gao et.al.|[2305.15732v2](http://arxiv.org/abs/2305.15732v2)|null|\n", "2305.15688": "|**2023-05-25**|**Frame-Event Alignment and Fusion Network for High Frame Rate Tracking**|Jiqing Zhang et.al.|[2305.15688v1](http://arxiv.org/abs/2305.15688v1)|null|\n", "2305.15483": "|**2023-05-24**|**Weakly Supervised Vision-and-Language Pre-training with Relative Representations**|Chi Chen et.al.|[2305.15483v1](http://arxiv.org/abs/2305.15483v1)|null|\n", "2305.17102": "|**2023-05-26**|**GeoVLN: Learning Geometry-Enhanced Visual Representation with Slot Attention for Vision-and-Language Navigation**|Jingyang Huo et.al.|[2305.17102v1](http://arxiv.org/abs/2305.17102v1)|**[link](https://github.com/jingyanghuo/GeoVLN)**|\n", "2305.17100": "|**2023-05-26**|**BiomedGPT: A Unified and Generalist Biomedical Generative Pre-trained Transformer for Vision, Language, and Multimodal Tasks**|Kai Zhang et.al.|[2305.17100v1](http://arxiv.org/abs/2305.17100v1)|**[link](https://github.com/taokz/biomedgpt)**|\n", "2305.17011": "|**2023-05-26**|**SOC: Semantic-Assisted Object Cluster for Referring Video Object Segmentation**|Zhuoyan Luo et.al.|[2305.17011v1](http://arxiv.org/abs/2305.17011v1)|null|\n", "2305.16986": "|**2023-05-29**|**NavGPT: Explicit Reasoning in Vision-and-Language Navigation with Large Language Models**|Gengze Zhou et.al.|[2305.16986v2](http://arxiv.org/abs/2305.16986v2)|**[link](https://github.com/gengzezhou/navgpt)**|\n", "2305.16685": "|**2023-05-26**|**S4M: Generating Radiology Reports by A Single Model for Multiple Body Parts**|Qi Chen et.al.|[2305.16685v1](http://arxiv.org/abs/2305.16685v1)|**[link](https://github.com/ytongxie/s4m)**|\n", "2305.16556": "|**2023-05-26**|**LANISTR: Multimodal Learning from Structured and Unstructured Data**|Sayna Ebrahimi et.al.|[2305.16556v1](http://arxiv.org/abs/2305.16556v1)|null|\n", "2305.16434": "|**2023-05-25**|**Credit Valuation Adjustment in Financial Networks**|Irena Barja\u0161i\u0107 et.al.|[2305.16434v1](http://arxiv.org/abs/2305.16434v1)|null|\n", "2305.16406": "|**2023-05-25**|**Context-Aware Attention Layers coupled with Optimal Transport Domain Adaptation methods for recognizing dementia from spontaneous speech**|Loukas Ilias et.al.|[2305.16406v1](http://arxiv.org/abs/2305.16406v1)|null|\n", "2305.18171": "|**2023-05-29**|**Improved Probabilistic Image-Text Representations**|Sanghyuk Chun et.al.|[2305.18171v1](http://arxiv.org/abs/2305.18171v1)|**[link](https://github.com/naver-ai/pcmepp)**|\n", "2305.18009": "|**2023-05-29**|**Multi-Modal Face Stylization with a Generative Prior**|Mengtian Li et.al.|[2305.18009v1](http://arxiv.org/abs/2305.18009v1)|null|\n", "2305.17993": "|**2023-05-29**|**Multi-Scale Attention for Audio Question Answering**|Guangyao Li et.al.|[2305.17993v1](http://arxiv.org/abs/2305.17993v1)|**[link](https://github.com/gewu-lab/mwafm)**|\n", "2305.17941": "|**2023-05-29**|**Safety of autonomous vehicles: A survey on Model-based vs. AI-based approaches**|Dimia Iberraken et.al.|[2305.17941v1](http://arxiv.org/abs/2305.17941v1)|null|\n", "2305.17925": "|**2023-05-29**|**Identifying shifts in multi-modal travel patterns during special events using mobile data: Celebrating Vappu in Helsinki**|Zhiren Huang et.al.|[2305.17925v1](http://arxiv.org/abs/2305.17925v1)|null|\n", "2305.17911": "|**2023-05-29**|**TotalDefMeme: A Multi-Attribute Meme dataset on Total Defence in Singapore**|Nirmalendu Prakash et.al.|[2305.17911v1](http://arxiv.org/abs/2305.17911v1)|null|\n", "2305.17903": "|**2023-05-30**|**Deeply Coupled Cross-Modal Prompt Learning**|Xuejing Liu et.al.|[2305.17903v2](http://arxiv.org/abs/2305.17903v2)|**[link](https://github.com/gingl/cmpa)**|\n", "2305.17652": "|**2023-05-28**|**ConaCLIP: Exploring Distillation of Fully-Connected Knowledge Interaction Graph for Lightweight Text-Image Retrieval**|Jiapeng Wang et.al.|[2305.17652v1](http://arxiv.org/abs/2305.17652v1)|null|\n", "2305.17629": "|**2023-05-28**|**Multi-Modal Wireless Flexible Gel-Free Sensors with Edge Deep Learning for Detecting and Alerting Freezing of Gait in Parkinson's Patients**|Yuhan Hou et.al.|[2305.17629v1](http://arxiv.org/abs/2305.17629v1)|null|\n", "2305.17600": "|**2023-05-28**|**GAME-UP: Game-Aware Mode Enumeration and Understanding for Trajectory Prediction**|Justin Lidard et.al.|[2305.17600v1](http://arxiv.org/abs/2305.17600v1)|null|\n", "2305.17530": "|**2023-05-27**|**PuMer: Pruning and Merging Tokens for Efficient Vision Language Models**|Qingqing Cao et.al.|[2305.17530v1](http://arxiv.org/abs/2305.17530v1)|**[link](https://github.com/csarron/pumer)**|\n", "2305.17499": "|**2023-05-27**|**CIF-PT: Bridging Speech and Text Representations for Spoken Language Understanding via Continuous Integrate-and-Fire Pre-Training**|Linhao Dong et.al.|[2305.17499v1](http://arxiv.org/abs/2305.17499v1)|null|\n", "2305.17455": "|**2023-05-27**|**CrossGET: Cross-Guided Ensemble of Tokens for Accelerating Vision-Language Transformers**|Dachuan Shi et.al.|[2305.17455v1](http://arxiv.org/abs/2305.17455v1)|**[link](https://github.com/sdc17/crossget)**|\n", "2305.17343": "|**2023-05-27**|**Modality-Independent Teachers Meet Weakly-Supervised Audio-Visual Event Parser**|Yung-Hsuan Lai et.al.|[2305.17343v1](http://arxiv.org/abs/2305.17343v1)|**[link](https://github.com/franklin905/valor)**|\n", "2305.17219": "|**2023-05-26**|**GVdoc: Graph-based Visual Document Classification**|Fnu Mohbat et.al.|[2305.17219v1](http://arxiv.org/abs/2305.17219v1)|**[link](https://github.com/mohbattharani/GVdoc)**|\n", "2305.19270": "|**2023-05-30**|**Learning without Forgetting for Vision-Language Models**|Da-Wei Zhou et.al.|[2305.19270v1](http://arxiv.org/abs/2305.19270v1)|null|\n", "2305.19240": "|**2023-05-30**|**NetHack is Hard to Hack**|Ulyana Piterbarg et.al.|[2305.19240v1](http://arxiv.org/abs/2305.19240v1)|**[link](https://github.com/upiterbarg/hihack)**|\n", "2305.19228": "|**2023-05-30**|**Unsupervised Melody-to-Lyric Generation**|Yufei Tian et.al.|[2305.19228v1](http://arxiv.org/abs/2305.19228v1)|**[link](https://github.com/amazon-science/unsupervised-melody-to-lyrics-generation)**|\n", "2305.19216": "|**2023-05-30**|**Translation-Enhanced Multilingual Text-to-Image Generation**|Yaoyiran Li et.al.|[2305.19216v1](http://arxiv.org/abs/2305.19216v1)|null|\n", "2305.18980": "|**2023-05-30**|**Multi-modal Queried Object Detection in the Wild**|Yifan Xu et.al.|[2305.18980v1](http://arxiv.org/abs/2305.18980v1)|**[link](https://github.com/yifanxu74/mq-det)**|\n", "2305.18969": "|**2023-05-30**|**MS-DETR: Natural Language Video Localization with Sampling Moment-Moment Interaction**|Jing Wang et.al.|[2305.18969v1](http://arxiv.org/abs/2305.18969v1)|**[link](https://github.com/k-nick/ms-detr)**|\n", "2305.18898": "|**2023-05-30**|**AlphaBlock: Embodied Finetuning for Vision-Language Reasoning in Robot Manipulation**|Chuhao Jin et.al.|[2305.18898v1](http://arxiv.org/abs/2305.18898v1)|null|\n", "2305.18842": "|**2023-05-30**|**Generate then Select: Open-ended Visual Question Answering Guided by World Knowledge**|Xingyu Fu et.al.|[2305.18842v1](http://arxiv.org/abs/2305.18842v1)|null|\n", "2305.18752": "|**2023-05-30**|**GPT4Tools: Teaching Large Language Model to Use Tools via Self-instruction**|Rui Yang et.al.|[2305.18752v1](http://arxiv.org/abs/2305.18752v1)|**[link](https://github.com/stevengrove/gpt4tools)**|\n", "2305.18721": "|**2023-05-30**|**LayoutMask: Enhance Text-Layout Interaction in Multi-modal Pre-training for Document Understanding**|Yi Tu et.al.|[2305.18721v1](http://arxiv.org/abs/2305.18721v1)|null|\n", "2305.18641": "|**2023-05-29**|**Enhanced Chart Understanding in Vision and Language Task via Cross-modal Pre-training on Plot Table Pairs**|Mingyang Zhou et.al.|[2305.18641v1](http://arxiv.org/abs/2305.18641v1)|null|\n", "2305.18500": "|**2023-05-29**|**VAST: A Vision-Audio-Subtitle-Text Omni-Modality Foundation Model and Dataset**|Sihan Chen et.al.|[2305.18500v1](http://arxiv.org/abs/2305.18500v1)|**[link](https://github.com/txh-mercury/vast)**|\n", "2305.19972": "|**2023-05-31**|**ViLaS: Integrating Vision and Language into Automatic Speech Recognition**|Minglun Han et.al.|[2305.19972v1](http://arxiv.org/abs/2305.19972v1)|null|\n", "2305.19924": "|**2023-06-01**|**Joint Adaptive Representations for Image-Language Learning**|AJ Piergiovanni et.al.|[2305.19924v2](http://arxiv.org/abs/2305.19924v2)|null|\n", "2305.19912": "|**2023-05-31**|**Structure-Aware Language Model Pretraining Improves Dense Retrieval on Structured Data**|Xinze Li et.al.|[2305.19912v1](http://arxiv.org/abs/2305.19912v1)|**[link](https://github.com/openmatch/openmatch)**|\n", "2305.19894": "|**2023-05-31**|**Med-UniC: Unifying Cross-Lingual Medical Vision-Language Pre-Training by Diminishing Bias**|Zhongwei Wan et.al.|[2305.19894v1](http://arxiv.org/abs/2305.19894v1)|**[link](https://github.com/SUSTechBruce/Med-UniC)**|\n", "2305.19664": "|**2023-05-31**|**Unveiling Cross Modality Bias in Visual Question Answering: A Causal View with Possible Worlds VQA**|Ali Vosoughi et.al.|[2305.19664v1](http://arxiv.org/abs/2305.19664v1)|null|\n", "2305.19624": "|**2023-05-31**|**A Multi-Modal Transformer Network for Action Detection**|Matthew Korban et.al.|[2305.19624v1](http://arxiv.org/abs/2305.19624v1)|null|\n", "2305.19595": "|**2023-06-01**|**Dense and Aligned Captions (DAC) Promote Compositional Reasoning in VL Models**|Sivan Doveh et.al.|[2305.19595v2](http://arxiv.org/abs/2305.19595v2)|null|\n", "2305.19522": "|**2023-06-01**|**PromptStyle: Controllable Style Transfer for Text-to-Speech with Natural Language Descriptions**|Guanghou Liu et.al.|[2305.19522v2](http://arxiv.org/abs/2305.19522v2)|null|\n", "2306.00978": "|**2023-06-01**|**AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration**|Ji Lin et.al.|[2306.00978v1](http://arxiv.org/abs/2306.00978v1)|**[link](https://github.com/mit-han-lab/llm-awq)**|\n", "2306.00964": "|**2023-06-01**|**Cocktail: Mixing Multi-Modality Controls for Text-Conditional Image Generation**|Minghui Hu et.al.|[2306.00964v1](http://arxiv.org/abs/2306.00964v1)|null|\n", "2306.00958": "|**2023-06-01**|**LIV: Language-Image Representations and Rewards for Robotic Control**|Yecheng Jason Ma et.al.|[2306.00958v1](http://arxiv.org/abs/2306.00958v1)|**[link](https://github.com/penn-pal-lab/liv)**|\n", "2306.00932": "|**2023-06-01**|**Cross Modal Data Discovery over Structured and Unstructured Data Lakes**|Mohamed Y. Eltabakh et.al.|[2306.00932v1](http://arxiv.org/abs/2306.00932v1)|**[link](https://github.com/qcri/cmdl)**|\n", "2306.00813": "|**2023-06-01**|**UniDiff: Advancing Vision-Language Models with Generative and Discriminative Learning**|Xiao Dong et.al.|[2306.00813v1](http://arxiv.org/abs/2306.00813v1)|null|\n", "2306.00792": "|**2023-06-01**|**Learning Across Decentralized Multi-Modal Remote Sensing Archives with Federated Learning**|Bar\u0131\u015f B\u00fcy\u00fckta\u015f et.al.|[2306.00792v1](http://arxiv.org/abs/2306.00792v1)|null|\n", "2306.00789": "|**2023-06-01**|**Improved Cross-Lingual Transfer Learning For Automatic Speech Translation**|Sameer Khurana et.al.|[2306.00789v1](http://arxiv.org/abs/2306.00789v1)|null|\n", "2306.00783": "|**2023-06-01**|**FDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and Relighting with Diffusion Models**|Hao Zhang et.al.|[2306.00783v1](http://arxiv.org/abs/2306.00783v1)|**[link](https://github.com/billyxyb/fdnerf)**|\n", "2306.00640": "|**2023-06-01**|**Multi-Modal Deep Learning for Multi-Temporal Urban Mapping With a Partly Missing Optical Modality**|Sebastian Hafner et.al.|[2306.00640v1](http://arxiv.org/abs/2306.00640v1)|null|\n", "2306.00424": "|**2023-06-01**|**End-to-end Knowledge Retrieval with Multi-modal Queries**|Man Luo et.al.|[2306.00424v1](http://arxiv.org/abs/2306.00424v1)|**[link](https://github.com/luomancs/remuq)**|\n", "2306.00409": "|**2023-06-01**|**Adapting Pre-trained Language Models to Vision-Language Tasks via Dynamic Visual Prompting**|Shubin Huang et.al.|[2306.00409v1](http://arxiv.org/abs/2306.00409v1)|**[link](https://github.com/hsb1357173526/dynamic_visual_prompting)**|\n", "2306.00386": "|**2023-06-01**|**Symmetric Uncertainty-Aware Feature Transmission for Depth Super-Resolution**|Wuxuan Shi et.al.|[2306.00386v1](http://arxiv.org/abs/2306.00386v1)|**[link](https://github.com/shiwuxuan/suft)**|\n", "2306.00228": "|**2023-05-31**|**Using Visual Cropping to Enhance Fine-Detail Question Answering of BLIP-Family Models**|Jiarui Zhang et.al.|[2306.00228v1](http://arxiv.org/abs/2306.00228v1)|null|\n", "2306.00179": "|**2023-05-31**|**LeggedWalking on Inclined Surfaces**|Chenghao Wang et.al.|[2306.00179v1](http://arxiv.org/abs/2306.00179v1)|null|\n", "2306.00103": "|**2023-05-31**|**ManagerTower: Aggregating the Insights of Uni-Modal Experts for Vision-Language Representation Learning**|Xiao Xu et.al.|[2306.00103v1](http://arxiv.org/abs/2306.00103v1)|**[link](https://github.com/looperxx/managertower)**|\n", "2306.01733": "|**2023-06-02**|**DocFormerv2: Local Features for Document Understanding**|Srikar Appalaraju et.al.|[2306.01733v1](http://arxiv.org/abs/2306.01733v1)|null|\n", "2306.01675": "|**2023-06-02**|**Bayesian Segmentation Modeling of Epidemic Growth**|Tejasv Bedi et.al.|[2306.01675v1](http://arxiv.org/abs/2306.01675v1)|null|\n", "2306.01656": "|**2023-06-02**|**Backchannel Detection and Agreement Estimation from Video with Transformer Networks**|Ahmed Amer et.al.|[2306.01656v1](http://arxiv.org/abs/2306.01656v1)|**[link](https://git.opendfki.de/body_language/ijcnn23-backchannel-detection)**|\n", "2306.01523": "|**2023-06-02**|**Transformer-based Multi-Modal Learning for Multi Label Remote Sensing Image Classification**|David Hoffmann et.al.|[2306.01523v1](http://arxiv.org/abs/2306.01523v1)|null|\n", "2306.01492": "|**2023-06-02**|**Multi-Modal Emotion Recognition for Enhanced Requirements Engineering: A Novel Approach**|Ben Cheng et.al.|[2306.01492v1](http://arxiv.org/abs/2306.01492v1)|null|\n", "2306.01312": "|**2023-06-02**|**Syntax-aware Hybrid prompt model for Few-shot multi-modal sentiment analysis**|Zikai Zhou et.al.|[2306.01312v1](http://arxiv.org/abs/2306.01312v1)|null|\n", "2306.01311": "|**2023-06-02**|**MetaVL: Transferring In-Context Learning Ability From Language Models to Vision-Language Models**|Masoud Monajatipoor et.al.|[2306.01311v1](http://arxiv.org/abs/2306.01311v1)|null|\n", "2306.01163": "|**2023-06-01**|**A Multi-Modal Latent-Features based Service Recommendation System for the Social Internet of Things**|Amar Khelloufi et.al.|[2306.01163v1](http://arxiv.org/abs/2306.01163v1)|null|\n", "2306.01144": "|**2023-06-01**|**Evaluating the Capabilities of Multi-modal Reasoning Models with Synthetic Task Data**|Nathan Vaska et.al.|[2306.01144v1](http://arxiv.org/abs/2306.01144v1)|null|\n", "2306.01112": "|**2023-06-01**|**What if We Enrich day-ahead Solar Irradiance Time Series Forecasting with Spatio-Temporal Context?**|Oussama Boussif et.al.|[2306.01112v1](http://arxiv.org/abs/2306.01112v1)|**[link](https://github.com/gitbooo/CrossViVit)**|\n", "2306.02972": "|**2023-06-05**|**Simultaneous or Sequential Training? How Speech Representations Cooperate in a Multi-Task Self-Supervised Learning System**|Khazar Khorrami et.al.|[2306.02972v1](http://arxiv.org/abs/2306.02972v1)|null|\n", "2306.02901": "|**2023-06-05**|**A Vessel-Segmentation-Based CycleGAN for Unpaired Multi-modal Retinal Image Synthesis**|Aline Sindel et.al.|[2306.02901v1](http://arxiv.org/abs/2306.02901v1)|null|\n", "2306.02894": "|**2023-06-05**|**Recyclable Semi-supervised Method Based on Multi-model Ensemble for Video Scene Parsing**|Biao Wu et.al.|[2306.02894v1](http://arxiv.org/abs/2306.02894v1)|null|\n", "2306.02858": "|**2023-06-06**|**Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding**|Hang Zhang et.al.|[2306.02858v2](http://arxiv.org/abs/2306.02858v2)|**[link](https://github.com/damo-nlp-sg/video-llama)**|\n", "2306.02841": "|**2023-06-05**|**CTRL: Connect Tabular and Language Model for CTR Prediction**|Xiangyang Li et.al.|[2306.02841v1](http://arxiv.org/abs/2306.02841v1)|null|\n", "2306.02831": "|**2023-06-05**|**MM-DAG: Multi-task DAG Learning for Multi-modal Data -- with Application for Traffic Congestion Analysis**|Tian Lan et.al.|[2306.02831v1](http://arxiv.org/abs/2306.02831v1)|**[link](https://github.com/lantian72/mm-dag)**|\n", "2306.02673": "|**2023-06-05**|**Cross-Modal Vertical Federated Learning for MRI Reconstruction**|Yunlu Yan et.al.|[2306.02673v1](http://arxiv.org/abs/2306.02673v1)|null|\n", "2306.02596": "|**2023-06-05**|**A Novel Interpretable and Generalizable Re-synchronization Model for Cued Speech based on a Multi-Cuer Corpus**|Lufei Gao et.al.|[2306.02596v1](http://arxiv.org/abs/2306.02596v1)|**[link](https://github.com/lufei321/resync-cs)**|\n", "2306.02546": "|**2023-06-05**|**LmPa: Improving Decompilation by Synergy of Large Language Model and Program Analysis**|Xiangzhe Xu et.al.|[2306.02546v1](http://arxiv.org/abs/2306.02546v1)|null|\n", "2306.02329": "|**2023-06-04**|**Multi-CLIP: Contrastive Vision-Language Pre-training for Question Answering tasks in 3D Scenes**|Alexandros Delitzas et.al.|[2306.02329v1](http://arxiv.org/abs/2306.02329v1)|null|\n", "2306.02307": "|**2023-06-04**|**Finding the SWEET Spot: Analysis and Improvement of Adaptive Inference in Low Resource Settings**|Daniel Rotem et.al.|[2306.02307v1](http://arxiv.org/abs/2306.02307v1)|null|\n", "2306.02259": "|**2023-06-04**|**Predicting Information Pathways Across Online Communities**|Yiqiao Jin et.al.|[2306.02259v1](http://arxiv.org/abs/2306.02259v1)|**[link](https://github.com/claws-lab/inpac)**|\n", "2306.02137": "|**2023-06-03**|**Inconsistent Matters: A Knowledge-guided Dual-consistency Network for Multi-modal Rumor Detection**|Mengzhu Sun et.al.|[2306.02137v1](http://arxiv.org/abs/2306.02137v1)|**[link](https://github.com/mengzsun/kdcn)**|\n", "2306.02050": "|**2023-06-06**|**Provable Dynamic Fusion for Low-Quality Multimodal Data**|Qingyang Zhang et.al.|[2306.02050v2](http://arxiv.org/abs/2306.02050v2)|**[link](https://github.com/qingyangzhang/qmf)**|\n", "2306.01929": "|**2023-06-02**|**Recent Advances of Local Mechanisms in Computer Vision: A Survey and Outlook of Recent Work**|Qiangchang Wang et.al.|[2306.01929v1](http://arxiv.org/abs/2306.01929v1)|null|\n", "2306.03899": "|**2023-06-06**|**Towards Label-free Scene Understanding by Vision Foundation Models**|Runnan Chen et.al.|[2306.03899v1](http://arxiv.org/abs/2306.03899v1)|**[link](https://github.com/runnanchen/label-free-scene-understanding)**|\n", "2306.03810": "|**2023-06-06**|**X-Align++: cross-modal cross-view alignment for Bird's-eye-view segmentation**|Shubhankar Borse et.al.|[2306.03810v1](http://arxiv.org/abs/2306.03810v1)|null|\n", "2306.03802": "|**2023-06-06**|**Learning to Ground Instructional Articles in Videos through Narrations**|Effrosyni Mavroudi et.al.|[2306.03802v1](http://arxiv.org/abs/2306.03802v1)|null|\n", "2306.03730": "|**2023-06-06**|**Modality-Agnostic Learning for Medical Image Segmentation Using Multi-modality Self-distillation**|Qisheng He et.al.|[2306.03730v1](http://arxiv.org/abs/2306.03730v1)|null|\n", "2306.03678": "|**2023-06-06**|**On the Difference of BERT-style and CLIP-style Text Encoders**|Zhihong Chen et.al.|[2306.03678v1](http://arxiv.org/abs/2306.03678v1)|**[link](https://github.com/zhjohnchan/bert-clip-synesthesia)**|\n", "2306.03650": "|**2023-06-06**|**A Quantum Probability Driven Framework for Joint Multi-Modal Sarcasm, Sentiment and Emotion Analysis**|Yaochen Liu et.al.|[2306.03650v1](http://arxiv.org/abs/2306.03650v1)|null|\n", "2306.03617": "|**2023-06-06**|**A Data-Efficient Approach for Long-Term Human Motion Prediction Using Maps of Dynamics**|Yufei Zhu et.al.|[2306.03617v1](http://arxiv.org/abs/2306.03617v1)|null|\n", "2306.03367": "|**2023-06-06**|**Bridging the Gap Between Multi-Step and One-Shot Trajectory Prediction via Self-Supervision**|Faris Janjo\u0161 et.al.|[2306.03367v1](http://arxiv.org/abs/2306.03367v1)|null|\n", "2306.03252": "|**2023-06-05**|**RACECAR -- The Dataset for High-Speed Autonomous Racing**|Amar Kulkarni et.al.|[2306.03252v1](http://arxiv.org/abs/2306.03252v1)|**[link](https://github.com/linklab-uva/racecar_data)**|\n", "2306.04445": "|**2023-06-07**|**Multi-modal Latent Diffusion**|Mustapha Bounoua et.al.|[2306.04445v1](http://arxiv.org/abs/2306.04445v1)|null|\n", "2306.04387": "|**2023-06-08**|**M$^3$IT: A Large-Scale Dataset towards Multi-Modal Multilingual Instruction Tuning**|Lei Li et.al.|[2306.04387v2](http://arxiv.org/abs/2306.04387v2)|null|\n", "2306.04362": "|**2023-06-07**|**Youku-mPLUG: A 10 Million Large-scale Chinese Video-Language Dataset for Pre-training and Benchmarks**|Haiyang Xu et.al.|[2306.04362v1](http://arxiv.org/abs/2306.04362v1)|**[link](https://github.com/x-plug/youku-mplug)**|\n", "2306.04272": "|**2023-06-07**|**On the Generalization of Multi-modal Contrastive Learning**|Qi Zhang et.al.|[2306.04272v1](http://arxiv.org/abs/2306.04272v1)|**[link](https://github.com/pku-ml/clip-help-simclr)**|\n", "2306.04163": "|**2023-06-07**|**Enhancing Virtual Assistant Intelligence: Precise Area Targeting for Instance-level User Intents beyond Metadata**|Mengyu Chen et.al.|[2306.04163v1](http://arxiv.org/abs/2306.04163v1)|null|\n", "2306.04083": "|**2023-06-07**|**Coverage Path Planning with Budget Constraints for Multiple Unmanned Ground Vehicles**|Vu Phi Tran et.al.|[2306.04083v1](http://arxiv.org/abs/2306.04083v1)|null|\n", "2306.04021": "|**2023-06-06**|**Energy-Based Models for Cross-Modal Localization using Convolutional Transformers**|Alan Wu et.al.|[2306.04021v1](http://arxiv.org/abs/2306.04021v1)|null|\n", "2306.05425": "|**2023-06-08**|**MIMIC-IT: Multi-Modal In-Context Instruction Tuning**|Bo Li et.al.|[2306.05425v1](http://arxiv.org/abs/2306.05425v1)|**[link](https://github.com/luodian/otter)**|\n", "2306.04928": "|**2023-06-08**|**Underwater Intention Recognition using Head Motion and Throat Vibration for Supernumerary Robotic Assistance**|Yuqin Guo et.al.|[2306.04928v1](http://arxiv.org/abs/2306.04928v1)|null|\n", "2306.06048": "|**2023-06-09**|**How Does Fine-Tuning Impact Out-of-Distribution Detection for Vision-Language Models?**|Yifei Ming et.al.|[2306.06048v1](http://arxiv.org/abs/2306.06048v1)|null|\n", "2306.05716": "|**2023-06-09**|**Pave the Way to Grasp Anything: Transferring Foundation Models for Universal Pick-Place Robots**|Jiange Yang et.al.|[2306.05716v1](http://arxiv.org/abs/2306.05716v1)|null|\n", "2306.05493": "|**2023-06-08**|**Multi-Modal Classifiers for Open-Vocabulary Object Detection**|Prannay Kaul et.al.|[2306.05493v1](http://arxiv.org/abs/2306.05493v1)|null|\n", "2306.07272": "|**2023-06-12**|**Zero-shot Composed Text-Image Retrieval**|Yikun Liu et.al.|[2306.07272v1](http://arxiv.org/abs/2306.07272v1)|**[link](https://github.com/Code-kunkun/ZS-CIR)**|\n", "2306.07257": "|**2023-06-12**|**MovieFactory: Automatic Movie Creation from Text using Large Generative Models for Language and Images**|Junchen Zhu et.al.|[2306.07257v1](http://arxiv.org/abs/2306.07257v1)|null|\n", "2306.07207": "|**2023-06-12**|**Valley: Video Assistant with Large Language model Enhanced abilitY**|Ruipu Luo et.al.|[2306.07207v1](http://arxiv.org/abs/2306.07207v1)|**[link](https://github.com/rupertluo/valley)**|\n", "2306.07196": "|**2023-06-12**|**Retrieval-Enhanced Contrastive Vision-Text Models**|Ahmet Iscen et.al.|[2306.07196v1](http://arxiv.org/abs/2306.07196v1)|null|\n", "2306.07187": "|**2023-06-12**|**Video-to-Music Recommendation using Temporal Alignment of Segments**|Laure Pr\u00e9tet et.al.|[2306.07187v1](http://arxiv.org/abs/2306.07187v1)|null|\n", "2306.07096": "|**2023-06-12**|**Global and Local Semantic Completion Learning for Vision-Language Pre-training**|Rong-Cheng Tu et.al.|[2306.07096v1](http://arxiv.org/abs/2306.07096v1)|**[link](https://github.com/iigroup/scl)**|\n", "2306.06885": "|**2023-06-12**|**NPVForensics: Jointing Non-critical Phonemes and Visemes for Deepfake Detection**|Yu Chen et.al.|[2306.06885v1](http://arxiv.org/abs/2306.06885v1)|null|\n", "2306.06691": "|**2023-06-11**|**Self-Enhancement Improves Text-Image Retrieval in Foundation Visual-Language Models**|Yuguang Yang et.al.|[2306.06691v1](http://arxiv.org/abs/2306.06691v1)|null|\n", "2306.06687": "|**2023-06-11**|**LAMM: Language-Assisted Multi-Modal Instruction-Tuning Dataset, Framework, and Benchmark**|Zhenfei Yin et.al.|[2306.06687v1](http://arxiv.org/abs/2306.06687v1)|**[link](https://github.com/openlamm/lamm)**|\n", "2306.06615": "|**2023-06-11**|**Empowering Molecule Discovery for Molecule-Caption Translation with Large Language Models: A ChatGPT Perspective**|Jiatong Li et.al.|[2306.06615v1](http://arxiv.org/abs/2306.06615v1)|**[link](https://github.com/phenixace/molregpt)**|\n", "2306.06583": "|**2023-06-11**|**REACT2023: the first Multi-modal Multiple Appropriate Facial Reaction Generation Challenge**|Siyang Song et.al.|[2306.06583v1](http://arxiv.org/abs/2306.06583v1)|**[link](https://github.com/reactmultimodalchallenge/baseline_react2023)**|\n", "2306.06494": "|**2023-06-10**|**Multi-modal Pre-training for Medical Vision-language Understanding and Generation: An Empirical Study with A New Benchmark**|Li Xu et.al.|[2306.06494v1](http://arxiv.org/abs/2306.06494v1)|**[link](https://github.com/control-xl/medical-vision-langauge-transformer)**|\n", "2306.06476": "|**2023-06-10**|**Modality Influence in Multimodal Machine Learning**|Abdelhamid Haouhat et.al.|[2306.06476v1](http://arxiv.org/abs/2306.06476v1)|null|\n", "2306.06465": "|**2023-06-10**|**Simultaneous Trajectory Optimization and Contact Selection for Multi-Modal Manipulation Planning**|Mengchao Zhang et.al.|[2306.06465v1](http://arxiv.org/abs/2306.06465v1)|null|\n", "2306.06410": "|**2023-06-10**|**OpenSR: Open-Modality Speech Recognition via Maintaining Multi-Modality Alignment**|Xize Cheng et.al.|[2306.06410v1](http://arxiv.org/abs/2306.06410v1)|**[link](https://github.com/exgc/opensr)**|\n", "2306.07744": "|**2023-06-13**|**Contrastive Learning-Based Audio to Lyrics Alignment for Multiple Languages**|Simon Durand et.al.|[2306.07744v1](http://arxiv.org/abs/2306.07744v1)|**[link](https://github.com/f90/jamendolyrics)**|\n", "2306.07646": "|**2023-06-13**|**Enhanced Multimodal Representation Learning with Cross-modal KD**|Mengxi Chen et.al.|[2306.07646v1](http://arxiv.org/abs/2306.07646v1)|null|\n", "2306.07505": "|**2023-06-13**|**Deep learning radiomics for assessment of gastroesophageal varices in people with compensated advanced chronic liver disease**|Lan Wang et.al.|[2306.07505v1](http://arxiv.org/abs/2306.07505v1)|null|\n", "2306.07303": "|**2023-06-11**|**A Comprehensive Survey on Applications of Transformers for Deep Learning Tasks**|Saidul Islam et.al.|[2306.07303v1](http://arxiv.org/abs/2306.07303v1)|null|\n", "2306.09347": "|**2023-06-15**|**Segment Any Point Cloud Sequences by Distilling Vision Foundation Models**|Youquan Liu et.al.|[2306.09347v1](http://arxiv.org/abs/2306.09347v1)|**[link](https://github.com/youquanl/segment-any-point-cloud)**|\n", "2306.09265": "|**2023-06-15**|**LVLM-eHub: A Comprehensive Evaluation Benchmark for Large Vision-Language Models**|Peng Xu et.al.|[2306.09265v1](http://arxiv.org/abs/2306.09265v1)|**[link](https://github.com/opengvlab/multi-modality-arena)**|\n", "2306.09093": "|**2023-06-15**|**Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration**|Chenyang Lyu et.al.|[2306.09093v1](http://arxiv.org/abs/2306.09093v1)|**[link](https://github.com/lyuchenyang/macaw-llm)**|\n", "2306.09067": "|**2023-06-15**|**Winning Solution for the CVPR2023 Visual Anomaly and Novelty Detection Challenge: Multimodal Prompting for Data-centric Anomaly Detection**|Yunkang Cao et.al.|[2306.09067v1](http://arxiv.org/abs/2306.09067v1)|**[link](https://github.com/caoyunkang/segment-any-anomaly)**|\n", "2306.08966": "|**2023-06-15**|**Training Multimedia Event Extraction With Generated Images and Captions**|Zilin Du et.al.|[2306.08966v1](http://arxiv.org/abs/2306.08966v1)|null|\n", "2306.08893": "|**2023-06-15**|**LOVM: Language-Only Vision Model Selection**|Orr Zohar et.al.|[2306.08893v1](http://arxiv.org/abs/2306.08893v1)|**[link](https://github.com/orrzohar/lovm)**|\n", "2306.08871": "|**2023-06-15**|**Med-MMHL: A Multi-Modal Dataset for Detecting Human- and LLM-Generated Misinformation in the Medical Domain**|Yanshen Sun et.al.|[2306.08871v1](http://arxiv.org/abs/2306.08871v1)|**[link](https://github.com/styxsys0927/med-mmhl)**|\n", "2306.08832": "|**2023-06-15**|**Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to Enhance Visio-Linguistic Fine-grained Understanding**|Le Zhang et.al.|[2306.08832v1](http://arxiv.org/abs/2306.08832v1)|**[link](https://github.com/magiccircuit/enhance-finegrained)**|\n", "2306.08789": "|**2023-06-15**|**Efficient Token-Guided Image-Text Retrieval with Consistent Multimodal Contrastive Training**|Chong Liu et.al.|[2306.08789v1](http://arxiv.org/abs/2306.08789v1)|null|\n", "2306.08749": "|**2023-06-14**|**Utilizing Longitudinal Chest X-Rays and Reports to Pre-Fill Radiology Reports**|Qingqing Zhu et.al.|[2306.08749v1](http://arxiv.org/abs/2306.08749v1)|null|\n", "2306.08657": "|**2023-06-14**|**EMERSK -- Explainable Multimodal Emotion Recognition with Situational Knowledge**|Mijanur Palash et.al.|[2306.08657v1](http://arxiv.org/abs/2306.08657v1)|null|\n", "2306.08640": "|**2023-06-14**|**AssistGPT: A General Multi-modal Assistant that can Plan, Execute, Inspect, and Learn**|Difei Gao et.al.|[2306.08640v1](http://arxiv.org/abs/2306.08640v1)|null|\n", "2306.08522": "|**2023-06-14**|**Challenges of Indoor SLAM: A multi-modal multi-floor dataset for SLAM evaluation**|Pushyami Kaveti et.al.|[2306.08522v1](http://arxiv.org/abs/2306.08522v1)|**[link](https://github.com/neufieldrobotics/nufr-m3f)**|\n", "2306.08498": "|**2023-06-14**|**RISCLIP: Referring Image Segmentation Framework using CLIP**|Seoyeon Kim et.al.|[2306.08498v1](http://arxiv.org/abs/2306.08498v1)|**[link](https://github.com/Yeon07/RISCLIP)**|\n", "2306.08247": "|**2023-06-14**|**Diffusion in Diffusion: Cyclic One-Way Diffusion for Text-Vision-Conditioned Generation**|Yongqi Yang et.al.|[2306.08247v1](http://arxiv.org/abs/2306.08247v1)|null|\n", "2306.09851": "|**2023-06-16**|**Joint multi-modal Self-Supervised pre-training in Remote Sensing: Application to Methane Source Classification**|Paul Berg et.al.|[2306.09851v1](http://arxiv.org/abs/2306.09851v1)|null|\n", "2306.09546": "|**2023-06-15**|**Cross-Modal Video to Body-joints Augmentation for Rehabilitation Exercise Quality Assessment**|Ali Abedi et.al.|[2306.09546v1](http://arxiv.org/abs/2306.09546v1)|null|\n", "2306.09523": "|**2023-06-19**|**Tell Me Where to Go: A Composable Framework for Context-Aware Embodied Robot Navigation**|Harel Biggie et.al.|[2306.09523v2](http://arxiv.org/abs/2306.09523v2)|null|\n", "2306.09417": "|**2023-06-15**|**Diff-TTSG: Denoising probabilistic integrated speech and gesture synthesis**|Shivam Mehta et.al.|[2306.09417v1](http://arxiv.org/abs/2306.09417v1)|null|\n", "2306.11510": "|**2023-06-20**|**Pushing the Limits of 3D Shape Generation at Scale**|Wang Yu et.al.|[2306.11510v1](http://arxiv.org/abs/2306.11510v1)|null|\n", "2306.11504": "|**2023-06-20**|**Align, Adapt and Inject: Sound-guided Unified Image Generation**|Yue Yang et.al.|[2306.11504v1](http://arxiv.org/abs/2306.11504v1)|null|\n", "2306.11400": "|**2023-06-20**|**MuDPT: Multi-modal Deep-symphysis Prompt Tuning for Large Pre-trained Vision-Language Models**|Yongzhu Miao et.al.|[2306.11400v1](http://arxiv.org/abs/2306.11400v1)|**[link](https://github.com/mechrev0/mudpt)**|\n", "2306.11207": "|**2023-06-22**|**Quilt-1M: One Million Image-Text Pairs for Histopathology**|Wisdom Oluchi Ikezogwo et.al.|[2306.11207v2](http://arxiv.org/abs/2306.11207v2)|**[link](https://github.com/wisdomikezogwo/quilt1m)**|\n", "2306.11137": "|**2023-06-19**|**Deep Learning Framework with Multi-Head Dilated Encoders for Enhanced Segmentation of Cervical Cancer on Multiparametric Magnetic Resonance Imaging**|Reza Kalantar et.al.|[2306.11137v1](http://arxiv.org/abs/2306.11137v1)|null|\n", "2306.11065": "|**2023-06-19**|**Cross-Modal Attribute Insertions for Assessing the Robustness of Vision-and-Language Learning**|Shivaen Ramshetty et.al.|[2306.11065v1](http://arxiv.org/abs/2306.11065v1)|**[link](https://github.com/claws-lab/multimodal-robustness-xmai)**|\n", "2306.11025": "|**2023-06-19**|**Temporal Data Meets LLM -- Explainable Financial Time Series Forecasting**|Xinli Yu et.al.|[2306.11025v1](http://arxiv.org/abs/2306.11025v1)|null|\n", "2306.11020": "|**2023-06-19**|**Dual-Gated Fusion with Prefix-Tuning for Multi-Modal Relation Extraction**|Qian Li et.al.|[2306.11020v1](http://arxiv.org/abs/2306.11020v1)|null|\n", "2306.10830": "|**2023-06-19**|**3D VR Sketch Guided 3D Shape Prototyping and Exploration**|Ling Luo et.al.|[2306.10830v1](http://arxiv.org/abs/2306.10830v1)|**[link](https://github.com/rowl1ng/3dsketch2shape)**|\n", "2306.10799": "|**2023-06-19**|**SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend 3D Talking Faces**|Ziqiao Peng et.al.|[2306.10799v1](http://arxiv.org/abs/2306.10799v1)|**[link](https://github.com/psyai-net/SelfTalk_release)**|\n", "2306.10772": "|**2023-06-19**|**Learning an Interpretable End-to-End Network for Real-Time Acoustic Beamforming**|Hao Liang et.al.|[2306.10772v1](http://arxiv.org/abs/2306.10772v1)|null|\n", "2306.10750": "|**2023-06-19**|**WiCo: Win-win Cooperation of Bottom-up and Top-down Referring Image Segmentation**|Zesen Cheng et.al.|[2306.10750v1](http://arxiv.org/abs/2306.10750v1)|null|\n", "2306.10730": "|**2023-06-19**|**UniG3D: A Unified 3D Object Generation Dataset**|Qinghong Sun et.al.|[2306.10730v1](http://arxiv.org/abs/2306.10730v1)|null|\n", "2306.10687": "|**2023-06-19**|**Categories of Response-Based, Feature-Based, and Relation-Based Knowledge Distillation**|Chuanguang Yang et.al.|[2306.10687v1](http://arxiv.org/abs/2306.10687v1)|null|\n", "2306.10567": "|**2023-06-18**|**MIR-GAN: Refining Frame-Level Modality-Invariant Representations with Adversarial Network for Audio-Visual Speech Recognition**|Yuchen Hu et.al.|[2306.10567v1](http://arxiv.org/abs/2306.10567v1)|**[link](https://github.com/yuchen005/mir-gan)**|\n", "2306.12387": "|**2023-06-21**|**Solving Dialogue Grounding Embodied Task in a Simulated Environment using Further Masked Language Modeling**|Weijie Jack Zhang et.al.|[2306.12387v1](http://arxiv.org/abs/2306.12387v1)|null|\n", "2306.11762": "|**2023-06-20**|**MultiEarth 2023 Deforestation Challenge -- Team FOREVER**|Seunghan Park et.al.|[2306.11762v1](http://arxiv.org/abs/2306.11762v1)|null|\n", "2306.13076": "|**2023-06-22**|**A Comparison of Time-based Models for Multimodal Emotion Recognition**|Ege Kesim et.al.|[2306.13076v1](http://arxiv.org/abs/2306.13076v1)|null|\n", "2306.12819": "|**2023-06-22**|**XACML Extension for Graphs: Flexible Authorization Policy Specification and Datastore-independent Enforcement**|Aya Mohamed et.al.|[2306.12819v1](http://arxiv.org/abs/2306.12819v1)|null|\n", "2306.12795": "|**2023-06-22**|**Learning Unseen Modality Interaction**|Yunhua Zhang et.al.|[2306.12795v1](http://arxiv.org/abs/2306.12795v1)|null|\n", "2306.12725": "|**2023-06-22**|**Generative Multimodal Entity Linking**|Senbao Shi et.al.|[2306.12725v1](http://arxiv.org/abs/2306.12725v1)|**[link](https://github.com/hitsz-tmg/gemel)**|\n", "2306.12559": "|**2023-06-21**|**Exploring the Role of Audio in Video Captioning**|Yuhan Shen et.al.|[2306.12559v1](http://arxiv.org/abs/2306.12559v1)|null|\n", "2306.12525": "|**2023-06-21**|**LPFormer: LiDAR Pose Estimation Transformer with Multi-Task Network**|Dongqiangzi Ye et.al.|[2306.12525v1](http://arxiv.org/abs/2306.12525v1)|null|\n", "2306.13592": "|**2023-06-23**|**TACOformer:Token-channel compounded Cross Attention for Multimodal Emotion Recognition**|Xinda Li et.al.|[2306.13592v1](http://arxiv.org/abs/2306.13592v1)|null|\n", "2306.13285": "|**2023-06-23**|**Learning Scene Flow With Skeleton Guidance For 3D Action Recognition**|Vasileios Magoulianitis et.al.|[2306.13285v1](http://arxiv.org/abs/2306.13285v1)|null|\n", "2306.13240": "|**2023-06-22**|**Continuous Online Extrinsic Calibration of Fisheye Camera and LiDAR**|Jack Borer et.al.|[2306.13240v1](http://arxiv.org/abs/2306.13240v1)|null|\n", "2306.14795": "|**2023-06-26**|**MotionGPT: Human Motion as a Foreign Language**|Biao Jiang et.al.|[2306.14795v1](http://arxiv.org/abs/2306.14795v1)|**[link](https://github.com/openmotionlab/motiongpt)**|\n", "2306.14565": "|**2023-06-26**|**Aligning Large Multi-Modal Model with Robust Instruction Tuning**|Fuxiao Liu et.al.|[2306.14565v1](http://arxiv.org/abs/2306.14565v1)|**[link](https://github.com/FuxiaoLiu/LRV-Instruction)**|\n", "2306.14406": "|**2023-06-26**|**TCEIP: Text Condition Embedded Regression Network for Dental Implant Position Prediction**|Xinquan Yang et.al.|[2306.14406v1](http://arxiv.org/abs/2306.14406v1)|null|\n", "2306.14399": "|**2023-06-26**|**Mutual Query Network for Multi-Modal Product Image Segmentation**|Yun Guo et.al.|[2306.14399v1](http://arxiv.org/abs/2306.14399v1)|**[link](https://github.com/weifeng-github/mqn)**|\n", "2306.14177": "|**2023-06-25**|**Enhancing Mapless Trajectory Prediction through Knowledge Distillation**|Yuning Wang et.al.|[2306.14177v1](http://arxiv.org/abs/2306.14177v1)|null|\n", "2306.14170": "|**2023-06-25**|**AV-SepFormer: Cross-Attention SepFormer for Audio-Visual Target Speaker Extraction**|Jiuxin Lin et.al.|[2306.14170v1](http://arxiv.org/abs/2306.14170v1)|**[link](https://github.com/lin9x/av-sepformer)**|\n", "2306.14143": "|**2023-06-25**|**Intelligent Multi-Modal Sensing-Communication Integration: Synesthesia of Machines**|Xiang Cheng et.al.|[2306.14143v1](http://arxiv.org/abs/2306.14143v1)|null|\n", "2306.14125": "|**2023-06-25**|**M$^3$SC: A Generic Dataset for Mixed Multi-Modal (MMM) Sensing and Communication Integration**|Xiang Cheng et.al.|[2306.14125v1](http://arxiv.org/abs/2306.14125v1)|null|\n", "2306.14112": "|**2023-06-25**|**Enhancing Dynamic Image Advertising with Vision-Language Pre-training**|Zhoufutu Wen et.al.|[2306.14112v1](http://arxiv.org/abs/2306.14112v1)|null|\n", "2306.13856": "|**2023-06-24**|**Learning-to-Rank Meets Language: Boosting Language-Driven Ordering Alignment for Ordinal Classification**|Rui Wang et.al.|[2306.13856v1](http://arxiv.org/abs/2306.13856v1)|**[link](https://github.com/raywang335/l2rclip)**|\n", "2306.13804": "|**2023-06-27**|**Cross-Language Speech Emotion Recognition Using Multimodal Dual Attention Transformers**|Syed Aun Muhammad Zaidi et.al.|[2306.13804v2](http://arxiv.org/abs/2306.13804v2)|null|\n", "2306.15644": "|**2023-06-27**|**Style-transfer based Speech and Audio-visual Scene Understanding for Robot Action Sequence Acquisition from Videos**|Chiori Hori et.al.|[2306.15644v1](http://arxiv.org/abs/2306.15644v1)|null|\n", "2306.15612": "|**2023-06-27**|**Rethinking Cross-Entropy Loss for Stereo Matching Networks**|Peng Xu et.al.|[2306.15612v1](http://arxiv.org/abs/2306.15612v1)|null|\n", "2306.15605": "|**2023-06-27**|**Deep Normalizing Flows for State Estimation**|Harrison Delecki et.al.|[2306.15605v1](http://arxiv.org/abs/2306.15605v1)|**[link](https://github.com/sisl/deepnfstateestimation)**|\n", "2306.15464": "|**2023-06-27**|**Large-scale unsupervised audio pre-training for video-to-speech synthesis**|Triantafyllos Kefalas et.al.|[2306.15464v1](http://arxiv.org/abs/2306.15464v1)|null|\n", "2306.15255": "|**2023-06-27**|**GroundNLQ @ Ego4D Natural Language Queries Challenge 2023**|Zhijian Hou et.al.|[2306.15255v1](http://arxiv.org/abs/2306.15255v1)|**[link](https://github.com/houzhijian/groundnlq)**|\n", "2306.15231": "|**2023-06-27**|**Emulating Reader Behaviors for Fake News Detection**|Junwei Yin et.al.|[2306.15231v1](http://arxiv.org/abs/2306.15231v1)|null|\n", "2306.15114": "|**2023-06-26**|**Transfer: Cross Modality Knowledge Transfer using Adversarial Networks -- A Study on Gesture Recognition**|Payal Kamboj et.al.|[2306.15114v1](http://arxiv.org/abs/2306.15114v1)|null|\n", "2306.16349": "|**2023-06-28**|**Accurate, uncertainty-aware classification of molecular chemical motifs from multi-modal X-ray absorption spectroscopy**|Matthew R. Carbone et.al.|[2306.16349v1](http://arxiv.org/abs/2306.16349v1)|null|\n", "2306.16329": "|**2023-06-28**|**DiffComplete: Diffusion-based Generative 3D Shape Completion**|Ruihang Chu et.al.|[2306.16329v1](http://arxiv.org/abs/2306.16329v1)|null|\n", "2306.16207": "|**2023-06-28**|**Inferring the Goals of Communicating Agents from Actions and Instructions**|Lance Ying et.al.|[2306.16207v1](http://arxiv.org/abs/2306.16207v1)|null|\n", "2306.16034": "|**2023-06-28**|**Stone Needle: A General Multimodal Large-scale Model Framework towards Healthcare**|Weihua Liu et.al.|[2306.16034v1](http://arxiv.org/abs/2306.16034v1)|null|\n", "2306.15977": "|**2023-06-28**|**A Dimensional Structure based Knowledge Distillation Method for Cross-Modal Learning**|Lingyu Si et.al.|[2306.15977v1](http://arxiv.org/abs/2306.15977v1)|null|\n", "2306.15955": "|**2023-06-29**|**Bridging the Gap: Neural Collapse Inspired Prompt Tuning for Generalization under Class Imbalance**|Didi Zhu et.al.|[2306.15955v2](http://arxiv.org/abs/2306.15955v2)|null|\n", "2306.15946": "|**2023-06-28**|**Knowledge-Enhanced Hierarchical Information Correlation Learning for Multi-Modal Rumor Detection**|Jiawei Liu et.al.|[2306.15946v1](http://arxiv.org/abs/2306.15946v1)|null|\n", "2306.15943": "|**2023-06-28**|**No Transfers Required: Integrating Last Mile with Public Transit Using Opti-Mile**|Raashid Altaf et.al.|[2306.15943v1](http://arxiv.org/abs/2306.15943v1)|null|\n", "2306.15837": "|**2023-06-27**|**Symbol emergence as interpersonal cross-situational learning: the emergence of lexical knowledge with combinatoriality**|Yoshinobu Hagiwara et.al.|[2306.15837v1](http://arxiv.org/abs/2306.15837v1)|null|\n", "2306.15808": "|**2023-06-27**|**Classification of Infant Sleep/Wake States: Cross-Attention among Large Scale Pretrained Transformer Networks using Audio, ECG, and IMU Data**|Kai Chieh Chang et.al.|[2306.15808v1](http://arxiv.org/abs/2306.15808v1)|null|\n", "2306.15711": "|**2023-06-27**|**Semi-supervised Multimodal Representation Learning through a Global Workspace**|Benjamin Devillers et.al.|[2306.15711v1](http://arxiv.org/abs/2306.15711v1)|**[link](https://github.com/bdvllrs/bimgw)**|\n", "2306.17115": "|**2023-07-03**|**Michelangelo: Conditional 3D Shape Generation based on Shape-Image-Text Aligned Latent Representation**|Zibo Zhao et.al.|[2306.17115v2](http://arxiv.org/abs/2306.17115v2)|**[link](https://github.com/neuralcarver/michelangelo)**|\n", "2306.17107": "|**2023-06-29**|**LLaVAR: Enhanced Visual Instruction Tuning for Text-Rich Image Understanding**|Yanzhe Zhang et.al.|[2306.17107v1](http://arxiv.org/abs/2306.17107v1)|**[link](https://github.com/SALT-NLP/LLaVAR)**|\n", "2306.17000": "|**2023-06-29**|**MotionTrack: End-to-End Transformer-based Multi-Object Tracing with LiDAR-Camera Fusion**|Ce Zhang et.al.|[2306.17000v1](http://arxiv.org/abs/2306.17000v1)|null|\n", "2306.16927": "|**2023-06-29**|**End-to-end Autonomous Driving: Challenges and Frontiers**|Li Chen et.al.|[2306.16927v1](http://arxiv.org/abs/2306.16927v1)|**[link](https://github.com/opendrivelab/end-to-end-autonomous-driving)**|\n", "2306.16862": "|**2023-06-29**|**Sustainable Palm Tree Farming: Leveraging IoT and Multi-Modal Data for Early Detection and Mapping of Red Palm Weevil**|Yosra Hajjaji et.al.|[2306.16862v1](http://arxiv.org/abs/2306.16862v1)|null|\n", "2306.16762": "|**2023-06-29**|**Unified Language Representation for Question Answering over Text, Tables, and Images**|Bowen Yu et.al.|[2306.16762v1](http://arxiv.org/abs/2306.16762v1)|null|\n", "2306.16478": "|**2023-06-28**|**Pre-Training Multi-Modal Dense Retrievers for Outside-Knowledge Visual Question Answering**|Alireza Salemi et.al.|[2306.16478v1](http://arxiv.org/abs/2306.16478v1)|**[link](https://github.com/alirezasalemi7/pretraining-multimodal-dense-retriever-for-okvqa)**|\n", "2306.17525": "|**2023-06-30**|**MeLM, a generative pretrained language modeling framework that solves forward and inverse mechanics problems**|Markus J. Buehler et.al.|[2306.17525v1](http://arxiv.org/abs/2306.17525v1)|null|\n", "2306.17400": "|**2023-06-30**|**Topological Data Analysis Guided Segment Anything Model Prompt Optimization for Zero-Shot Segmentation in Biological Imaging**|Ruben Glatt et.al.|[2306.17400v1](http://arxiv.org/abs/2306.17400v1)|null|\n", "2306.17371": "|**2023-06-30**|**Capturing functional connectomics using Riemannian partial least squares**|Matt Ryan et.al.|[2306.17371v1](http://arxiv.org/abs/2306.17371v1)|null|\n", "2307.01146": "|**2023-07-05**|**AVSegFormer: Audio-Visual Segmentation with Transformer**|Shengyi Gao et.al.|[2307.01146v2](http://arxiv.org/abs/2307.01146v2)|**[link](https://github.com/vvvb-github/avsegformer)**|\n", "2307.01124": "|**2023-07-03**|**Cross-modality Attention Adapter: A Glioma Segmentation Fine-tuning Method for SAM Using Multimodal Brain MR Images**|Xiaoyu Shi et.al.|[2307.01124v1](http://arxiv.org/abs/2307.01124v1)|null|\n", "2307.01121": "|**2023-07-03**|**Artifacts Mapping: Multi-Modal Semantic Mapping for Object Detection and 3D Localization**|Federico Rollo et.al.|[2307.01121v1](http://arxiv.org/abs/2307.01121v1)|null|\n", "2307.01047": "|**2023-07-03**|**Cross-modal Place Recognition in Image Databases using Event-based Sensors**|Xiang Ji et.al.|[2307.01047v1](http://arxiv.org/abs/2307.01047v1)|null|\n", "2307.01003": "|**2023-07-03**|**Visual Instruction Tuning with Polite Flamingo**|Delong Chen et.al.|[2307.01003v1](http://arxiv.org/abs/2307.01003v1)|**[link](https://github.com/chendelong1999/polite_flamingo)**|\n", "2307.00997": "|**2023-07-03**|**RefSAM: Efficiently Adapting Segmenting Anything Model for Referring Video Object Segmentation**|Yonglin Li et.al.|[2307.00997v1](http://arxiv.org/abs/2307.00997v1)|**[link](https://github.com/lancasterli/refsam)**|\n", "2307.00954": "|**2023-07-03**|**HODINet: High-Order Discrepant Interaction Network for RGB-D Salient Object Detection**|Kang Yi et.al.|[2307.00954v1](http://arxiv.org/abs/2307.00954v1)|null|\n", "2307.00877": "|**2023-07-03**|**Exploring the Multi-modal Demand Dynamics During Transport System Disruptions**|Ali Shateri Benam et.al.|[2307.00877v1](http://arxiv.org/abs/2307.00877v1)|null|\n", "2307.00873": "|**2023-07-03**|**End-To-End Prediction of Knee Osteoarthritis Progression With Multi-Modal Transformers**|Egor Panfilov et.al.|[2307.00873v1](http://arxiv.org/abs/2307.00873v1)|null|\n", "2307.00716": "|**2023-07-03**|**JourneyDB: A Benchmark for Generative Image Understanding**|Junting Pan et.al.|[2307.00716v1](http://arxiv.org/abs/2307.00716v1)|null|\n", "2307.00671": "|**2023-07-02**|**Leveraging Multi-modal Sensing for Robotic Insertion Tasks in R&D Laboratories**|Aaron Butterworth et.al.|[2307.00671v1](http://arxiv.org/abs/2307.00671v1)|null|\n", "2307.00610": "|**2023-07-02**|**Fraunhofer SIT at CheckThat! 2023: Mixing Single-Modal Classifiers to Estimate the Check-Worthiness of Multi-Modal Tweets**|Raphael Frick et.al.|[2307.00610v1](http://arxiv.org/abs/2307.00610v1)|null|\n", "2307.00595": "|**2023-07-02**|**RH20T: A Robotic Dataset for Learning Diverse Skills in One-Shot**|Hao-Shu Fang et.al.|[2307.00595v1](http://arxiv.org/abs/2307.00595v1)|null|\n", "2307.00536": "|**2023-07-02**|**Referring Video Object Segmentation with Inter-Frame Interaction and Cross-Modal Correlation**|Meng Lan et.al.|[2307.00536v1](http://arxiv.org/abs/2307.00536v1)|null|\n", "2307.00398": "|**2023-07-01**|**ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models**|Uddeshya Upadhyay et.al.|[2307.00398v1](http://arxiv.org/abs/2307.00398v1)|**[link](https://github.com/explainableml/probvlm)**|\n", "2307.02469": "|**2023-07-05**|**What Matters in Training a GPT4-Style Language Model with Multimodal Inputs?**|Yan Zeng et.al.|[2307.02469v1](http://arxiv.org/abs/2307.02469v1)|null|\n", "2307.02280": "|**2023-07-05**|**Interactive Image Segmentation with Cross-Modality Vision Transformers**|Kun Li et.al.|[2307.02280v1](http://arxiv.org/abs/2307.02280v1)|**[link](https://github.com/lik1996/icmformer)**|\n", "2307.02041": "|**2023-07-05**|**Multimodal Imbalance-Aware Gradient Modulation for Weakly-supervised Audio-Visual Video Parsing**|Jie Fu et.al.|[2307.02041v1](http://arxiv.org/abs/2307.02041v1)|null|\n", "2307.02003": "|**2023-07-05**|**Multi-Modal Prototypes for Open-Set Semantic Segmentation**|Yuhuan Yang et.al.|[2307.02003v1](http://arxiv.org/abs/2307.02003v1)|null|\n", "2307.01947": "|**2023-07-04**|**Causal Video Summarizer for Video Exploration**|Jia-Hong Huang et.al.|[2307.01947v1](http://arxiv.org/abs/2307.01947v1)|null|\n", "2307.01824": "|**2023-07-04**|**Multi-Channel Feature Extraction for Virtual Histological Staining of Photon Absorption Remote Sensing Images**|Marian Boktor et.al.|[2307.01824v1](http://arxiv.org/abs/2307.01824v1)|null|\n", "2307.01798": "|**2023-07-04**|**Edge-aware Multi-task Network for Integrating Quantification Segmentation and Uncertainty Prediction of Liver Tumor on Multi-modality Non-contrast MRI**|Xiaojiao Xiao et.al.|[2307.01798v1](http://arxiv.org/abs/2307.01798v1)|null|\n", "2307.01741": "|**2023-07-04**|**Ben-ge: Extending BigEarthNet with Geographical and Environmental Data**|Michael Mommert et.al.|[2307.01741v1](http://arxiv.org/abs/2307.01741v1)|**[link](https://github.com/hsg-aiml/ben-ge)**|\n", "2307.01704": "|**2023-07-04**|**Graph-Ensemble Learning Model for Multi-label Skin Lesion Classification using Dermoscopy and Clinical Images**|Peng Tang et.al.|[2307.01704v1](http://arxiv.org/abs/2307.01704v1)|null|\n", "2307.01691": "|**2023-07-06**|**SeePrivacy: Automated Contextual Privacy Policy Generation for Mobile Applications**|Shidong Pan et.al.|[2307.01691v2](http://arxiv.org/abs/2307.01691v2)|**[link](https://github.com/cpp4app/cpp4app)**|\n", "2307.01577": "|**2023-07-04**|**Conceptual Cognitive Maps Formation with Neural Successor Networks and Word Embeddings**|Paul Stoewer et.al.|[2307.01577v1](http://arxiv.org/abs/2307.01577v1)|null|\n", "2307.01515": "|**2023-07-04**|**LPN: Language-guided Prototypical Network for few-shot classification**|Kaihui Cheng et.al.|[2307.01515v1](http://arxiv.org/abs/2307.01515v1)|null|\n", "2307.01425": "|**2023-07-04**|**Consistent Multimodal Generation via A Unified GAN Framework**|Zhen Zhu et.al.|[2307.01425v1](http://arxiv.org/abs/2307.01425v1)|null|\n", "2307.01422": "|**2023-07-04**|**Generative Flow Networks: a Markov Chain Perspective**|Tristan Deleu et.al.|[2307.01422v1](http://arxiv.org/abs/2307.01422v1)|null|\n", "2307.03068": "|**2023-07-06**|**A Hybrid End-to-End Spatio-Temporal Attention Neural Network with Graph-Smooth Signals for EEG Emotion Recognition**|Shadi Sartipi et.al.|[2307.03068v1](http://arxiv.org/abs/2307.03068v1)|null|\n", "2307.02978": "|**2023-07-06**|**Multi-modal multi-class Parkinson disease classification using CNN and decision level fusion**|Sushanta Kumar Sahu et.al.|[2307.02978v1](http://arxiv.org/abs/2307.02978v1)|null|\n", "2307.02971": "|**2023-07-06**|**On the Cultural Gap in Text-to-Image Generation**|Bingshuai Liu et.al.|[2307.02971v1](http://arxiv.org/abs/2307.02971v1)|null|\n", "2307.02862": "|**2023-07-06**|**A Critical Look at the Current Usage of Foundation Model for Dense Recognition Task**|Shiqi Yang et.al.|[2307.02862v1](http://arxiv.org/abs/2307.02862v1)|null|\n", "2307.02796": "|**2023-07-06**|**VerifAI: Verified Generative AI**|Nan Tang et.al.|[2307.02796v1](http://arxiv.org/abs/2307.02796v1)|null|\n", "2307.02761": "|**2023-07-06**|**Cross-Modal Content Inference and Feature Enrichment for Cold-Start Recommendation**|Haokai Ma et.al.|[2307.02761v1](http://arxiv.org/abs/2307.02761v1)|null|\n", "2307.02730": "|**2023-07-06**|**Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of Figure Skating**|Sheng-Lan Liu et.al.|[2307.02730v1](http://arxiv.org/abs/2307.02730v1)|null|\n", "2307.03706": "|**2023-07-07**|**Counterion-controlled phase equilibria in a charge-regulated polymer solution**|Giulia L. Celora et.al.|[2307.03706v1](http://arxiv.org/abs/2307.03706v1)|null|\n", "2307.03638": "|**2023-07-07**|**Physical-aware Cross-modal Adversarial Network for Wearable Sensor-based Human Action Recognition**|Jianyuan Ni et.al.|[2307.03638v1](http://arxiv.org/abs/2307.03638v1)|null|\n", "2307.03623": "|**2023-07-07**|**Robust Human Detection under Visual Degradation via Thermal and mmWave Radar Fusion**|Kaiwen Cai et.al.|[2307.03623v1](http://arxiv.org/abs/2307.03623v1)|**[link](https://github.com/ramdrop/utm)**|\n", "2307.03535": "|**2023-07-07**|**Matching in the Wild: Learning Anatomical Embeddings for Multi-Modality Images**|Xiaoyu Bai et.al.|[2307.03535v1](http://arxiv.org/abs/2307.03535v1)|null|\n", "2307.03427": "|**2023-07-07**|**Merging-Diverging Hybrid Transformer Networks for Survival Prediction in Head and Neck Cancer**|Mingyuan Meng et.al.|[2307.03427v1](http://arxiv.org/abs/2307.03427v1)|**[link](https://github.com/mungomeng/survival-xsurv)**|\n", "2307.03388": "|**2023-07-07**|**General-Purpose Multimodal Transformer meets Remote Sensing Semantic Segmentation**|Nhi Kieu et.al.|[2307.03388v1](http://arxiv.org/abs/2307.03388v1)|**[link](https://github.com/nhikieu/spatialvolumetricmultimodal)**|\n", "2307.03373": "|**2023-07-07**|**All in One: Exploring Unified Vision-Language Tracking with Multi-Modal Alignment**|Chunhui Zhang et.al.|[2307.03373v1](http://arxiv.org/abs/2307.03373v1)|null|\n", "2307.03339": "|**2023-07-07**|**Open-Vocabulary Object Detection via Scene Graph Discovery**|Hengcan Shi et.al.|[2307.03339v1](http://arxiv.org/abs/2307.03339v1)|null|\n", "2307.03274": "|**2023-07-06**|**It is not Sexually Suggestive, It is Educative. Separating Sex Education from Suggestive Content on TikTok Videos**|Enfa George et.al.|[2307.03274v1](http://arxiv.org/abs/2307.03274v1)|null|\n", "2307.03240": "|**2023-07-06**|**Adaptive Generation of Privileged Intermediate Information for Visible-Infrared Person Re-Identification**|Mahdi Alehdaghi et.al.|[2307.03240v1](http://arxiv.org/abs/2307.03240v1)|null|\n", "2307.03591": "|**2023-07-06**|**Structure Guided Multi-modal Pre-trained Transformer for Knowledge Graph Reasoning**|Ke Liang et.al.|[2307.03591v1](http://arxiv.org/abs/2307.03591v1)|null|\n", "2307.04751": "|**2023-07-10**|**Shelving, Stacking, Hanging: Relational Pose Diffusion for Multi-modal Rearrangement**|Anthony Simeonov et.al.|[2307.04751v1](http://arxiv.org/abs/2307.04751v1)|null|\n", "2307.04749": "|**2023-07-10**|**Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image Alignment with Iterative VQA Feedback**|Jaskirat Singh et.al.|[2307.04749v1](http://arxiv.org/abs/2307.04749v1)|null|\n", "2307.04722": "|**2023-07-10**|**Advances and Challenges in Meta-Learning: A Technical Review**|Anna Vettoruzzo et.al.|[2307.04722v1](http://arxiv.org/abs/2307.04722v1)|null|\n", "2307.04470": "|**2023-07-10**|**Test-Time Adaptation for Nighttime Color-Thermal Semantic Segmentation**|Yexin Liu et.al.|[2307.04470v1](http://arxiv.org/abs/2307.04470v1)|null|\n", "2307.04461": "|**2023-07-10**|**Multi-modal Graph Learning over UMLS Knowledge Graphs**|Manuel Burger et.al.|[2307.04461v1](http://arxiv.org/abs/2307.04461v1)|**[link](https://github.com/ratschlab/mmugl)**|\n", "2307.04421": "|**2023-07-13**|**Towards Enabling Cardiac Digital Twins of Myocardial Infarction Using Deep Computational Models for Inverse Inference**|Lei Li et.al.|[2307.04421v2](http://arxiv.org/abs/2307.04421v2)|null|\n", "2307.04361": "|**2023-07-10**|**Enhancing Cross-lingual Transfer via Phonemic Transcription Integration**|Hoang H. Nguyen et.al.|[2307.04361v1](http://arxiv.org/abs/2307.04361v1)|**[link](https://github.com/nhhoang96/phonemic_xlingual)**|\n", "2307.04296": "|**2023-07-10**|**K-Space-Aware Cross-Modality Score for Synthesized Neuroimage Quality Assessment**|Jinbao Wang et.al.|[2307.04296v1](http://arxiv.org/abs/2307.04296v1)|null|\n", "2307.04231": "|**2023-07-09**|**Mx2M: Masked Cross-Modality Modeling in Domain Adaptation for 3D Semantic Segmentation**|Boxiang Zhang et.al.|[2307.04231v1](http://arxiv.org/abs/2307.04231v1)|null|\n", "2307.04129": "|**2023-07-09**|**Cross-modal Orthogonal High-rank Augmentation for RGB-Event Transformer-trackers**|Zhiyu Zhu et.al.|[2307.04129v1](http://arxiv.org/abs/2307.04129v1)|**[link](https://github.com/ZHU-Zhiyu/High-Rank_RGB-Event_Tracker)**|\n", "2307.04091": "|**2023-07-09**|**CMDFusion: Bidirectional Fusion Network with Cross-modality Knowledge Distillation for LIDAR Semantic Segmentation**|Jun Cen et.al.|[2307.04091v1](http://arxiv.org/abs/2307.04091v1)|null|\n", "2307.03990": "|**2023-07-08**|**FTFDNet: Learning to Detect Talking Face Video Manipulation with Tri-Modality Interaction**|Ganglai Wang et.al.|[2307.03990v1](http://arxiv.org/abs/2307.03990v1)|null|\n", "2307.03942": "|**2023-07-08**|**Ariadne's Thread:Using Text Prompts to Improve Segmentation of Infected Areas from Chest X-ray images**|Yi Zhong et.al.|[2307.03942v1](http://arxiv.org/abs/2307.03942v1)|**[link](https://github.com/junelin2333/languidemedseg-miccai2023)**|\n", "2307.03903": "|**2023-07-08**|**Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for Visible-Infrared Video Person Re-Identification**|Huafeng Li et.al.|[2307.03903v1](http://arxiv.org/abs/2307.03903v1)|null|\n", "2307.03798": "|**2023-07-07**|**CLIPMasterPrints: Fooling Contrastive Language-Image Pre-training Using Latent Variable Evolution**|Matthias Freiberger et.al.|[2307.03798v1](http://arxiv.org/abs/2307.03798v1)|**[link](https://github.com/matfrei/clipmasterprints)**|\n", "2307.05463": "|**2023-07-11**|**EgoVLPv2: Egocentric Video-Language Pre-training with Fusion in the Backbone**|Shraman Pramanick et.al.|[2307.05463v1](http://arxiv.org/abs/2307.05463v1)|null|\n", "2307.05435": "|**2023-07-11**|**One-Versus-Others Attention: Scalable Multimodal Integration**|Michal Golovanevsky et.al.|[2307.05435v1](http://arxiv.org/abs/2307.05435v1)|**[link](https://github.com/rsinghlab/ovo)**|\n", "2307.04978": "|**2023-07-11**|**Diffusion idea exploration for art generation**|Nikhil Verma et.al.|[2307.04978v1](http://arxiv.org/abs/2307.04978v1)|null|\n", "2307.06281": "|**2023-07-12**|**MMBench: Is Your Multi-modal Model an All-around Player?**|Yuan Liu et.al.|[2307.06281v1](http://arxiv.org/abs/2307.06281v1)|**[link](https://github.com/InternLM/opencompass)**|\n", "2307.06174": "|**2023-07-12**|**Identification in Multiple Treatment Models under Discrete Variation**|Vishal Kamat et.al.|[2307.06174v1](http://arxiv.org/abs/2307.06174v1)|null|\n", "2307.05591": "|**2023-07-10**|**SITTA: A Semantic Image-Text Alignment for Image Captioning**|Fabian Paischer et.al.|[2307.05591v1](http://arxiv.org/abs/2307.05591v1)|**[link](https://github.com/ml-jku/semantic-image-text-alignment)**|\n", "2307.06505": "|**2023-07-13**|**WaterScenes: A Multi-Task 4D Radar-Camera Fusion Dataset and Benchmark for Autonomous Driving on Water Surfaces**|Shanliang Yao et.al.|[2307.06505v1](http://arxiv.org/abs/2307.06505v1)|**[link](https://github.com/waterscenes/waterscenes)**|\n", "2307.06424": "|**2023-07-12**|**Robust scalable initialization for Bayesian variational inference with multi-modal Laplace approximations**|Wyatt Bridgman et.al.|[2307.06424v1](http://arxiv.org/abs/2307.06424v1)|null|\n", "2307.07453": "|**2023-07-14**|**Investigation of Deep Learning-Based Filtered Density Function for Large Eddy Simulation of Turbulent Scalar Mixing**|Shubhangi Bansude et.al.|[2307.07453v1](http://arxiv.org/abs/2307.07453v1)|null|\n", "2307.07362": "|**2023-07-14**|**A scoping review on multimodal deep learning in biomedical images and texts**|Zhaoyi Sun et.al.|[2307.07362v1](http://arxiv.org/abs/2307.07362v1)|null|\n", "2307.07341": "|**2023-07-14**|**PiTL: Cross-modal Retrieval with Weakly-supervised Vision-language Pre-training via Prompting**|Zixin Guo et.al.|[2307.07341v1](http://arxiv.org/abs/2307.07341v1)|null|\n", "2307.07184": "|**2023-07-14**|**TVPR: Text-to-Video Person Retrieval and a New Benchmark**|Fan Ni et.al.|[2307.07184v1](http://arxiv.org/abs/2307.07184v1)|null|\n", "2307.07177": "|**2023-07-14**|**TriFormer: A Multi-modal Transformer Framework For Mild Cognitive Impairment Conversion Prediction**|Linfeng Liu et.al.|[2307.07177v1](http://arxiv.org/abs/2307.07177v1)|null|\n", "2307.07142": "|**2023-07-14**|**CFI2P: Coarse-to-Fine Cross-Modal Correspondence Learning for Image-to-Point Cloud Registration**|Gongxin Yao et.al.|[2307.07142v1](http://arxiv.org/abs/2307.07142v1)|null|\n", "2307.07135": "|**2023-07-14**|**MMSD2.0: Towards a Reliable Multi-modal Sarcasm Detection System**|Libo Qin et.al.|[2307.07135v1](http://arxiv.org/abs/2307.07135v1)|**[link](https://github.com/joeying1019/mmsd2.0)**|\n", "2307.08581": "|**2023-07-17**|**BuboGPT: Enabling Visual Grounding in Multi-Modal LLMs**|Yang Zhao et.al.|[2307.08581v1](http://arxiv.org/abs/2307.08581v1)|null|\n", "2307.08492": "|**2023-07-17**|**SVDFormer: Complementing Point Cloud via Self-view Augmentation and Self-structure Dual-generator**|Zhe Zhu et.al.|[2307.08492v1](http://arxiv.org/abs/2307.08492v1)|**[link](https://github.com/czvvd/svdformer)**|\n", "2307.08415": "|**2023-07-17**|**Monocular 3D Object Detection with LiDAR Guided Semi Supervised Active Learning**|Aral Hekimoglu et.al.|[2307.08415v1](http://arxiv.org/abs/2307.08415v1)|null|\n", "2307.08339": "|**2023-07-17**|**Multi-Task Cross-Modality Attention-Fusion for 2D Object Detection**|Huawei Sun et.al.|[2307.08339v1](http://arxiv.org/abs/2307.08339v1)|null|\n", "2307.08316": "|**2023-07-17**|**Bridging the Gap: Multi-Level Cross-Modality Joint Alignment for Visible-Infrared Person Re-Identification**|Tengfei Liang et.al.|[2307.08316v1](http://arxiv.org/abs/2307.08316v1)|null|\n", "2307.08238": "|**2023-07-17**|**Unified Open-Vocabulary Dense Visual Prediction**|Hengcan Shi et.al.|[2307.08238v1](http://arxiv.org/abs/2307.08238v1)|null|\n", "2307.08233": "|**2023-07-17**|**ROFusion: Efficient Object Detection using Hybrid Point-wise Radar-Optical Fusion**|Liu Liu et.al.|[2307.08233v1](http://arxiv.org/abs/2307.08233v1)|**[link](https://github.com/liuliu-55/rofusion)**|\n", "2307.08228": "|**2023-07-17**|**Video Frame Interpolation with Stereo Event and Intensity Camera**|Chao Ding et.al.|[2307.08228v1](http://arxiv.org/abs/2307.08228v1)|null|\n", "2307.08098": "|**2023-07-16**|**CalibNet: Dual-branch Cross-modal Calibration for RGB-D Salient Instance Segmentation**|Jialun Pei et.al.|[2307.08098v1](http://arxiv.org/abs/2307.08098v1)|**[link](https://github.com/pjlallen/calibnet)**|\n", "2307.08019": "|**2023-07-16**|**A Multi-model and Multi-scenario Assessment of the Impact of Climate Change on the Heating and Cooling Load Components of an Archetypical Residential Room in Major Indian Cities**|Raj S. Srivastava et.al.|[2307.08019v1](http://arxiv.org/abs/2307.08019v1)|null|\n", "2307.08016": "|**2023-07-16**|**Breaking Down the Task: A Unit-Grained Hybrid Training Framework for Vision and Language Decision Making**|Ruipu Luo et.al.|[2307.08016v1](http://arxiv.org/abs/2307.08016v1)|null|\n", "2307.07859": "|**2023-07-15**|**Unified Adversarial Patch for Cross-modal Attacks in the Physical World**|Xingxing Wei et.al.|[2307.07859v1](http://arxiv.org/abs/2307.07859v1)|null|\n", "2307.07807": "|**2023-07-15**|**MUVF-YOLOX: A Multi-modal Ultrasound Video Fusion Network for Renal Tumor Diagnosis**|Junyu Li et.al.|[2307.07807v1](http://arxiv.org/abs/2307.07807v1)|**[link](https://github.com/jeunyuli/muaf)**|\n", "2307.07791": "|**2023-07-15**|**Joint Adversarial and Collaborative Learning for Self-Supervised Action Recognition**|Tianyu Guo et.al.|[2307.07791v1](http://arxiv.org/abs/2307.07791v1)|**[link](https://github.com/levigty/acl)**|\n", "2307.07763": "|**2023-07-15**|**Tightly-Coupled LiDAR-Visual SLAM Based on Geometric Features for Mobile Agents**|Ke Cao et.al.|[2307.07763v1](http://arxiv.org/abs/2307.07763v1)|null|\n", "2307.09356": "|**2023-07-18**|**OnlineRefer: A Simple Online Baseline for Referring Video Object Segmentation**|Dongming Wu et.al.|[2307.09356v1](http://arxiv.org/abs/2307.09356v1)|**[link](https://github.com/wudongming97/onlinerefer)**|\n", "2307.09329": "|**2023-07-18**|**Towards a performance analysis on pre-trained Visual Question Answering models for autonomous driving**|Kaavya Rekanar et.al.|[2307.09329v1](http://arxiv.org/abs/2307.09329v1)|**[link](https://github.com/kaavyarekanar/towards-a-performance-analysis-on-pre-trained-vqa-models-for-autonomous-driving)**|\n", "2307.09323": "|**2023-07-18**|**Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Jiahe Li et.al.|[2307.09323v1](http://arxiv.org/abs/2307.09323v1)|**[link](https://github.com/fictionarry/er-nerf)**|\n", "2307.09312": "|**2023-07-18**|**Multi-Modal Discussion Transformer: Integrating Text, Images and Graph Transformers to Detect Hate Speech on Social Media**|Liam Hebert et.al.|[2307.09312v1](http://arxiv.org/abs/2307.09312v1)|**[link](https://github.com/liamhebert/multimodaldiscussiontransformer)**|\n", "2307.09306": "|**2023-07-18**|**EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory Forecasting**|Inhwan Bae et.al.|[2307.09306v1](http://arxiv.org/abs/2307.09306v1)|**[link](https://github.com/inhwanbae/eigentrajectory)**|\n", "2307.09184": "|**2023-07-18**|**You've Got Two Teachers: Co-evolutionary Image and Report Distillation for Semi-supervised Anatomical Abnormality Detection in Chest X-ray**|Jinghan Sun et.al.|[2307.09184v1](http://arxiv.org/abs/2307.09184v1)|null|\n", "2307.09155": "|**2023-07-18**|**MLF-DET: Multi-Level Fusion for Cross-Modal 3D Object Detection**|Zewei Lin et.al.|[2307.09155v1](http://arxiv.org/abs/2307.09155v1)|null|\n", "2307.09066": "|**2023-07-18**|**PatchCT: Aligning Patch Set and Label Set with Conditional Transport for Multi-Label Image Classification**|Miaoge Li et.al.|[2307.09066v1](http://arxiv.org/abs/2307.09066v1)|**[link](https://github.com/keepgoingjkg/patchct)**|\n", "2307.09059": "|**2023-07-18**|**Unleashing the Imagination of Text: A Novel Framework for Text-to-image Person Retrieval via Exploring the Power of Words**|Delong Liu et.al.|[2307.09059v1](http://arxiv.org/abs/2307.09059v1)|null|\n", "2307.09050": "|**2023-07-18**|**R-Cut: Enhancing Explainability in Vision Transformers with Relationship Weighted Out and Cut**|Yingjie Niu et.al.|[2307.09050v1](http://arxiv.org/abs/2307.09050v1)|null|\n", "2307.09036": "|**2023-07-18**|**PromptMagician: Interactive Prompt Engineering for Text-to-Image Creation**|Yingchaojie Feng et.al.|[2307.09036v1](http://arxiv.org/abs/2307.09036v1)|**[link](https://github.com/yingchaojiefeng/promptmagician)**|\n", "2307.08991": "|**2023-07-18**|**EgoVM: Achieving Precise Ego-Localization using Lightweight Vectorized Maps**|Yuzhe He et.al.|[2307.08991v1](http://arxiv.org/abs/2307.08991v1)|null|\n", "2307.08788": "|**2023-07-17**|**Uncovering Load-Altering Attacks Against N-1 Secure Power Grids: A Rare-Event Sampling Approach**|Maldon Patrice Goodridge et.al.|[2307.08788v1](http://arxiv.org/abs/2307.08788v1)|null|\n", "2307.08752": "|**2023-07-17**|**A Re-Appraisal of CO/O$_2$ Runaway on Habitable Planets Orbiting Low-Mass Stars**|Sukrit Ranjan et.al.|[2307.08752v1](http://arxiv.org/abs/2307.08752v1)|null|\n", "2307.10094": "|**2023-07-19**|**Make-A-Volume: Leveraging Latent Diffusion Models for Cross-Modality 3D Brain MRI Synthesis**|Lingting Zhu et.al.|[2307.10094v1](http://arxiv.org/abs/2307.10094v1)|null|\n", "2307.09931": "|**2023-07-19**|**DISA: DIfferentiable Similarity Approximation for Universal Multimodal Registration**|Matteo Ronchetti et.al.|[2307.09931v1](http://arxiv.org/abs/2307.09931v1)|**[link](https://github.com/imfusiongmbh/disa-universal-multimodal-registration)**|\n", "2307.09915": "|**2023-07-19**|**Embedded Heterogeneous Attention Transformer for Cross-lingual Image Captioning**|Zijie Song et.al.|[2307.09915v1](http://arxiv.org/abs/2307.09915v1)|null|\n", "2307.09823": "|**2023-07-19**|**Multi-modal Learning based Prediction for Disease**|Yaran Chen et.al.|[2307.09823v1](http://arxiv.org/abs/2307.09823v1)|null|\n", "2307.09769": "|**2023-07-19**|**Source-Free Domain Adaptation for Medical Image Segmentation via Prototype-Anchored Feature Alignment and Contrastive Learning**|Qinji Yu et.al.|[2307.09769v1](http://arxiv.org/abs/2307.09769v1)|**[link](https://github.com/cscyqj/miccai23-protocontra-sfda)**|\n", "2307.09749": "|**2023-07-19**|**Towards Robust Scene Text Image Super-resolution via Explicit Location Enhancement**|Hang Guo et.al.|[2307.09749v1](http://arxiv.org/abs/2307.09749v1)|**[link](https://github.com/csguoh/lemma)**|\n", "2307.09721": "|**2023-07-19**|**Multi-Grained Multimodal Interaction Network for Entity Linking**|Pengfei Luo et.al.|[2307.09721v1](http://arxiv.org/abs/2307.09721v1)|**[link](https://github.com/pengfei-luo/mimic)**|\n", "2307.10810": "|**2023-07-20**|**On Combining Expert Demonstrations in Imitation Learning via Optimal Transport**|Ilana Sebag et.al.|[2307.10810v1](http://arxiv.org/abs/2307.10810v1)|null|\n", "2307.10782": "|**2023-07-20**|**See More and Know More: Zero-shot Point Cloud Segmentation via Multi-modal Visual Data**|Yuhang Lu et.al.|[2307.10782v1](http://arxiv.org/abs/2307.10782v1)|null|\n", "2307.10763": "|**2023-07-20**|**MSQNet: Actor-agnostic Action Recognition with Multi-modal Query**|Anindya Mondal et.al.|[2307.10763v1](http://arxiv.org/abs/2307.10763v1)|**[link](https://github.com/mondalanindya/msqnet)**|\n", "2307.10685": "|**2023-07-20**|**Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged Object Detection**|Yinghui Xing et.al.|[2307.10685v1](http://arxiv.org/abs/2307.10685v1)|null|\n", "2307.10601": "|**2023-07-20**|**SCA-PVNet: Self-and-Cross Attention Based Aggregation of Point Cloud and Multi-View for 3D Object Retrieval**|Dongyun Lin et.al.|[2307.10601v1](http://arxiv.org/abs/2307.10601v1)|null|\n", "2307.10577": "|**2023-07-21**|**Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced Perception based on Joint-Embedding & Contextual Label Affinity**|Hugo Latapie et.al.|[2307.10577v2](http://arxiv.org/abs/2307.10577v2)|null|\n", "2307.10519": "|**2023-07-20**|**Probabilistic Multimodal Depth Estimation Based on Camera-LiDAR Sensor Fusion**|Johan S. Obando-Ceron et.al.|[2307.10519v1](http://arxiv.org/abs/2307.10519v1)|null|\n", "2307.10490": "|**2023-07-24**|**(Ab)using Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs**|Eugene Bagdasaryan et.al.|[2307.10490v3](http://arxiv.org/abs/2307.10490v3)|**[link](https://github.com/ebagdasa/multimodal_injection)**|\n", "2307.10475": "|**2023-07-19**|**Findings of Factify 2: Multimodal Fake News Detection**|S Suryavardan et.al.|[2307.10475v1](http://arxiv.org/abs/2307.10475v1)|null|\n", "2307.11552": "|**2023-07-21**|**A multi-modal representation of El Ni\u00f1o Southern Oscillation Diversity**|Jakob Schl\u00f6r et.al.|[2307.11552v1](http://arxiv.org/abs/2307.11552v1)|**[link](https://github.com/jakob-schloer/latentgmm)**|\n", "2307.11545": "|**2023-07-21**|**Bridging Vision and Language Encoders: Parameter-Efficient Tuning for Referring Image Segmentation**|Zunnan Xu et.al.|[2307.11545v1](http://arxiv.org/abs/2307.11545v1)|**[link](https://github.com/kkakkkka/etris)**|\n", "2307.11530": "|**2023-07-21**|**UWAT-GAN: Fundus Fluorescein Angiography Synthesis via Ultra-wide-angle Transformation Multi-scale GAN**|Zhaojie Fang et.al.|[2307.11530v1](http://arxiv.org/abs/2307.11530v1)|**[link](https://github.com/Tinysqua/UWAT-GAN)**|\n", "2307.11450": "|**2023-07-21**|**Topic Identification For Spontaneous Speech: Enriching Audio Features With Embedded Linguistic Information**|Dejan Porjazovski et.al.|[2307.11450v1](http://arxiv.org/abs/2307.11450v1)|**[link](https://github.com/aalto-speech/Topic-identification-for-spontaneous-Finnish-speech)**|\n", "2307.11323": "|**2023-07-21**|**HVDetFusion: A Simple and Robust Camera-Radar Fusion Framework**|Kai Lei et.al.|[2307.11323v1](http://arxiv.org/abs/2307.11323v1)|**[link](https://github.com/hvxlab/hvdetfusion)**|\n", "2307.12964": "|**2023-07-24**|**Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature Alignment**|Sarah Ibrahimi et.al.|[2307.12964v1](http://arxiv.org/abs/2307.12964v1)|null|\n", "2307.12853": "|**2023-07-25**|**Spatiotemporal Modeling Encounters 3D Medical Image Analysis: Slice-Shift UNet with Multi-View Fusion**|C. I. Ugwu et.al.|[2307.12853v2](http://arxiv.org/abs/2307.12853v2)|null|\n", "2307.12732": "|**2023-07-24**|**CLIP-KD: An Empirical Study of Distilling CLIP Models**|Chuanguang Yang et.al.|[2307.12732v1](http://arxiv.org/abs/2307.12732v1)|null|\n", "2307.12626": "|**2023-07-24**|**Enhancing Human-like Multi-Modal Reasoning: A New Challenging Dataset and Comprehensive Framework**|Jingxuan Wei et.al.|[2307.12626v1](http://arxiv.org/abs/2307.12626v1)|**[link](https://github.com/weijingxuan/COCO-MMR)**|\n", "2307.12577": "|**2023-07-24**|**PRIOR: Prototype Representation Joint Learning from Medical Images and Reports**|Pujin Cheng et.al.|[2307.12577v1](http://arxiv.org/abs/2307.12577v1)|**[link](https://github.com/qtacierp/prior)**|\n", "2307.12545": "|**2023-07-24**|**Towards Video Anomaly Retrieval from Video Anomaly Detection: New Benchmarks and Model**|Peng Wu et.al.|[2307.12545v1](http://arxiv.org/abs/2307.12545v1)|null|\n", "2307.12242": "|**2023-07-23**|**HealthPrism: A Visual Analytics System for Exploring Children's Physical and Mental Health Profiles with Multimodal Data**|Zhihan Jiang et.al.|[2307.12242v1](http://arxiv.org/abs/2307.12242v1)|null|\n", "2307.12236": "|**2023-07-23**|**Multi-Modal Machine Learning for Assessing Gaming Skills in Online Streaming: A Case Study with CS:GO**|Longxiang Zhang et.al.|[2307.12236v1](http://arxiv.org/abs/2307.12236v1)|null|\n", "2307.12180": "|**2023-07-22**|**Prototype-Driven and Multi-Expert Integrated Multi-Modal MR Brain Tumor Image Segmentation**|Yafei Zhang et.al.|[2307.12180v1](http://arxiv.org/abs/2307.12180v1)|**[link](https://github.com/linzy0227/pdminet)**|\n", "2307.12067": "|**2023-07-22**|**Replay: Multi-modal Multi-view Acted Videos for Casual Holography**|Roman Shapovalov et.al.|[2307.12067v1](http://arxiv.org/abs/2307.12067v1)|**[link](https://github.com/facebookresearch/replay_dataset)**|\n", "2307.12058": "|**2023-07-22**|**Discovering Spatio-Temporal Rationales for Video Question Answering**|Yicong Li et.al.|[2307.12058v1](http://arxiv.org/abs/2307.12058v1)|null|\n", "2307.11921": "|**2023-07-21**|**Poverty rate prediction using multi-modal survey and earth observation data**|Simone Fobi et.al.|[2307.11921v1](http://arxiv.org/abs/2307.11921v1)|null|\n", "2307.13600": "|**2023-07-25**|**Decisive Data using Multi-Modality Optical Sensors for Advanced Vehicular Systems**|Muhammad Ali Farooq et.al.|[2307.13600v1](http://arxiv.org/abs/2307.13600v1)|null|\n", "2307.13537": "|**2023-07-25**|**Spectrum-guided Multi-granularity Referring Video Object Segmentation**|Bo Miao et.al.|[2307.13537v1](http://arxiv.org/abs/2307.13537v1)|**[link](https://github.com/bo-miao/sgmg)**|\n", "2307.13529": "|**2023-07-25**|**Re-mine, Learn and Reason: Exploring the Cross-modal Semantic Correlations for Language-guided HOI detection**|Yichao Cao et.al.|[2307.13529v1](http://arxiv.org/abs/2307.13529v1)|null|\n", "2307.13205": "|**2023-07-25**|**Text-oriented Modality Reinforcement Network for Multimodal Sentiment Analysis from Unaligned Multimodal Sequences**|Yuxuan Lei et.al.|[2307.13205v1](http://arxiv.org/abs/2307.13205v1)|null|\n", "2307.13125": "|**2023-07-24**|**Deep Learning Approaches for Data Augmentation in Medical Imaging: A Review**|Aghiles Kebaili et.al.|[2307.13125v1](http://arxiv.org/abs/2307.13125v1)|null|\n", "2307.13069": "|**2023-07-24**|**General-Purpose Multi-Modal OOD Detection Framework**|Viet Duong et.al.|[2307.13069v1](http://arxiv.org/abs/2307.13069v1)|null|\n", "2307.14277": "|**2023-07-26**|**G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and Game Theory**|Hongxiang Li et.al.|[2307.14277v1](http://arxiv.org/abs/2307.14277v1)|null|\n", "2307.14273": "|**2023-07-26**|**Deepfake Image Generation for Improved Brain Tumor Segmentation**|Roa'a Al-Emaryeen et.al.|[2307.14273v1](http://arxiv.org/abs/2307.14273v1)|null|\n", "2307.14244": "|**2023-07-26**|**Neural-based Cross-modal Search and Retrieval of Artwork**|Yan Gong et.al.|[2307.14244v1](http://arxiv.org/abs/2307.14244v1)|null|\n", "2307.14240": "|**2023-07-26**|**Boon: A Neural Search Engine for Cross-Modal Information Retrieval**|Yan Gong et.al.|[2307.14240v1](http://arxiv.org/abs/2307.14240v1)|null|\n", "2307.14185": "|**2023-07-26**|**A comparison of machine learning surrogate models of street-scale flooding in Norfolk, Virginia**|Diana McSpadden et.al.|[2307.14185v1](http://arxiv.org/abs/2307.14185v1)|null|\n", "2307.14126": "|**2023-07-26**|**Multi-modal Learning with Missing Modality via Shared-Specific Feature Modelling**|Hu Wang et.al.|[2307.14126v1](http://arxiv.org/abs/2307.14126v1)|null|\n", "2307.14061": "|**2023-07-26**|**Set-level Guidance Attack: Boosting Adversarial Transferability of Vision-Language Pre-training Models**|Dong Lu et.al.|[2307.14061v1](http://arxiv.org/abs/2307.14061v1)|**[link](https://github.com/Zoky-2020/Set-level_Guidance_Attack)**|\n", "2307.13950": "|**2023-07-26**|**Deep Robust Multi-Robot Re-localisation in Natural Environments**|Milad Ramezani et.al.|[2307.13950v1](http://arxiv.org/abs/2307.13950v1)|null|\n", "2307.13933": "|**2023-07-26**|**AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking Dataset for Assistive Driving Perception**|Dingkang Yang et.al.|[2307.13933v1](http://arxiv.org/abs/2307.13933v1)|**[link](https://github.com/ydk122024/aide)**|\n", "2307.13925": "|**2023-07-27**|**EasyNet: An Easy Network for 3D Industrial Anomaly Detection**|Ruitao Chen et.al.|[2307.13925v2](http://arxiv.org/abs/2307.13925v2)|null|\n", "2307.13871": "|**2023-07-26**|**Emulating Expert Insight: A Robust Strategy for Optimal Experimental Design**|Matthew R. Carbone et.al.|[2307.13871v1](http://arxiv.org/abs/2307.13871v1)|**[link](https://github.com/matthewcarbone/scientificvalueagent)**|\n", "2307.15016": "|**2023-07-27**|**How Good is Google Bard's Visual Understanding? An Empirical Study on Open Challenges**|Haotong Qin et.al.|[2307.15016v1](http://arxiv.org/abs/2307.15016v1)|**[link](https://github.com/htqin/googlebard-visunderstand)**|\n", "2307.14901": "|**2023-07-27**|**Text-guided Foundation Model Adaptation for Pathological Image Classification**|Yunkun Zhang et.al.|[2307.14901v1](http://arxiv.org/abs/2307.14901v1)|**[link](https://github.com/yunkun-zhang/cite)**|\n", "2307.14889": "|**2023-07-27**|**Weakly Supervised Multi-Modal 3D Human Body Pose Estimation for Autonomous Driving**|Peter Bauer et.al.|[2307.14889v1](http://arxiv.org/abs/2307.14889v1)|null|\n", "2307.14878": "|**2023-07-27**|**MESED: A Multi-modal Entity Set Expansion Dataset with Fine-grained Semantic Classes and Hard Negative Entities**|Yangning Li et.al.|[2307.14878v1](http://arxiv.org/abs/2307.14878v1)|**[link](https://github.com/thukelab/mesed)**|\n", "2307.14682": "|**2023-07-27**|**Unified Adversarial Patch for Visible-Infrared Cross-modal Attacks in the Physical World**|Xingxing Wei et.al.|[2307.14682v1](http://arxiv.org/abs/2307.14682v1)|**[link](https://github.com/aries-iai/cross-modal_patch_attack)**|\n", "2307.14619": "|**2023-07-29**|**Imitating Complex Trajectories: Bridging Low-Level Stability and High-Level Behavior**|Adam Block et.al.|[2307.14619v2](http://arxiv.org/abs/2307.14619v2)|null|\n", "2307.14572": "|**2023-07-27**|**Non-invasive Deep-Brain Imaging with 3D Integrated Photoacoustic Tomography and Ultrasound Localization Microscopy (3D-PAULM)**|Yuqi Tang et.al.|[2307.14572v1](http://arxiv.org/abs/2307.14572v1)|null|\n", "2307.14539": "|**2023-07-26**|**Plug and Pray: Exploiting off-the-shelf components of Multi-Modal Models**|Erfan Shayegani et.al.|[2307.14539v1](http://arxiv.org/abs/2307.14539v1)|null|\n", "2307.14523": "|**2023-07-26**|**Towards multi-modal anatomical landmark detection for ultrasound-guided brain tumor resection with contrastive learning**|Soorena Salari et.al.|[2307.14523v1](http://arxiv.org/abs/2307.14523v1)|null|\n", "2307.14491": "|**2023-07-26**|**Modality-Agnostic Audio-Visual Deepfake Detection**|Cai Yu et.al.|[2307.14491v1](http://arxiv.org/abs/2307.14491v1)|null|\n", "2307.15554": "|**2023-07-28**|**'What are you referring to?' Evaluating the Ability of Multi-Modal Dialogue Models to Process Clarificational Exchanges**|Javier Chiyah-Garcia et.al.|[2307.15554v1](http://arxiv.org/abs/2307.15554v1)|**[link](https://github.com/jchiyah/what-are-you-referring-to)**|\n", "2307.15460": "|**2023-07-28**|**Cross-Modal Concept Learning and Inference for Vision-Language Models**|Yi Zhang et.al.|[2307.15460v1](http://arxiv.org/abs/2307.15460v1)|null|\n", "2307.15432": "|**2023-07-28**|**CFN-ESA: A Cross-Modal Fusion Network with Emotion-Shift Awareness for Dialogue Emotion Recognition**|Jiang Li et.al.|[2307.15432v1](http://arxiv.org/abs/2307.15432v1)|null|\n", "2307.15344": "|**2023-07-28**|**Improving Audio-Text Retrieval via Hierarchical Cross-Modal Interaction and Auxiliary Captions**|Yifei Xin et.al.|[2307.15344v1](http://arxiv.org/abs/2307.15344v1)|null|\n", "2307.15220": "|**2023-07-27**|**Learning Multi-modal Representations by Watching Hundreds of Surgical Video Lectures**|Kun Yuan et.al.|[2307.15220v1](http://arxiv.org/abs/2307.15220v1)|**[link](https://github.com/camma-public/surgvlp)**|\n", "2307.15167": "|**2023-07-27**|**PEANUT: A Human-AI Collaborative Tool for Annotating Audio-Visual Data**|Zheng Zhang et.al.|[2307.15167v1](http://arxiv.org/abs/2307.15167v1)|null|\n", "2307.15097": "|**2023-07-27**|**Cascaded Cross-Modal Transformer for Request and Complaint Detection**|Nicolae-Catalin Ristea et.al.|[2307.15097v1](http://arxiv.org/abs/2307.15097v1)|null|\n", "2307.16896": "|**2023-07-31**|**Disruptive Autoencoders: Leveraging Low-level features for 3D Medical Image Pre-training**|Jeya Maria Jose Valanarasu et.al.|[2307.16896v1](http://arxiv.org/abs/2307.16896v1)|null|\n", "2307.16847": "|**2023-07-31**|**Latent Masking for Multimodal Self-supervised Learning in Health Timeseries**|Shohreh Deldari et.al.|[2307.16847v1](http://arxiv.org/abs/2307.16847v1)|null|\n", "2307.16745": "|**2023-07-31**|**Advancing Smart Malnutrition Monitoring: A Multi-Modal Learning Approach for Vital Health Parameter Estimation**|Ashish Marisetty et.al.|[2307.16745v1](http://arxiv.org/abs/2307.16745v1)|null|\n", "2307.16617": "|**2023-07-31**|**FULLER: Unified Multi-modality Multi-task 3D Perception via Multi-level Gradient Calibration**|Zhijian Huang et.al.|[2307.16617v1](http://arxiv.org/abs/2307.16617v1)|null|\n", "2307.16532": "|**2023-07-31**|**Echoes Beyond Points: Unleashing the Power of Raw Radar Data in Multi-modality Fusion**|Yang Liu et.al.|[2307.16532v1](http://arxiv.org/abs/2307.16532v1)|null|\n", "2307.16395": "|**2023-07-31**|**Bridging the Gap: Exploring the Capabilities of Bridge-Architectures for Complex Visual Reasoning Tasks**|Kousik Rajesh et.al.|[2307.16395v1](http://arxiv.org/abs/2307.16395v1)|null|\n", "2307.16366": "|**2023-07-31**|**Multi-modal Graph Neural Network for Early Diagnosis of Alzheimer's Disease from sMRI and PET Scans**|Yanteng Zhanga et.al.|[2307.16366v1](http://arxiv.org/abs/2307.16366v1)|null|\n", "2307.16210": "|**2023-08-01**|**Rethinking Uncertainly Missing and Ambiguous Visual Modality in Multi-Modal Entity Alignment**|Zhuo Chen et.al.|[2307.16210v2](http://arxiv.org/abs/2307.16210v2)|**[link](https://github.com/zjukg/umaea)**|\n", "2307.16142": "|**2023-07-30**|**Implicit Neural Representation in Medical Imaging: A Comparative Survey**|Amirali Molaei et.al.|[2307.16142v1](http://arxiv.org/abs/2307.16142v1)|**[link](https://github.com/mindflow-institue/awesome-implicit-neural-representations-in-medical-imaging)**|\n", "2307.16121": "|**2023-07-30**|**Uncertainty-Encoded Multi-Modal Fusion for Robust Object Detection in Autonomous Driving**|Yang Lou et.al.|[2307.16121v1](http://arxiv.org/abs/2307.16121v1)|null|\n", "2307.16106": "|**2023-07-30**|**TransFusion: A Practical and Effective Transformer-based Diffusion Model for 3D Human Motion Prediction**|Sibo Tian et.al.|[2307.16106v1](http://arxiv.org/abs/2307.16106v1)|null|\n", "2307.16013": "|**2023-07-29**|**Marrying Dialogue Systems with Data Visualization: Interactive Data Visualization Generation from Natural Language Conversations**|Yuanfeng Song et.al.|[2307.16013v1](http://arxiv.org/abs/2307.16013v1)|null|\n", "2307.15988": "|**2023-07-29**|**RGB-D-Fusion: Image Conditioned Depth Diffusion of Humanoid Subjects**|Sascha Kirch et.al.|[2307.15988v1](http://arxiv.org/abs/2307.15988v1)|**[link](https://github.com/sascha-kirch/rgb-d-fusion)**|\n", "2307.15942": "|**2023-07-29**|**CMDA: Cross-Modality Domain Adaptation for Nighttime Semantic Segmentation**|Ruihao Xia et.al.|[2307.15942v1](http://arxiv.org/abs/2307.15942v1)|**[link](https://github.com/xiarho/cmda)**|\n", "2307.15872": "|**2023-07-29**|**Cross-dimensional transfer learning in medical image segmentation with deep learning**|Hicham Messaoudi et.al.|[2307.15872v1](http://arxiv.org/abs/2307.15872v1)|**[link](https://github.com/hic-messaoudi/cross-dimensional-transfer-learning-in-medical-image-segmentation-with-deep-learning)**|\n", "2308.00692": "|**2023-08-03**|**LISA: Reasoning Segmentation via Large Language Model**|Xin Lai et.al.|[2308.00692v2](http://arxiv.org/abs/2308.00692v2)|**[link](https://github.com/dvlab-research/lisa)**|\n", "2308.00628": "|**2023-08-01**|**Human-M3: A Multi-view Multi-modal Dataset for 3D Human Pose Estimation in Outdoor Scenes**|Bohao Fan et.al.|[2308.00628v1](http://arxiv.org/abs/2308.00628v1)|**[link](https://github.com/soullessrobot/human-m3-dataset)**|\n", "2308.00588": "|**2023-08-01**|**Relation-Aware Distribution Representation Network for Person Clustering with Multiple Modalities**|Kaijian Liu et.al.|[2308.00588v1](http://arxiv.org/abs/2308.00588v1)|null|\n", "2308.00330": "|**2023-08-01**|**Advancing Frame-Dropping in Multi-Object Tracking-by-Detection Systems Through Event-Based Detection Triggering**|Matti Henning et.al.|[2308.00330v1](http://arxiv.org/abs/2308.00330v1)|null|\n", "2308.00295": "|**2023-08-01**|**Making the V in Text-VQA Matter**|Shamanthak Hegde et.al.|[2308.00295v1](http://arxiv.org/abs/2308.00295v1)|null|\n", "2308.00291": "|**2023-08-01**|**Fundus-Enhanced Disease-Aware Distillation Model for Retinal Disease Classification from OCT Images**|Lehan Wang et.al.|[2308.00291v1](http://arxiv.org/abs/2308.00291v1)|**[link](https://github.com/xmed-lab/fddm)**|\n", "2308.00264": "|**2023-08-01**|**Multi-Modality Multi-Loss Fusion Network**|Zehui Wu et.al.|[2308.00264v1](http://arxiv.org/abs/2308.00264v1)|null|\n", "2308.00235": "|**2023-08-01**|**Demonstrating Autonomous 3D Path Planning on a Novel Scalable UGV-UAV Morphing Robot**|Eric Sihite et.al.|[2308.00235v1](http://arxiv.org/abs/2308.00235v1)|null|\n", "2308.00228": "|**2023-08-01**|**Using Scene and Semantic Features for Multi-modal Emotion Recognition**|Zhifeng Wang et.al.|[2308.00228v1](http://arxiv.org/abs/2308.00228v1)|null|\n", "2307.16620": "|**2023-08-01**|**Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics**|Chen Liu et.al.|[2307.16620v2](http://arxiv.org/abs/2307.16620v2)|null|\n", "2308.01217": "|**2023-08-02**|**TeachCLIP: Multi-Grained Teaching for Efficient Text-to-Video Retrieval**|Kaibin Tian et.al.|[2308.01217v1](http://arxiv.org/abs/2308.01217v1)|null|\n", "2308.01147": "|**2023-08-02**|**Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment for Markup-to-Image Generation**|Guojin Zhong et.al.|[2308.01147v1](http://arxiv.org/abs/2308.01147v1)|**[link](https://github.com/zgj77/fsacdm)**|\n", "2308.01006": "|**2023-08-03**|**FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of Autonomous Driving**|Tengju Ye et.al.|[2308.01006v2](http://arxiv.org/abs/2308.01006v2)|**[link](https://github.com/westlake-autolab/fusionad)**|\n", "2308.00980": "|**2023-08-02**|**Grasp Stability Assessment Through Attention-Guided Cross-Modality Fusion and Transfer Learning**|Zhuangzhuang Zhang et.al.|[2308.00980v1](http://arxiv.org/abs/2308.00980v1)|null|\n", "2308.00906": "|**2023-08-02**|**ImageBrush: Learning Visual In-Context Instructions for Exemplar-Based Image Manipulation**|Yasheng Sun et.al.|[2308.00906v1](http://arxiv.org/abs/2308.00906v1)|null|\n", "2308.00856": "|**2023-08-01**|**Differential Privacy for Adaptive Weight Aggregation in Federated Tumor Segmentation**|Muhammad Irfan Khan et.al.|[2308.00856v1](http://arxiv.org/abs/2308.00856v1)|null|\n", "2308.01731": "|**2023-08-03**|**Quantification of Predictive Uncertainty via Inference-Time Sampling**|Katar\u00edna T\u00f3thov\u00e1 et.al.|[2308.01731v1](http://arxiv.org/abs/2308.01731v1)|null|\n", "2308.01546": "|**2023-08-03**|**MusicLDM: Enhancing Novelty in Text-to-Music Generation Using Beat-Synchronous Mixup Strategies**|Ke Chen et.al.|[2308.01546v1](http://arxiv.org/abs/2308.01546v1)|**[link](https://github.com/retrocirce/musicldm)**|\n", "2308.01526": "|**2023-08-03**|**Data Augmentation for Human Behavior Analysis in Multi-Person Conversations**|Kun Li et.al.|[2308.01526v1](http://arxiv.org/abs/2308.01526v1)|null|\n", "2308.01328": "|**2023-08-02**|**A vision transformer-based framework for knowledge transfer from multi-modal to mono-modal lymphoma subtyping models**|Bilel Guetarni et.al.|[2308.01328v1](http://arxiv.org/abs/2308.01328v1)|null|\n", "2308.02487": "|**2023-08-04**|**Convolutions Die Hard: Open-Vocabulary Segmentation with Single Frozen Convolutional CLIP**|Qihang Yu et.al.|[2308.02487v1](http://arxiv.org/abs/2308.02487v1)|**[link](https://github.com/bytedance/fc-clip)**|\n", "2308.02463": "|**2023-08-04**|**Towards Generalist Foundation Model for Radiology**|Chaoyi Wu et.al.|[2308.02463v1](http://arxiv.org/abs/2308.02463v1)|**[link](https://github.com/chaoyi-wu/radfm)**|\n", "2308.02239": "|**2023-08-04**|**DTF-Net: Category-Level Pose Estimation and Shape Reconstruction via Deformable Template Field**|Haowen Wang et.al.|[2308.02239v1](http://arxiv.org/abs/2308.02239v1)|null|\n", "2308.02097": "|**2023-08-04**|**Multi-interactive Feature Learning and a Full-time Multi-modality Benchmark for Image Fusion and Segmentation**|Jinyuan Liu et.al.|[2308.02097v1](http://arxiv.org/abs/2308.02097v1)|**[link](https://github.com/jinyuanliu-cv/segmif)**|\n", "2308.01994": "|**2023-08-03**|**Explainable unsupervised multi-modal image registration using deep networks**|Chengjia Wang et.al.|[2308.01994v1](http://arxiv.org/abs/2308.01994v1)|null|\n", "2308.02299": "|**2023-08-03**|**RegionBLIP: A Unified Multi-modal Pre-training Framework for Holistic and Regional Comprehension**|Qiang Zhou et.al.|[2308.02299v1](http://arxiv.org/abs/2308.02299v1)|**[link](https://github.com/mightyzau/regionblip)**|\n", "2308.03729": "|**2023-08-07**|**Tiny LVLM-eHub: Early Multimodal Experiments with Bard**|Wenqi Shao et.al.|[2308.03729v1](http://arxiv.org/abs/2308.03729v1)|**[link](https://github.com/opengvlab/multi-modality-arena)**|\n", "2308.03666": "|**2023-08-07**|**Bridging Trustworthiness and Open-World Learning: An Exploratory Neural Approach for Enhancing Interpretability, Generalization, and Robustness**|Shide Du et.al.|[2308.03666v1](http://arxiv.org/abs/2308.03666v1)|null|\n", "2308.03475": "|**2023-08-07**|**COPA: Efficient Vision-Language Pre-training Through Collaborative Object- and Patch-Text Alignment**|Chaoya Jiang et.al.|[2308.03475v1](http://arxiv.org/abs/2308.03475v1)|null|\n", "2308.03432": "|**2023-08-07**|**Cuing Without Sharing: A Federated Cued Speech Recognition Framework via Mutual Knowledge Distillation**|Yuxuan Zhang et.al.|[2308.03432v1](http://arxiv.org/abs/2308.03432v1)|**[link](https://github.com/yuxuanzhang0713/fedcsr)**|\n", "2308.03424": "|**2023-08-07**|**CAESURA: Language Models as Multi-Modal Query Planners**|Matthias Urban et.al.|[2308.03424v1](http://arxiv.org/abs/2308.03424v1)|null|\n", "2308.03267": "|**2023-08-07**|**Redundancy-aware Transformer for Video Question Answering**|Yicong Li et.al.|[2308.03267v1](http://arxiv.org/abs/2308.03267v1)|null|\n", "2308.03256": "|**2023-08-07**|**Learning a Graph Neural Network with Cross Modality Interaction for Image Fusion**|Jiawei Li et.al.|[2308.03256v1](http://arxiv.org/abs/2308.03256v1)|**[link](https://github.com/lok-18/ignet)**|\n", "2308.03151": "|**2023-08-06**|**Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating Vision-Language Models**|Zheng Ma et.al.|[2308.03151v1](http://arxiv.org/abs/2308.03151v1)|**[link](https://github.com/aaronma2020/Food500-Cap)**|\n", "2308.03135": "|**2023-08-06**|**E-CLIP: Towards Label-efficient Event-based Open-world Understanding by CLIP**|Jiazhou Zhou et.al.|[2308.03135v1](http://arxiv.org/abs/2308.03135v1)|null|\n", "2308.02982": "|**2023-08-06**|**Beyond First Impressions: Integrating Joint Multi-modal Cues for Comprehensive 3D Representation**|Haowei Wang et.al.|[2308.02982v1](http://arxiv.org/abs/2308.02982v1)|**[link](https://github.com/mr-neko/jm3d)**|\n", "2308.02883": "|**2023-08-05**|**Cross-modal & Cross-domain Learning for Unsupervised LiDAR Semantic Segmentation**|Yiyang Chen et.al.|[2308.02883v1](http://arxiv.org/abs/2308.02883v1)|null|\n", "2308.02872": "|**2023-08-05**|**Data-Based Design of Multi-Model Inferential Sensors**|Martin Mojto et.al.|[2308.02872v1](http://arxiv.org/abs/2308.02872v1)|null|\n", "2308.02823": "|**2023-08-05**|**A Symbolic Character-Aware Model for Solving Geometry Problems**|Maizhen Ning et.al.|[2308.02823v1](http://arxiv.org/abs/2308.02823v1)|**[link](https://github.com/ning-mz/sca-gps)**|\n", "2308.04369": "|**2023-08-08**|**SSTFormer: Bridging Spiking Neural Network and Memory Support Transformer for Frame-Event based Recognition**|Xiao Wang et.al.|[2308.04369v1](http://arxiv.org/abs/2308.04369v1)|**[link](https://github.com/event-ahu/sstformer)**|\n", "2308.04352": "|**2023-08-08**|**3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment**|Ziyu Zhu et.al.|[2308.04352v1](http://arxiv.org/abs/2308.04352v1)|null|\n", "2308.04343": "|**2023-08-08**|**Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval**|Yi Bin et.al.|[2308.04343v1](http://arxiv.org/abs/2308.04343v1)|**[link](https://github.com/luminosityx/hat)**|\n", "2308.04126": "|**2023-08-08**|**OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion and Infinite Data Generation**|Dongyang Yu et.al.|[2308.04126v1](http://arxiv.org/abs/2308.04126v1)|**[link](https://github.com/shajiayu1/OmniDataComposer)**|\n", "2308.04067": "|**2023-08-08**|**Online Distillation-enhanced Multi-modal Transformer for Sequential Recommendation**|Wei Ji et.al.|[2308.04067v1](http://arxiv.org/abs/2308.04067v1)|**[link](https://github.com/xyliugo/odmt)**|\n", "2308.03908": "|**2023-08-07**|**ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings for Video Action Recognition**|Soumyabrata Chaudhuri et.al.|[2308.03908v1](http://arxiv.org/abs/2308.03908v1)|null|\n", "2308.05061": "|**2023-08-09**|**Prompting In-Context Operator Learning with Sensor Data, Equations, and Natural Language**|Liu Yang et.al.|[2308.05061v1](http://arxiv.org/abs/2308.05061v1)|**[link](https://github.com/liuyangmage/in-context-operator-networks)**|\n", "2308.04992": "|**2023-08-09**|**AspectMMKG: A Multi-modal Knowledge Graph with Aspect-aware Entities**|Jingdan Zhang et.al.|[2308.04992v1](http://arxiv.org/abs/2308.04992v1)|**[link](https://github.com/thezjd/aspectmmkg)**|\n", "2308.04829": "|**2023-08-09**|**MixReorg: Cross-Modal Mixed Patch Reorganization is a Good Mask Learner for Open-World Semantic Segmentation**|Kaixin Cai et.al.|[2308.04829v1](http://arxiv.org/abs/2308.04829v1)|null|\n", "2308.04820": "|**2023-08-09**|**Strategic Interactions in Multi-modal Mobility Systems: A Game-Theoretic Perspective**|Gioele Zardini et.al.|[2308.04820v1](http://arxiv.org/abs/2308.04820v1)|null|\n", "2308.04779": "|**2023-08-09**|**Multi-View Fusion and Distillation for Subgrade Distresses Detection based on 3D-GPR**|Chunpeng Zhou et.al.|[2308.04779v1](http://arxiv.org/abs/2308.04779v1)|null|\n", "2308.04778": "|**2023-08-09**|**Multi-modal Multi-view Clustering based on Non-negative Matrix Factorization**|Yasser Khalafaoui et.al.|[2308.04778v1](http://arxiv.org/abs/2308.04778v1)|null|\n", "2308.04706": "|**2023-08-09**|**Pareto Invariant Representation Learning for Multimedia Recommendation**|Shanshan Huang et.al.|[2308.04706v1](http://arxiv.org/abs/2308.04706v1)|null|\n", "2308.04702": "|**2023-08-09**|**Continual Road-Scene Semantic Segmentation via Feature-Aligned Symmetric Multi-Modal Network**|Francesco Barbato et.al.|[2308.04702v1](http://arxiv.org/abs/2308.04702v1)|null|\n", "2308.04663": "|**2023-08-09**|**Classification of lung cancer subtypes on CT images with synthetic pathological priors**|Wentao Zhu et.al.|[2308.04663v1](http://arxiv.org/abs/2308.04663v1)|null|\n", "2308.04579": "|**2023-08-08**|**RECipe: Does a Multi-Modal Recipe Knowledge Graph Fit a Multi-Purpose Recommendation System?**|Ali Pesaranghader et.al.|[2308.04579v1](http://arxiv.org/abs/2308.04579v1)|null|\n", "2308.04556": "|**2023-08-08**|**FocalFormer3D : Focusing on Hard Instance for 3D Object Detection**|Yilun Chen et.al.|[2308.04556v1](http://arxiv.org/abs/2308.04556v1)|**[link](https://github.com/NVlabs/FocalFormer3D)**|\n", "2308.05667": "|**2023-08-14**|**2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration between Images and Point Clouds**|Minhao Li et.al.|[2308.05667v2](http://arxiv.org/abs/2308.05667v2)|**[link](https://github.com/minhaolee/2d3dmatr)**|\n", "2308.05648": "|**2023-08-10**|**Counterfactual Cross-modality Reasoning for Weakly Supervised Video Moment Localization**|Zezhong Lv et.al.|[2308.05648v1](http://arxiv.org/abs/2308.05648v1)|**[link](https://github.com/sldz0306/ccr)**|\n", "2308.05478": "|**2023-08-10**|**Reviewing 3D Object Detectors in the Context of High-Resolution 3+1D Radar**|Patrick Palmer et.al.|[2308.05478v1](http://arxiv.org/abs/2308.05478v1)|null|\n", "2308.05438": "|**2023-08-10**|**Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints Voting for Robust 6D Object Pose Estimation**|Jun Zhou et.al.|[2308.05438v1](http://arxiv.org/abs/2308.05438v1)|**[link](https://github.com/junzastar/dftr_voting)**|\n", "2308.05421": "|**2023-08-10**|**Progressive Spatio-temporal Perception for Audio-Visual Question Answering**|Guangyao Li et.al.|[2308.05421v1](http://arxiv.org/abs/2308.05421v1)|**[link](https://github.com/gewu-lab/pstp-net)**|\n", "2308.05128": "|**2023-08-09**|**High-Level Features Parallelization for Inference Cost Reduction Through Selective Attention**|Andr\u00e9 Peter Kelm et.al.|[2308.05128v1](http://arxiv.org/abs/2308.05128v1)|null|\n", "2308.06262": "|**2023-08-11**|**Foundation Model is Efficient Multimodal Multitask Model Selector**|Fanqing Meng et.al.|[2308.06262v1](http://arxiv.org/abs/2308.06262v1)|**[link](https://github.com/opengvlab/multitask-model-selector)**|\n", "2308.06207": "|**2023-08-11**|**Thinking Like an Expert:Multimodal Hypergraph-of-Thought (HoT) Reasoning to boost Foundation Modals**|Fanglong Yao et.al.|[2308.06207v1](http://arxiv.org/abs/2308.06207v1)|null|\n", "2308.06125": "|**2023-08-11**|**Improving Joint Speech-Text Representations Without Alignment**|Cal Peyser et.al.|[2308.06125v1](http://arxiv.org/abs/2308.06125v1)|null|\n", "2308.06024": "|**2023-08-11**|**Spatial-information Guided Adaptive Context-aware Network for Efficient RGB-D Semantic Segmentation**|Yang Zhang et.al.|[2308.06024v1](http://arxiv.org/abs/2308.06024v1)|**[link](https://github.com/mvme-hbut/sgacnet)**|\n", "2308.06009": "|**2023-08-11**|**ViGT: Proposal-free Video Grounding with Learnable Token in Transformer**|Kun Li et.al.|[2308.06009v1](http://arxiv.org/abs/2308.06009v1)|null|\n", "2308.05993": "|**2023-08-11**|**Image-based Geolocalization by Ground-to-2.5D Map Matching**|Mengjie Zhou et.al.|[2308.05993v1](http://arxiv.org/abs/2308.05993v1)|**[link](https://github.com/zhoumengjie/2-5dmap-dataset)**|\n", "2308.05948": "|**2023-08-11**|**Uncertainty-Aware Cross-Modal Transfer Network for Sketch-Based 3D Shape Retrieval**|Yiyang Cai et.al.|[2308.05948v1](http://arxiv.org/abs/2308.05948v1)|null|\n", "2308.05864": "|**2023-08-10**|**The Multi-modality Cell Segmentation Challenge: Towards Universal Solutions**|Jun Ma et.al.|[2308.05864v1](http://arxiv.org/abs/2308.05864v1)|null|\n", "2308.07222": "|**2023-08-14**|**MM-GEF: Multi-modal representation meet collaborative filtering**|Hao Wu et.al.|[2308.07222v1](http://arxiv.org/abs/2308.07222v1)|null|\n", "2308.07214": "|**2023-08-14**|**Automated Ensemble-Based Segmentation of Adult Brain Tumors: A Novel Approach Using the BraTS AFRICA Challenge Data**|Chiranjeewee Prasad Koirala et.al.|[2308.07214v1](http://arxiv.org/abs/2308.07214v1)|null|\n", "2308.07173": "|**2023-08-14**|**Enhancing State Estimator for Autonomous Race Car : Leveraging Multi-modal System and Managing Computing Resources**|Daegyu Lee et.al.|[2308.07173v1](http://arxiv.org/abs/2308.07173v1)|null|\n", "2308.07146": "|**2023-08-14**|**CTP: Towards Vision-Language Continual Pretraining via Compatible Momentum Contrast and Topology Preservation**|Hongguang Zhu et.al.|[2308.07146v1](http://arxiv.org/abs/2308.07146v1)|**[link](https://github.com/kevinlight831/ctp)**|\n", "2308.07026": "|**2023-08-14**|**AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal Contrastive Learning**|Ziqi Zhou et.al.|[2308.07026v1](http://arxiv.org/abs/2308.07026v1)|**[link](https://github.com/cgcl-codes/advclip)**|\n", "2308.06911": "|**2023-08-14**|**GIT-Mol: A Multi-modal Large Language Model for Molecular Science with Graph, Image, and Text**|Pengfei Liu et.al.|[2308.06911v1](http://arxiv.org/abs/2308.06911v1)|null|\n", "2308.06866": "|**2023-08-13**|**Improving Face Recognition from Caption Supervision with Multi-Granular Contextual Feature Aggregation**|Md Mahedi Hasan et.al.|[2308.06866v1](http://arxiv.org/abs/2308.06866v1)|null|\n", "2308.06735": "|**2023-08-13**|**AerialVLN: Vision-and-Language Navigation for UAVs**|Shubo Liu et.al.|[2308.06735v1](http://arxiv.org/abs/2308.06735v1)|**[link](https://github.com/airvln/airvln)**|\n", "2308.06696": "|**2023-08-13**|**MACO: A Modality Adversarial and Contrastive Framework for Modality-missing Multi-modal Knowledge Graph Completion**|Yichi Zhang et.al.|[2308.06696v1](http://arxiv.org/abs/2308.06696v1)|**[link](https://github.com/zjukg/maco)**|\n", "2308.06573": "|**2023-08-12**|**4DRVO-Net: Deep 4D Radar-Visual Odometry Using Multi-Modal and Multi-Scale Adaptive Fusion**|Guirong Zhuo et.al.|[2308.06573v1](http://arxiv.org/abs/2308.06573v1)|null|\n", "2308.06556": "|**2023-08-12**|**Contrastive Learning for Cross-modal Artist Retrieval**|Andres Ferraro et.al.|[2308.06556v1](http://arxiv.org/abs/2308.06556v1)|null|\n", "2308.06530": "|**2023-08-12**|**BEV-DG: Cross-Modal Learning under Bird's-Eye View for Domain Generalization of 3D Semantic Segmentation**|Miaoyu Li et.al.|[2308.06530v1](http://arxiv.org/abs/2308.06530v1)|null|\n", "2308.06498": "|**2023-08-12**|**Latent Emission-Augmented Perspective-Taking (LEAPT) for Human-Robot Interaction**|Kaiqi Chen et.al.|[2308.06498v1](http://arxiv.org/abs/2308.06498v1)|null|\n", "2308.06394": "|**2023-08-11**|**Detecting and Preventing Hallucinations in Large Vision Language Models**|Anisha Gunjal et.al.|[2308.06394v1](http://arxiv.org/abs/2308.06394v1)|null|\n", "2308.06377": "|**2023-08-11**|**CATS v2: Hybrid encoders for robust medical segmentation**|Hao Li et.al.|[2308.06377v1](http://arxiv.org/abs/2308.06377v1)|**[link](https://github.com/haoli12345/cats)**|\n", "2308.07907": "|**2023-08-15**|**Sequential Monte Carlo with Cross-validated Neural Networks for Complexity of Hyperbolic Black Hole Solutions in 4D**|Armin Hatefi et.al.|[2308.07907v1](http://arxiv.org/abs/2308.07907v1)|null|\n", "2308.07777": "|**2023-08-15**|**Enhancing Visually-Rich Document Understanding via Layout Structure Modeling**|Qiwei Li et.al.|[2308.07777v1](http://arxiv.org/abs/2308.07777v1)|null|\n", "2308.07751": "|**2023-08-15**|**CASPNet++: Joint Multi-Agent Motion Prediction**|Maximilian Sch\u00e4fer et.al.|[2308.07751v1](http://arxiv.org/abs/2308.07751v1)|null|\n", "2308.07732": "|**2023-08-15**|**UniTR: A Unified and Efficient Multi-Modal Transformer for Bird's-Eye-View Representation**|Haiyang Wang et.al.|[2308.07732v1](http://arxiv.org/abs/2308.07732v1)|**[link](https://github.com/haiyang-w/unitr)**|\n", "2308.07686": "|**2023-08-15**|**Boosting Multi-modal Model Performance with Adaptive Gradient Modulation**|Hong Li et.al.|[2308.07686v1](http://arxiv.org/abs/2308.07686v1)|**[link](https://github.com/lihong2303/agm_iccv2023)**|\n", "2308.07648": "|**2023-08-15**|**Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval**|Chaorui Deng et.al.|[2308.07648v1](http://arxiv.org/abs/2308.07648v1)|**[link](https://github.com/bladewaltz1/promptswitch)**|\n", "2308.07622": "|**2023-08-15**|**EMID: An Emotional Aligned Dataset in Audio-Visual Modality**|Jialing Zou et.al.|[2308.07622v1](http://arxiv.org/abs/2308.07622v1)|**[link](https://github.com/ecnu-aigc/emid)**|\n", "2308.07605": "|**2023-08-15**|**SGDiff: A Style Guided Diffusion Model for Fashion Synthesis**|Zhengwentai Sun et.al.|[2308.07605v1](http://arxiv.org/abs/2308.07605v1)|**[link](https://github.com/taited/sgdiff)**|\n", "2308.08546": "|**2023-08-16**|**What is the source of the PTA GW signal?**|John Ellis et.al.|[2308.08546v1](http://arxiv.org/abs/2308.08546v1)|null|\n", "2308.08409": "|**2023-08-16**|**X-PSI Parameter Recovery for Temperature Map Configurations Inspired by PSR J0030+0451**|Serena Vinciguerra et.al.|[2308.08409v1](http://arxiv.org/abs/2308.08409v1)|null|\n", "2308.08303": "|**2023-08-16**|**Leveraging Next-Active Objects for Context-Aware Anticipation in Egocentric Videos**|Sanket Thakur et.al.|[2308.08303v1](http://arxiv.org/abs/2308.08303v1)|null|\n", "2308.08157": "|**2023-08-16**|**Learning to Generate Semantic Layouts for Higher Text-Image Correspondence in Text-to-Image Synthesis**|Minho Park et.al.|[2308.08157v1](http://arxiv.org/abs/2308.08157v1)|**[link](https://github.com/pmh9960/GCDP)**|\n", "2308.08143": "|**2023-08-16**|**SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech Separation**|Kai Li et.al.|[2308.08143v1](http://arxiv.org/abs/2308.08143v1)|null|\n", "2308.08125": "|**2023-08-16**|**Radio2Text: Streaming Speech Recognition Using mmWave Radio Signals**|Running Zhao et.al.|[2308.08125v1](http://arxiv.org/abs/2308.08125v1)|null|\n", "2308.08088": "|**2023-08-16**|**Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme Detection**|Rui Cao et.al.|[2308.08088v1](http://arxiv.org/abs/2308.08088v1)|**[link](https://github.com/social-ai-studio/pro-cap)**|\n", "2308.09622": "|**2023-08-18**|**Is context all you need? Scaling Neural Sign Language Translation to Large Domains of Discourse**|Ozge Mercanoglu Sincan et.al.|[2308.09622v1](http://arxiv.org/abs/2308.09622v1)|null|\n", "2308.09599": "|**2023-08-18**|**Language-Guided Diffusion Model for Visual Grounding**|Sijia Chen et.al.|[2308.09599v1](http://arxiv.org/abs/2308.09599v1)|null|\n", "2308.09568": "|**2023-08-18**|**PUMGPT: A Large Vision-Language Model for Product Understanding**|Shuhui Wu et.al.|[2308.09568v1](http://arxiv.org/abs/2308.09568v1)|null|\n", "2308.09475": "|**2023-08-18**|**Video-Instrument Synergistic Network for Referring Video Instrument Segmentation in Robotic Surgery**|Hongqiu Wang et.al.|[2308.09475v1](http://arxiv.org/abs/2308.09475v1)|null|\n", "2308.09469": "|**2023-08-18**|**An updated mass-radius analysis of the 2017-2018 NICER data set of PSR J0030+0451**|Serena Vinciguerra et.al.|[2308.09469v1](http://arxiv.org/abs/2308.09469v1)|null|\n", "2308.09442": "|**2023-08-21**|**BioMedGPT: Open Multimodal Generative Pre-trained Transformer for BioMedicine**|Yizhen Luo et.al.|[2308.09442v2](http://arxiv.org/abs/2308.09442v2)|**[link](https://github.com/pharmolix/openbiomed)**|\n", "2308.09369": "|**2023-08-18**|**Single Frame Semantic Segmentation Using Multi-Modal Spherical Images**|Suresh Guttikonda et.al.|[2308.09369v1](http://arxiv.org/abs/2308.09369v1)|**[link](https://github.com/sguttikon/SFSS-MMSI)**|\n", "2308.09363": "|**2023-08-18**|**Open-vocabulary Video Question Answering: A New Benchmark for Evaluating the Generalizability of Video Question Answering Models**|Dohwan Ko et.al.|[2308.09363v1](http://arxiv.org/abs/2308.09363v1)|**[link](https://github.com/mlvlab/ovqa)**|\n", "2308.09351": "|**2023-08-18**|**RLIPv2: Fast Scaling of Relational Language-Image Pre-training**|Hangjie Yuan et.al.|[2308.09351v1](http://arxiv.org/abs/2308.09351v1)|**[link](https://github.com/jacobyuan7/rlipv2)**|\n", "2308.09322": "|**2023-08-18**|**Audio-Visual Glance Network for Efficient Video Recognition**|Muhammad Adi Nugroho et.al.|[2308.09322v1](http://arxiv.org/abs/2308.09322v1)|null|\n", "2308.09306": "|**2023-08-18**|**DiffDis: Empowering Generative Diffusion Model with Cross-Modal Discrimination Capability**|Runhui Huang et.al.|[2308.09306v1](http://arxiv.org/abs/2308.09306v1)|null|\n", "2308.09300": "|**2023-08-21**|**V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by Connecting Foundation Models**|Heng Wang et.al.|[2308.09300v2](http://arxiv.org/abs/2308.09300v2)|**[link](https://github.com/heng-hw/V2A-Mapper)**|\n", "2308.09234": "|**2023-08-18**|**Deep Boosting Multi-Modal Ensemble Face Recognition with Sample-Level Weighting**|Sahar Rahimi Malakshan et.al.|[2308.09234v1](http://arxiv.org/abs/2308.09234v1)|null|\n", "2308.09179": "|**2023-08-17**|**Versatile Multi-Contact Planning and Control for Legged Loco-Manipulation**|Jean-Pierre Sleiman et.al.|[2308.09179v1](http://arxiv.org/abs/2308.09179v1)|null|\n", "2308.08930": "|**2023-08-17**|**Point-aware Interaction and CNN-induced Refinement Network for RGB-D Salient Object Detection**|Runmin Cong et.al.|[2308.08930v1](http://arxiv.org/abs/2308.08930v1)|**[link](https://github.com/rmcong/picr-net_acmmm23)**|\n", "2308.10777": "|**2023-08-21**|**I-BaR: Integrated Balance Rehabilitation Framework**|Tugce Ersoy et.al.|[2308.10777v1](http://arxiv.org/abs/2308.10777v1)|null|\n", "2308.10741": "|**2023-08-21**|**On the Adversarial Robustness of Multi-Modal Foundation Models**|Christian Schlarmann et.al.|[2308.10741v1](http://arxiv.org/abs/2308.10741v1)|null|\n", "2308.10631": "|**2023-08-21**|**PsyMo: A Dataset for Estimating Self-Reported Psychological Traits from Gait**|Adrian Cosma et.al.|[2308.10631v1](http://arxiv.org/abs/2308.10631v1)|null|\n", "2308.10627": "|**2023-08-21**|**Polarimetric Information for Multi-Modal 6D Pose Estimation of Photometrically Challenging Objects with Limited Data**|Patrick Ruhkamp et.al.|[2308.10627v1](http://arxiv.org/abs/2308.10627v1)|null|\n", "2308.10621": "|**2023-08-21**|**Multi-Modal Dataset Acquisition for Photometrically Challenging Object**|HyunJun Jung et.al.|[2308.10621v1](http://arxiv.org/abs/2308.10621v1)|null|\n", "2308.10491": "|**2023-08-21**|**SynDrone -- Multi-modal UAV Dataset for Urban Scenarios**|Giulia Rizzoli et.al.|[2308.10491v1](http://arxiv.org/abs/2308.10491v1)|**[link](https://github.com/lttm/syndrone)**|\n", "2308.10486": "|**2023-08-21**|**Deep Metric Loss for Multimodal Learning**|Sehwan Moon et.al.|[2308.10486v1](http://arxiv.org/abs/2308.10486v1)|**[link](https://github.com/sehwanmoon/multimodalloss)**|\n", "2308.10454": "|**2023-08-21**|**Elucidating STEM Concepts through Generative AI: A Multi-modal Exploration of Analogical Reasoning**|Chen Cao et.al.|[2308.10454v1](http://arxiv.org/abs/2308.10454v1)|null|\n", "2308.10421": "|**2023-08-21**|**UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D Representation for 3D Perception in Autonomous Driving**|Jian Zou et.al.|[2308.10421v1](http://arxiv.org/abs/2308.10421v1)|**[link](https://github.com/hollow-503/unim2ae)**|\n", "2308.10362": "|**2023-08-20**|**Vehicle Cameras Guide mmWave Beams: Approach and Real-World V2V Demonstration**|Tawfik Osman et.al.|[2308.10362v1](http://arxiv.org/abs/2308.10362v1)|null|\n", "2308.10240": "|**2023-08-20**|**Generic Attention-model Explainability by Weighted Relevance Accumulation**|Yiming Huang et.al.|[2308.10240v1](http://arxiv.org/abs/2308.10240v1)|null|\n", "2308.10175": "|**2023-08-20**|**BAVS: Bootstrapping Audio-Visual Segmentation by Integrating Foundation Knowledge**|Chen Liu et.al.|[2308.10175v1](http://arxiv.org/abs/2308.10175v1)|null|\n", "2308.10172": "|**2023-08-20**|**VLN-PETL: Parameter-Efficient Transfer Learning for Vision-and-Language Navigation**|Yanyuan Qiao et.al.|[2308.10172v1](http://arxiv.org/abs/2308.10172v1)|**[link](https://github.com/yanyuanqiao/vln-petl)**|\n", "2308.10161": "|**2023-08-20**|**ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under Challenging Conditions**|Qiao Yan et.al.|[2308.10161v1](http://arxiv.org/abs/2308.10161v1)|null|\n", "2308.10146": "|**2023-08-20**|**OCHID-Fi: Occlusion-Robust Hand Pose Estimation in 3D via RF-Vision**|Shujie Zhang et.al.|[2308.10146v1](http://arxiv.org/abs/2308.10146v1)|null|\n", "2308.11601": "|**2023-08-23**|**Tryage: Real-time, intelligent Routing of User Prompts to Large Language Models**|Surya Narayanan Hari et.al.|[2308.11601v2](http://arxiv.org/abs/2308.11601v2)|null|\n", "2308.11561": "|**2023-08-23**|**Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog Navigation**|Yifei Su et.al.|[2308.11561v2](http://arxiv.org/abs/2308.11561v2)|**[link](https://github.com/yifeisu/avdn-challenge)**|\n", "2308.11551": "|**2023-08-22**|**Multi-event Video-Text Retrieval**|Gengyuan Zhang et.al.|[2308.11551v1](http://arxiv.org/abs/2308.11551v1)|**[link](https://github.com/gengyuanmax/mevtr)**|\n", "2308.11530": "|**2023-08-22**|**Furnishing Sound Event Detection with Language Model Abilities**|Hualei Wang et.al.|[2308.11530v1](http://arxiv.org/abs/2308.11530v1)|null|\n", "2308.11513": "|**2023-08-22**|**TrackFlow: Multi-Object Tracking with Normalizing Flows**|Gianluca Mancusi et.al.|[2308.11513v1](http://arxiv.org/abs/2308.11513v1)|null|\n", "2308.11501": "|**2023-08-22**|**Four years of multi-modal odometry and mapping on the rail vehicles**|Yusheng Wang et.al.|[2308.11501v1](http://arxiv.org/abs/2308.11501v1)|null|\n", "2308.11492": "|**2023-08-22**|**A LiDAR-Inertial SLAM Tightly-Coupled with Dropout-Tolerant GNSS Fusion for Autonomous Mine Service Vehicles**|Yusheng Wang et.al.|[2308.11492v1](http://arxiv.org/abs/2308.11492v1)|null|\n", "2308.11356": "|**2023-08-22**|**Semantic RGB-D Image Synthesis**|Shijie Li et.al.|[2308.11356v1](http://arxiv.org/abs/2308.11356v1)|null|\n", "2308.11351": "|**2023-08-22**|**M3PS: End-to-End Multi-Grained Multi-Modal Attribute-Aware Product Summarization in E-commerce**|Tao Chen et.al.|[2308.11351v1](http://arxiv.org/abs/2308.11351v1)|null|\n", "2308.11331": "|**2023-08-22**|**GrowCLIP: Data-aware Automatic Model Growing for Large-scale Contrastive Language-Image Pre-training**|Xinchi Deng et.al.|[2308.11331v1](http://arxiv.org/abs/2308.11331v1)|null|\n", "2308.11206": "|**2023-08-22**|**DiffCloth: Diffusion Based Garment Synthesis and Manipulation via Structural Cross-modal Semantic Alignment**|Xujie Zhang et.al.|[2308.11206v1](http://arxiv.org/abs/2308.11206v1)|null|\n", "2308.11175": "|**2023-08-22**|**MISSRec: Pre-training and Transferring Multi-modal Interest-aware Sequence Representation for Recommendation**|Jinpeng Wang et.al.|[2308.11175v1](http://arxiv.org/abs/2308.11175v1)|**[link](https://github.com/gimpong/MM23-MISSRec)**|\n", "2308.11165": "|**2023-08-22**|**Improving Misaligned Multi-modality Image Fusion with One-stage Progressive Dense Registration**|Di Wang et.al.|[2308.11165v1](http://arxiv.org/abs/2308.11165v1)|null|\n", "2308.12199": "|**2023-08-23**|**Towards Real-Time Analysis of Broadcast Badminton Videos**|Nitin Nilesh et.al.|[2308.12199v1](http://arxiv.org/abs/2308.12199v1)|**[link](https://gitlab.com/nitin.nilesh/badminton-analysis-star)**|\n", "2308.12163": "|**2023-08-23**|**NPF-200: A Multi-Modal Eye Fixation Dataset and Method for Non-Photorealistic Videos**|Ziyu Yang et.al.|[2308.12163v1](http://arxiv.org/abs/2308.12163v1)|**[link](https://github.com/yangziyu/npf200)**|\n", "2308.12111": "|**2023-08-23**|**Cross-Modality Proposal-guided Feature Mining for Unregistered RGB-Thermal Pedestrian Detection**|Chao Tian et.al.|[2308.12111v1](http://arxiv.org/abs/2308.12111v1)|null|\n", "2308.12049": "|**2023-08-23**|**Towards Privacy-Supporting Fall Detection via Deep Unsupervised RGB2Depth Adaptation**|Hejun Xiao et.al.|[2308.12049v1](http://arxiv.org/abs/2308.12049v1)|**[link](https://github.com/1015206533/privacy_supporting_fall_detection)**|\n", "2308.11994": "|**2023-08-23**|**Progressive Feature Mining and External Knowledge-Assisted Text-Pedestrian Image Retrieval**|Huafeng Li et.al.|[2308.11994v1](http://arxiv.org/abs/2308.11994v1)|null|\n", "2308.11983": "|**2023-08-23**|**Multi-Modal Multi-Task (3MT) Road Segmentation**|Erkan Milli et.al.|[2308.11983v1](http://arxiv.org/abs/2308.11983v1)|**[link](https://github.com/erkanmilli/3mt-roadseg)**|\n", "2308.11880": "|**2023-08-23**|**SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal Targets**|Cody Simons et.al.|[2308.11880v1](http://arxiv.org/abs/2308.11880v1)|**[link](https://github.com/csimo005/summit)**|\n", "2308.11877": "|**2023-08-24**|**Integrated Image and Location Analysis for Wound Classification: A Deep Learning Approach**|Yash Patel et.al.|[2308.11877v2](http://arxiv.org/abs/2308.11877v2)|null|\n", "2308.11804": "|**2023-08-22**|**Ceci n'est pas une pomme: Adversarial Illusions in Multi-Modal Embeddings**|Eugene Bagdasaryan et.al.|[2308.11804v1](http://arxiv.org/abs/2308.11804v1)|**[link](https://github.com/ebagdasa/adversarial_illusions)**|\n", "2308.11797": "|**2023-08-22**|**CLIP Multi-modal Hashing: A new baseline CLIPMH**|Jian Zhu et.al.|[2308.11797v1](http://arxiv.org/abs/2308.11797v1)|null|\n", "2308.12956": "|**2023-08-24**|**DLIP: Distilling Language-Image Pre-training**|Huafeng Kuang et.al.|[2308.12956v1](http://arxiv.org/abs/2308.12956v1)|null|\n", "2308.12871": "|**2023-08-24**|**IPA: Inference Pipeline Adaptation to Achieve High Accuracy and Cost-Efficiency**|Saeid Ghafouri et.al.|[2308.12871v1](http://arxiv.org/abs/2308.12871v1)|null|\n", "2308.12863": "|**2023-08-24**|**SkipcrossNets: Adaptive Skip-cross Fusion for Road Detection**|Xinyu Zhang et.al.|[2308.12863v1](http://arxiv.org/abs/2308.12863v1)|null|\n", "2308.12755": "|**2023-08-24**|**Acquiring Qualitative Explainable Graphs for Automated Driving Scene Interpretation**|Nassim Belmecheri et.al.|[2308.12755v1](http://arxiv.org/abs/2308.12755v1)|**[link](https://github.com/simula-vias/qxg-builder)**|\n", "2308.12736": "|**2023-08-24**|**FastSurfer-HypVINN: Automated sub-segmentation of the hypothalamus and adjacent structures on high-resolutional brain MRI**|Santiago Estrada et.al.|[2308.12736v1](http://arxiv.org/abs/2308.12736v1)|**[link](https://github.com/Deep-MI/FastSurfer)**|\n", "2308.12610": "|**2023-08-24**|**Emotion-Aligned Contrastive Learning Between Images and Music**|Shanti Stewart et.al.|[2308.12610v1](http://arxiv.org/abs/2308.12610v1)|null|\n", "2308.12604": "|**2023-08-24**|**PromptMRG: Diagnosis-Driven Prompts for Medical Report Generation**|Haibo Jin et.al.|[2308.12604v1](http://arxiv.org/abs/2308.12604v1)|null|\n", "2308.12587": "|**2023-08-24**|**Grounded Entity-Landmark Adaptive Pre-training for Vision-and-Language Navigation**|Yibo Cui et.al.|[2308.12587v1](http://arxiv.org/abs/2308.12587v1)|**[link](https://github.com/csir1996/vln-gela)**|\n", "2308.12558": "|**2023-08-24**|**Hyperbolic Audio-visual Zero-shot Learning**|Jie Hong et.al.|[2308.12558v1](http://arxiv.org/abs/2308.12558v1)|null|\n", "2308.12509": "|**2023-08-24**|**Parameter-Efficient Transfer Learning for Remote Sensing Image-Text Retrieval**|Yuan Yuan et.al.|[2308.12509v1](http://arxiv.org/abs/2308.12509v1)|**[link](https://github.com/ZhanYang-nwpu/PE-RSITR)**|\n", "2308.12370": "|**2023-08-23**|**AdVerb: Visually Guided Audio Dereverberation**|Sanjoy Chowdhury et.al.|[2308.12370v1](http://arxiv.org/abs/2308.12370v1)|null|\n", "2308.12320": "|**2023-08-23**|**Understanding Dark Scenes by Contrasting Multi-Modal Observations**|Xiaoyu Dong et.al.|[2308.12320v1](http://arxiv.org/abs/2308.12320v1)|**[link](https://github.com/palmdong/smmcl)**|\n", "2308.13437": "|**2023-08-25**|**Position-Enhanced Visual Instruction Tuning for Multimodal Large Language Models**|Chi Chen et.al.|[2308.13437v1](http://arxiv.org/abs/2308.13437v1)|**[link](https://github.com/pvit-official/pvit)**|\n", "2308.13392": "|**2023-08-25**|**Self-Supervised Representation Learning with Cross-Context Learning between Global and Hypercolumn Features**|Zheng Gao et.al.|[2308.13392v1](http://arxiv.org/abs/2308.13392v1)|null|\n", "2308.13355": "|**2023-08-25**|**WorldSmith: Iterative and Expressive Prompting for World Building with a Generative AI**|Hai Dang et.al.|[2308.13355v1](http://arxiv.org/abs/2308.13355v1)|null|\n", "2308.13340": "|**2023-08-25**|**TriGait: Aligning and Fusing Skeleton and Silhouette Gait Data via a Tri-Branch Network**|Yan Sun et.al.|[2308.13340v1](http://arxiv.org/abs/2308.13340v1)|**[link](https://github.com/feng-xueling/trigait)**|\n", "2308.13077": "|**2023-08-24**|**Preserving Modality Structure Improves Multi-Modal Learning**|Swetha Sirnam et.al.|[2308.13077v1](http://arxiv.org/abs/2308.13077v1)|null|\n", "2308.14713": "|**2023-08-28**|**R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras**|Aron Schmied et.al.|[2308.14713v1](http://arxiv.org/abs/2308.14713v1)|null|\n", "2308.14619": "|**2023-08-29**|**Compositional Semantic Mix for Domain Adaptation in Point Cloud Segmentation**|Cristiano Saltori et.al.|[2308.14619v2](http://arxiv.org/abs/2308.14619v2)|**[link](https://github.com/saltoricristiano/cosmix-uda)**|\n", "2308.14613": "|**2023-08-28**|**MS-Net: A Multi-modal Self-supervised Network for Fine-Grained Classification of Aircraft in SAR Images**|Bingying Yue et.al.|[2308.14613v1](http://arxiv.org/abs/2308.14613v1)|null|\n", "2308.14482": "|**2023-08-28**|**An Empirical Study of Consistency Regularization for End-to-End Speech-to-Text Translation**|Pengzhi Gao et.al.|[2308.14482v1](http://arxiv.org/abs/2308.14482v1)|**[link](https://github.com/gpengzhi/simcr)**|\n", "2308.14383": "|**2023-08-28**|**Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a Light-Weight ToF Sensor**|Xinyang Liu et.al.|[2308.14383v1](http://arxiv.org/abs/2308.14383v1)|null|\n", "2308.14263": "|**2023-08-28**|**Cross-Modal Retrieval: A Systematic Review of Methods and Future Directions**|Lei Zhu et.al.|[2308.14263v1](http://arxiv.org/abs/2308.14263v1)|**[link](https://github.com/bmc-sdnu/cross-modal-retrieval)**|\n", "2308.14212": "|**2023-08-27**|**Exploring the Transfer Learning Capabilities of CLIP in Domain Generalization for Diabetic Retinopathy**|Sanoojan Baliah et.al.|[2308.14212v1](http://arxiv.org/abs/2308.14212v1)|**[link](https://github.com/sanoojan/clip-drdg)**|\n", "2308.14177": "|**2023-08-27**|**AIGC for Various Data Modalities: A Survey**|Lin Geng Foo et.al.|[2308.14177v1](http://arxiv.org/abs/2308.14177v1)|null|\n", "2308.14160": "|**2023-08-27**|**A Unified Transformer-based Network for multimodal Emotion Recognition**|Kamran Ali et.al.|[2308.14160v1](http://arxiv.org/abs/2308.14160v1)|null|\n", "2308.14105": "|**2023-08-29**|**Unified and Dynamic Graph for Temporal Character Grouping in Long Videos**|Xiujun Shu et.al.|[2308.14105v2](http://arxiv.org/abs/2308.14105v2)|null|\n", "2308.14083": "|**2023-08-27**|**4D Myocardium Reconstruction with Decoupled Motion and Shape Model**|Xiaohan Yuan et.al.|[2308.14083v1](http://arxiv.org/abs/2308.14083v1)|**[link](https://github.com/yuan-xiaohan/4d-myocardium-reconstruction-with-decoupled-motion-and-shape-model)**|\n", "2308.14064": "|**2023-08-27**|**Multi-model fusion for Aerial Vision and Dialog Navigation based on human attention aids**|Xinyi Wang et.al.|[2308.14064v1](http://arxiv.org/abs/2308.14064v1)|null|\n", "2308.14023": "|**2023-08-27**|**Domain-Specificity Inducing Transformers for Source-Free Domain Adaptation**|Sunandini Sanyal et.al.|[2308.14023v1](http://arxiv.org/abs/2308.14023v1)|null|\n", "2308.14009": "|**2023-08-27**|**Towards Fast and Accurate Image-Text Retrieval with Self-Supervised Fine-Grained Alignment**|Jiamin Zhuang et.al.|[2308.14009v1](http://arxiv.org/abs/2308.14009v1)|**[link](https://github.com/zjamie813/selfalign)**|\n", "2308.13976": "|**2023-08-27**|**Label Denoising through Cross-Model Agreement**|Yu Wang et.al.|[2308.13976v1](http://arxiv.org/abs/2308.13976v1)|null|\n", "2308.15273": "|**2023-08-29**|**Cross-Modal Retrieval Meets Inference:Improving Zero-Shot Classification with Cross-Modal Retrieval**|Seongha Eom et.al.|[2308.15273v1](http://arxiv.org/abs/2308.15273v1)|null|\n", "2308.15063": "|**2023-08-29**|**Learning Cross-modality Information Bottleneck Representation for Heterogeneous Person Re-Identification**|Haichao Shi et.al.|[2308.15063v1](http://arxiv.org/abs/2308.15063v1)|null|\n", "2308.14978": "|**2023-08-29**|**Vision Grid Transformer for Document Layout Analysis**|Cheng Da et.al.|[2308.14978v1](http://arxiv.org/abs/2308.14978v1)|**[link](https://github.com/alibabaresearch/advancedliteratemachinery)**|\n", "2308.14786": "|**2023-08-28**|**Extending Cross-Modal Retrieval with Interactive Learning to Improve Image Retrieval Performance in Forensics**|Nils B\u00f6hne et.al.|[2308.14786v1](http://arxiv.org/abs/2308.14786v1)|null|\n", "2308.16150": "|**2023-08-30**|**Modality Cycles with Masked Conditional Diffusion for Unsupervised Anomaly Segmentation in MRI**|Ziyun Liang et.al.|[2308.16150v1](http://arxiv.org/abs/2308.16150v1)|**[link](https://github.com/ziyunliang/mmccd)**|\n", "2308.16071": "|**2023-08-30**|**Semantic Image Synthesis via Class-Adaptive Cross-Attention**|Tomaso Fontanini et.al.|[2308.16071v1](http://arxiv.org/abs/2308.16071v1)|null|\n", "2308.16021": "|**2023-08-30**|**CALM: Contrastive Cross-modal Speaking Style Modeling for Expressive Text-to-Speech Synthesis**|Yi Meng et.al.|[2308.16021v1](http://arxiv.org/abs/2308.16021v1)|null|\n", "2308.15980": "|**2023-08-30**|**Adaptive Multi-Modalities Fusion in Sequential Recommendation Systems**|Hengchang Hu et.al.|[2308.15980v1](http://arxiv.org/abs/2308.15980v1)|**[link](https://github.com/holdenhu/mmsr)**|\n", "2308.15930": "|**2023-08-30**|**LLaSM: Large Language and Speech Model**|Yu Shu et.al.|[2308.15930v1](http://arxiv.org/abs/2308.15930v1)|**[link](https://github.com/linksoul-ai/llasm)**|\n", "2308.15846": "|**2023-08-30**|**Exploring Multi-Modal Contextual Knowledge for Open-Vocabulary Object Detection**|Yifan Xu et.al.|[2308.15846v1](http://arxiv.org/abs/2308.15846v1)|null|\n", "2308.15670": "|**2023-08-29**|**Multimodal Foundation Models For Echocardiogram Interpretation**|Matthew Christensen et.al.|[2308.15670v1](http://arxiv.org/abs/2308.15670v1)|**[link](https://github.com/echonet/echo_CLIP)**|\n", "2308.15640": "|**2023-08-29**|**Identifying Constitutive Parameters for Complex Hyperelastic Solids using Physics-Informed Neural Networks**|Siyuan Song et.al.|[2308.15640v1](http://arxiv.org/abs/2308.15640v1)|null|\n", "2308.15609": "|**2023-08-29**|**InstaTune: Instantaneous Neural Architecture Search During Fine-Tuning**|Sharath Nittur Sridhar et.al.|[2308.15609v1](http://arxiv.org/abs/2308.15609v1)|null|\n", "2308.15592": "|**2023-08-29**|**Non-local Interactions are Essential Elements for Dark Matter Halo Stability: A Cross-Model Study**|Ahmad Borzou et.al.|[2308.15592v1](http://arxiv.org/abs/2308.15592v1)|null|\n", "2308.16896": "|**2023-08-31**|**PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic Occupancy Prediction**|Sicheng Zuo et.al.|[2308.16896v1](http://arxiv.org/abs/2308.16896v1)|**[link](https://github.com/wzzheng/pointocc)**|\n", "2308.16777": "|**2023-09-01**|**Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models**|Minheng Ni et.al.|[2308.16777v2](http://arxiv.org/abs/2308.16777v2)|null|\n", "2308.16758": "|**2023-08-31**|**Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation Using only Images**|Cuican Yu et.al.|[2308.16758v1](http://arxiv.org/abs/2308.16758v1)|null|\n", "2308.16649": "|**2023-08-31**|**Learning with Multi-modal Gradient Attention for Explainable Composed Image Retrieval**|Prateksha Udhayanan et.al.|[2308.16649v1](http://arxiv.org/abs/2308.16649v1)|null|\n", "2308.16632": "|**2023-08-31**|**3D-STMN: Dependency-Driven Superpoint-Text Matching Network for End-to-End 3D Referring Expression Segmentation**|Changli Wu et.al.|[2308.16632v1](http://arxiv.org/abs/2308.16632v1)|**[link](https://github.com/sosppxo/3d-stmn)**|\n", "2308.16493": "|**2023-08-31**|**Expanding Frozen Vision-Language Models without Retraining: Towards Improved Robot Perception**|Riley Tavassoli et.al.|[2308.16493v1](http://arxiv.org/abs/2308.16493v1)|null|\n", "2308.16474": "|**2023-08-31**|**Enhancing Subtask Performance of Multi-modal Large Language Model**|Yongqiang Zhao et.al.|[2308.16474v1](http://arxiv.org/abs/2308.16474v1)|null|\n", "2308.16437": "|**2023-08-31**|**AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR Prediction**|Zhaoxin Huan et.al.|[2308.16437v1](http://arxiv.org/abs/2308.16437v1)|null|\n", "2308.16386": "|**2023-08-31**|**RGB-T Tracking via Multi-Modal Mutual Prompt Learning**|Yang Luo et.al.|[2308.16386v1](http://arxiv.org/abs/2308.16386v1)|**[link](https://github.com/husteryoung/mplt)**|\n", "2309.00615": "|**2023-09-01**|**Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D Understanding, Generation, and Instruction Following**|Ziyu Guo et.al.|[2309.00615v1](http://arxiv.org/abs/2309.00615v1)|**[link](https://github.com/ziyuguo99/point-bind_point-llm)**|\n", "2309.00406": "|**2023-09-01**|**Constraining X-ray variability of the blazar 3C 273 using XMM-Newton observations over two decades**|Adithiya Dinesh et.al.|[2309.00406v1](http://arxiv.org/abs/2309.00406v1)|null|\n", "2309.00380": "|**2023-09-01**|**Learning multi-modal generative models with permutation-invariant encoders and tighter variational bounds**|Marcel Hirt et.al.|[2309.00380v1](http://arxiv.org/abs/2309.00380v1)|null|\n", "2309.00372": "|**2023-09-01**|**On the Localization of Ultrasound Image Slices within Point Distribution Models**|Lennart Bastian et.al.|[2309.00372v1](http://arxiv.org/abs/2309.00372v1)|**[link](https://github.com/vuenc/slice-to-shape)**|\n", "2309.00227": "|**2023-09-01**|**What Makes Good Open-Vocabulary Detector: A Disassembling Perspective**|Jincheng Li et.al.|[2309.00227v1](http://arxiv.org/abs/2309.00227v1)|null|\n", "2309.00133": "|**2023-08-31**|**Distraction-free Embeddings for Robust VQA**|Atharvan Dogra et.al.|[2309.00133v1](http://arxiv.org/abs/2309.00133v1)|null|\n", "2309.00030": "|**2023-08-31**|**Audio-Driven Dubbing for User Generated Contents via Style-Aware Semi-Parametric Synthesis**|Linsen Song et.al.|[2309.00030v1](http://arxiv.org/abs/2309.00030v1)|null|\n", "2309.02320": "|**2023-09-05**|**SeisCLIP: A seismology foundation model pre-trained by multi-modal data for multi-purpose seismic feature extraction**|Xu Si et.al.|[2309.02320v1](http://arxiv.org/abs/2309.02320v1)|**[link](https://github.com/sixu0/SeisCLIP)**|\n", "2309.02169": "|**2023-09-05**|**Dual Relation Alignment for Composed Image Retrieval**|Xintong Jiang et.al.|[2309.02169v1](http://arxiv.org/abs/2309.02169v1)|null|\n", "2309.02124": "|**2023-09-05**|**Exploiting Spatial-temporal Data for Sleep Stage Classification via Hypergraph Learning**|Yuze Liu et.al.|[2309.02124v1](http://arxiv.org/abs/2309.02124v1)|null|\n", "2309.02043": "|**2023-09-05**|**Decomposed Guided Dynamic Filters for Efficient RGB-Guided Depth Completion**|Yufei Wang et.al.|[2309.02043v1](http://arxiv.org/abs/2309.02043v1)|null|\n", "2309.02041": "|**2023-09-05**|**Learning Cross-Modal Affinity for Referring Video Object Segmentation Targeting Limited Samples**|Guanghui Li et.al.|[2309.02041v1](http://arxiv.org/abs/2309.02041v1)|**[link](https://github.com/hengliusky/few_shot_rvos)**|\n", "2309.01981": "|**2023-09-05**|**Graph-Based Interaction-Aware Multimodal 2D Vehicle Trajectory Prediction using Diffusion Graph Convolutional Networks**|Keshu Wu et.al.|[2309.01981v1](http://arxiv.org/abs/2309.01981v1)|null|\n", "2309.01955": "|**2023-09-05**|**A Survey on Interpretable Cross-modal Reasoning**|Dizhan Xue et.al.|[2309.01955v1](http://arxiv.org/abs/2309.01955v1)|**[link](https://github.com/ZuyiZhou/Awesome-Interpretable-Cross-modal-Reasoning)**|\n", "2309.01918": "|**2023-09-05**|**RoboAgent: Generalization and Efficiency in Robot Manipulation via Semantic Augmentations and Action Chunking**|Homanga Bharadhwaj et.al.|[2309.01918v1](http://arxiv.org/abs/2309.01918v1)|null|\n", "2309.01860": "|**2023-09-06**|**Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition and Translation**|Zaber Ibn Abdul Hakim et.al.|[2309.01860v2](http://arxiv.org/abs/2309.01860v2)|null|\n", "2309.01728": "|**2023-09-04**|**Generative-based Fusion Mechanism for Multi-Modal Tracking**|Zhangyong Tang et.al.|[2309.01728v1](http://arxiv.org/abs/2309.01728v1)|**[link](https://github.com/zhangyong-tang/gmmt)**|\n", "2309.01516": "|**2023-09-04**|**MultiWay-Adapater: Adapting large-scale multi-modal models for scalable image-text retrieval**|Zijun Long et.al.|[2309.01516v1](http://arxiv.org/abs/2309.01516v1)|**[link](https://github.com/longkukuhi/multiway-adapter)**|\n", "2309.01420": "|**2023-09-04**|**Unified Pre-training with Pseudo Texts for Text-To-Image Person Re-identification**|Zhiyin Shao et.al.|[2309.01420v1](http://arxiv.org/abs/2309.01420v1)|**[link](https://github.com/zhiyinshao-h/unipt)**|\n", "2309.01327": "|**2023-09-04**|**Can I Trust Your Answer? Visually Grounded Video Question Answering**|Junbin Xiao et.al.|[2309.01327v1](http://arxiv.org/abs/2309.01327v1)|**[link](https://github.com/doc-doc/next-gqa)**|\n", "2309.01256": "|**2023-09-03**|**BDC-Adapter: Brownian Distance Covariance for Better Vision-Language Reasoning**|Yi Zhang et.al.|[2309.01256v1](http://arxiv.org/abs/2309.01256v1)|null|\n", "2309.01073": "|**2023-09-03**|**Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding**|Cheng Shi et.al.|[2309.01073v1](http://arxiv.org/abs/2309.01073v1)|null|\n", "2309.03177": "|**2023-09-06**|**3D Object Positioning Using Differentiable Multimodal Learning**|Sean Zanyk-McLean et.al.|[2309.03177v1](http://arxiv.org/abs/2309.03177v1)|null|\n", "2309.03147": "|**2023-09-06**|**Real-Time Non-Invasive Imaging and Detection of Spreading Depolarizations through EEG: An Ultra-Light Explainable Deep Learning Approach**|Yinzhe Wu et.al.|[2309.03147v1](http://arxiv.org/abs/2309.03147v1)|null|\n", "2309.03100": "|**2023-09-06**|**FArMARe: a Furniture-Aware Multi-task methodology for Recommending Apartments based on the user interests**|Ali Abdari et.al.|[2309.03100v1](http://arxiv.org/abs/2309.03100v1)|**[link](https://github.com/aliabdari/farmare)**|\n", "2309.02965": "|**2023-09-06**|**Dynamic Hyperbolic Attention Network for Fine Hand-object Reconstruction**|Zhiying Leng et.al.|[2309.02965v1](http://arxiv.org/abs/2309.02965v1)|null|\n", "2309.02875": "|**2023-09-06**|**MAD: Modality Agnostic Distance Measure for Image Registration**|Vasiliki Sideri-Lampretsa et.al.|[2309.02875v1](http://arxiv.org/abs/2309.02875v1)|null|\n", "2309.02702": "|**2023-09-06**|**Gene-induced Multimodal Pre-training for Image-omic Classification**|Ting Jin et.al.|[2309.02702v1](http://arxiv.org/abs/2309.02702v1)|null|\n", "2309.02616": "|**2023-09-05**|**Generative AI-aided Joint Training-free Secure Semantic Communications via Multi-modal Prompts**|Hongyang Du et.al.|[2309.02616v1](http://arxiv.org/abs/2309.02616v1)|null|\n", "2309.02591": "|**2023-09-05**|**Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction Tuning**|Lili Yu et.al.|[2309.02591v1](http://arxiv.org/abs/2309.02591v1)|null|\n", "2309.03905": "|**2023-09-07**|**ImageBind-LLM: Multi-modality Instruction Tuning**|Jiaming Han et.al.|[2309.03905v1](http://arxiv.org/abs/2309.03905v1)|**[link](https://github.com/opengvlab/llama-adapter)**|\n", "2309.03869": "|**2023-09-07**|**Text-to-feature diffusion for audio-visual few-shot learning**|Otniel-Bogdan Mercea et.al.|[2309.03869v1](http://arxiv.org/abs/2309.03869v1)|**[link](https://github.com/explainableml/avdiff-gfsl)**|\n", "2309.03734": "|**2023-09-07**|**ClusterFusion: Leveraging Radar Spatial Features for Radar-Camera 3D Object Detection in Autonomous Vehicles**|Irfan Tito Kurniawan et.al.|[2309.03734v1](http://arxiv.org/abs/2309.03734v1)|null|\n", "2309.03661": "|**2023-09-07**|**Prompt-based Context- and Domain-aware Pretraining for Vision and Language Navigation**|Ting Liu et.al.|[2309.03661v1](http://arxiv.org/abs/2309.03661v1)|null|\n", "2309.03473": "|**2023-09-07**|**Temporal Collection and Distribution for Referring Video Object Segmentation**|Jiajin Tang et.al.|[2309.03473v1](http://arxiv.org/abs/2309.03473v1)|null|\n", "2309.03452": "|**2023-09-07**|**Multi-Modality Guidance Network For Missing Modality Inference**|Zhuokai Zhao et.al.|[2309.03452v1](http://arxiv.org/abs/2309.03452v1)|null|\n", "2309.04453": "|**2023-09-08**|**WiSARD: A Labeled Visual and Thermal Image Dataset for Wilderness Search and Rescue**|Daniel Broyles et.al.|[2309.04453v1](http://arxiv.org/abs/2309.04453v1)|null|\n", "2309.04399": "|**2023-09-08**|**MaskDiffusion: Boosting Text-to-Image Consistency with Conditional Mask**|Yupeng Zhou et.al.|[2309.04399v1](http://arxiv.org/abs/2309.04399v1)|null|\n", "2309.04302": "|**2023-09-08**|**Have We Ever Encountered This Before? Retrieving Out-of-Distribution Road Obstacles from Driving Scenes**|Youssef Shoeb et.al.|[2309.04302v1](http://arxiv.org/abs/2309.04302v1)|null|\n", "2309.04287": "|**2023-09-08**|**Sequential Semantic Generative Communication for Progressive Text-to-Image Generation**|Hyelin Nam et.al.|[2309.04287v1](http://arxiv.org/abs/2309.04287v1)|null|\n", "2309.04109": "|**2023-09-08**|**From Text to Mask: Localizing Entities Using the Attention of Text-to-Image Diffusion Models**|Changming Xiao et.al.|[2309.04109v1](http://arxiv.org/abs/2309.04109v1)|null|\n", "2309.04062": "|**2023-09-08**|**3D Denoisers are Good 2D Teachers: Molecular Pretraining via Denoising and Cross-Modal Distillation**|Sungjun Cho et.al.|[2309.04062v1](http://arxiv.org/abs/2309.04062v1)|null|\n", "2309.04001": "|**2023-09-07**|**Multimodal Transformer for Material Segmentation**|Md Kaykobad Reza et.al.|[2309.04001v1](http://arxiv.org/abs/2309.04001v1)|**[link](https://github.com/csiplab/mmsformer)**|\n", "2309.05644": "|**2023-09-11**|**Grid-based Hybrid 3DMA GNSS and Terrestrial Positioning**|Paul Schwarzbach et.al.|[2309.05644v1](http://arxiv.org/abs/2309.05644v1)|null|\n", "2309.05608": "|**2023-09-11**|**Incorporating Pre-trained Model Prompting in Multimodal Stock Volume Movement Prediction**|Ruibo Chen et.al.|[2309.05608v1](http://arxiv.org/abs/2309.05608v1)|**[link](https://github.com/rayruibochen/promuse)**|\n", "2309.05573": "|**2023-09-11**|**UniSeg: A Unified Multi-Modal LiDAR Segmentation Network and the OpenPCSeg Codebase**|Youquan Liu et.al.|[2309.05573v1](http://arxiv.org/abs/2309.05573v1)|**[link](https://github.com/pjlab-adg/pcseg)**|\n", "2309.05519": "|**2023-09-13**|**NExT-GPT: Any-to-Any Multimodal LLM**|Shengqiong Wu et.al.|[2309.05519v2](http://arxiv.org/abs/2309.05519v2)|**[link](https://github.com/NExT-GPT/NExT-GPT)**|\n", "2309.05503": "|**2023-09-11**|**Long-Range Transformer Architectures for Document Understanding**|Thibault Douzon et.al.|[2309.05503v1](http://arxiv.org/abs/2309.05503v1)|**[link](https://github.com/thibaultdouzon/long-range-document-transformer)**|\n", "2309.05451": "|**2023-09-11**|**Dual-view Curricular Optimal Transport for Cross-lingual Cross-modal Retrieval**|Yabing Wang et.al.|[2309.05451v1](http://arxiv.org/abs/2309.05451v1)|null|\n", "2309.05423": "|**2023-09-11**|**Multi-Modal Automatic Prosody Annotation with Contrastive Pretraining of SSWP**|Jinzuomu Zhong et.al.|[2309.05423v1](http://arxiv.org/abs/2309.05423v1)|null|\n", "2309.05396": "|**2023-09-12**|**SlideSpeech: A Large-Scale Slide-Enriched Audio-Visual Corpus**|Haoxu Wang et.al.|[2309.05396v2](http://arxiv.org/abs/2309.05396v2)|null|\n", "2309.05298": "|**2023-09-11**|**Real-Time Parallel Trajectory Optimization with Spatiotemporal Safety Constraints for Autonomous Driving in Congested Traffic**|Lei Zheng et.al.|[2309.05298v1](http://arxiv.org/abs/2309.05298v1)|null|\n", "2309.05281": "|**2023-09-11**|**Class-Incremental Grouping Network for Continual Audio-Visual Learning**|Shentong Mo et.al.|[2309.05281v1](http://arxiv.org/abs/2309.05281v1)|**[link](https://github.com/stonemo/cign)**|\n", "2309.05257": "|**2023-09-11**|**FusionFormer: A Multi-sensory Fusion in Bird's-Eye-View and Temporal Consistent Transformer for 3D Objection**|Chunyong Hu et.al.|[2309.05257v1](http://arxiv.org/abs/2309.05257v1)|null|\n", "2309.05251": "|**2023-09-11**|**Multi3DRefer: Grounding Text Description to Multiple 3D Objects**|Yiming Zhang et.al.|[2309.05251v1](http://arxiv.org/abs/2309.05251v1)|null|\n", "2309.05248": "|**2023-09-11**|**Enhancing Speaker Diarization with Large Language Models: A Contextual Beam Search Approach**|Tae Jin Park et.al.|[2309.05248v1](http://arxiv.org/abs/2309.05248v1)|null|\n", "2309.05203": "|**2023-09-11**|**From Artificially Real to Real: Leveraging Pseudo Data from Large Language Models for Low-Resource Molecule Discovery**|Yuhan Chen et.al.|[2309.05203v1](http://arxiv.org/abs/2309.05203v1)|null|\n", "2309.05090": "|**2023-09-10**|**Sculpting Efficiency: Pruning Medical Imaging Models for On-Device Inference**|Sudarshan Sreeram et.al.|[2309.05090v1](http://arxiv.org/abs/2309.05090v1)|null|\n", "2309.06262": "|**2023-09-12**|**Modality Unifying Network for Visible-Infrared Person Re-Identification**|Hao Yu et.al.|[2309.06262v1](http://arxiv.org/abs/2309.06262v1)|null|\n", "2309.06255": "|**2023-09-12**|**Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation**|Yake Wei et.al.|[2309.06255v1](http://arxiv.org/abs/2309.06255v1)|null|\n", "2309.06176": "|**2023-09-12**|**Dual-Path Temporal Map Optimization for Make-up Temporal Video Grounding**|Jiaxiu Li et.al.|[2309.06176v1](http://arxiv.org/abs/2309.06176v1)|null|\n", "2309.06102": "|**2023-09-12**|**Can we predict the Most Replayed data of video streaming platforms?**|Alessandro Duico et.al.|[2309.06102v1](http://arxiv.org/abs/2309.06102v1)|**[link](https://github.com/ombretta/most-replayed-data)**|\n", "2309.06081": "|**2023-09-12**|**Information Flow in Graph Neural Networks: A Clinical Triage Use Case**|V\u00edctor Valls et.al.|[2309.06081v1](http://arxiv.org/abs/2309.06081v1)|null|\n", "2309.05904": "|**2023-09-12**|**Enhancing Representation in Radiography-Reports Foundation Model: A Granular Alignment Algorithm Using Masked Contrastive Learning**|Weijian Huang et.al.|[2309.05904v1](http://arxiv.org/abs/2309.05904v1)|null|\n", "2309.05818": "|**2023-09-11**|**Rice Plant Disease Detection and Diagnosis using Deep Convolutional Neural Networks and Multispectral Imaging**|Yara Ali Alnaggar et.al.|[2309.05818v1](http://arxiv.org/abs/2309.05818v1)|null|\n", "2309.05803": "|**2023-09-11**|**Revisiting Energy Based Models as Policies: Ranking Noise Contrastive Estimation and Interpolating Energy Models**|Sumeet Singh et.al.|[2309.05803v1](http://arxiv.org/abs/2309.05803v1)|null|\n", "2309.05756": "|**2023-09-11**|**TransferDoc: A Self-Supervised Transferable Document Representation Learning Model Unifying Vision and Language**|Souhail Bakkali et.al.|[2309.05756v1](http://arxiv.org/abs/2309.05756v1)|null|\n", "2309.07120": "|**2023-09-13**|**Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness and Ethics**|Haoqin Tu et.al.|[2309.07120v1](http://arxiv.org/abs/2309.07120v1)|**[link](https://github.com/ucsc-vlaa/sight-beyond-text)**|\n", "2309.07066": "|**2023-09-13**|**CLiFF-LHMP: Using Spatial Dynamics Patterns for Long-Term Human Motion Prediction**|Yufei Zhu et.al.|[2309.07066v1](http://arxiv.org/abs/2309.07066v1)|null|\n", "2309.06799": "|**2023-09-13**|**When Geoscience Meets Foundation Models: Towards General Geoscience Artificial Intelligence System**|Hao Zhang et.al.|[2309.06799v1](http://arxiv.org/abs/2309.06799v1)|null|\n", "2309.06735": "|**2023-09-13**|**GelFlow: Self-supervised Learning of Optical Flow for Vision-Based Tactile Sensor Displacement Measurement**|Zhiyuan Zhang et.al.|[2309.06735v1](http://arxiv.org/abs/2309.06735v1)|null|\n", "2309.06728": "|**2023-09-13**|**Leveraging Foundation models for Unsupervised Audio-Visual Segmentation**|Swapnil Bhosale et.al.|[2309.06728v1](http://arxiv.org/abs/2309.06728v1)|null|\n", "2309.06599": "|**2023-09-12**|**Reasoning with Latent Diffusion in Offline Reinforcement Learning**|Siddarth Venkatraman et.al.|[2309.06599v1](http://arxiv.org/abs/2309.06599v1)|**[link](https://github.com/ldcq/ldcq)**|\n", "2309.06597": "|**2023-09-12**|**Rank2Tell: A Multimodal Driving Dataset for Joint Importance Ranking and Reasoning**|Enna Sachdeva et.al.|[2309.06597v1](http://arxiv.org/abs/2309.06597v1)|null|\n", "2309.06547": "|**2023-09-12**|**AmodalSynthDrive: A Synthetic Amodal Perception Dataset for Autonomous Driving**|Ahmed Rida Sekkat et.al.|[2309.06547v1](http://arxiv.org/abs/2309.06547v1)|null|\n", "2309.06517": "|**2023-09-12**|**Overview of Memotion 3: Sentiment and Emotion Analysis of Codemixed Hinglish Memes**|Shreyash Mishra et.al.|[2309.06517v1](http://arxiv.org/abs/2309.06517v1)|null|\n", "2309.06511": "|**2023-09-12**|**DF-TransFusion: Multimodal Deepfake Detection via Lip-Audio Cross-Attention and Facial Self-Attention**|Aaditya Kharel et.al.|[2309.06511v1](http://arxiv.org/abs/2309.06511v1)|null|\n", "2309.07915": "|**2023-09-14**|**MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning**|Haozhe Zhao et.al.|[2309.07915v1](http://arxiv.org/abs/2309.07915v1)|**[link](https://github.com/haozhezhao/mic)**|\n", "2309.07794": "|**2023-09-14**|**Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary tasks**|Danae S\u00e1nchez Villegas et.al.|[2309.07794v1](http://arxiv.org/abs/2309.07794v1)|null|\n", "2309.07759": "|**2023-09-14**|**PROGrasp: Pragmatic Human-Robot Communication for Object Grasping**|Gi-Cheon Kang et.al.|[2309.07759v1](http://arxiv.org/abs/2309.07759v1)|null|\n", "2309.07623": "|**2023-09-14**|**SwitchGPT: Adapting Large Language Models for Non-Text Outputs**|Xinyu Wang et.al.|[2309.07623v1](http://arxiv.org/abs/2309.07623v1)|null|\n", "2309.07495": "|**2023-09-14**|**HDTR-Net: A Real-Time High-Definition Teeth Restoration Network for Arbitrary Talking Face Generation Methods**|Yongyuan Li et.al.|[2309.07495v1](http://arxiv.org/abs/2309.07495v1)|**[link](https://github.com/yylgoodlucky/hdtr)**|\n", "2309.07387": "|**2023-09-14**|**VDialogUE: A Unified Evaluation Benchmark for Visually-grounded Dialogue**|Yunshui Li et.al.|[2309.07387v1](http://arxiv.org/abs/2309.07387v1)|null|\n", "2309.07332": "|**2023-09-13**|**Reliability-based cleaning of noisy training labels with inductive conformal prediction in multi-modal biomedical data mining**|Xianghao Zhan et.al.|[2309.07332v1](http://arxiv.org/abs/2309.07332v1)|**[link](https://github.com/xzhan96-stf/icp_train_clean)**|\n", "2309.07297": "|**2023-09-13**|**Multi-Modal Hybrid Learning and Sequential Training for RGB-T Saliency Detection**|Guangyu Ren et.al.|[2309.07297v1](http://arxiv.org/abs/2309.07297v1)|null|\n", "2309.08531": "|**2023-09-15**|**Towards Practical and Efficient Image-to-Speech Captioning with Vision-Language Pre-training and Multi-modal Tokens**|Minsu Kim et.al.|[2309.08531v1](http://arxiv.org/abs/2309.08531v1)|null|\n", "2309.08508": "|**2023-09-15**|**MOSAIC: Learning Unified Multi-Sensory Object Property Representations for Robot Perception**|Gyan Tatiya et.al.|[2309.08508v1](http://arxiv.org/abs/2309.08508v1)|**[link](https://github.com/gtatiya/MOSAIC)**|\n", "2309.08229": "|**2023-09-15**|**Automated Multi-Drugs Administration During Total Intravenous Anesthesia Using Multi-Model Predictive Control**|Bob Aubouin-Pairault et.al.|[2309.08229v1](http://arxiv.org/abs/2309.08229v1)|**[link](https://github.com/bobaubouin/tiva_drug_control)**|\n", "2309.08204": "|**2023-09-15**|**One-stage Modality Distillation for Incomplete Multimodal Learning**|Shicai Wei et.al.|[2309.08204v1](http://arxiv.org/abs/2309.08204v1)|null|\n", "2309.08160": "|**2023-09-15**|**Cross-Modal Synthesis of Structural MRI and Functional Connectivity Networks via Conditional ViT-GANs**|Yuda Bi et.al.|[2309.08160v1](http://arxiv.org/abs/2309.08160v1)|null|\n", "2309.08154": "|**2023-09-15**|**Uncertainty-Aware Multi-View Visual Semantic Embedding**|Wenzhang Wei et.al.|[2309.08154v1](http://arxiv.org/abs/2309.08154v1)|null|\n", "2309.08096": "|**2023-09-15**|**GelSplitter: Tactile Reconstruction from Near Infrared and Visible Images**|Yuankai Lin et.al.|[2309.08096v1](http://arxiv.org/abs/2309.08096v1)|null|\n", "2309.08088": "|**2023-09-15**|**Interactive Model Fusion-Based GM-PHD Filter**|Jiacheng He et.al.|[2309.08088v1](http://arxiv.org/abs/2309.08088v1)|null|\n", "2309.08021": "|**2023-09-14**|**Vision-based Analysis of Driver Activity and Driving Performance Under the Influence of Alcohol**|Ross Greer et.al.|[2309.08021v1](http://arxiv.org/abs/2309.08021v1)|null|\n", "2309.09958": "|**2023-09-18**|**An Empirical Study of Scaling Instruct-Tuned Large Multimodal Models**|Yadong Lu et.al.|[2309.09958v1](http://arxiv.org/abs/2309.09958v1)|**[link](https://github.com/haotian-liu/LLaVA)**|\n", "2309.09875": "|**2023-09-18**|**RaLF: Flow-based Global and Metric Radar Localization in LiDAR Maps**|Abhijeet Nayak et.al.|[2309.09875v1](http://arxiv.org/abs/2309.09875v1)|null|\n", "2309.09867": "|**2023-09-18**|**EGFE: End-to-end Grouping of Fragmented Elements in UI Designs with Multimodal Learning**|Liuqing Chen et.al.|[2309.09867v1](http://arxiv.org/abs/2309.09867v1)|**[link](https://github.com/test2975/egfe)**|\n", "2309.09832": "|**2023-09-18**|**Task Selection and Assignment for Multi-modal Multi-task Dialogue Act Classification with Non-stationary Multi-armed Bandits**|Xiangheng He et.al.|[2309.09832v1](http://arxiv.org/abs/2309.09832v1)|null|\n", "2309.09667": "|**2023-09-18**|**Unified Frequency-Assisted Transformer Framework for Detecting and Grounding Multi-Modal Manipulation**|Huan Liu et.al.|[2309.09667v1](http://arxiv.org/abs/2309.09667v1)|null|\n", "2309.09646": "|**2023-09-18**|**Concurrent Haptic, Audio, and Visual Data Set During Bare Finger Interaction with Textured Surfaces**|Alexis W. M. Devillard et.al.|[2309.09646v1](http://arxiv.org/abs/2309.09646v1)|null|\n", "2309.09592": "|**2023-09-18**|**Multi-Semantic Fusion Model for Generalized Zero-Shot Skeleton-Based Action Recognition**|Ming-Zhe Li et.al.|[2309.09592v1](http://arxiv.org/abs/2309.09592v1)|**[link](https://github.com/EHZ9NIWI7/MSF-GZSSAR)**|\n", "2309.09513": "|**2023-09-18**|**Learning Parallax for Stereo Event-based Motion Deblurring**|Mingyuan Lin et.al.|[2309.09513v1](http://arxiv.org/abs/2309.09513v1)|null|\n", "2309.09501": "|**2023-09-18**|**Discovering Sounding Objects by Audio Queries for Audio Visual Segmentation**|Shaofei Huang et.al.|[2309.09501v1](http://arxiv.org/abs/2309.09501v1)|null|\n", "2309.09473": "|**2023-09-18**|**Self-supervised Multi-view Clustering in Computer Vision: A Survey**|Jiatai Wang et.al.|[2309.09473v1](http://arxiv.org/abs/2309.09473v1)|null|\n", "2309.09421": "|**2023-09-18**|**Unified Pretraining Target Based Video-music Retrieval With Music Rhythm And Video Optical Flow Information**|Tianjun Mao et.al.|[2309.09421v1](http://arxiv.org/abs/2309.09421v1)|null|\n", "2309.09246": "|**2023-09-17**|**Image-level supervision and self-training for transformer-based cross-modality tumor segmentation**|Malo de Boisredon et.al.|[2309.09246v1](http://arxiv.org/abs/2309.09246v1)|null|\n", "2309.09088": "|**2023-09-16**|**Enhancing GAN-Based Vocoders with Contrastive Learning Under Data-limited Condition**|Haoming Guo et.al.|[2309.09088v1](http://arxiv.org/abs/2309.09088v1)|null|\n", "2309.09067": "|**2023-09-19**|**MMST-ViT: Climate Change-aware Crop Yield Prediction via Multi-Modal Spatial-Temporal Vision Transformer**|Fudong Lin et.al.|[2309.09067v2](http://arxiv.org/abs/2309.09067v2)|**[link](https://github.com/fudong03/mmst-vit)**|\n", "2309.08966": "|**2023-09-16**|**FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering and Local to Global Optimization**|Nan Ma et.al.|[2309.08966v1](http://arxiv.org/abs/2309.08966v1)|null|\n", "2309.10724": "|**2023-09-19**|**Sound Source Localization is All about Cross-Modal Alignment**|Arda Senocak et.al.|[2309.10724v1](http://arxiv.org/abs/2309.10724v1)|null|\n", "2309.10649": "|**2023-09-19**|**Cross-modal and Cross-domain Knowledge Transfer for Label-free 3D Segmentation**|Jingyu Zhang et.al.|[2309.10649v1](http://arxiv.org/abs/2309.10649v1)|null|\n", "2309.10606": "|**2023-09-19**|**A Novel Hybrid Algorithm for Optimized Solutions in Ocean Renewable Energy Industry: Enhancing Power Take-Off Parameters and Site Selection Procedure of Wave Energy Converters**|Hossein Mehdipour et.al.|[2309.10606v1](http://arxiv.org/abs/2309.10606v1)|null|\n", "2309.10537": "|**2023-09-19**|**FoleyGen: Visually-Guided Audio Generation**|Xinhao Mei et.al.|[2309.10537v1](http://arxiv.org/abs/2309.10537v1)|null|\n", "2309.10365": "|**2023-09-19**|**Testable Likelihoods for Beyond-the-Standard Model Fits**|Anja Beck et.al.|[2309.10365v1](http://arxiv.org/abs/2309.10365v1)|null|\n", "2309.10361": "|**2023-09-19**|**Improving CLIP Robustness with Knowledge Distillation and Self-Training**|Clement Laroudie et.al.|[2309.10361v1](http://arxiv.org/abs/2309.10361v1)|null|\n", "2309.10283": "|**2023-09-19**|**FRAMU: Attention-based Machine Unlearning using Federated Reinforcement Learning**|Thanveer Shaik et.al.|[2309.10283v1](http://arxiv.org/abs/2309.10283v1)|null|\n", "2309.10244": "|**2023-09-19**|**UPL-SFDA: Uncertainty-aware Pseudo Label Guided Source-Free Domain Adaptation for Medical Image Segmentation**|Jianghao Wu et.al.|[2309.10244v1](http://arxiv.org/abs/2309.10244v1)|**[link](https://github.com/hilab-git/upl-sfda)**|\n", "2309.10195": "|**2023-09-20**|**Multi-modality Meets Re-learning: Mitigating Negative Transfer in Sequential Recommendation**|Bo Peng et.al.|[2309.10195v2](http://arxiv.org/abs/2309.10195v2)|null|\n", "2309.10091": "|**2023-09-18**|**Unified Coarse-to-Fine Alignment for Video-Text Retrieval**|Ziyang Wang et.al.|[2309.10091v1](http://arxiv.org/abs/2309.10091v1)|**[link](https://github.com/ziyang412/ucofia)**|\n", "2309.10077": "|**2023-09-18**|**GAME: Generalized deep learning model towards multimodal data integration for early screening of adolescent mental disorders**|Zhicheng Du et.al.|[2309.10077v1](http://arxiv.org/abs/2309.10077v1)|null|\n", "2309.11335": "|**2023-09-20**|**2D-3D Pose Tracking with Multi-View Constraints**|Huai Yu et.al.|[2309.11335v1](http://arxiv.org/abs/2309.11335v1)|null|\n", "2309.11119": "|**2023-09-21**|**BroadBEV: Collaborative LiDAR-camera Fusion for Broad-sighted Bird's Eye View Map Construction**|Minsu Kim et.al.|[2309.11119v2](http://arxiv.org/abs/2309.11119v2)|null|\n", "2309.11082": "|**2023-09-20**|**Dual-Modal Attention-Enhanced Text-Video Retrieval with Triplet Partial Margin Contrastive Learning**|Chen Jiang et.al.|[2309.11082v1](http://arxiv.org/abs/2309.11082v1)|null|\n", "2309.11081": "|**2023-09-20**|**Dense 2D-3D Indoor Prediction with Sound via Aligned Cross-Modal Distillation**|Heeseung Yun et.al.|[2309.11081v1](http://arxiv.org/abs/2309.11081v1)|**[link](https://github.com/hs-yn/daps)**|\n", "2309.12314": "|**2023-09-21**|**TinyCLIP: CLIP Distillation via Affinity Mimicking and Weight Inheritance**|Kan Wu et.al.|[2309.12314v1](http://arxiv.org/abs/2309.12314v1)|**[link](https://github.com/microsoft/Cream/tree/main/TinyCLIP)**|\n", "2309.12224": "|**2023-09-21**|**Towards Answering Health-related Questions from Medical Videos: Datasets and Approaches**|Deepak Gupta et.al.|[2309.12224v1](http://arxiv.org/abs/2309.12224v1)|null|\n", "2309.12158": "|**2023-09-21**|**Towards Robust and Truly Large-Scale Audio-Sheet Music Retrieval**|Luis Carvalho et.al.|[2309.12158v1](http://arxiv.org/abs/2309.12158v1)|null|\n", "2309.12134": "|**2023-09-21**|**Self-Supervised Contrastive Learning for Robust Audio-Sheet Music Retrieval Systems**|Luis Carvalho et.al.|[2309.12134v1](http://arxiv.org/abs/2309.12134v1)|null|\n", "2309.12111": "|**2023-09-21**|**Passage Summarization with Recurrent Models for Audio-Sheet Music Retrieval**|Luis Carvalho et.al.|[2309.12111v1](http://arxiv.org/abs/2309.12111v1)|null|\n", "2309.12110": "|**2023-09-21**|**Exploiting CLIP-based Multi-modal Approach for Artwork Classification and Retrieval**|Alberto Baldrati et.al.|[2309.12110v1](http://arxiv.org/abs/2309.12110v1)|null|\n", "2309.12030": "|**2023-09-21**|**CAMERA: A Multimodal Dataset and Benchmark for Ad Text Generation**|Masato Mita et.al.|[2309.12030v1](http://arxiv.org/abs/2309.12030v1)|**[link](https://github.com/cyberagentailab/camera)**|\n", "2309.12009": "|**2023-09-21**|**Elevating Skeleton-Based Action Recognition with Efficient Multi-Modality Self-Supervision**|Yiping Wei et.al.|[2309.12009v1](http://arxiv.org/abs/2309.12009v1)|**[link](https://github.com/desehuileng0o0/ikem)**|\n", "2309.11933": "|**2023-09-21**|**Fully Transformer-Equipped Architecture for End-to-End Referring Video Object Segmentation**|Ping Li et.al.|[2309.11933v1](http://arxiv.org/abs/2309.11933v1)|null|\n", "2309.11923": "|**2023-09-21**|**TextCLIP: Text-Guided Face Image Generation And Manipulation Without Adversarial Training**|Xiaozhou You et.al.|[2309.11923v1](http://arxiv.org/abs/2309.11923v1)|null|\n", "2309.11860": "|**2023-09-21**|**QUEST: An Efficient Query Evaluation Scheme Towards Scan-Intensive Cross-Model Analysis**|Jianfeng Huang et.al.|[2309.11860v1](http://arxiv.org/abs/2309.11860v1)|null|\n", "2309.11845": "|**2023-09-21**|**TMac: Temporal Multi-Modal Graph Learning for Acoustic Event Classification**|Meng Liu et.al.|[2309.11845v1](http://arxiv.org/abs/2309.11845v1)|**[link](https://github.com/mgithubl/tmac)**|\n", "2309.11839": "|**2023-09-21**|**MoPA: Multi-Modal Prior Aided Domain Adaptation for 3D Semantic Segmentation**|Haozhi Cao et.al.|[2309.11839v1](http://arxiv.org/abs/2309.11839v1)|null|\n", "2309.11837": "|**2023-09-21**|**Stellar model calibrations with the Ai Phe binary system. Open questions about the robustness of the fit**|G. Valle et.al.|[2309.11837v1](http://arxiv.org/abs/2309.11837v1)|null|\n", "2309.11755": "|**2023-09-21**|**2DDATA: 2D Detection Annotations Transmittable Aggregation for Semantic Segmentation on Point Cloud**|Guan-Cheng Lee et.al.|[2309.11755v1](http://arxiv.org/abs/2309.11755v1)|null|\n", "2309.13007": "|**2023-09-22**|**ReConcile: Round-Table Conference Improves Reasoning via Consensus among Diverse LLMs**|Justin Chih-Yao Chen et.al.|[2309.13007v1](http://arxiv.org/abs/2309.13007v1)|**[link](https://github.com/dinobby/reconcile)**|\n", "2309.12865": "|**2023-09-22**|**Bridging Sensor Gaps via Single-Direction Tuning for Hyperspectral Image Classification**|Xizhe Xue et.al.|[2309.12865v1](http://arxiv.org/abs/2309.12865v1)|**[link](https://github.com/cecilia-xue/hyt-nas)**|\n", "2309.12855": "|**2023-09-22**|**Cross-Modal Translation and Alignment for Survival Analysis**|Fengtao Zhou et.al.|[2309.12855v1](http://arxiv.org/abs/2309.12855v1)|**[link](https://github.com/ft-zhou-zzz/cmta)**|\n", "2309.12764": "|**2023-09-22**|**Multi-Modal Embeddings for Isolating Cross-Platform Coordinated Information Campaigns on Social Media**|Fabio Barbero et.al.|[2309.12764v1](http://arxiv.org/abs/2309.12764v1)|null|\n", "2309.12657": "|**2023-09-22**|**Exploiting Modality-Specific Features For Multi-Modal Manipulation Detection And Grounding**|Jiazhen Wang et.al.|[2309.12657v1](http://arxiv.org/abs/2309.12657v1)|null|\n", "2309.12572": "|**2023-09-22**|**Interpretable 3D Multi-Modal Residual Convolutional Neural Network for Mild Traumatic Brain Injury Diagnosis**|Hanem Ellethy et.al.|[2309.12572v1](http://arxiv.org/abs/2309.12572v1)|null|\n", "2309.14327": "|**2023-09-25**|**DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention**|Zhewei Yao et.al.|[2309.14327v1](http://arxiv.org/abs/2309.14327v1)|**[link](https://github.com/microsoft/deepspeedexamples)**|\n", "2309.14320": "|**2023-09-25**|**MUTEX: Learning Unified Policies from Multimodal Task Specifications**|Rutav Shah et.al.|[2309.14320v1](http://arxiv.org/abs/2309.14320v1)|null|\n", "2309.14203": "|**2023-09-25**|**Detecting and Grounding Multi-Modal Media Manipulation and Beyond**|Rui Shao et.al.|[2309.14203v1](http://arxiv.org/abs/2309.14203v1)|**[link](https://github.com/rshaojimmy/multimodal-deepfake)**|\n", "2309.14183": "|**2023-09-26**|**Species196: A One-Million Semi-supervised Dataset for Fine-grained Species Recognition**|Wei He et.al.|[2309.14183v2](http://arxiv.org/abs/2309.14183v2)|**[link](https://github.com/Species-Dataset/species-dataset.github.io)**|\n", "2309.14181": "|**2023-09-25**|**Q-Bench: A Benchmark for General-Purpose Foundation Models on Low-level Vision**|Haoning Wu et.al.|[2309.14181v1](http://arxiv.org/abs/2309.14181v1)|**[link](https://github.com/VQAssessment/Q-Bench)**|\n", "2309.14065": "|**2023-09-26**|**AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile Platform Real-Time RGB-D Semantic Segmentation**|Siqi Du et.al.|[2309.14065v2](http://arxiv.org/abs/2309.14065v2)|**[link](https://github.com/Fourier7754/AsymFormer)**|\n", "2309.14050": "|**2023-09-26**|**NNgTL: Neural Network Guided Optimal Temporal Logic Task Planning for Mobile Robots**|Ruijia Liu et.al.|[2309.14050v2](http://arxiv.org/abs/2309.14050v2)|null|\n", "2309.14003": "|**2023-09-25**|**Hierarchical Imitation Learning for Stochastic Environments**|Maximilian Igl et.al.|[2309.14003v1](http://arxiv.org/abs/2309.14003v1)|null|\n", "2309.13770": "|**2023-09-24**|**Devil in the Number: Towards Robust Multi-modality Data Filter**|Yichen Xu et.al.|[2309.13770v1](http://arxiv.org/abs/2309.13770v1)|null|\n", "2309.13650": "|**2023-09-24**|**Cross-modal Alignment with Optimal Transport for CTC-based ASR**|Xugang Lu et.al.|[2309.13650v1](http://arxiv.org/abs/2309.13650v1)|null|\n", "2309.13554": "|**2023-09-24**|**A Novel Stochastic Interacting Particle-Field Algorithm for 3D Parabolic-Parabolic Keller-Segel Chemotaxis System**|Zhongjian Wang et.al.|[2309.13554v1](http://arxiv.org/abs/2309.13554v1)|null|\n", "2309.13504": "|**2023-09-23**|**Attention Is All You Need For Blind Room Volume Estimation**|Chunxi Wang et.al.|[2309.13504v1](http://arxiv.org/abs/2309.13504v1)|null|\n", "2309.13470": "|**2023-09-23**|**HAVE-Net: Hallucinated Audio-Visual Embeddings for Few-Shot Classification with Unimodal Cues**|Ankit Jha et.al.|[2309.13470v1](http://arxiv.org/abs/2309.13470v1)|null|\n", "2309.13322": "|**2023-09-23**|**From Text to Source: Results in Detecting Large Language Model-Generated Content**|Wissam Antoun et.al.|[2309.13322v1](http://arxiv.org/abs/2309.13322v1)|null|\n", "2309.13266": "|**2023-09-23**|**Robust Navigation with Cross-Modal Fusion and Knowledge Transfer**|Wenzhe Cai et.al.|[2309.13266v1](http://arxiv.org/abs/2309.13266v1)|**[link](https://github.com/wzcai99/Distill-Navigator)**|\n", "2309.15117": "|**2023-09-26**|**Generating Visual Scenes from Touch**|Fengyu Yang et.al.|[2309.15117v1](http://arxiv.org/abs/2309.15117v1)|null|\n", "2309.15112": "|**2023-09-27**|**InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition**|Pan Zhang et.al.|[2309.15112v2](http://arxiv.org/abs/2309.15112v2)|**[link](https://github.com/internlm/internlm-xcomposer)**|\n", "2309.15109": "|**2023-09-26**|**DistillBEV: Boosting Multi-Camera 3D Object Detection with Cross-Modal Knowledge Distillation**|Zeyu Wang et.al.|[2309.15109v1](http://arxiv.org/abs/2309.15109v1)|**[link](https://github.com/qcraftai/distill-bev)**|\n", "2309.15082": "|**2023-09-26**|**RPEFlow: Multimodal Fusion of RGB-PointCloud-Event for Joint Optical Flow and Scene Flow Estimation**|Zhexiong Wan et.al.|[2309.15082v1](http://arxiv.org/abs/2309.15082v1)|**[link](https://github.com/danqu130/RPEFlow)**|\n", "2309.14704": "|**2023-09-26**|**Tile Classification Based Viewport Prediction with Multi-modal Fusion Transformer**|Zhihao Zhang et.al.|[2309.14704v1](http://arxiv.org/abs/2309.14704v1)|null|\n", "2309.14673": "|**2023-09-26**|**ALEX: Towards Effective Graph Transfer Learning with Noisy Labels**|Jingyang Yuan et.al.|[2309.14673v1](http://arxiv.org/abs/2309.14673v1)|null|\n", "2309.14611": "|**2023-09-26**|**Event Stream-based Visual Object Tracking: A High-Resolution Benchmark Dataset and A Novel Baseline**|Xiao Wang et.al.|[2309.14611v1](http://arxiv.org/abs/2309.14611v1)|**[link](https://github.com/event-ahu/eventvot_benchmark)**|\n", "2309.14580": "|**2023-09-26**|**CWCL: Cross-Modal Transfer with Continuously Weighted Contrastive Loss**|Rakshith Sharma Srinivasa et.al.|[2309.14580v1](http://arxiv.org/abs/2309.14580v1)|null|\n", "2309.14516": "|**2023-09-25**|**UniBEV: Multi-modal 3D Object Detection with Uniform BEV Encoders for Robustness against Missing Sensor Modalities**|Shiming Wang et.al.|[2309.14516v1](http://arxiv.org/abs/2309.14516v1)|null|\n", "2309.14491": "|**2023-09-25**|**Unsupervised 3D Perception with 2D Vision-Language Distillation for Autonomous Driving**|Mahyar Najibi et.al.|[2309.14491v1](http://arxiv.org/abs/2309.14491v1)|null|\n", "2309.15826": "|**2023-09-27**|**Cross-Modal Multi-Tasking for Speech-to-Text Translation via Hard Parameter Sharing**|Brian Yan et.al.|[2309.15826v1](http://arxiv.org/abs/2309.15826v1)|null|\n", "2309.15751": "|**2023-09-27**|**InfraParis: A multi-modal and multi-task autonomous driving dataset**|Gianni Franchi et.al.|[2309.15751v1](http://arxiv.org/abs/2309.15751v1)|null|\n", "2309.15739": "|**2023-09-27**|**Experience and Evidence are the eyes of an excellent summarizer! Towards Knowledge Infused Multi-modal Clinical Conversation Summarization**|Abhisek Tiwari et.al.|[2309.15739v1](http://arxiv.org/abs/2309.15739v1)|**[link](https://github.com/nlp-rl/mm-cliconsummation)**|\n", "2309.15683": "|**2023-09-27**|**End-to-End Streaming Video Temporal Action Segmentation with Reinforce Learning**|Wujun Wen et.al.|[2309.15683v1](http://arxiv.org/abs/2309.15683v1)|**[link](https://github.com/Thinksky5124/SVTAS)**|\n", "2309.15599": "|**2023-09-27**|**OceanBench: The Sea Surface Height Edition**|J. Emmanuel Johnson et.al.|[2309.15599v1](http://arxiv.org/abs/2309.15599v1)|**[link](https://github.com/jejjohnson/oceanbench)**|\n", "2309.15529": "|**2023-09-27**|**Missing-modality Enabled Multi-modal Fusion Architecture for Medical Data**|Muyu Wang et.al.|[2309.15529v1](http://arxiv.org/abs/2309.15529v1)|null|\n", "2309.15427": "|**2023-09-27**|**Graph Neural Prompting with Large Language Models**|Yijun Tian et.al.|[2309.15427v1](http://arxiv.org/abs/2309.15427v1)|null|\n", "2309.15402": "|**2023-09-27**|**A Survey of Chain of Thought Reasoning: Advances, Frontiers and Future**|Zheng Chu et.al.|[2309.15402v1](http://arxiv.org/abs/2309.15402v1)|**[link](https://github.com/zchuz/cot-reasoning-survey)**|\n", "2309.15390": "|**2023-09-27**|**MINS: Efficient and Robust Multisensor-aided Inertial Navigation System**|Woosik Lee et.al.|[2309.15390v1](http://arxiv.org/abs/2309.15390v1)|**[link](https://github.com/rpng/mins)**|\n", "2309.15313": "|**2023-09-26**|**M$^{3}$3D: Learning 3D priors using Multi-Modal Masked Autoencoders for 2D image and video understanding**|Muhammad Abdullah Jamal et.al.|[2309.15313v1](http://arxiv.org/abs/2309.15313v1)|null|\n", "2309.15302": "|**2023-09-26**|**Self-Supervised Terrain Representation Learning from Unconstrained Robot Experience**|Haresh Karnan et.al.|[2309.15302v1](http://arxiv.org/abs/2309.15302v1)|null|\n", "2309.15283": "|**2023-09-26**|**Multi-Modal Planning on Regrasping for Stable Manipulation**|Jiaming Hu et.al.|[2309.15283v1](http://arxiv.org/abs/2309.15283v1)|null|\n", "2309.16592": "|**2023-09-28**|**Tensor Factorization for Leveraging Cross-Modal Knowledge in Data-Constrained Infrared Object Detection**|Manish Sharma et.al.|[2309.16592v1](http://arxiv.org/abs/2309.16592v1)|null|\n", "2309.16569": "|**2023-09-28**|**Audio-Visual Speaker Verification via Joint Cross-Attention**|R. Gnana Praveen et.al.|[2309.16569v1](http://arxiv.org/abs/2309.16569v1)|null|\n", "2309.16283": "|**2023-09-28**|**Self-supervised Cross-view Representation Reconstruction for Change Captioning**|Yunbin Tu et.al.|[2309.16283v1](http://arxiv.org/abs/2309.16283v1)|null|\n", "2309.16211": "|**2023-09-28**|**VDC: Versatile Data Cleanser for Detecting Dirty Samples via Visual-Linguistic Inconsistency**|Zihao Zhu et.al.|[2309.16211v1](http://arxiv.org/abs/2309.16211v1)|null|\n", "2309.16206": "|**2023-09-28**|**Cross-Modal Transformer GAN: Brain Structural-Functional Deep Fusing Network for Alzheimer's Disease Analysis**|Qiankun Zuo et.al.|[2309.16206v1](http://arxiv.org/abs/2309.16206v1)|null|\n", "2309.16203": "|**2023-09-28**|**The Cloud Strikes Back: Investigating the Decentralization of IPFS**|Leonhard Balduf et.al.|[2309.16203v1](http://arxiv.org/abs/2309.16203v1)|null|\n", "2309.16141": "|**2023-09-28**|**Align before Search: Aligning Ads Image to Text for Accurate Cross-Modal Sponsored Search**|Yuanmin Tang et.al.|[2309.16141v1](http://arxiv.org/abs/2309.16141v1)|**[link](https://github.com/pter61/aligncmss)**|\n", "2309.16093": "|**2023-09-28**|**Hierarchical Cross-Modality Knowledge Transfer with Sinkhorn Attention for CTC-based ASR**|Xugang Lu et.al.|[2309.16093v1](http://arxiv.org/abs/2309.16093v1)|null|\n", "2309.15954": "|**2023-09-27**|**The Devil is in the Details: A Deep Dive into the Rabbit Hole of Data Filtering**|Haichao Yu et.al.|[2309.15954v1](http://arxiv.org/abs/2309.15954v1)|null|\n", "2309.15915": "|**2023-09-27**|**Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts**|Deniz Engin et.al.|[2309.15915v1](http://arxiv.org/abs/2309.15915v1)|**[link](https://github.com/engindeniz/vitis)**|\n", "2309.17395": "|**2023-09-29**|**AV-CPL: Continuous Pseudo-Labeling for Audio-Visual Speech Recognition**|Andrew Rouditchenko et.al.|[2309.17395v1](http://arxiv.org/abs/2309.17395v1)|null|\n", "2309.17336": "|**2023-09-29**|**See Beyond Seeing: Robust 3D Object Detection from Point Clouds via Cross-Modal Hallucination**|Jianning Deng et.al.|[2309.17336v1](http://arxiv.org/abs/2309.17336v1)|null|\n", "2309.17264": "|**2023-09-29**|**A Foundation Model for General Moving Object Segmentation in Medical Images**|Zhongnuo Yan et.al.|[2309.17264v1](http://arxiv.org/abs/2309.17264v1)|null|\n", "2309.17239": "|**2023-09-29**|**EGVD: Event-Guided Video Deraining**|Yueyi Zhang et.al.|[2309.17239v1](http://arxiv.org/abs/2309.17239v1)|**[link](https://github.com/booker-max/egvd)**|\n", "2309.17175": "|**2023-09-29**|**TextField3D: Towards Enhancing Open-Vocabulary 3D Generation with Noisy Text Fields**|Tianyu Huang et.al.|[2309.17175v1](http://arxiv.org/abs/2309.17175v1)|null|\n", "2309.17133": "|**2023-09-29**|**Fine-grained Late-interaction Multi-modal Retrieval for Retrieval Augmented Visual Question Answering**|Weizhe Lin et.al.|[2309.17133v1](http://arxiv.org/abs/2309.17133v1)|**[link](https://github.com/linweizhedragon/retrieval-augmented-visual-question-answering)**|\n", "2309.17104": "|**2023-10-03**|**Prototype-guided Cross-modal Completion and Alignment for Incomplete Text-based Person Re-identification**|Tiantian Gong et.al.|[2309.17104v2](http://arxiv.org/abs/2309.17104v2)|null|\n", "2309.17102": "|**2023-09-29**|**Guiding Instruction-based Image Editing via Multimodal Large Language Models**|Tsu-Jui Fu et.al.|[2309.17102v1](http://arxiv.org/abs/2309.17102v1)|**[link](https://github.com/tsujuifu/pytorch_mgie)**|\n", "2309.17093": "|**2023-09-29**|**Prototype-based Aleatoric Uncertainty Quantification for Cross-modal Retrieval**|Hao Li et.al.|[2309.17093v1](http://arxiv.org/abs/2309.17093v1)|**[link](https://github.com/leolee99/pau)**|\n", "2309.17037": "|**2023-09-29**|**Beyond Co-occurrence: Multi-modal Session-based Recommendation**|Xiaokun Zhang et.al.|[2309.17037v1](http://arxiv.org/abs/2309.17037v1)|**[link](https://github.com/zhang-xiaokun/mmsbr)**|\n", "2309.16984": "|**2023-09-29**|**Consistency Models as a Rich and Efficient Policy Class for Reinforcement Learning**|Zihan Ding et.al.|[2309.16984v1](http://arxiv.org/abs/2309.16984v1)|null|\n", "2309.16949": "|**2023-09-29**|**CrossZoom: Simultaneously Motion Deblurring and Event Super-Resolving**|Chi Zhang et.al.|[2309.16949v1](http://arxiv.org/abs/2309.16949v1)|**[link](https://github.com/bestrivenzc/CZ-Net)**|\n", "2309.16830": "|**2023-09-28**|**Robust Safe Control with Multi-Modal Uncertainty**|Tianhao Wei et.al.|[2309.16830v1](http://arxiv.org/abs/2309.16830v1)|null|\n", "2309.16818": "|**2023-09-28**|**MEM: Multi-Modal Elevation Mapping for Robotics and Learning**|Gian Erni et.al.|[2309.16818v1](http://arxiv.org/abs/2309.16818v1)|**[link](https://github.com/leggedrobotics/elevation_mapping_cupy)**|\n", "2309.16772": "|**2023-10-02**|**XVO: Generalized Visual Odometry via Cross-Modal Self-Training**|Lei Lai et.al.|[2309.16772v2](http://arxiv.org/abs/2309.16772v2)|null|\n", "2310.02071": "|**2023-10-03**|**Towards End-to-End Embodied Decision Making via Multi-modal Large Language Model: Explorations with GPT4-Vision and Beyond**|Liang Chen et.al.|[2310.02071v1](http://arxiv.org/abs/2310.02071v1)|**[link](https://github.com/pkunlp-icler/pca-eval)**|\n", "2310.02050": "|**2023-10-03**|**Tuning Large language model for End-to-end Speech Translation**|Hao Zhang et.al.|[2310.02050v1](http://arxiv.org/abs/2310.02050v1)|null|\n", "2310.01852": "|**2023-10-04**|**LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment**|Bin Zhu et.al.|[2310.01852v2](http://arxiv.org/abs/2310.01852v2)|**[link](https://github.com/pku-yuangroup/languagebind)**|\n", "2310.01733": "|**2023-10-03**|**Health Guardian: Using Multi-modal Data to Understand Individual Health**|Vince S. Siu et.al.|[2310.01733v1](http://arxiv.org/abs/2310.01733v1)|null|\n", "2310.01358": "|**2023-10-02**|**NEUCORE: Neural Concept Reasoning for Composed Image Retrieval**|Shu Zhao et.al.|[2310.01358v1](http://arxiv.org/abs/2310.01358v1)|null|\n", "2310.01351": "|**2023-10-02**|**Streaming Motion Forecasting for Autonomous Driving**|Ziqi Pang et.al.|[2310.01351v1](http://arxiv.org/abs/2310.01351v1)|**[link](https://github.com/ziqipang/streamingforecasting)**|\n", "2310.01330": "|**2023-10-02**|**Towards reporting bias in visual-language datasets: bimodal augmentation by decoupling object-attribute association**|Qiyu Wu et.al.|[2310.01330v1](http://arxiv.org/abs/2310.01330v1)|null|\n", "2310.01286": "|**2023-10-02**|**A Dynamic Macroscopic Framework for Pricing of Ride-hailing Services with an Optional Bus Lane Access for Pool Vehicles**|Lynn Fayed et.al.|[2310.01286v1](http://arxiv.org/abs/2310.01286v1)|null|\n", "2310.01232": "|**2023-10-02**|**Modality-aware Transformer for Time series Forecasting**|Hajar Emami et.al.|[2310.01232v1](http://arxiv.org/abs/2310.01232v1)|null|\n", "2310.01035": "|**2023-10-02**|**Learnable Cross-modal Knowledge Distillation for Multi-modal Learning with Missing Modality**|Hu Wang et.al.|[2310.01035v1](http://arxiv.org/abs/2310.01035v1)|null|\n", "2310.00927": "|**2023-10-02**|**Understanding Transferable Representation Learning and Zero-shot Transfer in CLIP**|Zixiang Chen et.al.|[2310.00927v1](http://arxiv.org/abs/2310.00927v1)|null|\n", "2310.00862": "|**2023-10-02**|**Shack-Hartmann wavefront sensing: A new approach to time-resolved measurement of stress intensity during dynamic fracture of small brittle specimens**|Liuchi Li et.al.|[2310.00862v1](http://arxiv.org/abs/2310.00862v1)|null|\n", "2310.00745": "|**2023-10-01**|**Deterministic Langevin Unconstrained Optimization with Normalizing Flows**|James M. Sullivan et.al.|[2310.00745v1](http://arxiv.org/abs/2310.00745v1)|null|\n", "2310.00740": "|**2023-10-01**|**Top-down Green-ups: Satellite Sensing and Deep Models to Predict Buffelgrass Phenology**|Lucas Rosenblatt et.al.|[2310.00740v1](http://arxiv.org/abs/2310.00740v1)|**[link](https://github.com/lurosenb/phenology_projects)**|\n", "2310.00672": "|**2023-10-01**|**GeRA: Label-Efficient Geometrically Regularized Alignment**|Dustin Klebe et.al.|[2310.00672v1](http://arxiv.org/abs/2310.00672v1)|null|\n", "2310.03024": "|**2023-10-04**|**AstroCLIP: Cross-Modal Pre-Training for Astronomical Foundation Models**|Francois Lanusse et.al.|[2310.03024v1](http://arxiv.org/abs/2310.03024v1)|**[link](https://github.com/PolymathicAI/AstroCLIP)**|\n", "2310.02960": "|**2023-10-04**|**CoDA: Collaborative Novel Box Discovery and Cross-modal Alignment for Open-vocabulary 3D Object Detection**|Yang Cao et.al.|[2310.02960v1](http://arxiv.org/abs/2310.02960v1)|**[link](https://github.com/yangcaoai/CoDA_NeurIPS2023)**|\n", "2310.02821": "|**2023-10-04**|**Improving Vision Anomaly Detection with the Guidance of Language Modality**|Dong Chen et.al.|[2310.02821v1](http://arxiv.org/abs/2310.02821v1)|**[link](https://github.com/Anfeather/CMG)**|\n", "2310.02777": "|**2023-10-04**|**The Role of Linguistic Priors in Measuring Compositional Generalization of Vision-Language Models**|Chenwei Wu et.al.|[2310.02777v1](http://arxiv.org/abs/2310.02777v1)|null|\n", "2310.02690": "|**2023-10-04**|**Multi-Dimension-Embedding-Aware Modality Fusion Transformer for Psychiatric Disorder Clasification**|Guoxin Wang et.al.|[2310.02690v1](http://arxiv.org/abs/2310.02690v1)|null|\n", "2310.02663": "|**2023-10-04**|**MedPrompt: Cross-Modal Prompting for Multi-Task Medical Image Translation**|Xuhang Chen et.al.|[2310.02663v1](http://arxiv.org/abs/2310.02663v1)|null|\n", "2310.02569": "|**2023-10-04**|**ReForm-Eval: Evaluating Large Vision Language Models via Unified Re-Formulation of Task-Oriented Benchmarks**|Zejun Li et.al.|[2310.02569v1](http://arxiv.org/abs/2310.02569v1)|**[link](https://github.com/fudandisc/reform-eval)**|\n", "2310.02561": "|**2023-10-04**|**Integrated Sensing and Communications towards Proactive Beamforming in mmWave V2I via Multi-Modal Feature Fusion (MMFF)**|Haotian Zhang et.al.|[2310.02561v1](http://arxiv.org/abs/2310.02561v1)|null|\n", "2310.02528": "|**2023-10-04**|**On the Cognition of Visual Question Answering Models and Human Intelligence: A Comparative Study**|Liben Chen et.al.|[2310.02528v1](http://arxiv.org/abs/2310.02528v1)|null|\n", "2310.02361": "|**2023-10-03**|**Event-Enhanced Multi-Modal Spiking Neural Network for Dynamic Obstacle Avoidance**|Yang Wang et.al.|[2310.02361v1](http://arxiv.org/abs/2310.02361v1)|null|\n", "2310.03744": "|**2023-10-05**|**Improved Baselines with Visual Instruction Tuning**|Haotian Liu et.al.|[2310.03744v1](http://arxiv.org/abs/2310.03744v1)|null|\n", "2310.03724": "|**2023-10-05**|**Modular Speech-to-Text Translation for Zero-Shot Cross-Modal Transfer**|Paul-Ambroise Duquenne et.al.|[2310.03724v1](http://arxiv.org/abs/2310.03724v1)|null|\n", "2310.03485": "|**2023-10-07**|**BTDNet: a Multi-Modal Approach for Brain Tumor Radiogenomic Classification**|Dimitrios Kollias et.al.|[2310.03485v2](http://arxiv.org/abs/2310.03485v2)|null|\n", "2310.03420": "|**2023-10-05**|**FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained Diffusion Models and Monocular Depth Estimators**|Haiping Wang et.al.|[2310.03420v1](http://arxiv.org/abs/2310.03420v1)|**[link](https://github.com/WHU-USI3DV/FreeReg)**|\n", "2310.03333": "|**2023-10-05**|**Real-time Multi-modal Object Detection and Tracking on Edge for Regulatory Compliance Monitoring**|Jia Syuen Lim et.al.|[2310.03333v1](http://arxiv.org/abs/2310.03333v1)|null|\n", "2310.03320": "|**2023-10-05**|**BioBridge: Bridging Biomedical Foundation Models via Knowledge Graph**|Zifeng Wang et.al.|[2310.03320v1](http://arxiv.org/abs/2310.03320v1)|null|\n", "2310.03221": "|**2023-10-05**|**Know2BIO: A Comprehensive Dual-View Benchmark for Evolving Biomedical Knowledge Graphs**|Yijia Xiao et.al.|[2310.03221v1](http://arxiv.org/abs/2310.03221v1)|**[link](https://github.com/yijia-xiao/know2bio)**|\n", "2310.03218": "|**2023-10-05**|**Learning Energy-Based Prior Model with Diffusion-Amortized MCMC**|Peiyu Yu et.al.|[2310.03218v1](http://arxiv.org/abs/2310.03218v1)|**[link](https://github.com/yupeiyu98/diffusion-amortized-mcmc)**|\n", "2310.03140": "|**2023-10-04**|**ViFiT: Reconstructing Vision Trajectories from IMU and Wi-Fi Fine Time Measurements**|Bryan Bo Cao et.al.|[2310.03140v1](http://arxiv.org/abs/2310.03140v1)|**[link](https://github.com/bryanbocao/vifit)**|\n", "2310.03111": "|**2023-10-04**|**Multi-modal Gaussian Process Variational Autoencoders for Neural and Behavioral Data**|Rabia Gondur et.al.|[2310.03111v1](http://arxiv.org/abs/2310.03111v1)|null|\n", "2310.03059": "|**2023-10-04**|**Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models**|Ivan Tang et.al.|[2310.03059v1](http://arxiv.org/abs/2310.03059v1)|**[link](https://github.com/EvenJoker/Point-PEFT)**|\n", "2310.04122": "|**2023-10-06**|**VI-Diff: Unpaired Visible-Infrared Translation Diffusion Model for Single Modality Labeled Visible-Infrared Person Re-identification**|Han Huang et.al.|[2310.04122v1](http://arxiv.org/abs/2310.04122v1)|null|\n", "2310.03958": "|**2023-10-06**|**The \"Seen but Unnoticed\" Vocabulary of Natural Touch: Revolutionizing Direct Interaction with Our Devices and One Another (UIST 2021 Vision)**|Ken Hinckley et.al.|[2310.03958v1](http://arxiv.org/abs/2310.03958v1)|null|\n", "2310.05863": "|**2023-10-10**|**Fine-grained Audio-Visual Joint Representations for Multimodal Large Language Models**|Guangzhi Sun et.al.|[2310.05863v2](http://arxiv.org/abs/2310.05863v2)|**[link](https://github.com/briansidp/audiovisualllm)**|\n", "2310.05628": "|**2023-10-09**|**Glitter or Gold? Deriving Structured Insights from Sustainability Reports via Large Language Models**|Marco Bronzini et.al.|[2310.05628v1](http://arxiv.org/abs/2310.05628v1)|**[link](https://github.com/saturnmars/derivingstructuredinsightsfromsustainabilityreportsvialargelanguagemodels)**|\n", "2310.05608": "|**2023-10-09**|**FlexKnot and Gaussian Process for 21 cm global signal analysis and foreground separation**|Stefan Heimersheim et.al.|[2310.05608v1](http://arxiv.org/abs/2310.05608v1)|null|\n", "2310.05572": "|**2023-10-09**|**A Simple and Robust Framework for Cross-Modality Medical Image Segmentation applied to Vision Transformers**|Matteo Bastico et.al.|[2310.05572v1](http://arxiv.org/abs/2310.05572v1)|**[link](https://github.com/matteo-bastico/mi-seg)**|\n", "2310.05462": "|**2023-10-09**|**AdaFuse: Adaptive Medical Image Fusion Based on Spatial-Frequential Cross Attention**|Xianming Gu et.al.|[2310.05462v1](http://arxiv.org/abs/2310.05462v1)|**[link](https://github.com/xianming-gu/adafuse)**|\n", "2310.05401": "|**2023-10-09**|**Entropy-MCMC: Sampling from Flat Basins with Ease**|Bolian Li et.al.|[2310.05401v1](http://arxiv.org/abs/2310.05401v1)|null|\n", "2310.05364": "|**2023-10-10**|**Universal Multi-modal Entity Alignment via Iteratively Fusing Modality Similarity Paths**|Bolin Zhu et.al.|[2310.05364v2](http://arxiv.org/abs/2310.05364v2)|**[link](https://github.com/blzhu0823/pathfusion)**|\n", "2310.05355": "|**2023-10-09**|**C^2M-DoT: Cross-modal consistent multi-view medical report generation with domain transfer network**|Ruizhi Wang et.al.|[2310.05355v1](http://arxiv.org/abs/2310.05355v1)|null|\n", "2310.05245": "|**2023-10-08**|**Influence of Camera-LiDAR Configuration on 3D Object Detection for Autonomous Driving**|Ye Li et.al.|[2310.05245v1](http://arxiv.org/abs/2310.05245v1)|**[link](https://github.com/safeai-lab/lidar-camera-placement)**|\n", "2310.05193": "|**2023-10-08**|**Improving Discriminative Multi-Modal Learning with Large-Scale Pre-Trained Models**|Chenzhuang Du et.al.|[2310.05193v1](http://arxiv.org/abs/2310.05193v1)|null|\n", "2310.05181": "|**2023-10-08**|**Unified speech and gesture synthesis using flow matching**|Shivam Mehta et.al.|[2310.05181v1](http://arxiv.org/abs/2310.05181v1)|null|\n", "2310.05060": "|**2023-10-08**|**Video-CSR: Complex Video Digest Creation for Visual-Language Models**|Tingkai Liu et.al.|[2310.05060v1](http://arxiv.org/abs/2310.05060v1)|null|\n", "2310.04992": "|**2023-10-08**|**VisionFM: a Multi-Modal Multi-Task Vision Foundation Model for Generalist Ophthalmic Artificial Intelligence**|Jianing Qiu et.al.|[2310.04992v1](http://arxiv.org/abs/2310.04992v1)|null|\n", "2310.04991": "|**2023-10-10**|**Video-Teller: Enhancing Cross-Modal Generation with Fusion and Decoupling**|Haogeng Liu et.al.|[2310.04991v2](http://arxiv.org/abs/2310.04991v2)|null|\n", "2310.04971": "|**2023-10-08**|**Understanding the Robustness of Multi-modal Contrastive Learning to Distribution Shift**|Yihao Xue et.al.|[2310.04971v1](http://arxiv.org/abs/2310.04971v1)|null|\n", "2310.06633": "|**2023-10-10**|**Blind Dates: Examining the Expression of Temporality in Historical Photographs**|Alexandra Barancov\u00e1 et.al.|[2310.06633v1](http://arxiv.org/abs/2310.06633v1)|null|\n", "2310.06627": "|**2023-10-10**|**What If the TV Was Off? Examining Counterfactual Reasoning Abilities of Multi-modal Language Models**|Letian Zhang et.al.|[2310.06627v1](http://arxiv.org/abs/2310.06627v1)|**[link](https://github.com/letian2003/c-vqa)**|\n", "2310.06440": "|**2023-10-10**|**Solution for SMART-101 Challenge of ICCV Multi-modal Algorithmic Reasoning Task 2023**|Xiangyu Wu et.al.|[2310.06440v1](http://arxiv.org/abs/2310.06440v1)|null|\n", "2310.06434": "|**2023-10-10**|**Whispering LLaMA: A Cross-Modal Generative Error Correction Framework for Speech Recognition**|Srijith Radhakrishnan et.al.|[2310.06434v1](http://arxiv.org/abs/2310.06434v1)|**[link](https://github.com/srijith-rkr/whispering-llama)**|\n", "2310.06383": "|**2023-10-10**|**What Makes for Robust Multi-Modal Models in the Face of Missing Modalities?**|Siting Li et.al.|[2310.06383v1](http://arxiv.org/abs/2310.06383v1)|null|\n", "2310.06365": "|**2023-10-10**|**Multi-Modal Knowledge Graph Transformer Framework for Multi-Modal Entity Alignment**|Qian Li et.al.|[2310.06365v1](http://arxiv.org/abs/2310.06365v1)|**[link](https://github.com/xiaoqian19940510/moalign)**|\n", "2310.06342": "|**2023-10-10**|**Contrastive Prompt Learning-based Code Search based on Interaction Matrix**|Yubo Zhang et.al.|[2310.06342v1](http://arxiv.org/abs/2310.06342v1)|null|\n", "2310.06282": "|**2023-10-11**|**MuseChat: A Conversational Music Recommendation System for Videos**|Zhikang Dong et.al.|[2310.06282v2](http://arxiv.org/abs/2310.06282v2)|null|\n", "2310.06259": "|**2023-10-10**|**Cross-modal Cognitive Consensus guided Audio-Visual Segmentation**|Zhaofeng Shi et.al.|[2310.06259v1](http://arxiv.org/abs/2310.06259v1)|null|\n", "2310.06212": "|**2023-10-09**|**Comparison of deep-learning data fusion strategies in mandibular osteoradionecrosis prediction modelling using clinical variables and radiation dose distribution volumes**|Laia Humbert-Vidan et.al.|[2310.06212v1](http://arxiv.org/abs/2310.06212v1)|null|\n", "2310.06008": "|**2023-10-09**|**CoBEVFusion: Cooperative Perception with LiDAR-Camera Bird's-Eye View Fusion**|Donghao Qiao et.al.|[2310.06008v1](http://arxiv.org/abs/2310.06008v1)|null|\n", "2310.07706": "|**2023-10-11**|**Pixel State Value Network for Combined Prediction and Planning in Interactive Environments**|Sascha Rosbach et.al.|[2310.07706v1](http://arxiv.org/abs/2310.07706v1)|null|\n", "2310.07668": "|**2023-10-11**|**GRaMuFeN: Graph-based Multi-modal Fake News Detection in Social Media**|Makan Kananian et.al.|[2310.07668v1](http://arxiv.org/abs/2310.07668v1)|null|\n", "2310.07602": "|**2023-10-11**|**Dual Radar: A Multi-modal Dataset with Dual 4D Radar for Autononous Driving**|Xinyu Zhang et.al.|[2310.07602v1](http://arxiv.org/abs/2310.07602v1)|**[link](https://github.com/adept-thu/dual-radar)**|\n", "2310.07591": "|**2023-10-11**|**PeP: a Point enhanced Painting method for unified point cloud tasks**|Zichao Dong et.al.|[2310.07591v1](http://arxiv.org/abs/2310.07591v1)|null|\n", "2310.07552": "|**2023-10-11**|**ProtoHPE: Prototype-guided High-frequency Patch Enhancement for Visible-Infrared Person Re-identification**|Guiwei Zhang et.al.|[2310.07552v1](http://arxiv.org/abs/2310.07552v1)|null|\n", "2310.07517": "|**2023-10-11**|**CM-PIE: Cross-modal perception for interactive-enhanced audio-visual video parsing**|Yaru Chen et.al.|[2310.07517v1](http://arxiv.org/abs/2310.07517v1)|null|\n", "2310.07355": "|**2023-10-11**|**IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training**|Che Liu et.al.|[2310.07355v1](http://arxiv.org/abs/2310.07355v1)|null|\n", "2310.07276": "|**2023-10-11**|**BioT5: Enriching Cross-modal Integration in Biology with Chemical Knowledge and Natural Language Associations**|Qizhi Pei et.al.|[2310.07276v1](http://arxiv.org/abs/2310.07276v1)|**[link](https://github.com/QizhiPei/BioT5)**|\n", "2310.07265": "|**2023-10-11**|**Distilling Efficient Vision Transformers from CNNs for Semantic Segmentation**|Xu Zheng et.al.|[2310.07265v1](http://arxiv.org/abs/2310.07265v1)|null|\n", "2310.07222": "|**2023-10-11**|**Uni-paint: A Unified Framework for Multimodal Image Inpainting with Pretrained Diffusion Model**|Shiyuan Yang et.al.|[2310.07222v1](http://arxiv.org/abs/2310.07222v1)|**[link](https://github.com/ysy31415/unipaint)**|\n", "2310.07005": "|**2023-10-10**|**Sound-skwatter (Did You Mean: Sound-squatter?) AI-powered Generator for Phishing Prevention**|Rodolfo Valentim et.al.|[2310.07005v1](http://arxiv.org/abs/2310.07005v1)|null|\n", "2310.08530": "|**2023-10-12**|**UniPose: Detecting Any Keypoints**|Jie Yang et.al.|[2310.08530v1](http://arxiv.org/abs/2310.08530v1)|**[link](https://github.com/IDEA-Research/UniPose)**|\n", "2310.08487": "|**2023-10-12**|**GraphextQA: A Benchmark for Evaluating Graph-Enhanced Large Language Models**|Yuanchun Shen et.al.|[2310.08487v1](http://arxiv.org/abs/2310.08487v1)|**[link](https://github.com/happen2me/cross-gnn)**|\n", "2310.08446": "|**2023-10-12**|**Towards Robust Multi-Modal Reasoning via Model Selection**|Xiangyan Liu et.al.|[2310.08446v1](http://arxiv.org/abs/2310.08446v1)|null|\n", "2310.08303": "|**2023-10-12**|**Multimodal Variational Auto-encoder based Audio-Visual Segmentation**|Yuxin Mao et.al.|[2310.08303v1](http://arxiv.org/abs/2310.08303v1)|**[link](https://github.com/opennlplab/mmvae-avs)**|\n", "2310.08285": "|**2023-10-12**|**How would mobility-as-a-service (MaaS) platform survive as an intermediary? From the viewpoint of stability in many-to-many matching**|Rui Yao et.al.|[2310.08285v1](http://arxiv.org/abs/2310.08285v1)|null|\n", "2310.08270": "|**2023-10-12**|**Hilbert Space Embedding-based Trajectory Optimization for Multi-Modal Uncertain Obstacle Trajectory Prediction**|Basant Sharma et.al.|[2310.08270v1](http://arxiv.org/abs/2310.08270v1)|null|\n", "2310.08261": "|**2023-10-12**|**GraphAlign: Enhancing Accurate Feature Alignment by Graph matching for Multi-Modal 3D Object Detection**|Ziying Song et.al.|[2310.08261v1](http://arxiv.org/abs/2310.08261v1)|null|\n", "2310.08166": "|**2023-10-12**|**Ziya-VL: Bilingual Large Vision-Language Model via Multi-Task Instruction Tuning**|Junyu Lu et.al.|[2310.08166v1](http://arxiv.org/abs/2310.08166v1)|null|\n", "2310.08114": "|**2023-10-12**|**Multi-Modal Sensor Fusion and Object Tracking for Autonomous Racing**|Phillip Karle et.al.|[2310.08114v1](http://arxiv.org/abs/2310.08114v1)|**[link](https://github.com/tumftm/fusiontracking)**|\n", "2310.08103": "|**2023-10-12**|**Radio Galaxy Zoo: tagging radio subjects using text**|Dawei Chen et.al.|[2310.08103v1](http://arxiv.org/abs/2310.08103v1)|null|\n", "2310.08027": "|**2023-10-12**|**Exploring Large Language Models for Multi-Modal Out-of-Distribution Detection**|Yi Dai et.al.|[2310.08027v1](http://arxiv.org/abs/2310.08027v1)|null|\n", "2310.08026": "|**2023-10-12**|**Beyond Sharing Weights in Decoupling Feature Learning Network for UAV RGB-Infrared Vehicle Re-Identification**|Xingyue Liu et.al.|[2310.08026v1](http://arxiv.org/abs/2310.08026v1)|null|\n", "2310.07990": "|**2023-10-12**|**Multi-View Variational Autoencoder for Missing Value Imputation in Untargeted Metabolomics**|Chen Zhao et.al.|[2310.07990v1](http://arxiv.org/abs/2310.07990v1)|null|\n", "2310.07944": "|**2023-10-11**|**AutoRepo: A general framework for multi-modal LLM-based automated construction reporting**|Hongxu Pu et.al.|[2310.07944v1](http://arxiv.org/abs/2310.07944v1)|null|\n", "2310.07940": "|**2023-10-11**|**Cost-Driven Hardware-Software Co-Optimization of Machine Learning Pipelines**|Ravit Sharma et.al.|[2310.07940v1](http://arxiv.org/abs/2310.07940v1)|null|\n", "2310.10651": "|**2023-10-16**|**HairCLIPv2: Unifying Hair Editing via Proxy Feature Blending**|Tianyi Wei et.al.|[2310.10651v1](http://arxiv.org/abs/2310.10651v1)|**[link](https://github.com/wty-ustc/hairclipv2)**|\n", "2310.10414": "|**2023-10-16**|**Style transfer between Microscopy and Magnetic Resonance Imaging via Generative Adversarial Network in small sample size settings**|Monika Pytlarz et.al.|[2310.10414v1](http://arxiv.org/abs/2310.10414v1)|null|\n", "2310.10371": "|**2023-10-16**|**Camera-LiDAR Fusion with Latent Contact for Place Recognition in Challenging Cross-Scenes**|Yan Pan et.al.|[2310.10371v1](http://arxiv.org/abs/2310.10371v1)|null|\n", "2310.10347": "|**2023-10-16**|**Editable-DeepSC: Cross-Modal Editable Semantic Communication Systems**|Wenbo Yu et.al.|[2310.10347v1](http://arxiv.org/abs/2310.10347v1)|null|\n", "2310.10290": "|**2023-10-16**|**Autonomous Mapping and Navigation using Fiducial Markers and Pan-Tilt Camera for Assisting Indoor Mobility of Blind and Visually Impaired People**|Dharmateja Adapa et.al.|[2310.10290v1](http://arxiv.org/abs/2310.10290v1)|null|\n", "2310.10125": "|**2023-10-16**|**Few-shot Action Recognition with Captioning Foundation Models**|Xiang Wang et.al.|[2310.10125v1](http://arxiv.org/abs/2310.10125v1)|null|\n", "2310.10010": "|**2023-10-16**|**Black-box Targeted Adversarial Attack on Segment Anything (SAM)**|Sheng Zheng et.al.|[2310.10010v1](http://arxiv.org/abs/2310.10010v1)|null|\n", "2310.09761": "|**2023-10-15**|**CAPro: Webly Supervised Learning with Cross-Modality Aligned Prototypes**|Yulei Qin et.al.|[2310.09761v1](http://arxiv.org/abs/2310.09761v1)|**[link](https://github.com/yuleiqin/capro)**|\n", "2310.09755": "|**2023-10-15**|**Beyond Segmentation: Road Network Generation with Multi-Modal LLMs**|Sumedh Rasal et.al.|[2310.09755v1](http://arxiv.org/abs/2310.09755v1)|null|\n", "2310.09714": "|**2023-10-15**|**Enhancing Task Performance of Learned Simplified Models via Reinforcement Learning**|Hien Bui et.al.|[2310.09714v1](http://arxiv.org/abs/2310.09714v1)|null|\n", "2310.09696": "|**2023-10-15**|**Progressive Evidence Refinement for Open-domain Multimodal Retrieval Question Answering**|Shuwen Yang et.al.|[2310.09696v1](http://arxiv.org/abs/2310.09696v1)|null|\n", "2310.09503": "|**2023-10-14**|**JM3D & JM3D-LLM: Elevating 3D Representation with Joint Multi-modal Cues**|Jiayi Ji et.al.|[2310.09503v1](http://arxiv.org/abs/2310.09503v1)|**[link](https://github.com/mr-neko/jm3d)**|\n", "2310.09478": "|**2023-10-14**|**MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning**|Jun Chen et.al.|[2310.09478v1](http://arxiv.org/abs/2310.09478v1)|null|\n", "2310.09199": "|**2023-10-13**|**PaLI-3 Vision Language Models: Smaller, Faster, Stronger**|Xi Chen et.al.|[2310.09199v1](http://arxiv.org/abs/2310.09199v1)|null|\n", "2310.09165": "|**2023-10-13**|**Towards Robust UAV Tracking in GNSS-Denied Environments: A Multi-LiDAR Multi-UAV Dataset**|Iacopo Catalano et.al.|[2310.09165v1](http://arxiv.org/abs/2310.09165v1)|**[link](https://github.com/tiers/uav_multi_lidar_dataset)**|\n", "2310.11374": "|**2023-10-17**|**DialogueLLM: Context and Emotion Knowledge-Tuned LLaMA Models for Emotion Recognition in Conversations**|Yazhou Zhang et.al.|[2310.11374v1](http://arxiv.org/abs/2310.11374v1)|null|\n", "2310.11316": "|**2023-10-17**|**MonoSKD: General Distillation Framework for Monocular 3D Object Detection via Spearman Correlation Coefficient**|Sen Wang et.al.|[2310.11316v1](http://arxiv.org/abs/2310.11316v1)|**[link](https://github.com/senwang98/monoskd)**|\n", "2310.11307": "|**2023-10-17**|**Multi Self-supervised Pre-fine-tuned Transformer Fusion for Better Intelligent Transportation Detection**|Juwu Zheng et.al.|[2310.11307v1](http://arxiv.org/abs/2310.11307v1)|null|\n", "2310.11295": "|**2023-10-17**|**CorrTalk: Correlation Between Hierarchical Speech and Facial Activity Variances for 3D Animation**|Zhaojie Chu et.al.|[2310.11295v1](http://arxiv.org/abs/2310.11295v1)|null|\n", "2310.10942": "|**2023-10-17**|**Unanswerable Visual Question Answering**|Yanyang Guo et.al.|[2310.10942v1](http://arxiv.org/abs/2310.10942v1)|**[link](https://github.com/guoyang9/unk-vqa)**|\n", "2310.10844": "|**2023-10-16**|**Survey of Vulnerabilities in Large Language Models Revealed by Adversarial Attacks**|Erfan Shayegani et.al.|[2310.10844v1](http://arxiv.org/abs/2310.10844v1)|null|\n", "2310.12081": "|**2023-10-18**|**DHOT-GM: Robust Graph Matching Using A Differentiable Hierarchical Optimal Transport Framework**|Haoran Cheng et.al.|[2310.12081v1](http://arxiv.org/abs/2310.12081v1)|null|\n", "2310.11989": "|**2023-10-18**|**Image Clustering with External Guidance**|Yunfan Li et.al.|[2310.11989v1](http://arxiv.org/abs/2310.11989v1)|null|\n", "2310.11939": "|**2023-10-18**|**Mixture distributions for probabilistic forecasts of disease outbreaks**|Spencer Wadsworth et.al.|[2310.11939v1](http://arxiv.org/abs/2310.11939v1)|null|\n", "2310.11938": "|**2023-10-18**|**Grounded and Well-rounded: A Methodological Approach to the Study of Cross-modal and Cross-lingual Grounding**|Timothee Mickus et.al.|[2310.11938v1](http://arxiv.org/abs/2310.11938v1)|null|\n", "2310.11910": "|**2023-10-18**|**Multi-modal Medical Neurological Image Fusion using Wavelet Pooled Edge Preserving Autoencoder**|Manisha Das et.al.|[2310.11910v1](http://arxiv.org/abs/2310.11910v1)|null|\n", "2310.11713": "|**2023-10-18**|**Separating Invisible Sounds Toward Universal Audiovisual Scene-Aware Sound Separation**|Yiyang Su et.al.|[2310.11713v1](http://arxiv.org/abs/2310.11713v1)|null|\n", "2310.11612": "|**2023-10-17**|**Balance Act: Mitigating Hubness in Cross-Modal Retrieval with Query and Gallery Banks**|Yimu Wang et.al.|[2310.11612v1](http://arxiv.org/abs/2310.11612v1)|**[link](https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval)**|\n", "2310.12973": "|**2023-10-19**|**Frozen Transformers in Language Models Are Effective Visual Encoder Layers**|Ziqi Pang et.al.|[2310.12973v1](http://arxiv.org/abs/2310.12973v1)|**[link](https://github.com/ziqipang/lm4visualencoding)**|\n", "2310.12798": "|**2023-10-19**|**MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal Adapter**|Zhiyuan Liu et.al.|[2310.12798v1](http://arxiv.org/abs/2310.12798v1)|**[link](https://github.com/acharkq/molca)**|\n", "2310.12609": "|**2023-10-19**|**Denoising Heat-inspired Diffusion with Insulators for Collision Free Motion Planning**|Junwoo Chang et.al.|[2310.12609v1](http://arxiv.org/abs/2310.12609v1)|null|\n", "2310.12520": "|**2023-10-19**|**Lost in Translation: When GPT-4V(ision) Can't See Eye to Eye with Text. A Vision-Language-Consistency Analysis of VLLMs and Beyond**|Xiang Zhang et.al.|[2310.12520v1](http://arxiv.org/abs/2310.12520v1)|null|\n", "2310.12518": "|**2023-10-19**|**Light-enhanced van der Waals force microscopy**|Han Yu-Xiao et.al.|[2310.12518v1](http://arxiv.org/abs/2310.12518v1)|null|\n", "2310.12344": "|**2023-10-18**|**LACMA: Language-Aligning Contrastive Learning with Meta-Actions for Embodied Instruction Following**|Cheng-Fu Yang et.al.|[2310.12344v1](http://arxiv.org/abs/2310.12344v1)|**[link](https://github.com/joeyy5588/lacma)**|\n", "2310.13619": "|**2023-10-20**|**Semi-supervised multimodal coreference resolution in image narrations**|Arushi Goel et.al.|[2310.13619v1](http://arxiv.org/abs/2310.13619v1)|**[link](https://github.com/vico-uoe/cin-ssl)**|\n", "2310.13596": "|**2023-10-20**|**MarineGPT: Unlocking Secrets of Ocean to the Public**|Ziqiang Zheng et.al.|[2310.13596v1](http://arxiv.org/abs/2310.13596v1)|**[link](https://github.com/hkust-vgd/marinegpt)**|\n", "2310.13451": "|**2023-10-20**|**Two-Stage Triplet Loss Training with Curriculum Augmentation for Audio-Visual Retrieval**|Donghuo Zeng et.al.|[2310.13451v1](http://arxiv.org/abs/2310.13451v1)|null|\n", "2310.13398": "|**2023-10-20**|**OpenAnnotate3D: Open-Vocabulary Auto-Labeling System for Multi-modal 3D Data**|Yijie Zhou et.al.|[2310.13398v1](http://arxiv.org/abs/2310.13398v1)|null|\n", "2310.13289": "|**2023-10-20**|**SALMONN: Towards Generic Hearing Abilities for Large Language Models**|Changli Tang et.al.|[2310.13289v1](http://arxiv.org/abs/2310.13289v1)|**[link](https://github.com/bytedance/salmonn)**|\n", "2310.13276": "|**2023-10-20**|**InvGC: Robust Cross-Modal Retrieval by Inverse Graph Convolution**|Xiangru Jian et.al.|[2310.13276v1](http://arxiv.org/abs/2310.13276v1)|**[link](https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval)**|\n", "2310.13267": "|**2023-10-20**|**On the Language Encoder of Contrastive Cross-modal Models**|Mengjie Zhao et.al.|[2310.13267v1](http://arxiv.org/abs/2310.13267v1)|null|\n", "2310.13265": "|**2023-10-20**|**MoqaGPT : Zero-Shot Multi-modal Open-domain Question Answering with Large Language Model**|Le Zhang et.al.|[2310.13265v1](http://arxiv.org/abs/2310.13265v1)|**[link](https://github.com/lezhang7/moqagpt)**|\n", "2310.13257": "|**2023-10-20**|**Visual Grounding Helps Learn Word Meanings in Low-Data Regimes**|Chengxu Zhuang et.al.|[2310.13257v1](http://arxiv.org/abs/2310.13257v1)|null|\n", "2310.13235": "|**2023-10-20**|**Auxiliary Features-Guided Super Resolution for Monte Carlo Rendering**|Qiqi Hou et.al.|[2310.13235v1](http://arxiv.org/abs/2310.13235v1)|null|\n", "2310.13103": "|**2023-10-19**|**AVTENet: Audio-Visual Transformer-based Ensemble Network Exploiting Multiple Experts for Video Deepfake Detection**|Ammarah Hashmi et.al.|[2310.13103v1](http://arxiv.org/abs/2310.13103v1)|null|\n", "2310.14924": "|**2023-10-23**|**Converting Depth Images and Point Clouds for Feature-based Pose Estimation**|Robert L\u00f6sch et.al.|[2310.14924v1](http://arxiv.org/abs/2310.14924v1)|**[link](https://github.com/rlsch/depth-conversions)**|\n", "2310.14805": "|**2023-10-23**|**Cross-Modal Conceptualization in Bottleneck Models**|Danis Alukaev et.al.|[2310.14805v1](http://arxiv.org/abs/2310.14805v1)|**[link](https://github.com/danisalukaev/xcbs)**|\n", "2310.14785": "|**2023-10-23**|**Vision-Enhanced Semantic Entity Recognition in Document Images via Visually-Asymmetric Consistency Learning**|Hao Wang et.al.|[2310.14785v1](http://arxiv.org/abs/2310.14785v1)|null|\n", "2310.14720": "|**2023-10-23**|**Extended Deep Adaptive Input Normalization for Preprocessing Time Series Data for Neural Networks**|Marcus A. K. September et.al.|[2310.14720v1](http://arxiv.org/abs/2310.14720v1)|**[link](https://github.com/marcusgh/edain_paper)**|\n", "2310.14702": "|**2023-10-23**|**BM2CP: Efficient Collaborative Perception with LiDAR-Camera Modalities**|Binyu Zhao et.al.|[2310.14702v1](http://arxiv.org/abs/2310.14702v1)|**[link](https://github.com/byzhaoai/bm2cp)**|\n", "2310.14643": "|**2023-10-23**|**Dynamic gain and frequency comb formation in exceptional-point lasers**|Xingwei Gao et.al.|[2310.14643v1](http://arxiv.org/abs/2310.14643v1)|null|\n", "2310.14566": "|**2023-10-23**|**HallusionBench: You See What You Think? Or You Think What You See? An Image-Context Reasoning Benchmark Challenging for GPT-4V(ision), LLaVA-1.5, and Other Multi-modality Models**|Fuxiao Liu et.al.|[2310.14566v1](http://arxiv.org/abs/2310.14566v1)|**[link](https://github.com/tianyi-lab/hallusionbench)**|\n", "2310.14549": "|**2023-10-23**|**Multimodal Graph Learning for Modeling Emerging Pandemics with Big Data**|Khanh-Tung Tran et.al.|[2310.14549v1](http://arxiv.org/abs/2310.14549v1)|**[link](https://github.com/khanhtungtran/mgl4mep)**|\n", "2310.14278": "|**2023-10-22**|**Conversational Speech Recognition by Learning Audio-textual Cross-modal Contextual Representation**|Kun Wei et.al.|[2310.14278v1](http://arxiv.org/abs/2310.14278v1)|null|\n", "2310.14226": "|**2023-10-22**|**Multi-stream Cell Segmentation with Low-level Cues for Multi-modality Images**|Wei Lou et.al.|[2310.14226v1](http://arxiv.org/abs/2310.14226v1)|**[link](https://github.com/lhaof/cellseg)**|\n", "2310.14216": "|**2023-10-22**|**UniMAP: Universal SMILES-Graph Representation Learning**|Shikun Feng et.al.|[2310.14216v1](http://arxiv.org/abs/2310.14216v1)|**[link](https://github.com/fengshikun/unimap)**|\n", "2310.14158": "|**2023-10-22**|**Visual-Attribute Prompt Learning for Progressive Mild Cognitive Impairment Prediction**|Luoyao Kang et.al.|[2310.14158v1](http://arxiv.org/abs/2310.14158v1)|**[link](https://github.com/lhaof/vapl)**|\n", "2310.14075": "|**2023-10-21**|**Unsupervised Sim-to-Real Adaptation of Soft Robot Proprioception using a Dual Cross-modal Autoencoder**|Chaeree Park et.al.|[2310.14075v1](http://arxiv.org/abs/2310.14075v1)|null|\n", "2310.14037": "|**2023-10-21**|**Unlock Multi-Modal Capability of Dense Retrieval via Visual Module Plugin**|Tianshuo Zhou et.al.|[2310.14037v1](http://arxiv.org/abs/2310.14037v1)|**[link](https://github.com/openmatch/marvel)**|\n", "2310.13898": "|**2023-10-21**|**Computational and Systems Biology Advances to Enable for Bioagent Agnostic Signatures**|Andy Lin et.al.|[2310.13898v1](http://arxiv.org/abs/2310.13898v1)|null|\n", "2310.15887": "|**2023-10-24**|**AdaptiX -- A Transitional XR Framework for Development and Evaluation of Shared Control Applications in Assistive Robotics**|Max Pascher et.al.|[2310.15887v1](http://arxiv.org/abs/2310.15887v1)|**[link](https://github.com/maxpascher/AdaptiX)**|\n", "2310.15676": "|**2023-10-24**|**Recent Advances in Multi-modal 3D Scene Understanding: A Comprehensive Survey and Evaluation**|Yinjie Lei et.al.|[2310.15676v1](http://arxiv.org/abs/2310.15676v1)|null|\n", "2310.15670": "|**2023-10-24**|**Leveraging Vision-Centric Multi-Modal Expertise for 3D Object Detection**|Linyan Huang et.al.|[2310.15670v1](http://arxiv.org/abs/2310.15670v1)|**[link](https://github.com/opendrivelab/birds-eye-view-perception)**|\n", "2310.15587": "|**2023-10-24**|**ScanDL: A Diffusion Model for Generating Synthetic Scanpaths on Texts**|Lena S. Bolliger et.al.|[2310.15587v1](http://arxiv.org/abs/2310.15587v1)|**[link](https://github.com/dili-lab/scandl)**|\n", "2310.15585": "|**2023-10-24**|**Multimodal Representations for Teacher-Guided Compositional Visual Reasoning**|Wafa Aissa et.al.|[2310.15585v1](http://arxiv.org/abs/2310.15585v1)|null|\n", "2310.15568": "|**2023-10-24**|**I$^2$MD: 3D Action Representation Learning with Inter- and Intra-modal Mutual Distillation**|Yunyao Mao et.al.|[2310.15568v1](http://arxiv.org/abs/2310.15568v1)|null|\n", "2310.15482": "|**2023-10-24**|**Salient Object Detection in RGB-D Videos**|Ao Mou et.al.|[2310.15482v1](http://arxiv.org/abs/2310.15482v1)|**[link](https://github.com/kerenfu/rdvs)**|\n", "2310.15325": "|**2023-10-23**|**LXMERT Model Compression for Visual Question Answering**|Maryam Hashemi et.al.|[2310.15325v1](http://arxiv.org/abs/2310.15325v1)|**[link](https://github.com/ghazaleh-mahmoodi/lxmert_compression)**|\n", "2310.15301": "|**2023-10-23**|**ADMarker: A Multi-Modal Federated Learning System for Monitoring Digital Biomarkers of Alzheimer's Disease**|Xiaomin Ouyang et.al.|[2310.15301v1](http://arxiv.org/abs/2310.15301v1)|null|\n", "2310.15281": "|**2023-10-23**|**UncertaintyPlayground: A Fast and Simplified Python Library for Uncertainty Estimation**|Ilia Azizi et.al.|[2310.15281v1](http://arxiv.org/abs/2310.15281v1)|**[link](https://github.com/Unco3892/UncertaintyPlayground)**|\n", "2310.16781": "|**2023-10-25**|**Kiki or Bouba? Sound Symbolism in Vision-and-Language Models**|Morris Alper et.al.|[2310.16781v1](http://arxiv.org/abs/2310.16781v1)|null|\n", "2310.16754": "|**2023-10-25**|**CAD -- Contextual Multi-modal Alignment for Dynamic AVQA**|Asmar Nadeem et.al.|[2310.16754v1](http://arxiv.org/abs/2310.16754v1)|null|\n", "2310.16641": "|**2023-10-25**|**The Next Evolution of Artificial Sense of Touch**|Sonja Gro\u00df et.al.|[2310.16641v1](http://arxiv.org/abs/2310.16641v1)|null|\n", "2310.16629": "|**2023-10-25**|**EdgeCalib: Multi-Frame Weighted Edge Features for Automatic Targetless LiDAR-Camera Calibration**|Xingchen Li et.al.|[2310.16629v1](http://arxiv.org/abs/2310.16629v1)|null|\n", "2310.16590": "|**2023-10-25**|**$\\mathbb{VD}$-$\\mathbb{GR}$: Boosting $\\mathbb{V}$isual $\\mathbb{D}$ialog with Cascaded Spatial-Temporal Multi-Modal $\\mathbb{GR}$aphs**|Adnen Abdessaied et.al.|[2310.16590v1](http://arxiv.org/abs/2310.16590v1)|null|\n", "2310.16477": "|**2023-10-25**|**Show from Tell: Audio-Visual Modelling in Clinical Settings**|Jianbo Jiao et.al.|[2310.16477v1](http://arxiv.org/abs/2310.16477v1)|null|\n", "2310.16402": "|**2023-10-25**|**Video Referring Expression Comprehension via Transformer with Content-conditioned Query**|Ji Jiang et.al.|[2310.16402v1](http://arxiv.org/abs/2310.16402v1)|null|\n", "2310.16380": "|**2023-10-25**|**A model for multi-attack classification to improve intrusion detection performance using deep learning approaches**|Arun Kumar Silivery et.al.|[2310.16380v1](http://arxiv.org/abs/2310.16380v1)|null|\n", "2310.16356": "|**2023-10-25**|**A Multi-Modal Multilingual Benchmark for Document Image Classification**|Yoshinari Fujinuma et.al.|[2310.16356v1](http://arxiv.org/abs/2310.16356v1)|null|\n", "2310.16273": "|**2023-10-25**|**Deep Learning for Plant Identification and Disease Classification from Leaf Images: Multi-prediction Approaches**|Jianping Yao et.al.|[2310.16273v1](http://arxiv.org/abs/2310.16273v1)|**[link](https://github.com/funzi-son/plant_pathology_dl)**|\n", "2310.17642": "|**2023-10-26**|**Drive Anywhere: Generalizable End-to-end Autonomous Driving with Multi-modal Foundation Models**|Tsun-Hsuan Wang et.al.|[2310.17642v1](http://arxiv.org/abs/2310.17642v1)|null|\n", "2310.17568": "|**2023-10-26**|**Navigating to Success in Multi-Modal Human-Robot Collaboration: Analysis and Corpus Release**|Stephanie M. Lukin et.al.|[2310.17568v1](http://arxiv.org/abs/2310.17568v1)|null|\n", "2310.17540": "|**2023-10-26**|**EqDrive: Efficient Equivariant Motion Forecasting with Multi-Modality for Autonomous Driving**|Yuping Wang et.al.|[2310.17540v1](http://arxiv.org/abs/2310.17540v1)|null|\n", "2310.17468": "|**2023-10-26**|**Cross-modal Active Complementary Learning with Self-refining Correspondence**|Yang Qin et.al.|[2310.17468v1](http://arxiv.org/abs/2310.17468v1)|**[link](https://github.com/qinyang79/crcl)**|\n", "2310.17323": "|**2023-10-26**|**IndustReal: A Dataset for Procedure Step Recognition Handling Execution Errors in Egocentric Videos in an Industrial-Like Setting**|Tim J. Schoonbeek et.al.|[2310.17323v1](http://arxiv.org/abs/2310.17323v1)|**[link](https://github.com/timschoonbeek/industreal)**|\n", "2310.17133": "|**2023-10-26**|**Incorporating Probing Signals into Multimodal Machine Translation via Visual Question-Answering Pairs**|Yuxin Zuo et.al.|[2310.17133v1](http://arxiv.org/abs/2310.17133v1)|**[link](https://github.com/libeineu/mmt-vqa)**|\n", "2310.17025": "|**2023-10-25**|**netFound: Foundation Model for Network Security**|Satyandra Guthula et.al.|[2310.17025v1](http://arxiv.org/abs/2310.17025v1)|null|\n", "2310.16917": "|**2023-10-25**|**MimicTouch: Learning Human's Control Strategy with Multi-Modal Tactile Feedback**|Kelin Yu et.al.|[2310.16917v1](http://arxiv.org/abs/2310.16917v1)|null|\n", "2310.18049": "|**2023-10-27**|**Text Augmented Spatial-aware Zero-shot Referring Image Segmentation**|Yucheng Suo et.al.|[2310.18049v1](http://arxiv.org/abs/2310.18049v1)|null|\n", "2310.17956": "|**2023-10-27**|**Qilin-Med-VL: Towards Chinese Large Vision-Language Model for General Healthcare**|Junling Liu et.al.|[2310.17956v1](http://arxiv.org/abs/2310.17956v1)|**[link](https://github.com/williamliujl/qilin-med-vl)**|\n", "2310.17933": "|**2023-10-27**|**A barycenter-based approach for the multi-model ensembling of subseasonal forecasts**|Camille Le Coz et.al.|[2310.17933v1](http://arxiv.org/abs/2310.17933v1)|null|\n", "2310.17852": "|**2023-10-27**|**Function Space Bayesian Pseudocoreset for Bayesian Neural Networks**|Balhae Kim et.al.|[2310.17852v1](http://arxiv.org/abs/2310.17852v1)|null|\n", "2310.17796": "|**2023-10-26**|**ControlLLM: Augment Language Models with Tools by Searching on Graphs**|Zhaoyang Liu et.al.|[2310.17796v1](http://arxiv.org/abs/2310.17796v1)|**[link](https://github.com/opengvlab/controlllm)**|\n", "2310.17770": "|**2023-10-26**|**GROOViST: A Metric for Grounding Objects in Visual Storytelling**|Aditya K Surikuchi et.al.|[2310.17770v1](http://arxiv.org/abs/2310.17770v1)|**[link](https://github.com/akskuchi/groovist)**|\n", "2310.17737": "|**2023-10-26**|**ArchBERT: Bi-Modal Understanding of Neural Architectures and Natural Languages**|Mohammad Akbari et.al.|[2310.17737v1](http://arxiv.org/abs/2310.17737v1)|null|\n", "2310.19168": "|**2023-10-29**|**BirdSAT: Cross-View Contrastive Masked Autoencoders for Bird Species Classification and Mapping**|Srikumar Sastry et.al.|[2310.19168v1](http://arxiv.org/abs/2310.19168v1)|**[link](https://github.com/mvrl/birdsat)**|\n", "2310.19070": "|**2023-10-29**|**Myriad: Large Multimodal Model by Applying Vision Experts for Industrial Anomaly Detection**|Yuanze Li et.al.|[2310.19070v1](http://arxiv.org/abs/2310.19070v1)|null|\n", "2310.19062": "|**2023-10-29**|**A multi-modal table tennis robot system**|Andreas Ziegler et.al.|[2310.19062v1](http://arxiv.org/abs/2310.19062v1)|null|\n", "2310.19001": "|**2023-10-29**|**Uncovering Prototypical Knowledge for Weakly Open-Vocabulary Semantic Segmentation**|Fei Zhang et.al.|[2310.19001v1](http://arxiv.org/abs/2310.19001v1)|null|\n", "2310.18949": "|**2023-10-29**|**Customize StyleGAN with One Hand Sketch**|Shaocong Zhang et.al.|[2310.18949v1](http://arxiv.org/abs/2310.18949v1)|null|\n", "2310.18890": "|**2023-10-29**|**Towards Generalized Multi-stage Clustering: Multi-view Self-distillation**|Jiatai Wang et.al.|[2310.18890v1](http://arxiv.org/abs/2310.18890v1)|null|\n", "2310.18728": "|**2023-10-28**|**Online Multi-view Anomaly Detection with Disentangled Product-of-Experts Modeling**|Hao Wang et.al.|[2310.18728v1](http://arxiv.org/abs/2310.18728v1)|null|\n", "2310.18709": "|**2023-10-28**|**Audio-Visual Instance Segmentation**|Ruohao Guo et.al.|[2310.18709v1](http://arxiv.org/abs/2310.18709v1)|null|\n", "2310.18652": "|**2023-10-28**|**EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health Records with Chest X-ray Images**|Seongsu Bae et.al.|[2310.18652v1](http://arxiv.org/abs/2310.18652v1)|**[link](https://github.com/baeseongsu/ehrxqa)**|\n", "2310.18620": "|**2023-10-28**|**ODM3D: Alleviating Foreground Sparsity for Enhanced Semi-Supervised Monocular 3D Object Detection**|Weijia Zhang et.al.|[2310.18620v1](http://arxiv.org/abs/2310.18620v1)|null|\n", "2310.18583": "|**2023-10-28**|**Self-Supervised Multi-Modality Learning for Multi-Label Skin Lesion Classification**|Hao Wang et.al.|[2310.18583v1](http://arxiv.org/abs/2310.18583v1)|**[link](https://github.com/dylan-h-wang/skin-sm3)**|\n", "2310.18481": "|**2023-10-27**|**MOSEL: Inference Serving Using Dynamic Modality Selection**|Bodun Hu et.al.|[2310.18481v1](http://arxiv.org/abs/2310.18481v1)|null|\n", "2310.18438": "|**2023-10-27**|**Exploring Shape Embedding for Cloth-Changing Person Re-Identification via 2D-3D Correspondences**|Yubin Wang et.al.|[2310.18438v1](http://arxiv.org/abs/2310.18438v1)|null|\n", "2310.20561": "|**2023-10-31**|**Predictive Control for Autonomous Driving with Uncertain, Multi-modal Predictions**|Siddharth H. Nair et.al.|[2310.20561v1](http://arxiv.org/abs/2310.20561v1)|null|\n", "2310.20446": "|**2023-10-31**|**LAVSS: Location-Guided Audio-Visual Spatial Audio Separation**|Yuxin Ye et.al.|[2310.20446v1](http://arxiv.org/abs/2310.20446v1)|null|\n", "2310.20357": "|**2023-11-01**|**Enhancing the Spatial Awareness Capability of Multi-Modal Large Language Model**|Yongqiang Zhao et.al.|[2310.20357v2](http://arxiv.org/abs/2310.20357v2)|null|\n", "2310.20343": "|**2023-10-31**|**Large Multi-modal Encoders for Recommendation**|Zixuan Yi et.al.|[2310.20343v1](http://arxiv.org/abs/2310.20343v1)|null|\n", "2310.20025": "|**2023-10-30**|**GOPlan: Goal-conditioned Offline Reinforcement Learning by Planning with Learned Models**|Mianchu Wang et.al.|[2310.20025v1](http://arxiv.org/abs/2310.20025v1)|null|\n", "2310.19795": "|**2023-10-30**|**SimMMDG: A Simple and Effective Framework for Multi-modal Domain Generalization**|Hao Dong et.al.|[2310.19795v1](http://arxiv.org/abs/2310.19795v1)|**[link](https://github.com/donghao51/simmmdg)**|\n", "2310.19743": "|**2023-10-30**|**Tell Me What Is Good About This Property: Leveraging Reviews For Segment-Personalized Image Collection Summarization**|Monika Wysoczanska et.al.|[2310.19743v1](http://arxiv.org/abs/2310.19743v1)|null|\n", "2310.19654": "|**2023-10-30**|**MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient image-text retrieval**|Youbo Lei et.al.|[2310.19654v1](http://arxiv.org/abs/2310.19654v1)|null|\n", "2310.19635": "|**2023-10-30**|**Bidirectional Captioning for Clinically Accurate and Interpretable Models**|Keegan Quigley et.al.|[2310.19635v1](http://arxiv.org/abs/2310.19635v1)|null|\n", "2310.19608": "|**2023-10-30**|**On Feynman--Kac training of partial Bayesian neural networks**|Zheng Zhao et.al.|[2310.19608v1](http://arxiv.org/abs/2310.19608v1)|null|\n", "2310.19559": "|**2023-10-30**|**Disentangled Counterfactual Learning for Physical Audiovisual Commonsense Reasoning**|Changsheng Lv et.al.|[2310.19559v1](http://arxiv.org/abs/2310.19559v1)|null|\n", "2310.19554": "|**2023-10-30**|**Harvest Video Foundation Models via Efficient Post-Pretraining**|Yizhuo Li et.al.|[2310.19554v1](http://arxiv.org/abs/2310.19554v1)|**[link](https://github.com/opengvlab/internvideo)**|\n", "2310.19432": "|**2023-10-30**|**Explaining the Decisions of Deep Policy Networks for Robotic Manipulations**|Seongun Kim et.al.|[2310.19432v1](http://arxiv.org/abs/2310.19432v1)|null|\n", "2310.19264": "|**2023-10-30**|**Sound of Story: Multi-modal Storytelling with Audio**|Jaeyeon Bae et.al.|[2310.19264v1](http://arxiv.org/abs/2310.19264v1)|null|\n", "2311.00618": "|**2023-11-01**|**De-Diffusion Makes Text a Strong Cross-Modal Interface**|Chen Wei et.al.|[2311.00618v1](http://arxiv.org/abs/2311.00618v1)|null|\n", "2311.00566": "|**2023-11-01**|**CROMA: Remote Sensing Representations with Contrastive Radar-Optical Masked Autoencoders**|Anthony Fuller et.al.|[2311.00566v1](http://arxiv.org/abs/2311.00566v1)|**[link](https://github.com/antofuller/croma)**|\n", "2311.00436": "|**2023-11-01**|**Enhancing Traffic Object Detection in Variable Illumination with RGB-Event Fusion**|Zhanwen Liu et.al.|[2311.00436v1](http://arxiv.org/abs/2311.00436v1)|null|\n", "2311.00265": "|**2023-11-01**|**Adaptive Latent Diffusion Model for 3D Medical Image to Image Translation: Multi-modal Magnetic Resonance Imaging Study**|Jonghun Kim et.al.|[2311.00265v1](http://arxiv.org/abs/2311.00265v1)|**[link](https://github.com/jongdory/aldm)**|\n", "2311.00207": "|**2023-11-01**|**Magmaw: Modality-Agnostic Adversarial Attacks on Machine Learning-Based Wireless Communication Systems**|Jung-Woo Chang et.al.|[2311.00207v1](http://arxiv.org/abs/2311.00207v1)|null|\n", "2311.01459": "|**2023-11-02**|**Align Your Prompts: Test-Time Prompting with Distribution Alignment for Zero-Shot Generalization**|Jameel Hassan et.al.|[2311.01459v1](http://arxiv.org/abs/2311.01459v1)|null|\n", "2311.01361": "|**2023-11-02**|**GPT-4V(ision) as a Generalist Evaluator for Vision-Language Tasks**|Xinlu Zhang et.al.|[2311.01361v1](http://arxiv.org/abs/2311.01361v1)|null|\n", "2311.01202": "|**2023-11-02**|**Cross-Modal Information-Guided Network using Contrastive Learning for Point Cloud Registration**|Yifan Xie et.al.|[2311.01202v1](http://arxiv.org/abs/2311.01202v1)|**[link](https://github.com/ivanxie416/cmignet)**|\n", "2311.01092": "|**2023-11-02**|**Learning A Multi-Task Transformer Via Unified And Customized Instruction Tuning For Chest Radiograph Interpretation**|Lijian Xu et.al.|[2311.01092v1](http://arxiv.org/abs/2311.01092v1)|**[link](https://github.com/medhk23/omnifm-dr)**|\n", "2311.01066": "|**2023-11-02**|**Dynamic Multimodal Information Bottleneck for Multimodality Classification**|Yingying Fang et.al.|[2311.01066v1](http://arxiv.org/abs/2311.01066v1)|**[link](https://github.com/bii-wushuang/dmib)**|\n", "2311.00807": "|**2023-11-01**|**VQA-GEN: A Visual Question Answering Benchmark for Domain Generalization**|Suraj Jyothi Unni et.al.|[2311.00807v1](http://arxiv.org/abs/2311.00807v1)|null|\n", "2311.00737": "|**2023-11-01**|**Real-Time Magnetic Tracking and Diagnosis of COVID-19 via Machine Learning**|Dang Nguyen et.al.|[2311.00737v1](http://arxiv.org/abs/2311.00737v1)|null|\n", "2311.01908": "|**2023-11-03**|**LLM-driven Multimodal Target Volume Contouring in Radiation Oncology**|Yujin Oh et.al.|[2311.01908v1](http://arxiv.org/abs/2311.01908v1)|null|\n", "2311.01886": "|**2023-11-03**|**Bridging the Gap between Multi-focus and Multi-modal: A Focused Integration Framework for Multi-modal Image Fusion**|Xilai Li et.al.|[2311.01886v1](http://arxiv.org/abs/2311.01886v1)|null|\n", "2311.01881": "|**2023-11-03**|**Quantitative Evaluation of a Multi-Modal Camera Setup for Fusing Event Data with RGB Images**|Julian Moosmann et.al.|[2311.01881v1](http://arxiv.org/abs/2311.01881v1)|null|\n", "2311.01831": "|**2023-11-03**|**Universal Multi-modal Multi-domain Pre-trained Recommendation**|Wenqi Sun et.al.|[2311.01831v1](http://arxiv.org/abs/2311.01831v1)|null|\n", "2311.01807": "|**2023-11-03**|**Cross-modal Consistency Learning with Fine-grained Fusion Network for Multimodal Fake News Detection**|Jun Li et.al.|[2311.01807v1](http://arxiv.org/abs/2311.01807v1)|**[link](https://github.com/uestc-lj/cffn)**|\n", "2311.01767": "|**2023-11-03**|**PPTC Benchmark: Evaluating Large Language Models for PowerPoint Task Completion**|Yiduo Guo et.al.|[2311.01767v1](http://arxiv.org/abs/2311.01767v1)|**[link](https://github.com/gydpku/pptc)**|\n", "2311.01766": "|**2023-11-03**|**Support or Refute: Analyzing the Stance of Evidence to Detect Out-of-Context Mis- and Disinformation**|Xin Yuan et.al.|[2311.01766v1](http://arxiv.org/abs/2311.01766v1)|null|\n", "2311.01740": "|**2023-11-03**|**SAC$^3$: Reliable Hallucination Detection in Black-Box Language Models via Semantic-aware Cross-check Consistency**|Jiaxin Zhang et.al.|[2311.01740v1](http://arxiv.org/abs/2311.01740v1)|null|\n", "2311.01734": "|**2023-11-03**|**MixCon3D: Synergizing Multi-View and Cross-Modal Contrastive Learning for Enhancing 3D Representation**|Yipeng Gao et.al.|[2311.01734v1](http://arxiv.org/abs/2311.01734v1)|**[link](https://github.com/ucsc-vlaa/mixcon3d)**|\n", "2311.01487": "|**2023-11-02**|**What Makes for Good Visual Instructions? Synthesizing Complex Visual Reasoning Instructions for Visual Instruction Tuning**|Yifan Du et.al.|[2311.01487v1](http://arxiv.org/abs/2311.01487v1)|**[link](https://github.com/rucaibox/comvint)**|\n", "2311.03328": "|**2023-11-06**|**On Asynchrony, Memory, and Communication: Separations and Landscapes**|Paola Flocchini et.al.|[2311.03328v1](http://arxiv.org/abs/2311.03328v1)|null|\n", "2311.03217": "|**2023-11-06**|**Leveraging Transformers to Improve Breast Cancer Classification and Risk Assessment with Multi-modal and Longitudinal Data**|Yiqiu Shen et.al.|[2311.03217v1](http://arxiv.org/abs/2311.03217v1)|null|\n", "2311.03106": "|**2023-11-06**|**Unified Multi-modal Unsupervised Representation Learning for Skeleton-based Action Understanding**|Shengkai Sun et.al.|[2311.03106v1](http://arxiv.org/abs/2311.03106v1)|**[link](https://github.com/huiguanlab/umurl)**|\n", "2311.03090": "|**2023-11-06**|**A multi-modal approach to continuous material identification through tactile sensing**|Augusto G\u00f3mez Egu\u00edluz et.al.|[2311.03090v1](http://arxiv.org/abs/2311.03090v1)|null|\n", "2311.03079": "|**2023-11-06**|**CogVLM: Visual Expert for Pretrained Language Models**|Weihan Wang et.al.|[2311.03079v1](http://arxiv.org/abs/2311.03079v1)|**[link](https://github.com/thudm/cogvlm)**|\n", "2311.02863": "|**2023-11-06**|**Temporal Shift -- Multi-Objective Loss Function for Improved Anomaly Fall Detection**|Stefan Denkovski et.al.|[2311.02863v1](http://arxiv.org/abs/2311.02863v1)|null|\n", "2311.02850": "|**2023-11-06**|**IR-STP: Enhancing Autonomous Driving with Interaction Reasoning in Spatio-Temporal Planning**|Yingbing Chen et.al.|[2311.02850v1](http://arxiv.org/abs/2311.02850v1)|**[link](https://github.com/chenyingbing/ir-stp-planner)**|\n", "2311.02842": "|**2023-11-06**|**An invariant feature extraction for multi-modal images matching**|Chenzhong Gao et.al.|[2311.02842v1](http://arxiv.org/abs/2311.02842v1)|null|\n", "2311.02820": "|**2023-11-06**|**Mesh Neural Cellular Automata**|Ehsan Pajouheshgar et.al.|[2311.02820v1](http://arxiv.org/abs/2311.02820v1)|null|\n", "2311.02782": "|**2023-11-05**|**Towards Generic Anomaly Detection and Understanding: Large-scale Visual-linguistic Model (GPT-4V) Takes the Lead**|Yunkang Cao et.al.|[2311.02782v1](http://arxiv.org/abs/2311.02782v1)|**[link](https://github.com/caoyunkang/gpt4v-for-generic-anomaly-detection)**|\n", "2311.02733": "|**2023-11-05**|**AV-Lip-Sync+: Leveraging AV-HuBERT to Exploit Multimodal Inconsistency for Video Deepfake Detection**|Sahibzada Adil Shahzad et.al.|[2311.02733v1](http://arxiv.org/abs/2311.02733v1)|null|\n", "2311.02559": "|**2023-11-05**|**Rotation Invariant Transformer for Recognizing Object in UAVs**|Shuoyi Chen et.al.|[2311.02559v1](http://arxiv.org/abs/2311.02559v1)|null|\n", "2311.02329": "|**2023-11-04**|**Complex Organ Mask Guided Radiology Report Generation**|Gu Tiancheng et.al.|[2311.02329v1](http://arxiv.org/abs/2311.02329v1)|**[link](https://github.com/garygutc/comg_model)**|\n", "2311.02282": "|**2023-11-04**|**Contrastive Multi-Modal Representation Learning for Spark Plug Fault Diagnosis**|Ardavan Modarres et.al.|[2311.02282v1](http://arxiv.org/abs/2311.02282v1)|null|\n", "2311.02248": "|**2023-11-03**|**COSMIC: Data Efficient Instruction-tuning For Speech In-Context Learning**|Jing Pan et.al.|[2311.02248v1](http://arxiv.org/abs/2311.02248v1)|null|\n", "2311.04219": "|**2023-11-07**|**OtterHD: A High-Resolution Multi-modality Model**|Bo Li et.al.|[2311.04219v1](http://arxiv.org/abs/2311.04219v1)|null|\n", "2311.04160": "|**2023-11-07**|**\"Tell me about that church\": Exploring the Design and User Experience of In-Vehicle Multi-modal Intuitive Interface in the Context of Driving Scenario**|Yueteng Yu et.al.|[2311.04160v1](http://arxiv.org/abs/2311.04160v1)|null|\n", "2311.04091": "|**2023-11-07**|**Proceedings of the 5th International Workshop on Reading Music Systems**|Jorge Calvo-Zaragoza et.al.|[2311.04091v1](http://arxiv.org/abs/2311.04091v1)|**[link](https://github.com/suziai/gui-tools)**|\n", "2311.04058": "|**2023-11-07**|**mmFUSION: Multimodal Fusion for 3D Objects Detection**|Javed Ahmad et.al.|[2311.04058v1](http://arxiv.org/abs/2311.04058v1)|null|\n", "2311.04056": "|**2023-11-07**|**Multi-View Causal Representation Learning with Partial Observability**|Dingling Yao et.al.|[2311.04056v1](http://arxiv.org/abs/2311.04056v1)|null|\n", "2311.03810": "|**2023-11-07**|**Rethinking and Improving Multi-task Learning for End-to-end Speech Translation**|Yuhao Zhang et.al.|[2311.03810v1](http://arxiv.org/abs/2311.03810v1)|**[link](https://github.com/xiaozhang521/imtl)**|\n", "2311.03620": "|**2023-11-07**|**FusionViT: Hierarchical 3D Object Detection via LiDAR-Camera Vision Transformer Fusion**|Xinhao Xiang et.al.|[2311.03620v1](http://arxiv.org/abs/2311.03620v1)|null|\n", "2311.03606": "|**2023-11-06**|**Multimodal Stress Detection Using Facial Landmarks and Biometric Signals**|Majid Hosseini et.al.|[2311.03606v1](http://arxiv.org/abs/2311.03606v1)|null|\n", "2311.03413": "|**2023-11-06**|**Discret2Di -- Deep Learning based Discretization for Model-based Diagnosis**|Lukas Moddemann et.al.|[2311.03413v1](http://arxiv.org/abs/2311.03413v1)|null|\n", "2311.04766": "|**2023-11-08**|**DualTalker: A Cross-Modal Dual Learning Approach for Speech-Driven 3D Facial Animation**|Guinan Su et.al.|[2311.04766v1](http://arxiv.org/abs/2311.04766v1)|null|\n", "2311.04678": "|**2023-11-08**|**Weakly supervised cross-model learning in high-content screening**|Watkinson Gabriel et.al.|[2311.04678v1](http://arxiv.org/abs/2311.04678v1)|null|\n", "2311.04589": "|**2023-11-08**|**TEAL: Tokenize and Embed ALL for Multi-modal Large Language Models**|Zhen Yang et.al.|[2311.04589v1](http://arxiv.org/abs/2311.04589v1)|null|\n", "2311.04563": "|**2023-11-08**|**Investigating the Nature of Disagreements on Mid-Scale Ratings: A Case Study on the Abstractness-Concreteness Continuum**|Urban Knuple\u0161 et.al.|[2311.04563v1](http://arxiv.org/abs/2311.04563v1)|null|\n", "2311.04552": "|**2023-11-08**|**A 3D generative model of pathological multi-modal MR images and segmentations**|Virginia Fernandez et.al.|[2311.04552v1](http://arxiv.org/abs/2311.04552v1)|**[link](https://github.com/virginiafdez/brainspade3d_rel)**|\n", "2311.04512": "|**2023-11-08**|**FFINet: Future Feedback Interaction Network for Motion Forecasting**|Miao Kang et.al.|[2311.04512v1](http://arxiv.org/abs/2311.04512v1)|null|\n", "2311.04507": "|**2023-11-08**|**Conversation Understanding using Relational Temporal Graph Neural Networks with Auxiliary Cross-Modality Interaction**|Cam-Van Thi Nguyen et.al.|[2311.04507v1](http://arxiv.org/abs/2311.04507v1)|null|\n", "2311.04390": "|**2023-11-07**|**Force-Constrained Visual Policy: Safe Robot-Assisted Dressing via Multi-Modal Sensing**|Zhanyi Sun et.al.|[2311.04390v1](http://arxiv.org/abs/2311.04390v1)|null|\n", "2311.04257": "|**2023-11-07**|**mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration**|Qinghao Ye et.al.|[2311.04257v1](http://arxiv.org/abs/2311.04257v1)|**[link](https://github.com/x-plug/mplug-owl)**|\n", "2311.05494": "|**2023-11-09**|**Object-centric Cross-modal Feature Distillation for Event-based Object Detection**|Lei Li et.al.|[2311.05494v1](http://arxiv.org/abs/2311.05494v1)|null|\n", "2311.05464": "|**2023-11-09**|**3DStyle-Diffusion: Pursuing Fine-grained Text-driven 3D Stylization with 2D Diffusion Models**|Haibo Yang et.al.|[2311.05464v1](http://arxiv.org/abs/2311.05464v1)|**[link](https://github.com/yanghb22-fdu/3dstyle-diffusion-official)**|\n", "2311.05463": "|**2023-11-09**|**ControlStyle: Text-Driven Stylized Image Generation Using Diffusion Priors**|Jingwen Chen et.al.|[2311.05463v1](http://arxiv.org/abs/2311.05463v1)|null|\n", "2311.05348": "|**2023-11-09**|**u-LLaVA: Unifying Multi-Modal Tasks via Large Language Model**|Jinjin Xu et.al.|[2311.05348v1](http://arxiv.org/abs/2311.05348v1)|null|\n", "2311.05319": "|**2023-11-09**|**TLCFuse: Temporal Multi-Modality Fusion Towards Occlusion-Aware Semantic Segmentation-Aided Motion Planning**|Gustavo Salazar-Gomez et.al.|[2311.05319v1](http://arxiv.org/abs/2311.05319v1)|null|\n", "2311.05298": "|**2023-11-09**|**Improving Vision-and-Language Reasoning via Spatial Relations Modeling**|Cheng Yang et.al.|[2311.05298v1](http://arxiv.org/abs/2311.05298v1)|null|\n", "2311.05152": "|**2023-11-09**|**Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual Downstream Tasks**|Haoyi Duan et.al.|[2311.05152v1](http://arxiv.org/abs/2311.05152v1)|**[link](https://github.com/haoyi-duan/dg-sct)**|\n", "2311.05032": "|**2023-11-08**|**Transfer learning from a sparsely annotated dataset of 3D medical images**|Gabriel Efrain Humpire-Mamani et.al.|[2311.05032v1](http://arxiv.org/abs/2311.05032v1)|**[link](https://github.com/diagnijmegen/medicaltransferlearning3d-unet)**|\n", "2311.05870": "|**2023-11-10**|**Automated Heterogeneous Low-Bit Quantization of Multi-Model Deep Learning Inference Pipeline**|Jayeeta Mondal et.al.|[2311.05870v1](http://arxiv.org/abs/2311.05870v1)|null|\n", "2311.05863": "|**2023-11-10**|**Watermarking Vision-Language Pre-trained Models for Multi-modal Embedding as a Service**|Yuanmin Tang et.al.|[2311.05863v1](http://arxiv.org/abs/2311.05863v1)|**[link](https://github.com/Pter61/vlpmarker)**|\n", "2311.05699": "|**2023-11-09**|**Cosmological parameter estimation with Genetic Algorithms**|Ricardo Medel-Esquivel et.al.|[2311.05699v1](http://arxiv.org/abs/2311.05699v1)|null|\n", "2311.05669": "|**2023-11-09**|**Multi-Modal Gaze Following in Conversational Scenarios**|Yuqi Hou et.al.|[2311.05669v1](http://arxiv.org/abs/2311.05669v1)|null|\n"}, "Point Cloud Localization": {"2301.05372": "|**2023-01-13**|**Text to Point Cloud Localization with Relation-Enhanced Transformer**|Guangzhi Wang et.al.|[2301.05372v1](http://arxiv.org/abs/2301.05372v1)|null|\n", "2209.15475": "|**2022-09-30**|**Point Cloud Quality Assessment using 3D Saliency Maps**|Zhengyu Wang et.al.|[2209.15475v1](http://arxiv.org/abs/2209.15475v1)|null|\n", "2207.05317": "|**2022-07-12**|**CPO: Change Robust Panorama to Point Cloud Localization**|Junho Kim et.al.|[2207.05317v1](http://arxiv.org/abs/2207.05317v1)|null|\n", "2205.14965": "|**2022-05-31**|**PSNet: Fast Data Structuring for Hierarchical Deep Learning on Point Cloud**|Luyang Li et.al.|[2205.14965v2](http://arxiv.org/abs/2205.14965v2)|**[link](https://github.com/lly007/pointstructuringnet)**|\n", "2203.15125": "|**2022-04-05**|**Text2Pos: Text-to-Point-Cloud Cross-Modal Localization**|Manuel Kolmet et.al.|[2203.15125v2](http://arxiv.org/abs/2203.15125v2)|null|\n", "2003.02392": "|**2021-11-22**|**PointLoc: Deep Pose Regressor for LiDAR Point Cloud Localization**|Wei Wang et.al.|[2003.02392v3](http://arxiv.org/abs/2003.02392v3)|**[link](https://github.com/loveoxford/vreloc)**|\n", "1812.01711": "|**2018-11-28**|**A Graph-CNN for 3D Point Cloud Classification**|Yingxue Zhang et.al.|[1812.01711v1](http://arxiv.org/abs/1812.01711v1)|**[link](https://github.com/maggie0106/Graph-CNN-in-3D-Point-Cloud-Classification)**|\n", "1712.06760": "|**2018-04-03**|**Mining Point Cloud Local Structures by Kernel Correlation and Graph Pooling**|Yiru Shen et.al.|[1712.06760v2](http://arxiv.org/abs/1712.06760v2)|null|\n", "1702.04114": "|**2017-02-14**|**Graph Based Over-Segmentation Methods for 3D Point Clouds**|Yizhak Ben-Shabat et.al.|[1702.04114v1](http://arxiv.org/abs/1702.04114v1)|null|\n"}, "Place Recognization": {"2302.06149": "|**2023-02-13**|**Contour Context: Abstract Structural Distribution for 3D LiDAR Loop Detection and Metric Pose Estimation**|Binqian Jiang et.al.|[2302.06149v1](http://arxiv.org/abs/2302.06149v1)|**[link](https://github.com/lewisjiang/contour-context)**|\n", "2301.05604": "|**2023-01-13**|**A LiDAR-Inertial-Visual SLAM System with Loop Detection**|Kangcheng Liu et.al.|[2301.05604v1](http://arxiv.org/abs/2301.05604v1)|null|\n", "2212.12745": "|**2022-12-24**|**GraffMatch: Global Matching of 3D Lines and Planes for Wide Baseline LiDAR Registration**|Parker C. Lusk et.al.|[2212.12745v1](http://arxiv.org/abs/2212.12745v1)|null|\n", "2211.14864": "|**2022-11-27**|**A Faster, Lighter and Stronger Deep Learning-Based Approach for Place Recognition**|Rui Huang et.al.|[2211.14864v1](http://arxiv.org/abs/2211.14864v1)|null|\n", "2211.12732": "|**2023-03-02**|**Wild-Places: A Large-Scale Dataset for Lidar Place Recognition in Unstructured Natural Environments**|Joshua Knights et.al.|[2211.12732v3](http://arxiv.org/abs/2211.12732v3)|**[link](https://github.com/csiro-robotics/Wild-Places)**|\n", "2210.13856": "|**2022-11-02**|**A Framework for Collaborative Multi-Robot Mapping using Spectral Graph Wavelets**|Lukas Bernreiter et.al.|[2210.13856v2](http://arxiv.org/abs/2210.13856v2)|null|\n", "2210.11029": "|**2022-10-20**|**DeepRING: Learning Roto-translation Invariant Representation for LiDAR based Place Recognition**|Sha Lu et.al.|[2210.11029v1](http://arxiv.org/abs/2210.11029v1)|null|\n", "2210.04432": "|**2023-03-06**|**Spectral Geometric Verification: Re-Ranking Point Cloud Retrieval for Metric Localization**|Kavisha Vidanapathirana et.al.|[2210.04432v2](http://arxiv.org/abs/2210.04432v2)|**[link](https://github.com/csiro-robotics/spectralgv)**|\n", "2210.04236": "|**2022-10-09**|**Fusing Event-based Camera and Radar for SLAM Using Spiking Neural Networks with Continual STDP Learning**|Ali Safa et.al.|[2210.04236v1](http://arxiv.org/abs/2210.04236v1)|null|\n", "2210.01320": "|**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|\n", "2209.12513": "|**2022-09-26**|**NDD: A 3D Point Cloud Descriptor Based on Normal Distribution for Loop Closure Detection**|Ruihao Zhou et.al.|[2209.12513v1](http://arxiv.org/abs/2209.12513v1)|**[link](https://github.com/zhouruihao1001/ndd)**|\n", "2209.11894": "|**2022-09-24**|**Closing the Loop: Graph Networks to Unify Semantic Objects and Visual Features for Multi-object Scenes**|Jonathan J. Y. Kim et.al.|[2209.11894v1](http://arxiv.org/abs/2209.11894v1)|null|\n", "2209.09699": "|**2023-03-28**|**PADLoC: LiDAR-Based Deep Loop Closure Detection and Registration Using Panoptic Attention**|Jos\u00e9 Arce et.al.|[2209.09699v3](http://arxiv.org/abs/2209.09699v3)|**[link](https://github.com/robot-learning-freiburg/PADLoC)**|\n", "2209.08608": "|**2022-09-18**|**HGI-SLAM: Loop Closure With Human and Geometric Importance Features**|Shuhul Mujoo et.al.|[2209.08608v1](http://arxiv.org/abs/2209.08608v1)|null|\n", "2209.08578": "|**2022-09-18**|**Data-driven Loop Closure Detection in Bathymetric Point Clouds for Underwater SLAM**|Jiarui Tan et.al.|[2209.08578v1](http://arxiv.org/abs/2209.08578v1)|**[link](https://github.com/tjr16/bathy_nn_learning)**|\n", "2209.06779": "|**2022-10-15**|**Efficient Planar Pose Estimation via UWB Measurements**|Haodong Jiang et.al.|[2209.06779v3](http://arxiv.org/abs/2209.06779v3)|**[link](https://github.com/SLAMLab-CUHKSZ/Efficient-Pose-Estimation-via-UWB-measurements)**|\n", "2209.06545": "|**2023-01-12**|**Tac2Structure: Object Surface Reconstruction Only through Multi Times Touch**|Junyuan Lu et.al.|[2209.06545v3](http://arxiv.org/abs/2209.06545v3)|**[link](https://github.com/ljy-zju/tac2structure)**|\n", "2209.04497": "|**2022-09-09**|**General Place Recognition Survey: Towards the Real-world Autonomy Age**|Peng Yin et.al.|[2209.04497v1](http://arxiv.org/abs/2209.04497v1)|**[link](https://github.com/MetaSLAM/GPRS)**|\n", "2207.10916": "|**2022-07-22**|**PLD-SLAM: A Real-Time Visual SLAM Using Points and Line Segments in Dynamic Scenes**|BaoSheng Zhang et.al.|[2207.10916v1](http://arxiv.org/abs/2207.10916v1)|null|\n", "2207.06965": "|**2022-09-28**|**AutoMerge: A Framework for Map Assembling and Smoothing in City-scale Environments**|Peng Yin et.al.|[2207.06965v3](http://arxiv.org/abs/2207.06965v3)|null|\n", "2207.06738": "|**2022-07-14**|**Semi-supervised Vector-Quantization in Visual SLAM using HGCN**|Amir Zarringhalam et.al.|[2207.06738v1](http://arxiv.org/abs/2207.06738v1)|null|\n", "2207.06732": "|**2022-07-14**|**Self-supervised Vector-Quantization in Visual SLAM using Deep Convolutional Autoencoders**|Amir Zarringhalam et.al.|[2207.06732v1](http://arxiv.org/abs/2207.06732v1)|null|\n", "2206.12628": "|**2022-09-27**|**FreSCo: Frequency-Domain Scan Context for LiDAR-based Place Recognition with Translation and Rotation Invariance**|Yongzhi Fan et.al.|[2206.12628v2](http://arxiv.org/abs/2206.12628v2)|**[link](https://github.com/soytony/fresco)**|\n", "2206.08733": "|**2022-06-17**|**Efficient WiFi LiDAR SLAM for Autonomous Robots in Large Environments**|Khairuldanial Ismail et.al.|[2206.08733v1](http://arxiv.org/abs/2206.08733v1)|null|\n", "2205.13135": "|**2022-07-09**|**LAMP 2.0: A Robust Multi-Robot SLAM System for Operation in Challenging Large-Scale Underground Environments**|Yun Chang et.al.|[2205.13135v3](http://arxiv.org/abs/2205.13135v3)|**[link](https://github.com/nebula-autonomy/nebula-multirobot-dataset)**|\n", "2204.12831": "|**2022-11-09**|**The Revisiting Problem in Simultaneous Localization and Mapping: A Survey on Visual Loop Closure Detection**|Konstantinos A. Tsintotas et.al.|[2204.12831v3](http://arxiv.org/abs/2204.12831v3)|null|\n", "2204.05481": "|**2022-04-12**|**HiTPR: Hierarchical Transformer for Place Recognition in Point Cloud**|Zhixing Hou et.al.|[2204.05481v1](http://arxiv.org/abs/2204.05481v1)|null|\n", "2204.04932": "|**2022-04-11**|**Optimized SC-F-LOAM: Optimized Fast LiDAR Odometry and Mapping Using Scan Context**|Lizhou Liao et.al.|[2204.04932v1](http://arxiv.org/abs/2204.04932v1)|**[link](https://github.com/SlamCabbage/Optimized-SC-F-LOAM)**|\n", "2204.01524": "|**2022-04-01**|**Bi-directional Loop Closure for Visual SLAM**|Ihtisham Ali et.al.|[2204.01524v1](http://arxiv.org/abs/2204.01524v1)|null|\n", "2203.03454": "|**2022-03-07**|**Multi-Modal Lidar Dataset for Benchmarking General-Purpose Localization and Mapping Algorithms**|Qingqing Li et.al.|[2203.03454v1](http://arxiv.org/abs/2203.03454v1)|**[link](https://github.com/tiers/tiers-lidars-dataset)**|\n", "2201.13360": "|**2022-06-20**|**Hydra: A Real-time Spatial Perception System for 3D Scene Graph Construction and Optimization**|Nathan Hughes et.al.|[2201.13360v2](http://arxiv.org/abs/2201.13360v2)|null|\n", "2201.09048": "|**2022-01-22**|**Phase-SLAM: Phase Based Simultaneous Localization and Mapping for Mobile Structured Light Illumination Systems**|Xi Zheng et.al.|[2201.09048v1](http://arxiv.org/abs/2201.09048v1)|**[link](https://github.com/zhengxi-git/phase-slam)**|\n", "2201.03212": "|**2022-01-10**|**Why-So-Deep: Towards Boosting Previously Trained Models for Visual Place Recognition**|M. Usman Maqbool Bhutta et.al.|[2201.03212v1](http://arxiv.org/abs/2201.03212v1)|**[link](https://github.com/UsmanMaqbool/why-so-deep)**|\n", "2111.14990": "|**2021-11-29**|**MIXER: A Principled Framework for Multimodal, Multiway Data Association**|Parker C. Lusk et.al.|[2111.14990v1](http://arxiv.org/abs/2111.14990v1)|null|\n", "2111.13838": "|**2021-11-27**|**DSC: Deep Scan Context Descriptor for Large-Scale Place Recognition**|Jiafeng Cui et.al.|[2111.13838v1](http://arxiv.org/abs/2111.13838v1)|null|\n", "2111.13826": "|**2021-11-27**|**Average Outward Flux Skeletons for Environment Mapping and Topology Matching**|Morteza Rezanejad et.al.|[2111.13826v1](http://arxiv.org/abs/2111.13826v1)|null|\n", "2111.00440": "|**2022-02-27**|**Loop closure detection using local 3D deep descriptors**|Youjie Zhou et.al.|[2111.00440v2](http://arxiv.org/abs/2111.00440v2)|**[link](https://github.com/yiming107/l3d_loop_closure)**|\n", "2110.11491": "|**2021-10-21**|**SymbioLCD: Ensemble-Based Loop Closure Detection using CNN-Extracted Objects and Visual Bag-of-Words**|Jonathan J. Y. Kim et.al.|[2110.11491v1](http://arxiv.org/abs/2110.11491v1)|null|\n", "2109.08975": "|**2022-03-09**|**AirLoop: Lifelong Loop Closure Detection**|Dasong Gao et.al.|[2109.08975v3](http://arxiv.org/abs/2109.08975v3)|**[link](https://github.com/wang-chen/airloop)**|\n", "2109.06596": "|**2021-09-14**|**GPGM-SLAM: a Robust SLAM System for Unstructured Planetary Environments with Gaussian Process Gradient Maps**|Riccardo Giubilato et.al.|[2109.06596v1](http://arxiv.org/abs/2109.06596v1)|null|\n", "2108.12790": "|**2022-08-28**|**RPR-Net: A Point Cloud-based Rotation-aware Large Scale Place Recognition Network**|Zhaoxin Fan et.al.|[2108.12790v3](http://arxiv.org/abs/2108.12790v3)|null|\n", "2108.02028": "|**2021-08-04**|**Incorporating Learnt Local and Global Embeddings into Monocular Visual SLAM**|Huaiyang Huang et.al.|[2108.02028v1](http://arxiv.org/abs/2108.02028v1)|null|\n", "2108.01383": "|**2021-08-03**|**On the descriptive power of LiDAR intensity images for segment-based loop closing in 3-D SLAM**|Jan Wietrzykowski et.al.|[2108.01383v1](http://arxiv.org/abs/2108.01383v1)|**[link](https://github.com/LRMPUT/segmap_vis_views)**|\n", "2107.14611": "|**2021-07-30**|**Automatic Vocabulary and Graph Verification for Accurate Loop Closure Detection**|Haosong Yue et.al.|[2107.14611v1](http://arxiv.org/abs/2107.14611v1)|null|\n", "2107.07707": "|**2021-07-16**|**Probabilistic Appearance-Invariant Topometric Localization with New Place Awareness**|Ming Xu et.al.|[2107.07707v1](http://arxiv.org/abs/2107.07707v1)|**[link](https://github.com/mingu6/TopometricLoc)**|\n", "2107.07133": "|**2021-07-15**|**A life-long SLAM approach using adaptable local maps based on rasterized LIDAR images**|Waqas Ali et.al.|[2107.07133v1](http://arxiv.org/abs/2107.07133v1)|null|\n", "2106.11516": "|**2021-07-01**|**SA-LOAM: Semantic-aided LiDAR SLAM with Loop Closure**|Lin Li et.al.|[2106.11516v2](http://arxiv.org/abs/2106.11516v2)|null|\n", "2106.09637": "|**2023-01-04**|**AttDLNet: Attention-based DL Network for 3D LiDAR Place Recognition**|Tiago Barros et.al.|[2106.09637v4](http://arxiv.org/abs/2106.09637v4)|**[link](https://github.com/cybonic/attdlnet)**|\n", "2105.11344": "|**2021-05-24**|**OverlapNet: Loop Closing for LiDAR-based SLAM**|Xieyuanli Chen et.al.|[2105.11344v1](http://arxiv.org/abs/2105.11344v1)|**[link](https://github.com/PRBonn/OverlapNet)**|\n", "2103.12292": "|**2021-03-23**|**NDT-Transformer: Large-Scale 3D Point Cloud Localisation using the Normal Distribution Transform Representation**|Zhicheng Zhou et.al.|[2103.12292v1](http://arxiv.org/abs/2103.12292v1)|**[link](https://github.com/dachengxiaocheng/NDT-Transformer)**|\n", "2303.00477": "|**2023-03-01**|**ORCHNet: A Robust Global Feature Aggregation approach for 3D LiDAR-based Place recognition in Orchards**|T. Barros et.al.|[2303.00477v1](http://arxiv.org/abs/2303.00477v1)|**[link](https://github.com/cybonic/orchnet)**|\n", "2303.00295": "|**2023-03-01**|**Region Prediction for Efficient Robot Localization on Large Maps**|Matteo Scucchia et.al.|[2303.00295v1](http://arxiv.org/abs/2303.00295v1)|null|\n", "2304.03872": "|**2023-06-24**|**LSGDDN-LCD: An Appearance-based Loop Closure Detection using Local Superpixel Grid Descriptors and Incremental Dynamic Nodes**|Baosheng Zhang et.al.|[2304.03872v2](http://arxiv.org/abs/2304.03872v2)|null|\n", "2304.05146": "|**2023-04-14**|**Loop Closure Detection Based on Object-level Spatial Layout and Semantic Consistency**|Xingwu Ji et.al.|[2304.05146v2](http://arxiv.org/abs/2304.05146v2)|**[link](https://github.com/jixingwu/ss-lcd)**|\n", "2304.13487": "|**2023-04-26**|**Hydra-Multi: Collaborative Online Construction of 3D Scene Graphs with Multi-Robot Teams**|Yun Chang et.al.|[2304.13487v1](http://arxiv.org/abs/2304.13487v1)|null|\n", "2305.07154": "|**2023-05-11**|**Foundations of Spatial Perception for Robotics: Hierarchical Representations and Real-time Systems**|Nathan Hughes et.al.|[2305.07154v1](http://arxiv.org/abs/2305.07154v1)|**[link](https://github.com/mit-spark/hydra)**|\n", "2305.18013": "|**2023-05-29**|**TReR: A Lightweight Transformer Re-Ranking Approach for 3D LiDAR Place Recognition**|Tiago Barros et.al.|[2305.18013v1](http://arxiv.org/abs/2305.18013v1)|null|\n", "2307.04321": "|**2023-07-10**|**RaPlace: Place Recognition for Imaging Radar using Radon Transform and Mutable Threshold**|Hyesu Jang et.al.|[2307.04321v1](http://arxiv.org/abs/2307.04321v1)|**[link](https://github.com/hyesu-jang/raplace)**|\n", "2307.08221": "|**2023-07-17**|**NDT-Map-Code: A 3D global descriptor for real-time loop closure detection in lidar SLAM**|Lizhou Liao et.al.|[2307.08221v1](http://arxiv.org/abs/2307.08221v1)|**[link](https://github.com/SlamCabbage/NDTMC)**|\n", "2307.09044": "|**2023-07-18**|**3D-SeqMOS: A Novel Sequential 3D Moving Object Segmentation in Autonomous Driving**|Qipeng Li et.al.|[2307.09044v1](http://arxiv.org/abs/2307.09044v1)|null|\n", "2309.02394": "|**2023-09-05**|**Magnetic Navigation using Attitude-Invariant Magnetic Field Information for Loop Closure Detection**|Natalia Pavlasek et.al.|[2309.02394v1](http://arxiv.org/abs/2309.02394v1)|null|\n", "2309.07094": "|**2023-09-13**|**RadarLCD: Learnable Radar-based Loop Closure Detection Pipeline**|Mirko Usuelli et.al.|[2309.07094v1](http://arxiv.org/abs/2309.07094v1)|null|\n", "2309.09879": "|**2023-09-18**|**DynaPix SLAM: A Pixel-Based Dynamic SLAM Approach**|Chenghao Xu et.al.|[2309.09879v1](http://arxiv.org/abs/2309.09879v1)|null|\n", "2309.08914": "|**2023-09-16**|**Outram: One-shot Global Localization via Triangulated Scene Graph and Global Outlier Pruning**|Pengyu Yin et.al.|[2309.08914v1](http://arxiv.org/abs/2309.08914v1)|**[link](https://github.com/pamphlett/outram)**|\n"}, "LiDAR SLAM": {"2212.14209": "|**2022-12-29**|**An Enhanced LiDAR-Inertial SLAM System for Robotics Localization and Mapping**|Kangcheng Liu et.al.|[2212.14209v1](http://arxiv.org/abs/2212.14209v1)|**[link](https://github.com/KangchengLiu/slam_resources)**|\n", "2212.05705": "|**2022-12-12**|**An Integrated LiDAR-SLAM System for Complex Environment with Noisy Point Clouds**|Kangcheng Liu et.al.|[2212.05705v1](http://arxiv.org/abs/2212.05705v1)|**[link](https://github.com/KangchengLiu/DLC_LiDAR_SLAM)**|\n", "2212.02077": "|**2022-12-05**|**DL-SLOT: Dynamic LiDAR SLAM and object tracking based on collaborative graph optimization**|Xuebo Tian et.al.|[2212.02077v1](http://arxiv.org/abs/2212.02077v1)|null|\n", "2211.03484": "|**2022-11-07**|**When Geometry is not Enough: Using Reflector Markers in Lidar SLAM**|Gerhard Kurz et.al.|[2211.03484v1](http://arxiv.org/abs/2211.03484v1)|null|\n", "2211.02445": "|**2023-04-14**|**Lidar-level localization with radar? The CFEAR approach to accurate, fast and robust large-scale radar odometry in diverse environments**|Daniel Adolfsson et.al.|[2211.02445v3](http://arxiv.org/abs/2211.02445v3)|**[link](https://github.com/dan11003/CFEAR_Radarodometry_code_public)**|\n", "2210.11978": "|**2023-04-13**|**DCL-SLAM: A Distributed Collaborative LiDAR SLAM Framework for a Robotic Swarm**|Shipeng Zhong et.al.|[2210.11978v2](http://arxiv.org/abs/2210.11978v2)|**[link](https://github.com/pengyu-team/dcl-slam)**|\n", "2210.00812": "|**2022-10-03**|**A Benchmark for Multi-Modal Lidar SLAM with Ground Truth in GNSS-Denied Environments**|Ha Sier et.al.|[2210.00812v1](http://arxiv.org/abs/2210.00812v1)|**[link](https://github.com/tiers/tiers-lidars-dataset-enhanced)**|\n", "2209.08810": "|**2022-09-19**|**LMBAO: A Landmark Map for Bundle Adjustment Odometry in LiDAR SLAM**|Letian Zhang et.al.|[2209.08810v1](http://arxiv.org/abs/2209.08810v1)|null|\n", "2209.08248": "|**2022-09-29**|**PlaneSLAM: Plane-based LiDAR SLAM for Motion Planning in Structured 3D Environments**|Adam Dai et.al.|[2209.08248v2](http://arxiv.org/abs/2209.08248v2)|**[link](https://github.com/stanford-navlab/planeslam)**|\n", "2209.08091": "|**2022-09-16**|**ViWiD: Leveraging WiFi for Robust and Resource-Efficient SLAM**|Aditya Arun et.al.|[2209.08091v1](http://arxiv.org/abs/2209.08091v1)|null|\n", "2208.11855": "|**2022-08-25**|**Lidar SLAM for Autonomous Driving Vehicles**|Farhad Aghili et.al.|[2208.11855v1](http://arxiv.org/abs/2208.11855v1)|null|\n", "2208.09777": "|**2022-09-08**|**JVLDLoc: a Joint Optimization of Visual-LiDAR Constraints and Direction Priors for Localization in Driving Scenario**|Longrui Dong et.al.|[2208.09777v3](http://arxiv.org/abs/2208.09777v3)|null|\n", "2208.07473": "|**2022-11-18**|**BoW3D: Bag of Words for Real-Time Loop Closing in 3D LiDAR SLAM**|Yunge Cui et.al.|[2208.07473v2](http://arxiv.org/abs/2208.07473v2)|**[link](https://github.com/yungecui/bow3d)**|\n", "2207.06815": "|**2022-07-14**|**Challenges of SLAM in extremely unstructured environments: the DLR Planetary Stereo, Solid-State LiDAR, Inertial Dataset**|Riccardo Giubilato et.al.|[2207.06815v1](http://arxiv.org/abs/2207.06815v1)|null|\n", "2206.09463": "|**2022-06-19**|**RF-LIO: Removal-First Tightly-coupled Lidar Inertial Odometry in High Dynamic Environments**|Chenglong Qian et.al.|[2206.09463v1](http://arxiv.org/abs/2206.09463v1)|null|\n", "2206.08733": "|**2022-06-17**|**Efficient WiFi LiDAR SLAM for Autonomous Robots in Large Environments**|Khairuldanial Ismail et.al.|[2206.08733v1](http://arxiv.org/abs/2206.08733v1)|null|\n", "2206.00266": "|**2022-06-01**|**PaGO-LOAM: Robust Ground-Optimized LiDAR Odometry**|Dong-Uk Seo et.al.|[2206.00266v1](http://arxiv.org/abs/2206.00266v1)|**[link](https://github.com/url-kaist/alterground-lego-loam)**|\n", "2205.08556": "|**2022-05-17**|**Global Data Association for SLAM with 3D Grassmannian Manifold Objects**|Parker C. Lusk et.al.|[2205.08556v1](http://arxiv.org/abs/2205.08556v1)|null|\n", "2204.12769": "|**2022-04-27**|**Dynamic Registration: Joint Ego Motion Estimation and 3D Moving Object Detection in Dynamic Environment**|Wenyu Li et.al.|[2204.12769v1](http://arxiv.org/abs/2204.12769v1)|null|\n", "2204.08163": "|**2022-04-18**|**Mapping While Following: 2D LiDAR SLAM in Indoor Dynamic Environments with a Person Tracker**|Hanjing Ye et.al.|[2204.08163v1](http://arxiv.org/abs/2204.08163v1)|null|\n", "2203.13799": "|**2022-03-25**|**Gravity-constrained point cloud registration**|Vladim\u00edr Kubelka et.al.|[2203.13799v1](http://arxiv.org/abs/2203.13799v1)|null|\n", "2202.11431": "|**2022-02-23**|**DL-SLOT: Dynamic Lidar SLAM and Object Tracking Based On Graph Optimization**|Xuebo Tian et.al.|[2202.11431v1](http://arxiv.org/abs/2202.11431v1)|null|\n", "2201.06423": "|**2022-01-17**|**SC-LiDAR-SLAM: a Front-end Agnostic Versatile LiDAR SLAM System**|Giseop Kim et.al.|[2201.06423v1](http://arxiv.org/abs/2201.06423v1)|null|\n", "2110.11517": "|**2021-10-21**|**Real-Time Ground-Plane Refined LiDAR SLAM**|Fan Yang et.al.|[2110.11517v1](http://arxiv.org/abs/2110.11517v1)|null|\n", "2110.02018": "|**2021-10-03**|**AEROS: Adaptive RObust least-Squares for Graph-Based SLAM**|Milad Ramezani et.al.|[2110.02018v1](http://arxiv.org/abs/2110.02018v1)|null|\n", "2109.05483": "|**2021-09-12**|**ART-SLAM: Accurate Real-Time 6DoF LiDAR SLAM**|Matteo Frosi et.al.|[2109.05483v1](http://arxiv.org/abs/2109.05483v1)|**[link](https://github.com/matteof94/artslam)**|\n", "2109.00200": "|**2021-09-01**|**A real-time global re-localization framework for 3D LiDAR SLAM**|Ziqi Chai et.al.|[2109.00200v1](http://arxiv.org/abs/2109.00200v1)|null|\n", "2108.01383": "|**2021-08-03**|**On the descriptive power of LiDAR intensity images for segment-based loop closing in 3-D SLAM**|Jan Wietrzykowski et.al.|[2108.01383v1](http://arxiv.org/abs/2108.01383v1)|**[link](https://github.com/LRMPUT/segmap_vis_views)**|\n", "2107.05283": "|**2021-07-12**|**Benchmark of visual and 3D lidar SLAM systems in simulation environment for vineyards**|Ibrahim Hroob et.al.|[2107.05283v1](http://arxiv.org/abs/2107.05283v1)|null|\n", "2106.11516": "|**2021-07-01**|**SA-LOAM: Semantic-aided LiDAR SLAM with Loop Closure**|Lin Li et.al.|[2106.11516v2](http://arxiv.org/abs/2106.11516v2)|null|\n", "2105.08941": "|**2021-05-19**|**Large-scale Localization Datasets in Crowded Indoor Spaces**|Donghwan Lee et.al.|[2105.08941v1](http://arxiv.org/abs/2105.08941v1)|null|\n", "2105.03296": "|**2021-10-05**|**VIRAL SLAM: Tightly Coupled Camera-IMU-UWB-Lidar SLAM**|Thien-Minh Nguyen et.al.|[2105.03296v3](http://arxiv.org/abs/2105.03296v3)|null|\n", "2104.05347": "|**2021-04-12**|**Radar SLAM: A Robust SLAM System for All Weather Conditions**|Ziyang Hong et.al.|[2104.05347v1](http://arxiv.org/abs/2104.05347v1)|null|\n", "2104.03657": "|**2021-04-08**|**Dynamic Object Aware LiDAR SLAM based on Automatic Generation of Training Data**|Patrick Pfreundschuh et.al.|[2104.03657v1](http://arxiv.org/abs/2104.03657v1)|null|\n", "2103.13090": "|**2021-03-24**|**Greedy-Based Feature Selection for Efficient LiDAR SLAM**|Jianhao Jiao et.al.|[2103.13090v1](http://arxiv.org/abs/2103.13090v1)|null|\n", "2103.10678": "|**2021-03-19**|**6-DOF Feature based LIDAR SLAM using ORB Features from Rasterized Images of 3D LIDAR Point Cloud**|Waqas Ali et.al.|[2103.10678v1](http://arxiv.org/abs/2103.10678v1)|null|\n", "2103.09523": "|**2021-12-30**|**A Universal LiDAR SLAM Accelerator System on Low-cost FPGA**|Keisuke Sugiura et.al.|[2103.09523v2](http://arxiv.org/abs/2103.09523v2)|null|\n", "2103.05056": "|**2022-02-08**|**LCDNet: Deep Loop Closure Detection and Point Cloud Registration for LiDAR SLAM**|Daniele Cattaneo et.al.|[2103.05056v4](http://arxiv.org/abs/2103.05056v4)|**[link](https://github.com/robot-learning-freiburg/LCDNet)**|\n", "2103.03713": "|**2021-03-05**|**Ground-SLAM: Ground Constrained LiDAR SLAM for Structured Multi-Floor Environments**|Xin Wei et.al.|[2103.03713v1](http://arxiv.org/abs/2103.03713v1)|null|\n", "2102.03800": "|**2021-02-17**|**Lightweight 3-D Localization and Mapping for Solid-State LiDAR**|Han Wang et.al.|[2102.03800v2](http://arxiv.org/abs/2102.03800v2)|**[link](https://github.com/wh200720041/SSL_SLAM)**|\n", "2102.03798": "|**2021-02-17**|**Intensity-SLAM: Intensity Assisted Localization and Mapping for Large Scale Environment**|Han Wang et.al.|[2102.03798v2](http://arxiv.org/abs/2102.03798v2)|**[link](https://github.com/wh200720041/intensity_slam)**|\n", "2102.03771": "|**2021-04-27**|**MULLS: Versatile LiDAR SLAM via Multi-metric Linear Least Square**|Yue Pan et.al.|[2102.03771v3](http://arxiv.org/abs/2102.03771v3)|**[link](https://github.com/YuePanEdward/MULLS)**|\n", "2101.06615": "|**2021-05-31**|**Online Robust Sliding-Windowed LiDAR SLAM in Natural Environments**|Quang-Ha Pham et.al.|[2101.06615v6](http://arxiv.org/abs/2101.06615v6)|null|\n", "2012.03455": "|**2020-12-07**|**TP-TIO: A Robust Thermal-Inertial Odometry with Deep ThermalPoint**|Shibo Zhao et.al.|[2012.03455v1](http://arxiv.org/abs/2012.03455v1)|null|\n", "2012.02399": "|**2020-12-04**|**P3-LOAM: PPP/LiDAR Loosely Coupled SLAM with Accurate Covariance Estimation and Robust RAIM in Urban Canyon Environment**|Tao Li et.al.|[2012.02399v1](http://arxiv.org/abs/2012.02399v1)|null|\n", "2011.11357": "|**2020-11-23**|**CamVox: A Low-cost and Accurate Lidar-assisted Visual SLAM System**|Yuewen Zhu et.al.|[2011.11357v1](http://arxiv.org/abs/2011.11357v1)|**[link](https://github.com/ISEE-Technology/CamVox)**|\n", "2011.02306": "|**2021-09-11**|**A Comparison of LiDAR-based SLAM Systems for Control of Unmanned Aerial Vehicles**|Robert Milijas et.al.|[2011.02306v3](http://arxiv.org/abs/2011.02306v3)|null|\n", "2010.08215": "|**2021-01-13**|**BALM: Bundle Adjustment for Lidar Mapping**|Zheng Liu et.al.|[2010.08215v2](http://arxiv.org/abs/2010.08215v2)|**[link](https://github.com/hku-mars/BALM)**|\n", "2008.03694": "|**2020-08-09**|**LiDAR Data Enrichment Using Deep Learning Based on High-Resolution Image: An Approach to Achieve High-Performance LiDAR SLAM Using Low-cost LiDAR**|Jiang Yue et.al.|[2008.03694v1](http://arxiv.org/abs/2008.03694v1)|null|\n", "2008.02274": "|**2020-08-05**|**Elasticity Meets Continuous-Time: Map-Centric Dense 3D LiDAR SLAM**|Chanoh Park et.al.|[2008.02274v1](http://arxiv.org/abs/2008.02274v1)|null|\n", "2302.13613": "|**2023-03-13**|**Evaluation of Lidar-based 3D SLAM algorithms in SubT environment**|Anton Koval et.al.|[2302.13613v2](http://arxiv.org/abs/2302.13613v2)|null|\n", "2303.01155": "|**2023-04-07**|**Marker-based Visual SLAM leveraging Hierarchical Representations**|Ali Tourani et.al.|[2303.01155v2](http://arxiv.org/abs/2303.01155v2)|null|\n", "2303.05252": "|**2023-03-09**|**SLAMesh: Real-time LiDAR Simultaneous Localization and Meshing**|Jianyuan Ruan et.al.|[2303.05252v1](http://arxiv.org/abs/2303.05252v1)|**[link](https://github.com/RuanJY/SLAMesh)**|\n", "2305.01843": "|**2023-05-03**|**Direct LiDAR-Inertial Odometry and Mapping: Perceptive and Connective SLAM**|Kenny Chen et.al.|[2305.01843v1](http://arxiv.org/abs/2305.01843v1)|null|\n", "2306.03660": "|**2023-06-06**|**PQM: A Point Quality Evaluation Metric for Dense Maps**|Yash Turkar et.al.|[2306.03660v1](http://arxiv.org/abs/2306.03660v1)|**[link](https://github.com/droneslab/pqm-sim)**|\n", "2307.08221": "|**2023-07-17**|**NDT-Map-Code: A 3D global descriptor for real-time loop closure detection in lidar SLAM**|Lizhou Liao et.al.|[2307.08221v1](http://arxiv.org/abs/2307.08221v1)|**[link](https://github.com/SlamCabbage/NDTMC)**|\n", "2307.09044": "|**2023-07-18**|**3D-SeqMOS: A Novel Sequential 3D Moving Object Segmentation in Autonomous Driving**|Qipeng Li et.al.|[2307.09044v1](http://arxiv.org/abs/2307.09044v1)|null|\n", "2307.15005": "|**2023-07-27**|**FLiCR: A Fast and Lightweight LiDAR Point Cloud Compression Based on Lossy RI**|Jin Heo et.al.|[2307.15005v1](http://arxiv.org/abs/2307.15005v1)|null|\n", "2309.04937": "|**2023-09-12**|**LONER: LiDAR Only Neural Representations for Real-Time SLAM**|Seth Isaacson et.al.|[2309.04937v2](http://arxiv.org/abs/2309.04937v2)|null|\n", "2309.08086": "|**2023-09-15**|**Fast and Accurate Deep Loop Closing and Relocalization for Reliable LiDAR SLAM**|Chenghao Shi et.al.|[2309.08086v1](http://arxiv.org/abs/2309.08086v1)|null|\n", "2311.00928": "|**2023-11-02**|**Quatro++: Robust Global Registration Exploiting Ground Segmentation for Loop Closing in LiDAR SLAM**|Hyungtae Lim et.al.|[2311.00928v1](http://arxiv.org/abs/2311.00928v1)|null|\n", "2311.02327": "|**2023-11-04**|**ECMD: An Event-Centric Multisensory Driving Dataset for SLAM**|Peiyu Chen et.al.|[2311.02327v1](http://arxiv.org/abs/2311.02327v1)|null|\n"}, "Transformer": {"2302.08104": "|**2023-02-16**|**Multiscalar field cosmological model and possible solutions using Noether symmetry approach**|Santu Mondal et.al.|[2302.08104v1](http://arxiv.org/abs/2302.08104v1)|null|\n", "2301.11622": "|**2023-01-30**|**Darboux transformations for Dunkl-Schroedinger equations with energy dependent potential and position dependent mass**|Axel Schulze-Halberg et.al.|[2301.11622v2](http://arxiv.org/abs/2301.11622v2)|null|\n", "2301.09364": "|**2023-04-06**|**On uniqueness of submaximally symmetric vector ordinary differential equations of C-class**|Johnson Allen Kessy et.al.|[2301.09364v2](http://arxiv.org/abs/2301.09364v2)|null|\n", "2301.08739": "|**2023-03-30**|**FlatFormer: Flattened Window Attention for Efficient Point Cloud Transformer**|Zhijian Liu et.al.|[2301.08739v2](http://arxiv.org/abs/2301.08739v2)|null|\n", "2301.07301": "|**2023-01-18**|**PTA-Det: Point Transformer Associating Point cloud and Image for 3D Object Detection**|Rui Wan et.al.|[2301.07301v1](http://arxiv.org/abs/2301.07301v1)|null|\n", "2301.02650": "|**2023-01-06**|**Model-Agnostic Hierarchical Attention for 3D Object Detection**|Manli Shu et.al.|[2301.02650v1](http://arxiv.org/abs/2301.02650v1)|null|\n", "2212.13736": "|**2022-12-28**|**Hermitian Topologies originating from non-Hermitian braidings**|W. B. Rui et.al.|[2212.13736v1](http://arxiv.org/abs/2212.13736v1)|null|\n", "2212.13276": "|**2022-12-26**|**Generalization of non-Cartan Symmetries to arbitrary dimensions**|J. C. Ndogmo et.al.|[2212.13276v1](http://arxiv.org/abs/2212.13276v1)|null|\n", "2212.13244": "|**2022-12-26**|**Equivalence classes and Linearization of the Riccati and Abel chain**|J. C. Ndogmo et.al.|[2212.13244v1](http://arxiv.org/abs/2212.13244v1)|null|\n", "2211.12510": "|**2022-11-22**|**Reconstructing the Image Scanning Microscopy Dataset: an Inverse Problem**|Alessandro Zunino et.al.|[2211.12510v1](http://arxiv.org/abs/2211.12510v1)|null|\n", "2211.02079": "|**2022-11-03**|**On Darboux non-integrability of the Hietarinta equation**|S. Ya. Startsev et.al.|[2211.02079v1](http://arxiv.org/abs/2211.02079v1)|null|\n", "2210.15933": "|**2022-10-28**|**PSFormer: Point Transformer for 3D Salient Object Detection**|Baian Chen et.al.|[2210.15933v1](http://arxiv.org/abs/2210.15933v1)|null|\n", "2210.06668": "|**2022-11-05**|**Aspects of the Equivalence Between the $f^\u03bc$ and $c^{\u03bd\u03bc}$ Terms in Lorentz-Violating Quantum Field Theory**|Sapan Karki et.al.|[2210.06668v2](http://arxiv.org/abs/2210.06668v2)|null|\n", "2210.05666": "|**2022-10-12**|**Point Transformer V2: Grouped Vector Attention and Partition-based Pooling**|Xiaoyang Wu et.al.|[2210.05666v2](http://arxiv.org/abs/2210.05666v2)|**[link](https://github.com/gofinge/pointtransformerv2)**|\n", "2209.11255": "|**2022-09-21**|**3DPCT: 3D Point Cloud Transformer with Dual Self-attention**|Dening Lu et.al.|[2209.11255v1](http://arxiv.org/abs/2209.11255v1)|null|\n", "2208.10395": "|**2022-08-22**|**Symmetry Classification of Scalar $n$th Order Ordinary Differential Equations**|Said Waqas Shah et.al.|[2208.10395v1](http://arxiv.org/abs/2208.10395v1)|null|\n", "2208.00281": "|**2022-12-20**|**Point Primitive Transformer for Long-Term 4D Point Cloud Video Understanding**|Hao Wen et.al.|[2208.00281v2](http://arxiv.org/abs/2208.00281v2)|**[link](https://github.com/hoi4d/PPTr)**|\n", "2207.13226": "|**2022-08-15**|**Boosting Point-BERT by Multi-choice Tokens**|Kexue Fu et.al.|[2207.13226v2](http://arxiv.org/abs/2207.13226v2)|**[link](https://github.com/fukexue/mcp-bert)**|\n", "2207.11995": "|**2022-07-26**|**3D Siamese Transformer Network for Single Object Tracking on Point Clouds**|Le Hui et.al.|[2207.11995v2](http://arxiv.org/abs/2207.11995v2)|**[link](https://github.com/fpthink/stnet)**|\n", "2207.10994": "|**2022-07-22**|**Learning Generalized Non-Rigid Multimodal Biomedical Image Registration from Generic Point Set Data**|Zachary MC Baum et.al.|[2207.10994v1](http://arxiv.org/abs/2207.10994v1)|null|\n", "2207.08575": "|**2022-07-18**|**Anisotropic spacetimes in $f(T,B)$ theory IV: Noether symmetry analysis**|Andronikos Paliathanasis et.al.|[2207.08575v1](http://arxiv.org/abs/2207.08575v1)|null|\n", "2206.15191": "|**2022-06-30**|**Lewis-Riesenfeld invariants for PT-symmetrically coupled oscillators from two dimensional point transformations and Lie algebraic expansions**|Andreas Fring et.al.|[2206.15191v1](http://arxiv.org/abs/2206.15191v1)|null|\n", "2206.04670": "|**2022-10-12**|**PointNeXt: Revisiting PointNet++ with Improved Training and Scaling Strategies**|Guocheng Qian et.al.|[2206.04670v2](http://arxiv.org/abs/2206.04670v2)|**[link](https://github.com/guochengqian/pointnext)**|\n", "2206.04511": "|**2022-08-29**|**Efficient Human Pose Estimation via 3D Event Point Cloud**|Jiaan Chen et.al.|[2206.04511v2](http://arxiv.org/abs/2206.04511v2)|**[link](https://github.com/masterhow/eventpointpose)**|\n", "2205.08886": "|**2022-05-18**|**GeoPointGAN: Synthetic Spatial Data with Local Label Differential Privacy**|Teddy Cunningham et.al.|[2205.08886v1](http://arxiv.org/abs/2205.08886v1)|**[link](https://github.com/konstantinklemmer/geopointgan)**|\n", "2204.03957": "|**2022-04-08**|**Points to Patches: Enabling the Use of Self-Attention for 3D Shape Recognition**|Axel Berg et.al.|[2204.03957v1](http://arxiv.org/abs/2204.03957v1)|**[link](https://github.com/axeber01/point-tnt)**|\n", "2203.12758": "|**2022-03-23**|**Mokey: Enabling Narrow Fixed-Point Inference for Out-of-the-Box Floating-Point Transformer Models**|Ali Hadi Zadeh et.al.|[2203.12758v1](http://arxiv.org/abs/2203.12758v1)|null|\n", "2203.04007": "|**2022-08-31**|**DuMLP-Pin: A Dual-MLP-dot-product Permutation-invariant Network for Set Feature Extraction**|Jiajun Fei et.al.|[2203.04007v2](http://arxiv.org/abs/2203.04007v2)|**[link](https://github.com/jaronthu/dumlp-pin)**|\n", "2203.00972": "|**2022-04-07**|**Improving Point Cloud Based Place Recognition with Ranking-based Loss and Large Batch Training**|Jacek Komorowski et.al.|[2203.00972v2](http://arxiv.org/abs/2203.00972v2)|**[link](https://github.com/jac99/minkloc3dv2)**|\n", "2201.05140": "|**2022-01-13**|**An introduction to PT-symmetric quantum mechanics -- time-dependent systems**|Andreas Fring et.al.|[2201.05140v1](http://arxiv.org/abs/2201.05140v1)|null|\n", "2112.13725": "|**2021-12-27**|**Near-Optimal Bounds for Generalized Orthogonal Procrustes Problem via Generalized Power Method**|Shuyang Ling et.al.|[2112.13725v1](http://arxiv.org/abs/2112.13725v1)|null|\n", "2112.11959": "|**2021-12-22**|**Dynamics of a symmetrically decoupled three-dimensional point transformation**|Hacene Gharout et.al.|[2112.11959v1](http://arxiv.org/abs/2112.11959v1)|null|\n", "2112.05635": "|**2021-12-10**|**Geometry of inhomogeneous Poisson brackets, multicomponent Harry Dym hierarchies and multicomponent Hunter-Saxton equations**|Andrey Yu. Konyaev et.al.|[2112.05635v1](http://arxiv.org/abs/2112.05635v1)|null|\n", "2112.04863": "|**2021-12-17**|**3D Medical Point Transformer: Introducing Convolution to Attention Networks for Medical Point Cloud Analysis**|Jianhui Yu et.al.|[2112.04863v2](http://arxiv.org/abs/2112.04863v2)|**[link](https://github.com/crane-papercode/3dmedpt)**|\n", "2112.04702": "|**2022-04-04**|**Fast Point Transformer**|Chunghyun Park et.al.|[2112.04702v2](http://arxiv.org/abs/2112.04702v2)|**[link](https://github.com/POSTECH-CVLab/FastPointTransformer)**|\n", "2111.14819": "|**2022-06-06**|**Point-BERT: Pre-training 3D Point Cloud Transformers with Masked Point Modeling**|Xumin Yu et.al.|[2111.14819v2](http://arxiv.org/abs/2111.14819v2)|**[link](https://github.com/lulutang0608/Point-BERT)**|\n", "2111.14451": "|**2022-03-31**|**HDR-NeRF: High Dynamic Range Neural Radiance Fields**|Xin Huang et.al.|[2111.14451v3](http://arxiv.org/abs/2111.14451v3)|null|\n", "2111.13702": "|**2022-12-12**|**The Information Content of Projected Galaxy Fields**|Lucas Porth et.al.|[2111.13702v2](http://arxiv.org/abs/2111.13702v2)|null|\n", "2111.10866": "|**2021-11-21**|**CpT: Convolutional Point Transformer for 3D Point Cloud Processing**|Chaitanya Kaul et.al.|[2111.10866v1](http://arxiv.org/abs/2111.10866v1)|null|\n", "2111.08973": "|**2021-11-19**|**Generating Unrestricted 3D Adversarial Point Clouds**|Xuelong Dai et.al.|[2111.08973v2](http://arxiv.org/abs/2111.08973v2)|**[link](https://github.com/EricDai0/AdvGCGAN)**|\n", "2111.00207": "|**2022-03-24**|**PatchFormer: An Efficient Point Transformer with Patch Attention**|Zhang Cheng et.al.|[2111.00207v3](http://arxiv.org/abs/2111.00207v3)|null|\n", "2110.05609": "|**2021-11-03**|**Comparison between time-independent and time-dependent quantum systems in the context of energy, Heisenberg uncertainty, average energy, force, average force and thermodynamic quantities**|Debraj Nath et.al.|[2110.05609v2](http://arxiv.org/abs/2110.05609v2)|null|\n", "2110.09230": "|**2021-10-07**|**A study on the Friedmann like Universe with Torsion using Noether Symmetry**|Ramkumar Radhakrishnan et.al.|[2110.09230v1](http://arxiv.org/abs/2110.09230v1)|null|\n", "2109.05023": "|**2021-09-20**|**Real-time multimodal image registration with partial intraoperative point-set data**|Zachary M C Baum et.al.|[2109.05023v2](http://arxiv.org/abs/2109.05023v2)|null|\n", "2109.02107": "|**2021-09-05**|**Normal Forms of second order Ordinary Differential Equations $y_{xx}=J(x,y,y_{x})$ under Fibre-Preserving Maps**|Wei Guo Foo et.al.|[2109.02107v1](http://arxiv.org/abs/2109.02107v1)|null|\n", "2108.08958": "|**2021-08-20**|**Exact solutions for time-dependent non-Hermitian oscillators: classical and quantum pictures**|Kevin Zelaya et.al.|[2108.08958v1](http://arxiv.org/abs/2108.08958v1)|null|\n", "2108.08891": "|**2021-08-19**|**Neural TMDlayer: Modeling Instantaneous flow of features via SDE Generators**|Zihang Meng et.al.|[2108.08891v1](http://arxiv.org/abs/2108.08891v1)|**[link](https://github.com/zihangm/neural-tmd-layer)**|\n", "2108.06076": "|**2022-05-25**|**PVT: Point-Voxel Transformer for Point Cloud Learning**|Cheng Zhang et.al.|[2108.06076v4](http://arxiv.org/abs/2108.06076v4)|**[link](https://github.com/HaochengWan/PVT)**|\n", "2108.00620": "|**2021-10-14**|**Investigating Attention Mechanism in 3D Point Cloud Object Detection**|Shi Qiu et.al.|[2108.00620v2](http://arxiv.org/abs/2108.00620v2)|**[link](https://github.com/ShiQiu0419/attentions_in_3D_detection)**|\n", "2107.14144": "|**2021-07-29**|**Reduction of balance laws in (3+1)--dimensions to autonomous conservation laws by means of equivalence transformations**|Matteo Gorgone et.al.|[2107.14144v1](http://arxiv.org/abs/2107.14144v1)|null|\n", "2303.01166": "|**2023-03-02**|**BPT: Binary Point Cloud Transformer for Place Recognition**|Zhixing Hou et.al.|[2303.01166v1](http://arxiv.org/abs/2303.01166v1)|null|\n", "2303.04458": "|**2023-03-08**|**Full Point Encoding for Local Feature Aggregation in 3D Point Clouds**|Yong He et.al.|[2303.04458v1](http://arxiv.org/abs/2303.04458v1)|null|\n", "2303.07766": "|**2023-03-14**|**Classical and quantum cosmology in $f(T)$-gravity theory: A Noether symmetry approach**|Roshni Bhaumik et.al.|[2303.07766v1](http://arxiv.org/abs/2303.07766v1)|null|\n", "2303.08274": "|**2023-03-14**|**GeoSpark: Sparking up Point Cloud Segmentation with Geometry Clue**|Zhening Huang et.al.|[2303.08274v1](http://arxiv.org/abs/2303.08274v1)|null|\n", "2303.15320": "|**2023-03-22**|**Noether's theorem and Lie symmetries for time-dependent Hamilton-Lagrange systems**|J\u00fcrgen Struckmeier et.al.|[2303.15320v1](http://arxiv.org/abs/2303.15320v1)|null|\n", "2303.17815": "|**2023-03-31**|**APPT : Asymmetric Parallel Point Transformer for 3D Point Cloud Understanding**|Hengjia Li et.al.|[2303.17815v1](http://arxiv.org/abs/2303.17815v1)|null|\n", "2304.02013": "|**2023-09-01**|**NPC: Neural Point Characters from Video**|Shih-Yang Su et.al.|[2304.02013v2](http://arxiv.org/abs/2304.02013v2)|null|\n", "2304.08279": "|**2023-05-27**|**MoDA: Modeling Deformable 3D Objects from Casual Videos**|Chaoyue Song et.al.|[2304.08279v2](http://arxiv.org/abs/2304.08279v2)|**[link](https://github.com/chaoyuesong/moda)**|\n", "2304.08681": "|**2023-09-07**|**The integer point transform as a complete invariant**|Sinai Robins et.al.|[2304.08681v4](http://arxiv.org/abs/2304.08681v4)|null|\n", "2304.14132": "|**2023-04-28**|**Human Semantic Segmentation using Millimeter-Wave Radar Sparse Point Clouds**|Pengfei Song et.al.|[2304.14132v2](http://arxiv.org/abs/2304.14132v2)|null|\n", "2305.00773": "|**2023-05-01**|**Point Cloud Semantic Segmentation**|Ivan Martinovi\u0107 et.al.|[2305.00773v1](http://arxiv.org/abs/2305.00773v1)|null|\n", "2305.03045": "|**2023-05-08**|**OctFormer: Octree-based Transformers for 3D Point Clouds**|Peng-Shuai Wang et.al.|[2305.03045v2](http://arxiv.org/abs/2305.03045v2)|**[link](https://github.com/octree-nn/octformer)**|\n", "2305.02533": "|**2023-05-04**|**Point Transformer For Coronary Artery Labeling**|Xu Wang et.al.|[2305.02533v1](http://arxiv.org/abs/2305.02533v1)|null|\n", "2306.10759": "|**2023-10-31**|**Simplifying and Empowering Transformers for Large-Graph Representations**|Qitian Wu et.al.|[2306.10759v3](http://arxiv.org/abs/2306.10759v3)|**[link](https://github.com/qitianwu/sgformer)**|\n", "2306.12361": "|**2023-06-21**|**Sigma-point Kalman Filter with Nonlinear Unknown Input Estimation via Optimization and Data-driven Approach for Dynamic Systems**|Junn Yong Loo et.al.|[2306.12361v1](http://arxiv.org/abs/2306.12361v1)|null|\n", "2306.10798": "|**2023-06-23**|**ExpPoint-MAE: Better interpretability and performance for self-supervised point cloud transformers**|Ioannis Romanelis et.al.|[2306.10798v2](http://arxiv.org/abs/2306.10798v2)|**[link](https://github.com/vvrpanda/exppoint-mae)**|\n", "2307.04723": "|**2023-07-18**|**Quark/Gluon Discrimination and Top Tagging with Dual Attention Transformer**|Minxuan He et.al.|[2307.04723v2](http://arxiv.org/abs/2307.04723v2)|null|\n", "2307.11973": "|**2023-07-22**|**Two-stream Multi-level Dynamic Point Transformer for Two-person Interaction Recognition**|Yao Liu et.al.|[2307.11973v1](http://arxiv.org/abs/2307.11973v1)|null|\n", "2308.04637": "|**2023-08-09**|**Sparse Binary Transformers for Multivariate Time Series Modeling**|Matt Gorbett et.al.|[2308.04637v1](http://arxiv.org/abs/2308.04637v1)|null|\n", "2308.09403": "|**2023-08-18**|**Target Clustering Based Multi-Bernoulli Filter for Superpositional Sensors**|Wang Sen et.al.|[2308.09403v1](http://arxiv.org/abs/2308.09403v1)|null|\n", "2309.00339": "|**2023-09-01**|**Robust Point Cloud Processing through Positional Embedding**|Jianqiao Zheng et.al.|[2309.00339v1](http://arxiv.org/abs/2309.00339v1)|null|\n", "2309.04105": "|**2023-09-08**|**Weakly Supervised Point Clouds Transformer for 3D Object Detection**|Zuojin Tang et.al.|[2309.04105v1](http://arxiv.org/abs/2309.04105v1)|null|\n", "2310.01545": "|**2023-10-02**|**RF-ULM: Deep Learning for Radio-Frequency Ultrasound Localization Microscopy**|Christopher Hahne et.al.|[2310.01545v1](http://arxiv.org/abs/2310.01545v1)|**[link](https://github.com/hahnec/rf-ulm)**|\n", "2310.05780": "|**2023-10-09**|**Lie symmetries for the cosmological field equations in brane-world gravity with bulk scalar field**|Andronikos Paliathanasis et.al.|[2310.05780v1](http://arxiv.org/abs/2310.05780v1)|null|\n", "2310.16861": "|**2023-10-25**|**General Point Model with Autoencoding and Autoregressive**|Zhe Li et.al.|[2310.16861v1](http://arxiv.org/abs/2310.16861v1)|null|\n", "2310.19772": "|**2023-10-22**|**Exact FLRW cosmological solutions via invariants of the symmetry groups**|E. Ahmadi Azar et.al.|[2310.19772v1](http://arxiv.org/abs/2310.19772v1)|null|\n", "2311.04081": "|**2023-11-07**|**Learning Super-Resolution Ultrasound Localization Microscopy from Radio-Frequency Data**|Christopher Hahne et.al.|[2311.04081v1](http://arxiv.org/abs/2311.04081v1)|null|\n"}, "NeRF": {"2302.12237": "|**2023-02-24**|**Learning Neural Volumetric Representations of Dynamic Humans in Minutes**|Chen Geng et.al.|[2302.12237v2](http://arxiv.org/abs/2302.12237v2)|**[link](https://github.com/zju3dv/instant-nvr)**|\n", "2302.12231": "|**2023-02-23**|**DiffusioNeRF: Regularizing Neural Radiance Fields with Denoising Diffusion Models**|Jamie Wynn et.al.|[2302.12231v1](http://arxiv.org/abs/2302.12231v1)|**[link](https://github.com/nianticlabs/diffusionerf)**|\n", "2302.10109": "|**2023-02-20**|**NerfDiff: Single-image View Synthesis with NeRF-guided Distillation from 3D-aware Diffusion**|Jiatao Gu et.al.|[2302.10109v1](http://arxiv.org/abs/2302.10109v1)|null|\n", "2302.09486": "|**2023-02-19**|**LC-NeRF: Local Controllable Face Generation in Neural Randiance Field**|Wenyang Zhou et.al.|[2302.09486v1](http://arxiv.org/abs/2302.09486v1)|null|\n", "2302.08788": "|**2023-02-17**|**MixNeRF: Modeling a Ray with Mixture Density for Novel View Synthesis from Sparse Inputs**|Seunghyeon Seo et.al.|[2302.08788v1](http://arxiv.org/abs/2302.08788v1)|**[link](https://github.com/shawn615/MixNeRF)**|\n", "2302.06833": "|**2023-02-14**|**VQ3D: Learning a 3D-Aware Generative Model on ImageNet**|Kyle Sargent et.al.|[2302.06833v1](http://arxiv.org/abs/2302.06833v1)|null|\n", "2302.06608": "|**2023-02-13**|**3D-aware Blending with Generative NeRFs**|Hyunsu Kim et.al.|[2302.06608v1](http://arxiv.org/abs/2302.06608v1)|**[link](https://github.com/naver-ai/BlendNeRF)**|\n", "2302.05573": "|**2023-02-11**|**3D Colored Shape Reconstruction from a Single RGB Image through Diffusion**|Bo Li et.al.|[2302.05573v1](http://arxiv.org/abs/2302.05573v1)|null|\n", "2302.04264": "|**2023-02-08**|**Nerfstudio: A Modular Framework for Neural Radiance Field Development**|Matthew Tancik et.al.|[2302.04264v1](http://arxiv.org/abs/2302.04264v1)|null|\n", "2302.02088": "|**2023-02-07**|**AV-NeRF: Learning Neural Fields for Real-World Audio-Visual Scene Synthesis**|Susan Liang et.al.|[2302.02088v2](http://arxiv.org/abs/2302.02088v2)|null|\n", "2302.01579": "|**2023-02-03**|**Semantic 3D-aware Portrait Synthesis and Manipulation Based on Compositional Neural Radiance Field**|Tianxiang Ma et.al.|[2302.01579v1](http://arxiv.org/abs/2302.01579v1)|**[link](https://github.com/tianxiangma/cnerf)**|\n", "2302.01571": "|**2023-02-03**|**Robust Camera Pose Refinement for Multi-Resolution Hash Encoding**|Hwan Heo et.al.|[2302.01571v1](http://arxiv.org/abs/2302.01571v1)|null|\n", "2302.01532": "|**2023-02-03**|**INV: Towards Streaming Incremental Neural Videos**|Shengze Wang et.al.|[2302.01532v1](http://arxiv.org/abs/2302.01532v1)|null|\n", "2302.01226": "|**2023-02-02**|**Factor Fields: A Unified Framework for Neural Fields and Beyond**|Anpei Chen et.al.|[2302.01226v1](http://arxiv.org/abs/2302.01226v1)|null|\n", "2302.00833": "|**2023-02-02**|**RobustNeRF: Ignoring Distractors with Robust Losses**|Sara Sabour et.al.|[2302.00833v1](http://arxiv.org/abs/2302.00833v1)|null|\n", "2301.13430": "|**2023-01-31**|**GeneFace: Generalized and High-Fidelity Audio-Driven 3D Talking Face Synthesis**|Zhenhui Ye et.al.|[2301.13430v1](http://arxiv.org/abs/2301.13430v1)|null|\n", "2301.12780": "|**2023-01-30**|**Equivariant Architectures for Learning in Deep Weight Spaces**|Aviv Navon et.al.|[2301.12780v1](http://arxiv.org/abs/2301.12780v1)|**[link](https://github.com/AvivNavon/DWSNets)**|\n", "2301.11631": "|**2023-01-27**|**HyperNeRFGAN: Hypernetwork approach to 3D NeRF GAN**|Adam Kania et.al.|[2301.11631v1](http://arxiv.org/abs/2301.11631v1)|**[link](https://github.com/gmum/hypernerfgan)**|\n", "2301.11522": "|**2023-01-27**|**A Comparison of Tiny-nerf versus Spatial Representations for 3d Reconstruction**|Saulo Abraham Gante et.al.|[2301.11522v1](http://arxiv.org/abs/2301.11522v1)|null|\n", "2301.11520": "|**2023-01-27**|**SNeRL: Semantic-aware Neural Radiance Fields for Reinforcement Learning**|Dongseok Shim et.al.|[2301.11520v1](http://arxiv.org/abs/2301.11520v1)|null|\n", "2301.11280": "|**2023-01-26**|**Text-To-4D Dynamic Scene Generation**|Uriel Singer et.al.|[2301.11280v1](http://arxiv.org/abs/2301.11280v1)|null|\n", "2301.10941": "|**2023-01-26**|**GeCoNeRF: Few-shot Neural Radiance Fields via Geometric Consistency**|Minseop Kwak et.al.|[2301.10941v1](http://arxiv.org/abs/2301.10941v1)|**[link](https://github.com/KU-CVLAB/GeCoNeRF)**|\n", "2301.09632": "|**2023-01-23**|**HexPlane: A Fast Representation for Dynamic Scenes**|Ang Cao et.al.|[2301.09632v1](http://arxiv.org/abs/2301.09632v1)|**[link](https://github.com/Caoang327/HexPlane)**|\n", "2301.09060": "|**2023-02-02**|**3D Reconstruction of Non-cooperative Resident Space Objects using Instant NGP-accelerated NeRF and D-NeRF**|Trupti Mahendrakar et.al.|[2301.09060v2](http://arxiv.org/abs/2301.09060v2)|null|\n", "2301.07958": "|**2023-02-05**|**RecolorNeRF: Layer Decomposed Radiance Fields for Efficient Color Editing of 3D Scenes**|Bingchen Gong et.al.|[2301.07958v2](http://arxiv.org/abs/2301.07958v2)|null|\n", "2301.08556": "|**2023-01-18**|**NeRF in the Palm of Your Hand: Corrective Augmentation for Robotics via Novel-View Synthesis**|Allan Zhou et.al.|[2301.08556v1](http://arxiv.org/abs/2301.08556v1)|null|\n", "2301.07668": "|**2023-01-18**|**Behind the Scenes: Density Fields for Single View Reconstruction**|Felix Wimbauer et.al.|[2301.07668v1](http://arxiv.org/abs/2301.07668v1)|**[link](https://github.com/Brummi/BehindTheScenes)**|\n", "2301.06782": "|**2023-01-17**|**A Large-Scale Outdoor Multi-modal Dataset and Benchmark for Novel View Synthesis and Implicit Scene Reconstruction**|Chongshan Lu et.al.|[2301.06782v1](http://arxiv.org/abs/2301.06782v1)|null|\n", "2301.05747": "|**2023-01-13**|**Laser: Latent Set Representations for 3D Generative Modeling**|Pol Moreno et.al.|[2301.05747v1](http://arxiv.org/abs/2301.05747v1)|null|\n", "2301.04075": "|**2023-01-10**|**Benchmarking Robustness in Neural Radiance Fields**|Chen Wang et.al.|[2301.04075v1](http://arxiv.org/abs/2301.04075v1)|null|\n", "2301.03102": "|**2023-01-08**|**Towards Open World NeRF-Based SLAM**|Daniil Lisus et.al.|[2301.03102v1](http://arxiv.org/abs/2301.03102v1)|null|\n", "2301.02975": "|**2023-01-10**|**Traditional Readability Formulas Compared for English**|Bruce W. Lee et.al.|[2301.02975v2](http://arxiv.org/abs/2301.02975v2)|null|\n", "2301.00950": "|**2023-01-09**|**Class-Continuous Conditional Generative Neural Radiance Field**|Jiwook Kim et.al.|[2301.00950v2](http://arxiv.org/abs/2301.00950v2)|**[link](https://github.com/tom919654/C3G-NeRF)**|\n", "2301.00411": "|**2023-01-11**|**Detachable Novel Views Synthesis of Dynamic Scenes Using Distribution-Driven Neural Radiance Fields**|Boyu Zhang et.al.|[2301.00411v2](http://arxiv.org/abs/2301.00411v2)|**[link](https://github.com/luciferbobo/d4nerf)**|\n", "2212.13056": "|**2022-12-26**|**MonoNeRF: Learning a Generalizable Dynamic Radiance Field from Monocular Videos**|Fengrui Tian et.al.|[2212.13056v1](http://arxiv.org/abs/2212.13056v1)|**[link](https://github.com/tianfr/mononerf)**|\n", "2212.12871": "|**2022-12-25**|**PaletteNeRF: Palette-based Color Editing for NeRFs**|Qiling Wu et.al.|[2212.12871v1](http://arxiv.org/abs/2212.12871v1)|null|\n", "2212.11966": "|**2022-12-22**|**Removing Objects From Neural Radiance Fields**|Silvan Weder et.al.|[2212.11966v1](http://arxiv.org/abs/2212.11966v1)|null|\n", "2212.10950": "|**2022-12-21**|**Incremental Learning for Neural Radiance Field with Uncertainty-Filtered Knowledge Distillation**|Mengqi Guo et.al.|[2212.10950v1](http://arxiv.org/abs/2212.10950v1)|null|\n", "2212.10699": "|**2023-01-24**|**PaletteNeRF: Palette-based Appearance Editing of Neural Radiance Fields**|Zhengfei Kuang et.al.|[2212.10699v2](http://arxiv.org/abs/2212.10699v2)|null|\n", "2212.09735": "|**2022-12-20**|**Correspondence Distillation from NeRF-based GAN**|Yushi Lan et.al.|[2212.09735v2](http://arxiv.org/abs/2212.09735v2)|null|\n", "2212.09330": "|**2022-12-19**|**StyleTRF: Stylizing Tensorial Radiance Fields**|Rahul Goel et.al.|[2212.09330v1](http://arxiv.org/abs/2212.09330v1)|null|\n", "2212.09100": "|**2022-12-18**|**SPARF: Large-Scale Learning of 3D Sparse Radiance Fields from Few Input Images**|Abdullah Hamdi et.al.|[2212.09100v1](http://arxiv.org/abs/2212.09100v1)|**[link](https://github.com/ajhamdi/sparf_pytorch)**|\n", "2212.09069": "|**2022-12-18**|**Masked Wavelet Representation for Compact Neural Radiance Fields**|Daniel Rho et.al.|[2212.09069v1](http://arxiv.org/abs/2212.09069v1)|**[link](https://github.com/daniel03c1/masked_wavelet_nerf)**|\n", "2212.08328": "|**2022-12-31**|**MEIL-NeRF: Memory-Efficient Incremental Learning of Neural Radiance Fields**|Jaeyoung Chung et.al.|[2212.08328v2](http://arxiv.org/abs/2212.08328v2)|null|\n", "2212.08070": "|**2022-12-15**|**NeRF-Art: Text-Driven Neural Radiance Fields Stylization**|Can Wang et.al.|[2212.08070v1](http://arxiv.org/abs/2212.08070v1)|**[link](https://github.com/cassiePython/NeRF-Art)**|\n", "2212.08057": "|**2022-12-15**|**Real-Time Neural Light Field on Mobile Devices**|Junli Cao et.al.|[2212.08057v1](http://arxiv.org/abs/2212.08057v1)|**[link](https://github.com/snap-research/mobiler2l)**|\n", "2212.08476": "|**2022-12-15**|**SteerNeRF: Accelerating NeRF Rendering via Smooth Viewpoint Trajectory**|Sicheng Li et.al.|[2212.08476v1](http://arxiv.org/abs/2212.08476v1)|null|\n", "2212.07388": "|**2022-12-14**|**NoPe-NeRF: Optimising Neural Radiance Field with No Pose Prior**|Wenjing Bian et.al.|[2212.07388v1](http://arxiv.org/abs/2212.07388v1)|**[link](https://github.com/ActiveVisionLab/nope-nerf)**|\n", "2212.04701": "|**2022-12-09**|**4K-NeRF: High Fidelity Neural Radiance Fields at Ultra High Resolutions**|Zhongshu Wang et.al.|[2212.04701v1](http://arxiv.org/abs/2212.04701v1)|**[link](https://github.com/frozoul/4k-nerf)**|\n", "2212.04823": "|**2022-12-08**|**GazeNeRF: 3D-Aware Gaze Redirection with Neural Radiance Fields**|Alessandro Ruzzi et.al.|[2212.04823v1](http://arxiv.org/abs/2212.04823v1)|**[link](https://github.com/alessandroruzzi/gazenerf)**|\n", "2302.13543": "|**2023-02-27**|**BaLi-RF: Bandlimited Radiance Fields for Dynamic Scene Modeling**|Sameera Ramasinghe et.al.|[2302.13543v1](http://arxiv.org/abs/2302.13543v1)|null|\n", "2302.13397": "|**2023-02-26**|**Efficient physics-informed neural networks using hash encoding**|Xinquan Huang et.al.|[2302.13397v1](http://arxiv.org/abs/2302.13397v1)|null|\n", "2302.12931": "|**2023-02-24**|**CATNIPS: Collision Avoidance Through Neural Implicit Probabilistic Scenes**|Timothy Chen et.al.|[2302.12931v1](http://arxiv.org/abs/2302.12931v1)|null|\n", "2302.14683": "|**2023-03-09**|**IntrinsicNGP: Intrinsic Coordinate based Hash Encoding for Human NeRF**|Bo Peng et.al.|[2302.14683v2](http://arxiv.org/abs/2302.14683v2)|null|\n", "2303.00749": "|**2023-03-01**|**S-NeRF: Neural Radiance Fields for Street Views**|Ziyang Xie et.al.|[2303.00749v1](http://arxiv.org/abs/2303.00749v1)|null|\n", "2303.02091": "|**2023-03-03**|**Delicate Textured Mesh Recovery from NeRF via Adaptive Surface Refinement**|Jiaxiang Tang et.al.|[2303.02091v1](http://arxiv.org/abs/2303.02091v1)|**[link](https://github.com/ashawkey/nerf2mesh)**|\n", "2303.01736": "|**2023-03-03**|**Multi-Plane Neural Radiance Fields for Novel View Synthesis**|Youssef Abdelkareem et.al.|[2303.01736v1](http://arxiv.org/abs/2303.01736v1)|null|\n", "2303.03361": "|**2023-03-10**|**Nerflets: Local Radiance Fields for Efficient Structure-Aware 3D Scene Representation from 2D Supervision**|Xiaoshuai Zhang et.al.|[2303.03361v2](http://arxiv.org/abs/2303.03361v2)|null|\n", "2303.03003": "|**2023-03-07**|**Efficient Large-scale Scene Representation with a Hybrid of High-resolution Grid and Plane Features**|Yuqi Zhang et.al.|[2303.03003v2](http://arxiv.org/abs/2303.03003v2)|**[link](https://github.com/zyqz97/gp-nerf)**|\n", "2303.04086": "|**2023-03-07**|**NEPHELE: A Neural Platform for Highly Realistic Cloud Radiance Rendering**|Haimin Luo et.al.|[2303.04086v1](http://arxiv.org/abs/2303.04086v1)|null|\n", "2303.03808": "|**2023-03-07**|**Multiscale Tensor Decomposition and Rendering Equation Encoding for View Synthesis**|Kang Han et.al.|[2303.03808v1](http://arxiv.org/abs/2303.03808v1)|**[link](https://github.com/imkanghan/nrff)**|\n", "2303.03966": "|**2023-03-05**|**Semantic-aware Occlusion Filtering Neural Radiance Fields in the Wild**|Jaewon Lee et.al.|[2303.03966v1](http://arxiv.org/abs/2303.03966v1)|null|\n", "2303.04508": "|**2023-03-08**|**FastSurf: Fast Neural RGB-D Surface Reconstruction using Per-Frame Intrinsic Refinement and TSDF Fusion Prior Learning**|Seunghwan Lee et.al.|[2303.04508v1](http://arxiv.org/abs/2303.04508v1)|**[link](https://github.com/ROKIT-Healthcare/FastSurf)**|\n", "2303.04322": "|**2023-03-08**|**DroNeRF: Real-time Multi-agent Drone Pose Optimization for Computing Neural Radiance Fields**|Dipam Patel et.al.|[2303.04322v1](http://arxiv.org/abs/2303.04322v1)|null|\n", "2303.05512": "|**2023-03-09**|**PAC-NeRF: Physics Augmented Continuum Neural Radiance Fields for Geometry-Agnostic System Identification**|Xuan Li et.al.|[2303.05512v1](http://arxiv.org/abs/2303.05512v1)|null|\n", "2303.05835": "|**2023-03-10**|**You Only Train Once: Multi-Identity Free-Viewpoint Neural Human Rendering from Monocular Videos**|Jaehyeok Kim et.al.|[2303.05835v1](http://arxiv.org/abs/2303.05835v1)|null|\n", "2303.05807": "|**2023-03-10**|**Aleth-NeRF: Low-light Condition View Synthesis with Concealing Fields**|Ziteng Cui et.al.|[2303.05807v1](http://arxiv.org/abs/2303.05807v1)|null|\n", "2303.05775": "|**2023-03-10**|**Self-NeRF: A Self-Training Pipeline for Few-Shot Neural Radiance Fields**|Jiayang Bai et.al.|[2303.05775v1](http://arxiv.org/abs/2303.05775v1)|null|\n", "2303.05735": "|**2023-03-14**|**Hardware Acceleration of Neural Graphics**|Muhammad Husnain Mubarik et.al.|[2303.05735v2](http://arxiv.org/abs/2303.05735v2)|null|\n", "2303.05703": "|**2023-03-10**|**MovingParts: Motion-based 3D Part Discovery in Dynamic Radiance Field**|Kaizhi Yang et.al.|[2303.05703v1](http://arxiv.org/abs/2303.05703v1)|null|\n", "2303.06919": "|**2023-03-13**|**NeRFLiX: High-Quality Neural View Synthesis by Learning a Degradation-Driven Inter-viewpoint MiXer**|Kun Zhou et.al.|[2303.06919v1](http://arxiv.org/abs/2303.06919v1)|**[link](https://github.com/redrock303/NeRFLiX_CPVR2023)**|\n", "2303.06335": "|**2023-03-11**|**Just Flip: Flipped Observation Generation and Optimization for Neural Radiance Fields to Cover Unobserved View**|Minjae Lee et.al.|[2303.06335v1](http://arxiv.org/abs/2303.06335v1)|**[link](https://github.com/minjae-lulu/just-flip)**|\n", "2303.06226": "|**2023-03-10**|**NeRFlame: FLAME-based conditioning of NeRF for 3D face rendering**|Wojciech Zaj\u0105c et.al.|[2303.06226v1](http://arxiv.org/abs/2303.06226v1)|**[link](https://github.com/wojtekz4/nerflame)**|\n", "2303.08096": "|**2023-03-14**|**MELON: NeRF with Unposed Images Using Equivalence Class Estimation**|Axel Levy et.al.|[2303.08096v1](http://arxiv.org/abs/2303.08096v1)|null|\n", "2303.07937": "|**2023-03-16**|**Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation**|Junyoung Seo et.al.|[2303.07937v3](http://arxiv.org/abs/2303.07937v3)|**[link](https://github.com/KU-CVLAB/3DFuse)**|\n", "2303.07653": "|**2023-03-16**|**NEF: Neural Edge Fields for 3D Parametric Curve Reconstruction from Multi-view Images**|Yunfan Ye et.al.|[2303.07653v2](http://arxiv.org/abs/2303.07653v2)|**[link](https://github.com/yunfan1202/NEF_code)**|\n", "2303.07596": "|**2023-03-18**|**Frequency-Modulated Point Cloud Rendering with Easy Editing**|Yi Zhang et.al.|[2303.07596v2](http://arxiv.org/abs/2303.07596v2)|**[link](https://github.com/yizhangphd/freqpcr)**|\n", "2303.07418": "|**2023-03-13**|**FreeNeRF: Improving Few-shot Neural Rendering with Free Frequency Regularization**|Jiawei Yang et.al.|[2303.07418v1](http://arxiv.org/abs/2303.07418v1)|**[link](https://github.com/jiawei-yang/freenerf)**|\n", "2303.08808": "|**2023-03-15**|**Mesh Strikes Back: Fast and Efficient Human Reconstruction from RGB videos**|Rohit Jena et.al.|[2303.08808v1](http://arxiv.org/abs/2303.08808v1)|null|\n", "2303.08717": "|**2023-03-15**|**Re-ReND: Real-time Rendering of NeRFs across Devices**|Sara Rojas et.al.|[2303.08717v1](http://arxiv.org/abs/2303.08717v1)|**[link](https://github.com/sararoma95/Re-ReND)**|\n", "2303.08695": "|**2023-03-15**|**RefiNeRF: Modelling dynamic neural radiance fields with inconsistent or missing camera parameters**|Shuja Khalid et.al.|[2303.08695v1](http://arxiv.org/abs/2303.08695v1)|null|\n", "2303.08370": "|**2023-03-15**|**Harnessing Low-Frequency Neural Fields for Few-Shot View Synthesis**|Liangchen Song et.al.|[2303.08370v1](http://arxiv.org/abs/2303.08370v1)|**[link](https://github.com/lsongx/halo)**|\n", "2303.09554": "|**2023-03-21**|**PartNeRF: Generating Part-Aware Editable 3D Shapes without 3D Supervision**|Konstantinos Tertikas et.al.|[2303.09554v3](http://arxiv.org/abs/2303.09554v3)|null|\n", "2303.09553": "|**2023-03-16**|**LERF: Language Embedded Radiance Fields**|Justin Kerr et.al.|[2303.09553v1](http://arxiv.org/abs/2303.09553v1)|null|\n", "2303.09431": "|**2023-03-16**|**NeRFMeshing: Distilling Neural Radiance Fields into Geometrically-Accurate 3D Meshes**|Marie-Julie Rakotosaona et.al.|[2303.09431v1](http://arxiv.org/abs/2303.09431v1)|null|\n", "2303.09412": "|**2023-03-17**|**NeRFtrinsic Four: An End-To-End Trainable NeRF Jointly Optimizing Diverse Intrinsic and Extrinsic Camera Parameters**|Hannah Schieber et.al.|[2303.09412v2](http://arxiv.org/abs/2303.09412v2)|**[link](https://github.com/hannahhaensen/nerftrinsic_four)**|\n", "2303.09153": "|**2023-03-16**|**Reliable Image Dehazing by NeRF**|Zheyan Jin et.al.|[2303.09153v1](http://arxiv.org/abs/2303.09153v1)|null|\n", "2303.10083": "|**2023-03-17**|**$\u03b1$Surf: Implicit Surface Reconstruction for Semi-Transparent and Thin Objects with Decoupled Geometry and Opacity**|Tianhao Wu et.al.|[2303.10083v1](http://arxiv.org/abs/2303.10083v1)|null|\n", "2303.09952": "|**2023-03-17**|**Single-view Neural Radiance Fields with Depth Teacher**|Yurui Chen et.al.|[2303.09952v1](http://arxiv.org/abs/2303.09952v1)|null|\n", "2303.11052": "|**2023-03-20**|**ContraNeRF: Generalizable Neural Radiance Fields for Synthetic-to-real Novel View Synthesis via Contrastive Learning**|Hao Yang et.al.|[2303.11052v1](http://arxiv.org/abs/2303.11052v1)|null|\n", "2303.10735": "|**2023-03-19**|**SKED: Sketch-guided Text-based 3D Editing**|Aryan Mikaeili et.al.|[2303.10735v1](http://arxiv.org/abs/2303.10735v1)|null|\n", "2303.10709": "|**2023-03-19**|**NeRF-LOAM: Neural Implicit Representation for Large-Scale Incremental LiDAR Odometry and Mapping**|Junyuan Deng et.al.|[2303.10709v1](http://arxiv.org/abs/2303.10709v1)|**[link](https://github.com/junyuandeng/nerf-loam)**|\n", "2303.10340": "|**2023-03-18**|**3D Data Augmentation for Driving Scenes on Camera**|Wenwen Tong et.al.|[2303.10340v1](http://arxiv.org/abs/2303.10340v1)|null|\n", "2303.11938": "|**2023-03-21**|**3D-CLFusion: Fast Text-to-3D Rendering with Contrastive Latent Diffusion**|Yu-Jhe Li et.al.|[2303.11938v1](http://arxiv.org/abs/2303.11938v1)|null|\n", "2303.11728": "|**2023-03-22**|**ExtremeNeRF: Few-shot Neural Radiance Fields Under Unconstrained Illumination**|SeokYeong Lee et.al.|[2303.11728v2](http://arxiv.org/abs/2303.11728v2)|null|\n", "2303.11364": "|**2023-03-20**|**DehazeNeRF: Multiple Image Haze Removal and 3D Shape Reconstruction using Neural Radiance Fields**|Wei-Ting Chen et.al.|[2303.11364v1](http://arxiv.org/abs/2303.11364v1)|null|\n", "2303.12791": "|**2023-03-22**|**SHERF: Generalizable Human NeRF from a Single Image**|Shoukang Hu et.al.|[2303.12791v1](http://arxiv.org/abs/2303.12791v1)|**[link](https://github.com/skhu101/sherf)**|\n", "2303.12789": "|**2023-03-22**|**Instruct-NeRF2NeRF: Editing 3D Scenes with Instructions**|Ayaan Haque et.al.|[2303.12789v1](http://arxiv.org/abs/2303.12789v1)|null|\n", "2303.12786": "|**2023-03-22**|**FeatureNeRF: Learning Generalizable NeRFs by Distilling Foundation Models**|Jianglong Ye et.al.|[2303.12786v1](http://arxiv.org/abs/2303.12786v1)|null|\n", "2303.12408": "|**2023-03-24**|**Balanced Spherical Grid for Egocentric View Synthesis**|Changwoon Choi et.al.|[2303.12408v2](http://arxiv.org/abs/2303.12408v2)|**[link](https://github.com/changwoonchoi/EgoNeRF)**|\n", "2303.12234": "|**2023-03-21**|**Pre-NeRF 360: Enriching Unbounded Appearances for Neural Radiance Fields**|Ahmad AlMughrabi et.al.|[2303.12234v1](http://arxiv.org/abs/2303.12234v1)|**[link](https://github.com/amughrabi/pre-nerf)**|\n", "2303.13497": "|**2023-03-23**|**TriPlaneNet: An Encoder for EG3D Inversion**|Ananta R. Bhattarai et.al.|[2303.13497v1](http://arxiv.org/abs/2303.13497v1)|null|\n", "2303.13472": "|**2023-03-23**|**Plotting Behind the Scenes: Towards Learnable Game Engines**|Willi Menapace et.al.|[2303.13472v1](http://arxiv.org/abs/2303.13472v1)|null|\n", "2303.13450": "|**2023-03-23**|**Set-the-Scene: Global-Local Training for Generating Controllable NeRF Scenes**|Dana Cohen-Bar et.al.|[2303.13450v1](http://arxiv.org/abs/2303.13450v1)|**[link](https://github.com/DanaCohen95/Set-the-Scene)**|\n", "2303.13277": "|**2023-03-25**|**SINE: Semantic-driven Image-based NeRF Editing with Prior-guided Editing Field**|Chong Bao et.al.|[2303.13277v2](http://arxiv.org/abs/2303.13277v2)|null|\n", "2303.13232": "|**2023-03-23**|**Transforming Radiance Field with Lipschitz Network for Photorealistic 3D Scene Stylization**|Zicheng Zhang et.al.|[2303.13232v1](http://arxiv.org/abs/2303.13232v1)|null|\n", "2303.13014": "|**2023-03-23**|**Semantic Ray: Learning a Generalizable Semantic Field with Cross-Reprojection Attention**|Fangfu Liu et.al.|[2303.13014v1](http://arxiv.org/abs/2303.13014v1)|**[link](https://github.com/liuff19/Semantic-Ray)**|\n", "2303.12865": "|**2023-03-22**|**NeRF-GAN Distillation for Efficient 3D-Aware Generation with Convolutions**|Mohamad Shahbazi et.al.|[2303.12865v1](http://arxiv.org/abs/2303.12865v1)|**[link](https://github.com/mshahbazi72/nerf-gan-distillation)**|\n", "2303.14001": "|**2023-03-24**|**Grid-guided Neural Radiance Fields for Large Urban Scenes**|Linning Xu et.al.|[2303.14001v1](http://arxiv.org/abs/2303.14001v1)|null|\n", "2303.13843": "|**2023-03-24**|**CompoNeRF: Text-guided Multi-object Compositional NeRF with Editable 3D Scene Layout**|Yiqi Lin et.al.|[2303.13843v1](http://arxiv.org/abs/2303.13843v1)|null|\n", "2303.13825": "|**2023-03-24**|**HandNeRF: Neural Radiance Fields for Animatable Interacting Hands**|Zhiyang Guo et.al.|[2303.13825v1](http://arxiv.org/abs/2303.13825v1)|null|\n", "2303.13817": "|**2023-03-24**|**ABLE-NeRF: Attention-Based Rendering with Learnable Embeddings for Neural Radiance Field**|Zhe Jun Tang et.al.|[2303.13817v1](http://arxiv.org/abs/2303.13817v1)|**[link](https://github.com/tangzj/able-nerf)**|\n", "2303.13777": "|**2023-03-24**|**GM-NeRF: Learning Generalizable Model-based Neural Radiance Fields from Multi-view Images**|Jianchuan Chen et.al.|[2303.13777v1](http://arxiv.org/abs/2303.13777v1)|null|\n", "2303.13743": "|**2023-03-24**|**TEGLO: High Fidelity Canonical Texture Mapping from Single-View Images**|Vishal Vinod et.al.|[2303.13743v1](http://arxiv.org/abs/2303.13743v1)|null|\n", "2303.13582": "|**2023-03-23**|**SCADE: NeRFs from Space Carving with Ambiguity-Aware Depth Estimates**|Mikaela Angelina Uy et.al.|[2303.13582v1](http://arxiv.org/abs/2303.13582v1)|null|\n", "2303.15427": "|**2023-03-27**|**JAWS: Just A Wild Shot for Cinematic Transfer in Neural Radiance Fields**|Xi Wang et.al.|[2303.15427v1](http://arxiv.org/abs/2303.15427v1)|**[link](https://github.com/robincourant/jaws)**|\n", "2303.15387": "|**2023-03-27**|**Generalizable Neural Voxels for Fast Human Radiance Fields**|Taoran Yi et.al.|[2303.15387v1](http://arxiv.org/abs/2303.15387v1)|null|\n", "2303.15368": "|**2023-03-27**|**NeUDF: Learning Unsigned Distance Fields from Multi-view Images for Reconstructing Non-watertight Models**|Fei Hou et.al.|[2303.15368v1](http://arxiv.org/abs/2303.15368v1)|null|\n", "2303.15012": "|**2023-03-27**|**3D-Aware Multi-Class Image-to-Image Translation with NeRFs**|Senmao Li et.al.|[2303.15012v1](http://arxiv.org/abs/2303.15012v1)|**[link](https://github.com/sen-mao/3di2i-translation)**|\n", "2303.14707": "|**2023-03-26**|**Clean-NeRF: Reformulating NeRF to account for View-Dependent Observations**|Xinhang Liu et.al.|[2303.14707v1](http://arxiv.org/abs/2303.14707v1)|null|\n", "2303.14536": "|**2023-03-25**|**SUDS: Scalable Urban Dynamic Scenes**|Haithem Turki et.al.|[2303.14536v1](http://arxiv.org/abs/2303.14536v1)|null|\n", "2303.14478": "|**2023-03-25**|**DBARF: Deep Bundle-Adjusting Generalizable Neural Radiance Fields**|Yu Chen et.al.|[2303.14478v1](http://arxiv.org/abs/2303.14478v1)|null|\n", "2303.14435": "|**2023-03-25**|**NeRF-DS: Neural Radiance Fields for Dynamic Specular Objects**|Zhiwen Yan et.al.|[2303.14435v1](http://arxiv.org/abs/2303.14435v1)|**[link](https://github.com/jokeryan/nerf-ds)**|\n", "2303.15206": "|**2023-03-24**|**Perceptual Quality Assessment of NeRF and Neural View Synthesis Methods for Front-Facing Views**|Hanxue Liang et.al.|[2303.15206v1](http://arxiv.org/abs/2303.15206v1)|null|\n", "2303.16196": "|**2023-03-28**|**SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis**|Guangcong Wang et.al.|[2303.16196v1](http://arxiv.org/abs/2303.16196v1)|null|\n", "2303.16184": "|**2023-03-28**|**VMesh: Hybrid Volume-Mesh Representation for Efficient View Synthesis**|Yuan-Chen Guo et.al.|[2303.16184v1](http://arxiv.org/abs/2303.16184v1)|null|\n", "2303.16001": "|**2023-03-30**|**Adaptive Voronoi NeRFs**|Tim Elsner et.al.|[2303.16001v2](http://arxiv.org/abs/2303.16001v2)|null|\n", "2303.15951": "|**2023-03-28**|**F$^{2}$-NeRF: Fast Neural Radiance Field Training with Free Camera Trajectories**|Peng Wang et.al.|[2303.15951v1](http://arxiv.org/abs/2303.15951v1)|**[link](https://github.com/Totoro97/f2-nerf)**|\n", "2303.16485": "|**2023-03-29**|**TriVol: Point Cloud Rendering via Triple Volumes**|Tao Hu et.al.|[2303.16485v1](http://arxiv.org/abs/2303.16485v1)|**[link](https://github.com/dvlab-research/trivol)**|\n", "2303.16482": "|**2023-03-29**|**Point2Pix: Photo-Realistic Point Cloud Rendering via Neural Radiance Fields**|Tao Hu et.al.|[2303.16482v1](http://arxiv.org/abs/2303.16482v1)|null|\n", "2303.16333": "|**2023-03-28**|**Flow supervision for Deformable NeRF**|Chaoyang Wang et.al.|[2303.16333v1](http://arxiv.org/abs/2303.16333v1)|null|\n", "2303.17603": "|**2023-03-30**|**NeRF-Supervised Deep Stereo**|Fabio Tosi et.al.|[2303.17603v1](http://arxiv.org/abs/2303.17603v1)|**[link](https://github.com/fabiotosi92/nerf-supervised-deep-stereo)**|\n", "2303.17368": "|**2023-03-30**|**SynBody: Synthetic Dataset with Layered Human Models for 3D Human Perception and Modeling**|Zhitao Yang et.al.|[2303.17368v1](http://arxiv.org/abs/2303.17368v1)|**[link](https://github.com/openxrlab/xrfeitoria)**|\n", "2303.17147": "|**2023-03-30**|**NeILF++: Inter-Reflectable Light Fields for Geometry and Material Estimation**|Jingyang Zhang et.al.|[2303.17147v1](http://arxiv.org/abs/2303.17147v1)|null|\n", "2303.17094": "|**2023-03-30**|**Enhanced Stable View Synthesis**|Nishant Jain et.al.|[2303.17094v1](http://arxiv.org/abs/2303.17094v1)|null|\n", "2303.17968": "|**2023-03-31**|**VDN-NeRF: Resolving Shape-Radiance Ambiguity via View-Dependence Normalization**|Bingfan Zhu et.al.|[2303.17968v1](http://arxiv.org/abs/2303.17968v1)|**[link](https://github.com/boifz/vdn-nerf)**|\n", "2304.00916": "|**2023-04-06**|**DreamAvatar: Text-and-Shape Guided 3D Human Avatar Generation via Diffusion Models**|Yukang Cao et.al.|[2304.00916v2](http://arxiv.org/abs/2304.00916v2)|null|\n", "2304.00341": "|**2023-04-01**|**JacobiNeRF: NeRF Shaping with Mutual Information Gradients**|Xiaomeng Xu et.al.|[2304.00341v1](http://arxiv.org/abs/2304.00341v1)|**[link](https://github.com/xxm19/jacobinerf)**|\n", "2304.02001": "|**2023-04-04**|**MonoHuman: Animatable Human Neural Field from Monocular Video**|Zhengming Yu et.al.|[2304.02001v1](http://arxiv.org/abs/2304.02001v1)|null|\n", "2304.02061": "|**2023-04-11**|**Generating Continual Human Motion in Diverse 3D Scenes**|Aymen Mir et.al.|[2304.02061v2](http://arxiv.org/abs/2304.02061v2)|null|\n", "2304.03280": "|**2023-04-06**|**LANe: Lighting-Aware Neural Fields for Compositional Scene Synthesis**|Akshay Krishnan et.al.|[2304.03280v1](http://arxiv.org/abs/2304.03280v1)|null|\n", "2304.03266": "|**2023-04-06**|**Neural Fields meet Explicit Geometric Representation for Inverse Rendering of Urban Scenes**|Zian Wang et.al.|[2304.03266v1](http://arxiv.org/abs/2304.03266v1)|null|\n", "2304.02827": "|**2023-04-06**|**DITTO-NeRF: Diffusion-based Iterative Text To Omni-directional 3D Model**|Hoigi Seo et.al.|[2304.02827v1](http://arxiv.org/abs/2304.02827v1)|null|\n", "2304.02736": "|**2023-04-05**|**Image Stabilization for Hololens Camera in Remote Collaboration**|Gowtham Senthil et.al.|[2304.02736v1](http://arxiv.org/abs/2304.02736v1)|null|\n", "2304.03526": "|**2023-04-07**|**Lift3D: Synthesize 3D Training Data by Lifting 2D GAN to 3D Generative Radiance Field**|Leheng Li et.al.|[2304.03526v1](http://arxiv.org/abs/2304.03526v1)|null|\n", "2304.03384": "|**2023-04-06**|**Beyond NeRF Underwater: Learning Neural Reflectance Fields for True Color Correction of Marine Imagery**|Tianyi Zhang et.al.|[2304.03384v1](http://arxiv.org/abs/2304.03384v1)|**[link](https://github.com/tyz1030/neuralsea)**|\n", "2304.04452": "|**2023-04-10**|**Neural Residual Radiance Fields for Streamably Free-Viewpoint Videos**|Liao Wang et.al.|[2304.04452v1](http://arxiv.org/abs/2304.04452v1)|null|\n", "2304.04446": "|**2023-04-10**|**Inferring Fluid Dynamics via Inverse Rendering**|Jinxian Liu et.al.|[2304.04446v1](http://arxiv.org/abs/2304.04446v1)|null|\n", "2304.04395": "|**2023-04-10**|**Instance Neural Radiance Field**|Benran Hu et.al.|[2304.04395v1](http://arxiv.org/abs/2304.04395v1)|**[link](https://github.com/lyclyc52/instance_nerf)**|\n", "2304.04133": "|**2023-04-12**|**NeRF applied to satellite imagery for surface reconstruction**|Federico Semeraro et.al.|[2304.04133v3](http://arxiv.org/abs/2304.04133v3)|**[link](https://github.com/fsemerar/satnerf)**|\n", "2304.04012": "|**2023-04-08**|**PVD-AL: Progressive Volume Distillation with Active Learning for Efficient Conversion Between Different NeRF Architectures**|Shuangkang Fang et.al.|[2304.04012v1](http://arxiv.org/abs/2304.04012v1)|**[link](https://github.com/megvii-research/AAAI2023-PVD)**|\n", "2304.04559": "|**2023-04-07**|**Event-based Camera Tracker by $\\nabla$t NeRF**|Mana Masuda et.al.|[2304.04559v1](http://arxiv.org/abs/2304.04559v1)|null|\n", "2304.05218": "|**2023-04-11**|**Improving Neural Radiance Fields with Depth-aware Optimization for Novel View Synthesis**|Shu Chen et.al.|[2304.05218v1](http://arxiv.org/abs/2304.05218v1)|**[link](https://github.com/xtu-pr-lab/sfmnerf)**|\n", "2304.05097": "|**2023-04-11**|**One-Shot High-Fidelity Talking-Head Synthesis with Deformable Neural Radiance Field**|Weichuang Li et.al.|[2304.05097v1](http://arxiv.org/abs/2304.05097v1)|null|\n", "2304.04962": "|**2023-04-11**|**MRVM-NeRF: Mask-Based Pretraining for Neural Radiance Fields**|Ganlin Yang et.al.|[2304.04962v1](http://arxiv.org/abs/2304.04962v1)|null|\n", "2304.04897": "|**2023-04-10**|**Neural Image-based Avatars: Generalizable Radiance Fields for Human Avatar Modeling**|Youngjoong Kwon et.al.|[2304.04897v1](http://arxiv.org/abs/2304.04897v1)|null|\n", "2304.05620": "|**2023-04-12**|**NutritionVerse-Thin: An Optimized Strategy for Enabling Improved Rendering of 3D Thin Food Models**|Chi-en Amy Tai et.al.|[2304.05620v1](http://arxiv.org/abs/2304.05620v1)|null|\n", "2304.06714": "|**2023-04-17**|**Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and Reconstruction**|Hansheng Chen et.al.|[2304.06714v2](http://arxiv.org/abs/2304.06714v2)|**[link](https://github.com/Lakonik/SSDNeRF)**|\n", "2304.06706": "|**2023-04-13**|**Zip-NeRF: Anti-Aliased Grid-Based Neural Radiance Fields**|Jonathan T. Barron et.al.|[2304.06706v1](http://arxiv.org/abs/2304.06706v1)|null|\n", "2304.06287": "|**2023-04-13**|**NeRFVS: Neural Radiance Fields for Free View Synthesis via Geometry Scaffolds**|Chen Yang et.al.|[2304.06287v1](http://arxiv.org/abs/2304.06287v1)|null|\n", "2304.06969": "|**2023-04-14**|**UVA: Towards Unified Volumetric Avatar for View Synthesis, Pose rendering, Geometry and Texture Editing**|Jinlong Fan et.al.|[2304.06969v1](http://arxiv.org/abs/2304.06969v1)|null|\n", "2304.08279": "|**2023-04-17**|**MoDA: Modeling Deformable 3D Objects from Casual Videos**|Chaoyue Song et.al.|[2304.08279v1](http://arxiv.org/abs/2304.08279v1)|**[link](https://github.com/chaoyuesong/moda)**|\n", "2304.07979": "|**2023-04-17**|**NeRF-Loc: Visual Localization with Conditional Neural Radiance Field**|Jianlin Liu et.al.|[2304.07979v1](http://arxiv.org/abs/2304.07979v1)|**[link](https://github.com/jenningsl/nerf-loc)**|\n", "2304.07918": "|**2023-04-16**|**Likelihood-Based Generative Radiance Field with Latent Space Energy-Based Model for 3D-Aware Disentangled Image Representation**|Yaxuan Zhu et.al.|[2304.07918v1](http://arxiv.org/abs/2304.07918v1)|null|\n", "2304.07915": "|**2023-04-16**|**CAT-NeRF: Constancy-Aware Tx$^2$Former for Dynamic Body Modeling**|Haidong Zhu et.al.|[2304.07915v1](http://arxiv.org/abs/2304.07915v1)|**[link](https://github.com/haidongz-usc/CAT-NeRF)**|\n", "2304.07743": "|**2023-04-16**|**SeaThru-NeRF: Neural Radiance Fields in Scattering Media**|Deborah Levy et.al.|[2304.07743v1](http://arxiv.org/abs/2304.07743v1)|**[link](https://github.com/deborahLevy130/seathru_NeRF)**|\n", "2304.08971": "|**2023-04-18**|**SurfelNeRF: Neural Surfel Radiance Fields for Online Photorealistic Reconstruction of Indoor Scenes**|Yiming Gao et.al.|[2304.08971v1](http://arxiv.org/abs/2304.08971v1)|null|\n", "2304.08757": "|**2023-04-18**|**NeAI: A Pre-convoluted Representation for Plug-and-Play Neural Ambient Illumination**|Yiyu Zhuang et.al.|[2304.08757v1](http://arxiv.org/abs/2304.08757v1)|null|\n", "2304.09677": "|**2023-04-20**|**Reference-guided Controllable Inpainting of Neural Radiance Fields**|Ashkan Mirzaei et.al.|[2304.09677v2](http://arxiv.org/abs/2304.09677v2)|null|\n", "2304.10537": "|**2023-04-20**|**Learning Neural Duplex Radiance Fields for Real-Time View Synthesis**|Ziyu Wan et.al.|[2304.10537v1](http://arxiv.org/abs/2304.10537v1)|null|\n", "2304.10532": "|**2023-04-21**|**Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs**|Frederik Warburg et.al.|[2304.10532v2](http://arxiv.org/abs/2304.10532v2)|**[link](https://github.com/ethanweber/nerfbusters)**|\n", "2304.10448": "|**2023-04-20**|**ReLight My NeRF: A Dataset for Novel View Synthesis and Relighting of Real World Objects**|Marco Toschi et.al.|[2304.10448v1](http://arxiv.org/abs/2304.10448v1)|null|\n", "2304.10406": "|**2023-04-20**|**LiDAR-NeRF: Novel LiDAR View Synthesis via Neural Radiance Fields**|Tang Tao et.al.|[2304.10406v1](http://arxiv.org/abs/2304.10406v1)|**[link](https://github.com/tangtaogo/lidar-nerf)**|\n", "2304.10250": "|**2023-04-20**|**Revisiting Implicit Neural Representations in Low-Level Vision**|Wentian Xu et.al.|[2304.10250v1](http://arxiv.org/abs/2304.10250v1)|**[link](https://github.com/wentxul/linr)**|\n", "2304.10075": "|**2023-04-20**|**Multiscale Representation for Real-Time Anti-Aliasing Neural Rendering**|Dongting Hu et.al.|[2304.10075v1](http://arxiv.org/abs/2304.10075v1)|null|\n", "2304.10050": "|**2023-04-20**|**Neural Radiance Fields: Past, Present, and Future**|Ansh Mittal et.al.|[2304.10050v1](http://arxiv.org/abs/2304.10050v1)|null|\n", "2304.09987": "|**2023-04-19**|**Tetra-NeRF: Representing Neural Radiance Fields Using Tetrahedra**|Jonas Kulhanek et.al.|[2304.09987v1](http://arxiv.org/abs/2304.09987v1)|**[link](https://github.com/jkulhanek/tetra-nerf)**|\n", "2304.10780": "|**2023-04-21**|**Omni-Line-of-Sight Imaging for Holistic Shape Reconstruction**|Binbin Huang et.al.|[2304.10780v1](http://arxiv.org/abs/2304.10780v1)|null|\n", "2304.10664": "|**2023-04-20**|**A Comparative Neural Radiance Field (NeRF) 3D Analysis of Camera Poses from HoloLens Trajectories and Structure from Motion**|Miriam J\u00e4ger et.al.|[2304.10664v1](http://arxiv.org/abs/2304.10664v1)|null|\n", "2304.12308": "|**2023-04-26**|**Segment Anything in 3D with NeRFs**|Jiazhong Cen et.al.|[2304.12308v2](http://arxiv.org/abs/2304.12308v2)|null|\n", "2304.12294": "|**2023-04-24**|**Explicit Correspondence Matching for Generalizable Neural Radiance Fields**|Yuedong Chen et.al.|[2304.12294v1](http://arxiv.org/abs/2304.12294v1)|**[link](https://github.com/donydchen/matchnerf)**|\n", "2304.11842": "|**2023-04-25**|**Gen-NeRF: Efficient and Generalizable Neural Radiance Fields via Algorithm-Hardware Co-Design**|Yonggan Fu et.al.|[2304.11842v2](http://arxiv.org/abs/2304.11842v2)|null|\n", "2304.11470": "|**2023-04-22**|**3D-IntPhys: Towards More Generalized 3D-grounded Visual Intuitive Physics under Challenging Scenes**|Haotian Xue et.al.|[2304.11470v1](http://arxiv.org/abs/2304.11470v1)|null|\n", "2304.11448": "|**2023-04-22**|**Dehazing-NeRF: Neural Radiance Fields from Hazy Images**|Tian Li et.al.|[2304.11448v1](http://arxiv.org/abs/2304.11448v1)|null|\n", "2304.11342": "|**2023-04-22**|**NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent Semantic Navigation**|Baao Xie et.al.|[2304.11342v1](http://arxiv.org/abs/2304.11342v1)|null|\n", "2304.11241": "|**2023-04-21**|**AutoNeRF: Training Implicit Scene Representations with Autonomous Agents**|Pierre Marza et.al.|[2304.11241v1](http://arxiv.org/abs/2304.11241v1)|null|\n", "2304.12746": "|**2023-04-25**|**Local Implicit Ray Function for Generalizable Radiance Field Representation**|Xin Huang et.al.|[2304.12746v1](http://arxiv.org/abs/2304.12746v1)|null|\n", "2304.12587": "|**2023-04-27**|**MF-NeRF: Memory Efficient NeRF with Mixed-Feature Hash Table**|Yongjae Lee et.al.|[2304.12587v3](http://arxiv.org/abs/2304.12587v3)|**[link](https://github.com/nfyfamr/mf-nerf)**|\n", "2304.12467": "|**2023-04-24**|**Instant-3D: Instant Neural Radiance Field Training Towards On-Device AR/VR 3D Reconstruction**|Sixu Li et.al.|[2304.12467v1](http://arxiv.org/abs/2304.12467v1)|null|\n", "2304.12439": "|**2023-04-24**|**TextMesh: Generation of Realistic 3D Meshes From Text Prompts**|Christina Tsalicoglou et.al.|[2304.12439v1](http://arxiv.org/abs/2304.12439v1)|null|\n", "2304.13518": "|**2023-04-26**|**Super-NeRF: View-consistent Detail Generation for NeRF super-resolution**|Yuqi Han et.al.|[2304.13518v1](http://arxiv.org/abs/2304.13518v1)|null|\n", "2304.13386": "|**2023-04-26**|**VGOS: Voxel Grid Optimization for View Synthesis from Sparse Inputs**|Jiakai Sun et.al.|[2304.13386v1](http://arxiv.org/abs/2304.13386v1)|**[link](https://github.com/sjojok/vgos)**|\n", "2304.14401": "|**2023-04-27**|**ActorsNeRF: Animatable Few-shot Human Rendering with Generalizable NeRFs**|Jiteng Mu et.al.|[2304.14401v1](http://arxiv.org/abs/2304.14401v1)|null|\n", "2304.14301": "|**2023-05-03**|**Combining HoloLens with Instant-NeRFs: Advanced Real-Time 3D Mobile Mapping**|Dennis Haitz et.al.|[2304.14301v2](http://arxiv.org/abs/2304.14301v2)|null|\n", "2304.14070": "|**2023-04-27**|**Compositional 3D Human-Object Neural Animation**|Zhi Hou et.al.|[2304.14070v1](http://arxiv.org/abs/2304.14070v1)|null|\n", "2304.14811": "|**2023-04-28**|**NeRF-LiDAR: Generating Realistic LiDAR Point Clouds with Neural Radiance Fields**|Junge Zhang et.al.|[2304.14811v1](http://arxiv.org/abs/2304.14811v1)|null|\n", "2304.14473": "|**2023-04-27**|**Learning a Diffusion Prior for NeRFs**|Guandao Yang et.al.|[2304.14473v1](http://arxiv.org/abs/2304.14473v1)|null|\n", "2305.00787": "|**2023-05-01**|**GeneFace++: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation**|Zhenhui Ye et.al.|[2305.00787v1](http://arxiv.org/abs/2305.00787v1)|null|\n", "2305.00375": "|**2023-04-30**|**Neural Radiance Fields (NeRFs): A Review and Some Recent Developments**|Mohamed Debbagh et.al.|[2305.00375v1](http://arxiv.org/abs/2305.00375v1)|null|\n", "2305.00041": "|**2023-04-28**|**ViP-NeRF: Visibility Prior for Sparse Input Neural Radiance Fields**|Nagabhushan Somraj et.al.|[2305.00041v1](http://arxiv.org/abs/2305.00041v1)|**[link](https://github.com/NagabhushanSN95/ViP-NeRF)**|\n", "2305.01643": "|**2023-05-02**|**Neural LiDAR Fields for Novel View Synthesis**|Shengyu Huang et.al.|[2305.01643v1](http://arxiv.org/abs/2305.01643v1)|null|\n", "2305.01190": "|**2023-05-03**|**LatentAvatar: Learning Latent Expression Code for Expressive Neural Head Avatar**|Yuelang Xu et.al.|[2305.01190v2](http://arxiv.org/abs/2305.01190v2)|null|\n", "2305.01163": "|**2023-05-02**|**Federated Neural Radiance Fields**|Lachlan Holden et.al.|[2305.01163v1](http://arxiv.org/abs/2305.01163v1)|**[link](https://github.com/lachholden/fednerf-pytorch)**|\n", "2305.03049": "|**2023-05-04**|**NeuralEditor: Editing Neural Radiance Fields via Manipulating Point Clouds**|Jun-Kun Chen et.al.|[2305.03049v1](http://arxiv.org/abs/2305.03049v1)|null|\n", "2305.02756": "|**2023-05-04**|**Radiance Field Gradient Scaling for Unbiased Near-Camera Training**|Julien Philip et.al.|[2305.02756v1](http://arxiv.org/abs/2305.02756v1)|**[link](https://github.com/gradient-scaling/gradient-scaling.github.io)**|\n", "2305.02618": "|**2023-05-04**|**Semantic-aware Generation of Multi-view Portrait Drawings**|Biao Ma et.al.|[2305.02618v1](http://arxiv.org/abs/2305.02618v1)|**[link](https://github.com/aiart-hdu/sage)**|\n", "2305.03176": "|**2023-05-04**|**NeRF-QA: Neural Radiance Fields Quality Assessment Database**|Pedro Martin et.al.|[2305.03176v1](http://arxiv.org/abs/2305.03176v1)|null|\n", "2305.04789": "|**2023-05-08**|**AvatarReX: Real-time Expressive Full-body Avatars**|Zerong Zheng et.al.|[2305.04789v1](http://arxiv.org/abs/2305.04789v1)|null|\n", "2305.04296": "|**2023-05-07**|**HashCC: Lightweight Method to Improve the Quality of the Camera-less NeRF Scene Generation**|Jan Olszewski et.al.|[2305.04296v1](http://arxiv.org/abs/2305.04296v1)|null|\n", "2305.04268": "|**2023-05-07**|**Multi-Space Neural Radiance Fields**|Ze-Xin Yin et.al.|[2305.04268v1](http://arxiv.org/abs/2305.04268v1)|null|\n", "2305.05594": "|**2023-05-09**|**PET-NeuS: Positional Encoding Tri-Planes for Neural Surfaces**|Yiqun Wang et.al.|[2305.05594v1](http://arxiv.org/abs/2305.05594v1)|**[link](https://github.com/yiqun-wang/pet-neus)**|\n", "2305.04966": "|**2023-05-08**|**NerfAcc: Efficient Sampling Accelerates NeRFs**|Ruilong Li et.al.|[2305.04966v1](http://arxiv.org/abs/2305.04966v1)|null|\n", "2305.06131": "|**2023-05-10**|**Generative AI meets 3D: A Survey on Text-to-3D in AIGC Era**|Chenghao Li et.al.|[2305.06131v1](http://arxiv.org/abs/2305.06131v1)|null|\n", "2305.06118": "|**2023-05-10**|**NeRF$^\\textbf{2}$: Neural Radio-Frequency Radiance Fields**|Xiaopeng Zhao et.al.|[2305.06118v1](http://arxiv.org/abs/2305.06118v1)|null|\n", "2305.05766": "|**2023-05-09**|**Instant-NeRF: Instant On-Device Neural Radiance Field Training via Algorithm-Accelerator Co-Designed Near-Memory Processing**|Yang Zhao et.al.|[2305.05766v1](http://arxiv.org/abs/2305.05766v1)|null|\n", "2305.07342": "|**2023-05-12**|**BundleRecon: Ray Bundle-Based 3D Neural Reconstruction**|Weikun Zhang et.al.|[2305.07342v1](http://arxiv.org/abs/2305.07342v1)|null|\n", "2305.08851": "|**2023-05-15**|**MV-Map: Offboard HD-Map Generation with Multi-view Consistency**|Ziyang Xie et.al.|[2305.08851v1](http://arxiv.org/abs/2305.08851v1)|**[link](https://github.com/ziyang-xie/mv-map)**|\n", "2305.09761": "|**2023-05-16**|**NerfBridge: Bringing Real-time, Online Neural Radiance Field Training to Robotics**|Javier Yu et.al.|[2305.09761v1](http://arxiv.org/abs/2305.09761v1)|**[link](https://github.com/javieryu/nerf_bridge)**|\n", "2305.11167": "|**2023-05-18**|**MVPSNet: Fast Generalizable Multi-view Photometric Stereo**|Dongxu Zhao et.al.|[2305.11167v1](http://arxiv.org/abs/2305.11167v1)|null|\n", "2305.11031": "|**2023-05-18**|**ConsistentNeRF: Enhancing Neural Radiance Fields with 3D Consistency for Sparse View Synthesis**|Shoukang Hu et.al.|[2305.11031v1](http://arxiv.org/abs/2305.11031v1)|**[link](https://github.com/skhu101/consistentnerf)**|\n", "2305.10579": "|**2023-05-17**|**MultiPlaneNeRF: Neural Radiance Field with Non-Trainable Representation**|Dominik Zimny et.al.|[2305.10579v1](http://arxiv.org/abs/2305.10579v1)|**[link](https://github.com/gmum/multiplanenerf)**|\n", "2305.10503": "|**2023-05-24**|**OR-NeRF: Object Removing from 3D Scenes Guided by Multiview Segmentation with Neural Radiance Fields**|Youtan Yin et.al.|[2305.10503v2](http://arxiv.org/abs/2305.10503v2)|**[link](https://github.com/cuteyyt/or-nerf)**|\n", "2305.11588": "|**2023-05-19**|**Text2NeRF: Text-Driven 3D Scene Generation with Neural Radiance Fields**|Jingbo Zhang et.al.|[2305.11588v1](http://arxiv.org/abs/2305.11588v1)|null|\n", "2305.13307": "|**2023-05-22**|**NeRFuser: Large-Scale Scene Representation by NeRF Fusion**|Jiading Fang et.al.|[2305.13307v1](http://arxiv.org/abs/2305.13307v1)|**[link](https://github.com/ripl/nerfuser)**|\n", "2305.12843": "|**2023-05-22**|**Registering Neural Radiance Fields as 3D Density Images**|Han Jiang et.al.|[2305.12843v1](http://arxiv.org/abs/2305.12843v1)|null|\n", "2305.14093": "|**2023-05-24**|**3D Open-vocabulary Segmentation with Foundation Models**|Kunhao Liu et.al.|[2305.14093v2](http://arxiv.org/abs/2305.14093v2)|**[link](https://github.com/kunhao-liu/3d-ovs)**|\n", "2305.15171": "|**2023-05-31**|**Deceptive-NeRF: Enhancing NeRF Reconstruction using Pseudo-Observations from Diffusion Models**|Xinhang Liu et.al.|[2305.15171v2](http://arxiv.org/abs/2305.15171v2)|null|\n", "2305.15094": "|**2023-05-24**|**InpaintNeRF360: Text-Guided 3D Inpainting on Unbounded Neural Radiance Fields**|Dongqing Wang et.al.|[2305.15094v1](http://arxiv.org/abs/2305.15094v1)|null|\n", "2305.14831": "|**2023-05-24**|**OD-NeRF: Efficient Training of On-the-Fly Dynamic Neural Radiance Fields**|Zhiwen Yan et.al.|[2305.14831v1](http://arxiv.org/abs/2305.14831v1)|null|\n", "2305.16233": "|**2023-05-25**|**Interactive Segment Anything NeRF with Feature Imitation**|Xiaokang Chen et.al.|[2305.16233v1](http://arxiv.org/abs/2305.16233v1)|null|\n", "2305.16213": "|**2023-05-25**|**ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with Variational Score Distillation**|Zhengyi Wang et.al.|[2305.16213v1](http://arxiv.org/abs/2305.16213v1)|**[link](https://github.com/thu-ml/prolificdreamer)**|\n", "2305.16914": "|**2023-06-06**|**PlaNeRF: SVD Unsupervised 3D Plane Regularization for NeRF Large-Scale Scene Reconstruction**|Fusang Wang et.al.|[2305.16914v3](http://arxiv.org/abs/2305.16914v3)|null|\n", "2305.16411": "|**2023-05-25**|**ZeroAvatar: Zero-shot 3D Avatar Generation from a Single Image**|Zhenzhen Weng et.al.|[2305.16411v1](http://arxiv.org/abs/2305.16411v1)|null|\n", "2305.18079": "|**2023-05-31**|**Towards a Robust Framework for NeRF Evaluation**|Adrian Azzarelli et.al.|[2305.18079v3](http://arxiv.org/abs/2305.18079v3)|**[link](https://github.com/azzarelli/wape)**|\n", "2305.17916": "|**2023-05-31**|**Volume Feature Rendering for Fast Neural Radiance Field Reconstruction**|Kang Han et.al.|[2305.17916v2](http://arxiv.org/abs/2305.17916v2)|null|\n", "2305.19201": "|**2023-05-30**|**D\u00e4RF: Boosting Radiance Fields from Sparse Inputs with Monocular Depth Adaptation**|Jiuhn Song et.al.|[2305.19201v1](http://arxiv.org/abs/2305.19201v1)|**[link](https://github.com/KU-CVLAB/DaRF)**|\n", "2305.19065": "|**2023-05-30**|**Template-free Articulated Neural Point Clouds for Reposable View Synthesis**|Lukas Uzolas et.al.|[2305.19065v1](http://arxiv.org/abs/2305.19065v1)|**[link](https://github.com/lukasuz/articulated-point-nerf)**|\n", "2305.18766": "|**2023-05-31**|**HiFA: High-fidelity Text-to-3D with Advanced Diffusion Guidance**|Junzhe Zhu et.al.|[2305.18766v2](http://arxiv.org/abs/2305.18766v2)|null|\n", "2306.00783": "|**2023-06-01**|**FDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and Relighting with Diffusion Models**|Hao Zhang et.al.|[2306.00783v1](http://arxiv.org/abs/2306.00783v1)|**[link](https://github.com/billyxyb/fdnerf)**|\n", "2306.00696": "|**2023-06-01**|**Analyzing the Internals of Neural Radiance Fields**|Lukas Radl et.al.|[2306.00696v1](http://arxiv.org/abs/2306.00696v1)|**[link](https://github.com/r4dl/nerfinternals)**|\n", "2306.00547": "|**2023-06-02**|**AvatarStudio: Text-driven Editing of 3D Dynamic Human Head Avatars**|Mohit Mendiratta et.al.|[2306.00547v2](http://arxiv.org/abs/2306.00547v2)|null|\n", "2306.03000": "|**2023-06-05**|**BeyondPixels: A Comprehensive Review of the Evolution of Neural Radiance Fields**|AKM Shahariar Azad Rabby et.al.|[2306.03000v1](http://arxiv.org/abs/2306.03000v1)|null|\n", "2306.02741": "|**2023-06-05**|**ZIGNeRF: Zero-shot 3D Scene Representation with Invertible Generative Neural Radiance Fields**|Kanghyeok Ko et.al.|[2306.02741v1](http://arxiv.org/abs/2306.02741v1)|null|\n", "2306.03727": "|**2023-06-06**|**Towards Visual Foundational Models of Physical Scenes**|Chethan Parameshwara et.al.|[2306.03727v1](http://arxiv.org/abs/2306.03727v1)|null|\n", "2306.03576": "|**2023-06-06**|**Human 3D Avatar Modeling with Implicit Neural Representation: A Brief Survey**|Mingyang Sun et.al.|[2306.03576v1](http://arxiv.org/abs/2306.03576v1)|null|\n", "2306.03207": "|**2023-06-05**|**H2-Mapping: Real-time Dense Mapping Using Hierarchical Hybrid Representation**|Chenxing Jiang et.al.|[2306.03207v1](http://arxiv.org/abs/2306.03207v1)|**[link](https://github.com/sysu-star/h2-mapping)**|\n", "2306.05410": "|**2023-06-08**|**LU-NeRF: Scene and Pose Estimation by Synchronizing Local Unposed NeRFs**|Zezhou Cheng et.al.|[2306.05410v1](http://arxiv.org/abs/2306.05410v1)|null|\n", "2306.05303": "|**2023-06-08**|**Enhance-NeRF: Multiple Performance Evaluation for Neural Radiance Fields**|Qianqiu Tan et.al.|[2306.05303v1](http://arxiv.org/abs/2306.05303v1)|**[link](https://github.com/tanqianq/enhance-nerf)**|\n", "2306.06093": "|**2023-06-09**|**HyP-NeRF: Learning Improved NeRF Priors using a HyperNetwork**|Bipasha Sen et.al.|[2306.06093v1](http://arxiv.org/abs/2306.06093v1)|null|\n", "2306.06044": "|**2023-06-09**|**GANeRF: Leveraging Discriminators to Optimize Neural Radiance Fields**|Barbara Roessle et.al.|[2306.06044v1](http://arxiv.org/abs/2306.06044v1)|null|\n", "2306.05668": "|**2023-06-09**|**RePaint-NeRF: NeRF Editting via Semantic Masks and Diffusion Models**|Xingchen Zhou et.al.|[2306.05668v1](http://arxiv.org/abs/2306.05668v1)|null|\n", "2306.06388": "|**2023-06-10**|**From NeRFLiX to NeRFLiX++: A General NeRF-Agnostic Restorer Paradigm**|Kun Zhou et.al.|[2306.06388v1](http://arxiv.org/abs/2306.06388v1)|null|\n", "2306.06300": "|**2023-06-15**|**NERFBK: A High-Quality Benchmark for NERF-Based 3D Reconstruction**|Ali Karami et.al.|[2306.06300v2](http://arxiv.org/abs/2306.06300v2)|**[link](https://github.com/3dom-fbk/nerfbk)**|\n", "2306.07581": "|**2023-06-13**|**Binary Radiance Fields**|Seungjoo Shin et.al.|[2306.07581v1](http://arxiv.org/abs/2306.07581v1)|null|\n", "2306.09349": "|**2023-06-16**|**UrbanIR: Large-Scale Urban Scene Inverse Rendering from a Single Video**|Zhi-Hao Lin et.al.|[2306.09349v2](http://arxiv.org/abs/2306.09349v2)|null|\n", "2306.08068": "|**2023-06-13**|**DORSal: Diffusion for Object-centric Representations of Scenes $\\textit{et al.}$**|Allan Jabri et.al.|[2306.08068v1](http://arxiv.org/abs/2306.08068v1)|null|\n", "2306.09551": "|**2023-06-15**|**Edit-DiffNeRF: Editing 3D Neural Radiance Fields using 2D Diffusion Model**|Lu Yu et.al.|[2306.09551v1](http://arxiv.org/abs/2306.09551v1)|null|\n", "2306.11556": "|**2023-06-20**|**NeRF synthesis with shading guidance**|Chenbin Li et.al.|[2306.11556v1](http://arxiv.org/abs/2306.11556v1)|null|\n", "2306.10350": "|**2023-06-24**|**MA-NeRF: Motion-Assisted Neural Radiance Fields for Face Synthesis from Sparse Images**|Weichen Zhang et.al.|[2306.10350v2](http://arxiv.org/abs/2306.10350v2)|null|\n", "2306.12423": "|**2023-06-21**|**Benchmarking and Analyzing 3D-aware Image Synthesis with a Modularized Codebase**|Qiuyu Wang et.al.|[2306.12423v1](http://arxiv.org/abs/2306.12423v1)|**[link](https://github.com/qiuyu96/carver)**|\n", "2306.12422": "|**2023-06-21**|**DreamTime: An Improved Optimization Strategy for Text-to-3D Content Creation**|Yukun Huang et.al.|[2306.12422v1](http://arxiv.org/abs/2306.12422v1)|null|\n", "2306.12760": "|**2023-06-22**|**Blended-NeRF: Zero-Shot Object Generation and Blending in Existing Neural Radiance Fields**|Ori Gordon et.al.|[2306.12760v1](http://arxiv.org/abs/2306.12760v1)|**[link](https://github.com/orig333/Blended-NeRF)**|\n", "2306.12570": "|**2023-06-21**|**Local 3D Editing via 3D Distillation of CLIP Knowledge**|Junha Hyung et.al.|[2306.12570v1](http://arxiv.org/abs/2306.12570v1)|null|\n", "2306.15203": "|**2023-06-27**|**Unsupervised Polychromatic Neural Representation for CT Metal Artifact Reduction**|Qing Wu et.al.|[2306.15203v1](http://arxiv.org/abs/2306.15203v1)|**[link](https://github.com/iwuqing/polyner)**|\n", "2306.16541": "|**2023-06-28**|**Envisioning a Next Generation Extended Reality Conferencing System with Efficient Photorealistic Human Rendering**|Chuanyue Shen et.al.|[2306.16541v1](http://arxiv.org/abs/2306.16541v1)|null|\n", "2306.17723": "|**2023-07-16**|**FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis**|Seunghyeon Seo et.al.|[2306.17723v2](http://arxiv.org/abs/2306.17723v2)|**[link](https://github.com/shawn615/FlipNeRF)**|\n", "2306.17624": "|**2023-07-03**|**Sphere2Vec: A General-Purpose Location Representation Learning over a Spherical Surface for Large-Scale Geospatial Predictions**|Gengchen Mai et.al.|[2306.17624v2](http://arxiv.org/abs/2306.17624v2)|null|\n", "2307.03441": "|**2023-07-07**|**NOFA: NeRF-based One-shot Facial Avatar Reconstruction**|Wangbo Yu et.al.|[2307.03441v1](http://arxiv.org/abs/2307.03441v1)|null|\n", "2307.03404": "|**2023-07-07**|**RGB-D Mapping and Tracking in a Plenoxel Radiance Field**|Andreas L. Teigen et.al.|[2307.03404v1](http://arxiv.org/abs/2307.03404v1)|**[link](https://github.com/ysus33/rgb-d_plenoxel_mapping_tracking)**|\n", "2307.05087": "|**2023-07-11**|**SAR-NeRF: Neural Radiance Fields for Synthetic Aperture Radar Multi-View Representation**|Zhengxin Lei et.al.|[2307.05087v1](http://arxiv.org/abs/2307.05087v1)|null|\n", "2307.08093": "|**2023-07-16**|**Cross-Ray Neural Radiance Fields for Novel-view Synthesis from Unconstrained Image Collections**|Yifan Yang et.al.|[2307.08093v1](http://arxiv.org/abs/2307.08093v1)|**[link](https://github.com/yifyang993/cr-nerf-pytorch)**|\n", "2307.07729": "|**2023-07-15**|**Improving NeRF with Height Data for Utilization of GIS Data**|Hinata Aoki et.al.|[2307.07729v1](http://arxiv.org/abs/2307.07729v1)|null|\n", "2307.09323": "|**2023-07-18**|**Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Jiahe Li et.al.|[2307.09323v1](http://arxiv.org/abs/2307.09323v1)|**[link](https://github.com/fictionarry/er-nerf)**|\n", "2307.10135": "|**2023-07-19**|**An Improved NeuMIP with Better Accuracy**|Bowen Xue et.al.|[2307.10135v1](http://arxiv.org/abs/2307.10135v1)|null|\n", "2307.09860": "|**2023-07-19**|**Magic NeRF Lens: Interactive Fusion of Neural Radiance Fields for Virtual Facility Inspection**|Ke Li et.al.|[2307.09860v1](http://arxiv.org/abs/2307.09860v1)|**[link](https://github.com/uhhhci/immersive-ngp)**|\n", "2307.09555": "|**2023-07-14**|**Transient Neural Radiance Fields for Lidar View Synthesis and 3D Reconstruction**|Anagh Malik et.al.|[2307.09555v1](http://arxiv.org/abs/2307.09555v1)|null|\n", "2307.10776": "|**2023-07-20**|**Urban Radiance Field Representation with Deformable Neural Mesh Primitives**|Fan Lu et.al.|[2307.10776v1](http://arxiv.org/abs/2307.10776v1)|null|\n", "2307.10664": "|**2023-07-20**|**Lighting up NeRF via Unsupervised Decomposition and Enhancement**|Haoyuan Wang et.al.|[2307.10664v1](http://arxiv.org/abs/2307.10664v1)|**[link](https://github.com/onpix/LLNeRF)**|\n", "2307.11526": "|**2023-07-29**|**CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields**|Ziyuan Luo et.al.|[2307.11526v2](http://arxiv.org/abs/2307.11526v2)|null|\n", "2307.11418": "|**2023-08-07**|**FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural Radiance Fields**|Sungwon Hwang et.al.|[2307.11418v2](http://arxiv.org/abs/2307.11418v2)|null|\n", "2307.11335": "|**2023-07-21**|**Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural Radiance Fields**|Wenbo Hu et.al.|[2307.11335v1](http://arxiv.org/abs/2307.11335v1)|null|\n", "2307.12909": "|**2023-07-24**|**Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields**|Shangzhan Zhang et.al.|[2307.12909v1](http://arxiv.org/abs/2307.12909v1)|null|\n", "2307.12718": "|**2023-07-24**|**CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle Components**|Davide Di Nucci et.al.|[2307.12718v1](http://arxiv.org/abs/2307.12718v1)|null|\n", "2307.12291": "|**2023-07-23**|**TransHuman: A Transformer-based Human Representation for Generalizable Neural Human Rendering**|Xiao Pan et.al.|[2307.12291v1](http://arxiv.org/abs/2307.12291v1)|null|\n", "2307.13908": "|**2023-07-26**|**Points-to-3D: Bridging the Gap between Sparse Points and Shape-Controllable Text-to-3D Generation**|Chaohui Yu et.al.|[2307.13908v1](http://arxiv.org/abs/2307.13908v1)|null|\n", "2307.15058": "|**2023-07-27**|**MARS: An Instance-aware, Modular and Realistic Simulator for Autonomous Driving**|Zirui Wu et.al.|[2307.15058v1](http://arxiv.org/abs/2307.15058v1)|**[link](https://github.com/open-air-sun/mars)**|\n", "2307.14620": "|**2023-07-27**|**NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection**|Chenfeng Xu et.al.|[2307.14620v1](http://arxiv.org/abs/2307.14620v1)|**[link](https://github.com/facebookresearch/nerf-det)**|\n", "2307.15333": "|**2023-07-28**|**Dynamic PlenOctree for Adaptive Sampling Refinement in Explicit NeRF**|Haotian Bai et.al.|[2307.15333v1](http://arxiv.org/abs/2307.15333v1)|null|\n", "2307.15131": "|**2023-07-27**|**Seal-3D: Interactive Pixel-Level Editing for Neural Radiance Fields**|Xiangyu Wang et.al.|[2307.15131v1](http://arxiv.org/abs/2307.15131v1)|**[link](https://github.com/windingwind/seal-3d)**|\n", "2308.00462": "|**2023-08-01**|**Context-Aware Talking-Head Video Editing**|Songlin Yang et.al.|[2308.00462v1](http://arxiv.org/abs/2308.00462v1)|null|\n", "2308.01262": "|**2023-08-02**|**Incorporating Season and Solar Specificity into Renderings made by a NeRF Architecture using Satellite Images**|Michael Gableman et.al.|[2308.01262v1](http://arxiv.org/abs/2308.01262v1)|**[link](https://github.com/enterprisecv-6/season-nerf)**|\n", "2308.00773": "|**2023-08-01**|**High-Fidelity Eye Animatable Neural Radiance Fields for Human Face**|Hengfei Wang et.al.|[2308.00773v1](http://arxiv.org/abs/2308.00773v1)|null|\n", "2308.02191": "|**2023-08-04**|**ES-MVSNet: Efficient Framework for End-to-end Self-supervised Multi-View Stereo**|Qiang Zhou et.al.|[2308.02191v1](http://arxiv.org/abs/2308.02191v1)|null|\n", "2308.03280": "|**2023-08-07**|**Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with Whitted-Style Ray Tracing**|Junyi Zeng et.al.|[2308.03280v1](http://arxiv.org/abs/2308.03280v1)|null|\n", "2308.02908": "|**2023-08-05**|**Where and How: Mitigating Confusion in Neural Radiance Fields from Sparse Inputs**|Yanqi Bao et.al.|[2308.02908v1](http://arxiv.org/abs/2308.02908v1)|**[link](https://github.com/bbbbby-99/wah-nerf)**|\n", "2308.02840": "|**2023-08-05**|**Learning Unified Decompositional and Compositional NeRF for Editable Novel View Synthesis**|Yuxin Wang et.al.|[2308.02840v1](http://arxiv.org/abs/2308.02840v1)|null|\n", "2308.02751": "|**2023-08-05**|**NeRFs: The Search for the Best 3D Representation**|Ravi Ramamoorthi et.al.|[2308.02751v1](http://arxiv.org/abs/2308.02751v1)|null|\n", "2308.04413": "|**2023-08-08**|**Digging into Depth Priors for Outdoor Neural Radiance Fields**|Chen Wang et.al.|[2308.04413v1](http://arxiv.org/abs/2308.04413v1)|null|\n", "2308.03772": "|**2023-07-27**|**Improved Neural Radiance Fields Using Pseudo-depth and Fusion**|Jingliang Li et.al.|[2308.03772v1](http://arxiv.org/abs/2308.03772v1)|null|\n", "2308.04826": "|**2023-08-09**|**WaveNeRF: Wavelet-based Generalizable Neural Radiance Fields**|Muyu Xu et.al.|[2308.04826v1](http://arxiv.org/abs/2308.04826v1)|null|\n", "2308.04669": "|**2023-08-14**|**A General Implicit Framework for Fast NeRF Composition and Rendering**|Xinyu Gao et.al.|[2308.04669v2](http://arxiv.org/abs/2308.04669v2)|null|\n", "2308.05970": "|**2023-08-11**|**Focused Specific Objects NeRF**|Yuesong Li et.al.|[2308.05970v1](http://arxiv.org/abs/2308.05970v1)|null|\n", "2308.05939": "|**2023-08-11**|**VERF: Runtime Monitoring of Pose Estimation with Neural Radiance Fields**|Dominic Maggio et.al.|[2308.05939v1](http://arxiv.org/abs/2308.05939v1)|null|\n", "2308.07118": "|**2023-08-16**|**Neural radiance fields in the industrial and robotics domain: applications, research opportunities and use cases**|Eugen \u0160lapak et.al.|[2308.07118v2](http://arxiv.org/abs/2308.07118v2)|**[link](https://github.com/maftej/iisnerf)**|\n", "2308.07032": "|**2023-08-14**|**S3IM: Stochastic Structural SIMilarity and Its Unreasonable Effectiveness for Neural Fields**|Zeke Xie et.al.|[2308.07032v1](http://arxiv.org/abs/2308.07032v1)|**[link](https://github.com/madaoer/s3im_nerf)**|\n", "2308.08530": "|**2023-08-21**|**Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstruction**|Georgios Kouros et.al.|[2308.08530v3](http://arxiv.org/abs/2308.08530v3)|**[link](https://github.com/gkouros/ref-dvgo)**|\n", "2308.08258": "|**2023-08-16**|**SceNeRFlow: Time-Consistent Reconstruction of General Dynamic Scenes**|Edith Tretschk et.al.|[2308.08258v1](http://arxiv.org/abs/2308.08258v1)|null|\n", "2308.09421": "|**2023-08-18**|**MonoNeRD: NeRF-like Representations for Monocular 3D Object Detection**|Junkai Xu et.al.|[2308.09421v1](http://arxiv.org/abs/2308.09421v1)|**[link](https://github.com/cskkxjk/mononerd)**|\n", "2308.09386": "|**2023-08-18**|**DReg-NeRF: Deep Registration for Neural Radiance Fields**|Yu Chen et.al.|[2308.09386v1](http://arxiv.org/abs/2308.09386v1)|**[link](https://github.com/aibluefisher/dreg-nerf)**|\n", "2308.08947": "|**2023-08-17**|**Watch Your Steps: Local Image and Scene Editing by Text Instructions**|Ashkan Mirzaei et.al.|[2308.08947v1](http://arxiv.org/abs/2308.08947v1)|null|\n", "2308.10902": "|**2023-08-30**|**CamP: Camera Preconditioning for Neural Radiance Fields**|Keunhong Park et.al.|[2308.10902v2](http://arxiv.org/abs/2308.10902v2)|null|\n", "2308.10337": "|**2023-08-20**|**Strata-NeRF : Neural Radiance Fields for Stratified Scenes**|Ankit Dhiman et.al.|[2308.10337v1](http://arxiv.org/abs/2308.10337v1)|null|\n", "2308.10122": "|**2023-08-19**|**HollowNeRF: Pruning Hashgrid-Based NeRFs with Trainable Collision Mitigation**|Xiufeng Xie et.al.|[2308.10122v1](http://arxiv.org/abs/2308.10122v1)|null|\n", "2308.10001": "|**2023-08-19**|**AltNeRF: Learning Robust Neural Radiance Field via Alternating Depth-Pose Optimization**|Kun Wang et.al.|[2308.10001v1](http://arxiv.org/abs/2308.10001v1)|null|\n", "2308.09894": "|**2023-08-19**|**Semantic-Human: Neural Rendering of Humans from Monocular Video with Human Parsing**|Jie Zhang et.al.|[2308.09894v1](http://arxiv.org/abs/2308.09894v1)|null|\n", "2308.11198": "|**2023-08-22**|**Novel-view Synthesis and Pose Estimation for Hand-Object Interaction from Sparse Views**|Wentian Qu et.al.|[2308.11198v1](http://arxiv.org/abs/2308.11198v1)|null|\n", "2308.11130": "|**2023-08-22**|**Efficient View Synthesis with Neural Radiance Distribution Field**|Yushuang Wu et.al.|[2308.11130v1](http://arxiv.org/abs/2308.11130v1)|null|\n", "2308.11974": "|**2023-08-23**|**Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields**|Hyeonseop Song et.al.|[2308.11974v1](http://arxiv.org/abs/2308.11974v1)|null|\n", "2308.11951": "|**2023-08-25**|**Pose Modulated Avatars from Video**|Chunjin Song et.al.|[2308.11951v2](http://arxiv.org/abs/2308.11951v2)|null|\n", "2308.11793": "|**2023-08-22**|**Enhancing NeRF akin to Enhancing LLMs: Generalizable NeRF Transformer with Mixture-of-View-Experts**|Wenyan Cong et.al.|[2308.11793v1](http://arxiv.org/abs/2308.11793v1)|**[link](https://github.com/vita-group/gnt-move)**|\n", "2308.11774": "|**2023-08-22**|**SAMSNeRF: Segment Anything Model (SAM) Guides Dynamic Surgical Scene Reconstruction by Neural Radiance Field (NeRF)**|Ange Lou et.al.|[2308.11774v1](http://arxiv.org/abs/2308.11774v1)|null|\n", "2308.12560": "|**2023-08-24**|**NOVA: NOvel View Augmentation for Neural Composition of Dynamic Objects**|Dakshit Agrawal et.al.|[2308.12560v1](http://arxiv.org/abs/2308.12560v1)|**[link](https://github.com/dakshitagrawal/nova)**|\n", "2308.13897": "|**2023-08-26**|**InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules**|Yanqi Bao et.al.|[2308.13897v1](http://arxiv.org/abs/2308.13897v1)|**[link](https://github.com/bbbbby-99/insertnerf)**|\n", "2308.15049": "|**2023-08-29**|**Pose-Free Neural Radiance Fields via Implicit Pose Regularization**|Jiahui Zhang et.al.|[2308.15049v1](http://arxiv.org/abs/2308.15049v1)|null|\n", "2308.14816": "|**2023-08-28**|**CLNeRF: Continual Learning Meets NeRF**|Zhipeng Cai et.al.|[2308.14816v1](http://arxiv.org/abs/2308.14816v1)|**[link](https://github.com/intellabs/clnerf)**|\n", "2308.16041": "|**2023-08-30**|**From Pixels to Portraits: A Comprehensive Survey of Talking Head Generation Techniques and Applications**|Shreyank N Gowda et.al.|[2308.16041v1](http://arxiv.org/abs/2308.16041v1)|null|\n", "2308.15733": "|**2023-08-30**|**Drone-NeRF: Efficient NeRF Based 3D Scene Reconstruction for Large-Scale Drone Survey**|Zhihao Jia et.al.|[2308.15733v1](http://arxiv.org/abs/2308.15733v1)|null|\n", "2308.15547": "|**2023-08-29**|**Efficient Ray Sampling for Radiance Fields Reconstruction**|Shilei Sun et.al.|[2308.15547v1](http://arxiv.org/abs/2308.15547v1)|null|\n", "2308.16576": "|**2023-09-03**|**GHuNeRF: Generalizable Human NeRF from a Monocular Video**|Chen Li et.al.|[2308.16576v2](http://arxiv.org/abs/2308.16576v2)|null|\n", "2309.00277": "|**2023-09-01**|**SparseSat-NeRF: Dense Depth Supervised Neural Radiance Fields for Sparse Satellite Images**|Lulin Zhang et.al.|[2309.00277v1](http://arxiv.org/abs/2309.00277v1)|**[link](https://github.com/lulinzhang/sps-nerf)**|\n", "2309.00014": "|**2023-09-04**|**Improving NeRF Quality by Progressive Camera Placement for Unrestricted Navigation in Complex Environments**|Georgios Kopanas et.al.|[2309.00014v2](http://arxiv.org/abs/2309.00014v2)|null|\n", "2309.01811": "|**2023-09-06**|**Instant Continual Learning of Neural Radiance Fields**|Ryan Po et.al.|[2309.01811v2](http://arxiv.org/abs/2309.01811v2)|null|\n", "2309.01351": "|**2023-09-04**|**Adv3D: Generating 3D Adversarial Examples in Driving Scenarios with NeRF**|Leheng Li et.al.|[2309.01351v1](http://arxiv.org/abs/2309.01351v1)|null|\n", "2309.03185": "|**2023-09-06**|**Bayes' Rays: Uncertainty Quantification for Neural Radiance Fields**|Lily Goli et.al.|[2309.03185v1](http://arxiv.org/abs/2309.03185v1)|**[link](https://github.com/BayesRays/BayesRays)**|\n", "2309.03160": "|**2023-09-06**|**ResFields: Residual Neural Fields for Spatiotemporal Signals**|Marko Mihajlovic et.al.|[2309.03160v1](http://arxiv.org/abs/2309.03160v1)|**[link](https://github.com/markomih/ResFields)**|\n", "2309.03550": "|**2023-09-07**|**Text2Control3D: Controllable 3D Avatar Generation in Neural Radiance Fields using Geometry-Guided Text-to-Image Diffusion Model**|Sungwon Hwang et.al.|[2309.03550v1](http://arxiv.org/abs/2309.03550v1)|null|\n", "2309.04410": "|**2023-09-08**|**DeformToon3D: Deformable 3D Toonification from Neural Radiance Fields**|Junzhe Zhang et.al.|[2309.04410v1](http://arxiv.org/abs/2309.04410v1)|**[link](https://github.com/junzhezhang/deformtoon3d)**|\n", "2309.03955": "|**2023-09-14**|**SimpleNeRF: Regularizing Sparse Input Neural Radiance Fields with Simpler Solutions**|Nagabhushan Somraj et.al.|[2309.03955v2](http://arxiv.org/abs/2309.03955v2)|null|\n", "2309.03933": "|**2023-09-07**|**BluNF: Blueprint Neural Field**|Robin Courant et.al.|[2309.03933v1](http://arxiv.org/abs/2309.03933v1)|null|\n", "2309.05339": "|**2023-09-11**|**PAg-NeRF: Towards fast and efficient end-to-end panoptic 3D representations for agricultural robotics**|Claus Smitt et.al.|[2309.05339v1](http://arxiv.org/abs/2309.05339v1)|null|\n", "2309.04917": "|**2023-09-10**|**Text-driven Editing of 3D Scenes without Retraining**|Shuangkang Fang et.al.|[2309.04917v1](http://arxiv.org/abs/2309.04917v1)|**[link](https://github.com/Fangkang515/DN2N)**|\n", "2309.04750": "|**2023-09-09**|**Mirror-Aware Neural Humans**|Daniel Ajisafe et.al.|[2309.04750v1](http://arxiv.org/abs/2309.04750v1)|null|\n", "2309.04581": "|**2023-09-08**|**Dynamic Mesh-Aware Radiance Fields**|Yi-Ling Qiao et.al.|[2309.04581v1](http://arxiv.org/abs/2309.04581v1)|null|\n", "2309.06030": "|**2023-09-12**|**Federated Learning for Large-Scale Scene Modeling with Neural Radiance Fields**|Teppei Suzuki et.al.|[2309.06030v1](http://arxiv.org/abs/2309.06030v1)|null|\n", "2309.07125": "|**2023-09-13**|**Text-Guided Generation and Editing of Compositional 3D Avatars**|Hao Zhang et.al.|[2309.07125v1](http://arxiv.org/abs/2309.07125v1)|null|\n", "2309.06802": "|**2023-09-13**|**Dynamic NeRFs for Soccer Scenes**|Sacha Lewin et.al.|[2309.06802v1](http://arxiv.org/abs/2309.06802v1)|null|\n", "2309.07846": "|**2023-09-14**|**MC-NeRF: Muti-Camera Neural Radiance Fields for Muti-Camera Image Acquisition Systems**|Yu Gao et.al.|[2309.07846v1](http://arxiv.org/abs/2309.07846v1)|null|\n", "2309.07752": "|**2023-09-14**|**DT-NeRF: Decomposed Triplane-Hash Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Yaoyu Su et.al.|[2309.07752v1](http://arxiv.org/abs/2309.07752v1)|null|\n", "2309.07668": "|**2023-09-14**|**CoRF : Colorizing Radiance Fields using Knowledge Distillation**|Ankit Dhiman et.al.|[2309.07668v1](http://arxiv.org/abs/2309.07668v1)|null|\n", "2309.08596": "|**2023-09-15**|**Robust e-NeRF: NeRF from Sparse & Noisy Events under Non-Uniform Motion**|Weng Fei Low et.al.|[2309.08596v1](http://arxiv.org/abs/2309.08596v1)|**[link](https://github.com/wengflow/robust-e-nerf)**|\n", "2309.08040": "|**2023-09-14**|**Gradient based Grasp Pose Optimization on a NeRF that Approximates Grasp Success**|Gergely S\u00f3ti et.al.|[2309.08040v1](http://arxiv.org/abs/2309.08040v1)|null|\n", "2309.09502": "|**2023-09-18**|**RenderOcc: Vision-Centric 3D Occupancy Prediction with 2D Rendering Supervision**|Mingjie Pan et.al.|[2309.09502v1](http://arxiv.org/abs/2309.09502v1)|**[link](https://github.com/pmj110119/renderocc)**|\n", "2309.09295": "|**2023-09-17**|**NeRF-VINS: A Real-time Neural Radiance Field Map-based Visual-Inertial Navigation System**|Saimouli Katragadda et.al.|[2309.09295v1](http://arxiv.org/abs/2309.09295v1)|null|\n", "2309.08927": "|**2023-09-16**|**DynaMoN: Motion-Aware Fast And Robust Camera Localization for Dynamic NeRF**|Mert Asim Karaoglu et.al.|[2309.08927v1](http://arxiv.org/abs/2309.08927v1)|null|\n", "2309.10684": "|**2023-09-19**|**Locally Stylized Neural Radiance Fields**|Hong-Wing Pang et.al.|[2309.10684v1](http://arxiv.org/abs/2309.10684v1)|null|\n", "2309.10503": "|**2023-09-19**|**Steganography for Neural Radiance Fields by Backdooring**|Weina Dong et.al.|[2309.10503v1](http://arxiv.org/abs/2309.10503v1)|null|\n", "2309.10011": "|**2023-09-18**|**Instant Photorealistic Style Transfer: A Lightweight and Adaptive Approach**|Rong Liu et.al.|[2309.10011v1](http://arxiv.org/abs/2309.10011v1)|null|\n", "2309.11009": "|**2023-09-21**|**Controllable Dynamic Appearance for Neural 3D Portraits**|ShahRukh Athar et.al.|[2309.11009v2](http://arxiv.org/abs/2309.11009v2)|null|\n", "2309.10987": "|**2023-09-20**|**Spiking NeRF: Making Bio-inspired Neural Networks See through the Real World**|Xingting Yao et.al.|[2309.10987v1](http://arxiv.org/abs/2309.10987v1)|null|\n", "2309.12183": "|**2023-09-21**|**ORTexME: Occlusion-Robust Human Shape and Pose via Temporal Average Texture and Mesh Encoding**|Yu Cheng et.al.|[2309.12183v1](http://arxiv.org/abs/2309.12183v1)|null|\n", "2309.11966": "|**2023-09-21**|**NeuralLabeling: A versatile toolset for labeling vision datasets using Neural Radiance Fields**|Floris Erich et.al.|[2309.11966v1](http://arxiv.org/abs/2309.11966v1)|**[link](https://github.com/FlorisE/neural-labeling)**|\n", "2309.11767": "|**2023-09-21**|**Fast Satellite Tensorial Radiance Field for Multi-date Satellite Imagery of Large Size**|Tongtong Zhang et.al.|[2309.11767v1](http://arxiv.org/abs/2309.11767v1)|null|\n", "2309.11747": "|**2023-09-21**|**MarkNerf:Watermarking for Neural Radiance Field**|Lifeng Chen et.al.|[2309.11747v1](http://arxiv.org/abs/2309.11747v1)|null|\n", "2309.11698": "|**2023-09-21**|**Rendering stable features improves sampling-based localisation with Neural radiance fields**|Boxuan Zhang et.al.|[2309.11698v1](http://arxiv.org/abs/2309.11698v1)|null|\n", "2309.11627": "|**2023-09-20**|**GenLayNeRF: Generalizable Layered Representations with 3D Model Alignment for Multi-Human View Synthesis**|Youssef Abdelkareem et.al.|[2309.11627v1](http://arxiv.org/abs/2309.11627v1)|null|\n", "2309.11525": "|**2023-09-23**|**Light Field Diffusion for Single-View Novel View Synthesis**|Yifeng Xiong et.al.|[2309.11525v2](http://arxiv.org/abs/2309.11525v2)|null|\n", "2309.13039": "|**2023-09-22**|**NeRRF: 3D Reconstruction and View Synthesis for Transparent and Specular Objects with Neural Refractive-Reflective Fields**|Xiaoxue Chen et.al.|[2309.13039v1](http://arxiv.org/abs/2309.13039v1)|**[link](https://github.com/dawning77/nerrf)**|\n", "2309.14293": "|**2023-09-25**|**NAS-NeRF: Generative Neural Architecture Search for Neural Radiance Fields**|Saeejith Nair et.al.|[2309.14293v1](http://arxiv.org/abs/2309.14293v1)|null|\n", "2309.14010": "|**2023-09-25**|**Variational Inference for Scalable 3D Object-centric Learning**|Tianyu Wang et.al.|[2309.14010v1](http://arxiv.org/abs/2309.14010v1)|null|\n", "2309.13607": "|**2023-09-24**|**MM-NeRF: Multimodal-Guided 3D Multi-Style Transfer of Neural Radiance Field**|Zijiang Yang et.al.|[2309.13607v1](http://arxiv.org/abs/2309.13607v1)|null|\n", "2309.13240": "|**2023-09-23**|**NeRF-Enhanced Outpainting for Faithful Field-of-View Extrapolation**|Rui Yu et.al.|[2309.13240v1](http://arxiv.org/abs/2309.13240v1)|null|\n", "2309.14800": "|**2023-09-26**|**3D Density-Gradient based Edge Detection on Neural Radiance Fields (NeRFs) for Geometric Reconstruction**|Miriam J\u00e4ger et.al.|[2309.14800v1](http://arxiv.org/abs/2309.14800v1)|null|\n", "2309.15526": "|**2023-09-27**|**P2I-NET: Mapping Camera Pose to Image via Adversarial Learning for New View Synthesis in Real Indoor Environments**|Xujie Kang et.al.|[2309.15526v1](http://arxiv.org/abs/2309.15526v1)|null|\n", "2309.15329": "|**2023-09-27**|**BASED: Bundle-Adjusting Surgical Endoscopic Dynamic Video Reconstruction using Neural Radiance Fields**|Shreya Saha et.al.|[2309.15329v1](http://arxiv.org/abs/2309.15329v1)|null|\n", "2309.16553": "|**2023-09-28**|**MatrixCity: A Large-scale City Dataset for City-scale Neural Rendering and Beyond**|Yixuan Li et.al.|[2309.16553v1](http://arxiv.org/abs/2309.16553v1)|null|\n", "2309.16364": "|**2023-10-04**|**FG-NeRF: Flow-GAN based Probabilistic Neural Radiance Field for Independence-Assumption-Free Uncertainty Estimation**|Songlin Wei et.al.|[2309.16364v2](http://arxiv.org/abs/2309.16364v2)|null|\n", "2309.16110": "|**2023-09-28**|**Learning Effective NeRFs and SDFs Representations with 3D Generative Adversarial Networks for 3D Object Generation: Technical Report for ICCV 2023 OmniObject3D Challenge**|Zheyuan Yang et.al.|[2309.16110v1](http://arxiv.org/abs/2309.16110v1)|null|\n", "2309.17450": "|**2023-09-29**|**Multi-task View Synthesis with Neural Radiance Fields**|Shuhong Zheng et.al.|[2309.17450v1](http://arxiv.org/abs/2309.17450v1)|**[link](https://github.com/zsh2000/muvienerf)**|\n", "2309.17390": "|**2023-09-29**|**Forward Flow for Novel View Synthesis of Dynamic Scenes**|Xiang Guo et.al.|[2309.17390v1](http://arxiv.org/abs/2309.17390v1)|null|\n", "2309.17128": "|**2023-09-29**|**HAvatar: High-fidelity Head Avatar via Facial Model Conditioned Neural Radiance Field**|Xiaochen Zhao et.al.|[2309.17128v1](http://arxiv.org/abs/2309.17128v1)|null|\n", "2309.16859": "|**2023-09-28**|**Preface: A Data-driven Volumetric Prior for Few-shot Ultra High-resolution Face Synthesis**|Marcel C. B\u00fchler et.al.|[2309.16859v1](http://arxiv.org/abs/2309.16859v1)|null|\n", "2310.01881": "|**2023-10-03**|**Adaptive Multi-NeRF: Exploit Efficient Parallelism in Adaptive Multiple Scale Neural Radiance Field Rendering**|Tong Wang et.al.|[2310.01881v1](http://arxiv.org/abs/2310.01881v1)|null|\n", "2310.01821": "|**2023-10-03**|**MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural Radiance Fields**|Takuhiro Kaneko et.al.|[2310.01821v1](http://arxiv.org/abs/2310.01821v1)|null|\n", "2310.00874": "|**2023-10-02**|**PC-NeRF: Parent-Child Neural Radiance Fields under Partial Sensor Data Loss in Autonomous Driving Environments**|Xiuzhong Hu et.al.|[2310.00874v1](http://arxiv.org/abs/2310.00874v1)|**[link](https://github.com/biter0088/pc-nerf)**|\n", "2310.00684": "|**2023-10-01**|**How Many Views Are Needed to Reconstruct an Unknown Object Using NeRF?**|Sicong Pan et.al.|[2310.00684v1](http://arxiv.org/abs/2310.00684v1)|**[link](https://github.com/psc0628/nerf-prv)**|\n", "2310.00530": "|**2023-10-01**|**Enabling Neural Radiance Fields (NeRF) for Large-scale Aerial Images -- A Multi-tiling Approaching and the Geometry Assessment of NeRF**|Ningli Xu et.al.|[2310.00530v1](http://arxiv.org/abs/2310.00530v1)|null|\n", "2310.00249": "|**2023-09-30**|**MMPI: a Flexible Radiance Field Representation by Multiple Multi-plane Images Blending**|Yuze He et.al.|[2310.00249v1](http://arxiv.org/abs/2310.00249v1)|null|\n", "2310.02977": "|**2023-10-04**|**T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation**|Yuze He et.al.|[2310.02977v1](http://arxiv.org/abs/2310.02977v1)|**[link](https://github.com/THU-LYJ-Lab/T3Bench)**|\n", "2310.02712": "|**2023-10-04**|**ED-NeRF: Efficient Text-Guided Editing of 3D Scene using Latent Space NeRF**|Jangho Park et.al.|[2310.02712v1](http://arxiv.org/abs/2310.02712v1)|null|\n", "2310.02687": "|**2023-10-05**|**USB-NeRF: Unrolling Shutter Bundle Adjusted Neural Radiance Fields**|Moyang Li et.al.|[2310.02687v2](http://arxiv.org/abs/2310.02687v2)|null|\n", "2310.02437": "|**2023-10-03**|**EvDNeRF: Reconstructing Event Data with Dynamic Neural Radiance Fields**|Anish Bhattacharya et.al.|[2310.02437v1](http://arxiv.org/abs/2310.02437v1)|**[link](https://github.com/anish-bhattacharya/evdnerf)**|\n", "2310.03704": "|**2023-10-05**|**Drag View: Generalizable Novel View Synthesis with Unposed Imagery**|Zhiwen Fan et.al.|[2310.03704v1](http://arxiv.org/abs/2310.03704v1)|**[link](https://github.com/zhiwenfan/DragView)**|\n", "2310.03578": "|**2023-10-05**|**Targeted Adversarial Attacks on Generalizable Neural Radiance Fields**|Andras Horvath et.al.|[2310.03578v1](http://arxiv.org/abs/2310.03578v1)|null|\n", "2310.03563": "|**2023-10-05**|**BID-NeRF: RGB-D image pose estimation with inverted Neural Radiance Fields**|\u00c1goston Istv\u00e1n Csehi et.al.|[2310.03563v1](http://arxiv.org/abs/2310.03563v1)|null|\n", "2310.03125": "|**2023-10-04**|**Shielding the Unseen: Privacy Protection through Poisoning NeRF with Spatial Deformation**|Yihan Wu et.al.|[2310.03125v1](http://arxiv.org/abs/2310.03125v1)|null|\n", "2310.04152": "|**2023-10-06**|**Improving Neural Radiance Field using Near-Surface Sampling with Point Cloud Generation**|Hye Bin Yoo et.al.|[2310.04152v1](http://arxiv.org/abs/2310.04152v1)|null|\n", "2310.05837": "|**2023-10-09**|**A Real-time Method for Inserting Virtual Objects into Neural Radiance Fields**|Keyang Ye et.al.|[2310.05837v1](http://arxiv.org/abs/2310.05837v1)|null|\n", "2310.05391": "|**2023-10-09**|**Neural Impostor: Editing Neural Radiance Fields with Explicit Shape Manipulation**|Ruiyang Liu et.al.|[2310.05391v1](http://arxiv.org/abs/2310.05391v1)|null|\n", "2310.05134": "|**2023-10-08**|**LocoNeRF: A NeRF-based Approach for Local Structure from Motion for Precise Localization**|Artem Nenashev et.al.|[2310.05134v1](http://arxiv.org/abs/2310.05134v1)|null|\n", "2310.05133": "|**2023-10-08**|**Geometry Aware Field-to-field Transformations for 3D Semantic Segmentation**|Dominik Hollidt et.al.|[2310.05133v1](http://arxiv.org/abs/2310.05133v1)|null|\n", "2310.06275": "|**2023-10-10**|**High-Fidelity 3D Head Avatars Reconstruction through Spatially-Varying Expression Conditioned Neural Radiance Field**|Minghan Qin et.al.|[2310.06275v1](http://arxiv.org/abs/2310.06275v1)|null|\n", "2310.07449": "|**2023-10-12**|**PoRF: Pose Residual Field for Accurate Neural Surface Reconstruction**|Jia-Wang Bian et.al.|[2310.07449v2](http://arxiv.org/abs/2310.07449v2)|null|\n", "2310.07179": "|**2023-10-11**|**rpcPRF: Generalizable MPI Neural Radiance Field for Satellite Camera**|Tongtong Zhang et.al.|[2310.07179v1](http://arxiv.org/abs/2310.07179v1)|null|\n", "2310.06984": "|**2023-10-10**|**Leveraging Neural Radiance Fields for Uncertainty-Aware Visual Localization**|Le Chen et.al.|[2310.06984v1](http://arxiv.org/abs/2310.06984v1)|null|\n", "2310.07916": "|**2023-10-11**|**Dynamic Appearance Particle Neural Radiance Field**|Ancheng Lin et.al.|[2310.07916v1](http://arxiv.org/abs/2310.07916v1)|null|\n", "2310.10650": "|**2023-10-16**|**TraM-NeRF: Tracing Mirror and Near-Perfect Specular Reflections through Neural Radiance Fields**|Leif Van Holland et.al.|[2310.10650v1](http://arxiv.org/abs/2310.10650v1)|**[link](https://github.com/Rubikalubi/TraM-NeRF)**|\n", "2310.10624": "|**2023-10-16**|**DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion- and View-Change Human-Centric Video Editing**|Jia-Wei Liu et.al.|[2310.10624v1](http://arxiv.org/abs/2310.10624v1)|null|\n", "2310.10209": "|**2023-10-16**|**Self-supervised Fetal MRI 3D Reconstruction Based on Radiation Diffusion Generation Model**|Junpeng Tan et.al.|[2310.10209v1](http://arxiv.org/abs/2310.10209v1)|null|\n", "2310.09965": "|**2023-10-15**|**ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context**|Binglun Wang et.al.|[2310.09965v1](http://arxiv.org/abs/2310.09965v1)|null|\n", "2310.09892": "|**2023-10-15**|**Active Perception using Neural Radiance Fields**|Siming He et.al.|[2310.09892v1](http://arxiv.org/abs/2310.09892v1)|**[link](https://github.com/grasp-lyrl/active-perception-using-neural-radiance-fields)**|\n", "2310.09776": "|**2023-10-15**|**CBARF: Cascaded Bundle-Adjusting Neural Radiance Fields from Imperfect Camera Poses**|Hongyu Fu et.al.|[2310.09776v1](http://arxiv.org/abs/2310.09776v1)|null|\n", "2310.11864": "|**2023-10-18**|**VQ-NeRF: Neural Reflectance Decomposition and Editing with Vector Quantization**|Hongliang Zhong et.al.|[2310.11864v1](http://arxiv.org/abs/2310.11864v1)|null|\n", "2310.11645": "|**2023-10-18**|**Towards Abdominal 3-D Scene Rendering from Laparoscopy Surgical Videos using NeRFs**|Khoa Tuan Nguyen et.al.|[2310.11645v1](http://arxiv.org/abs/2310.11645v1)|null|\n", "2310.13670": "|**2023-10-20**|**ManifoldNeRF: View-dependent Image Feature Supervision for Few-shot Neural Radiance Fields**|Daiju Kanaoka et.al.|[2310.13670v1](http://arxiv.org/abs/2310.13670v1)|null|\n", "2310.13356": "|**2023-10-20**|**Sync-NeRF: Generalizing Dynamic NeRFs to Unsynchronized Videos**|Seoha Kim et.al.|[2310.13356v1](http://arxiv.org/abs/2310.13356v1)|**[link](https://github.com/seoha-kim/Sync-NeRF)**|\n", "2310.13263": "|**2023-10-20**|**UE4-NeRF:Neural Radiance Field for Real-Time Rendering of Large-Scale Scene**|Jiaming Gu et.al.|[2310.13263v1](http://arxiv.org/abs/2310.13263v1)|null|\n", "2310.14695": "|**2023-10-23**|**CAwa-NeRF: Instant Learning of Compression-Aware NeRF Features**|Omnia Mahmoud et.al.|[2310.14695v1](http://arxiv.org/abs/2310.14695v1)|null|\n", "2310.14487": "|**2023-10-23**|**VQ-NeRF: Vector Quantization Enhances Implicit Neural Representations**|Yiying Yang et.al.|[2310.14487v1](http://arxiv.org/abs/2310.14487v1)|null|\n", "2310.15504": "|**2023-10-24**|**Cross-view Self-localization from Synthesized Scene-graphs**|Ryogo Yamamoto et.al.|[2310.15504v1](http://arxiv.org/abs/2310.15504v1)|null|\n", "2310.16832": "|**2023-10-26**|**LightSpeed: Light and Fast Neural Light Fields on Mobile Devices**|Aarush Gupta et.al.|[2310.16832v2](http://arxiv.org/abs/2310.16832v2)|**[link](https://github.com/lightspeed-r2l/lightspeed)**|\n", "2310.16831": "|**2023-10-28**|**PERF: Panoramic Neural Radiance Field from a Single Panorama**|Guangcong Wang et.al.|[2310.16831v2](http://arxiv.org/abs/2310.16831v2)|**[link](https://github.com/perf-project/PeRF)**|\n", "2310.16383": "|**2023-10-25**|**Open-NeRF: Towards Open Vocabulary NeRF Decomposition**|Hao Zhang et.al.|[2310.16383v1](http://arxiv.org/abs/2310.16383v1)|null|\n", "2310.16255": "|**2023-10-25**|**UAV-Sim: NeRF-based Synthetic Data Generation for UAV-based Perception**|Christopher Maxey et.al.|[2310.16255v1](http://arxiv.org/abs/2310.16255v1)|null|\n", "2310.17075": "|**2023-10-27**|**HyperFields: Towards Zero-Shot Generation of NeRFs from Text**|Sudarshan Babu et.al.|[2310.17075v2](http://arxiv.org/abs/2310.17075v2)|null|\n", "2310.16858": "|**2023-10-25**|**4D-Editor: Interactive Object-level Editing in Dynamic Neural Radiance Fields via 4D Semantic Segmentation**|Dadong Jiang et.al.|[2310.16858v1](http://arxiv.org/abs/2310.16858v1)|null|\n", "2310.17994": "|**2023-10-27**|**ZeroNVS: Zero-Shot 360-Degree View Synthesis from a Single Real Image**|Kyle Sargent et.al.|[2310.17994v1](http://arxiv.org/abs/2310.17994v1)|null|\n", "2310.17880": "|**2023-10-27**|**Reconstructive Latent-Space Neural Radiance Fields for Efficient 3D Scene Representations**|Tristan Aumentado-Armstrong et.al.|[2310.17880v1](http://arxiv.org/abs/2310.17880v1)|null|\n", "2310.18917": "|**2023-11-04**|**TiV-NeRF: Tracking and Mapping via Time-Varying Representation with Dynamic Neural Radiance Fields**|Chengyao Duan et.al.|[2310.18917v2](http://arxiv.org/abs/2310.18917v2)|null|\n", "2310.18846": "|**2023-10-28**|**INCODE: Implicit Neural Conditioning with Prior Knowledge Embeddings**|Amirhossein Kazerouni et.al.|[2310.18846v1](http://arxiv.org/abs/2310.18846v1)|**[link](https://github.com/xmindflow/INCODE)**|\n", "2310.20710": "|**2023-10-31**|**FPO++: Efficient Encoding and Rendering of Dynamic Neural Radiance Fields by Analyzing and Enhancing Fourier PlenOctrees**|Saskia Rabich et.al.|[2310.20710v1](http://arxiv.org/abs/2310.20710v1)|null|\n", "2310.20685": "|**2023-10-31**|**NeRF Revisited: Fixing Quadrature Instability in Volume Rendering**|Mikaela Angelina Uy et.al.|[2310.20685v1](http://arxiv.org/abs/2310.20685v1)|null|\n", "2310.19464": "|**2023-10-30**|**Generative Neural Fields by Mixtures of Neural Implicit Functions**|Tackgeun You et.al.|[2310.19464v1](http://arxiv.org/abs/2310.19464v1)|null|\n", "2311.01065": "|**2023-11-02**|**Novel View Synthesis from a Single RGBD Image for Indoor Scenes**|Congrui Hetang et.al.|[2311.01065v1](http://arxiv.org/abs/2311.01065v1)|null|\n", "2311.01815": "|**2023-11-03**|**Estimating 3D Uncertainty Field: Quantifying Uncertainty for Neural Radiance Fields**|Jianxiong Shen et.al.|[2311.01815v1](http://arxiv.org/abs/2311.01815v1)|null|\n", "2311.01773": "|**2023-11-03**|**PDF: Point Diffusion Implicit Function for Large-scale Scene Neural Representation**|Yuhan Ding et.al.|[2311.01773v1](http://arxiv.org/abs/2311.01773v1)|null|\n", "2311.01659": "|**2023-11-03**|**Efficient Cloud Pipelines for Neural Radiance Fields**|Derek Jacoby et.al.|[2311.01659v1](http://arxiv.org/abs/2311.01659v1)|null|\n", "2311.03140": "|**2023-11-06**|**Animating NeRFs from Texture Space: A Framework for Pose-Dependent Rendering of Human Performances**|Paul Knoll et.al.|[2311.03140v1](http://arxiv.org/abs/2311.03140v1)|null|\n", "2311.02826": "|**2023-11-06**|**InstructPix2NeRF: Instructed 3D Portrait Editing from a Single Image**|Jianhui Li et.al.|[2311.02826v1](http://arxiv.org/abs/2311.02826v1)|**[link](https://github.com/mybabyyh/instructpix2nerf)**|\n", "2311.04154": "|**2023-11-07**|**High-fidelity 3D Reconstruction of Plants using Neural Radiance Field**|Kewei Hu et.al.|[2311.04154v1](http://arxiv.org/abs/2311.04154v1)|null|\n", "2311.03965": "|**2023-11-07**|**Fast Sun-aligned Outdoor Scene Relighting based on TensoRF**|Yeonjin Chang et.al.|[2311.03965v1](http://arxiv.org/abs/2311.03965v1)|null|\n", "2311.03784": "|**2023-11-08**|**UP-NeRF: Unconstrained Pose-Prior-Free Neural Radiance Fields**|Injae Kim et.al.|[2311.03784v2](http://arxiv.org/abs/2311.03784v2)|**[link](https://github.com/mlvlab/upnerf)**|\n", "2311.03484": "|**2023-11-06**|**Osprey: Multi-Session Autonomous Aerial Mapping with LiDAR-based SLAM and Next Best View Planning**|Rowan Border et.al.|[2311.03484v1](http://arxiv.org/abs/2311.03484v1)|null|\n", "2311.04400": "|**2023-11-08**|**LRM: Large Reconstruction Model for Single Image to 3D**|Yicong Hong et.al.|[2311.04400v1](http://arxiv.org/abs/2311.04400v1)|null|\n", "2311.04246": "|**2023-11-07**|**ADFactory: Automated Data Factory for Optical Flow Tasks**|Han Ling et.al.|[2311.04246v1](http://arxiv.org/abs/2311.04246v1)|null|\n", "2311.05521": "|**2023-11-09**|**BakedAvatar: Baking Neural Fields for Real-Time Head Avatar Synthesis**|Hao-Bin Duan et.al.|[2311.05521v1](http://arxiv.org/abs/2311.05521v1)|null|\n", "2311.05461": "|**2023-11-09**|**Control3D: Towards Controllable Text-to-3D Generation**|Yang Chen et.al.|[2311.05461v1](http://arxiv.org/abs/2311.05461v1)|null|\n", "2311.06214": "|**2023-11-10**|**Instant3D: Fast Text-to-3D with Sparse-View Generation and Large Reconstruction Model**|Jiahao Li et.al.|[2311.06214v1](http://arxiv.org/abs/2311.06214v1)|null|\n", "2311.05958": "|**2023-11-10**|**A Neural Height-Map Approach for the Binocular Photometric Stereo Problem**|Fotios Logothetis et.al.|[2311.05958v1](http://arxiv.org/abs/2311.05958v1)|null|\n"}}
\ No newline at end of file
diff --git a/docs/cv-arxiv-daily.json b/docs/cv-arxiv-daily.json
index 4b83168b71..9f21f1dfce 100755
--- a/docs/cv-arxiv-daily.json
+++ b/docs/cv-arxiv-daily.json
@@ -1 +1 @@
-{"Kinematic Mapping": {"2302.11988": "|**2023-02-23**|**Time Complexity of Broadcast and Consensus for Randomized Oblivious Message Adversaries**|Antoine El-Hayek et.al.|[2302.11988v1](http://arxiv.org/abs/2302.11988v1)|null|\n", "2302.09743": "|**2023-02-20**|**Dynamic Optimal Control: A Real-Time Control Optimization Algorithm for Dynamic Networks**|Chunyu Pan et.al.|[2302.09743v1](http://arxiv.org/abs/2302.09743v1)|null|\n", "2302.09382": "|**2023-02-18**|**Co-trading networks for modeling dynamic interdependency structures and estimating high-dimensional covariances in US equity markets**|Yutong Lu et.al.|[2302.09382v1](http://arxiv.org/abs/2302.09382v1)|null|\n", "2302.07657": "|**2023-02-15**|**Dynamic Flows with Time-Dependent Capacities**|Thomas Bl\u00e4sius et.al.|[2302.07657v1](http://arxiv.org/abs/2302.07657v1)|null|\n", "2302.04377": "|**2023-02-08**|**ER network heterogeneity guides diffusive transport and kinetics**|Zubenelgenubi C. Scott et.al.|[2302.04377v1](http://arxiv.org/abs/2302.04377v1)|null|\n", "2302.03677": "|**2023-02-24**|**Wealth distribution on a dynamic complex network**|Gustavo Kohlrausch et.al.|[2302.03677v2](http://arxiv.org/abs/2302.03677v2)|null|\n", "2302.03039": "|**2023-02-06**|**SUPER VII. Morphology and kinematics of H$\u03b1$ emission in AGN host galaxies at Cosmic noon using SINFONI**|D. Kakkad et.al.|[2302.03039v1](http://arxiv.org/abs/2302.03039v1)|null|\n", "2302.02313": "|**2023-02-05**|**A Game-Theoretic Approach to Solving the Roman Domination Problem**|Xiuyang Chen et.al.|[2302.02313v1](http://arxiv.org/abs/2302.02313v1)|null|\n", "2302.01694": "|**2023-02-03**|**Coevolving Boolean and Multi-Valued Regulatory Networks**|Larry Bull et.al.|[2302.01694v1](http://arxiv.org/abs/2302.01694v1)|null|\n", "2301.12892": "|**2023-01-30**|**Quantifying and maximizing the information flux in recurrent neural networks**|Claus Metzner et.al.|[2301.12892v1](http://arxiv.org/abs/2301.12892v1)|null|\n", "2301.12156": "|**2023-03-23**|**Perspective: How to overcome dynamical density functional theory**|Daniel de las Heras et.al.|[2301.12156v2](http://arxiv.org/abs/2301.12156v2)|null|\n", "2301.11982": "|**2023-02-01**|**Strategy evolution on dynamic networks**|Qi Su et.al.|[2301.11982v2](http://arxiv.org/abs/2301.11982v2)|null|\n", "2301.10962": "|**2023-01-26**|**Scheduling Policy for Value-of-Information (VoI) in Trajectory Estimation for Digital Twins**|Van-Phuc Bui et.al.|[2301.10962v1](http://arxiv.org/abs/2301.10962v1)|null|\n", "2301.07849": "|**2023-01-19**|**Efficient Computation in Congested Anonymous Dynamic Networks**|Giuseppe A. Di Luna et.al.|[2301.07849v1](http://arxiv.org/abs/2301.07849v1)|null|\n", "2301.07515": "|**2023-01-15**|**Towards the development of Dynamic Networked Psychology Hypotheses**|Liaquat Hossain et.al.|[2301.07515v1](http://arxiv.org/abs/2301.07515v1)|null|\n", "2301.04904": "|**2023-01-12**|**Lesion-aware Dynamic Kernel for Polyp Segmentation**|Ruifei Zhang et.al.|[2301.04904v1](http://arxiv.org/abs/2301.04904v1)|**[link](https://github.com/reafly/ldnet)**|\n", "2301.04296": "|**2023-01-11**|**A degree-corrected Cox model for dynamic networks**|Yuguo Chen et.al.|[2301.04296v1](http://arxiv.org/abs/2301.04296v1)|null|\n", "2301.03965": "|**2023-01-10**|**BiCurNet: Pre-Movement EEG based Neural Decoder for Biceps Curl Trajectory Estimation**|Manali Saini et.al.|[2301.03965v1](http://arxiv.org/abs/2301.03965v1)|null|\n", "2301.01314": "|**2023-01-03**|**Network-theoretic modeling of fluid-structure interactions**|Aditya G. Nair et.al.|[2301.01314v1](http://arxiv.org/abs/2301.01314v1)|null|\n", "2212.12843": "|**2022-12-25**|**A Note on Improved Results for One Round Distributed Clique Listing**|Quanquan C. Liu et.al.|[2212.12843v1](http://arxiv.org/abs/2212.12843v1)|null|\n", "2212.12345": "|**2022-12-23**|**Piecewise-Velocity Model for Learning Continuous-time Dynamic Node Representations**|Abdulkadir \u00c7elikkanat et.al.|[2212.12345v1](http://arxiv.org/abs/2212.12345v1)|null|\n", "2212.12130": "|**2023-02-04**|**Learning to Detect and Segment for Open Vocabulary Object Detection**|Tao Wang et.al.|[2212.12130v2](http://arxiv.org/abs/2212.12130v2)|null|\n", "2212.09483": "|**2022-12-19**|**Adaptive Control of Client Selection and Gradient Compression for Efficient Federated Learning**|Zhida Jiang et.al.|[2212.09483v1](http://arxiv.org/abs/2212.09483v1)|null|\n", "2212.08358": "|**2022-12-16**|**Some recent trends in embeddings of time series and dynamic networks**|Dag Tj\u00f8stheim et.al.|[2212.08358v1](http://arxiv.org/abs/2212.08358v1)|null|\n", "2212.08314": "|**2023-01-30**|**Synchronization-preserving clusters in hypergraphs**|Anirban Banerjee et.al.|[2212.08314v2](http://arxiv.org/abs/2212.08314v2)|null|\n", "2212.08239": "|**2022-12-16**|**Discovering Structural Hole Spanners in Dynamic Networks via Graph Neural Networks**|Diksha Goel et.al.|[2212.08239v1](http://arxiv.org/abs/2212.08239v1)|null|\n", "2212.07961": "|**2022-12-15**|**Topological Data Analysis Detects Percolation Thresholds in Arctic Melt-Pond Evolution**|Wilfred Offord et.al.|[2212.07961v1](http://arxiv.org/abs/2212.07961v1)|**[link](https://github.com/wilfofford/tda-for-sea-ice-percolation)**|\n", "2212.05980": "|**2022-12-12**|**Evaluation of RGB-D SLAM in Large Indoor Environments**|Kirill Muravyev et.al.|[2212.05980v1](http://arxiv.org/abs/2212.05980v1)|null|\n", "2212.03999": "|**2022-12-07**|**On the application of dimensionality reduction and clustering algorithms for the classification of kinematic morphologies of galaxies**|M. S. Rosito et.al.|[2212.03999v1](http://arxiv.org/abs/2212.03999v1)|null|\n", "2212.02410": "|**2023-03-17**|**Antipodal Self-Duality for a Four-Particle Form Factor**|Lance J. Dixon et.al.|[2212.02410v2](http://arxiv.org/abs/2212.02410v2)|null|\n", "2212.02383": "|**2022-12-05**|**An Approach for Detecting Dynamic Communities in Social Networks**|Souaad Boudebza et.al.|[2212.02383v1](http://arxiv.org/abs/2212.02383v1)|**[link](https://github.com/Yquetzal/ECML_PKDD_2019)**|\n", "2212.01594": "|**2022-12-03**|**Parameterized temporal exploration problems**|Thomas Erlebach et.al.|[2212.01594v1](http://arxiv.org/abs/2212.01594v1)|null|\n", "2211.16726": "|**2022-11-30**|**Boosted Dynamic Neural Networks**|Haichao Yu et.al.|[2211.16726v1](http://arxiv.org/abs/2211.16726v1)|**[link](https://github.com/SHI-Labs/Boosted-Dynamic-Networks)**|\n", "2211.15301": "|**2022-11-28**|**Learning Coherent Clusters in Weakly-Connected Network Systems**|Hancheng Min et.al.|[2211.15301v1](http://arxiv.org/abs/2211.15301v1)|null|\n", "2211.15043": "|**2022-11-28**|**Higher-order Knowledge Transfer for Dynamic Community Detection with Great Changes**|Huixin Ma et.al.|[2211.15043v1](http://arxiv.org/abs/2211.15043v1)|null|\n", "2211.14560": "|**2023-01-24**|**A dynamic multi-region MFD model for ride-sourcing with ridesplitting**|Caio Vitor Beojone et.al.|[2211.14560v2](http://arxiv.org/abs/2211.14560v2)|null|\n", "2211.12589": "|**2022-11-22**|**Building Squares with Optimal State Complexity in Restricted Active Self-Assembly**|Robert M. Alaniz et.al.|[2211.12589v1](http://arxiv.org/abs/2211.12589v1)|**[link](https://github.com/asarg/autotile)**|\n", "2211.11876": "|**2022-11-21**|**Structural Modelling of Dynamic Networks and Identifying Maximum Likelihood**|Christian Gourieroux et.al.|[2211.11876v1](http://arxiv.org/abs/2211.11876v1)|null|\n", "2211.11352": "|**2023-01-30**|**Brief Announcement: Broadcasting Time in Dynamic Rooted Trees is Linear**|Antoine El-Hayek et.al.|[2211.11352v3](http://arxiv.org/abs/2211.11352v3)|null|\n", "2211.11069": "|**2022-11-20**|**Learning Nonlinear Couplings in Network of Agents from a Single Sample Trajectory**|Arash Amini et.al.|[2211.11069v1](http://arxiv.org/abs/2211.11069v1)|null|\n", "2211.10825": "|**2022-11-20**|**Identifiability of dynamic networks: the essential r\u00f4le of dources and dinks**|Eduardo Mapurunga et.al.|[2211.10825v1](http://arxiv.org/abs/2211.10825v1)|null|\n", "2211.10151": "|**2023-01-27**|**Asymptotically Tight Bounds on the Time Complexity of Broadcast and its Variants in Dynamic Networks**|Antoine El-Hayek et.al.|[2211.10151v2](http://arxiv.org/abs/2211.10151v2)|null|\n", "2211.09139": "|**2022-11-16**|**The Pandora project. I: the impact of radiation and cosmic rays on baryonic and dark matter properties of dwarf galaxies**|Sergio Martin-Alvarez et.al.|[2211.09139v1](http://arxiv.org/abs/2211.09139v1)|null|\n", "2211.08820": "|**2022-11-16**|**Computing-Aware Routing for LEO Satellite Networks: A Transmission and Computation Integration Approach**|Jiaqi Cao et.al.|[2211.08820v1](http://arxiv.org/abs/2211.08820v1)|null|\n", "2211.08700": "|**2023-02-14**|**Bi-directional Digital Twin and Edge Computing in the Metaverse**|Jiadong Yu et.al.|[2211.08700v2](http://arxiv.org/abs/2211.08700v2)|null|\n", "2211.08639": "|**2022-11-16**|**Hierarchical Dynamic Image Harmonization**|Haoxing Chen et.al.|[2211.08639v1](http://arxiv.org/abs/2211.08639v1)|**[link](https://github.com/chenhaoxing/hdnet)**|\n", "2211.08378": "|**2022-11-15**|**Anomaly Detection in Multiplex Dynamic Networks: from Blockchain Security to Brain Disease Prediction**|Ali Behrouz et.al.|[2211.08378v1](http://arxiv.org/abs/2211.08378v1)|**[link](https://github.com/ubc-systopia/anomuly)**|\n", "2211.09664": "|**2022-11-15**|**Influencer Detection with Dynamic Graph Neural Networks**|Elena Tiukhova et.al.|[2211.09664v1](http://arxiv.org/abs/2211.09664v1)|**[link](https://github.com/banking-analytics-lab/dynamicgraphlearning)**|\n", "2211.07570": "|**2022-11-14**|**Tides Need STEMMED: A Locally Operating Spatio-Temporal Mutually Exciting Point Process with Dynamic Network for Improving Opioid Overdose Death Prediction**|Che-Yi Liao et.al.|[2211.07570v1](http://arxiv.org/abs/2211.07570v1)|null|\n", "2211.07449": "|**2022-11-14**|**Dual-based Online Learning of Dynamic Network Topologies**|Seyed Saman Saboksayr et.al.|[2211.07449v1](http://arxiv.org/abs/2211.07449v1)|null|\n", "2302.12759": "|**2023-02-24**|**Modularity-based approach for tracking communities in dynamic social networks**|Michele Mazza et.al.|[2302.12759v1](http://arxiv.org/abs/2302.12759v1)|null|\n", "2302.13629": "|**2023-02-27**|**Estimation of continuous environments by robot swarms: Correlated networks and decision-making**|Mohsen Raoufi et.al.|[2302.13629v1](http://arxiv.org/abs/2302.13629v1)|null|\n", "2302.13292": "|**2023-02-26**|**Discovering Top-k Structural Hole Spanners in Dynamic Networks**|Diksha Goel et.al.|[2302.13292v1](http://arxiv.org/abs/2302.13292v1)|null|\n", "2211.05668": "|**2022-12-07**|**Mapping the Milky Way Disk with Gaia DR3: 3D extended kinematic maps and rotation curve to $\\approx 30$ kpc**|Hai-Feng Wang et.al.|[2211.05668v2](http://arxiv.org/abs/2211.05668v2)|null|\n", "2211.01538": "|**2023-03-12**|**$D^2$SLAM: Decentralized and Distributed Collaborative Visual-inertial SLAM System for Aerial Swarm**|Hao Xu et.al.|[2211.01538v3](http://arxiv.org/abs/2211.01538v3)|**[link](https://github.com/hkust-aerial-robotics/d2slam)**|\n", "2210.14842": "|**2022-10-26**|**Continuum Robot State Estimation Using Gaussian Process Regression on $SE(3)$**|Sven Lilge et.al.|[2210.14842v1](http://arxiv.org/abs/2210.14842v1)|null|\n", "2210.04572": "|**2022-10-10**|**Floorplan-Aware Camera Poses Refinement**|Anna Sokolova et.al.|[2210.04572v1](http://arxiv.org/abs/2210.04572v1)|null|\n", "2210.03412": "|**2022-10-07**|**The Trajectory PHD Filter for Coexisting Point and Extended Target Tracking**|Shaoxiu Wei et.al.|[2210.03412v1](http://arxiv.org/abs/2210.03412v1)|null|\n", "2210.01320": "|**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|\n", "2209.09723": "|**2023-02-22**|**GANet: Goal Area Network for Motion Forecasting**|Mingkun Wang et.al.|[2209.09723v3](http://arxiv.org/abs/2209.09723v3)|**[link](https://github.com/kingwmk/ganet)**|\n", "2212.03441": "|**2023-03-23**|**Higher topological complexity of a map**|Cesar A. Ipanaque Zapata et.al.|[2212.03441v2](http://arxiv.org/abs/2212.03441v2)|null|\n", "2304.09043": "|**2023-05-16**|**Continuous-Time Range-Only Pose Estimation**|Abhishek Goudar et.al.|[2304.09043v2](http://arxiv.org/abs/2304.09043v2)|null|\n", "2304.11694": "|**2023-04-25**|**Vehicle State Estimation and Prediction**|Xinchen Li et.al.|[2304.11694v2](http://arxiv.org/abs/2304.11694v2)|null|\n", "2306.01188": "|**2023-09-12**|**Event-based Stereo Visual Odometry with Native Temporal Resolution via Continuous-time Gaussian Process Regression**|Jianeng Wang et.al.|[2306.01188v5](http://arxiv.org/abs/2306.01188v5)|null|\n", "2306.01056": "|**2023-06-01**|**ERGO-ML: Towards a robust machine learning model for inferring the fraction of accreted stars in galaxies from integral-field spectroscopic maps**|Eirini Angeloudi et.al.|[2306.01056v1](http://arxiv.org/abs/2306.01056v1)|null|\n", "2306.11091": "|**2023-06-19**|**Composite Bulges -- IV. Detecting Signatures of Gas Inflows in the IFU data: The MUSE View of Ionized Gas Kinematics in NGC 1097**|Tutku Kolcu et.al.|[2306.11091v1](http://arxiv.org/abs/2306.11091v1)|null|\n", "2306.14573": "|**2023-06-26**|**Hydrodynamic simulations of the Disk of Gas Around Supermassive black holes (HDGAS) -I; Molecular Gas Dynamics**|Mojtaba Raouf et.al.|[2306.14573v1](http://arxiv.org/abs/2306.14573v1)|null|\n", "2307.00728": "|**2023-07-03**|**A new approach to QCD evolution in processes with massive partons**|Benoit Assi et.al.|[2307.00728v1](http://arxiv.org/abs/2307.00728v1)|null|\n", "2307.03207": "|**2023-07-06**|**H$\u03b1$ Kinematics of Superbubbles and Supernova Remnants of the Dwarf galaxy NGC 4214**|M. S\u00e1nchez-Cruces et.al.|[2307.03207v1](http://arxiv.org/abs/2307.03207v1)|null|\n", "2307.10381": "|**2023-07-19**|**Accelerating galaxy dynamical modeling using a neural network for joint lensing and kinematics analyses**|Matthew R. Gomer et.al.|[2307.10381v1](http://arxiv.org/abs/2307.10381v1)|null|\n", "2307.14125": "|**2023-07-26**|**Multi-IMU Proprioceptive State Estimator for Humanoid Robots**|Fabio Elnecave Xavier et.al.|[2307.14125v1](http://arxiv.org/abs/2307.14125v1)|null|\n", "2308.04071": "|**2023-08-08**|**Path Signatures for Diversity in Probabilistic Trajectory Optimisation**|Lucas Barcelos et.al.|[2308.04071v1](http://arxiv.org/abs/2308.04071v1)|null|\n", "2308.08654": "|**2023-08-16**|**Advancing Brain-Computer Interface System Performance in Hand Trajectory Estimation with NeuroKinect**|Sidharth Pancholi et.al.|[2308.08654v1](http://arxiv.org/abs/2308.08654v1)|null|\n", "2308.11493": "|**2023-08-22**|**Looking into the faintEst WIth MUSE (LEWIS): on the nature of ultra-diffuse galaxies in the Hydra-I cluster.I. Project description and preliminary results**|Enrichetta Iodice et.al.|[2308.11493v1](http://arxiv.org/abs/2308.11493v1)|null|\n", "2308.12418": "|**2023-08-23**|**Certifiably Optimal Rotation and Pose Estimation Based on the Cayley Map**|Timothy D Barfoot et.al.|[2308.12418v1](http://arxiv.org/abs/2308.12418v1)|null|\n", "2308.16620": "|**2023-08-31**|**GA-NIFS: JWST/NIRSpec IFU observations of HFLS3 reveal a dense galaxy group at z~6.3**|G. C. Jones et.al.|[2308.16620v1](http://arxiv.org/abs/2308.16620v1)|null|\n", "2309.03396": "|**2023-09-06**|**Detection of open cluster rotation fields from Gaia EDR3 proper motions**|Pedro Guilherme-Garcia et.al.|[2309.03396v1](http://arxiv.org/abs/2309.03396v1)|null|\n", "2309.06792": "|**2023-09-13**|**Motion-Bias-Free Feature-Based SLAM**|Alejandro Fontan et.al.|[2309.06792v1](http://arxiv.org/abs/2309.06792v1)|null|\n", "2309.09808": "|**2023-09-18**|**Coco-LIC: Continuous-Time Tightly-Coupled LiDAR-Inertial-Camera Odometry using Non-Uniform B-spline**|Xiaolei Lang et.al.|[2309.09808v1](http://arxiv.org/abs/2309.09808v1)|**[link](https://github.com/april-zju/coco-lic)**|\n", "2309.09011": "|**2023-09-16**|**Optimal Initialization Strategies for Range-Only Trajectory Estimation**|Abhishek Goudar et.al.|[2309.09011v1](http://arxiv.org/abs/2309.09011v1)|null|\n", "2309.08780": "|**2023-09-15**|**Simultaneous Trajectory Estimation and Mapping for Autonomous Underwater Proximity Operations**|Aldo Ter\u00e1n Espinoza et.al.|[2309.08780v1](http://arxiv.org/abs/2309.08780v1)|null|\n", "2309.11134": "|**2023-09-20**|**GNSS/Multi-Sensor Fusion Using Continuous-Time Factor Graph Optimization for Robust Localization**|Haoming Zhang et.al.|[2309.11134v1](http://arxiv.org/abs/2309.11134v1)|**[link](https://github.com/rwth-irt/gnssfgo)**|\n", "2309.15065": "|**2023-09-26**|**Language-EXtended Indoor SLAM (LEXIS): A Versatile System for Real-time Visual Scene Understanding**|Christina Kassab et.al.|[2309.15065v1](http://arxiv.org/abs/2309.15065v1)|null|\n", "2310.03353": "|**2023-10-05**|**Deep Geometric Learning with Monotonicity Constraints for Alzheimer's Disease Progression**|Seungwoo Jeong et.al.|[2310.03353v1](http://arxiv.org/abs/2310.03353v1)|null|\n", "2310.06249": "|**2023-10-10**|**l-dyno: framework to learn consistent visual features using robot's motion**|Kartikeya Singh et.al.|[2310.06249v1](http://arxiv.org/abs/2310.06249v1)|null|\n", "2310.10723": "|**2023-10-16**|**Kinematical coherence between satellite galaxies and host stellar discs for MaNGA & SAMI galaxies**|Sen Wang et.al.|[2310.10723v1](http://arxiv.org/abs/2310.10723v1)|null|\n", "2310.12776": "|**2023-10-19**|**First holistic modelling of meteoroid ablation and fragmentation: A case study of the Orionids recorded by the Canadian Automated Meteor Observatory**|Denis Vida et.al.|[2310.12776v1](http://arxiv.org/abs/2310.12776v1)|null|\n", "2310.14506": "|**2023-10-23**|**Label Space Partition Selection for Multi-Object Tracking Using Two-Layer Partitioning**|Ji Youn Lee et.al.|[2310.14506v1](http://arxiv.org/abs/2310.14506v1)|null|\n"}, "Map fusion": {"2302.11106": "|**2023-02-22**|**Multi-Head Feature Pyramid Networks for Breast Mass Detection**|Hexiang Zhang et.al.|[2302.11106v1](http://arxiv.org/abs/2302.11106v1)|null|\n", "2301.09213": "|**2023-01-24**|**FRAME: Fast and Robust Autonomous 3D point cloud Map-merging for Egocentric multi-robot exploration**|Nikolaos Stathoulopoulos et.al.|[2301.09213v2](http://arxiv.org/abs/2301.09213v2)|null|\n", "2212.01538": "|**2022-12-03**|**Multi-resolution Monocular Depth Map Fusion by Self-supervised Gradient-based Composition**|Yaqiao Dai et.al.|[2212.01538v1](http://arxiv.org/abs/2212.01538v1)|**[link](https://github.com/yuinsky/gradient-based-depth-map-fusion)**|\n", "2211.03423": "|**2022-11-07**|**Detecting Invalid Map Merges in Lifelong SLAM**|Matthias Holoch et.al.|[2211.03423v1](http://arxiv.org/abs/2211.03423v1)|null|\n", "2209.10775": "|**2022-09-22**|**MUI-TARE: Multi-Agent Cooperative Exploration with Unknown Initial Position**|Jingtian Yan et.al.|[2209.10775v1](http://arxiv.org/abs/2209.10775v1)|null|\n", "2209.08988": "|**2022-09-19**|**MSA-GCN:Multiscale Adaptive Graph Convolution Network for Gait Emotion Recognition**|Yunfei Yin et.al.|[2209.08988v1](http://arxiv.org/abs/2209.08988v1)|null|\n", "2209.03096": "|**2022-09-07**|**Spherical wedge billiard: from chaos to fractals and Talbot carpets**|Tom\u00e1\u0161 Tyc et.al.|[2209.03096v1](http://arxiv.org/abs/2209.03096v1)|null|\n", "2208.06293": "|**2022-08-12**|**dual unet:a novel siamese network for change detection with cascade differential fusion**|Kaixuan Jiang et.al.|[2208.06293v1](http://arxiv.org/abs/2208.06293v1)|null|\n", "2207.09210": "|**2023-10-23**|**KinD-LCE Curve Estimation And Retinex Fusion On Low-Light Image**|Xiaochun Lei et.al.|[2207.09210v3](http://arxiv.org/abs/2207.09210v3)|null|\n", "2207.06965": "|**2023-06-27**|**AutoMerge: A Framework for Map Assembling and Smoothing in City-scale Environments**|Peng Yin et.al.|[2207.06965v4](http://arxiv.org/abs/2207.06965v4)|null|\n", "2203.00436": "|**2022-03-01**|**Boundary Corrected Multi-scale Fusion Network for Real-time Semantic Segmentation**|Tianjiao Jiang et.al.|[2203.00436v1](http://arxiv.org/abs/2203.00436v1)|null|\n", "2202.08498": "|**2022-02-17**|**Mirror-Yolo: An attention-based instance segmentation and detection model for mirrors**|Fengze Li et.al.|[2202.08498v1](http://arxiv.org/abs/2202.08498v1)|null|\n", "2201.11937": "|**2022-01-28**|**Stereo Matching with Cost Volume based Sparse Disparity Propagation**|Wei Xue et.al.|[2201.11937v1](http://arxiv.org/abs/2201.11937v1)|null|\n", "2201.10152": "|**2022-01-29**|**Unsupervised Image Fusion Method based on Feature Mutual Mapping**|Dongyu Rao et.al.|[2201.10152v2](http://arxiv.org/abs/2201.10152v2)|null|\n", "2112.13222": "|**2022-01-24**|**Edge Robotics: Edge-Computing-Accelerated Multi-Robot Simultaneous Localization and Mapping**|Peng Huang et.al.|[2112.13222v2](http://arxiv.org/abs/2112.13222v2)|null|\n", "2112.11044": "|**2021-12-21**|**Extending Merge Resolution to a Family of Proof Systems**|Sravanthi Chede et.al.|[2112.11044v1](http://arxiv.org/abs/2112.11044v1)|null|\n", "2111.14990": "|**2021-11-29**|**MIXER: A Principled Framework for Multimodal, Multiway Data Association**|Parker C. Lusk et.al.|[2111.14990v1](http://arxiv.org/abs/2111.14990v1)|null|\n", "2110.12338": "|**2021-10-24**|**Quality Map Fusion for Adversarial Learning**|Uche Osahor et.al.|[2110.12338v1](http://arxiv.org/abs/2110.12338v1)|null|\n", "2110.08172": "|**2021-10-18**|**MLFC: From 10 to 50 Planners in the Multi-Agent Programming Contest**|Rafael C. Cardoso et.al.|[2110.08172v2](http://arxiv.org/abs/2110.08172v2)|null|\n", "2110.06697": "|**2021-10-13**|**Semantic Image Fusion**|P. R. Hill et.al.|[2110.06697v1](http://arxiv.org/abs/2110.06697v1)|null|\n", "2110.06436": "|**2021-10-13**|**Non-local Recurrent Regularization Networks for Multi-view Stereo**|Qingshan Xu et.al.|[2110.06436v1](http://arxiv.org/abs/2110.06436v1)|null|\n", "2108.08623": "|**2021-08-19**|**VolumeFusion: Deep Depth Fusion for 3D Scene Reconstruction**|Jaesung Choe et.al.|[2108.08623v1](http://arxiv.org/abs/2108.08623v1)|null|\n", "2106.11515": "|**2021-06-23**|**Cooperative mmWave PHD-SLAM with Moving Scatterers**|Hyowon Kim et.al.|[2106.11515v2](http://arxiv.org/abs/2106.11515v2)|null|\n", "2106.10220": "|**2021-06-18**|**Semantic navigation with domain knowledge**|Rafael Gomes Braga et.al.|[2106.10220v1](http://arxiv.org/abs/2106.10220v1)|null|\n", "2106.04512": "|**2021-06-22**|**Formal Verification of a Map Merging Protocol in the Multi-Agent Programming Contest**|Matt Luckcuck et.al.|[2106.04512v2](http://arxiv.org/abs/2106.04512v2)|null|\n", "2105.14994": "|**2021-05-31**|**MAOMaps: A Photo-Realistic Benchmark For vSLAM and Map Merging Quality Assessment**|Andrey Bokovoy et.al.|[2105.14994v1](http://arxiv.org/abs/2105.14994v1)|**[link](https://github.com/CnnDepth/MAOMaps)**|\n", "2103.13246": "|**2021-03-24**|**Generic Merging of Structure from Motion Maps with a Low Memory Footprint**|Gabrielle Flood et.al.|[2103.13246v1](http://arxiv.org/abs/2103.13246v1)|null|\n", "2103.03786": "|**2022-09-22**|**Distributed Dynamic Map Fusion via Federated Learning for Intelligent Networked Vehicles**|Zijian Zhang et.al.|[2103.03786v3](http://arxiv.org/abs/2103.03786v3)|**[link](https://github.com/zijianzhang/CARLA_INVS)**|\n", "2102.10929": "|**2021-02-22**|**Deep Learning for Robust Motion Segmentation with Non-Static Cameras**|Markus Bosch et.al.|[2102.10929v1](http://arxiv.org/abs/2102.10929v1)|null|\n", "2012.10658": "|**2021-02-24**|**Generalize a Small Pre-trained Model to Arbitrarily Large TSP Instances**|Zhang-Hua Fu et.al.|[2012.10658v2](http://arxiv.org/abs/2012.10658v2)|**[link](https://github.com/Spider-scnu/TSP)**|\n", "2011.14791": "|**2021-06-08**|**NeuralFusion: Online Depth Fusion in Latent Space**|Silvan Weder et.al.|[2011.14791v2](http://arxiv.org/abs/2011.14791v2)|**[link](https://github.com/weders/NeuralFusion)**|\n", "2011.03975": "|**2020-11-11**|**Mapless-Planner: A Robust and Fast Planning Framework for Aggressive Autonomous Flight without Map Fusion**|Jialin Ji et.al.|[2011.03975v2](http://arxiv.org/abs/2011.03975v2)|null|\n", "2010.03026": "|**2020-11-16**|**Place Recognition in Forests with Urquhart Tessellations**|Guilherme V. Nardari et.al.|[2010.03026v2](http://arxiv.org/abs/2010.03026v2)|**[link](https://github.com/gnardari/urquhart)**|\n", "2009.05819": "|**2020-09-12**|**Map-merging Algorithms for Visual SLAM: Feasibility Study and Empirical Evaluation**|Andrey Bokovoy et.al.|[2009.05819v1](http://arxiv.org/abs/2009.05819v1)|null|\n", "2007.14177": "|**2020-07-28**|**Generative networks as inverse problems with fractional wavelet scattering networks**|Jiasong Wu et.al.|[2007.14177v1](http://arxiv.org/abs/2007.14177v1)|null|\n", "2007.02295": "|**2020-07-05**|**Multi view stereo with semantic priors**|Elisavet Konstantina Stathopoulou et.al.|[2007.02295v1](http://arxiv.org/abs/2007.02295v1)|null|\n", "2007.02108": "|**2020-07-04**|**SplitFusion: Simultaneous Tracking and Mapping for Non-Rigid Scenes**|Yang Li et.al.|[2007.02108v1](http://arxiv.org/abs/2007.02108v1)|null|\n", "2006.00420": "|**2020-05-31**|**VIR-SLAM: Visual, Inertial, and Ranging SLAM for single and multi-robot systems**|Yanjun Cao et.al.|[2006.00420v1](http://arxiv.org/abs/2006.00420v1)|null|\n", "2002.10342": "|**2020-02-24**|**Comparing View-Based and Map-Based Semantic Labelling in Real-Time SLAM**|Zoe Landgraf et.al.|[2002.10342v1](http://arxiv.org/abs/2002.10342v1)|null|\n", "2001.09796": "|**2020-01-16**|**Knowledge Integration of Collaborative Product Design Using Cloud Computing Infrastructure**|Mahdi Bohlouli et.al.|[2001.09796v1](http://arxiv.org/abs/2001.09796v1)|null|\n", "2001.04388": "|**2020-04-03**|**RoutedFusion: Learning Real-time Depth Map Fusion**|Silvan Weder et.al.|[2001.04388v2](http://arxiv.org/abs/2001.04388v2)|**[link](https://github.com/weders/RoutedFusion)**|\n", "1909.00703": "|**2019-09-02**|**Learned Semantic Multi-Sensor Depth Map Fusion**|Denys Rozumnyi et.al.|[1909.00703v1](http://arxiv.org/abs/1909.00703v1)|null|\n", "1908.11585": "|**2019-08-30**|**ORBSLAM-Atlas: a robust and accurate multi-map system**|Richard Elvira et.al.|[1908.11585v1](http://arxiv.org/abs/1908.11585v1)|null|\n", "1908.10541": "|**2020-06-07**|**Search and Rescue under the Forest Canopy using Multiple UAVs**|Yulun Tian et.al.|[1908.10541v2](http://arxiv.org/abs/1908.10541v2)|null|\n", "1908.09806": "|**2020-02-26**|**5G mmWave Cooperative Positioning and Mapping using Multi-Model PHD Filter and Map Fusion**|Hyowon Kim et.al.|[1908.09806v3](http://arxiv.org/abs/1908.09806v3)|**[link](https://github.com/HyowonKim-P1/5GmmWavePHDFilterMapFusion)**|\n", "1905.11257": "|**2019-05-27**|**IRAS23385+6053: An embedded massive cluster in the making**|R. Cesaroni et.al.|[1905.11257v1](http://arxiv.org/abs/1905.11257v1)|null|\n", "1812.08402": "|**2018-12-20**|**SFA: Small Faces Attention Face Detector**|Shi Luo et.al.|[1812.08402v1](http://arxiv.org/abs/1812.08402v1)|**[link](https://github.com/shiluo1990/SFA)**|\n", "1811.07632": "|**2018-11-21**|**Collaborative Dense SLAM**|Louis Gallagher et.al.|[1811.07632v2](http://arxiv.org/abs/1811.07632v2)|null|\n", "1810.00457": "|**2019-03-14**|**AgriColMap: Aerial-Ground Collaborative 3D Mapping for Precision Farming**|Ciro Potena et.al.|[1810.00457v2](http://arxiv.org/abs/1810.00457v2)|null|\n", "1809.09646": "|**2019-03-05**|**Efficient Constellation-Based Map-Merging for Semantic SLAM**|Kristoffer M. Frey et.al.|[1809.09646v2](http://arxiv.org/abs/1809.09646v2)|null|\n", "2306.15416": "|**2023-07-04**|**Irregular Change Detection in Sparse Bi-Temporal Point Clouds using Learned Place Recognition Descriptors and Point-to-Voxel Comparison**|Nikolaos Stathoulopoulos et.al.|[2306.15416v2](http://arxiv.org/abs/2306.15416v2)|null|\n", "2307.00500": "|**2023-07-02**|**CQLite: Communication-Efficient Multi-Robot Exploration Using Coverage-biased Distributed Q-Learning**|Ehsan Latif et.al.|[2307.00500v1](http://arxiv.org/abs/2307.00500v1)|null|\n", "2212.08334": "|**2023-07-10**|**Lightweight integration of 3D features to improve 2D image segmentation**|Olivier Pradelle et.al.|[2212.08334v2](http://arxiv.org/abs/2212.08334v2)|**[link](https://github.com/opradelle/2dguidedlight3d)**|\n", "2307.07126": "|**2023-07-14**|**Multi-Session, Localization-oriented and Lightweight LiDAR Mapping Using Semantic Lines and Planes**|Zehuan Yu et.al.|[2307.07126v1](http://arxiv.org/abs/2307.07126v1)|null|\n", "2308.02674": "|**2023-08-04**|**Group-$k$ consistent measurement set maximization via maximum clique over k-Uniform hypergraphs for robust multi-robot map merging**|Brendon Forsgren et.al.|[2308.02674v1](http://arxiv.org/abs/2308.02674v1)|**[link](https://bitbucket.org/jmangelson/gkcm)**|\n", "2308.08715": "|**2023-08-17**|**V-FUSE: Volumetric Depth Map Fusion with Long-Range Constraints**|Nathaniel Burgdorfer et.al.|[2308.08715v1](http://arxiv.org/abs/2308.08715v1)|**[link](https://github.com/nburgdorfer/v-fuse)**|\n", "2311.03146": "|**2023-11-06**|**Enabling In-Situ Resources Utilisation by leveraging collaborative robotics and astronaut-robot interaction**|Silvia Romero-Azpitarte et.al.|[2311.03146v1](http://arxiv.org/abs/2311.03146v1)|null|\n"}, "MultiModality": {"2302.12248": "|**2023-02-23**|**Learning Visual Representations via Language-Guided Sampling**|Mohamed El Banani et.al.|[2302.12248v1](http://arxiv.org/abs/2302.12248v1)|**[link](https://github.com/mbanani/lgssl)**|\n", "2302.11939": "|**2023-02-23**|**Power Time Series Forecasting by Pretrained LM**|Tian Zhou et.al.|[2302.11939v1](http://arxiv.org/abs/2302.11939v1)|**[link](https://github.com/damo-di-ml/one_fits_all)**|\n", "2302.11713": "|**2023-02-24**|**Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?**|Yang Chen et.al.|[2302.11713v2](http://arxiv.org/abs/2302.11713v2)|**[link](https://github.com/edchengg/infoseek_eval)**|\n", "2302.11529": "|**2023-02-22**|**Modular Deep Learning**|Jonas Pfeiffer et.al.|[2302.11529v1](http://arxiv.org/abs/2302.11529v1)|null|\n", "2302.11458": "|**2023-02-22**|**Fusing Visual Appearance and Geometry for Multi-modality 6DoF Object Tracking**|Manuel Stoiber et.al.|[2302.11458v1](http://arxiv.org/abs/2302.11458v1)|**[link](https://github.com/dlr-rm/3dobjecttracking)**|\n", "2302.11352": "|**2023-02-22**|**X-TRA: Improving Chest X-ray Tasks with Cross-Modal Retrieval Augmentation**|Tom van Sonsbeek et.al.|[2302.11352v1](http://arxiv.org/abs/2302.11352v1)|null|\n", "2302.11254": "|**2023-02-22**|**Cross-modal Audio-visual Co-learning for Text-independent Speaker Verification**|Meng Liu et.al.|[2302.11254v1](http://arxiv.org/abs/2302.11254v1)|**[link](https://github.com/danielmengliu/audiovisuallip)**|\n", "2302.11154": "|**2023-02-24**|**Open-domain Visual Entity Recognition: Towards Recognizing Millions of Wikipedia Entities**|Hexiang Hu et.al.|[2302.11154v2](http://arxiv.org/abs/2302.11154v2)|**[link](https://github.com/edchengg/oven_eval)**|\n", "2302.11097": "|**2023-02-22**|**A Multi-Modal Neural Geometric Solver with Textual Clauses Parsed from Diagram**|Ming-Liang Zhang et.al.|[2302.11097v1](http://arxiv.org/abs/2302.11097v1)|**[link](https://github.com/mingliangzhang2018/pgps)**|\n", "2302.11082": "|**2023-02-22**|**BB-GCN: A Bi-modal Bridged Graph Convolutional Network for Multi-label Chest X-Ray Recognition**|Guoli Wang et.al.|[2302.11082v1](http://arxiv.org/abs/2302.11082v1)|null|\n", "2302.11025": "|**2023-02-21**|**Asteroseismology of $\u03b4$ Scuti stars: emulating model grids using a neural network**|Owen J. Scutt et.al.|[2302.11025v1](http://arxiv.org/abs/2302.11025v1)|null|\n", "2302.11021": "|**2023-02-21**|**MVMTnet: A Multi-variate Multi-modal Transformer for Multi-class Classification of Cardiac Irregularities Using ECG Waveforms and Clinical Notes**|Ankur Samanta et.al.|[2302.11021v1](http://arxiv.org/abs/2302.11021v1)|null|\n", "2302.10873": "|**2023-02-21**|**Context-Aware Timewise VAEs for Real-Time Vehicle Trajectory Prediction**|Pei Xu et.al.|[2302.10873v1](http://arxiv.org/abs/2302.10873v1)|**[link](https://github.com/xupei0610/contextvae)**|\n", "2302.10859": "|**2023-02-21**|**SF2Former: Amyotrophic Lateral Sclerosis Identification From Multi-center MRI Data Using Spatial and Frequency Fusion Transformer**|Rafsanjany Kushol et.al.|[2302.10859v1](http://arxiv.org/abs/2302.10859v1)|**[link](https://github.com/raoyongming/GFNet)**|\n", "2302.10813": "|**2023-02-21**|**Tracking Objects and Activities with Attention for Temporal Sentence Grounding**|Zeyu Xiong et.al.|[2302.10813v1](http://arxiv.org/abs/2302.10813v1)|null|\n", "2302.10632": "|**2023-02-23**|**Multi-Modal Self-Supervised Learning for Recommendation**|Wei Wei et.al.|[2302.10632v2](http://arxiv.org/abs/2302.10632v2)|**[link](https://github.com/hkuds/mmssl)**|\n", "2302.10511": "|**2023-02-21**|**MVFusion: Multi-View 3D Object Detection with Semantic-aligned Radar and Camera Fusion**|Zizhang Wu et.al.|[2302.10511v1](http://arxiv.org/abs/2302.10511v1)|null|\n", "2302.10465": "|**2023-02-21**|**A Flexible Multi-view Multi-modal Imaging System for Outdoor Scenes**|Meng Zhang et.al.|[2302.10465v1](http://arxiv.org/abs/2302.10465v1)|null|\n", "2302.10035": "|**2023-02-20**|**Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey**|Xiao Wang et.al.|[2302.10035v1](http://arxiv.org/abs/2302.10035v1)|**[link](https://github.com/wangxiao5791509/multimodal_bigmodels_survey)**|\n", "2302.09934": "|**2023-02-20**|**CISum: Learning Cross-modality Interaction to Enhance Multimodal Semantic Coverage for Multimodal Summarization**|Litian Zhang et.al.|[2302.09934v1](http://arxiv.org/abs/2302.09934v1)|null|\n", "2302.09850": "|**2023-02-20**|**Constraint and Union for Partially-Supervised Temporal Sentence Grounding**|Chen Ju et.al.|[2302.09850v1](http://arxiv.org/abs/2302.09850v1)|null|\n", "2302.09636": "|**2023-02-19**|**Interpretable Medical Image Visual Question Answering via Multi-Modal Relationship Graph Learning**|Xinyue Hu et.al.|[2302.09636v1](http://arxiv.org/abs/2302.09636v1)|null|\n", "2302.09328": "|**2023-02-18**|**SSVMR: Saliency-based Self-training for Video-Music Retrieval**|Xuxin Cheng et.al.|[2302.09328v1](http://arxiv.org/abs/2302.09328v1)|null|\n", "2302.08958": "|**2023-02-17**|**Towards Unifying Medical Vision-and-Language Pre-training via Soft Prompts**|Zhihong Chen et.al.|[2302.08958v1](http://arxiv.org/abs/2302.08958v1)|**[link](https://github.com/zhjohnchan/ptunifier)**|\n", "2302.08888": "|**2023-02-17**|**Multimodal Federated Learning via Contrastive Representation Ensemble**|Qiying Yu et.al.|[2302.08888v1](http://arxiv.org/abs/2302.08888v1)|**[link](https://github.com/flair-thu/creamfl)**|\n", "2302.08820": "|**2023-02-17**|**Understanding Stationary and Moving Direct Skin Vibrotactile Stimulation on the Palm**|Hesham Elsayed et.al.|[2302.08820v1](http://arxiv.org/abs/2302.08820v1)|null|\n", "2302.08774": "|**2023-02-17**|**Vision, Deduction and Alignment: An Empirical Study on Multi-modal Knowledge Graph Alignment**|Yangning Li et.al.|[2302.08774v1](http://arxiv.org/abs/2302.08774v1)|null|\n", "2302.08706": "|**2023-02-20**|**Fine-grained Cross-modal Fusion based Refinement for Text-to-Image Synthesis**|Haoran Sun et.al.|[2302.08706v2](http://arxiv.org/abs/2302.08706v2)|**[link](https://github.com/haoranhfut/ff-gan)**|\n", "2302.08670": "|**2023-02-17**|**Cascaded information enhancement and cross-modal attention feature fusion for multispectral pedestrian detection**|Yang Yang et.al.|[2302.08670v1](http://arxiv.org/abs/2302.08670v1)|null|\n", "2302.09302": "|**2023-02-16**|**Bridge the Gap between Language models and Tabular Understanding**|Nuo Chen et.al.|[2302.09302v1](http://arxiv.org/abs/2302.09302v1)|null|\n", "2302.08326": "|**2023-02-16**|**NUAA-QMUL-AIIT at Memotion 3: Multi-modal Fusion with Squeeze-and-Excitation for Internet Meme Emotion Analysis**|Xiaoyu Guo et.al.|[2302.08326v1](http://arxiv.org/abs/2302.08326v1)|**[link](https://github.com/xxxxxxxxy/memotion3-SEFusion)**|\n", "2302.08212": "|**2023-02-16**|**Visible-Infrared Person Re-Identification via Patch-Mixed Cross-Modality Learning**|Zhihao Qian et.al.|[2302.08212v1](http://arxiv.org/abs/2302.08212v1)|null|\n", "2302.08180": "|**2023-02-16**|**Cross Modal Distillation for Flood Extent Mapping**|Shubhika Garg et.al.|[2302.08180v1](http://arxiv.org/abs/2302.08180v1)|null|\n", "2302.08052": "|**2023-02-16**|**Hierarchical Cross-modal Transformer for RGB-D Salient Object Detection**|Hao Chen et.al.|[2302.08052v1](http://arxiv.org/abs/2302.08052v1)|null|\n", "2302.08020": "|**2023-02-16**|**All-Electrical Skyrmionic Bits in a Chiral Magnetic Tunnel Junction**|Shaohai Chen et.al.|[2302.08020v1](http://arxiv.org/abs/2302.08020v1)|null|\n", "2302.08016": "|**2023-02-16**|**Unsupervised Domain Adaptation for MRI Volume Segmentation and Classification Using Image-to-Image Translation**|Satoshi Kondo et.al.|[2302.08016v1](http://arxiv.org/abs/2302.08016v1)|null|\n", "2302.07919": "|**2023-02-15**|**COVID-VTS: Fact Extraction and Verification on Short Video Platforms**|Fuxiao Liu et.al.|[2302.07919v1](http://arxiv.org/abs/2302.07919v1)|**[link](https://github.com/fuxiaoliu/twitter-video-dataset)**|\n", "2302.07702": "|**2023-02-15**|**Audio-Visual Contrastive Learning with Temporal Self-Supervision**|Simon Jenni et.al.|[2302.07702v1](http://arxiv.org/abs/2302.07702v1)|null|\n", "2302.07693": "|**2023-02-16**|**Fine-tuning of sign language recognition models: a technical report**|Maxim Novopoltsev et.al.|[2302.07693v2](http://arxiv.org/abs/2302.07693v2)|**[link](https://github.com/ds-hub-sochi/sl-techreport)**|\n", "2302.07661": "|**2023-02-15**|**Depth- and Semantics-aware Multi-modal Domain Translation: Generating 3D Panoramic Color Images from LiDAR Point Clouds**|Tiago Cortinhal et.al.|[2302.07661v1](http://arxiv.org/abs/2302.07661v1)|**[link](https://github.com/tiagocortinhal/titan-next)**|\n", "2302.07456": "|**2023-02-15**|**Continuous-Time Fixed-Lag Smoothing for LiDAR-Inertial-Camera SLAM**|Jiajun Lv et.al.|[2302.07456v1](http://arxiv.org/abs/2302.07456v1)|**[link](https://github.com/april-zju/clic)**|\n", "2302.07269": "|**2023-02-14**|**Dual-mode adaptive-SVD ghost imaging**|Dajing Wang et.al.|[2302.07269v1](http://arxiv.org/abs/2302.07269v1)|null|\n", "2302.06914": "|**2023-02-14**|**Heterogeneous Anomaly Detection for Software Systems via Semi-supervised Cross-modal Attention**|Cheryl Lee et.al.|[2302.06914v1](http://arxiv.org/abs/2302.06914v1)|**[link](https://github.com/bebillionaireusd/hades)**|\n", "2302.10909": "|**2023-02-14**|**Multi-modal Machine Learning in Engineering Design: A Review and Future Directions**|Binyang Song et.al.|[2302.10909v1](http://arxiv.org/abs/2302.10909v1)|null|\n", "2302.06643": "|**2023-02-13**|**Vision-RADAR fusion for Robotics BEV Detections: A Survey**|Apoorv Singh et.al.|[2302.06643v1](http://arxiv.org/abs/2302.06643v1)|null|\n", "2302.06605": "|**2023-02-13**|**UniAdapter: Unified Parameter-Efficient Transfer Learning for Cross-modal Modeling**|Haoyu Lu et.al.|[2302.06605v1](http://arxiv.org/abs/2302.06605v1)|**[link](https://github.com/rerv/uniadapter)**|\n", "2302.06560": "|**2023-02-13**|**Large Scale Multi-Lingual Multi-Modal Summarization Dataset**|Yash Verma et.al.|[2302.06560v1](http://arxiv.org/abs/2302.06560v1)|**[link](https://github.com/anubhav-jangra/m3ls)**|\n", "2302.06452": "|**2023-02-13**|**Mixed Multi-Model Semantic Interaction for Graph-based Narrative Visualizations**|Brian Keith Norambuena et.al.|[2302.06452v1](http://arxiv.org/abs/2302.06452v1)|null|\n", "2302.06350": "|**2023-02-13**|**CLIP-RR: Improved CLIP Network for Relation-Focused Cross-Modal Information Retrieval**|Yan Gong et.al.|[2302.06350v1](http://arxiv.org/abs/2302.06350v1)|null|\n", "2302.06148": "|**2023-02-13**|**CoMAE: Single Model Hybrid Pre-training on Small-Scale RGB-D Datasets**|Jiange Yang et.al.|[2302.06148v1](http://arxiv.org/abs/2302.06148v1)|**[link](https://github.com/mcg-nju/comae)**|\n", "2302.12816": "|**2023-02-24**|**Floquet Analysis of Frequency Collisions**|Kentaro Heya et.al.|[2302.12816v1](http://arxiv.org/abs/2302.12816v1)|null|\n", "2302.12610": "|**2023-02-24**|**A Joint Modeling of Vision-Language-Action for Target-oriented Grasping in Clutter**|Kechun Xu et.al.|[2302.12610v1](http://arxiv.org/abs/2302.12610v1)|**[link](https://github.com/xukechun/Vision-Language-Grasping)**|\n", "2302.12552": "|**2023-02-24**|**Deep Learning for Video-Text Retrieval: a Review**|Cunjuan Zhu et.al.|[2302.12552v1](http://arxiv.org/abs/2302.12552v1)|null|\n", "2302.12258": "|**2023-02-23**|**Data leakage in cross-modal retrieval training: A case study**|Benno Weck et.al.|[2302.12258v1](http://arxiv.org/abs/2302.12258v1)|null|\n", "2302.14045": "|**2023-02-27**|**Language Is Not All You Need: Aligning Perception with Language Models**|Shaohan Huang et.al.|[2302.14045v1](http://arxiv.org/abs/2302.14045v1)|**[link](https://github.com/microsoft/unilm)**|\n", "2302.14042": "|**2023-02-27**|**Knowledge-enhanced Pre-training for Auto-diagnosis of Chest Radiology Images**|Xiaoman Zhang et.al.|[2302.14042v1](http://arxiv.org/abs/2302.14042v1)|null|\n", "2302.14007": "|**2023-02-27**|**Joint-MAE: 2D-3D Joint Masked Autoencoders for 3D Point Cloud Pre-training**|Ziyu Guo et.al.|[2302.14007v1](http://arxiv.org/abs/2302.14007v1)|null|\n", "2302.13838": "|**2023-02-27**|**Cross-modal Face- and Voice-style Transfer**|Naoya Takahashi et.al.|[2302.13838v1](http://arxiv.org/abs/2302.13838v1)|null|\n", "2302.13668": "|**2023-02-27**|**Contrastive Video Question Answering via Video Graph Transformer**|Junbin Xiao et.al.|[2302.13668v1](http://arxiv.org/abs/2302.13668v1)|**[link](https://github.com/doc-doc/covgt)**|\n", "2302.13321": "|**2023-02-26**|**Multi-Modality in Music: Predicting Emotion in Music from High-Level Audio Features and Lyrics**|Tibor Krols et.al.|[2302.13321v1](http://arxiv.org/abs/2302.13321v1)|**[link](https://github.com/tibor-krols/cogsci2-spotify)**|\n", "2302.13311": "|**2023-02-26**|**Understanding Social Media Cross-Modality Discourse in Linguistic Space**|Chunpu Xu et.al.|[2302.13311v1](http://arxiv.org/abs/2302.13311v1)|**[link](https://github.com/cpaaax/multimodal_discourse)**|\n", "2302.13187": "|**2023-02-25**|**Tractable Diversity: Scalable Multiperspective Ontology Management via Standpoint EL**|Luc\u00eda G\u00f3mez \u00c1lvarez et.al.|[2302.13187v1](http://arxiv.org/abs/2302.13187v1)|null|\n", "2302.13094": "|**2023-02-25**|**Knowledge-infused Contrastive Learning for Urban Imagery-based Socioeconomic Prediction**|Yu Liu et.al.|[2302.13094v1](http://arxiv.org/abs/2302.13094v1)|**[link](https://github.com/tsinghua-fib-lab/urbankg-knowcl)**|\n", "2302.12971": "|**2023-02-25**|**BrainCLIP: Bridging Brain and Visual-Linguistic Representation via CLIP for Generic Natural Visual Stimulus Decoding from fMRI**|Yulong Liu et.al.|[2302.12971v1](http://arxiv.org/abs/2302.12971v1)|**[link](https://github.com/YulongBonjour/BrainCLIP)**|\n", "2302.14785": "|**2023-02-28**|**Joint Representations of Text and Knowledge Graphs for Retrieval and Evaluation**|Teven Le Scao et.al.|[2302.14785v1](http://arxiv.org/abs/2302.14785v1)|null|\n", "2302.14777": "|**2023-02-28**|**VQA with Cascade of Self- and Co-Attention Blocks**|Aakansha Mishra et.al.|[2302.14777v1](http://arxiv.org/abs/2302.14777v1)|null|\n", "2302.14564": "|**2023-02-28**|**Exploring Self-supervised Pre-trained ASR Models For Dysarthric and Elderly Speech Recognition**|Shujie Hu et.al.|[2302.14564v1](http://arxiv.org/abs/2302.14564v1)|null|\n", "2302.14418": "|**2023-02-28**|**PCR-CG: Point Cloud Registration via Deep Color and Geometry**|Yu Zhang et.al.|[2302.14418v1](http://arxiv.org/abs/2302.14418v1)|**[link](https://github.com/gardlin/pcr-cg)**|\n", "2302.14264": "|**2023-02-28**|**RGB-D Grasp Detection via Depth Guided Learning with Cross-modal Attention**|Ran Qin et.al.|[2302.14264v1](http://arxiv.org/abs/2302.14264v1)|null|\n", "2302.14115": "|**2023-02-27**|**Vid2Seq: Large-Scale Pretraining of a Visual Language Model for Dense Video Captioning**|Antoine Yang et.al.|[2302.14115v1](http://arxiv.org/abs/2302.14115v1)|**[link](https://github.com/google-research/scenic/tree/main/scenic/projects/vid2seq)**|\n", "2302.14082": "|**2023-02-27**|**Detecting and Mitigating Mode-Collapse for Flow-based Sampling of Lattice Field Theories**|Kim A. Nicoli et.al.|[2302.14082v1](http://arxiv.org/abs/2302.14082v1)|null|\n", "2303.00720": "|**2023-03-01**|**Cross-Modal Entity Matching for Visually Rich Documents**|Ritesh Sarkhel et.al.|[2303.00720v1](http://arxiv.org/abs/2303.00720v1)|null|\n", "2303.00534": "|**2023-03-01**|**RAMM: Retrieval-augmented Biomedical Visual Question Answering with Multi-modal Pre-training**|Zheng Yuan et.al.|[2303.00534v1](http://arxiv.org/abs/2303.00534v1)|**[link](https://github.com/GanjinZero/RAMM)**|\n", "2303.00462": "|**2023-03-02**|**Hidden Gems: 4D Radar Scene Flow Learning Using Cross-Modal Supervision**|Fangqiang Ding et.al.|[2303.00462v2](http://arxiv.org/abs/2303.00462v2)|**[link](https://github.com/toytiny/cmflow)**|\n", "2303.00448": "|**2023-03-01**|**The style transformer with common knowledge optimization for image-text retrieval**|Wenrui Li et.al.|[2303.00448v1](http://arxiv.org/abs/2303.00448v1)|null|\n", "2303.00369": "|**2023-03-02**|**Indescribable Multi-modal Spatial Evaluator**|Lingke Kong et.al.|[2303.00369v2](http://arxiv.org/abs/2303.00369v2)|**[link](https://github.com/kid-liet/imse)**|\n", "2303.00289": "|**2023-03-01**|**StrucTexTv2: Masked Visual-Textual Prediction for Document Image Pre-training**|Yuechen Yu et.al.|[2303.00289v1](http://arxiv.org/abs/2303.00289v1)|**[link](https://github.com/PaddlePaddle/VIMER/tree/main/StrucTexT/v2)**|\n", "2303.00277": "|**2023-03-01**|**UAV Tracking with Lidar as a Camera Sensors in GNSS-Denied Environments**|Ha Sier et.al.|[2303.00277v1](http://arxiv.org/abs/2303.00277v1)|**[link](https://github.com/tiers/uav-tracking-based-on-lidar-as-a-camera)**|\n", "2303.00233": "|**2023-03-01**|**Single-Cell Multimodal Prediction via Transformers**|Wenzhuo Tang et.al.|[2303.00233v1](http://arxiv.org/abs/2303.00233v1)|**[link](https://github.com/omicsml/scmoformer)**|\n", "2303.00200": "|**2023-03-01**|**Feature Extraction Matters More: Universal Deepfake Disruption through Attacking Ensemble Feature Extractors**|Long Tang et.al.|[2303.00200v1](http://arxiv.org/abs/2303.00200v1)|null|\n", "2303.00073": "|**2023-02-28**|**Cross-correlated quantum thermometry using diamond containing dual-defect centers**|Madhav Gupta et.al.|[2303.00073v1](http://arxiv.org/abs/2303.00073v1)|null|\n", "2303.00040": "|**2023-02-28**|**Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training**|Dezhao Luo et.al.|[2303.00040v1](http://arxiv.org/abs/2303.00040v1)|null|\n", "2303.01480": "|**2023-03-02**|**Delivering Arbitrary-Modal Semantic Segmentation**|Jiaming Zhang et.al.|[2303.01480v1](http://arxiv.org/abs/2303.01480v1)|**[link](https://github.com/jamycheung/DELIVER)**|\n", "2303.01311": "|**2023-03-02**|**Zero-Shot Text-to-Parameter Translation for Game Character Auto-Creation**|Rui Zhao et.al.|[2303.01311v1](http://arxiv.org/abs/2303.01311v1)|null|\n", "2303.01310": "|**2023-03-02**|**Learning Language-Conditioned Deformable Object Manipulation with Graph Dynamics**|Kai Mo et.al.|[2303.01310v1](http://arxiv.org/abs/2303.01310v1)|null|\n", "2303.01217": "|**2023-03-02**|**Synthetic Misinformers: Generating and Combating Multimodal Misinformation**|Stefanos-Iordanis Papadopoulos et.al.|[2303.01217v1](http://arxiv.org/abs/2303.01217v1)|null|\n", "2303.01043": "|**2023-03-02**|**I2P-Rec: Recognizing Images on Large-scale Point Cloud Maps through Bird's Eye View Projections**|Yixuan Li et.al.|[2303.01043v1](http://arxiv.org/abs/2303.01043v1)|null|\n", "2303.00882": "|**2023-03-02**|**X-Ray2EM: Uncertainty-Aware Cross-Modality Image Reconstruction from X-Ray to Electron Microscopy in Connectomics**|Yicong Li et.al.|[2303.00882v1](http://arxiv.org/abs/2303.00882v1)|null|\n", "2303.00865": "|**2023-03-01**|**AMIGO: Sparse Multi-Modal Graph Transformer with Shared-Context Processing for Representation Learning of Giga-pixel Images**|Ramin Nakhli et.al.|[2303.00865v1](http://arxiv.org/abs/2303.00865v1)|**[link](https://github.com/raminnakhli/amigo)**|\n", "2303.00806": "|**2023-03-01**|**Survival modelling of smartphone trigger data for earthquake parameter estimation in early warning. With applications to 2023 Turkish-Syrian and 2019 Ridgecrest events**|Luca Aiello et.al.|[2303.00806v1](http://arxiv.org/abs/2303.00806v1)|null|\n", "2303.02139": "|**2023-03-03**|**Data Association Aware POMDP Planning with Hypothesis Pruning Performance Guarantees**|Moran Barenboim et.al.|[2303.02139v1](http://arxiv.org/abs/2303.02139v1)|null|\n", "2303.01933": "|**2023-03-03**|**BogieCopter: A Multi-Modal Aerial-Ground Vehicle for Long-Endurance Inspection Applications**|Teodoro Dias et.al.|[2303.01933v1](http://arxiv.org/abs/2303.01933v1)|null|\n", "2303.01510": "|**2023-03-02**|**INO at Factify 2: Structure Coherence based Multi-Modal Fact Verification**|Yinuo Zhang et.al.|[2303.01510v1](http://arxiv.org/abs/2303.01510v1)|**[link](https://github.com/catrin-baze/ino-of-factify)**|\n", "2303.03378": "|**2023-03-06**|**PaLM-E: An Embodied Multimodal Language Model**|Danny Driess et.al.|[2303.03378v1](http://arxiv.org/abs/2303.03378v1)|null|\n", "2303.03131": "|**2023-03-08**|**Video Question Answering Using CLIP-Guided Visual-Text Attention**|Shuhong Ye et.al.|[2303.03131v2](http://arxiv.org/abs/2303.03131v2)|null|\n", "2303.03093": "|**2023-03-06**|**A Miniaturised Camera-based Multi-Modal Tactile Sensor**|Kaspar Althoefer et.al.|[2303.03093v1](http://arxiv.org/abs/2303.03093v1)|null|\n", "2303.03056": "|**2023-03-07**|**MOISST: Multi-modal Optimization of Implicit Scene for SpatioTemporal calibration**|Quentin Herau et.al.|[2303.03056v2](http://arxiv.org/abs/2303.03056v2)|null|\n", "2303.03032": "|**2023-03-06**|**DeCap: Decoding CLIP Latents for Zero-Shot Captioning via Text-Only Training**|Wei Li et.al.|[2303.03032v1](http://arxiv.org/abs/2303.03032v1)|**[link](https://github.com/dhg-wei/decap)**|\n", "2303.02995": "|**2023-03-06**|**HiCLIP: Contrastive Language-Image Pretraining with Hierarchy-aware Attention**|Shijie Geng et.al.|[2303.02995v1](http://arxiv.org/abs/2303.02995v1)|**[link](https://github.com/jeykigung/hiclip)**|\n", "2303.02976": "|**2023-03-06**|**Dronument: System for Reliable Deployment of Micro Aerial Vehicles in Dark Areas of Large Historical Monuments**|Pavel Petracek et.al.|[2303.02976v1](http://arxiv.org/abs/2303.02976v1)|null|\n", "2303.02688": "|**2023-03-05**|**Text2Face: A Multi-Modal 3D Face Model**|Will Rowan et.al.|[2303.02688v1](http://arxiv.org/abs/2303.02688v1)|null|\n", "2303.02684": "|**2023-03-05**|**Robust Multi-Modal Multi-LiDAR-Inertial Odometry and Mapping for Indoor Environments**|Li Qingqing et.al.|[2303.02684v1](http://arxiv.org/abs/2303.02684v1)|**[link](https://github.com/tiers/multi-modal-loam)**|\n", "2303.02506": "|**2023-03-04**|**Prismer: A Vision-Language Model with An Ensemble of Experts**|Shikun Liu et.al.|[2303.02506v1](http://arxiv.org/abs/2303.02506v1)|**[link](https://github.com/nvlabs/prismer)**|\n", "2303.02483": "|**2023-03-04**|**FAME-ViL: Multi-Tasking Vision-Language Model for Heterogeneous Fashion Tasks**|Xiao Han et.al.|[2303.02483v1](http://arxiv.org/abs/2303.02483v1)|**[link](https://github.com/brandonhanx/fame-vil)**|\n", "2303.02479": "|**2023-03-04**|**Chronic Kidney Disease of Unknown Aetiolgy (CKDu)-the search for causes and the impact of its politicization**|Chandre Dharma-wardana et.al.|[2303.02479v1](http://arxiv.org/abs/2303.02479v1)|null|\n", "2303.02407": "|**2023-03-04**|**Local Navigation Among Movable Obstacles with Deep Reinforcement Learning**|Linghong Yao et.al.|[2303.02407v1](http://arxiv.org/abs/2303.02407v1)|null|\n", "2303.02323": "|**2023-03-04**|**APE: An Open and Shared Annotated Dataset for Learning Urban Pedestrian Path Networks**|Yuxiang Zhang et.al.|[2303.02323v1](http://arxiv.org/abs/2303.02323v1)|null|\n", "2303.02203": "|**2023-03-03**|**X$^3$KD: Knowledge Distillation Across Modalities, Tasks and Stages for Multi-Camera 3D Object Detection**|Marvin Klingner et.al.|[2303.02203v1](http://arxiv.org/abs/2303.02203v1)|null|\n", "2303.03991": "|**2023-03-07**|**OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception**|Xiaofeng Wang et.al.|[2303.03991v1](http://arxiv.org/abs/2303.03991v1)|**[link](https://github.com/jeffwang987/openoccupancy)**|\n", "2303.03878": "|**2023-03-07**|**A convergence analysis of a structure-preserving gradient flow method for the all-electron Kohn-Sham model**|Yedan Shen et.al.|[2303.03878v1](http://arxiv.org/abs/2303.03878v1)|null|\n", "2303.03595": "|**2023-03-07**|**LoGoNet: Towards Accurate 3D Object Detection with Local-to-Global Cross-Modal Fusion**|Xin Li et.al.|[2303.03595v1](http://arxiv.org/abs/2303.03595v1)|**[link](https://github.com/sankin97/logonet)**|\n", "2303.03449": "|**2023-03-06**|**Dual-encoded magnetization transfer and diffusion imaging and its application to tract-specific microstructure mapping**|Ilana R Leppert et.al.|[2303.03449v1](http://arxiv.org/abs/2303.03449v1)|**[link](https://github.com/tardiflab/mt-diff)**|\n", "2303.04748": "|**2023-03-08**|**CLIP-FO3D: Learning Free Open-world 3D Scene Representations from 2D Dense CLIP**|Junbo Zhang et.al.|[2303.04748v1](http://arxiv.org/abs/2303.04748v1)|null|\n", "2303.04585": "|**2023-03-08**|**New Audio Representations Image Gan Generation from BriVL**|Sen Fang et.al.|[2303.04585v1](http://arxiv.org/abs/2303.04585v1)|**[link](https://github.com/fangsen9000/brivl-generation)**|\n", "2303.04439": "|**2023-03-08**|**A Light Weight Model for Active Speaker Detection**|Junhua Liao et.al.|[2303.04439v1](http://arxiv.org/abs/2303.04439v1)|**[link](https://github.com/junhua-liao/light-asd)**|\n", "2303.04398": "|**2023-03-08**|**Implications of Personality on Cognitive Workload, Affect, and Task Performance in Robot Remote Control**|Go-Eum Cha et.al.|[2303.04398v1](http://arxiv.org/abs/2303.04398v1)|null|\n", "2303.04364": "|**2023-03-08**|**Dynamic Scenario Representation Learning for Motion Forecasting with Heterogeneous Graph Convolutional Recurrent Networks**|Xing Gao et.al.|[2303.04364v1](http://arxiv.org/abs/2303.04364v1)|null|\n", "2303.05499": "|**2023-03-10**|**Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection**|Shilong Liu et.al.|[2303.05499v2](http://arxiv.org/abs/2303.05499v2)|**[link](https://github.com/idea-research/groundingdino)**|\n", "2303.05338": "|**2023-03-11**|**MMCosine: Multi-Modal Cosine Loss Towards Balanced Audio-Visual Fine-Grained Learning**|Ruize Xu et.al.|[2303.05338v2](http://arxiv.org/abs/2303.05338v2)|null|\n", "2303.05313": "|**2023-03-09**|**Replacement as a Self-supervision for Fine-grained Vision-language Pre-training**|Lisai Zhang et.al.|[2303.05313v1](http://arxiv.org/abs/2303.05313v1)|null|\n", "2303.05309": "|**2023-03-09**|**MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream Mixup for Visual Speech Translation and Recognition**|Xize Cheng et.al.|[2303.05309v1](http://arxiv.org/abs/2303.05309v1)|**[link](https://github.com/exgc/avmust-ted)**|\n", "2303.05193": "|**2023-03-09**|**GOATS: Goal Sampling Adaptation for Scooping with Curriculum Reinforcement Learning**|Yaru Niu et.al.|[2303.05193v1](http://arxiv.org/abs/2303.05193v1)|null|\n", "2303.05093": "|**2023-03-09**|**Improving Video Retrieval by Adaptive Margin**|Feng He et.al.|[2303.05093v1](http://arxiv.org/abs/2303.05093v1)|null|\n", "2303.05026": "|**2023-03-09**|**SSL^2: Self-Supervised Learning meets Semi-Supervised Learning: Multiple Sclerosis Segmentation in 7T-MRI from large-scale 3T-MRI**|Jiacheng Wang et.al.|[2303.05026v1](http://arxiv.org/abs/2303.05026v1)|null|\n", "2303.04955": "|**2023-03-09**|**Exploring Smart Commercial Building Occupants' Perceptions and Notification Preferences of Internet of Things Data Collection in the United States**|Tu Le et.al.|[2303.04955v1](http://arxiv.org/abs/2303.04955v1)|null|\n", "2303.06129": "|**2023-03-10**|**Single-branch Network for Multimodal Training**|Muhammad Saad Saeed et.al.|[2303.06129v1](http://arxiv.org/abs/2303.06129v1)|**[link](https://github.com/msaadsaeed/sbnet)**|\n", "2303.05952": "|**2023-03-10**|**Understanding and Constructing Latent Modality Structures in Multi-modal Representation Learning**|Qian Jiang et.al.|[2303.05952v1](http://arxiv.org/abs/2303.05952v1)|null|\n", "2303.05936": "|**2023-03-10**|**Learning Decoupled Multi-touch Force Estimation, Localization and Stretch for Soft Capacitive E-skin**|Abu Bakar Dawood et.al.|[2303.05936v1](http://arxiv.org/abs/2303.05936v1)|null|\n", "2303.05793": "|**2023-03-10**|**Analyzing covariate clustering effects in healthcare cost subgroups: insights and applications for prediction**|Zhengxiao Li et.al.|[2303.05793v1](http://arxiv.org/abs/2303.05793v1)|**[link](https://github.com/huangyf2217/fmr-covariates-clustering)**|\n", "2303.05725": "|**2023-03-10**|**CVT-SLR: Contrastive Visual-Textual Transformation for Sign Language Recognition with Variational Alignment**|Jiangbin Zheng et.al.|[2303.05725v1](http://arxiv.org/abs/2303.05725v1)|**[link](https://github.com/binbinjiang/cvt-slr)**|\n", "2303.05714": "|**2023-03-10**|**Simultaneous estimation of multiple eigenvalues with short-depth quantum circuit on early fault-tolerant quantum computers**|Zhiyan Ding et.al.|[2303.05714v1](http://arxiv.org/abs/2303.05714v1)|null|\n", "2303.05707": "|**2023-03-10**|**MuLTI: Efficient Video-and-Language Understanding with MultiWay-Sampler and Multiple Choice Modeling**|Jiaqi Xu et.al.|[2303.05707v1](http://arxiv.org/abs/2303.05707v1)|null|\n", "2303.07284": "|**2023-03-13**|**Align and Attend: Multimodal Summarization with Dual Contrastive Losses**|Bo He et.al.|[2303.07284v1](http://arxiv.org/abs/2303.07284v1)|**[link](https://github.com/boheumd/A2Summ)**|\n", "2303.07274": "|**2023-03-14**|**Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of Synthetic and Compositional Images**|Nitzan Bitton-Guetta et.al.|[2303.07274v2](http://arxiv.org/abs/2303.07274v2)|null|\n", "2303.07265": "|**2023-03-13**|**Multimodal Reinforcement Learning for Robots Collaborating with Humans**|Afagh Mehri Shervedani et.al.|[2303.07265v1](http://arxiv.org/abs/2303.07265v1)|null|\n", "2303.07064": "|**2023-03-13**|**A Generalized Multi-Modal Fusion Detection Framework**|Leichao Cui et.al.|[2303.07064v1](http://arxiv.org/abs/2303.07064v1)|null|\n", "2303.07000": "|**2023-03-13**|**Predicting Density of States via Multi-modal Transformer**|Namkyeong Lee et.al.|[2303.07000v1](http://arxiv.org/abs/2303.07000v1)|**[link](https://github.com/heewoongnoh/dostransformer)**|\n", "2303.06947": "|**2023-03-13**|**A Multi-Modal Simulation Framework to Enable Digital Twin-based V2X Communications in Dynamic Environments**|Lorenzo Cazzella et.al.|[2303.06947v1](http://arxiv.org/abs/2303.06947v1)|null|\n", "2303.06840": "|**2023-03-13**|**DDFM: Denoising Diffusion Model for Multi-Modality Image Fusion**|Zixiang Zhao et.al.|[2303.06840v1](http://arxiv.org/abs/2303.06840v1)|**[link](https://github.com/zhaozixiang1228/mmif-ddfm)**|\n", "2303.06662": "|**2023-03-12**|**Fuzzy Alignments in Directed Acyclic Graph for Non-Autoregressive Machine Translation**|Zhengrui Ma et.al.|[2303.06662v1](http://arxiv.org/abs/2303.06662v1)|**[link](https://github.com/ictnlp/fa-dat)**|\n", "2303.06555": "|**2023-03-12**|**One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale**|Fan Bao et.al.|[2303.06555v1](http://arxiv.org/abs/2303.06555v1)|**[link](https://github.com/thu-ml/unidiffuser)**|\n", "2303.06536": "|**2023-03-12**|**AutoOptLib: A Library of Automatically Designing Metaheuristic Optimization Algorithms in MATLAB**|Qi Zhao et.al.|[2303.06536v1](http://arxiv.org/abs/2303.06536v1)|**[link](https://github.com/qz89/AutoOpt)**|\n", "2303.06464": "|**2023-03-11**|**PARASOL: Parametric Style Control for Diffusion Image Synthesis**|Gemma Canet Tarr\u00e9s et.al.|[2303.06464v1](http://arxiv.org/abs/2303.06464v1)|null|\n", "2303.06422": "|**2023-03-11**|**An approximate control variates approach to multifidelity distribution estimation**|Ruijian Han et.al.|[2303.06422v1](http://arxiv.org/abs/2303.06422v1)|null|\n", "2303.06398": "|**2023-03-11**|**Variational Gaussian filtering via Wasserstein gradient flows**|Adrie Corenflos et.al.|[2303.06398v1](http://arxiv.org/abs/2303.06398v1)|**[link](https://github.com/hanyas/wasserstein-flow-filter)**|\n", "2303.06378": "|**2023-03-11**|**Learning Grounded Vision-Language Representation for Versatile Understanding in Untrimmed Videos**|Teng Wang et.al.|[2303.06378v1](http://arxiv.org/abs/2303.06378v1)|**[link](https://github.com/zjr2000/gvl)**|\n", "2303.06345": "|**2023-03-11**|**Semantics-Aware Dynamic Localization and Refinement for Referring Image Segmentation**|Zhao Yang et.al.|[2303.06345v1](http://arxiv.org/abs/2303.06345v1)|null|\n", "2303.08129": "|**2023-03-14**|**PiMAE: Point Cloud and Image Interactive Masked Autoencoders for 3D Object Detection**|Anthony Chen et.al.|[2303.08129v1](http://arxiv.org/abs/2303.08129v1)|**[link](https://github.com/blvlab/pimae)**|\n", "2303.08054": "|**2023-03-15**|**Statistical Hardware Design With Multi-model Active Learning**|Alireza Ghaffari et.al.|[2303.08054v2](http://arxiv.org/abs/2303.08054v2)|null|\n", "2303.08017": "|**2023-03-14**|**Reliable Beamforming at Terahertz Bands: Are Causal Representations the Way Forward?**|Christo Kurisummoottil Thomas et.al.|[2303.08017v1](http://arxiv.org/abs/2303.08017v1)|null|\n", "2303.07896": "|**2023-03-16**|**Exploring Weakly Supervised Semantic Segmentation Ensembles for Medical Imaging Systems**|Erik Ostrowski et.al.|[2303.07896v2](http://arxiv.org/abs/2303.07896v2)|**[link](https://github.com/erikostrowski/automated_ensemble)**|\n", "2303.07775": "|**2023-03-14**|**Data-Free Sketch-Based Image Retrieval**|Abhra Chaudhuri et.al.|[2303.07775v1](http://arxiv.org/abs/2303.07775v1)|**[link](https://github.com/abhrac/data-free-sbir)**|\n", "2303.07748": "|**2023-03-14**|**Generation-Guided Multi-Level Unified Network for Video Grounding**|Xing Cheng et.al.|[2303.07748v1](http://arxiv.org/abs/2303.07748v1)|null|\n", "2303.07742": "|**2023-03-14**|**ForDigitStress: A multi-modal stress dataset employing a digital job interview scenario**|Alexander Heimerl et.al.|[2303.07742v1](http://arxiv.org/abs/2303.07742v1)|null|\n", "2303.07674": "|**2023-03-14**|**Koos Classification of Vestibular Schwannoma via Image Translation-Based Unsupervised Cross-Modality Domain Adaptation**|Tao Yang et.al.|[2303.07674v1](http://arxiv.org/abs/2303.07674v1)|null|\n", "2303.07667": "|**2023-03-14**|**Improving Music Genre Classification from multi-modal properties of music and genre correlations Perspective**|Ganghui Ru et.al.|[2303.07667v1](http://arxiv.org/abs/2303.07667v1)|null|\n", "2303.07647": "|**2023-03-15**|**Recent Advances and Applications of Machine Learning in Experimental Solid Mechanics: A Review**|Hanxun Jin et.al.|[2303.07647v2](http://arxiv.org/abs/2303.07647v2)|null|\n", "2303.07601": "|**2023-03-14**|**V2V4Real: A Real-world Large-scale Dataset for Vehicle-to-Vehicle Cooperative Perception**|Runsheng Xu et.al.|[2303.07601v1](http://arxiv.org/abs/2303.07601v1)|**[link](https://github.com/ucla-mobility/v2v4real)**|\n", "2303.07543": "|**2023-03-14**|**WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminative Analysis**|Yiye Chen et.al.|[2303.07543v1](http://arxiv.org/abs/2303.07543v1)|**[link](https://github.com/ivalab/wdiscood)**|\n", "2303.07522": "|**2023-03-13**|**Audio Visual Language Maps for Robot Navigation**|Chenguang Huang et.al.|[2303.07522v1](http://arxiv.org/abs/2303.07522v1)|null|\n", "2303.08692": "|**2023-03-15**|**SpiderMesh: Spatial-aware Demand-guided Recursive Meshing for RGB-T Semantic Segmentation**|Siqi Fan et.al.|[2303.08692v1](http://arxiv.org/abs/2303.08692v1)|**[link](https://github.com/leofansq/spidermesh)**|\n", "2303.08600": "|**2023-03-15**|**MSeg3D: Multi-modal 3D Semantic Segmentation for Autonomous Driving**|Jiale Li et.al.|[2303.08600v1](http://arxiv.org/abs/2303.08600v1)|**[link](https://github.com/jialeli1/lidarseg3d)**|\n", "2303.08562": "|**2023-03-15**|**MGA: Medical generalist agent through text-guided knowledge transformation**|Weijian Huang et.al.|[2303.08562v1](http://arxiv.org/abs/2303.08562v1)|null|\n", "2303.08518": "|**2023-03-15**|**UPRISE: Universal Prompt Retrieval for Improving Zero-Shot Evaluation**|Daixuan Cheng et.al.|[2303.08518v1](http://arxiv.org/abs/2303.08518v1)|**[link](https://github.com/microsoft/lmops)**|\n", "2303.08419": "|**2023-03-15**|**Multi-Modal Facial Expression Recognition with Transformer-Based Fusion Networks and Dynamic Sampling**|Jun-Hwa Kim et.al.|[2303.08419v1](http://arxiv.org/abs/2303.08419v1)|null|\n", "2303.08372": "|**2023-03-15**|**Target Sound Extraction with Variable Cross-modality Clues**|Chenda Li et.al.|[2303.08372v1](http://arxiv.org/abs/2303.08372v1)|**[link](https://github.com/lichenda/multi-clue-tse-data)**|\n", "2303.08367": "|**2023-03-15**|**Uncertainty-Aware Pedestrian Trajectory Prediction via Distributional Diffusion**|Yao Liu et.al.|[2303.08367v1](http://arxiv.org/abs/2303.08367v1)|null|\n", "2303.08359": "|**2023-03-15**|**Haptics-Enabled Forceps with Multi-Modal Force Sensing: Towards Task-Autonomous Robotic Surgery**|Tangyou Liu et.al.|[2303.08359v1](http://arxiv.org/abs/2303.08359v1)|null|\n", "2303.08356": "|**2023-03-15**|**Continuous emotion recognition based on TCN and Transformer**|Weiwei Zhou et.al.|[2303.08356v1](http://arxiv.org/abs/2303.08356v1)|**[link](https://github.com/upczww/abaw5)**|\n", "2303.09463": "|**2023-03-16**|**An Autonomous System for Head-to-Head Race: Design, Implementation and Analysis; Team KAIST at the Indy Autonomous Challenge**|Chanyoung Jung et.al.|[2303.09463v1](http://arxiv.org/abs/2303.09463v1)|null|\n", "2303.09381": "|**2023-03-16**|**Multi-modal Differentiable Unsupervised Feature Selection**|Junchen Yang et.al.|[2303.09381v1](http://arxiv.org/abs/2303.09381v1)|**[link](https://github.com/jcyang34/mmdufs)**|\n", "2303.09373": "|**2023-03-16**|**3D Masked Autoencoding and Pseudo-labeling for Domain Adaptive Segmentation of Heterogeneous Infant Brain MRI**|Xuzhe Zhang et.al.|[2303.09373v1](http://arxiv.org/abs/2303.09373v1)|null|\n", "2303.09367": "|**2023-03-16**|**Goal-conditioned Offline Reinforcement Learning through State Space Partitioning**|Mianchu Wang et.al.|[2303.09367v1](http://arxiv.org/abs/2303.09367v1)|null|\n", "2303.09319": "|**2023-03-16**|**Unified Multi-Modal Latent Diffusion for Joint Subject and Text Conditional Image Generation**|Yiyang Ma et.al.|[2303.09319v1](http://arxiv.org/abs/2303.09319v1)|null|\n", "2303.09270": "|**2023-03-16**|**SpectralCLIP: Preventing Artifacts in Text-Guided Style Transfer from a Spectral Perspective**|Zipeng Xu et.al.|[2303.09270v1](http://arxiv.org/abs/2303.09270v1)|**[link](https://github.com/zipengxuc/spectralclip)**|\n", "2303.09167": "|**2023-03-16**|**Emotional Reaction Intensity Estimation Based on Multimodal Data**|Shangfei Wang et.al.|[2303.09167v1](http://arxiv.org/abs/2303.09167v1)|null|\n", "2303.09119": "|**2023-03-16**|**Taming Diffusion Models for Audio-Driven Co-Speech Gesture Generation**|Lingting Zhu et.al.|[2303.09119v1](http://arxiv.org/abs/2303.09119v1)|**[link](https://github.com/advocate99/diffgesture)**|\n", "2303.09117": "|**2023-03-16**|**Visual-Linguistic Causal Intervention for Radiology Report Generation**|Weixing Chen et.al.|[2303.09117v1](http://arxiv.org/abs/2303.09117v1)|**[link](https://github.com/wissingchen/vlci)**|\n", "2303.08942": "|**2023-03-15**|**Spherical Space Feature Decomposition for Guided Depth Map Super-Resolution**|Zixiang Zhao et.al.|[2303.08942v1](http://arxiv.org/abs/2303.08942v1)|null|\n", "2303.10056": "|**2023-03-17**|**GlueGen: Plug and Play Multi-modal Encoders for X-to-image Generation**|Can Qin et.al.|[2303.10056v1](http://arxiv.org/abs/2303.10056v1)|**[link](https://github.com/salesforce/gluegen)**|\n", "2303.10033": "|**2023-03-17**|**Multi-modal Expression Recognition with Ensemble Method**|Chuanhe Liu et.al.|[2303.10033v1](http://arxiv.org/abs/2303.10033v1)|null|\n", "2303.09858": "|**2023-03-20**|**MedLocker: A Transferable Adversarial Watermarking for Preventing Unauthorized Analysis of Medical Image Dataset**|Bangzheng Pu et.al.|[2303.09858v2](http://arxiv.org/abs/2303.09858v2)|null|\n", "2303.09830": "|**2023-03-17**|**Prototype Knowledge Distillation for Medical Segmentation with Missing Modality**|Shuai Wang et.al.|[2303.09830v1](http://arxiv.org/abs/2303.09830v1)|**[link](https://github.com/sakurajimamaiii/protokd)**|\n", "2303.09825": "|**2023-03-17**|**LCE-Calib: Automatic LiDAR-Frame/Event Camera Extrinsic Calibration With A Globally Optimal Solution**|Jianhao Jiao et.al.|[2303.09825v1](http://arxiv.org/abs/2303.09825v1)|**[link](https://github.com/hkustgz-iadc/lcecalib)**|\n", "2303.09817": "|**2023-03-17**|**Hospital Length of Stay Prediction Based on Multi-modal Data towards Trustworthy Human-AI Collaboration in Radiomics**|Hubert Baniecki et.al.|[2303.09817v1](http://arxiv.org/abs/2303.09817v1)|**[link](https://github.com/modeloriented/survex)**|\n", "2303.09800": "|**2023-03-17**|**GOOD: General Optimization-based Fusion for 3D Object Detection via LiDAR-Camera Object Candidates**|Bingqi Shen et.al.|[2303.09800v1](http://arxiv.org/abs/2303.09800v1)|null|\n", "2303.09797": "|**2023-03-17**|**MMFace4D: A Large-Scale Multi-Modal 4D Face Dataset for Audio-Driven 3D Face Animation**|Haozhe Wu et.al.|[2303.09797v1](http://arxiv.org/abs/2303.09797v1)|null|\n", "2303.09756": "|**2023-03-17**|**Video Action Recognition with Attentive Semantic Units**|Yifei Chen et.al.|[2303.09756v1](http://arxiv.org/abs/2303.09756v1)|null|\n", "2303.09733": "|**2023-03-17**|**Scribble-Supervised RGB-T Salient Object Detection**|Zhengyi Liu et.al.|[2303.09733v1](http://arxiv.org/abs/2303.09733v1)|**[link](https://github.com/liuzywen/rgbtscribble-icme2023)**|\n", "2303.09695": "|**2023-03-17**|**PersonalTailor: Personalizing 2D Pattern Design from 3D Garment Point Clouds**|Anran Qi et.al.|[2303.09695v1](http://arxiv.org/abs/2303.09695v1)|null|\n", "2303.11181": "|**2023-03-20**|**Non-Markovian paths and cycles in NFT trades**|Haaroon Yousaf et.al.|[2303.11181v1](http://arxiv.org/abs/2303.11181v1)|null|\n", "2303.11090": "|**2023-03-20**|**Scene Graph Based Fusion Network For Image-Text Retrieval**|Guoliang Wang et.al.|[2303.11090v1](http://arxiv.org/abs/2303.11090v1)|null|\n", "2303.10895": "|**2023-03-20**|**Leapfrog Diffusion Model for Stochastic Trajectory Prediction**|Weibo Mao et.al.|[2303.10895v1](http://arxiv.org/abs/2303.10895v1)|**[link](https://github.com/mediabrain-sjtu/led)**|\n", "2303.10865": "|**2023-03-21**|**Rotating Objects via In-Hand Pivoting using Vision, Force and Touch**|Shiyu Xu et.al.|[2303.10865v2](http://arxiv.org/abs/2303.10865v2)|null|\n", "2303.10849": "|**2023-03-20**|**Facial Affective Analysis based on MAE and Multi-modal Information for 5th ABAW Competition**|Wei Zhang et.al.|[2303.10849v1](http://arxiv.org/abs/2303.10849v1)|null|\n", "2303.10839": "|**2023-03-21**|**MXM-CLR: A Unified Framework for Contrastive Learning of Multifold Cross-Modal Representations**|Ye Wang et.al.|[2303.10839v2](http://arxiv.org/abs/2303.10839v2)|null|\n", "2303.10835": "|**2023-03-20**|**Bifurcation analysis of the Keynesian cross model**|Xinyu Li et.al.|[2303.10835v1](http://arxiv.org/abs/2303.10835v1)|null|\n", "2303.10826": "|**2023-03-20**|**Visual Prompt Multi-Modal Tracking**|Jiawen Zhu et.al.|[2303.10826v1](http://arxiv.org/abs/2303.10826v1)|**[link](https://github.com/jiawen-zhu/vipt)**|\n", "2303.10794": "|**2023-03-19**|**PheME: A deep ensemble framework for improving phenotype prediction from multi-modal data**|Shenghan Zhang et.al.|[2303.10794v1](http://arxiv.org/abs/2303.10794v1)|null|\n", "2303.10766": "|**2023-03-21**|**Multi-modal reward for visual relationships-based image captioning**|Ali Abedi et.al.|[2303.10766v2](http://arxiv.org/abs/2303.10766v2)|null|\n", "2303.10667": "|**2023-03-19**|**Audio-Text Models Do Not Yet Leverage Natural Language**|Ho-Hsiang Wu et.al.|[2303.10667v1](http://arxiv.org/abs/2303.10667v1)|**[link](https://github.com/hohsiangwu/preposition-synthesis)**|\n", "2303.10590": "|**2023-03-19**|**Multi-modal Facial Action Unit Detection with Large Pre-trained Models for the 5th Competition on Affective Behavior Analysis in-the-wild**|Yufeng Yin et.al.|[2303.10590v1](http://arxiv.org/abs/2303.10590v1)|null|\n", "2303.10571": "|**2023-03-19**|**CLIP4MC: An RL-Friendly Vision-Language Model for Minecraft**|Ziluo Ding et.al.|[2303.10571v1](http://arxiv.org/abs/2303.10571v1)|**[link](https://github.com/PKU-RL/CLIP4MC)**|\n", "2303.10457": "|**2023-03-18**|**Multi-Modal Continual Test-Time Adaptation for 3D Semantic Segmentation**|Haozhi Cao et.al.|[2303.10457v1](http://arxiv.org/abs/2303.10457v1)|null|\n", "2303.10406": "|**2023-03-18**|**3DQD: Generalized Deep 3D Shape Prior via Part-Discretized Diffusion Process**|Yuhan Li et.al.|[2303.10406v1](http://arxiv.org/abs/2303.10406v1)|**[link](https://github.com/colorful-liyu/3dqd)**|\n", "2303.12060": "|**2023-03-21**|**VideoXum: Cross-modal Visual and Textural Summarization of Videos**|Jingyang Lin et.al.|[2303.12060v1](http://arxiv.org/abs/2303.12060v1)|null|\n", "2303.11771": "|**2023-03-21**|**Self-Sufficient Framework for Continuous Sign Language Recognition**|Youngjoon Jang et.al.|[2303.11771v1](http://arxiv.org/abs/2303.11771v1)|null|\n", "2303.11732": "|**2023-03-21**|**Multi-modal Prompting for Low-Shot Temporal Action Localization**|Chen Ju et.al.|[2303.11732v1](http://arxiv.org/abs/2303.11732v1)|null|\n", "2303.11625": "|**2023-03-21**|**Information-containing Adversarial Perturbation for Combating Facial Manipulation Systems**|Yao Zhu et.al.|[2303.11625v1](http://arxiv.org/abs/2303.11625v1)|null|\n", "2303.12501": "|**2023-03-22**|**Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval**|Ding Jiang et.al.|[2303.12501v1](http://arxiv.org/abs/2303.12501v1)|**[link](https://github.com/anosorae/irra)**|\n", "2303.12445": "|**2023-03-22**|**MEDIMP: Medical Images and Prompts for renal transplant representation learning**|Leo Milecki et.al.|[2303.12445v1](http://arxiv.org/abs/2303.12445v1)|**[link](https://github.com/leomlck/medimp)**|\n", "2303.12423": "|**2023-03-22**|**Text with Knowledge Graph Augmented Transformer for Video Captioning**|Xin Gu et.al.|[2303.12423v1](http://arxiv.org/abs/2303.12423v1)|null|\n", "2303.12419": "|**2023-03-22**|**BiCro: Noisy Correspondence Rectification for Multi-modality Data via Bi-directional Cross-modal Similarity Consistency**|Shuo Yang et.al.|[2303.12419v1](http://arxiv.org/abs/2303.12419v1)|**[link](https://github.com/xu5zhao/bicro)**|\n", "2303.12417": "|**2023-03-22**|**CLIP^2: Contrastive Language-Image-Point Pretraining from Real-World Point Cloud Data**|Yihan Zeng et.al.|[2303.12417v1](http://arxiv.org/abs/2303.12417v1)|null|\n", "2303.12379": "|**2023-03-22**|**VMCML: Video and Music Matching via Cross-Modality Lifting**|Yi-Shan Lee et.al.|[2303.12379v1](http://arxiv.org/abs/2303.12379v1)|null|\n", "2303.12112": "|**2023-03-21**|**Positive-Augmented Constrastive Learning for Image and Video Captioning Evaluation**|Sara Sarto et.al.|[2303.12112v1](http://arxiv.org/abs/2303.12112v1)|**[link](https://github.com/aimagelab/pacscore)**|\n", "2303.13471": "|**2023-03-23**|**Egocentric Audio-Visual Object Localization**|Chao Huang et.al.|[2303.13471v1](http://arxiv.org/abs/2303.13471v1)|**[link](https://github.com/wikichao/ego-av-loc)**|\n", "2303.13455": "|**2023-03-23**|**CoBIT: A Contrastive Bi-directional Image-Text Generation Model**|Haoxuan You et.al.|[2303.13455v1](http://arxiv.org/abs/2303.13455v1)|null|\n", "2303.13430": "|**2023-03-23**|**Medical diffusion on a budget: textual inversion for medical image generation**|Bram de Wilde et.al.|[2303.13430v1](http://arxiv.org/abs/2303.13430v1)|null|\n", "2303.13371": "|**2023-03-23**|**Plug-and-Play Regulators for Image-Text Matching**|Haiwen Diao et.al.|[2303.13371v1](http://arxiv.org/abs/2303.13371v1)|**[link](https://github.com/paranioar/rcar)**|\n", "2303.13233": "|**2023-03-23**|**Visually-Prompted Language Model for Fine-Grained Scene Graph Generation in an Open World**|Qifan Yu et.al.|[2303.13233v1](http://arxiv.org/abs/2303.13233v1)|**[link](https://github.com/Yuqifan1117/CaCao)**|\n", "2303.13095": "|**2023-03-23**|**Modeling Entities as Semantic Points for Visual Information Extraction in the Wild**|Zhibo Yang et.al.|[2303.13095v1](http://arxiv.org/abs/2303.13095v1)|null|\n", "2303.13041": "|**2023-03-23**|**gDoc: Automatic Generation of Structured API Documentation**|Shujun Wang et.al.|[2303.13041v1](http://arxiv.org/abs/2303.13041v1)|null|\n", "2303.13009": "|**2023-03-23**|**MELTR: Meta Loss Transformer for Learning to Fine-tune Video Foundation Models**|Dohwan Ko et.al.|[2303.13009v1](http://arxiv.org/abs/2303.13009v1)|**[link](https://github.com/mlvlab/MELTR)**|\n", "2303.12997": "|**2023-03-23**|**FER-former: Multi-modal Transformer for Facial Expression Recognition**|Yande Li et.al.|[2303.12997v1](http://arxiv.org/abs/2303.12997v1)|null|\n", "2303.12930": "|**2023-03-24**|**Dense-Localizing Audio-Visual Events in Untrimmed Videos: A Large-Scale Benchmark and Baseline**|Tiantian Geng et.al.|[2303.12930v2](http://arxiv.org/abs/2303.12930v2)|**[link](https://github.com/ttgeng233/UnAV)**|\n", "2303.14153": "|**2023-03-24**|**Local Contrastive Learning for Medical Image Recognition**|S. A. Rizvi et.al.|[2303.14153v1](http://arxiv.org/abs/2303.14153v1)|null|\n", "2303.14139": "|**2023-03-24**|**MindDiffuser: Controlled Image Reconstruction from Human Brain Activity with Semantic and Structural Diffusion**|Yizhuo Lu et.al.|[2303.14139v1](http://arxiv.org/abs/2303.14139v1)|null|\n", "2303.14081": "|**2023-03-24**|**CoLa-Diff: Conditional Latent Diffusion Model for Multi-Modal MRI Synthesis**|Lan Jiang et.al.|[2303.14081v1](http://arxiv.org/abs/2303.14081v1)|null|\n", "2303.13885": "|**2023-03-24**|**ARKitTrack: A New Diverse Dataset for Tracking Using Mobile RGB-D Data**|Haojie Zhao et.al.|[2303.13885v1](http://arxiv.org/abs/2303.13885v1)|**[link](https://github.com/lawrence-cj/ARKitTrack)**|\n", "2303.13839": "|**2023-03-24**|**HRDoc: Dataset and Baseline Method Toward Hierarchical Reconstruction of Document Structures**|Jiefeng Ma et.al.|[2303.13839v1](http://arxiv.org/abs/2303.13839v1)|**[link](https://github.com/jfma-ustc/hrdoc)**|\n", "2303.13810": "|**2023-03-24**|**Evidence-aware multi-modal data fusion and its application to total knee replacement prediction**|Xinwen Liu et.al.|[2303.13810v1](http://arxiv.org/abs/2303.13810v1)|null|\n", "2303.15444": "|**2023-03-27**|**Quantum Multi-Model Fitting**|Matteo Farina et.al.|[2303.15444v1](http://arxiv.org/abs/2303.15444v1)|**[link](https://github.com/farinamatteo/qmmf)**|\n", "2303.15230": "|**2023-03-27**|**Troika: Multi-Path Cross-Modal Traction for Compositional Zero-Shot Learning**|Siteng Huang et.al.|[2303.15230v1](http://arxiv.org/abs/2303.15230v1)|null|\n", "2303.15219": "|**2023-03-27**|**Knowing the Distance: Understanding the Gap Between Synthetic and Real Data For Face Parsing**|Eli Friedman et.al.|[2303.15219v1](http://arxiv.org/abs/2303.15219v1)|null|\n", "2303.15103": "|**2023-03-27**|**Contrastive Learning Is Spectral Clustering On Similarity Graph**|Zhiquan Tan et.al.|[2303.15103v1](http://arxiv.org/abs/2303.15103v1)|**[link](https://github.com/yifanzhang-pro/kernel-infonce)**|\n", "2303.15083": "|**2023-03-27**|**UniDistill: A Universal Cross-Modality Knowledge Distillation Framework for 3D Object Detection in Bird's-Eye View**|Shengchao Zhou et.al.|[2303.15083v1](http://arxiv.org/abs/2303.15083v1)|**[link](https://github.com/megvii-research/cvpr2023-unidistill)**|\n", "2303.15016": "|**2023-03-27**|**Borrowing Human Senses: Comment-Aware Self-Training for Social Media Multimodal Classification**|Chunpu Xu et.al.|[2303.15016v1](http://arxiv.org/abs/2303.15016v1)|**[link](https://github.com/cpaaax/multimodal_cast)**|\n", "2303.15006": "|**2023-03-27**|**Curriculum Learning for Compositional Visual Reasoning**|Wafa Aissa et.al.|[2303.15006v1](http://arxiv.org/abs/2303.15006v1)|null|\n", "2303.14998": "|**2023-03-27**|**Multi-view Cross-Modality MR Image Translation for Vestibular Schwannoma and Cochlea Segmentation**|Bogyeong Kang et.al.|[2303.14998v1](http://arxiv.org/abs/2303.14998v1)|null|\n", "2303.14880": "|**2023-03-27**|**Toward Human-Like Social Robot Navigation: A Large-Scale, Multi-Modal, Social Human Navigation Dataset**|Duc M. Nguyen et.al.|[2303.14880v1](http://arxiv.org/abs/2303.14880v1)|null|\n", "2303.14865": "|**2023-03-27**|**Revisiting Multimodal Representation in Contrastive Learning: From Patch and Token Embeddings to Finite Discrete Tokens**|Yuxiao Chen et.al.|[2303.14865v1](http://arxiv.org/abs/2303.14865v1)|**[link](https://github.com/yuxiaochen1103/fdt)**|\n", "2303.14840": "|**2023-03-26**|**On the Importance of Accurate Geometry Data for Dense 3D Vision Tasks**|HyunJun Jung et.al.|[2303.14840v1](http://arxiv.org/abs/2303.14840v1)|**[link](https://github.com/junggy/hammer-dataset)**|\n", "2303.14768": "|**2023-03-26**|**Collaborative Noisy Label Cleaner: Learning Scene-aware Trailers for Multi-modal Highlight Detection in Movies**|Bei Gan et.al.|[2303.14768v1](http://arxiv.org/abs/2303.14768v1)|**[link](https://github.com/tencentyouturesearch/highlightdetection-clc)**|\n", "2303.14730": "|**2023-03-26**|**Semantic Neural Decoding via Cross-Modal Generation**|Xuelin Qian et.al.|[2303.14730v1](http://arxiv.org/abs/2303.14730v1)|null|\n", "2303.14666": "|**2023-03-26**|**Generalization Matters: Loss Minima Flattening via Parameter Hybridization for Efficient Online Knowledge Distillation**|Tianli Zhang et.al.|[2303.14666v1](http://arxiv.org/abs/2303.14666v1)|null|\n", "2303.14626": "|**2023-03-26**|**MRCN: A Novel Modality Restitution and Compensation Network for Visible-Infrared Person Re-identification**|Yukang Zhang et.al.|[2303.14626v1](http://arxiv.org/abs/2303.14626v1)|null|\n", "2303.16199": "|**2023-03-28**|**LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention**|Renrui Zhang et.al.|[2303.16199v1](http://arxiv.org/abs/2303.16199v1)|**[link](https://github.com/zrrskywalker/llama-adapter)**|\n", "2303.16099": "|**2023-03-28**|**Medical Image Analysis using Deep Relational Learning**|Zhihua Liu et.al.|[2303.16099v1](http://arxiv.org/abs/2303.16099v1)|null|\n", "2303.16058": "|**2023-03-28**|**Unmasked Teacher: Towards Training-Efficient Video Foundation Models**|Kunchang Li et.al.|[2303.16058v1](http://arxiv.org/abs/2303.16058v1)|**[link](https://github.com/opengvlab/unmasked_teacher)**|\n", "2303.15932": "|**2023-03-29**|**Unify, Align and Refine: Multi-Level Semantic Alignment for Radiology Report Generation**|Yaowei Li et.al.|[2303.15932v2](http://arxiv.org/abs/2303.15932v2)|null|\n", "2303.15826": "|**2023-03-28**|**MS-MT: Multi-Scale Mean Teacher with Contrastive Unpaired Translation for Cross-Modality Vestibular Schwannoma and Cochlea Segmentation**|Ziyuan Zhao et.al.|[2303.15826v1](http://arxiv.org/abs/2303.15826v1)|null|\n", "2303.15777": "|**2023-03-28**|**Imbalance Knowledge-Driven Multi-modal Network for Land-Cover Semantic Segmentation Using Images and LiDAR Point Clouds**|Yameng Wang et.al.|[2303.15777v1](http://arxiv.org/abs/2303.15777v1)|null|\n", "2303.15770": "|**2023-03-28**|**DDMM-Synth: A Denoising Diffusion Model for Cross-modal Medical Image Synthesis with Sparse-view Measurement Embedding**|Xiaoyue Li et.al.|[2303.15770v1](http://arxiv.org/abs/2303.15770v1)|null|\n", "2303.15710": "|**2023-03-28**|**Explicit Attention-Enhanced Fusion for RGB-Thermal Perception Tasks**|Mingjian Liang et.al.|[2303.15710v1](http://arxiv.org/abs/2303.15710v1)|**[link](https://github.com/freeformrobotics/eaefnet)**|\n", "2303.16818": "|**2023-03-30**|**BEVSimDet: Simulated Multi-modal Distillation in Bird's-Eye View for Multi-view 3D Object Detection**|Haimei Zhao et.al.|[2303.16818v2](http://arxiv.org/abs/2303.16818v2)|**[link](https://github.com/vitae-transformer/bevsimdet)**|\n", "2303.16604": "|**2023-03-29**|**Bi-directional Training for Composed Image Retrieval via Text Prompt Learning**|Zheyuan Liu et.al.|[2303.16604v1](http://arxiv.org/abs/2303.16604v1)|**[link](https://github.com/Cuberick-Orion/Bi-Blip4CIR)**|\n", "2303.16541": "|**2023-03-29**|**Sounding Video Generator: A Unified Framework for Text-guided Sounding Video Generation**|Jiawei Liu et.al.|[2303.16541v1](http://arxiv.org/abs/2303.16541v1)|**[link](https://github.com/jwliu-cc/svg)**|\n", "2303.16443": "|**2023-03-29**|**A tensor based varying-coefficient model for multi-modal neuroimaging data analysis**|Pratim Guha Niyogi et.al.|[2303.16443v1](http://arxiv.org/abs/2303.16443v1)|null|\n", "2303.17561": "|**2023-03-30**|**SoftCLIP: Softer Cross-modal Alignment Makes CLIP Stronger**|Yuting Gao et.al.|[2303.17561v1](http://arxiv.org/abs/2303.17561v1)|null|\n", "2303.17531": "|**2023-03-30**|**Asymmetric Face Recognition with Cross Model Compatible Ensembles**|Ori Linial et.al.|[2303.17531v1](http://arxiv.org/abs/2303.17531v1)|null|\n", "2303.17517": "|**2023-03-30**|**Hindi as a Second Language: Improving Visually Grounded Speech with Semantically Similar Samples**|Hyeonggon Ryu et.al.|[2303.17517v1](http://arxiv.org/abs/2303.17517v1)|null|\n", "2303.17490": "|**2023-03-30**|**Sound to Visual Scene Generation by Audio-to-Visual Latent Alignment**|Kim Sung-Bin et.al.|[2303.17490v1](http://arxiv.org/abs/2303.17490v1)|null|\n", "2303.17409": "|**2023-03-30**|**Steered Mixture of Experts Regression for Image Denoising with Multi-Model-Inference**|Aytac \u00d6zkan et.al.|[2303.17409v1](http://arxiv.org/abs/2303.17409v1)|null|\n", "2303.17386": "|**2023-03-30**|**Complementary Random Masking for RGB-Thermal Semantic Segmentation**|Ukcheol Shin et.al.|[2303.17386v1](http://arxiv.org/abs/2303.17386v1)|**[link](https://github.com/UkcheolShin/CRM_RGBTSeg)**|\n", "2303.17297": "|**2023-03-30**|**Understanding the Robustness of 3D Object Detection with Bird's-Eye-View Representations in Autonomous Driving**|Zijian Zhu et.al.|[2303.17297v1](http://arxiv.org/abs/2303.17297v1)|**[link](https://github.com/zzj403/BEV_Robust)**|\n", "2303.17285": "|**2023-03-30**|**Decomposed Cross-modal Distillation for RGB-based Temporal Action Detection**|Pilhyeon Lee et.al.|[2303.17285v1](http://arxiv.org/abs/2303.17285v1)|null|\n", "2303.17169": "|**2023-03-30**|**Task-Oriented Multi-Modal Mutual Leaning for Vision-Language Models**|Sifan Long et.al.|[2303.17169v1](http://arxiv.org/abs/2303.17169v1)|null|\n", "2303.17099": "|**2023-03-30**|**BEVFusion4D: Learning LiDAR-Camera Fusion Under Bird's-Eye-View via Cross-Modality Guidance and Temporal Aggregation**|Hongxiang Cai et.al.|[2303.17099v1](http://arxiv.org/abs/2303.17099v1)|null|\n", "2303.18248": "|**2023-03-31**|**Towards Flexible Multi-modal Document Models**|Naoto Inoue et.al.|[2303.18248v1](http://arxiv.org/abs/2303.18248v1)|**[link](https://github.com/CyberAgentAILab/flex-dm)**|\n", "2303.17981": "|**2023-03-31**|**Knowledge Distillation for Feature Extraction in Underwater VSLAM**|Jinghe Yang et.al.|[2303.17981v1](http://arxiv.org/abs/2303.17981v1)|**[link](https://github.com/jinghe-mel/ufen-slam)**|\n", "2303.17859": "|**2023-03-31**|**MapFormer: Boosting Change Detection by Using Pre-change Information**|Maximilian Bernhard et.al.|[2303.17859v1](http://arxiv.org/abs/2303.17859v1)|**[link](https://github.com/mxbh/mapformer)**|\n", "2303.17811": "|**2023-04-03**|**Zero-shot Referring Image Segmentation with Global-Local Context Features**|Seonghoon Yu et.al.|[2303.17811v2](http://arxiv.org/abs/2303.17811v2)|**[link](https://github.com/seonghoon-yu/zero-shot-ris)**|\n", "2304.00932": "|**2023-04-03**|**HypLiLoc: Towards Effective LiDAR Pose Regression with Hyperbolic Fusion**|Sijie Wang et.al.|[2304.00932v1](http://arxiv.org/abs/2304.00932v1)|**[link](https://github.com/sijieaaa/hypliloc)**|\n", "2304.00827": "|**2023-04-03**|**Multi-modal Fake News Detection on Social Media via Multi-grained Information Fusion**|Yangming Zhou et.al.|[2304.00827v1](http://arxiv.org/abs/2304.00827v1)|null|\n", "2304.00788": "|**2023-04-03**|**Open-Vocabulary Point-Cloud Object Detection without 3D Annotation**|Yuheng Lu et.al.|[2304.00788v1](http://arxiv.org/abs/2304.00788v1)|**[link](https://github.com/lyhdet/ov-3det)**|\n", "2304.00719": "|**2023-04-03**|**Multi-Modal Representation Learning with Text-Driven Soft Masks**|Jaeyoo Park et.al.|[2304.00719v1](http://arxiv.org/abs/2304.00719v1)|null|\n", "2304.00670": "|**2023-04-03**|**CRN: Camera Radar Net for Accurate, Robust, Efficient 3D Perception**|Youngseok Kim et.al.|[2304.00670v1](http://arxiv.org/abs/2304.00670v1)|null|\n", "2304.00495": "|**2023-04-02**|**Multimodal Hyperspectral Image Classification via Interconnected Fusion**|Lu Huo et.al.|[2304.00495v1](http://arxiv.org/abs/2304.00495v1)|null|\n", "2304.00450": "|**2023-04-02**|**Sketch-based Video Object Localization**|Sangmin Woo et.al.|[2304.00450v1](http://arxiv.org/abs/2304.00450v1)|null|\n", "2304.00379": "|**2023-04-01**|**Improved Multimodal Fusion for Small Datasets with Auxiliary Supervision**|Gregory Holste et.al.|[2304.00379v1](http://arxiv.org/abs/2304.00379v1)|null|\n", "2304.00157": "|**2023-03-31**|**Robotic Perception of Transparent Objects: A Review**|Jiaqi Jiang et.al.|[2304.00157v1](http://arxiv.org/abs/2304.00157v1)|null|\n", "2304.01961": "|**2023-04-04**|**AToMiC: An Image/Text Retrieval Test Collection to Support Multimedia Content Creation**|Jheng-Hong Yang et.al.|[2304.01961v1](http://arxiv.org/abs/2304.01961v1)|**[link](https://github.com/trec-atomic/atomic)**|\n", "2304.01799": "|**2023-04-04**|**naplib-python: Neural Acoustic Data Processing and Analysis Tools in Python**|Gavin Mischler et.al.|[2304.01799v1](http://arxiv.org/abs/2304.01799v1)|**[link](https://github.com/naplab/naplib-python)**|\n", "2304.01705": "|**2023-04-04**|**Cross-modal tumor segmentation using generative blending augmentation and self training**|Guillaume Sall\u00e9 et.al.|[2304.01705v1](http://arxiv.org/abs/2304.01705v1)|null|\n", "2304.01603": "|**2023-04-04**|**Locate Then Generate: Bridging Vision and Language with Bounding Box for Scene-Text VQA**|Yongxin Zhu et.al.|[2304.01603v1](http://arxiv.org/abs/2304.01603v1)|null|\n", "2304.01601": "|**2023-04-04**|**Primitive Simultaneous Optimization of Similarity Metrics for Image Registration**|Diana Waldmannstetter et.al.|[2304.01601v1](http://arxiv.org/abs/2304.01601v1)|null|\n", "2304.01563": "|**2023-04-04**|**Attribute-Consistent Knowledge Graph Representation Learning for Multi-Modal Entity Alignment**|Qian Li et.al.|[2304.01563v1](http://arxiv.org/abs/2304.01563v1)|null|\n", "2304.01491": "|**2023-04-04**|**Multi model LSTM architecture for Track Association based on Automatic Identification System Data**|Md Asif Bin Syed et.al.|[2304.01491v1](http://arxiv.org/abs/2304.01491v1)|null|\n", "2304.01440": "|**2023-04-04**|**A Deep Multi-Modal Cyber-Attack Detection in Industrial Control Systems**|Sepideh Bahadoripour et.al.|[2304.01440v1](http://arxiv.org/abs/2304.01440v1)|null|\n", "2304.01430": "|**2023-04-04**|**Divided Attention: Unsupervised Multi-Object Discovery with Contextually Separated Slots**|Dong Lao et.al.|[2304.01430v1](http://arxiv.org/abs/2304.01430v1)|null|\n", "2304.01233": "|**2023-04-03**|**Multi-Modal Perceiver Language Model for Outcome Prediction in Emergency Department**|Sabri Boughorbel et.al.|[2304.01233v1](http://arxiv.org/abs/2304.01233v1)|null|\n", "2304.02556": "|**2023-04-05**|**Detecting and Grounding Multi-Modal Media Manipulation**|Rui Shao et.al.|[2304.02556v1](http://arxiv.org/abs/2304.02556v1)|**[link](https://github.com/rshaojimmy/multimodal-deepfake)**|\n", "2304.02532": "|**2023-04-05**|**Goal-Conditioned Imitation Learning using Score-based Diffusion Policies**|Moritz Reuss et.al.|[2304.02532v1](http://arxiv.org/abs/2304.02532v1)|null|\n", "2304.02419": "|**2023-04-05**|**TM2D: Bimodality Driven 3D Dance Generation via Music-Text Integration**|Kehong Gong et.al.|[2304.02419v1](http://arxiv.org/abs/2304.02419v1)|**[link](https://github.com/Garfield-kh/TM2D)**|\n", "2304.02407": "|**2023-04-05**|**Explaining Multimodal Data Fusion: Occlusion Analysis for Wilderness Mapping**|Burak Ekim et.al.|[2304.02407v1](http://arxiv.org/abs/2304.02407v1)|null|\n", "2304.02328": "|**2023-04-05**|**Enhancing Multimodal Entity and Relation Extraction with Variational Information Bottleneck**|Shiyao Cui et.al.|[2304.02328v1](http://arxiv.org/abs/2304.02328v1)|null|\n", "2304.02278": "|**2023-04-05**|**Calibrating Cross-modal Feature for Text-Based Person Searching**|Donglai Wei et.al.|[2304.02278v1](http://arxiv.org/abs/2304.02278v1)|null|\n", "2304.03047": "|**2023-04-07**|**ETPNav: Evolving Topological Planning for Vision-Language Navigation in Continuous Environments**|Dong An et.al.|[2304.03047v2](http://arxiv.org/abs/2304.03047v2)|**[link](https://github.com/marsaki/etpnav)**|\n", "2304.02991": "|**2023-04-06**|**Exploiting the Complementarity of 2D and 3D Networks to Address Domain-Shift in 3D Semantic Segmentation**|Adriano Cardace et.al.|[2304.02991v1](http://arxiv.org/abs/2304.02991v1)|**[link](https://github.com/cvlab-unibo/mm2d3d)**|\n", "2304.02948": "|**2023-04-06**|**FengWu: Pushing the Skillful Global Medium-range Weather Forecast beyond 10 Days Lead**|Kang Chen et.al.|[2304.02948v1](http://arxiv.org/abs/2304.02948v1)|null|\n", "2304.02916": "|**2023-04-06**|**Efficient Audio Captioning Transformer with Patchout and Text Guidance**|Thodoris Kouzelis et.al.|[2304.02916v1](http://arxiv.org/abs/2304.02916v1)|null|\n", "2304.02902": "|**2023-04-06**|**Towards Efficient MCMC Sampling in Bayesian Neural Networks by Exploiting Symmetry**|Jonas Gregor Wiese et.al.|[2304.02902v1](http://arxiv.org/abs/2304.02902v1)|null|\n", "2304.02853": "|**2023-04-06**|**Learning Instance-Level Representation for Large-Scale Multi-Modal Pretraining in E-commerce**|Yang Jin et.al.|[2304.02853v1](http://arxiv.org/abs/2304.02853v1)|null|\n", "2304.03669": "|**2023-04-07**|**DATE: Domain Adaptive Product Seeker for E-commerce**|Haoyuan Li et.al.|[2304.03669v1](http://arxiv.org/abs/2304.03669v1)|null|\n", "2304.03542": "|**2023-04-07**|**Better \"CMOS\" Produces Clearer Images: Learning Space-Variant Blur Estimation for Blind Image Super-Resolution**|Xuhai Chen et.al.|[2304.03542v1](http://arxiv.org/abs/2304.03542v1)|null|\n", "2304.03391": "|**2023-04-06**|**Exposing and Mitigating Spurious Correlations for Cross-Modal Retrieval**|Jae Myung Kim et.al.|[2304.03391v1](http://arxiv.org/abs/2304.03391v1)|null|\n", "2304.04523": "|**2023-04-10**|**PoseFusion: Robust Object-in-Hand Pose Estimation with SelectLSTM**|Yuyang Tu et.al.|[2304.04523v1](http://arxiv.org/abs/2304.04523v1)|null|\n", "2304.04302": "|**2023-04-09**|**Bionic Collapsible Wings in Aquatic-aerial Robot**|Xiao Xiong et.al.|[2304.04302v1](http://arxiv.org/abs/2304.04302v1)|null|\n", "2304.04298": "|**2023-04-09**|**Unsupervised Sampling Promoting for Stochastic Human Trajectory Prediction**|Guangyi Chen et.al.|[2304.04298v1](http://arxiv.org/abs/2304.04298v1)|**[link](https://github.com/viewsetting/unsupervised_sampling_promoting)**|\n", "2304.04290": "|**2023-04-09**|**Distributed Conditional GAN (discGAN) For Synthetic Healthcare Data Generation**|David Fuentes et.al.|[2304.04290v1](http://arxiv.org/abs/2304.04290v1)|null|\n", "2304.04231": "|**2023-04-09**|**CrowdCLIP: Unsupervised Crowd Counting via Vision-Language Model**|Dingkang Liang et.al.|[2304.04231v1](http://arxiv.org/abs/2304.04231v1)|**[link](https://github.com/dk-liang/crowdclip)**|\n", "2304.04187": "|**2023-04-09**|**Similarity-Aware Multimodal Prompt Learning for Fake News Detection**|Ye Jiang et.al.|[2304.04187v1](http://arxiv.org/abs/2304.04187v1)|null|\n", "2304.04113": "|**2023-04-08**|**An Automated Fully-Computational Framework to Construct Printability Maps for Additively Manufactured Metal Alloys**|Sofia Sheikh et.al.|[2304.04113v1](http://arxiv.org/abs/2304.04113v1)|null|\n", "2304.04062": "|**2023-04-08**|**Predicting multiple sclerosis disease severity with multimodal deep neural networks**|Kai Zhang et.al.|[2304.04062v1](http://arxiv.org/abs/2304.04062v1)|**[link](https://github.com/anotherkaizhang/ms)**|\n", "2304.03916": "|**2023-04-08**|**Mitigating Spurious Correlations in Multi-modal Models during Fine-tuning**|Yu Yang et.al.|[2304.03916v1](http://arxiv.org/abs/2304.03916v1)|null|\n", "2304.03910": "|**2023-04-08**|**Co-attention Propagation Network for Zero-Shot Video Object Segmentation**|Gensheng Pei et.al.|[2304.03910v1](http://arxiv.org/abs/2304.03910v1)|**[link](https://github.com/nust-machine-intelligence-laboratory/hcpn)**|\n", "2304.03897": "|**2023-04-08**|**Factify 2: A Multimodal Fake News and Satire News Dataset**|S Suryavardan et.al.|[2304.03897v1](http://arxiv.org/abs/2304.03897v1)|**[link](https://github.com/surya1701/factify-2.0)**|\n", "2304.05340": "|**2023-04-11**|**Unified Multi-Modal Image Synthesis for Missing Modality Imputation**|Yue Zhang et.al.|[2304.05340v1](http://arxiv.org/abs/2304.05340v1)|null|\n", "2304.05171": "|**2023-04-11**|**Curriculum-Based Imitation of Versatile Skills**|Maximilian Xiling Li et.al.|[2304.05171v1](http://arxiv.org/abs/2304.05171v1)|**[link](https://github.com/intuitive-robots/ml-cur)**|\n", "2304.05166": "|**2023-04-11**|**TrajFlow: Learning the Distribution over Trajectories**|Anna M\u00e9sz\u00e1ros et.al.|[2304.05166v1](http://arxiv.org/abs/2304.05166v1)|null|\n", "2304.05080": "|**2023-04-11**|**Investigating Imbalances Between SAR and Optical Utilization for Multi-Modal Urban Mapping**|Sebastian Hafner et.al.|[2304.05080v1](http://arxiv.org/abs/2304.05080v1)|null|\n", "2304.05051": "|**2023-04-11**|**FashionSAP: Symbols and Attributes Prompt for Fine-grained Fashion Vision-Language Pre-training**|Yunpeng Han et.al.|[2304.05051v1](http://arxiv.org/abs/2304.05051v1)|**[link](https://github.com/hssip/fashionsap)**|\n", "2304.05979": "|**2023-04-12**|**NaviSTAR: Socially Aware Robot Navigation with Hybrid Spatio-Temporal Graph Transformer and Preference Learning**|Weizheng Wang et.al.|[2304.05979v1](http://arxiv.org/abs/2304.05979v1)|null|\n", "2304.05754": "|**2023-04-12**|**Self-Supervised Learning with Cluster-Aware-DINO for High-Performance Robust Speaker Verification**|Bing Han et.al.|[2304.05754v1](http://arxiv.org/abs/2304.05754v1)|null|\n", "2304.05720": "|**2023-04-12**|**Towards a more comprehensive open-source model for interdisciplinary smart integrated energy systems**|B\u00e9la Wiegel et.al.|[2304.05720v1](http://arxiv.org/abs/2304.05720v1)|null|\n", "2304.05646": "|**2023-04-12**|**Modality-Invariant Representation for Infrared and Visible Image Registration**|Zhiying Jiang et.al.|[2304.05646v1](http://arxiv.org/abs/2304.05646v1)|null|\n", "2304.05645": "|**2023-04-12**|**WildRefer: 3D Object Localization in Large-scale Dynamic Scenes with Multi-modal Visual Data and Natural Language**|Zhenxiang Lin et.al.|[2304.05645v1](http://arxiv.org/abs/2304.05645v1)|null|\n", "2304.05600": "|**2023-04-12**|**Looking Similar, Sounding Different: Leveraging Counterfactual Cross-Modal Pairs for Audiovisual Representation Learning**|Nikhil Singh et.al.|[2304.05600v1](http://arxiv.org/abs/2304.05600v1)|null|\n", "2304.05523": "|**2023-04-11**|**MoMo: A shared encoder Model for text, image and multi-Modal representations**|Rakesh Chada et.al.|[2304.05523v1](http://arxiv.org/abs/2304.05523v1)|null|\n", "2304.05402": "|**2023-04-11**|**Boosting Cross-task Transferability of Adversarial Patches with Visual Relations**|Tony Ma et.al.|[2304.05402v1](http://arxiv.org/abs/2304.05402v1)|null|\n", "2304.06708": "|**2023-04-13**|**Verbs in Action: Improving verb understanding in video-language models**|Liliane Momeni et.al.|[2304.06708v1](http://arxiv.org/abs/2304.06708v1)|null|\n", "2304.06306": "|**2023-04-13**|**Efficient Multimodal Fusion via Interactive Prompting**|Yaowei Li et.al.|[2304.06306v1](http://arxiv.org/abs/2304.06306v1)|null|\n", "2304.06275": "|**2023-04-13**|**Noisy Correspondence Learning with Meta Similarity Correction**|Haochen Han et.al.|[2304.06275v1](http://arxiv.org/abs/2304.06275v1)|**[link](https://github.com/hhc1997/mscn)**|\n", "2304.06264": "|**2023-04-13**|**Loosely Coupled Odometry, UWB Ranging, and Cooperative Spatial Detection for Relative Monte-Carlo Multi-Robot Localization**|Xianjia Yu et.al.|[2304.06264v1](http://arxiv.org/abs/2304.06264v1)|**[link](https://github.com/tiers/uwb-cooperative-mrs-localization)**|\n", "2304.06051": "|**2023-04-12**|**Open-TransMind: A New Baseline and Benchmark for 1st Foundation Model Challenge of Intelligent Transportation**|Yifeng Shi et.al.|[2304.06051v1](http://arxiv.org/abs/2304.06051v1)|**[link](https://github.com/Traffic-X/Open-TransMind)**|\n", "2304.07199": "|**2023-04-14**|**CROVIA: Seeing Drone Scenes from Car Perspective via Cross-View Adaptation**|Thanh-Dat Truong et.al.|[2304.07199v1](http://arxiv.org/abs/2304.07199v1)|null|\n", "2304.07151": "|**2023-04-14**|**End-to-End Learning with Multiple Modalities for System-Optimised Renewables Nowcasting**|Rushil Vohra et.al.|[2304.07151v1](http://arxiv.org/abs/2304.07151v1)|null|\n", "2304.07147": "|**2023-04-14**|**Cross Attention Transformers for Multi-modal Unsupervised Whole-Body PET Anomaly Detection**|Ashay Patel et.al.|[2304.07147v1](http://arxiv.org/abs/2304.07147v1)|null|\n", "2304.06991": "|**2023-04-14**|**WYTIWYR: A User Intent-Aware Framework with Multi-modal Inputs for Visualization Retrieval**|Shishi Xiao et.al.|[2304.06991v1](http://arxiv.org/abs/2304.06991v1)|**[link](https://github.com/serendipitysx/wytiwyr)**|\n", "2304.06910": "|**2023-04-14**|**HCAM -- Hierarchical Cross Attention Model for Multi-modal Emotion Recognition**|Soumya Dutta et.al.|[2304.06910v1](http://arxiv.org/abs/2304.06910v1)|null|\n", "2304.06786": "|**2023-04-13**|**The future of hearing aid technology**|Volker Hohmann et.al.|[2304.06786v1](http://arxiv.org/abs/2304.06786v1)|null|\n", "2304.08345": "|**2023-04-17**|**VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset**|Sihan Chen et.al.|[2304.08345v1](http://arxiv.org/abs/2304.08345v1)|**[link](https://github.com/TXH-mercury/VALOR)**|\n", "2304.08304": "|**2023-04-17**|**SDVRF: Sparse-to-Dense Voxel Region Fusion for Multi-modal 3D Object Detection**|Binglu Ren et.al.|[2304.08304v1](http://arxiv.org/abs/2304.08304v1)|null|\n", "2304.08083": "|**2023-04-17**|**Causality-aware Visual Scene Discovery for Cross-Modal Question Reasoning**|Yang Liu et.al.|[2304.08083v1](http://arxiv.org/abs/2304.08083v1)|null|\n", "2304.08072": "|**2023-04-17**|**Two-stage MR Image Segmentation Method for Brain Tumors based on Attention Mechanism**|Li Zhu et.al.|[2304.08072v1](http://arxiv.org/abs/2304.08072v1)|null|\n", "2304.08058": "|**2023-04-17**|**One-Class SVM on siamese neural network latent space for Unsupervised Anomaly Detection on brain MRI White Matter Hyperintensities**|Nicolas Pinon et.al.|[2304.08058v1](http://arxiv.org/abs/2304.08058v1)|null|\n", "2304.08054": "|**2023-04-17**|**Fed-MIWAE: Federated Imputation of Incomplete Data via Deep Generative Models**|Irene Balelli et.al.|[2304.08054v1](http://arxiv.org/abs/2304.08054v1)|null|\n", "2304.07775": "|**2023-04-16**|**Robust Cross-Modal Knowledge Distillation for Unconstrained Videos**|Wenke Xia et.al.|[2304.07775v1](http://arxiv.org/abs/2304.07775v1)|**[link](https://github.com/gewu-lab/cross-modal-distillation)**|\n", "2304.07728": "|**2023-04-16**|**TransFusionOdom: Interpretable Transformer-based LiDAR-Inertial Fusion Odometry Estimation**|Leyuan Sun et.al.|[2304.07728v1](http://arxiv.org/abs/2304.07728v1)|**[link](https://github.com/rakugenson/multi-modal-dataset-for-odometry-estimation)**|\n", "2304.07633": "|**2023-04-15**|**Detecting Out-of-Context Multimodal Misinformation with interpretable neural-symbolic model**|Yizhou Zhang et.al.|[2304.07633v1](http://arxiv.org/abs/2304.07633v1)|null|\n", "2304.07567": "|**2023-04-15**|**CoVLR: Coordinating Cross-Modal Consistency and Intra-Modal Structure for Vision-Language Retrieval**|Yang Yang et.al.|[2304.07567v1](http://arxiv.org/abs/2304.07567v1)|null|\n", "2304.07549": "|**2023-04-15**|**MA-ViT: Modality-Agnostic Vision Transformers for Face Anti-Spoofing**|Ajian Liu et.al.|[2304.07549v1](http://arxiv.org/abs/2304.07549v1)|null|\n", "2304.07387": "|**2023-04-14**|**Cross-domain Food Image-to-Recipe Retrieval by Weighted Adversarial Learning**|Bin Zhu et.al.|[2304.07387v1](http://arxiv.org/abs/2304.07387v1)|null|\n", "2304.09172": "|**2023-04-18**|**Hyperbolic Image-Text Representations**|Karan Desai et.al.|[2304.09172v1](http://arxiv.org/abs/2304.09172v1)|null|\n", "2304.09164": "|**2023-04-18**|**Structure Preserving Cycle-GAN for Unsupervised Medical Image Domain Adaptation**|Paolo Iacono et.al.|[2304.09164v1](http://arxiv.org/abs/2304.09164v1)|null|\n", "2304.08965": "|**2023-04-18**|**Unsupervised Semantic Segmentation of 3D Point Clouds via Cross-modal Distillation and Super-Voxel Clustering**|Zisheng Chen et.al.|[2304.08965v1](http://arxiv.org/abs/2304.08965v1)|**[link](https://github.com/scut-bip-lab/pointdc)**|\n", "2304.08881": "|**2023-04-18**|**Segmentation of glioblastomas in early post-operative multi-modal MRI with deep neural networks**|Ragnhild Holden Helland et.al.|[2304.08881v1](http://arxiv.org/abs/2304.08881v1)|**[link](https://github.com/dbouget/validation_metrics_computation)**|\n", "2304.08709": "|**2023-04-18**|**You Only Need Two Detectors to Achieve Multi-Modal 3D Multi-Object Tracking**|Xiyang Wang et.al.|[2304.08709v1](http://arxiv.org/abs/2304.08709v1)|**[link](https://github.com/wangxiyang2022/YONTD-MOT)**|\n", "2304.08660": "|**2023-04-17**|**(LC)$^2$: LiDAR-Camera Loop Constraints For Cross-Modal Place Recognition**|Alex Junho Lee et.al.|[2304.08660v1](http://arxiv.org/abs/2304.08660v1)|null|\n", "2304.08658": "|**2023-04-20**|**In-situ surface porosity prediction in DED (directed energy deposition) printed SS316L parts using multimodal sensor fusion**|Adithyaa Karthikeyan et.al.|[2304.08658v2](http://arxiv.org/abs/2304.08658v2)|null|\n", "2304.09801": "|**2023-04-19**|**MetaBEV: Solving Sensor Failures for BEV Detection and Map Segmentation**|Chongjian Ge et.al.|[2304.09801v1](http://arxiv.org/abs/2304.09801v1)|**[link](https://github.com/ChongjianGE/MetaBEV)**|\n", "2304.09694": "|**2023-04-19**|**CrossFusion: Interleaving Cross-modal Complementation for Noise-resistant 3D Object Detection**|Yang Yang et.al.|[2304.09694v1](http://arxiv.org/abs/2304.09694v1)|null|\n", "2304.09609": "|**2023-04-19**|**MMDR: A Result Feature Fusion Object Detection Approach for Autonomous System**|Wendong Zhang et.al.|[2304.09609v1](http://arxiv.org/abs/2304.09609v1)|null|\n", "2304.09498": "|**2023-04-19**|**Learning Robust Visual-Semantic Embedding for Generalizable Person Re-identification**|Suncheng Xiang et.al.|[2304.09498v1](http://arxiv.org/abs/2304.09498v1)|**[link](https://github.com/jeremyxsc/mmet)**|\n", "2304.09448": "|**2023-04-19**|**EC^2: Emergent Communication for Embodied Control**|Yao Mu et.al.|[2304.09448v1](http://arxiv.org/abs/2304.09448v1)|null|\n", "2304.09421": "|**2023-04-19**|**TieFake: Title-Text Similarity and Emotion-Aware Fake News Detection**|Quanjiang Guo et.al.|[2304.09421v1](http://arxiv.org/abs/2304.09421v1)|**[link](https://github.com/uestc-gqj/tiefake)**|\n", "2304.09370": "|**2023-04-19**|**Integrating Reconfigurable Foot Design, Multi-modal Contact Sensing, and Terrain Classification for Bipedal Locomotion**|Ted Tyler et.al.|[2304.09370v1](http://arxiv.org/abs/2304.09370v1)|null|\n", "2304.09322": "|**2023-04-18**|**Multi-Modality Multi-Scale Cardiovascular Disease Subtypes Classification Using Raman Image and Medical History**|Bo Yu et.al.|[2304.09322v1](http://arxiv.org/abs/2304.09322v1)|null|\n", "2304.10530": "|**2023-04-20**|**Collaborative Diffusion for Multi-Modal Face Generation and Editing**|Ziqi Huang et.al.|[2304.10530v1](http://arxiv.org/abs/2304.10530v1)|**[link](https://github.com/ziqihuangg/collaborative-diffusion)**|\n", "2304.10309": "|**2023-04-20**|**Improving Speech Translation by Cross-Modal Multi-Grained Contrastive Learning**|Hao Zhang et.al.|[2304.10309v1](http://arxiv.org/abs/2304.10309v1)|null|\n", "2304.10254": "|**2023-04-20**|**Image-text Retrieval via preserving main Semantics of Vision**|Xu Zhang et.al.|[2304.10254v1](http://arxiv.org/abs/2304.10254v1)|**[link](https://github.com/zhangxu0963/vsl)**|\n", "2304.10091": "|**2023-04-20**|**Learning CLIP Guided Visual-Text Fusion Transformer for Video-based Pedestrian Attribute Recognition**|Jun Zhu et.al.|[2304.10091v1](http://arxiv.org/abs/2304.10091v1)|**[link](https://github.com/event-ahu/vtf_par)**|\n", "2304.09941": "|**2023-04-19**|**A robust and interpretable deep learning framework for multi-modal registration via keypoints**|Alan Q. Wang et.al.|[2304.09941v1](http://arxiv.org/abs/2304.09941v1)|**[link](https://github.com/evanmy/keymorph)**|\n", "2304.09921": "|**2023-04-19**|**Regularization for distributionally robust state estimation and prediction**|Jean-S\u00e9bastien Brouillon et.al.|[2304.09921v1](http://arxiv.org/abs/2304.09921v1)|null|\n", "2304.10382": "|**2023-04-21**|**Conditional Generative Models for Learning Stochastic Processes**|Salvatore Certo et.al.|[2304.10382v2](http://arxiv.org/abs/2304.10382v2)|null|\n", "2304.11098": "|**2023-04-21**|**Generative AI-enabled Vehicular Networks: Fundamentals, Framework, and Case Study**|Ruichen Zhang et.al.|[2304.11098v1](http://arxiv.org/abs/2304.11098v1)|null|\n", "2304.11029": "|**2023-04-24**|**CLaMP: Contrastive Language-Music Pre-training for Cross-Modal Symbolic Music Information Retrieval**|Shangda Wu et.al.|[2304.11029v2](http://arxiv.org/abs/2304.11029v2)|**[link](https://github.com/microsoft/muzic/tree/main/clamp)**|\n", "2304.10893": "|**2023-04-21**|**FindVehicle and VehicleFinder: A NER dataset for natural language-based vehicle retrieval and a keyword-based cross-modal vehicle retrieval system**|Runwei Guan et.al.|[2304.10893v1](http://arxiv.org/abs/2304.10893v1)|**[link](https://github.com/guanrunwei/vehiclefinder-ctim)**|\n", "2304.10824": "|**2023-04-21**|**Rethinking Benchmarks for Cross-modal Image-text Retrieval**|Weijing Chen et.al.|[2304.10824v1](http://arxiv.org/abs/2304.10824v1)|**[link](https://github.com/cwj1412/mscoco-flikcr30k_fg)**|\n", "2304.10759": "|**2023-04-21**|**GeoLayoutLM: Geometric Pre-training for Visual Information Extraction**|Chuwei Luo et.al.|[2304.10759v1](http://arxiv.org/abs/2304.10759v1)|**[link](https://github.com/alibabaresearch/advancedliteratemachinery)**|\n", "2304.10756": "|**2023-04-21**|**Missing Modality Robustness in Semi-Supervised Multi-Modal Semantic Segmentation**|Harsh Maheshwari et.al.|[2304.10756v1](http://arxiv.org/abs/2304.10756v1)|**[link](https://github.com/harshm121/m3l)**|\n", "2304.10740": "|**2023-04-21**|**Multi-Modal Deep Learning for Credit Rating Prediction Using Text and Numerical Data Streams**|Mahsa Tavakoli et.al.|[2304.10740v1](http://arxiv.org/abs/2304.10740v1)|**[link](https://github.com/banking-analytics-lab/multimodalfusionratings)**|\n", "2304.10727": "|**2023-04-21**|**RoCOCO: Robust Benchmark MS-COCO to Stress-test Robustness of Image-Text Matching Models**|Seulki Park et.al.|[2304.10727v1](http://arxiv.org/abs/2304.10727v1)|**[link](https://github.com/pseulki/rococo)**|\n", "2304.10658": "|**2023-04-20**|**Linear to multi-linear algebra and systems using tensors**|Divyanshu Pandey et.al.|[2304.10658v1](http://arxiv.org/abs/2304.10658v1)|null|\n", "2304.10628": "|**2023-04-20**|**HM-ViT: Hetero-modal Vehicle-to-Vehicle Cooperative perception with vision transformer**|Hao Xiang et.al.|[2304.10628v1](http://arxiv.org/abs/2304.10628v1)|null|\n", "2304.10592": "|**2023-04-20**|**MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models**|Deyao Zhu et.al.|[2304.10592v1](http://arxiv.org/abs/2304.10592v1)|**[link](https://github.com/vision-cair/minigpt-4)**|\n", "2304.12269": "|**2023-04-24**|**Enriching Source Code with Contextual Data for Code Completion Models: An Empirical Study**|Tim van Dam et.al.|[2304.12269v1](http://arxiv.org/abs/2304.12269v1)|**[link](https://github.com/aise-tudelft/contextualdatacodecompletion)**|\n", "2304.12259": "|**2023-04-24**|**Imaging 3D Chemistry at 1 nm Resolution with Fused Multi-Modal Electron Tomography**|Jonathan Schwartz et.al.|[2304.12259v1](http://arxiv.org/abs/2304.12259v1)|**[link](https://github.com/jtschwar/projection_refinement)**|\n", "2304.11993": "|**2023-04-25**|**MMC: Multi-Modal Colorization of Images using Textual Descriptions**|Subhankar Ghosh et.al.|[2304.11993v2](http://arxiv.org/abs/2304.11993v2)|null|\n", "2304.11875": "|**2023-04-24**|**Underwater object classification combining SAS and transferred optical-to-SAS Imagery**|Avi Abu et.al.|[2304.11875v1](http://arxiv.org/abs/2304.11875v1)|null|\n", "2304.11829": "|**2023-04-25**|**Hierarchical Diffusion Autoencoders and Disentangled Image Manipulation**|Zeyu Lu et.al.|[2304.11829v2](http://arxiv.org/abs/2304.11829v2)|null|\n", "2304.11764": "|**2023-04-23**|**Learning-enabled multi-modal motion prediction in urban environments**|Vinicius Trentin et.al.|[2304.11764v1](http://arxiv.org/abs/2304.11764v1)|null|\n", "2304.11697": "|**2023-04-23**|**Informative Data Selection with Uncertainty for Multi-modal Object Detection**|Xinyu Zhang et.al.|[2304.11697v1](http://arxiv.org/abs/2304.11697v1)|null|\n", "2304.11618": "|**2023-04-23**|**Modality-Aware Negative Sampling for Multi-modal Knowledge Graph Embedding**|Yichi Zhang et.al.|[2304.11618v1](http://arxiv.org/abs/2304.11618v1)|**[link](https://github.com/zjukg/mans)**|\n", "2304.11603": "|**2023-04-23**|**LaMD: Latent Motion Diffusion for Video Generation**|Yaosi Hu et.al.|[2304.11603v1](http://arxiv.org/abs/2304.11603v1)|null|\n", "2304.11193": "|**2023-04-21**|**Combining Vision and Tactile Sensation for Video Prediction**|Willow Mandil et.al.|[2304.11193v1](http://arxiv.org/abs/2304.11193v1)|null|\n", "2304.12995": "|**2023-04-25**|**AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head**|Rongjie Huang et.al.|[2304.12995v1](http://arxiv.org/abs/2304.12995v1)|**[link](https://github.com/aigc-audio/audiogpt)**|\n", "2304.12725": "|**2023-04-25**|**Quantitative analysis of collagen remodeling in pancreatic lesions using computationally translated collagen images derived from brightfield microscopy images**|Varun Nair et.al.|[2304.12725v1](http://arxiv.org/abs/2304.12725v1)|null|\n", "2304.12570": "|**2023-04-25**|**Learnable Pillar-based Re-ranking for Image-Text Retrieval**|Leigang Qu et.al.|[2304.12570v1](http://arxiv.org/abs/2304.12570v1)|**[link](https://github.com/lgqu/leaprr)**|\n", "2304.12412": "|**2023-04-24**|**End-to-End Lidar-Camera Self-Calibration for Autonomous Vehicles**|Arya Rachman et.al.|[2304.12412v1](http://arxiv.org/abs/2304.12412v1)|null|\n", "2304.13649": "|**2023-04-26**|**A Symmetric Dual Encoding Dense Retrieval Framework for Knowledge-Intensive Visual Question Answering**|Alireza Salemi et.al.|[2304.13649v1](http://arxiv.org/abs/2304.13649v1)|**[link](https://github.com/alirezasalemi7/dedr-mm-fid)**|\n", "2304.13583": "|**2023-04-26**|**Multi-Modality Deep Network for Extreme Learned Image Compression**|Xuhao Jiang et.al.|[2304.13583v1](http://arxiv.org/abs/2304.13583v1)|null|\n", "2304.13559": "|**2023-04-28**|**Towards Multi-Modal DBMSs for Seamless Querying of Texts and Tables**|Matthias Urban et.al.|[2304.13559v2](http://arxiv.org/abs/2304.13559v2)|null|\n", "2304.13425": "|**2023-04-26**|**Learnable Ophthalmology SAM**|Zhongxi Qiu et.al.|[2304.13425v1](http://arxiv.org/abs/2304.13425v1)|**[link](https://github.com/qsingle/learnablepromptsam)**|\n", "2304.13357": "|**2023-04-26**|**Deep Lifelong Cross-modal Hashing**|Liming Xu et.al.|[2304.13357v1](http://arxiv.org/abs/2304.13357v1)|null|\n", "2304.13277": "|**2023-04-26**|**Self-Supervised Multi-Modal Sequential Recommendation**|Kunzhe Song et.al.|[2304.13277v1](http://arxiv.org/abs/2304.13277v1)|**[link](https://github.com/kz-song/mmsrec)**|\n", "2304.13273": "|**2023-04-27**|**From Association to Generation: Text-only Captioning by Unsupervised Cross-modal Mapping**|Junyang Wang et.al.|[2304.13273v2](http://arxiv.org/abs/2304.13273v2)|**[link](https://github.com/junyangwang0410/knight)**|\n", "2304.13181": "|**2023-04-25**|**Sample-Specific Debiasing for Better Image-Text Models**|Peiqi Wang et.al.|[2304.13181v1](http://arxiv.org/abs/2304.13181v1)|null|\n", "2304.13172": "|**2023-04-25**|**Generating Procedural Materials from Text or Image Prompts**|Yiwei Hu et.al.|[2304.13172v1](http://arxiv.org/abs/2304.13172v1)|null|\n", "2304.13130": "|**2023-04-25**|**Hypernymization of named entity-rich captions for grounding-based multi-modal pretraining**|Giacomo Nebbia et.al.|[2304.13130v1](http://arxiv.org/abs/2304.13130v1)|null|\n", "2304.13103": "|**2023-04-25**|**HyMo: Vulnerability Detection in Smart Contracts using a Novel Multi-Modal Hybrid Model**|Mohammad Khodadadi et.al.|[2304.13103v1](http://arxiv.org/abs/2304.13103v1)|null|\n", "2304.13097": "|**2023-04-25**|**Bridging graph data models: RDF, RDF-star, and property graphs as directed acyclic graphs**|Ewout Gelling et.al.|[2304.13097v1](http://arxiv.org/abs/2304.13097v1)|**[link](https://github.com/ewoutgelling/bridging-data-models)**|\n", "2304.14340": "|**2023-04-27**|**SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection**|Yichen Xie et.al.|[2304.14340v1](http://arxiv.org/abs/2304.14340v1)|**[link](https://github.com/yichen928/sparsefusion)**|\n", "2304.14323": "|**2023-04-27**|**Pushing the Boundaries of Tractable Multiperspective Reasoning: A Deduction Calculus for Standpoint EL+**|Luc\u00eda {G\u00f3mez \u00c1lvarez} et.al.|[2304.14323v1](http://arxiv.org/abs/2304.14323v1)|**[link](https://github.com/cl-tud/standpoint-el-souffle-reasoner)**|\n", "2304.14243": "|**2023-04-27**|**Standpoint Linear Temporal Logic**|Nicola Gigante et.al.|[2304.14243v1](http://arxiv.org/abs/2304.14243v1)|null|\n", "2304.14178": "|**2023-04-27**|**mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality**|Qinghao Ye et.al.|[2304.14178v1](http://arxiv.org/abs/2304.14178v1)|**[link](https://github.com/x-plug/mplug-owl)**|\n", "2304.13979": "|**2023-04-27**|**Adaptive-Mask Fusion Network for Segmentation of Drivable Road and Negative Obstacle With Untrustworthy Features**|Zhen Feng et.al.|[2304.13979v1](http://arxiv.org/abs/2304.13979v1)|**[link](https://github.com/lab-sun/amfnet)**|\n", "2304.13923": "|**2023-04-27**|**Retrieval-based Knowledge Augmented Vision Language Pre-training**|Jiahua Rao et.al.|[2304.13923v1](http://arxiv.org/abs/2304.13923v1)|null|\n", "2304.13833": "|**2023-04-26**|**Mixtures of Gaussian process experts based on kernel stick-breaking processes**|Yuji Saikai et.al.|[2304.13833v1](http://arxiv.org/abs/2304.13833v1)|**[link](https://github.com/ysaikai/gpksbp)**|\n", "2304.14204": "|**2023-04-26**|**Towards Medical Artificial General Intelligence via Knowledge-Enhanced Multimodal Pretraining**|Bingqian Lin et.al.|[2304.14204v1](http://arxiv.org/abs/2304.14204v1)|**[link](https://github.com/chenzcv7/motor)**|\n", "2304.15010": "|**2023-04-28**|**LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model**|Peng Gao et.al.|[2304.15010v1](http://arxiv.org/abs/2304.15010v1)|**[link](https://github.com/zrrskywalker/llama-adapter)**|\n", "2304.14942": "|**2023-04-28**|**The Emotions of the Crowd: Learning Image Sentiment from Tweets via Cross-modal Distillation**|Alessio Serra et.al.|[2304.14942v1](http://arxiv.org/abs/2304.14942v1)|null|\n", "2304.14936": "|**2023-04-28**|**Information Redundancy and Biases in Public Document Information Extraction Benchmarks**|Seif Laatiri et.al.|[2304.14936v1](http://arxiv.org/abs/2304.14936v1)|**[link](https://github.com/seif-lat/bias-study-funsd-sroie)**|\n", "2304.14933": "|**2023-04-28**|**An Empirical Study of Multimodal Model Merging**|Yi-Lin Sung et.al.|[2304.14933v1](http://arxiv.org/abs/2304.14933v1)|**[link](https://github.com/ylsung/vl-merging)**|\n", "2304.14880": "|**2023-04-28**|**SGAligner : 3D Scene Alignment with Scene Graphs**|Sayan Deb Sarkar et.al.|[2304.14880v1](http://arxiv.org/abs/2304.14880v1)|**[link](https://github.com/sayands/sgaligner)**|\n", "2305.00970": "|**2023-05-01**|**ArK: Augmented Reality with Knowledge Interactive Emergent Ability**|Qiuyuan Huang et.al.|[2305.00970v1](http://arxiv.org/abs/2305.00970v1)|null|\n", "2305.00769": "|**2023-05-01**|**Multi-scale Transformer-based Network for Emotion Recognition from Multi Physiological Signals**|Tu Vu et.al.|[2305.00769v1](http://arxiv.org/abs/2305.00769v1)|**[link](https://github.com/vsl-team/EPiC-2023-ACII)**|\n", "2305.00537": "|**2023-04-30**|**Interpretability of Machine Learning: Recent Advances and Future Prospects**|Lei Gao et.al.|[2305.00537v1](http://arxiv.org/abs/2305.00537v1)|null|\n", "2305.00355": "|**2023-04-29**|**MH-DETR: Video Moment and Highlight Detection with Cross-modal Transformer**|Yifang Xu et.al.|[2305.00355v1](http://arxiv.org/abs/2305.00355v1)|null|\n", "2305.00320": "|**2023-04-29**|**Fusion for Visual-Infrared Person ReID in Real-World Surveillance Using Corrupted Multimodal Data**|Arthur Josi et.al.|[2305.00320v1](http://arxiv.org/abs/2305.00320v1)|**[link](https://github.com/art2611/mreid-ucd-ccd)**|\n", "2305.00314": "|**2023-04-29**|**InfraDet3D: Multi-Modal 3D Object Detection based on Roadside Infrastructure Camera and LiDAR Sensors**|Walter Zimmer et.al.|[2305.00314v1](http://arxiv.org/abs/2305.00314v1)|null|\n", "2305.00207": "|**2023-04-29**|**Mixed-Response State-Space Model for Analyzing Multi-Dimensional Digital Phenotypes**|Tianchen Xu et.al.|[2305.00207v1](http://arxiv.org/abs/2305.00207v1)|**[link](https://github.com/zjph602xtc/MRSS)**|\n", "2305.00201": "|**2023-04-29**|**Instruction-ViT: Multi-Modal Prompts for Instruction Learning in ViT**|Zhenxiang Xiao et.al.|[2305.00201v1](http://arxiv.org/abs/2305.00201v1)|null|\n", "2305.00042": "|**2023-04-28**|**Cycle-guided Denoising Diffusion Probability Model for 3D Cross-modality MRI Synthesis**|Shaoyan Pan et.al.|[2305.00042v1](http://arxiv.org/abs/2305.00042v1)|null|\n", "2305.00976": "|**2023-05-02**|**TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion Synthesis**|Mathis Petrovich et.al.|[2305.00976v1](http://arxiv.org/abs/2305.00976v1)|null|\n", "2305.01412": "|**2023-05-02**|**A Computational Approach for the Characterization of Airborne Pathogen Transmission in Turbulent Molecular Communication Channels**|Fatih Gulec et.al.|[2305.01412v1](http://arxiv.org/abs/2305.01412v1)|null|\n", "2305.01366": "|**2023-05-02**|**Establishing a Learning Model for Correct Hand Hygiene Technique in a NICU**|Ir\u00e9n A. Kopcs\u00f3n\u00e9 N\u00e9meth et.al.|[2305.01366v1](http://arxiv.org/abs/2305.01366v1)|null|\n", "2305.01245": "|**2023-05-02**|**MDENet: Multi-modal Dual-embedding Networks for Malware Open-set Recognition**|Jingcai Guo et.al.|[2305.01245v1](http://arxiv.org/abs/2305.01245v1)|null|\n", "2305.01233": "|**2023-05-03**|**On Uni-Modal Feature Learning in Supervised Multi-Modal Learning**|Chenzhuang Du et.al.|[2305.01233v2](http://arxiv.org/abs/2305.01233v2)|**[link](https://github.com/gewu-lab/ogm-ge_cvpr2022)**|\n", "2305.01111": "|**2023-05-01**|**Local and Global Contextual Features Fusion for Pedestrian Intention Prediction**|Mohsen Azarmi et.al.|[2305.01111v1](http://arxiv.org/abs/2305.01111v1)|null|\n", "2305.02269": "|**2023-05-03**|**M2-CTTS: End-to-End Multi-scale Multi-modal Conversational Text-to-Speech Synthesis**|Jinlong Xue et.al.|[2305.02269v1](http://arxiv.org/abs/2305.02269v1)|null|\n", "2305.01971": "|**2023-05-03**|**District-scale surface temperatures generated from high-resolution longitudinal thermal infrared images**|Subin Lin et.al.|[2305.01971v1](http://arxiv.org/abs/2305.01971v1)|**[link](https://github.com/buds-lab/project-iris-dataset)**|\n", "2305.01915": "|**2023-05-03**|**Denoising Multi-modal Sequential Recommenders with Contrastive Learning**|Dong Yao et.al.|[2305.01915v1](http://arxiv.org/abs/2305.01915v1)|null|\n", "2305.01912": "|**2023-05-03**|**MolKD: Distilling Cross-Modal Knowledge in Chemical Reactions for Molecular Property Prediction**|Liang Zeng et.al.|[2305.01912v1](http://arxiv.org/abs/2305.01912v1)|null|\n", "2305.01877": "|**2023-05-04**|**The Impacts of Dimensionality, Diffusion, and Directedness on Intrinsic Cross-Model Simulation in Tile-Based Self-Assembly**|Daniel Hader et.al.|[2305.01877v2](http://arxiv.org/abs/2305.01877v2)|null|\n", "2305.01864": "|**2023-05-05**|**Unsupervised Improvement of Audio-Text Cross-Modal Representations**|Zhepei Wang et.al.|[2305.01864v2](http://arxiv.org/abs/2305.01864v2)|**[link](https://github.com/zhepeiw/clap_curation)**|\n", "2305.01836": "|**2023-05-03**|**AV-SAM: Segment Anything Model Meets Audio-Visual Localization and Segmentation**|Shentong Mo et.al.|[2305.01836v1](http://arxiv.org/abs/2305.01836v1)|null|\n", "2305.01778": "|**2023-05-02**|**SLTUNET: A Simple Unified Model for Sign Language Translation**|Biao Zhang et.al.|[2305.01778v1](http://arxiv.org/abs/2305.01778v1)|**[link](https://github.com/bzhangGo/sltunet)**|\n", "2305.01661": "|**2023-05-02**|**SIA-FTP: A Spoken Instruction Aware Flight Trajectory Prediction Framework**|Dongyue Guo et.al.|[2305.01661v1](http://arxiv.org/abs/2305.01661v1)|null|\n", "2305.02930": "|**2023-05-04**|**Piecewise Normalizing Flows**|Harry Bevins et.al.|[2305.02930v1](http://arxiv.org/abs/2305.02930v1)|**[link](https://github.com/htjb/margarine)**|\n", "2305.02774": "|**2023-05-04**|**Spatial and Modal Optimal Transport for Fast Cross-Modal MRI Reconstruction**|Qi Wang et.al.|[2305.02774v1](http://arxiv.org/abs/2305.02774v1)|null|\n", "2305.02760": "|**2023-05-04**|**Multi-Modality Deep Network for JPEG Artifacts Reduction**|Xuhao Jiang et.al.|[2305.02760v1](http://arxiv.org/abs/2305.02760v1)|null|\n", "2305.02577": "|**2023-05-04**|**Text Reading Order in Uncontrolled Conditions by Sparse Graph Segmentation**|Renshen Wang et.al.|[2305.02577v1](http://arxiv.org/abs/2305.02577v1)|null|\n", "2305.02572": "|**2023-05-04**|**High-fidelity Generalized Emotional Talking Face Generation with Multi-modal Emotion Space Learning**|Chao Xu et.al.|[2305.02572v1](http://arxiv.org/abs/2305.02572v1)|null|\n", "2305.02504": "|**2023-05-04**|**Learning Missing Modal Electronic Health Records with Unified Multi-modal Data Embedding and Modality-Aware Attention**|Kwanhyung Lee et.al.|[2305.02504v1](http://arxiv.org/abs/2305.02504v1)|null|\n", "2305.03726": "|**2023-05-05**|**Otter: A Multi-Modal Model with In-Context Instruction Tuning**|Bo Li et.al.|[2305.03726v1](http://arxiv.org/abs/2305.03726v1)|**[link](https://github.com/luodian/otter)**|\n", "2305.03724": "|**2023-05-05**|**DualCross: Cross-Modality Cross-Domain Adaptation for Monocular BEV Perception**|Yunze Man et.al.|[2305.03724v1](http://arxiv.org/abs/2305.03724v1)|null|\n", "2305.03689": "|**2023-05-05**|**COLA: How to adapt vision-language models to Compose Objects Localized with Attributes?**|Arijit Ray et.al.|[2305.03689v1](http://arxiv.org/abs/2305.03689v1)|**[link](https://github.com/arijitray1993/COLA)**|\n", "2305.03347": "|**2023-05-05**|**A Large Cross-Modal Video Retrieval Dataset with Reading Comprehension**|Weijia Wu et.al.|[2305.03347v1](http://arxiv.org/abs/2305.03347v1)|**[link](https://github.com/callsys/textvr)**|\n", "2305.03314": "|**2023-05-05**|**Block the Label and Noise: An N-Gram Masked Speller for Chinese Spell Checking**|Haiyun Yang et.al.|[2305.03314v1](http://arxiv.org/abs/2305.03314v1)|null|\n", "2305.03277": "|**2023-05-05**|**FM-ViT: Flexible Modal Vision Transformers for Face Anti-Spoofing**|Ajian Liu et.al.|[2305.03277v1](http://arxiv.org/abs/2305.03277v1)|null|\n", "2305.03252": "|**2023-05-05**|**HeteroEdge: Addressing Asymmetry in Heterogeneous Collaborative Autonomous Systems**|Mohammad Saeid Anwar et.al.|[2305.03252v1](http://arxiv.org/abs/2305.03252v1)|null|\n", "2305.03212": "|**2023-05-04**|**LLM2Loss: Leveraging Language Models for Explainable Model Diagnostics**|Shervin Ardeshir et.al.|[2305.03212v1](http://arxiv.org/abs/2305.03212v1)|null|\n", "2305.03187": "|**2023-05-04**|**Generating Virtual On-body Accelerometer Data from Virtual Textual Descriptions for Human Activity Recognition**|Zikang Leng et.al.|[2305.03187v1](http://arxiv.org/abs/2305.03187v1)|**[link](https://github.com/ZikangLeng/IMUGPT)**|\n", "2305.03506": "|**2023-05-04**|**SI-LSTM: Speaker Hybrid Long-short Term Memory and Cross Modal Attention for Emotion Recognition in Conversation**|Xingwei Liang et.al.|[2305.03506v1](http://arxiv.org/abs/2305.03506v1)|null|\n", "2305.04824": "|**2023-05-08**|**Learning Summary-Worthy Visual Representation for Abstractive Summarization in Video**|Zenan Xu et.al.|[2305.04824v1](http://arxiv.org/abs/2305.04824v1)|null|\n", "2305.04790": "|**2023-05-09**|**MultiModal-GPT: A Vision and Language Model for Dialogue with Humans**|Tao Gong et.al.|[2305.04790v2](http://arxiv.org/abs/2305.04790v2)|**[link](https://github.com/open-mmlab/multimodal-gpt)**|\n", "2305.04685": "|**2023-05-08**|**ARDIE: AR, Dialogue, and Eye Gaze Policies for Human-Robot Collaboration**|Chelsea Zou et.al.|[2305.04685v1](http://arxiv.org/abs/2305.04685v1)|null|\n", "2305.04530": "|**2023-05-08**|**A Multi-Modal Context Reasoning Approach for Conditional Inference on Joint Textual and Visual Clues**|Yunxin Li et.al.|[2305.04530v1](http://arxiv.org/abs/2305.04530v1)|**[link](https://github.com/yunxinli/multimodal-context-reasoning)**|\n", "2305.04476": "|**2023-05-09**|**AlignSTS: Speech-to-Singing Conversion via Cross-Modal Alignment**|Ruiqi Li et.al.|[2305.04476v2](http://arxiv.org/abs/2305.04476v2)|null|\n", "2305.04474": "|**2023-05-09**|**Vision Langauge Pre-training by Contrastive Learning with Cross-Modal Similarity Regulation**|Chaoya Jiang et.al.|[2305.04474v2](http://arxiv.org/abs/2305.04474v2)|null|\n", "2305.04469": "|**2023-05-08**|**HACK: Learning a Parametric Head and Neck Model for High-fidelity Animation**|Longwen Zhang et.al.|[2305.04469v1](http://arxiv.org/abs/2305.04469v1)|**[link](https://github.com/zonelikewonderland/hack-model)**|\n", "2305.04451": "|**2023-05-08**|**FashionTex: Controllable Virtual Try-on with Text and Texture**|Anran Lin et.al.|[2305.04451v1](http://arxiv.org/abs/2305.04451v1)|**[link](https://github.com/picksh/fashiontex)**|\n", "2305.04298": "|**2023-05-07**|**Poses as Queries: Image-to-LiDAR Map Localization with Transformers**|Jinyu Miao et.al.|[2305.04298v1](http://arxiv.org/abs/2305.04298v1)|null|\n", "2305.04239": "|**2023-05-07**|**Instance-Variant Loss with Gaussian RBF Kernel for 3D Cross-modal Retriveal**|Zhitao Liu et.al.|[2305.04239v1](http://arxiv.org/abs/2305.04239v1)|null|\n", "2305.04224": "|**2023-05-07**|**Visual Causal Scene Refinement for Video Question Answering**|Yushen Wei et.al.|[2305.04224v1](http://arxiv.org/abs/2305.04224v1)|**[link](https://github.com/yangliu9208/vcsr)**|\n", "2305.04195": "|**2023-05-07**|**Cross-Modal Retrieval for Motion and Text via MildTriple Loss**|Sheng Yan et.al.|[2305.04195v1](http://arxiv.org/abs/2305.04195v1)|**[link](https://github.com/eanson023/rehamot)**|\n", "2305.04160": "|**2023-05-07**|**X-LLM: Bootstrapping Advanced Large Language Models by Treating Multi-Modalities as Foreign Languages**|Feilong Chen et.al.|[2305.04160v1](http://arxiv.org/abs/2305.04160v1)|null|\n", "2305.04156": "|**2023-05-07**|**SynthMix: Mixing up Aligned Synthesis for Medical Cross-Modality Domain Adaptation**|Xinwen Zhang et.al.|[2305.04156v1](http://arxiv.org/abs/2305.04156v1)|null|\n", "2305.04072": "|**2023-05-06**|**Keyword-Based Diverse Image Retrieval by Semantics-aware Contrastive Learning and Transformer**|Minyi Zhao et.al.|[2305.04072v1](http://arxiv.org/abs/2305.04072v1)|null|\n", "2305.05665": "|**2023-05-09**|**ImageBind: One Embedding Space To Bind Them All**|Rohit Girdhar et.al.|[2305.05665v1](http://arxiv.org/abs/2305.05665v1)|**[link](https://github.com/facebookresearch/imagebind)**|\n", "2305.05662": "|**2023-05-11**|**InternGPT: Solving Vision-Centric Tasks by Interacting with ChatGPT Beyond Language**|Zhaoyang Liu et.al.|[2305.05662v3](http://arxiv.org/abs/2305.05662v3)|**[link](https://github.com/opengvlab/interngpt)**|\n", "2305.05534": "|**2023-05-09**|**Integrating Holistic and Local Information to Estimate Emotional Reaction Intensity**|Yini Fang et.al.|[2305.05534v1](http://arxiv.org/abs/2305.05534v1)|**[link](https://github.com/hkust-nisl/abaw5)**|\n", "2305.05496": "|**2023-05-09**|**Exploiting Pseudo Image Captions for Multimodal Summarization**|Chaoya Jiang et.al.|[2305.05496v1](http://arxiv.org/abs/2305.05496v1)|**[link](https://github.com/sitaproject/sita)**|\n", "2305.05260": "|**2023-05-09**|**Guided Focal Stack Refinement Network for Light Field Salient Object Detection**|Bo Yuan et.al.|[2305.05260v1](http://arxiv.org/abs/2305.05260v1)|null|\n", "2305.05189": "|**2023-05-09**|**SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with Large Language Models**|Shanshan Zhong et.al.|[2305.05189v1](http://arxiv.org/abs/2305.05189v1)|**[link](https://github.com/Qrange-group/SUR-adapter)**|\n", "2305.05166": "|**2023-05-10**|**E2TIMT: Efficient and Effective Modal Adapter for Text Image Machine Translation**|Cong Ma et.al.|[2305.05166v2](http://arxiv.org/abs/2305.05166v2)|**[link](https://github.com/ericongma/e2timt)**|\n", "2305.05126": "|**2023-05-09**|**Comparing Foundation Models using Data Kernels**|Brandon Duderstadt et.al.|[2305.05126v1](http://arxiv.org/abs/2305.05126v1)|null|\n", "2305.04961": "|**2023-05-08**|**Joint Moment Retrieval and Highlight Detection Via Natural Language Queries**|Richard Luo et.al.|[2305.04961v1](http://arxiv.org/abs/2305.04961v1)|**[link](https://github.com/skyline-9/visionary-vids)**|\n", "2305.06292": "|**2023-05-10**|**Joint Metrics Matter: A Better Standard for Trajectory Forecasting**|Erica Weng et.al.|[2305.06292v1](http://arxiv.org/abs/2305.06292v1)|**[link](https://github.com/ericaweng/joint-metrics-matter)**|\n", "2305.06278": "|**2023-05-10**|**A Multi-modal Garden Dataset and Hybrid 3D Dense Reconstruction Framework Based on Panoramic Stereo Images for a Trimming Robot**|Can Pu et.al.|[2305.06278v1](http://arxiv.org/abs/2305.06278v1)|**[link](https://github.com/canpu999/trimbot-wageningen-slam-dataset)**|\n", "2305.06225": "|**2023-05-10**|**DaGAN++: Depth-Aware Generative Adversarial Network for Talking Head Video Generation**|Fa-Ting Hong et.al.|[2305.06225v1](http://arxiv.org/abs/2305.06225v1)|**[link](https://github.com/harlanhong/cvpr2022-dagan)**|\n", "2305.06221": "|**2023-05-10**|**Multi-Prompt with Depth Partitioned Cross-Modal Learning**|Yiqi Wang et.al.|[2305.06221v1](http://arxiv.org/abs/2305.06221v1)|**[link](https://github.com/wangyiqi/pmpo)**|\n", "2305.06203": "|**2023-05-10**|**Multiclass MRI Brain Tumor Segmentation using 3D Attention-based U-Net**|Maryann M. Gitonga et.al.|[2305.06203v1](http://arxiv.org/abs/2305.06203v1)|null|\n", "2305.06179": "|**2023-05-11**|**A Multi-modal Approach to Single-modal Visual Place Classification**|Tomoya Iwasaki et.al.|[2305.06179v2](http://arxiv.org/abs/2305.06179v2)|null|\n", "2305.05992": "|**2023-05-10**|**MMoT: Mixture-of-Modality-Tokens Transformer for Composed Multimodal Conditional Image Synthesis**|Jianbin Zheng et.al.|[2305.05992v1](http://arxiv.org/abs/2305.05992v1)|null|\n", "2305.05880": "|**2023-05-10**|**ChinaOpen: A Dataset for Open-world Multimodal Learning**|Aozhu Chen et.al.|[2305.05880v1](http://arxiv.org/abs/2305.05880v1)|**[link](https://github.com/dong03/GenerativeVideo2Text)**|\n", "2305.06978": "|**2023-05-11**|**Meta-hallucinator: Towards Few-Shot Cross-Modality Cardiac Image Segmentation**|Ziyuan Zhao et.al.|[2305.06978v1](http://arxiv.org/abs/2305.06978v1)|null|\n", "2305.06923": "|**2023-05-11**|**EAML: Ensemble Self-Attention-based Mutual Learning Network for Document Image Classification**|Souhail Bakkali et.al.|[2305.06923v1](http://arxiv.org/abs/2305.06923v1)|null|\n", "2305.06794": "|**2023-05-11**|**Multi-modal Multi-level Fusion for 3D Single Object Tracking**|Zhiheng Li et.al.|[2305.06794v1](http://arxiv.org/abs/2305.06794v1)|null|\n", "2305.06720": "|**2023-05-11**|**Bi-level Dynamic Learning for Jointly Multi-modality Image Fusion and Beyond**|Zhu Liu et.al.|[2305.06720v1](http://arxiv.org/abs/2305.06720v1)|**[link](https://github.com/LiuZhu-CV/BDLFusion)**|\n", "2305.06472": "|**2023-05-12**|**ChatGPT-Like Large-Scale Foundation Models for Prognostics and Health Management: A Survey and Roadmaps**|Yan-Fu Li et.al.|[2305.06472v2](http://arxiv.org/abs/2305.06472v2)|null|\n", "2305.06407": "|**2023-05-10**|**Combo of Thinking and Observing for Outside-Knowledge VQA**|Qingyi Si et.al.|[2305.06407v1](http://arxiv.org/abs/2305.06407v1)|**[link](https://github.com/phoebussi/thinking-while-observing)**|\n", "2305.06386": "|**2023-05-10**|**Text-To-Concept (and Back) via Cross-Model Alignment**|Mazda Moayeri et.al.|[2305.06386v1](http://arxiv.org/abs/2305.06386v1)|null|\n", "2305.07358": "|**2023-05-12**|**Towards Versatile and Efficient Visual Knowledge Injection into Pre-trained Language Models with Cross-Modal Adapters**|Xinyun Zhang et.al.|[2305.07358v1](http://arxiv.org/abs/2305.07358v1)|null|\n", "2305.07334": "|**2023-05-12**|**Locking and Quacking: Stacking Bayesian model predictions by log-pooling and superposition**|Yuling Yao et.al.|[2305.07334v1](http://arxiv.org/abs/2305.07334v1)|null|\n", "2305.07216": "|**2023-05-12**|**Versatile Audio-Visual Learning for Handling Single and Multi Modalities in Emotion Regression and Classification Tasks**|Lucas Goncalves et.al.|[2305.07216v1](http://arxiv.org/abs/2305.07216v1)|**[link](https://github.com/ilucasgoncalves/vavl)**|\n", "2305.07214": "|**2023-05-12**|**MMG-Ego4D: Multi-Modal Generalization in Egocentric Action Recognition**|Xinyu Gong et.al.|[2305.07214v1](http://arxiv.org/abs/2305.07214v1)|null|\n", "2305.07437": "|**2023-05-15**|**Continual Vision-Language Representation Learning with Off-Diagonal Information**|Zixuan Ni et.al.|[2305.07437v2](http://arxiv.org/abs/2305.07437v2)|null|\n", "2305.08706": "|**2023-05-15**|**Understanding and Bridging the Modality Gap for Speech Translation**|Qingkai Fang et.al.|[2305.08706v1](http://arxiv.org/abs/2305.08706v1)|**[link](https://github.com/ictnlp/cress)**|\n", "2305.08698": "|**2023-05-15**|**Continual Multimodal Knowledge Graph Construction**|Xiang Chen et.al.|[2305.08698v1](http://arxiv.org/abs/2305.08698v1)|**[link](https://github.com/zjunlp/ContinueMKGC)**|\n", "2305.08685": "|**2023-05-15**|**CLIP-VG: Self-paced Curriculum Adapting of CLIP via Exploiting Pseudo-Language Labels for Visual Grounding**|Linhui Xiao et.al.|[2305.08685v1](http://arxiv.org/abs/2305.08685v1)|**[link](https://github.com/linhuixiao/clip-vg)**|\n", "2305.08532": "|**2023-05-15**|**Benchmarking UWB-Based Infrastructure-Free Positioning and Multi-Robot Relative Localization: Dataset and Characterization**|Paola Torrico Mor\u00f3n et.al.|[2305.08532v1](http://arxiv.org/abs/2305.08532v1)|null|\n", "2305.08522": "|**2023-05-15**|**Cross-Modality Time-Variant Relation Learning for Generating Dynamic Scene Graphs**|Jingyi Wang et.al.|[2305.08522v1](http://arxiv.org/abs/2305.08522v1)|**[link](https://github.com/qncsn2016/TR2)**|\n", "2305.08386": "|**2023-05-15**|**PLIP: Language-Image Pre-training for Person Representation Learning**|Jialong Zuo et.al.|[2305.08386v1](http://arxiv.org/abs/2305.08386v1)|**[link](https://github.com/zplusdragon/plip)**|\n", "2305.08381": "|**2023-05-15**|**Mode Approximation Makes Good Vision-Language Prompts**|Haixin Wang et.al.|[2305.08381v1](http://arxiv.org/abs/2305.08381v1)|**[link](https://github.com/willdreamer/aurora)**|\n", "2305.08372": "|**2023-05-15**|**A Novel Framework for Multimodal Named Entity Recognition with Multi-level Alignments**|Peipei Liu et.al.|[2305.08372v1](http://arxiv.org/abs/2305.08372v1)|null|\n", "2305.08252": "|**2023-05-14**|**Parameter-Efficient Fine-Tuning for Medical Image Analysis: The Missed Opportunity**|Raman Dutt et.al.|[2305.08252v1](http://arxiv.org/abs/2305.08252v1)|null|\n", "2305.08120": "|**2023-05-14**|**Unraveling Cold Start Enigmas in Predictive Analytics for OTT Media: Synergistic Meta-Insights and Multimodal Ensemble Mastery**|K. Ganguly et.al.|[2305.08120v1](http://arxiv.org/abs/2305.08120v1)|null|\n", "2305.07927": "|**2023-05-13**|**RC3: Regularized Contrastive Cross-lingual Cross-modal Pre-training**|Chulun Zhou et.al.|[2305.07927v1](http://arxiv.org/abs/2305.07927v1)|null|\n", "2305.07920": "|**2023-05-13**|**Multi-task Paired Masking with Alignment Modeling for Medical Vision-Language Pre-training**|Ke Zhang et.al.|[2305.07920v1](http://arxiv.org/abs/2305.07920v1)|null|\n", "2305.07910": "|**2023-05-13**|**Mask to reconstruct: Cooperative Semantics Completion for Video-text Retrieval**|Han Fang et.al.|[2305.07910v1](http://arxiv.org/abs/2305.07910v1)|null|\n", "2305.07825": "|**2023-05-13**|**Student Classroom Behavior Detection based on YOLOv7-BRA and Multi-Model Fusion**|Fan Yang et.al.|[2305.07825v1](http://arxiv.org/abs/2305.07825v1)|**[link](https://github.com/whiffe/scb-dataset)**|\n", "2305.07792": "|**2023-05-12**|**Contextuality in multi-agent paradoxes**|Sidiney B. Montanhano et.al.|[2305.07792v1](http://arxiv.org/abs/2305.07792v1)|null|\n", "2305.09641": "|**2023-05-16**|**FitMe: Deep Photorealistic 3D Morphable Model Avatars**|Alexandros Lattas et.al.|[2305.09641v1](http://arxiv.org/abs/2305.09641v1)|null|\n", "2305.09600": "|**2023-05-16**|**Deep Reinforcement Learning to Maximize Arterial Usage during Extreme Congestion**|Ashutosh Dutta et.al.|[2305.09600v1](http://arxiv.org/abs/2305.09600v1)|null|\n", "2305.09333": "|**2023-05-16**|**Multi-modal Visual Understanding with Prompts for Semantic Information Disentanglement of Image**|Yuzhou Peng et.al.|[2305.09333v1](http://arxiv.org/abs/2305.09333v1)|null|\n", "2305.09272": "|**2023-05-16**|**Age of Incorrect Information in Semantic Communications for NOMA Aided XR Applications**|Jianrui Chen et.al.|[2305.09272v1](http://arxiv.org/abs/2305.09272v1)|null|\n", "2305.09255": "|**2023-05-16**|**Trust-Worthy Semantic Communications for the Metaverse Relying on Federated Learning**|Jianrui Chen et.al.|[2305.09255v1](http://arxiv.org/abs/2305.09255v1)|null|\n", "2305.09212": "|**2023-05-16**|**Cross-Modal Global Interaction and Local Alignment for Audio-Visual Speech Recognition**|Yuchen Hu et.al.|[2305.09212v1](http://arxiv.org/abs/2305.09212v1)|**[link](https://github.com/yuchen005/gila)**|\n", "2305.09011": "|**2023-05-18**|**The Brain Tumor Segmentation (BraTS) Challenge 2023: Brain MR Image Synthesis for Tumor Segmentation (BraSyn)**|Hongwei Bran Li et.al.|[2305.09011v2](http://arxiv.org/abs/2305.09011v2)|null|\n", "2305.10420": "|**2023-05-17**|**CLIP-GCD: Simple Language Guided Generalized Category Discovery**|Rabah Ouldnoughi et.al.|[2305.10420v1](http://arxiv.org/abs/2305.10420v1)|null|\n", "2305.10046": "|**2023-05-17**|**Probing the Role of Positional Information in Vision-Language Models**|Philipp J. R\u00f6sch et.al.|[2305.10046v1](http://arxiv.org/abs/2305.10046v1)|null|\n", "2305.09946": "|**2023-05-17**|**DeepMSS: Deep Multi-Modality Segmentation-to-Survival Learning for Survival Outcome Prediction from PET/CT Images**|Mingyuan Meng et.al.|[2305.09946v1](http://arxiv.org/abs/2305.09946v1)|**[link](https://github.com/mungomeng/survival-deepmss)**|\n", "2305.11176": "|**2023-05-18**|**Instruct2Act: Mapping Multi-modality Instructions to Robotic Actions with Large Language Model**|Siyuan Huang et.al.|[2305.11176v1](http://arxiv.org/abs/2305.11176v1)|**[link](https://github.com/opengvlab/instruct2act)**|\n", "2305.11172": "|**2023-05-18**|**ONE-PEACE: Exploring One General Representation Model Toward Unlimited Modalities**|Peng Wang et.al.|[2305.11172v1](http://arxiv.org/abs/2305.11172v1)|**[link](https://github.com/OFA-Sys/ONE-PEACE)**|\n", "2305.11101": "|**2023-05-18**|**XFormer: Fast and Accurate Monocular 3D Body Capture**|Lihui Qian et.al.|[2305.11101v1](http://arxiv.org/abs/2305.11101v1)|null|\n", "2305.11096": "|**2023-05-22**|**Cross-modality Data Augmentation for End-to-End Sign Language Translation**|Jinhui Ye et.al.|[2305.11096v2](http://arxiv.org/abs/2305.11096v2)|**[link](https://github.com/atrewin/signxmda)**|\n", "2305.11012": "|**2023-05-18**|**SDC-UDA: Volumetric Unsupervised Domain Adaptation Framework for Slice-Direction Continuous Cross-Modality Medical Image Segmentation**|Hyungseob Shin et.al.|[2305.11012v1](http://arxiv.org/abs/2305.11012v1)|null|\n", "2305.11000": "|**2023-05-19**|**SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities**|Dong Zhang et.al.|[2305.11000v2](http://arxiv.org/abs/2305.11000v2)|**[link](https://github.com/0nutation/speechgpt)**|\n", "2305.10920": "|**2023-05-18**|**Emergent Communication with Attention**|Ryokan Ri et.al.|[2305.10920v1](http://arxiv.org/abs/2305.10920v1)|null|\n", "2305.10838": "|**2023-05-18**|**ProgSG: Cross-Modality Representation Learning for Programs in Electronic Design Automation**|Yunsheng Bai et.al.|[2305.10838v1](http://arxiv.org/abs/2305.10838v1)|null|\n", "2305.10783": "|**2023-05-18**|**Transforming Human-Centered AI Collaboration: Redefining Embodied Agents Capabilities through Interactive Grounded Language Instructions**|Shrestha Mohanty et.al.|[2305.10783v1](http://arxiv.org/abs/2305.10783v1)|**[link](https://github.com/iglu-contest/nlp-baselines-2022)**|\n", "2305.10773": "|**2023-05-18**|**Rate-Adaptive Coding Mechanism for Semantic Communications With Multi-Modal Data**|Yangshuo He et.al.|[2305.10773v1](http://arxiv.org/abs/2305.10773v1)|null|\n", "2305.10764": "|**2023-05-18**|**OpenShape: Scaling Up 3D Shape Representation Towards Open-World Understanding**|Minghua Liu et.al.|[2305.10764v1](http://arxiv.org/abs/2305.10764v1)|null|\n", "2305.10763": "|**2023-05-18**|**CLAPSpeech: Learning Prosody from Text Context with Contrastive Language-Audio Pre-training**|Zhenhui Ye et.al.|[2305.10763v1](http://arxiv.org/abs/2305.10763v1)|null|\n", "2305.10724": "|**2023-05-18**|**Segment Any Anomaly without Training via Hybrid Prompt Regularization**|Yunkang Cao et.al.|[2305.10724v1](http://arxiv.org/abs/2305.10724v1)|**[link](https://github.com/caoyunkang/segment-any-anomaly)**|\n", "2305.10547": "|**2023-05-17**|**Rethinking Multimodal Content Moderation from an Asymmetric Angle with Mixed-modality**|Jialin Yuan et.al.|[2305.10547v1](http://arxiv.org/abs/2305.10547v1)|null|\n", "2305.10512": "|**2023-05-17**|**IMAD: IMage-Augmented multi-modal Dialogue**|Moskvoretskii Viktor et.al.|[2305.10512v1](http://arxiv.org/abs/2305.10512v1)|**[link](https://github.com/vityavitalich/imad)**|\n", "2305.11832": "|**2023-05-19**|**Improving Multimodal Joint Variational Autoencoders through Normalizing Flows and Correlation Analysis**|Agathe Senellart et.al.|[2305.11832v1](http://arxiv.org/abs/2305.11832v1)|null|\n", "2305.11818": "|**2023-05-19**|**MaGIC: Multi-modality Guided Image Completion**|Yongsheng Yu et.al.|[2305.11818v1](http://arxiv.org/abs/2305.11818v1)|null|\n", "2305.11719": "|**2023-05-19**|**Information Screening whilst Exploiting! Multimodal Relation Extraction with Feature Denoising and Multimodal Topic Modeling**|Shengqiong Wu et.al.|[2305.11719v1](http://arxiv.org/abs/2305.11719v1)|**[link](https://github.com/chocowu/mre-ise)**|\n", "2305.11579": "|**2023-05-19**|**Speech-Text Dialog Pre-training for Spoken Dialog Understanding with Explicit Cross-Modal Alignment**|Tianshu Yu et.al.|[2305.11579v1](http://arxiv.org/abs/2305.11579v1)|**[link](https://github.com/alibabaresearch/damo-convai)**|\n", "2305.11503": "|**2023-05-19**|**A Topic-aware Summarization Framework with Different Modal Side Information**|Xiuying Chen et.al.|[2305.11503v1](http://arxiv.org/abs/2305.11503v1)|null|\n", "2305.11481": "|**2023-05-22**|**CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring Image Segmentation**|Wenxuan Wang et.al.|[2305.11481v2](http://arxiv.org/abs/2305.11481v2)|null|\n", "2305.11443": "|**2023-05-19**|**Equivariant Multi-Modality Image Fusion**|Zixiang Zhao et.al.|[2305.11443v1](http://arxiv.org/abs/2305.11443v1)|null|\n", "2305.11439": "|**2023-05-19**|**Few-Shot Learning with Visual Distribution Calibration and Cross-Modal Distribution Alignment**|Runqi Wang et.al.|[2305.11439v1](http://arxiv.org/abs/2305.11439v1)|**[link](https://github.com/bhrqw/sada)**|\n", "2305.11392": "|**2023-05-19**|**Fast-StrucTexT: An Efficient Hourglass Transformer with Modality-guided Dynamic Token Merge for Document Understanding**|Mingliang Zhai et.al.|[2305.11392v1](http://arxiv.org/abs/2305.11392v1)|null|\n", "2305.11349": "|**2023-05-18**|**Unsupervised Domain-agnostic Fake News Detection using Multi-modal Weak Signals**|Amila Silva et.al.|[2305.11349v1](http://arxiv.org/abs/2305.11349v1)|null|\n", "2305.11327": "|**2023-05-18**|**MALM: Mask Augmentation based Local Matching for Food-Recipe Retrieval**|Bhanu Prakash Voutharoja et.al.|[2305.11327v1](http://arxiv.org/abs/2305.11327v1)|**[link](https://github.com/myfoodchoice/malm_mask_augmentation_based_local_matching-_for-_food_recipe_retrieval)**|\n", "2305.13220": "|**2023-05-22**|**Fast Monocular Scene Reconstruction with Global-Sparse Local-Dense Grids**|Wei Dong et.al.|[2305.13220v1](http://arxiv.org/abs/2305.13220v1)|null|\n", "2305.12953": "|**2023-05-22**|**Enhancing Next Active Object-based Egocentric Action Anticipation with Guided Attention**|Sanket Thakur et.al.|[2305.12953v1](http://arxiv.org/abs/2305.12953v1)|**[link](https://github.com/sanketsans/ganov2)**|\n", "2305.12903": "|**2023-05-22**|**DiffAVA: Personalized Text-to-Audio Generation with Visual Alignment**|Shentong Mo et.al.|[2305.12903v1](http://arxiv.org/abs/2305.12903v1)|null|\n", "2305.12878": "|**2023-05-22**|**Non-Autoregressive Document-Level Machine Translation (NA-DMT): Exploring Effective Approaches, Challenges, and Opportunities**|Guangsheng Bao et.al.|[2305.12878v1](http://arxiv.org/abs/2305.12878v1)|**[link](https://github.com/baoguangsheng/nat-on-doc)**|\n", "2305.12807": "|**2023-05-22**|**Multi-task Combinatorial Optimization: Adaptive Multi-modality Knowledge Transfer by an Explicit Inter-task Distance**|Peng Li et.al.|[2305.12807v1](http://arxiv.org/abs/2305.12807v1)|null|\n", "2305.12793": "|**2023-05-22**|**Zero-Shot End-to-End Spoken Language Understanding via Cross-Modal Selective Self-Training**|Jianfeng He et.al.|[2305.12793v1](http://arxiv.org/abs/2305.12793v1)|null|\n", "2305.12711": "|**2023-05-22**|**Unsupervised Visible-Infrared Person ReID by Collaborative Learning with Neighbor-Guided Label Refinement**|De Cheng et.al.|[2305.12711v1](http://arxiv.org/abs/2305.12711v1)|null|\n", "2305.12703": "|**2023-05-22**|**Progressive Sub-Graph Clustering Algorithm for Semi-Supervised Domain Adaptation Speaker Verification**|Zhuo Li et.al.|[2305.12703v1](http://arxiv.org/abs/2305.12703v1)|null|\n", "2305.12673": "|**2023-05-22**|**Efficient Bilateral Cross-Modality Cluster Matching for Unsupervised Visible-Infrared Person ReID**|De cheng et.al.|[2305.12673v1](http://arxiv.org/abs/2305.12673v1)|null|\n", "2305.12530": "|**2023-05-21**|**Towards Robust Family-Infant Audio Analysis Based on Unsupervised Pretraining of Wav2vec 2.0 on Large-Scale Unlabeled Family Audio**|Jialu Li et.al.|[2305.12530v1](http://arxiv.org/abs/2305.12530v1)|null|\n", "2305.12452": "|**2023-05-21**|**Advancing Referring Expression Segmentation Beyond Single Image**|Yixuan Wu et.al.|[2305.12452v1](http://arxiv.org/abs/2305.12452v1)|null|\n", "2305.12369": "|**2023-05-21**|**HIINT: Historical, Intra- and Inter- personal Dynamics Modeling with Cross-person Memory Transformer**|Yubin Kim et.al.|[2305.12369v1](http://arxiv.org/abs/2305.12369v1)|null|\n", "2305.12260": "|**2023-05-20**|**Cross2StrA: Unpaired Cross-lingual Image Captioning with Cross-lingual Cross-modal Structure-pivoted Alignment**|Shengqiong Wu et.al.|[2305.12260v1](http://arxiv.org/abs/2305.12260v1)|null|\n", "2305.12218": "|**2023-05-20**|**Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment**|Peng Jin et.al.|[2305.12218v1](http://arxiv.org/abs/2305.12218v1)|**[link](https://github.com/jpthu17/dicosa)**|\n", "2305.12011": "|**2023-05-19**|**Boosting Crop Classification by Hierarchically Fusing Satellite, Rotational, and Contextual Data**|Barriere Valentin et.al.|[2305.12011v1](http://arxiv.org/abs/2305.12011v1)|null|\n", "2305.14312": "|**2023-05-23**|**Text-guided 3D Human Generation from 2D Collections**|Tsu-Jui Fu et.al.|[2305.14312v1](http://arxiv.org/abs/2305.14312v1)|null|\n", "2305.14167": "|**2023-05-24**|**DetGPT: Detect What You Need via Reasoning**|Renjie Pi et.al.|[2305.14167v2](http://arxiv.org/abs/2305.14167v2)|null|\n", "2305.14042": "|**2023-05-23**|**Improving speech translation by fusing speech and text**|Wenbiao Yin et.al.|[2305.14042v1](http://arxiv.org/abs/2305.14042v1)|null|\n", "2305.14017": "|**2023-05-23**|**Faster Video Moment Retrieval with Point-Level Supervision**|Xun Jiang et.al.|[2305.14017v1](http://arxiv.org/abs/2305.14017v1)|null|\n", "2305.14014": "|**2023-05-23**|**CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained Vision-Language Model**|Shuai Zhao et.al.|[2305.14014v1](http://arxiv.org/abs/2305.14014v1)|null|\n", "2305.13986": "|**2023-05-23**|**A Multi-Modal Network Equilibrium Model with Interacting Mobility Service Providers'Strategies**|Claudia Bandiera et.al.|[2305.13986v1](http://arxiv.org/abs/2305.13986v1)|null|\n", "2305.13705": "|**2023-05-23**|**DiffHand: End-to-End Hand Mesh Reconstruction via Diffusion Models**|Lijun Li et.al.|[2305.13705v1](http://arxiv.org/abs/2305.13705v1)|null|\n", "2305.13697": "|**2023-05-23**|**UNIMO-3: Multi-granularity Interaction for Vision-Language Representation Learning**|Hao Yang et.al.|[2305.13697v1](http://arxiv.org/abs/2305.13697v1)|null|\n", "2305.13667": "|**2023-05-23**|**Optimizing Non-Autoregressive Transformers with Contrastive Learning**|Chenxin An et.al.|[2305.13667v1](http://arxiv.org/abs/2305.13667v1)|null|\n", "2305.13659": "|**2023-05-23**|**Flare-Aware Cross-modal Enhancement Network for Multi-spectral Vehicle Re-identification**|Aihua Zheng et.al.|[2305.13659v1](http://arxiv.org/abs/2305.13659v1)|**[link](https://github.com/Mzq12138/Official-Implementation-for-Flare-Aware-Cross-modal-Enhancement-for-Multi-spectral-Vehicle-ReID)**|\n", "2305.13653": "|**2023-05-23**|**RaSa: Relation and Sensitivity Aware Representation Learning for Text-based Person Search**|Yang Bai et.al.|[2305.13653v1](http://arxiv.org/abs/2305.13653v1)|**[link](https://github.com/flame-chasers/rasa)**|\n", "2305.13631": "|**2023-05-23**|**EDIS: Entity-Driven Image Search over Multimodal Web Content**|Siqi Liu et.al.|[2305.13631v1](http://arxiv.org/abs/2305.13631v1)|**[link](https://github.com/emerisly/edis)**|\n", "2305.13503": "|**2023-05-22**|**Asynchronous Multi-Model Federated Learning over Wireless Networks: Theory, Modeling, and Optimization**|Zhan-Lun Chang et.al.|[2305.13503v1](http://arxiv.org/abs/2305.13503v1)|null|\n", "2305.15403": "|**2023-05-24**|**AV-TranSpeech: Audio-Visual Robust Speech-to-Speech Translation**|Rongjie Huang et.al.|[2305.15403v1](http://arxiv.org/abs/2305.15403v1)|null|\n", "2305.15302": "|**2023-05-24**|**Multi-Modal Mutual Attention and Iterative Interaction for Referring Image Segmentation**|Chang Liu et.al.|[2305.15302v1](http://arxiv.org/abs/2305.15302v1)|null|\n", "2305.15296": "|**2023-05-24**|**MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal Image Generation**|Marco Bellagente et.al.|[2305.15296v1](http://arxiv.org/abs/2305.15296v1)|null|\n", "2305.15218": "|**2023-05-24**|**Multi-modal Machine Learning for Vehicle Rating Predictions Using Image, Text, and Parametric Data**|Hanqi Su et.al.|[2305.15218v1](http://arxiv.org/abs/2305.15218v1)|null|\n", "2305.15217": "|**2023-05-24**|**L-CAD: Language-based Colorization with Any-level Descriptions**|Zheng Chang et.al.|[2305.15217v1](http://arxiv.org/abs/2305.15217v1)|null|\n", "2305.15159": "|**2023-05-24**|**Collaborative Recommendation Model Based on Multi-modal Multi-view Attention Network: Movie and literature cases**|Zheng Hu et.al.|[2305.15159v1](http://arxiv.org/abs/2305.15159v1)|null|\n", "2305.15094": "|**2023-05-24**|**InpaintNeRF360: Text-Guided 3D Inpainting on Unbounded Neural Radiance Fields**|Dongqing Wang et.al.|[2305.15094v1](http://arxiv.org/abs/2305.15094v1)|null|\n", "2305.15033": "|**2023-05-24**|**SmartTrim: Adaptive Tokens and Parameters Pruning for Efficient Vision-Language Models**|Zekun Wang et.al.|[2305.15033v1](http://arxiv.org/abs/2305.15033v1)|null|\n", "2305.15023": "|**2023-05-24**|**Cheap and Quick: Efficient Vision-Language Instruction Tuning for Large Language Models**|Gen Luo et.al.|[2305.15023v1](http://arxiv.org/abs/2305.15023v1)|null|\n", "2305.15021": "|**2023-05-24**|**EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought**|Yao Mu et.al.|[2305.15021v1](http://arxiv.org/abs/2305.15021v1)|**[link](https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch)**|\n", "2305.14969": "|**2023-05-24**|**MMNet: Multi-Mask Network for Referring Image Segmentation**|Yichen Yan et.al.|[2305.14969v1](http://arxiv.org/abs/2305.14969v1)|null|\n", "2305.14914": "|**2023-05-24**|**GAMUS: A Geometry-aware Multi-modal Semantic Segmentation Benchmark for Remote Sensing Data**|Zhitong Xiong et.al.|[2305.14914v1](http://arxiv.org/abs/2305.14914v1)|**[link](https://github.com/earthnets/rsi-mmsegmentation)**|\n", "2305.14897": "|**2023-05-24**|**Text encoders are performance bottlenecks in contrastive vision-language models**|Amita Kamath et.al.|[2305.14897v1](http://arxiv.org/abs/2305.14897v1)|**[link](https://github.com/amitakamath/vl_text_encoders_are_bottlenecks)**|\n", "2305.14843": "|**2023-05-24**|**Meta-Learning For Vision-and-Language Cross-lingual Transfer**|Hanxu Hu et.al.|[2305.14843v1](http://arxiv.org/abs/2305.14843v1)|null|\n", "2305.14839": "|**2023-05-24**|**PaCE: Unified Multi-modal Dialogue Pre-training with Progressive and Compositional Experts**|Yunshui Li et.al.|[2305.14839v1](http://arxiv.org/abs/2305.14839v1)|**[link](https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/pace)**|\n", "2305.16318": "|**2023-05-25**|**Referred by Multi-Modality: A Unified Temporal Transformer for Video Object Segmentation**|Shilin Yan et.al.|[2305.16318v1](http://arxiv.org/abs/2305.16318v1)|**[link](https://github.com/opengvlab/mutr)**|\n", "2305.16304": "|**2023-05-25**|**Candidate Set Re-ranking for Composed Image Retrieval with Dual Multi-modal Encoder**|Zheyuan Liu et.al.|[2305.16304v1](http://arxiv.org/abs/2305.16304v1)|null|\n", "2305.16166": "|**2023-05-25**|**Multimodal Relation Extraction with Cross-Modal Retrieval and Synthesis**|Xuming Hu et.al.|[2305.16166v1](http://arxiv.org/abs/2305.16166v1)|null|\n", "2305.16107": "|**2023-05-25**|**VioLA: Unified Codec Language Models for Speech Recognition, Synthesis, and Translation**|Tianrui Wang et.al.|[2305.16107v1](http://arxiv.org/abs/2305.16107v1)|null|\n", "2305.15957": "|**2023-05-25**|**DiffCLIP: Leveraging Stable Diffusion for Language Grounded 3D Classification**|Sitian Shen et.al.|[2305.15957v1](http://arxiv.org/abs/2305.15957v1)|null|\n", "2305.15920": "|**2023-05-25**|**Learning and accurate generation of stochastic dynamics based on multi-model Generative Adversarial Networks**|Daniele Lanzoni et.al.|[2305.15920v1](http://arxiv.org/abs/2305.15920v1)|null|\n", "2305.15913": "|**2023-05-27**|**MEMEX: Detecting Explanatory Evidence for Memes via Knowledge-Enriched Contextualization**|Shivam Sharma et.al.|[2305.15913v2](http://arxiv.org/abs/2305.15913v2)|**[link](https://github.com/lcs2-iiitd/memex_meme_evidence)**|\n", "2305.15765": "|**2023-05-25**|**Language-Guided 3D Object Detection in Point Cloud for Autonomous Driving**|Wenhao Cheng et.al.|[2305.15765v1](http://arxiv.org/abs/2305.15765v1)|null|\n", "2305.15762": "|**2023-05-25**|**Dynamic Enhancement Network for Partial Multi-modality Person Re-identification**|Aihua Zheng et.al.|[2305.15762v1](http://arxiv.org/abs/2305.15762v1)|null|\n", "2305.15753": "|**2023-05-25**|**T2TD: Text-3D Generation Model based on Prior Knowledge Guidance**|Weizhi Nie et.al.|[2305.15753v1](http://arxiv.org/abs/2305.15753v1)|null|\n", "2305.15732": "|**2023-05-26**|**CLIP3Dstyler: Language Guided 3D Arbitrary Neural Style Transfer**|Ming Gao et.al.|[2305.15732v2](http://arxiv.org/abs/2305.15732v2)|null|\n", "2305.15688": "|**2023-05-25**|**Frame-Event Alignment and Fusion Network for High Frame Rate Tracking**|Jiqing Zhang et.al.|[2305.15688v1](http://arxiv.org/abs/2305.15688v1)|null|\n", "2305.15483": "|**2023-05-24**|**Weakly Supervised Vision-and-Language Pre-training with Relative Representations**|Chi Chen et.al.|[2305.15483v1](http://arxiv.org/abs/2305.15483v1)|null|\n", "2305.17102": "|**2023-05-26**|**GeoVLN: Learning Geometry-Enhanced Visual Representation with Slot Attention for Vision-and-Language Navigation**|Jingyang Huo et.al.|[2305.17102v1](http://arxiv.org/abs/2305.17102v1)|**[link](https://github.com/jingyanghuo/GeoVLN)**|\n", "2305.17100": "|**2023-05-26**|**BiomedGPT: A Unified and Generalist Biomedical Generative Pre-trained Transformer for Vision, Language, and Multimodal Tasks**|Kai Zhang et.al.|[2305.17100v1](http://arxiv.org/abs/2305.17100v1)|**[link](https://github.com/taokz/biomedgpt)**|\n", "2305.17011": "|**2023-05-26**|**SOC: Semantic-Assisted Object Cluster for Referring Video Object Segmentation**|Zhuoyan Luo et.al.|[2305.17011v1](http://arxiv.org/abs/2305.17011v1)|null|\n", "2305.16986": "|**2023-05-29**|**NavGPT: Explicit Reasoning in Vision-and-Language Navigation with Large Language Models**|Gengze Zhou et.al.|[2305.16986v2](http://arxiv.org/abs/2305.16986v2)|**[link](https://github.com/gengzezhou/navgpt)**|\n", "2305.16685": "|**2023-05-26**|**S4M: Generating Radiology Reports by A Single Model for Multiple Body Parts**|Qi Chen et.al.|[2305.16685v1](http://arxiv.org/abs/2305.16685v1)|**[link](https://github.com/ytongxie/s4m)**|\n", "2305.16556": "|**2023-05-26**|**LANISTR: Multimodal Learning from Structured and Unstructured Data**|Sayna Ebrahimi et.al.|[2305.16556v1](http://arxiv.org/abs/2305.16556v1)|null|\n", "2305.16434": "|**2023-05-25**|**Credit Valuation Adjustment in Financial Networks**|Irena Barja\u0161i\u0107 et.al.|[2305.16434v1](http://arxiv.org/abs/2305.16434v1)|null|\n", "2305.16406": "|**2023-05-25**|**Context-Aware Attention Layers coupled with Optimal Transport Domain Adaptation methods for recognizing dementia from spontaneous speech**|Loukas Ilias et.al.|[2305.16406v1](http://arxiv.org/abs/2305.16406v1)|null|\n", "2305.18171": "|**2023-05-29**|**Improved Probabilistic Image-Text Representations**|Sanghyuk Chun et.al.|[2305.18171v1](http://arxiv.org/abs/2305.18171v1)|**[link](https://github.com/naver-ai/pcmepp)**|\n", "2305.18009": "|**2023-05-29**|**Multi-Modal Face Stylization with a Generative Prior**|Mengtian Li et.al.|[2305.18009v1](http://arxiv.org/abs/2305.18009v1)|null|\n", "2305.17993": "|**2023-05-29**|**Multi-Scale Attention for Audio Question Answering**|Guangyao Li et.al.|[2305.17993v1](http://arxiv.org/abs/2305.17993v1)|**[link](https://github.com/gewu-lab/mwafm)**|\n", "2305.17941": "|**2023-05-29**|**Safety of autonomous vehicles: A survey on Model-based vs. AI-based approaches**|Dimia Iberraken et.al.|[2305.17941v1](http://arxiv.org/abs/2305.17941v1)|null|\n", "2305.17925": "|**2023-05-29**|**Identifying shifts in multi-modal travel patterns during special events using mobile data: Celebrating Vappu in Helsinki**|Zhiren Huang et.al.|[2305.17925v1](http://arxiv.org/abs/2305.17925v1)|null|\n", "2305.17911": "|**2023-05-29**|**TotalDefMeme: A Multi-Attribute Meme dataset on Total Defence in Singapore**|Nirmalendu Prakash et.al.|[2305.17911v1](http://arxiv.org/abs/2305.17911v1)|null|\n", "2305.17903": "|**2023-05-30**|**Deeply Coupled Cross-Modal Prompt Learning**|Xuejing Liu et.al.|[2305.17903v2](http://arxiv.org/abs/2305.17903v2)|**[link](https://github.com/gingl/cmpa)**|\n", "2305.17652": "|**2023-05-28**|**ConaCLIP: Exploring Distillation of Fully-Connected Knowledge Interaction Graph for Lightweight Text-Image Retrieval**|Jiapeng Wang et.al.|[2305.17652v1](http://arxiv.org/abs/2305.17652v1)|null|\n", "2305.17629": "|**2023-05-28**|**Multi-Modal Wireless Flexible Gel-Free Sensors with Edge Deep Learning for Detecting and Alerting Freezing of Gait in Parkinson's Patients**|Yuhan Hou et.al.|[2305.17629v1](http://arxiv.org/abs/2305.17629v1)|null|\n", "2305.17600": "|**2023-05-28**|**GAME-UP: Game-Aware Mode Enumeration and Understanding for Trajectory Prediction**|Justin Lidard et.al.|[2305.17600v1](http://arxiv.org/abs/2305.17600v1)|null|\n", "2305.17530": "|**2023-05-27**|**PuMer: Pruning and Merging Tokens for Efficient Vision Language Models**|Qingqing Cao et.al.|[2305.17530v1](http://arxiv.org/abs/2305.17530v1)|**[link](https://github.com/csarron/pumer)**|\n", "2305.17499": "|**2023-05-27**|**CIF-PT: Bridging Speech and Text Representations for Spoken Language Understanding via Continuous Integrate-and-Fire Pre-Training**|Linhao Dong et.al.|[2305.17499v1](http://arxiv.org/abs/2305.17499v1)|null|\n", "2305.17455": "|**2023-05-27**|**CrossGET: Cross-Guided Ensemble of Tokens for Accelerating Vision-Language Transformers**|Dachuan Shi et.al.|[2305.17455v1](http://arxiv.org/abs/2305.17455v1)|**[link](https://github.com/sdc17/crossget)**|\n", "2305.17343": "|**2023-05-27**|**Modality-Independent Teachers Meet Weakly-Supervised Audio-Visual Event Parser**|Yung-Hsuan Lai et.al.|[2305.17343v1](http://arxiv.org/abs/2305.17343v1)|**[link](https://github.com/franklin905/valor)**|\n", "2305.17219": "|**2023-05-26**|**GVdoc: Graph-based Visual Document Classification**|Fnu Mohbat et.al.|[2305.17219v1](http://arxiv.org/abs/2305.17219v1)|**[link](https://github.com/mohbattharani/GVdoc)**|\n", "2305.19270": "|**2023-05-30**|**Learning without Forgetting for Vision-Language Models**|Da-Wei Zhou et.al.|[2305.19270v1](http://arxiv.org/abs/2305.19270v1)|null|\n", "2305.19240": "|**2023-05-30**|**NetHack is Hard to Hack**|Ulyana Piterbarg et.al.|[2305.19240v1](http://arxiv.org/abs/2305.19240v1)|**[link](https://github.com/upiterbarg/hihack)**|\n", "2305.19228": "|**2023-05-30**|**Unsupervised Melody-to-Lyric Generation**|Yufei Tian et.al.|[2305.19228v1](http://arxiv.org/abs/2305.19228v1)|**[link](https://github.com/amazon-science/unsupervised-melody-to-lyrics-generation)**|\n", "2305.19216": "|**2023-05-30**|**Translation-Enhanced Multilingual Text-to-Image Generation**|Yaoyiran Li et.al.|[2305.19216v1](http://arxiv.org/abs/2305.19216v1)|null|\n", "2305.18980": "|**2023-05-30**|**Multi-modal Queried Object Detection in the Wild**|Yifan Xu et.al.|[2305.18980v1](http://arxiv.org/abs/2305.18980v1)|**[link](https://github.com/yifanxu74/mq-det)**|\n", "2305.18969": "|**2023-05-30**|**MS-DETR: Natural Language Video Localization with Sampling Moment-Moment Interaction**|Jing Wang et.al.|[2305.18969v1](http://arxiv.org/abs/2305.18969v1)|**[link](https://github.com/k-nick/ms-detr)**|\n", "2305.18898": "|**2023-05-30**|**AlphaBlock: Embodied Finetuning for Vision-Language Reasoning in Robot Manipulation**|Chuhao Jin et.al.|[2305.18898v1](http://arxiv.org/abs/2305.18898v1)|null|\n", "2305.18842": "|**2023-05-30**|**Generate then Select: Open-ended Visual Question Answering Guided by World Knowledge**|Xingyu Fu et.al.|[2305.18842v1](http://arxiv.org/abs/2305.18842v1)|null|\n", "2305.18752": "|**2023-05-30**|**GPT4Tools: Teaching Large Language Model to Use Tools via Self-instruction**|Rui Yang et.al.|[2305.18752v1](http://arxiv.org/abs/2305.18752v1)|**[link](https://github.com/stevengrove/gpt4tools)**|\n", "2305.18721": "|**2023-05-30**|**LayoutMask: Enhance Text-Layout Interaction in Multi-modal Pre-training for Document Understanding**|Yi Tu et.al.|[2305.18721v1](http://arxiv.org/abs/2305.18721v1)|null|\n", "2305.18641": "|**2023-05-29**|**Enhanced Chart Understanding in Vision and Language Task via Cross-modal Pre-training on Plot Table Pairs**|Mingyang Zhou et.al.|[2305.18641v1](http://arxiv.org/abs/2305.18641v1)|null|\n", "2305.18500": "|**2023-05-29**|**VAST: A Vision-Audio-Subtitle-Text Omni-Modality Foundation Model and Dataset**|Sihan Chen et.al.|[2305.18500v1](http://arxiv.org/abs/2305.18500v1)|**[link](https://github.com/txh-mercury/vast)**|\n", "2305.19972": "|**2023-05-31**|**ViLaS: Integrating Vision and Language into Automatic Speech Recognition**|Minglun Han et.al.|[2305.19972v1](http://arxiv.org/abs/2305.19972v1)|null|\n", "2305.19924": "|**2023-06-01**|**Joint Adaptive Representations for Image-Language Learning**|AJ Piergiovanni et.al.|[2305.19924v2](http://arxiv.org/abs/2305.19924v2)|null|\n", "2305.19912": "|**2023-05-31**|**Structure-Aware Language Model Pretraining Improves Dense Retrieval on Structured Data**|Xinze Li et.al.|[2305.19912v1](http://arxiv.org/abs/2305.19912v1)|**[link](https://github.com/openmatch/openmatch)**|\n", "2305.19894": "|**2023-05-31**|**Med-UniC: Unifying Cross-Lingual Medical Vision-Language Pre-Training by Diminishing Bias**|Zhongwei Wan et.al.|[2305.19894v1](http://arxiv.org/abs/2305.19894v1)|**[link](https://github.com/SUSTechBruce/Med-UniC)**|\n", "2305.19664": "|**2023-05-31**|**Unveiling Cross Modality Bias in Visual Question Answering: A Causal View with Possible Worlds VQA**|Ali Vosoughi et.al.|[2305.19664v1](http://arxiv.org/abs/2305.19664v1)|null|\n", "2305.19624": "|**2023-05-31**|**A Multi-Modal Transformer Network for Action Detection**|Matthew Korban et.al.|[2305.19624v1](http://arxiv.org/abs/2305.19624v1)|null|\n", "2305.19595": "|**2023-06-01**|**Dense and Aligned Captions (DAC) Promote Compositional Reasoning in VL Models**|Sivan Doveh et.al.|[2305.19595v2](http://arxiv.org/abs/2305.19595v2)|null|\n", "2305.19522": "|**2023-06-01**|**PromptStyle: Controllable Style Transfer for Text-to-Speech with Natural Language Descriptions**|Guanghou Liu et.al.|[2305.19522v2](http://arxiv.org/abs/2305.19522v2)|null|\n", "2306.00978": "|**2023-06-01**|**AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration**|Ji Lin et.al.|[2306.00978v1](http://arxiv.org/abs/2306.00978v1)|**[link](https://github.com/mit-han-lab/llm-awq)**|\n", "2306.00964": "|**2023-06-01**|**Cocktail: Mixing Multi-Modality Controls for Text-Conditional Image Generation**|Minghui Hu et.al.|[2306.00964v1](http://arxiv.org/abs/2306.00964v1)|null|\n", "2306.00958": "|**2023-06-01**|**LIV: Language-Image Representations and Rewards for Robotic Control**|Yecheng Jason Ma et.al.|[2306.00958v1](http://arxiv.org/abs/2306.00958v1)|**[link](https://github.com/penn-pal-lab/liv)**|\n", "2306.00932": "|**2023-06-01**|**Cross Modal Data Discovery over Structured and Unstructured Data Lakes**|Mohamed Y. Eltabakh et.al.|[2306.00932v1](http://arxiv.org/abs/2306.00932v1)|**[link](https://github.com/qcri/cmdl)**|\n", "2306.00813": "|**2023-06-01**|**UniDiff: Advancing Vision-Language Models with Generative and Discriminative Learning**|Xiao Dong et.al.|[2306.00813v1](http://arxiv.org/abs/2306.00813v1)|null|\n", "2306.00792": "|**2023-06-01**|**Learning Across Decentralized Multi-Modal Remote Sensing Archives with Federated Learning**|Bar\u0131\u015f B\u00fcy\u00fckta\u015f et.al.|[2306.00792v1](http://arxiv.org/abs/2306.00792v1)|null|\n", "2306.00789": "|**2023-06-01**|**Improved Cross-Lingual Transfer Learning For Automatic Speech Translation**|Sameer Khurana et.al.|[2306.00789v1](http://arxiv.org/abs/2306.00789v1)|null|\n", "2306.00783": "|**2023-06-01**|**FDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and Relighting with Diffusion Models**|Hao Zhang et.al.|[2306.00783v1](http://arxiv.org/abs/2306.00783v1)|**[link](https://github.com/billyxyb/fdnerf)**|\n", "2306.00640": "|**2023-06-01**|**Multi-Modal Deep Learning for Multi-Temporal Urban Mapping With a Partly Missing Optical Modality**|Sebastian Hafner et.al.|[2306.00640v1](http://arxiv.org/abs/2306.00640v1)|null|\n", "2306.00424": "|**2023-06-01**|**End-to-end Knowledge Retrieval with Multi-modal Queries**|Man Luo et.al.|[2306.00424v1](http://arxiv.org/abs/2306.00424v1)|**[link](https://github.com/luomancs/remuq)**|\n", "2306.00409": "|**2023-06-01**|**Adapting Pre-trained Language Models to Vision-Language Tasks via Dynamic Visual Prompting**|Shubin Huang et.al.|[2306.00409v1](http://arxiv.org/abs/2306.00409v1)|**[link](https://github.com/hsb1357173526/dynamic_visual_prompting)**|\n", "2306.00386": "|**2023-06-01**|**Symmetric Uncertainty-Aware Feature Transmission for Depth Super-Resolution**|Wuxuan Shi et.al.|[2306.00386v1](http://arxiv.org/abs/2306.00386v1)|**[link](https://github.com/shiwuxuan/suft)**|\n", "2306.00228": "|**2023-05-31**|**Using Visual Cropping to Enhance Fine-Detail Question Answering of BLIP-Family Models**|Jiarui Zhang et.al.|[2306.00228v1](http://arxiv.org/abs/2306.00228v1)|null|\n", "2306.00179": "|**2023-05-31**|**LeggedWalking on Inclined Surfaces**|Chenghao Wang et.al.|[2306.00179v1](http://arxiv.org/abs/2306.00179v1)|null|\n", "2306.00103": "|**2023-05-31**|**ManagerTower: Aggregating the Insights of Uni-Modal Experts for Vision-Language Representation Learning**|Xiao Xu et.al.|[2306.00103v1](http://arxiv.org/abs/2306.00103v1)|**[link](https://github.com/looperxx/managertower)**|\n", "2306.01733": "|**2023-06-02**|**DocFormerv2: Local Features for Document Understanding**|Srikar Appalaraju et.al.|[2306.01733v1](http://arxiv.org/abs/2306.01733v1)|null|\n", "2306.01675": "|**2023-06-02**|**Bayesian Segmentation Modeling of Epidemic Growth**|Tejasv Bedi et.al.|[2306.01675v1](http://arxiv.org/abs/2306.01675v1)|null|\n", "2306.01656": "|**2023-06-02**|**Backchannel Detection and Agreement Estimation from Video with Transformer Networks**|Ahmed Amer et.al.|[2306.01656v1](http://arxiv.org/abs/2306.01656v1)|**[link](https://git.opendfki.de/body_language/ijcnn23-backchannel-detection)**|\n", "2306.01523": "|**2023-06-02**|**Transformer-based Multi-Modal Learning for Multi Label Remote Sensing Image Classification**|David Hoffmann et.al.|[2306.01523v1](http://arxiv.org/abs/2306.01523v1)|null|\n", "2306.01492": "|**2023-06-02**|**Multi-Modal Emotion Recognition for Enhanced Requirements Engineering: A Novel Approach**|Ben Cheng et.al.|[2306.01492v1](http://arxiv.org/abs/2306.01492v1)|null|\n", "2306.01312": "|**2023-06-02**|**Syntax-aware Hybrid prompt model for Few-shot multi-modal sentiment analysis**|Zikai Zhou et.al.|[2306.01312v1](http://arxiv.org/abs/2306.01312v1)|null|\n", "2306.01311": "|**2023-06-02**|**MetaVL: Transferring In-Context Learning Ability From Language Models to Vision-Language Models**|Masoud Monajatipoor et.al.|[2306.01311v1](http://arxiv.org/abs/2306.01311v1)|null|\n", "2306.01163": "|**2023-06-01**|**A Multi-Modal Latent-Features based Service Recommendation System for the Social Internet of Things**|Amar Khelloufi et.al.|[2306.01163v1](http://arxiv.org/abs/2306.01163v1)|null|\n", "2306.01144": "|**2023-06-01**|**Evaluating the Capabilities of Multi-modal Reasoning Models with Synthetic Task Data**|Nathan Vaska et.al.|[2306.01144v1](http://arxiv.org/abs/2306.01144v1)|null|\n", "2306.01112": "|**2023-06-01**|**What if We Enrich day-ahead Solar Irradiance Time Series Forecasting with Spatio-Temporal Context?**|Oussama Boussif et.al.|[2306.01112v1](http://arxiv.org/abs/2306.01112v1)|**[link](https://github.com/gitbooo/CrossViVit)**|\n", "2306.02972": "|**2023-06-05**|**Simultaneous or Sequential Training? How Speech Representations Cooperate in a Multi-Task Self-Supervised Learning System**|Khazar Khorrami et.al.|[2306.02972v1](http://arxiv.org/abs/2306.02972v1)|null|\n", "2306.02901": "|**2023-06-05**|**A Vessel-Segmentation-Based CycleGAN for Unpaired Multi-modal Retinal Image Synthesis**|Aline Sindel et.al.|[2306.02901v1](http://arxiv.org/abs/2306.02901v1)|null|\n", "2306.02894": "|**2023-06-05**|**Recyclable Semi-supervised Method Based on Multi-model Ensemble for Video Scene Parsing**|Biao Wu et.al.|[2306.02894v1](http://arxiv.org/abs/2306.02894v1)|null|\n", "2306.02858": "|**2023-06-06**|**Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding**|Hang Zhang et.al.|[2306.02858v2](http://arxiv.org/abs/2306.02858v2)|**[link](https://github.com/damo-nlp-sg/video-llama)**|\n", "2306.02841": "|**2023-06-05**|**CTRL: Connect Tabular and Language Model for CTR Prediction**|Xiangyang Li et.al.|[2306.02841v1](http://arxiv.org/abs/2306.02841v1)|null|\n", "2306.02831": "|**2023-06-05**|**MM-DAG: Multi-task DAG Learning for Multi-modal Data -- with Application for Traffic Congestion Analysis**|Tian Lan et.al.|[2306.02831v1](http://arxiv.org/abs/2306.02831v1)|**[link](https://github.com/lantian72/mm-dag)**|\n", "2306.02673": "|**2023-06-05**|**Cross-Modal Vertical Federated Learning for MRI Reconstruction**|Yunlu Yan et.al.|[2306.02673v1](http://arxiv.org/abs/2306.02673v1)|null|\n", "2306.02596": "|**2023-06-05**|**A Novel Interpretable and Generalizable Re-synchronization Model for Cued Speech based on a Multi-Cuer Corpus**|Lufei Gao et.al.|[2306.02596v1](http://arxiv.org/abs/2306.02596v1)|**[link](https://github.com/lufei321/resync-cs)**|\n", "2306.02546": "|**2023-06-05**|**LmPa: Improving Decompilation by Synergy of Large Language Model and Program Analysis**|Xiangzhe Xu et.al.|[2306.02546v1](http://arxiv.org/abs/2306.02546v1)|null|\n", "2306.02329": "|**2023-06-04**|**Multi-CLIP: Contrastive Vision-Language Pre-training for Question Answering tasks in 3D Scenes**|Alexandros Delitzas et.al.|[2306.02329v1](http://arxiv.org/abs/2306.02329v1)|null|\n", "2306.02307": "|**2023-06-04**|**Finding the SWEET Spot: Analysis and Improvement of Adaptive Inference in Low Resource Settings**|Daniel Rotem et.al.|[2306.02307v1](http://arxiv.org/abs/2306.02307v1)|null|\n", "2306.02259": "|**2023-06-04**|**Predicting Information Pathways Across Online Communities**|Yiqiao Jin et.al.|[2306.02259v1](http://arxiv.org/abs/2306.02259v1)|**[link](https://github.com/claws-lab/inpac)**|\n", "2306.02137": "|**2023-06-03**|**Inconsistent Matters: A Knowledge-guided Dual-consistency Network for Multi-modal Rumor Detection**|Mengzhu Sun et.al.|[2306.02137v1](http://arxiv.org/abs/2306.02137v1)|**[link](https://github.com/mengzsun/kdcn)**|\n", "2306.02050": "|**2023-06-06**|**Provable Dynamic Fusion for Low-Quality Multimodal Data**|Qingyang Zhang et.al.|[2306.02050v2](http://arxiv.org/abs/2306.02050v2)|**[link](https://github.com/qingyangzhang/qmf)**|\n", "2306.01929": "|**2023-06-02**|**Recent Advances of Local Mechanisms in Computer Vision: A Survey and Outlook of Recent Work**|Qiangchang Wang et.al.|[2306.01929v1](http://arxiv.org/abs/2306.01929v1)|null|\n", "2306.03899": "|**2023-06-06**|**Towards Label-free Scene Understanding by Vision Foundation Models**|Runnan Chen et.al.|[2306.03899v1](http://arxiv.org/abs/2306.03899v1)|**[link](https://github.com/runnanchen/label-free-scene-understanding)**|\n", "2306.03810": "|**2023-06-06**|**X-Align++: cross-modal cross-view alignment for Bird's-eye-view segmentation**|Shubhankar Borse et.al.|[2306.03810v1](http://arxiv.org/abs/2306.03810v1)|null|\n", "2306.03802": "|**2023-06-06**|**Learning to Ground Instructional Articles in Videos through Narrations**|Effrosyni Mavroudi et.al.|[2306.03802v1](http://arxiv.org/abs/2306.03802v1)|null|\n", "2306.03730": "|**2023-06-06**|**Modality-Agnostic Learning for Medical Image Segmentation Using Multi-modality Self-distillation**|Qisheng He et.al.|[2306.03730v1](http://arxiv.org/abs/2306.03730v1)|null|\n", "2306.03678": "|**2023-06-06**|**On the Difference of BERT-style and CLIP-style Text Encoders**|Zhihong Chen et.al.|[2306.03678v1](http://arxiv.org/abs/2306.03678v1)|**[link](https://github.com/zhjohnchan/bert-clip-synesthesia)**|\n", "2306.03650": "|**2023-06-06**|**A Quantum Probability Driven Framework for Joint Multi-Modal Sarcasm, Sentiment and Emotion Analysis**|Yaochen Liu et.al.|[2306.03650v1](http://arxiv.org/abs/2306.03650v1)|null|\n", "2306.03617": "|**2023-06-06**|**A Data-Efficient Approach for Long-Term Human Motion Prediction Using Maps of Dynamics**|Yufei Zhu et.al.|[2306.03617v1](http://arxiv.org/abs/2306.03617v1)|null|\n", "2306.03367": "|**2023-06-06**|**Bridging the Gap Between Multi-Step and One-Shot Trajectory Prediction via Self-Supervision**|Faris Janjo\u0161 et.al.|[2306.03367v1](http://arxiv.org/abs/2306.03367v1)|null|\n", "2306.03252": "|**2023-06-05**|**RACECAR -- The Dataset for High-Speed Autonomous Racing**|Amar Kulkarni et.al.|[2306.03252v1](http://arxiv.org/abs/2306.03252v1)|**[link](https://github.com/linklab-uva/racecar_data)**|\n", "2306.04445": "|**2023-06-07**|**Multi-modal Latent Diffusion**|Mustapha Bounoua et.al.|[2306.04445v1](http://arxiv.org/abs/2306.04445v1)|null|\n", "2306.04387": "|**2023-06-08**|**M$^3$IT: A Large-Scale Dataset towards Multi-Modal Multilingual Instruction Tuning**|Lei Li et.al.|[2306.04387v2](http://arxiv.org/abs/2306.04387v2)|null|\n", "2306.04362": "|**2023-06-07**|**Youku-mPLUG: A 10 Million Large-scale Chinese Video-Language Dataset for Pre-training and Benchmarks**|Haiyang Xu et.al.|[2306.04362v1](http://arxiv.org/abs/2306.04362v1)|**[link](https://github.com/x-plug/youku-mplug)**|\n", "2306.04272": "|**2023-06-07**|**On the Generalization of Multi-modal Contrastive Learning**|Qi Zhang et.al.|[2306.04272v1](http://arxiv.org/abs/2306.04272v1)|**[link](https://github.com/pku-ml/clip-help-simclr)**|\n", "2306.04163": "|**2023-06-07**|**Enhancing Virtual Assistant Intelligence: Precise Area Targeting for Instance-level User Intents beyond Metadata**|Mengyu Chen et.al.|[2306.04163v1](http://arxiv.org/abs/2306.04163v1)|null|\n", "2306.04083": "|**2023-06-07**|**Coverage Path Planning with Budget Constraints for Multiple Unmanned Ground Vehicles**|Vu Phi Tran et.al.|[2306.04083v1](http://arxiv.org/abs/2306.04083v1)|null|\n", "2306.04021": "|**2023-06-06**|**Energy-Based Models for Cross-Modal Localization using Convolutional Transformers**|Alan Wu et.al.|[2306.04021v1](http://arxiv.org/abs/2306.04021v1)|null|\n", "2306.05425": "|**2023-06-08**|**MIMIC-IT: Multi-Modal In-Context Instruction Tuning**|Bo Li et.al.|[2306.05425v1](http://arxiv.org/abs/2306.05425v1)|**[link](https://github.com/luodian/otter)**|\n", "2306.04928": "|**2023-06-08**|**Underwater Intention Recognition using Head Motion and Throat Vibration for Supernumerary Robotic Assistance**|Yuqin Guo et.al.|[2306.04928v1](http://arxiv.org/abs/2306.04928v1)|null|\n", "2306.06048": "|**2023-06-09**|**How Does Fine-Tuning Impact Out-of-Distribution Detection for Vision-Language Models?**|Yifei Ming et.al.|[2306.06048v1](http://arxiv.org/abs/2306.06048v1)|null|\n", "2306.05716": "|**2023-06-09**|**Pave the Way to Grasp Anything: Transferring Foundation Models for Universal Pick-Place Robots**|Jiange Yang et.al.|[2306.05716v1](http://arxiv.org/abs/2306.05716v1)|null|\n", "2306.05493": "|**2023-06-08**|**Multi-Modal Classifiers for Open-Vocabulary Object Detection**|Prannay Kaul et.al.|[2306.05493v1](http://arxiv.org/abs/2306.05493v1)|null|\n", "2306.07272": "|**2023-06-12**|**Zero-shot Composed Text-Image Retrieval**|Yikun Liu et.al.|[2306.07272v1](http://arxiv.org/abs/2306.07272v1)|**[link](https://github.com/Code-kunkun/ZS-CIR)**|\n", "2306.07257": "|**2023-06-12**|**MovieFactory: Automatic Movie Creation from Text using Large Generative Models for Language and Images**|Junchen Zhu et.al.|[2306.07257v1](http://arxiv.org/abs/2306.07257v1)|null|\n", "2306.07207": "|**2023-06-12**|**Valley: Video Assistant with Large Language model Enhanced abilitY**|Ruipu Luo et.al.|[2306.07207v1](http://arxiv.org/abs/2306.07207v1)|**[link](https://github.com/rupertluo/valley)**|\n", "2306.07196": "|**2023-06-12**|**Retrieval-Enhanced Contrastive Vision-Text Models**|Ahmet Iscen et.al.|[2306.07196v1](http://arxiv.org/abs/2306.07196v1)|null|\n", "2306.07187": "|**2023-06-12**|**Video-to-Music Recommendation using Temporal Alignment of Segments**|Laure Pr\u00e9tet et.al.|[2306.07187v1](http://arxiv.org/abs/2306.07187v1)|null|\n", "2306.07096": "|**2023-06-12**|**Global and Local Semantic Completion Learning for Vision-Language Pre-training**|Rong-Cheng Tu et.al.|[2306.07096v1](http://arxiv.org/abs/2306.07096v1)|**[link](https://github.com/iigroup/scl)**|\n", "2306.06885": "|**2023-06-12**|**NPVForensics: Jointing Non-critical Phonemes and Visemes for Deepfake Detection**|Yu Chen et.al.|[2306.06885v1](http://arxiv.org/abs/2306.06885v1)|null|\n", "2306.06691": "|**2023-06-11**|**Self-Enhancement Improves Text-Image Retrieval in Foundation Visual-Language Models**|Yuguang Yang et.al.|[2306.06691v1](http://arxiv.org/abs/2306.06691v1)|null|\n", "2306.06687": "|**2023-06-11**|**LAMM: Language-Assisted Multi-Modal Instruction-Tuning Dataset, Framework, and Benchmark**|Zhenfei Yin et.al.|[2306.06687v1](http://arxiv.org/abs/2306.06687v1)|**[link](https://github.com/openlamm/lamm)**|\n", "2306.06615": "|**2023-06-11**|**Empowering Molecule Discovery for Molecule-Caption Translation with Large Language Models: A ChatGPT Perspective**|Jiatong Li et.al.|[2306.06615v1](http://arxiv.org/abs/2306.06615v1)|**[link](https://github.com/phenixace/molregpt)**|\n", "2306.06583": "|**2023-06-11**|**REACT2023: the first Multi-modal Multiple Appropriate Facial Reaction Generation Challenge**|Siyang Song et.al.|[2306.06583v1](http://arxiv.org/abs/2306.06583v1)|**[link](https://github.com/reactmultimodalchallenge/baseline_react2023)**|\n", "2306.06494": "|**2023-06-10**|**Multi-modal Pre-training for Medical Vision-language Understanding and Generation: An Empirical Study with A New Benchmark**|Li Xu et.al.|[2306.06494v1](http://arxiv.org/abs/2306.06494v1)|**[link](https://github.com/control-xl/medical-vision-langauge-transformer)**|\n", "2306.06476": "|**2023-06-10**|**Modality Influence in Multimodal Machine Learning**|Abdelhamid Haouhat et.al.|[2306.06476v1](http://arxiv.org/abs/2306.06476v1)|null|\n", "2306.06465": "|**2023-06-10**|**Simultaneous Trajectory Optimization and Contact Selection for Multi-Modal Manipulation Planning**|Mengchao Zhang et.al.|[2306.06465v1](http://arxiv.org/abs/2306.06465v1)|null|\n", "2306.06410": "|**2023-06-10**|**OpenSR: Open-Modality Speech Recognition via Maintaining Multi-Modality Alignment**|Xize Cheng et.al.|[2306.06410v1](http://arxiv.org/abs/2306.06410v1)|**[link](https://github.com/exgc/opensr)**|\n", "2306.07744": "|**2023-06-13**|**Contrastive Learning-Based Audio to Lyrics Alignment for Multiple Languages**|Simon Durand et.al.|[2306.07744v1](http://arxiv.org/abs/2306.07744v1)|**[link](https://github.com/f90/jamendolyrics)**|\n", "2306.07646": "|**2023-06-13**|**Enhanced Multimodal Representation Learning with Cross-modal KD**|Mengxi Chen et.al.|[2306.07646v1](http://arxiv.org/abs/2306.07646v1)|null|\n", "2306.07505": "|**2023-06-13**|**Deep learning radiomics for assessment of gastroesophageal varices in people with compensated advanced chronic liver disease**|Lan Wang et.al.|[2306.07505v1](http://arxiv.org/abs/2306.07505v1)|null|\n", "2306.07303": "|**2023-06-11**|**A Comprehensive Survey on Applications of Transformers for Deep Learning Tasks**|Saidul Islam et.al.|[2306.07303v1](http://arxiv.org/abs/2306.07303v1)|null|\n", "2306.09347": "|**2023-06-15**|**Segment Any Point Cloud Sequences by Distilling Vision Foundation Models**|Youquan Liu et.al.|[2306.09347v1](http://arxiv.org/abs/2306.09347v1)|**[link](https://github.com/youquanl/segment-any-point-cloud)**|\n", "2306.09265": "|**2023-06-15**|**LVLM-eHub: A Comprehensive Evaluation Benchmark for Large Vision-Language Models**|Peng Xu et.al.|[2306.09265v1](http://arxiv.org/abs/2306.09265v1)|**[link](https://github.com/opengvlab/multi-modality-arena)**|\n", "2306.09093": "|**2023-06-15**|**Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration**|Chenyang Lyu et.al.|[2306.09093v1](http://arxiv.org/abs/2306.09093v1)|**[link](https://github.com/lyuchenyang/macaw-llm)**|\n", "2306.09067": "|**2023-06-15**|**Winning Solution for the CVPR2023 Visual Anomaly and Novelty Detection Challenge: Multimodal Prompting for Data-centric Anomaly Detection**|Yunkang Cao et.al.|[2306.09067v1](http://arxiv.org/abs/2306.09067v1)|**[link](https://github.com/caoyunkang/segment-any-anomaly)**|\n", "2306.08966": "|**2023-06-15**|**Training Multimedia Event Extraction With Generated Images and Captions**|Zilin Du et.al.|[2306.08966v1](http://arxiv.org/abs/2306.08966v1)|null|\n", "2306.08893": "|**2023-06-15**|**LOVM: Language-Only Vision Model Selection**|Orr Zohar et.al.|[2306.08893v1](http://arxiv.org/abs/2306.08893v1)|**[link](https://github.com/orrzohar/lovm)**|\n", "2306.08871": "|**2023-06-15**|**Med-MMHL: A Multi-Modal Dataset for Detecting Human- and LLM-Generated Misinformation in the Medical Domain**|Yanshen Sun et.al.|[2306.08871v1](http://arxiv.org/abs/2306.08871v1)|**[link](https://github.com/styxsys0927/med-mmhl)**|\n", "2306.08832": "|**2023-06-15**|**Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to Enhance Visio-Linguistic Fine-grained Understanding**|Le Zhang et.al.|[2306.08832v1](http://arxiv.org/abs/2306.08832v1)|**[link](https://github.com/magiccircuit/enhance-finegrained)**|\n", "2306.08789": "|**2023-06-15**|**Efficient Token-Guided Image-Text Retrieval with Consistent Multimodal Contrastive Training**|Chong Liu et.al.|[2306.08789v1](http://arxiv.org/abs/2306.08789v1)|null|\n", "2306.08749": "|**2023-06-14**|**Utilizing Longitudinal Chest X-Rays and Reports to Pre-Fill Radiology Reports**|Qingqing Zhu et.al.|[2306.08749v1](http://arxiv.org/abs/2306.08749v1)|null|\n", "2306.08657": "|**2023-06-14**|**EMERSK -- Explainable Multimodal Emotion Recognition with Situational Knowledge**|Mijanur Palash et.al.|[2306.08657v1](http://arxiv.org/abs/2306.08657v1)|null|\n", "2306.08640": "|**2023-06-14**|**AssistGPT: A General Multi-modal Assistant that can Plan, Execute, Inspect, and Learn**|Difei Gao et.al.|[2306.08640v1](http://arxiv.org/abs/2306.08640v1)|null|\n", "2306.08522": "|**2023-06-14**|**Challenges of Indoor SLAM: A multi-modal multi-floor dataset for SLAM evaluation**|Pushyami Kaveti et.al.|[2306.08522v1](http://arxiv.org/abs/2306.08522v1)|**[link](https://github.com/neufieldrobotics/nufr-m3f)**|\n", "2306.08498": "|**2023-06-14**|**RISCLIP: Referring Image Segmentation Framework using CLIP**|Seoyeon Kim et.al.|[2306.08498v1](http://arxiv.org/abs/2306.08498v1)|**[link](https://github.com/Yeon07/RISCLIP)**|\n", "2306.08247": "|**2023-06-14**|**Diffusion in Diffusion: Cyclic One-Way Diffusion for Text-Vision-Conditioned Generation**|Yongqi Yang et.al.|[2306.08247v1](http://arxiv.org/abs/2306.08247v1)|null|\n", "2306.09851": "|**2023-06-16**|**Joint multi-modal Self-Supervised pre-training in Remote Sensing: Application to Methane Source Classification**|Paul Berg et.al.|[2306.09851v1](http://arxiv.org/abs/2306.09851v1)|null|\n", "2306.09546": "|**2023-06-15**|**Cross-Modal Video to Body-joints Augmentation for Rehabilitation Exercise Quality Assessment**|Ali Abedi et.al.|[2306.09546v1](http://arxiv.org/abs/2306.09546v1)|null|\n", "2306.09523": "|**2023-06-19**|**Tell Me Where to Go: A Composable Framework for Context-Aware Embodied Robot Navigation**|Harel Biggie et.al.|[2306.09523v2](http://arxiv.org/abs/2306.09523v2)|null|\n", "2306.09417": "|**2023-06-15**|**Diff-TTSG: Denoising probabilistic integrated speech and gesture synthesis**|Shivam Mehta et.al.|[2306.09417v1](http://arxiv.org/abs/2306.09417v1)|null|\n", "2306.11510": "|**2023-06-20**|**Pushing the Limits of 3D Shape Generation at Scale**|Wang Yu et.al.|[2306.11510v1](http://arxiv.org/abs/2306.11510v1)|null|\n", "2306.11504": "|**2023-06-20**|**Align, Adapt and Inject: Sound-guided Unified Image Generation**|Yue Yang et.al.|[2306.11504v1](http://arxiv.org/abs/2306.11504v1)|null|\n", "2306.11400": "|**2023-06-20**|**MuDPT: Multi-modal Deep-symphysis Prompt Tuning for Large Pre-trained Vision-Language Models**|Yongzhu Miao et.al.|[2306.11400v1](http://arxiv.org/abs/2306.11400v1)|**[link](https://github.com/mechrev0/mudpt)**|\n", "2306.11207": "|**2023-06-22**|**Quilt-1M: One Million Image-Text Pairs for Histopathology**|Wisdom Oluchi Ikezogwo et.al.|[2306.11207v2](http://arxiv.org/abs/2306.11207v2)|**[link](https://github.com/wisdomikezogwo/quilt1m)**|\n", "2306.11137": "|**2023-06-19**|**Deep Learning Framework with Multi-Head Dilated Encoders for Enhanced Segmentation of Cervical Cancer on Multiparametric Magnetic Resonance Imaging**|Reza Kalantar et.al.|[2306.11137v1](http://arxiv.org/abs/2306.11137v1)|null|\n", "2306.11065": "|**2023-06-19**|**Cross-Modal Attribute Insertions for Assessing the Robustness of Vision-and-Language Learning**|Shivaen Ramshetty et.al.|[2306.11065v1](http://arxiv.org/abs/2306.11065v1)|**[link](https://github.com/claws-lab/multimodal-robustness-xmai)**|\n", "2306.11025": "|**2023-06-19**|**Temporal Data Meets LLM -- Explainable Financial Time Series Forecasting**|Xinli Yu et.al.|[2306.11025v1](http://arxiv.org/abs/2306.11025v1)|null|\n", "2306.11020": "|**2023-06-19**|**Dual-Gated Fusion with Prefix-Tuning for Multi-Modal Relation Extraction**|Qian Li et.al.|[2306.11020v1](http://arxiv.org/abs/2306.11020v1)|null|\n", "2306.10830": "|**2023-06-19**|**3D VR Sketch Guided 3D Shape Prototyping and Exploration**|Ling Luo et.al.|[2306.10830v1](http://arxiv.org/abs/2306.10830v1)|**[link](https://github.com/rowl1ng/3dsketch2shape)**|\n", "2306.10799": "|**2023-06-19**|**SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend 3D Talking Faces**|Ziqiao Peng et.al.|[2306.10799v1](http://arxiv.org/abs/2306.10799v1)|**[link](https://github.com/psyai-net/SelfTalk_release)**|\n", "2306.10772": "|**2023-06-19**|**Learning an Interpretable End-to-End Network for Real-Time Acoustic Beamforming**|Hao Liang et.al.|[2306.10772v1](http://arxiv.org/abs/2306.10772v1)|null|\n", "2306.10750": "|**2023-06-19**|**WiCo: Win-win Cooperation of Bottom-up and Top-down Referring Image Segmentation**|Zesen Cheng et.al.|[2306.10750v1](http://arxiv.org/abs/2306.10750v1)|null|\n", "2306.10730": "|**2023-06-19**|**UniG3D: A Unified 3D Object Generation Dataset**|Qinghong Sun et.al.|[2306.10730v1](http://arxiv.org/abs/2306.10730v1)|null|\n", "2306.10687": "|**2023-06-19**|**Categories of Response-Based, Feature-Based, and Relation-Based Knowledge Distillation**|Chuanguang Yang et.al.|[2306.10687v1](http://arxiv.org/abs/2306.10687v1)|null|\n", "2306.10567": "|**2023-06-18**|**MIR-GAN: Refining Frame-Level Modality-Invariant Representations with Adversarial Network for Audio-Visual Speech Recognition**|Yuchen Hu et.al.|[2306.10567v1](http://arxiv.org/abs/2306.10567v1)|**[link](https://github.com/yuchen005/mir-gan)**|\n", "2306.12387": "|**2023-06-21**|**Solving Dialogue Grounding Embodied Task in a Simulated Environment using Further Masked Language Modeling**|Weijie Jack Zhang et.al.|[2306.12387v1](http://arxiv.org/abs/2306.12387v1)|null|\n", "2306.11762": "|**2023-06-20**|**MultiEarth 2023 Deforestation Challenge -- Team FOREVER**|Seunghan Park et.al.|[2306.11762v1](http://arxiv.org/abs/2306.11762v1)|null|\n", "2306.13076": "|**2023-06-22**|**A Comparison of Time-based Models for Multimodal Emotion Recognition**|Ege Kesim et.al.|[2306.13076v1](http://arxiv.org/abs/2306.13076v1)|null|\n", "2306.12819": "|**2023-06-22**|**XACML Extension for Graphs: Flexible Authorization Policy Specification and Datastore-independent Enforcement**|Aya Mohamed et.al.|[2306.12819v1](http://arxiv.org/abs/2306.12819v1)|null|\n", "2306.12795": "|**2023-06-22**|**Learning Unseen Modality Interaction**|Yunhua Zhang et.al.|[2306.12795v1](http://arxiv.org/abs/2306.12795v1)|null|\n", "2306.12725": "|**2023-06-22**|**Generative Multimodal Entity Linking**|Senbao Shi et.al.|[2306.12725v1](http://arxiv.org/abs/2306.12725v1)|**[link](https://github.com/hitsz-tmg/gemel)**|\n", "2306.12559": "|**2023-06-21**|**Exploring the Role of Audio in Video Captioning**|Yuhan Shen et.al.|[2306.12559v1](http://arxiv.org/abs/2306.12559v1)|null|\n", "2306.12525": "|**2023-06-21**|**LPFormer: LiDAR Pose Estimation Transformer with Multi-Task Network**|Dongqiangzi Ye et.al.|[2306.12525v1](http://arxiv.org/abs/2306.12525v1)|null|\n", "2306.13592": "|**2023-06-23**|**TACOformer:Token-channel compounded Cross Attention for Multimodal Emotion Recognition**|Xinda Li et.al.|[2306.13592v1](http://arxiv.org/abs/2306.13592v1)|null|\n", "2306.13285": "|**2023-06-23**|**Learning Scene Flow With Skeleton Guidance For 3D Action Recognition**|Vasileios Magoulianitis et.al.|[2306.13285v1](http://arxiv.org/abs/2306.13285v1)|null|\n", "2306.13240": "|**2023-06-22**|**Continuous Online Extrinsic Calibration of Fisheye Camera and LiDAR**|Jack Borer et.al.|[2306.13240v1](http://arxiv.org/abs/2306.13240v1)|null|\n", "2306.14795": "|**2023-06-26**|**MotionGPT: Human Motion as a Foreign Language**|Biao Jiang et.al.|[2306.14795v1](http://arxiv.org/abs/2306.14795v1)|**[link](https://github.com/openmotionlab/motiongpt)**|\n", "2306.14565": "|**2023-06-26**|**Aligning Large Multi-Modal Model with Robust Instruction Tuning**|Fuxiao Liu et.al.|[2306.14565v1](http://arxiv.org/abs/2306.14565v1)|**[link](https://github.com/FuxiaoLiu/LRV-Instruction)**|\n", "2306.14406": "|**2023-06-26**|**TCEIP: Text Condition Embedded Regression Network for Dental Implant Position Prediction**|Xinquan Yang et.al.|[2306.14406v1](http://arxiv.org/abs/2306.14406v1)|null|\n", "2306.14399": "|**2023-06-26**|**Mutual Query Network for Multi-Modal Product Image Segmentation**|Yun Guo et.al.|[2306.14399v1](http://arxiv.org/abs/2306.14399v1)|**[link](https://github.com/weifeng-github/mqn)**|\n", "2306.14177": "|**2023-06-25**|**Enhancing Mapless Trajectory Prediction through Knowledge Distillation**|Yuning Wang et.al.|[2306.14177v1](http://arxiv.org/abs/2306.14177v1)|null|\n", "2306.14170": "|**2023-06-25**|**AV-SepFormer: Cross-Attention SepFormer for Audio-Visual Target Speaker Extraction**|Jiuxin Lin et.al.|[2306.14170v1](http://arxiv.org/abs/2306.14170v1)|**[link](https://github.com/lin9x/av-sepformer)**|\n", "2306.14143": "|**2023-06-25**|**Intelligent Multi-Modal Sensing-Communication Integration: Synesthesia of Machines**|Xiang Cheng et.al.|[2306.14143v1](http://arxiv.org/abs/2306.14143v1)|null|\n", "2306.14125": "|**2023-06-25**|**M$^3$SC: A Generic Dataset for Mixed Multi-Modal (MMM) Sensing and Communication Integration**|Xiang Cheng et.al.|[2306.14125v1](http://arxiv.org/abs/2306.14125v1)|null|\n", "2306.14112": "|**2023-06-25**|**Enhancing Dynamic Image Advertising with Vision-Language Pre-training**|Zhoufutu Wen et.al.|[2306.14112v1](http://arxiv.org/abs/2306.14112v1)|null|\n", "2306.13856": "|**2023-06-24**|**Learning-to-Rank Meets Language: Boosting Language-Driven Ordering Alignment for Ordinal Classification**|Rui Wang et.al.|[2306.13856v1](http://arxiv.org/abs/2306.13856v1)|**[link](https://github.com/raywang335/l2rclip)**|\n", "2306.13804": "|**2023-06-27**|**Cross-Language Speech Emotion Recognition Using Multimodal Dual Attention Transformers**|Syed Aun Muhammad Zaidi et.al.|[2306.13804v2](http://arxiv.org/abs/2306.13804v2)|null|\n", "2306.15644": "|**2023-06-27**|**Style-transfer based Speech and Audio-visual Scene Understanding for Robot Action Sequence Acquisition from Videos**|Chiori Hori et.al.|[2306.15644v1](http://arxiv.org/abs/2306.15644v1)|null|\n", "2306.15612": "|**2023-06-27**|**Rethinking Cross-Entropy Loss for Stereo Matching Networks**|Peng Xu et.al.|[2306.15612v1](http://arxiv.org/abs/2306.15612v1)|null|\n", "2306.15605": "|**2023-06-27**|**Deep Normalizing Flows for State Estimation**|Harrison Delecki et.al.|[2306.15605v1](http://arxiv.org/abs/2306.15605v1)|**[link](https://github.com/sisl/deepnfstateestimation)**|\n", "2306.15464": "|**2023-06-27**|**Large-scale unsupervised audio pre-training for video-to-speech synthesis**|Triantafyllos Kefalas et.al.|[2306.15464v1](http://arxiv.org/abs/2306.15464v1)|null|\n", "2306.15255": "|**2023-06-27**|**GroundNLQ @ Ego4D Natural Language Queries Challenge 2023**|Zhijian Hou et.al.|[2306.15255v1](http://arxiv.org/abs/2306.15255v1)|**[link](https://github.com/houzhijian/groundnlq)**|\n", "2306.15231": "|**2023-06-27**|**Emulating Reader Behaviors for Fake News Detection**|Junwei Yin et.al.|[2306.15231v1](http://arxiv.org/abs/2306.15231v1)|null|\n", "2306.15114": "|**2023-06-26**|**Transfer: Cross Modality Knowledge Transfer using Adversarial Networks -- A Study on Gesture Recognition**|Payal Kamboj et.al.|[2306.15114v1](http://arxiv.org/abs/2306.15114v1)|null|\n", "2306.16349": "|**2023-06-28**|**Accurate, uncertainty-aware classification of molecular chemical motifs from multi-modal X-ray absorption spectroscopy**|Matthew R. Carbone et.al.|[2306.16349v1](http://arxiv.org/abs/2306.16349v1)|null|\n", "2306.16329": "|**2023-06-28**|**DiffComplete: Diffusion-based Generative 3D Shape Completion**|Ruihang Chu et.al.|[2306.16329v1](http://arxiv.org/abs/2306.16329v1)|null|\n", "2306.16207": "|**2023-06-28**|**Inferring the Goals of Communicating Agents from Actions and Instructions**|Lance Ying et.al.|[2306.16207v1](http://arxiv.org/abs/2306.16207v1)|null|\n", "2306.16034": "|**2023-06-28**|**Stone Needle: A General Multimodal Large-scale Model Framework towards Healthcare**|Weihua Liu et.al.|[2306.16034v1](http://arxiv.org/abs/2306.16034v1)|null|\n", "2306.15977": "|**2023-06-28**|**A Dimensional Structure based Knowledge Distillation Method for Cross-Modal Learning**|Lingyu Si et.al.|[2306.15977v1](http://arxiv.org/abs/2306.15977v1)|null|\n", "2306.15955": "|**2023-06-29**|**Bridging the Gap: Neural Collapse Inspired Prompt Tuning for Generalization under Class Imbalance**|Didi Zhu et.al.|[2306.15955v2](http://arxiv.org/abs/2306.15955v2)|null|\n", "2306.15946": "|**2023-06-28**|**Knowledge-Enhanced Hierarchical Information Correlation Learning for Multi-Modal Rumor Detection**|Jiawei Liu et.al.|[2306.15946v1](http://arxiv.org/abs/2306.15946v1)|null|\n", "2306.15943": "|**2023-06-28**|**No Transfers Required: Integrating Last Mile with Public Transit Using Opti-Mile**|Raashid Altaf et.al.|[2306.15943v1](http://arxiv.org/abs/2306.15943v1)|null|\n", "2306.15837": "|**2023-06-27**|**Symbol emergence as interpersonal cross-situational learning: the emergence of lexical knowledge with combinatoriality**|Yoshinobu Hagiwara et.al.|[2306.15837v1](http://arxiv.org/abs/2306.15837v1)|null|\n", "2306.15808": "|**2023-06-27**|**Classification of Infant Sleep/Wake States: Cross-Attention among Large Scale Pretrained Transformer Networks using Audio, ECG, and IMU Data**|Kai Chieh Chang et.al.|[2306.15808v1](http://arxiv.org/abs/2306.15808v1)|null|\n", "2306.15711": "|**2023-06-27**|**Semi-supervised Multimodal Representation Learning through a Global Workspace**|Benjamin Devillers et.al.|[2306.15711v1](http://arxiv.org/abs/2306.15711v1)|**[link](https://github.com/bdvllrs/bimgw)**|\n", "2306.17115": "|**2023-07-03**|**Michelangelo: Conditional 3D Shape Generation based on Shape-Image-Text Aligned Latent Representation**|Zibo Zhao et.al.|[2306.17115v2](http://arxiv.org/abs/2306.17115v2)|**[link](https://github.com/neuralcarver/michelangelo)**|\n", "2306.17107": "|**2023-06-29**|**LLaVAR: Enhanced Visual Instruction Tuning for Text-Rich Image Understanding**|Yanzhe Zhang et.al.|[2306.17107v1](http://arxiv.org/abs/2306.17107v1)|**[link](https://github.com/SALT-NLP/LLaVAR)**|\n", "2306.17000": "|**2023-06-29**|**MotionTrack: End-to-End Transformer-based Multi-Object Tracing with LiDAR-Camera Fusion**|Ce Zhang et.al.|[2306.17000v1](http://arxiv.org/abs/2306.17000v1)|null|\n", "2306.16927": "|**2023-06-29**|**End-to-end Autonomous Driving: Challenges and Frontiers**|Li Chen et.al.|[2306.16927v1](http://arxiv.org/abs/2306.16927v1)|**[link](https://github.com/opendrivelab/end-to-end-autonomous-driving)**|\n", "2306.16862": "|**2023-06-29**|**Sustainable Palm Tree Farming: Leveraging IoT and Multi-Modal Data for Early Detection and Mapping of Red Palm Weevil**|Yosra Hajjaji et.al.|[2306.16862v1](http://arxiv.org/abs/2306.16862v1)|null|\n", "2306.16762": "|**2023-06-29**|**Unified Language Representation for Question Answering over Text, Tables, and Images**|Bowen Yu et.al.|[2306.16762v1](http://arxiv.org/abs/2306.16762v1)|null|\n", "2306.16478": "|**2023-06-28**|**Pre-Training Multi-Modal Dense Retrievers for Outside-Knowledge Visual Question Answering**|Alireza Salemi et.al.|[2306.16478v1](http://arxiv.org/abs/2306.16478v1)|**[link](https://github.com/alirezasalemi7/pretraining-multimodal-dense-retriever-for-okvqa)**|\n", "2306.17525": "|**2023-06-30**|**MeLM, a generative pretrained language modeling framework that solves forward and inverse mechanics problems**|Markus J. Buehler et.al.|[2306.17525v1](http://arxiv.org/abs/2306.17525v1)|null|\n", "2306.17400": "|**2023-06-30**|**Topological Data Analysis Guided Segment Anything Model Prompt Optimization for Zero-Shot Segmentation in Biological Imaging**|Ruben Glatt et.al.|[2306.17400v1](http://arxiv.org/abs/2306.17400v1)|null|\n", "2306.17371": "|**2023-06-30**|**Capturing functional connectomics using Riemannian partial least squares**|Matt Ryan et.al.|[2306.17371v1](http://arxiv.org/abs/2306.17371v1)|null|\n", "2307.01146": "|**2023-07-05**|**AVSegFormer: Audio-Visual Segmentation with Transformer**|Shengyi Gao et.al.|[2307.01146v2](http://arxiv.org/abs/2307.01146v2)|**[link](https://github.com/vvvb-github/avsegformer)**|\n", "2307.01124": "|**2023-07-03**|**Cross-modality Attention Adapter: A Glioma Segmentation Fine-tuning Method for SAM Using Multimodal Brain MR Images**|Xiaoyu Shi et.al.|[2307.01124v1](http://arxiv.org/abs/2307.01124v1)|null|\n", "2307.01121": "|**2023-07-03**|**Artifacts Mapping: Multi-Modal Semantic Mapping for Object Detection and 3D Localization**|Federico Rollo et.al.|[2307.01121v1](http://arxiv.org/abs/2307.01121v1)|null|\n", "2307.01047": "|**2023-07-03**|**Cross-modal Place Recognition in Image Databases using Event-based Sensors**|Xiang Ji et.al.|[2307.01047v1](http://arxiv.org/abs/2307.01047v1)|null|\n", "2307.01003": "|**2023-07-03**|**Visual Instruction Tuning with Polite Flamingo**|Delong Chen et.al.|[2307.01003v1](http://arxiv.org/abs/2307.01003v1)|**[link](https://github.com/chendelong1999/polite_flamingo)**|\n", "2307.00997": "|**2023-07-03**|**RefSAM: Efficiently Adapting Segmenting Anything Model for Referring Video Object Segmentation**|Yonglin Li et.al.|[2307.00997v1](http://arxiv.org/abs/2307.00997v1)|**[link](https://github.com/lancasterli/refsam)**|\n", "2307.00954": "|**2023-07-03**|**HODINet: High-Order Discrepant Interaction Network for RGB-D Salient Object Detection**|Kang Yi et.al.|[2307.00954v1](http://arxiv.org/abs/2307.00954v1)|null|\n", "2307.00877": "|**2023-07-03**|**Exploring the Multi-modal Demand Dynamics During Transport System Disruptions**|Ali Shateri Benam et.al.|[2307.00877v1](http://arxiv.org/abs/2307.00877v1)|null|\n", "2307.00873": "|**2023-07-03**|**End-To-End Prediction of Knee Osteoarthritis Progression With Multi-Modal Transformers**|Egor Panfilov et.al.|[2307.00873v1](http://arxiv.org/abs/2307.00873v1)|null|\n", "2307.00716": "|**2023-07-03**|**JourneyDB: A Benchmark for Generative Image Understanding**|Junting Pan et.al.|[2307.00716v1](http://arxiv.org/abs/2307.00716v1)|null|\n", "2307.00671": "|**2023-07-02**|**Leveraging Multi-modal Sensing for Robotic Insertion Tasks in R&D Laboratories**|Aaron Butterworth et.al.|[2307.00671v1](http://arxiv.org/abs/2307.00671v1)|null|\n", "2307.00610": "|**2023-07-02**|**Fraunhofer SIT at CheckThat! 2023: Mixing Single-Modal Classifiers to Estimate the Check-Worthiness of Multi-Modal Tweets**|Raphael Frick et.al.|[2307.00610v1](http://arxiv.org/abs/2307.00610v1)|null|\n", "2307.00595": "|**2023-07-02**|**RH20T: A Robotic Dataset for Learning Diverse Skills in One-Shot**|Hao-Shu Fang et.al.|[2307.00595v1](http://arxiv.org/abs/2307.00595v1)|null|\n", "2307.00536": "|**2023-07-02**|**Referring Video Object Segmentation with Inter-Frame Interaction and Cross-Modal Correlation**|Meng Lan et.al.|[2307.00536v1](http://arxiv.org/abs/2307.00536v1)|null|\n", "2307.00398": "|**2023-07-01**|**ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models**|Uddeshya Upadhyay et.al.|[2307.00398v1](http://arxiv.org/abs/2307.00398v1)|**[link](https://github.com/explainableml/probvlm)**|\n", "2307.02469": "|**2023-07-05**|**What Matters in Training a GPT4-Style Language Model with Multimodal Inputs?**|Yan Zeng et.al.|[2307.02469v1](http://arxiv.org/abs/2307.02469v1)|null|\n", "2307.02280": "|**2023-07-05**|**Interactive Image Segmentation with Cross-Modality Vision Transformers**|Kun Li et.al.|[2307.02280v1](http://arxiv.org/abs/2307.02280v1)|**[link](https://github.com/lik1996/icmformer)**|\n", "2307.02041": "|**2023-07-05**|**Multimodal Imbalance-Aware Gradient Modulation for Weakly-supervised Audio-Visual Video Parsing**|Jie Fu et.al.|[2307.02041v1](http://arxiv.org/abs/2307.02041v1)|null|\n", "2307.02003": "|**2023-07-05**|**Multi-Modal Prototypes for Open-Set Semantic Segmentation**|Yuhuan Yang et.al.|[2307.02003v1](http://arxiv.org/abs/2307.02003v1)|null|\n", "2307.01947": "|**2023-07-04**|**Causal Video Summarizer for Video Exploration**|Jia-Hong Huang et.al.|[2307.01947v1](http://arxiv.org/abs/2307.01947v1)|null|\n", "2307.01824": "|**2023-07-04**|**Multi-Channel Feature Extraction for Virtual Histological Staining of Photon Absorption Remote Sensing Images**|Marian Boktor et.al.|[2307.01824v1](http://arxiv.org/abs/2307.01824v1)|null|\n", "2307.01798": "|**2023-07-04**|**Edge-aware Multi-task Network for Integrating Quantification Segmentation and Uncertainty Prediction of Liver Tumor on Multi-modality Non-contrast MRI**|Xiaojiao Xiao et.al.|[2307.01798v1](http://arxiv.org/abs/2307.01798v1)|null|\n", "2307.01741": "|**2023-07-04**|**Ben-ge: Extending BigEarthNet with Geographical and Environmental Data**|Michael Mommert et.al.|[2307.01741v1](http://arxiv.org/abs/2307.01741v1)|**[link](https://github.com/hsg-aiml/ben-ge)**|\n", "2307.01704": "|**2023-07-04**|**Graph-Ensemble Learning Model for Multi-label Skin Lesion Classification using Dermoscopy and Clinical Images**|Peng Tang et.al.|[2307.01704v1](http://arxiv.org/abs/2307.01704v1)|null|\n", "2307.01691": "|**2023-07-06**|**SeePrivacy: Automated Contextual Privacy Policy Generation for Mobile Applications**|Shidong Pan et.al.|[2307.01691v2](http://arxiv.org/abs/2307.01691v2)|**[link](https://github.com/cpp4app/cpp4app)**|\n", "2307.01577": "|**2023-07-04**|**Conceptual Cognitive Maps Formation with Neural Successor Networks and Word Embeddings**|Paul Stoewer et.al.|[2307.01577v1](http://arxiv.org/abs/2307.01577v1)|null|\n", "2307.01515": "|**2023-07-04**|**LPN: Language-guided Prototypical Network for few-shot classification**|Kaihui Cheng et.al.|[2307.01515v1](http://arxiv.org/abs/2307.01515v1)|null|\n", "2307.01425": "|**2023-07-04**|**Consistent Multimodal Generation via A Unified GAN Framework**|Zhen Zhu et.al.|[2307.01425v1](http://arxiv.org/abs/2307.01425v1)|null|\n", "2307.01422": "|**2023-07-04**|**Generative Flow Networks: a Markov Chain Perspective**|Tristan Deleu et.al.|[2307.01422v1](http://arxiv.org/abs/2307.01422v1)|null|\n", "2307.03068": "|**2023-07-06**|**A Hybrid End-to-End Spatio-Temporal Attention Neural Network with Graph-Smooth Signals for EEG Emotion Recognition**|Shadi Sartipi et.al.|[2307.03068v1](http://arxiv.org/abs/2307.03068v1)|null|\n", "2307.02978": "|**2023-07-06**|**Multi-modal multi-class Parkinson disease classification using CNN and decision level fusion**|Sushanta Kumar Sahu et.al.|[2307.02978v1](http://arxiv.org/abs/2307.02978v1)|null|\n", "2307.02971": "|**2023-07-06**|**On the Cultural Gap in Text-to-Image Generation**|Bingshuai Liu et.al.|[2307.02971v1](http://arxiv.org/abs/2307.02971v1)|null|\n", "2307.02862": "|**2023-07-06**|**A Critical Look at the Current Usage of Foundation Model for Dense Recognition Task**|Shiqi Yang et.al.|[2307.02862v1](http://arxiv.org/abs/2307.02862v1)|null|\n", "2307.02796": "|**2023-07-06**|**VerifAI: Verified Generative AI**|Nan Tang et.al.|[2307.02796v1](http://arxiv.org/abs/2307.02796v1)|null|\n", "2307.02761": "|**2023-07-06**|**Cross-Modal Content Inference and Feature Enrichment for Cold-Start Recommendation**|Haokai Ma et.al.|[2307.02761v1](http://arxiv.org/abs/2307.02761v1)|null|\n", "2307.02730": "|**2023-07-06**|**Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of Figure Skating**|Sheng-Lan Liu et.al.|[2307.02730v1](http://arxiv.org/abs/2307.02730v1)|null|\n", "2307.03706": "|**2023-07-07**|**Counterion-controlled phase equilibria in a charge-regulated polymer solution**|Giulia L. Celora et.al.|[2307.03706v1](http://arxiv.org/abs/2307.03706v1)|null|\n", "2307.03638": "|**2023-07-07**|**Physical-aware Cross-modal Adversarial Network for Wearable Sensor-based Human Action Recognition**|Jianyuan Ni et.al.|[2307.03638v1](http://arxiv.org/abs/2307.03638v1)|null|\n", "2307.03623": "|**2023-07-07**|**Robust Human Detection under Visual Degradation via Thermal and mmWave Radar Fusion**|Kaiwen Cai et.al.|[2307.03623v1](http://arxiv.org/abs/2307.03623v1)|**[link](https://github.com/ramdrop/utm)**|\n", "2307.03535": "|**2023-07-07**|**Matching in the Wild: Learning Anatomical Embeddings for Multi-Modality Images**|Xiaoyu Bai et.al.|[2307.03535v1](http://arxiv.org/abs/2307.03535v1)|null|\n", "2307.03427": "|**2023-07-07**|**Merging-Diverging Hybrid Transformer Networks for Survival Prediction in Head and Neck Cancer**|Mingyuan Meng et.al.|[2307.03427v1](http://arxiv.org/abs/2307.03427v1)|**[link](https://github.com/mungomeng/survival-xsurv)**|\n", "2307.03388": "|**2023-07-07**|**General-Purpose Multimodal Transformer meets Remote Sensing Semantic Segmentation**|Nhi Kieu et.al.|[2307.03388v1](http://arxiv.org/abs/2307.03388v1)|**[link](https://github.com/nhikieu/spatialvolumetricmultimodal)**|\n", "2307.03373": "|**2023-07-07**|**All in One: Exploring Unified Vision-Language Tracking with Multi-Modal Alignment**|Chunhui Zhang et.al.|[2307.03373v1](http://arxiv.org/abs/2307.03373v1)|null|\n", "2307.03339": "|**2023-07-07**|**Open-Vocabulary Object Detection via Scene Graph Discovery**|Hengcan Shi et.al.|[2307.03339v1](http://arxiv.org/abs/2307.03339v1)|null|\n", "2307.03274": "|**2023-07-06**|**It is not Sexually Suggestive, It is Educative. Separating Sex Education from Suggestive Content on TikTok Videos**|Enfa George et.al.|[2307.03274v1](http://arxiv.org/abs/2307.03274v1)|null|\n", "2307.03240": "|**2023-07-06**|**Adaptive Generation of Privileged Intermediate Information for Visible-Infrared Person Re-Identification**|Mahdi Alehdaghi et.al.|[2307.03240v1](http://arxiv.org/abs/2307.03240v1)|null|\n", "2307.03591": "|**2023-07-06**|**Structure Guided Multi-modal Pre-trained Transformer for Knowledge Graph Reasoning**|Ke Liang et.al.|[2307.03591v1](http://arxiv.org/abs/2307.03591v1)|null|\n", "2307.04751": "|**2023-07-10**|**Shelving, Stacking, Hanging: Relational Pose Diffusion for Multi-modal Rearrangement**|Anthony Simeonov et.al.|[2307.04751v1](http://arxiv.org/abs/2307.04751v1)|null|\n", "2307.04749": "|**2023-07-10**|**Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image Alignment with Iterative VQA Feedback**|Jaskirat Singh et.al.|[2307.04749v1](http://arxiv.org/abs/2307.04749v1)|null|\n", "2307.04722": "|**2023-07-10**|**Advances and Challenges in Meta-Learning: A Technical Review**|Anna Vettoruzzo et.al.|[2307.04722v1](http://arxiv.org/abs/2307.04722v1)|null|\n", "2307.04470": "|**2023-07-10**|**Test-Time Adaptation for Nighttime Color-Thermal Semantic Segmentation**|Yexin Liu et.al.|[2307.04470v1](http://arxiv.org/abs/2307.04470v1)|null|\n", "2307.04461": "|**2023-07-10**|**Multi-modal Graph Learning over UMLS Knowledge Graphs**|Manuel Burger et.al.|[2307.04461v1](http://arxiv.org/abs/2307.04461v1)|**[link](https://github.com/ratschlab/mmugl)**|\n", "2307.04421": "|**2023-07-13**|**Towards Enabling Cardiac Digital Twins of Myocardial Infarction Using Deep Computational Models for Inverse Inference**|Lei Li et.al.|[2307.04421v2](http://arxiv.org/abs/2307.04421v2)|null|\n", "2307.04361": "|**2023-07-10**|**Enhancing Cross-lingual Transfer via Phonemic Transcription Integration**|Hoang H. Nguyen et.al.|[2307.04361v1](http://arxiv.org/abs/2307.04361v1)|**[link](https://github.com/nhhoang96/phonemic_xlingual)**|\n", "2307.04296": "|**2023-07-10**|**K-Space-Aware Cross-Modality Score for Synthesized Neuroimage Quality Assessment**|Jinbao Wang et.al.|[2307.04296v1](http://arxiv.org/abs/2307.04296v1)|null|\n", "2307.04231": "|**2023-07-09**|**Mx2M: Masked Cross-Modality Modeling in Domain Adaptation for 3D Semantic Segmentation**|Boxiang Zhang et.al.|[2307.04231v1](http://arxiv.org/abs/2307.04231v1)|null|\n", "2307.04129": "|**2023-07-09**|**Cross-modal Orthogonal High-rank Augmentation for RGB-Event Transformer-trackers**|Zhiyu Zhu et.al.|[2307.04129v1](http://arxiv.org/abs/2307.04129v1)|**[link](https://github.com/ZHU-Zhiyu/High-Rank_RGB-Event_Tracker)**|\n", "2307.04091": "|**2023-07-09**|**CMDFusion: Bidirectional Fusion Network with Cross-modality Knowledge Distillation for LIDAR Semantic Segmentation**|Jun Cen et.al.|[2307.04091v1](http://arxiv.org/abs/2307.04091v1)|null|\n", "2307.03990": "|**2023-07-08**|**FTFDNet: Learning to Detect Talking Face Video Manipulation with Tri-Modality Interaction**|Ganglai Wang et.al.|[2307.03990v1](http://arxiv.org/abs/2307.03990v1)|null|\n", "2307.03942": "|**2023-07-08**|**Ariadne's Thread:Using Text Prompts to Improve Segmentation of Infected Areas from Chest X-ray images**|Yi Zhong et.al.|[2307.03942v1](http://arxiv.org/abs/2307.03942v1)|**[link](https://github.com/junelin2333/languidemedseg-miccai2023)**|\n", "2307.03903": "|**2023-07-08**|**Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for Visible-Infrared Video Person Re-Identification**|Huafeng Li et.al.|[2307.03903v1](http://arxiv.org/abs/2307.03903v1)|null|\n", "2307.03798": "|**2023-07-07**|**CLIPMasterPrints: Fooling Contrastive Language-Image Pre-training Using Latent Variable Evolution**|Matthias Freiberger et.al.|[2307.03798v1](http://arxiv.org/abs/2307.03798v1)|**[link](https://github.com/matfrei/clipmasterprints)**|\n", "2307.05463": "|**2023-07-11**|**EgoVLPv2: Egocentric Video-Language Pre-training with Fusion in the Backbone**|Shraman Pramanick et.al.|[2307.05463v1](http://arxiv.org/abs/2307.05463v1)|null|\n", "2307.05435": "|**2023-07-11**|**One-Versus-Others Attention: Scalable Multimodal Integration**|Michal Golovanevsky et.al.|[2307.05435v1](http://arxiv.org/abs/2307.05435v1)|**[link](https://github.com/rsinghlab/ovo)**|\n", "2307.04978": "|**2023-07-11**|**Diffusion idea exploration for art generation**|Nikhil Verma et.al.|[2307.04978v1](http://arxiv.org/abs/2307.04978v1)|null|\n", "2307.06281": "|**2023-07-12**|**MMBench: Is Your Multi-modal Model an All-around Player?**|Yuan Liu et.al.|[2307.06281v1](http://arxiv.org/abs/2307.06281v1)|**[link](https://github.com/InternLM/opencompass)**|\n", "2307.06174": "|**2023-07-12**|**Identification in Multiple Treatment Models under Discrete Variation**|Vishal Kamat et.al.|[2307.06174v1](http://arxiv.org/abs/2307.06174v1)|null|\n", "2307.05591": "|**2023-07-10**|**SITTA: A Semantic Image-Text Alignment for Image Captioning**|Fabian Paischer et.al.|[2307.05591v1](http://arxiv.org/abs/2307.05591v1)|**[link](https://github.com/ml-jku/semantic-image-text-alignment)**|\n", "2307.06505": "|**2023-07-13**|**WaterScenes: A Multi-Task 4D Radar-Camera Fusion Dataset and Benchmark for Autonomous Driving on Water Surfaces**|Shanliang Yao et.al.|[2307.06505v1](http://arxiv.org/abs/2307.06505v1)|**[link](https://github.com/waterscenes/waterscenes)**|\n", "2307.06424": "|**2023-07-12**|**Robust scalable initialization for Bayesian variational inference with multi-modal Laplace approximations**|Wyatt Bridgman et.al.|[2307.06424v1](http://arxiv.org/abs/2307.06424v1)|null|\n", "2307.07453": "|**2023-07-14**|**Investigation of Deep Learning-Based Filtered Density Function for Large Eddy Simulation of Turbulent Scalar Mixing**|Shubhangi Bansude et.al.|[2307.07453v1](http://arxiv.org/abs/2307.07453v1)|null|\n", "2307.07362": "|**2023-07-14**|**A scoping review on multimodal deep learning in biomedical images and texts**|Zhaoyi Sun et.al.|[2307.07362v1](http://arxiv.org/abs/2307.07362v1)|null|\n", "2307.07341": "|**2023-07-14**|**PiTL: Cross-modal Retrieval with Weakly-supervised Vision-language Pre-training via Prompting**|Zixin Guo et.al.|[2307.07341v1](http://arxiv.org/abs/2307.07341v1)|null|\n", "2307.07184": "|**2023-07-14**|**TVPR: Text-to-Video Person Retrieval and a New Benchmark**|Fan Ni et.al.|[2307.07184v1](http://arxiv.org/abs/2307.07184v1)|null|\n", "2307.07177": "|**2023-07-14**|**TriFormer: A Multi-modal Transformer Framework For Mild Cognitive Impairment Conversion Prediction**|Linfeng Liu et.al.|[2307.07177v1](http://arxiv.org/abs/2307.07177v1)|null|\n", "2307.07142": "|**2023-07-14**|**CFI2P: Coarse-to-Fine Cross-Modal Correspondence Learning for Image-to-Point Cloud Registration**|Gongxin Yao et.al.|[2307.07142v1](http://arxiv.org/abs/2307.07142v1)|null|\n", "2307.07135": "|**2023-07-14**|**MMSD2.0: Towards a Reliable Multi-modal Sarcasm Detection System**|Libo Qin et.al.|[2307.07135v1](http://arxiv.org/abs/2307.07135v1)|**[link](https://github.com/joeying1019/mmsd2.0)**|\n", "2307.08581": "|**2023-07-17**|**BuboGPT: Enabling Visual Grounding in Multi-Modal LLMs**|Yang Zhao et.al.|[2307.08581v1](http://arxiv.org/abs/2307.08581v1)|null|\n", "2307.08492": "|**2023-07-17**|**SVDFormer: Complementing Point Cloud via Self-view Augmentation and Self-structure Dual-generator**|Zhe Zhu et.al.|[2307.08492v1](http://arxiv.org/abs/2307.08492v1)|**[link](https://github.com/czvvd/svdformer)**|\n", "2307.08415": "|**2023-07-17**|**Monocular 3D Object Detection with LiDAR Guided Semi Supervised Active Learning**|Aral Hekimoglu et.al.|[2307.08415v1](http://arxiv.org/abs/2307.08415v1)|null|\n", "2307.08339": "|**2023-07-17**|**Multi-Task Cross-Modality Attention-Fusion for 2D Object Detection**|Huawei Sun et.al.|[2307.08339v1](http://arxiv.org/abs/2307.08339v1)|null|\n", "2307.08316": "|**2023-07-17**|**Bridging the Gap: Multi-Level Cross-Modality Joint Alignment for Visible-Infrared Person Re-Identification**|Tengfei Liang et.al.|[2307.08316v1](http://arxiv.org/abs/2307.08316v1)|null|\n", "2307.08238": "|**2023-07-17**|**Unified Open-Vocabulary Dense Visual Prediction**|Hengcan Shi et.al.|[2307.08238v1](http://arxiv.org/abs/2307.08238v1)|null|\n", "2307.08233": "|**2023-07-17**|**ROFusion: Efficient Object Detection using Hybrid Point-wise Radar-Optical Fusion**|Liu Liu et.al.|[2307.08233v1](http://arxiv.org/abs/2307.08233v1)|**[link](https://github.com/liuliu-55/rofusion)**|\n", "2307.08228": "|**2023-07-17**|**Video Frame Interpolation with Stereo Event and Intensity Camera**|Chao Ding et.al.|[2307.08228v1](http://arxiv.org/abs/2307.08228v1)|null|\n", "2307.08098": "|**2023-07-16**|**CalibNet: Dual-branch Cross-modal Calibration for RGB-D Salient Instance Segmentation**|Jialun Pei et.al.|[2307.08098v1](http://arxiv.org/abs/2307.08098v1)|**[link](https://github.com/pjlallen/calibnet)**|\n", "2307.08019": "|**2023-07-16**|**A Multi-model and Multi-scenario Assessment of the Impact of Climate Change on the Heating and Cooling Load Components of an Archetypical Residential Room in Major Indian Cities**|Raj S. Srivastava et.al.|[2307.08019v1](http://arxiv.org/abs/2307.08019v1)|null|\n", "2307.08016": "|**2023-07-16**|**Breaking Down the Task: A Unit-Grained Hybrid Training Framework for Vision and Language Decision Making**|Ruipu Luo et.al.|[2307.08016v1](http://arxiv.org/abs/2307.08016v1)|null|\n", "2307.07859": "|**2023-07-15**|**Unified Adversarial Patch for Cross-modal Attacks in the Physical World**|Xingxing Wei et.al.|[2307.07859v1](http://arxiv.org/abs/2307.07859v1)|null|\n", "2307.07807": "|**2023-07-15**|**MUVF-YOLOX: A Multi-modal Ultrasound Video Fusion Network for Renal Tumor Diagnosis**|Junyu Li et.al.|[2307.07807v1](http://arxiv.org/abs/2307.07807v1)|**[link](https://github.com/jeunyuli/muaf)**|\n", "2307.07791": "|**2023-07-15**|**Joint Adversarial and Collaborative Learning for Self-Supervised Action Recognition**|Tianyu Guo et.al.|[2307.07791v1](http://arxiv.org/abs/2307.07791v1)|**[link](https://github.com/levigty/acl)**|\n", "2307.07763": "|**2023-07-15**|**Tightly-Coupled LiDAR-Visual SLAM Based on Geometric Features for Mobile Agents**|Ke Cao et.al.|[2307.07763v1](http://arxiv.org/abs/2307.07763v1)|null|\n", "2307.09356": "|**2023-07-18**|**OnlineRefer: A Simple Online Baseline for Referring Video Object Segmentation**|Dongming Wu et.al.|[2307.09356v1](http://arxiv.org/abs/2307.09356v1)|**[link](https://github.com/wudongming97/onlinerefer)**|\n", "2307.09329": "|**2023-07-18**|**Towards a performance analysis on pre-trained Visual Question Answering models for autonomous driving**|Kaavya Rekanar et.al.|[2307.09329v1](http://arxiv.org/abs/2307.09329v1)|**[link](https://github.com/kaavyarekanar/towards-a-performance-analysis-on-pre-trained-vqa-models-for-autonomous-driving)**|\n", "2307.09323": "|**2023-07-18**|**Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Jiahe Li et.al.|[2307.09323v1](http://arxiv.org/abs/2307.09323v1)|**[link](https://github.com/fictionarry/er-nerf)**|\n", "2307.09312": "|**2023-07-18**|**Multi-Modal Discussion Transformer: Integrating Text, Images and Graph Transformers to Detect Hate Speech on Social Media**|Liam Hebert et.al.|[2307.09312v1](http://arxiv.org/abs/2307.09312v1)|**[link](https://github.com/liamhebert/multimodaldiscussiontransformer)**|\n", "2307.09306": "|**2023-07-18**|**EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory Forecasting**|Inhwan Bae et.al.|[2307.09306v1](http://arxiv.org/abs/2307.09306v1)|**[link](https://github.com/inhwanbae/eigentrajectory)**|\n", "2307.09184": "|**2023-07-18**|**You've Got Two Teachers: Co-evolutionary Image and Report Distillation for Semi-supervised Anatomical Abnormality Detection in Chest X-ray**|Jinghan Sun et.al.|[2307.09184v1](http://arxiv.org/abs/2307.09184v1)|null|\n", "2307.09155": "|**2023-07-18**|**MLF-DET: Multi-Level Fusion for Cross-Modal 3D Object Detection**|Zewei Lin et.al.|[2307.09155v1](http://arxiv.org/abs/2307.09155v1)|null|\n", "2307.09066": "|**2023-07-18**|**PatchCT: Aligning Patch Set and Label Set with Conditional Transport for Multi-Label Image Classification**|Miaoge Li et.al.|[2307.09066v1](http://arxiv.org/abs/2307.09066v1)|**[link](https://github.com/keepgoingjkg/patchct)**|\n", "2307.09059": "|**2023-07-18**|**Unleashing the Imagination of Text: A Novel Framework for Text-to-image Person Retrieval via Exploring the Power of Words**|Delong Liu et.al.|[2307.09059v1](http://arxiv.org/abs/2307.09059v1)|null|\n", "2307.09050": "|**2023-07-18**|**R-Cut: Enhancing Explainability in Vision Transformers with Relationship Weighted Out and Cut**|Yingjie Niu et.al.|[2307.09050v1](http://arxiv.org/abs/2307.09050v1)|null|\n", "2307.09036": "|**2023-07-18**|**PromptMagician: Interactive Prompt Engineering for Text-to-Image Creation**|Yingchaojie Feng et.al.|[2307.09036v1](http://arxiv.org/abs/2307.09036v1)|**[link](https://github.com/yingchaojiefeng/promptmagician)**|\n", "2307.08991": "|**2023-07-18**|**EgoVM: Achieving Precise Ego-Localization using Lightweight Vectorized Maps**|Yuzhe He et.al.|[2307.08991v1](http://arxiv.org/abs/2307.08991v1)|null|\n", "2307.08788": "|**2023-07-17**|**Uncovering Load-Altering Attacks Against N-1 Secure Power Grids: A Rare-Event Sampling Approach**|Maldon Patrice Goodridge et.al.|[2307.08788v1](http://arxiv.org/abs/2307.08788v1)|null|\n", "2307.08752": "|**2023-07-17**|**A Re-Appraisal of CO/O$_2$ Runaway on Habitable Planets Orbiting Low-Mass Stars**|Sukrit Ranjan et.al.|[2307.08752v1](http://arxiv.org/abs/2307.08752v1)|null|\n", "2307.10094": "|**2023-07-19**|**Make-A-Volume: Leveraging Latent Diffusion Models for Cross-Modality 3D Brain MRI Synthesis**|Lingting Zhu et.al.|[2307.10094v1](http://arxiv.org/abs/2307.10094v1)|null|\n", "2307.09931": "|**2023-07-19**|**DISA: DIfferentiable Similarity Approximation for Universal Multimodal Registration**|Matteo Ronchetti et.al.|[2307.09931v1](http://arxiv.org/abs/2307.09931v1)|**[link](https://github.com/imfusiongmbh/disa-universal-multimodal-registration)**|\n", "2307.09915": "|**2023-07-19**|**Embedded Heterogeneous Attention Transformer for Cross-lingual Image Captioning**|Zijie Song et.al.|[2307.09915v1](http://arxiv.org/abs/2307.09915v1)|null|\n", "2307.09823": "|**2023-07-19**|**Multi-modal Learning based Prediction for Disease**|Yaran Chen et.al.|[2307.09823v1](http://arxiv.org/abs/2307.09823v1)|null|\n", "2307.09769": "|**2023-07-19**|**Source-Free Domain Adaptation for Medical Image Segmentation via Prototype-Anchored Feature Alignment and Contrastive Learning**|Qinji Yu et.al.|[2307.09769v1](http://arxiv.org/abs/2307.09769v1)|**[link](https://github.com/cscyqj/miccai23-protocontra-sfda)**|\n", "2307.09749": "|**2023-07-19**|**Towards Robust Scene Text Image Super-resolution via Explicit Location Enhancement**|Hang Guo et.al.|[2307.09749v1](http://arxiv.org/abs/2307.09749v1)|**[link](https://github.com/csguoh/lemma)**|\n", "2307.09721": "|**2023-07-19**|**Multi-Grained Multimodal Interaction Network for Entity Linking**|Pengfei Luo et.al.|[2307.09721v1](http://arxiv.org/abs/2307.09721v1)|**[link](https://github.com/pengfei-luo/mimic)**|\n", "2307.10810": "|**2023-07-20**|**On Combining Expert Demonstrations in Imitation Learning via Optimal Transport**|Ilana Sebag et.al.|[2307.10810v1](http://arxiv.org/abs/2307.10810v1)|null|\n", "2307.10782": "|**2023-07-20**|**See More and Know More: Zero-shot Point Cloud Segmentation via Multi-modal Visual Data**|Yuhang Lu et.al.|[2307.10782v1](http://arxiv.org/abs/2307.10782v1)|null|\n", "2307.10763": "|**2023-07-20**|**MSQNet: Actor-agnostic Action Recognition with Multi-modal Query**|Anindya Mondal et.al.|[2307.10763v1](http://arxiv.org/abs/2307.10763v1)|**[link](https://github.com/mondalanindya/msqnet)**|\n", "2307.10685": "|**2023-07-20**|**Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged Object Detection**|Yinghui Xing et.al.|[2307.10685v1](http://arxiv.org/abs/2307.10685v1)|null|\n", "2307.10601": "|**2023-07-20**|**SCA-PVNet: Self-and-Cross Attention Based Aggregation of Point Cloud and Multi-View for 3D Object Retrieval**|Dongyun Lin et.al.|[2307.10601v1](http://arxiv.org/abs/2307.10601v1)|null|\n", "2307.10577": "|**2023-07-21**|**Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced Perception based on Joint-Embedding & Contextual Label Affinity**|Hugo Latapie et.al.|[2307.10577v2](http://arxiv.org/abs/2307.10577v2)|null|\n", "2307.10519": "|**2023-07-20**|**Probabilistic Multimodal Depth Estimation Based on Camera-LiDAR Sensor Fusion**|Johan S. Obando-Ceron et.al.|[2307.10519v1](http://arxiv.org/abs/2307.10519v1)|null|\n", "2307.10490": "|**2023-07-24**|**(Ab)using Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs**|Eugene Bagdasaryan et.al.|[2307.10490v3](http://arxiv.org/abs/2307.10490v3)|**[link](https://github.com/ebagdasa/multimodal_injection)**|\n", "2307.10475": "|**2023-07-19**|**Findings of Factify 2: Multimodal Fake News Detection**|S Suryavardan et.al.|[2307.10475v1](http://arxiv.org/abs/2307.10475v1)|null|\n", "2307.11552": "|**2023-07-21**|**A multi-modal representation of El Ni\u00f1o Southern Oscillation Diversity**|Jakob Schl\u00f6r et.al.|[2307.11552v1](http://arxiv.org/abs/2307.11552v1)|**[link](https://github.com/jakob-schloer/latentgmm)**|\n", "2307.11545": "|**2023-07-21**|**Bridging Vision and Language Encoders: Parameter-Efficient Tuning for Referring Image Segmentation**|Zunnan Xu et.al.|[2307.11545v1](http://arxiv.org/abs/2307.11545v1)|**[link](https://github.com/kkakkkka/etris)**|\n", "2307.11530": "|**2023-07-21**|**UWAT-GAN: Fundus Fluorescein Angiography Synthesis via Ultra-wide-angle Transformation Multi-scale GAN**|Zhaojie Fang et.al.|[2307.11530v1](http://arxiv.org/abs/2307.11530v1)|**[link](https://github.com/Tinysqua/UWAT-GAN)**|\n", "2307.11450": "|**2023-07-21**|**Topic Identification For Spontaneous Speech: Enriching Audio Features With Embedded Linguistic Information**|Dejan Porjazovski et.al.|[2307.11450v1](http://arxiv.org/abs/2307.11450v1)|**[link](https://github.com/aalto-speech/Topic-identification-for-spontaneous-Finnish-speech)**|\n", "2307.11323": "|**2023-07-21**|**HVDetFusion: A Simple and Robust Camera-Radar Fusion Framework**|Kai Lei et.al.|[2307.11323v1](http://arxiv.org/abs/2307.11323v1)|**[link](https://github.com/hvxlab/hvdetfusion)**|\n", "2307.12964": "|**2023-07-24**|**Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature Alignment**|Sarah Ibrahimi et.al.|[2307.12964v1](http://arxiv.org/abs/2307.12964v1)|null|\n", "2307.12853": "|**2023-07-25**|**Spatiotemporal Modeling Encounters 3D Medical Image Analysis: Slice-Shift UNet with Multi-View Fusion**|C. I. Ugwu et.al.|[2307.12853v2](http://arxiv.org/abs/2307.12853v2)|null|\n", "2307.12732": "|**2023-07-24**|**CLIP-KD: An Empirical Study of Distilling CLIP Models**|Chuanguang Yang et.al.|[2307.12732v1](http://arxiv.org/abs/2307.12732v1)|null|\n", "2307.12626": "|**2023-07-24**|**Enhancing Human-like Multi-Modal Reasoning: A New Challenging Dataset and Comprehensive Framework**|Jingxuan Wei et.al.|[2307.12626v1](http://arxiv.org/abs/2307.12626v1)|**[link](https://github.com/weijingxuan/COCO-MMR)**|\n", "2307.12577": "|**2023-07-24**|**PRIOR: Prototype Representation Joint Learning from Medical Images and Reports**|Pujin Cheng et.al.|[2307.12577v1](http://arxiv.org/abs/2307.12577v1)|**[link](https://github.com/qtacierp/prior)**|\n", "2307.12545": "|**2023-07-24**|**Towards Video Anomaly Retrieval from Video Anomaly Detection: New Benchmarks and Model**|Peng Wu et.al.|[2307.12545v1](http://arxiv.org/abs/2307.12545v1)|null|\n", "2307.12242": "|**2023-07-23**|**HealthPrism: A Visual Analytics System for Exploring Children's Physical and Mental Health Profiles with Multimodal Data**|Zhihan Jiang et.al.|[2307.12242v1](http://arxiv.org/abs/2307.12242v1)|null|\n", "2307.12236": "|**2023-07-23**|**Multi-Modal Machine Learning for Assessing Gaming Skills in Online Streaming: A Case Study with CS:GO**|Longxiang Zhang et.al.|[2307.12236v1](http://arxiv.org/abs/2307.12236v1)|null|\n", "2307.12180": "|**2023-07-22**|**Prototype-Driven and Multi-Expert Integrated Multi-Modal MR Brain Tumor Image Segmentation**|Yafei Zhang et.al.|[2307.12180v1](http://arxiv.org/abs/2307.12180v1)|**[link](https://github.com/linzy0227/pdminet)**|\n", "2307.12067": "|**2023-07-22**|**Replay: Multi-modal Multi-view Acted Videos for Casual Holography**|Roman Shapovalov et.al.|[2307.12067v1](http://arxiv.org/abs/2307.12067v1)|**[link](https://github.com/facebookresearch/replay_dataset)**|\n", "2307.12058": "|**2023-07-22**|**Discovering Spatio-Temporal Rationales for Video Question Answering**|Yicong Li et.al.|[2307.12058v1](http://arxiv.org/abs/2307.12058v1)|null|\n", "2307.11921": "|**2023-07-21**|**Poverty rate prediction using multi-modal survey and earth observation data**|Simone Fobi et.al.|[2307.11921v1](http://arxiv.org/abs/2307.11921v1)|null|\n", "2307.13600": "|**2023-07-25**|**Decisive Data using Multi-Modality Optical Sensors for Advanced Vehicular Systems**|Muhammad Ali Farooq et.al.|[2307.13600v1](http://arxiv.org/abs/2307.13600v1)|null|\n", "2307.13537": "|**2023-07-25**|**Spectrum-guided Multi-granularity Referring Video Object Segmentation**|Bo Miao et.al.|[2307.13537v1](http://arxiv.org/abs/2307.13537v1)|**[link](https://github.com/bo-miao/sgmg)**|\n", "2307.13529": "|**2023-07-25**|**Re-mine, Learn and Reason: Exploring the Cross-modal Semantic Correlations for Language-guided HOI detection**|Yichao Cao et.al.|[2307.13529v1](http://arxiv.org/abs/2307.13529v1)|null|\n", "2307.13205": "|**2023-07-25**|**Text-oriented Modality Reinforcement Network for Multimodal Sentiment Analysis from Unaligned Multimodal Sequences**|Yuxuan Lei et.al.|[2307.13205v1](http://arxiv.org/abs/2307.13205v1)|null|\n", "2307.13125": "|**2023-07-24**|**Deep Learning Approaches for Data Augmentation in Medical Imaging: A Review**|Aghiles Kebaili et.al.|[2307.13125v1](http://arxiv.org/abs/2307.13125v1)|null|\n", "2307.13069": "|**2023-07-24**|**General-Purpose Multi-Modal OOD Detection Framework**|Viet Duong et.al.|[2307.13069v1](http://arxiv.org/abs/2307.13069v1)|null|\n", "2307.14277": "|**2023-07-26**|**G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and Game Theory**|Hongxiang Li et.al.|[2307.14277v1](http://arxiv.org/abs/2307.14277v1)|null|\n", "2307.14273": "|**2023-07-26**|**Deepfake Image Generation for Improved Brain Tumor Segmentation**|Roa'a Al-Emaryeen et.al.|[2307.14273v1](http://arxiv.org/abs/2307.14273v1)|null|\n", "2307.14244": "|**2023-07-26**|**Neural-based Cross-modal Search and Retrieval of Artwork**|Yan Gong et.al.|[2307.14244v1](http://arxiv.org/abs/2307.14244v1)|null|\n", "2307.14240": "|**2023-07-26**|**Boon: A Neural Search Engine for Cross-Modal Information Retrieval**|Yan Gong et.al.|[2307.14240v1](http://arxiv.org/abs/2307.14240v1)|null|\n", "2307.14185": "|**2023-07-26**|**A comparison of machine learning surrogate models of street-scale flooding in Norfolk, Virginia**|Diana McSpadden et.al.|[2307.14185v1](http://arxiv.org/abs/2307.14185v1)|null|\n", "2307.14126": "|**2023-07-26**|**Multi-modal Learning with Missing Modality via Shared-Specific Feature Modelling**|Hu Wang et.al.|[2307.14126v1](http://arxiv.org/abs/2307.14126v1)|null|\n", "2307.14061": "|**2023-07-26**|**Set-level Guidance Attack: Boosting Adversarial Transferability of Vision-Language Pre-training Models**|Dong Lu et.al.|[2307.14061v1](http://arxiv.org/abs/2307.14061v1)|**[link](https://github.com/Zoky-2020/Set-level_Guidance_Attack)**|\n", "2307.13950": "|**2023-07-26**|**Deep Robust Multi-Robot Re-localisation in Natural Environments**|Milad Ramezani et.al.|[2307.13950v1](http://arxiv.org/abs/2307.13950v1)|null|\n", "2307.13933": "|**2023-07-26**|**AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking Dataset for Assistive Driving Perception**|Dingkang Yang et.al.|[2307.13933v1](http://arxiv.org/abs/2307.13933v1)|**[link](https://github.com/ydk122024/aide)**|\n", "2307.13925": "|**2023-07-27**|**EasyNet: An Easy Network for 3D Industrial Anomaly Detection**|Ruitao Chen et.al.|[2307.13925v2](http://arxiv.org/abs/2307.13925v2)|null|\n", "2307.13871": "|**2023-07-26**|**Emulating Expert Insight: A Robust Strategy for Optimal Experimental Design**|Matthew R. Carbone et.al.|[2307.13871v1](http://arxiv.org/abs/2307.13871v1)|**[link](https://github.com/matthewcarbone/scientificvalueagent)**|\n", "2307.15016": "|**2023-07-27**|**How Good is Google Bard's Visual Understanding? An Empirical Study on Open Challenges**|Haotong Qin et.al.|[2307.15016v1](http://arxiv.org/abs/2307.15016v1)|**[link](https://github.com/htqin/googlebard-visunderstand)**|\n", "2307.14901": "|**2023-07-27**|**Text-guided Foundation Model Adaptation for Pathological Image Classification**|Yunkun Zhang et.al.|[2307.14901v1](http://arxiv.org/abs/2307.14901v1)|**[link](https://github.com/yunkun-zhang/cite)**|\n", "2307.14889": "|**2023-07-27**|**Weakly Supervised Multi-Modal 3D Human Body Pose Estimation for Autonomous Driving**|Peter Bauer et.al.|[2307.14889v1](http://arxiv.org/abs/2307.14889v1)|null|\n", "2307.14878": "|**2023-07-27**|**MESED: A Multi-modal Entity Set Expansion Dataset with Fine-grained Semantic Classes and Hard Negative Entities**|Yangning Li et.al.|[2307.14878v1](http://arxiv.org/abs/2307.14878v1)|**[link](https://github.com/thukelab/mesed)**|\n", "2307.14682": "|**2023-07-27**|**Unified Adversarial Patch for Visible-Infrared Cross-modal Attacks in the Physical World**|Xingxing Wei et.al.|[2307.14682v1](http://arxiv.org/abs/2307.14682v1)|**[link](https://github.com/aries-iai/cross-modal_patch_attack)**|\n", "2307.14619": "|**2023-07-29**|**Imitating Complex Trajectories: Bridging Low-Level Stability and High-Level Behavior**|Adam Block et.al.|[2307.14619v2](http://arxiv.org/abs/2307.14619v2)|null|\n", "2307.14572": "|**2023-07-27**|**Non-invasive Deep-Brain Imaging with 3D Integrated Photoacoustic Tomography and Ultrasound Localization Microscopy (3D-PAULM)**|Yuqi Tang et.al.|[2307.14572v1](http://arxiv.org/abs/2307.14572v1)|null|\n", "2307.14539": "|**2023-07-26**|**Plug and Pray: Exploiting off-the-shelf components of Multi-Modal Models**|Erfan Shayegani et.al.|[2307.14539v1](http://arxiv.org/abs/2307.14539v1)|null|\n", "2307.14523": "|**2023-07-26**|**Towards multi-modal anatomical landmark detection for ultrasound-guided brain tumor resection with contrastive learning**|Soorena Salari et.al.|[2307.14523v1](http://arxiv.org/abs/2307.14523v1)|null|\n", "2307.14491": "|**2023-07-26**|**Modality-Agnostic Audio-Visual Deepfake Detection**|Cai Yu et.al.|[2307.14491v1](http://arxiv.org/abs/2307.14491v1)|null|\n", "2307.15554": "|**2023-07-28**|**'What are you referring to?' Evaluating the Ability of Multi-Modal Dialogue Models to Process Clarificational Exchanges**|Javier Chiyah-Garcia et.al.|[2307.15554v1](http://arxiv.org/abs/2307.15554v1)|**[link](https://github.com/jchiyah/what-are-you-referring-to)**|\n", "2307.15460": "|**2023-07-28**|**Cross-Modal Concept Learning and Inference for Vision-Language Models**|Yi Zhang et.al.|[2307.15460v1](http://arxiv.org/abs/2307.15460v1)|null|\n", "2307.15432": "|**2023-07-28**|**CFN-ESA: A Cross-Modal Fusion Network with Emotion-Shift Awareness for Dialogue Emotion Recognition**|Jiang Li et.al.|[2307.15432v1](http://arxiv.org/abs/2307.15432v1)|null|\n", "2307.15344": "|**2023-07-28**|**Improving Audio-Text Retrieval via Hierarchical Cross-Modal Interaction and Auxiliary Captions**|Yifei Xin et.al.|[2307.15344v1](http://arxiv.org/abs/2307.15344v1)|null|\n", "2307.15220": "|**2023-07-27**|**Learning Multi-modal Representations by Watching Hundreds of Surgical Video Lectures**|Kun Yuan et.al.|[2307.15220v1](http://arxiv.org/abs/2307.15220v1)|**[link](https://github.com/camma-public/surgvlp)**|\n", "2307.15167": "|**2023-07-27**|**PEANUT: A Human-AI Collaborative Tool for Annotating Audio-Visual Data**|Zheng Zhang et.al.|[2307.15167v1](http://arxiv.org/abs/2307.15167v1)|null|\n", "2307.15097": "|**2023-07-27**|**Cascaded Cross-Modal Transformer for Request and Complaint Detection**|Nicolae-Catalin Ristea et.al.|[2307.15097v1](http://arxiv.org/abs/2307.15097v1)|null|\n", "2307.16896": "|**2023-07-31**|**Disruptive Autoencoders: Leveraging Low-level features for 3D Medical Image Pre-training**|Jeya Maria Jose Valanarasu et.al.|[2307.16896v1](http://arxiv.org/abs/2307.16896v1)|null|\n", "2307.16847": "|**2023-07-31**|**Latent Masking for Multimodal Self-supervised Learning in Health Timeseries**|Shohreh Deldari et.al.|[2307.16847v1](http://arxiv.org/abs/2307.16847v1)|null|\n", "2307.16745": "|**2023-07-31**|**Advancing Smart Malnutrition Monitoring: A Multi-Modal Learning Approach for Vital Health Parameter Estimation**|Ashish Marisetty et.al.|[2307.16745v1](http://arxiv.org/abs/2307.16745v1)|null|\n", "2307.16617": "|**2023-07-31**|**FULLER: Unified Multi-modality Multi-task 3D Perception via Multi-level Gradient Calibration**|Zhijian Huang et.al.|[2307.16617v1](http://arxiv.org/abs/2307.16617v1)|null|\n", "2307.16532": "|**2023-07-31**|**Echoes Beyond Points: Unleashing the Power of Raw Radar Data in Multi-modality Fusion**|Yang Liu et.al.|[2307.16532v1](http://arxiv.org/abs/2307.16532v1)|null|\n", "2307.16395": "|**2023-07-31**|**Bridging the Gap: Exploring the Capabilities of Bridge-Architectures for Complex Visual Reasoning Tasks**|Kousik Rajesh et.al.|[2307.16395v1](http://arxiv.org/abs/2307.16395v1)|null|\n", "2307.16366": "|**2023-07-31**|**Multi-modal Graph Neural Network for Early Diagnosis of Alzheimer's Disease from sMRI and PET Scans**|Yanteng Zhanga et.al.|[2307.16366v1](http://arxiv.org/abs/2307.16366v1)|null|\n", "2307.16210": "|**2023-08-01**|**Rethinking Uncertainly Missing and Ambiguous Visual Modality in Multi-Modal Entity Alignment**|Zhuo Chen et.al.|[2307.16210v2](http://arxiv.org/abs/2307.16210v2)|**[link](https://github.com/zjukg/umaea)**|\n", "2307.16142": "|**2023-07-30**|**Implicit Neural Representation in Medical Imaging: A Comparative Survey**|Amirali Molaei et.al.|[2307.16142v1](http://arxiv.org/abs/2307.16142v1)|**[link](https://github.com/mindflow-institue/awesome-implicit-neural-representations-in-medical-imaging)**|\n", "2307.16121": "|**2023-07-30**|**Uncertainty-Encoded Multi-Modal Fusion for Robust Object Detection in Autonomous Driving**|Yang Lou et.al.|[2307.16121v1](http://arxiv.org/abs/2307.16121v1)|null|\n", "2307.16106": "|**2023-07-30**|**TransFusion: A Practical and Effective Transformer-based Diffusion Model for 3D Human Motion Prediction**|Sibo Tian et.al.|[2307.16106v1](http://arxiv.org/abs/2307.16106v1)|null|\n", "2307.16013": "|**2023-07-29**|**Marrying Dialogue Systems with Data Visualization: Interactive Data Visualization Generation from Natural Language Conversations**|Yuanfeng Song et.al.|[2307.16013v1](http://arxiv.org/abs/2307.16013v1)|null|\n", "2307.15988": "|**2023-07-29**|**RGB-D-Fusion: Image Conditioned Depth Diffusion of Humanoid Subjects**|Sascha Kirch et.al.|[2307.15988v1](http://arxiv.org/abs/2307.15988v1)|**[link](https://github.com/sascha-kirch/rgb-d-fusion)**|\n", "2307.15942": "|**2023-07-29**|**CMDA: Cross-Modality Domain Adaptation for Nighttime Semantic Segmentation**|Ruihao Xia et.al.|[2307.15942v1](http://arxiv.org/abs/2307.15942v1)|**[link](https://github.com/xiarho/cmda)**|\n", "2307.15872": "|**2023-07-29**|**Cross-dimensional transfer learning in medical image segmentation with deep learning**|Hicham Messaoudi et.al.|[2307.15872v1](http://arxiv.org/abs/2307.15872v1)|**[link](https://github.com/hic-messaoudi/cross-dimensional-transfer-learning-in-medical-image-segmentation-with-deep-learning)**|\n", "2308.00692": "|**2023-08-03**|**LISA: Reasoning Segmentation via Large Language Model**|Xin Lai et.al.|[2308.00692v2](http://arxiv.org/abs/2308.00692v2)|**[link](https://github.com/dvlab-research/lisa)**|\n", "2308.00628": "|**2023-08-01**|**Human-M3: A Multi-view Multi-modal Dataset for 3D Human Pose Estimation in Outdoor Scenes**|Bohao Fan et.al.|[2308.00628v1](http://arxiv.org/abs/2308.00628v1)|**[link](https://github.com/soullessrobot/human-m3-dataset)**|\n", "2308.00588": "|**2023-08-01**|**Relation-Aware Distribution Representation Network for Person Clustering with Multiple Modalities**|Kaijian Liu et.al.|[2308.00588v1](http://arxiv.org/abs/2308.00588v1)|null|\n", "2308.00330": "|**2023-08-01**|**Advancing Frame-Dropping in Multi-Object Tracking-by-Detection Systems Through Event-Based Detection Triggering**|Matti Henning et.al.|[2308.00330v1](http://arxiv.org/abs/2308.00330v1)|null|\n", "2308.00295": "|**2023-08-01**|**Making the V in Text-VQA Matter**|Shamanthak Hegde et.al.|[2308.00295v1](http://arxiv.org/abs/2308.00295v1)|null|\n", "2308.00291": "|**2023-08-01**|**Fundus-Enhanced Disease-Aware Distillation Model for Retinal Disease Classification from OCT Images**|Lehan Wang et.al.|[2308.00291v1](http://arxiv.org/abs/2308.00291v1)|**[link](https://github.com/xmed-lab/fddm)**|\n", "2308.00264": "|**2023-08-01**|**Multi-Modality Multi-Loss Fusion Network**|Zehui Wu et.al.|[2308.00264v1](http://arxiv.org/abs/2308.00264v1)|null|\n", "2308.00235": "|**2023-08-01**|**Demonstrating Autonomous 3D Path Planning on a Novel Scalable UGV-UAV Morphing Robot**|Eric Sihite et.al.|[2308.00235v1](http://arxiv.org/abs/2308.00235v1)|null|\n", "2308.00228": "|**2023-08-01**|**Using Scene and Semantic Features for Multi-modal Emotion Recognition**|Zhifeng Wang et.al.|[2308.00228v1](http://arxiv.org/abs/2308.00228v1)|null|\n", "2307.16620": "|**2023-08-01**|**Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics**|Chen Liu et.al.|[2307.16620v2](http://arxiv.org/abs/2307.16620v2)|null|\n", "2308.01217": "|**2023-08-02**|**TeachCLIP: Multi-Grained Teaching for Efficient Text-to-Video Retrieval**|Kaibin Tian et.al.|[2308.01217v1](http://arxiv.org/abs/2308.01217v1)|null|\n", "2308.01147": "|**2023-08-02**|**Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment for Markup-to-Image Generation**|Guojin Zhong et.al.|[2308.01147v1](http://arxiv.org/abs/2308.01147v1)|**[link](https://github.com/zgj77/fsacdm)**|\n", "2308.01006": "|**2023-08-03**|**FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of Autonomous Driving**|Tengju Ye et.al.|[2308.01006v2](http://arxiv.org/abs/2308.01006v2)|**[link](https://github.com/westlake-autolab/fusionad)**|\n", "2308.00980": "|**2023-08-02**|**Grasp Stability Assessment Through Attention-Guided Cross-Modality Fusion and Transfer Learning**|Zhuangzhuang Zhang et.al.|[2308.00980v1](http://arxiv.org/abs/2308.00980v1)|null|\n", "2308.00906": "|**2023-08-02**|**ImageBrush: Learning Visual In-Context Instructions for Exemplar-Based Image Manipulation**|Yasheng Sun et.al.|[2308.00906v1](http://arxiv.org/abs/2308.00906v1)|null|\n", "2308.00856": "|**2023-08-01**|**Differential Privacy for Adaptive Weight Aggregation in Federated Tumor Segmentation**|Muhammad Irfan Khan et.al.|[2308.00856v1](http://arxiv.org/abs/2308.00856v1)|null|\n", "2308.01731": "|**2023-08-03**|**Quantification of Predictive Uncertainty via Inference-Time Sampling**|Katar\u00edna T\u00f3thov\u00e1 et.al.|[2308.01731v1](http://arxiv.org/abs/2308.01731v1)|null|\n", "2308.01546": "|**2023-08-03**|**MusicLDM: Enhancing Novelty in Text-to-Music Generation Using Beat-Synchronous Mixup Strategies**|Ke Chen et.al.|[2308.01546v1](http://arxiv.org/abs/2308.01546v1)|**[link](https://github.com/retrocirce/musicldm)**|\n", "2308.01526": "|**2023-08-03**|**Data Augmentation for Human Behavior Analysis in Multi-Person Conversations**|Kun Li et.al.|[2308.01526v1](http://arxiv.org/abs/2308.01526v1)|null|\n", "2308.01328": "|**2023-08-02**|**A vision transformer-based framework for knowledge transfer from multi-modal to mono-modal lymphoma subtyping models**|Bilel Guetarni et.al.|[2308.01328v1](http://arxiv.org/abs/2308.01328v1)|null|\n", "2308.02487": "|**2023-08-04**|**Convolutions Die Hard: Open-Vocabulary Segmentation with Single Frozen Convolutional CLIP**|Qihang Yu et.al.|[2308.02487v1](http://arxiv.org/abs/2308.02487v1)|**[link](https://github.com/bytedance/fc-clip)**|\n", "2308.02463": "|**2023-08-04**|**Towards Generalist Foundation Model for Radiology**|Chaoyi Wu et.al.|[2308.02463v1](http://arxiv.org/abs/2308.02463v1)|**[link](https://github.com/chaoyi-wu/radfm)**|\n", "2308.02239": "|**2023-08-04**|**DTF-Net: Category-Level Pose Estimation and Shape Reconstruction via Deformable Template Field**|Haowen Wang et.al.|[2308.02239v1](http://arxiv.org/abs/2308.02239v1)|null|\n", "2308.02097": "|**2023-08-04**|**Multi-interactive Feature Learning and a Full-time Multi-modality Benchmark for Image Fusion and Segmentation**|Jinyuan Liu et.al.|[2308.02097v1](http://arxiv.org/abs/2308.02097v1)|**[link](https://github.com/jinyuanliu-cv/segmif)**|\n", "2308.01994": "|**2023-08-03**|**Explainable unsupervised multi-modal image registration using deep networks**|Chengjia Wang et.al.|[2308.01994v1](http://arxiv.org/abs/2308.01994v1)|null|\n", "2308.02299": "|**2023-08-03**|**RegionBLIP: A Unified Multi-modal Pre-training Framework for Holistic and Regional Comprehension**|Qiang Zhou et.al.|[2308.02299v1](http://arxiv.org/abs/2308.02299v1)|**[link](https://github.com/mightyzau/regionblip)**|\n", "2308.03729": "|**2023-08-07**|**Tiny LVLM-eHub: Early Multimodal Experiments with Bard**|Wenqi Shao et.al.|[2308.03729v1](http://arxiv.org/abs/2308.03729v1)|**[link](https://github.com/opengvlab/multi-modality-arena)**|\n", "2308.03666": "|**2023-08-07**|**Bridging Trustworthiness and Open-World Learning: An Exploratory Neural Approach for Enhancing Interpretability, Generalization, and Robustness**|Shide Du et.al.|[2308.03666v1](http://arxiv.org/abs/2308.03666v1)|null|\n", "2308.03475": "|**2023-08-07**|**COPA: Efficient Vision-Language Pre-training Through Collaborative Object- and Patch-Text Alignment**|Chaoya Jiang et.al.|[2308.03475v1](http://arxiv.org/abs/2308.03475v1)|null|\n", "2308.03432": "|**2023-08-07**|**Cuing Without Sharing: A Federated Cued Speech Recognition Framework via Mutual Knowledge Distillation**|Yuxuan Zhang et.al.|[2308.03432v1](http://arxiv.org/abs/2308.03432v1)|**[link](https://github.com/yuxuanzhang0713/fedcsr)**|\n", "2308.03424": "|**2023-08-07**|**CAESURA: Language Models as Multi-Modal Query Planners**|Matthias Urban et.al.|[2308.03424v1](http://arxiv.org/abs/2308.03424v1)|null|\n", "2308.03267": "|**2023-08-07**|**Redundancy-aware Transformer for Video Question Answering**|Yicong Li et.al.|[2308.03267v1](http://arxiv.org/abs/2308.03267v1)|null|\n", "2308.03256": "|**2023-08-07**|**Learning a Graph Neural Network with Cross Modality Interaction for Image Fusion**|Jiawei Li et.al.|[2308.03256v1](http://arxiv.org/abs/2308.03256v1)|**[link](https://github.com/lok-18/ignet)**|\n", "2308.03151": "|**2023-08-06**|**Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating Vision-Language Models**|Zheng Ma et.al.|[2308.03151v1](http://arxiv.org/abs/2308.03151v1)|**[link](https://github.com/aaronma2020/Food500-Cap)**|\n", "2308.03135": "|**2023-08-06**|**E-CLIP: Towards Label-efficient Event-based Open-world Understanding by CLIP**|Jiazhou Zhou et.al.|[2308.03135v1](http://arxiv.org/abs/2308.03135v1)|null|\n", "2308.02982": "|**2023-08-06**|**Beyond First Impressions: Integrating Joint Multi-modal Cues for Comprehensive 3D Representation**|Haowei Wang et.al.|[2308.02982v1](http://arxiv.org/abs/2308.02982v1)|**[link](https://github.com/mr-neko/jm3d)**|\n", "2308.02883": "|**2023-08-05**|**Cross-modal & Cross-domain Learning for Unsupervised LiDAR Semantic Segmentation**|Yiyang Chen et.al.|[2308.02883v1](http://arxiv.org/abs/2308.02883v1)|null|\n", "2308.02872": "|**2023-08-05**|**Data-Based Design of Multi-Model Inferential Sensors**|Martin Mojto et.al.|[2308.02872v1](http://arxiv.org/abs/2308.02872v1)|null|\n", "2308.02823": "|**2023-08-05**|**A Symbolic Character-Aware Model for Solving Geometry Problems**|Maizhen Ning et.al.|[2308.02823v1](http://arxiv.org/abs/2308.02823v1)|**[link](https://github.com/ning-mz/sca-gps)**|\n", "2308.04369": "|**2023-08-08**|**SSTFormer: Bridging Spiking Neural Network and Memory Support Transformer for Frame-Event based Recognition**|Xiao Wang et.al.|[2308.04369v1](http://arxiv.org/abs/2308.04369v1)|**[link](https://github.com/event-ahu/sstformer)**|\n", "2308.04352": "|**2023-08-08**|**3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment**|Ziyu Zhu et.al.|[2308.04352v1](http://arxiv.org/abs/2308.04352v1)|null|\n", "2308.04343": "|**2023-08-08**|**Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval**|Yi Bin et.al.|[2308.04343v1](http://arxiv.org/abs/2308.04343v1)|**[link](https://github.com/luminosityx/hat)**|\n", "2308.04126": "|**2023-08-08**|**OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion and Infinite Data Generation**|Dongyang Yu et.al.|[2308.04126v1](http://arxiv.org/abs/2308.04126v1)|**[link](https://github.com/shajiayu1/OmniDataComposer)**|\n", "2308.04067": "|**2023-08-08**|**Online Distillation-enhanced Multi-modal Transformer for Sequential Recommendation**|Wei Ji et.al.|[2308.04067v1](http://arxiv.org/abs/2308.04067v1)|**[link](https://github.com/xyliugo/odmt)**|\n", "2308.03908": "|**2023-08-07**|**ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings for Video Action Recognition**|Soumyabrata Chaudhuri et.al.|[2308.03908v1](http://arxiv.org/abs/2308.03908v1)|null|\n", "2308.05061": "|**2023-08-09**|**Prompting In-Context Operator Learning with Sensor Data, Equations, and Natural Language**|Liu Yang et.al.|[2308.05061v1](http://arxiv.org/abs/2308.05061v1)|**[link](https://github.com/liuyangmage/in-context-operator-networks)**|\n", "2308.04992": "|**2023-08-09**|**AspectMMKG: A Multi-modal Knowledge Graph with Aspect-aware Entities**|Jingdan Zhang et.al.|[2308.04992v1](http://arxiv.org/abs/2308.04992v1)|**[link](https://github.com/thezjd/aspectmmkg)**|\n", "2308.04829": "|**2023-08-09**|**MixReorg: Cross-Modal Mixed Patch Reorganization is a Good Mask Learner for Open-World Semantic Segmentation**|Kaixin Cai et.al.|[2308.04829v1](http://arxiv.org/abs/2308.04829v1)|null|\n", "2308.04820": "|**2023-08-09**|**Strategic Interactions in Multi-modal Mobility Systems: A Game-Theoretic Perspective**|Gioele Zardini et.al.|[2308.04820v1](http://arxiv.org/abs/2308.04820v1)|null|\n", "2308.04779": "|**2023-08-09**|**Multi-View Fusion and Distillation for Subgrade Distresses Detection based on 3D-GPR**|Chunpeng Zhou et.al.|[2308.04779v1](http://arxiv.org/abs/2308.04779v1)|null|\n", "2308.04778": "|**2023-08-09**|**Multi-modal Multi-view Clustering based on Non-negative Matrix Factorization**|Yasser Khalafaoui et.al.|[2308.04778v1](http://arxiv.org/abs/2308.04778v1)|null|\n", "2308.04706": "|**2023-08-09**|**Pareto Invariant Representation Learning for Multimedia Recommendation**|Shanshan Huang et.al.|[2308.04706v1](http://arxiv.org/abs/2308.04706v1)|null|\n", "2308.04702": "|**2023-08-09**|**Continual Road-Scene Semantic Segmentation via Feature-Aligned Symmetric Multi-Modal Network**|Francesco Barbato et.al.|[2308.04702v1](http://arxiv.org/abs/2308.04702v1)|null|\n", "2308.04663": "|**2023-08-09**|**Classification of lung cancer subtypes on CT images with synthetic pathological priors**|Wentao Zhu et.al.|[2308.04663v1](http://arxiv.org/abs/2308.04663v1)|null|\n", "2308.04579": "|**2023-08-08**|**RECipe: Does a Multi-Modal Recipe Knowledge Graph Fit a Multi-Purpose Recommendation System?**|Ali Pesaranghader et.al.|[2308.04579v1](http://arxiv.org/abs/2308.04579v1)|null|\n", "2308.04556": "|**2023-08-08**|**FocalFormer3D : Focusing on Hard Instance for 3D Object Detection**|Yilun Chen et.al.|[2308.04556v1](http://arxiv.org/abs/2308.04556v1)|**[link](https://github.com/NVlabs/FocalFormer3D)**|\n", "2308.05667": "|**2023-08-14**|**2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration between Images and Point Clouds**|Minhao Li et.al.|[2308.05667v2](http://arxiv.org/abs/2308.05667v2)|**[link](https://github.com/minhaolee/2d3dmatr)**|\n", "2308.05648": "|**2023-08-10**|**Counterfactual Cross-modality Reasoning for Weakly Supervised Video Moment Localization**|Zezhong Lv et.al.|[2308.05648v1](http://arxiv.org/abs/2308.05648v1)|**[link](https://github.com/sldz0306/ccr)**|\n", "2308.05478": "|**2023-08-10**|**Reviewing 3D Object Detectors in the Context of High-Resolution 3+1D Radar**|Patrick Palmer et.al.|[2308.05478v1](http://arxiv.org/abs/2308.05478v1)|null|\n", "2308.05438": "|**2023-08-10**|**Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints Voting for Robust 6D Object Pose Estimation**|Jun Zhou et.al.|[2308.05438v1](http://arxiv.org/abs/2308.05438v1)|**[link](https://github.com/junzastar/dftr_voting)**|\n", "2308.05421": "|**2023-08-10**|**Progressive Spatio-temporal Perception for Audio-Visual Question Answering**|Guangyao Li et.al.|[2308.05421v1](http://arxiv.org/abs/2308.05421v1)|**[link](https://github.com/gewu-lab/pstp-net)**|\n", "2308.05128": "|**2023-08-09**|**High-Level Features Parallelization for Inference Cost Reduction Through Selective Attention**|Andr\u00e9 Peter Kelm et.al.|[2308.05128v1](http://arxiv.org/abs/2308.05128v1)|null|\n", "2308.06262": "|**2023-08-11**|**Foundation Model is Efficient Multimodal Multitask Model Selector**|Fanqing Meng et.al.|[2308.06262v1](http://arxiv.org/abs/2308.06262v1)|**[link](https://github.com/opengvlab/multitask-model-selector)**|\n", "2308.06207": "|**2023-08-11**|**Thinking Like an Expert:Multimodal Hypergraph-of-Thought (HoT) Reasoning to boost Foundation Modals**|Fanglong Yao et.al.|[2308.06207v1](http://arxiv.org/abs/2308.06207v1)|null|\n", "2308.06125": "|**2023-08-11**|**Improving Joint Speech-Text Representations Without Alignment**|Cal Peyser et.al.|[2308.06125v1](http://arxiv.org/abs/2308.06125v1)|null|\n", "2308.06024": "|**2023-08-11**|**Spatial-information Guided Adaptive Context-aware Network for Efficient RGB-D Semantic Segmentation**|Yang Zhang et.al.|[2308.06024v1](http://arxiv.org/abs/2308.06024v1)|**[link](https://github.com/mvme-hbut/sgacnet)**|\n", "2308.06009": "|**2023-08-11**|**ViGT: Proposal-free Video Grounding with Learnable Token in Transformer**|Kun Li et.al.|[2308.06009v1](http://arxiv.org/abs/2308.06009v1)|null|\n", "2308.05993": "|**2023-08-11**|**Image-based Geolocalization by Ground-to-2.5D Map Matching**|Mengjie Zhou et.al.|[2308.05993v1](http://arxiv.org/abs/2308.05993v1)|**[link](https://github.com/zhoumengjie/2-5dmap-dataset)**|\n", "2308.05948": "|**2023-08-11**|**Uncertainty-Aware Cross-Modal Transfer Network for Sketch-Based 3D Shape Retrieval**|Yiyang Cai et.al.|[2308.05948v1](http://arxiv.org/abs/2308.05948v1)|null|\n", "2308.05864": "|**2023-08-10**|**The Multi-modality Cell Segmentation Challenge: Towards Universal Solutions**|Jun Ma et.al.|[2308.05864v1](http://arxiv.org/abs/2308.05864v1)|null|\n", "2308.07222": "|**2023-08-14**|**MM-GEF: Multi-modal representation meet collaborative filtering**|Hao Wu et.al.|[2308.07222v1](http://arxiv.org/abs/2308.07222v1)|null|\n", "2308.07214": "|**2023-08-14**|**Automated Ensemble-Based Segmentation of Adult Brain Tumors: A Novel Approach Using the BraTS AFRICA Challenge Data**|Chiranjeewee Prasad Koirala et.al.|[2308.07214v1](http://arxiv.org/abs/2308.07214v1)|null|\n", "2308.07173": "|**2023-08-14**|**Enhancing State Estimator for Autonomous Race Car : Leveraging Multi-modal System and Managing Computing Resources**|Daegyu Lee et.al.|[2308.07173v1](http://arxiv.org/abs/2308.07173v1)|null|\n", "2308.07146": "|**2023-08-14**|**CTP: Towards Vision-Language Continual Pretraining via Compatible Momentum Contrast and Topology Preservation**|Hongguang Zhu et.al.|[2308.07146v1](http://arxiv.org/abs/2308.07146v1)|**[link](https://github.com/kevinlight831/ctp)**|\n", "2308.07026": "|**2023-08-14**|**AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal Contrastive Learning**|Ziqi Zhou et.al.|[2308.07026v1](http://arxiv.org/abs/2308.07026v1)|**[link](https://github.com/cgcl-codes/advclip)**|\n", "2308.06911": "|**2023-08-14**|**GIT-Mol: A Multi-modal Large Language Model for Molecular Science with Graph, Image, and Text**|Pengfei Liu et.al.|[2308.06911v1](http://arxiv.org/abs/2308.06911v1)|null|\n", "2308.06866": "|**2023-08-13**|**Improving Face Recognition from Caption Supervision with Multi-Granular Contextual Feature Aggregation**|Md Mahedi Hasan et.al.|[2308.06866v1](http://arxiv.org/abs/2308.06866v1)|null|\n", "2308.06735": "|**2023-08-13**|**AerialVLN: Vision-and-Language Navigation for UAVs**|Shubo Liu et.al.|[2308.06735v1](http://arxiv.org/abs/2308.06735v1)|**[link](https://github.com/airvln/airvln)**|\n", "2308.06696": "|**2023-08-13**|**MACO: A Modality Adversarial and Contrastive Framework for Modality-missing Multi-modal Knowledge Graph Completion**|Yichi Zhang et.al.|[2308.06696v1](http://arxiv.org/abs/2308.06696v1)|**[link](https://github.com/zjukg/maco)**|\n", "2308.06573": "|**2023-08-12**|**4DRVO-Net: Deep 4D Radar-Visual Odometry Using Multi-Modal and Multi-Scale Adaptive Fusion**|Guirong Zhuo et.al.|[2308.06573v1](http://arxiv.org/abs/2308.06573v1)|null|\n", "2308.06556": "|**2023-08-12**|**Contrastive Learning for Cross-modal Artist Retrieval**|Andres Ferraro et.al.|[2308.06556v1](http://arxiv.org/abs/2308.06556v1)|null|\n", "2308.06530": "|**2023-08-12**|**BEV-DG: Cross-Modal Learning under Bird's-Eye View for Domain Generalization of 3D Semantic Segmentation**|Miaoyu Li et.al.|[2308.06530v1](http://arxiv.org/abs/2308.06530v1)|null|\n", "2308.06498": "|**2023-08-12**|**Latent Emission-Augmented Perspective-Taking (LEAPT) for Human-Robot Interaction**|Kaiqi Chen et.al.|[2308.06498v1](http://arxiv.org/abs/2308.06498v1)|null|\n", "2308.06394": "|**2023-08-11**|**Detecting and Preventing Hallucinations in Large Vision Language Models**|Anisha Gunjal et.al.|[2308.06394v1](http://arxiv.org/abs/2308.06394v1)|null|\n", "2308.06377": "|**2023-08-11**|**CATS v2: Hybrid encoders for robust medical segmentation**|Hao Li et.al.|[2308.06377v1](http://arxiv.org/abs/2308.06377v1)|**[link](https://github.com/haoli12345/cats)**|\n", "2308.07907": "|**2023-08-15**|**Sequential Monte Carlo with Cross-validated Neural Networks for Complexity of Hyperbolic Black Hole Solutions in 4D**|Armin Hatefi et.al.|[2308.07907v1](http://arxiv.org/abs/2308.07907v1)|null|\n", "2308.07777": "|**2023-08-15**|**Enhancing Visually-Rich Document Understanding via Layout Structure Modeling**|Qiwei Li et.al.|[2308.07777v1](http://arxiv.org/abs/2308.07777v1)|null|\n", "2308.07751": "|**2023-08-15**|**CASPNet++: Joint Multi-Agent Motion Prediction**|Maximilian Sch\u00e4fer et.al.|[2308.07751v1](http://arxiv.org/abs/2308.07751v1)|null|\n", "2308.07732": "|**2023-08-15**|**UniTR: A Unified and Efficient Multi-Modal Transformer for Bird's-Eye-View Representation**|Haiyang Wang et.al.|[2308.07732v1](http://arxiv.org/abs/2308.07732v1)|**[link](https://github.com/haiyang-w/unitr)**|\n", "2308.07686": "|**2023-08-15**|**Boosting Multi-modal Model Performance with Adaptive Gradient Modulation**|Hong Li et.al.|[2308.07686v1](http://arxiv.org/abs/2308.07686v1)|**[link](https://github.com/lihong2303/agm_iccv2023)**|\n", "2308.07648": "|**2023-08-15**|**Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval**|Chaorui Deng et.al.|[2308.07648v1](http://arxiv.org/abs/2308.07648v1)|**[link](https://github.com/bladewaltz1/promptswitch)**|\n", "2308.07622": "|**2023-08-15**|**EMID: An Emotional Aligned Dataset in Audio-Visual Modality**|Jialing Zou et.al.|[2308.07622v1](http://arxiv.org/abs/2308.07622v1)|**[link](https://github.com/ecnu-aigc/emid)**|\n", "2308.07605": "|**2023-08-15**|**SGDiff: A Style Guided Diffusion Model for Fashion Synthesis**|Zhengwentai Sun et.al.|[2308.07605v1](http://arxiv.org/abs/2308.07605v1)|**[link](https://github.com/taited/sgdiff)**|\n", "2308.08546": "|**2023-08-16**|**What is the source of the PTA GW signal?**|John Ellis et.al.|[2308.08546v1](http://arxiv.org/abs/2308.08546v1)|null|\n", "2308.08409": "|**2023-08-16**|**X-PSI Parameter Recovery for Temperature Map Configurations Inspired by PSR J0030+0451**|Serena Vinciguerra et.al.|[2308.08409v1](http://arxiv.org/abs/2308.08409v1)|null|\n", "2308.08303": "|**2023-08-16**|**Leveraging Next-Active Objects for Context-Aware Anticipation in Egocentric Videos**|Sanket Thakur et.al.|[2308.08303v1](http://arxiv.org/abs/2308.08303v1)|null|\n", "2308.08157": "|**2023-08-16**|**Learning to Generate Semantic Layouts for Higher Text-Image Correspondence in Text-to-Image Synthesis**|Minho Park et.al.|[2308.08157v1](http://arxiv.org/abs/2308.08157v1)|**[link](https://github.com/pmh9960/GCDP)**|\n", "2308.08143": "|**2023-08-16**|**SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech Separation**|Kai Li et.al.|[2308.08143v1](http://arxiv.org/abs/2308.08143v1)|null|\n", "2308.08125": "|**2023-08-16**|**Radio2Text: Streaming Speech Recognition Using mmWave Radio Signals**|Running Zhao et.al.|[2308.08125v1](http://arxiv.org/abs/2308.08125v1)|null|\n", "2308.08088": "|**2023-08-16**|**Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme Detection**|Rui Cao et.al.|[2308.08088v1](http://arxiv.org/abs/2308.08088v1)|**[link](https://github.com/social-ai-studio/pro-cap)**|\n", "2308.09622": "|**2023-08-18**|**Is context all you need? Scaling Neural Sign Language Translation to Large Domains of Discourse**|Ozge Mercanoglu Sincan et.al.|[2308.09622v1](http://arxiv.org/abs/2308.09622v1)|null|\n", "2308.09599": "|**2023-08-18**|**Language-Guided Diffusion Model for Visual Grounding**|Sijia Chen et.al.|[2308.09599v1](http://arxiv.org/abs/2308.09599v1)|null|\n", "2308.09568": "|**2023-08-18**|**PUMGPT: A Large Vision-Language Model for Product Understanding**|Shuhui Wu et.al.|[2308.09568v1](http://arxiv.org/abs/2308.09568v1)|null|\n", "2308.09475": "|**2023-08-18**|**Video-Instrument Synergistic Network for Referring Video Instrument Segmentation in Robotic Surgery**|Hongqiu Wang et.al.|[2308.09475v1](http://arxiv.org/abs/2308.09475v1)|null|\n", "2308.09469": "|**2023-08-18**|**An updated mass-radius analysis of the 2017-2018 NICER data set of PSR J0030+0451**|Serena Vinciguerra et.al.|[2308.09469v1](http://arxiv.org/abs/2308.09469v1)|null|\n", "2308.09442": "|**2023-08-21**|**BioMedGPT: Open Multimodal Generative Pre-trained Transformer for BioMedicine**|Yizhen Luo et.al.|[2308.09442v2](http://arxiv.org/abs/2308.09442v2)|**[link](https://github.com/pharmolix/openbiomed)**|\n", "2308.09369": "|**2023-08-18**|**Single Frame Semantic Segmentation Using Multi-Modal Spherical Images**|Suresh Guttikonda et.al.|[2308.09369v1](http://arxiv.org/abs/2308.09369v1)|**[link](https://github.com/sguttikon/SFSS-MMSI)**|\n", "2308.09363": "|**2023-08-18**|**Open-vocabulary Video Question Answering: A New Benchmark for Evaluating the Generalizability of Video Question Answering Models**|Dohwan Ko et.al.|[2308.09363v1](http://arxiv.org/abs/2308.09363v1)|**[link](https://github.com/mlvlab/ovqa)**|\n", "2308.09351": "|**2023-08-18**|**RLIPv2: Fast Scaling of Relational Language-Image Pre-training**|Hangjie Yuan et.al.|[2308.09351v1](http://arxiv.org/abs/2308.09351v1)|**[link](https://github.com/jacobyuan7/rlipv2)**|\n", "2308.09322": "|**2023-08-18**|**Audio-Visual Glance Network for Efficient Video Recognition**|Muhammad Adi Nugroho et.al.|[2308.09322v1](http://arxiv.org/abs/2308.09322v1)|null|\n", "2308.09306": "|**2023-08-18**|**DiffDis: Empowering Generative Diffusion Model with Cross-Modal Discrimination Capability**|Runhui Huang et.al.|[2308.09306v1](http://arxiv.org/abs/2308.09306v1)|null|\n", "2308.09300": "|**2023-08-21**|**V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by Connecting Foundation Models**|Heng Wang et.al.|[2308.09300v2](http://arxiv.org/abs/2308.09300v2)|**[link](https://github.com/heng-hw/V2A-Mapper)**|\n", "2308.09234": "|**2023-08-18**|**Deep Boosting Multi-Modal Ensemble Face Recognition with Sample-Level Weighting**|Sahar Rahimi Malakshan et.al.|[2308.09234v1](http://arxiv.org/abs/2308.09234v1)|null|\n", "2308.09179": "|**2023-08-17**|**Versatile Multi-Contact Planning and Control for Legged Loco-Manipulation**|Jean-Pierre Sleiman et.al.|[2308.09179v1](http://arxiv.org/abs/2308.09179v1)|null|\n", "2308.08930": "|**2023-08-17**|**Point-aware Interaction and CNN-induced Refinement Network for RGB-D Salient Object Detection**|Runmin Cong et.al.|[2308.08930v1](http://arxiv.org/abs/2308.08930v1)|**[link](https://github.com/rmcong/picr-net_acmmm23)**|\n", "2308.10777": "|**2023-08-21**|**I-BaR: Integrated Balance Rehabilitation Framework**|Tugce Ersoy et.al.|[2308.10777v1](http://arxiv.org/abs/2308.10777v1)|null|\n", "2308.10741": "|**2023-08-21**|**On the Adversarial Robustness of Multi-Modal Foundation Models**|Christian Schlarmann et.al.|[2308.10741v1](http://arxiv.org/abs/2308.10741v1)|null|\n", "2308.10631": "|**2023-08-21**|**PsyMo: A Dataset for Estimating Self-Reported Psychological Traits from Gait**|Adrian Cosma et.al.|[2308.10631v1](http://arxiv.org/abs/2308.10631v1)|null|\n", "2308.10627": "|**2023-08-21**|**Polarimetric Information for Multi-Modal 6D Pose Estimation of Photometrically Challenging Objects with Limited Data**|Patrick Ruhkamp et.al.|[2308.10627v1](http://arxiv.org/abs/2308.10627v1)|null|\n", "2308.10621": "|**2023-08-21**|**Multi-Modal Dataset Acquisition for Photometrically Challenging Object**|HyunJun Jung et.al.|[2308.10621v1](http://arxiv.org/abs/2308.10621v1)|null|\n", "2308.10491": "|**2023-08-21**|**SynDrone -- Multi-modal UAV Dataset for Urban Scenarios**|Giulia Rizzoli et.al.|[2308.10491v1](http://arxiv.org/abs/2308.10491v1)|**[link](https://github.com/lttm/syndrone)**|\n", "2308.10486": "|**2023-08-21**|**Deep Metric Loss for Multimodal Learning**|Sehwan Moon et.al.|[2308.10486v1](http://arxiv.org/abs/2308.10486v1)|**[link](https://github.com/sehwanmoon/multimodalloss)**|\n", "2308.10454": "|**2023-08-21**|**Elucidating STEM Concepts through Generative AI: A Multi-modal Exploration of Analogical Reasoning**|Chen Cao et.al.|[2308.10454v1](http://arxiv.org/abs/2308.10454v1)|null|\n", "2308.10421": "|**2023-08-21**|**UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D Representation for 3D Perception in Autonomous Driving**|Jian Zou et.al.|[2308.10421v1](http://arxiv.org/abs/2308.10421v1)|**[link](https://github.com/hollow-503/unim2ae)**|\n", "2308.10362": "|**2023-08-20**|**Vehicle Cameras Guide mmWave Beams: Approach and Real-World V2V Demonstration**|Tawfik Osman et.al.|[2308.10362v1](http://arxiv.org/abs/2308.10362v1)|null|\n", "2308.10240": "|**2023-08-20**|**Generic Attention-model Explainability by Weighted Relevance Accumulation**|Yiming Huang et.al.|[2308.10240v1](http://arxiv.org/abs/2308.10240v1)|null|\n", "2308.10175": "|**2023-08-20**|**BAVS: Bootstrapping Audio-Visual Segmentation by Integrating Foundation Knowledge**|Chen Liu et.al.|[2308.10175v1](http://arxiv.org/abs/2308.10175v1)|null|\n", "2308.10172": "|**2023-08-20**|**VLN-PETL: Parameter-Efficient Transfer Learning for Vision-and-Language Navigation**|Yanyuan Qiao et.al.|[2308.10172v1](http://arxiv.org/abs/2308.10172v1)|**[link](https://github.com/yanyuanqiao/vln-petl)**|\n", "2308.10161": "|**2023-08-20**|**ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under Challenging Conditions**|Qiao Yan et.al.|[2308.10161v1](http://arxiv.org/abs/2308.10161v1)|null|\n", "2308.10146": "|**2023-08-20**|**OCHID-Fi: Occlusion-Robust Hand Pose Estimation in 3D via RF-Vision**|Shujie Zhang et.al.|[2308.10146v1](http://arxiv.org/abs/2308.10146v1)|null|\n", "2308.11601": "|**2023-08-23**|**Tryage: Real-time, intelligent Routing of User Prompts to Large Language Models**|Surya Narayanan Hari et.al.|[2308.11601v2](http://arxiv.org/abs/2308.11601v2)|null|\n", "2308.11561": "|**2023-08-23**|**Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog Navigation**|Yifei Su et.al.|[2308.11561v2](http://arxiv.org/abs/2308.11561v2)|**[link](https://github.com/yifeisu/avdn-challenge)**|\n", "2308.11551": "|**2023-08-22**|**Multi-event Video-Text Retrieval**|Gengyuan Zhang et.al.|[2308.11551v1](http://arxiv.org/abs/2308.11551v1)|**[link](https://github.com/gengyuanmax/mevtr)**|\n", "2308.11530": "|**2023-08-22**|**Furnishing Sound Event Detection with Language Model Abilities**|Hualei Wang et.al.|[2308.11530v1](http://arxiv.org/abs/2308.11530v1)|null|\n", "2308.11513": "|**2023-08-22**|**TrackFlow: Multi-Object Tracking with Normalizing Flows**|Gianluca Mancusi et.al.|[2308.11513v1](http://arxiv.org/abs/2308.11513v1)|null|\n", "2308.11501": "|**2023-08-22**|**Four years of multi-modal odometry and mapping on the rail vehicles**|Yusheng Wang et.al.|[2308.11501v1](http://arxiv.org/abs/2308.11501v1)|null|\n", "2308.11492": "|**2023-08-22**|**A LiDAR-Inertial SLAM Tightly-Coupled with Dropout-Tolerant GNSS Fusion for Autonomous Mine Service Vehicles**|Yusheng Wang et.al.|[2308.11492v1](http://arxiv.org/abs/2308.11492v1)|null|\n", "2308.11356": "|**2023-08-22**|**Semantic RGB-D Image Synthesis**|Shijie Li et.al.|[2308.11356v1](http://arxiv.org/abs/2308.11356v1)|null|\n", "2308.11351": "|**2023-08-22**|**M3PS: End-to-End Multi-Grained Multi-Modal Attribute-Aware Product Summarization in E-commerce**|Tao Chen et.al.|[2308.11351v1](http://arxiv.org/abs/2308.11351v1)|null|\n", "2308.11331": "|**2023-08-22**|**GrowCLIP: Data-aware Automatic Model Growing for Large-scale Contrastive Language-Image Pre-training**|Xinchi Deng et.al.|[2308.11331v1](http://arxiv.org/abs/2308.11331v1)|null|\n", "2308.11206": "|**2023-08-22**|**DiffCloth: Diffusion Based Garment Synthesis and Manipulation via Structural Cross-modal Semantic Alignment**|Xujie Zhang et.al.|[2308.11206v1](http://arxiv.org/abs/2308.11206v1)|null|\n", "2308.11175": "|**2023-08-22**|**MISSRec: Pre-training and Transferring Multi-modal Interest-aware Sequence Representation for Recommendation**|Jinpeng Wang et.al.|[2308.11175v1](http://arxiv.org/abs/2308.11175v1)|**[link](https://github.com/gimpong/MM23-MISSRec)**|\n", "2308.11165": "|**2023-08-22**|**Improving Misaligned Multi-modality Image Fusion with One-stage Progressive Dense Registration**|Di Wang et.al.|[2308.11165v1](http://arxiv.org/abs/2308.11165v1)|null|\n", "2308.12199": "|**2023-08-23**|**Towards Real-Time Analysis of Broadcast Badminton Videos**|Nitin Nilesh et.al.|[2308.12199v1](http://arxiv.org/abs/2308.12199v1)|**[link](https://gitlab.com/nitin.nilesh/badminton-analysis-star)**|\n", "2308.12163": "|**2023-08-23**|**NPF-200: A Multi-Modal Eye Fixation Dataset and Method for Non-Photorealistic Videos**|Ziyu Yang et.al.|[2308.12163v1](http://arxiv.org/abs/2308.12163v1)|**[link](https://github.com/yangziyu/npf200)**|\n", "2308.12111": "|**2023-08-23**|**Cross-Modality Proposal-guided Feature Mining for Unregistered RGB-Thermal Pedestrian Detection**|Chao Tian et.al.|[2308.12111v1](http://arxiv.org/abs/2308.12111v1)|null|\n", "2308.12049": "|**2023-08-23**|**Towards Privacy-Supporting Fall Detection via Deep Unsupervised RGB2Depth Adaptation**|Hejun Xiao et.al.|[2308.12049v1](http://arxiv.org/abs/2308.12049v1)|**[link](https://github.com/1015206533/privacy_supporting_fall_detection)**|\n", "2308.11994": "|**2023-08-23**|**Progressive Feature Mining and External Knowledge-Assisted Text-Pedestrian Image Retrieval**|Huafeng Li et.al.|[2308.11994v1](http://arxiv.org/abs/2308.11994v1)|null|\n", "2308.11983": "|**2023-08-23**|**Multi-Modal Multi-Task (3MT) Road Segmentation**|Erkan Milli et.al.|[2308.11983v1](http://arxiv.org/abs/2308.11983v1)|**[link](https://github.com/erkanmilli/3mt-roadseg)**|\n", "2308.11880": "|**2023-08-23**|**SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal Targets**|Cody Simons et.al.|[2308.11880v1](http://arxiv.org/abs/2308.11880v1)|**[link](https://github.com/csimo005/summit)**|\n", "2308.11877": "|**2023-08-24**|**Integrated Image and Location Analysis for Wound Classification: A Deep Learning Approach**|Yash Patel et.al.|[2308.11877v2](http://arxiv.org/abs/2308.11877v2)|null|\n", "2308.11804": "|**2023-08-22**|**Ceci n'est pas une pomme: Adversarial Illusions in Multi-Modal Embeddings**|Eugene Bagdasaryan et.al.|[2308.11804v1](http://arxiv.org/abs/2308.11804v1)|**[link](https://github.com/ebagdasa/adversarial_illusions)**|\n", "2308.11797": "|**2023-08-22**|**CLIP Multi-modal Hashing: A new baseline CLIPMH**|Jian Zhu et.al.|[2308.11797v1](http://arxiv.org/abs/2308.11797v1)|null|\n", "2308.12956": "|**2023-08-24**|**DLIP: Distilling Language-Image Pre-training**|Huafeng Kuang et.al.|[2308.12956v1](http://arxiv.org/abs/2308.12956v1)|null|\n", "2308.12871": "|**2023-08-24**|**IPA: Inference Pipeline Adaptation to Achieve High Accuracy and Cost-Efficiency**|Saeid Ghafouri et.al.|[2308.12871v1](http://arxiv.org/abs/2308.12871v1)|null|\n", "2308.12863": "|**2023-08-24**|**SkipcrossNets: Adaptive Skip-cross Fusion for Road Detection**|Xinyu Zhang et.al.|[2308.12863v1](http://arxiv.org/abs/2308.12863v1)|null|\n", "2308.12755": "|**2023-08-24**|**Acquiring Qualitative Explainable Graphs for Automated Driving Scene Interpretation**|Nassim Belmecheri et.al.|[2308.12755v1](http://arxiv.org/abs/2308.12755v1)|**[link](https://github.com/simula-vias/qxg-builder)**|\n", "2308.12736": "|**2023-08-24**|**FastSurfer-HypVINN: Automated sub-segmentation of the hypothalamus and adjacent structures on high-resolutional brain MRI**|Santiago Estrada et.al.|[2308.12736v1](http://arxiv.org/abs/2308.12736v1)|**[link](https://github.com/Deep-MI/FastSurfer)**|\n", "2308.12610": "|**2023-08-24**|**Emotion-Aligned Contrastive Learning Between Images and Music**|Shanti Stewart et.al.|[2308.12610v1](http://arxiv.org/abs/2308.12610v1)|null|\n", "2308.12604": "|**2023-08-24**|**PromptMRG: Diagnosis-Driven Prompts for Medical Report Generation**|Haibo Jin et.al.|[2308.12604v1](http://arxiv.org/abs/2308.12604v1)|null|\n", "2308.12587": "|**2023-08-24**|**Grounded Entity-Landmark Adaptive Pre-training for Vision-and-Language Navigation**|Yibo Cui et.al.|[2308.12587v1](http://arxiv.org/abs/2308.12587v1)|**[link](https://github.com/csir1996/vln-gela)**|\n", "2308.12558": "|**2023-08-24**|**Hyperbolic Audio-visual Zero-shot Learning**|Jie Hong et.al.|[2308.12558v1](http://arxiv.org/abs/2308.12558v1)|null|\n", "2308.12509": "|**2023-08-24**|**Parameter-Efficient Transfer Learning for Remote Sensing Image-Text Retrieval**|Yuan Yuan et.al.|[2308.12509v1](http://arxiv.org/abs/2308.12509v1)|**[link](https://github.com/ZhanYang-nwpu/PE-RSITR)**|\n", "2308.12370": "|**2023-08-23**|**AdVerb: Visually Guided Audio Dereverberation**|Sanjoy Chowdhury et.al.|[2308.12370v1](http://arxiv.org/abs/2308.12370v1)|null|\n", "2308.12320": "|**2023-08-23**|**Understanding Dark Scenes by Contrasting Multi-Modal Observations**|Xiaoyu Dong et.al.|[2308.12320v1](http://arxiv.org/abs/2308.12320v1)|**[link](https://github.com/palmdong/smmcl)**|\n", "2308.13437": "|**2023-08-25**|**Position-Enhanced Visual Instruction Tuning for Multimodal Large Language Models**|Chi Chen et.al.|[2308.13437v1](http://arxiv.org/abs/2308.13437v1)|**[link](https://github.com/pvit-official/pvit)**|\n", "2308.13392": "|**2023-08-25**|**Self-Supervised Representation Learning with Cross-Context Learning between Global and Hypercolumn Features**|Zheng Gao et.al.|[2308.13392v1](http://arxiv.org/abs/2308.13392v1)|null|\n", "2308.13355": "|**2023-08-25**|**WorldSmith: Iterative and Expressive Prompting for World Building with a Generative AI**|Hai Dang et.al.|[2308.13355v1](http://arxiv.org/abs/2308.13355v1)|null|\n", "2308.13340": "|**2023-08-25**|**TriGait: Aligning and Fusing Skeleton and Silhouette Gait Data via a Tri-Branch Network**|Yan Sun et.al.|[2308.13340v1](http://arxiv.org/abs/2308.13340v1)|**[link](https://github.com/feng-xueling/trigait)**|\n", "2308.13077": "|**2023-08-24**|**Preserving Modality Structure Improves Multi-Modal Learning**|Swetha Sirnam et.al.|[2308.13077v1](http://arxiv.org/abs/2308.13077v1)|null|\n", "2308.14713": "|**2023-08-28**|**R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras**|Aron Schmied et.al.|[2308.14713v1](http://arxiv.org/abs/2308.14713v1)|null|\n", "2308.14619": "|**2023-08-29**|**Compositional Semantic Mix for Domain Adaptation in Point Cloud Segmentation**|Cristiano Saltori et.al.|[2308.14619v2](http://arxiv.org/abs/2308.14619v2)|**[link](https://github.com/saltoricristiano/cosmix-uda)**|\n", "2308.14613": "|**2023-08-28**|**MS-Net: A Multi-modal Self-supervised Network for Fine-Grained Classification of Aircraft in SAR Images**|Bingying Yue et.al.|[2308.14613v1](http://arxiv.org/abs/2308.14613v1)|null|\n", "2308.14482": "|**2023-08-28**|**An Empirical Study of Consistency Regularization for End-to-End Speech-to-Text Translation**|Pengzhi Gao et.al.|[2308.14482v1](http://arxiv.org/abs/2308.14482v1)|**[link](https://github.com/gpengzhi/simcr)**|\n", "2308.14383": "|**2023-08-28**|**Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a Light-Weight ToF Sensor**|Xinyang Liu et.al.|[2308.14383v1](http://arxiv.org/abs/2308.14383v1)|null|\n", "2308.14263": "|**2023-08-28**|**Cross-Modal Retrieval: A Systematic Review of Methods and Future Directions**|Lei Zhu et.al.|[2308.14263v1](http://arxiv.org/abs/2308.14263v1)|**[link](https://github.com/bmc-sdnu/cross-modal-retrieval)**|\n", "2308.14212": "|**2023-08-27**|**Exploring the Transfer Learning Capabilities of CLIP in Domain Generalization for Diabetic Retinopathy**|Sanoojan Baliah et.al.|[2308.14212v1](http://arxiv.org/abs/2308.14212v1)|**[link](https://github.com/sanoojan/clip-drdg)**|\n", "2308.14177": "|**2023-08-27**|**AIGC for Various Data Modalities: A Survey**|Lin Geng Foo et.al.|[2308.14177v1](http://arxiv.org/abs/2308.14177v1)|null|\n", "2308.14160": "|**2023-08-27**|**A Unified Transformer-based Network for multimodal Emotion Recognition**|Kamran Ali et.al.|[2308.14160v1](http://arxiv.org/abs/2308.14160v1)|null|\n", "2308.14105": "|**2023-08-29**|**Unified and Dynamic Graph for Temporal Character Grouping in Long Videos**|Xiujun Shu et.al.|[2308.14105v2](http://arxiv.org/abs/2308.14105v2)|null|\n", "2308.14083": "|**2023-08-27**|**4D Myocardium Reconstruction with Decoupled Motion and Shape Model**|Xiaohan Yuan et.al.|[2308.14083v1](http://arxiv.org/abs/2308.14083v1)|**[link](https://github.com/yuan-xiaohan/4d-myocardium-reconstruction-with-decoupled-motion-and-shape-model)**|\n", "2308.14064": "|**2023-08-27**|**Multi-model fusion for Aerial Vision and Dialog Navigation based on human attention aids**|Xinyi Wang et.al.|[2308.14064v1](http://arxiv.org/abs/2308.14064v1)|null|\n", "2308.14023": "|**2023-08-27**|**Domain-Specificity Inducing Transformers for Source-Free Domain Adaptation**|Sunandini Sanyal et.al.|[2308.14023v1](http://arxiv.org/abs/2308.14023v1)|null|\n", "2308.14009": "|**2023-08-27**|**Towards Fast and Accurate Image-Text Retrieval with Self-Supervised Fine-Grained Alignment**|Jiamin Zhuang et.al.|[2308.14009v1](http://arxiv.org/abs/2308.14009v1)|**[link](https://github.com/zjamie813/selfalign)**|\n", "2308.13976": "|**2023-08-27**|**Label Denoising through Cross-Model Agreement**|Yu Wang et.al.|[2308.13976v1](http://arxiv.org/abs/2308.13976v1)|null|\n", "2308.15273": "|**2023-08-29**|**Cross-Modal Retrieval Meets Inference:Improving Zero-Shot Classification with Cross-Modal Retrieval**|Seongha Eom et.al.|[2308.15273v1](http://arxiv.org/abs/2308.15273v1)|null|\n", "2308.15063": "|**2023-08-29**|**Learning Cross-modality Information Bottleneck Representation for Heterogeneous Person Re-Identification**|Haichao Shi et.al.|[2308.15063v1](http://arxiv.org/abs/2308.15063v1)|null|\n", "2308.14978": "|**2023-08-29**|**Vision Grid Transformer for Document Layout Analysis**|Cheng Da et.al.|[2308.14978v1](http://arxiv.org/abs/2308.14978v1)|**[link](https://github.com/alibabaresearch/advancedliteratemachinery)**|\n", "2308.14786": "|**2023-08-28**|**Extending Cross-Modal Retrieval with Interactive Learning to Improve Image Retrieval Performance in Forensics**|Nils B\u00f6hne et.al.|[2308.14786v1](http://arxiv.org/abs/2308.14786v1)|null|\n", "2308.16150": "|**2023-08-30**|**Modality Cycles with Masked Conditional Diffusion for Unsupervised Anomaly Segmentation in MRI**|Ziyun Liang et.al.|[2308.16150v1](http://arxiv.org/abs/2308.16150v1)|**[link](https://github.com/ziyunliang/mmccd)**|\n", "2308.16071": "|**2023-08-30**|**Semantic Image Synthesis via Class-Adaptive Cross-Attention**|Tomaso Fontanini et.al.|[2308.16071v1](http://arxiv.org/abs/2308.16071v1)|null|\n", "2308.16021": "|**2023-08-30**|**CALM: Contrastive Cross-modal Speaking Style Modeling for Expressive Text-to-Speech Synthesis**|Yi Meng et.al.|[2308.16021v1](http://arxiv.org/abs/2308.16021v1)|null|\n", "2308.15980": "|**2023-08-30**|**Adaptive Multi-Modalities Fusion in Sequential Recommendation Systems**|Hengchang Hu et.al.|[2308.15980v1](http://arxiv.org/abs/2308.15980v1)|**[link](https://github.com/holdenhu/mmsr)**|\n", "2308.15930": "|**2023-08-30**|**LLaSM: Large Language and Speech Model**|Yu Shu et.al.|[2308.15930v1](http://arxiv.org/abs/2308.15930v1)|**[link](https://github.com/linksoul-ai/llasm)**|\n", "2308.15846": "|**2023-08-30**|**Exploring Multi-Modal Contextual Knowledge for Open-Vocabulary Object Detection**|Yifan Xu et.al.|[2308.15846v1](http://arxiv.org/abs/2308.15846v1)|null|\n", "2308.15670": "|**2023-08-29**|**Multimodal Foundation Models For Echocardiogram Interpretation**|Matthew Christensen et.al.|[2308.15670v1](http://arxiv.org/abs/2308.15670v1)|**[link](https://github.com/echonet/echo_CLIP)**|\n", "2308.15640": "|**2023-08-29**|**Identifying Constitutive Parameters for Complex Hyperelastic Solids using Physics-Informed Neural Networks**|Siyuan Song et.al.|[2308.15640v1](http://arxiv.org/abs/2308.15640v1)|null|\n", "2308.15609": "|**2023-08-29**|**InstaTune: Instantaneous Neural Architecture Search During Fine-Tuning**|Sharath Nittur Sridhar et.al.|[2308.15609v1](http://arxiv.org/abs/2308.15609v1)|null|\n", "2308.15592": "|**2023-08-29**|**Non-local Interactions are Essential Elements for Dark Matter Halo Stability: A Cross-Model Study**|Ahmad Borzou et.al.|[2308.15592v1](http://arxiv.org/abs/2308.15592v1)|null|\n", "2308.16896": "|**2023-08-31**|**PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic Occupancy Prediction**|Sicheng Zuo et.al.|[2308.16896v1](http://arxiv.org/abs/2308.16896v1)|**[link](https://github.com/wzzheng/pointocc)**|\n", "2308.16777": "|**2023-09-01**|**Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models**|Minheng Ni et.al.|[2308.16777v2](http://arxiv.org/abs/2308.16777v2)|null|\n", "2308.16758": "|**2023-08-31**|**Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation Using only Images**|Cuican Yu et.al.|[2308.16758v1](http://arxiv.org/abs/2308.16758v1)|null|\n", "2308.16649": "|**2023-08-31**|**Learning with Multi-modal Gradient Attention for Explainable Composed Image Retrieval**|Prateksha Udhayanan et.al.|[2308.16649v1](http://arxiv.org/abs/2308.16649v1)|null|\n", "2308.16632": "|**2023-08-31**|**3D-STMN: Dependency-Driven Superpoint-Text Matching Network for End-to-End 3D Referring Expression Segmentation**|Changli Wu et.al.|[2308.16632v1](http://arxiv.org/abs/2308.16632v1)|**[link](https://github.com/sosppxo/3d-stmn)**|\n", "2308.16493": "|**2023-08-31**|**Expanding Frozen Vision-Language Models without Retraining: Towards Improved Robot Perception**|Riley Tavassoli et.al.|[2308.16493v1](http://arxiv.org/abs/2308.16493v1)|null|\n", "2308.16474": "|**2023-08-31**|**Enhancing Subtask Performance of Multi-modal Large Language Model**|Yongqiang Zhao et.al.|[2308.16474v1](http://arxiv.org/abs/2308.16474v1)|null|\n", "2308.16437": "|**2023-08-31**|**AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR Prediction**|Zhaoxin Huan et.al.|[2308.16437v1](http://arxiv.org/abs/2308.16437v1)|null|\n", "2308.16386": "|**2023-08-31**|**RGB-T Tracking via Multi-Modal Mutual Prompt Learning**|Yang Luo et.al.|[2308.16386v1](http://arxiv.org/abs/2308.16386v1)|**[link](https://github.com/husteryoung/mplt)**|\n", "2309.00615": "|**2023-09-01**|**Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D Understanding, Generation, and Instruction Following**|Ziyu Guo et.al.|[2309.00615v1](http://arxiv.org/abs/2309.00615v1)|**[link](https://github.com/ziyuguo99/point-bind_point-llm)**|\n", "2309.00406": "|**2023-09-01**|**Constraining X-ray variability of the blazar 3C 273 using XMM-Newton observations over two decades**|Adithiya Dinesh et.al.|[2309.00406v1](http://arxiv.org/abs/2309.00406v1)|null|\n", "2309.00380": "|**2023-09-01**|**Learning multi-modal generative models with permutation-invariant encoders and tighter variational bounds**|Marcel Hirt et.al.|[2309.00380v1](http://arxiv.org/abs/2309.00380v1)|null|\n", "2309.00372": "|**2023-09-01**|**On the Localization of Ultrasound Image Slices within Point Distribution Models**|Lennart Bastian et.al.|[2309.00372v1](http://arxiv.org/abs/2309.00372v1)|**[link](https://github.com/vuenc/slice-to-shape)**|\n", "2309.00227": "|**2023-09-01**|**What Makes Good Open-Vocabulary Detector: A Disassembling Perspective**|Jincheng Li et.al.|[2309.00227v1](http://arxiv.org/abs/2309.00227v1)|null|\n", "2309.00133": "|**2023-08-31**|**Distraction-free Embeddings for Robust VQA**|Atharvan Dogra et.al.|[2309.00133v1](http://arxiv.org/abs/2309.00133v1)|null|\n", "2309.00030": "|**2023-08-31**|**Audio-Driven Dubbing for User Generated Contents via Style-Aware Semi-Parametric Synthesis**|Linsen Song et.al.|[2309.00030v1](http://arxiv.org/abs/2309.00030v1)|null|\n", "2309.02320": "|**2023-09-05**|**SeisCLIP: A seismology foundation model pre-trained by multi-modal data for multi-purpose seismic feature extraction**|Xu Si et.al.|[2309.02320v1](http://arxiv.org/abs/2309.02320v1)|**[link](https://github.com/sixu0/SeisCLIP)**|\n", "2309.02169": "|**2023-09-05**|**Dual Relation Alignment for Composed Image Retrieval**|Xintong Jiang et.al.|[2309.02169v1](http://arxiv.org/abs/2309.02169v1)|null|\n", "2309.02124": "|**2023-09-05**|**Exploiting Spatial-temporal Data for Sleep Stage Classification via Hypergraph Learning**|Yuze Liu et.al.|[2309.02124v1](http://arxiv.org/abs/2309.02124v1)|null|\n", "2309.02043": "|**2023-09-05**|**Decomposed Guided Dynamic Filters for Efficient RGB-Guided Depth Completion**|Yufei Wang et.al.|[2309.02043v1](http://arxiv.org/abs/2309.02043v1)|null|\n", "2309.02041": "|**2023-09-05**|**Learning Cross-Modal Affinity for Referring Video Object Segmentation Targeting Limited Samples**|Guanghui Li et.al.|[2309.02041v1](http://arxiv.org/abs/2309.02041v1)|**[link](https://github.com/hengliusky/few_shot_rvos)**|\n", "2309.01981": "|**2023-09-05**|**Graph-Based Interaction-Aware Multimodal 2D Vehicle Trajectory Prediction using Diffusion Graph Convolutional Networks**|Keshu Wu et.al.|[2309.01981v1](http://arxiv.org/abs/2309.01981v1)|null|\n", "2309.01955": "|**2023-09-05**|**A Survey on Interpretable Cross-modal Reasoning**|Dizhan Xue et.al.|[2309.01955v1](http://arxiv.org/abs/2309.01955v1)|**[link](https://github.com/ZuyiZhou/Awesome-Interpretable-Cross-modal-Reasoning)**|\n", "2309.01918": "|**2023-09-05**|**RoboAgent: Generalization and Efficiency in Robot Manipulation via Semantic Augmentations and Action Chunking**|Homanga Bharadhwaj et.al.|[2309.01918v1](http://arxiv.org/abs/2309.01918v1)|null|\n", "2309.01860": "|**2023-09-06**|**Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition and Translation**|Zaber Ibn Abdul Hakim et.al.|[2309.01860v2](http://arxiv.org/abs/2309.01860v2)|null|\n", "2309.01728": "|**2023-09-04**|**Generative-based Fusion Mechanism for Multi-Modal Tracking**|Zhangyong Tang et.al.|[2309.01728v1](http://arxiv.org/abs/2309.01728v1)|**[link](https://github.com/zhangyong-tang/gmmt)**|\n", "2309.01516": "|**2023-09-04**|**MultiWay-Adapater: Adapting large-scale multi-modal models for scalable image-text retrieval**|Zijun Long et.al.|[2309.01516v1](http://arxiv.org/abs/2309.01516v1)|**[link](https://github.com/longkukuhi/multiway-adapter)**|\n", "2309.01420": "|**2023-09-04**|**Unified Pre-training with Pseudo Texts for Text-To-Image Person Re-identification**|Zhiyin Shao et.al.|[2309.01420v1](http://arxiv.org/abs/2309.01420v1)|**[link](https://github.com/zhiyinshao-h/unipt)**|\n", "2309.01327": "|**2023-09-04**|**Can I Trust Your Answer? Visually Grounded Video Question Answering**|Junbin Xiao et.al.|[2309.01327v1](http://arxiv.org/abs/2309.01327v1)|**[link](https://github.com/doc-doc/next-gqa)**|\n", "2309.01256": "|**2023-09-03**|**BDC-Adapter: Brownian Distance Covariance for Better Vision-Language Reasoning**|Yi Zhang et.al.|[2309.01256v1](http://arxiv.org/abs/2309.01256v1)|null|\n", "2309.01073": "|**2023-09-03**|**Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding**|Cheng Shi et.al.|[2309.01073v1](http://arxiv.org/abs/2309.01073v1)|null|\n", "2309.03177": "|**2023-09-06**|**3D Object Positioning Using Differentiable Multimodal Learning**|Sean Zanyk-McLean et.al.|[2309.03177v1](http://arxiv.org/abs/2309.03177v1)|null|\n", "2309.03147": "|**2023-09-06**|**Real-Time Non-Invasive Imaging and Detection of Spreading Depolarizations through EEG: An Ultra-Light Explainable Deep Learning Approach**|Yinzhe Wu et.al.|[2309.03147v1](http://arxiv.org/abs/2309.03147v1)|null|\n", "2309.03100": "|**2023-09-06**|**FArMARe: a Furniture-Aware Multi-task methodology for Recommending Apartments based on the user interests**|Ali Abdari et.al.|[2309.03100v1](http://arxiv.org/abs/2309.03100v1)|**[link](https://github.com/aliabdari/farmare)**|\n", "2309.02965": "|**2023-09-06**|**Dynamic Hyperbolic Attention Network for Fine Hand-object Reconstruction**|Zhiying Leng et.al.|[2309.02965v1](http://arxiv.org/abs/2309.02965v1)|null|\n", "2309.02875": "|**2023-09-06**|**MAD: Modality Agnostic Distance Measure for Image Registration**|Vasiliki Sideri-Lampretsa et.al.|[2309.02875v1](http://arxiv.org/abs/2309.02875v1)|null|\n", "2309.02702": "|**2023-09-06**|**Gene-induced Multimodal Pre-training for Image-omic Classification**|Ting Jin et.al.|[2309.02702v1](http://arxiv.org/abs/2309.02702v1)|null|\n", "2309.02616": "|**2023-09-05**|**Generative AI-aided Joint Training-free Secure Semantic Communications via Multi-modal Prompts**|Hongyang Du et.al.|[2309.02616v1](http://arxiv.org/abs/2309.02616v1)|null|\n", "2309.02591": "|**2023-09-05**|**Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction Tuning**|Lili Yu et.al.|[2309.02591v1](http://arxiv.org/abs/2309.02591v1)|null|\n", "2309.03905": "|**2023-09-07**|**ImageBind-LLM: Multi-modality Instruction Tuning**|Jiaming Han et.al.|[2309.03905v1](http://arxiv.org/abs/2309.03905v1)|**[link](https://github.com/opengvlab/llama-adapter)**|\n", "2309.03869": "|**2023-09-07**|**Text-to-feature diffusion for audio-visual few-shot learning**|Otniel-Bogdan Mercea et.al.|[2309.03869v1](http://arxiv.org/abs/2309.03869v1)|**[link](https://github.com/explainableml/avdiff-gfsl)**|\n", "2309.03734": "|**2023-09-07**|**ClusterFusion: Leveraging Radar Spatial Features for Radar-Camera 3D Object Detection in Autonomous Vehicles**|Irfan Tito Kurniawan et.al.|[2309.03734v1](http://arxiv.org/abs/2309.03734v1)|null|\n", "2309.03661": "|**2023-09-07**|**Prompt-based Context- and Domain-aware Pretraining for Vision and Language Navigation**|Ting Liu et.al.|[2309.03661v1](http://arxiv.org/abs/2309.03661v1)|null|\n", "2309.03473": "|**2023-09-07**|**Temporal Collection and Distribution for Referring Video Object Segmentation**|Jiajin Tang et.al.|[2309.03473v1](http://arxiv.org/abs/2309.03473v1)|null|\n", "2309.03452": "|**2023-09-07**|**Multi-Modality Guidance Network For Missing Modality Inference**|Zhuokai Zhao et.al.|[2309.03452v1](http://arxiv.org/abs/2309.03452v1)|null|\n", "2309.04453": "|**2023-09-08**|**WiSARD: A Labeled Visual and Thermal Image Dataset for Wilderness Search and Rescue**|Daniel Broyles et.al.|[2309.04453v1](http://arxiv.org/abs/2309.04453v1)|null|\n", "2309.04399": "|**2023-09-08**|**MaskDiffusion: Boosting Text-to-Image Consistency with Conditional Mask**|Yupeng Zhou et.al.|[2309.04399v1](http://arxiv.org/abs/2309.04399v1)|null|\n", "2309.04302": "|**2023-09-08**|**Have We Ever Encountered This Before? Retrieving Out-of-Distribution Road Obstacles from Driving Scenes**|Youssef Shoeb et.al.|[2309.04302v1](http://arxiv.org/abs/2309.04302v1)|null|\n", "2309.04287": "|**2023-09-08**|**Sequential Semantic Generative Communication for Progressive Text-to-Image Generation**|Hyelin Nam et.al.|[2309.04287v1](http://arxiv.org/abs/2309.04287v1)|null|\n", "2309.04109": "|**2023-09-08**|**From Text to Mask: Localizing Entities Using the Attention of Text-to-Image Diffusion Models**|Changming Xiao et.al.|[2309.04109v1](http://arxiv.org/abs/2309.04109v1)|null|\n", "2309.04062": "|**2023-09-08**|**3D Denoisers are Good 2D Teachers: Molecular Pretraining via Denoising and Cross-Modal Distillation**|Sungjun Cho et.al.|[2309.04062v1](http://arxiv.org/abs/2309.04062v1)|null|\n", "2309.04001": "|**2023-09-07**|**Multimodal Transformer for Material Segmentation**|Md Kaykobad Reza et.al.|[2309.04001v1](http://arxiv.org/abs/2309.04001v1)|**[link](https://github.com/csiplab/mmsformer)**|\n", "2309.05644": "|**2023-09-11**|**Grid-based Hybrid 3DMA GNSS and Terrestrial Positioning**|Paul Schwarzbach et.al.|[2309.05644v1](http://arxiv.org/abs/2309.05644v1)|null|\n", "2309.05608": "|**2023-09-11**|**Incorporating Pre-trained Model Prompting in Multimodal Stock Volume Movement Prediction**|Ruibo Chen et.al.|[2309.05608v1](http://arxiv.org/abs/2309.05608v1)|**[link](https://github.com/rayruibochen/promuse)**|\n", "2309.05573": "|**2023-09-11**|**UniSeg: A Unified Multi-Modal LiDAR Segmentation Network and the OpenPCSeg Codebase**|Youquan Liu et.al.|[2309.05573v1](http://arxiv.org/abs/2309.05573v1)|**[link](https://github.com/pjlab-adg/pcseg)**|\n", "2309.05519": "|**2023-09-13**|**NExT-GPT: Any-to-Any Multimodal LLM**|Shengqiong Wu et.al.|[2309.05519v2](http://arxiv.org/abs/2309.05519v2)|**[link](https://github.com/NExT-GPT/NExT-GPT)**|\n", "2309.05503": "|**2023-09-11**|**Long-Range Transformer Architectures for Document Understanding**|Thibault Douzon et.al.|[2309.05503v1](http://arxiv.org/abs/2309.05503v1)|**[link](https://github.com/thibaultdouzon/long-range-document-transformer)**|\n", "2309.05451": "|**2023-09-11**|**Dual-view Curricular Optimal Transport for Cross-lingual Cross-modal Retrieval**|Yabing Wang et.al.|[2309.05451v1](http://arxiv.org/abs/2309.05451v1)|null|\n", "2309.05423": "|**2023-09-11**|**Multi-Modal Automatic Prosody Annotation with Contrastive Pretraining of SSWP**|Jinzuomu Zhong et.al.|[2309.05423v1](http://arxiv.org/abs/2309.05423v1)|null|\n", "2309.05396": "|**2023-09-12**|**SlideSpeech: A Large-Scale Slide-Enriched Audio-Visual Corpus**|Haoxu Wang et.al.|[2309.05396v2](http://arxiv.org/abs/2309.05396v2)|null|\n", "2309.05298": "|**2023-09-11**|**Real-Time Parallel Trajectory Optimization with Spatiotemporal Safety Constraints for Autonomous Driving in Congested Traffic**|Lei Zheng et.al.|[2309.05298v1](http://arxiv.org/abs/2309.05298v1)|null|\n", "2309.05281": "|**2023-09-11**|**Class-Incremental Grouping Network for Continual Audio-Visual Learning**|Shentong Mo et.al.|[2309.05281v1](http://arxiv.org/abs/2309.05281v1)|**[link](https://github.com/stonemo/cign)**|\n", "2309.05257": "|**2023-09-11**|**FusionFormer: A Multi-sensory Fusion in Bird's-Eye-View and Temporal Consistent Transformer for 3D Objection**|Chunyong Hu et.al.|[2309.05257v1](http://arxiv.org/abs/2309.05257v1)|null|\n", "2309.05251": "|**2023-09-11**|**Multi3DRefer: Grounding Text Description to Multiple 3D Objects**|Yiming Zhang et.al.|[2309.05251v1](http://arxiv.org/abs/2309.05251v1)|null|\n", "2309.05248": "|**2023-09-11**|**Enhancing Speaker Diarization with Large Language Models: A Contextual Beam Search Approach**|Tae Jin Park et.al.|[2309.05248v1](http://arxiv.org/abs/2309.05248v1)|null|\n", "2309.05203": "|**2023-09-11**|**From Artificially Real to Real: Leveraging Pseudo Data from Large Language Models for Low-Resource Molecule Discovery**|Yuhan Chen et.al.|[2309.05203v1](http://arxiv.org/abs/2309.05203v1)|null|\n", "2309.05090": "|**2023-09-10**|**Sculpting Efficiency: Pruning Medical Imaging Models for On-Device Inference**|Sudarshan Sreeram et.al.|[2309.05090v1](http://arxiv.org/abs/2309.05090v1)|null|\n", "2309.06262": "|**2023-09-12**|**Modality Unifying Network for Visible-Infrared Person Re-Identification**|Hao Yu et.al.|[2309.06262v1](http://arxiv.org/abs/2309.06262v1)|null|\n", "2309.06255": "|**2023-09-12**|**Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation**|Yake Wei et.al.|[2309.06255v1](http://arxiv.org/abs/2309.06255v1)|null|\n", "2309.06176": "|**2023-09-12**|**Dual-Path Temporal Map Optimization for Make-up Temporal Video Grounding**|Jiaxiu Li et.al.|[2309.06176v1](http://arxiv.org/abs/2309.06176v1)|null|\n", "2309.06102": "|**2023-09-12**|**Can we predict the Most Replayed data of video streaming platforms?**|Alessandro Duico et.al.|[2309.06102v1](http://arxiv.org/abs/2309.06102v1)|**[link](https://github.com/ombretta/most-replayed-data)**|\n", "2309.06081": "|**2023-09-12**|**Information Flow in Graph Neural Networks: A Clinical Triage Use Case**|V\u00edctor Valls et.al.|[2309.06081v1](http://arxiv.org/abs/2309.06081v1)|null|\n", "2309.05904": "|**2023-09-12**|**Enhancing Representation in Radiography-Reports Foundation Model: A Granular Alignment Algorithm Using Masked Contrastive Learning**|Weijian Huang et.al.|[2309.05904v1](http://arxiv.org/abs/2309.05904v1)|null|\n", "2309.05818": "|**2023-09-11**|**Rice Plant Disease Detection and Diagnosis using Deep Convolutional Neural Networks and Multispectral Imaging**|Yara Ali Alnaggar et.al.|[2309.05818v1](http://arxiv.org/abs/2309.05818v1)|null|\n", "2309.05803": "|**2023-09-11**|**Revisiting Energy Based Models as Policies: Ranking Noise Contrastive Estimation and Interpolating Energy Models**|Sumeet Singh et.al.|[2309.05803v1](http://arxiv.org/abs/2309.05803v1)|null|\n", "2309.05756": "|**2023-09-11**|**TransferDoc: A Self-Supervised Transferable Document Representation Learning Model Unifying Vision and Language**|Souhail Bakkali et.al.|[2309.05756v1](http://arxiv.org/abs/2309.05756v1)|null|\n", "2309.07120": "|**2023-09-13**|**Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness and Ethics**|Haoqin Tu et.al.|[2309.07120v1](http://arxiv.org/abs/2309.07120v1)|**[link](https://github.com/ucsc-vlaa/sight-beyond-text)**|\n", "2309.07066": "|**2023-09-13**|**CLiFF-LHMP: Using Spatial Dynamics Patterns for Long-Term Human Motion Prediction**|Yufei Zhu et.al.|[2309.07066v1](http://arxiv.org/abs/2309.07066v1)|null|\n", "2309.06799": "|**2023-09-13**|**When Geoscience Meets Foundation Models: Towards General Geoscience Artificial Intelligence System**|Hao Zhang et.al.|[2309.06799v1](http://arxiv.org/abs/2309.06799v1)|null|\n", "2309.06735": "|**2023-09-13**|**GelFlow: Self-supervised Learning of Optical Flow for Vision-Based Tactile Sensor Displacement Measurement**|Zhiyuan Zhang et.al.|[2309.06735v1](http://arxiv.org/abs/2309.06735v1)|null|\n", "2309.06728": "|**2023-09-13**|**Leveraging Foundation models for Unsupervised Audio-Visual Segmentation**|Swapnil Bhosale et.al.|[2309.06728v1](http://arxiv.org/abs/2309.06728v1)|null|\n", "2309.06599": "|**2023-09-12**|**Reasoning with Latent Diffusion in Offline Reinforcement Learning**|Siddarth Venkatraman et.al.|[2309.06599v1](http://arxiv.org/abs/2309.06599v1)|**[link](https://github.com/ldcq/ldcq)**|\n", "2309.06597": "|**2023-09-12**|**Rank2Tell: A Multimodal Driving Dataset for Joint Importance Ranking and Reasoning**|Enna Sachdeva et.al.|[2309.06597v1](http://arxiv.org/abs/2309.06597v1)|null|\n", "2309.06547": "|**2023-09-12**|**AmodalSynthDrive: A Synthetic Amodal Perception Dataset for Autonomous Driving**|Ahmed Rida Sekkat et.al.|[2309.06547v1](http://arxiv.org/abs/2309.06547v1)|null|\n", "2309.06517": "|**2023-09-12**|**Overview of Memotion 3: Sentiment and Emotion Analysis of Codemixed Hinglish Memes**|Shreyash Mishra et.al.|[2309.06517v1](http://arxiv.org/abs/2309.06517v1)|null|\n", "2309.06511": "|**2023-09-12**|**DF-TransFusion: Multimodal Deepfake Detection via Lip-Audio Cross-Attention and Facial Self-Attention**|Aaditya Kharel et.al.|[2309.06511v1](http://arxiv.org/abs/2309.06511v1)|null|\n", "2309.07915": "|**2023-09-14**|**MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning**|Haozhe Zhao et.al.|[2309.07915v1](http://arxiv.org/abs/2309.07915v1)|**[link](https://github.com/haozhezhao/mic)**|\n", "2309.07794": "|**2023-09-14**|**Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary tasks**|Danae S\u00e1nchez Villegas et.al.|[2309.07794v1](http://arxiv.org/abs/2309.07794v1)|null|\n", "2309.07759": "|**2023-09-14**|**PROGrasp: Pragmatic Human-Robot Communication for Object Grasping**|Gi-Cheon Kang et.al.|[2309.07759v1](http://arxiv.org/abs/2309.07759v1)|null|\n", "2309.07623": "|**2023-09-14**|**SwitchGPT: Adapting Large Language Models for Non-Text Outputs**|Xinyu Wang et.al.|[2309.07623v1](http://arxiv.org/abs/2309.07623v1)|null|\n", "2309.07495": "|**2023-09-14**|**HDTR-Net: A Real-Time High-Definition Teeth Restoration Network for Arbitrary Talking Face Generation Methods**|Yongyuan Li et.al.|[2309.07495v1](http://arxiv.org/abs/2309.07495v1)|**[link](https://github.com/yylgoodlucky/hdtr)**|\n", "2309.07387": "|**2023-09-14**|**VDialogUE: A Unified Evaluation Benchmark for Visually-grounded Dialogue**|Yunshui Li et.al.|[2309.07387v1](http://arxiv.org/abs/2309.07387v1)|null|\n", "2309.07332": "|**2023-09-13**|**Reliability-based cleaning of noisy training labels with inductive conformal prediction in multi-modal biomedical data mining**|Xianghao Zhan et.al.|[2309.07332v1](http://arxiv.org/abs/2309.07332v1)|**[link](https://github.com/xzhan96-stf/icp_train_clean)**|\n", "2309.07297": "|**2023-09-13**|**Multi-Modal Hybrid Learning and Sequential Training for RGB-T Saliency Detection**|Guangyu Ren et.al.|[2309.07297v1](http://arxiv.org/abs/2309.07297v1)|null|\n", "2309.08531": "|**2023-09-15**|**Towards Practical and Efficient Image-to-Speech Captioning with Vision-Language Pre-training and Multi-modal Tokens**|Minsu Kim et.al.|[2309.08531v1](http://arxiv.org/abs/2309.08531v1)|null|\n", "2309.08508": "|**2023-09-15**|**MOSAIC: Learning Unified Multi-Sensory Object Property Representations for Robot Perception**|Gyan Tatiya et.al.|[2309.08508v1](http://arxiv.org/abs/2309.08508v1)|**[link](https://github.com/gtatiya/MOSAIC)**|\n", "2309.08229": "|**2023-09-15**|**Automated Multi-Drugs Administration During Total Intravenous Anesthesia Using Multi-Model Predictive Control**|Bob Aubouin-Pairault et.al.|[2309.08229v1](http://arxiv.org/abs/2309.08229v1)|**[link](https://github.com/bobaubouin/tiva_drug_control)**|\n", "2309.08204": "|**2023-09-15**|**One-stage Modality Distillation for Incomplete Multimodal Learning**|Shicai Wei et.al.|[2309.08204v1](http://arxiv.org/abs/2309.08204v1)|null|\n", "2309.08160": "|**2023-09-15**|**Cross-Modal Synthesis of Structural MRI and Functional Connectivity Networks via Conditional ViT-GANs**|Yuda Bi et.al.|[2309.08160v1](http://arxiv.org/abs/2309.08160v1)|null|\n", "2309.08154": "|**2023-09-15**|**Uncertainty-Aware Multi-View Visual Semantic Embedding**|Wenzhang Wei et.al.|[2309.08154v1](http://arxiv.org/abs/2309.08154v1)|null|\n", "2309.08096": "|**2023-09-15**|**GelSplitter: Tactile Reconstruction from Near Infrared and Visible Images**|Yuankai Lin et.al.|[2309.08096v1](http://arxiv.org/abs/2309.08096v1)|null|\n", "2309.08088": "|**2023-09-15**|**Interactive Model Fusion-Based GM-PHD Filter**|Jiacheng He et.al.|[2309.08088v1](http://arxiv.org/abs/2309.08088v1)|null|\n", "2309.08021": "|**2023-09-14**|**Vision-based Analysis of Driver Activity and Driving Performance Under the Influence of Alcohol**|Ross Greer et.al.|[2309.08021v1](http://arxiv.org/abs/2309.08021v1)|null|\n", "2309.09958": "|**2023-09-18**|**An Empirical Study of Scaling Instruct-Tuned Large Multimodal Models**|Yadong Lu et.al.|[2309.09958v1](http://arxiv.org/abs/2309.09958v1)|**[link](https://github.com/haotian-liu/LLaVA)**|\n", "2309.09875": "|**2023-09-18**|**RaLF: Flow-based Global and Metric Radar Localization in LiDAR Maps**|Abhijeet Nayak et.al.|[2309.09875v1](http://arxiv.org/abs/2309.09875v1)|null|\n", "2309.09867": "|**2023-09-18**|**EGFE: End-to-end Grouping of Fragmented Elements in UI Designs with Multimodal Learning**|Liuqing Chen et.al.|[2309.09867v1](http://arxiv.org/abs/2309.09867v1)|**[link](https://github.com/test2975/egfe)**|\n", "2309.09832": "|**2023-09-18**|**Task Selection and Assignment for Multi-modal Multi-task Dialogue Act Classification with Non-stationary Multi-armed Bandits**|Xiangheng He et.al.|[2309.09832v1](http://arxiv.org/abs/2309.09832v1)|null|\n", "2309.09667": "|**2023-09-18**|**Unified Frequency-Assisted Transformer Framework for Detecting and Grounding Multi-Modal Manipulation**|Huan Liu et.al.|[2309.09667v1](http://arxiv.org/abs/2309.09667v1)|null|\n", "2309.09646": "|**2023-09-18**|**Concurrent Haptic, Audio, and Visual Data Set During Bare Finger Interaction with Textured Surfaces**|Alexis W. M. Devillard et.al.|[2309.09646v1](http://arxiv.org/abs/2309.09646v1)|null|\n", "2309.09592": "|**2023-09-18**|**Multi-Semantic Fusion Model for Generalized Zero-Shot Skeleton-Based Action Recognition**|Ming-Zhe Li et.al.|[2309.09592v1](http://arxiv.org/abs/2309.09592v1)|**[link](https://github.com/EHZ9NIWI7/MSF-GZSSAR)**|\n", "2309.09513": "|**2023-09-18**|**Learning Parallax for Stereo Event-based Motion Deblurring**|Mingyuan Lin et.al.|[2309.09513v1](http://arxiv.org/abs/2309.09513v1)|null|\n", "2309.09501": "|**2023-09-18**|**Discovering Sounding Objects by Audio Queries for Audio Visual Segmentation**|Shaofei Huang et.al.|[2309.09501v1](http://arxiv.org/abs/2309.09501v1)|null|\n", "2309.09473": "|**2023-09-18**|**Self-supervised Multi-view Clustering in Computer Vision: A Survey**|Jiatai Wang et.al.|[2309.09473v1](http://arxiv.org/abs/2309.09473v1)|null|\n", "2309.09421": "|**2023-09-18**|**Unified Pretraining Target Based Video-music Retrieval With Music Rhythm And Video Optical Flow Information**|Tianjun Mao et.al.|[2309.09421v1](http://arxiv.org/abs/2309.09421v1)|null|\n", "2309.09246": "|**2023-09-17**|**Image-level supervision and self-training for transformer-based cross-modality tumor segmentation**|Malo de Boisredon et.al.|[2309.09246v1](http://arxiv.org/abs/2309.09246v1)|null|\n", "2309.09088": "|**2023-09-16**|**Enhancing GAN-Based Vocoders with Contrastive Learning Under Data-limited Condition**|Haoming Guo et.al.|[2309.09088v1](http://arxiv.org/abs/2309.09088v1)|null|\n", "2309.09067": "|**2023-09-19**|**MMST-ViT: Climate Change-aware Crop Yield Prediction via Multi-Modal Spatial-Temporal Vision Transformer**|Fudong Lin et.al.|[2309.09067v2](http://arxiv.org/abs/2309.09067v2)|**[link](https://github.com/fudong03/mmst-vit)**|\n", "2309.08966": "|**2023-09-16**|**FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering and Local to Global Optimization**|Nan Ma et.al.|[2309.08966v1](http://arxiv.org/abs/2309.08966v1)|null|\n", "2309.10724": "|**2023-09-19**|**Sound Source Localization is All about Cross-Modal Alignment**|Arda Senocak et.al.|[2309.10724v1](http://arxiv.org/abs/2309.10724v1)|null|\n", "2309.10649": "|**2023-09-19**|**Cross-modal and Cross-domain Knowledge Transfer for Label-free 3D Segmentation**|Jingyu Zhang et.al.|[2309.10649v1](http://arxiv.org/abs/2309.10649v1)|null|\n", "2309.10606": "|**2023-09-19**|**A Novel Hybrid Algorithm for Optimized Solutions in Ocean Renewable Energy Industry: Enhancing Power Take-Off Parameters and Site Selection Procedure of Wave Energy Converters**|Hossein Mehdipour et.al.|[2309.10606v1](http://arxiv.org/abs/2309.10606v1)|null|\n", "2309.10537": "|**2023-09-19**|**FoleyGen: Visually-Guided Audio Generation**|Xinhao Mei et.al.|[2309.10537v1](http://arxiv.org/abs/2309.10537v1)|null|\n", "2309.10365": "|**2023-09-19**|**Testable Likelihoods for Beyond-the-Standard Model Fits**|Anja Beck et.al.|[2309.10365v1](http://arxiv.org/abs/2309.10365v1)|null|\n", "2309.10361": "|**2023-09-19**|**Improving CLIP Robustness with Knowledge Distillation and Self-Training**|Clement Laroudie et.al.|[2309.10361v1](http://arxiv.org/abs/2309.10361v1)|null|\n", "2309.10283": "|**2023-09-19**|**FRAMU: Attention-based Machine Unlearning using Federated Reinforcement Learning**|Thanveer Shaik et.al.|[2309.10283v1](http://arxiv.org/abs/2309.10283v1)|null|\n", "2309.10244": "|**2023-09-19**|**UPL-SFDA: Uncertainty-aware Pseudo Label Guided Source-Free Domain Adaptation for Medical Image Segmentation**|Jianghao Wu et.al.|[2309.10244v1](http://arxiv.org/abs/2309.10244v1)|**[link](https://github.com/hilab-git/upl-sfda)**|\n", "2309.10195": "|**2023-09-20**|**Multi-modality Meets Re-learning: Mitigating Negative Transfer in Sequential Recommendation**|Bo Peng et.al.|[2309.10195v2](http://arxiv.org/abs/2309.10195v2)|null|\n", "2309.10091": "|**2023-09-18**|**Unified Coarse-to-Fine Alignment for Video-Text Retrieval**|Ziyang Wang et.al.|[2309.10091v1](http://arxiv.org/abs/2309.10091v1)|**[link](https://github.com/ziyang412/ucofia)**|\n", "2309.10077": "|**2023-09-18**|**GAME: Generalized deep learning model towards multimodal data integration for early screening of adolescent mental disorders**|Zhicheng Du et.al.|[2309.10077v1](http://arxiv.org/abs/2309.10077v1)|null|\n", "2309.11335": "|**2023-09-20**|**2D-3D Pose Tracking with Multi-View Constraints**|Huai Yu et.al.|[2309.11335v1](http://arxiv.org/abs/2309.11335v1)|null|\n", "2309.11119": "|**2023-09-21**|**BroadBEV: Collaborative LiDAR-camera Fusion for Broad-sighted Bird's Eye View Map Construction**|Minsu Kim et.al.|[2309.11119v2](http://arxiv.org/abs/2309.11119v2)|null|\n", "2309.11082": "|**2023-09-20**|**Dual-Modal Attention-Enhanced Text-Video Retrieval with Triplet Partial Margin Contrastive Learning**|Chen Jiang et.al.|[2309.11082v1](http://arxiv.org/abs/2309.11082v1)|null|\n", "2309.11081": "|**2023-09-20**|**Dense 2D-3D Indoor Prediction with Sound via Aligned Cross-Modal Distillation**|Heeseung Yun et.al.|[2309.11081v1](http://arxiv.org/abs/2309.11081v1)|**[link](https://github.com/hs-yn/daps)**|\n", "2309.12314": "|**2023-09-21**|**TinyCLIP: CLIP Distillation via Affinity Mimicking and Weight Inheritance**|Kan Wu et.al.|[2309.12314v1](http://arxiv.org/abs/2309.12314v1)|**[link](https://github.com/microsoft/Cream/tree/main/TinyCLIP)**|\n", "2309.12224": "|**2023-09-21**|**Towards Answering Health-related Questions from Medical Videos: Datasets and Approaches**|Deepak Gupta et.al.|[2309.12224v1](http://arxiv.org/abs/2309.12224v1)|null|\n", "2309.12158": "|**2023-09-21**|**Towards Robust and Truly Large-Scale Audio-Sheet Music Retrieval**|Luis Carvalho et.al.|[2309.12158v1](http://arxiv.org/abs/2309.12158v1)|null|\n", "2309.12134": "|**2023-09-21**|**Self-Supervised Contrastive Learning for Robust Audio-Sheet Music Retrieval Systems**|Luis Carvalho et.al.|[2309.12134v1](http://arxiv.org/abs/2309.12134v1)|null|\n", "2309.12111": "|**2023-09-21**|**Passage Summarization with Recurrent Models for Audio-Sheet Music Retrieval**|Luis Carvalho et.al.|[2309.12111v1](http://arxiv.org/abs/2309.12111v1)|null|\n", "2309.12110": "|**2023-09-21**|**Exploiting CLIP-based Multi-modal Approach for Artwork Classification and Retrieval**|Alberto Baldrati et.al.|[2309.12110v1](http://arxiv.org/abs/2309.12110v1)|null|\n", "2309.12030": "|**2023-09-21**|**CAMERA: A Multimodal Dataset and Benchmark for Ad Text Generation**|Masato Mita et.al.|[2309.12030v1](http://arxiv.org/abs/2309.12030v1)|**[link](https://github.com/cyberagentailab/camera)**|\n", "2309.12009": "|**2023-09-21**|**Elevating Skeleton-Based Action Recognition with Efficient Multi-Modality Self-Supervision**|Yiping Wei et.al.|[2309.12009v1](http://arxiv.org/abs/2309.12009v1)|**[link](https://github.com/desehuileng0o0/ikem)**|\n", "2309.11933": "|**2023-09-21**|**Fully Transformer-Equipped Architecture for End-to-End Referring Video Object Segmentation**|Ping Li et.al.|[2309.11933v1](http://arxiv.org/abs/2309.11933v1)|null|\n", "2309.11923": "|**2023-09-21**|**TextCLIP: Text-Guided Face Image Generation And Manipulation Without Adversarial Training**|Xiaozhou You et.al.|[2309.11923v1](http://arxiv.org/abs/2309.11923v1)|null|\n", "2309.11860": "|**2023-09-21**|**QUEST: An Efficient Query Evaluation Scheme Towards Scan-Intensive Cross-Model Analysis**|Jianfeng Huang et.al.|[2309.11860v1](http://arxiv.org/abs/2309.11860v1)|null|\n", "2309.11845": "|**2023-09-21**|**TMac: Temporal Multi-Modal Graph Learning for Acoustic Event Classification**|Meng Liu et.al.|[2309.11845v1](http://arxiv.org/abs/2309.11845v1)|**[link](https://github.com/mgithubl/tmac)**|\n", "2309.11839": "|**2023-09-21**|**MoPA: Multi-Modal Prior Aided Domain Adaptation for 3D Semantic Segmentation**|Haozhi Cao et.al.|[2309.11839v1](http://arxiv.org/abs/2309.11839v1)|null|\n", "2309.11837": "|**2023-09-21**|**Stellar model calibrations with the Ai Phe binary system. Open questions about the robustness of the fit**|G. Valle et.al.|[2309.11837v1](http://arxiv.org/abs/2309.11837v1)|null|\n", "2309.11755": "|**2023-09-21**|**2DDATA: 2D Detection Annotations Transmittable Aggregation for Semantic Segmentation on Point Cloud**|Guan-Cheng Lee et.al.|[2309.11755v1](http://arxiv.org/abs/2309.11755v1)|null|\n", "2309.13007": "|**2023-09-22**|**ReConcile: Round-Table Conference Improves Reasoning via Consensus among Diverse LLMs**|Justin Chih-Yao Chen et.al.|[2309.13007v1](http://arxiv.org/abs/2309.13007v1)|**[link](https://github.com/dinobby/reconcile)**|\n", "2309.12865": "|**2023-09-22**|**Bridging Sensor Gaps via Single-Direction Tuning for Hyperspectral Image Classification**|Xizhe Xue et.al.|[2309.12865v1](http://arxiv.org/abs/2309.12865v1)|**[link](https://github.com/cecilia-xue/hyt-nas)**|\n", "2309.12855": "|**2023-09-22**|**Cross-Modal Translation and Alignment for Survival Analysis**|Fengtao Zhou et.al.|[2309.12855v1](http://arxiv.org/abs/2309.12855v1)|**[link](https://github.com/ft-zhou-zzz/cmta)**|\n", "2309.12764": "|**2023-09-22**|**Multi-Modal Embeddings for Isolating Cross-Platform Coordinated Information Campaigns on Social Media**|Fabio Barbero et.al.|[2309.12764v1](http://arxiv.org/abs/2309.12764v1)|null|\n", "2309.12657": "|**2023-09-22**|**Exploiting Modality-Specific Features For Multi-Modal Manipulation Detection And Grounding**|Jiazhen Wang et.al.|[2309.12657v1](http://arxiv.org/abs/2309.12657v1)|null|\n", "2309.12572": "|**2023-09-22**|**Interpretable 3D Multi-Modal Residual Convolutional Neural Network for Mild Traumatic Brain Injury Diagnosis**|Hanem Ellethy et.al.|[2309.12572v1](http://arxiv.org/abs/2309.12572v1)|null|\n", "2309.14327": "|**2023-09-25**|**DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention**|Zhewei Yao et.al.|[2309.14327v1](http://arxiv.org/abs/2309.14327v1)|**[link](https://github.com/microsoft/deepspeedexamples)**|\n", "2309.14320": "|**2023-09-25**|**MUTEX: Learning Unified Policies from Multimodal Task Specifications**|Rutav Shah et.al.|[2309.14320v1](http://arxiv.org/abs/2309.14320v1)|null|\n", "2309.14203": "|**2023-09-25**|**Detecting and Grounding Multi-Modal Media Manipulation and Beyond**|Rui Shao et.al.|[2309.14203v1](http://arxiv.org/abs/2309.14203v1)|**[link](https://github.com/rshaojimmy/multimodal-deepfake)**|\n", "2309.14183": "|**2023-09-26**|**Species196: A One-Million Semi-supervised Dataset for Fine-grained Species Recognition**|Wei He et.al.|[2309.14183v2](http://arxiv.org/abs/2309.14183v2)|**[link](https://github.com/Species-Dataset/species-dataset.github.io)**|\n", "2309.14181": "|**2023-09-25**|**Q-Bench: A Benchmark for General-Purpose Foundation Models on Low-level Vision**|Haoning Wu et.al.|[2309.14181v1](http://arxiv.org/abs/2309.14181v1)|**[link](https://github.com/VQAssessment/Q-Bench)**|\n", "2309.14065": "|**2023-09-26**|**AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile Platform Real-Time RGB-D Semantic Segmentation**|Siqi Du et.al.|[2309.14065v2](http://arxiv.org/abs/2309.14065v2)|**[link](https://github.com/Fourier7754/AsymFormer)**|\n", "2309.14050": "|**2023-09-26**|**NNgTL: Neural Network Guided Optimal Temporal Logic Task Planning for Mobile Robots**|Ruijia Liu et.al.|[2309.14050v2](http://arxiv.org/abs/2309.14050v2)|null|\n", "2309.14003": "|**2023-09-25**|**Hierarchical Imitation Learning for Stochastic Environments**|Maximilian Igl et.al.|[2309.14003v1](http://arxiv.org/abs/2309.14003v1)|null|\n", "2309.13770": "|**2023-09-24**|**Devil in the Number: Towards Robust Multi-modality Data Filter**|Yichen Xu et.al.|[2309.13770v1](http://arxiv.org/abs/2309.13770v1)|null|\n", "2309.13650": "|**2023-09-24**|**Cross-modal Alignment with Optimal Transport for CTC-based ASR**|Xugang Lu et.al.|[2309.13650v1](http://arxiv.org/abs/2309.13650v1)|null|\n", "2309.13554": "|**2023-09-24**|**A Novel Stochastic Interacting Particle-Field Algorithm for 3D Parabolic-Parabolic Keller-Segel Chemotaxis System**|Zhongjian Wang et.al.|[2309.13554v1](http://arxiv.org/abs/2309.13554v1)|null|\n", "2309.13504": "|**2023-09-23**|**Attention Is All You Need For Blind Room Volume Estimation**|Chunxi Wang et.al.|[2309.13504v1](http://arxiv.org/abs/2309.13504v1)|null|\n", "2309.13470": "|**2023-09-23**|**HAVE-Net: Hallucinated Audio-Visual Embeddings for Few-Shot Classification with Unimodal Cues**|Ankit Jha et.al.|[2309.13470v1](http://arxiv.org/abs/2309.13470v1)|null|\n", "2309.13322": "|**2023-09-23**|**From Text to Source: Results in Detecting Large Language Model-Generated Content**|Wissam Antoun et.al.|[2309.13322v1](http://arxiv.org/abs/2309.13322v1)|null|\n", "2309.13266": "|**2023-09-23**|**Robust Navigation with Cross-Modal Fusion and Knowledge Transfer**|Wenzhe Cai et.al.|[2309.13266v1](http://arxiv.org/abs/2309.13266v1)|**[link](https://github.com/wzcai99/Distill-Navigator)**|\n", "2309.15117": "|**2023-09-26**|**Generating Visual Scenes from Touch**|Fengyu Yang et.al.|[2309.15117v1](http://arxiv.org/abs/2309.15117v1)|null|\n", "2309.15112": "|**2023-09-27**|**InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition**|Pan Zhang et.al.|[2309.15112v2](http://arxiv.org/abs/2309.15112v2)|**[link](https://github.com/internlm/internlm-xcomposer)**|\n", "2309.15109": "|**2023-09-26**|**DistillBEV: Boosting Multi-Camera 3D Object Detection with Cross-Modal Knowledge Distillation**|Zeyu Wang et.al.|[2309.15109v1](http://arxiv.org/abs/2309.15109v1)|**[link](https://github.com/qcraftai/distill-bev)**|\n", "2309.15082": "|**2023-09-26**|**RPEFlow: Multimodal Fusion of RGB-PointCloud-Event for Joint Optical Flow and Scene Flow Estimation**|Zhexiong Wan et.al.|[2309.15082v1](http://arxiv.org/abs/2309.15082v1)|**[link](https://github.com/danqu130/RPEFlow)**|\n", "2309.14704": "|**2023-09-26**|**Tile Classification Based Viewport Prediction with Multi-modal Fusion Transformer**|Zhihao Zhang et.al.|[2309.14704v1](http://arxiv.org/abs/2309.14704v1)|null|\n", "2309.14673": "|**2023-09-26**|**ALEX: Towards Effective Graph Transfer Learning with Noisy Labels**|Jingyang Yuan et.al.|[2309.14673v1](http://arxiv.org/abs/2309.14673v1)|null|\n", "2309.14611": "|**2023-09-26**|**Event Stream-based Visual Object Tracking: A High-Resolution Benchmark Dataset and A Novel Baseline**|Xiao Wang et.al.|[2309.14611v1](http://arxiv.org/abs/2309.14611v1)|**[link](https://github.com/event-ahu/eventvot_benchmark)**|\n", "2309.14580": "|**2023-09-26**|**CWCL: Cross-Modal Transfer with Continuously Weighted Contrastive Loss**|Rakshith Sharma Srinivasa et.al.|[2309.14580v1](http://arxiv.org/abs/2309.14580v1)|null|\n", "2309.14516": "|**2023-09-25**|**UniBEV: Multi-modal 3D Object Detection with Uniform BEV Encoders for Robustness against Missing Sensor Modalities**|Shiming Wang et.al.|[2309.14516v1](http://arxiv.org/abs/2309.14516v1)|null|\n", "2309.14491": "|**2023-09-25**|**Unsupervised 3D Perception with 2D Vision-Language Distillation for Autonomous Driving**|Mahyar Najibi et.al.|[2309.14491v1](http://arxiv.org/abs/2309.14491v1)|null|\n", "2309.15826": "|**2023-09-27**|**Cross-Modal Multi-Tasking for Speech-to-Text Translation via Hard Parameter Sharing**|Brian Yan et.al.|[2309.15826v1](http://arxiv.org/abs/2309.15826v1)|null|\n", "2309.15751": "|**2023-09-27**|**InfraParis: A multi-modal and multi-task autonomous driving dataset**|Gianni Franchi et.al.|[2309.15751v1](http://arxiv.org/abs/2309.15751v1)|null|\n", "2309.15739": "|**2023-09-27**|**Experience and Evidence are the eyes of an excellent summarizer! Towards Knowledge Infused Multi-modal Clinical Conversation Summarization**|Abhisek Tiwari et.al.|[2309.15739v1](http://arxiv.org/abs/2309.15739v1)|**[link](https://github.com/nlp-rl/mm-cliconsummation)**|\n", "2309.15683": "|**2023-09-27**|**End-to-End Streaming Video Temporal Action Segmentation with Reinforce Learning**|Wujun Wen et.al.|[2309.15683v1](http://arxiv.org/abs/2309.15683v1)|**[link](https://github.com/Thinksky5124/SVTAS)**|\n", "2309.15599": "|**2023-09-27**|**OceanBench: The Sea Surface Height Edition**|J. Emmanuel Johnson et.al.|[2309.15599v1](http://arxiv.org/abs/2309.15599v1)|**[link](https://github.com/jejjohnson/oceanbench)**|\n", "2309.15529": "|**2023-09-27**|**Missing-modality Enabled Multi-modal Fusion Architecture for Medical Data**|Muyu Wang et.al.|[2309.15529v1](http://arxiv.org/abs/2309.15529v1)|null|\n", "2309.15427": "|**2023-09-27**|**Graph Neural Prompting with Large Language Models**|Yijun Tian et.al.|[2309.15427v1](http://arxiv.org/abs/2309.15427v1)|null|\n", "2309.15402": "|**2023-09-27**|**A Survey of Chain of Thought Reasoning: Advances, Frontiers and Future**|Zheng Chu et.al.|[2309.15402v1](http://arxiv.org/abs/2309.15402v1)|**[link](https://github.com/zchuz/cot-reasoning-survey)**|\n", "2309.15390": "|**2023-09-27**|**MINS: Efficient and Robust Multisensor-aided Inertial Navigation System**|Woosik Lee et.al.|[2309.15390v1](http://arxiv.org/abs/2309.15390v1)|**[link](https://github.com/rpng/mins)**|\n", "2309.15313": "|**2023-09-26**|**M$^{3}$3D: Learning 3D priors using Multi-Modal Masked Autoencoders for 2D image and video understanding**|Muhammad Abdullah Jamal et.al.|[2309.15313v1](http://arxiv.org/abs/2309.15313v1)|null|\n", "2309.15302": "|**2023-09-26**|**Self-Supervised Terrain Representation Learning from Unconstrained Robot Experience**|Haresh Karnan et.al.|[2309.15302v1](http://arxiv.org/abs/2309.15302v1)|null|\n", "2309.15283": "|**2023-09-26**|**Multi-Modal Planning on Regrasping for Stable Manipulation**|Jiaming Hu et.al.|[2309.15283v1](http://arxiv.org/abs/2309.15283v1)|null|\n", "2309.16592": "|**2023-09-28**|**Tensor Factorization for Leveraging Cross-Modal Knowledge in Data-Constrained Infrared Object Detection**|Manish Sharma et.al.|[2309.16592v1](http://arxiv.org/abs/2309.16592v1)|null|\n", "2309.16569": "|**2023-09-28**|**Audio-Visual Speaker Verification via Joint Cross-Attention**|R. Gnana Praveen et.al.|[2309.16569v1](http://arxiv.org/abs/2309.16569v1)|null|\n", "2309.16283": "|**2023-09-28**|**Self-supervised Cross-view Representation Reconstruction for Change Captioning**|Yunbin Tu et.al.|[2309.16283v1](http://arxiv.org/abs/2309.16283v1)|null|\n", "2309.16211": "|**2023-09-28**|**VDC: Versatile Data Cleanser for Detecting Dirty Samples via Visual-Linguistic Inconsistency**|Zihao Zhu et.al.|[2309.16211v1](http://arxiv.org/abs/2309.16211v1)|null|\n", "2309.16206": "|**2023-09-28**|**Cross-Modal Transformer GAN: Brain Structural-Functional Deep Fusing Network for Alzheimer's Disease Analysis**|Qiankun Zuo et.al.|[2309.16206v1](http://arxiv.org/abs/2309.16206v1)|null|\n", "2309.16203": "|**2023-09-28**|**The Cloud Strikes Back: Investigating the Decentralization of IPFS**|Leonhard Balduf et.al.|[2309.16203v1](http://arxiv.org/abs/2309.16203v1)|null|\n", "2309.16141": "|**2023-09-28**|**Align before Search: Aligning Ads Image to Text for Accurate Cross-Modal Sponsored Search**|Yuanmin Tang et.al.|[2309.16141v1](http://arxiv.org/abs/2309.16141v1)|**[link](https://github.com/pter61/aligncmss)**|\n", "2309.16093": "|**2023-09-28**|**Hierarchical Cross-Modality Knowledge Transfer with Sinkhorn Attention for CTC-based ASR**|Xugang Lu et.al.|[2309.16093v1](http://arxiv.org/abs/2309.16093v1)|null|\n", "2309.15954": "|**2023-09-27**|**The Devil is in the Details: A Deep Dive into the Rabbit Hole of Data Filtering**|Haichao Yu et.al.|[2309.15954v1](http://arxiv.org/abs/2309.15954v1)|null|\n", "2309.15915": "|**2023-09-27**|**Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts**|Deniz Engin et.al.|[2309.15915v1](http://arxiv.org/abs/2309.15915v1)|**[link](https://github.com/engindeniz/vitis)**|\n", "2309.17395": "|**2023-09-29**|**AV-CPL: Continuous Pseudo-Labeling for Audio-Visual Speech Recognition**|Andrew Rouditchenko et.al.|[2309.17395v1](http://arxiv.org/abs/2309.17395v1)|null|\n", "2309.17336": "|**2023-09-29**|**See Beyond Seeing: Robust 3D Object Detection from Point Clouds via Cross-Modal Hallucination**|Jianning Deng et.al.|[2309.17336v1](http://arxiv.org/abs/2309.17336v1)|null|\n", "2309.17264": "|**2023-09-29**|**A Foundation Model for General Moving Object Segmentation in Medical Images**|Zhongnuo Yan et.al.|[2309.17264v1](http://arxiv.org/abs/2309.17264v1)|null|\n", "2309.17239": "|**2023-09-29**|**EGVD: Event-Guided Video Deraining**|Yueyi Zhang et.al.|[2309.17239v1](http://arxiv.org/abs/2309.17239v1)|**[link](https://github.com/booker-max/egvd)**|\n", "2309.17175": "|**2023-09-29**|**TextField3D: Towards Enhancing Open-Vocabulary 3D Generation with Noisy Text Fields**|Tianyu Huang et.al.|[2309.17175v1](http://arxiv.org/abs/2309.17175v1)|null|\n", "2309.17133": "|**2023-09-29**|**Fine-grained Late-interaction Multi-modal Retrieval for Retrieval Augmented Visual Question Answering**|Weizhe Lin et.al.|[2309.17133v1](http://arxiv.org/abs/2309.17133v1)|**[link](https://github.com/linweizhedragon/retrieval-augmented-visual-question-answering)**|\n", "2309.17104": "|**2023-10-03**|**Prototype-guided Cross-modal Completion and Alignment for Incomplete Text-based Person Re-identification**|Tiantian Gong et.al.|[2309.17104v2](http://arxiv.org/abs/2309.17104v2)|null|\n", "2309.17102": "|**2023-09-29**|**Guiding Instruction-based Image Editing via Multimodal Large Language Models**|Tsu-Jui Fu et.al.|[2309.17102v1](http://arxiv.org/abs/2309.17102v1)|**[link](https://github.com/tsujuifu/pytorch_mgie)**|\n", "2309.17093": "|**2023-09-29**|**Prototype-based Aleatoric Uncertainty Quantification for Cross-modal Retrieval**|Hao Li et.al.|[2309.17093v1](http://arxiv.org/abs/2309.17093v1)|**[link](https://github.com/leolee99/pau)**|\n", "2309.17037": "|**2023-09-29**|**Beyond Co-occurrence: Multi-modal Session-based Recommendation**|Xiaokun Zhang et.al.|[2309.17037v1](http://arxiv.org/abs/2309.17037v1)|**[link](https://github.com/zhang-xiaokun/mmsbr)**|\n", "2309.16984": "|**2023-09-29**|**Consistency Models as a Rich and Efficient Policy Class for Reinforcement Learning**|Zihan Ding et.al.|[2309.16984v1](http://arxiv.org/abs/2309.16984v1)|null|\n", "2309.16949": "|**2023-09-29**|**CrossZoom: Simultaneously Motion Deblurring and Event Super-Resolving**|Chi Zhang et.al.|[2309.16949v1](http://arxiv.org/abs/2309.16949v1)|**[link](https://github.com/bestrivenzc/CZ-Net)**|\n", "2309.16830": "|**2023-09-28**|**Robust Safe Control with Multi-Modal Uncertainty**|Tianhao Wei et.al.|[2309.16830v1](http://arxiv.org/abs/2309.16830v1)|null|\n", "2309.16818": "|**2023-09-28**|**MEM: Multi-Modal Elevation Mapping for Robotics and Learning**|Gian Erni et.al.|[2309.16818v1](http://arxiv.org/abs/2309.16818v1)|**[link](https://github.com/leggedrobotics/elevation_mapping_cupy)**|\n", "2309.16772": "|**2023-10-02**|**XVO: Generalized Visual Odometry via Cross-Modal Self-Training**|Lei Lai et.al.|[2309.16772v2](http://arxiv.org/abs/2309.16772v2)|null|\n", "2310.02071": "|**2023-10-03**|**Towards End-to-End Embodied Decision Making via Multi-modal Large Language Model: Explorations with GPT4-Vision and Beyond**|Liang Chen et.al.|[2310.02071v1](http://arxiv.org/abs/2310.02071v1)|**[link](https://github.com/pkunlp-icler/pca-eval)**|\n", "2310.02050": "|**2023-10-03**|**Tuning Large language model for End-to-end Speech Translation**|Hao Zhang et.al.|[2310.02050v1](http://arxiv.org/abs/2310.02050v1)|null|\n", "2310.01852": "|**2023-10-04**|**LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment**|Bin Zhu et.al.|[2310.01852v2](http://arxiv.org/abs/2310.01852v2)|**[link](https://github.com/pku-yuangroup/languagebind)**|\n", "2310.01733": "|**2023-10-03**|**Health Guardian: Using Multi-modal Data to Understand Individual Health**|Vince S. Siu et.al.|[2310.01733v1](http://arxiv.org/abs/2310.01733v1)|null|\n", "2310.01358": "|**2023-10-02**|**NEUCORE: Neural Concept Reasoning for Composed Image Retrieval**|Shu Zhao et.al.|[2310.01358v1](http://arxiv.org/abs/2310.01358v1)|null|\n", "2310.01351": "|**2023-10-02**|**Streaming Motion Forecasting for Autonomous Driving**|Ziqi Pang et.al.|[2310.01351v1](http://arxiv.org/abs/2310.01351v1)|**[link](https://github.com/ziqipang/streamingforecasting)**|\n", "2310.01330": "|**2023-10-02**|**Towards reporting bias in visual-language datasets: bimodal augmentation by decoupling object-attribute association**|Qiyu Wu et.al.|[2310.01330v1](http://arxiv.org/abs/2310.01330v1)|null|\n", "2310.01286": "|**2023-10-02**|**A Dynamic Macroscopic Framework for Pricing of Ride-hailing Services with an Optional Bus Lane Access for Pool Vehicles**|Lynn Fayed et.al.|[2310.01286v1](http://arxiv.org/abs/2310.01286v1)|null|\n", "2310.01232": "|**2023-10-02**|**Modality-aware Transformer for Time series Forecasting**|Hajar Emami et.al.|[2310.01232v1](http://arxiv.org/abs/2310.01232v1)|null|\n", "2310.01035": "|**2023-10-02**|**Learnable Cross-modal Knowledge Distillation for Multi-modal Learning with Missing Modality**|Hu Wang et.al.|[2310.01035v1](http://arxiv.org/abs/2310.01035v1)|null|\n", "2310.00927": "|**2023-10-02**|**Understanding Transferable Representation Learning and Zero-shot Transfer in CLIP**|Zixiang Chen et.al.|[2310.00927v1](http://arxiv.org/abs/2310.00927v1)|null|\n", "2310.00862": "|**2023-10-02**|**Shack-Hartmann wavefront sensing: A new approach to time-resolved measurement of stress intensity during dynamic fracture of small brittle specimens**|Liuchi Li et.al.|[2310.00862v1](http://arxiv.org/abs/2310.00862v1)|null|\n", "2310.00745": "|**2023-10-01**|**Deterministic Langevin Unconstrained Optimization with Normalizing Flows**|James M. Sullivan et.al.|[2310.00745v1](http://arxiv.org/abs/2310.00745v1)|null|\n", "2310.00740": "|**2023-10-01**|**Top-down Green-ups: Satellite Sensing and Deep Models to Predict Buffelgrass Phenology**|Lucas Rosenblatt et.al.|[2310.00740v1](http://arxiv.org/abs/2310.00740v1)|**[link](https://github.com/lurosenb/phenology_projects)**|\n", "2310.00672": "|**2023-10-01**|**GeRA: Label-Efficient Geometrically Regularized Alignment**|Dustin Klebe et.al.|[2310.00672v1](http://arxiv.org/abs/2310.00672v1)|null|\n", "2310.03024": "|**2023-10-04**|**AstroCLIP: Cross-Modal Pre-Training for Astronomical Foundation Models**|Francois Lanusse et.al.|[2310.03024v1](http://arxiv.org/abs/2310.03024v1)|**[link](https://github.com/PolymathicAI/AstroCLIP)**|\n", "2310.02960": "|**2023-10-04**|**CoDA: Collaborative Novel Box Discovery and Cross-modal Alignment for Open-vocabulary 3D Object Detection**|Yang Cao et.al.|[2310.02960v1](http://arxiv.org/abs/2310.02960v1)|**[link](https://github.com/yangcaoai/CoDA_NeurIPS2023)**|\n", "2310.02821": "|**2023-10-04**|**Improving Vision Anomaly Detection with the Guidance of Language Modality**|Dong Chen et.al.|[2310.02821v1](http://arxiv.org/abs/2310.02821v1)|**[link](https://github.com/Anfeather/CMG)**|\n", "2310.02777": "|**2023-10-04**|**The Role of Linguistic Priors in Measuring Compositional Generalization of Vision-Language Models**|Chenwei Wu et.al.|[2310.02777v1](http://arxiv.org/abs/2310.02777v1)|null|\n", "2310.02690": "|**2023-10-04**|**Multi-Dimension-Embedding-Aware Modality Fusion Transformer for Psychiatric Disorder Clasification**|Guoxin Wang et.al.|[2310.02690v1](http://arxiv.org/abs/2310.02690v1)|null|\n", "2310.02663": "|**2023-10-04**|**MedPrompt: Cross-Modal Prompting for Multi-Task Medical Image Translation**|Xuhang Chen et.al.|[2310.02663v1](http://arxiv.org/abs/2310.02663v1)|null|\n", "2310.02569": "|**2023-10-04**|**ReForm-Eval: Evaluating Large Vision Language Models via Unified Re-Formulation of Task-Oriented Benchmarks**|Zejun Li et.al.|[2310.02569v1](http://arxiv.org/abs/2310.02569v1)|**[link](https://github.com/fudandisc/reform-eval)**|\n", "2310.02561": "|**2023-10-04**|**Integrated Sensing and Communications towards Proactive Beamforming in mmWave V2I via Multi-Modal Feature Fusion (MMFF)**|Haotian Zhang et.al.|[2310.02561v1](http://arxiv.org/abs/2310.02561v1)|null|\n", "2310.02528": "|**2023-10-04**|**On the Cognition of Visual Question Answering Models and Human Intelligence: A Comparative Study**|Liben Chen et.al.|[2310.02528v1](http://arxiv.org/abs/2310.02528v1)|null|\n", "2310.02361": "|**2023-10-03**|**Event-Enhanced Multi-Modal Spiking Neural Network for Dynamic Obstacle Avoidance**|Yang Wang et.al.|[2310.02361v1](http://arxiv.org/abs/2310.02361v1)|null|\n", "2310.03744": "|**2023-10-05**|**Improved Baselines with Visual Instruction Tuning**|Haotian Liu et.al.|[2310.03744v1](http://arxiv.org/abs/2310.03744v1)|null|\n", "2310.03724": "|**2023-10-05**|**Modular Speech-to-Text Translation for Zero-Shot Cross-Modal Transfer**|Paul-Ambroise Duquenne et.al.|[2310.03724v1](http://arxiv.org/abs/2310.03724v1)|null|\n", "2310.03485": "|**2023-10-07**|**BTDNet: a Multi-Modal Approach for Brain Tumor Radiogenomic Classification**|Dimitrios Kollias et.al.|[2310.03485v2](http://arxiv.org/abs/2310.03485v2)|null|\n", "2310.03420": "|**2023-10-05**|**FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained Diffusion Models and Monocular Depth Estimators**|Haiping Wang et.al.|[2310.03420v1](http://arxiv.org/abs/2310.03420v1)|**[link](https://github.com/WHU-USI3DV/FreeReg)**|\n", "2310.03333": "|**2023-10-05**|**Real-time Multi-modal Object Detection and Tracking on Edge for Regulatory Compliance Monitoring**|Jia Syuen Lim et.al.|[2310.03333v1](http://arxiv.org/abs/2310.03333v1)|null|\n", "2310.03320": "|**2023-10-05**|**BioBridge: Bridging Biomedical Foundation Models via Knowledge Graph**|Zifeng Wang et.al.|[2310.03320v1](http://arxiv.org/abs/2310.03320v1)|null|\n", "2310.03221": "|**2023-10-05**|**Know2BIO: A Comprehensive Dual-View Benchmark for Evolving Biomedical Knowledge Graphs**|Yijia Xiao et.al.|[2310.03221v1](http://arxiv.org/abs/2310.03221v1)|**[link](https://github.com/yijia-xiao/know2bio)**|\n", "2310.03218": "|**2023-10-05**|**Learning Energy-Based Prior Model with Diffusion-Amortized MCMC**|Peiyu Yu et.al.|[2310.03218v1](http://arxiv.org/abs/2310.03218v1)|**[link](https://github.com/yupeiyu98/diffusion-amortized-mcmc)**|\n", "2310.03140": "|**2023-10-04**|**ViFiT: Reconstructing Vision Trajectories from IMU and Wi-Fi Fine Time Measurements**|Bryan Bo Cao et.al.|[2310.03140v1](http://arxiv.org/abs/2310.03140v1)|**[link](https://github.com/bryanbocao/vifit)**|\n", "2310.03111": "|**2023-10-04**|**Multi-modal Gaussian Process Variational Autoencoders for Neural and Behavioral Data**|Rabia Gondur et.al.|[2310.03111v1](http://arxiv.org/abs/2310.03111v1)|null|\n", "2310.03059": "|**2023-10-04**|**Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models**|Ivan Tang et.al.|[2310.03059v1](http://arxiv.org/abs/2310.03059v1)|**[link](https://github.com/EvenJoker/Point-PEFT)**|\n", "2310.04122": "|**2023-10-06**|**VI-Diff: Unpaired Visible-Infrared Translation Diffusion Model for Single Modality Labeled Visible-Infrared Person Re-identification**|Han Huang et.al.|[2310.04122v1](http://arxiv.org/abs/2310.04122v1)|null|\n", "2310.03958": "|**2023-10-06**|**The \"Seen but Unnoticed\" Vocabulary of Natural Touch: Revolutionizing Direct Interaction with Our Devices and One Another (UIST 2021 Vision)**|Ken Hinckley et.al.|[2310.03958v1](http://arxiv.org/abs/2310.03958v1)|null|\n", "2310.05863": "|**2023-10-10**|**Fine-grained Audio-Visual Joint Representations for Multimodal Large Language Models**|Guangzhi Sun et.al.|[2310.05863v2](http://arxiv.org/abs/2310.05863v2)|**[link](https://github.com/briansidp/audiovisualllm)**|\n", "2310.05628": "|**2023-10-09**|**Glitter or Gold? Deriving Structured Insights from Sustainability Reports via Large Language Models**|Marco Bronzini et.al.|[2310.05628v1](http://arxiv.org/abs/2310.05628v1)|**[link](https://github.com/saturnmars/derivingstructuredinsightsfromsustainabilityreportsvialargelanguagemodels)**|\n", "2310.05608": "|**2023-10-09**|**FlexKnot and Gaussian Process for 21 cm global signal analysis and foreground separation**|Stefan Heimersheim et.al.|[2310.05608v1](http://arxiv.org/abs/2310.05608v1)|null|\n", "2310.05572": "|**2023-10-09**|**A Simple and Robust Framework for Cross-Modality Medical Image Segmentation applied to Vision Transformers**|Matteo Bastico et.al.|[2310.05572v1](http://arxiv.org/abs/2310.05572v1)|**[link](https://github.com/matteo-bastico/mi-seg)**|\n", "2310.05462": "|**2023-10-09**|**AdaFuse: Adaptive Medical Image Fusion Based on Spatial-Frequential Cross Attention**|Xianming Gu et.al.|[2310.05462v1](http://arxiv.org/abs/2310.05462v1)|**[link](https://github.com/xianming-gu/adafuse)**|\n", "2310.05401": "|**2023-10-09**|**Entropy-MCMC: Sampling from Flat Basins with Ease**|Bolian Li et.al.|[2310.05401v1](http://arxiv.org/abs/2310.05401v1)|null|\n", "2310.05364": "|**2023-10-10**|**Universal Multi-modal Entity Alignment via Iteratively Fusing Modality Similarity Paths**|Bolin Zhu et.al.|[2310.05364v2](http://arxiv.org/abs/2310.05364v2)|**[link](https://github.com/blzhu0823/pathfusion)**|\n", "2310.05355": "|**2023-10-09**|**C^2M-DoT: Cross-modal consistent multi-view medical report generation with domain transfer network**|Ruizhi Wang et.al.|[2310.05355v1](http://arxiv.org/abs/2310.05355v1)|null|\n", "2310.05245": "|**2023-10-08**|**Influence of Camera-LiDAR Configuration on 3D Object Detection for Autonomous Driving**|Ye Li et.al.|[2310.05245v1](http://arxiv.org/abs/2310.05245v1)|**[link](https://github.com/safeai-lab/lidar-camera-placement)**|\n", "2310.05193": "|**2023-10-08**|**Improving Discriminative Multi-Modal Learning with Large-Scale Pre-Trained Models**|Chenzhuang Du et.al.|[2310.05193v1](http://arxiv.org/abs/2310.05193v1)|null|\n", "2310.05181": "|**2023-10-08**|**Unified speech and gesture synthesis using flow matching**|Shivam Mehta et.al.|[2310.05181v1](http://arxiv.org/abs/2310.05181v1)|null|\n", "2310.05060": "|**2023-10-08**|**Video-CSR: Complex Video Digest Creation for Visual-Language Models**|Tingkai Liu et.al.|[2310.05060v1](http://arxiv.org/abs/2310.05060v1)|null|\n", "2310.04992": "|**2023-10-08**|**VisionFM: a Multi-Modal Multi-Task Vision Foundation Model for Generalist Ophthalmic Artificial Intelligence**|Jianing Qiu et.al.|[2310.04992v1](http://arxiv.org/abs/2310.04992v1)|null|\n", "2310.04991": "|**2023-10-10**|**Video-Teller: Enhancing Cross-Modal Generation with Fusion and Decoupling**|Haogeng Liu et.al.|[2310.04991v2](http://arxiv.org/abs/2310.04991v2)|null|\n", "2310.04971": "|**2023-10-08**|**Understanding the Robustness of Multi-modal Contrastive Learning to Distribution Shift**|Yihao Xue et.al.|[2310.04971v1](http://arxiv.org/abs/2310.04971v1)|null|\n", "2310.06633": "|**2023-10-10**|**Blind Dates: Examining the Expression of Temporality in Historical Photographs**|Alexandra Barancov\u00e1 et.al.|[2310.06633v1](http://arxiv.org/abs/2310.06633v1)|null|\n", "2310.06627": "|**2023-10-10**|**What If the TV Was Off? Examining Counterfactual Reasoning Abilities of Multi-modal Language Models**|Letian Zhang et.al.|[2310.06627v1](http://arxiv.org/abs/2310.06627v1)|**[link](https://github.com/letian2003/c-vqa)**|\n", "2310.06440": "|**2023-10-10**|**Solution for SMART-101 Challenge of ICCV Multi-modal Algorithmic Reasoning Task 2023**|Xiangyu Wu et.al.|[2310.06440v1](http://arxiv.org/abs/2310.06440v1)|null|\n", "2310.06434": "|**2023-10-10**|**Whispering LLaMA: A Cross-Modal Generative Error Correction Framework for Speech Recognition**|Srijith Radhakrishnan et.al.|[2310.06434v1](http://arxiv.org/abs/2310.06434v1)|**[link](https://github.com/srijith-rkr/whispering-llama)**|\n", "2310.06383": "|**2023-10-10**|**What Makes for Robust Multi-Modal Models in the Face of Missing Modalities?**|Siting Li et.al.|[2310.06383v1](http://arxiv.org/abs/2310.06383v1)|null|\n", "2310.06365": "|**2023-10-10**|**Multi-Modal Knowledge Graph Transformer Framework for Multi-Modal Entity Alignment**|Qian Li et.al.|[2310.06365v1](http://arxiv.org/abs/2310.06365v1)|**[link](https://github.com/xiaoqian19940510/moalign)**|\n", "2310.06342": "|**2023-10-10**|**Contrastive Prompt Learning-based Code Search based on Interaction Matrix**|Yubo Zhang et.al.|[2310.06342v1](http://arxiv.org/abs/2310.06342v1)|null|\n", "2310.06282": "|**2023-10-11**|**MuseChat: A Conversational Music Recommendation System for Videos**|Zhikang Dong et.al.|[2310.06282v2](http://arxiv.org/abs/2310.06282v2)|null|\n", "2310.06259": "|**2023-10-10**|**Cross-modal Cognitive Consensus guided Audio-Visual Segmentation**|Zhaofeng Shi et.al.|[2310.06259v1](http://arxiv.org/abs/2310.06259v1)|null|\n", "2310.06212": "|**2023-10-09**|**Comparison of deep-learning data fusion strategies in mandibular osteoradionecrosis prediction modelling using clinical variables and radiation dose distribution volumes**|Laia Humbert-Vidan et.al.|[2310.06212v1](http://arxiv.org/abs/2310.06212v1)|null|\n", "2310.06008": "|**2023-10-09**|**CoBEVFusion: Cooperative Perception with LiDAR-Camera Bird's-Eye View Fusion**|Donghao Qiao et.al.|[2310.06008v1](http://arxiv.org/abs/2310.06008v1)|null|\n", "2310.07706": "|**2023-10-11**|**Pixel State Value Network for Combined Prediction and Planning in Interactive Environments**|Sascha Rosbach et.al.|[2310.07706v1](http://arxiv.org/abs/2310.07706v1)|null|\n", "2310.07668": "|**2023-10-11**|**GRaMuFeN: Graph-based Multi-modal Fake News Detection in Social Media**|Makan Kananian et.al.|[2310.07668v1](http://arxiv.org/abs/2310.07668v1)|null|\n", "2310.07602": "|**2023-10-11**|**Dual Radar: A Multi-modal Dataset with Dual 4D Radar for Autononous Driving**|Xinyu Zhang et.al.|[2310.07602v1](http://arxiv.org/abs/2310.07602v1)|**[link](https://github.com/adept-thu/dual-radar)**|\n", "2310.07591": "|**2023-10-11**|**PeP: a Point enhanced Painting method for unified point cloud tasks**|Zichao Dong et.al.|[2310.07591v1](http://arxiv.org/abs/2310.07591v1)|null|\n", "2310.07552": "|**2023-10-11**|**ProtoHPE: Prototype-guided High-frequency Patch Enhancement for Visible-Infrared Person Re-identification**|Guiwei Zhang et.al.|[2310.07552v1](http://arxiv.org/abs/2310.07552v1)|null|\n", "2310.07517": "|**2023-10-11**|**CM-PIE: Cross-modal perception for interactive-enhanced audio-visual video parsing**|Yaru Chen et.al.|[2310.07517v1](http://arxiv.org/abs/2310.07517v1)|null|\n", "2310.07355": "|**2023-10-11**|**IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training**|Che Liu et.al.|[2310.07355v1](http://arxiv.org/abs/2310.07355v1)|null|\n", "2310.07276": "|**2023-10-11**|**BioT5: Enriching Cross-modal Integration in Biology with Chemical Knowledge and Natural Language Associations**|Qizhi Pei et.al.|[2310.07276v1](http://arxiv.org/abs/2310.07276v1)|**[link](https://github.com/QizhiPei/BioT5)**|\n", "2310.07265": "|**2023-10-11**|**Distilling Efficient Vision Transformers from CNNs for Semantic Segmentation**|Xu Zheng et.al.|[2310.07265v1](http://arxiv.org/abs/2310.07265v1)|null|\n", "2310.07222": "|**2023-10-11**|**Uni-paint: A Unified Framework for Multimodal Image Inpainting with Pretrained Diffusion Model**|Shiyuan Yang et.al.|[2310.07222v1](http://arxiv.org/abs/2310.07222v1)|**[link](https://github.com/ysy31415/unipaint)**|\n", "2310.07005": "|**2023-10-10**|**Sound-skwatter (Did You Mean: Sound-squatter?) AI-powered Generator for Phishing Prevention**|Rodolfo Valentim et.al.|[2310.07005v1](http://arxiv.org/abs/2310.07005v1)|null|\n", "2310.08530": "|**2023-10-12**|**UniPose: Detecting Any Keypoints**|Jie Yang et.al.|[2310.08530v1](http://arxiv.org/abs/2310.08530v1)|**[link](https://github.com/IDEA-Research/UniPose)**|\n", "2310.08487": "|**2023-10-12**|**GraphextQA: A Benchmark for Evaluating Graph-Enhanced Large Language Models**|Yuanchun Shen et.al.|[2310.08487v1](http://arxiv.org/abs/2310.08487v1)|**[link](https://github.com/happen2me/cross-gnn)**|\n", "2310.08446": "|**2023-10-12**|**Towards Robust Multi-Modal Reasoning via Model Selection**|Xiangyan Liu et.al.|[2310.08446v1](http://arxiv.org/abs/2310.08446v1)|null|\n", "2310.08303": "|**2023-10-12**|**Multimodal Variational Auto-encoder based Audio-Visual Segmentation**|Yuxin Mao et.al.|[2310.08303v1](http://arxiv.org/abs/2310.08303v1)|**[link](https://github.com/opennlplab/mmvae-avs)**|\n", "2310.08285": "|**2023-10-12**|**How would mobility-as-a-service (MaaS) platform survive as an intermediary? From the viewpoint of stability in many-to-many matching**|Rui Yao et.al.|[2310.08285v1](http://arxiv.org/abs/2310.08285v1)|null|\n", "2310.08270": "|**2023-10-12**|**Hilbert Space Embedding-based Trajectory Optimization for Multi-Modal Uncertain Obstacle Trajectory Prediction**|Basant Sharma et.al.|[2310.08270v1](http://arxiv.org/abs/2310.08270v1)|null|\n", "2310.08261": "|**2023-10-12**|**GraphAlign: Enhancing Accurate Feature Alignment by Graph matching for Multi-Modal 3D Object Detection**|Ziying Song et.al.|[2310.08261v1](http://arxiv.org/abs/2310.08261v1)|null|\n", "2310.08166": "|**2023-10-12**|**Ziya-VL: Bilingual Large Vision-Language Model via Multi-Task Instruction Tuning**|Junyu Lu et.al.|[2310.08166v1](http://arxiv.org/abs/2310.08166v1)|null|\n", "2310.08114": "|**2023-10-12**|**Multi-Modal Sensor Fusion and Object Tracking for Autonomous Racing**|Phillip Karle et.al.|[2310.08114v1](http://arxiv.org/abs/2310.08114v1)|**[link](https://github.com/tumftm/fusiontracking)**|\n", "2310.08103": "|**2023-10-12**|**Radio Galaxy Zoo: tagging radio subjects using text**|Dawei Chen et.al.|[2310.08103v1](http://arxiv.org/abs/2310.08103v1)|null|\n", "2310.08027": "|**2023-10-12**|**Exploring Large Language Models for Multi-Modal Out-of-Distribution Detection**|Yi Dai et.al.|[2310.08027v1](http://arxiv.org/abs/2310.08027v1)|null|\n", "2310.08026": "|**2023-10-12**|**Beyond Sharing Weights in Decoupling Feature Learning Network for UAV RGB-Infrared Vehicle Re-Identification**|Xingyue Liu et.al.|[2310.08026v1](http://arxiv.org/abs/2310.08026v1)|null|\n", "2310.07990": "|**2023-10-12**|**Multi-View Variational Autoencoder for Missing Value Imputation in Untargeted Metabolomics**|Chen Zhao et.al.|[2310.07990v1](http://arxiv.org/abs/2310.07990v1)|null|\n", "2310.07944": "|**2023-10-11**|**AutoRepo: A general framework for multi-modal LLM-based automated construction reporting**|Hongxu Pu et.al.|[2310.07944v1](http://arxiv.org/abs/2310.07944v1)|null|\n", "2310.07940": "|**2023-10-11**|**Cost-Driven Hardware-Software Co-Optimization of Machine Learning Pipelines**|Ravit Sharma et.al.|[2310.07940v1](http://arxiv.org/abs/2310.07940v1)|null|\n", "2310.10651": "|**2023-10-16**|**HairCLIPv2: Unifying Hair Editing via Proxy Feature Blending**|Tianyi Wei et.al.|[2310.10651v1](http://arxiv.org/abs/2310.10651v1)|**[link](https://github.com/wty-ustc/hairclipv2)**|\n", "2310.10414": "|**2023-10-16**|**Style transfer between Microscopy and Magnetic Resonance Imaging via Generative Adversarial Network in small sample size settings**|Monika Pytlarz et.al.|[2310.10414v1](http://arxiv.org/abs/2310.10414v1)|null|\n", "2310.10371": "|**2023-10-16**|**Camera-LiDAR Fusion with Latent Contact for Place Recognition in Challenging Cross-Scenes**|Yan Pan et.al.|[2310.10371v1](http://arxiv.org/abs/2310.10371v1)|null|\n", "2310.10347": "|**2023-10-16**|**Editable-DeepSC: Cross-Modal Editable Semantic Communication Systems**|Wenbo Yu et.al.|[2310.10347v1](http://arxiv.org/abs/2310.10347v1)|null|\n", "2310.10290": "|**2023-10-16**|**Autonomous Mapping and Navigation using Fiducial Markers and Pan-Tilt Camera for Assisting Indoor Mobility of Blind and Visually Impaired People**|Dharmateja Adapa et.al.|[2310.10290v1](http://arxiv.org/abs/2310.10290v1)|null|\n", "2310.10125": "|**2023-10-16**|**Few-shot Action Recognition with Captioning Foundation Models**|Xiang Wang et.al.|[2310.10125v1](http://arxiv.org/abs/2310.10125v1)|null|\n", "2310.10010": "|**2023-10-16**|**Black-box Targeted Adversarial Attack on Segment Anything (SAM)**|Sheng Zheng et.al.|[2310.10010v1](http://arxiv.org/abs/2310.10010v1)|null|\n", "2310.09761": "|**2023-10-15**|**CAPro: Webly Supervised Learning with Cross-Modality Aligned Prototypes**|Yulei Qin et.al.|[2310.09761v1](http://arxiv.org/abs/2310.09761v1)|**[link](https://github.com/yuleiqin/capro)**|\n", "2310.09755": "|**2023-10-15**|**Beyond Segmentation: Road Network Generation with Multi-Modal LLMs**|Sumedh Rasal et.al.|[2310.09755v1](http://arxiv.org/abs/2310.09755v1)|null|\n", "2310.09714": "|**2023-10-15**|**Enhancing Task Performance of Learned Simplified Models via Reinforcement Learning**|Hien Bui et.al.|[2310.09714v1](http://arxiv.org/abs/2310.09714v1)|null|\n", "2310.09696": "|**2023-10-15**|**Progressive Evidence Refinement for Open-domain Multimodal Retrieval Question Answering**|Shuwen Yang et.al.|[2310.09696v1](http://arxiv.org/abs/2310.09696v1)|null|\n", "2310.09503": "|**2023-10-14**|**JM3D & JM3D-LLM: Elevating 3D Representation with Joint Multi-modal Cues**|Jiayi Ji et.al.|[2310.09503v1](http://arxiv.org/abs/2310.09503v1)|**[link](https://github.com/mr-neko/jm3d)**|\n", "2310.09478": "|**2023-10-14**|**MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning**|Jun Chen et.al.|[2310.09478v1](http://arxiv.org/abs/2310.09478v1)|null|\n", "2310.09199": "|**2023-10-13**|**PaLI-3 Vision Language Models: Smaller, Faster, Stronger**|Xi Chen et.al.|[2310.09199v1](http://arxiv.org/abs/2310.09199v1)|null|\n", "2310.09165": "|**2023-10-13**|**Towards Robust UAV Tracking in GNSS-Denied Environments: A Multi-LiDAR Multi-UAV Dataset**|Iacopo Catalano et.al.|[2310.09165v1](http://arxiv.org/abs/2310.09165v1)|**[link](https://github.com/tiers/uav_multi_lidar_dataset)**|\n", "2310.11374": "|**2023-10-17**|**DialogueLLM: Context and Emotion Knowledge-Tuned LLaMA Models for Emotion Recognition in Conversations**|Yazhou Zhang et.al.|[2310.11374v1](http://arxiv.org/abs/2310.11374v1)|null|\n", "2310.11316": "|**2023-10-17**|**MonoSKD: General Distillation Framework for Monocular 3D Object Detection via Spearman Correlation Coefficient**|Sen Wang et.al.|[2310.11316v1](http://arxiv.org/abs/2310.11316v1)|**[link](https://github.com/senwang98/monoskd)**|\n", "2310.11307": "|**2023-10-17**|**Multi Self-supervised Pre-fine-tuned Transformer Fusion for Better Intelligent Transportation Detection**|Juwu Zheng et.al.|[2310.11307v1](http://arxiv.org/abs/2310.11307v1)|null|\n", "2310.11295": "|**2023-10-17**|**CorrTalk: Correlation Between Hierarchical Speech and Facial Activity Variances for 3D Animation**|Zhaojie Chu et.al.|[2310.11295v1](http://arxiv.org/abs/2310.11295v1)|null|\n", "2310.10942": "|**2023-10-17**|**Unanswerable Visual Question Answering**|Yanyang Guo et.al.|[2310.10942v1](http://arxiv.org/abs/2310.10942v1)|**[link](https://github.com/guoyang9/unk-vqa)**|\n", "2310.10844": "|**2023-10-16**|**Survey of Vulnerabilities in Large Language Models Revealed by Adversarial Attacks**|Erfan Shayegani et.al.|[2310.10844v1](http://arxiv.org/abs/2310.10844v1)|null|\n", "2310.12081": "|**2023-10-18**|**DHOT-GM: Robust Graph Matching Using A Differentiable Hierarchical Optimal Transport Framework**|Haoran Cheng et.al.|[2310.12081v1](http://arxiv.org/abs/2310.12081v1)|null|\n", "2310.11989": "|**2023-10-18**|**Image Clustering with External Guidance**|Yunfan Li et.al.|[2310.11989v1](http://arxiv.org/abs/2310.11989v1)|null|\n", "2310.11939": "|**2023-10-18**|**Mixture distributions for probabilistic forecasts of disease outbreaks**|Spencer Wadsworth et.al.|[2310.11939v1](http://arxiv.org/abs/2310.11939v1)|null|\n", "2310.11938": "|**2023-10-18**|**Grounded and Well-rounded: A Methodological Approach to the Study of Cross-modal and Cross-lingual Grounding**|Timothee Mickus et.al.|[2310.11938v1](http://arxiv.org/abs/2310.11938v1)|null|\n", "2310.11910": "|**2023-10-18**|**Multi-modal Medical Neurological Image Fusion using Wavelet Pooled Edge Preserving Autoencoder**|Manisha Das et.al.|[2310.11910v1](http://arxiv.org/abs/2310.11910v1)|null|\n", "2310.11713": "|**2023-10-18**|**Separating Invisible Sounds Toward Universal Audiovisual Scene-Aware Sound Separation**|Yiyang Su et.al.|[2310.11713v1](http://arxiv.org/abs/2310.11713v1)|null|\n", "2310.11612": "|**2023-10-17**|**Balance Act: Mitigating Hubness in Cross-Modal Retrieval with Query and Gallery Banks**|Yimu Wang et.al.|[2310.11612v1](http://arxiv.org/abs/2310.11612v1)|**[link](https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval)**|\n", "2310.12973": "|**2023-10-19**|**Frozen Transformers in Language Models Are Effective Visual Encoder Layers**|Ziqi Pang et.al.|[2310.12973v1](http://arxiv.org/abs/2310.12973v1)|**[link](https://github.com/ziqipang/lm4visualencoding)**|\n", "2310.12798": "|**2023-10-19**|**MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal Adapter**|Zhiyuan Liu et.al.|[2310.12798v1](http://arxiv.org/abs/2310.12798v1)|**[link](https://github.com/acharkq/molca)**|\n", "2310.12609": "|**2023-10-19**|**Denoising Heat-inspired Diffusion with Insulators for Collision Free Motion Planning**|Junwoo Chang et.al.|[2310.12609v1](http://arxiv.org/abs/2310.12609v1)|null|\n", "2310.12520": "|**2023-10-19**|**Lost in Translation: When GPT-4V(ision) Can't See Eye to Eye with Text. A Vision-Language-Consistency Analysis of VLLMs and Beyond**|Xiang Zhang et.al.|[2310.12520v1](http://arxiv.org/abs/2310.12520v1)|null|\n", "2310.12518": "|**2023-10-19**|**Light-enhanced van der Waals force microscopy**|Han Yu-Xiao et.al.|[2310.12518v1](http://arxiv.org/abs/2310.12518v1)|null|\n", "2310.12344": "|**2023-10-18**|**LACMA: Language-Aligning Contrastive Learning with Meta-Actions for Embodied Instruction Following**|Cheng-Fu Yang et.al.|[2310.12344v1](http://arxiv.org/abs/2310.12344v1)|**[link](https://github.com/joeyy5588/lacma)**|\n", "2310.13619": "|**2023-10-20**|**Semi-supervised multimodal coreference resolution in image narrations**|Arushi Goel et.al.|[2310.13619v1](http://arxiv.org/abs/2310.13619v1)|**[link](https://github.com/vico-uoe/cin-ssl)**|\n", "2310.13596": "|**2023-10-20**|**MarineGPT: Unlocking Secrets of Ocean to the Public**|Ziqiang Zheng et.al.|[2310.13596v1](http://arxiv.org/abs/2310.13596v1)|**[link](https://github.com/hkust-vgd/marinegpt)**|\n", "2310.13451": "|**2023-10-20**|**Two-Stage Triplet Loss Training with Curriculum Augmentation for Audio-Visual Retrieval**|Donghuo Zeng et.al.|[2310.13451v1](http://arxiv.org/abs/2310.13451v1)|null|\n", "2310.13398": "|**2023-10-20**|**OpenAnnotate3D: Open-Vocabulary Auto-Labeling System for Multi-modal 3D Data**|Yijie Zhou et.al.|[2310.13398v1](http://arxiv.org/abs/2310.13398v1)|null|\n", "2310.13289": "|**2023-10-20**|**SALMONN: Towards Generic Hearing Abilities for Large Language Models**|Changli Tang et.al.|[2310.13289v1](http://arxiv.org/abs/2310.13289v1)|**[link](https://github.com/bytedance/salmonn)**|\n", "2310.13276": "|**2023-10-20**|**InvGC: Robust Cross-Modal Retrieval by Inverse Graph Convolution**|Xiangru Jian et.al.|[2310.13276v1](http://arxiv.org/abs/2310.13276v1)|**[link](https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval)**|\n", "2310.13267": "|**2023-10-20**|**On the Language Encoder of Contrastive Cross-modal Models**|Mengjie Zhao et.al.|[2310.13267v1](http://arxiv.org/abs/2310.13267v1)|null|\n", "2310.13265": "|**2023-10-20**|**MoqaGPT : Zero-Shot Multi-modal Open-domain Question Answering with Large Language Model**|Le Zhang et.al.|[2310.13265v1](http://arxiv.org/abs/2310.13265v1)|**[link](https://github.com/lezhang7/moqagpt)**|\n", "2310.13257": "|**2023-10-20**|**Visual Grounding Helps Learn Word Meanings in Low-Data Regimes**|Chengxu Zhuang et.al.|[2310.13257v1](http://arxiv.org/abs/2310.13257v1)|null|\n", "2310.13235": "|**2023-10-20**|**Auxiliary Features-Guided Super Resolution for Monte Carlo Rendering**|Qiqi Hou et.al.|[2310.13235v1](http://arxiv.org/abs/2310.13235v1)|null|\n", "2310.13103": "|**2023-10-19**|**AVTENet: Audio-Visual Transformer-based Ensemble Network Exploiting Multiple Experts for Video Deepfake Detection**|Ammarah Hashmi et.al.|[2310.13103v1](http://arxiv.org/abs/2310.13103v1)|null|\n", "2310.14924": "|**2023-10-23**|**Converting Depth Images and Point Clouds for Feature-based Pose Estimation**|Robert L\u00f6sch et.al.|[2310.14924v1](http://arxiv.org/abs/2310.14924v1)|**[link](https://github.com/rlsch/depth-conversions)**|\n", "2310.14805": "|**2023-10-23**|**Cross-Modal Conceptualization in Bottleneck Models**|Danis Alukaev et.al.|[2310.14805v1](http://arxiv.org/abs/2310.14805v1)|**[link](https://github.com/danisalukaev/xcbs)**|\n", "2310.14785": "|**2023-10-23**|**Vision-Enhanced Semantic Entity Recognition in Document Images via Visually-Asymmetric Consistency Learning**|Hao Wang et.al.|[2310.14785v1](http://arxiv.org/abs/2310.14785v1)|null|\n", "2310.14720": "|**2023-10-23**|**Extended Deep Adaptive Input Normalization for Preprocessing Time Series Data for Neural Networks**|Marcus A. K. September et.al.|[2310.14720v1](http://arxiv.org/abs/2310.14720v1)|**[link](https://github.com/marcusgh/edain_paper)**|\n", "2310.14702": "|**2023-10-23**|**BM2CP: Efficient Collaborative Perception with LiDAR-Camera Modalities**|Binyu Zhao et.al.|[2310.14702v1](http://arxiv.org/abs/2310.14702v1)|**[link](https://github.com/byzhaoai/bm2cp)**|\n", "2310.14643": "|**2023-10-23**|**Dynamic gain and frequency comb formation in exceptional-point lasers**|Xingwei Gao et.al.|[2310.14643v1](http://arxiv.org/abs/2310.14643v1)|null|\n", "2310.14566": "|**2023-10-23**|**HallusionBench: You See What You Think? Or You Think What You See? An Image-Context Reasoning Benchmark Challenging for GPT-4V(ision), LLaVA-1.5, and Other Multi-modality Models**|Fuxiao Liu et.al.|[2310.14566v1](http://arxiv.org/abs/2310.14566v1)|**[link](https://github.com/tianyi-lab/hallusionbench)**|\n", "2310.14549": "|**2023-10-23**|**Multimodal Graph Learning for Modeling Emerging Pandemics with Big Data**|Khanh-Tung Tran et.al.|[2310.14549v1](http://arxiv.org/abs/2310.14549v1)|**[link](https://github.com/khanhtungtran/mgl4mep)**|\n", "2310.14278": "|**2023-10-22**|**Conversational Speech Recognition by Learning Audio-textual Cross-modal Contextual Representation**|Kun Wei et.al.|[2310.14278v1](http://arxiv.org/abs/2310.14278v1)|null|\n", "2310.14226": "|**2023-10-22**|**Multi-stream Cell Segmentation with Low-level Cues for Multi-modality Images**|Wei Lou et.al.|[2310.14226v1](http://arxiv.org/abs/2310.14226v1)|**[link](https://github.com/lhaof/cellseg)**|\n", "2310.14216": "|**2023-10-22**|**UniMAP: Universal SMILES-Graph Representation Learning**|Shikun Feng et.al.|[2310.14216v1](http://arxiv.org/abs/2310.14216v1)|**[link](https://github.com/fengshikun/unimap)**|\n", "2310.14158": "|**2023-10-22**|**Visual-Attribute Prompt Learning for Progressive Mild Cognitive Impairment Prediction**|Luoyao Kang et.al.|[2310.14158v1](http://arxiv.org/abs/2310.14158v1)|**[link](https://github.com/lhaof/vapl)**|\n", "2310.14075": "|**2023-10-21**|**Unsupervised Sim-to-Real Adaptation of Soft Robot Proprioception using a Dual Cross-modal Autoencoder**|Chaeree Park et.al.|[2310.14075v1](http://arxiv.org/abs/2310.14075v1)|null|\n", "2310.14037": "|**2023-10-21**|**Unlock Multi-Modal Capability of Dense Retrieval via Visual Module Plugin**|Tianshuo Zhou et.al.|[2310.14037v1](http://arxiv.org/abs/2310.14037v1)|**[link](https://github.com/openmatch/marvel)**|\n", "2310.13898": "|**2023-10-21**|**Computational and Systems Biology Advances to Enable for Bioagent Agnostic Signatures**|Andy Lin et.al.|[2310.13898v1](http://arxiv.org/abs/2310.13898v1)|null|\n", "2310.15887": "|**2023-10-24**|**AdaptiX -- A Transitional XR Framework for Development and Evaluation of Shared Control Applications in Assistive Robotics**|Max Pascher et.al.|[2310.15887v1](http://arxiv.org/abs/2310.15887v1)|**[link](https://github.com/maxpascher/AdaptiX)**|\n", "2310.15676": "|**2023-10-24**|**Recent Advances in Multi-modal 3D Scene Understanding: A Comprehensive Survey and Evaluation**|Yinjie Lei et.al.|[2310.15676v1](http://arxiv.org/abs/2310.15676v1)|null|\n", "2310.15670": "|**2023-10-24**|**Leveraging Vision-Centric Multi-Modal Expertise for 3D Object Detection**|Linyan Huang et.al.|[2310.15670v1](http://arxiv.org/abs/2310.15670v1)|**[link](https://github.com/opendrivelab/birds-eye-view-perception)**|\n", "2310.15587": "|**2023-10-24**|**ScanDL: A Diffusion Model for Generating Synthetic Scanpaths on Texts**|Lena S. Bolliger et.al.|[2310.15587v1](http://arxiv.org/abs/2310.15587v1)|**[link](https://github.com/dili-lab/scandl)**|\n", "2310.15585": "|**2023-10-24**|**Multimodal Representations for Teacher-Guided Compositional Visual Reasoning**|Wafa Aissa et.al.|[2310.15585v1](http://arxiv.org/abs/2310.15585v1)|null|\n", "2310.15568": "|**2023-10-24**|**I$^2$MD: 3D Action Representation Learning with Inter- and Intra-modal Mutual Distillation**|Yunyao Mao et.al.|[2310.15568v1](http://arxiv.org/abs/2310.15568v1)|null|\n", "2310.15482": "|**2023-10-24**|**Salient Object Detection in RGB-D Videos**|Ao Mou et.al.|[2310.15482v1](http://arxiv.org/abs/2310.15482v1)|**[link](https://github.com/kerenfu/rdvs)**|\n", "2310.15325": "|**2023-10-23**|**LXMERT Model Compression for Visual Question Answering**|Maryam Hashemi et.al.|[2310.15325v1](http://arxiv.org/abs/2310.15325v1)|**[link](https://github.com/ghazaleh-mahmoodi/lxmert_compression)**|\n", "2310.15301": "|**2023-10-23**|**ADMarker: A Multi-Modal Federated Learning System for Monitoring Digital Biomarkers of Alzheimer's Disease**|Xiaomin Ouyang et.al.|[2310.15301v1](http://arxiv.org/abs/2310.15301v1)|null|\n", "2310.15281": "|**2023-10-23**|**UncertaintyPlayground: A Fast and Simplified Python Library for Uncertainty Estimation**|Ilia Azizi et.al.|[2310.15281v1](http://arxiv.org/abs/2310.15281v1)|**[link](https://github.com/Unco3892/UncertaintyPlayground)**|\n", "2310.16781": "|**2023-10-25**|**Kiki or Bouba? Sound Symbolism in Vision-and-Language Models**|Morris Alper et.al.|[2310.16781v1](http://arxiv.org/abs/2310.16781v1)|null|\n", "2310.16754": "|**2023-10-25**|**CAD -- Contextual Multi-modal Alignment for Dynamic AVQA**|Asmar Nadeem et.al.|[2310.16754v1](http://arxiv.org/abs/2310.16754v1)|null|\n", "2310.16641": "|**2023-10-25**|**The Next Evolution of Artificial Sense of Touch**|Sonja Gro\u00df et.al.|[2310.16641v1](http://arxiv.org/abs/2310.16641v1)|null|\n", "2310.16629": "|**2023-10-25**|**EdgeCalib: Multi-Frame Weighted Edge Features for Automatic Targetless LiDAR-Camera Calibration**|Xingchen Li et.al.|[2310.16629v1](http://arxiv.org/abs/2310.16629v1)|null|\n", "2310.16590": "|**2023-10-25**|**$\\mathbb{VD}$-$\\mathbb{GR}$: Boosting $\\mathbb{V}$isual $\\mathbb{D}$ialog with Cascaded Spatial-Temporal Multi-Modal $\\mathbb{GR}$aphs**|Adnen Abdessaied et.al.|[2310.16590v1](http://arxiv.org/abs/2310.16590v1)|null|\n", "2310.16477": "|**2023-10-25**|**Show from Tell: Audio-Visual Modelling in Clinical Settings**|Jianbo Jiao et.al.|[2310.16477v1](http://arxiv.org/abs/2310.16477v1)|null|\n", "2310.16402": "|**2023-10-25**|**Video Referring Expression Comprehension via Transformer with Content-conditioned Query**|Ji Jiang et.al.|[2310.16402v1](http://arxiv.org/abs/2310.16402v1)|null|\n", "2310.16380": "|**2023-10-25**|**A model for multi-attack classification to improve intrusion detection performance using deep learning approaches**|Arun Kumar Silivery et.al.|[2310.16380v1](http://arxiv.org/abs/2310.16380v1)|null|\n", "2310.16356": "|**2023-10-25**|**A Multi-Modal Multilingual Benchmark for Document Image Classification**|Yoshinari Fujinuma et.al.|[2310.16356v1](http://arxiv.org/abs/2310.16356v1)|null|\n", "2310.16273": "|**2023-10-25**|**Deep Learning for Plant Identification and Disease Classification from Leaf Images: Multi-prediction Approaches**|Jianping Yao et.al.|[2310.16273v1](http://arxiv.org/abs/2310.16273v1)|**[link](https://github.com/funzi-son/plant_pathology_dl)**|\n", "2310.17642": "|**2023-10-26**|**Drive Anywhere: Generalizable End-to-end Autonomous Driving with Multi-modal Foundation Models**|Tsun-Hsuan Wang et.al.|[2310.17642v1](http://arxiv.org/abs/2310.17642v1)|null|\n", "2310.17568": "|**2023-10-26**|**Navigating to Success in Multi-Modal Human-Robot Collaboration: Analysis and Corpus Release**|Stephanie M. Lukin et.al.|[2310.17568v1](http://arxiv.org/abs/2310.17568v1)|null|\n", "2310.17540": "|**2023-10-26**|**EqDrive: Efficient Equivariant Motion Forecasting with Multi-Modality for Autonomous Driving**|Yuping Wang et.al.|[2310.17540v1](http://arxiv.org/abs/2310.17540v1)|null|\n", "2310.17468": "|**2023-10-26**|**Cross-modal Active Complementary Learning with Self-refining Correspondence**|Yang Qin et.al.|[2310.17468v1](http://arxiv.org/abs/2310.17468v1)|**[link](https://github.com/qinyang79/crcl)**|\n", "2310.17323": "|**2023-10-26**|**IndustReal: A Dataset for Procedure Step Recognition Handling Execution Errors in Egocentric Videos in an Industrial-Like Setting**|Tim J. Schoonbeek et.al.|[2310.17323v1](http://arxiv.org/abs/2310.17323v1)|**[link](https://github.com/timschoonbeek/industreal)**|\n", "2310.17133": "|**2023-10-26**|**Incorporating Probing Signals into Multimodal Machine Translation via Visual Question-Answering Pairs**|Yuxin Zuo et.al.|[2310.17133v1](http://arxiv.org/abs/2310.17133v1)|**[link](https://github.com/libeineu/mmt-vqa)**|\n", "2310.17025": "|**2023-10-25**|**netFound: Foundation Model for Network Security**|Satyandra Guthula et.al.|[2310.17025v1](http://arxiv.org/abs/2310.17025v1)|null|\n", "2310.16917": "|**2023-10-25**|**MimicTouch: Learning Human's Control Strategy with Multi-Modal Tactile Feedback**|Kelin Yu et.al.|[2310.16917v1](http://arxiv.org/abs/2310.16917v1)|null|\n", "2310.18049": "|**2023-10-27**|**Text Augmented Spatial-aware Zero-shot Referring Image Segmentation**|Yucheng Suo et.al.|[2310.18049v1](http://arxiv.org/abs/2310.18049v1)|null|\n", "2310.17956": "|**2023-10-27**|**Qilin-Med-VL: Towards Chinese Large Vision-Language Model for General Healthcare**|Junling Liu et.al.|[2310.17956v1](http://arxiv.org/abs/2310.17956v1)|**[link](https://github.com/williamliujl/qilin-med-vl)**|\n", "2310.17933": "|**2023-10-27**|**A barycenter-based approach for the multi-model ensembling of subseasonal forecasts**|Camille Le Coz et.al.|[2310.17933v1](http://arxiv.org/abs/2310.17933v1)|null|\n", "2310.17852": "|**2023-10-27**|**Function Space Bayesian Pseudocoreset for Bayesian Neural Networks**|Balhae Kim et.al.|[2310.17852v1](http://arxiv.org/abs/2310.17852v1)|null|\n", "2310.17796": "|**2023-10-26**|**ControlLLM: Augment Language Models with Tools by Searching on Graphs**|Zhaoyang Liu et.al.|[2310.17796v1](http://arxiv.org/abs/2310.17796v1)|**[link](https://github.com/opengvlab/controlllm)**|\n", "2310.17770": "|**2023-10-26**|**GROOViST: A Metric for Grounding Objects in Visual Storytelling**|Aditya K Surikuchi et.al.|[2310.17770v1](http://arxiv.org/abs/2310.17770v1)|**[link](https://github.com/akskuchi/groovist)**|\n", "2310.17737": "|**2023-10-26**|**ArchBERT: Bi-Modal Understanding of Neural Architectures and Natural Languages**|Mohammad Akbari et.al.|[2310.17737v1](http://arxiv.org/abs/2310.17737v1)|null|\n", "2310.19168": "|**2023-10-29**|**BirdSAT: Cross-View Contrastive Masked Autoencoders for Bird Species Classification and Mapping**|Srikumar Sastry et.al.|[2310.19168v1](http://arxiv.org/abs/2310.19168v1)|**[link](https://github.com/mvrl/birdsat)**|\n", "2310.19070": "|**2023-10-29**|**Myriad: Large Multimodal Model by Applying Vision Experts for Industrial Anomaly Detection**|Yuanze Li et.al.|[2310.19070v1](http://arxiv.org/abs/2310.19070v1)|null|\n", "2310.19062": "|**2023-10-29**|**A multi-modal table tennis robot system**|Andreas Ziegler et.al.|[2310.19062v1](http://arxiv.org/abs/2310.19062v1)|null|\n", "2310.19001": "|**2023-10-29**|**Uncovering Prototypical Knowledge for Weakly Open-Vocabulary Semantic Segmentation**|Fei Zhang et.al.|[2310.19001v1](http://arxiv.org/abs/2310.19001v1)|null|\n", "2310.18949": "|**2023-10-29**|**Customize StyleGAN with One Hand Sketch**|Shaocong Zhang et.al.|[2310.18949v1](http://arxiv.org/abs/2310.18949v1)|null|\n", "2310.18890": "|**2023-10-29**|**Towards Generalized Multi-stage Clustering: Multi-view Self-distillation**|Jiatai Wang et.al.|[2310.18890v1](http://arxiv.org/abs/2310.18890v1)|null|\n", "2310.18728": "|**2023-10-28**|**Online Multi-view Anomaly Detection with Disentangled Product-of-Experts Modeling**|Hao Wang et.al.|[2310.18728v1](http://arxiv.org/abs/2310.18728v1)|null|\n", "2310.18709": "|**2023-10-28**|**Audio-Visual Instance Segmentation**|Ruohao Guo et.al.|[2310.18709v1](http://arxiv.org/abs/2310.18709v1)|null|\n", "2310.18652": "|**2023-10-28**|**EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health Records with Chest X-ray Images**|Seongsu Bae et.al.|[2310.18652v1](http://arxiv.org/abs/2310.18652v1)|**[link](https://github.com/baeseongsu/ehrxqa)**|\n", "2310.18620": "|**2023-10-28**|**ODM3D: Alleviating Foreground Sparsity for Enhanced Semi-Supervised Monocular 3D Object Detection**|Weijia Zhang et.al.|[2310.18620v1](http://arxiv.org/abs/2310.18620v1)|null|\n", "2310.18583": "|**2023-10-28**|**Self-Supervised Multi-Modality Learning for Multi-Label Skin Lesion Classification**|Hao Wang et.al.|[2310.18583v1](http://arxiv.org/abs/2310.18583v1)|**[link](https://github.com/dylan-h-wang/skin-sm3)**|\n", "2310.18481": "|**2023-10-27**|**MOSEL: Inference Serving Using Dynamic Modality Selection**|Bodun Hu et.al.|[2310.18481v1](http://arxiv.org/abs/2310.18481v1)|null|\n", "2310.18438": "|**2023-10-27**|**Exploring Shape Embedding for Cloth-Changing Person Re-Identification via 2D-3D Correspondences**|Yubin Wang et.al.|[2310.18438v1](http://arxiv.org/abs/2310.18438v1)|null|\n", "2310.20561": "|**2023-10-31**|**Predictive Control for Autonomous Driving with Uncertain, Multi-modal Predictions**|Siddharth H. Nair et.al.|[2310.20561v1](http://arxiv.org/abs/2310.20561v1)|null|\n", "2310.20446": "|**2023-10-31**|**LAVSS: Location-Guided Audio-Visual Spatial Audio Separation**|Yuxin Ye et.al.|[2310.20446v1](http://arxiv.org/abs/2310.20446v1)|null|\n", "2310.20357": "|**2023-11-01**|**Enhancing the Spatial Awareness Capability of Multi-Modal Large Language Model**|Yongqiang Zhao et.al.|[2310.20357v2](http://arxiv.org/abs/2310.20357v2)|null|\n", "2310.20343": "|**2023-10-31**|**Large Multi-modal Encoders for Recommendation**|Zixuan Yi et.al.|[2310.20343v1](http://arxiv.org/abs/2310.20343v1)|null|\n", "2310.20025": "|**2023-10-30**|**GOPlan: Goal-conditioned Offline Reinforcement Learning by Planning with Learned Models**|Mianchu Wang et.al.|[2310.20025v1](http://arxiv.org/abs/2310.20025v1)|null|\n", "2310.19795": "|**2023-10-30**|**SimMMDG: A Simple and Effective Framework for Multi-modal Domain Generalization**|Hao Dong et.al.|[2310.19795v1](http://arxiv.org/abs/2310.19795v1)|**[link](https://github.com/donghao51/simmmdg)**|\n", "2310.19743": "|**2023-10-30**|**Tell Me What Is Good About This Property: Leveraging Reviews For Segment-Personalized Image Collection Summarization**|Monika Wysoczanska et.al.|[2310.19743v1](http://arxiv.org/abs/2310.19743v1)|null|\n", "2310.19654": "|**2023-10-30**|**MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient image-text retrieval**|Youbo Lei et.al.|[2310.19654v1](http://arxiv.org/abs/2310.19654v1)|null|\n", "2310.19635": "|**2023-10-30**|**Bidirectional Captioning for Clinically Accurate and Interpretable Models**|Keegan Quigley et.al.|[2310.19635v1](http://arxiv.org/abs/2310.19635v1)|null|\n", "2310.19608": "|**2023-10-30**|**On Feynman--Kac training of partial Bayesian neural networks**|Zheng Zhao et.al.|[2310.19608v1](http://arxiv.org/abs/2310.19608v1)|null|\n", "2310.19559": "|**2023-10-30**|**Disentangled Counterfactual Learning for Physical Audiovisual Commonsense Reasoning**|Changsheng Lv et.al.|[2310.19559v1](http://arxiv.org/abs/2310.19559v1)|null|\n", "2310.19554": "|**2023-10-30**|**Harvest Video Foundation Models via Efficient Post-Pretraining**|Yizhuo Li et.al.|[2310.19554v1](http://arxiv.org/abs/2310.19554v1)|**[link](https://github.com/opengvlab/internvideo)**|\n", "2310.19432": "|**2023-10-30**|**Explaining the Decisions of Deep Policy Networks for Robotic Manipulations**|Seongun Kim et.al.|[2310.19432v1](http://arxiv.org/abs/2310.19432v1)|null|\n", "2310.19264": "|**2023-10-30**|**Sound of Story: Multi-modal Storytelling with Audio**|Jaeyeon Bae et.al.|[2310.19264v1](http://arxiv.org/abs/2310.19264v1)|null|\n", "2311.00618": "|**2023-11-01**|**De-Diffusion Makes Text a Strong Cross-Modal Interface**|Chen Wei et.al.|[2311.00618v1](http://arxiv.org/abs/2311.00618v1)|null|\n", "2311.00566": "|**2023-11-01**|**CROMA: Remote Sensing Representations with Contrastive Radar-Optical Masked Autoencoders**|Anthony Fuller et.al.|[2311.00566v1](http://arxiv.org/abs/2311.00566v1)|**[link](https://github.com/antofuller/croma)**|\n", "2311.00436": "|**2023-11-01**|**Enhancing Traffic Object Detection in Variable Illumination with RGB-Event Fusion**|Zhanwen Liu et.al.|[2311.00436v1](http://arxiv.org/abs/2311.00436v1)|null|\n", "2311.00265": "|**2023-11-01**|**Adaptive Latent Diffusion Model for 3D Medical Image to Image Translation: Multi-modal Magnetic Resonance Imaging Study**|Jonghun Kim et.al.|[2311.00265v1](http://arxiv.org/abs/2311.00265v1)|**[link](https://github.com/jongdory/aldm)**|\n", "2311.00207": "|**2023-11-01**|**Magmaw: Modality-Agnostic Adversarial Attacks on Machine Learning-Based Wireless Communication Systems**|Jung-Woo Chang et.al.|[2311.00207v1](http://arxiv.org/abs/2311.00207v1)|null|\n", "2311.01459": "|**2023-11-02**|**Align Your Prompts: Test-Time Prompting with Distribution Alignment for Zero-Shot Generalization**|Jameel Hassan et.al.|[2311.01459v1](http://arxiv.org/abs/2311.01459v1)|null|\n", "2311.01361": "|**2023-11-02**|**GPT-4V(ision) as a Generalist Evaluator for Vision-Language Tasks**|Xinlu Zhang et.al.|[2311.01361v1](http://arxiv.org/abs/2311.01361v1)|null|\n", "2311.01202": "|**2023-11-02**|**Cross-Modal Information-Guided Network using Contrastive Learning for Point Cloud Registration**|Yifan Xie et.al.|[2311.01202v1](http://arxiv.org/abs/2311.01202v1)|**[link](https://github.com/ivanxie416/cmignet)**|\n", "2311.01092": "|**2023-11-02**|**Learning A Multi-Task Transformer Via Unified And Customized Instruction Tuning For Chest Radiograph Interpretation**|Lijian Xu et.al.|[2311.01092v1](http://arxiv.org/abs/2311.01092v1)|**[link](https://github.com/medhk23/omnifm-dr)**|\n", "2311.01066": "|**2023-11-02**|**Dynamic Multimodal Information Bottleneck for Multimodality Classification**|Yingying Fang et.al.|[2311.01066v1](http://arxiv.org/abs/2311.01066v1)|**[link](https://github.com/bii-wushuang/dmib)**|\n", "2311.00807": "|**2023-11-01**|**VQA-GEN: A Visual Question Answering Benchmark for Domain Generalization**|Suraj Jyothi Unni et.al.|[2311.00807v1](http://arxiv.org/abs/2311.00807v1)|null|\n", "2311.00737": "|**2023-11-01**|**Real-Time Magnetic Tracking and Diagnosis of COVID-19 via Machine Learning**|Dang Nguyen et.al.|[2311.00737v1](http://arxiv.org/abs/2311.00737v1)|null|\n", "2311.01908": "|**2023-11-03**|**LLM-driven Multimodal Target Volume Contouring in Radiation Oncology**|Yujin Oh et.al.|[2311.01908v1](http://arxiv.org/abs/2311.01908v1)|null|\n", "2311.01886": "|**2023-11-03**|**Bridging the Gap between Multi-focus and Multi-modal: A Focused Integration Framework for Multi-modal Image Fusion**|Xilai Li et.al.|[2311.01886v1](http://arxiv.org/abs/2311.01886v1)|null|\n", "2311.01881": "|**2023-11-03**|**Quantitative Evaluation of a Multi-Modal Camera Setup for Fusing Event Data with RGB Images**|Julian Moosmann et.al.|[2311.01881v1](http://arxiv.org/abs/2311.01881v1)|null|\n", "2311.01831": "|**2023-11-03**|**Universal Multi-modal Multi-domain Pre-trained Recommendation**|Wenqi Sun et.al.|[2311.01831v1](http://arxiv.org/abs/2311.01831v1)|null|\n", "2311.01807": "|**2023-11-03**|**Cross-modal Consistency Learning with Fine-grained Fusion Network for Multimodal Fake News Detection**|Jun Li et.al.|[2311.01807v1](http://arxiv.org/abs/2311.01807v1)|**[link](https://github.com/uestc-lj/cffn)**|\n", "2311.01767": "|**2023-11-03**|**PPTC Benchmark: Evaluating Large Language Models for PowerPoint Task Completion**|Yiduo Guo et.al.|[2311.01767v1](http://arxiv.org/abs/2311.01767v1)|**[link](https://github.com/gydpku/pptc)**|\n", "2311.01766": "|**2023-11-03**|**Support or Refute: Analyzing the Stance of Evidence to Detect Out-of-Context Mis- and Disinformation**|Xin Yuan et.al.|[2311.01766v1](http://arxiv.org/abs/2311.01766v1)|null|\n", "2311.01740": "|**2023-11-03**|**SAC$^3$: Reliable Hallucination Detection in Black-Box Language Models via Semantic-aware Cross-check Consistency**|Jiaxin Zhang et.al.|[2311.01740v1](http://arxiv.org/abs/2311.01740v1)|null|\n", "2311.01734": "|**2023-11-03**|**MixCon3D: Synergizing Multi-View and Cross-Modal Contrastive Learning for Enhancing 3D Representation**|Yipeng Gao et.al.|[2311.01734v1](http://arxiv.org/abs/2311.01734v1)|**[link](https://github.com/ucsc-vlaa/mixcon3d)**|\n", "2311.01487": "|**2023-11-02**|**What Makes for Good Visual Instructions? Synthesizing Complex Visual Reasoning Instructions for Visual Instruction Tuning**|Yifan Du et.al.|[2311.01487v1](http://arxiv.org/abs/2311.01487v1)|**[link](https://github.com/rucaibox/comvint)**|\n", "2311.03328": "|**2023-11-06**|**On Asynchrony, Memory, and Communication: Separations and Landscapes**|Paola Flocchini et.al.|[2311.03328v1](http://arxiv.org/abs/2311.03328v1)|null|\n", "2311.03217": "|**2023-11-06**|**Leveraging Transformers to Improve Breast Cancer Classification and Risk Assessment with Multi-modal and Longitudinal Data**|Yiqiu Shen et.al.|[2311.03217v1](http://arxiv.org/abs/2311.03217v1)|null|\n", "2311.03106": "|**2023-11-06**|**Unified Multi-modal Unsupervised Representation Learning for Skeleton-based Action Understanding**|Shengkai Sun et.al.|[2311.03106v1](http://arxiv.org/abs/2311.03106v1)|**[link](https://github.com/huiguanlab/umurl)**|\n", "2311.03090": "|**2023-11-06**|**A multi-modal approach to continuous material identification through tactile sensing**|Augusto G\u00f3mez Egu\u00edluz et.al.|[2311.03090v1](http://arxiv.org/abs/2311.03090v1)|null|\n", "2311.03079": "|**2023-11-06**|**CogVLM: Visual Expert for Pretrained Language Models**|Weihan Wang et.al.|[2311.03079v1](http://arxiv.org/abs/2311.03079v1)|**[link](https://github.com/thudm/cogvlm)**|\n", "2311.02863": "|**2023-11-06**|**Temporal Shift -- Multi-Objective Loss Function for Improved Anomaly Fall Detection**|Stefan Denkovski et.al.|[2311.02863v1](http://arxiv.org/abs/2311.02863v1)|null|\n", "2311.02850": "|**2023-11-06**|**IR-STP: Enhancing Autonomous Driving with Interaction Reasoning in Spatio-Temporal Planning**|Yingbing Chen et.al.|[2311.02850v1](http://arxiv.org/abs/2311.02850v1)|**[link](https://github.com/chenyingbing/ir-stp-planner)**|\n", "2311.02842": "|**2023-11-06**|**An invariant feature extraction for multi-modal images matching**|Chenzhong Gao et.al.|[2311.02842v1](http://arxiv.org/abs/2311.02842v1)|null|\n", "2311.02820": "|**2023-11-06**|**Mesh Neural Cellular Automata**|Ehsan Pajouheshgar et.al.|[2311.02820v1](http://arxiv.org/abs/2311.02820v1)|null|\n", "2311.02782": "|**2023-11-05**|**Towards Generic Anomaly Detection and Understanding: Large-scale Visual-linguistic Model (GPT-4V) Takes the Lead**|Yunkang Cao et.al.|[2311.02782v1](http://arxiv.org/abs/2311.02782v1)|**[link](https://github.com/caoyunkang/gpt4v-for-generic-anomaly-detection)**|\n", "2311.02733": "|**2023-11-05**|**AV-Lip-Sync+: Leveraging AV-HuBERT to Exploit Multimodal Inconsistency for Video Deepfake Detection**|Sahibzada Adil Shahzad et.al.|[2311.02733v1](http://arxiv.org/abs/2311.02733v1)|null|\n", "2311.02559": "|**2023-11-05**|**Rotation Invariant Transformer for Recognizing Object in UAVs**|Shuoyi Chen et.al.|[2311.02559v1](http://arxiv.org/abs/2311.02559v1)|null|\n", "2311.02329": "|**2023-11-04**|**Complex Organ Mask Guided Radiology Report Generation**|Gu Tiancheng et.al.|[2311.02329v1](http://arxiv.org/abs/2311.02329v1)|**[link](https://github.com/garygutc/comg_model)**|\n", "2311.02282": "|**2023-11-04**|**Contrastive Multi-Modal Representation Learning for Spark Plug Fault Diagnosis**|Ardavan Modarres et.al.|[2311.02282v1](http://arxiv.org/abs/2311.02282v1)|null|\n", "2311.02248": "|**2023-11-03**|**COSMIC: Data Efficient Instruction-tuning For Speech In-Context Learning**|Jing Pan et.al.|[2311.02248v1](http://arxiv.org/abs/2311.02248v1)|null|\n", "2311.04219": "|**2023-11-07**|**OtterHD: A High-Resolution Multi-modality Model**|Bo Li et.al.|[2311.04219v1](http://arxiv.org/abs/2311.04219v1)|null|\n", "2311.04160": "|**2023-11-07**|**\"Tell me about that church\": Exploring the Design and User Experience of In-Vehicle Multi-modal Intuitive Interface in the Context of Driving Scenario**|Yueteng Yu et.al.|[2311.04160v1](http://arxiv.org/abs/2311.04160v1)|null|\n", "2311.04091": "|**2023-11-07**|**Proceedings of the 5th International Workshop on Reading Music Systems**|Jorge Calvo-Zaragoza et.al.|[2311.04091v1](http://arxiv.org/abs/2311.04091v1)|**[link](https://github.com/suziai/gui-tools)**|\n", "2311.04058": "|**2023-11-07**|**mmFUSION: Multimodal Fusion for 3D Objects Detection**|Javed Ahmad et.al.|[2311.04058v1](http://arxiv.org/abs/2311.04058v1)|null|\n", "2311.04056": "|**2023-11-07**|**Multi-View Causal Representation Learning with Partial Observability**|Dingling Yao et.al.|[2311.04056v1](http://arxiv.org/abs/2311.04056v1)|null|\n", "2311.03810": "|**2023-11-07**|**Rethinking and Improving Multi-task Learning for End-to-end Speech Translation**|Yuhao Zhang et.al.|[2311.03810v1](http://arxiv.org/abs/2311.03810v1)|**[link](https://github.com/xiaozhang521/imtl)**|\n", "2311.03620": "|**2023-11-07**|**FusionViT: Hierarchical 3D Object Detection via LiDAR-Camera Vision Transformer Fusion**|Xinhao Xiang et.al.|[2311.03620v1](http://arxiv.org/abs/2311.03620v1)|null|\n", "2311.03606": "|**2023-11-06**|**Multimodal Stress Detection Using Facial Landmarks and Biometric Signals**|Majid Hosseini et.al.|[2311.03606v1](http://arxiv.org/abs/2311.03606v1)|null|\n", "2311.03413": "|**2023-11-06**|**Discret2Di -- Deep Learning based Discretization for Model-based Diagnosis**|Lukas Moddemann et.al.|[2311.03413v1](http://arxiv.org/abs/2311.03413v1)|null|\n", "2311.04766": "|**2023-11-08**|**DualTalker: A Cross-Modal Dual Learning Approach for Speech-Driven 3D Facial Animation**|Guinan Su et.al.|[2311.04766v1](http://arxiv.org/abs/2311.04766v1)|null|\n", "2311.04678": "|**2023-11-08**|**Weakly supervised cross-model learning in high-content screening**|Watkinson Gabriel et.al.|[2311.04678v1](http://arxiv.org/abs/2311.04678v1)|null|\n", "2311.04589": "|**2023-11-08**|**TEAL: Tokenize and Embed ALL for Multi-modal Large Language Models**|Zhen Yang et.al.|[2311.04589v1](http://arxiv.org/abs/2311.04589v1)|null|\n", "2311.04563": "|**2023-11-08**|**Investigating the Nature of Disagreements on Mid-Scale Ratings: A Case Study on the Abstractness-Concreteness Continuum**|Urban Knuple\u0161 et.al.|[2311.04563v1](http://arxiv.org/abs/2311.04563v1)|null|\n", "2311.04552": "|**2023-11-08**|**A 3D generative model of pathological multi-modal MR images and segmentations**|Virginia Fernandez et.al.|[2311.04552v1](http://arxiv.org/abs/2311.04552v1)|**[link](https://github.com/virginiafdez/brainspade3d_rel)**|\n", "2311.04512": "|**2023-11-08**|**FFINet: Future Feedback Interaction Network for Motion Forecasting**|Miao Kang et.al.|[2311.04512v1](http://arxiv.org/abs/2311.04512v1)|null|\n", "2311.04507": "|**2023-11-08**|**Conversation Understanding using Relational Temporal Graph Neural Networks with Auxiliary Cross-Modality Interaction**|Cam-Van Thi Nguyen et.al.|[2311.04507v1](http://arxiv.org/abs/2311.04507v1)|null|\n", "2311.04390": "|**2023-11-07**|**Force-Constrained Visual Policy: Safe Robot-Assisted Dressing via Multi-Modal Sensing**|Zhanyi Sun et.al.|[2311.04390v1](http://arxiv.org/abs/2311.04390v1)|null|\n", "2311.04257": "|**2023-11-07**|**mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration**|Qinghao Ye et.al.|[2311.04257v1](http://arxiv.org/abs/2311.04257v1)|**[link](https://github.com/x-plug/mplug-owl)**|\n", "2311.05494": "|**2023-11-09**|**Object-centric Cross-modal Feature Distillation for Event-based Object Detection**|Lei Li et.al.|[2311.05494v1](http://arxiv.org/abs/2311.05494v1)|null|\n", "2311.05464": "|**2023-11-09**|**3DStyle-Diffusion: Pursuing Fine-grained Text-driven 3D Stylization with 2D Diffusion Models**|Haibo Yang et.al.|[2311.05464v1](http://arxiv.org/abs/2311.05464v1)|**[link](https://github.com/yanghb22-fdu/3dstyle-diffusion-official)**|\n", "2311.05463": "|**2023-11-09**|**ControlStyle: Text-Driven Stylized Image Generation Using Diffusion Priors**|Jingwen Chen et.al.|[2311.05463v1](http://arxiv.org/abs/2311.05463v1)|null|\n", "2311.05348": "|**2023-11-09**|**u-LLaVA: Unifying Multi-Modal Tasks via Large Language Model**|Jinjin Xu et.al.|[2311.05348v1](http://arxiv.org/abs/2311.05348v1)|null|\n", "2311.05319": "|**2023-11-09**|**TLCFuse: Temporal Multi-Modality Fusion Towards Occlusion-Aware Semantic Segmentation-Aided Motion Planning**|Gustavo Salazar-Gomez et.al.|[2311.05319v1](http://arxiv.org/abs/2311.05319v1)|null|\n", "2311.05298": "|**2023-11-09**|**Improving Vision-and-Language Reasoning via Spatial Relations Modeling**|Cheng Yang et.al.|[2311.05298v1](http://arxiv.org/abs/2311.05298v1)|null|\n", "2311.05152": "|**2023-11-09**|**Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual Downstream Tasks**|Haoyi Duan et.al.|[2311.05152v1](http://arxiv.org/abs/2311.05152v1)|**[link](https://github.com/haoyi-duan/dg-sct)**|\n", "2311.05032": "|**2023-11-08**|**Transfer learning from a sparsely annotated dataset of 3D medical images**|Gabriel Efrain Humpire-Mamani et.al.|[2311.05032v1](http://arxiv.org/abs/2311.05032v1)|**[link](https://github.com/diagnijmegen/medicaltransferlearning3d-unet)**|\n"}, "Point Cloud Localization": {"2301.05372": "|**2023-01-13**|**Text to Point Cloud Localization with Relation-Enhanced Transformer**|Guangzhi Wang et.al.|[2301.05372v1](http://arxiv.org/abs/2301.05372v1)|null|\n", "2209.15475": "|**2022-09-30**|**Point Cloud Quality Assessment using 3D Saliency Maps**|Zhengyu Wang et.al.|[2209.15475v1](http://arxiv.org/abs/2209.15475v1)|null|\n", "2207.05317": "|**2022-07-12**|**CPO: Change Robust Panorama to Point Cloud Localization**|Junho Kim et.al.|[2207.05317v1](http://arxiv.org/abs/2207.05317v1)|null|\n", "2205.14965": "|**2022-05-31**|**PSNet: Fast Data Structuring for Hierarchical Deep Learning on Point Cloud**|Luyang Li et.al.|[2205.14965v2](http://arxiv.org/abs/2205.14965v2)|**[link](https://github.com/lly007/pointstructuringnet)**|\n", "2203.15125": "|**2022-04-05**|**Text2Pos: Text-to-Point-Cloud Cross-Modal Localization**|Manuel Kolmet et.al.|[2203.15125v2](http://arxiv.org/abs/2203.15125v2)|null|\n", "2003.02392": "|**2021-11-22**|**PointLoc: Deep Pose Regressor for LiDAR Point Cloud Localization**|Wei Wang et.al.|[2003.02392v3](http://arxiv.org/abs/2003.02392v3)|**[link](https://github.com/loveoxford/vreloc)**|\n", "1812.01711": "|**2018-11-28**|**A Graph-CNN for 3D Point Cloud Classification**|Yingxue Zhang et.al.|[1812.01711v1](http://arxiv.org/abs/1812.01711v1)|**[link](https://github.com/maggie0106/Graph-CNN-in-3D-Point-Cloud-Classification)**|\n", "1712.06760": "|**2018-04-03**|**Mining Point Cloud Local Structures by Kernel Correlation and Graph Pooling**|Yiru Shen et.al.|[1712.06760v2](http://arxiv.org/abs/1712.06760v2)|null|\n", "1702.04114": "|**2017-02-14**|**Graph Based Over-Segmentation Methods for 3D Point Clouds**|Yizhak Ben-Shabat et.al.|[1702.04114v1](http://arxiv.org/abs/1702.04114v1)|null|\n"}, "Place Recognization": {"2302.06149": "|**2023-02-13**|**Contour Context: Abstract Structural Distribution for 3D LiDAR Loop Detection and Metric Pose Estimation**|Binqian Jiang et.al.|[2302.06149v1](http://arxiv.org/abs/2302.06149v1)|**[link](https://github.com/lewisjiang/contour-context)**|\n", "2301.05604": "|**2023-01-13**|**A LiDAR-Inertial-Visual SLAM System with Loop Detection**|Kangcheng Liu et.al.|[2301.05604v1](http://arxiv.org/abs/2301.05604v1)|null|\n", "2212.12745": "|**2022-12-24**|**GraffMatch: Global Matching of 3D Lines and Planes for Wide Baseline LiDAR Registration**|Parker C. Lusk et.al.|[2212.12745v1](http://arxiv.org/abs/2212.12745v1)|null|\n", "2211.14864": "|**2022-11-27**|**A Faster, Lighter and Stronger Deep Learning-Based Approach for Place Recognition**|Rui Huang et.al.|[2211.14864v1](http://arxiv.org/abs/2211.14864v1)|null|\n", "2211.12732": "|**2023-03-02**|**Wild-Places: A Large-Scale Dataset for Lidar Place Recognition in Unstructured Natural Environments**|Joshua Knights et.al.|[2211.12732v3](http://arxiv.org/abs/2211.12732v3)|**[link](https://github.com/csiro-robotics/Wild-Places)**|\n", "2210.13856": "|**2022-11-02**|**A Framework for Collaborative Multi-Robot Mapping using Spectral Graph Wavelets**|Lukas Bernreiter et.al.|[2210.13856v2](http://arxiv.org/abs/2210.13856v2)|null|\n", "2210.11029": "|**2022-10-20**|**DeepRING: Learning Roto-translation Invariant Representation for LiDAR based Place Recognition**|Sha Lu et.al.|[2210.11029v1](http://arxiv.org/abs/2210.11029v1)|null|\n", "2210.04432": "|**2023-03-06**|**Spectral Geometric Verification: Re-Ranking Point Cloud Retrieval for Metric Localization**|Kavisha Vidanapathirana et.al.|[2210.04432v2](http://arxiv.org/abs/2210.04432v2)|**[link](https://github.com/csiro-robotics/spectralgv)**|\n", "2210.04236": "|**2022-10-09**|**Fusing Event-based Camera and Radar for SLAM Using Spiking Neural Networks with Continual STDP Learning**|Ali Safa et.al.|[2210.04236v1](http://arxiv.org/abs/2210.04236v1)|null|\n", "2210.01320": "|**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|\n", "2209.12513": "|**2022-09-26**|**NDD: A 3D Point Cloud Descriptor Based on Normal Distribution for Loop Closure Detection**|Ruihao Zhou et.al.|[2209.12513v1](http://arxiv.org/abs/2209.12513v1)|**[link](https://github.com/zhouruihao1001/ndd)**|\n", "2209.11894": "|**2022-09-24**|**Closing the Loop: Graph Networks to Unify Semantic Objects and Visual Features for Multi-object Scenes**|Jonathan J. Y. Kim et.al.|[2209.11894v1](http://arxiv.org/abs/2209.11894v1)|null|\n", "2209.09699": "|**2023-03-28**|**PADLoC: LiDAR-Based Deep Loop Closure Detection and Registration Using Panoptic Attention**|Jos\u00e9 Arce et.al.|[2209.09699v3](http://arxiv.org/abs/2209.09699v3)|**[link](https://github.com/robot-learning-freiburg/PADLoC)**|\n", "2209.08608": "|**2022-09-18**|**HGI-SLAM: Loop Closure With Human and Geometric Importance Features**|Shuhul Mujoo et.al.|[2209.08608v1](http://arxiv.org/abs/2209.08608v1)|null|\n", "2209.08578": "|**2022-09-18**|**Data-driven Loop Closure Detection in Bathymetric Point Clouds for Underwater SLAM**|Jiarui Tan et.al.|[2209.08578v1](http://arxiv.org/abs/2209.08578v1)|**[link](https://github.com/tjr16/bathy_nn_learning)**|\n", "2209.06779": "|**2022-10-15**|**Efficient Planar Pose Estimation via UWB Measurements**|Haodong Jiang et.al.|[2209.06779v3](http://arxiv.org/abs/2209.06779v3)|**[link](https://github.com/SLAMLab-CUHKSZ/Efficient-Pose-Estimation-via-UWB-measurements)**|\n", "2209.06545": "|**2023-01-12**|**Tac2Structure: Object Surface Reconstruction Only through Multi Times Touch**|Junyuan Lu et.al.|[2209.06545v3](http://arxiv.org/abs/2209.06545v3)|**[link](https://github.com/ljy-zju/tac2structure)**|\n", "2209.04497": "|**2022-09-09**|**General Place Recognition Survey: Towards the Real-world Autonomy Age**|Peng Yin et.al.|[2209.04497v1](http://arxiv.org/abs/2209.04497v1)|**[link](https://github.com/MetaSLAM/GPRS)**|\n", "2207.10916": "|**2022-07-22**|**PLD-SLAM: A Real-Time Visual SLAM Using Points and Line Segments in Dynamic Scenes**|BaoSheng Zhang et.al.|[2207.10916v1](http://arxiv.org/abs/2207.10916v1)|null|\n", "2207.06965": "|**2022-09-28**|**AutoMerge: A Framework for Map Assembling and Smoothing in City-scale Environments**|Peng Yin et.al.|[2207.06965v3](http://arxiv.org/abs/2207.06965v3)|null|\n", "2207.06738": "|**2022-07-14**|**Semi-supervised Vector-Quantization in Visual SLAM using HGCN**|Amir Zarringhalam et.al.|[2207.06738v1](http://arxiv.org/abs/2207.06738v1)|null|\n", "2207.06732": "|**2022-07-14**|**Self-supervised Vector-Quantization in Visual SLAM using Deep Convolutional Autoencoders**|Amir Zarringhalam et.al.|[2207.06732v1](http://arxiv.org/abs/2207.06732v1)|null|\n", "2206.12628": "|**2022-09-27**|**FreSCo: Frequency-Domain Scan Context for LiDAR-based Place Recognition with Translation and Rotation Invariance**|Yongzhi Fan et.al.|[2206.12628v2](http://arxiv.org/abs/2206.12628v2)|**[link](https://github.com/soytony/fresco)**|\n", "2206.08733": "|**2022-06-17**|**Efficient WiFi LiDAR SLAM for Autonomous Robots in Large Environments**|Khairuldanial Ismail et.al.|[2206.08733v1](http://arxiv.org/abs/2206.08733v1)|null|\n", "2205.13135": "|**2022-07-09**|**LAMP 2.0: A Robust Multi-Robot SLAM System for Operation in Challenging Large-Scale Underground Environments**|Yun Chang et.al.|[2205.13135v3](http://arxiv.org/abs/2205.13135v3)|**[link](https://github.com/nebula-autonomy/nebula-multirobot-dataset)**|\n", "2204.12831": "|**2022-11-09**|**The Revisiting Problem in Simultaneous Localization and Mapping: A Survey on Visual Loop Closure Detection**|Konstantinos A. Tsintotas et.al.|[2204.12831v3](http://arxiv.org/abs/2204.12831v3)|null|\n", "2204.05481": "|**2022-04-12**|**HiTPR: Hierarchical Transformer for Place Recognition in Point Cloud**|Zhixing Hou et.al.|[2204.05481v1](http://arxiv.org/abs/2204.05481v1)|null|\n", "2204.04932": "|**2022-04-11**|**Optimized SC-F-LOAM: Optimized Fast LiDAR Odometry and Mapping Using Scan Context**|Lizhou Liao et.al.|[2204.04932v1](http://arxiv.org/abs/2204.04932v1)|**[link](https://github.com/SlamCabbage/Optimized-SC-F-LOAM)**|\n", "2204.01524": "|**2022-04-01**|**Bi-directional Loop Closure for Visual SLAM**|Ihtisham Ali et.al.|[2204.01524v1](http://arxiv.org/abs/2204.01524v1)|null|\n", "2203.03454": "|**2022-03-07**|**Multi-Modal Lidar Dataset for Benchmarking General-Purpose Localization and Mapping Algorithms**|Qingqing Li et.al.|[2203.03454v1](http://arxiv.org/abs/2203.03454v1)|**[link](https://github.com/tiers/tiers-lidars-dataset)**|\n", "2201.13360": "|**2022-06-20**|**Hydra: A Real-time Spatial Perception System for 3D Scene Graph Construction and Optimization**|Nathan Hughes et.al.|[2201.13360v2](http://arxiv.org/abs/2201.13360v2)|null|\n", "2201.09048": "|**2022-01-22**|**Phase-SLAM: Phase Based Simultaneous Localization and Mapping for Mobile Structured Light Illumination Systems**|Xi Zheng et.al.|[2201.09048v1](http://arxiv.org/abs/2201.09048v1)|**[link](https://github.com/zhengxi-git/phase-slam)**|\n", "2201.03212": "|**2022-01-10**|**Why-So-Deep: Towards Boosting Previously Trained Models for Visual Place Recognition**|M. Usman Maqbool Bhutta et.al.|[2201.03212v1](http://arxiv.org/abs/2201.03212v1)|**[link](https://github.com/UsmanMaqbool/why-so-deep)**|\n", "2111.14990": "|**2021-11-29**|**MIXER: A Principled Framework for Multimodal, Multiway Data Association**|Parker C. Lusk et.al.|[2111.14990v1](http://arxiv.org/abs/2111.14990v1)|null|\n", "2111.13838": "|**2021-11-27**|**DSC: Deep Scan Context Descriptor for Large-Scale Place Recognition**|Jiafeng Cui et.al.|[2111.13838v1](http://arxiv.org/abs/2111.13838v1)|null|\n", "2111.13826": "|**2021-11-27**|**Average Outward Flux Skeletons for Environment Mapping and Topology Matching**|Morteza Rezanejad et.al.|[2111.13826v1](http://arxiv.org/abs/2111.13826v1)|null|\n", "2111.00440": "|**2022-02-27**|**Loop closure detection using local 3D deep descriptors**|Youjie Zhou et.al.|[2111.00440v2](http://arxiv.org/abs/2111.00440v2)|**[link](https://github.com/yiming107/l3d_loop_closure)**|\n", "2110.11491": "|**2021-10-21**|**SymbioLCD: Ensemble-Based Loop Closure Detection using CNN-Extracted Objects and Visual Bag-of-Words**|Jonathan J. Y. Kim et.al.|[2110.11491v1](http://arxiv.org/abs/2110.11491v1)|null|\n", "2109.08975": "|**2022-03-09**|**AirLoop: Lifelong Loop Closure Detection**|Dasong Gao et.al.|[2109.08975v3](http://arxiv.org/abs/2109.08975v3)|**[link](https://github.com/wang-chen/airloop)**|\n", "2109.06596": "|**2021-09-14**|**GPGM-SLAM: a Robust SLAM System for Unstructured Planetary Environments with Gaussian Process Gradient Maps**|Riccardo Giubilato et.al.|[2109.06596v1](http://arxiv.org/abs/2109.06596v1)|null|\n", "2108.12790": "|**2022-08-28**|**RPR-Net: A Point Cloud-based Rotation-aware Large Scale Place Recognition Network**|Zhaoxin Fan et.al.|[2108.12790v3](http://arxiv.org/abs/2108.12790v3)|null|\n", "2108.02028": "|**2021-08-04**|**Incorporating Learnt Local and Global Embeddings into Monocular Visual SLAM**|Huaiyang Huang et.al.|[2108.02028v1](http://arxiv.org/abs/2108.02028v1)|null|\n", "2108.01383": "|**2021-08-03**|**On the descriptive power of LiDAR intensity images for segment-based loop closing in 3-D SLAM**|Jan Wietrzykowski et.al.|[2108.01383v1](http://arxiv.org/abs/2108.01383v1)|**[link](https://github.com/LRMPUT/segmap_vis_views)**|\n", "2107.14611": "|**2021-07-30**|**Automatic Vocabulary and Graph Verification for Accurate Loop Closure Detection**|Haosong Yue et.al.|[2107.14611v1](http://arxiv.org/abs/2107.14611v1)|null|\n", "2107.07707": "|**2021-07-16**|**Probabilistic Appearance-Invariant Topometric Localization with New Place Awareness**|Ming Xu et.al.|[2107.07707v1](http://arxiv.org/abs/2107.07707v1)|**[link](https://github.com/mingu6/TopometricLoc)**|\n", "2107.07133": "|**2021-07-15**|**A life-long SLAM approach using adaptable local maps based on rasterized LIDAR images**|Waqas Ali et.al.|[2107.07133v1](http://arxiv.org/abs/2107.07133v1)|null|\n", "2106.11516": "|**2021-07-01**|**SA-LOAM: Semantic-aided LiDAR SLAM with Loop Closure**|Lin Li et.al.|[2106.11516v2](http://arxiv.org/abs/2106.11516v2)|null|\n", "2106.09637": "|**2023-01-04**|**AttDLNet: Attention-based DL Network for 3D LiDAR Place Recognition**|Tiago Barros et.al.|[2106.09637v4](http://arxiv.org/abs/2106.09637v4)|**[link](https://github.com/cybonic/attdlnet)**|\n", "2105.11344": "|**2021-05-24**|**OverlapNet: Loop Closing for LiDAR-based SLAM**|Xieyuanli Chen et.al.|[2105.11344v1](http://arxiv.org/abs/2105.11344v1)|**[link](https://github.com/PRBonn/OverlapNet)**|\n", "2103.12292": "|**2021-03-23**|**NDT-Transformer: Large-Scale 3D Point Cloud Localisation using the Normal Distribution Transform Representation**|Zhicheng Zhou et.al.|[2103.12292v1](http://arxiv.org/abs/2103.12292v1)|**[link](https://github.com/dachengxiaocheng/NDT-Transformer)**|\n", "2303.00477": "|**2023-03-01**|**ORCHNet: A Robust Global Feature Aggregation approach for 3D LiDAR-based Place recognition in Orchards**|T. Barros et.al.|[2303.00477v1](http://arxiv.org/abs/2303.00477v1)|**[link](https://github.com/cybonic/orchnet)**|\n", "2303.00295": "|**2023-03-01**|**Region Prediction for Efficient Robot Localization on Large Maps**|Matteo Scucchia et.al.|[2303.00295v1](http://arxiv.org/abs/2303.00295v1)|null|\n", "2304.03872": "|**2023-06-24**|**LSGDDN-LCD: An Appearance-based Loop Closure Detection using Local Superpixel Grid Descriptors and Incremental Dynamic Nodes**|Baosheng Zhang et.al.|[2304.03872v2](http://arxiv.org/abs/2304.03872v2)|null|\n", "2304.05146": "|**2023-04-14**|**Loop Closure Detection Based on Object-level Spatial Layout and Semantic Consistency**|Xingwu Ji et.al.|[2304.05146v2](http://arxiv.org/abs/2304.05146v2)|**[link](https://github.com/jixingwu/ss-lcd)**|\n", "2304.13487": "|**2023-04-26**|**Hydra-Multi: Collaborative Online Construction of 3D Scene Graphs with Multi-Robot Teams**|Yun Chang et.al.|[2304.13487v1](http://arxiv.org/abs/2304.13487v1)|null|\n", "2305.07154": "|**2023-05-11**|**Foundations of Spatial Perception for Robotics: Hierarchical Representations and Real-time Systems**|Nathan Hughes et.al.|[2305.07154v1](http://arxiv.org/abs/2305.07154v1)|**[link](https://github.com/mit-spark/hydra)**|\n", "2305.18013": "|**2023-05-29**|**TReR: A Lightweight Transformer Re-Ranking Approach for 3D LiDAR Place Recognition**|Tiago Barros et.al.|[2305.18013v1](http://arxiv.org/abs/2305.18013v1)|null|\n", "2307.04321": "|**2023-07-10**|**RaPlace: Place Recognition for Imaging Radar using Radon Transform and Mutable Threshold**|Hyesu Jang et.al.|[2307.04321v1](http://arxiv.org/abs/2307.04321v1)|**[link](https://github.com/hyesu-jang/raplace)**|\n", "2307.08221": "|**2023-07-17**|**NDT-Map-Code: A 3D global descriptor for real-time loop closure detection in lidar SLAM**|Lizhou Liao et.al.|[2307.08221v1](http://arxiv.org/abs/2307.08221v1)|**[link](https://github.com/SlamCabbage/NDTMC)**|\n", "2307.09044": "|**2023-07-18**|**3D-SeqMOS: A Novel Sequential 3D Moving Object Segmentation in Autonomous Driving**|Qipeng Li et.al.|[2307.09044v1](http://arxiv.org/abs/2307.09044v1)|null|\n", "2309.02394": "|**2023-09-05**|**Magnetic Navigation using Attitude-Invariant Magnetic Field Information for Loop Closure Detection**|Natalia Pavlasek et.al.|[2309.02394v1](http://arxiv.org/abs/2309.02394v1)|null|\n", "2309.07094": "|**2023-09-13**|**RadarLCD: Learnable Radar-based Loop Closure Detection Pipeline**|Mirko Usuelli et.al.|[2309.07094v1](http://arxiv.org/abs/2309.07094v1)|null|\n", "2309.09879": "|**2023-09-18**|**DynaPix SLAM: A Pixel-Based Dynamic SLAM Approach**|Chenghao Xu et.al.|[2309.09879v1](http://arxiv.org/abs/2309.09879v1)|null|\n", "2309.08914": "|**2023-09-16**|**Outram: One-shot Global Localization via Triangulated Scene Graph and Global Outlier Pruning**|Pengyu Yin et.al.|[2309.08914v1](http://arxiv.org/abs/2309.08914v1)|**[link](https://github.com/pamphlett/outram)**|\n"}, "LiDAR SLAM": {"2212.14209": "|**2022-12-29**|**An Enhanced LiDAR-Inertial SLAM System for Robotics Localization and Mapping**|Kangcheng Liu et.al.|[2212.14209v1](http://arxiv.org/abs/2212.14209v1)|**[link](https://github.com/KangchengLiu/slam_resources)**|\n", "2212.05705": "|**2022-12-12**|**An Integrated LiDAR-SLAM System for Complex Environment with Noisy Point Clouds**|Kangcheng Liu et.al.|[2212.05705v1](http://arxiv.org/abs/2212.05705v1)|**[link](https://github.com/KangchengLiu/DLC_LiDAR_SLAM)**|\n", "2212.02077": "|**2022-12-05**|**DL-SLOT: Dynamic LiDAR SLAM and object tracking based on collaborative graph optimization**|Xuebo Tian et.al.|[2212.02077v1](http://arxiv.org/abs/2212.02077v1)|null|\n", "2211.03484": "|**2022-11-07**|**When Geometry is not Enough: Using Reflector Markers in Lidar SLAM**|Gerhard Kurz et.al.|[2211.03484v1](http://arxiv.org/abs/2211.03484v1)|null|\n", "2211.02445": "|**2023-04-14**|**Lidar-level localization with radar? The CFEAR approach to accurate, fast and robust large-scale radar odometry in diverse environments**|Daniel Adolfsson et.al.|[2211.02445v3](http://arxiv.org/abs/2211.02445v3)|**[link](https://github.com/dan11003/CFEAR_Radarodometry_code_public)**|\n", "2210.11978": "|**2023-04-13**|**DCL-SLAM: A Distributed Collaborative LiDAR SLAM Framework for a Robotic Swarm**|Shipeng Zhong et.al.|[2210.11978v2](http://arxiv.org/abs/2210.11978v2)|**[link](https://github.com/pengyu-team/dcl-slam)**|\n", "2210.00812": "|**2022-10-03**|**A Benchmark for Multi-Modal Lidar SLAM with Ground Truth in GNSS-Denied Environments**|Ha Sier et.al.|[2210.00812v1](http://arxiv.org/abs/2210.00812v1)|**[link](https://github.com/tiers/tiers-lidars-dataset-enhanced)**|\n", "2209.08810": "|**2022-09-19**|**LMBAO: A Landmark Map for Bundle Adjustment Odometry in LiDAR SLAM**|Letian Zhang et.al.|[2209.08810v1](http://arxiv.org/abs/2209.08810v1)|null|\n", "2209.08248": "|**2022-09-29**|**PlaneSLAM: Plane-based LiDAR SLAM for Motion Planning in Structured 3D Environments**|Adam Dai et.al.|[2209.08248v2](http://arxiv.org/abs/2209.08248v2)|**[link](https://github.com/stanford-navlab/planeslam)**|\n", "2209.08091": "|**2022-09-16**|**ViWiD: Leveraging WiFi for Robust and Resource-Efficient SLAM**|Aditya Arun et.al.|[2209.08091v1](http://arxiv.org/abs/2209.08091v1)|null|\n", "2208.11855": "|**2022-08-25**|**Lidar SLAM for Autonomous Driving Vehicles**|Farhad Aghili et.al.|[2208.11855v1](http://arxiv.org/abs/2208.11855v1)|null|\n", "2208.09777": "|**2022-09-08**|**JVLDLoc: a Joint Optimization of Visual-LiDAR Constraints and Direction Priors for Localization in Driving Scenario**|Longrui Dong et.al.|[2208.09777v3](http://arxiv.org/abs/2208.09777v3)|null|\n", "2208.07473": "|**2022-11-18**|**BoW3D: Bag of Words for Real-Time Loop Closing in 3D LiDAR SLAM**|Yunge Cui et.al.|[2208.07473v2](http://arxiv.org/abs/2208.07473v2)|**[link](https://github.com/yungecui/bow3d)**|\n", "2207.06815": "|**2022-07-14**|**Challenges of SLAM in extremely unstructured environments: the DLR Planetary Stereo, Solid-State LiDAR, Inertial Dataset**|Riccardo Giubilato et.al.|[2207.06815v1](http://arxiv.org/abs/2207.06815v1)|null|\n", "2206.09463": "|**2022-06-19**|**RF-LIO: Removal-First Tightly-coupled Lidar Inertial Odometry in High Dynamic Environments**|Chenglong Qian et.al.|[2206.09463v1](http://arxiv.org/abs/2206.09463v1)|null|\n", "2206.08733": "|**2022-06-17**|**Efficient WiFi LiDAR SLAM for Autonomous Robots in Large Environments**|Khairuldanial Ismail et.al.|[2206.08733v1](http://arxiv.org/abs/2206.08733v1)|null|\n", "2206.00266": "|**2022-06-01**|**PaGO-LOAM: Robust Ground-Optimized LiDAR Odometry**|Dong-Uk Seo et.al.|[2206.00266v1](http://arxiv.org/abs/2206.00266v1)|**[link](https://github.com/url-kaist/alterground-lego-loam)**|\n", "2205.08556": "|**2022-05-17**|**Global Data Association for SLAM with 3D Grassmannian Manifold Objects**|Parker C. Lusk et.al.|[2205.08556v1](http://arxiv.org/abs/2205.08556v1)|null|\n", "2204.12769": "|**2022-04-27**|**Dynamic Registration: Joint Ego Motion Estimation and 3D Moving Object Detection in Dynamic Environment**|Wenyu Li et.al.|[2204.12769v1](http://arxiv.org/abs/2204.12769v1)|null|\n", "2204.08163": "|**2022-04-18**|**Mapping While Following: 2D LiDAR SLAM in Indoor Dynamic Environments with a Person Tracker**|Hanjing Ye et.al.|[2204.08163v1](http://arxiv.org/abs/2204.08163v1)|null|\n", "2203.13799": "|**2022-03-25**|**Gravity-constrained point cloud registration**|Vladim\u00edr Kubelka et.al.|[2203.13799v1](http://arxiv.org/abs/2203.13799v1)|null|\n", "2202.11431": "|**2022-02-23**|**DL-SLOT: Dynamic Lidar SLAM and Object Tracking Based On Graph Optimization**|Xuebo Tian et.al.|[2202.11431v1](http://arxiv.org/abs/2202.11431v1)|null|\n", "2201.06423": "|**2022-01-17**|**SC-LiDAR-SLAM: a Front-end Agnostic Versatile LiDAR SLAM System**|Giseop Kim et.al.|[2201.06423v1](http://arxiv.org/abs/2201.06423v1)|null|\n", "2110.11517": "|**2021-10-21**|**Real-Time Ground-Plane Refined LiDAR SLAM**|Fan Yang et.al.|[2110.11517v1](http://arxiv.org/abs/2110.11517v1)|null|\n", "2110.02018": "|**2021-10-03**|**AEROS: Adaptive RObust least-Squares for Graph-Based SLAM**|Milad Ramezani et.al.|[2110.02018v1](http://arxiv.org/abs/2110.02018v1)|null|\n", "2109.05483": "|**2021-09-12**|**ART-SLAM: Accurate Real-Time 6DoF LiDAR SLAM**|Matteo Frosi et.al.|[2109.05483v1](http://arxiv.org/abs/2109.05483v1)|**[link](https://github.com/matteof94/artslam)**|\n", "2109.00200": "|**2021-09-01**|**A real-time global re-localization framework for 3D LiDAR SLAM**|Ziqi Chai et.al.|[2109.00200v1](http://arxiv.org/abs/2109.00200v1)|null|\n", "2108.01383": "|**2021-08-03**|**On the descriptive power of LiDAR intensity images for segment-based loop closing in 3-D SLAM**|Jan Wietrzykowski et.al.|[2108.01383v1](http://arxiv.org/abs/2108.01383v1)|**[link](https://github.com/LRMPUT/segmap_vis_views)**|\n", "2107.05283": "|**2021-07-12**|**Benchmark of visual and 3D lidar SLAM systems in simulation environment for vineyards**|Ibrahim Hroob et.al.|[2107.05283v1](http://arxiv.org/abs/2107.05283v1)|null|\n", "2106.11516": "|**2021-07-01**|**SA-LOAM: Semantic-aided LiDAR SLAM with Loop Closure**|Lin Li et.al.|[2106.11516v2](http://arxiv.org/abs/2106.11516v2)|null|\n", "2105.08941": "|**2021-05-19**|**Large-scale Localization Datasets in Crowded Indoor Spaces**|Donghwan Lee et.al.|[2105.08941v1](http://arxiv.org/abs/2105.08941v1)|null|\n", "2105.03296": "|**2021-10-05**|**VIRAL SLAM: Tightly Coupled Camera-IMU-UWB-Lidar SLAM**|Thien-Minh Nguyen et.al.|[2105.03296v3](http://arxiv.org/abs/2105.03296v3)|null|\n", "2104.05347": "|**2021-04-12**|**Radar SLAM: A Robust SLAM System for All Weather Conditions**|Ziyang Hong et.al.|[2104.05347v1](http://arxiv.org/abs/2104.05347v1)|null|\n", "2104.03657": "|**2021-04-08**|**Dynamic Object Aware LiDAR SLAM based on Automatic Generation of Training Data**|Patrick Pfreundschuh et.al.|[2104.03657v1](http://arxiv.org/abs/2104.03657v1)|null|\n", "2103.13090": "|**2021-03-24**|**Greedy-Based Feature Selection for Efficient LiDAR SLAM**|Jianhao Jiao et.al.|[2103.13090v1](http://arxiv.org/abs/2103.13090v1)|null|\n", "2103.10678": "|**2021-03-19**|**6-DOF Feature based LIDAR SLAM using ORB Features from Rasterized Images of 3D LIDAR Point Cloud**|Waqas Ali et.al.|[2103.10678v1](http://arxiv.org/abs/2103.10678v1)|null|\n", "2103.09523": "|**2021-12-30**|**A Universal LiDAR SLAM Accelerator System on Low-cost FPGA**|Keisuke Sugiura et.al.|[2103.09523v2](http://arxiv.org/abs/2103.09523v2)|null|\n", "2103.05056": "|**2022-02-08**|**LCDNet: Deep Loop Closure Detection and Point Cloud Registration for LiDAR SLAM**|Daniele Cattaneo et.al.|[2103.05056v4](http://arxiv.org/abs/2103.05056v4)|**[link](https://github.com/robot-learning-freiburg/LCDNet)**|\n", "2103.03713": "|**2021-03-05**|**Ground-SLAM: Ground Constrained LiDAR SLAM for Structured Multi-Floor Environments**|Xin Wei et.al.|[2103.03713v1](http://arxiv.org/abs/2103.03713v1)|null|\n", "2102.03800": "|**2021-02-17**|**Lightweight 3-D Localization and Mapping for Solid-State LiDAR**|Han Wang et.al.|[2102.03800v2](http://arxiv.org/abs/2102.03800v2)|**[link](https://github.com/wh200720041/SSL_SLAM)**|\n", "2102.03798": "|**2021-02-17**|**Intensity-SLAM: Intensity Assisted Localization and Mapping for Large Scale Environment**|Han Wang et.al.|[2102.03798v2](http://arxiv.org/abs/2102.03798v2)|**[link](https://github.com/wh200720041/intensity_slam)**|\n", "2102.03771": "|**2021-04-27**|**MULLS: Versatile LiDAR SLAM via Multi-metric Linear Least Square**|Yue Pan et.al.|[2102.03771v3](http://arxiv.org/abs/2102.03771v3)|**[link](https://github.com/YuePanEdward/MULLS)**|\n", "2101.06615": "|**2021-05-31**|**Online Robust Sliding-Windowed LiDAR SLAM in Natural Environments**|Quang-Ha Pham et.al.|[2101.06615v6](http://arxiv.org/abs/2101.06615v6)|null|\n", "2012.03455": "|**2020-12-07**|**TP-TIO: A Robust Thermal-Inertial Odometry with Deep ThermalPoint**|Shibo Zhao et.al.|[2012.03455v1](http://arxiv.org/abs/2012.03455v1)|null|\n", "2012.02399": "|**2020-12-04**|**P3-LOAM: PPP/LiDAR Loosely Coupled SLAM with Accurate Covariance Estimation and Robust RAIM in Urban Canyon Environment**|Tao Li et.al.|[2012.02399v1](http://arxiv.org/abs/2012.02399v1)|null|\n", "2011.11357": "|**2020-11-23**|**CamVox: A Low-cost and Accurate Lidar-assisted Visual SLAM System**|Yuewen Zhu et.al.|[2011.11357v1](http://arxiv.org/abs/2011.11357v1)|**[link](https://github.com/ISEE-Technology/CamVox)**|\n", "2011.02306": "|**2021-09-11**|**A Comparison of LiDAR-based SLAM Systems for Control of Unmanned Aerial Vehicles**|Robert Milijas et.al.|[2011.02306v3](http://arxiv.org/abs/2011.02306v3)|null|\n", "2010.08215": "|**2021-01-13**|**BALM: Bundle Adjustment for Lidar Mapping**|Zheng Liu et.al.|[2010.08215v2](http://arxiv.org/abs/2010.08215v2)|**[link](https://github.com/hku-mars/BALM)**|\n", "2008.03694": "|**2020-08-09**|**LiDAR Data Enrichment Using Deep Learning Based on High-Resolution Image: An Approach to Achieve High-Performance LiDAR SLAM Using Low-cost LiDAR**|Jiang Yue et.al.|[2008.03694v1](http://arxiv.org/abs/2008.03694v1)|null|\n", "2008.02274": "|**2020-08-05**|**Elasticity Meets Continuous-Time: Map-Centric Dense 3D LiDAR SLAM**|Chanoh Park et.al.|[2008.02274v1](http://arxiv.org/abs/2008.02274v1)|null|\n", "2302.13613": "|**2023-03-13**|**Evaluation of Lidar-based 3D SLAM algorithms in SubT environment**|Anton Koval et.al.|[2302.13613v2](http://arxiv.org/abs/2302.13613v2)|null|\n", "2303.01155": "|**2023-04-07**|**Marker-based Visual SLAM leveraging Hierarchical Representations**|Ali Tourani et.al.|[2303.01155v2](http://arxiv.org/abs/2303.01155v2)|null|\n", "2303.05252": "|**2023-03-09**|**SLAMesh: Real-time LiDAR Simultaneous Localization and Meshing**|Jianyuan Ruan et.al.|[2303.05252v1](http://arxiv.org/abs/2303.05252v1)|**[link](https://github.com/RuanJY/SLAMesh)**|\n", "2305.01843": "|**2023-05-03**|**Direct LiDAR-Inertial Odometry and Mapping: Perceptive and Connective SLAM**|Kenny Chen et.al.|[2305.01843v1](http://arxiv.org/abs/2305.01843v1)|null|\n", "2306.03660": "|**2023-06-06**|**PQM: A Point Quality Evaluation Metric for Dense Maps**|Yash Turkar et.al.|[2306.03660v1](http://arxiv.org/abs/2306.03660v1)|**[link](https://github.com/droneslab/pqm-sim)**|\n", "2307.08221": "|**2023-07-17**|**NDT-Map-Code: A 3D global descriptor for real-time loop closure detection in lidar SLAM**|Lizhou Liao et.al.|[2307.08221v1](http://arxiv.org/abs/2307.08221v1)|**[link](https://github.com/SlamCabbage/NDTMC)**|\n", "2307.09044": "|**2023-07-18**|**3D-SeqMOS: A Novel Sequential 3D Moving Object Segmentation in Autonomous Driving**|Qipeng Li et.al.|[2307.09044v1](http://arxiv.org/abs/2307.09044v1)|null|\n", "2307.15005": "|**2023-07-27**|**FLiCR: A Fast and Lightweight LiDAR Point Cloud Compression Based on Lossy RI**|Jin Heo et.al.|[2307.15005v1](http://arxiv.org/abs/2307.15005v1)|null|\n", "2309.04937": "|**2023-09-12**|**LONER: LiDAR Only Neural Representations for Real-Time SLAM**|Seth Isaacson et.al.|[2309.04937v2](http://arxiv.org/abs/2309.04937v2)|null|\n", "2309.08086": "|**2023-09-15**|**Fast and Accurate Deep Loop Closing and Relocalization for Reliable LiDAR SLAM**|Chenghao Shi et.al.|[2309.08086v1](http://arxiv.org/abs/2309.08086v1)|null|\n", "2311.00928": "|**2023-11-02**|**Quatro++: Robust Global Registration Exploiting Ground Segmentation for Loop Closing in LiDAR SLAM**|Hyungtae Lim et.al.|[2311.00928v1](http://arxiv.org/abs/2311.00928v1)|null|\n", "2311.02327": "|**2023-11-04**|**ECMD: An Event-Centric Multisensory Driving Dataset for SLAM**|Peiyu Chen et.al.|[2311.02327v1](http://arxiv.org/abs/2311.02327v1)|null|\n"}, "Transformer": {"2302.08104": "|**2023-02-16**|**Multiscalar field cosmological model and possible solutions using Noether symmetry approach**|Santu Mondal et.al.|[2302.08104v1](http://arxiv.org/abs/2302.08104v1)|null|\n", "2301.11622": "|**2023-01-30**|**Darboux transformations for Dunkl-Schroedinger equations with energy dependent potential and position dependent mass**|Axel Schulze-Halberg et.al.|[2301.11622v2](http://arxiv.org/abs/2301.11622v2)|null|\n", "2301.09364": "|**2023-04-06**|**On uniqueness of submaximally symmetric vector ordinary differential equations of C-class**|Johnson Allen Kessy et.al.|[2301.09364v2](http://arxiv.org/abs/2301.09364v2)|null|\n", "2301.08739": "|**2023-03-30**|**FlatFormer: Flattened Window Attention for Efficient Point Cloud Transformer**|Zhijian Liu et.al.|[2301.08739v2](http://arxiv.org/abs/2301.08739v2)|null|\n", "2301.07301": "|**2023-01-18**|**PTA-Det: Point Transformer Associating Point cloud and Image for 3D Object Detection**|Rui Wan et.al.|[2301.07301v1](http://arxiv.org/abs/2301.07301v1)|null|\n", "2301.02650": "|**2023-01-06**|**Model-Agnostic Hierarchical Attention for 3D Object Detection**|Manli Shu et.al.|[2301.02650v1](http://arxiv.org/abs/2301.02650v1)|null|\n", "2212.13736": "|**2022-12-28**|**Hermitian Topologies originating from non-Hermitian braidings**|W. B. Rui et.al.|[2212.13736v1](http://arxiv.org/abs/2212.13736v1)|null|\n", "2212.13276": "|**2022-12-26**|**Generalization of non-Cartan Symmetries to arbitrary dimensions**|J. C. Ndogmo et.al.|[2212.13276v1](http://arxiv.org/abs/2212.13276v1)|null|\n", "2212.13244": "|**2022-12-26**|**Equivalence classes and Linearization of the Riccati and Abel chain**|J. C. Ndogmo et.al.|[2212.13244v1](http://arxiv.org/abs/2212.13244v1)|null|\n", "2211.12510": "|**2022-11-22**|**Reconstructing the Image Scanning Microscopy Dataset: an Inverse Problem**|Alessandro Zunino et.al.|[2211.12510v1](http://arxiv.org/abs/2211.12510v1)|null|\n", "2211.02079": "|**2022-11-03**|**On Darboux non-integrability of the Hietarinta equation**|S. Ya. Startsev et.al.|[2211.02079v1](http://arxiv.org/abs/2211.02079v1)|null|\n", "2210.15933": "|**2022-10-28**|**PSFormer: Point Transformer for 3D Salient Object Detection**|Baian Chen et.al.|[2210.15933v1](http://arxiv.org/abs/2210.15933v1)|null|\n", "2210.06668": "|**2022-11-05**|**Aspects of the Equivalence Between the $f^\u03bc$ and $c^{\u03bd\u03bc}$ Terms in Lorentz-Violating Quantum Field Theory**|Sapan Karki et.al.|[2210.06668v2](http://arxiv.org/abs/2210.06668v2)|null|\n", "2210.05666": "|**2022-10-12**|**Point Transformer V2: Grouped Vector Attention and Partition-based Pooling**|Xiaoyang Wu et.al.|[2210.05666v2](http://arxiv.org/abs/2210.05666v2)|**[link](https://github.com/gofinge/pointtransformerv2)**|\n", "2209.11255": "|**2022-09-21**|**3DPCT: 3D Point Cloud Transformer with Dual Self-attention**|Dening Lu et.al.|[2209.11255v1](http://arxiv.org/abs/2209.11255v1)|null|\n", "2208.10395": "|**2022-08-22**|**Symmetry Classification of Scalar $n$th Order Ordinary Differential Equations**|Said Waqas Shah et.al.|[2208.10395v1](http://arxiv.org/abs/2208.10395v1)|null|\n", "2208.00281": "|**2022-12-20**|**Point Primitive Transformer for Long-Term 4D Point Cloud Video Understanding**|Hao Wen et.al.|[2208.00281v2](http://arxiv.org/abs/2208.00281v2)|**[link](https://github.com/hoi4d/PPTr)**|\n", "2207.13226": "|**2022-08-15**|**Boosting Point-BERT by Multi-choice Tokens**|Kexue Fu et.al.|[2207.13226v2](http://arxiv.org/abs/2207.13226v2)|**[link](https://github.com/fukexue/mcp-bert)**|\n", "2207.11995": "|**2022-07-26**|**3D Siamese Transformer Network for Single Object Tracking on Point Clouds**|Le Hui et.al.|[2207.11995v2](http://arxiv.org/abs/2207.11995v2)|**[link](https://github.com/fpthink/stnet)**|\n", "2207.10994": "|**2022-07-22**|**Learning Generalized Non-Rigid Multimodal Biomedical Image Registration from Generic Point Set Data**|Zachary MC Baum et.al.|[2207.10994v1](http://arxiv.org/abs/2207.10994v1)|null|\n", "2207.08575": "|**2022-07-18**|**Anisotropic spacetimes in $f(T,B)$ theory IV: Noether symmetry analysis**|Andronikos Paliathanasis et.al.|[2207.08575v1](http://arxiv.org/abs/2207.08575v1)|null|\n", "2206.15191": "|**2022-06-30**|**Lewis-Riesenfeld invariants for PT-symmetrically coupled oscillators from two dimensional point transformations and Lie algebraic expansions**|Andreas Fring et.al.|[2206.15191v1](http://arxiv.org/abs/2206.15191v1)|null|\n", "2206.04670": "|**2022-10-12**|**PointNeXt: Revisiting PointNet++ with Improved Training and Scaling Strategies**|Guocheng Qian et.al.|[2206.04670v2](http://arxiv.org/abs/2206.04670v2)|**[link](https://github.com/guochengqian/pointnext)**|\n", "2206.04511": "|**2022-08-29**|**Efficient Human Pose Estimation via 3D Event Point Cloud**|Jiaan Chen et.al.|[2206.04511v2](http://arxiv.org/abs/2206.04511v2)|**[link](https://github.com/masterhow/eventpointpose)**|\n", "2205.08886": "|**2022-05-18**|**GeoPointGAN: Synthetic Spatial Data with Local Label Differential Privacy**|Teddy Cunningham et.al.|[2205.08886v1](http://arxiv.org/abs/2205.08886v1)|**[link](https://github.com/konstantinklemmer/geopointgan)**|\n", "2204.03957": "|**2022-04-08**|**Points to Patches: Enabling the Use of Self-Attention for 3D Shape Recognition**|Axel Berg et.al.|[2204.03957v1](http://arxiv.org/abs/2204.03957v1)|**[link](https://github.com/axeber01/point-tnt)**|\n", "2203.12758": "|**2022-03-23**|**Mokey: Enabling Narrow Fixed-Point Inference for Out-of-the-Box Floating-Point Transformer Models**|Ali Hadi Zadeh et.al.|[2203.12758v1](http://arxiv.org/abs/2203.12758v1)|null|\n", "2203.04007": "|**2022-08-31**|**DuMLP-Pin: A Dual-MLP-dot-product Permutation-invariant Network for Set Feature Extraction**|Jiajun Fei et.al.|[2203.04007v2](http://arxiv.org/abs/2203.04007v2)|**[link](https://github.com/jaronthu/dumlp-pin)**|\n", "2203.00972": "|**2022-04-07**|**Improving Point Cloud Based Place Recognition with Ranking-based Loss and Large Batch Training**|Jacek Komorowski et.al.|[2203.00972v2](http://arxiv.org/abs/2203.00972v2)|**[link](https://github.com/jac99/minkloc3dv2)**|\n", "2201.05140": "|**2022-01-13**|**An introduction to PT-symmetric quantum mechanics -- time-dependent systems**|Andreas Fring et.al.|[2201.05140v1](http://arxiv.org/abs/2201.05140v1)|null|\n", "2112.13725": "|**2021-12-27**|**Near-Optimal Bounds for Generalized Orthogonal Procrustes Problem via Generalized Power Method**|Shuyang Ling et.al.|[2112.13725v1](http://arxiv.org/abs/2112.13725v1)|null|\n", "2112.11959": "|**2021-12-22**|**Dynamics of a symmetrically decoupled three-dimensional point transformation**|Hacene Gharout et.al.|[2112.11959v1](http://arxiv.org/abs/2112.11959v1)|null|\n", "2112.05635": "|**2021-12-10**|**Geometry of inhomogeneous Poisson brackets, multicomponent Harry Dym hierarchies and multicomponent Hunter-Saxton equations**|Andrey Yu. Konyaev et.al.|[2112.05635v1](http://arxiv.org/abs/2112.05635v1)|null|\n", "2112.04863": "|**2021-12-17**|**3D Medical Point Transformer: Introducing Convolution to Attention Networks for Medical Point Cloud Analysis**|Jianhui Yu et.al.|[2112.04863v2](http://arxiv.org/abs/2112.04863v2)|**[link](https://github.com/crane-papercode/3dmedpt)**|\n", "2112.04702": "|**2022-04-04**|**Fast Point Transformer**|Chunghyun Park et.al.|[2112.04702v2](http://arxiv.org/abs/2112.04702v2)|**[link](https://github.com/POSTECH-CVLab/FastPointTransformer)**|\n", "2111.14819": "|**2022-06-06**|**Point-BERT: Pre-training 3D Point Cloud Transformers with Masked Point Modeling**|Xumin Yu et.al.|[2111.14819v2](http://arxiv.org/abs/2111.14819v2)|**[link](https://github.com/lulutang0608/Point-BERT)**|\n", "2111.14451": "|**2022-03-31**|**HDR-NeRF: High Dynamic Range Neural Radiance Fields**|Xin Huang et.al.|[2111.14451v3](http://arxiv.org/abs/2111.14451v3)|null|\n", "2111.13702": "|**2022-12-12**|**The Information Content of Projected Galaxy Fields**|Lucas Porth et.al.|[2111.13702v2](http://arxiv.org/abs/2111.13702v2)|null|\n", "2111.10866": "|**2021-11-21**|**CpT: Convolutional Point Transformer for 3D Point Cloud Processing**|Chaitanya Kaul et.al.|[2111.10866v1](http://arxiv.org/abs/2111.10866v1)|null|\n", "2111.08973": "|**2021-11-19**|**Generating Unrestricted 3D Adversarial Point Clouds**|Xuelong Dai et.al.|[2111.08973v2](http://arxiv.org/abs/2111.08973v2)|**[link](https://github.com/EricDai0/AdvGCGAN)**|\n", "2111.00207": "|**2022-03-24**|**PatchFormer: An Efficient Point Transformer with Patch Attention**|Zhang Cheng et.al.|[2111.00207v3](http://arxiv.org/abs/2111.00207v3)|null|\n", "2110.05609": "|**2021-11-03**|**Comparison between time-independent and time-dependent quantum systems in the context of energy, Heisenberg uncertainty, average energy, force, average force and thermodynamic quantities**|Debraj Nath et.al.|[2110.05609v2](http://arxiv.org/abs/2110.05609v2)|null|\n", "2110.09230": "|**2021-10-07**|**A study on the Friedmann like Universe with Torsion using Noether Symmetry**|Ramkumar Radhakrishnan et.al.|[2110.09230v1](http://arxiv.org/abs/2110.09230v1)|null|\n", "2109.05023": "|**2021-09-20**|**Real-time multimodal image registration with partial intraoperative point-set data**|Zachary M C Baum et.al.|[2109.05023v2](http://arxiv.org/abs/2109.05023v2)|null|\n", "2109.02107": "|**2021-09-05**|**Normal Forms of second order Ordinary Differential Equations $y_{xx}=J(x,y,y_{x})$ under Fibre-Preserving Maps**|Wei Guo Foo et.al.|[2109.02107v1](http://arxiv.org/abs/2109.02107v1)|null|\n", "2108.08958": "|**2021-08-20**|**Exact solutions for time-dependent non-Hermitian oscillators: classical and quantum pictures**|Kevin Zelaya et.al.|[2108.08958v1](http://arxiv.org/abs/2108.08958v1)|null|\n", "2108.08891": "|**2021-08-19**|**Neural TMDlayer: Modeling Instantaneous flow of features via SDE Generators**|Zihang Meng et.al.|[2108.08891v1](http://arxiv.org/abs/2108.08891v1)|**[link](https://github.com/zihangm/neural-tmd-layer)**|\n", "2108.06076": "|**2022-05-25**|**PVT: Point-Voxel Transformer for Point Cloud Learning**|Cheng Zhang et.al.|[2108.06076v4](http://arxiv.org/abs/2108.06076v4)|**[link](https://github.com/HaochengWan/PVT)**|\n", "2108.00620": "|**2021-10-14**|**Investigating Attention Mechanism in 3D Point Cloud Object Detection**|Shi Qiu et.al.|[2108.00620v2](http://arxiv.org/abs/2108.00620v2)|**[link](https://github.com/ShiQiu0419/attentions_in_3D_detection)**|\n", "2107.14144": "|**2021-07-29**|**Reduction of balance laws in (3+1)--dimensions to autonomous conservation laws by means of equivalence transformations**|Matteo Gorgone et.al.|[2107.14144v1](http://arxiv.org/abs/2107.14144v1)|null|\n", "2303.01166": "|**2023-03-02**|**BPT: Binary Point Cloud Transformer for Place Recognition**|Zhixing Hou et.al.|[2303.01166v1](http://arxiv.org/abs/2303.01166v1)|null|\n", "2303.04458": "|**2023-03-08**|**Full Point Encoding for Local Feature Aggregation in 3D Point Clouds**|Yong He et.al.|[2303.04458v1](http://arxiv.org/abs/2303.04458v1)|null|\n", "2303.07766": "|**2023-03-14**|**Classical and quantum cosmology in $f(T)$-gravity theory: A Noether symmetry approach**|Roshni Bhaumik et.al.|[2303.07766v1](http://arxiv.org/abs/2303.07766v1)|null|\n", "2303.08274": "|**2023-03-14**|**GeoSpark: Sparking up Point Cloud Segmentation with Geometry Clue**|Zhening Huang et.al.|[2303.08274v1](http://arxiv.org/abs/2303.08274v1)|null|\n", "2303.15320": "|**2023-03-22**|**Noether's theorem and Lie symmetries for time-dependent Hamilton-Lagrange systems**|J\u00fcrgen Struckmeier et.al.|[2303.15320v1](http://arxiv.org/abs/2303.15320v1)|null|\n", "2303.17815": "|**2023-03-31**|**APPT : Asymmetric Parallel Point Transformer for 3D Point Cloud Understanding**|Hengjia Li et.al.|[2303.17815v1](http://arxiv.org/abs/2303.17815v1)|null|\n", "2304.02013": "|**2023-09-01**|**NPC: Neural Point Characters from Video**|Shih-Yang Su et.al.|[2304.02013v2](http://arxiv.org/abs/2304.02013v2)|null|\n", "2304.08279": "|**2023-05-27**|**MoDA: Modeling Deformable 3D Objects from Casual Videos**|Chaoyue Song et.al.|[2304.08279v2](http://arxiv.org/abs/2304.08279v2)|**[link](https://github.com/chaoyuesong/moda)**|\n", "2304.08681": "|**2023-09-07**|**The integer point transform as a complete invariant**|Sinai Robins et.al.|[2304.08681v4](http://arxiv.org/abs/2304.08681v4)|null|\n", "2304.14132": "|**2023-04-28**|**Human Semantic Segmentation using Millimeter-Wave Radar Sparse Point Clouds**|Pengfei Song et.al.|[2304.14132v2](http://arxiv.org/abs/2304.14132v2)|null|\n", "2305.00773": "|**2023-05-01**|**Point Cloud Semantic Segmentation**|Ivan Martinovi\u0107 et.al.|[2305.00773v1](http://arxiv.org/abs/2305.00773v1)|null|\n", "2305.03045": "|**2023-05-08**|**OctFormer: Octree-based Transformers for 3D Point Clouds**|Peng-Shuai Wang et.al.|[2305.03045v2](http://arxiv.org/abs/2305.03045v2)|**[link](https://github.com/octree-nn/octformer)**|\n", "2305.02533": "|**2023-05-04**|**Point Transformer For Coronary Artery Labeling**|Xu Wang et.al.|[2305.02533v1](http://arxiv.org/abs/2305.02533v1)|null|\n", "2306.10759": "|**2023-10-31**|**Simplifying and Empowering Transformers for Large-Graph Representations**|Qitian Wu et.al.|[2306.10759v3](http://arxiv.org/abs/2306.10759v3)|**[link](https://github.com/qitianwu/sgformer)**|\n", "2306.12361": "|**2023-06-21**|**Sigma-point Kalman Filter with Nonlinear Unknown Input Estimation via Optimization and Data-driven Approach for Dynamic Systems**|Junn Yong Loo et.al.|[2306.12361v1](http://arxiv.org/abs/2306.12361v1)|null|\n", "2306.10798": "|**2023-06-23**|**ExpPoint-MAE: Better interpretability and performance for self-supervised point cloud transformers**|Ioannis Romanelis et.al.|[2306.10798v2](http://arxiv.org/abs/2306.10798v2)|**[link](https://github.com/vvrpanda/exppoint-mae)**|\n", "2307.04723": "|**2023-07-18**|**Quark/Gluon Discrimination and Top Tagging with Dual Attention Transformer**|Minxuan He et.al.|[2307.04723v2](http://arxiv.org/abs/2307.04723v2)|null|\n", "2307.11973": "|**2023-07-22**|**Two-stream Multi-level Dynamic Point Transformer for Two-person Interaction Recognition**|Yao Liu et.al.|[2307.11973v1](http://arxiv.org/abs/2307.11973v1)|null|\n", "2308.04637": "|**2023-08-09**|**Sparse Binary Transformers for Multivariate Time Series Modeling**|Matt Gorbett et.al.|[2308.04637v1](http://arxiv.org/abs/2308.04637v1)|null|\n", "2308.09403": "|**2023-08-18**|**Target Clustering Based Multi-Bernoulli Filter for Superpositional Sensors**|Wang Sen et.al.|[2308.09403v1](http://arxiv.org/abs/2308.09403v1)|null|\n", "2309.00339": "|**2023-09-01**|**Robust Point Cloud Processing through Positional Embedding**|Jianqiao Zheng et.al.|[2309.00339v1](http://arxiv.org/abs/2309.00339v1)|null|\n", "2309.04105": "|**2023-09-08**|**Weakly Supervised Point Clouds Transformer for 3D Object Detection**|Zuojin Tang et.al.|[2309.04105v1](http://arxiv.org/abs/2309.04105v1)|null|\n", "2310.01545": "|**2023-10-02**|**RF-ULM: Deep Learning for Radio-Frequency Ultrasound Localization Microscopy**|Christopher Hahne et.al.|[2310.01545v1](http://arxiv.org/abs/2310.01545v1)|**[link](https://github.com/hahnec/rf-ulm)**|\n", "2310.05780": "|**2023-10-09**|**Lie symmetries for the cosmological field equations in brane-world gravity with bulk scalar field**|Andronikos Paliathanasis et.al.|[2310.05780v1](http://arxiv.org/abs/2310.05780v1)|null|\n", "2310.16861": "|**2023-10-25**|**General Point Model with Autoencoding and Autoregressive**|Zhe Li et.al.|[2310.16861v1](http://arxiv.org/abs/2310.16861v1)|null|\n", "2310.19772": "|**2023-10-22**|**Exact FLRW cosmological solutions via invariants of the symmetry groups**|E. Ahmadi Azar et.al.|[2310.19772v1](http://arxiv.org/abs/2310.19772v1)|null|\n", "2311.04081": "|**2023-11-07**|**Learning Super-Resolution Ultrasound Localization Microscopy from Radio-Frequency Data**|Christopher Hahne et.al.|[2311.04081v1](http://arxiv.org/abs/2311.04081v1)|null|\n"}, "NeRF": {"2302.12237": "|**2023-02-24**|**Learning Neural Volumetric Representations of Dynamic Humans in Minutes**|Chen Geng et.al.|[2302.12237v2](http://arxiv.org/abs/2302.12237v2)|**[link](https://github.com/zju3dv/instant-nvr)**|\n", "2302.12231": "|**2023-02-23**|**DiffusioNeRF: Regularizing Neural Radiance Fields with Denoising Diffusion Models**|Jamie Wynn et.al.|[2302.12231v1](http://arxiv.org/abs/2302.12231v1)|**[link](https://github.com/nianticlabs/diffusionerf)**|\n", "2302.10109": "|**2023-02-20**|**NerfDiff: Single-image View Synthesis with NeRF-guided Distillation from 3D-aware Diffusion**|Jiatao Gu et.al.|[2302.10109v1](http://arxiv.org/abs/2302.10109v1)|null|\n", "2302.09486": "|**2023-02-19**|**LC-NeRF: Local Controllable Face Generation in Neural Randiance Field**|Wenyang Zhou et.al.|[2302.09486v1](http://arxiv.org/abs/2302.09486v1)|null|\n", "2302.08788": "|**2023-02-17**|**MixNeRF: Modeling a Ray with Mixture Density for Novel View Synthesis from Sparse Inputs**|Seunghyeon Seo et.al.|[2302.08788v1](http://arxiv.org/abs/2302.08788v1)|**[link](https://github.com/shawn615/MixNeRF)**|\n", "2302.06833": "|**2023-02-14**|**VQ3D: Learning a 3D-Aware Generative Model on ImageNet**|Kyle Sargent et.al.|[2302.06833v1](http://arxiv.org/abs/2302.06833v1)|null|\n", "2302.06608": "|**2023-02-13**|**3D-aware Blending with Generative NeRFs**|Hyunsu Kim et.al.|[2302.06608v1](http://arxiv.org/abs/2302.06608v1)|**[link](https://github.com/naver-ai/BlendNeRF)**|\n", "2302.05573": "|**2023-02-11**|**3D Colored Shape Reconstruction from a Single RGB Image through Diffusion**|Bo Li et.al.|[2302.05573v1](http://arxiv.org/abs/2302.05573v1)|null|\n", "2302.04264": "|**2023-02-08**|**Nerfstudio: A Modular Framework for Neural Radiance Field Development**|Matthew Tancik et.al.|[2302.04264v1](http://arxiv.org/abs/2302.04264v1)|null|\n", "2302.02088": "|**2023-02-07**|**AV-NeRF: Learning Neural Fields for Real-World Audio-Visual Scene Synthesis**|Susan Liang et.al.|[2302.02088v2](http://arxiv.org/abs/2302.02088v2)|null|\n", "2302.01579": "|**2023-02-03**|**Semantic 3D-aware Portrait Synthesis and Manipulation Based on Compositional Neural Radiance Field**|Tianxiang Ma et.al.|[2302.01579v1](http://arxiv.org/abs/2302.01579v1)|**[link](https://github.com/tianxiangma/cnerf)**|\n", "2302.01571": "|**2023-02-03**|**Robust Camera Pose Refinement for Multi-Resolution Hash Encoding**|Hwan Heo et.al.|[2302.01571v1](http://arxiv.org/abs/2302.01571v1)|null|\n", "2302.01532": "|**2023-02-03**|**INV: Towards Streaming Incremental Neural Videos**|Shengze Wang et.al.|[2302.01532v1](http://arxiv.org/abs/2302.01532v1)|null|\n", "2302.01226": "|**2023-02-02**|**Factor Fields: A Unified Framework for Neural Fields and Beyond**|Anpei Chen et.al.|[2302.01226v1](http://arxiv.org/abs/2302.01226v1)|null|\n", "2302.00833": "|**2023-02-02**|**RobustNeRF: Ignoring Distractors with Robust Losses**|Sara Sabour et.al.|[2302.00833v1](http://arxiv.org/abs/2302.00833v1)|null|\n", "2301.13430": "|**2023-01-31**|**GeneFace: Generalized and High-Fidelity Audio-Driven 3D Talking Face Synthesis**|Zhenhui Ye et.al.|[2301.13430v1](http://arxiv.org/abs/2301.13430v1)|null|\n", "2301.12780": "|**2023-01-30**|**Equivariant Architectures for Learning in Deep Weight Spaces**|Aviv Navon et.al.|[2301.12780v1](http://arxiv.org/abs/2301.12780v1)|**[link](https://github.com/AvivNavon/DWSNets)**|\n", "2301.11631": "|**2023-01-27**|**HyperNeRFGAN: Hypernetwork approach to 3D NeRF GAN**|Adam Kania et.al.|[2301.11631v1](http://arxiv.org/abs/2301.11631v1)|**[link](https://github.com/gmum/hypernerfgan)**|\n", "2301.11522": "|**2023-01-27**|**A Comparison of Tiny-nerf versus Spatial Representations for 3d Reconstruction**|Saulo Abraham Gante et.al.|[2301.11522v1](http://arxiv.org/abs/2301.11522v1)|null|\n", "2301.11520": "|**2023-01-27**|**SNeRL: Semantic-aware Neural Radiance Fields for Reinforcement Learning**|Dongseok Shim et.al.|[2301.11520v1](http://arxiv.org/abs/2301.11520v1)|null|\n", "2301.11280": "|**2023-01-26**|**Text-To-4D Dynamic Scene Generation**|Uriel Singer et.al.|[2301.11280v1](http://arxiv.org/abs/2301.11280v1)|null|\n", "2301.10941": "|**2023-01-26**|**GeCoNeRF: Few-shot Neural Radiance Fields via Geometric Consistency**|Minseop Kwak et.al.|[2301.10941v1](http://arxiv.org/abs/2301.10941v1)|**[link](https://github.com/KU-CVLAB/GeCoNeRF)**|\n", "2301.09632": "|**2023-01-23**|**HexPlane: A Fast Representation for Dynamic Scenes**|Ang Cao et.al.|[2301.09632v1](http://arxiv.org/abs/2301.09632v1)|**[link](https://github.com/Caoang327/HexPlane)**|\n", "2301.09060": "|**2023-02-02**|**3D Reconstruction of Non-cooperative Resident Space Objects using Instant NGP-accelerated NeRF and D-NeRF**|Trupti Mahendrakar et.al.|[2301.09060v2](http://arxiv.org/abs/2301.09060v2)|null|\n", "2301.07958": "|**2023-02-05**|**RecolorNeRF: Layer Decomposed Radiance Fields for Efficient Color Editing of 3D Scenes**|Bingchen Gong et.al.|[2301.07958v2](http://arxiv.org/abs/2301.07958v2)|null|\n", "2301.08556": "|**2023-01-18**|**NeRF in the Palm of Your Hand: Corrective Augmentation for Robotics via Novel-View Synthesis**|Allan Zhou et.al.|[2301.08556v1](http://arxiv.org/abs/2301.08556v1)|null|\n", "2301.07668": "|**2023-01-18**|**Behind the Scenes: Density Fields for Single View Reconstruction**|Felix Wimbauer et.al.|[2301.07668v1](http://arxiv.org/abs/2301.07668v1)|**[link](https://github.com/Brummi/BehindTheScenes)**|\n", "2301.06782": "|**2023-01-17**|**A Large-Scale Outdoor Multi-modal Dataset and Benchmark for Novel View Synthesis and Implicit Scene Reconstruction**|Chongshan Lu et.al.|[2301.06782v1](http://arxiv.org/abs/2301.06782v1)|null|\n", "2301.05747": "|**2023-01-13**|**Laser: Latent Set Representations for 3D Generative Modeling**|Pol Moreno et.al.|[2301.05747v1](http://arxiv.org/abs/2301.05747v1)|null|\n", "2301.04075": "|**2023-01-10**|**Benchmarking Robustness in Neural Radiance Fields**|Chen Wang et.al.|[2301.04075v1](http://arxiv.org/abs/2301.04075v1)|null|\n", "2301.03102": "|**2023-01-08**|**Towards Open World NeRF-Based SLAM**|Daniil Lisus et.al.|[2301.03102v1](http://arxiv.org/abs/2301.03102v1)|null|\n", "2301.02975": "|**2023-01-10**|**Traditional Readability Formulas Compared for English**|Bruce W. Lee et.al.|[2301.02975v2](http://arxiv.org/abs/2301.02975v2)|null|\n", "2301.00950": "|**2023-01-09**|**Class-Continuous Conditional Generative Neural Radiance Field**|Jiwook Kim et.al.|[2301.00950v2](http://arxiv.org/abs/2301.00950v2)|**[link](https://github.com/tom919654/C3G-NeRF)**|\n", "2301.00411": "|**2023-01-11**|**Detachable Novel Views Synthesis of Dynamic Scenes Using Distribution-Driven Neural Radiance Fields**|Boyu Zhang et.al.|[2301.00411v2](http://arxiv.org/abs/2301.00411v2)|**[link](https://github.com/luciferbobo/d4nerf)**|\n", "2212.13056": "|**2022-12-26**|**MonoNeRF: Learning a Generalizable Dynamic Radiance Field from Monocular Videos**|Fengrui Tian et.al.|[2212.13056v1](http://arxiv.org/abs/2212.13056v1)|**[link](https://github.com/tianfr/mononerf)**|\n", "2212.12871": "|**2022-12-25**|**PaletteNeRF: Palette-based Color Editing for NeRFs**|Qiling Wu et.al.|[2212.12871v1](http://arxiv.org/abs/2212.12871v1)|null|\n", "2212.11966": "|**2022-12-22**|**Removing Objects From Neural Radiance Fields**|Silvan Weder et.al.|[2212.11966v1](http://arxiv.org/abs/2212.11966v1)|null|\n", "2212.10950": "|**2022-12-21**|**Incremental Learning for Neural Radiance Field with Uncertainty-Filtered Knowledge Distillation**|Mengqi Guo et.al.|[2212.10950v1](http://arxiv.org/abs/2212.10950v1)|null|\n", "2212.10699": "|**2023-01-24**|**PaletteNeRF: Palette-based Appearance Editing of Neural Radiance Fields**|Zhengfei Kuang et.al.|[2212.10699v2](http://arxiv.org/abs/2212.10699v2)|null|\n", "2212.09735": "|**2022-12-20**|**Correspondence Distillation from NeRF-based GAN**|Yushi Lan et.al.|[2212.09735v2](http://arxiv.org/abs/2212.09735v2)|null|\n", "2212.09330": "|**2022-12-19**|**StyleTRF: Stylizing Tensorial Radiance Fields**|Rahul Goel et.al.|[2212.09330v1](http://arxiv.org/abs/2212.09330v1)|null|\n", "2212.09100": "|**2022-12-18**|**SPARF: Large-Scale Learning of 3D Sparse Radiance Fields from Few Input Images**|Abdullah Hamdi et.al.|[2212.09100v1](http://arxiv.org/abs/2212.09100v1)|**[link](https://github.com/ajhamdi/sparf_pytorch)**|\n", "2212.09069": "|**2022-12-18**|**Masked Wavelet Representation for Compact Neural Radiance Fields**|Daniel Rho et.al.|[2212.09069v1](http://arxiv.org/abs/2212.09069v1)|**[link](https://github.com/daniel03c1/masked_wavelet_nerf)**|\n", "2212.08328": "|**2022-12-31**|**MEIL-NeRF: Memory-Efficient Incremental Learning of Neural Radiance Fields**|Jaeyoung Chung et.al.|[2212.08328v2](http://arxiv.org/abs/2212.08328v2)|null|\n", "2212.08070": "|**2022-12-15**|**NeRF-Art: Text-Driven Neural Radiance Fields Stylization**|Can Wang et.al.|[2212.08070v1](http://arxiv.org/abs/2212.08070v1)|**[link](https://github.com/cassiePython/NeRF-Art)**|\n", "2212.08057": "|**2022-12-15**|**Real-Time Neural Light Field on Mobile Devices**|Junli Cao et.al.|[2212.08057v1](http://arxiv.org/abs/2212.08057v1)|**[link](https://github.com/snap-research/mobiler2l)**|\n", "2212.08476": "|**2022-12-15**|**SteerNeRF: Accelerating NeRF Rendering via Smooth Viewpoint Trajectory**|Sicheng Li et.al.|[2212.08476v1](http://arxiv.org/abs/2212.08476v1)|null|\n", "2212.07388": "|**2022-12-14**|**NoPe-NeRF: Optimising Neural Radiance Field with No Pose Prior**|Wenjing Bian et.al.|[2212.07388v1](http://arxiv.org/abs/2212.07388v1)|**[link](https://github.com/ActiveVisionLab/nope-nerf)**|\n", "2212.04701": "|**2022-12-09**|**4K-NeRF: High Fidelity Neural Radiance Fields at Ultra High Resolutions**|Zhongshu Wang et.al.|[2212.04701v1](http://arxiv.org/abs/2212.04701v1)|**[link](https://github.com/frozoul/4k-nerf)**|\n", "2212.04823": "|**2022-12-08**|**GazeNeRF: 3D-Aware Gaze Redirection with Neural Radiance Fields**|Alessandro Ruzzi et.al.|[2212.04823v1](http://arxiv.org/abs/2212.04823v1)|**[link](https://github.com/alessandroruzzi/gazenerf)**|\n", "2302.13543": "|**2023-02-27**|**BaLi-RF: Bandlimited Radiance Fields for Dynamic Scene Modeling**|Sameera Ramasinghe et.al.|[2302.13543v1](http://arxiv.org/abs/2302.13543v1)|null|\n", "2302.13397": "|**2023-02-26**|**Efficient physics-informed neural networks using hash encoding**|Xinquan Huang et.al.|[2302.13397v1](http://arxiv.org/abs/2302.13397v1)|null|\n", "2302.12931": "|**2023-02-24**|**CATNIPS: Collision Avoidance Through Neural Implicit Probabilistic Scenes**|Timothy Chen et.al.|[2302.12931v1](http://arxiv.org/abs/2302.12931v1)|null|\n", "2302.14683": "|**2023-03-09**|**IntrinsicNGP: Intrinsic Coordinate based Hash Encoding for Human NeRF**|Bo Peng et.al.|[2302.14683v2](http://arxiv.org/abs/2302.14683v2)|null|\n", "2303.00749": "|**2023-03-01**|**S-NeRF: Neural Radiance Fields for Street Views**|Ziyang Xie et.al.|[2303.00749v1](http://arxiv.org/abs/2303.00749v1)|null|\n", "2303.02091": "|**2023-03-03**|**Delicate Textured Mesh Recovery from NeRF via Adaptive Surface Refinement**|Jiaxiang Tang et.al.|[2303.02091v1](http://arxiv.org/abs/2303.02091v1)|**[link](https://github.com/ashawkey/nerf2mesh)**|\n", "2303.01736": "|**2023-03-03**|**Multi-Plane Neural Radiance Fields for Novel View Synthesis**|Youssef Abdelkareem et.al.|[2303.01736v1](http://arxiv.org/abs/2303.01736v1)|null|\n", "2303.03361": "|**2023-03-10**|**Nerflets: Local Radiance Fields for Efficient Structure-Aware 3D Scene Representation from 2D Supervision**|Xiaoshuai Zhang et.al.|[2303.03361v2](http://arxiv.org/abs/2303.03361v2)|null|\n", "2303.03003": "|**2023-03-07**|**Efficient Large-scale Scene Representation with a Hybrid of High-resolution Grid and Plane Features**|Yuqi Zhang et.al.|[2303.03003v2](http://arxiv.org/abs/2303.03003v2)|**[link](https://github.com/zyqz97/gp-nerf)**|\n", "2303.04086": "|**2023-03-07**|**NEPHELE: A Neural Platform for Highly Realistic Cloud Radiance Rendering**|Haimin Luo et.al.|[2303.04086v1](http://arxiv.org/abs/2303.04086v1)|null|\n", "2303.03808": "|**2023-03-07**|**Multiscale Tensor Decomposition and Rendering Equation Encoding for View Synthesis**|Kang Han et.al.|[2303.03808v1](http://arxiv.org/abs/2303.03808v1)|**[link](https://github.com/imkanghan/nrff)**|\n", "2303.03966": "|**2023-03-05**|**Semantic-aware Occlusion Filtering Neural Radiance Fields in the Wild**|Jaewon Lee et.al.|[2303.03966v1](http://arxiv.org/abs/2303.03966v1)|null|\n", "2303.04508": "|**2023-03-08**|**FastSurf: Fast Neural RGB-D Surface Reconstruction using Per-Frame Intrinsic Refinement and TSDF Fusion Prior Learning**|Seunghwan Lee et.al.|[2303.04508v1](http://arxiv.org/abs/2303.04508v1)|**[link](https://github.com/ROKIT-Healthcare/FastSurf)**|\n", "2303.04322": "|**2023-03-08**|**DroNeRF: Real-time Multi-agent Drone Pose Optimization for Computing Neural Radiance Fields**|Dipam Patel et.al.|[2303.04322v1](http://arxiv.org/abs/2303.04322v1)|null|\n", "2303.05512": "|**2023-03-09**|**PAC-NeRF: Physics Augmented Continuum Neural Radiance Fields for Geometry-Agnostic System Identification**|Xuan Li et.al.|[2303.05512v1](http://arxiv.org/abs/2303.05512v1)|null|\n", "2303.05835": "|**2023-03-10**|**You Only Train Once: Multi-Identity Free-Viewpoint Neural Human Rendering from Monocular Videos**|Jaehyeok Kim et.al.|[2303.05835v1](http://arxiv.org/abs/2303.05835v1)|null|\n", "2303.05807": "|**2023-03-10**|**Aleth-NeRF: Low-light Condition View Synthesis with Concealing Fields**|Ziteng Cui et.al.|[2303.05807v1](http://arxiv.org/abs/2303.05807v1)|null|\n", "2303.05775": "|**2023-03-10**|**Self-NeRF: A Self-Training Pipeline for Few-Shot Neural Radiance Fields**|Jiayang Bai et.al.|[2303.05775v1](http://arxiv.org/abs/2303.05775v1)|null|\n", "2303.05735": "|**2023-03-14**|**Hardware Acceleration of Neural Graphics**|Muhammad Husnain Mubarik et.al.|[2303.05735v2](http://arxiv.org/abs/2303.05735v2)|null|\n", "2303.05703": "|**2023-03-10**|**MovingParts: Motion-based 3D Part Discovery in Dynamic Radiance Field**|Kaizhi Yang et.al.|[2303.05703v1](http://arxiv.org/abs/2303.05703v1)|null|\n", "2303.06919": "|**2023-03-13**|**NeRFLiX: High-Quality Neural View Synthesis by Learning a Degradation-Driven Inter-viewpoint MiXer**|Kun Zhou et.al.|[2303.06919v1](http://arxiv.org/abs/2303.06919v1)|**[link](https://github.com/redrock303/NeRFLiX_CPVR2023)**|\n", "2303.06335": "|**2023-03-11**|**Just Flip: Flipped Observation Generation and Optimization for Neural Radiance Fields to Cover Unobserved View**|Minjae Lee et.al.|[2303.06335v1](http://arxiv.org/abs/2303.06335v1)|**[link](https://github.com/minjae-lulu/just-flip)**|\n", "2303.06226": "|**2023-03-10**|**NeRFlame: FLAME-based conditioning of NeRF for 3D face rendering**|Wojciech Zaj\u0105c et.al.|[2303.06226v1](http://arxiv.org/abs/2303.06226v1)|**[link](https://github.com/wojtekz4/nerflame)**|\n", "2303.08096": "|**2023-03-14**|**MELON: NeRF with Unposed Images Using Equivalence Class Estimation**|Axel Levy et.al.|[2303.08096v1](http://arxiv.org/abs/2303.08096v1)|null|\n", "2303.07937": "|**2023-03-16**|**Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation**|Junyoung Seo et.al.|[2303.07937v3](http://arxiv.org/abs/2303.07937v3)|**[link](https://github.com/KU-CVLAB/3DFuse)**|\n", "2303.07653": "|**2023-03-16**|**NEF: Neural Edge Fields for 3D Parametric Curve Reconstruction from Multi-view Images**|Yunfan Ye et.al.|[2303.07653v2](http://arxiv.org/abs/2303.07653v2)|**[link](https://github.com/yunfan1202/NEF_code)**|\n", "2303.07596": "|**2023-03-18**|**Frequency-Modulated Point Cloud Rendering with Easy Editing**|Yi Zhang et.al.|[2303.07596v2](http://arxiv.org/abs/2303.07596v2)|**[link](https://github.com/yizhangphd/freqpcr)**|\n", "2303.07418": "|**2023-03-13**|**FreeNeRF: Improving Few-shot Neural Rendering with Free Frequency Regularization**|Jiawei Yang et.al.|[2303.07418v1](http://arxiv.org/abs/2303.07418v1)|**[link](https://github.com/jiawei-yang/freenerf)**|\n", "2303.08808": "|**2023-03-15**|**Mesh Strikes Back: Fast and Efficient Human Reconstruction from RGB videos**|Rohit Jena et.al.|[2303.08808v1](http://arxiv.org/abs/2303.08808v1)|null|\n", "2303.08717": "|**2023-03-15**|**Re-ReND: Real-time Rendering of NeRFs across Devices**|Sara Rojas et.al.|[2303.08717v1](http://arxiv.org/abs/2303.08717v1)|**[link](https://github.com/sararoma95/Re-ReND)**|\n", "2303.08695": "|**2023-03-15**|**RefiNeRF: Modelling dynamic neural radiance fields with inconsistent or missing camera parameters**|Shuja Khalid et.al.|[2303.08695v1](http://arxiv.org/abs/2303.08695v1)|null|\n", "2303.08370": "|**2023-03-15**|**Harnessing Low-Frequency Neural Fields for Few-Shot View Synthesis**|Liangchen Song et.al.|[2303.08370v1](http://arxiv.org/abs/2303.08370v1)|**[link](https://github.com/lsongx/halo)**|\n", "2303.09554": "|**2023-03-21**|**PartNeRF: Generating Part-Aware Editable 3D Shapes without 3D Supervision**|Konstantinos Tertikas et.al.|[2303.09554v3](http://arxiv.org/abs/2303.09554v3)|null|\n", "2303.09553": "|**2023-03-16**|**LERF: Language Embedded Radiance Fields**|Justin Kerr et.al.|[2303.09553v1](http://arxiv.org/abs/2303.09553v1)|null|\n", "2303.09431": "|**2023-03-16**|**NeRFMeshing: Distilling Neural Radiance Fields into Geometrically-Accurate 3D Meshes**|Marie-Julie Rakotosaona et.al.|[2303.09431v1](http://arxiv.org/abs/2303.09431v1)|null|\n", "2303.09412": "|**2023-03-17**|**NeRFtrinsic Four: An End-To-End Trainable NeRF Jointly Optimizing Diverse Intrinsic and Extrinsic Camera Parameters**|Hannah Schieber et.al.|[2303.09412v2](http://arxiv.org/abs/2303.09412v2)|**[link](https://github.com/hannahhaensen/nerftrinsic_four)**|\n", "2303.09153": "|**2023-03-16**|**Reliable Image Dehazing by NeRF**|Zheyan Jin et.al.|[2303.09153v1](http://arxiv.org/abs/2303.09153v1)|null|\n", "2303.10083": "|**2023-03-17**|**$\u03b1$Surf: Implicit Surface Reconstruction for Semi-Transparent and Thin Objects with Decoupled Geometry and Opacity**|Tianhao Wu et.al.|[2303.10083v1](http://arxiv.org/abs/2303.10083v1)|null|\n", "2303.09952": "|**2023-03-17**|**Single-view Neural Radiance Fields with Depth Teacher**|Yurui Chen et.al.|[2303.09952v1](http://arxiv.org/abs/2303.09952v1)|null|\n", "2303.11052": "|**2023-03-20**|**ContraNeRF: Generalizable Neural Radiance Fields for Synthetic-to-real Novel View Synthesis via Contrastive Learning**|Hao Yang et.al.|[2303.11052v1](http://arxiv.org/abs/2303.11052v1)|null|\n", "2303.10735": "|**2023-03-19**|**SKED: Sketch-guided Text-based 3D Editing**|Aryan Mikaeili et.al.|[2303.10735v1](http://arxiv.org/abs/2303.10735v1)|null|\n", "2303.10709": "|**2023-03-19**|**NeRF-LOAM: Neural Implicit Representation for Large-Scale Incremental LiDAR Odometry and Mapping**|Junyuan Deng et.al.|[2303.10709v1](http://arxiv.org/abs/2303.10709v1)|**[link](https://github.com/junyuandeng/nerf-loam)**|\n", "2303.10340": "|**2023-03-18**|**3D Data Augmentation for Driving Scenes on Camera**|Wenwen Tong et.al.|[2303.10340v1](http://arxiv.org/abs/2303.10340v1)|null|\n", "2303.11938": "|**2023-03-21**|**3D-CLFusion: Fast Text-to-3D Rendering with Contrastive Latent Diffusion**|Yu-Jhe Li et.al.|[2303.11938v1](http://arxiv.org/abs/2303.11938v1)|null|\n", "2303.11728": "|**2023-03-22**|**ExtremeNeRF: Few-shot Neural Radiance Fields Under Unconstrained Illumination**|SeokYeong Lee et.al.|[2303.11728v2](http://arxiv.org/abs/2303.11728v2)|null|\n", "2303.11364": "|**2023-03-20**|**DehazeNeRF: Multiple Image Haze Removal and 3D Shape Reconstruction using Neural Radiance Fields**|Wei-Ting Chen et.al.|[2303.11364v1](http://arxiv.org/abs/2303.11364v1)|null|\n", "2303.12791": "|**2023-03-22**|**SHERF: Generalizable Human NeRF from a Single Image**|Shoukang Hu et.al.|[2303.12791v1](http://arxiv.org/abs/2303.12791v1)|**[link](https://github.com/skhu101/sherf)**|\n", "2303.12789": "|**2023-03-22**|**Instruct-NeRF2NeRF: Editing 3D Scenes with Instructions**|Ayaan Haque et.al.|[2303.12789v1](http://arxiv.org/abs/2303.12789v1)|null|\n", "2303.12786": "|**2023-03-22**|**FeatureNeRF: Learning Generalizable NeRFs by Distilling Foundation Models**|Jianglong Ye et.al.|[2303.12786v1](http://arxiv.org/abs/2303.12786v1)|null|\n", "2303.12408": "|**2023-03-24**|**Balanced Spherical Grid for Egocentric View Synthesis**|Changwoon Choi et.al.|[2303.12408v2](http://arxiv.org/abs/2303.12408v2)|**[link](https://github.com/changwoonchoi/EgoNeRF)**|\n", "2303.12234": "|**2023-03-21**|**Pre-NeRF 360: Enriching Unbounded Appearances for Neural Radiance Fields**|Ahmad AlMughrabi et.al.|[2303.12234v1](http://arxiv.org/abs/2303.12234v1)|**[link](https://github.com/amughrabi/pre-nerf)**|\n", "2303.13497": "|**2023-03-23**|**TriPlaneNet: An Encoder for EG3D Inversion**|Ananta R. Bhattarai et.al.|[2303.13497v1](http://arxiv.org/abs/2303.13497v1)|null|\n", "2303.13472": "|**2023-03-23**|**Plotting Behind the Scenes: Towards Learnable Game Engines**|Willi Menapace et.al.|[2303.13472v1](http://arxiv.org/abs/2303.13472v1)|null|\n", "2303.13450": "|**2023-03-23**|**Set-the-Scene: Global-Local Training for Generating Controllable NeRF Scenes**|Dana Cohen-Bar et.al.|[2303.13450v1](http://arxiv.org/abs/2303.13450v1)|**[link](https://github.com/DanaCohen95/Set-the-Scene)**|\n", "2303.13277": "|**2023-03-25**|**SINE: Semantic-driven Image-based NeRF Editing with Prior-guided Editing Field**|Chong Bao et.al.|[2303.13277v2](http://arxiv.org/abs/2303.13277v2)|null|\n", "2303.13232": "|**2023-03-23**|**Transforming Radiance Field with Lipschitz Network for Photorealistic 3D Scene Stylization**|Zicheng Zhang et.al.|[2303.13232v1](http://arxiv.org/abs/2303.13232v1)|null|\n", "2303.13014": "|**2023-03-23**|**Semantic Ray: Learning a Generalizable Semantic Field with Cross-Reprojection Attention**|Fangfu Liu et.al.|[2303.13014v1](http://arxiv.org/abs/2303.13014v1)|**[link](https://github.com/liuff19/Semantic-Ray)**|\n", "2303.12865": "|**2023-03-22**|**NeRF-GAN Distillation for Efficient 3D-Aware Generation with Convolutions**|Mohamad Shahbazi et.al.|[2303.12865v1](http://arxiv.org/abs/2303.12865v1)|**[link](https://github.com/mshahbazi72/nerf-gan-distillation)**|\n", "2303.14001": "|**2023-03-24**|**Grid-guided Neural Radiance Fields for Large Urban Scenes**|Linning Xu et.al.|[2303.14001v1](http://arxiv.org/abs/2303.14001v1)|null|\n", "2303.13843": "|**2023-03-24**|**CompoNeRF: Text-guided Multi-object Compositional NeRF with Editable 3D Scene Layout**|Yiqi Lin et.al.|[2303.13843v1](http://arxiv.org/abs/2303.13843v1)|null|\n", "2303.13825": "|**2023-03-24**|**HandNeRF: Neural Radiance Fields for Animatable Interacting Hands**|Zhiyang Guo et.al.|[2303.13825v1](http://arxiv.org/abs/2303.13825v1)|null|\n", "2303.13817": "|**2023-03-24**|**ABLE-NeRF: Attention-Based Rendering with Learnable Embeddings for Neural Radiance Field**|Zhe Jun Tang et.al.|[2303.13817v1](http://arxiv.org/abs/2303.13817v1)|**[link](https://github.com/tangzj/able-nerf)**|\n", "2303.13777": "|**2023-03-24**|**GM-NeRF: Learning Generalizable Model-based Neural Radiance Fields from Multi-view Images**|Jianchuan Chen et.al.|[2303.13777v1](http://arxiv.org/abs/2303.13777v1)|null|\n", "2303.13743": "|**2023-03-24**|**TEGLO: High Fidelity Canonical Texture Mapping from Single-View Images**|Vishal Vinod et.al.|[2303.13743v1](http://arxiv.org/abs/2303.13743v1)|null|\n", "2303.13582": "|**2023-03-23**|**SCADE: NeRFs from Space Carving with Ambiguity-Aware Depth Estimates**|Mikaela Angelina Uy et.al.|[2303.13582v1](http://arxiv.org/abs/2303.13582v1)|null|\n", "2303.15427": "|**2023-03-27**|**JAWS: Just A Wild Shot for Cinematic Transfer in Neural Radiance Fields**|Xi Wang et.al.|[2303.15427v1](http://arxiv.org/abs/2303.15427v1)|**[link](https://github.com/robincourant/jaws)**|\n", "2303.15387": "|**2023-03-27**|**Generalizable Neural Voxels for Fast Human Radiance Fields**|Taoran Yi et.al.|[2303.15387v1](http://arxiv.org/abs/2303.15387v1)|null|\n", "2303.15368": "|**2023-03-27**|**NeUDF: Learning Unsigned Distance Fields from Multi-view Images for Reconstructing Non-watertight Models**|Fei Hou et.al.|[2303.15368v1](http://arxiv.org/abs/2303.15368v1)|null|\n", "2303.15012": "|**2023-03-27**|**3D-Aware Multi-Class Image-to-Image Translation with NeRFs**|Senmao Li et.al.|[2303.15012v1](http://arxiv.org/abs/2303.15012v1)|**[link](https://github.com/sen-mao/3di2i-translation)**|\n", "2303.14707": "|**2023-03-26**|**Clean-NeRF: Reformulating NeRF to account for View-Dependent Observations**|Xinhang Liu et.al.|[2303.14707v1](http://arxiv.org/abs/2303.14707v1)|null|\n", "2303.14536": "|**2023-03-25**|**SUDS: Scalable Urban Dynamic Scenes**|Haithem Turki et.al.|[2303.14536v1](http://arxiv.org/abs/2303.14536v1)|null|\n", "2303.14478": "|**2023-03-25**|**DBARF: Deep Bundle-Adjusting Generalizable Neural Radiance Fields**|Yu Chen et.al.|[2303.14478v1](http://arxiv.org/abs/2303.14478v1)|null|\n", "2303.14435": "|**2023-03-25**|**NeRF-DS: Neural Radiance Fields for Dynamic Specular Objects**|Zhiwen Yan et.al.|[2303.14435v1](http://arxiv.org/abs/2303.14435v1)|**[link](https://github.com/jokeryan/nerf-ds)**|\n", "2303.15206": "|**2023-03-24**|**Perceptual Quality Assessment of NeRF and Neural View Synthesis Methods for Front-Facing Views**|Hanxue Liang et.al.|[2303.15206v1](http://arxiv.org/abs/2303.15206v1)|null|\n", "2303.16196": "|**2023-03-28**|**SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis**|Guangcong Wang et.al.|[2303.16196v1](http://arxiv.org/abs/2303.16196v1)|null|\n", "2303.16184": "|**2023-03-28**|**VMesh: Hybrid Volume-Mesh Representation for Efficient View Synthesis**|Yuan-Chen Guo et.al.|[2303.16184v1](http://arxiv.org/abs/2303.16184v1)|null|\n", "2303.16001": "|**2023-03-30**|**Adaptive Voronoi NeRFs**|Tim Elsner et.al.|[2303.16001v2](http://arxiv.org/abs/2303.16001v2)|null|\n", "2303.15951": "|**2023-03-28**|**F$^{2}$-NeRF: Fast Neural Radiance Field Training with Free Camera Trajectories**|Peng Wang et.al.|[2303.15951v1](http://arxiv.org/abs/2303.15951v1)|**[link](https://github.com/Totoro97/f2-nerf)**|\n", "2303.16485": "|**2023-03-29**|**TriVol: Point Cloud Rendering via Triple Volumes**|Tao Hu et.al.|[2303.16485v1](http://arxiv.org/abs/2303.16485v1)|**[link](https://github.com/dvlab-research/trivol)**|\n", "2303.16482": "|**2023-03-29**|**Point2Pix: Photo-Realistic Point Cloud Rendering via Neural Radiance Fields**|Tao Hu et.al.|[2303.16482v1](http://arxiv.org/abs/2303.16482v1)|null|\n", "2303.16333": "|**2023-03-28**|**Flow supervision for Deformable NeRF**|Chaoyang Wang et.al.|[2303.16333v1](http://arxiv.org/abs/2303.16333v1)|null|\n", "2303.17603": "|**2023-03-30**|**NeRF-Supervised Deep Stereo**|Fabio Tosi et.al.|[2303.17603v1](http://arxiv.org/abs/2303.17603v1)|**[link](https://github.com/fabiotosi92/nerf-supervised-deep-stereo)**|\n", "2303.17368": "|**2023-03-30**|**SynBody: Synthetic Dataset with Layered Human Models for 3D Human Perception and Modeling**|Zhitao Yang et.al.|[2303.17368v1](http://arxiv.org/abs/2303.17368v1)|**[link](https://github.com/openxrlab/xrfeitoria)**|\n", "2303.17147": "|**2023-03-30**|**NeILF++: Inter-Reflectable Light Fields for Geometry and Material Estimation**|Jingyang Zhang et.al.|[2303.17147v1](http://arxiv.org/abs/2303.17147v1)|null|\n", "2303.17094": "|**2023-03-30**|**Enhanced Stable View Synthesis**|Nishant Jain et.al.|[2303.17094v1](http://arxiv.org/abs/2303.17094v1)|null|\n", "2303.17968": "|**2023-03-31**|**VDN-NeRF: Resolving Shape-Radiance Ambiguity via View-Dependence Normalization**|Bingfan Zhu et.al.|[2303.17968v1](http://arxiv.org/abs/2303.17968v1)|**[link](https://github.com/boifz/vdn-nerf)**|\n", "2304.00916": "|**2023-04-06**|**DreamAvatar: Text-and-Shape Guided 3D Human Avatar Generation via Diffusion Models**|Yukang Cao et.al.|[2304.00916v2](http://arxiv.org/abs/2304.00916v2)|null|\n", "2304.00341": "|**2023-04-01**|**JacobiNeRF: NeRF Shaping with Mutual Information Gradients**|Xiaomeng Xu et.al.|[2304.00341v1](http://arxiv.org/abs/2304.00341v1)|**[link](https://github.com/xxm19/jacobinerf)**|\n", "2304.02001": "|**2023-04-04**|**MonoHuman: Animatable Human Neural Field from Monocular Video**|Zhengming Yu et.al.|[2304.02001v1](http://arxiv.org/abs/2304.02001v1)|null|\n", "2304.02061": "|**2023-04-11**|**Generating Continual Human Motion in Diverse 3D Scenes**|Aymen Mir et.al.|[2304.02061v2](http://arxiv.org/abs/2304.02061v2)|null|\n", "2304.03280": "|**2023-04-06**|**LANe: Lighting-Aware Neural Fields for Compositional Scene Synthesis**|Akshay Krishnan et.al.|[2304.03280v1](http://arxiv.org/abs/2304.03280v1)|null|\n", "2304.03266": "|**2023-04-06**|**Neural Fields meet Explicit Geometric Representation for Inverse Rendering of Urban Scenes**|Zian Wang et.al.|[2304.03266v1](http://arxiv.org/abs/2304.03266v1)|null|\n", "2304.02827": "|**2023-04-06**|**DITTO-NeRF: Diffusion-based Iterative Text To Omni-directional 3D Model**|Hoigi Seo et.al.|[2304.02827v1](http://arxiv.org/abs/2304.02827v1)|null|\n", "2304.02736": "|**2023-04-05**|**Image Stabilization for Hololens Camera in Remote Collaboration**|Gowtham Senthil et.al.|[2304.02736v1](http://arxiv.org/abs/2304.02736v1)|null|\n", "2304.03526": "|**2023-04-07**|**Lift3D: Synthesize 3D Training Data by Lifting 2D GAN to 3D Generative Radiance Field**|Leheng Li et.al.|[2304.03526v1](http://arxiv.org/abs/2304.03526v1)|null|\n", "2304.03384": "|**2023-04-06**|**Beyond NeRF Underwater: Learning Neural Reflectance Fields for True Color Correction of Marine Imagery**|Tianyi Zhang et.al.|[2304.03384v1](http://arxiv.org/abs/2304.03384v1)|**[link](https://github.com/tyz1030/neuralsea)**|\n", "2304.04452": "|**2023-04-10**|**Neural Residual Radiance Fields for Streamably Free-Viewpoint Videos**|Liao Wang et.al.|[2304.04452v1](http://arxiv.org/abs/2304.04452v1)|null|\n", "2304.04446": "|**2023-04-10**|**Inferring Fluid Dynamics via Inverse Rendering**|Jinxian Liu et.al.|[2304.04446v1](http://arxiv.org/abs/2304.04446v1)|null|\n", "2304.04395": "|**2023-04-10**|**Instance Neural Radiance Field**|Benran Hu et.al.|[2304.04395v1](http://arxiv.org/abs/2304.04395v1)|**[link](https://github.com/lyclyc52/instance_nerf)**|\n", "2304.04133": "|**2023-04-12**|**NeRF applied to satellite imagery for surface reconstruction**|Federico Semeraro et.al.|[2304.04133v3](http://arxiv.org/abs/2304.04133v3)|**[link](https://github.com/fsemerar/satnerf)**|\n", "2304.04012": "|**2023-04-08**|**PVD-AL: Progressive Volume Distillation with Active Learning for Efficient Conversion Between Different NeRF Architectures**|Shuangkang Fang et.al.|[2304.04012v1](http://arxiv.org/abs/2304.04012v1)|**[link](https://github.com/megvii-research/AAAI2023-PVD)**|\n", "2304.04559": "|**2023-04-07**|**Event-based Camera Tracker by $\\nabla$t NeRF**|Mana Masuda et.al.|[2304.04559v1](http://arxiv.org/abs/2304.04559v1)|null|\n", "2304.05218": "|**2023-04-11**|**Improving Neural Radiance Fields with Depth-aware Optimization for Novel View Synthesis**|Shu Chen et.al.|[2304.05218v1](http://arxiv.org/abs/2304.05218v1)|**[link](https://github.com/xtu-pr-lab/sfmnerf)**|\n", "2304.05097": "|**2023-04-11**|**One-Shot High-Fidelity Talking-Head Synthesis with Deformable Neural Radiance Field**|Weichuang Li et.al.|[2304.05097v1](http://arxiv.org/abs/2304.05097v1)|null|\n", "2304.04962": "|**2023-04-11**|**MRVM-NeRF: Mask-Based Pretraining for Neural Radiance Fields**|Ganlin Yang et.al.|[2304.04962v1](http://arxiv.org/abs/2304.04962v1)|null|\n", "2304.04897": "|**2023-04-10**|**Neural Image-based Avatars: Generalizable Radiance Fields for Human Avatar Modeling**|Youngjoong Kwon et.al.|[2304.04897v1](http://arxiv.org/abs/2304.04897v1)|null|\n", "2304.05620": "|**2023-04-12**|**NutritionVerse-Thin: An Optimized Strategy for Enabling Improved Rendering of 3D Thin Food Models**|Chi-en Amy Tai et.al.|[2304.05620v1](http://arxiv.org/abs/2304.05620v1)|null|\n", "2304.06714": "|**2023-04-17**|**Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and Reconstruction**|Hansheng Chen et.al.|[2304.06714v2](http://arxiv.org/abs/2304.06714v2)|**[link](https://github.com/Lakonik/SSDNeRF)**|\n", "2304.06706": "|**2023-04-13**|**Zip-NeRF: Anti-Aliased Grid-Based Neural Radiance Fields**|Jonathan T. Barron et.al.|[2304.06706v1](http://arxiv.org/abs/2304.06706v1)|null|\n", "2304.06287": "|**2023-04-13**|**NeRFVS: Neural Radiance Fields for Free View Synthesis via Geometry Scaffolds**|Chen Yang et.al.|[2304.06287v1](http://arxiv.org/abs/2304.06287v1)|null|\n", "2304.06969": "|**2023-04-14**|**UVA: Towards Unified Volumetric Avatar for View Synthesis, Pose rendering, Geometry and Texture Editing**|Jinlong Fan et.al.|[2304.06969v1](http://arxiv.org/abs/2304.06969v1)|null|\n", "2304.08279": "|**2023-04-17**|**MoDA: Modeling Deformable 3D Objects from Casual Videos**|Chaoyue Song et.al.|[2304.08279v1](http://arxiv.org/abs/2304.08279v1)|**[link](https://github.com/chaoyuesong/moda)**|\n", "2304.07979": "|**2023-04-17**|**NeRF-Loc: Visual Localization with Conditional Neural Radiance Field**|Jianlin Liu et.al.|[2304.07979v1](http://arxiv.org/abs/2304.07979v1)|**[link](https://github.com/jenningsl/nerf-loc)**|\n", "2304.07918": "|**2023-04-16**|**Likelihood-Based Generative Radiance Field with Latent Space Energy-Based Model for 3D-Aware Disentangled Image Representation**|Yaxuan Zhu et.al.|[2304.07918v1](http://arxiv.org/abs/2304.07918v1)|null|\n", "2304.07915": "|**2023-04-16**|**CAT-NeRF: Constancy-Aware Tx$^2$Former for Dynamic Body Modeling**|Haidong Zhu et.al.|[2304.07915v1](http://arxiv.org/abs/2304.07915v1)|**[link](https://github.com/haidongz-usc/CAT-NeRF)**|\n", "2304.07743": "|**2023-04-16**|**SeaThru-NeRF: Neural Radiance Fields in Scattering Media**|Deborah Levy et.al.|[2304.07743v1](http://arxiv.org/abs/2304.07743v1)|**[link](https://github.com/deborahLevy130/seathru_NeRF)**|\n", "2304.08971": "|**2023-04-18**|**SurfelNeRF: Neural Surfel Radiance Fields for Online Photorealistic Reconstruction of Indoor Scenes**|Yiming Gao et.al.|[2304.08971v1](http://arxiv.org/abs/2304.08971v1)|null|\n", "2304.08757": "|**2023-04-18**|**NeAI: A Pre-convoluted Representation for Plug-and-Play Neural Ambient Illumination**|Yiyu Zhuang et.al.|[2304.08757v1](http://arxiv.org/abs/2304.08757v1)|null|\n", "2304.09677": "|**2023-04-20**|**Reference-guided Controllable Inpainting of Neural Radiance Fields**|Ashkan Mirzaei et.al.|[2304.09677v2](http://arxiv.org/abs/2304.09677v2)|null|\n", "2304.10537": "|**2023-04-20**|**Learning Neural Duplex Radiance Fields for Real-Time View Synthesis**|Ziyu Wan et.al.|[2304.10537v1](http://arxiv.org/abs/2304.10537v1)|null|\n", "2304.10532": "|**2023-04-21**|**Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs**|Frederik Warburg et.al.|[2304.10532v2](http://arxiv.org/abs/2304.10532v2)|**[link](https://github.com/ethanweber/nerfbusters)**|\n", "2304.10448": "|**2023-04-20**|**ReLight My NeRF: A Dataset for Novel View Synthesis and Relighting of Real World Objects**|Marco Toschi et.al.|[2304.10448v1](http://arxiv.org/abs/2304.10448v1)|null|\n", "2304.10406": "|**2023-04-20**|**LiDAR-NeRF: Novel LiDAR View Synthesis via Neural Radiance Fields**|Tang Tao et.al.|[2304.10406v1](http://arxiv.org/abs/2304.10406v1)|**[link](https://github.com/tangtaogo/lidar-nerf)**|\n", "2304.10250": "|**2023-04-20**|**Revisiting Implicit Neural Representations in Low-Level Vision**|Wentian Xu et.al.|[2304.10250v1](http://arxiv.org/abs/2304.10250v1)|**[link](https://github.com/wentxul/linr)**|\n", "2304.10075": "|**2023-04-20**|**Multiscale Representation for Real-Time Anti-Aliasing Neural Rendering**|Dongting Hu et.al.|[2304.10075v1](http://arxiv.org/abs/2304.10075v1)|null|\n", "2304.10050": "|**2023-04-20**|**Neural Radiance Fields: Past, Present, and Future**|Ansh Mittal et.al.|[2304.10050v1](http://arxiv.org/abs/2304.10050v1)|null|\n", "2304.09987": "|**2023-04-19**|**Tetra-NeRF: Representing Neural Radiance Fields Using Tetrahedra**|Jonas Kulhanek et.al.|[2304.09987v1](http://arxiv.org/abs/2304.09987v1)|**[link](https://github.com/jkulhanek/tetra-nerf)**|\n", "2304.10780": "|**2023-04-21**|**Omni-Line-of-Sight Imaging for Holistic Shape Reconstruction**|Binbin Huang et.al.|[2304.10780v1](http://arxiv.org/abs/2304.10780v1)|null|\n", "2304.10664": "|**2023-04-20**|**A Comparative Neural Radiance Field (NeRF) 3D Analysis of Camera Poses from HoloLens Trajectories and Structure from Motion**|Miriam J\u00e4ger et.al.|[2304.10664v1](http://arxiv.org/abs/2304.10664v1)|null|\n", "2304.12308": "|**2023-04-26**|**Segment Anything in 3D with NeRFs**|Jiazhong Cen et.al.|[2304.12308v2](http://arxiv.org/abs/2304.12308v2)|null|\n", "2304.12294": "|**2023-04-24**|**Explicit Correspondence Matching for Generalizable Neural Radiance Fields**|Yuedong Chen et.al.|[2304.12294v1](http://arxiv.org/abs/2304.12294v1)|**[link](https://github.com/donydchen/matchnerf)**|\n", "2304.11842": "|**2023-04-25**|**Gen-NeRF: Efficient and Generalizable Neural Radiance Fields via Algorithm-Hardware Co-Design**|Yonggan Fu et.al.|[2304.11842v2](http://arxiv.org/abs/2304.11842v2)|null|\n", "2304.11470": "|**2023-04-22**|**3D-IntPhys: Towards More Generalized 3D-grounded Visual Intuitive Physics under Challenging Scenes**|Haotian Xue et.al.|[2304.11470v1](http://arxiv.org/abs/2304.11470v1)|null|\n", "2304.11448": "|**2023-04-22**|**Dehazing-NeRF: Neural Radiance Fields from Hazy Images**|Tian Li et.al.|[2304.11448v1](http://arxiv.org/abs/2304.11448v1)|null|\n", "2304.11342": "|**2023-04-22**|**NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent Semantic Navigation**|Baao Xie et.al.|[2304.11342v1](http://arxiv.org/abs/2304.11342v1)|null|\n", "2304.11241": "|**2023-04-21**|**AutoNeRF: Training Implicit Scene Representations with Autonomous Agents**|Pierre Marza et.al.|[2304.11241v1](http://arxiv.org/abs/2304.11241v1)|null|\n", "2304.12746": "|**2023-04-25**|**Local Implicit Ray Function for Generalizable Radiance Field Representation**|Xin Huang et.al.|[2304.12746v1](http://arxiv.org/abs/2304.12746v1)|null|\n", "2304.12587": "|**2023-04-27**|**MF-NeRF: Memory Efficient NeRF with Mixed-Feature Hash Table**|Yongjae Lee et.al.|[2304.12587v3](http://arxiv.org/abs/2304.12587v3)|**[link](https://github.com/nfyfamr/mf-nerf)**|\n", "2304.12467": "|**2023-04-24**|**Instant-3D: Instant Neural Radiance Field Training Towards On-Device AR/VR 3D Reconstruction**|Sixu Li et.al.|[2304.12467v1](http://arxiv.org/abs/2304.12467v1)|null|\n", "2304.12439": "|**2023-04-24**|**TextMesh: Generation of Realistic 3D Meshes From Text Prompts**|Christina Tsalicoglou et.al.|[2304.12439v1](http://arxiv.org/abs/2304.12439v1)|null|\n", "2304.13518": "|**2023-04-26**|**Super-NeRF: View-consistent Detail Generation for NeRF super-resolution**|Yuqi Han et.al.|[2304.13518v1](http://arxiv.org/abs/2304.13518v1)|null|\n", "2304.13386": "|**2023-04-26**|**VGOS: Voxel Grid Optimization for View Synthesis from Sparse Inputs**|Jiakai Sun et.al.|[2304.13386v1](http://arxiv.org/abs/2304.13386v1)|**[link](https://github.com/sjojok/vgos)**|\n", "2304.14401": "|**2023-04-27**|**ActorsNeRF: Animatable Few-shot Human Rendering with Generalizable NeRFs**|Jiteng Mu et.al.|[2304.14401v1](http://arxiv.org/abs/2304.14401v1)|null|\n", "2304.14301": "|**2023-05-03**|**Combining HoloLens with Instant-NeRFs: Advanced Real-Time 3D Mobile Mapping**|Dennis Haitz et.al.|[2304.14301v2](http://arxiv.org/abs/2304.14301v2)|null|\n", "2304.14070": "|**2023-04-27**|**Compositional 3D Human-Object Neural Animation**|Zhi Hou et.al.|[2304.14070v1](http://arxiv.org/abs/2304.14070v1)|null|\n", "2304.14811": "|**2023-04-28**|**NeRF-LiDAR: Generating Realistic LiDAR Point Clouds with Neural Radiance Fields**|Junge Zhang et.al.|[2304.14811v1](http://arxiv.org/abs/2304.14811v1)|null|\n", "2304.14473": "|**2023-04-27**|**Learning a Diffusion Prior for NeRFs**|Guandao Yang et.al.|[2304.14473v1](http://arxiv.org/abs/2304.14473v1)|null|\n", "2305.00787": "|**2023-05-01**|**GeneFace++: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation**|Zhenhui Ye et.al.|[2305.00787v1](http://arxiv.org/abs/2305.00787v1)|null|\n", "2305.00375": "|**2023-04-30**|**Neural Radiance Fields (NeRFs): A Review and Some Recent Developments**|Mohamed Debbagh et.al.|[2305.00375v1](http://arxiv.org/abs/2305.00375v1)|null|\n", "2305.00041": "|**2023-04-28**|**ViP-NeRF: Visibility Prior for Sparse Input Neural Radiance Fields**|Nagabhushan Somraj et.al.|[2305.00041v1](http://arxiv.org/abs/2305.00041v1)|**[link](https://github.com/NagabhushanSN95/ViP-NeRF)**|\n", "2305.01643": "|**2023-05-02**|**Neural LiDAR Fields for Novel View Synthesis**|Shengyu Huang et.al.|[2305.01643v1](http://arxiv.org/abs/2305.01643v1)|null|\n", "2305.01190": "|**2023-05-03**|**LatentAvatar: Learning Latent Expression Code for Expressive Neural Head Avatar**|Yuelang Xu et.al.|[2305.01190v2](http://arxiv.org/abs/2305.01190v2)|null|\n", "2305.01163": "|**2023-05-02**|**Federated Neural Radiance Fields**|Lachlan Holden et.al.|[2305.01163v1](http://arxiv.org/abs/2305.01163v1)|**[link](https://github.com/lachholden/fednerf-pytorch)**|\n", "2305.03049": "|**2023-05-04**|**NeuralEditor: Editing Neural Radiance Fields via Manipulating Point Clouds**|Jun-Kun Chen et.al.|[2305.03049v1](http://arxiv.org/abs/2305.03049v1)|null|\n", "2305.02756": "|**2023-05-04**|**Radiance Field Gradient Scaling for Unbiased Near-Camera Training**|Julien Philip et.al.|[2305.02756v1](http://arxiv.org/abs/2305.02756v1)|**[link](https://github.com/gradient-scaling/gradient-scaling.github.io)**|\n", "2305.02618": "|**2023-05-04**|**Semantic-aware Generation of Multi-view Portrait Drawings**|Biao Ma et.al.|[2305.02618v1](http://arxiv.org/abs/2305.02618v1)|**[link](https://github.com/aiart-hdu/sage)**|\n", "2305.03176": "|**2023-05-04**|**NeRF-QA: Neural Radiance Fields Quality Assessment Database**|Pedro Martin et.al.|[2305.03176v1](http://arxiv.org/abs/2305.03176v1)|null|\n", "2305.04789": "|**2023-05-08**|**AvatarReX: Real-time Expressive Full-body Avatars**|Zerong Zheng et.al.|[2305.04789v1](http://arxiv.org/abs/2305.04789v1)|null|\n", "2305.04296": "|**2023-05-07**|**HashCC: Lightweight Method to Improve the Quality of the Camera-less NeRF Scene Generation**|Jan Olszewski et.al.|[2305.04296v1](http://arxiv.org/abs/2305.04296v1)|null|\n", "2305.04268": "|**2023-05-07**|**Multi-Space Neural Radiance Fields**|Ze-Xin Yin et.al.|[2305.04268v1](http://arxiv.org/abs/2305.04268v1)|null|\n", "2305.05594": "|**2023-05-09**|**PET-NeuS: Positional Encoding Tri-Planes for Neural Surfaces**|Yiqun Wang et.al.|[2305.05594v1](http://arxiv.org/abs/2305.05594v1)|**[link](https://github.com/yiqun-wang/pet-neus)**|\n", "2305.04966": "|**2023-05-08**|**NerfAcc: Efficient Sampling Accelerates NeRFs**|Ruilong Li et.al.|[2305.04966v1](http://arxiv.org/abs/2305.04966v1)|null|\n", "2305.06131": "|**2023-05-10**|**Generative AI meets 3D: A Survey on Text-to-3D in AIGC Era**|Chenghao Li et.al.|[2305.06131v1](http://arxiv.org/abs/2305.06131v1)|null|\n", "2305.06118": "|**2023-05-10**|**NeRF$^\\textbf{2}$: Neural Radio-Frequency Radiance Fields**|Xiaopeng Zhao et.al.|[2305.06118v1](http://arxiv.org/abs/2305.06118v1)|null|\n", "2305.05766": "|**2023-05-09**|**Instant-NeRF: Instant On-Device Neural Radiance Field Training via Algorithm-Accelerator Co-Designed Near-Memory Processing**|Yang Zhao et.al.|[2305.05766v1](http://arxiv.org/abs/2305.05766v1)|null|\n", "2305.07342": "|**2023-05-12**|**BundleRecon: Ray Bundle-Based 3D Neural Reconstruction**|Weikun Zhang et.al.|[2305.07342v1](http://arxiv.org/abs/2305.07342v1)|null|\n", "2305.08851": "|**2023-05-15**|**MV-Map: Offboard HD-Map Generation with Multi-view Consistency**|Ziyang Xie et.al.|[2305.08851v1](http://arxiv.org/abs/2305.08851v1)|**[link](https://github.com/ziyang-xie/mv-map)**|\n", "2305.09761": "|**2023-05-16**|**NerfBridge: Bringing Real-time, Online Neural Radiance Field Training to Robotics**|Javier Yu et.al.|[2305.09761v1](http://arxiv.org/abs/2305.09761v1)|**[link](https://github.com/javieryu/nerf_bridge)**|\n", "2305.11167": "|**2023-05-18**|**MVPSNet: Fast Generalizable Multi-view Photometric Stereo**|Dongxu Zhao et.al.|[2305.11167v1](http://arxiv.org/abs/2305.11167v1)|null|\n", "2305.11031": "|**2023-05-18**|**ConsistentNeRF: Enhancing Neural Radiance Fields with 3D Consistency for Sparse View Synthesis**|Shoukang Hu et.al.|[2305.11031v1](http://arxiv.org/abs/2305.11031v1)|**[link](https://github.com/skhu101/consistentnerf)**|\n", "2305.10579": "|**2023-05-17**|**MultiPlaneNeRF: Neural Radiance Field with Non-Trainable Representation**|Dominik Zimny et.al.|[2305.10579v1](http://arxiv.org/abs/2305.10579v1)|**[link](https://github.com/gmum/multiplanenerf)**|\n", "2305.10503": "|**2023-05-24**|**OR-NeRF: Object Removing from 3D Scenes Guided by Multiview Segmentation with Neural Radiance Fields**|Youtan Yin et.al.|[2305.10503v2](http://arxiv.org/abs/2305.10503v2)|**[link](https://github.com/cuteyyt/or-nerf)**|\n", "2305.11588": "|**2023-05-19**|**Text2NeRF: Text-Driven 3D Scene Generation with Neural Radiance Fields**|Jingbo Zhang et.al.|[2305.11588v1](http://arxiv.org/abs/2305.11588v1)|null|\n", "2305.13307": "|**2023-05-22**|**NeRFuser: Large-Scale Scene Representation by NeRF Fusion**|Jiading Fang et.al.|[2305.13307v1](http://arxiv.org/abs/2305.13307v1)|**[link](https://github.com/ripl/nerfuser)**|\n", "2305.12843": "|**2023-05-22**|**Registering Neural Radiance Fields as 3D Density Images**|Han Jiang et.al.|[2305.12843v1](http://arxiv.org/abs/2305.12843v1)|null|\n", "2305.14093": "|**2023-05-24**|**3D Open-vocabulary Segmentation with Foundation Models**|Kunhao Liu et.al.|[2305.14093v2](http://arxiv.org/abs/2305.14093v2)|**[link](https://github.com/kunhao-liu/3d-ovs)**|\n", "2305.15171": "|**2023-05-31**|**Deceptive-NeRF: Enhancing NeRF Reconstruction using Pseudo-Observations from Diffusion Models**|Xinhang Liu et.al.|[2305.15171v2](http://arxiv.org/abs/2305.15171v2)|null|\n", "2305.15094": "|**2023-05-24**|**InpaintNeRF360: Text-Guided 3D Inpainting on Unbounded Neural Radiance Fields**|Dongqing Wang et.al.|[2305.15094v1](http://arxiv.org/abs/2305.15094v1)|null|\n", "2305.14831": "|**2023-05-24**|**OD-NeRF: Efficient Training of On-the-Fly Dynamic Neural Radiance Fields**|Zhiwen Yan et.al.|[2305.14831v1](http://arxiv.org/abs/2305.14831v1)|null|\n", "2305.16233": "|**2023-05-25**|**Interactive Segment Anything NeRF with Feature Imitation**|Xiaokang Chen et.al.|[2305.16233v1](http://arxiv.org/abs/2305.16233v1)|null|\n", "2305.16213": "|**2023-05-25**|**ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with Variational Score Distillation**|Zhengyi Wang et.al.|[2305.16213v1](http://arxiv.org/abs/2305.16213v1)|**[link](https://github.com/thu-ml/prolificdreamer)**|\n", "2305.16914": "|**2023-06-06**|**PlaNeRF: SVD Unsupervised 3D Plane Regularization for NeRF Large-Scale Scene Reconstruction**|Fusang Wang et.al.|[2305.16914v3](http://arxiv.org/abs/2305.16914v3)|null|\n", "2305.16411": "|**2023-05-25**|**ZeroAvatar: Zero-shot 3D Avatar Generation from a Single Image**|Zhenzhen Weng et.al.|[2305.16411v1](http://arxiv.org/abs/2305.16411v1)|null|\n", "2305.18079": "|**2023-05-31**|**Towards a Robust Framework for NeRF Evaluation**|Adrian Azzarelli et.al.|[2305.18079v3](http://arxiv.org/abs/2305.18079v3)|**[link](https://github.com/azzarelli/wape)**|\n", "2305.17916": "|**2023-05-31**|**Volume Feature Rendering for Fast Neural Radiance Field Reconstruction**|Kang Han et.al.|[2305.17916v2](http://arxiv.org/abs/2305.17916v2)|null|\n", "2305.19201": "|**2023-05-30**|**D\u00e4RF: Boosting Radiance Fields from Sparse Inputs with Monocular Depth Adaptation**|Jiuhn Song et.al.|[2305.19201v1](http://arxiv.org/abs/2305.19201v1)|**[link](https://github.com/KU-CVLAB/DaRF)**|\n", "2305.19065": "|**2023-05-30**|**Template-free Articulated Neural Point Clouds for Reposable View Synthesis**|Lukas Uzolas et.al.|[2305.19065v1](http://arxiv.org/abs/2305.19065v1)|**[link](https://github.com/lukasuz/articulated-point-nerf)**|\n", "2305.18766": "|**2023-05-31**|**HiFA: High-fidelity Text-to-3D with Advanced Diffusion Guidance**|Junzhe Zhu et.al.|[2305.18766v2](http://arxiv.org/abs/2305.18766v2)|null|\n", "2306.00783": "|**2023-06-01**|**FDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and Relighting with Diffusion Models**|Hao Zhang et.al.|[2306.00783v1](http://arxiv.org/abs/2306.00783v1)|**[link](https://github.com/billyxyb/fdnerf)**|\n", "2306.00696": "|**2023-06-01**|**Analyzing the Internals of Neural Radiance Fields**|Lukas Radl et.al.|[2306.00696v1](http://arxiv.org/abs/2306.00696v1)|**[link](https://github.com/r4dl/nerfinternals)**|\n", "2306.00547": "|**2023-06-02**|**AvatarStudio: Text-driven Editing of 3D Dynamic Human Head Avatars**|Mohit Mendiratta et.al.|[2306.00547v2](http://arxiv.org/abs/2306.00547v2)|null|\n", "2306.03000": "|**2023-06-05**|**BeyondPixels: A Comprehensive Review of the Evolution of Neural Radiance Fields**|AKM Shahariar Azad Rabby et.al.|[2306.03000v1](http://arxiv.org/abs/2306.03000v1)|null|\n", "2306.02741": "|**2023-06-05**|**ZIGNeRF: Zero-shot 3D Scene Representation with Invertible Generative Neural Radiance Fields**|Kanghyeok Ko et.al.|[2306.02741v1](http://arxiv.org/abs/2306.02741v1)|null|\n", "2306.03727": "|**2023-06-06**|**Towards Visual Foundational Models of Physical Scenes**|Chethan Parameshwara et.al.|[2306.03727v1](http://arxiv.org/abs/2306.03727v1)|null|\n", "2306.03576": "|**2023-06-06**|**Human 3D Avatar Modeling with Implicit Neural Representation: A Brief Survey**|Mingyang Sun et.al.|[2306.03576v1](http://arxiv.org/abs/2306.03576v1)|null|\n", "2306.03207": "|**2023-06-05**|**H2-Mapping: Real-time Dense Mapping Using Hierarchical Hybrid Representation**|Chenxing Jiang et.al.|[2306.03207v1](http://arxiv.org/abs/2306.03207v1)|**[link](https://github.com/sysu-star/h2-mapping)**|\n", "2306.05410": "|**2023-06-08**|**LU-NeRF: Scene and Pose Estimation by Synchronizing Local Unposed NeRFs**|Zezhou Cheng et.al.|[2306.05410v1](http://arxiv.org/abs/2306.05410v1)|null|\n", "2306.05303": "|**2023-06-08**|**Enhance-NeRF: Multiple Performance Evaluation for Neural Radiance Fields**|Qianqiu Tan et.al.|[2306.05303v1](http://arxiv.org/abs/2306.05303v1)|**[link](https://github.com/tanqianq/enhance-nerf)**|\n", "2306.06093": "|**2023-06-09**|**HyP-NeRF: Learning Improved NeRF Priors using a HyperNetwork**|Bipasha Sen et.al.|[2306.06093v1](http://arxiv.org/abs/2306.06093v1)|null|\n", "2306.06044": "|**2023-06-09**|**GANeRF: Leveraging Discriminators to Optimize Neural Radiance Fields**|Barbara Roessle et.al.|[2306.06044v1](http://arxiv.org/abs/2306.06044v1)|null|\n", "2306.05668": "|**2023-06-09**|**RePaint-NeRF: NeRF Editting via Semantic Masks and Diffusion Models**|Xingchen Zhou et.al.|[2306.05668v1](http://arxiv.org/abs/2306.05668v1)|null|\n", "2306.06388": "|**2023-06-10**|**From NeRFLiX to NeRFLiX++: A General NeRF-Agnostic Restorer Paradigm**|Kun Zhou et.al.|[2306.06388v1](http://arxiv.org/abs/2306.06388v1)|null|\n", "2306.06300": "|**2023-06-15**|**NERFBK: A High-Quality Benchmark for NERF-Based 3D Reconstruction**|Ali Karami et.al.|[2306.06300v2](http://arxiv.org/abs/2306.06300v2)|**[link](https://github.com/3dom-fbk/nerfbk)**|\n", "2306.07581": "|**2023-06-13**|**Binary Radiance Fields**|Seungjoo Shin et.al.|[2306.07581v1](http://arxiv.org/abs/2306.07581v1)|null|\n", "2306.09349": "|**2023-06-16**|**UrbanIR: Large-Scale Urban Scene Inverse Rendering from a Single Video**|Zhi-Hao Lin et.al.|[2306.09349v2](http://arxiv.org/abs/2306.09349v2)|null|\n", "2306.08068": "|**2023-06-13**|**DORSal: Diffusion for Object-centric Representations of Scenes $\\textit{et al.}$**|Allan Jabri et.al.|[2306.08068v1](http://arxiv.org/abs/2306.08068v1)|null|\n", "2306.09551": "|**2023-06-15**|**Edit-DiffNeRF: Editing 3D Neural Radiance Fields using 2D Diffusion Model**|Lu Yu et.al.|[2306.09551v1](http://arxiv.org/abs/2306.09551v1)|null|\n", "2306.11556": "|**2023-06-20**|**NeRF synthesis with shading guidance**|Chenbin Li et.al.|[2306.11556v1](http://arxiv.org/abs/2306.11556v1)|null|\n", "2306.10350": "|**2023-06-24**|**MA-NeRF: Motion-Assisted Neural Radiance Fields for Face Synthesis from Sparse Images**|Weichen Zhang et.al.|[2306.10350v2](http://arxiv.org/abs/2306.10350v2)|null|\n", "2306.12423": "|**2023-06-21**|**Benchmarking and Analyzing 3D-aware Image Synthesis with a Modularized Codebase**|Qiuyu Wang et.al.|[2306.12423v1](http://arxiv.org/abs/2306.12423v1)|**[link](https://github.com/qiuyu96/carver)**|\n", "2306.12422": "|**2023-06-21**|**DreamTime: An Improved Optimization Strategy for Text-to-3D Content Creation**|Yukun Huang et.al.|[2306.12422v1](http://arxiv.org/abs/2306.12422v1)|null|\n", "2306.12760": "|**2023-06-22**|**Blended-NeRF: Zero-Shot Object Generation and Blending in Existing Neural Radiance Fields**|Ori Gordon et.al.|[2306.12760v1](http://arxiv.org/abs/2306.12760v1)|**[link](https://github.com/orig333/Blended-NeRF)**|\n", "2306.12570": "|**2023-06-21**|**Local 3D Editing via 3D Distillation of CLIP Knowledge**|Junha Hyung et.al.|[2306.12570v1](http://arxiv.org/abs/2306.12570v1)|null|\n", "2306.15203": "|**2023-06-27**|**Unsupervised Polychromatic Neural Representation for CT Metal Artifact Reduction**|Qing Wu et.al.|[2306.15203v1](http://arxiv.org/abs/2306.15203v1)|**[link](https://github.com/iwuqing/polyner)**|\n", "2306.16541": "|**2023-06-28**|**Envisioning a Next Generation Extended Reality Conferencing System with Efficient Photorealistic Human Rendering**|Chuanyue Shen et.al.|[2306.16541v1](http://arxiv.org/abs/2306.16541v1)|null|\n", "2306.17723": "|**2023-07-16**|**FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis**|Seunghyeon Seo et.al.|[2306.17723v2](http://arxiv.org/abs/2306.17723v2)|**[link](https://github.com/shawn615/FlipNeRF)**|\n", "2306.17624": "|**2023-07-03**|**Sphere2Vec: A General-Purpose Location Representation Learning over a Spherical Surface for Large-Scale Geospatial Predictions**|Gengchen Mai et.al.|[2306.17624v2](http://arxiv.org/abs/2306.17624v2)|null|\n", "2307.03441": "|**2023-07-07**|**NOFA: NeRF-based One-shot Facial Avatar Reconstruction**|Wangbo Yu et.al.|[2307.03441v1](http://arxiv.org/abs/2307.03441v1)|null|\n", "2307.03404": "|**2023-07-07**|**RGB-D Mapping and Tracking in a Plenoxel Radiance Field**|Andreas L. Teigen et.al.|[2307.03404v1](http://arxiv.org/abs/2307.03404v1)|**[link](https://github.com/ysus33/rgb-d_plenoxel_mapping_tracking)**|\n", "2307.05087": "|**2023-07-11**|**SAR-NeRF: Neural Radiance Fields for Synthetic Aperture Radar Multi-View Representation**|Zhengxin Lei et.al.|[2307.05087v1](http://arxiv.org/abs/2307.05087v1)|null|\n", "2307.08093": "|**2023-07-16**|**Cross-Ray Neural Radiance Fields for Novel-view Synthesis from Unconstrained Image Collections**|Yifan Yang et.al.|[2307.08093v1](http://arxiv.org/abs/2307.08093v1)|**[link](https://github.com/yifyang993/cr-nerf-pytorch)**|\n", "2307.07729": "|**2023-07-15**|**Improving NeRF with Height Data for Utilization of GIS Data**|Hinata Aoki et.al.|[2307.07729v1](http://arxiv.org/abs/2307.07729v1)|null|\n", "2307.09323": "|**2023-07-18**|**Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Jiahe Li et.al.|[2307.09323v1](http://arxiv.org/abs/2307.09323v1)|**[link](https://github.com/fictionarry/er-nerf)**|\n", "2307.10135": "|**2023-07-19**|**An Improved NeuMIP with Better Accuracy**|Bowen Xue et.al.|[2307.10135v1](http://arxiv.org/abs/2307.10135v1)|null|\n", "2307.09860": "|**2023-07-19**|**Magic NeRF Lens: Interactive Fusion of Neural Radiance Fields for Virtual Facility Inspection**|Ke Li et.al.|[2307.09860v1](http://arxiv.org/abs/2307.09860v1)|**[link](https://github.com/uhhhci/immersive-ngp)**|\n", "2307.09555": "|**2023-07-14**|**Transient Neural Radiance Fields for Lidar View Synthesis and 3D Reconstruction**|Anagh Malik et.al.|[2307.09555v1](http://arxiv.org/abs/2307.09555v1)|null|\n", "2307.10776": "|**2023-07-20**|**Urban Radiance Field Representation with Deformable Neural Mesh Primitives**|Fan Lu et.al.|[2307.10776v1](http://arxiv.org/abs/2307.10776v1)|null|\n", "2307.10664": "|**2023-07-20**|**Lighting up NeRF via Unsupervised Decomposition and Enhancement**|Haoyuan Wang et.al.|[2307.10664v1](http://arxiv.org/abs/2307.10664v1)|**[link](https://github.com/onpix/LLNeRF)**|\n", "2307.11526": "|**2023-07-29**|**CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields**|Ziyuan Luo et.al.|[2307.11526v2](http://arxiv.org/abs/2307.11526v2)|null|\n", "2307.11418": "|**2023-08-07**|**FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural Radiance Fields**|Sungwon Hwang et.al.|[2307.11418v2](http://arxiv.org/abs/2307.11418v2)|null|\n", "2307.11335": "|**2023-07-21**|**Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural Radiance Fields**|Wenbo Hu et.al.|[2307.11335v1](http://arxiv.org/abs/2307.11335v1)|null|\n", "2307.12909": "|**2023-07-24**|**Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields**|Shangzhan Zhang et.al.|[2307.12909v1](http://arxiv.org/abs/2307.12909v1)|null|\n", "2307.12718": "|**2023-07-24**|**CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle Components**|Davide Di Nucci et.al.|[2307.12718v1](http://arxiv.org/abs/2307.12718v1)|null|\n", "2307.12291": "|**2023-07-23**|**TransHuman: A Transformer-based Human Representation for Generalizable Neural Human Rendering**|Xiao Pan et.al.|[2307.12291v1](http://arxiv.org/abs/2307.12291v1)|null|\n", "2307.13908": "|**2023-07-26**|**Points-to-3D: Bridging the Gap between Sparse Points and Shape-Controllable Text-to-3D Generation**|Chaohui Yu et.al.|[2307.13908v1](http://arxiv.org/abs/2307.13908v1)|null|\n", "2307.15058": "|**2023-07-27**|**MARS: An Instance-aware, Modular and Realistic Simulator for Autonomous Driving**|Zirui Wu et.al.|[2307.15058v1](http://arxiv.org/abs/2307.15058v1)|**[link](https://github.com/open-air-sun/mars)**|\n", "2307.14620": "|**2023-07-27**|**NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection**|Chenfeng Xu et.al.|[2307.14620v1](http://arxiv.org/abs/2307.14620v1)|**[link](https://github.com/facebookresearch/nerf-det)**|\n", "2307.15333": "|**2023-07-28**|**Dynamic PlenOctree for Adaptive Sampling Refinement in Explicit NeRF**|Haotian Bai et.al.|[2307.15333v1](http://arxiv.org/abs/2307.15333v1)|null|\n", "2307.15131": "|**2023-07-27**|**Seal-3D: Interactive Pixel-Level Editing for Neural Radiance Fields**|Xiangyu Wang et.al.|[2307.15131v1](http://arxiv.org/abs/2307.15131v1)|**[link](https://github.com/windingwind/seal-3d)**|\n", "2308.00462": "|**2023-08-01**|**Context-Aware Talking-Head Video Editing**|Songlin Yang et.al.|[2308.00462v1](http://arxiv.org/abs/2308.00462v1)|null|\n", "2308.01262": "|**2023-08-02**|**Incorporating Season and Solar Specificity into Renderings made by a NeRF Architecture using Satellite Images**|Michael Gableman et.al.|[2308.01262v1](http://arxiv.org/abs/2308.01262v1)|**[link](https://github.com/enterprisecv-6/season-nerf)**|\n", "2308.00773": "|**2023-08-01**|**High-Fidelity Eye Animatable Neural Radiance Fields for Human Face**|Hengfei Wang et.al.|[2308.00773v1](http://arxiv.org/abs/2308.00773v1)|null|\n", "2308.02191": "|**2023-08-04**|**ES-MVSNet: Efficient Framework for End-to-end Self-supervised Multi-View Stereo**|Qiang Zhou et.al.|[2308.02191v1](http://arxiv.org/abs/2308.02191v1)|null|\n", "2308.03280": "|**2023-08-07**|**Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with Whitted-Style Ray Tracing**|Junyi Zeng et.al.|[2308.03280v1](http://arxiv.org/abs/2308.03280v1)|null|\n", "2308.02908": "|**2023-08-05**|**Where and How: Mitigating Confusion in Neural Radiance Fields from Sparse Inputs**|Yanqi Bao et.al.|[2308.02908v1](http://arxiv.org/abs/2308.02908v1)|**[link](https://github.com/bbbbby-99/wah-nerf)**|\n", "2308.02840": "|**2023-08-05**|**Learning Unified Decompositional and Compositional NeRF for Editable Novel View Synthesis**|Yuxin Wang et.al.|[2308.02840v1](http://arxiv.org/abs/2308.02840v1)|null|\n", "2308.02751": "|**2023-08-05**|**NeRFs: The Search for the Best 3D Representation**|Ravi Ramamoorthi et.al.|[2308.02751v1](http://arxiv.org/abs/2308.02751v1)|null|\n", "2308.04413": "|**2023-08-08**|**Digging into Depth Priors for Outdoor Neural Radiance Fields**|Chen Wang et.al.|[2308.04413v1](http://arxiv.org/abs/2308.04413v1)|null|\n", "2308.03772": "|**2023-07-27**|**Improved Neural Radiance Fields Using Pseudo-depth and Fusion**|Jingliang Li et.al.|[2308.03772v1](http://arxiv.org/abs/2308.03772v1)|null|\n", "2308.04826": "|**2023-08-09**|**WaveNeRF: Wavelet-based Generalizable Neural Radiance Fields**|Muyu Xu et.al.|[2308.04826v1](http://arxiv.org/abs/2308.04826v1)|null|\n", "2308.04669": "|**2023-08-14**|**A General Implicit Framework for Fast NeRF Composition and Rendering**|Xinyu Gao et.al.|[2308.04669v2](http://arxiv.org/abs/2308.04669v2)|null|\n", "2308.05970": "|**2023-08-11**|**Focused Specific Objects NeRF**|Yuesong Li et.al.|[2308.05970v1](http://arxiv.org/abs/2308.05970v1)|null|\n", "2308.05939": "|**2023-08-11**|**VERF: Runtime Monitoring of Pose Estimation with Neural Radiance Fields**|Dominic Maggio et.al.|[2308.05939v1](http://arxiv.org/abs/2308.05939v1)|null|\n", "2308.07118": "|**2023-08-16**|**Neural radiance fields in the industrial and robotics domain: applications, research opportunities and use cases**|Eugen \u0160lapak et.al.|[2308.07118v2](http://arxiv.org/abs/2308.07118v2)|**[link](https://github.com/maftej/iisnerf)**|\n", "2308.07032": "|**2023-08-14**|**S3IM: Stochastic Structural SIMilarity and Its Unreasonable Effectiveness for Neural Fields**|Zeke Xie et.al.|[2308.07032v1](http://arxiv.org/abs/2308.07032v1)|**[link](https://github.com/madaoer/s3im_nerf)**|\n", "2308.08530": "|**2023-08-21**|**Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstruction**|Georgios Kouros et.al.|[2308.08530v3](http://arxiv.org/abs/2308.08530v3)|**[link](https://github.com/gkouros/ref-dvgo)**|\n", "2308.08258": "|**2023-08-16**|**SceNeRFlow: Time-Consistent Reconstruction of General Dynamic Scenes**|Edith Tretschk et.al.|[2308.08258v1](http://arxiv.org/abs/2308.08258v1)|null|\n", "2308.09421": "|**2023-08-18**|**MonoNeRD: NeRF-like Representations for Monocular 3D Object Detection**|Junkai Xu et.al.|[2308.09421v1](http://arxiv.org/abs/2308.09421v1)|**[link](https://github.com/cskkxjk/mononerd)**|\n", "2308.09386": "|**2023-08-18**|**DReg-NeRF: Deep Registration for Neural Radiance Fields**|Yu Chen et.al.|[2308.09386v1](http://arxiv.org/abs/2308.09386v1)|**[link](https://github.com/aibluefisher/dreg-nerf)**|\n", "2308.08947": "|**2023-08-17**|**Watch Your Steps: Local Image and Scene Editing by Text Instructions**|Ashkan Mirzaei et.al.|[2308.08947v1](http://arxiv.org/abs/2308.08947v1)|null|\n", "2308.10902": "|**2023-08-30**|**CamP: Camera Preconditioning for Neural Radiance Fields**|Keunhong Park et.al.|[2308.10902v2](http://arxiv.org/abs/2308.10902v2)|null|\n", "2308.10337": "|**2023-08-20**|**Strata-NeRF : Neural Radiance Fields for Stratified Scenes**|Ankit Dhiman et.al.|[2308.10337v1](http://arxiv.org/abs/2308.10337v1)|null|\n", "2308.10122": "|**2023-08-19**|**HollowNeRF: Pruning Hashgrid-Based NeRFs with Trainable Collision Mitigation**|Xiufeng Xie et.al.|[2308.10122v1](http://arxiv.org/abs/2308.10122v1)|null|\n", "2308.10001": "|**2023-08-19**|**AltNeRF: Learning Robust Neural Radiance Field via Alternating Depth-Pose Optimization**|Kun Wang et.al.|[2308.10001v1](http://arxiv.org/abs/2308.10001v1)|null|\n", "2308.09894": "|**2023-08-19**|**Semantic-Human: Neural Rendering of Humans from Monocular Video with Human Parsing**|Jie Zhang et.al.|[2308.09894v1](http://arxiv.org/abs/2308.09894v1)|null|\n", "2308.11198": "|**2023-08-22**|**Novel-view Synthesis and Pose Estimation for Hand-Object Interaction from Sparse Views**|Wentian Qu et.al.|[2308.11198v1](http://arxiv.org/abs/2308.11198v1)|null|\n", "2308.11130": "|**2023-08-22**|**Efficient View Synthesis with Neural Radiance Distribution Field**|Yushuang Wu et.al.|[2308.11130v1](http://arxiv.org/abs/2308.11130v1)|null|\n", "2308.11974": "|**2023-08-23**|**Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields**|Hyeonseop Song et.al.|[2308.11974v1](http://arxiv.org/abs/2308.11974v1)|null|\n", "2308.11951": "|**2023-08-25**|**Pose Modulated Avatars from Video**|Chunjin Song et.al.|[2308.11951v2](http://arxiv.org/abs/2308.11951v2)|null|\n", "2308.11793": "|**2023-08-22**|**Enhancing NeRF akin to Enhancing LLMs: Generalizable NeRF Transformer with Mixture-of-View-Experts**|Wenyan Cong et.al.|[2308.11793v1](http://arxiv.org/abs/2308.11793v1)|**[link](https://github.com/vita-group/gnt-move)**|\n", "2308.11774": "|**2023-08-22**|**SAMSNeRF: Segment Anything Model (SAM) Guides Dynamic Surgical Scene Reconstruction by Neural Radiance Field (NeRF)**|Ange Lou et.al.|[2308.11774v1](http://arxiv.org/abs/2308.11774v1)|null|\n", "2308.12560": "|**2023-08-24**|**NOVA: NOvel View Augmentation for Neural Composition of Dynamic Objects**|Dakshit Agrawal et.al.|[2308.12560v1](http://arxiv.org/abs/2308.12560v1)|**[link](https://github.com/dakshitagrawal/nova)**|\n", "2308.13897": "|**2023-08-26**|**InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules**|Yanqi Bao et.al.|[2308.13897v1](http://arxiv.org/abs/2308.13897v1)|**[link](https://github.com/bbbbby-99/insertnerf)**|\n", "2308.15049": "|**2023-08-29**|**Pose-Free Neural Radiance Fields via Implicit Pose Regularization**|Jiahui Zhang et.al.|[2308.15049v1](http://arxiv.org/abs/2308.15049v1)|null|\n", "2308.14816": "|**2023-08-28**|**CLNeRF: Continual Learning Meets NeRF**|Zhipeng Cai et.al.|[2308.14816v1](http://arxiv.org/abs/2308.14816v1)|**[link](https://github.com/intellabs/clnerf)**|\n", "2308.16041": "|**2023-08-30**|**From Pixels to Portraits: A Comprehensive Survey of Talking Head Generation Techniques and Applications**|Shreyank N Gowda et.al.|[2308.16041v1](http://arxiv.org/abs/2308.16041v1)|null|\n", "2308.15733": "|**2023-08-30**|**Drone-NeRF: Efficient NeRF Based 3D Scene Reconstruction for Large-Scale Drone Survey**|Zhihao Jia et.al.|[2308.15733v1](http://arxiv.org/abs/2308.15733v1)|null|\n", "2308.15547": "|**2023-08-29**|**Efficient Ray Sampling for Radiance Fields Reconstruction**|Shilei Sun et.al.|[2308.15547v1](http://arxiv.org/abs/2308.15547v1)|null|\n", "2308.16576": "|**2023-09-03**|**GHuNeRF: Generalizable Human NeRF from a Monocular Video**|Chen Li et.al.|[2308.16576v2](http://arxiv.org/abs/2308.16576v2)|null|\n", "2309.00277": "|**2023-09-01**|**SparseSat-NeRF: Dense Depth Supervised Neural Radiance Fields for Sparse Satellite Images**|Lulin Zhang et.al.|[2309.00277v1](http://arxiv.org/abs/2309.00277v1)|**[link](https://github.com/lulinzhang/sps-nerf)**|\n", "2309.00014": "|**2023-09-04**|**Improving NeRF Quality by Progressive Camera Placement for Unrestricted Navigation in Complex Environments**|Georgios Kopanas et.al.|[2309.00014v2](http://arxiv.org/abs/2309.00014v2)|null|\n", "2309.01811": "|**2023-09-06**|**Instant Continual Learning of Neural Radiance Fields**|Ryan Po et.al.|[2309.01811v2](http://arxiv.org/abs/2309.01811v2)|null|\n", "2309.01351": "|**2023-09-04**|**Adv3D: Generating 3D Adversarial Examples in Driving Scenarios with NeRF**|Leheng Li et.al.|[2309.01351v1](http://arxiv.org/abs/2309.01351v1)|null|\n", "2309.03185": "|**2023-09-06**|**Bayes' Rays: Uncertainty Quantification for Neural Radiance Fields**|Lily Goli et.al.|[2309.03185v1](http://arxiv.org/abs/2309.03185v1)|**[link](https://github.com/BayesRays/BayesRays)**|\n", "2309.03160": "|**2023-09-06**|**ResFields: Residual Neural Fields for Spatiotemporal Signals**|Marko Mihajlovic et.al.|[2309.03160v1](http://arxiv.org/abs/2309.03160v1)|**[link](https://github.com/markomih/ResFields)**|\n", "2309.03550": "|**2023-09-07**|**Text2Control3D: Controllable 3D Avatar Generation in Neural Radiance Fields using Geometry-Guided Text-to-Image Diffusion Model**|Sungwon Hwang et.al.|[2309.03550v1](http://arxiv.org/abs/2309.03550v1)|null|\n", "2309.04410": "|**2023-09-08**|**DeformToon3D: Deformable 3D Toonification from Neural Radiance Fields**|Junzhe Zhang et.al.|[2309.04410v1](http://arxiv.org/abs/2309.04410v1)|**[link](https://github.com/junzhezhang/deformtoon3d)**|\n", "2309.03955": "|**2023-09-14**|**SimpleNeRF: Regularizing Sparse Input Neural Radiance Fields with Simpler Solutions**|Nagabhushan Somraj et.al.|[2309.03955v2](http://arxiv.org/abs/2309.03955v2)|null|\n", "2309.03933": "|**2023-09-07**|**BluNF: Blueprint Neural Field**|Robin Courant et.al.|[2309.03933v1](http://arxiv.org/abs/2309.03933v1)|null|\n", "2309.05339": "|**2023-09-11**|**PAg-NeRF: Towards fast and efficient end-to-end panoptic 3D representations for agricultural robotics**|Claus Smitt et.al.|[2309.05339v1](http://arxiv.org/abs/2309.05339v1)|null|\n", "2309.04917": "|**2023-09-10**|**Text-driven Editing of 3D Scenes without Retraining**|Shuangkang Fang et.al.|[2309.04917v1](http://arxiv.org/abs/2309.04917v1)|**[link](https://github.com/Fangkang515/DN2N)**|\n", "2309.04750": "|**2023-09-09**|**Mirror-Aware Neural Humans**|Daniel Ajisafe et.al.|[2309.04750v1](http://arxiv.org/abs/2309.04750v1)|null|\n", "2309.04581": "|**2023-09-08**|**Dynamic Mesh-Aware Radiance Fields**|Yi-Ling Qiao et.al.|[2309.04581v1](http://arxiv.org/abs/2309.04581v1)|null|\n", "2309.06030": "|**2023-09-12**|**Federated Learning for Large-Scale Scene Modeling with Neural Radiance Fields**|Teppei Suzuki et.al.|[2309.06030v1](http://arxiv.org/abs/2309.06030v1)|null|\n", "2309.07125": "|**2023-09-13**|**Text-Guided Generation and Editing of Compositional 3D Avatars**|Hao Zhang et.al.|[2309.07125v1](http://arxiv.org/abs/2309.07125v1)|null|\n", "2309.06802": "|**2023-09-13**|**Dynamic NeRFs for Soccer Scenes**|Sacha Lewin et.al.|[2309.06802v1](http://arxiv.org/abs/2309.06802v1)|null|\n", "2309.07846": "|**2023-09-14**|**MC-NeRF: Muti-Camera Neural Radiance Fields for Muti-Camera Image Acquisition Systems**|Yu Gao et.al.|[2309.07846v1](http://arxiv.org/abs/2309.07846v1)|null|\n", "2309.07752": "|**2023-09-14**|**DT-NeRF: Decomposed Triplane-Hash Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Yaoyu Su et.al.|[2309.07752v1](http://arxiv.org/abs/2309.07752v1)|null|\n", "2309.07668": "|**2023-09-14**|**CoRF : Colorizing Radiance Fields using Knowledge Distillation**|Ankit Dhiman et.al.|[2309.07668v1](http://arxiv.org/abs/2309.07668v1)|null|\n", "2309.08596": "|**2023-09-15**|**Robust e-NeRF: NeRF from Sparse & Noisy Events under Non-Uniform Motion**|Weng Fei Low et.al.|[2309.08596v1](http://arxiv.org/abs/2309.08596v1)|**[link](https://github.com/wengflow/robust-e-nerf)**|\n", "2309.08040": "|**2023-09-14**|**Gradient based Grasp Pose Optimization on a NeRF that Approximates Grasp Success**|Gergely S\u00f3ti et.al.|[2309.08040v1](http://arxiv.org/abs/2309.08040v1)|null|\n", "2309.09502": "|**2023-09-18**|**RenderOcc: Vision-Centric 3D Occupancy Prediction with 2D Rendering Supervision**|Mingjie Pan et.al.|[2309.09502v1](http://arxiv.org/abs/2309.09502v1)|**[link](https://github.com/pmj110119/renderocc)**|\n", "2309.09295": "|**2023-09-17**|**NeRF-VINS: A Real-time Neural Radiance Field Map-based Visual-Inertial Navigation System**|Saimouli Katragadda et.al.|[2309.09295v1](http://arxiv.org/abs/2309.09295v1)|null|\n", "2309.08927": "|**2023-09-16**|**DynaMoN: Motion-Aware Fast And Robust Camera Localization for Dynamic NeRF**|Mert Asim Karaoglu et.al.|[2309.08927v1](http://arxiv.org/abs/2309.08927v1)|null|\n", "2309.10684": "|**2023-09-19**|**Locally Stylized Neural Radiance Fields**|Hong-Wing Pang et.al.|[2309.10684v1](http://arxiv.org/abs/2309.10684v1)|null|\n", "2309.10503": "|**2023-09-19**|**Steganography for Neural Radiance Fields by Backdooring**|Weina Dong et.al.|[2309.10503v1](http://arxiv.org/abs/2309.10503v1)|null|\n", "2309.10011": "|**2023-09-18**|**Instant Photorealistic Style Transfer: A Lightweight and Adaptive Approach**|Rong Liu et.al.|[2309.10011v1](http://arxiv.org/abs/2309.10011v1)|null|\n", "2309.11009": "|**2023-09-21**|**Controllable Dynamic Appearance for Neural 3D Portraits**|ShahRukh Athar et.al.|[2309.11009v2](http://arxiv.org/abs/2309.11009v2)|null|\n", "2309.10987": "|**2023-09-20**|**Spiking NeRF: Making Bio-inspired Neural Networks See through the Real World**|Xingting Yao et.al.|[2309.10987v1](http://arxiv.org/abs/2309.10987v1)|null|\n", "2309.12183": "|**2023-09-21**|**ORTexME: Occlusion-Robust Human Shape and Pose via Temporal Average Texture and Mesh Encoding**|Yu Cheng et.al.|[2309.12183v1](http://arxiv.org/abs/2309.12183v1)|null|\n", "2309.11966": "|**2023-09-21**|**NeuralLabeling: A versatile toolset for labeling vision datasets using Neural Radiance Fields**|Floris Erich et.al.|[2309.11966v1](http://arxiv.org/abs/2309.11966v1)|**[link](https://github.com/FlorisE/neural-labeling)**|\n", "2309.11767": "|**2023-09-21**|**Fast Satellite Tensorial Radiance Field for Multi-date Satellite Imagery of Large Size**|Tongtong Zhang et.al.|[2309.11767v1](http://arxiv.org/abs/2309.11767v1)|null|\n", "2309.11747": "|**2023-09-21**|**MarkNerf:Watermarking for Neural Radiance Field**|Lifeng Chen et.al.|[2309.11747v1](http://arxiv.org/abs/2309.11747v1)|null|\n", "2309.11698": "|**2023-09-21**|**Rendering stable features improves sampling-based localisation with Neural radiance fields**|Boxuan Zhang et.al.|[2309.11698v1](http://arxiv.org/abs/2309.11698v1)|null|\n", "2309.11627": "|**2023-09-20**|**GenLayNeRF: Generalizable Layered Representations with 3D Model Alignment for Multi-Human View Synthesis**|Youssef Abdelkareem et.al.|[2309.11627v1](http://arxiv.org/abs/2309.11627v1)|null|\n", "2309.11525": "|**2023-09-23**|**Light Field Diffusion for Single-View Novel View Synthesis**|Yifeng Xiong et.al.|[2309.11525v2](http://arxiv.org/abs/2309.11525v2)|null|\n", "2309.13039": "|**2023-09-22**|**NeRRF: 3D Reconstruction and View Synthesis for Transparent and Specular Objects with Neural Refractive-Reflective Fields**|Xiaoxue Chen et.al.|[2309.13039v1](http://arxiv.org/abs/2309.13039v1)|**[link](https://github.com/dawning77/nerrf)**|\n", "2309.14293": "|**2023-09-25**|**NAS-NeRF: Generative Neural Architecture Search for Neural Radiance Fields**|Saeejith Nair et.al.|[2309.14293v1](http://arxiv.org/abs/2309.14293v1)|null|\n", "2309.14010": "|**2023-09-25**|**Variational Inference for Scalable 3D Object-centric Learning**|Tianyu Wang et.al.|[2309.14010v1](http://arxiv.org/abs/2309.14010v1)|null|\n", "2309.13607": "|**2023-09-24**|**MM-NeRF: Multimodal-Guided 3D Multi-Style Transfer of Neural Radiance Field**|Zijiang Yang et.al.|[2309.13607v1](http://arxiv.org/abs/2309.13607v1)|null|\n", "2309.13240": "|**2023-09-23**|**NeRF-Enhanced Outpainting for Faithful Field-of-View Extrapolation**|Rui Yu et.al.|[2309.13240v1](http://arxiv.org/abs/2309.13240v1)|null|\n", "2309.14800": "|**2023-09-26**|**3D Density-Gradient based Edge Detection on Neural Radiance Fields (NeRFs) for Geometric Reconstruction**|Miriam J\u00e4ger et.al.|[2309.14800v1](http://arxiv.org/abs/2309.14800v1)|null|\n", "2309.15526": "|**2023-09-27**|**P2I-NET: Mapping Camera Pose to Image via Adversarial Learning for New View Synthesis in Real Indoor Environments**|Xujie Kang et.al.|[2309.15526v1](http://arxiv.org/abs/2309.15526v1)|null|\n", "2309.15329": "|**2023-09-27**|**BASED: Bundle-Adjusting Surgical Endoscopic Dynamic Video Reconstruction using Neural Radiance Fields**|Shreya Saha et.al.|[2309.15329v1](http://arxiv.org/abs/2309.15329v1)|null|\n", "2309.16553": "|**2023-09-28**|**MatrixCity: A Large-scale City Dataset for City-scale Neural Rendering and Beyond**|Yixuan Li et.al.|[2309.16553v1](http://arxiv.org/abs/2309.16553v1)|null|\n", "2309.16364": "|**2023-10-04**|**FG-NeRF: Flow-GAN based Probabilistic Neural Radiance Field for Independence-Assumption-Free Uncertainty Estimation**|Songlin Wei et.al.|[2309.16364v2](http://arxiv.org/abs/2309.16364v2)|null|\n", "2309.16110": "|**2023-09-28**|**Learning Effective NeRFs and SDFs Representations with 3D Generative Adversarial Networks for 3D Object Generation: Technical Report for ICCV 2023 OmniObject3D Challenge**|Zheyuan Yang et.al.|[2309.16110v1](http://arxiv.org/abs/2309.16110v1)|null|\n", "2309.17450": "|**2023-09-29**|**Multi-task View Synthesis with Neural Radiance Fields**|Shuhong Zheng et.al.|[2309.17450v1](http://arxiv.org/abs/2309.17450v1)|**[link](https://github.com/zsh2000/muvienerf)**|\n", "2309.17390": "|**2023-09-29**|**Forward Flow for Novel View Synthesis of Dynamic Scenes**|Xiang Guo et.al.|[2309.17390v1](http://arxiv.org/abs/2309.17390v1)|null|\n", "2309.17128": "|**2023-09-29**|**HAvatar: High-fidelity Head Avatar via Facial Model Conditioned Neural Radiance Field**|Xiaochen Zhao et.al.|[2309.17128v1](http://arxiv.org/abs/2309.17128v1)|null|\n", "2309.16859": "|**2023-09-28**|**Preface: A Data-driven Volumetric Prior for Few-shot Ultra High-resolution Face Synthesis**|Marcel C. B\u00fchler et.al.|[2309.16859v1](http://arxiv.org/abs/2309.16859v1)|null|\n", "2310.01881": "|**2023-10-03**|**Adaptive Multi-NeRF: Exploit Efficient Parallelism in Adaptive Multiple Scale Neural Radiance Field Rendering**|Tong Wang et.al.|[2310.01881v1](http://arxiv.org/abs/2310.01881v1)|null|\n", "2310.01821": "|**2023-10-03**|**MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural Radiance Fields**|Takuhiro Kaneko et.al.|[2310.01821v1](http://arxiv.org/abs/2310.01821v1)|null|\n", "2310.00874": "|**2023-10-02**|**PC-NeRF: Parent-Child Neural Radiance Fields under Partial Sensor Data Loss in Autonomous Driving Environments**|Xiuzhong Hu et.al.|[2310.00874v1](http://arxiv.org/abs/2310.00874v1)|**[link](https://github.com/biter0088/pc-nerf)**|\n", "2310.00684": "|**2023-10-01**|**How Many Views Are Needed to Reconstruct an Unknown Object Using NeRF?**|Sicong Pan et.al.|[2310.00684v1](http://arxiv.org/abs/2310.00684v1)|**[link](https://github.com/psc0628/nerf-prv)**|\n", "2310.00530": "|**2023-10-01**|**Enabling Neural Radiance Fields (NeRF) for Large-scale Aerial Images -- A Multi-tiling Approaching and the Geometry Assessment of NeRF**|Ningli Xu et.al.|[2310.00530v1](http://arxiv.org/abs/2310.00530v1)|null|\n", "2310.00249": "|**2023-09-30**|**MMPI: a Flexible Radiance Field Representation by Multiple Multi-plane Images Blending**|Yuze He et.al.|[2310.00249v1](http://arxiv.org/abs/2310.00249v1)|null|\n", "2310.02977": "|**2023-10-04**|**T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation**|Yuze He et.al.|[2310.02977v1](http://arxiv.org/abs/2310.02977v1)|**[link](https://github.com/THU-LYJ-Lab/T3Bench)**|\n", "2310.02712": "|**2023-10-04**|**ED-NeRF: Efficient Text-Guided Editing of 3D Scene using Latent Space NeRF**|Jangho Park et.al.|[2310.02712v1](http://arxiv.org/abs/2310.02712v1)|null|\n", "2310.02687": "|**2023-10-05**|**USB-NeRF: Unrolling Shutter Bundle Adjusted Neural Radiance Fields**|Moyang Li et.al.|[2310.02687v2](http://arxiv.org/abs/2310.02687v2)|null|\n", "2310.02437": "|**2023-10-03**|**EvDNeRF: Reconstructing Event Data with Dynamic Neural Radiance Fields**|Anish Bhattacharya et.al.|[2310.02437v1](http://arxiv.org/abs/2310.02437v1)|**[link](https://github.com/anish-bhattacharya/evdnerf)**|\n", "2310.03704": "|**2023-10-05**|**Drag View: Generalizable Novel View Synthesis with Unposed Imagery**|Zhiwen Fan et.al.|[2310.03704v1](http://arxiv.org/abs/2310.03704v1)|**[link](https://github.com/zhiwenfan/DragView)**|\n", "2310.03578": "|**2023-10-05**|**Targeted Adversarial Attacks on Generalizable Neural Radiance Fields**|Andras Horvath et.al.|[2310.03578v1](http://arxiv.org/abs/2310.03578v1)|null|\n", "2310.03563": "|**2023-10-05**|**BID-NeRF: RGB-D image pose estimation with inverted Neural Radiance Fields**|\u00c1goston Istv\u00e1n Csehi et.al.|[2310.03563v1](http://arxiv.org/abs/2310.03563v1)|null|\n", "2310.03125": "|**2023-10-04**|**Shielding the Unseen: Privacy Protection through Poisoning NeRF with Spatial Deformation**|Yihan Wu et.al.|[2310.03125v1](http://arxiv.org/abs/2310.03125v1)|null|\n", "2310.04152": "|**2023-10-06**|**Improving Neural Radiance Field using Near-Surface Sampling with Point Cloud Generation**|Hye Bin Yoo et.al.|[2310.04152v1](http://arxiv.org/abs/2310.04152v1)|null|\n", "2310.05837": "|**2023-10-09**|**A Real-time Method for Inserting Virtual Objects into Neural Radiance Fields**|Keyang Ye et.al.|[2310.05837v1](http://arxiv.org/abs/2310.05837v1)|null|\n", "2310.05391": "|**2023-10-09**|**Neural Impostor: Editing Neural Radiance Fields with Explicit Shape Manipulation**|Ruiyang Liu et.al.|[2310.05391v1](http://arxiv.org/abs/2310.05391v1)|null|\n", "2310.05134": "|**2023-10-08**|**LocoNeRF: A NeRF-based Approach for Local Structure from Motion for Precise Localization**|Artem Nenashev et.al.|[2310.05134v1](http://arxiv.org/abs/2310.05134v1)|null|\n", "2310.05133": "|**2023-10-08**|**Geometry Aware Field-to-field Transformations for 3D Semantic Segmentation**|Dominik Hollidt et.al.|[2310.05133v1](http://arxiv.org/abs/2310.05133v1)|null|\n", "2310.06275": "|**2023-10-10**|**High-Fidelity 3D Head Avatars Reconstruction through Spatially-Varying Expression Conditioned Neural Radiance Field**|Minghan Qin et.al.|[2310.06275v1](http://arxiv.org/abs/2310.06275v1)|null|\n", "2310.07449": "|**2023-10-12**|**PoRF: Pose Residual Field for Accurate Neural Surface Reconstruction**|Jia-Wang Bian et.al.|[2310.07449v2](http://arxiv.org/abs/2310.07449v2)|null|\n", "2310.07179": "|**2023-10-11**|**rpcPRF: Generalizable MPI Neural Radiance Field for Satellite Camera**|Tongtong Zhang et.al.|[2310.07179v1](http://arxiv.org/abs/2310.07179v1)|null|\n", "2310.06984": "|**2023-10-10**|**Leveraging Neural Radiance Fields for Uncertainty-Aware Visual Localization**|Le Chen et.al.|[2310.06984v1](http://arxiv.org/abs/2310.06984v1)|null|\n", "2310.07916": "|**2023-10-11**|**Dynamic Appearance Particle Neural Radiance Field**|Ancheng Lin et.al.|[2310.07916v1](http://arxiv.org/abs/2310.07916v1)|null|\n", "2310.10650": "|**2023-10-16**|**TraM-NeRF: Tracing Mirror and Near-Perfect Specular Reflections through Neural Radiance Fields**|Leif Van Holland et.al.|[2310.10650v1](http://arxiv.org/abs/2310.10650v1)|**[link](https://github.com/Rubikalubi/TraM-NeRF)**|\n", "2310.10624": "|**2023-10-16**|**DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion- and View-Change Human-Centric Video Editing**|Jia-Wei Liu et.al.|[2310.10624v1](http://arxiv.org/abs/2310.10624v1)|null|\n", "2310.10209": "|**2023-10-16**|**Self-supervised Fetal MRI 3D Reconstruction Based on Radiation Diffusion Generation Model**|Junpeng Tan et.al.|[2310.10209v1](http://arxiv.org/abs/2310.10209v1)|null|\n", "2310.09965": "|**2023-10-15**|**ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context**|Binglun Wang et.al.|[2310.09965v1](http://arxiv.org/abs/2310.09965v1)|null|\n", "2310.09892": "|**2023-10-15**|**Active Perception using Neural Radiance Fields**|Siming He et.al.|[2310.09892v1](http://arxiv.org/abs/2310.09892v1)|**[link](https://github.com/grasp-lyrl/active-perception-using-neural-radiance-fields)**|\n", "2310.09776": "|**2023-10-15**|**CBARF: Cascaded Bundle-Adjusting Neural Radiance Fields from Imperfect Camera Poses**|Hongyu Fu et.al.|[2310.09776v1](http://arxiv.org/abs/2310.09776v1)|null|\n", "2310.11864": "|**2023-10-18**|**VQ-NeRF: Neural Reflectance Decomposition and Editing with Vector Quantization**|Hongliang Zhong et.al.|[2310.11864v1](http://arxiv.org/abs/2310.11864v1)|null|\n", "2310.11645": "|**2023-10-18**|**Towards Abdominal 3-D Scene Rendering from Laparoscopy Surgical Videos using NeRFs**|Khoa Tuan Nguyen et.al.|[2310.11645v1](http://arxiv.org/abs/2310.11645v1)|null|\n", "2310.13670": "|**2023-10-20**|**ManifoldNeRF: View-dependent Image Feature Supervision for Few-shot Neural Radiance Fields**|Daiju Kanaoka et.al.|[2310.13670v1](http://arxiv.org/abs/2310.13670v1)|null|\n", "2310.13356": "|**2023-10-20**|**Sync-NeRF: Generalizing Dynamic NeRFs to Unsynchronized Videos**|Seoha Kim et.al.|[2310.13356v1](http://arxiv.org/abs/2310.13356v1)|**[link](https://github.com/seoha-kim/Sync-NeRF)**|\n", "2310.13263": "|**2023-10-20**|**UE4-NeRF:Neural Radiance Field for Real-Time Rendering of Large-Scale Scene**|Jiaming Gu et.al.|[2310.13263v1](http://arxiv.org/abs/2310.13263v1)|null|\n", "2310.14695": "|**2023-10-23**|**CAwa-NeRF: Instant Learning of Compression-Aware NeRF Features**|Omnia Mahmoud et.al.|[2310.14695v1](http://arxiv.org/abs/2310.14695v1)|null|\n", "2310.14487": "|**2023-10-23**|**VQ-NeRF: Vector Quantization Enhances Implicit Neural Representations**|Yiying Yang et.al.|[2310.14487v1](http://arxiv.org/abs/2310.14487v1)|null|\n", "2310.15504": "|**2023-10-24**|**Cross-view Self-localization from Synthesized Scene-graphs**|Ryogo Yamamoto et.al.|[2310.15504v1](http://arxiv.org/abs/2310.15504v1)|null|\n", "2310.16832": "|**2023-10-26**|**LightSpeed: Light and Fast Neural Light Fields on Mobile Devices**|Aarush Gupta et.al.|[2310.16832v2](http://arxiv.org/abs/2310.16832v2)|**[link](https://github.com/lightspeed-r2l/lightspeed)**|\n", "2310.16831": "|**2023-10-28**|**PERF: Panoramic Neural Radiance Field from a Single Panorama**|Guangcong Wang et.al.|[2310.16831v2](http://arxiv.org/abs/2310.16831v2)|**[link](https://github.com/perf-project/PeRF)**|\n", "2310.16383": "|**2023-10-25**|**Open-NeRF: Towards Open Vocabulary NeRF Decomposition**|Hao Zhang et.al.|[2310.16383v1](http://arxiv.org/abs/2310.16383v1)|null|\n", "2310.16255": "|**2023-10-25**|**UAV-Sim: NeRF-based Synthetic Data Generation for UAV-based Perception**|Christopher Maxey et.al.|[2310.16255v1](http://arxiv.org/abs/2310.16255v1)|null|\n", "2310.17075": "|**2023-10-27**|**HyperFields: Towards Zero-Shot Generation of NeRFs from Text**|Sudarshan Babu et.al.|[2310.17075v2](http://arxiv.org/abs/2310.17075v2)|null|\n", "2310.16858": "|**2023-10-25**|**4D-Editor: Interactive Object-level Editing in Dynamic Neural Radiance Fields via 4D Semantic Segmentation**|Dadong Jiang et.al.|[2310.16858v1](http://arxiv.org/abs/2310.16858v1)|null|\n", "2310.17994": "|**2023-10-27**|**ZeroNVS: Zero-Shot 360-Degree View Synthesis from a Single Real Image**|Kyle Sargent et.al.|[2310.17994v1](http://arxiv.org/abs/2310.17994v1)|null|\n", "2310.17880": "|**2023-10-27**|**Reconstructive Latent-Space Neural Radiance Fields for Efficient 3D Scene Representations**|Tristan Aumentado-Armstrong et.al.|[2310.17880v1](http://arxiv.org/abs/2310.17880v1)|null|\n", "2310.18917": "|**2023-11-04**|**TiV-NeRF: Tracking and Mapping via Time-Varying Representation with Dynamic Neural Radiance Fields**|Chengyao Duan et.al.|[2310.18917v2](http://arxiv.org/abs/2310.18917v2)|null|\n", "2310.18846": "|**2023-10-28**|**INCODE: Implicit Neural Conditioning with Prior Knowledge Embeddings**|Amirhossein Kazerouni et.al.|[2310.18846v1](http://arxiv.org/abs/2310.18846v1)|**[link](https://github.com/xmindflow/INCODE)**|\n", "2310.20710": "|**2023-10-31**|**FPO++: Efficient Encoding and Rendering of Dynamic Neural Radiance Fields by Analyzing and Enhancing Fourier PlenOctrees**|Saskia Rabich et.al.|[2310.20710v1](http://arxiv.org/abs/2310.20710v1)|null|\n", "2310.20685": "|**2023-10-31**|**NeRF Revisited: Fixing Quadrature Instability in Volume Rendering**|Mikaela Angelina Uy et.al.|[2310.20685v1](http://arxiv.org/abs/2310.20685v1)|null|\n", "2310.19464": "|**2023-10-30**|**Generative Neural Fields by Mixtures of Neural Implicit Functions**|Tackgeun You et.al.|[2310.19464v1](http://arxiv.org/abs/2310.19464v1)|null|\n", "2311.01065": "|**2023-11-02**|**Novel View Synthesis from a Single RGBD Image for Indoor Scenes**|Congrui Hetang et.al.|[2311.01065v1](http://arxiv.org/abs/2311.01065v1)|null|\n", "2311.01815": "|**2023-11-03**|**Estimating 3D Uncertainty Field: Quantifying Uncertainty for Neural Radiance Fields**|Jianxiong Shen et.al.|[2311.01815v1](http://arxiv.org/abs/2311.01815v1)|null|\n", "2311.01773": "|**2023-11-03**|**PDF: Point Diffusion Implicit Function for Large-scale Scene Neural Representation**|Yuhan Ding et.al.|[2311.01773v1](http://arxiv.org/abs/2311.01773v1)|null|\n", "2311.01659": "|**2023-11-03**|**Efficient Cloud Pipelines for Neural Radiance Fields**|Derek Jacoby et.al.|[2311.01659v1](http://arxiv.org/abs/2311.01659v1)|null|\n", "2311.03140": "|**2023-11-06**|**Animating NeRFs from Texture Space: A Framework for Pose-Dependent Rendering of Human Performances**|Paul Knoll et.al.|[2311.03140v1](http://arxiv.org/abs/2311.03140v1)|null|\n", "2311.02826": "|**2023-11-06**|**InstructPix2NeRF: Instructed 3D Portrait Editing from a Single Image**|Jianhui Li et.al.|[2311.02826v1](http://arxiv.org/abs/2311.02826v1)|**[link](https://github.com/mybabyyh/instructpix2nerf)**|\n", "2311.04154": "|**2023-11-07**|**High-fidelity 3D Reconstruction of Plants using Neural Radiance Field**|Kewei Hu et.al.|[2311.04154v1](http://arxiv.org/abs/2311.04154v1)|null|\n", "2311.03965": "|**2023-11-07**|**Fast Sun-aligned Outdoor Scene Relighting based on TensoRF**|Yeonjin Chang et.al.|[2311.03965v1](http://arxiv.org/abs/2311.03965v1)|null|\n", "2311.03784": "|**2023-11-08**|**UP-NeRF: Unconstrained Pose-Prior-Free Neural Radiance Fields**|Injae Kim et.al.|[2311.03784v2](http://arxiv.org/abs/2311.03784v2)|**[link](https://github.com/mlvlab/upnerf)**|\n", "2311.03484": "|**2023-11-06**|**Osprey: Multi-Session Autonomous Aerial Mapping with LiDAR-based SLAM and Next Best View Planning**|Rowan Border et.al.|[2311.03484v1](http://arxiv.org/abs/2311.03484v1)|null|\n", "2311.04400": "|**2023-11-08**|**LRM: Large Reconstruction Model for Single Image to 3D**|Yicong Hong et.al.|[2311.04400v1](http://arxiv.org/abs/2311.04400v1)|null|\n", "2311.04246": "|**2023-11-07**|**ADFactory: Automated Data Factory for Optical Flow Tasks**|Han Ling et.al.|[2311.04246v1](http://arxiv.org/abs/2311.04246v1)|null|\n", "2311.05521": "|**2023-11-09**|**BakedAvatar: Baking Neural Fields for Real-Time Head Avatar Synthesis**|Hao-Bin Duan et.al.|[2311.05521v1](http://arxiv.org/abs/2311.05521v1)|null|\n", "2311.05461": "|**2023-11-09**|**Control3D: Towards Controllable Text-to-3D Generation**|Yang Chen et.al.|[2311.05461v1](http://arxiv.org/abs/2311.05461v1)|null|\n"}}
\ No newline at end of file
+{"Kinematic Mapping": {"2302.11988": "|**2023-02-23**|**Time Complexity of Broadcast and Consensus for Randomized Oblivious Message Adversaries**|Antoine El-Hayek et.al.|[2302.11988v1](http://arxiv.org/abs/2302.11988v1)|null|\n", "2302.09743": "|**2023-02-20**|**Dynamic Optimal Control: A Real-Time Control Optimization Algorithm for Dynamic Networks**|Chunyu Pan et.al.|[2302.09743v1](http://arxiv.org/abs/2302.09743v1)|null|\n", "2302.09382": "|**2023-02-18**|**Co-trading networks for modeling dynamic interdependency structures and estimating high-dimensional covariances in US equity markets**|Yutong Lu et.al.|[2302.09382v1](http://arxiv.org/abs/2302.09382v1)|null|\n", "2302.07657": "|**2023-02-15**|**Dynamic Flows with Time-Dependent Capacities**|Thomas Bl\u00e4sius et.al.|[2302.07657v1](http://arxiv.org/abs/2302.07657v1)|null|\n", "2302.04377": "|**2023-02-08**|**ER network heterogeneity guides diffusive transport and kinetics**|Zubenelgenubi C. Scott et.al.|[2302.04377v1](http://arxiv.org/abs/2302.04377v1)|null|\n", "2302.03677": "|**2023-02-24**|**Wealth distribution on a dynamic complex network**|Gustavo Kohlrausch et.al.|[2302.03677v2](http://arxiv.org/abs/2302.03677v2)|null|\n", "2302.03039": "|**2023-02-06**|**SUPER VII. Morphology and kinematics of H$\u03b1$ emission in AGN host galaxies at Cosmic noon using SINFONI**|D. Kakkad et.al.|[2302.03039v1](http://arxiv.org/abs/2302.03039v1)|null|\n", "2302.02313": "|**2023-02-05**|**A Game-Theoretic Approach to Solving the Roman Domination Problem**|Xiuyang Chen et.al.|[2302.02313v1](http://arxiv.org/abs/2302.02313v1)|null|\n", "2302.01694": "|**2023-02-03**|**Coevolving Boolean and Multi-Valued Regulatory Networks**|Larry Bull et.al.|[2302.01694v1](http://arxiv.org/abs/2302.01694v1)|null|\n", "2301.12892": "|**2023-01-30**|**Quantifying and maximizing the information flux in recurrent neural networks**|Claus Metzner et.al.|[2301.12892v1](http://arxiv.org/abs/2301.12892v1)|null|\n", "2301.12156": "|**2023-03-23**|**Perspective: How to overcome dynamical density functional theory**|Daniel de las Heras et.al.|[2301.12156v2](http://arxiv.org/abs/2301.12156v2)|null|\n", "2301.11982": "|**2023-02-01**|**Strategy evolution on dynamic networks**|Qi Su et.al.|[2301.11982v2](http://arxiv.org/abs/2301.11982v2)|null|\n", "2301.10962": "|**2023-01-26**|**Scheduling Policy for Value-of-Information (VoI) in Trajectory Estimation for Digital Twins**|Van-Phuc Bui et.al.|[2301.10962v1](http://arxiv.org/abs/2301.10962v1)|null|\n", "2301.07849": "|**2023-01-19**|**Efficient Computation in Congested Anonymous Dynamic Networks**|Giuseppe A. Di Luna et.al.|[2301.07849v1](http://arxiv.org/abs/2301.07849v1)|null|\n", "2301.07515": "|**2023-01-15**|**Towards the development of Dynamic Networked Psychology Hypotheses**|Liaquat Hossain et.al.|[2301.07515v1](http://arxiv.org/abs/2301.07515v1)|null|\n", "2301.04904": "|**2023-01-12**|**Lesion-aware Dynamic Kernel for Polyp Segmentation**|Ruifei Zhang et.al.|[2301.04904v1](http://arxiv.org/abs/2301.04904v1)|**[link](https://github.com/reafly/ldnet)**|\n", "2301.04296": "|**2023-01-11**|**A degree-corrected Cox model for dynamic networks**|Yuguo Chen et.al.|[2301.04296v1](http://arxiv.org/abs/2301.04296v1)|null|\n", "2301.03965": "|**2023-01-10**|**BiCurNet: Pre-Movement EEG based Neural Decoder for Biceps Curl Trajectory Estimation**|Manali Saini et.al.|[2301.03965v1](http://arxiv.org/abs/2301.03965v1)|null|\n", "2301.01314": "|**2023-01-03**|**Network-theoretic modeling of fluid-structure interactions**|Aditya G. Nair et.al.|[2301.01314v1](http://arxiv.org/abs/2301.01314v1)|null|\n", "2212.12843": "|**2022-12-25**|**A Note on Improved Results for One Round Distributed Clique Listing**|Quanquan C. Liu et.al.|[2212.12843v1](http://arxiv.org/abs/2212.12843v1)|null|\n", "2212.12345": "|**2022-12-23**|**Piecewise-Velocity Model for Learning Continuous-time Dynamic Node Representations**|Abdulkadir \u00c7elikkanat et.al.|[2212.12345v1](http://arxiv.org/abs/2212.12345v1)|null|\n", "2212.12130": "|**2023-02-04**|**Learning to Detect and Segment for Open Vocabulary Object Detection**|Tao Wang et.al.|[2212.12130v2](http://arxiv.org/abs/2212.12130v2)|null|\n", "2212.09483": "|**2022-12-19**|**Adaptive Control of Client Selection and Gradient Compression for Efficient Federated Learning**|Zhida Jiang et.al.|[2212.09483v1](http://arxiv.org/abs/2212.09483v1)|null|\n", "2212.08358": "|**2022-12-16**|**Some recent trends in embeddings of time series and dynamic networks**|Dag Tj\u00f8stheim et.al.|[2212.08358v1](http://arxiv.org/abs/2212.08358v1)|null|\n", "2212.08314": "|**2023-01-30**|**Synchronization-preserving clusters in hypergraphs**|Anirban Banerjee et.al.|[2212.08314v2](http://arxiv.org/abs/2212.08314v2)|null|\n", "2212.08239": "|**2022-12-16**|**Discovering Structural Hole Spanners in Dynamic Networks via Graph Neural Networks**|Diksha Goel et.al.|[2212.08239v1](http://arxiv.org/abs/2212.08239v1)|null|\n", "2212.07961": "|**2022-12-15**|**Topological Data Analysis Detects Percolation Thresholds in Arctic Melt-Pond Evolution**|Wilfred Offord et.al.|[2212.07961v1](http://arxiv.org/abs/2212.07961v1)|**[link](https://github.com/wilfofford/tda-for-sea-ice-percolation)**|\n", "2212.05980": "|**2022-12-12**|**Evaluation of RGB-D SLAM in Large Indoor Environments**|Kirill Muravyev et.al.|[2212.05980v1](http://arxiv.org/abs/2212.05980v1)|null|\n", "2212.03999": "|**2022-12-07**|**On the application of dimensionality reduction and clustering algorithms for the classification of kinematic morphologies of galaxies**|M. S. Rosito et.al.|[2212.03999v1](http://arxiv.org/abs/2212.03999v1)|null|\n", "2212.02410": "|**2023-03-17**|**Antipodal Self-Duality for a Four-Particle Form Factor**|Lance J. Dixon et.al.|[2212.02410v2](http://arxiv.org/abs/2212.02410v2)|null|\n", "2212.02383": "|**2022-12-05**|**An Approach for Detecting Dynamic Communities in Social Networks**|Souaad Boudebza et.al.|[2212.02383v1](http://arxiv.org/abs/2212.02383v1)|**[link](https://github.com/Yquetzal/ECML_PKDD_2019)**|\n", "2212.01594": "|**2022-12-03**|**Parameterized temporal exploration problems**|Thomas Erlebach et.al.|[2212.01594v1](http://arxiv.org/abs/2212.01594v1)|null|\n", "2211.16726": "|**2022-11-30**|**Boosted Dynamic Neural Networks**|Haichao Yu et.al.|[2211.16726v1](http://arxiv.org/abs/2211.16726v1)|**[link](https://github.com/SHI-Labs/Boosted-Dynamic-Networks)**|\n", "2211.15301": "|**2022-11-28**|**Learning Coherent Clusters in Weakly-Connected Network Systems**|Hancheng Min et.al.|[2211.15301v1](http://arxiv.org/abs/2211.15301v1)|null|\n", "2211.15043": "|**2022-11-28**|**Higher-order Knowledge Transfer for Dynamic Community Detection with Great Changes**|Huixin Ma et.al.|[2211.15043v1](http://arxiv.org/abs/2211.15043v1)|null|\n", "2211.14560": "|**2023-01-24**|**A dynamic multi-region MFD model for ride-sourcing with ridesplitting**|Caio Vitor Beojone et.al.|[2211.14560v2](http://arxiv.org/abs/2211.14560v2)|null|\n", "2211.12589": "|**2022-11-22**|**Building Squares with Optimal State Complexity in Restricted Active Self-Assembly**|Robert M. Alaniz et.al.|[2211.12589v1](http://arxiv.org/abs/2211.12589v1)|**[link](https://github.com/asarg/autotile)**|\n", "2211.11876": "|**2022-11-21**|**Structural Modelling of Dynamic Networks and Identifying Maximum Likelihood**|Christian Gourieroux et.al.|[2211.11876v1](http://arxiv.org/abs/2211.11876v1)|null|\n", "2211.11352": "|**2023-01-30**|**Brief Announcement: Broadcasting Time in Dynamic Rooted Trees is Linear**|Antoine El-Hayek et.al.|[2211.11352v3](http://arxiv.org/abs/2211.11352v3)|null|\n", "2211.11069": "|**2022-11-20**|**Learning Nonlinear Couplings in Network of Agents from a Single Sample Trajectory**|Arash Amini et.al.|[2211.11069v1](http://arxiv.org/abs/2211.11069v1)|null|\n", "2211.10825": "|**2022-11-20**|**Identifiability of dynamic networks: the essential r\u00f4le of dources and dinks**|Eduardo Mapurunga et.al.|[2211.10825v1](http://arxiv.org/abs/2211.10825v1)|null|\n", "2211.10151": "|**2023-01-27**|**Asymptotically Tight Bounds on the Time Complexity of Broadcast and its Variants in Dynamic Networks**|Antoine El-Hayek et.al.|[2211.10151v2](http://arxiv.org/abs/2211.10151v2)|null|\n", "2211.09139": "|**2022-11-16**|**The Pandora project. I: the impact of radiation and cosmic rays on baryonic and dark matter properties of dwarf galaxies**|Sergio Martin-Alvarez et.al.|[2211.09139v1](http://arxiv.org/abs/2211.09139v1)|null|\n", "2211.08820": "|**2022-11-16**|**Computing-Aware Routing for LEO Satellite Networks: A Transmission and Computation Integration Approach**|Jiaqi Cao et.al.|[2211.08820v1](http://arxiv.org/abs/2211.08820v1)|null|\n", "2211.08700": "|**2023-02-14**|**Bi-directional Digital Twin and Edge Computing in the Metaverse**|Jiadong Yu et.al.|[2211.08700v2](http://arxiv.org/abs/2211.08700v2)|null|\n", "2211.08639": "|**2022-11-16**|**Hierarchical Dynamic Image Harmonization**|Haoxing Chen et.al.|[2211.08639v1](http://arxiv.org/abs/2211.08639v1)|**[link](https://github.com/chenhaoxing/hdnet)**|\n", "2211.08378": "|**2022-11-15**|**Anomaly Detection in Multiplex Dynamic Networks: from Blockchain Security to Brain Disease Prediction**|Ali Behrouz et.al.|[2211.08378v1](http://arxiv.org/abs/2211.08378v1)|**[link](https://github.com/ubc-systopia/anomuly)**|\n", "2211.09664": "|**2022-11-15**|**Influencer Detection with Dynamic Graph Neural Networks**|Elena Tiukhova et.al.|[2211.09664v1](http://arxiv.org/abs/2211.09664v1)|**[link](https://github.com/banking-analytics-lab/dynamicgraphlearning)**|\n", "2211.07570": "|**2022-11-14**|**Tides Need STEMMED: A Locally Operating Spatio-Temporal Mutually Exciting Point Process with Dynamic Network for Improving Opioid Overdose Death Prediction**|Che-Yi Liao et.al.|[2211.07570v1](http://arxiv.org/abs/2211.07570v1)|null|\n", "2211.07449": "|**2022-11-14**|**Dual-based Online Learning of Dynamic Network Topologies**|Seyed Saman Saboksayr et.al.|[2211.07449v1](http://arxiv.org/abs/2211.07449v1)|null|\n", "2302.12759": "|**2023-02-24**|**Modularity-based approach for tracking communities in dynamic social networks**|Michele Mazza et.al.|[2302.12759v1](http://arxiv.org/abs/2302.12759v1)|null|\n", "2302.13629": "|**2023-02-27**|**Estimation of continuous environments by robot swarms: Correlated networks and decision-making**|Mohsen Raoufi et.al.|[2302.13629v1](http://arxiv.org/abs/2302.13629v1)|null|\n", "2302.13292": "|**2023-02-26**|**Discovering Top-k Structural Hole Spanners in Dynamic Networks**|Diksha Goel et.al.|[2302.13292v1](http://arxiv.org/abs/2302.13292v1)|null|\n", "2211.05668": "|**2022-12-07**|**Mapping the Milky Way Disk with Gaia DR3: 3D extended kinematic maps and rotation curve to $\\approx 30$ kpc**|Hai-Feng Wang et.al.|[2211.05668v2](http://arxiv.org/abs/2211.05668v2)|null|\n", "2211.01538": "|**2023-03-12**|**$D^2$SLAM: Decentralized and Distributed Collaborative Visual-inertial SLAM System for Aerial Swarm**|Hao Xu et.al.|[2211.01538v3](http://arxiv.org/abs/2211.01538v3)|**[link](https://github.com/hkust-aerial-robotics/d2slam)**|\n", "2210.14842": "|**2022-10-26**|**Continuum Robot State Estimation Using Gaussian Process Regression on $SE(3)$**|Sven Lilge et.al.|[2210.14842v1](http://arxiv.org/abs/2210.14842v1)|null|\n", "2210.04572": "|**2022-10-10**|**Floorplan-Aware Camera Poses Refinement**|Anna Sokolova et.al.|[2210.04572v1](http://arxiv.org/abs/2210.04572v1)|null|\n", "2210.03412": "|**2022-10-07**|**The Trajectory PHD Filter for Coexisting Point and Extended Target Tracking**|Shaoxiu Wei et.al.|[2210.03412v1](http://arxiv.org/abs/2210.03412v1)|null|\n", "2210.01320": "|**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|\n", "2209.09723": "|**2023-02-22**|**GANet: Goal Area Network for Motion Forecasting**|Mingkun Wang et.al.|[2209.09723v3](http://arxiv.org/abs/2209.09723v3)|**[link](https://github.com/kingwmk/ganet)**|\n", "2212.03441": "|**2023-03-23**|**Higher topological complexity of a map**|Cesar A. Ipanaque Zapata et.al.|[2212.03441v2](http://arxiv.org/abs/2212.03441v2)|null|\n", "2304.09043": "|**2023-05-16**|**Continuous-Time Range-Only Pose Estimation**|Abhishek Goudar et.al.|[2304.09043v2](http://arxiv.org/abs/2304.09043v2)|null|\n", "2304.11694": "|**2023-04-25**|**Vehicle State Estimation and Prediction**|Xinchen Li et.al.|[2304.11694v2](http://arxiv.org/abs/2304.11694v2)|null|\n", "2306.01188": "|**2023-09-12**|**Event-based Stereo Visual Odometry with Native Temporal Resolution via Continuous-time Gaussian Process Regression**|Jianeng Wang et.al.|[2306.01188v5](http://arxiv.org/abs/2306.01188v5)|null|\n", "2306.01056": "|**2023-06-01**|**ERGO-ML: Towards a robust machine learning model for inferring the fraction of accreted stars in galaxies from integral-field spectroscopic maps**|Eirini Angeloudi et.al.|[2306.01056v1](http://arxiv.org/abs/2306.01056v1)|null|\n", "2306.11091": "|**2023-06-19**|**Composite Bulges -- IV. Detecting Signatures of Gas Inflows in the IFU data: The MUSE View of Ionized Gas Kinematics in NGC 1097**|Tutku Kolcu et.al.|[2306.11091v1](http://arxiv.org/abs/2306.11091v1)|null|\n", "2306.14573": "|**2023-06-26**|**Hydrodynamic simulations of the Disk of Gas Around Supermassive black holes (HDGAS) -I; Molecular Gas Dynamics**|Mojtaba Raouf et.al.|[2306.14573v1](http://arxiv.org/abs/2306.14573v1)|null|\n", "2307.00728": "|**2023-07-03**|**A new approach to QCD evolution in processes with massive partons**|Benoit Assi et.al.|[2307.00728v1](http://arxiv.org/abs/2307.00728v1)|null|\n", "2307.03207": "|**2023-07-06**|**H$\u03b1$ Kinematics of Superbubbles and Supernova Remnants of the Dwarf galaxy NGC 4214**|M. S\u00e1nchez-Cruces et.al.|[2307.03207v1](http://arxiv.org/abs/2307.03207v1)|null|\n", "2307.10381": "|**2023-07-19**|**Accelerating galaxy dynamical modeling using a neural network for joint lensing and kinematics analyses**|Matthew R. Gomer et.al.|[2307.10381v1](http://arxiv.org/abs/2307.10381v1)|null|\n", "2307.14125": "|**2023-07-26**|**Multi-IMU Proprioceptive State Estimator for Humanoid Robots**|Fabio Elnecave Xavier et.al.|[2307.14125v1](http://arxiv.org/abs/2307.14125v1)|null|\n", "2308.04071": "|**2023-08-08**|**Path Signatures for Diversity in Probabilistic Trajectory Optimisation**|Lucas Barcelos et.al.|[2308.04071v1](http://arxiv.org/abs/2308.04071v1)|null|\n", "2308.08654": "|**2023-08-16**|**Advancing Brain-Computer Interface System Performance in Hand Trajectory Estimation with NeuroKinect**|Sidharth Pancholi et.al.|[2308.08654v1](http://arxiv.org/abs/2308.08654v1)|null|\n", "2308.11493": "|**2023-08-22**|**Looking into the faintEst WIth MUSE (LEWIS): on the nature of ultra-diffuse galaxies in the Hydra-I cluster.I. Project description and preliminary results**|Enrichetta Iodice et.al.|[2308.11493v1](http://arxiv.org/abs/2308.11493v1)|null|\n", "2308.12418": "|**2023-08-23**|**Certifiably Optimal Rotation and Pose Estimation Based on the Cayley Map**|Timothy D Barfoot et.al.|[2308.12418v1](http://arxiv.org/abs/2308.12418v1)|null|\n", "2308.16620": "|**2023-08-31**|**GA-NIFS: JWST/NIRSpec IFU observations of HFLS3 reveal a dense galaxy group at z~6.3**|G. C. Jones et.al.|[2308.16620v1](http://arxiv.org/abs/2308.16620v1)|null|\n", "2309.03396": "|**2023-09-06**|**Detection of open cluster rotation fields from Gaia EDR3 proper motions**|Pedro Guilherme-Garcia et.al.|[2309.03396v1](http://arxiv.org/abs/2309.03396v1)|null|\n", "2309.06792": "|**2023-09-13**|**Motion-Bias-Free Feature-Based SLAM**|Alejandro Fontan et.al.|[2309.06792v1](http://arxiv.org/abs/2309.06792v1)|null|\n", "2309.09808": "|**2023-09-18**|**Coco-LIC: Continuous-Time Tightly-Coupled LiDAR-Inertial-Camera Odometry using Non-Uniform B-spline**|Xiaolei Lang et.al.|[2309.09808v1](http://arxiv.org/abs/2309.09808v1)|**[link](https://github.com/april-zju/coco-lic)**|\n", "2309.09011": "|**2023-09-16**|**Optimal Initialization Strategies for Range-Only Trajectory Estimation**|Abhishek Goudar et.al.|[2309.09011v1](http://arxiv.org/abs/2309.09011v1)|null|\n", "2309.08780": "|**2023-09-15**|**Simultaneous Trajectory Estimation and Mapping for Autonomous Underwater Proximity Operations**|Aldo Ter\u00e1n Espinoza et.al.|[2309.08780v1](http://arxiv.org/abs/2309.08780v1)|null|\n", "2309.11134": "|**2023-09-20**|**GNSS/Multi-Sensor Fusion Using Continuous-Time Factor Graph Optimization for Robust Localization**|Haoming Zhang et.al.|[2309.11134v1](http://arxiv.org/abs/2309.11134v1)|**[link](https://github.com/rwth-irt/gnssfgo)**|\n", "2309.15065": "|**2023-09-26**|**Language-EXtended Indoor SLAM (LEXIS): A Versatile System for Real-time Visual Scene Understanding**|Christina Kassab et.al.|[2309.15065v1](http://arxiv.org/abs/2309.15065v1)|null|\n", "2310.03353": "|**2023-10-05**|**Deep Geometric Learning with Monotonicity Constraints for Alzheimer's Disease Progression**|Seungwoo Jeong et.al.|[2310.03353v1](http://arxiv.org/abs/2310.03353v1)|null|\n", "2310.06249": "|**2023-10-10**|**l-dyno: framework to learn consistent visual features using robot's motion**|Kartikeya Singh et.al.|[2310.06249v1](http://arxiv.org/abs/2310.06249v1)|null|\n", "2310.10723": "|**2023-10-16**|**Kinematical coherence between satellite galaxies and host stellar discs for MaNGA & SAMI galaxies**|Sen Wang et.al.|[2310.10723v1](http://arxiv.org/abs/2310.10723v1)|null|\n", "2310.12776": "|**2023-10-19**|**First holistic modelling of meteoroid ablation and fragmentation: A case study of the Orionids recorded by the Canadian Automated Meteor Observatory**|Denis Vida et.al.|[2310.12776v1](http://arxiv.org/abs/2310.12776v1)|null|\n", "2310.14506": "|**2023-10-23**|**Label Space Partition Selection for Multi-Object Tracking Using Two-Layer Partitioning**|Ji Youn Lee et.al.|[2310.14506v1](http://arxiv.org/abs/2310.14506v1)|null|\n"}, "Map fusion": {"2302.11106": "|**2023-02-22**|**Multi-Head Feature Pyramid Networks for Breast Mass Detection**|Hexiang Zhang et.al.|[2302.11106v1](http://arxiv.org/abs/2302.11106v1)|null|\n", "2301.09213": "|**2023-01-24**|**FRAME: Fast and Robust Autonomous 3D point cloud Map-merging for Egocentric multi-robot exploration**|Nikolaos Stathoulopoulos et.al.|[2301.09213v2](http://arxiv.org/abs/2301.09213v2)|null|\n", "2212.01538": "|**2022-12-03**|**Multi-resolution Monocular Depth Map Fusion by Self-supervised Gradient-based Composition**|Yaqiao Dai et.al.|[2212.01538v1](http://arxiv.org/abs/2212.01538v1)|**[link](https://github.com/yuinsky/gradient-based-depth-map-fusion)**|\n", "2211.03423": "|**2022-11-07**|**Detecting Invalid Map Merges in Lifelong SLAM**|Matthias Holoch et.al.|[2211.03423v1](http://arxiv.org/abs/2211.03423v1)|null|\n", "2209.10775": "|**2022-09-22**|**MUI-TARE: Multi-Agent Cooperative Exploration with Unknown Initial Position**|Jingtian Yan et.al.|[2209.10775v1](http://arxiv.org/abs/2209.10775v1)|null|\n", "2209.08988": "|**2022-09-19**|**MSA-GCN:Multiscale Adaptive Graph Convolution Network for Gait Emotion Recognition**|Yunfei Yin et.al.|[2209.08988v1](http://arxiv.org/abs/2209.08988v1)|null|\n", "2209.03096": "|**2022-09-07**|**Spherical wedge billiard: from chaos to fractals and Talbot carpets**|Tom\u00e1\u0161 Tyc et.al.|[2209.03096v1](http://arxiv.org/abs/2209.03096v1)|null|\n", "2208.06293": "|**2022-08-12**|**dual unet:a novel siamese network for change detection with cascade differential fusion**|Kaixuan Jiang et.al.|[2208.06293v1](http://arxiv.org/abs/2208.06293v1)|null|\n", "2207.09210": "|**2023-10-23**|**KinD-LCE Curve Estimation And Retinex Fusion On Low-Light Image**|Xiaochun Lei et.al.|[2207.09210v3](http://arxiv.org/abs/2207.09210v3)|null|\n", "2207.06965": "|**2023-06-27**|**AutoMerge: A Framework for Map Assembling and Smoothing in City-scale Environments**|Peng Yin et.al.|[2207.06965v4](http://arxiv.org/abs/2207.06965v4)|null|\n", "2203.00436": "|**2022-03-01**|**Boundary Corrected Multi-scale Fusion Network for Real-time Semantic Segmentation**|Tianjiao Jiang et.al.|[2203.00436v1](http://arxiv.org/abs/2203.00436v1)|null|\n", "2202.08498": "|**2022-02-17**|**Mirror-Yolo: An attention-based instance segmentation and detection model for mirrors**|Fengze Li et.al.|[2202.08498v1](http://arxiv.org/abs/2202.08498v1)|null|\n", "2201.11937": "|**2022-01-28**|**Stereo Matching with Cost Volume based Sparse Disparity Propagation**|Wei Xue et.al.|[2201.11937v1](http://arxiv.org/abs/2201.11937v1)|null|\n", "2201.10152": "|**2022-01-29**|**Unsupervised Image Fusion Method based on Feature Mutual Mapping**|Dongyu Rao et.al.|[2201.10152v2](http://arxiv.org/abs/2201.10152v2)|null|\n", "2112.13222": "|**2022-01-24**|**Edge Robotics: Edge-Computing-Accelerated Multi-Robot Simultaneous Localization and Mapping**|Peng Huang et.al.|[2112.13222v2](http://arxiv.org/abs/2112.13222v2)|null|\n", "2112.11044": "|**2021-12-21**|**Extending Merge Resolution to a Family of Proof Systems**|Sravanthi Chede et.al.|[2112.11044v1](http://arxiv.org/abs/2112.11044v1)|null|\n", "2111.14990": "|**2021-11-29**|**MIXER: A Principled Framework for Multimodal, Multiway Data Association**|Parker C. Lusk et.al.|[2111.14990v1](http://arxiv.org/abs/2111.14990v1)|null|\n", "2110.12338": "|**2021-10-24**|**Quality Map Fusion for Adversarial Learning**|Uche Osahor et.al.|[2110.12338v1](http://arxiv.org/abs/2110.12338v1)|null|\n", "2110.08172": "|**2021-10-18**|**MLFC: From 10 to 50 Planners in the Multi-Agent Programming Contest**|Rafael C. Cardoso et.al.|[2110.08172v2](http://arxiv.org/abs/2110.08172v2)|null|\n", "2110.06697": "|**2021-10-13**|**Semantic Image Fusion**|P. R. Hill et.al.|[2110.06697v1](http://arxiv.org/abs/2110.06697v1)|null|\n", "2110.06436": "|**2021-10-13**|**Non-local Recurrent Regularization Networks for Multi-view Stereo**|Qingshan Xu et.al.|[2110.06436v1](http://arxiv.org/abs/2110.06436v1)|null|\n", "2108.08623": "|**2021-08-19**|**VolumeFusion: Deep Depth Fusion for 3D Scene Reconstruction**|Jaesung Choe et.al.|[2108.08623v1](http://arxiv.org/abs/2108.08623v1)|null|\n", "2106.11515": "|**2021-06-23**|**Cooperative mmWave PHD-SLAM with Moving Scatterers**|Hyowon Kim et.al.|[2106.11515v2](http://arxiv.org/abs/2106.11515v2)|null|\n", "2106.10220": "|**2021-06-18**|**Semantic navigation with domain knowledge**|Rafael Gomes Braga et.al.|[2106.10220v1](http://arxiv.org/abs/2106.10220v1)|null|\n", "2106.04512": "|**2021-06-22**|**Formal Verification of a Map Merging Protocol in the Multi-Agent Programming Contest**|Matt Luckcuck et.al.|[2106.04512v2](http://arxiv.org/abs/2106.04512v2)|null|\n", "2105.14994": "|**2021-05-31**|**MAOMaps: A Photo-Realistic Benchmark For vSLAM and Map Merging Quality Assessment**|Andrey Bokovoy et.al.|[2105.14994v1](http://arxiv.org/abs/2105.14994v1)|**[link](https://github.com/CnnDepth/MAOMaps)**|\n", "2103.13246": "|**2021-03-24**|**Generic Merging of Structure from Motion Maps with a Low Memory Footprint**|Gabrielle Flood et.al.|[2103.13246v1](http://arxiv.org/abs/2103.13246v1)|null|\n", "2103.03786": "|**2022-09-22**|**Distributed Dynamic Map Fusion via Federated Learning for Intelligent Networked Vehicles**|Zijian Zhang et.al.|[2103.03786v3](http://arxiv.org/abs/2103.03786v3)|**[link](https://github.com/zijianzhang/CARLA_INVS)**|\n", "2102.10929": "|**2021-02-22**|**Deep Learning for Robust Motion Segmentation with Non-Static Cameras**|Markus Bosch et.al.|[2102.10929v1](http://arxiv.org/abs/2102.10929v1)|null|\n", "2012.10658": "|**2021-02-24**|**Generalize a Small Pre-trained Model to Arbitrarily Large TSP Instances**|Zhang-Hua Fu et.al.|[2012.10658v2](http://arxiv.org/abs/2012.10658v2)|**[link](https://github.com/Spider-scnu/TSP)**|\n", "2011.14791": "|**2021-06-08**|**NeuralFusion: Online Depth Fusion in Latent Space**|Silvan Weder et.al.|[2011.14791v2](http://arxiv.org/abs/2011.14791v2)|**[link](https://github.com/weders/NeuralFusion)**|\n", "2011.03975": "|**2020-11-11**|**Mapless-Planner: A Robust and Fast Planning Framework for Aggressive Autonomous Flight without Map Fusion**|Jialin Ji et.al.|[2011.03975v2](http://arxiv.org/abs/2011.03975v2)|null|\n", "2010.03026": "|**2020-11-16**|**Place Recognition in Forests with Urquhart Tessellations**|Guilherme V. Nardari et.al.|[2010.03026v2](http://arxiv.org/abs/2010.03026v2)|**[link](https://github.com/gnardari/urquhart)**|\n", "2009.05819": "|**2020-09-12**|**Map-merging Algorithms for Visual SLAM: Feasibility Study and Empirical Evaluation**|Andrey Bokovoy et.al.|[2009.05819v1](http://arxiv.org/abs/2009.05819v1)|null|\n", "2007.14177": "|**2020-07-28**|**Generative networks as inverse problems with fractional wavelet scattering networks**|Jiasong Wu et.al.|[2007.14177v1](http://arxiv.org/abs/2007.14177v1)|null|\n", "2007.02295": "|**2020-07-05**|**Multi view stereo with semantic priors**|Elisavet Konstantina Stathopoulou et.al.|[2007.02295v1](http://arxiv.org/abs/2007.02295v1)|null|\n", "2007.02108": "|**2020-07-04**|**SplitFusion: Simultaneous Tracking and Mapping for Non-Rigid Scenes**|Yang Li et.al.|[2007.02108v1](http://arxiv.org/abs/2007.02108v1)|null|\n", "2006.00420": "|**2020-05-31**|**VIR-SLAM: Visual, Inertial, and Ranging SLAM for single and multi-robot systems**|Yanjun Cao et.al.|[2006.00420v1](http://arxiv.org/abs/2006.00420v1)|null|\n", "2002.10342": "|**2020-02-24**|**Comparing View-Based and Map-Based Semantic Labelling in Real-Time SLAM**|Zoe Landgraf et.al.|[2002.10342v1](http://arxiv.org/abs/2002.10342v1)|null|\n", "2001.09796": "|**2020-01-16**|**Knowledge Integration of Collaborative Product Design Using Cloud Computing Infrastructure**|Mahdi Bohlouli et.al.|[2001.09796v1](http://arxiv.org/abs/2001.09796v1)|null|\n", "2001.04388": "|**2020-04-03**|**RoutedFusion: Learning Real-time Depth Map Fusion**|Silvan Weder et.al.|[2001.04388v2](http://arxiv.org/abs/2001.04388v2)|**[link](https://github.com/weders/RoutedFusion)**|\n", "1909.00703": "|**2019-09-02**|**Learned Semantic Multi-Sensor Depth Map Fusion**|Denys Rozumnyi et.al.|[1909.00703v1](http://arxiv.org/abs/1909.00703v1)|null|\n", "1908.11585": "|**2019-08-30**|**ORBSLAM-Atlas: a robust and accurate multi-map system**|Richard Elvira et.al.|[1908.11585v1](http://arxiv.org/abs/1908.11585v1)|null|\n", "1908.10541": "|**2020-06-07**|**Search and Rescue under the Forest Canopy using Multiple UAVs**|Yulun Tian et.al.|[1908.10541v2](http://arxiv.org/abs/1908.10541v2)|null|\n", "1908.09806": "|**2020-02-26**|**5G mmWave Cooperative Positioning and Mapping using Multi-Model PHD Filter and Map Fusion**|Hyowon Kim et.al.|[1908.09806v3](http://arxiv.org/abs/1908.09806v3)|**[link](https://github.com/HyowonKim-P1/5GmmWavePHDFilterMapFusion)**|\n", "1905.11257": "|**2019-05-27**|**IRAS23385+6053: An embedded massive cluster in the making**|R. Cesaroni et.al.|[1905.11257v1](http://arxiv.org/abs/1905.11257v1)|null|\n", "1812.08402": "|**2018-12-20**|**SFA: Small Faces Attention Face Detector**|Shi Luo et.al.|[1812.08402v1](http://arxiv.org/abs/1812.08402v1)|**[link](https://github.com/shiluo1990/SFA)**|\n", "1811.07632": "|**2018-11-21**|**Collaborative Dense SLAM**|Louis Gallagher et.al.|[1811.07632v2](http://arxiv.org/abs/1811.07632v2)|null|\n", "1810.00457": "|**2019-03-14**|**AgriColMap: Aerial-Ground Collaborative 3D Mapping for Precision Farming**|Ciro Potena et.al.|[1810.00457v2](http://arxiv.org/abs/1810.00457v2)|null|\n", "1809.09646": "|**2019-03-05**|**Efficient Constellation-Based Map-Merging for Semantic SLAM**|Kristoffer M. Frey et.al.|[1809.09646v2](http://arxiv.org/abs/1809.09646v2)|null|\n", "2306.15416": "|**2023-07-04**|**Irregular Change Detection in Sparse Bi-Temporal Point Clouds using Learned Place Recognition Descriptors and Point-to-Voxel Comparison**|Nikolaos Stathoulopoulos et.al.|[2306.15416v2](http://arxiv.org/abs/2306.15416v2)|null|\n", "2307.00500": "|**2023-07-02**|**CQLite: Communication-Efficient Multi-Robot Exploration Using Coverage-biased Distributed Q-Learning**|Ehsan Latif et.al.|[2307.00500v1](http://arxiv.org/abs/2307.00500v1)|null|\n", "2212.08334": "|**2023-07-10**|**Lightweight integration of 3D features to improve 2D image segmentation**|Olivier Pradelle et.al.|[2212.08334v2](http://arxiv.org/abs/2212.08334v2)|**[link](https://github.com/opradelle/2dguidedlight3d)**|\n", "2307.07126": "|**2023-07-14**|**Multi-Session, Localization-oriented and Lightweight LiDAR Mapping Using Semantic Lines and Planes**|Zehuan Yu et.al.|[2307.07126v1](http://arxiv.org/abs/2307.07126v1)|null|\n", "2308.02674": "|**2023-08-04**|**Group-$k$ consistent measurement set maximization via maximum clique over k-Uniform hypergraphs for robust multi-robot map merging**|Brendon Forsgren et.al.|[2308.02674v1](http://arxiv.org/abs/2308.02674v1)|**[link](https://bitbucket.org/jmangelson/gkcm)**|\n", "2308.08715": "|**2023-08-17**|**V-FUSE: Volumetric Depth Map Fusion with Long-Range Constraints**|Nathaniel Burgdorfer et.al.|[2308.08715v1](http://arxiv.org/abs/2308.08715v1)|**[link](https://github.com/nburgdorfer/v-fuse)**|\n", "2311.03146": "|**2023-11-06**|**Enabling In-Situ Resources Utilisation by leveraging collaborative robotics and astronaut-robot interaction**|Silvia Romero-Azpitarte et.al.|[2311.03146v1](http://arxiv.org/abs/2311.03146v1)|null|\n"}, "MultiModality": {"2302.12248": "|**2023-02-23**|**Learning Visual Representations via Language-Guided Sampling**|Mohamed El Banani et.al.|[2302.12248v1](http://arxiv.org/abs/2302.12248v1)|**[link](https://github.com/mbanani/lgssl)**|\n", "2302.11939": "|**2023-02-23**|**Power Time Series Forecasting by Pretrained LM**|Tian Zhou et.al.|[2302.11939v1](http://arxiv.org/abs/2302.11939v1)|**[link](https://github.com/damo-di-ml/one_fits_all)**|\n", "2302.11713": "|**2023-02-24**|**Can Pre-trained Vision and Language Models Answer Visual Information-Seeking Questions?**|Yang Chen et.al.|[2302.11713v2](http://arxiv.org/abs/2302.11713v2)|**[link](https://github.com/edchengg/infoseek_eval)**|\n", "2302.11529": "|**2023-02-22**|**Modular Deep Learning**|Jonas Pfeiffer et.al.|[2302.11529v1](http://arxiv.org/abs/2302.11529v1)|null|\n", "2302.11458": "|**2023-02-22**|**Fusing Visual Appearance and Geometry for Multi-modality 6DoF Object Tracking**|Manuel Stoiber et.al.|[2302.11458v1](http://arxiv.org/abs/2302.11458v1)|**[link](https://github.com/dlr-rm/3dobjecttracking)**|\n", "2302.11352": "|**2023-02-22**|**X-TRA: Improving Chest X-ray Tasks with Cross-Modal Retrieval Augmentation**|Tom van Sonsbeek et.al.|[2302.11352v1](http://arxiv.org/abs/2302.11352v1)|null|\n", "2302.11254": "|**2023-02-22**|**Cross-modal Audio-visual Co-learning for Text-independent Speaker Verification**|Meng Liu et.al.|[2302.11254v1](http://arxiv.org/abs/2302.11254v1)|**[link](https://github.com/danielmengliu/audiovisuallip)**|\n", "2302.11154": "|**2023-02-24**|**Open-domain Visual Entity Recognition: Towards Recognizing Millions of Wikipedia Entities**|Hexiang Hu et.al.|[2302.11154v2](http://arxiv.org/abs/2302.11154v2)|**[link](https://github.com/edchengg/oven_eval)**|\n", "2302.11097": "|**2023-02-22**|**A Multi-Modal Neural Geometric Solver with Textual Clauses Parsed from Diagram**|Ming-Liang Zhang et.al.|[2302.11097v1](http://arxiv.org/abs/2302.11097v1)|**[link](https://github.com/mingliangzhang2018/pgps)**|\n", "2302.11082": "|**2023-02-22**|**BB-GCN: A Bi-modal Bridged Graph Convolutional Network for Multi-label Chest X-Ray Recognition**|Guoli Wang et.al.|[2302.11082v1](http://arxiv.org/abs/2302.11082v1)|null|\n", "2302.11025": "|**2023-02-21**|**Asteroseismology of $\u03b4$ Scuti stars: emulating model grids using a neural network**|Owen J. Scutt et.al.|[2302.11025v1](http://arxiv.org/abs/2302.11025v1)|null|\n", "2302.11021": "|**2023-02-21**|**MVMTnet: A Multi-variate Multi-modal Transformer for Multi-class Classification of Cardiac Irregularities Using ECG Waveforms and Clinical Notes**|Ankur Samanta et.al.|[2302.11021v1](http://arxiv.org/abs/2302.11021v1)|null|\n", "2302.10873": "|**2023-02-21**|**Context-Aware Timewise VAEs for Real-Time Vehicle Trajectory Prediction**|Pei Xu et.al.|[2302.10873v1](http://arxiv.org/abs/2302.10873v1)|**[link](https://github.com/xupei0610/contextvae)**|\n", "2302.10859": "|**2023-02-21**|**SF2Former: Amyotrophic Lateral Sclerosis Identification From Multi-center MRI Data Using Spatial and Frequency Fusion Transformer**|Rafsanjany Kushol et.al.|[2302.10859v1](http://arxiv.org/abs/2302.10859v1)|**[link](https://github.com/raoyongming/GFNet)**|\n", "2302.10813": "|**2023-02-21**|**Tracking Objects and Activities with Attention for Temporal Sentence Grounding**|Zeyu Xiong et.al.|[2302.10813v1](http://arxiv.org/abs/2302.10813v1)|null|\n", "2302.10632": "|**2023-02-23**|**Multi-Modal Self-Supervised Learning for Recommendation**|Wei Wei et.al.|[2302.10632v2](http://arxiv.org/abs/2302.10632v2)|**[link](https://github.com/hkuds/mmssl)**|\n", "2302.10511": "|**2023-02-21**|**MVFusion: Multi-View 3D Object Detection with Semantic-aligned Radar and Camera Fusion**|Zizhang Wu et.al.|[2302.10511v1](http://arxiv.org/abs/2302.10511v1)|null|\n", "2302.10465": "|**2023-02-21**|**A Flexible Multi-view Multi-modal Imaging System for Outdoor Scenes**|Meng Zhang et.al.|[2302.10465v1](http://arxiv.org/abs/2302.10465v1)|null|\n", "2302.10035": "|**2023-02-20**|**Large-scale Multi-Modal Pre-trained Models: A Comprehensive Survey**|Xiao Wang et.al.|[2302.10035v1](http://arxiv.org/abs/2302.10035v1)|**[link](https://github.com/wangxiao5791509/multimodal_bigmodels_survey)**|\n", "2302.09934": "|**2023-02-20**|**CISum: Learning Cross-modality Interaction to Enhance Multimodal Semantic Coverage for Multimodal Summarization**|Litian Zhang et.al.|[2302.09934v1](http://arxiv.org/abs/2302.09934v1)|null|\n", "2302.09850": "|**2023-02-20**|**Constraint and Union for Partially-Supervised Temporal Sentence Grounding**|Chen Ju et.al.|[2302.09850v1](http://arxiv.org/abs/2302.09850v1)|null|\n", "2302.09636": "|**2023-02-19**|**Interpretable Medical Image Visual Question Answering via Multi-Modal Relationship Graph Learning**|Xinyue Hu et.al.|[2302.09636v1](http://arxiv.org/abs/2302.09636v1)|null|\n", "2302.09328": "|**2023-02-18**|**SSVMR: Saliency-based Self-training for Video-Music Retrieval**|Xuxin Cheng et.al.|[2302.09328v1](http://arxiv.org/abs/2302.09328v1)|null|\n", "2302.08958": "|**2023-02-17**|**Towards Unifying Medical Vision-and-Language Pre-training via Soft Prompts**|Zhihong Chen et.al.|[2302.08958v1](http://arxiv.org/abs/2302.08958v1)|**[link](https://github.com/zhjohnchan/ptunifier)**|\n", "2302.08888": "|**2023-02-17**|**Multimodal Federated Learning via Contrastive Representation Ensemble**|Qiying Yu et.al.|[2302.08888v1](http://arxiv.org/abs/2302.08888v1)|**[link](https://github.com/flair-thu/creamfl)**|\n", "2302.08820": "|**2023-02-17**|**Understanding Stationary and Moving Direct Skin Vibrotactile Stimulation on the Palm**|Hesham Elsayed et.al.|[2302.08820v1](http://arxiv.org/abs/2302.08820v1)|null|\n", "2302.08774": "|**2023-02-17**|**Vision, Deduction and Alignment: An Empirical Study on Multi-modal Knowledge Graph Alignment**|Yangning Li et.al.|[2302.08774v1](http://arxiv.org/abs/2302.08774v1)|null|\n", "2302.08706": "|**2023-02-20**|**Fine-grained Cross-modal Fusion based Refinement for Text-to-Image Synthesis**|Haoran Sun et.al.|[2302.08706v2](http://arxiv.org/abs/2302.08706v2)|**[link](https://github.com/haoranhfut/ff-gan)**|\n", "2302.08670": "|**2023-02-17**|**Cascaded information enhancement and cross-modal attention feature fusion for multispectral pedestrian detection**|Yang Yang et.al.|[2302.08670v1](http://arxiv.org/abs/2302.08670v1)|null|\n", "2302.09302": "|**2023-02-16**|**Bridge the Gap between Language models and Tabular Understanding**|Nuo Chen et.al.|[2302.09302v1](http://arxiv.org/abs/2302.09302v1)|null|\n", "2302.08326": "|**2023-02-16**|**NUAA-QMUL-AIIT at Memotion 3: Multi-modal Fusion with Squeeze-and-Excitation for Internet Meme Emotion Analysis**|Xiaoyu Guo et.al.|[2302.08326v1](http://arxiv.org/abs/2302.08326v1)|**[link](https://github.com/xxxxxxxxy/memotion3-SEFusion)**|\n", "2302.08212": "|**2023-02-16**|**Visible-Infrared Person Re-Identification via Patch-Mixed Cross-Modality Learning**|Zhihao Qian et.al.|[2302.08212v1](http://arxiv.org/abs/2302.08212v1)|null|\n", "2302.08180": "|**2023-02-16**|**Cross Modal Distillation for Flood Extent Mapping**|Shubhika Garg et.al.|[2302.08180v1](http://arxiv.org/abs/2302.08180v1)|null|\n", "2302.08052": "|**2023-02-16**|**Hierarchical Cross-modal Transformer for RGB-D Salient Object Detection**|Hao Chen et.al.|[2302.08052v1](http://arxiv.org/abs/2302.08052v1)|null|\n", "2302.08020": "|**2023-02-16**|**All-Electrical Skyrmionic Bits in a Chiral Magnetic Tunnel Junction**|Shaohai Chen et.al.|[2302.08020v1](http://arxiv.org/abs/2302.08020v1)|null|\n", "2302.08016": "|**2023-02-16**|**Unsupervised Domain Adaptation for MRI Volume Segmentation and Classification Using Image-to-Image Translation**|Satoshi Kondo et.al.|[2302.08016v1](http://arxiv.org/abs/2302.08016v1)|null|\n", "2302.07919": "|**2023-02-15**|**COVID-VTS: Fact Extraction and Verification on Short Video Platforms**|Fuxiao Liu et.al.|[2302.07919v1](http://arxiv.org/abs/2302.07919v1)|**[link](https://github.com/fuxiaoliu/twitter-video-dataset)**|\n", "2302.07702": "|**2023-02-15**|**Audio-Visual Contrastive Learning with Temporal Self-Supervision**|Simon Jenni et.al.|[2302.07702v1](http://arxiv.org/abs/2302.07702v1)|null|\n", "2302.07693": "|**2023-02-16**|**Fine-tuning of sign language recognition models: a technical report**|Maxim Novopoltsev et.al.|[2302.07693v2](http://arxiv.org/abs/2302.07693v2)|**[link](https://github.com/ds-hub-sochi/sl-techreport)**|\n", "2302.07661": "|**2023-02-15**|**Depth- and Semantics-aware Multi-modal Domain Translation: Generating 3D Panoramic Color Images from LiDAR Point Clouds**|Tiago Cortinhal et.al.|[2302.07661v1](http://arxiv.org/abs/2302.07661v1)|**[link](https://github.com/tiagocortinhal/titan-next)**|\n", "2302.07456": "|**2023-02-15**|**Continuous-Time Fixed-Lag Smoothing for LiDAR-Inertial-Camera SLAM**|Jiajun Lv et.al.|[2302.07456v1](http://arxiv.org/abs/2302.07456v1)|**[link](https://github.com/april-zju/clic)**|\n", "2302.07269": "|**2023-02-14**|**Dual-mode adaptive-SVD ghost imaging**|Dajing Wang et.al.|[2302.07269v1](http://arxiv.org/abs/2302.07269v1)|null|\n", "2302.06914": "|**2023-02-14**|**Heterogeneous Anomaly Detection for Software Systems via Semi-supervised Cross-modal Attention**|Cheryl Lee et.al.|[2302.06914v1](http://arxiv.org/abs/2302.06914v1)|**[link](https://github.com/bebillionaireusd/hades)**|\n", "2302.10909": "|**2023-02-14**|**Multi-modal Machine Learning in Engineering Design: A Review and Future Directions**|Binyang Song et.al.|[2302.10909v1](http://arxiv.org/abs/2302.10909v1)|null|\n", "2302.06643": "|**2023-02-13**|**Vision-RADAR fusion for Robotics BEV Detections: A Survey**|Apoorv Singh et.al.|[2302.06643v1](http://arxiv.org/abs/2302.06643v1)|null|\n", "2302.06605": "|**2023-02-13**|**UniAdapter: Unified Parameter-Efficient Transfer Learning for Cross-modal Modeling**|Haoyu Lu et.al.|[2302.06605v1](http://arxiv.org/abs/2302.06605v1)|**[link](https://github.com/rerv/uniadapter)**|\n", "2302.06560": "|**2023-02-13**|**Large Scale Multi-Lingual Multi-Modal Summarization Dataset**|Yash Verma et.al.|[2302.06560v1](http://arxiv.org/abs/2302.06560v1)|**[link](https://github.com/anubhav-jangra/m3ls)**|\n", "2302.06452": "|**2023-02-13**|**Mixed Multi-Model Semantic Interaction for Graph-based Narrative Visualizations**|Brian Keith Norambuena et.al.|[2302.06452v1](http://arxiv.org/abs/2302.06452v1)|null|\n", "2302.06350": "|**2023-02-13**|**CLIP-RR: Improved CLIP Network for Relation-Focused Cross-Modal Information Retrieval**|Yan Gong et.al.|[2302.06350v1](http://arxiv.org/abs/2302.06350v1)|null|\n", "2302.06148": "|**2023-02-13**|**CoMAE: Single Model Hybrid Pre-training on Small-Scale RGB-D Datasets**|Jiange Yang et.al.|[2302.06148v1](http://arxiv.org/abs/2302.06148v1)|**[link](https://github.com/mcg-nju/comae)**|\n", "2302.12816": "|**2023-02-24**|**Floquet Analysis of Frequency Collisions**|Kentaro Heya et.al.|[2302.12816v1](http://arxiv.org/abs/2302.12816v1)|null|\n", "2302.12610": "|**2023-02-24**|**A Joint Modeling of Vision-Language-Action for Target-oriented Grasping in Clutter**|Kechun Xu et.al.|[2302.12610v1](http://arxiv.org/abs/2302.12610v1)|**[link](https://github.com/xukechun/Vision-Language-Grasping)**|\n", "2302.12552": "|**2023-02-24**|**Deep Learning for Video-Text Retrieval: a Review**|Cunjuan Zhu et.al.|[2302.12552v1](http://arxiv.org/abs/2302.12552v1)|null|\n", "2302.12258": "|**2023-02-23**|**Data leakage in cross-modal retrieval training: A case study**|Benno Weck et.al.|[2302.12258v1](http://arxiv.org/abs/2302.12258v1)|null|\n", "2302.14045": "|**2023-02-27**|**Language Is Not All You Need: Aligning Perception with Language Models**|Shaohan Huang et.al.|[2302.14045v1](http://arxiv.org/abs/2302.14045v1)|**[link](https://github.com/microsoft/unilm)**|\n", "2302.14042": "|**2023-02-27**|**Knowledge-enhanced Pre-training for Auto-diagnosis of Chest Radiology Images**|Xiaoman Zhang et.al.|[2302.14042v1](http://arxiv.org/abs/2302.14042v1)|null|\n", "2302.14007": "|**2023-02-27**|**Joint-MAE: 2D-3D Joint Masked Autoencoders for 3D Point Cloud Pre-training**|Ziyu Guo et.al.|[2302.14007v1](http://arxiv.org/abs/2302.14007v1)|null|\n", "2302.13838": "|**2023-02-27**|**Cross-modal Face- and Voice-style Transfer**|Naoya Takahashi et.al.|[2302.13838v1](http://arxiv.org/abs/2302.13838v1)|null|\n", "2302.13668": "|**2023-02-27**|**Contrastive Video Question Answering via Video Graph Transformer**|Junbin Xiao et.al.|[2302.13668v1](http://arxiv.org/abs/2302.13668v1)|**[link](https://github.com/doc-doc/covgt)**|\n", "2302.13321": "|**2023-02-26**|**Multi-Modality in Music: Predicting Emotion in Music from High-Level Audio Features and Lyrics**|Tibor Krols et.al.|[2302.13321v1](http://arxiv.org/abs/2302.13321v1)|**[link](https://github.com/tibor-krols/cogsci2-spotify)**|\n", "2302.13311": "|**2023-02-26**|**Understanding Social Media Cross-Modality Discourse in Linguistic Space**|Chunpu Xu et.al.|[2302.13311v1](http://arxiv.org/abs/2302.13311v1)|**[link](https://github.com/cpaaax/multimodal_discourse)**|\n", "2302.13187": "|**2023-02-25**|**Tractable Diversity: Scalable Multiperspective Ontology Management via Standpoint EL**|Luc\u00eda G\u00f3mez \u00c1lvarez et.al.|[2302.13187v1](http://arxiv.org/abs/2302.13187v1)|null|\n", "2302.13094": "|**2023-02-25**|**Knowledge-infused Contrastive Learning for Urban Imagery-based Socioeconomic Prediction**|Yu Liu et.al.|[2302.13094v1](http://arxiv.org/abs/2302.13094v1)|**[link](https://github.com/tsinghua-fib-lab/urbankg-knowcl)**|\n", "2302.12971": "|**2023-02-25**|**BrainCLIP: Bridging Brain and Visual-Linguistic Representation via CLIP for Generic Natural Visual Stimulus Decoding from fMRI**|Yulong Liu et.al.|[2302.12971v1](http://arxiv.org/abs/2302.12971v1)|**[link](https://github.com/YulongBonjour/BrainCLIP)**|\n", "2302.14785": "|**2023-02-28**|**Joint Representations of Text and Knowledge Graphs for Retrieval and Evaluation**|Teven Le Scao et.al.|[2302.14785v1](http://arxiv.org/abs/2302.14785v1)|null|\n", "2302.14777": "|**2023-02-28**|**VQA with Cascade of Self- and Co-Attention Blocks**|Aakansha Mishra et.al.|[2302.14777v1](http://arxiv.org/abs/2302.14777v1)|null|\n", "2302.14564": "|**2023-02-28**|**Exploring Self-supervised Pre-trained ASR Models For Dysarthric and Elderly Speech Recognition**|Shujie Hu et.al.|[2302.14564v1](http://arxiv.org/abs/2302.14564v1)|null|\n", "2302.14418": "|**2023-02-28**|**PCR-CG: Point Cloud Registration via Deep Color and Geometry**|Yu Zhang et.al.|[2302.14418v1](http://arxiv.org/abs/2302.14418v1)|**[link](https://github.com/gardlin/pcr-cg)**|\n", "2302.14264": "|**2023-02-28**|**RGB-D Grasp Detection via Depth Guided Learning with Cross-modal Attention**|Ran Qin et.al.|[2302.14264v1](http://arxiv.org/abs/2302.14264v1)|null|\n", "2302.14115": "|**2023-02-27**|**Vid2Seq: Large-Scale Pretraining of a Visual Language Model for Dense Video Captioning**|Antoine Yang et.al.|[2302.14115v1](http://arxiv.org/abs/2302.14115v1)|**[link](https://github.com/google-research/scenic/tree/main/scenic/projects/vid2seq)**|\n", "2302.14082": "|**2023-02-27**|**Detecting and Mitigating Mode-Collapse for Flow-based Sampling of Lattice Field Theories**|Kim A. Nicoli et.al.|[2302.14082v1](http://arxiv.org/abs/2302.14082v1)|null|\n", "2303.00720": "|**2023-03-01**|**Cross-Modal Entity Matching for Visually Rich Documents**|Ritesh Sarkhel et.al.|[2303.00720v1](http://arxiv.org/abs/2303.00720v1)|null|\n", "2303.00534": "|**2023-03-01**|**RAMM: Retrieval-augmented Biomedical Visual Question Answering with Multi-modal Pre-training**|Zheng Yuan et.al.|[2303.00534v1](http://arxiv.org/abs/2303.00534v1)|**[link](https://github.com/GanjinZero/RAMM)**|\n", "2303.00462": "|**2023-03-02**|**Hidden Gems: 4D Radar Scene Flow Learning Using Cross-Modal Supervision**|Fangqiang Ding et.al.|[2303.00462v2](http://arxiv.org/abs/2303.00462v2)|**[link](https://github.com/toytiny/cmflow)**|\n", "2303.00448": "|**2023-03-01**|**The style transformer with common knowledge optimization for image-text retrieval**|Wenrui Li et.al.|[2303.00448v1](http://arxiv.org/abs/2303.00448v1)|null|\n", "2303.00369": "|**2023-03-02**|**Indescribable Multi-modal Spatial Evaluator**|Lingke Kong et.al.|[2303.00369v2](http://arxiv.org/abs/2303.00369v2)|**[link](https://github.com/kid-liet/imse)**|\n", "2303.00289": "|**2023-03-01**|**StrucTexTv2: Masked Visual-Textual Prediction for Document Image Pre-training**|Yuechen Yu et.al.|[2303.00289v1](http://arxiv.org/abs/2303.00289v1)|**[link](https://github.com/PaddlePaddle/VIMER/tree/main/StrucTexT/v2)**|\n", "2303.00277": "|**2023-03-01**|**UAV Tracking with Lidar as a Camera Sensors in GNSS-Denied Environments**|Ha Sier et.al.|[2303.00277v1](http://arxiv.org/abs/2303.00277v1)|**[link](https://github.com/tiers/uav-tracking-based-on-lidar-as-a-camera)**|\n", "2303.00233": "|**2023-03-01**|**Single-Cell Multimodal Prediction via Transformers**|Wenzhuo Tang et.al.|[2303.00233v1](http://arxiv.org/abs/2303.00233v1)|**[link](https://github.com/omicsml/scmoformer)**|\n", "2303.00200": "|**2023-03-01**|**Feature Extraction Matters More: Universal Deepfake Disruption through Attacking Ensemble Feature Extractors**|Long Tang et.al.|[2303.00200v1](http://arxiv.org/abs/2303.00200v1)|null|\n", "2303.00073": "|**2023-02-28**|**Cross-correlated quantum thermometry using diamond containing dual-defect centers**|Madhav Gupta et.al.|[2303.00073v1](http://arxiv.org/abs/2303.00073v1)|null|\n", "2303.00040": "|**2023-02-28**|**Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training**|Dezhao Luo et.al.|[2303.00040v1](http://arxiv.org/abs/2303.00040v1)|null|\n", "2303.01480": "|**2023-03-02**|**Delivering Arbitrary-Modal Semantic Segmentation**|Jiaming Zhang et.al.|[2303.01480v1](http://arxiv.org/abs/2303.01480v1)|**[link](https://github.com/jamycheung/DELIVER)**|\n", "2303.01311": "|**2023-03-02**|**Zero-Shot Text-to-Parameter Translation for Game Character Auto-Creation**|Rui Zhao et.al.|[2303.01311v1](http://arxiv.org/abs/2303.01311v1)|null|\n", "2303.01310": "|**2023-03-02**|**Learning Language-Conditioned Deformable Object Manipulation with Graph Dynamics**|Kai Mo et.al.|[2303.01310v1](http://arxiv.org/abs/2303.01310v1)|null|\n", "2303.01217": "|**2023-03-02**|**Synthetic Misinformers: Generating and Combating Multimodal Misinformation**|Stefanos-Iordanis Papadopoulos et.al.|[2303.01217v1](http://arxiv.org/abs/2303.01217v1)|null|\n", "2303.01043": "|**2023-03-02**|**I2P-Rec: Recognizing Images on Large-scale Point Cloud Maps through Bird's Eye View Projections**|Yixuan Li et.al.|[2303.01043v1](http://arxiv.org/abs/2303.01043v1)|null|\n", "2303.00882": "|**2023-03-02**|**X-Ray2EM: Uncertainty-Aware Cross-Modality Image Reconstruction from X-Ray to Electron Microscopy in Connectomics**|Yicong Li et.al.|[2303.00882v1](http://arxiv.org/abs/2303.00882v1)|null|\n", "2303.00865": "|**2023-03-01**|**AMIGO: Sparse Multi-Modal Graph Transformer with Shared-Context Processing for Representation Learning of Giga-pixel Images**|Ramin Nakhli et.al.|[2303.00865v1](http://arxiv.org/abs/2303.00865v1)|**[link](https://github.com/raminnakhli/amigo)**|\n", "2303.00806": "|**2023-03-01**|**Survival modelling of smartphone trigger data for earthquake parameter estimation in early warning. With applications to 2023 Turkish-Syrian and 2019 Ridgecrest events**|Luca Aiello et.al.|[2303.00806v1](http://arxiv.org/abs/2303.00806v1)|null|\n", "2303.02139": "|**2023-03-03**|**Data Association Aware POMDP Planning with Hypothesis Pruning Performance Guarantees**|Moran Barenboim et.al.|[2303.02139v1](http://arxiv.org/abs/2303.02139v1)|null|\n", "2303.01933": "|**2023-03-03**|**BogieCopter: A Multi-Modal Aerial-Ground Vehicle for Long-Endurance Inspection Applications**|Teodoro Dias et.al.|[2303.01933v1](http://arxiv.org/abs/2303.01933v1)|null|\n", "2303.01510": "|**2023-03-02**|**INO at Factify 2: Structure Coherence based Multi-Modal Fact Verification**|Yinuo Zhang et.al.|[2303.01510v1](http://arxiv.org/abs/2303.01510v1)|**[link](https://github.com/catrin-baze/ino-of-factify)**|\n", "2303.03378": "|**2023-03-06**|**PaLM-E: An Embodied Multimodal Language Model**|Danny Driess et.al.|[2303.03378v1](http://arxiv.org/abs/2303.03378v1)|null|\n", "2303.03131": "|**2023-03-08**|**Video Question Answering Using CLIP-Guided Visual-Text Attention**|Shuhong Ye et.al.|[2303.03131v2](http://arxiv.org/abs/2303.03131v2)|null|\n", "2303.03093": "|**2023-03-06**|**A Miniaturised Camera-based Multi-Modal Tactile Sensor**|Kaspar Althoefer et.al.|[2303.03093v1](http://arxiv.org/abs/2303.03093v1)|null|\n", "2303.03056": "|**2023-03-07**|**MOISST: Multi-modal Optimization of Implicit Scene for SpatioTemporal calibration**|Quentin Herau et.al.|[2303.03056v2](http://arxiv.org/abs/2303.03056v2)|null|\n", "2303.03032": "|**2023-03-06**|**DeCap: Decoding CLIP Latents for Zero-Shot Captioning via Text-Only Training**|Wei Li et.al.|[2303.03032v1](http://arxiv.org/abs/2303.03032v1)|**[link](https://github.com/dhg-wei/decap)**|\n", "2303.02995": "|**2023-03-06**|**HiCLIP: Contrastive Language-Image Pretraining with Hierarchy-aware Attention**|Shijie Geng et.al.|[2303.02995v1](http://arxiv.org/abs/2303.02995v1)|**[link](https://github.com/jeykigung/hiclip)**|\n", "2303.02976": "|**2023-03-06**|**Dronument: System for Reliable Deployment of Micro Aerial Vehicles in Dark Areas of Large Historical Monuments**|Pavel Petracek et.al.|[2303.02976v1](http://arxiv.org/abs/2303.02976v1)|null|\n", "2303.02688": "|**2023-03-05**|**Text2Face: A Multi-Modal 3D Face Model**|Will Rowan et.al.|[2303.02688v1](http://arxiv.org/abs/2303.02688v1)|null|\n", "2303.02684": "|**2023-03-05**|**Robust Multi-Modal Multi-LiDAR-Inertial Odometry and Mapping for Indoor Environments**|Li Qingqing et.al.|[2303.02684v1](http://arxiv.org/abs/2303.02684v1)|**[link](https://github.com/tiers/multi-modal-loam)**|\n", "2303.02506": "|**2023-03-04**|**Prismer: A Vision-Language Model with An Ensemble of Experts**|Shikun Liu et.al.|[2303.02506v1](http://arxiv.org/abs/2303.02506v1)|**[link](https://github.com/nvlabs/prismer)**|\n", "2303.02483": "|**2023-03-04**|**FAME-ViL: Multi-Tasking Vision-Language Model for Heterogeneous Fashion Tasks**|Xiao Han et.al.|[2303.02483v1](http://arxiv.org/abs/2303.02483v1)|**[link](https://github.com/brandonhanx/fame-vil)**|\n", "2303.02479": "|**2023-03-04**|**Chronic Kidney Disease of Unknown Aetiolgy (CKDu)-the search for causes and the impact of its politicization**|Chandre Dharma-wardana et.al.|[2303.02479v1](http://arxiv.org/abs/2303.02479v1)|null|\n", "2303.02407": "|**2023-03-04**|**Local Navigation Among Movable Obstacles with Deep Reinforcement Learning**|Linghong Yao et.al.|[2303.02407v1](http://arxiv.org/abs/2303.02407v1)|null|\n", "2303.02323": "|**2023-03-04**|**APE: An Open and Shared Annotated Dataset for Learning Urban Pedestrian Path Networks**|Yuxiang Zhang et.al.|[2303.02323v1](http://arxiv.org/abs/2303.02323v1)|null|\n", "2303.02203": "|**2023-03-03**|**X$^3$KD: Knowledge Distillation Across Modalities, Tasks and Stages for Multi-Camera 3D Object Detection**|Marvin Klingner et.al.|[2303.02203v1](http://arxiv.org/abs/2303.02203v1)|null|\n", "2303.03991": "|**2023-03-07**|**OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception**|Xiaofeng Wang et.al.|[2303.03991v1](http://arxiv.org/abs/2303.03991v1)|**[link](https://github.com/jeffwang987/openoccupancy)**|\n", "2303.03878": "|**2023-03-07**|**A convergence analysis of a structure-preserving gradient flow method for the all-electron Kohn-Sham model**|Yedan Shen et.al.|[2303.03878v1](http://arxiv.org/abs/2303.03878v1)|null|\n", "2303.03595": "|**2023-03-07**|**LoGoNet: Towards Accurate 3D Object Detection with Local-to-Global Cross-Modal Fusion**|Xin Li et.al.|[2303.03595v1](http://arxiv.org/abs/2303.03595v1)|**[link](https://github.com/sankin97/logonet)**|\n", "2303.03449": "|**2023-03-06**|**Dual-encoded magnetization transfer and diffusion imaging and its application to tract-specific microstructure mapping**|Ilana R Leppert et.al.|[2303.03449v1](http://arxiv.org/abs/2303.03449v1)|**[link](https://github.com/tardiflab/mt-diff)**|\n", "2303.04748": "|**2023-03-08**|**CLIP-FO3D: Learning Free Open-world 3D Scene Representations from 2D Dense CLIP**|Junbo Zhang et.al.|[2303.04748v1](http://arxiv.org/abs/2303.04748v1)|null|\n", "2303.04585": "|**2023-03-08**|**New Audio Representations Image Gan Generation from BriVL**|Sen Fang et.al.|[2303.04585v1](http://arxiv.org/abs/2303.04585v1)|**[link](https://github.com/fangsen9000/brivl-generation)**|\n", "2303.04439": "|**2023-03-08**|**A Light Weight Model for Active Speaker Detection**|Junhua Liao et.al.|[2303.04439v1](http://arxiv.org/abs/2303.04439v1)|**[link](https://github.com/junhua-liao/light-asd)**|\n", "2303.04398": "|**2023-03-08**|**Implications of Personality on Cognitive Workload, Affect, and Task Performance in Robot Remote Control**|Go-Eum Cha et.al.|[2303.04398v1](http://arxiv.org/abs/2303.04398v1)|null|\n", "2303.04364": "|**2023-03-08**|**Dynamic Scenario Representation Learning for Motion Forecasting with Heterogeneous Graph Convolutional Recurrent Networks**|Xing Gao et.al.|[2303.04364v1](http://arxiv.org/abs/2303.04364v1)|null|\n", "2303.05499": "|**2023-03-10**|**Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection**|Shilong Liu et.al.|[2303.05499v2](http://arxiv.org/abs/2303.05499v2)|**[link](https://github.com/idea-research/groundingdino)**|\n", "2303.05338": "|**2023-03-11**|**MMCosine: Multi-Modal Cosine Loss Towards Balanced Audio-Visual Fine-Grained Learning**|Ruize Xu et.al.|[2303.05338v2](http://arxiv.org/abs/2303.05338v2)|null|\n", "2303.05313": "|**2023-03-09**|**Replacement as a Self-supervision for Fine-grained Vision-language Pre-training**|Lisai Zhang et.al.|[2303.05313v1](http://arxiv.org/abs/2303.05313v1)|null|\n", "2303.05309": "|**2023-03-09**|**MixSpeech: Cross-Modality Self-Learning with Audio-Visual Stream Mixup for Visual Speech Translation and Recognition**|Xize Cheng et.al.|[2303.05309v1](http://arxiv.org/abs/2303.05309v1)|**[link](https://github.com/exgc/avmust-ted)**|\n", "2303.05193": "|**2023-03-09**|**GOATS: Goal Sampling Adaptation for Scooping with Curriculum Reinforcement Learning**|Yaru Niu et.al.|[2303.05193v1](http://arxiv.org/abs/2303.05193v1)|null|\n", "2303.05093": "|**2023-03-09**|**Improving Video Retrieval by Adaptive Margin**|Feng He et.al.|[2303.05093v1](http://arxiv.org/abs/2303.05093v1)|null|\n", "2303.05026": "|**2023-03-09**|**SSL^2: Self-Supervised Learning meets Semi-Supervised Learning: Multiple Sclerosis Segmentation in 7T-MRI from large-scale 3T-MRI**|Jiacheng Wang et.al.|[2303.05026v1](http://arxiv.org/abs/2303.05026v1)|null|\n", "2303.04955": "|**2023-03-09**|**Exploring Smart Commercial Building Occupants' Perceptions and Notification Preferences of Internet of Things Data Collection in the United States**|Tu Le et.al.|[2303.04955v1](http://arxiv.org/abs/2303.04955v1)|null|\n", "2303.06129": "|**2023-03-10**|**Single-branch Network for Multimodal Training**|Muhammad Saad Saeed et.al.|[2303.06129v1](http://arxiv.org/abs/2303.06129v1)|**[link](https://github.com/msaadsaeed/sbnet)**|\n", "2303.05952": "|**2023-03-10**|**Understanding and Constructing Latent Modality Structures in Multi-modal Representation Learning**|Qian Jiang et.al.|[2303.05952v1](http://arxiv.org/abs/2303.05952v1)|null|\n", "2303.05936": "|**2023-03-10**|**Learning Decoupled Multi-touch Force Estimation, Localization and Stretch for Soft Capacitive E-skin**|Abu Bakar Dawood et.al.|[2303.05936v1](http://arxiv.org/abs/2303.05936v1)|null|\n", "2303.05793": "|**2023-03-10**|**Analyzing covariate clustering effects in healthcare cost subgroups: insights and applications for prediction**|Zhengxiao Li et.al.|[2303.05793v1](http://arxiv.org/abs/2303.05793v1)|**[link](https://github.com/huangyf2217/fmr-covariates-clustering)**|\n", "2303.05725": "|**2023-03-10**|**CVT-SLR: Contrastive Visual-Textual Transformation for Sign Language Recognition with Variational Alignment**|Jiangbin Zheng et.al.|[2303.05725v1](http://arxiv.org/abs/2303.05725v1)|**[link](https://github.com/binbinjiang/cvt-slr)**|\n", "2303.05714": "|**2023-03-10**|**Simultaneous estimation of multiple eigenvalues with short-depth quantum circuit on early fault-tolerant quantum computers**|Zhiyan Ding et.al.|[2303.05714v1](http://arxiv.org/abs/2303.05714v1)|null|\n", "2303.05707": "|**2023-03-10**|**MuLTI: Efficient Video-and-Language Understanding with MultiWay-Sampler and Multiple Choice Modeling**|Jiaqi Xu et.al.|[2303.05707v1](http://arxiv.org/abs/2303.05707v1)|null|\n", "2303.07284": "|**2023-03-13**|**Align and Attend: Multimodal Summarization with Dual Contrastive Losses**|Bo He et.al.|[2303.07284v1](http://arxiv.org/abs/2303.07284v1)|**[link](https://github.com/boheumd/A2Summ)**|\n", "2303.07274": "|**2023-03-14**|**Breaking Common Sense: WHOOPS! A Vision-and-Language Benchmark of Synthetic and Compositional Images**|Nitzan Bitton-Guetta et.al.|[2303.07274v2](http://arxiv.org/abs/2303.07274v2)|null|\n", "2303.07265": "|**2023-03-13**|**Multimodal Reinforcement Learning for Robots Collaborating with Humans**|Afagh Mehri Shervedani et.al.|[2303.07265v1](http://arxiv.org/abs/2303.07265v1)|null|\n", "2303.07064": "|**2023-03-13**|**A Generalized Multi-Modal Fusion Detection Framework**|Leichao Cui et.al.|[2303.07064v1](http://arxiv.org/abs/2303.07064v1)|null|\n", "2303.07000": "|**2023-03-13**|**Predicting Density of States via Multi-modal Transformer**|Namkyeong Lee et.al.|[2303.07000v1](http://arxiv.org/abs/2303.07000v1)|**[link](https://github.com/heewoongnoh/dostransformer)**|\n", "2303.06947": "|**2023-03-13**|**A Multi-Modal Simulation Framework to Enable Digital Twin-based V2X Communications in Dynamic Environments**|Lorenzo Cazzella et.al.|[2303.06947v1](http://arxiv.org/abs/2303.06947v1)|null|\n", "2303.06840": "|**2023-03-13**|**DDFM: Denoising Diffusion Model for Multi-Modality Image Fusion**|Zixiang Zhao et.al.|[2303.06840v1](http://arxiv.org/abs/2303.06840v1)|**[link](https://github.com/zhaozixiang1228/mmif-ddfm)**|\n", "2303.06662": "|**2023-03-12**|**Fuzzy Alignments in Directed Acyclic Graph for Non-Autoregressive Machine Translation**|Zhengrui Ma et.al.|[2303.06662v1](http://arxiv.org/abs/2303.06662v1)|**[link](https://github.com/ictnlp/fa-dat)**|\n", "2303.06555": "|**2023-03-12**|**One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale**|Fan Bao et.al.|[2303.06555v1](http://arxiv.org/abs/2303.06555v1)|**[link](https://github.com/thu-ml/unidiffuser)**|\n", "2303.06536": "|**2023-03-12**|**AutoOptLib: A Library of Automatically Designing Metaheuristic Optimization Algorithms in MATLAB**|Qi Zhao et.al.|[2303.06536v1](http://arxiv.org/abs/2303.06536v1)|**[link](https://github.com/qz89/AutoOpt)**|\n", "2303.06464": "|**2023-03-11**|**PARASOL: Parametric Style Control for Diffusion Image Synthesis**|Gemma Canet Tarr\u00e9s et.al.|[2303.06464v1](http://arxiv.org/abs/2303.06464v1)|null|\n", "2303.06422": "|**2023-03-11**|**An approximate control variates approach to multifidelity distribution estimation**|Ruijian Han et.al.|[2303.06422v1](http://arxiv.org/abs/2303.06422v1)|null|\n", "2303.06398": "|**2023-03-11**|**Variational Gaussian filtering via Wasserstein gradient flows**|Adrie Corenflos et.al.|[2303.06398v1](http://arxiv.org/abs/2303.06398v1)|**[link](https://github.com/hanyas/wasserstein-flow-filter)**|\n", "2303.06378": "|**2023-03-11**|**Learning Grounded Vision-Language Representation for Versatile Understanding in Untrimmed Videos**|Teng Wang et.al.|[2303.06378v1](http://arxiv.org/abs/2303.06378v1)|**[link](https://github.com/zjr2000/gvl)**|\n", "2303.06345": "|**2023-03-11**|**Semantics-Aware Dynamic Localization and Refinement for Referring Image Segmentation**|Zhao Yang et.al.|[2303.06345v1](http://arxiv.org/abs/2303.06345v1)|null|\n", "2303.08129": "|**2023-03-14**|**PiMAE: Point Cloud and Image Interactive Masked Autoencoders for 3D Object Detection**|Anthony Chen et.al.|[2303.08129v1](http://arxiv.org/abs/2303.08129v1)|**[link](https://github.com/blvlab/pimae)**|\n", "2303.08054": "|**2023-03-15**|**Statistical Hardware Design With Multi-model Active Learning**|Alireza Ghaffari et.al.|[2303.08054v2](http://arxiv.org/abs/2303.08054v2)|null|\n", "2303.08017": "|**2023-03-14**|**Reliable Beamforming at Terahertz Bands: Are Causal Representations the Way Forward?**|Christo Kurisummoottil Thomas et.al.|[2303.08017v1](http://arxiv.org/abs/2303.08017v1)|null|\n", "2303.07896": "|**2023-03-16**|**Exploring Weakly Supervised Semantic Segmentation Ensembles for Medical Imaging Systems**|Erik Ostrowski et.al.|[2303.07896v2](http://arxiv.org/abs/2303.07896v2)|**[link](https://github.com/erikostrowski/automated_ensemble)**|\n", "2303.07775": "|**2023-03-14**|**Data-Free Sketch-Based Image Retrieval**|Abhra Chaudhuri et.al.|[2303.07775v1](http://arxiv.org/abs/2303.07775v1)|**[link](https://github.com/abhrac/data-free-sbir)**|\n", "2303.07748": "|**2023-03-14**|**Generation-Guided Multi-Level Unified Network for Video Grounding**|Xing Cheng et.al.|[2303.07748v1](http://arxiv.org/abs/2303.07748v1)|null|\n", "2303.07742": "|**2023-03-14**|**ForDigitStress: A multi-modal stress dataset employing a digital job interview scenario**|Alexander Heimerl et.al.|[2303.07742v1](http://arxiv.org/abs/2303.07742v1)|null|\n", "2303.07674": "|**2023-03-14**|**Koos Classification of Vestibular Schwannoma via Image Translation-Based Unsupervised Cross-Modality Domain Adaptation**|Tao Yang et.al.|[2303.07674v1](http://arxiv.org/abs/2303.07674v1)|null|\n", "2303.07667": "|**2023-03-14**|**Improving Music Genre Classification from multi-modal properties of music and genre correlations Perspective**|Ganghui Ru et.al.|[2303.07667v1](http://arxiv.org/abs/2303.07667v1)|null|\n", "2303.07647": "|**2023-03-15**|**Recent Advances and Applications of Machine Learning in Experimental Solid Mechanics: A Review**|Hanxun Jin et.al.|[2303.07647v2](http://arxiv.org/abs/2303.07647v2)|null|\n", "2303.07601": "|**2023-03-14**|**V2V4Real: A Real-world Large-scale Dataset for Vehicle-to-Vehicle Cooperative Perception**|Runsheng Xu et.al.|[2303.07601v1](http://arxiv.org/abs/2303.07601v1)|**[link](https://github.com/ucla-mobility/v2v4real)**|\n", "2303.07543": "|**2023-03-14**|**WDiscOOD: Out-of-Distribution Detection via Whitened Linear Discriminative Analysis**|Yiye Chen et.al.|[2303.07543v1](http://arxiv.org/abs/2303.07543v1)|**[link](https://github.com/ivalab/wdiscood)**|\n", "2303.07522": "|**2023-03-13**|**Audio Visual Language Maps for Robot Navigation**|Chenguang Huang et.al.|[2303.07522v1](http://arxiv.org/abs/2303.07522v1)|null|\n", "2303.08692": "|**2023-03-15**|**SpiderMesh: Spatial-aware Demand-guided Recursive Meshing for RGB-T Semantic Segmentation**|Siqi Fan et.al.|[2303.08692v1](http://arxiv.org/abs/2303.08692v1)|**[link](https://github.com/leofansq/spidermesh)**|\n", "2303.08600": "|**2023-03-15**|**MSeg3D: Multi-modal 3D Semantic Segmentation for Autonomous Driving**|Jiale Li et.al.|[2303.08600v1](http://arxiv.org/abs/2303.08600v1)|**[link](https://github.com/jialeli1/lidarseg3d)**|\n", "2303.08562": "|**2023-03-15**|**MGA: Medical generalist agent through text-guided knowledge transformation**|Weijian Huang et.al.|[2303.08562v1](http://arxiv.org/abs/2303.08562v1)|null|\n", "2303.08518": "|**2023-03-15**|**UPRISE: Universal Prompt Retrieval for Improving Zero-Shot Evaluation**|Daixuan Cheng et.al.|[2303.08518v1](http://arxiv.org/abs/2303.08518v1)|**[link](https://github.com/microsoft/lmops)**|\n", "2303.08419": "|**2023-03-15**|**Multi-Modal Facial Expression Recognition with Transformer-Based Fusion Networks and Dynamic Sampling**|Jun-Hwa Kim et.al.|[2303.08419v1](http://arxiv.org/abs/2303.08419v1)|null|\n", "2303.08372": "|**2023-03-15**|**Target Sound Extraction with Variable Cross-modality Clues**|Chenda Li et.al.|[2303.08372v1](http://arxiv.org/abs/2303.08372v1)|**[link](https://github.com/lichenda/multi-clue-tse-data)**|\n", "2303.08367": "|**2023-03-15**|**Uncertainty-Aware Pedestrian Trajectory Prediction via Distributional Diffusion**|Yao Liu et.al.|[2303.08367v1](http://arxiv.org/abs/2303.08367v1)|null|\n", "2303.08359": "|**2023-03-15**|**Haptics-Enabled Forceps with Multi-Modal Force Sensing: Towards Task-Autonomous Robotic Surgery**|Tangyou Liu et.al.|[2303.08359v1](http://arxiv.org/abs/2303.08359v1)|null|\n", "2303.08356": "|**2023-03-15**|**Continuous emotion recognition based on TCN and Transformer**|Weiwei Zhou et.al.|[2303.08356v1](http://arxiv.org/abs/2303.08356v1)|**[link](https://github.com/upczww/abaw5)**|\n", "2303.09463": "|**2023-03-16**|**An Autonomous System for Head-to-Head Race: Design, Implementation and Analysis; Team KAIST at the Indy Autonomous Challenge**|Chanyoung Jung et.al.|[2303.09463v1](http://arxiv.org/abs/2303.09463v1)|null|\n", "2303.09381": "|**2023-03-16**|**Multi-modal Differentiable Unsupervised Feature Selection**|Junchen Yang et.al.|[2303.09381v1](http://arxiv.org/abs/2303.09381v1)|**[link](https://github.com/jcyang34/mmdufs)**|\n", "2303.09373": "|**2023-03-16**|**3D Masked Autoencoding and Pseudo-labeling for Domain Adaptive Segmentation of Heterogeneous Infant Brain MRI**|Xuzhe Zhang et.al.|[2303.09373v1](http://arxiv.org/abs/2303.09373v1)|null|\n", "2303.09367": "|**2023-03-16**|**Goal-conditioned Offline Reinforcement Learning through State Space Partitioning**|Mianchu Wang et.al.|[2303.09367v1](http://arxiv.org/abs/2303.09367v1)|null|\n", "2303.09319": "|**2023-03-16**|**Unified Multi-Modal Latent Diffusion for Joint Subject and Text Conditional Image Generation**|Yiyang Ma et.al.|[2303.09319v1](http://arxiv.org/abs/2303.09319v1)|null|\n", "2303.09270": "|**2023-03-16**|**SpectralCLIP: Preventing Artifacts in Text-Guided Style Transfer from a Spectral Perspective**|Zipeng Xu et.al.|[2303.09270v1](http://arxiv.org/abs/2303.09270v1)|**[link](https://github.com/zipengxuc/spectralclip)**|\n", "2303.09167": "|**2023-03-16**|**Emotional Reaction Intensity Estimation Based on Multimodal Data**|Shangfei Wang et.al.|[2303.09167v1](http://arxiv.org/abs/2303.09167v1)|null|\n", "2303.09119": "|**2023-03-16**|**Taming Diffusion Models for Audio-Driven Co-Speech Gesture Generation**|Lingting Zhu et.al.|[2303.09119v1](http://arxiv.org/abs/2303.09119v1)|**[link](https://github.com/advocate99/diffgesture)**|\n", "2303.09117": "|**2023-03-16**|**Visual-Linguistic Causal Intervention for Radiology Report Generation**|Weixing Chen et.al.|[2303.09117v1](http://arxiv.org/abs/2303.09117v1)|**[link](https://github.com/wissingchen/vlci)**|\n", "2303.08942": "|**2023-03-15**|**Spherical Space Feature Decomposition for Guided Depth Map Super-Resolution**|Zixiang Zhao et.al.|[2303.08942v1](http://arxiv.org/abs/2303.08942v1)|null|\n", "2303.10056": "|**2023-03-17**|**GlueGen: Plug and Play Multi-modal Encoders for X-to-image Generation**|Can Qin et.al.|[2303.10056v1](http://arxiv.org/abs/2303.10056v1)|**[link](https://github.com/salesforce/gluegen)**|\n", "2303.10033": "|**2023-03-17**|**Multi-modal Expression Recognition with Ensemble Method**|Chuanhe Liu et.al.|[2303.10033v1](http://arxiv.org/abs/2303.10033v1)|null|\n", "2303.09858": "|**2023-03-20**|**MedLocker: A Transferable Adversarial Watermarking for Preventing Unauthorized Analysis of Medical Image Dataset**|Bangzheng Pu et.al.|[2303.09858v2](http://arxiv.org/abs/2303.09858v2)|null|\n", "2303.09830": "|**2023-03-17**|**Prototype Knowledge Distillation for Medical Segmentation with Missing Modality**|Shuai Wang et.al.|[2303.09830v1](http://arxiv.org/abs/2303.09830v1)|**[link](https://github.com/sakurajimamaiii/protokd)**|\n", "2303.09825": "|**2023-03-17**|**LCE-Calib: Automatic LiDAR-Frame/Event Camera Extrinsic Calibration With A Globally Optimal Solution**|Jianhao Jiao et.al.|[2303.09825v1](http://arxiv.org/abs/2303.09825v1)|**[link](https://github.com/hkustgz-iadc/lcecalib)**|\n", "2303.09817": "|**2023-03-17**|**Hospital Length of Stay Prediction Based on Multi-modal Data towards Trustworthy Human-AI Collaboration in Radiomics**|Hubert Baniecki et.al.|[2303.09817v1](http://arxiv.org/abs/2303.09817v1)|**[link](https://github.com/modeloriented/survex)**|\n", "2303.09800": "|**2023-03-17**|**GOOD: General Optimization-based Fusion for 3D Object Detection via LiDAR-Camera Object Candidates**|Bingqi Shen et.al.|[2303.09800v1](http://arxiv.org/abs/2303.09800v1)|null|\n", "2303.09797": "|**2023-03-17**|**MMFace4D: A Large-Scale Multi-Modal 4D Face Dataset for Audio-Driven 3D Face Animation**|Haozhe Wu et.al.|[2303.09797v1](http://arxiv.org/abs/2303.09797v1)|null|\n", "2303.09756": "|**2023-03-17**|**Video Action Recognition with Attentive Semantic Units**|Yifei Chen et.al.|[2303.09756v1](http://arxiv.org/abs/2303.09756v1)|null|\n", "2303.09733": "|**2023-03-17**|**Scribble-Supervised RGB-T Salient Object Detection**|Zhengyi Liu et.al.|[2303.09733v1](http://arxiv.org/abs/2303.09733v1)|**[link](https://github.com/liuzywen/rgbtscribble-icme2023)**|\n", "2303.09695": "|**2023-03-17**|**PersonalTailor: Personalizing 2D Pattern Design from 3D Garment Point Clouds**|Anran Qi et.al.|[2303.09695v1](http://arxiv.org/abs/2303.09695v1)|null|\n", "2303.11181": "|**2023-03-20**|**Non-Markovian paths and cycles in NFT trades**|Haaroon Yousaf et.al.|[2303.11181v1](http://arxiv.org/abs/2303.11181v1)|null|\n", "2303.11090": "|**2023-03-20**|**Scene Graph Based Fusion Network For Image-Text Retrieval**|Guoliang Wang et.al.|[2303.11090v1](http://arxiv.org/abs/2303.11090v1)|null|\n", "2303.10895": "|**2023-03-20**|**Leapfrog Diffusion Model for Stochastic Trajectory Prediction**|Weibo Mao et.al.|[2303.10895v1](http://arxiv.org/abs/2303.10895v1)|**[link](https://github.com/mediabrain-sjtu/led)**|\n", "2303.10865": "|**2023-03-21**|**Rotating Objects via In-Hand Pivoting using Vision, Force and Touch**|Shiyu Xu et.al.|[2303.10865v2](http://arxiv.org/abs/2303.10865v2)|null|\n", "2303.10849": "|**2023-03-20**|**Facial Affective Analysis based on MAE and Multi-modal Information for 5th ABAW Competition**|Wei Zhang et.al.|[2303.10849v1](http://arxiv.org/abs/2303.10849v1)|null|\n", "2303.10839": "|**2023-03-21**|**MXM-CLR: A Unified Framework for Contrastive Learning of Multifold Cross-Modal Representations**|Ye Wang et.al.|[2303.10839v2](http://arxiv.org/abs/2303.10839v2)|null|\n", "2303.10835": "|**2023-03-20**|**Bifurcation analysis of the Keynesian cross model**|Xinyu Li et.al.|[2303.10835v1](http://arxiv.org/abs/2303.10835v1)|null|\n", "2303.10826": "|**2023-03-20**|**Visual Prompt Multi-Modal Tracking**|Jiawen Zhu et.al.|[2303.10826v1](http://arxiv.org/abs/2303.10826v1)|**[link](https://github.com/jiawen-zhu/vipt)**|\n", "2303.10794": "|**2023-03-19**|**PheME: A deep ensemble framework for improving phenotype prediction from multi-modal data**|Shenghan Zhang et.al.|[2303.10794v1](http://arxiv.org/abs/2303.10794v1)|null|\n", "2303.10766": "|**2023-03-21**|**Multi-modal reward for visual relationships-based image captioning**|Ali Abedi et.al.|[2303.10766v2](http://arxiv.org/abs/2303.10766v2)|null|\n", "2303.10667": "|**2023-03-19**|**Audio-Text Models Do Not Yet Leverage Natural Language**|Ho-Hsiang Wu et.al.|[2303.10667v1](http://arxiv.org/abs/2303.10667v1)|**[link](https://github.com/hohsiangwu/preposition-synthesis)**|\n", "2303.10590": "|**2023-03-19**|**Multi-modal Facial Action Unit Detection with Large Pre-trained Models for the 5th Competition on Affective Behavior Analysis in-the-wild**|Yufeng Yin et.al.|[2303.10590v1](http://arxiv.org/abs/2303.10590v1)|null|\n", "2303.10571": "|**2023-03-19**|**CLIP4MC: An RL-Friendly Vision-Language Model for Minecraft**|Ziluo Ding et.al.|[2303.10571v1](http://arxiv.org/abs/2303.10571v1)|**[link](https://github.com/PKU-RL/CLIP4MC)**|\n", "2303.10457": "|**2023-03-18**|**Multi-Modal Continual Test-Time Adaptation for 3D Semantic Segmentation**|Haozhi Cao et.al.|[2303.10457v1](http://arxiv.org/abs/2303.10457v1)|null|\n", "2303.10406": "|**2023-03-18**|**3DQD: Generalized Deep 3D Shape Prior via Part-Discretized Diffusion Process**|Yuhan Li et.al.|[2303.10406v1](http://arxiv.org/abs/2303.10406v1)|**[link](https://github.com/colorful-liyu/3dqd)**|\n", "2303.12060": "|**2023-03-21**|**VideoXum: Cross-modal Visual and Textural Summarization of Videos**|Jingyang Lin et.al.|[2303.12060v1](http://arxiv.org/abs/2303.12060v1)|null|\n", "2303.11771": "|**2023-03-21**|**Self-Sufficient Framework for Continuous Sign Language Recognition**|Youngjoon Jang et.al.|[2303.11771v1](http://arxiv.org/abs/2303.11771v1)|null|\n", "2303.11732": "|**2023-03-21**|**Multi-modal Prompting for Low-Shot Temporal Action Localization**|Chen Ju et.al.|[2303.11732v1](http://arxiv.org/abs/2303.11732v1)|null|\n", "2303.11625": "|**2023-03-21**|**Information-containing Adversarial Perturbation for Combating Facial Manipulation Systems**|Yao Zhu et.al.|[2303.11625v1](http://arxiv.org/abs/2303.11625v1)|null|\n", "2303.12501": "|**2023-03-22**|**Cross-Modal Implicit Relation Reasoning and Aligning for Text-to-Image Person Retrieval**|Ding Jiang et.al.|[2303.12501v1](http://arxiv.org/abs/2303.12501v1)|**[link](https://github.com/anosorae/irra)**|\n", "2303.12445": "|**2023-03-22**|**MEDIMP: Medical Images and Prompts for renal transplant representation learning**|Leo Milecki et.al.|[2303.12445v1](http://arxiv.org/abs/2303.12445v1)|**[link](https://github.com/leomlck/medimp)**|\n", "2303.12423": "|**2023-03-22**|**Text with Knowledge Graph Augmented Transformer for Video Captioning**|Xin Gu et.al.|[2303.12423v1](http://arxiv.org/abs/2303.12423v1)|null|\n", "2303.12419": "|**2023-03-22**|**BiCro: Noisy Correspondence Rectification for Multi-modality Data via Bi-directional Cross-modal Similarity Consistency**|Shuo Yang et.al.|[2303.12419v1](http://arxiv.org/abs/2303.12419v1)|**[link](https://github.com/xu5zhao/bicro)**|\n", "2303.12417": "|**2023-03-22**|**CLIP^2: Contrastive Language-Image-Point Pretraining from Real-World Point Cloud Data**|Yihan Zeng et.al.|[2303.12417v1](http://arxiv.org/abs/2303.12417v1)|null|\n", "2303.12379": "|**2023-03-22**|**VMCML: Video and Music Matching via Cross-Modality Lifting**|Yi-Shan Lee et.al.|[2303.12379v1](http://arxiv.org/abs/2303.12379v1)|null|\n", "2303.12112": "|**2023-03-21**|**Positive-Augmented Constrastive Learning for Image and Video Captioning Evaluation**|Sara Sarto et.al.|[2303.12112v1](http://arxiv.org/abs/2303.12112v1)|**[link](https://github.com/aimagelab/pacscore)**|\n", "2303.13471": "|**2023-03-23**|**Egocentric Audio-Visual Object Localization**|Chao Huang et.al.|[2303.13471v1](http://arxiv.org/abs/2303.13471v1)|**[link](https://github.com/wikichao/ego-av-loc)**|\n", "2303.13455": "|**2023-03-23**|**CoBIT: A Contrastive Bi-directional Image-Text Generation Model**|Haoxuan You et.al.|[2303.13455v1](http://arxiv.org/abs/2303.13455v1)|null|\n", "2303.13430": "|**2023-03-23**|**Medical diffusion on a budget: textual inversion for medical image generation**|Bram de Wilde et.al.|[2303.13430v1](http://arxiv.org/abs/2303.13430v1)|null|\n", "2303.13371": "|**2023-03-23**|**Plug-and-Play Regulators for Image-Text Matching**|Haiwen Diao et.al.|[2303.13371v1](http://arxiv.org/abs/2303.13371v1)|**[link](https://github.com/paranioar/rcar)**|\n", "2303.13233": "|**2023-03-23**|**Visually-Prompted Language Model for Fine-Grained Scene Graph Generation in an Open World**|Qifan Yu et.al.|[2303.13233v1](http://arxiv.org/abs/2303.13233v1)|**[link](https://github.com/Yuqifan1117/CaCao)**|\n", "2303.13095": "|**2023-03-23**|**Modeling Entities as Semantic Points for Visual Information Extraction in the Wild**|Zhibo Yang et.al.|[2303.13095v1](http://arxiv.org/abs/2303.13095v1)|null|\n", "2303.13041": "|**2023-03-23**|**gDoc: Automatic Generation of Structured API Documentation**|Shujun Wang et.al.|[2303.13041v1](http://arxiv.org/abs/2303.13041v1)|null|\n", "2303.13009": "|**2023-03-23**|**MELTR: Meta Loss Transformer for Learning to Fine-tune Video Foundation Models**|Dohwan Ko et.al.|[2303.13009v1](http://arxiv.org/abs/2303.13009v1)|**[link](https://github.com/mlvlab/MELTR)**|\n", "2303.12997": "|**2023-03-23**|**FER-former: Multi-modal Transformer for Facial Expression Recognition**|Yande Li et.al.|[2303.12997v1](http://arxiv.org/abs/2303.12997v1)|null|\n", "2303.12930": "|**2023-03-24**|**Dense-Localizing Audio-Visual Events in Untrimmed Videos: A Large-Scale Benchmark and Baseline**|Tiantian Geng et.al.|[2303.12930v2](http://arxiv.org/abs/2303.12930v2)|**[link](https://github.com/ttgeng233/UnAV)**|\n", "2303.14153": "|**2023-03-24**|**Local Contrastive Learning for Medical Image Recognition**|S. A. Rizvi et.al.|[2303.14153v1](http://arxiv.org/abs/2303.14153v1)|null|\n", "2303.14139": "|**2023-03-24**|**MindDiffuser: Controlled Image Reconstruction from Human Brain Activity with Semantic and Structural Diffusion**|Yizhuo Lu et.al.|[2303.14139v1](http://arxiv.org/abs/2303.14139v1)|null|\n", "2303.14081": "|**2023-03-24**|**CoLa-Diff: Conditional Latent Diffusion Model for Multi-Modal MRI Synthesis**|Lan Jiang et.al.|[2303.14081v1](http://arxiv.org/abs/2303.14081v1)|null|\n", "2303.13885": "|**2023-03-24**|**ARKitTrack: A New Diverse Dataset for Tracking Using Mobile RGB-D Data**|Haojie Zhao et.al.|[2303.13885v1](http://arxiv.org/abs/2303.13885v1)|**[link](https://github.com/lawrence-cj/ARKitTrack)**|\n", "2303.13839": "|**2023-03-24**|**HRDoc: Dataset and Baseline Method Toward Hierarchical Reconstruction of Document Structures**|Jiefeng Ma et.al.|[2303.13839v1](http://arxiv.org/abs/2303.13839v1)|**[link](https://github.com/jfma-ustc/hrdoc)**|\n", "2303.13810": "|**2023-03-24**|**Evidence-aware multi-modal data fusion and its application to total knee replacement prediction**|Xinwen Liu et.al.|[2303.13810v1](http://arxiv.org/abs/2303.13810v1)|null|\n", "2303.15444": "|**2023-03-27**|**Quantum Multi-Model Fitting**|Matteo Farina et.al.|[2303.15444v1](http://arxiv.org/abs/2303.15444v1)|**[link](https://github.com/farinamatteo/qmmf)**|\n", "2303.15230": "|**2023-03-27**|**Troika: Multi-Path Cross-Modal Traction for Compositional Zero-Shot Learning**|Siteng Huang et.al.|[2303.15230v1](http://arxiv.org/abs/2303.15230v1)|null|\n", "2303.15219": "|**2023-03-27**|**Knowing the Distance: Understanding the Gap Between Synthetic and Real Data For Face Parsing**|Eli Friedman et.al.|[2303.15219v1](http://arxiv.org/abs/2303.15219v1)|null|\n", "2303.15103": "|**2023-03-27**|**Contrastive Learning Is Spectral Clustering On Similarity Graph**|Zhiquan Tan et.al.|[2303.15103v1](http://arxiv.org/abs/2303.15103v1)|**[link](https://github.com/yifanzhang-pro/kernel-infonce)**|\n", "2303.15083": "|**2023-03-27**|**UniDistill: A Universal Cross-Modality Knowledge Distillation Framework for 3D Object Detection in Bird's-Eye View**|Shengchao Zhou et.al.|[2303.15083v1](http://arxiv.org/abs/2303.15083v1)|**[link](https://github.com/megvii-research/cvpr2023-unidistill)**|\n", "2303.15016": "|**2023-03-27**|**Borrowing Human Senses: Comment-Aware Self-Training for Social Media Multimodal Classification**|Chunpu Xu et.al.|[2303.15016v1](http://arxiv.org/abs/2303.15016v1)|**[link](https://github.com/cpaaax/multimodal_cast)**|\n", "2303.15006": "|**2023-03-27**|**Curriculum Learning for Compositional Visual Reasoning**|Wafa Aissa et.al.|[2303.15006v1](http://arxiv.org/abs/2303.15006v1)|null|\n", "2303.14998": "|**2023-03-27**|**Multi-view Cross-Modality MR Image Translation for Vestibular Schwannoma and Cochlea Segmentation**|Bogyeong Kang et.al.|[2303.14998v1](http://arxiv.org/abs/2303.14998v1)|null|\n", "2303.14880": "|**2023-03-27**|**Toward Human-Like Social Robot Navigation: A Large-Scale, Multi-Modal, Social Human Navigation Dataset**|Duc M. Nguyen et.al.|[2303.14880v1](http://arxiv.org/abs/2303.14880v1)|null|\n", "2303.14865": "|**2023-03-27**|**Revisiting Multimodal Representation in Contrastive Learning: From Patch and Token Embeddings to Finite Discrete Tokens**|Yuxiao Chen et.al.|[2303.14865v1](http://arxiv.org/abs/2303.14865v1)|**[link](https://github.com/yuxiaochen1103/fdt)**|\n", "2303.14840": "|**2023-03-26**|**On the Importance of Accurate Geometry Data for Dense 3D Vision Tasks**|HyunJun Jung et.al.|[2303.14840v1](http://arxiv.org/abs/2303.14840v1)|**[link](https://github.com/junggy/hammer-dataset)**|\n", "2303.14768": "|**2023-03-26**|**Collaborative Noisy Label Cleaner: Learning Scene-aware Trailers for Multi-modal Highlight Detection in Movies**|Bei Gan et.al.|[2303.14768v1](http://arxiv.org/abs/2303.14768v1)|**[link](https://github.com/tencentyouturesearch/highlightdetection-clc)**|\n", "2303.14730": "|**2023-03-26**|**Semantic Neural Decoding via Cross-Modal Generation**|Xuelin Qian et.al.|[2303.14730v1](http://arxiv.org/abs/2303.14730v1)|null|\n", "2303.14666": "|**2023-03-26**|**Generalization Matters: Loss Minima Flattening via Parameter Hybridization for Efficient Online Knowledge Distillation**|Tianli Zhang et.al.|[2303.14666v1](http://arxiv.org/abs/2303.14666v1)|null|\n", "2303.14626": "|**2023-03-26**|**MRCN: A Novel Modality Restitution and Compensation Network for Visible-Infrared Person Re-identification**|Yukang Zhang et.al.|[2303.14626v1](http://arxiv.org/abs/2303.14626v1)|null|\n", "2303.16199": "|**2023-03-28**|**LLaMA-Adapter: Efficient Fine-tuning of Language Models with Zero-init Attention**|Renrui Zhang et.al.|[2303.16199v1](http://arxiv.org/abs/2303.16199v1)|**[link](https://github.com/zrrskywalker/llama-adapter)**|\n", "2303.16099": "|**2023-03-28**|**Medical Image Analysis using Deep Relational Learning**|Zhihua Liu et.al.|[2303.16099v1](http://arxiv.org/abs/2303.16099v1)|null|\n", "2303.16058": "|**2023-03-28**|**Unmasked Teacher: Towards Training-Efficient Video Foundation Models**|Kunchang Li et.al.|[2303.16058v1](http://arxiv.org/abs/2303.16058v1)|**[link](https://github.com/opengvlab/unmasked_teacher)**|\n", "2303.15932": "|**2023-03-29**|**Unify, Align and Refine: Multi-Level Semantic Alignment for Radiology Report Generation**|Yaowei Li et.al.|[2303.15932v2](http://arxiv.org/abs/2303.15932v2)|null|\n", "2303.15826": "|**2023-03-28**|**MS-MT: Multi-Scale Mean Teacher with Contrastive Unpaired Translation for Cross-Modality Vestibular Schwannoma and Cochlea Segmentation**|Ziyuan Zhao et.al.|[2303.15826v1](http://arxiv.org/abs/2303.15826v1)|null|\n", "2303.15777": "|**2023-03-28**|**Imbalance Knowledge-Driven Multi-modal Network for Land-Cover Semantic Segmentation Using Images and LiDAR Point Clouds**|Yameng Wang et.al.|[2303.15777v1](http://arxiv.org/abs/2303.15777v1)|null|\n", "2303.15770": "|**2023-03-28**|**DDMM-Synth: A Denoising Diffusion Model for Cross-modal Medical Image Synthesis with Sparse-view Measurement Embedding**|Xiaoyue Li et.al.|[2303.15770v1](http://arxiv.org/abs/2303.15770v1)|null|\n", "2303.15710": "|**2023-03-28**|**Explicit Attention-Enhanced Fusion for RGB-Thermal Perception Tasks**|Mingjian Liang et.al.|[2303.15710v1](http://arxiv.org/abs/2303.15710v1)|**[link](https://github.com/freeformrobotics/eaefnet)**|\n", "2303.16818": "|**2023-03-30**|**BEVSimDet: Simulated Multi-modal Distillation in Bird's-Eye View for Multi-view 3D Object Detection**|Haimei Zhao et.al.|[2303.16818v2](http://arxiv.org/abs/2303.16818v2)|**[link](https://github.com/vitae-transformer/bevsimdet)**|\n", "2303.16604": "|**2023-03-29**|**Bi-directional Training for Composed Image Retrieval via Text Prompt Learning**|Zheyuan Liu et.al.|[2303.16604v1](http://arxiv.org/abs/2303.16604v1)|**[link](https://github.com/Cuberick-Orion/Bi-Blip4CIR)**|\n", "2303.16541": "|**2023-03-29**|**Sounding Video Generator: A Unified Framework for Text-guided Sounding Video Generation**|Jiawei Liu et.al.|[2303.16541v1](http://arxiv.org/abs/2303.16541v1)|**[link](https://github.com/jwliu-cc/svg)**|\n", "2303.16443": "|**2023-03-29**|**A tensor based varying-coefficient model for multi-modal neuroimaging data analysis**|Pratim Guha Niyogi et.al.|[2303.16443v1](http://arxiv.org/abs/2303.16443v1)|null|\n", "2303.17561": "|**2023-03-30**|**SoftCLIP: Softer Cross-modal Alignment Makes CLIP Stronger**|Yuting Gao et.al.|[2303.17561v1](http://arxiv.org/abs/2303.17561v1)|null|\n", "2303.17531": "|**2023-03-30**|**Asymmetric Face Recognition with Cross Model Compatible Ensembles**|Ori Linial et.al.|[2303.17531v1](http://arxiv.org/abs/2303.17531v1)|null|\n", "2303.17517": "|**2023-03-30**|**Hindi as a Second Language: Improving Visually Grounded Speech with Semantically Similar Samples**|Hyeonggon Ryu et.al.|[2303.17517v1](http://arxiv.org/abs/2303.17517v1)|null|\n", "2303.17490": "|**2023-03-30**|**Sound to Visual Scene Generation by Audio-to-Visual Latent Alignment**|Kim Sung-Bin et.al.|[2303.17490v1](http://arxiv.org/abs/2303.17490v1)|null|\n", "2303.17409": "|**2023-03-30**|**Steered Mixture of Experts Regression for Image Denoising with Multi-Model-Inference**|Aytac \u00d6zkan et.al.|[2303.17409v1](http://arxiv.org/abs/2303.17409v1)|null|\n", "2303.17386": "|**2023-03-30**|**Complementary Random Masking for RGB-Thermal Semantic Segmentation**|Ukcheol Shin et.al.|[2303.17386v1](http://arxiv.org/abs/2303.17386v1)|**[link](https://github.com/UkcheolShin/CRM_RGBTSeg)**|\n", "2303.17297": "|**2023-03-30**|**Understanding the Robustness of 3D Object Detection with Bird's-Eye-View Representations in Autonomous Driving**|Zijian Zhu et.al.|[2303.17297v1](http://arxiv.org/abs/2303.17297v1)|**[link](https://github.com/zzj403/BEV_Robust)**|\n", "2303.17285": "|**2023-03-30**|**Decomposed Cross-modal Distillation for RGB-based Temporal Action Detection**|Pilhyeon Lee et.al.|[2303.17285v1](http://arxiv.org/abs/2303.17285v1)|null|\n", "2303.17169": "|**2023-03-30**|**Task-Oriented Multi-Modal Mutual Leaning for Vision-Language Models**|Sifan Long et.al.|[2303.17169v1](http://arxiv.org/abs/2303.17169v1)|null|\n", "2303.17099": "|**2023-03-30**|**BEVFusion4D: Learning LiDAR-Camera Fusion Under Bird's-Eye-View via Cross-Modality Guidance and Temporal Aggregation**|Hongxiang Cai et.al.|[2303.17099v1](http://arxiv.org/abs/2303.17099v1)|null|\n", "2303.18248": "|**2023-03-31**|**Towards Flexible Multi-modal Document Models**|Naoto Inoue et.al.|[2303.18248v1](http://arxiv.org/abs/2303.18248v1)|**[link](https://github.com/CyberAgentAILab/flex-dm)**|\n", "2303.17981": "|**2023-03-31**|**Knowledge Distillation for Feature Extraction in Underwater VSLAM**|Jinghe Yang et.al.|[2303.17981v1](http://arxiv.org/abs/2303.17981v1)|**[link](https://github.com/jinghe-mel/ufen-slam)**|\n", "2303.17859": "|**2023-03-31**|**MapFormer: Boosting Change Detection by Using Pre-change Information**|Maximilian Bernhard et.al.|[2303.17859v1](http://arxiv.org/abs/2303.17859v1)|**[link](https://github.com/mxbh/mapformer)**|\n", "2303.17811": "|**2023-04-03**|**Zero-shot Referring Image Segmentation with Global-Local Context Features**|Seonghoon Yu et.al.|[2303.17811v2](http://arxiv.org/abs/2303.17811v2)|**[link](https://github.com/seonghoon-yu/zero-shot-ris)**|\n", "2304.00932": "|**2023-04-03**|**HypLiLoc: Towards Effective LiDAR Pose Regression with Hyperbolic Fusion**|Sijie Wang et.al.|[2304.00932v1](http://arxiv.org/abs/2304.00932v1)|**[link](https://github.com/sijieaaa/hypliloc)**|\n", "2304.00827": "|**2023-04-03**|**Multi-modal Fake News Detection on Social Media via Multi-grained Information Fusion**|Yangming Zhou et.al.|[2304.00827v1](http://arxiv.org/abs/2304.00827v1)|null|\n", "2304.00788": "|**2023-04-03**|**Open-Vocabulary Point-Cloud Object Detection without 3D Annotation**|Yuheng Lu et.al.|[2304.00788v1](http://arxiv.org/abs/2304.00788v1)|**[link](https://github.com/lyhdet/ov-3det)**|\n", "2304.00719": "|**2023-04-03**|**Multi-Modal Representation Learning with Text-Driven Soft Masks**|Jaeyoo Park et.al.|[2304.00719v1](http://arxiv.org/abs/2304.00719v1)|null|\n", "2304.00670": "|**2023-04-03**|**CRN: Camera Radar Net for Accurate, Robust, Efficient 3D Perception**|Youngseok Kim et.al.|[2304.00670v1](http://arxiv.org/abs/2304.00670v1)|null|\n", "2304.00495": "|**2023-04-02**|**Multimodal Hyperspectral Image Classification via Interconnected Fusion**|Lu Huo et.al.|[2304.00495v1](http://arxiv.org/abs/2304.00495v1)|null|\n", "2304.00450": "|**2023-04-02**|**Sketch-based Video Object Localization**|Sangmin Woo et.al.|[2304.00450v1](http://arxiv.org/abs/2304.00450v1)|null|\n", "2304.00379": "|**2023-04-01**|**Improved Multimodal Fusion for Small Datasets with Auxiliary Supervision**|Gregory Holste et.al.|[2304.00379v1](http://arxiv.org/abs/2304.00379v1)|null|\n", "2304.00157": "|**2023-03-31**|**Robotic Perception of Transparent Objects: A Review**|Jiaqi Jiang et.al.|[2304.00157v1](http://arxiv.org/abs/2304.00157v1)|null|\n", "2304.01961": "|**2023-04-04**|**AToMiC: An Image/Text Retrieval Test Collection to Support Multimedia Content Creation**|Jheng-Hong Yang et.al.|[2304.01961v1](http://arxiv.org/abs/2304.01961v1)|**[link](https://github.com/trec-atomic/atomic)**|\n", "2304.01799": "|**2023-04-04**|**naplib-python: Neural Acoustic Data Processing and Analysis Tools in Python**|Gavin Mischler et.al.|[2304.01799v1](http://arxiv.org/abs/2304.01799v1)|**[link](https://github.com/naplab/naplib-python)**|\n", "2304.01705": "|**2023-04-04**|**Cross-modal tumor segmentation using generative blending augmentation and self training**|Guillaume Sall\u00e9 et.al.|[2304.01705v1](http://arxiv.org/abs/2304.01705v1)|null|\n", "2304.01603": "|**2023-04-04**|**Locate Then Generate: Bridging Vision and Language with Bounding Box for Scene-Text VQA**|Yongxin Zhu et.al.|[2304.01603v1](http://arxiv.org/abs/2304.01603v1)|null|\n", "2304.01601": "|**2023-04-04**|**Primitive Simultaneous Optimization of Similarity Metrics for Image Registration**|Diana Waldmannstetter et.al.|[2304.01601v1](http://arxiv.org/abs/2304.01601v1)|null|\n", "2304.01563": "|**2023-04-04**|**Attribute-Consistent Knowledge Graph Representation Learning for Multi-Modal Entity Alignment**|Qian Li et.al.|[2304.01563v1](http://arxiv.org/abs/2304.01563v1)|null|\n", "2304.01491": "|**2023-04-04**|**Multi model LSTM architecture for Track Association based on Automatic Identification System Data**|Md Asif Bin Syed et.al.|[2304.01491v1](http://arxiv.org/abs/2304.01491v1)|null|\n", "2304.01440": "|**2023-04-04**|**A Deep Multi-Modal Cyber-Attack Detection in Industrial Control Systems**|Sepideh Bahadoripour et.al.|[2304.01440v1](http://arxiv.org/abs/2304.01440v1)|null|\n", "2304.01430": "|**2023-04-04**|**Divided Attention: Unsupervised Multi-Object Discovery with Contextually Separated Slots**|Dong Lao et.al.|[2304.01430v1](http://arxiv.org/abs/2304.01430v1)|null|\n", "2304.01233": "|**2023-04-03**|**Multi-Modal Perceiver Language Model for Outcome Prediction in Emergency Department**|Sabri Boughorbel et.al.|[2304.01233v1](http://arxiv.org/abs/2304.01233v1)|null|\n", "2304.02556": "|**2023-04-05**|**Detecting and Grounding Multi-Modal Media Manipulation**|Rui Shao et.al.|[2304.02556v1](http://arxiv.org/abs/2304.02556v1)|**[link](https://github.com/rshaojimmy/multimodal-deepfake)**|\n", "2304.02532": "|**2023-04-05**|**Goal-Conditioned Imitation Learning using Score-based Diffusion Policies**|Moritz Reuss et.al.|[2304.02532v1](http://arxiv.org/abs/2304.02532v1)|null|\n", "2304.02419": "|**2023-04-05**|**TM2D: Bimodality Driven 3D Dance Generation via Music-Text Integration**|Kehong Gong et.al.|[2304.02419v1](http://arxiv.org/abs/2304.02419v1)|**[link](https://github.com/Garfield-kh/TM2D)**|\n", "2304.02407": "|**2023-04-05**|**Explaining Multimodal Data Fusion: Occlusion Analysis for Wilderness Mapping**|Burak Ekim et.al.|[2304.02407v1](http://arxiv.org/abs/2304.02407v1)|null|\n", "2304.02328": "|**2023-04-05**|**Enhancing Multimodal Entity and Relation Extraction with Variational Information Bottleneck**|Shiyao Cui et.al.|[2304.02328v1](http://arxiv.org/abs/2304.02328v1)|null|\n", "2304.02278": "|**2023-04-05**|**Calibrating Cross-modal Feature for Text-Based Person Searching**|Donglai Wei et.al.|[2304.02278v1](http://arxiv.org/abs/2304.02278v1)|null|\n", "2304.03047": "|**2023-04-07**|**ETPNav: Evolving Topological Planning for Vision-Language Navigation in Continuous Environments**|Dong An et.al.|[2304.03047v2](http://arxiv.org/abs/2304.03047v2)|**[link](https://github.com/marsaki/etpnav)**|\n", "2304.02991": "|**2023-04-06**|**Exploiting the Complementarity of 2D and 3D Networks to Address Domain-Shift in 3D Semantic Segmentation**|Adriano Cardace et.al.|[2304.02991v1](http://arxiv.org/abs/2304.02991v1)|**[link](https://github.com/cvlab-unibo/mm2d3d)**|\n", "2304.02948": "|**2023-04-06**|**FengWu: Pushing the Skillful Global Medium-range Weather Forecast beyond 10 Days Lead**|Kang Chen et.al.|[2304.02948v1](http://arxiv.org/abs/2304.02948v1)|null|\n", "2304.02916": "|**2023-04-06**|**Efficient Audio Captioning Transformer with Patchout and Text Guidance**|Thodoris Kouzelis et.al.|[2304.02916v1](http://arxiv.org/abs/2304.02916v1)|null|\n", "2304.02902": "|**2023-04-06**|**Towards Efficient MCMC Sampling in Bayesian Neural Networks by Exploiting Symmetry**|Jonas Gregor Wiese et.al.|[2304.02902v1](http://arxiv.org/abs/2304.02902v1)|null|\n", "2304.02853": "|**2023-04-06**|**Learning Instance-Level Representation for Large-Scale Multi-Modal Pretraining in E-commerce**|Yang Jin et.al.|[2304.02853v1](http://arxiv.org/abs/2304.02853v1)|null|\n", "2304.03669": "|**2023-04-07**|**DATE: Domain Adaptive Product Seeker for E-commerce**|Haoyuan Li et.al.|[2304.03669v1](http://arxiv.org/abs/2304.03669v1)|null|\n", "2304.03542": "|**2023-04-07**|**Better \"CMOS\" Produces Clearer Images: Learning Space-Variant Blur Estimation for Blind Image Super-Resolution**|Xuhai Chen et.al.|[2304.03542v1](http://arxiv.org/abs/2304.03542v1)|null|\n", "2304.03391": "|**2023-04-06**|**Exposing and Mitigating Spurious Correlations for Cross-Modal Retrieval**|Jae Myung Kim et.al.|[2304.03391v1](http://arxiv.org/abs/2304.03391v1)|null|\n", "2304.04523": "|**2023-04-10**|**PoseFusion: Robust Object-in-Hand Pose Estimation with SelectLSTM**|Yuyang Tu et.al.|[2304.04523v1](http://arxiv.org/abs/2304.04523v1)|null|\n", "2304.04302": "|**2023-04-09**|**Bionic Collapsible Wings in Aquatic-aerial Robot**|Xiao Xiong et.al.|[2304.04302v1](http://arxiv.org/abs/2304.04302v1)|null|\n", "2304.04298": "|**2023-04-09**|**Unsupervised Sampling Promoting for Stochastic Human Trajectory Prediction**|Guangyi Chen et.al.|[2304.04298v1](http://arxiv.org/abs/2304.04298v1)|**[link](https://github.com/viewsetting/unsupervised_sampling_promoting)**|\n", "2304.04290": "|**2023-04-09**|**Distributed Conditional GAN (discGAN) For Synthetic Healthcare Data Generation**|David Fuentes et.al.|[2304.04290v1](http://arxiv.org/abs/2304.04290v1)|null|\n", "2304.04231": "|**2023-04-09**|**CrowdCLIP: Unsupervised Crowd Counting via Vision-Language Model**|Dingkang Liang et.al.|[2304.04231v1](http://arxiv.org/abs/2304.04231v1)|**[link](https://github.com/dk-liang/crowdclip)**|\n", "2304.04187": "|**2023-04-09**|**Similarity-Aware Multimodal Prompt Learning for Fake News Detection**|Ye Jiang et.al.|[2304.04187v1](http://arxiv.org/abs/2304.04187v1)|null|\n", "2304.04113": "|**2023-04-08**|**An Automated Fully-Computational Framework to Construct Printability Maps for Additively Manufactured Metal Alloys**|Sofia Sheikh et.al.|[2304.04113v1](http://arxiv.org/abs/2304.04113v1)|null|\n", "2304.04062": "|**2023-04-08**|**Predicting multiple sclerosis disease severity with multimodal deep neural networks**|Kai Zhang et.al.|[2304.04062v1](http://arxiv.org/abs/2304.04062v1)|**[link](https://github.com/anotherkaizhang/ms)**|\n", "2304.03916": "|**2023-04-08**|**Mitigating Spurious Correlations in Multi-modal Models during Fine-tuning**|Yu Yang et.al.|[2304.03916v1](http://arxiv.org/abs/2304.03916v1)|null|\n", "2304.03910": "|**2023-04-08**|**Co-attention Propagation Network for Zero-Shot Video Object Segmentation**|Gensheng Pei et.al.|[2304.03910v1](http://arxiv.org/abs/2304.03910v1)|**[link](https://github.com/nust-machine-intelligence-laboratory/hcpn)**|\n", "2304.03897": "|**2023-04-08**|**Factify 2: A Multimodal Fake News and Satire News Dataset**|S Suryavardan et.al.|[2304.03897v1](http://arxiv.org/abs/2304.03897v1)|**[link](https://github.com/surya1701/factify-2.0)**|\n", "2304.05340": "|**2023-04-11**|**Unified Multi-Modal Image Synthesis for Missing Modality Imputation**|Yue Zhang et.al.|[2304.05340v1](http://arxiv.org/abs/2304.05340v1)|null|\n", "2304.05171": "|**2023-04-11**|**Curriculum-Based Imitation of Versatile Skills**|Maximilian Xiling Li et.al.|[2304.05171v1](http://arxiv.org/abs/2304.05171v1)|**[link](https://github.com/intuitive-robots/ml-cur)**|\n", "2304.05166": "|**2023-04-11**|**TrajFlow: Learning the Distribution over Trajectories**|Anna M\u00e9sz\u00e1ros et.al.|[2304.05166v1](http://arxiv.org/abs/2304.05166v1)|null|\n", "2304.05080": "|**2023-04-11**|**Investigating Imbalances Between SAR and Optical Utilization for Multi-Modal Urban Mapping**|Sebastian Hafner et.al.|[2304.05080v1](http://arxiv.org/abs/2304.05080v1)|null|\n", "2304.05051": "|**2023-04-11**|**FashionSAP: Symbols and Attributes Prompt for Fine-grained Fashion Vision-Language Pre-training**|Yunpeng Han et.al.|[2304.05051v1](http://arxiv.org/abs/2304.05051v1)|**[link](https://github.com/hssip/fashionsap)**|\n", "2304.05979": "|**2023-04-12**|**NaviSTAR: Socially Aware Robot Navigation with Hybrid Spatio-Temporal Graph Transformer and Preference Learning**|Weizheng Wang et.al.|[2304.05979v1](http://arxiv.org/abs/2304.05979v1)|null|\n", "2304.05754": "|**2023-04-12**|**Self-Supervised Learning with Cluster-Aware-DINO for High-Performance Robust Speaker Verification**|Bing Han et.al.|[2304.05754v1](http://arxiv.org/abs/2304.05754v1)|null|\n", "2304.05720": "|**2023-04-12**|**Towards a more comprehensive open-source model for interdisciplinary smart integrated energy systems**|B\u00e9la Wiegel et.al.|[2304.05720v1](http://arxiv.org/abs/2304.05720v1)|null|\n", "2304.05646": "|**2023-04-12**|**Modality-Invariant Representation for Infrared and Visible Image Registration**|Zhiying Jiang et.al.|[2304.05646v1](http://arxiv.org/abs/2304.05646v1)|null|\n", "2304.05645": "|**2023-04-12**|**WildRefer: 3D Object Localization in Large-scale Dynamic Scenes with Multi-modal Visual Data and Natural Language**|Zhenxiang Lin et.al.|[2304.05645v1](http://arxiv.org/abs/2304.05645v1)|null|\n", "2304.05600": "|**2023-04-12**|**Looking Similar, Sounding Different: Leveraging Counterfactual Cross-Modal Pairs for Audiovisual Representation Learning**|Nikhil Singh et.al.|[2304.05600v1](http://arxiv.org/abs/2304.05600v1)|null|\n", "2304.05523": "|**2023-04-11**|**MoMo: A shared encoder Model for text, image and multi-Modal representations**|Rakesh Chada et.al.|[2304.05523v1](http://arxiv.org/abs/2304.05523v1)|null|\n", "2304.05402": "|**2023-04-11**|**Boosting Cross-task Transferability of Adversarial Patches with Visual Relations**|Tony Ma et.al.|[2304.05402v1](http://arxiv.org/abs/2304.05402v1)|null|\n", "2304.06708": "|**2023-04-13**|**Verbs in Action: Improving verb understanding in video-language models**|Liliane Momeni et.al.|[2304.06708v1](http://arxiv.org/abs/2304.06708v1)|null|\n", "2304.06306": "|**2023-04-13**|**Efficient Multimodal Fusion via Interactive Prompting**|Yaowei Li et.al.|[2304.06306v1](http://arxiv.org/abs/2304.06306v1)|null|\n", "2304.06275": "|**2023-04-13**|**Noisy Correspondence Learning with Meta Similarity Correction**|Haochen Han et.al.|[2304.06275v1](http://arxiv.org/abs/2304.06275v1)|**[link](https://github.com/hhc1997/mscn)**|\n", "2304.06264": "|**2023-04-13**|**Loosely Coupled Odometry, UWB Ranging, and Cooperative Spatial Detection for Relative Monte-Carlo Multi-Robot Localization**|Xianjia Yu et.al.|[2304.06264v1](http://arxiv.org/abs/2304.06264v1)|**[link](https://github.com/tiers/uwb-cooperative-mrs-localization)**|\n", "2304.06051": "|**2023-04-12**|**Open-TransMind: A New Baseline and Benchmark for 1st Foundation Model Challenge of Intelligent Transportation**|Yifeng Shi et.al.|[2304.06051v1](http://arxiv.org/abs/2304.06051v1)|**[link](https://github.com/Traffic-X/Open-TransMind)**|\n", "2304.07199": "|**2023-04-14**|**CROVIA: Seeing Drone Scenes from Car Perspective via Cross-View Adaptation**|Thanh-Dat Truong et.al.|[2304.07199v1](http://arxiv.org/abs/2304.07199v1)|null|\n", "2304.07151": "|**2023-04-14**|**End-to-End Learning with Multiple Modalities for System-Optimised Renewables Nowcasting**|Rushil Vohra et.al.|[2304.07151v1](http://arxiv.org/abs/2304.07151v1)|null|\n", "2304.07147": "|**2023-04-14**|**Cross Attention Transformers for Multi-modal Unsupervised Whole-Body PET Anomaly Detection**|Ashay Patel et.al.|[2304.07147v1](http://arxiv.org/abs/2304.07147v1)|null|\n", "2304.06991": "|**2023-04-14**|**WYTIWYR: A User Intent-Aware Framework with Multi-modal Inputs for Visualization Retrieval**|Shishi Xiao et.al.|[2304.06991v1](http://arxiv.org/abs/2304.06991v1)|**[link](https://github.com/serendipitysx/wytiwyr)**|\n", "2304.06910": "|**2023-04-14**|**HCAM -- Hierarchical Cross Attention Model for Multi-modal Emotion Recognition**|Soumya Dutta et.al.|[2304.06910v1](http://arxiv.org/abs/2304.06910v1)|null|\n", "2304.06786": "|**2023-04-13**|**The future of hearing aid technology**|Volker Hohmann et.al.|[2304.06786v1](http://arxiv.org/abs/2304.06786v1)|null|\n", "2304.08345": "|**2023-04-17**|**VALOR: Vision-Audio-Language Omni-Perception Pretraining Model and Dataset**|Sihan Chen et.al.|[2304.08345v1](http://arxiv.org/abs/2304.08345v1)|**[link](https://github.com/TXH-mercury/VALOR)**|\n", "2304.08304": "|**2023-04-17**|**SDVRF: Sparse-to-Dense Voxel Region Fusion for Multi-modal 3D Object Detection**|Binglu Ren et.al.|[2304.08304v1](http://arxiv.org/abs/2304.08304v1)|null|\n", "2304.08083": "|**2023-04-17**|**Causality-aware Visual Scene Discovery for Cross-Modal Question Reasoning**|Yang Liu et.al.|[2304.08083v1](http://arxiv.org/abs/2304.08083v1)|null|\n", "2304.08072": "|**2023-04-17**|**Two-stage MR Image Segmentation Method for Brain Tumors based on Attention Mechanism**|Li Zhu et.al.|[2304.08072v1](http://arxiv.org/abs/2304.08072v1)|null|\n", "2304.08058": "|**2023-04-17**|**One-Class SVM on siamese neural network latent space for Unsupervised Anomaly Detection on brain MRI White Matter Hyperintensities**|Nicolas Pinon et.al.|[2304.08058v1](http://arxiv.org/abs/2304.08058v1)|null|\n", "2304.08054": "|**2023-04-17**|**Fed-MIWAE: Federated Imputation of Incomplete Data via Deep Generative Models**|Irene Balelli et.al.|[2304.08054v1](http://arxiv.org/abs/2304.08054v1)|null|\n", "2304.07775": "|**2023-04-16**|**Robust Cross-Modal Knowledge Distillation for Unconstrained Videos**|Wenke Xia et.al.|[2304.07775v1](http://arxiv.org/abs/2304.07775v1)|**[link](https://github.com/gewu-lab/cross-modal-distillation)**|\n", "2304.07728": "|**2023-04-16**|**TransFusionOdom: Interpretable Transformer-based LiDAR-Inertial Fusion Odometry Estimation**|Leyuan Sun et.al.|[2304.07728v1](http://arxiv.org/abs/2304.07728v1)|**[link](https://github.com/rakugenson/multi-modal-dataset-for-odometry-estimation)**|\n", "2304.07633": "|**2023-04-15**|**Detecting Out-of-Context Multimodal Misinformation with interpretable neural-symbolic model**|Yizhou Zhang et.al.|[2304.07633v1](http://arxiv.org/abs/2304.07633v1)|null|\n", "2304.07567": "|**2023-04-15**|**CoVLR: Coordinating Cross-Modal Consistency and Intra-Modal Structure for Vision-Language Retrieval**|Yang Yang et.al.|[2304.07567v1](http://arxiv.org/abs/2304.07567v1)|null|\n", "2304.07549": "|**2023-04-15**|**MA-ViT: Modality-Agnostic Vision Transformers for Face Anti-Spoofing**|Ajian Liu et.al.|[2304.07549v1](http://arxiv.org/abs/2304.07549v1)|null|\n", "2304.07387": "|**2023-04-14**|**Cross-domain Food Image-to-Recipe Retrieval by Weighted Adversarial Learning**|Bin Zhu et.al.|[2304.07387v1](http://arxiv.org/abs/2304.07387v1)|null|\n", "2304.09172": "|**2023-04-18**|**Hyperbolic Image-Text Representations**|Karan Desai et.al.|[2304.09172v1](http://arxiv.org/abs/2304.09172v1)|null|\n", "2304.09164": "|**2023-04-18**|**Structure Preserving Cycle-GAN for Unsupervised Medical Image Domain Adaptation**|Paolo Iacono et.al.|[2304.09164v1](http://arxiv.org/abs/2304.09164v1)|null|\n", "2304.08965": "|**2023-04-18**|**Unsupervised Semantic Segmentation of 3D Point Clouds via Cross-modal Distillation and Super-Voxel Clustering**|Zisheng Chen et.al.|[2304.08965v1](http://arxiv.org/abs/2304.08965v1)|**[link](https://github.com/scut-bip-lab/pointdc)**|\n", "2304.08881": "|**2023-04-18**|**Segmentation of glioblastomas in early post-operative multi-modal MRI with deep neural networks**|Ragnhild Holden Helland et.al.|[2304.08881v1](http://arxiv.org/abs/2304.08881v1)|**[link](https://github.com/dbouget/validation_metrics_computation)**|\n", "2304.08709": "|**2023-04-18**|**You Only Need Two Detectors to Achieve Multi-Modal 3D Multi-Object Tracking**|Xiyang Wang et.al.|[2304.08709v1](http://arxiv.org/abs/2304.08709v1)|**[link](https://github.com/wangxiyang2022/YONTD-MOT)**|\n", "2304.08660": "|**2023-04-17**|**(LC)$^2$: LiDAR-Camera Loop Constraints For Cross-Modal Place Recognition**|Alex Junho Lee et.al.|[2304.08660v1](http://arxiv.org/abs/2304.08660v1)|null|\n", "2304.08658": "|**2023-04-20**|**In-situ surface porosity prediction in DED (directed energy deposition) printed SS316L parts using multimodal sensor fusion**|Adithyaa Karthikeyan et.al.|[2304.08658v2](http://arxiv.org/abs/2304.08658v2)|null|\n", "2304.09801": "|**2023-04-19**|**MetaBEV: Solving Sensor Failures for BEV Detection and Map Segmentation**|Chongjian Ge et.al.|[2304.09801v1](http://arxiv.org/abs/2304.09801v1)|**[link](https://github.com/ChongjianGE/MetaBEV)**|\n", "2304.09694": "|**2023-04-19**|**CrossFusion: Interleaving Cross-modal Complementation for Noise-resistant 3D Object Detection**|Yang Yang et.al.|[2304.09694v1](http://arxiv.org/abs/2304.09694v1)|null|\n", "2304.09609": "|**2023-04-19**|**MMDR: A Result Feature Fusion Object Detection Approach for Autonomous System**|Wendong Zhang et.al.|[2304.09609v1](http://arxiv.org/abs/2304.09609v1)|null|\n", "2304.09498": "|**2023-04-19**|**Learning Robust Visual-Semantic Embedding for Generalizable Person Re-identification**|Suncheng Xiang et.al.|[2304.09498v1](http://arxiv.org/abs/2304.09498v1)|**[link](https://github.com/jeremyxsc/mmet)**|\n", "2304.09448": "|**2023-04-19**|**EC^2: Emergent Communication for Embodied Control**|Yao Mu et.al.|[2304.09448v1](http://arxiv.org/abs/2304.09448v1)|null|\n", "2304.09421": "|**2023-04-19**|**TieFake: Title-Text Similarity and Emotion-Aware Fake News Detection**|Quanjiang Guo et.al.|[2304.09421v1](http://arxiv.org/abs/2304.09421v1)|**[link](https://github.com/uestc-gqj/tiefake)**|\n", "2304.09370": "|**2023-04-19**|**Integrating Reconfigurable Foot Design, Multi-modal Contact Sensing, and Terrain Classification for Bipedal Locomotion**|Ted Tyler et.al.|[2304.09370v1](http://arxiv.org/abs/2304.09370v1)|null|\n", "2304.09322": "|**2023-04-18**|**Multi-Modality Multi-Scale Cardiovascular Disease Subtypes Classification Using Raman Image and Medical History**|Bo Yu et.al.|[2304.09322v1](http://arxiv.org/abs/2304.09322v1)|null|\n", "2304.10530": "|**2023-04-20**|**Collaborative Diffusion for Multi-Modal Face Generation and Editing**|Ziqi Huang et.al.|[2304.10530v1](http://arxiv.org/abs/2304.10530v1)|**[link](https://github.com/ziqihuangg/collaborative-diffusion)**|\n", "2304.10309": "|**2023-04-20**|**Improving Speech Translation by Cross-Modal Multi-Grained Contrastive Learning**|Hao Zhang et.al.|[2304.10309v1](http://arxiv.org/abs/2304.10309v1)|null|\n", "2304.10254": "|**2023-04-20**|**Image-text Retrieval via preserving main Semantics of Vision**|Xu Zhang et.al.|[2304.10254v1](http://arxiv.org/abs/2304.10254v1)|**[link](https://github.com/zhangxu0963/vsl)**|\n", "2304.10091": "|**2023-04-20**|**Learning CLIP Guided Visual-Text Fusion Transformer for Video-based Pedestrian Attribute Recognition**|Jun Zhu et.al.|[2304.10091v1](http://arxiv.org/abs/2304.10091v1)|**[link](https://github.com/event-ahu/vtf_par)**|\n", "2304.09941": "|**2023-04-19**|**A robust and interpretable deep learning framework for multi-modal registration via keypoints**|Alan Q. Wang et.al.|[2304.09941v1](http://arxiv.org/abs/2304.09941v1)|**[link](https://github.com/evanmy/keymorph)**|\n", "2304.09921": "|**2023-04-19**|**Regularization for distributionally robust state estimation and prediction**|Jean-S\u00e9bastien Brouillon et.al.|[2304.09921v1](http://arxiv.org/abs/2304.09921v1)|null|\n", "2304.10382": "|**2023-04-21**|**Conditional Generative Models for Learning Stochastic Processes**|Salvatore Certo et.al.|[2304.10382v2](http://arxiv.org/abs/2304.10382v2)|null|\n", "2304.11098": "|**2023-04-21**|**Generative AI-enabled Vehicular Networks: Fundamentals, Framework, and Case Study**|Ruichen Zhang et.al.|[2304.11098v1](http://arxiv.org/abs/2304.11098v1)|null|\n", "2304.11029": "|**2023-04-24**|**CLaMP: Contrastive Language-Music Pre-training for Cross-Modal Symbolic Music Information Retrieval**|Shangda Wu et.al.|[2304.11029v2](http://arxiv.org/abs/2304.11029v2)|**[link](https://github.com/microsoft/muzic/tree/main/clamp)**|\n", "2304.10893": "|**2023-04-21**|**FindVehicle and VehicleFinder: A NER dataset for natural language-based vehicle retrieval and a keyword-based cross-modal vehicle retrieval system**|Runwei Guan et.al.|[2304.10893v1](http://arxiv.org/abs/2304.10893v1)|**[link](https://github.com/guanrunwei/vehiclefinder-ctim)**|\n", "2304.10824": "|**2023-04-21**|**Rethinking Benchmarks for Cross-modal Image-text Retrieval**|Weijing Chen et.al.|[2304.10824v1](http://arxiv.org/abs/2304.10824v1)|**[link](https://github.com/cwj1412/mscoco-flikcr30k_fg)**|\n", "2304.10759": "|**2023-04-21**|**GeoLayoutLM: Geometric Pre-training for Visual Information Extraction**|Chuwei Luo et.al.|[2304.10759v1](http://arxiv.org/abs/2304.10759v1)|**[link](https://github.com/alibabaresearch/advancedliteratemachinery)**|\n", "2304.10756": "|**2023-04-21**|**Missing Modality Robustness in Semi-Supervised Multi-Modal Semantic Segmentation**|Harsh Maheshwari et.al.|[2304.10756v1](http://arxiv.org/abs/2304.10756v1)|**[link](https://github.com/harshm121/m3l)**|\n", "2304.10740": "|**2023-04-21**|**Multi-Modal Deep Learning for Credit Rating Prediction Using Text and Numerical Data Streams**|Mahsa Tavakoli et.al.|[2304.10740v1](http://arxiv.org/abs/2304.10740v1)|**[link](https://github.com/banking-analytics-lab/multimodalfusionratings)**|\n", "2304.10727": "|**2023-04-21**|**RoCOCO: Robust Benchmark MS-COCO to Stress-test Robustness of Image-Text Matching Models**|Seulki Park et.al.|[2304.10727v1](http://arxiv.org/abs/2304.10727v1)|**[link](https://github.com/pseulki/rococo)**|\n", "2304.10658": "|**2023-04-20**|**Linear to multi-linear algebra and systems using tensors**|Divyanshu Pandey et.al.|[2304.10658v1](http://arxiv.org/abs/2304.10658v1)|null|\n", "2304.10628": "|**2023-04-20**|**HM-ViT: Hetero-modal Vehicle-to-Vehicle Cooperative perception with vision transformer**|Hao Xiang et.al.|[2304.10628v1](http://arxiv.org/abs/2304.10628v1)|null|\n", "2304.10592": "|**2023-04-20**|**MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models**|Deyao Zhu et.al.|[2304.10592v1](http://arxiv.org/abs/2304.10592v1)|**[link](https://github.com/vision-cair/minigpt-4)**|\n", "2304.12269": "|**2023-04-24**|**Enriching Source Code with Contextual Data for Code Completion Models: An Empirical Study**|Tim van Dam et.al.|[2304.12269v1](http://arxiv.org/abs/2304.12269v1)|**[link](https://github.com/aise-tudelft/contextualdatacodecompletion)**|\n", "2304.12259": "|**2023-04-24**|**Imaging 3D Chemistry at 1 nm Resolution with Fused Multi-Modal Electron Tomography**|Jonathan Schwartz et.al.|[2304.12259v1](http://arxiv.org/abs/2304.12259v1)|**[link](https://github.com/jtschwar/projection_refinement)**|\n", "2304.11993": "|**2023-04-25**|**MMC: Multi-Modal Colorization of Images using Textual Descriptions**|Subhankar Ghosh et.al.|[2304.11993v2](http://arxiv.org/abs/2304.11993v2)|null|\n", "2304.11875": "|**2023-04-24**|**Underwater object classification combining SAS and transferred optical-to-SAS Imagery**|Avi Abu et.al.|[2304.11875v1](http://arxiv.org/abs/2304.11875v1)|null|\n", "2304.11829": "|**2023-04-25**|**Hierarchical Diffusion Autoencoders and Disentangled Image Manipulation**|Zeyu Lu et.al.|[2304.11829v2](http://arxiv.org/abs/2304.11829v2)|null|\n", "2304.11764": "|**2023-04-23**|**Learning-enabled multi-modal motion prediction in urban environments**|Vinicius Trentin et.al.|[2304.11764v1](http://arxiv.org/abs/2304.11764v1)|null|\n", "2304.11697": "|**2023-04-23**|**Informative Data Selection with Uncertainty for Multi-modal Object Detection**|Xinyu Zhang et.al.|[2304.11697v1](http://arxiv.org/abs/2304.11697v1)|null|\n", "2304.11618": "|**2023-04-23**|**Modality-Aware Negative Sampling for Multi-modal Knowledge Graph Embedding**|Yichi Zhang et.al.|[2304.11618v1](http://arxiv.org/abs/2304.11618v1)|**[link](https://github.com/zjukg/mans)**|\n", "2304.11603": "|**2023-04-23**|**LaMD: Latent Motion Diffusion for Video Generation**|Yaosi Hu et.al.|[2304.11603v1](http://arxiv.org/abs/2304.11603v1)|null|\n", "2304.11193": "|**2023-04-21**|**Combining Vision and Tactile Sensation for Video Prediction**|Willow Mandil et.al.|[2304.11193v1](http://arxiv.org/abs/2304.11193v1)|null|\n", "2304.12995": "|**2023-04-25**|**AudioGPT: Understanding and Generating Speech, Music, Sound, and Talking Head**|Rongjie Huang et.al.|[2304.12995v1](http://arxiv.org/abs/2304.12995v1)|**[link](https://github.com/aigc-audio/audiogpt)**|\n", "2304.12725": "|**2023-04-25**|**Quantitative analysis of collagen remodeling in pancreatic lesions using computationally translated collagen images derived from brightfield microscopy images**|Varun Nair et.al.|[2304.12725v1](http://arxiv.org/abs/2304.12725v1)|null|\n", "2304.12570": "|**2023-04-25**|**Learnable Pillar-based Re-ranking for Image-Text Retrieval**|Leigang Qu et.al.|[2304.12570v1](http://arxiv.org/abs/2304.12570v1)|**[link](https://github.com/lgqu/leaprr)**|\n", "2304.12412": "|**2023-04-24**|**End-to-End Lidar-Camera Self-Calibration for Autonomous Vehicles**|Arya Rachman et.al.|[2304.12412v1](http://arxiv.org/abs/2304.12412v1)|null|\n", "2304.13649": "|**2023-04-26**|**A Symmetric Dual Encoding Dense Retrieval Framework for Knowledge-Intensive Visual Question Answering**|Alireza Salemi et.al.|[2304.13649v1](http://arxiv.org/abs/2304.13649v1)|**[link](https://github.com/alirezasalemi7/dedr-mm-fid)**|\n", "2304.13583": "|**2023-04-26**|**Multi-Modality Deep Network for Extreme Learned Image Compression**|Xuhao Jiang et.al.|[2304.13583v1](http://arxiv.org/abs/2304.13583v1)|null|\n", "2304.13559": "|**2023-04-28**|**Towards Multi-Modal DBMSs for Seamless Querying of Texts and Tables**|Matthias Urban et.al.|[2304.13559v2](http://arxiv.org/abs/2304.13559v2)|null|\n", "2304.13425": "|**2023-04-26**|**Learnable Ophthalmology SAM**|Zhongxi Qiu et.al.|[2304.13425v1](http://arxiv.org/abs/2304.13425v1)|**[link](https://github.com/qsingle/learnablepromptsam)**|\n", "2304.13357": "|**2023-04-26**|**Deep Lifelong Cross-modal Hashing**|Liming Xu et.al.|[2304.13357v1](http://arxiv.org/abs/2304.13357v1)|null|\n", "2304.13277": "|**2023-04-26**|**Self-Supervised Multi-Modal Sequential Recommendation**|Kunzhe Song et.al.|[2304.13277v1](http://arxiv.org/abs/2304.13277v1)|**[link](https://github.com/kz-song/mmsrec)**|\n", "2304.13273": "|**2023-04-27**|**From Association to Generation: Text-only Captioning by Unsupervised Cross-modal Mapping**|Junyang Wang et.al.|[2304.13273v2](http://arxiv.org/abs/2304.13273v2)|**[link](https://github.com/junyangwang0410/knight)**|\n", "2304.13181": "|**2023-04-25**|**Sample-Specific Debiasing for Better Image-Text Models**|Peiqi Wang et.al.|[2304.13181v1](http://arxiv.org/abs/2304.13181v1)|null|\n", "2304.13172": "|**2023-04-25**|**Generating Procedural Materials from Text or Image Prompts**|Yiwei Hu et.al.|[2304.13172v1](http://arxiv.org/abs/2304.13172v1)|null|\n", "2304.13130": "|**2023-04-25**|**Hypernymization of named entity-rich captions for grounding-based multi-modal pretraining**|Giacomo Nebbia et.al.|[2304.13130v1](http://arxiv.org/abs/2304.13130v1)|null|\n", "2304.13103": "|**2023-04-25**|**HyMo: Vulnerability Detection in Smart Contracts using a Novel Multi-Modal Hybrid Model**|Mohammad Khodadadi et.al.|[2304.13103v1](http://arxiv.org/abs/2304.13103v1)|null|\n", "2304.13097": "|**2023-04-25**|**Bridging graph data models: RDF, RDF-star, and property graphs as directed acyclic graphs**|Ewout Gelling et.al.|[2304.13097v1](http://arxiv.org/abs/2304.13097v1)|**[link](https://github.com/ewoutgelling/bridging-data-models)**|\n", "2304.14340": "|**2023-04-27**|**SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection**|Yichen Xie et.al.|[2304.14340v1](http://arxiv.org/abs/2304.14340v1)|**[link](https://github.com/yichen928/sparsefusion)**|\n", "2304.14323": "|**2023-04-27**|**Pushing the Boundaries of Tractable Multiperspective Reasoning: A Deduction Calculus for Standpoint EL+**|Luc\u00eda {G\u00f3mez \u00c1lvarez} et.al.|[2304.14323v1](http://arxiv.org/abs/2304.14323v1)|**[link](https://github.com/cl-tud/standpoint-el-souffle-reasoner)**|\n", "2304.14243": "|**2023-04-27**|**Standpoint Linear Temporal Logic**|Nicola Gigante et.al.|[2304.14243v1](http://arxiv.org/abs/2304.14243v1)|null|\n", "2304.14178": "|**2023-04-27**|**mPLUG-Owl: Modularization Empowers Large Language Models with Multimodality**|Qinghao Ye et.al.|[2304.14178v1](http://arxiv.org/abs/2304.14178v1)|**[link](https://github.com/x-plug/mplug-owl)**|\n", "2304.13979": "|**2023-04-27**|**Adaptive-Mask Fusion Network for Segmentation of Drivable Road and Negative Obstacle With Untrustworthy Features**|Zhen Feng et.al.|[2304.13979v1](http://arxiv.org/abs/2304.13979v1)|**[link](https://github.com/lab-sun/amfnet)**|\n", "2304.13923": "|**2023-04-27**|**Retrieval-based Knowledge Augmented Vision Language Pre-training**|Jiahua Rao et.al.|[2304.13923v1](http://arxiv.org/abs/2304.13923v1)|null|\n", "2304.13833": "|**2023-04-26**|**Mixtures of Gaussian process experts based on kernel stick-breaking processes**|Yuji Saikai et.al.|[2304.13833v1](http://arxiv.org/abs/2304.13833v1)|**[link](https://github.com/ysaikai/gpksbp)**|\n", "2304.14204": "|**2023-04-26**|**Towards Medical Artificial General Intelligence via Knowledge-Enhanced Multimodal Pretraining**|Bingqian Lin et.al.|[2304.14204v1](http://arxiv.org/abs/2304.14204v1)|**[link](https://github.com/chenzcv7/motor)**|\n", "2304.15010": "|**2023-04-28**|**LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model**|Peng Gao et.al.|[2304.15010v1](http://arxiv.org/abs/2304.15010v1)|**[link](https://github.com/zrrskywalker/llama-adapter)**|\n", "2304.14942": "|**2023-04-28**|**The Emotions of the Crowd: Learning Image Sentiment from Tweets via Cross-modal Distillation**|Alessio Serra et.al.|[2304.14942v1](http://arxiv.org/abs/2304.14942v1)|null|\n", "2304.14936": "|**2023-04-28**|**Information Redundancy and Biases in Public Document Information Extraction Benchmarks**|Seif Laatiri et.al.|[2304.14936v1](http://arxiv.org/abs/2304.14936v1)|**[link](https://github.com/seif-lat/bias-study-funsd-sroie)**|\n", "2304.14933": "|**2023-04-28**|**An Empirical Study of Multimodal Model Merging**|Yi-Lin Sung et.al.|[2304.14933v1](http://arxiv.org/abs/2304.14933v1)|**[link](https://github.com/ylsung/vl-merging)**|\n", "2304.14880": "|**2023-04-28**|**SGAligner : 3D Scene Alignment with Scene Graphs**|Sayan Deb Sarkar et.al.|[2304.14880v1](http://arxiv.org/abs/2304.14880v1)|**[link](https://github.com/sayands/sgaligner)**|\n", "2305.00970": "|**2023-05-01**|**ArK: Augmented Reality with Knowledge Interactive Emergent Ability**|Qiuyuan Huang et.al.|[2305.00970v1](http://arxiv.org/abs/2305.00970v1)|null|\n", "2305.00769": "|**2023-05-01**|**Multi-scale Transformer-based Network for Emotion Recognition from Multi Physiological Signals**|Tu Vu et.al.|[2305.00769v1](http://arxiv.org/abs/2305.00769v1)|**[link](https://github.com/vsl-team/EPiC-2023-ACII)**|\n", "2305.00537": "|**2023-04-30**|**Interpretability of Machine Learning: Recent Advances and Future Prospects**|Lei Gao et.al.|[2305.00537v1](http://arxiv.org/abs/2305.00537v1)|null|\n", "2305.00355": "|**2023-04-29**|**MH-DETR: Video Moment and Highlight Detection with Cross-modal Transformer**|Yifang Xu et.al.|[2305.00355v1](http://arxiv.org/abs/2305.00355v1)|null|\n", "2305.00320": "|**2023-04-29**|**Fusion for Visual-Infrared Person ReID in Real-World Surveillance Using Corrupted Multimodal Data**|Arthur Josi et.al.|[2305.00320v1](http://arxiv.org/abs/2305.00320v1)|**[link](https://github.com/art2611/mreid-ucd-ccd)**|\n", "2305.00314": "|**2023-04-29**|**InfraDet3D: Multi-Modal 3D Object Detection based on Roadside Infrastructure Camera and LiDAR Sensors**|Walter Zimmer et.al.|[2305.00314v1](http://arxiv.org/abs/2305.00314v1)|null|\n", "2305.00207": "|**2023-04-29**|**Mixed-Response State-Space Model for Analyzing Multi-Dimensional Digital Phenotypes**|Tianchen Xu et.al.|[2305.00207v1](http://arxiv.org/abs/2305.00207v1)|**[link](https://github.com/zjph602xtc/MRSS)**|\n", "2305.00201": "|**2023-04-29**|**Instruction-ViT: Multi-Modal Prompts for Instruction Learning in ViT**|Zhenxiang Xiao et.al.|[2305.00201v1](http://arxiv.org/abs/2305.00201v1)|null|\n", "2305.00042": "|**2023-04-28**|**Cycle-guided Denoising Diffusion Probability Model for 3D Cross-modality MRI Synthesis**|Shaoyan Pan et.al.|[2305.00042v1](http://arxiv.org/abs/2305.00042v1)|null|\n", "2305.00976": "|**2023-05-02**|**TMR: Text-to-Motion Retrieval Using Contrastive 3D Human Motion Synthesis**|Mathis Petrovich et.al.|[2305.00976v1](http://arxiv.org/abs/2305.00976v1)|null|\n", "2305.01412": "|**2023-05-02**|**A Computational Approach for the Characterization of Airborne Pathogen Transmission in Turbulent Molecular Communication Channels**|Fatih Gulec et.al.|[2305.01412v1](http://arxiv.org/abs/2305.01412v1)|null|\n", "2305.01366": "|**2023-05-02**|**Establishing a Learning Model for Correct Hand Hygiene Technique in a NICU**|Ir\u00e9n A. Kopcs\u00f3n\u00e9 N\u00e9meth et.al.|[2305.01366v1](http://arxiv.org/abs/2305.01366v1)|null|\n", "2305.01245": "|**2023-05-02**|**MDENet: Multi-modal Dual-embedding Networks for Malware Open-set Recognition**|Jingcai Guo et.al.|[2305.01245v1](http://arxiv.org/abs/2305.01245v1)|null|\n", "2305.01233": "|**2023-05-03**|**On Uni-Modal Feature Learning in Supervised Multi-Modal Learning**|Chenzhuang Du et.al.|[2305.01233v2](http://arxiv.org/abs/2305.01233v2)|**[link](https://github.com/gewu-lab/ogm-ge_cvpr2022)**|\n", "2305.01111": "|**2023-05-01**|**Local and Global Contextual Features Fusion for Pedestrian Intention Prediction**|Mohsen Azarmi et.al.|[2305.01111v1](http://arxiv.org/abs/2305.01111v1)|null|\n", "2305.02269": "|**2023-05-03**|**M2-CTTS: End-to-End Multi-scale Multi-modal Conversational Text-to-Speech Synthesis**|Jinlong Xue et.al.|[2305.02269v1](http://arxiv.org/abs/2305.02269v1)|null|\n", "2305.01971": "|**2023-05-03**|**District-scale surface temperatures generated from high-resolution longitudinal thermal infrared images**|Subin Lin et.al.|[2305.01971v1](http://arxiv.org/abs/2305.01971v1)|**[link](https://github.com/buds-lab/project-iris-dataset)**|\n", "2305.01915": "|**2023-05-03**|**Denoising Multi-modal Sequential Recommenders with Contrastive Learning**|Dong Yao et.al.|[2305.01915v1](http://arxiv.org/abs/2305.01915v1)|null|\n", "2305.01912": "|**2023-05-03**|**MolKD: Distilling Cross-Modal Knowledge in Chemical Reactions for Molecular Property Prediction**|Liang Zeng et.al.|[2305.01912v1](http://arxiv.org/abs/2305.01912v1)|null|\n", "2305.01877": "|**2023-05-04**|**The Impacts of Dimensionality, Diffusion, and Directedness on Intrinsic Cross-Model Simulation in Tile-Based Self-Assembly**|Daniel Hader et.al.|[2305.01877v2](http://arxiv.org/abs/2305.01877v2)|null|\n", "2305.01864": "|**2023-05-05**|**Unsupervised Improvement of Audio-Text Cross-Modal Representations**|Zhepei Wang et.al.|[2305.01864v2](http://arxiv.org/abs/2305.01864v2)|**[link](https://github.com/zhepeiw/clap_curation)**|\n", "2305.01836": "|**2023-05-03**|**AV-SAM: Segment Anything Model Meets Audio-Visual Localization and Segmentation**|Shentong Mo et.al.|[2305.01836v1](http://arxiv.org/abs/2305.01836v1)|null|\n", "2305.01778": "|**2023-05-02**|**SLTUNET: A Simple Unified Model for Sign Language Translation**|Biao Zhang et.al.|[2305.01778v1](http://arxiv.org/abs/2305.01778v1)|**[link](https://github.com/bzhangGo/sltunet)**|\n", "2305.01661": "|**2023-05-02**|**SIA-FTP: A Spoken Instruction Aware Flight Trajectory Prediction Framework**|Dongyue Guo et.al.|[2305.01661v1](http://arxiv.org/abs/2305.01661v1)|null|\n", "2305.02930": "|**2023-05-04**|**Piecewise Normalizing Flows**|Harry Bevins et.al.|[2305.02930v1](http://arxiv.org/abs/2305.02930v1)|**[link](https://github.com/htjb/margarine)**|\n", "2305.02774": "|**2023-05-04**|**Spatial and Modal Optimal Transport for Fast Cross-Modal MRI Reconstruction**|Qi Wang et.al.|[2305.02774v1](http://arxiv.org/abs/2305.02774v1)|null|\n", "2305.02760": "|**2023-05-04**|**Multi-Modality Deep Network for JPEG Artifacts Reduction**|Xuhao Jiang et.al.|[2305.02760v1](http://arxiv.org/abs/2305.02760v1)|null|\n", "2305.02577": "|**2023-05-04**|**Text Reading Order in Uncontrolled Conditions by Sparse Graph Segmentation**|Renshen Wang et.al.|[2305.02577v1](http://arxiv.org/abs/2305.02577v1)|null|\n", "2305.02572": "|**2023-05-04**|**High-fidelity Generalized Emotional Talking Face Generation with Multi-modal Emotion Space Learning**|Chao Xu et.al.|[2305.02572v1](http://arxiv.org/abs/2305.02572v1)|null|\n", "2305.02504": "|**2023-05-04**|**Learning Missing Modal Electronic Health Records with Unified Multi-modal Data Embedding and Modality-Aware Attention**|Kwanhyung Lee et.al.|[2305.02504v1](http://arxiv.org/abs/2305.02504v1)|null|\n", "2305.03726": "|**2023-05-05**|**Otter: A Multi-Modal Model with In-Context Instruction Tuning**|Bo Li et.al.|[2305.03726v1](http://arxiv.org/abs/2305.03726v1)|**[link](https://github.com/luodian/otter)**|\n", "2305.03724": "|**2023-05-05**|**DualCross: Cross-Modality Cross-Domain Adaptation for Monocular BEV Perception**|Yunze Man et.al.|[2305.03724v1](http://arxiv.org/abs/2305.03724v1)|null|\n", "2305.03689": "|**2023-05-05**|**COLA: How to adapt vision-language models to Compose Objects Localized with Attributes?**|Arijit Ray et.al.|[2305.03689v1](http://arxiv.org/abs/2305.03689v1)|**[link](https://github.com/arijitray1993/COLA)**|\n", "2305.03347": "|**2023-05-05**|**A Large Cross-Modal Video Retrieval Dataset with Reading Comprehension**|Weijia Wu et.al.|[2305.03347v1](http://arxiv.org/abs/2305.03347v1)|**[link](https://github.com/callsys/textvr)**|\n", "2305.03314": "|**2023-05-05**|**Block the Label and Noise: An N-Gram Masked Speller for Chinese Spell Checking**|Haiyun Yang et.al.|[2305.03314v1](http://arxiv.org/abs/2305.03314v1)|null|\n", "2305.03277": "|**2023-05-05**|**FM-ViT: Flexible Modal Vision Transformers for Face Anti-Spoofing**|Ajian Liu et.al.|[2305.03277v1](http://arxiv.org/abs/2305.03277v1)|null|\n", "2305.03252": "|**2023-05-05**|**HeteroEdge: Addressing Asymmetry in Heterogeneous Collaborative Autonomous Systems**|Mohammad Saeid Anwar et.al.|[2305.03252v1](http://arxiv.org/abs/2305.03252v1)|null|\n", "2305.03212": "|**2023-05-04**|**LLM2Loss: Leveraging Language Models for Explainable Model Diagnostics**|Shervin Ardeshir et.al.|[2305.03212v1](http://arxiv.org/abs/2305.03212v1)|null|\n", "2305.03187": "|**2023-05-04**|**Generating Virtual On-body Accelerometer Data from Virtual Textual Descriptions for Human Activity Recognition**|Zikang Leng et.al.|[2305.03187v1](http://arxiv.org/abs/2305.03187v1)|**[link](https://github.com/ZikangLeng/IMUGPT)**|\n", "2305.03506": "|**2023-05-04**|**SI-LSTM: Speaker Hybrid Long-short Term Memory and Cross Modal Attention for Emotion Recognition in Conversation**|Xingwei Liang et.al.|[2305.03506v1](http://arxiv.org/abs/2305.03506v1)|null|\n", "2305.04824": "|**2023-05-08**|**Learning Summary-Worthy Visual Representation for Abstractive Summarization in Video**|Zenan Xu et.al.|[2305.04824v1](http://arxiv.org/abs/2305.04824v1)|null|\n", "2305.04790": "|**2023-05-09**|**MultiModal-GPT: A Vision and Language Model for Dialogue with Humans**|Tao Gong et.al.|[2305.04790v2](http://arxiv.org/abs/2305.04790v2)|**[link](https://github.com/open-mmlab/multimodal-gpt)**|\n", "2305.04685": "|**2023-05-08**|**ARDIE: AR, Dialogue, and Eye Gaze Policies for Human-Robot Collaboration**|Chelsea Zou et.al.|[2305.04685v1](http://arxiv.org/abs/2305.04685v1)|null|\n", "2305.04530": "|**2023-05-08**|**A Multi-Modal Context Reasoning Approach for Conditional Inference on Joint Textual and Visual Clues**|Yunxin Li et.al.|[2305.04530v1](http://arxiv.org/abs/2305.04530v1)|**[link](https://github.com/yunxinli/multimodal-context-reasoning)**|\n", "2305.04476": "|**2023-05-09**|**AlignSTS: Speech-to-Singing Conversion via Cross-Modal Alignment**|Ruiqi Li et.al.|[2305.04476v2](http://arxiv.org/abs/2305.04476v2)|null|\n", "2305.04474": "|**2023-05-09**|**Vision Langauge Pre-training by Contrastive Learning with Cross-Modal Similarity Regulation**|Chaoya Jiang et.al.|[2305.04474v2](http://arxiv.org/abs/2305.04474v2)|null|\n", "2305.04469": "|**2023-05-08**|**HACK: Learning a Parametric Head and Neck Model for High-fidelity Animation**|Longwen Zhang et.al.|[2305.04469v1](http://arxiv.org/abs/2305.04469v1)|**[link](https://github.com/zonelikewonderland/hack-model)**|\n", "2305.04451": "|**2023-05-08**|**FashionTex: Controllable Virtual Try-on with Text and Texture**|Anran Lin et.al.|[2305.04451v1](http://arxiv.org/abs/2305.04451v1)|**[link](https://github.com/picksh/fashiontex)**|\n", "2305.04298": "|**2023-05-07**|**Poses as Queries: Image-to-LiDAR Map Localization with Transformers**|Jinyu Miao et.al.|[2305.04298v1](http://arxiv.org/abs/2305.04298v1)|null|\n", "2305.04239": "|**2023-05-07**|**Instance-Variant Loss with Gaussian RBF Kernel for 3D Cross-modal Retriveal**|Zhitao Liu et.al.|[2305.04239v1](http://arxiv.org/abs/2305.04239v1)|null|\n", "2305.04224": "|**2023-05-07**|**Visual Causal Scene Refinement for Video Question Answering**|Yushen Wei et.al.|[2305.04224v1](http://arxiv.org/abs/2305.04224v1)|**[link](https://github.com/yangliu9208/vcsr)**|\n", "2305.04195": "|**2023-05-07**|**Cross-Modal Retrieval for Motion and Text via MildTriple Loss**|Sheng Yan et.al.|[2305.04195v1](http://arxiv.org/abs/2305.04195v1)|**[link](https://github.com/eanson023/rehamot)**|\n", "2305.04160": "|**2023-05-07**|**X-LLM: Bootstrapping Advanced Large Language Models by Treating Multi-Modalities as Foreign Languages**|Feilong Chen et.al.|[2305.04160v1](http://arxiv.org/abs/2305.04160v1)|null|\n", "2305.04156": "|**2023-05-07**|**SynthMix: Mixing up Aligned Synthesis for Medical Cross-Modality Domain Adaptation**|Xinwen Zhang et.al.|[2305.04156v1](http://arxiv.org/abs/2305.04156v1)|null|\n", "2305.04072": "|**2023-05-06**|**Keyword-Based Diverse Image Retrieval by Semantics-aware Contrastive Learning and Transformer**|Minyi Zhao et.al.|[2305.04072v1](http://arxiv.org/abs/2305.04072v1)|null|\n", "2305.05665": "|**2023-05-09**|**ImageBind: One Embedding Space To Bind Them All**|Rohit Girdhar et.al.|[2305.05665v1](http://arxiv.org/abs/2305.05665v1)|**[link](https://github.com/facebookresearch/imagebind)**|\n", "2305.05662": "|**2023-05-11**|**InternGPT: Solving Vision-Centric Tasks by Interacting with ChatGPT Beyond Language**|Zhaoyang Liu et.al.|[2305.05662v3](http://arxiv.org/abs/2305.05662v3)|**[link](https://github.com/opengvlab/interngpt)**|\n", "2305.05534": "|**2023-05-09**|**Integrating Holistic and Local Information to Estimate Emotional Reaction Intensity**|Yini Fang et.al.|[2305.05534v1](http://arxiv.org/abs/2305.05534v1)|**[link](https://github.com/hkust-nisl/abaw5)**|\n", "2305.05496": "|**2023-05-09**|**Exploiting Pseudo Image Captions for Multimodal Summarization**|Chaoya Jiang et.al.|[2305.05496v1](http://arxiv.org/abs/2305.05496v1)|**[link](https://github.com/sitaproject/sita)**|\n", "2305.05260": "|**2023-05-09**|**Guided Focal Stack Refinement Network for Light Field Salient Object Detection**|Bo Yuan et.al.|[2305.05260v1](http://arxiv.org/abs/2305.05260v1)|null|\n", "2305.05189": "|**2023-05-09**|**SUR-adapter: Enhancing Text-to-Image Pre-trained Diffusion Models with Large Language Models**|Shanshan Zhong et.al.|[2305.05189v1](http://arxiv.org/abs/2305.05189v1)|**[link](https://github.com/Qrange-group/SUR-adapter)**|\n", "2305.05166": "|**2023-05-10**|**E2TIMT: Efficient and Effective Modal Adapter for Text Image Machine Translation**|Cong Ma et.al.|[2305.05166v2](http://arxiv.org/abs/2305.05166v2)|**[link](https://github.com/ericongma/e2timt)**|\n", "2305.05126": "|**2023-05-09**|**Comparing Foundation Models using Data Kernels**|Brandon Duderstadt et.al.|[2305.05126v1](http://arxiv.org/abs/2305.05126v1)|null|\n", "2305.04961": "|**2023-05-08**|**Joint Moment Retrieval and Highlight Detection Via Natural Language Queries**|Richard Luo et.al.|[2305.04961v1](http://arxiv.org/abs/2305.04961v1)|**[link](https://github.com/skyline-9/visionary-vids)**|\n", "2305.06292": "|**2023-05-10**|**Joint Metrics Matter: A Better Standard for Trajectory Forecasting**|Erica Weng et.al.|[2305.06292v1](http://arxiv.org/abs/2305.06292v1)|**[link](https://github.com/ericaweng/joint-metrics-matter)**|\n", "2305.06278": "|**2023-05-10**|**A Multi-modal Garden Dataset and Hybrid 3D Dense Reconstruction Framework Based on Panoramic Stereo Images for a Trimming Robot**|Can Pu et.al.|[2305.06278v1](http://arxiv.org/abs/2305.06278v1)|**[link](https://github.com/canpu999/trimbot-wageningen-slam-dataset)**|\n", "2305.06225": "|**2023-05-10**|**DaGAN++: Depth-Aware Generative Adversarial Network for Talking Head Video Generation**|Fa-Ting Hong et.al.|[2305.06225v1](http://arxiv.org/abs/2305.06225v1)|**[link](https://github.com/harlanhong/cvpr2022-dagan)**|\n", "2305.06221": "|**2023-05-10**|**Multi-Prompt with Depth Partitioned Cross-Modal Learning**|Yiqi Wang et.al.|[2305.06221v1](http://arxiv.org/abs/2305.06221v1)|**[link](https://github.com/wangyiqi/pmpo)**|\n", "2305.06203": "|**2023-05-10**|**Multiclass MRI Brain Tumor Segmentation using 3D Attention-based U-Net**|Maryann M. Gitonga et.al.|[2305.06203v1](http://arxiv.org/abs/2305.06203v1)|null|\n", "2305.06179": "|**2023-05-11**|**A Multi-modal Approach to Single-modal Visual Place Classification**|Tomoya Iwasaki et.al.|[2305.06179v2](http://arxiv.org/abs/2305.06179v2)|null|\n", "2305.05992": "|**2023-05-10**|**MMoT: Mixture-of-Modality-Tokens Transformer for Composed Multimodal Conditional Image Synthesis**|Jianbin Zheng et.al.|[2305.05992v1](http://arxiv.org/abs/2305.05992v1)|null|\n", "2305.05880": "|**2023-05-10**|**ChinaOpen: A Dataset for Open-world Multimodal Learning**|Aozhu Chen et.al.|[2305.05880v1](http://arxiv.org/abs/2305.05880v1)|**[link](https://github.com/dong03/GenerativeVideo2Text)**|\n", "2305.06978": "|**2023-05-11**|**Meta-hallucinator: Towards Few-Shot Cross-Modality Cardiac Image Segmentation**|Ziyuan Zhao et.al.|[2305.06978v1](http://arxiv.org/abs/2305.06978v1)|null|\n", "2305.06923": "|**2023-05-11**|**EAML: Ensemble Self-Attention-based Mutual Learning Network for Document Image Classification**|Souhail Bakkali et.al.|[2305.06923v1](http://arxiv.org/abs/2305.06923v1)|null|\n", "2305.06794": "|**2023-05-11**|**Multi-modal Multi-level Fusion for 3D Single Object Tracking**|Zhiheng Li et.al.|[2305.06794v1](http://arxiv.org/abs/2305.06794v1)|null|\n", "2305.06720": "|**2023-05-11**|**Bi-level Dynamic Learning for Jointly Multi-modality Image Fusion and Beyond**|Zhu Liu et.al.|[2305.06720v1](http://arxiv.org/abs/2305.06720v1)|**[link](https://github.com/LiuZhu-CV/BDLFusion)**|\n", "2305.06472": "|**2023-05-12**|**ChatGPT-Like Large-Scale Foundation Models for Prognostics and Health Management: A Survey and Roadmaps**|Yan-Fu Li et.al.|[2305.06472v2](http://arxiv.org/abs/2305.06472v2)|null|\n", "2305.06407": "|**2023-05-10**|**Combo of Thinking and Observing for Outside-Knowledge VQA**|Qingyi Si et.al.|[2305.06407v1](http://arxiv.org/abs/2305.06407v1)|**[link](https://github.com/phoebussi/thinking-while-observing)**|\n", "2305.06386": "|**2023-05-10**|**Text-To-Concept (and Back) via Cross-Model Alignment**|Mazda Moayeri et.al.|[2305.06386v1](http://arxiv.org/abs/2305.06386v1)|null|\n", "2305.07358": "|**2023-05-12**|**Towards Versatile and Efficient Visual Knowledge Injection into Pre-trained Language Models with Cross-Modal Adapters**|Xinyun Zhang et.al.|[2305.07358v1](http://arxiv.org/abs/2305.07358v1)|null|\n", "2305.07334": "|**2023-05-12**|**Locking and Quacking: Stacking Bayesian model predictions by log-pooling and superposition**|Yuling Yao et.al.|[2305.07334v1](http://arxiv.org/abs/2305.07334v1)|null|\n", "2305.07216": "|**2023-05-12**|**Versatile Audio-Visual Learning for Handling Single and Multi Modalities in Emotion Regression and Classification Tasks**|Lucas Goncalves et.al.|[2305.07216v1](http://arxiv.org/abs/2305.07216v1)|**[link](https://github.com/ilucasgoncalves/vavl)**|\n", "2305.07214": "|**2023-05-12**|**MMG-Ego4D: Multi-Modal Generalization in Egocentric Action Recognition**|Xinyu Gong et.al.|[2305.07214v1](http://arxiv.org/abs/2305.07214v1)|null|\n", "2305.07437": "|**2023-05-15**|**Continual Vision-Language Representation Learning with Off-Diagonal Information**|Zixuan Ni et.al.|[2305.07437v2](http://arxiv.org/abs/2305.07437v2)|null|\n", "2305.08706": "|**2023-05-15**|**Understanding and Bridging the Modality Gap for Speech Translation**|Qingkai Fang et.al.|[2305.08706v1](http://arxiv.org/abs/2305.08706v1)|**[link](https://github.com/ictnlp/cress)**|\n", "2305.08698": "|**2023-05-15**|**Continual Multimodal Knowledge Graph Construction**|Xiang Chen et.al.|[2305.08698v1](http://arxiv.org/abs/2305.08698v1)|**[link](https://github.com/zjunlp/ContinueMKGC)**|\n", "2305.08685": "|**2023-05-15**|**CLIP-VG: Self-paced Curriculum Adapting of CLIP via Exploiting Pseudo-Language Labels for Visual Grounding**|Linhui Xiao et.al.|[2305.08685v1](http://arxiv.org/abs/2305.08685v1)|**[link](https://github.com/linhuixiao/clip-vg)**|\n", "2305.08532": "|**2023-05-15**|**Benchmarking UWB-Based Infrastructure-Free Positioning and Multi-Robot Relative Localization: Dataset and Characterization**|Paola Torrico Mor\u00f3n et.al.|[2305.08532v1](http://arxiv.org/abs/2305.08532v1)|null|\n", "2305.08522": "|**2023-05-15**|**Cross-Modality Time-Variant Relation Learning for Generating Dynamic Scene Graphs**|Jingyi Wang et.al.|[2305.08522v1](http://arxiv.org/abs/2305.08522v1)|**[link](https://github.com/qncsn2016/TR2)**|\n", "2305.08386": "|**2023-05-15**|**PLIP: Language-Image Pre-training for Person Representation Learning**|Jialong Zuo et.al.|[2305.08386v1](http://arxiv.org/abs/2305.08386v1)|**[link](https://github.com/zplusdragon/plip)**|\n", "2305.08381": "|**2023-05-15**|**Mode Approximation Makes Good Vision-Language Prompts**|Haixin Wang et.al.|[2305.08381v1](http://arxiv.org/abs/2305.08381v1)|**[link](https://github.com/willdreamer/aurora)**|\n", "2305.08372": "|**2023-05-15**|**A Novel Framework for Multimodal Named Entity Recognition with Multi-level Alignments**|Peipei Liu et.al.|[2305.08372v1](http://arxiv.org/abs/2305.08372v1)|null|\n", "2305.08252": "|**2023-05-14**|**Parameter-Efficient Fine-Tuning for Medical Image Analysis: The Missed Opportunity**|Raman Dutt et.al.|[2305.08252v1](http://arxiv.org/abs/2305.08252v1)|null|\n", "2305.08120": "|**2023-05-14**|**Unraveling Cold Start Enigmas in Predictive Analytics for OTT Media: Synergistic Meta-Insights and Multimodal Ensemble Mastery**|K. Ganguly et.al.|[2305.08120v1](http://arxiv.org/abs/2305.08120v1)|null|\n", "2305.07927": "|**2023-05-13**|**RC3: Regularized Contrastive Cross-lingual Cross-modal Pre-training**|Chulun Zhou et.al.|[2305.07927v1](http://arxiv.org/abs/2305.07927v1)|null|\n", "2305.07920": "|**2023-05-13**|**Multi-task Paired Masking with Alignment Modeling for Medical Vision-Language Pre-training**|Ke Zhang et.al.|[2305.07920v1](http://arxiv.org/abs/2305.07920v1)|null|\n", "2305.07910": "|**2023-05-13**|**Mask to reconstruct: Cooperative Semantics Completion for Video-text Retrieval**|Han Fang et.al.|[2305.07910v1](http://arxiv.org/abs/2305.07910v1)|null|\n", "2305.07825": "|**2023-05-13**|**Student Classroom Behavior Detection based on YOLOv7-BRA and Multi-Model Fusion**|Fan Yang et.al.|[2305.07825v1](http://arxiv.org/abs/2305.07825v1)|**[link](https://github.com/whiffe/scb-dataset)**|\n", "2305.07792": "|**2023-05-12**|**Contextuality in multi-agent paradoxes**|Sidiney B. Montanhano et.al.|[2305.07792v1](http://arxiv.org/abs/2305.07792v1)|null|\n", "2305.09641": "|**2023-05-16**|**FitMe: Deep Photorealistic 3D Morphable Model Avatars**|Alexandros Lattas et.al.|[2305.09641v1](http://arxiv.org/abs/2305.09641v1)|null|\n", "2305.09600": "|**2023-05-16**|**Deep Reinforcement Learning to Maximize Arterial Usage during Extreme Congestion**|Ashutosh Dutta et.al.|[2305.09600v1](http://arxiv.org/abs/2305.09600v1)|null|\n", "2305.09333": "|**2023-05-16**|**Multi-modal Visual Understanding with Prompts for Semantic Information Disentanglement of Image**|Yuzhou Peng et.al.|[2305.09333v1](http://arxiv.org/abs/2305.09333v1)|null|\n", "2305.09272": "|**2023-05-16**|**Age of Incorrect Information in Semantic Communications for NOMA Aided XR Applications**|Jianrui Chen et.al.|[2305.09272v1](http://arxiv.org/abs/2305.09272v1)|null|\n", "2305.09255": "|**2023-05-16**|**Trust-Worthy Semantic Communications for the Metaverse Relying on Federated Learning**|Jianrui Chen et.al.|[2305.09255v1](http://arxiv.org/abs/2305.09255v1)|null|\n", "2305.09212": "|**2023-05-16**|**Cross-Modal Global Interaction and Local Alignment for Audio-Visual Speech Recognition**|Yuchen Hu et.al.|[2305.09212v1](http://arxiv.org/abs/2305.09212v1)|**[link](https://github.com/yuchen005/gila)**|\n", "2305.09011": "|**2023-05-18**|**The Brain Tumor Segmentation (BraTS) Challenge 2023: Brain MR Image Synthesis for Tumor Segmentation (BraSyn)**|Hongwei Bran Li et.al.|[2305.09011v2](http://arxiv.org/abs/2305.09011v2)|null|\n", "2305.10420": "|**2023-05-17**|**CLIP-GCD: Simple Language Guided Generalized Category Discovery**|Rabah Ouldnoughi et.al.|[2305.10420v1](http://arxiv.org/abs/2305.10420v1)|null|\n", "2305.10046": "|**2023-05-17**|**Probing the Role of Positional Information in Vision-Language Models**|Philipp J. R\u00f6sch et.al.|[2305.10046v1](http://arxiv.org/abs/2305.10046v1)|null|\n", "2305.09946": "|**2023-05-17**|**DeepMSS: Deep Multi-Modality Segmentation-to-Survival Learning for Survival Outcome Prediction from PET/CT Images**|Mingyuan Meng et.al.|[2305.09946v1](http://arxiv.org/abs/2305.09946v1)|**[link](https://github.com/mungomeng/survival-deepmss)**|\n", "2305.11176": "|**2023-05-18**|**Instruct2Act: Mapping Multi-modality Instructions to Robotic Actions with Large Language Model**|Siyuan Huang et.al.|[2305.11176v1](http://arxiv.org/abs/2305.11176v1)|**[link](https://github.com/opengvlab/instruct2act)**|\n", "2305.11172": "|**2023-05-18**|**ONE-PEACE: Exploring One General Representation Model Toward Unlimited Modalities**|Peng Wang et.al.|[2305.11172v1](http://arxiv.org/abs/2305.11172v1)|**[link](https://github.com/OFA-Sys/ONE-PEACE)**|\n", "2305.11101": "|**2023-05-18**|**XFormer: Fast and Accurate Monocular 3D Body Capture**|Lihui Qian et.al.|[2305.11101v1](http://arxiv.org/abs/2305.11101v1)|null|\n", "2305.11096": "|**2023-05-22**|**Cross-modality Data Augmentation for End-to-End Sign Language Translation**|Jinhui Ye et.al.|[2305.11096v2](http://arxiv.org/abs/2305.11096v2)|**[link](https://github.com/atrewin/signxmda)**|\n", "2305.11012": "|**2023-05-18**|**SDC-UDA: Volumetric Unsupervised Domain Adaptation Framework for Slice-Direction Continuous Cross-Modality Medical Image Segmentation**|Hyungseob Shin et.al.|[2305.11012v1](http://arxiv.org/abs/2305.11012v1)|null|\n", "2305.11000": "|**2023-05-19**|**SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities**|Dong Zhang et.al.|[2305.11000v2](http://arxiv.org/abs/2305.11000v2)|**[link](https://github.com/0nutation/speechgpt)**|\n", "2305.10920": "|**2023-05-18**|**Emergent Communication with Attention**|Ryokan Ri et.al.|[2305.10920v1](http://arxiv.org/abs/2305.10920v1)|null|\n", "2305.10838": "|**2023-05-18**|**ProgSG: Cross-Modality Representation Learning for Programs in Electronic Design Automation**|Yunsheng Bai et.al.|[2305.10838v1](http://arxiv.org/abs/2305.10838v1)|null|\n", "2305.10783": "|**2023-05-18**|**Transforming Human-Centered AI Collaboration: Redefining Embodied Agents Capabilities through Interactive Grounded Language Instructions**|Shrestha Mohanty et.al.|[2305.10783v1](http://arxiv.org/abs/2305.10783v1)|**[link](https://github.com/iglu-contest/nlp-baselines-2022)**|\n", "2305.10773": "|**2023-05-18**|**Rate-Adaptive Coding Mechanism for Semantic Communications With Multi-Modal Data**|Yangshuo He et.al.|[2305.10773v1](http://arxiv.org/abs/2305.10773v1)|null|\n", "2305.10764": "|**2023-05-18**|**OpenShape: Scaling Up 3D Shape Representation Towards Open-World Understanding**|Minghua Liu et.al.|[2305.10764v1](http://arxiv.org/abs/2305.10764v1)|null|\n", "2305.10763": "|**2023-05-18**|**CLAPSpeech: Learning Prosody from Text Context with Contrastive Language-Audio Pre-training**|Zhenhui Ye et.al.|[2305.10763v1](http://arxiv.org/abs/2305.10763v1)|null|\n", "2305.10724": "|**2023-05-18**|**Segment Any Anomaly without Training via Hybrid Prompt Regularization**|Yunkang Cao et.al.|[2305.10724v1](http://arxiv.org/abs/2305.10724v1)|**[link](https://github.com/caoyunkang/segment-any-anomaly)**|\n", "2305.10547": "|**2023-05-17**|**Rethinking Multimodal Content Moderation from an Asymmetric Angle with Mixed-modality**|Jialin Yuan et.al.|[2305.10547v1](http://arxiv.org/abs/2305.10547v1)|null|\n", "2305.10512": "|**2023-05-17**|**IMAD: IMage-Augmented multi-modal Dialogue**|Moskvoretskii Viktor et.al.|[2305.10512v1](http://arxiv.org/abs/2305.10512v1)|**[link](https://github.com/vityavitalich/imad)**|\n", "2305.11832": "|**2023-05-19**|**Improving Multimodal Joint Variational Autoencoders through Normalizing Flows and Correlation Analysis**|Agathe Senellart et.al.|[2305.11832v1](http://arxiv.org/abs/2305.11832v1)|null|\n", "2305.11818": "|**2023-05-19**|**MaGIC: Multi-modality Guided Image Completion**|Yongsheng Yu et.al.|[2305.11818v1](http://arxiv.org/abs/2305.11818v1)|null|\n", "2305.11719": "|**2023-05-19**|**Information Screening whilst Exploiting! Multimodal Relation Extraction with Feature Denoising and Multimodal Topic Modeling**|Shengqiong Wu et.al.|[2305.11719v1](http://arxiv.org/abs/2305.11719v1)|**[link](https://github.com/chocowu/mre-ise)**|\n", "2305.11579": "|**2023-05-19**|**Speech-Text Dialog Pre-training for Spoken Dialog Understanding with Explicit Cross-Modal Alignment**|Tianshu Yu et.al.|[2305.11579v1](http://arxiv.org/abs/2305.11579v1)|**[link](https://github.com/alibabaresearch/damo-convai)**|\n", "2305.11503": "|**2023-05-19**|**A Topic-aware Summarization Framework with Different Modal Side Information**|Xiuying Chen et.al.|[2305.11503v1](http://arxiv.org/abs/2305.11503v1)|null|\n", "2305.11481": "|**2023-05-22**|**CM-MaskSD: Cross-Modality Masked Self-Distillation for Referring Image Segmentation**|Wenxuan Wang et.al.|[2305.11481v2](http://arxiv.org/abs/2305.11481v2)|null|\n", "2305.11443": "|**2023-05-19**|**Equivariant Multi-Modality Image Fusion**|Zixiang Zhao et.al.|[2305.11443v1](http://arxiv.org/abs/2305.11443v1)|null|\n", "2305.11439": "|**2023-05-19**|**Few-Shot Learning with Visual Distribution Calibration and Cross-Modal Distribution Alignment**|Runqi Wang et.al.|[2305.11439v1](http://arxiv.org/abs/2305.11439v1)|**[link](https://github.com/bhrqw/sada)**|\n", "2305.11392": "|**2023-05-19**|**Fast-StrucTexT: An Efficient Hourglass Transformer with Modality-guided Dynamic Token Merge for Document Understanding**|Mingliang Zhai et.al.|[2305.11392v1](http://arxiv.org/abs/2305.11392v1)|null|\n", "2305.11349": "|**2023-05-18**|**Unsupervised Domain-agnostic Fake News Detection using Multi-modal Weak Signals**|Amila Silva et.al.|[2305.11349v1](http://arxiv.org/abs/2305.11349v1)|null|\n", "2305.11327": "|**2023-05-18**|**MALM: Mask Augmentation based Local Matching for Food-Recipe Retrieval**|Bhanu Prakash Voutharoja et.al.|[2305.11327v1](http://arxiv.org/abs/2305.11327v1)|**[link](https://github.com/myfoodchoice/malm_mask_augmentation_based_local_matching-_for-_food_recipe_retrieval)**|\n", "2305.13220": "|**2023-05-22**|**Fast Monocular Scene Reconstruction with Global-Sparse Local-Dense Grids**|Wei Dong et.al.|[2305.13220v1](http://arxiv.org/abs/2305.13220v1)|null|\n", "2305.12953": "|**2023-05-22**|**Enhancing Next Active Object-based Egocentric Action Anticipation with Guided Attention**|Sanket Thakur et.al.|[2305.12953v1](http://arxiv.org/abs/2305.12953v1)|**[link](https://github.com/sanketsans/ganov2)**|\n", "2305.12903": "|**2023-05-22**|**DiffAVA: Personalized Text-to-Audio Generation with Visual Alignment**|Shentong Mo et.al.|[2305.12903v1](http://arxiv.org/abs/2305.12903v1)|null|\n", "2305.12878": "|**2023-05-22**|**Non-Autoregressive Document-Level Machine Translation (NA-DMT): Exploring Effective Approaches, Challenges, and Opportunities**|Guangsheng Bao et.al.|[2305.12878v1](http://arxiv.org/abs/2305.12878v1)|**[link](https://github.com/baoguangsheng/nat-on-doc)**|\n", "2305.12807": "|**2023-05-22**|**Multi-task Combinatorial Optimization: Adaptive Multi-modality Knowledge Transfer by an Explicit Inter-task Distance**|Peng Li et.al.|[2305.12807v1](http://arxiv.org/abs/2305.12807v1)|null|\n", "2305.12793": "|**2023-05-22**|**Zero-Shot End-to-End Spoken Language Understanding via Cross-Modal Selective Self-Training**|Jianfeng He et.al.|[2305.12793v1](http://arxiv.org/abs/2305.12793v1)|null|\n", "2305.12711": "|**2023-05-22**|**Unsupervised Visible-Infrared Person ReID by Collaborative Learning with Neighbor-Guided Label Refinement**|De Cheng et.al.|[2305.12711v1](http://arxiv.org/abs/2305.12711v1)|null|\n", "2305.12703": "|**2023-05-22**|**Progressive Sub-Graph Clustering Algorithm for Semi-Supervised Domain Adaptation Speaker Verification**|Zhuo Li et.al.|[2305.12703v1](http://arxiv.org/abs/2305.12703v1)|null|\n", "2305.12673": "|**2023-05-22**|**Efficient Bilateral Cross-Modality Cluster Matching for Unsupervised Visible-Infrared Person ReID**|De cheng et.al.|[2305.12673v1](http://arxiv.org/abs/2305.12673v1)|null|\n", "2305.12530": "|**2023-05-21**|**Towards Robust Family-Infant Audio Analysis Based on Unsupervised Pretraining of Wav2vec 2.0 on Large-Scale Unlabeled Family Audio**|Jialu Li et.al.|[2305.12530v1](http://arxiv.org/abs/2305.12530v1)|null|\n", "2305.12452": "|**2023-05-21**|**Advancing Referring Expression Segmentation Beyond Single Image**|Yixuan Wu et.al.|[2305.12452v1](http://arxiv.org/abs/2305.12452v1)|null|\n", "2305.12369": "|**2023-05-21**|**HIINT: Historical, Intra- and Inter- personal Dynamics Modeling with Cross-person Memory Transformer**|Yubin Kim et.al.|[2305.12369v1](http://arxiv.org/abs/2305.12369v1)|null|\n", "2305.12260": "|**2023-05-20**|**Cross2StrA: Unpaired Cross-lingual Image Captioning with Cross-lingual Cross-modal Structure-pivoted Alignment**|Shengqiong Wu et.al.|[2305.12260v1](http://arxiv.org/abs/2305.12260v1)|null|\n", "2305.12218": "|**2023-05-20**|**Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment**|Peng Jin et.al.|[2305.12218v1](http://arxiv.org/abs/2305.12218v1)|**[link](https://github.com/jpthu17/dicosa)**|\n", "2305.12011": "|**2023-05-19**|**Boosting Crop Classification by Hierarchically Fusing Satellite, Rotational, and Contextual Data**|Barriere Valentin et.al.|[2305.12011v1](http://arxiv.org/abs/2305.12011v1)|null|\n", "2305.14312": "|**2023-05-23**|**Text-guided 3D Human Generation from 2D Collections**|Tsu-Jui Fu et.al.|[2305.14312v1](http://arxiv.org/abs/2305.14312v1)|null|\n", "2305.14167": "|**2023-05-24**|**DetGPT: Detect What You Need via Reasoning**|Renjie Pi et.al.|[2305.14167v2](http://arxiv.org/abs/2305.14167v2)|null|\n", "2305.14042": "|**2023-05-23**|**Improving speech translation by fusing speech and text**|Wenbiao Yin et.al.|[2305.14042v1](http://arxiv.org/abs/2305.14042v1)|null|\n", "2305.14017": "|**2023-05-23**|**Faster Video Moment Retrieval with Point-Level Supervision**|Xun Jiang et.al.|[2305.14017v1](http://arxiv.org/abs/2305.14017v1)|null|\n", "2305.14014": "|**2023-05-23**|**CLIP4STR: A Simple Baseline for Scene Text Recognition with Pre-trained Vision-Language Model**|Shuai Zhao et.al.|[2305.14014v1](http://arxiv.org/abs/2305.14014v1)|null|\n", "2305.13986": "|**2023-05-23**|**A Multi-Modal Network Equilibrium Model with Interacting Mobility Service Providers'Strategies**|Claudia Bandiera et.al.|[2305.13986v1](http://arxiv.org/abs/2305.13986v1)|null|\n", "2305.13705": "|**2023-05-23**|**DiffHand: End-to-End Hand Mesh Reconstruction via Diffusion Models**|Lijun Li et.al.|[2305.13705v1](http://arxiv.org/abs/2305.13705v1)|null|\n", "2305.13697": "|**2023-05-23**|**UNIMO-3: Multi-granularity Interaction for Vision-Language Representation Learning**|Hao Yang et.al.|[2305.13697v1](http://arxiv.org/abs/2305.13697v1)|null|\n", "2305.13667": "|**2023-05-23**|**Optimizing Non-Autoregressive Transformers with Contrastive Learning**|Chenxin An et.al.|[2305.13667v1](http://arxiv.org/abs/2305.13667v1)|null|\n", "2305.13659": "|**2023-05-23**|**Flare-Aware Cross-modal Enhancement Network for Multi-spectral Vehicle Re-identification**|Aihua Zheng et.al.|[2305.13659v1](http://arxiv.org/abs/2305.13659v1)|**[link](https://github.com/Mzq12138/Official-Implementation-for-Flare-Aware-Cross-modal-Enhancement-for-Multi-spectral-Vehicle-ReID)**|\n", "2305.13653": "|**2023-05-23**|**RaSa: Relation and Sensitivity Aware Representation Learning for Text-based Person Search**|Yang Bai et.al.|[2305.13653v1](http://arxiv.org/abs/2305.13653v1)|**[link](https://github.com/flame-chasers/rasa)**|\n", "2305.13631": "|**2023-05-23**|**EDIS: Entity-Driven Image Search over Multimodal Web Content**|Siqi Liu et.al.|[2305.13631v1](http://arxiv.org/abs/2305.13631v1)|**[link](https://github.com/emerisly/edis)**|\n", "2305.13503": "|**2023-05-22**|**Asynchronous Multi-Model Federated Learning over Wireless Networks: Theory, Modeling, and Optimization**|Zhan-Lun Chang et.al.|[2305.13503v1](http://arxiv.org/abs/2305.13503v1)|null|\n", "2305.15403": "|**2023-05-24**|**AV-TranSpeech: Audio-Visual Robust Speech-to-Speech Translation**|Rongjie Huang et.al.|[2305.15403v1](http://arxiv.org/abs/2305.15403v1)|null|\n", "2305.15302": "|**2023-05-24**|**Multi-Modal Mutual Attention and Iterative Interaction for Referring Image Segmentation**|Chang Liu et.al.|[2305.15302v1](http://arxiv.org/abs/2305.15302v1)|null|\n", "2305.15296": "|**2023-05-24**|**MultiFusion: Fusing Pre-Trained Models for Multi-Lingual, Multi-Modal Image Generation**|Marco Bellagente et.al.|[2305.15296v1](http://arxiv.org/abs/2305.15296v1)|null|\n", "2305.15218": "|**2023-05-24**|**Multi-modal Machine Learning for Vehicle Rating Predictions Using Image, Text, and Parametric Data**|Hanqi Su et.al.|[2305.15218v1](http://arxiv.org/abs/2305.15218v1)|null|\n", "2305.15217": "|**2023-05-24**|**L-CAD: Language-based Colorization with Any-level Descriptions**|Zheng Chang et.al.|[2305.15217v1](http://arxiv.org/abs/2305.15217v1)|null|\n", "2305.15159": "|**2023-05-24**|**Collaborative Recommendation Model Based on Multi-modal Multi-view Attention Network: Movie and literature cases**|Zheng Hu et.al.|[2305.15159v1](http://arxiv.org/abs/2305.15159v1)|null|\n", "2305.15094": "|**2023-05-24**|**InpaintNeRF360: Text-Guided 3D Inpainting on Unbounded Neural Radiance Fields**|Dongqing Wang et.al.|[2305.15094v1](http://arxiv.org/abs/2305.15094v1)|null|\n", "2305.15033": "|**2023-05-24**|**SmartTrim: Adaptive Tokens and Parameters Pruning for Efficient Vision-Language Models**|Zekun Wang et.al.|[2305.15033v1](http://arxiv.org/abs/2305.15033v1)|null|\n", "2305.15023": "|**2023-05-24**|**Cheap and Quick: Efficient Vision-Language Instruction Tuning for Large Language Models**|Gen Luo et.al.|[2305.15023v1](http://arxiv.org/abs/2305.15023v1)|null|\n", "2305.15021": "|**2023-05-24**|**EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought**|Yao Mu et.al.|[2305.15021v1](http://arxiv.org/abs/2305.15021v1)|**[link](https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch)**|\n", "2305.14969": "|**2023-05-24**|**MMNet: Multi-Mask Network for Referring Image Segmentation**|Yichen Yan et.al.|[2305.14969v1](http://arxiv.org/abs/2305.14969v1)|null|\n", "2305.14914": "|**2023-05-24**|**GAMUS: A Geometry-aware Multi-modal Semantic Segmentation Benchmark for Remote Sensing Data**|Zhitong Xiong et.al.|[2305.14914v1](http://arxiv.org/abs/2305.14914v1)|**[link](https://github.com/earthnets/rsi-mmsegmentation)**|\n", "2305.14897": "|**2023-05-24**|**Text encoders are performance bottlenecks in contrastive vision-language models**|Amita Kamath et.al.|[2305.14897v1](http://arxiv.org/abs/2305.14897v1)|**[link](https://github.com/amitakamath/vl_text_encoders_are_bottlenecks)**|\n", "2305.14843": "|**2023-05-24**|**Meta-Learning For Vision-and-Language Cross-lingual Transfer**|Hanxu Hu et.al.|[2305.14843v1](http://arxiv.org/abs/2305.14843v1)|null|\n", "2305.14839": "|**2023-05-24**|**PaCE: Unified Multi-modal Dialogue Pre-training with Progressive and Compositional Experts**|Yunshui Li et.al.|[2305.14839v1](http://arxiv.org/abs/2305.14839v1)|**[link](https://github.com/AlibabaResearch/DAMO-ConvAI/tree/main/pace)**|\n", "2305.16318": "|**2023-05-25**|**Referred by Multi-Modality: A Unified Temporal Transformer for Video Object Segmentation**|Shilin Yan et.al.|[2305.16318v1](http://arxiv.org/abs/2305.16318v1)|**[link](https://github.com/opengvlab/mutr)**|\n", "2305.16304": "|**2023-05-25**|**Candidate Set Re-ranking for Composed Image Retrieval with Dual Multi-modal Encoder**|Zheyuan Liu et.al.|[2305.16304v1](http://arxiv.org/abs/2305.16304v1)|null|\n", "2305.16166": "|**2023-05-25**|**Multimodal Relation Extraction with Cross-Modal Retrieval and Synthesis**|Xuming Hu et.al.|[2305.16166v1](http://arxiv.org/abs/2305.16166v1)|null|\n", "2305.16107": "|**2023-05-25**|**VioLA: Unified Codec Language Models for Speech Recognition, Synthesis, and Translation**|Tianrui Wang et.al.|[2305.16107v1](http://arxiv.org/abs/2305.16107v1)|null|\n", "2305.15957": "|**2023-05-25**|**DiffCLIP: Leveraging Stable Diffusion for Language Grounded 3D Classification**|Sitian Shen et.al.|[2305.15957v1](http://arxiv.org/abs/2305.15957v1)|null|\n", "2305.15920": "|**2023-05-25**|**Learning and accurate generation of stochastic dynamics based on multi-model Generative Adversarial Networks**|Daniele Lanzoni et.al.|[2305.15920v1](http://arxiv.org/abs/2305.15920v1)|null|\n", "2305.15913": "|**2023-05-27**|**MEMEX: Detecting Explanatory Evidence for Memes via Knowledge-Enriched Contextualization**|Shivam Sharma et.al.|[2305.15913v2](http://arxiv.org/abs/2305.15913v2)|**[link](https://github.com/lcs2-iiitd/memex_meme_evidence)**|\n", "2305.15765": "|**2023-05-25**|**Language-Guided 3D Object Detection in Point Cloud for Autonomous Driving**|Wenhao Cheng et.al.|[2305.15765v1](http://arxiv.org/abs/2305.15765v1)|null|\n", "2305.15762": "|**2023-05-25**|**Dynamic Enhancement Network for Partial Multi-modality Person Re-identification**|Aihua Zheng et.al.|[2305.15762v1](http://arxiv.org/abs/2305.15762v1)|null|\n", "2305.15753": "|**2023-05-25**|**T2TD: Text-3D Generation Model based on Prior Knowledge Guidance**|Weizhi Nie et.al.|[2305.15753v1](http://arxiv.org/abs/2305.15753v1)|null|\n", "2305.15732": "|**2023-05-26**|**CLIP3Dstyler: Language Guided 3D Arbitrary Neural Style Transfer**|Ming Gao et.al.|[2305.15732v2](http://arxiv.org/abs/2305.15732v2)|null|\n", "2305.15688": "|**2023-05-25**|**Frame-Event Alignment and Fusion Network for High Frame Rate Tracking**|Jiqing Zhang et.al.|[2305.15688v1](http://arxiv.org/abs/2305.15688v1)|null|\n", "2305.15483": "|**2023-05-24**|**Weakly Supervised Vision-and-Language Pre-training with Relative Representations**|Chi Chen et.al.|[2305.15483v1](http://arxiv.org/abs/2305.15483v1)|null|\n", "2305.17102": "|**2023-05-26**|**GeoVLN: Learning Geometry-Enhanced Visual Representation with Slot Attention for Vision-and-Language Navigation**|Jingyang Huo et.al.|[2305.17102v1](http://arxiv.org/abs/2305.17102v1)|**[link](https://github.com/jingyanghuo/GeoVLN)**|\n", "2305.17100": "|**2023-05-26**|**BiomedGPT: A Unified and Generalist Biomedical Generative Pre-trained Transformer for Vision, Language, and Multimodal Tasks**|Kai Zhang et.al.|[2305.17100v1](http://arxiv.org/abs/2305.17100v1)|**[link](https://github.com/taokz/biomedgpt)**|\n", "2305.17011": "|**2023-05-26**|**SOC: Semantic-Assisted Object Cluster for Referring Video Object Segmentation**|Zhuoyan Luo et.al.|[2305.17011v1](http://arxiv.org/abs/2305.17011v1)|null|\n", "2305.16986": "|**2023-05-29**|**NavGPT: Explicit Reasoning in Vision-and-Language Navigation with Large Language Models**|Gengze Zhou et.al.|[2305.16986v2](http://arxiv.org/abs/2305.16986v2)|**[link](https://github.com/gengzezhou/navgpt)**|\n", "2305.16685": "|**2023-05-26**|**S4M: Generating Radiology Reports by A Single Model for Multiple Body Parts**|Qi Chen et.al.|[2305.16685v1](http://arxiv.org/abs/2305.16685v1)|**[link](https://github.com/ytongxie/s4m)**|\n", "2305.16556": "|**2023-05-26**|**LANISTR: Multimodal Learning from Structured and Unstructured Data**|Sayna Ebrahimi et.al.|[2305.16556v1](http://arxiv.org/abs/2305.16556v1)|null|\n", "2305.16434": "|**2023-05-25**|**Credit Valuation Adjustment in Financial Networks**|Irena Barja\u0161i\u0107 et.al.|[2305.16434v1](http://arxiv.org/abs/2305.16434v1)|null|\n", "2305.16406": "|**2023-05-25**|**Context-Aware Attention Layers coupled with Optimal Transport Domain Adaptation methods for recognizing dementia from spontaneous speech**|Loukas Ilias et.al.|[2305.16406v1](http://arxiv.org/abs/2305.16406v1)|null|\n", "2305.18171": "|**2023-05-29**|**Improved Probabilistic Image-Text Representations**|Sanghyuk Chun et.al.|[2305.18171v1](http://arxiv.org/abs/2305.18171v1)|**[link](https://github.com/naver-ai/pcmepp)**|\n", "2305.18009": "|**2023-05-29**|**Multi-Modal Face Stylization with a Generative Prior**|Mengtian Li et.al.|[2305.18009v1](http://arxiv.org/abs/2305.18009v1)|null|\n", "2305.17993": "|**2023-05-29**|**Multi-Scale Attention for Audio Question Answering**|Guangyao Li et.al.|[2305.17993v1](http://arxiv.org/abs/2305.17993v1)|**[link](https://github.com/gewu-lab/mwafm)**|\n", "2305.17941": "|**2023-05-29**|**Safety of autonomous vehicles: A survey on Model-based vs. AI-based approaches**|Dimia Iberraken et.al.|[2305.17941v1](http://arxiv.org/abs/2305.17941v1)|null|\n", "2305.17925": "|**2023-05-29**|**Identifying shifts in multi-modal travel patterns during special events using mobile data: Celebrating Vappu in Helsinki**|Zhiren Huang et.al.|[2305.17925v1](http://arxiv.org/abs/2305.17925v1)|null|\n", "2305.17911": "|**2023-05-29**|**TotalDefMeme: A Multi-Attribute Meme dataset on Total Defence in Singapore**|Nirmalendu Prakash et.al.|[2305.17911v1](http://arxiv.org/abs/2305.17911v1)|null|\n", "2305.17903": "|**2023-05-30**|**Deeply Coupled Cross-Modal Prompt Learning**|Xuejing Liu et.al.|[2305.17903v2](http://arxiv.org/abs/2305.17903v2)|**[link](https://github.com/gingl/cmpa)**|\n", "2305.17652": "|**2023-05-28**|**ConaCLIP: Exploring Distillation of Fully-Connected Knowledge Interaction Graph for Lightweight Text-Image Retrieval**|Jiapeng Wang et.al.|[2305.17652v1](http://arxiv.org/abs/2305.17652v1)|null|\n", "2305.17629": "|**2023-05-28**|**Multi-Modal Wireless Flexible Gel-Free Sensors with Edge Deep Learning for Detecting and Alerting Freezing of Gait in Parkinson's Patients**|Yuhan Hou et.al.|[2305.17629v1](http://arxiv.org/abs/2305.17629v1)|null|\n", "2305.17600": "|**2023-05-28**|**GAME-UP: Game-Aware Mode Enumeration and Understanding for Trajectory Prediction**|Justin Lidard et.al.|[2305.17600v1](http://arxiv.org/abs/2305.17600v1)|null|\n", "2305.17530": "|**2023-05-27**|**PuMer: Pruning and Merging Tokens for Efficient Vision Language Models**|Qingqing Cao et.al.|[2305.17530v1](http://arxiv.org/abs/2305.17530v1)|**[link](https://github.com/csarron/pumer)**|\n", "2305.17499": "|**2023-05-27**|**CIF-PT: Bridging Speech and Text Representations for Spoken Language Understanding via Continuous Integrate-and-Fire Pre-Training**|Linhao Dong et.al.|[2305.17499v1](http://arxiv.org/abs/2305.17499v1)|null|\n", "2305.17455": "|**2023-05-27**|**CrossGET: Cross-Guided Ensemble of Tokens for Accelerating Vision-Language Transformers**|Dachuan Shi et.al.|[2305.17455v1](http://arxiv.org/abs/2305.17455v1)|**[link](https://github.com/sdc17/crossget)**|\n", "2305.17343": "|**2023-05-27**|**Modality-Independent Teachers Meet Weakly-Supervised Audio-Visual Event Parser**|Yung-Hsuan Lai et.al.|[2305.17343v1](http://arxiv.org/abs/2305.17343v1)|**[link](https://github.com/franklin905/valor)**|\n", "2305.17219": "|**2023-05-26**|**GVdoc: Graph-based Visual Document Classification**|Fnu Mohbat et.al.|[2305.17219v1](http://arxiv.org/abs/2305.17219v1)|**[link](https://github.com/mohbattharani/GVdoc)**|\n", "2305.19270": "|**2023-05-30**|**Learning without Forgetting for Vision-Language Models**|Da-Wei Zhou et.al.|[2305.19270v1](http://arxiv.org/abs/2305.19270v1)|null|\n", "2305.19240": "|**2023-05-30**|**NetHack is Hard to Hack**|Ulyana Piterbarg et.al.|[2305.19240v1](http://arxiv.org/abs/2305.19240v1)|**[link](https://github.com/upiterbarg/hihack)**|\n", "2305.19228": "|**2023-05-30**|**Unsupervised Melody-to-Lyric Generation**|Yufei Tian et.al.|[2305.19228v1](http://arxiv.org/abs/2305.19228v1)|**[link](https://github.com/amazon-science/unsupervised-melody-to-lyrics-generation)**|\n", "2305.19216": "|**2023-05-30**|**Translation-Enhanced Multilingual Text-to-Image Generation**|Yaoyiran Li et.al.|[2305.19216v1](http://arxiv.org/abs/2305.19216v1)|null|\n", "2305.18980": "|**2023-05-30**|**Multi-modal Queried Object Detection in the Wild**|Yifan Xu et.al.|[2305.18980v1](http://arxiv.org/abs/2305.18980v1)|**[link](https://github.com/yifanxu74/mq-det)**|\n", "2305.18969": "|**2023-05-30**|**MS-DETR: Natural Language Video Localization with Sampling Moment-Moment Interaction**|Jing Wang et.al.|[2305.18969v1](http://arxiv.org/abs/2305.18969v1)|**[link](https://github.com/k-nick/ms-detr)**|\n", "2305.18898": "|**2023-05-30**|**AlphaBlock: Embodied Finetuning for Vision-Language Reasoning in Robot Manipulation**|Chuhao Jin et.al.|[2305.18898v1](http://arxiv.org/abs/2305.18898v1)|null|\n", "2305.18842": "|**2023-05-30**|**Generate then Select: Open-ended Visual Question Answering Guided by World Knowledge**|Xingyu Fu et.al.|[2305.18842v1](http://arxiv.org/abs/2305.18842v1)|null|\n", "2305.18752": "|**2023-05-30**|**GPT4Tools: Teaching Large Language Model to Use Tools via Self-instruction**|Rui Yang et.al.|[2305.18752v1](http://arxiv.org/abs/2305.18752v1)|**[link](https://github.com/stevengrove/gpt4tools)**|\n", "2305.18721": "|**2023-05-30**|**LayoutMask: Enhance Text-Layout Interaction in Multi-modal Pre-training for Document Understanding**|Yi Tu et.al.|[2305.18721v1](http://arxiv.org/abs/2305.18721v1)|null|\n", "2305.18641": "|**2023-05-29**|**Enhanced Chart Understanding in Vision and Language Task via Cross-modal Pre-training on Plot Table Pairs**|Mingyang Zhou et.al.|[2305.18641v1](http://arxiv.org/abs/2305.18641v1)|null|\n", "2305.18500": "|**2023-05-29**|**VAST: A Vision-Audio-Subtitle-Text Omni-Modality Foundation Model and Dataset**|Sihan Chen et.al.|[2305.18500v1](http://arxiv.org/abs/2305.18500v1)|**[link](https://github.com/txh-mercury/vast)**|\n", "2305.19972": "|**2023-05-31**|**ViLaS: Integrating Vision and Language into Automatic Speech Recognition**|Minglun Han et.al.|[2305.19972v1](http://arxiv.org/abs/2305.19972v1)|null|\n", "2305.19924": "|**2023-06-01**|**Joint Adaptive Representations for Image-Language Learning**|AJ Piergiovanni et.al.|[2305.19924v2](http://arxiv.org/abs/2305.19924v2)|null|\n", "2305.19912": "|**2023-05-31**|**Structure-Aware Language Model Pretraining Improves Dense Retrieval on Structured Data**|Xinze Li et.al.|[2305.19912v1](http://arxiv.org/abs/2305.19912v1)|**[link](https://github.com/openmatch/openmatch)**|\n", "2305.19894": "|**2023-05-31**|**Med-UniC: Unifying Cross-Lingual Medical Vision-Language Pre-Training by Diminishing Bias**|Zhongwei Wan et.al.|[2305.19894v1](http://arxiv.org/abs/2305.19894v1)|**[link](https://github.com/SUSTechBruce/Med-UniC)**|\n", "2305.19664": "|**2023-05-31**|**Unveiling Cross Modality Bias in Visual Question Answering: A Causal View with Possible Worlds VQA**|Ali Vosoughi et.al.|[2305.19664v1](http://arxiv.org/abs/2305.19664v1)|null|\n", "2305.19624": "|**2023-05-31**|**A Multi-Modal Transformer Network for Action Detection**|Matthew Korban et.al.|[2305.19624v1](http://arxiv.org/abs/2305.19624v1)|null|\n", "2305.19595": "|**2023-06-01**|**Dense and Aligned Captions (DAC) Promote Compositional Reasoning in VL Models**|Sivan Doveh et.al.|[2305.19595v2](http://arxiv.org/abs/2305.19595v2)|null|\n", "2305.19522": "|**2023-06-01**|**PromptStyle: Controllable Style Transfer for Text-to-Speech with Natural Language Descriptions**|Guanghou Liu et.al.|[2305.19522v2](http://arxiv.org/abs/2305.19522v2)|null|\n", "2306.00978": "|**2023-06-01**|**AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration**|Ji Lin et.al.|[2306.00978v1](http://arxiv.org/abs/2306.00978v1)|**[link](https://github.com/mit-han-lab/llm-awq)**|\n", "2306.00964": "|**2023-06-01**|**Cocktail: Mixing Multi-Modality Controls for Text-Conditional Image Generation**|Minghui Hu et.al.|[2306.00964v1](http://arxiv.org/abs/2306.00964v1)|null|\n", "2306.00958": "|**2023-06-01**|**LIV: Language-Image Representations and Rewards for Robotic Control**|Yecheng Jason Ma et.al.|[2306.00958v1](http://arxiv.org/abs/2306.00958v1)|**[link](https://github.com/penn-pal-lab/liv)**|\n", "2306.00932": "|**2023-06-01**|**Cross Modal Data Discovery over Structured and Unstructured Data Lakes**|Mohamed Y. Eltabakh et.al.|[2306.00932v1](http://arxiv.org/abs/2306.00932v1)|**[link](https://github.com/qcri/cmdl)**|\n", "2306.00813": "|**2023-06-01**|**UniDiff: Advancing Vision-Language Models with Generative and Discriminative Learning**|Xiao Dong et.al.|[2306.00813v1](http://arxiv.org/abs/2306.00813v1)|null|\n", "2306.00792": "|**2023-06-01**|**Learning Across Decentralized Multi-Modal Remote Sensing Archives with Federated Learning**|Bar\u0131\u015f B\u00fcy\u00fckta\u015f et.al.|[2306.00792v1](http://arxiv.org/abs/2306.00792v1)|null|\n", "2306.00789": "|**2023-06-01**|**Improved Cross-Lingual Transfer Learning For Automatic Speech Translation**|Sameer Khurana et.al.|[2306.00789v1](http://arxiv.org/abs/2306.00789v1)|null|\n", "2306.00783": "|**2023-06-01**|**FDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and Relighting with Diffusion Models**|Hao Zhang et.al.|[2306.00783v1](http://arxiv.org/abs/2306.00783v1)|**[link](https://github.com/billyxyb/fdnerf)**|\n", "2306.00640": "|**2023-06-01**|**Multi-Modal Deep Learning for Multi-Temporal Urban Mapping With a Partly Missing Optical Modality**|Sebastian Hafner et.al.|[2306.00640v1](http://arxiv.org/abs/2306.00640v1)|null|\n", "2306.00424": "|**2023-06-01**|**End-to-end Knowledge Retrieval with Multi-modal Queries**|Man Luo et.al.|[2306.00424v1](http://arxiv.org/abs/2306.00424v1)|**[link](https://github.com/luomancs/remuq)**|\n", "2306.00409": "|**2023-06-01**|**Adapting Pre-trained Language Models to Vision-Language Tasks via Dynamic Visual Prompting**|Shubin Huang et.al.|[2306.00409v1](http://arxiv.org/abs/2306.00409v1)|**[link](https://github.com/hsb1357173526/dynamic_visual_prompting)**|\n", "2306.00386": "|**2023-06-01**|**Symmetric Uncertainty-Aware Feature Transmission for Depth Super-Resolution**|Wuxuan Shi et.al.|[2306.00386v1](http://arxiv.org/abs/2306.00386v1)|**[link](https://github.com/shiwuxuan/suft)**|\n", "2306.00228": "|**2023-05-31**|**Using Visual Cropping to Enhance Fine-Detail Question Answering of BLIP-Family Models**|Jiarui Zhang et.al.|[2306.00228v1](http://arxiv.org/abs/2306.00228v1)|null|\n", "2306.00179": "|**2023-05-31**|**LeggedWalking on Inclined Surfaces**|Chenghao Wang et.al.|[2306.00179v1](http://arxiv.org/abs/2306.00179v1)|null|\n", "2306.00103": "|**2023-05-31**|**ManagerTower: Aggregating the Insights of Uni-Modal Experts for Vision-Language Representation Learning**|Xiao Xu et.al.|[2306.00103v1](http://arxiv.org/abs/2306.00103v1)|**[link](https://github.com/looperxx/managertower)**|\n", "2306.01733": "|**2023-06-02**|**DocFormerv2: Local Features for Document Understanding**|Srikar Appalaraju et.al.|[2306.01733v1](http://arxiv.org/abs/2306.01733v1)|null|\n", "2306.01675": "|**2023-06-02**|**Bayesian Segmentation Modeling of Epidemic Growth**|Tejasv Bedi et.al.|[2306.01675v1](http://arxiv.org/abs/2306.01675v1)|null|\n", "2306.01656": "|**2023-06-02**|**Backchannel Detection and Agreement Estimation from Video with Transformer Networks**|Ahmed Amer et.al.|[2306.01656v1](http://arxiv.org/abs/2306.01656v1)|**[link](https://git.opendfki.de/body_language/ijcnn23-backchannel-detection)**|\n", "2306.01523": "|**2023-06-02**|**Transformer-based Multi-Modal Learning for Multi Label Remote Sensing Image Classification**|David Hoffmann et.al.|[2306.01523v1](http://arxiv.org/abs/2306.01523v1)|null|\n", "2306.01492": "|**2023-06-02**|**Multi-Modal Emotion Recognition for Enhanced Requirements Engineering: A Novel Approach**|Ben Cheng et.al.|[2306.01492v1](http://arxiv.org/abs/2306.01492v1)|null|\n", "2306.01312": "|**2023-06-02**|**Syntax-aware Hybrid prompt model for Few-shot multi-modal sentiment analysis**|Zikai Zhou et.al.|[2306.01312v1](http://arxiv.org/abs/2306.01312v1)|null|\n", "2306.01311": "|**2023-06-02**|**MetaVL: Transferring In-Context Learning Ability From Language Models to Vision-Language Models**|Masoud Monajatipoor et.al.|[2306.01311v1](http://arxiv.org/abs/2306.01311v1)|null|\n", "2306.01163": "|**2023-06-01**|**A Multi-Modal Latent-Features based Service Recommendation System for the Social Internet of Things**|Amar Khelloufi et.al.|[2306.01163v1](http://arxiv.org/abs/2306.01163v1)|null|\n", "2306.01144": "|**2023-06-01**|**Evaluating the Capabilities of Multi-modal Reasoning Models with Synthetic Task Data**|Nathan Vaska et.al.|[2306.01144v1](http://arxiv.org/abs/2306.01144v1)|null|\n", "2306.01112": "|**2023-06-01**|**What if We Enrich day-ahead Solar Irradiance Time Series Forecasting with Spatio-Temporal Context?**|Oussama Boussif et.al.|[2306.01112v1](http://arxiv.org/abs/2306.01112v1)|**[link](https://github.com/gitbooo/CrossViVit)**|\n", "2306.02972": "|**2023-06-05**|**Simultaneous or Sequential Training? How Speech Representations Cooperate in a Multi-Task Self-Supervised Learning System**|Khazar Khorrami et.al.|[2306.02972v1](http://arxiv.org/abs/2306.02972v1)|null|\n", "2306.02901": "|**2023-06-05**|**A Vessel-Segmentation-Based CycleGAN for Unpaired Multi-modal Retinal Image Synthesis**|Aline Sindel et.al.|[2306.02901v1](http://arxiv.org/abs/2306.02901v1)|null|\n", "2306.02894": "|**2023-06-05**|**Recyclable Semi-supervised Method Based on Multi-model Ensemble for Video Scene Parsing**|Biao Wu et.al.|[2306.02894v1](http://arxiv.org/abs/2306.02894v1)|null|\n", "2306.02858": "|**2023-06-06**|**Video-LLaMA: An Instruction-tuned Audio-Visual Language Model for Video Understanding**|Hang Zhang et.al.|[2306.02858v2](http://arxiv.org/abs/2306.02858v2)|**[link](https://github.com/damo-nlp-sg/video-llama)**|\n", "2306.02841": "|**2023-06-05**|**CTRL: Connect Tabular and Language Model for CTR Prediction**|Xiangyang Li et.al.|[2306.02841v1](http://arxiv.org/abs/2306.02841v1)|null|\n", "2306.02831": "|**2023-06-05**|**MM-DAG: Multi-task DAG Learning for Multi-modal Data -- with Application for Traffic Congestion Analysis**|Tian Lan et.al.|[2306.02831v1](http://arxiv.org/abs/2306.02831v1)|**[link](https://github.com/lantian72/mm-dag)**|\n", "2306.02673": "|**2023-06-05**|**Cross-Modal Vertical Federated Learning for MRI Reconstruction**|Yunlu Yan et.al.|[2306.02673v1](http://arxiv.org/abs/2306.02673v1)|null|\n", "2306.02596": "|**2023-06-05**|**A Novel Interpretable and Generalizable Re-synchronization Model for Cued Speech based on a Multi-Cuer Corpus**|Lufei Gao et.al.|[2306.02596v1](http://arxiv.org/abs/2306.02596v1)|**[link](https://github.com/lufei321/resync-cs)**|\n", "2306.02546": "|**2023-06-05**|**LmPa: Improving Decompilation by Synergy of Large Language Model and Program Analysis**|Xiangzhe Xu et.al.|[2306.02546v1](http://arxiv.org/abs/2306.02546v1)|null|\n", "2306.02329": "|**2023-06-04**|**Multi-CLIP: Contrastive Vision-Language Pre-training for Question Answering tasks in 3D Scenes**|Alexandros Delitzas et.al.|[2306.02329v1](http://arxiv.org/abs/2306.02329v1)|null|\n", "2306.02307": "|**2023-06-04**|**Finding the SWEET Spot: Analysis and Improvement of Adaptive Inference in Low Resource Settings**|Daniel Rotem et.al.|[2306.02307v1](http://arxiv.org/abs/2306.02307v1)|null|\n", "2306.02259": "|**2023-06-04**|**Predicting Information Pathways Across Online Communities**|Yiqiao Jin et.al.|[2306.02259v1](http://arxiv.org/abs/2306.02259v1)|**[link](https://github.com/claws-lab/inpac)**|\n", "2306.02137": "|**2023-06-03**|**Inconsistent Matters: A Knowledge-guided Dual-consistency Network for Multi-modal Rumor Detection**|Mengzhu Sun et.al.|[2306.02137v1](http://arxiv.org/abs/2306.02137v1)|**[link](https://github.com/mengzsun/kdcn)**|\n", "2306.02050": "|**2023-06-06**|**Provable Dynamic Fusion for Low-Quality Multimodal Data**|Qingyang Zhang et.al.|[2306.02050v2](http://arxiv.org/abs/2306.02050v2)|**[link](https://github.com/qingyangzhang/qmf)**|\n", "2306.01929": "|**2023-06-02**|**Recent Advances of Local Mechanisms in Computer Vision: A Survey and Outlook of Recent Work**|Qiangchang Wang et.al.|[2306.01929v1](http://arxiv.org/abs/2306.01929v1)|null|\n", "2306.03899": "|**2023-06-06**|**Towards Label-free Scene Understanding by Vision Foundation Models**|Runnan Chen et.al.|[2306.03899v1](http://arxiv.org/abs/2306.03899v1)|**[link](https://github.com/runnanchen/label-free-scene-understanding)**|\n", "2306.03810": "|**2023-06-06**|**X-Align++: cross-modal cross-view alignment for Bird's-eye-view segmentation**|Shubhankar Borse et.al.|[2306.03810v1](http://arxiv.org/abs/2306.03810v1)|null|\n", "2306.03802": "|**2023-06-06**|**Learning to Ground Instructional Articles in Videos through Narrations**|Effrosyni Mavroudi et.al.|[2306.03802v1](http://arxiv.org/abs/2306.03802v1)|null|\n", "2306.03730": "|**2023-06-06**|**Modality-Agnostic Learning for Medical Image Segmentation Using Multi-modality Self-distillation**|Qisheng He et.al.|[2306.03730v1](http://arxiv.org/abs/2306.03730v1)|null|\n", "2306.03678": "|**2023-06-06**|**On the Difference of BERT-style and CLIP-style Text Encoders**|Zhihong Chen et.al.|[2306.03678v1](http://arxiv.org/abs/2306.03678v1)|**[link](https://github.com/zhjohnchan/bert-clip-synesthesia)**|\n", "2306.03650": "|**2023-06-06**|**A Quantum Probability Driven Framework for Joint Multi-Modal Sarcasm, Sentiment and Emotion Analysis**|Yaochen Liu et.al.|[2306.03650v1](http://arxiv.org/abs/2306.03650v1)|null|\n", "2306.03617": "|**2023-06-06**|**A Data-Efficient Approach for Long-Term Human Motion Prediction Using Maps of Dynamics**|Yufei Zhu et.al.|[2306.03617v1](http://arxiv.org/abs/2306.03617v1)|null|\n", "2306.03367": "|**2023-06-06**|**Bridging the Gap Between Multi-Step and One-Shot Trajectory Prediction via Self-Supervision**|Faris Janjo\u0161 et.al.|[2306.03367v1](http://arxiv.org/abs/2306.03367v1)|null|\n", "2306.03252": "|**2023-06-05**|**RACECAR -- The Dataset for High-Speed Autonomous Racing**|Amar Kulkarni et.al.|[2306.03252v1](http://arxiv.org/abs/2306.03252v1)|**[link](https://github.com/linklab-uva/racecar_data)**|\n", "2306.04445": "|**2023-06-07**|**Multi-modal Latent Diffusion**|Mustapha Bounoua et.al.|[2306.04445v1](http://arxiv.org/abs/2306.04445v1)|null|\n", "2306.04387": "|**2023-06-08**|**M$^3$IT: A Large-Scale Dataset towards Multi-Modal Multilingual Instruction Tuning**|Lei Li et.al.|[2306.04387v2](http://arxiv.org/abs/2306.04387v2)|null|\n", "2306.04362": "|**2023-06-07**|**Youku-mPLUG: A 10 Million Large-scale Chinese Video-Language Dataset for Pre-training and Benchmarks**|Haiyang Xu et.al.|[2306.04362v1](http://arxiv.org/abs/2306.04362v1)|**[link](https://github.com/x-plug/youku-mplug)**|\n", "2306.04272": "|**2023-06-07**|**On the Generalization of Multi-modal Contrastive Learning**|Qi Zhang et.al.|[2306.04272v1](http://arxiv.org/abs/2306.04272v1)|**[link](https://github.com/pku-ml/clip-help-simclr)**|\n", "2306.04163": "|**2023-06-07**|**Enhancing Virtual Assistant Intelligence: Precise Area Targeting for Instance-level User Intents beyond Metadata**|Mengyu Chen et.al.|[2306.04163v1](http://arxiv.org/abs/2306.04163v1)|null|\n", "2306.04083": "|**2023-06-07**|**Coverage Path Planning with Budget Constraints for Multiple Unmanned Ground Vehicles**|Vu Phi Tran et.al.|[2306.04083v1](http://arxiv.org/abs/2306.04083v1)|null|\n", "2306.04021": "|**2023-06-06**|**Energy-Based Models for Cross-Modal Localization using Convolutional Transformers**|Alan Wu et.al.|[2306.04021v1](http://arxiv.org/abs/2306.04021v1)|null|\n", "2306.05425": "|**2023-06-08**|**MIMIC-IT: Multi-Modal In-Context Instruction Tuning**|Bo Li et.al.|[2306.05425v1](http://arxiv.org/abs/2306.05425v1)|**[link](https://github.com/luodian/otter)**|\n", "2306.04928": "|**2023-06-08**|**Underwater Intention Recognition using Head Motion and Throat Vibration for Supernumerary Robotic Assistance**|Yuqin Guo et.al.|[2306.04928v1](http://arxiv.org/abs/2306.04928v1)|null|\n", "2306.06048": "|**2023-06-09**|**How Does Fine-Tuning Impact Out-of-Distribution Detection for Vision-Language Models?**|Yifei Ming et.al.|[2306.06048v1](http://arxiv.org/abs/2306.06048v1)|null|\n", "2306.05716": "|**2023-06-09**|**Pave the Way to Grasp Anything: Transferring Foundation Models for Universal Pick-Place Robots**|Jiange Yang et.al.|[2306.05716v1](http://arxiv.org/abs/2306.05716v1)|null|\n", "2306.05493": "|**2023-06-08**|**Multi-Modal Classifiers for Open-Vocabulary Object Detection**|Prannay Kaul et.al.|[2306.05493v1](http://arxiv.org/abs/2306.05493v1)|null|\n", "2306.07272": "|**2023-06-12**|**Zero-shot Composed Text-Image Retrieval**|Yikun Liu et.al.|[2306.07272v1](http://arxiv.org/abs/2306.07272v1)|**[link](https://github.com/Code-kunkun/ZS-CIR)**|\n", "2306.07257": "|**2023-06-12**|**MovieFactory: Automatic Movie Creation from Text using Large Generative Models for Language and Images**|Junchen Zhu et.al.|[2306.07257v1](http://arxiv.org/abs/2306.07257v1)|null|\n", "2306.07207": "|**2023-06-12**|**Valley: Video Assistant with Large Language model Enhanced abilitY**|Ruipu Luo et.al.|[2306.07207v1](http://arxiv.org/abs/2306.07207v1)|**[link](https://github.com/rupertluo/valley)**|\n", "2306.07196": "|**2023-06-12**|**Retrieval-Enhanced Contrastive Vision-Text Models**|Ahmet Iscen et.al.|[2306.07196v1](http://arxiv.org/abs/2306.07196v1)|null|\n", "2306.07187": "|**2023-06-12**|**Video-to-Music Recommendation using Temporal Alignment of Segments**|Laure Pr\u00e9tet et.al.|[2306.07187v1](http://arxiv.org/abs/2306.07187v1)|null|\n", "2306.07096": "|**2023-06-12**|**Global and Local Semantic Completion Learning for Vision-Language Pre-training**|Rong-Cheng Tu et.al.|[2306.07096v1](http://arxiv.org/abs/2306.07096v1)|**[link](https://github.com/iigroup/scl)**|\n", "2306.06885": "|**2023-06-12**|**NPVForensics: Jointing Non-critical Phonemes and Visemes for Deepfake Detection**|Yu Chen et.al.|[2306.06885v1](http://arxiv.org/abs/2306.06885v1)|null|\n", "2306.06691": "|**2023-06-11**|**Self-Enhancement Improves Text-Image Retrieval in Foundation Visual-Language Models**|Yuguang Yang et.al.|[2306.06691v1](http://arxiv.org/abs/2306.06691v1)|null|\n", "2306.06687": "|**2023-06-11**|**LAMM: Language-Assisted Multi-Modal Instruction-Tuning Dataset, Framework, and Benchmark**|Zhenfei Yin et.al.|[2306.06687v1](http://arxiv.org/abs/2306.06687v1)|**[link](https://github.com/openlamm/lamm)**|\n", "2306.06615": "|**2023-06-11**|**Empowering Molecule Discovery for Molecule-Caption Translation with Large Language Models: A ChatGPT Perspective**|Jiatong Li et.al.|[2306.06615v1](http://arxiv.org/abs/2306.06615v1)|**[link](https://github.com/phenixace/molregpt)**|\n", "2306.06583": "|**2023-06-11**|**REACT2023: the first Multi-modal Multiple Appropriate Facial Reaction Generation Challenge**|Siyang Song et.al.|[2306.06583v1](http://arxiv.org/abs/2306.06583v1)|**[link](https://github.com/reactmultimodalchallenge/baseline_react2023)**|\n", "2306.06494": "|**2023-06-10**|**Multi-modal Pre-training for Medical Vision-language Understanding and Generation: An Empirical Study with A New Benchmark**|Li Xu et.al.|[2306.06494v1](http://arxiv.org/abs/2306.06494v1)|**[link](https://github.com/control-xl/medical-vision-langauge-transformer)**|\n", "2306.06476": "|**2023-06-10**|**Modality Influence in Multimodal Machine Learning**|Abdelhamid Haouhat et.al.|[2306.06476v1](http://arxiv.org/abs/2306.06476v1)|null|\n", "2306.06465": "|**2023-06-10**|**Simultaneous Trajectory Optimization and Contact Selection for Multi-Modal Manipulation Planning**|Mengchao Zhang et.al.|[2306.06465v1](http://arxiv.org/abs/2306.06465v1)|null|\n", "2306.06410": "|**2023-06-10**|**OpenSR: Open-Modality Speech Recognition via Maintaining Multi-Modality Alignment**|Xize Cheng et.al.|[2306.06410v1](http://arxiv.org/abs/2306.06410v1)|**[link](https://github.com/exgc/opensr)**|\n", "2306.07744": "|**2023-06-13**|**Contrastive Learning-Based Audio to Lyrics Alignment for Multiple Languages**|Simon Durand et.al.|[2306.07744v1](http://arxiv.org/abs/2306.07744v1)|**[link](https://github.com/f90/jamendolyrics)**|\n", "2306.07646": "|**2023-06-13**|**Enhanced Multimodal Representation Learning with Cross-modal KD**|Mengxi Chen et.al.|[2306.07646v1](http://arxiv.org/abs/2306.07646v1)|null|\n", "2306.07505": "|**2023-06-13**|**Deep learning radiomics for assessment of gastroesophageal varices in people with compensated advanced chronic liver disease**|Lan Wang et.al.|[2306.07505v1](http://arxiv.org/abs/2306.07505v1)|null|\n", "2306.07303": "|**2023-06-11**|**A Comprehensive Survey on Applications of Transformers for Deep Learning Tasks**|Saidul Islam et.al.|[2306.07303v1](http://arxiv.org/abs/2306.07303v1)|null|\n", "2306.09347": "|**2023-06-15**|**Segment Any Point Cloud Sequences by Distilling Vision Foundation Models**|Youquan Liu et.al.|[2306.09347v1](http://arxiv.org/abs/2306.09347v1)|**[link](https://github.com/youquanl/segment-any-point-cloud)**|\n", "2306.09265": "|**2023-06-15**|**LVLM-eHub: A Comprehensive Evaluation Benchmark for Large Vision-Language Models**|Peng Xu et.al.|[2306.09265v1](http://arxiv.org/abs/2306.09265v1)|**[link](https://github.com/opengvlab/multi-modality-arena)**|\n", "2306.09093": "|**2023-06-15**|**Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration**|Chenyang Lyu et.al.|[2306.09093v1](http://arxiv.org/abs/2306.09093v1)|**[link](https://github.com/lyuchenyang/macaw-llm)**|\n", "2306.09067": "|**2023-06-15**|**Winning Solution for the CVPR2023 Visual Anomaly and Novelty Detection Challenge: Multimodal Prompting for Data-centric Anomaly Detection**|Yunkang Cao et.al.|[2306.09067v1](http://arxiv.org/abs/2306.09067v1)|**[link](https://github.com/caoyunkang/segment-any-anomaly)**|\n", "2306.08966": "|**2023-06-15**|**Training Multimedia Event Extraction With Generated Images and Captions**|Zilin Du et.al.|[2306.08966v1](http://arxiv.org/abs/2306.08966v1)|null|\n", "2306.08893": "|**2023-06-15**|**LOVM: Language-Only Vision Model Selection**|Orr Zohar et.al.|[2306.08893v1](http://arxiv.org/abs/2306.08893v1)|**[link](https://github.com/orrzohar/lovm)**|\n", "2306.08871": "|**2023-06-15**|**Med-MMHL: A Multi-Modal Dataset for Detecting Human- and LLM-Generated Misinformation in the Medical Domain**|Yanshen Sun et.al.|[2306.08871v1](http://arxiv.org/abs/2306.08871v1)|**[link](https://github.com/styxsys0927/med-mmhl)**|\n", "2306.08832": "|**2023-06-15**|**Contrasting Intra-Modal and Ranking Cross-Modal Hard Negatives to Enhance Visio-Linguistic Fine-grained Understanding**|Le Zhang et.al.|[2306.08832v1](http://arxiv.org/abs/2306.08832v1)|**[link](https://github.com/magiccircuit/enhance-finegrained)**|\n", "2306.08789": "|**2023-06-15**|**Efficient Token-Guided Image-Text Retrieval with Consistent Multimodal Contrastive Training**|Chong Liu et.al.|[2306.08789v1](http://arxiv.org/abs/2306.08789v1)|null|\n", "2306.08749": "|**2023-06-14**|**Utilizing Longitudinal Chest X-Rays and Reports to Pre-Fill Radiology Reports**|Qingqing Zhu et.al.|[2306.08749v1](http://arxiv.org/abs/2306.08749v1)|null|\n", "2306.08657": "|**2023-06-14**|**EMERSK -- Explainable Multimodal Emotion Recognition with Situational Knowledge**|Mijanur Palash et.al.|[2306.08657v1](http://arxiv.org/abs/2306.08657v1)|null|\n", "2306.08640": "|**2023-06-14**|**AssistGPT: A General Multi-modal Assistant that can Plan, Execute, Inspect, and Learn**|Difei Gao et.al.|[2306.08640v1](http://arxiv.org/abs/2306.08640v1)|null|\n", "2306.08522": "|**2023-06-14**|**Challenges of Indoor SLAM: A multi-modal multi-floor dataset for SLAM evaluation**|Pushyami Kaveti et.al.|[2306.08522v1](http://arxiv.org/abs/2306.08522v1)|**[link](https://github.com/neufieldrobotics/nufr-m3f)**|\n", "2306.08498": "|**2023-06-14**|**RISCLIP: Referring Image Segmentation Framework using CLIP**|Seoyeon Kim et.al.|[2306.08498v1](http://arxiv.org/abs/2306.08498v1)|**[link](https://github.com/Yeon07/RISCLIP)**|\n", "2306.08247": "|**2023-06-14**|**Diffusion in Diffusion: Cyclic One-Way Diffusion for Text-Vision-Conditioned Generation**|Yongqi Yang et.al.|[2306.08247v1](http://arxiv.org/abs/2306.08247v1)|null|\n", "2306.09851": "|**2023-06-16**|**Joint multi-modal Self-Supervised pre-training in Remote Sensing: Application to Methane Source Classification**|Paul Berg et.al.|[2306.09851v1](http://arxiv.org/abs/2306.09851v1)|null|\n", "2306.09546": "|**2023-06-15**|**Cross-Modal Video to Body-joints Augmentation for Rehabilitation Exercise Quality Assessment**|Ali Abedi et.al.|[2306.09546v1](http://arxiv.org/abs/2306.09546v1)|null|\n", "2306.09523": "|**2023-06-19**|**Tell Me Where to Go: A Composable Framework for Context-Aware Embodied Robot Navigation**|Harel Biggie et.al.|[2306.09523v2](http://arxiv.org/abs/2306.09523v2)|null|\n", "2306.09417": "|**2023-06-15**|**Diff-TTSG: Denoising probabilistic integrated speech and gesture synthesis**|Shivam Mehta et.al.|[2306.09417v1](http://arxiv.org/abs/2306.09417v1)|null|\n", "2306.11510": "|**2023-06-20**|**Pushing the Limits of 3D Shape Generation at Scale**|Wang Yu et.al.|[2306.11510v1](http://arxiv.org/abs/2306.11510v1)|null|\n", "2306.11504": "|**2023-06-20**|**Align, Adapt and Inject: Sound-guided Unified Image Generation**|Yue Yang et.al.|[2306.11504v1](http://arxiv.org/abs/2306.11504v1)|null|\n", "2306.11400": "|**2023-06-20**|**MuDPT: Multi-modal Deep-symphysis Prompt Tuning for Large Pre-trained Vision-Language Models**|Yongzhu Miao et.al.|[2306.11400v1](http://arxiv.org/abs/2306.11400v1)|**[link](https://github.com/mechrev0/mudpt)**|\n", "2306.11207": "|**2023-06-22**|**Quilt-1M: One Million Image-Text Pairs for Histopathology**|Wisdom Oluchi Ikezogwo et.al.|[2306.11207v2](http://arxiv.org/abs/2306.11207v2)|**[link](https://github.com/wisdomikezogwo/quilt1m)**|\n", "2306.11137": "|**2023-06-19**|**Deep Learning Framework with Multi-Head Dilated Encoders for Enhanced Segmentation of Cervical Cancer on Multiparametric Magnetic Resonance Imaging**|Reza Kalantar et.al.|[2306.11137v1](http://arxiv.org/abs/2306.11137v1)|null|\n", "2306.11065": "|**2023-06-19**|**Cross-Modal Attribute Insertions for Assessing the Robustness of Vision-and-Language Learning**|Shivaen Ramshetty et.al.|[2306.11065v1](http://arxiv.org/abs/2306.11065v1)|**[link](https://github.com/claws-lab/multimodal-robustness-xmai)**|\n", "2306.11025": "|**2023-06-19**|**Temporal Data Meets LLM -- Explainable Financial Time Series Forecasting**|Xinli Yu et.al.|[2306.11025v1](http://arxiv.org/abs/2306.11025v1)|null|\n", "2306.11020": "|**2023-06-19**|**Dual-Gated Fusion with Prefix-Tuning for Multi-Modal Relation Extraction**|Qian Li et.al.|[2306.11020v1](http://arxiv.org/abs/2306.11020v1)|null|\n", "2306.10830": "|**2023-06-19**|**3D VR Sketch Guided 3D Shape Prototyping and Exploration**|Ling Luo et.al.|[2306.10830v1](http://arxiv.org/abs/2306.10830v1)|**[link](https://github.com/rowl1ng/3dsketch2shape)**|\n", "2306.10799": "|**2023-06-19**|**SelfTalk: A Self-Supervised Commutative Training Diagram to Comprehend 3D Talking Faces**|Ziqiao Peng et.al.|[2306.10799v1](http://arxiv.org/abs/2306.10799v1)|**[link](https://github.com/psyai-net/SelfTalk_release)**|\n", "2306.10772": "|**2023-06-19**|**Learning an Interpretable End-to-End Network for Real-Time Acoustic Beamforming**|Hao Liang et.al.|[2306.10772v1](http://arxiv.org/abs/2306.10772v1)|null|\n", "2306.10750": "|**2023-06-19**|**WiCo: Win-win Cooperation of Bottom-up and Top-down Referring Image Segmentation**|Zesen Cheng et.al.|[2306.10750v1](http://arxiv.org/abs/2306.10750v1)|null|\n", "2306.10730": "|**2023-06-19**|**UniG3D: A Unified 3D Object Generation Dataset**|Qinghong Sun et.al.|[2306.10730v1](http://arxiv.org/abs/2306.10730v1)|null|\n", "2306.10687": "|**2023-06-19**|**Categories of Response-Based, Feature-Based, and Relation-Based Knowledge Distillation**|Chuanguang Yang et.al.|[2306.10687v1](http://arxiv.org/abs/2306.10687v1)|null|\n", "2306.10567": "|**2023-06-18**|**MIR-GAN: Refining Frame-Level Modality-Invariant Representations with Adversarial Network for Audio-Visual Speech Recognition**|Yuchen Hu et.al.|[2306.10567v1](http://arxiv.org/abs/2306.10567v1)|**[link](https://github.com/yuchen005/mir-gan)**|\n", "2306.12387": "|**2023-06-21**|**Solving Dialogue Grounding Embodied Task in a Simulated Environment using Further Masked Language Modeling**|Weijie Jack Zhang et.al.|[2306.12387v1](http://arxiv.org/abs/2306.12387v1)|null|\n", "2306.11762": "|**2023-06-20**|**MultiEarth 2023 Deforestation Challenge -- Team FOREVER**|Seunghan Park et.al.|[2306.11762v1](http://arxiv.org/abs/2306.11762v1)|null|\n", "2306.13076": "|**2023-06-22**|**A Comparison of Time-based Models for Multimodal Emotion Recognition**|Ege Kesim et.al.|[2306.13076v1](http://arxiv.org/abs/2306.13076v1)|null|\n", "2306.12819": "|**2023-06-22**|**XACML Extension for Graphs: Flexible Authorization Policy Specification and Datastore-independent Enforcement**|Aya Mohamed et.al.|[2306.12819v1](http://arxiv.org/abs/2306.12819v1)|null|\n", "2306.12795": "|**2023-06-22**|**Learning Unseen Modality Interaction**|Yunhua Zhang et.al.|[2306.12795v1](http://arxiv.org/abs/2306.12795v1)|null|\n", "2306.12725": "|**2023-06-22**|**Generative Multimodal Entity Linking**|Senbao Shi et.al.|[2306.12725v1](http://arxiv.org/abs/2306.12725v1)|**[link](https://github.com/hitsz-tmg/gemel)**|\n", "2306.12559": "|**2023-06-21**|**Exploring the Role of Audio in Video Captioning**|Yuhan Shen et.al.|[2306.12559v1](http://arxiv.org/abs/2306.12559v1)|null|\n", "2306.12525": "|**2023-06-21**|**LPFormer: LiDAR Pose Estimation Transformer with Multi-Task Network**|Dongqiangzi Ye et.al.|[2306.12525v1](http://arxiv.org/abs/2306.12525v1)|null|\n", "2306.13592": "|**2023-06-23**|**TACOformer:Token-channel compounded Cross Attention for Multimodal Emotion Recognition**|Xinda Li et.al.|[2306.13592v1](http://arxiv.org/abs/2306.13592v1)|null|\n", "2306.13285": "|**2023-06-23**|**Learning Scene Flow With Skeleton Guidance For 3D Action Recognition**|Vasileios Magoulianitis et.al.|[2306.13285v1](http://arxiv.org/abs/2306.13285v1)|null|\n", "2306.13240": "|**2023-06-22**|**Continuous Online Extrinsic Calibration of Fisheye Camera and LiDAR**|Jack Borer et.al.|[2306.13240v1](http://arxiv.org/abs/2306.13240v1)|null|\n", "2306.14795": "|**2023-06-26**|**MotionGPT: Human Motion as a Foreign Language**|Biao Jiang et.al.|[2306.14795v1](http://arxiv.org/abs/2306.14795v1)|**[link](https://github.com/openmotionlab/motiongpt)**|\n", "2306.14565": "|**2023-06-26**|**Aligning Large Multi-Modal Model with Robust Instruction Tuning**|Fuxiao Liu et.al.|[2306.14565v1](http://arxiv.org/abs/2306.14565v1)|**[link](https://github.com/FuxiaoLiu/LRV-Instruction)**|\n", "2306.14406": "|**2023-06-26**|**TCEIP: Text Condition Embedded Regression Network for Dental Implant Position Prediction**|Xinquan Yang et.al.|[2306.14406v1](http://arxiv.org/abs/2306.14406v1)|null|\n", "2306.14399": "|**2023-06-26**|**Mutual Query Network for Multi-Modal Product Image Segmentation**|Yun Guo et.al.|[2306.14399v1](http://arxiv.org/abs/2306.14399v1)|**[link](https://github.com/weifeng-github/mqn)**|\n", "2306.14177": "|**2023-06-25**|**Enhancing Mapless Trajectory Prediction through Knowledge Distillation**|Yuning Wang et.al.|[2306.14177v1](http://arxiv.org/abs/2306.14177v1)|null|\n", "2306.14170": "|**2023-06-25**|**AV-SepFormer: Cross-Attention SepFormer for Audio-Visual Target Speaker Extraction**|Jiuxin Lin et.al.|[2306.14170v1](http://arxiv.org/abs/2306.14170v1)|**[link](https://github.com/lin9x/av-sepformer)**|\n", "2306.14143": "|**2023-06-25**|**Intelligent Multi-Modal Sensing-Communication Integration: Synesthesia of Machines**|Xiang Cheng et.al.|[2306.14143v1](http://arxiv.org/abs/2306.14143v1)|null|\n", "2306.14125": "|**2023-06-25**|**M$^3$SC: A Generic Dataset for Mixed Multi-Modal (MMM) Sensing and Communication Integration**|Xiang Cheng et.al.|[2306.14125v1](http://arxiv.org/abs/2306.14125v1)|null|\n", "2306.14112": "|**2023-06-25**|**Enhancing Dynamic Image Advertising with Vision-Language Pre-training**|Zhoufutu Wen et.al.|[2306.14112v1](http://arxiv.org/abs/2306.14112v1)|null|\n", "2306.13856": "|**2023-06-24**|**Learning-to-Rank Meets Language: Boosting Language-Driven Ordering Alignment for Ordinal Classification**|Rui Wang et.al.|[2306.13856v1](http://arxiv.org/abs/2306.13856v1)|**[link](https://github.com/raywang335/l2rclip)**|\n", "2306.13804": "|**2023-06-27**|**Cross-Language Speech Emotion Recognition Using Multimodal Dual Attention Transformers**|Syed Aun Muhammad Zaidi et.al.|[2306.13804v2](http://arxiv.org/abs/2306.13804v2)|null|\n", "2306.15644": "|**2023-06-27**|**Style-transfer based Speech and Audio-visual Scene Understanding for Robot Action Sequence Acquisition from Videos**|Chiori Hori et.al.|[2306.15644v1](http://arxiv.org/abs/2306.15644v1)|null|\n", "2306.15612": "|**2023-06-27**|**Rethinking Cross-Entropy Loss for Stereo Matching Networks**|Peng Xu et.al.|[2306.15612v1](http://arxiv.org/abs/2306.15612v1)|null|\n", "2306.15605": "|**2023-06-27**|**Deep Normalizing Flows for State Estimation**|Harrison Delecki et.al.|[2306.15605v1](http://arxiv.org/abs/2306.15605v1)|**[link](https://github.com/sisl/deepnfstateestimation)**|\n", "2306.15464": "|**2023-06-27**|**Large-scale unsupervised audio pre-training for video-to-speech synthesis**|Triantafyllos Kefalas et.al.|[2306.15464v1](http://arxiv.org/abs/2306.15464v1)|null|\n", "2306.15255": "|**2023-06-27**|**GroundNLQ @ Ego4D Natural Language Queries Challenge 2023**|Zhijian Hou et.al.|[2306.15255v1](http://arxiv.org/abs/2306.15255v1)|**[link](https://github.com/houzhijian/groundnlq)**|\n", "2306.15231": "|**2023-06-27**|**Emulating Reader Behaviors for Fake News Detection**|Junwei Yin et.al.|[2306.15231v1](http://arxiv.org/abs/2306.15231v1)|null|\n", "2306.15114": "|**2023-06-26**|**Transfer: Cross Modality Knowledge Transfer using Adversarial Networks -- A Study on Gesture Recognition**|Payal Kamboj et.al.|[2306.15114v1](http://arxiv.org/abs/2306.15114v1)|null|\n", "2306.16349": "|**2023-06-28**|**Accurate, uncertainty-aware classification of molecular chemical motifs from multi-modal X-ray absorption spectroscopy**|Matthew R. Carbone et.al.|[2306.16349v1](http://arxiv.org/abs/2306.16349v1)|null|\n", "2306.16329": "|**2023-06-28**|**DiffComplete: Diffusion-based Generative 3D Shape Completion**|Ruihang Chu et.al.|[2306.16329v1](http://arxiv.org/abs/2306.16329v1)|null|\n", "2306.16207": "|**2023-06-28**|**Inferring the Goals of Communicating Agents from Actions and Instructions**|Lance Ying et.al.|[2306.16207v1](http://arxiv.org/abs/2306.16207v1)|null|\n", "2306.16034": "|**2023-06-28**|**Stone Needle: A General Multimodal Large-scale Model Framework towards Healthcare**|Weihua Liu et.al.|[2306.16034v1](http://arxiv.org/abs/2306.16034v1)|null|\n", "2306.15977": "|**2023-06-28**|**A Dimensional Structure based Knowledge Distillation Method for Cross-Modal Learning**|Lingyu Si et.al.|[2306.15977v1](http://arxiv.org/abs/2306.15977v1)|null|\n", "2306.15955": "|**2023-06-29**|**Bridging the Gap: Neural Collapse Inspired Prompt Tuning for Generalization under Class Imbalance**|Didi Zhu et.al.|[2306.15955v2](http://arxiv.org/abs/2306.15955v2)|null|\n", "2306.15946": "|**2023-06-28**|**Knowledge-Enhanced Hierarchical Information Correlation Learning for Multi-Modal Rumor Detection**|Jiawei Liu et.al.|[2306.15946v1](http://arxiv.org/abs/2306.15946v1)|null|\n", "2306.15943": "|**2023-06-28**|**No Transfers Required: Integrating Last Mile with Public Transit Using Opti-Mile**|Raashid Altaf et.al.|[2306.15943v1](http://arxiv.org/abs/2306.15943v1)|null|\n", "2306.15837": "|**2023-06-27**|**Symbol emergence as interpersonal cross-situational learning: the emergence of lexical knowledge with combinatoriality**|Yoshinobu Hagiwara et.al.|[2306.15837v1](http://arxiv.org/abs/2306.15837v1)|null|\n", "2306.15808": "|**2023-06-27**|**Classification of Infant Sleep/Wake States: Cross-Attention among Large Scale Pretrained Transformer Networks using Audio, ECG, and IMU Data**|Kai Chieh Chang et.al.|[2306.15808v1](http://arxiv.org/abs/2306.15808v1)|null|\n", "2306.15711": "|**2023-06-27**|**Semi-supervised Multimodal Representation Learning through a Global Workspace**|Benjamin Devillers et.al.|[2306.15711v1](http://arxiv.org/abs/2306.15711v1)|**[link](https://github.com/bdvllrs/bimgw)**|\n", "2306.17115": "|**2023-07-03**|**Michelangelo: Conditional 3D Shape Generation based on Shape-Image-Text Aligned Latent Representation**|Zibo Zhao et.al.|[2306.17115v2](http://arxiv.org/abs/2306.17115v2)|**[link](https://github.com/neuralcarver/michelangelo)**|\n", "2306.17107": "|**2023-06-29**|**LLaVAR: Enhanced Visual Instruction Tuning for Text-Rich Image Understanding**|Yanzhe Zhang et.al.|[2306.17107v1](http://arxiv.org/abs/2306.17107v1)|**[link](https://github.com/SALT-NLP/LLaVAR)**|\n", "2306.17000": "|**2023-06-29**|**MotionTrack: End-to-End Transformer-based Multi-Object Tracing with LiDAR-Camera Fusion**|Ce Zhang et.al.|[2306.17000v1](http://arxiv.org/abs/2306.17000v1)|null|\n", "2306.16927": "|**2023-06-29**|**End-to-end Autonomous Driving: Challenges and Frontiers**|Li Chen et.al.|[2306.16927v1](http://arxiv.org/abs/2306.16927v1)|**[link](https://github.com/opendrivelab/end-to-end-autonomous-driving)**|\n", "2306.16862": "|**2023-06-29**|**Sustainable Palm Tree Farming: Leveraging IoT and Multi-Modal Data for Early Detection and Mapping of Red Palm Weevil**|Yosra Hajjaji et.al.|[2306.16862v1](http://arxiv.org/abs/2306.16862v1)|null|\n", "2306.16762": "|**2023-06-29**|**Unified Language Representation for Question Answering over Text, Tables, and Images**|Bowen Yu et.al.|[2306.16762v1](http://arxiv.org/abs/2306.16762v1)|null|\n", "2306.16478": "|**2023-06-28**|**Pre-Training Multi-Modal Dense Retrievers for Outside-Knowledge Visual Question Answering**|Alireza Salemi et.al.|[2306.16478v1](http://arxiv.org/abs/2306.16478v1)|**[link](https://github.com/alirezasalemi7/pretraining-multimodal-dense-retriever-for-okvqa)**|\n", "2306.17525": "|**2023-06-30**|**MeLM, a generative pretrained language modeling framework that solves forward and inverse mechanics problems**|Markus J. Buehler et.al.|[2306.17525v1](http://arxiv.org/abs/2306.17525v1)|null|\n", "2306.17400": "|**2023-06-30**|**Topological Data Analysis Guided Segment Anything Model Prompt Optimization for Zero-Shot Segmentation in Biological Imaging**|Ruben Glatt et.al.|[2306.17400v1](http://arxiv.org/abs/2306.17400v1)|null|\n", "2306.17371": "|**2023-06-30**|**Capturing functional connectomics using Riemannian partial least squares**|Matt Ryan et.al.|[2306.17371v1](http://arxiv.org/abs/2306.17371v1)|null|\n", "2307.01146": "|**2023-07-05**|**AVSegFormer: Audio-Visual Segmentation with Transformer**|Shengyi Gao et.al.|[2307.01146v2](http://arxiv.org/abs/2307.01146v2)|**[link](https://github.com/vvvb-github/avsegformer)**|\n", "2307.01124": "|**2023-07-03**|**Cross-modality Attention Adapter: A Glioma Segmentation Fine-tuning Method for SAM Using Multimodal Brain MR Images**|Xiaoyu Shi et.al.|[2307.01124v1](http://arxiv.org/abs/2307.01124v1)|null|\n", "2307.01121": "|**2023-07-03**|**Artifacts Mapping: Multi-Modal Semantic Mapping for Object Detection and 3D Localization**|Federico Rollo et.al.|[2307.01121v1](http://arxiv.org/abs/2307.01121v1)|null|\n", "2307.01047": "|**2023-07-03**|**Cross-modal Place Recognition in Image Databases using Event-based Sensors**|Xiang Ji et.al.|[2307.01047v1](http://arxiv.org/abs/2307.01047v1)|null|\n", "2307.01003": "|**2023-07-03**|**Visual Instruction Tuning with Polite Flamingo**|Delong Chen et.al.|[2307.01003v1](http://arxiv.org/abs/2307.01003v1)|**[link](https://github.com/chendelong1999/polite_flamingo)**|\n", "2307.00997": "|**2023-07-03**|**RefSAM: Efficiently Adapting Segmenting Anything Model for Referring Video Object Segmentation**|Yonglin Li et.al.|[2307.00997v1](http://arxiv.org/abs/2307.00997v1)|**[link](https://github.com/lancasterli/refsam)**|\n", "2307.00954": "|**2023-07-03**|**HODINet: High-Order Discrepant Interaction Network for RGB-D Salient Object Detection**|Kang Yi et.al.|[2307.00954v1](http://arxiv.org/abs/2307.00954v1)|null|\n", "2307.00877": "|**2023-07-03**|**Exploring the Multi-modal Demand Dynamics During Transport System Disruptions**|Ali Shateri Benam et.al.|[2307.00877v1](http://arxiv.org/abs/2307.00877v1)|null|\n", "2307.00873": "|**2023-07-03**|**End-To-End Prediction of Knee Osteoarthritis Progression With Multi-Modal Transformers**|Egor Panfilov et.al.|[2307.00873v1](http://arxiv.org/abs/2307.00873v1)|null|\n", "2307.00716": "|**2023-07-03**|**JourneyDB: A Benchmark for Generative Image Understanding**|Junting Pan et.al.|[2307.00716v1](http://arxiv.org/abs/2307.00716v1)|null|\n", "2307.00671": "|**2023-07-02**|**Leveraging Multi-modal Sensing for Robotic Insertion Tasks in R&D Laboratories**|Aaron Butterworth et.al.|[2307.00671v1](http://arxiv.org/abs/2307.00671v1)|null|\n", "2307.00610": "|**2023-07-02**|**Fraunhofer SIT at CheckThat! 2023: Mixing Single-Modal Classifiers to Estimate the Check-Worthiness of Multi-Modal Tweets**|Raphael Frick et.al.|[2307.00610v1](http://arxiv.org/abs/2307.00610v1)|null|\n", "2307.00595": "|**2023-07-02**|**RH20T: A Robotic Dataset for Learning Diverse Skills in One-Shot**|Hao-Shu Fang et.al.|[2307.00595v1](http://arxiv.org/abs/2307.00595v1)|null|\n", "2307.00536": "|**2023-07-02**|**Referring Video Object Segmentation with Inter-Frame Interaction and Cross-Modal Correlation**|Meng Lan et.al.|[2307.00536v1](http://arxiv.org/abs/2307.00536v1)|null|\n", "2307.00398": "|**2023-07-01**|**ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models**|Uddeshya Upadhyay et.al.|[2307.00398v1](http://arxiv.org/abs/2307.00398v1)|**[link](https://github.com/explainableml/probvlm)**|\n", "2307.02469": "|**2023-07-05**|**What Matters in Training a GPT4-Style Language Model with Multimodal Inputs?**|Yan Zeng et.al.|[2307.02469v1](http://arxiv.org/abs/2307.02469v1)|null|\n", "2307.02280": "|**2023-07-05**|**Interactive Image Segmentation with Cross-Modality Vision Transformers**|Kun Li et.al.|[2307.02280v1](http://arxiv.org/abs/2307.02280v1)|**[link](https://github.com/lik1996/icmformer)**|\n", "2307.02041": "|**2023-07-05**|**Multimodal Imbalance-Aware Gradient Modulation for Weakly-supervised Audio-Visual Video Parsing**|Jie Fu et.al.|[2307.02041v1](http://arxiv.org/abs/2307.02041v1)|null|\n", "2307.02003": "|**2023-07-05**|**Multi-Modal Prototypes for Open-Set Semantic Segmentation**|Yuhuan Yang et.al.|[2307.02003v1](http://arxiv.org/abs/2307.02003v1)|null|\n", "2307.01947": "|**2023-07-04**|**Causal Video Summarizer for Video Exploration**|Jia-Hong Huang et.al.|[2307.01947v1](http://arxiv.org/abs/2307.01947v1)|null|\n", "2307.01824": "|**2023-07-04**|**Multi-Channel Feature Extraction for Virtual Histological Staining of Photon Absorption Remote Sensing Images**|Marian Boktor et.al.|[2307.01824v1](http://arxiv.org/abs/2307.01824v1)|null|\n", "2307.01798": "|**2023-07-04**|**Edge-aware Multi-task Network for Integrating Quantification Segmentation and Uncertainty Prediction of Liver Tumor on Multi-modality Non-contrast MRI**|Xiaojiao Xiao et.al.|[2307.01798v1](http://arxiv.org/abs/2307.01798v1)|null|\n", "2307.01741": "|**2023-07-04**|**Ben-ge: Extending BigEarthNet with Geographical and Environmental Data**|Michael Mommert et.al.|[2307.01741v1](http://arxiv.org/abs/2307.01741v1)|**[link](https://github.com/hsg-aiml/ben-ge)**|\n", "2307.01704": "|**2023-07-04**|**Graph-Ensemble Learning Model for Multi-label Skin Lesion Classification using Dermoscopy and Clinical Images**|Peng Tang et.al.|[2307.01704v1](http://arxiv.org/abs/2307.01704v1)|null|\n", "2307.01691": "|**2023-07-06**|**SeePrivacy: Automated Contextual Privacy Policy Generation for Mobile Applications**|Shidong Pan et.al.|[2307.01691v2](http://arxiv.org/abs/2307.01691v2)|**[link](https://github.com/cpp4app/cpp4app)**|\n", "2307.01577": "|**2023-07-04**|**Conceptual Cognitive Maps Formation with Neural Successor Networks and Word Embeddings**|Paul Stoewer et.al.|[2307.01577v1](http://arxiv.org/abs/2307.01577v1)|null|\n", "2307.01515": "|**2023-07-04**|**LPN: Language-guided Prototypical Network for few-shot classification**|Kaihui Cheng et.al.|[2307.01515v1](http://arxiv.org/abs/2307.01515v1)|null|\n", "2307.01425": "|**2023-07-04**|**Consistent Multimodal Generation via A Unified GAN Framework**|Zhen Zhu et.al.|[2307.01425v1](http://arxiv.org/abs/2307.01425v1)|null|\n", "2307.01422": "|**2023-07-04**|**Generative Flow Networks: a Markov Chain Perspective**|Tristan Deleu et.al.|[2307.01422v1](http://arxiv.org/abs/2307.01422v1)|null|\n", "2307.03068": "|**2023-07-06**|**A Hybrid End-to-End Spatio-Temporal Attention Neural Network with Graph-Smooth Signals for EEG Emotion Recognition**|Shadi Sartipi et.al.|[2307.03068v1](http://arxiv.org/abs/2307.03068v1)|null|\n", "2307.02978": "|**2023-07-06**|**Multi-modal multi-class Parkinson disease classification using CNN and decision level fusion**|Sushanta Kumar Sahu et.al.|[2307.02978v1](http://arxiv.org/abs/2307.02978v1)|null|\n", "2307.02971": "|**2023-07-06**|**On the Cultural Gap in Text-to-Image Generation**|Bingshuai Liu et.al.|[2307.02971v1](http://arxiv.org/abs/2307.02971v1)|null|\n", "2307.02862": "|**2023-07-06**|**A Critical Look at the Current Usage of Foundation Model for Dense Recognition Task**|Shiqi Yang et.al.|[2307.02862v1](http://arxiv.org/abs/2307.02862v1)|null|\n", "2307.02796": "|**2023-07-06**|**VerifAI: Verified Generative AI**|Nan Tang et.al.|[2307.02796v1](http://arxiv.org/abs/2307.02796v1)|null|\n", "2307.02761": "|**2023-07-06**|**Cross-Modal Content Inference and Feature Enrichment for Cold-Start Recommendation**|Haokai Ma et.al.|[2307.02761v1](http://arxiv.org/abs/2307.02761v1)|null|\n", "2307.02730": "|**2023-07-06**|**Fine-grained Action Analysis: A Multi-modality and Multi-task Dataset of Figure Skating**|Sheng-Lan Liu et.al.|[2307.02730v1](http://arxiv.org/abs/2307.02730v1)|null|\n", "2307.03706": "|**2023-07-07**|**Counterion-controlled phase equilibria in a charge-regulated polymer solution**|Giulia L. Celora et.al.|[2307.03706v1](http://arxiv.org/abs/2307.03706v1)|null|\n", "2307.03638": "|**2023-07-07**|**Physical-aware Cross-modal Adversarial Network for Wearable Sensor-based Human Action Recognition**|Jianyuan Ni et.al.|[2307.03638v1](http://arxiv.org/abs/2307.03638v1)|null|\n", "2307.03623": "|**2023-07-07**|**Robust Human Detection under Visual Degradation via Thermal and mmWave Radar Fusion**|Kaiwen Cai et.al.|[2307.03623v1](http://arxiv.org/abs/2307.03623v1)|**[link](https://github.com/ramdrop/utm)**|\n", "2307.03535": "|**2023-07-07**|**Matching in the Wild: Learning Anatomical Embeddings for Multi-Modality Images**|Xiaoyu Bai et.al.|[2307.03535v1](http://arxiv.org/abs/2307.03535v1)|null|\n", "2307.03427": "|**2023-07-07**|**Merging-Diverging Hybrid Transformer Networks for Survival Prediction in Head and Neck Cancer**|Mingyuan Meng et.al.|[2307.03427v1](http://arxiv.org/abs/2307.03427v1)|**[link](https://github.com/mungomeng/survival-xsurv)**|\n", "2307.03388": "|**2023-07-07**|**General-Purpose Multimodal Transformer meets Remote Sensing Semantic Segmentation**|Nhi Kieu et.al.|[2307.03388v1](http://arxiv.org/abs/2307.03388v1)|**[link](https://github.com/nhikieu/spatialvolumetricmultimodal)**|\n", "2307.03373": "|**2023-07-07**|**All in One: Exploring Unified Vision-Language Tracking with Multi-Modal Alignment**|Chunhui Zhang et.al.|[2307.03373v1](http://arxiv.org/abs/2307.03373v1)|null|\n", "2307.03339": "|**2023-07-07**|**Open-Vocabulary Object Detection via Scene Graph Discovery**|Hengcan Shi et.al.|[2307.03339v1](http://arxiv.org/abs/2307.03339v1)|null|\n", "2307.03274": "|**2023-07-06**|**It is not Sexually Suggestive, It is Educative. Separating Sex Education from Suggestive Content on TikTok Videos**|Enfa George et.al.|[2307.03274v1](http://arxiv.org/abs/2307.03274v1)|null|\n", "2307.03240": "|**2023-07-06**|**Adaptive Generation of Privileged Intermediate Information for Visible-Infrared Person Re-Identification**|Mahdi Alehdaghi et.al.|[2307.03240v1](http://arxiv.org/abs/2307.03240v1)|null|\n", "2307.03591": "|**2023-07-06**|**Structure Guided Multi-modal Pre-trained Transformer for Knowledge Graph Reasoning**|Ke Liang et.al.|[2307.03591v1](http://arxiv.org/abs/2307.03591v1)|null|\n", "2307.04751": "|**2023-07-10**|**Shelving, Stacking, Hanging: Relational Pose Diffusion for Multi-modal Rearrangement**|Anthony Simeonov et.al.|[2307.04751v1](http://arxiv.org/abs/2307.04751v1)|null|\n", "2307.04749": "|**2023-07-10**|**Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image Alignment with Iterative VQA Feedback**|Jaskirat Singh et.al.|[2307.04749v1](http://arxiv.org/abs/2307.04749v1)|null|\n", "2307.04722": "|**2023-07-10**|**Advances and Challenges in Meta-Learning: A Technical Review**|Anna Vettoruzzo et.al.|[2307.04722v1](http://arxiv.org/abs/2307.04722v1)|null|\n", "2307.04470": "|**2023-07-10**|**Test-Time Adaptation for Nighttime Color-Thermal Semantic Segmentation**|Yexin Liu et.al.|[2307.04470v1](http://arxiv.org/abs/2307.04470v1)|null|\n", "2307.04461": "|**2023-07-10**|**Multi-modal Graph Learning over UMLS Knowledge Graphs**|Manuel Burger et.al.|[2307.04461v1](http://arxiv.org/abs/2307.04461v1)|**[link](https://github.com/ratschlab/mmugl)**|\n", "2307.04421": "|**2023-07-13**|**Towards Enabling Cardiac Digital Twins of Myocardial Infarction Using Deep Computational Models for Inverse Inference**|Lei Li et.al.|[2307.04421v2](http://arxiv.org/abs/2307.04421v2)|null|\n", "2307.04361": "|**2023-07-10**|**Enhancing Cross-lingual Transfer via Phonemic Transcription Integration**|Hoang H. Nguyen et.al.|[2307.04361v1](http://arxiv.org/abs/2307.04361v1)|**[link](https://github.com/nhhoang96/phonemic_xlingual)**|\n", "2307.04296": "|**2023-07-10**|**K-Space-Aware Cross-Modality Score for Synthesized Neuroimage Quality Assessment**|Jinbao Wang et.al.|[2307.04296v1](http://arxiv.org/abs/2307.04296v1)|null|\n", "2307.04231": "|**2023-07-09**|**Mx2M: Masked Cross-Modality Modeling in Domain Adaptation for 3D Semantic Segmentation**|Boxiang Zhang et.al.|[2307.04231v1](http://arxiv.org/abs/2307.04231v1)|null|\n", "2307.04129": "|**2023-07-09**|**Cross-modal Orthogonal High-rank Augmentation for RGB-Event Transformer-trackers**|Zhiyu Zhu et.al.|[2307.04129v1](http://arxiv.org/abs/2307.04129v1)|**[link](https://github.com/ZHU-Zhiyu/High-Rank_RGB-Event_Tracker)**|\n", "2307.04091": "|**2023-07-09**|**CMDFusion: Bidirectional Fusion Network with Cross-modality Knowledge Distillation for LIDAR Semantic Segmentation**|Jun Cen et.al.|[2307.04091v1](http://arxiv.org/abs/2307.04091v1)|null|\n", "2307.03990": "|**2023-07-08**|**FTFDNet: Learning to Detect Talking Face Video Manipulation with Tri-Modality Interaction**|Ganglai Wang et.al.|[2307.03990v1](http://arxiv.org/abs/2307.03990v1)|null|\n", "2307.03942": "|**2023-07-08**|**Ariadne's Thread:Using Text Prompts to Improve Segmentation of Infected Areas from Chest X-ray images**|Yi Zhong et.al.|[2307.03942v1](http://arxiv.org/abs/2307.03942v1)|**[link](https://github.com/junelin2333/languidemedseg-miccai2023)**|\n", "2307.03903": "|**2023-07-08**|**Adversarial Self-Attack Defense and Spatial-Temporal Relation Mining for Visible-Infrared Video Person Re-Identification**|Huafeng Li et.al.|[2307.03903v1](http://arxiv.org/abs/2307.03903v1)|null|\n", "2307.03798": "|**2023-07-07**|**CLIPMasterPrints: Fooling Contrastive Language-Image Pre-training Using Latent Variable Evolution**|Matthias Freiberger et.al.|[2307.03798v1](http://arxiv.org/abs/2307.03798v1)|**[link](https://github.com/matfrei/clipmasterprints)**|\n", "2307.05463": "|**2023-07-11**|**EgoVLPv2: Egocentric Video-Language Pre-training with Fusion in the Backbone**|Shraman Pramanick et.al.|[2307.05463v1](http://arxiv.org/abs/2307.05463v1)|null|\n", "2307.05435": "|**2023-07-11**|**One-Versus-Others Attention: Scalable Multimodal Integration**|Michal Golovanevsky et.al.|[2307.05435v1](http://arxiv.org/abs/2307.05435v1)|**[link](https://github.com/rsinghlab/ovo)**|\n", "2307.04978": "|**2023-07-11**|**Diffusion idea exploration for art generation**|Nikhil Verma et.al.|[2307.04978v1](http://arxiv.org/abs/2307.04978v1)|null|\n", "2307.06281": "|**2023-07-12**|**MMBench: Is Your Multi-modal Model an All-around Player?**|Yuan Liu et.al.|[2307.06281v1](http://arxiv.org/abs/2307.06281v1)|**[link](https://github.com/InternLM/opencompass)**|\n", "2307.06174": "|**2023-07-12**|**Identification in Multiple Treatment Models under Discrete Variation**|Vishal Kamat et.al.|[2307.06174v1](http://arxiv.org/abs/2307.06174v1)|null|\n", "2307.05591": "|**2023-07-10**|**SITTA: A Semantic Image-Text Alignment for Image Captioning**|Fabian Paischer et.al.|[2307.05591v1](http://arxiv.org/abs/2307.05591v1)|**[link](https://github.com/ml-jku/semantic-image-text-alignment)**|\n", "2307.06505": "|**2023-07-13**|**WaterScenes: A Multi-Task 4D Radar-Camera Fusion Dataset and Benchmark for Autonomous Driving on Water Surfaces**|Shanliang Yao et.al.|[2307.06505v1](http://arxiv.org/abs/2307.06505v1)|**[link](https://github.com/waterscenes/waterscenes)**|\n", "2307.06424": "|**2023-07-12**|**Robust scalable initialization for Bayesian variational inference with multi-modal Laplace approximations**|Wyatt Bridgman et.al.|[2307.06424v1](http://arxiv.org/abs/2307.06424v1)|null|\n", "2307.07453": "|**2023-07-14**|**Investigation of Deep Learning-Based Filtered Density Function for Large Eddy Simulation of Turbulent Scalar Mixing**|Shubhangi Bansude et.al.|[2307.07453v1](http://arxiv.org/abs/2307.07453v1)|null|\n", "2307.07362": "|**2023-07-14**|**A scoping review on multimodal deep learning in biomedical images and texts**|Zhaoyi Sun et.al.|[2307.07362v1](http://arxiv.org/abs/2307.07362v1)|null|\n", "2307.07341": "|**2023-07-14**|**PiTL: Cross-modal Retrieval with Weakly-supervised Vision-language Pre-training via Prompting**|Zixin Guo et.al.|[2307.07341v1](http://arxiv.org/abs/2307.07341v1)|null|\n", "2307.07184": "|**2023-07-14**|**TVPR: Text-to-Video Person Retrieval and a New Benchmark**|Fan Ni et.al.|[2307.07184v1](http://arxiv.org/abs/2307.07184v1)|null|\n", "2307.07177": "|**2023-07-14**|**TriFormer: A Multi-modal Transformer Framework For Mild Cognitive Impairment Conversion Prediction**|Linfeng Liu et.al.|[2307.07177v1](http://arxiv.org/abs/2307.07177v1)|null|\n", "2307.07142": "|**2023-07-14**|**CFI2P: Coarse-to-Fine Cross-Modal Correspondence Learning for Image-to-Point Cloud Registration**|Gongxin Yao et.al.|[2307.07142v1](http://arxiv.org/abs/2307.07142v1)|null|\n", "2307.07135": "|**2023-07-14**|**MMSD2.0: Towards a Reliable Multi-modal Sarcasm Detection System**|Libo Qin et.al.|[2307.07135v1](http://arxiv.org/abs/2307.07135v1)|**[link](https://github.com/joeying1019/mmsd2.0)**|\n", "2307.08581": "|**2023-07-17**|**BuboGPT: Enabling Visual Grounding in Multi-Modal LLMs**|Yang Zhao et.al.|[2307.08581v1](http://arxiv.org/abs/2307.08581v1)|null|\n", "2307.08492": "|**2023-07-17**|**SVDFormer: Complementing Point Cloud via Self-view Augmentation and Self-structure Dual-generator**|Zhe Zhu et.al.|[2307.08492v1](http://arxiv.org/abs/2307.08492v1)|**[link](https://github.com/czvvd/svdformer)**|\n", "2307.08415": "|**2023-07-17**|**Monocular 3D Object Detection with LiDAR Guided Semi Supervised Active Learning**|Aral Hekimoglu et.al.|[2307.08415v1](http://arxiv.org/abs/2307.08415v1)|null|\n", "2307.08339": "|**2023-07-17**|**Multi-Task Cross-Modality Attention-Fusion for 2D Object Detection**|Huawei Sun et.al.|[2307.08339v1](http://arxiv.org/abs/2307.08339v1)|null|\n", "2307.08316": "|**2023-07-17**|**Bridging the Gap: Multi-Level Cross-Modality Joint Alignment for Visible-Infrared Person Re-Identification**|Tengfei Liang et.al.|[2307.08316v1](http://arxiv.org/abs/2307.08316v1)|null|\n", "2307.08238": "|**2023-07-17**|**Unified Open-Vocabulary Dense Visual Prediction**|Hengcan Shi et.al.|[2307.08238v1](http://arxiv.org/abs/2307.08238v1)|null|\n", "2307.08233": "|**2023-07-17**|**ROFusion: Efficient Object Detection using Hybrid Point-wise Radar-Optical Fusion**|Liu Liu et.al.|[2307.08233v1](http://arxiv.org/abs/2307.08233v1)|**[link](https://github.com/liuliu-55/rofusion)**|\n", "2307.08228": "|**2023-07-17**|**Video Frame Interpolation with Stereo Event and Intensity Camera**|Chao Ding et.al.|[2307.08228v1](http://arxiv.org/abs/2307.08228v1)|null|\n", "2307.08098": "|**2023-07-16**|**CalibNet: Dual-branch Cross-modal Calibration for RGB-D Salient Instance Segmentation**|Jialun Pei et.al.|[2307.08098v1](http://arxiv.org/abs/2307.08098v1)|**[link](https://github.com/pjlallen/calibnet)**|\n", "2307.08019": "|**2023-07-16**|**A Multi-model and Multi-scenario Assessment of the Impact of Climate Change on the Heating and Cooling Load Components of an Archetypical Residential Room in Major Indian Cities**|Raj S. Srivastava et.al.|[2307.08019v1](http://arxiv.org/abs/2307.08019v1)|null|\n", "2307.08016": "|**2023-07-16**|**Breaking Down the Task: A Unit-Grained Hybrid Training Framework for Vision and Language Decision Making**|Ruipu Luo et.al.|[2307.08016v1](http://arxiv.org/abs/2307.08016v1)|null|\n", "2307.07859": "|**2023-07-15**|**Unified Adversarial Patch for Cross-modal Attacks in the Physical World**|Xingxing Wei et.al.|[2307.07859v1](http://arxiv.org/abs/2307.07859v1)|null|\n", "2307.07807": "|**2023-07-15**|**MUVF-YOLOX: A Multi-modal Ultrasound Video Fusion Network for Renal Tumor Diagnosis**|Junyu Li et.al.|[2307.07807v1](http://arxiv.org/abs/2307.07807v1)|**[link](https://github.com/jeunyuli/muaf)**|\n", "2307.07791": "|**2023-07-15**|**Joint Adversarial and Collaborative Learning for Self-Supervised Action Recognition**|Tianyu Guo et.al.|[2307.07791v1](http://arxiv.org/abs/2307.07791v1)|**[link](https://github.com/levigty/acl)**|\n", "2307.07763": "|**2023-07-15**|**Tightly-Coupled LiDAR-Visual SLAM Based on Geometric Features for Mobile Agents**|Ke Cao et.al.|[2307.07763v1](http://arxiv.org/abs/2307.07763v1)|null|\n", "2307.09356": "|**2023-07-18**|**OnlineRefer: A Simple Online Baseline for Referring Video Object Segmentation**|Dongming Wu et.al.|[2307.09356v1](http://arxiv.org/abs/2307.09356v1)|**[link](https://github.com/wudongming97/onlinerefer)**|\n", "2307.09329": "|**2023-07-18**|**Towards a performance analysis on pre-trained Visual Question Answering models for autonomous driving**|Kaavya Rekanar et.al.|[2307.09329v1](http://arxiv.org/abs/2307.09329v1)|**[link](https://github.com/kaavyarekanar/towards-a-performance-analysis-on-pre-trained-vqa-models-for-autonomous-driving)**|\n", "2307.09323": "|**2023-07-18**|**Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Jiahe Li et.al.|[2307.09323v1](http://arxiv.org/abs/2307.09323v1)|**[link](https://github.com/fictionarry/er-nerf)**|\n", "2307.09312": "|**2023-07-18**|**Multi-Modal Discussion Transformer: Integrating Text, Images and Graph Transformers to Detect Hate Speech on Social Media**|Liam Hebert et.al.|[2307.09312v1](http://arxiv.org/abs/2307.09312v1)|**[link](https://github.com/liamhebert/multimodaldiscussiontransformer)**|\n", "2307.09306": "|**2023-07-18**|**EigenTrajectory: Low-Rank Descriptors for Multi-Modal Trajectory Forecasting**|Inhwan Bae et.al.|[2307.09306v1](http://arxiv.org/abs/2307.09306v1)|**[link](https://github.com/inhwanbae/eigentrajectory)**|\n", "2307.09184": "|**2023-07-18**|**You've Got Two Teachers: Co-evolutionary Image and Report Distillation for Semi-supervised Anatomical Abnormality Detection in Chest X-ray**|Jinghan Sun et.al.|[2307.09184v1](http://arxiv.org/abs/2307.09184v1)|null|\n", "2307.09155": "|**2023-07-18**|**MLF-DET: Multi-Level Fusion for Cross-Modal 3D Object Detection**|Zewei Lin et.al.|[2307.09155v1](http://arxiv.org/abs/2307.09155v1)|null|\n", "2307.09066": "|**2023-07-18**|**PatchCT: Aligning Patch Set and Label Set with Conditional Transport for Multi-Label Image Classification**|Miaoge Li et.al.|[2307.09066v1](http://arxiv.org/abs/2307.09066v1)|**[link](https://github.com/keepgoingjkg/patchct)**|\n", "2307.09059": "|**2023-07-18**|**Unleashing the Imagination of Text: A Novel Framework for Text-to-image Person Retrieval via Exploring the Power of Words**|Delong Liu et.al.|[2307.09059v1](http://arxiv.org/abs/2307.09059v1)|null|\n", "2307.09050": "|**2023-07-18**|**R-Cut: Enhancing Explainability in Vision Transformers with Relationship Weighted Out and Cut**|Yingjie Niu et.al.|[2307.09050v1](http://arxiv.org/abs/2307.09050v1)|null|\n", "2307.09036": "|**2023-07-18**|**PromptMagician: Interactive Prompt Engineering for Text-to-Image Creation**|Yingchaojie Feng et.al.|[2307.09036v1](http://arxiv.org/abs/2307.09036v1)|**[link](https://github.com/yingchaojiefeng/promptmagician)**|\n", "2307.08991": "|**2023-07-18**|**EgoVM: Achieving Precise Ego-Localization using Lightweight Vectorized Maps**|Yuzhe He et.al.|[2307.08991v1](http://arxiv.org/abs/2307.08991v1)|null|\n", "2307.08788": "|**2023-07-17**|**Uncovering Load-Altering Attacks Against N-1 Secure Power Grids: A Rare-Event Sampling Approach**|Maldon Patrice Goodridge et.al.|[2307.08788v1](http://arxiv.org/abs/2307.08788v1)|null|\n", "2307.08752": "|**2023-07-17**|**A Re-Appraisal of CO/O$_2$ Runaway on Habitable Planets Orbiting Low-Mass Stars**|Sukrit Ranjan et.al.|[2307.08752v1](http://arxiv.org/abs/2307.08752v1)|null|\n", "2307.10094": "|**2023-07-19**|**Make-A-Volume: Leveraging Latent Diffusion Models for Cross-Modality 3D Brain MRI Synthesis**|Lingting Zhu et.al.|[2307.10094v1](http://arxiv.org/abs/2307.10094v1)|null|\n", "2307.09931": "|**2023-07-19**|**DISA: DIfferentiable Similarity Approximation for Universal Multimodal Registration**|Matteo Ronchetti et.al.|[2307.09931v1](http://arxiv.org/abs/2307.09931v1)|**[link](https://github.com/imfusiongmbh/disa-universal-multimodal-registration)**|\n", "2307.09915": "|**2023-07-19**|**Embedded Heterogeneous Attention Transformer for Cross-lingual Image Captioning**|Zijie Song et.al.|[2307.09915v1](http://arxiv.org/abs/2307.09915v1)|null|\n", "2307.09823": "|**2023-07-19**|**Multi-modal Learning based Prediction for Disease**|Yaran Chen et.al.|[2307.09823v1](http://arxiv.org/abs/2307.09823v1)|null|\n", "2307.09769": "|**2023-07-19**|**Source-Free Domain Adaptation for Medical Image Segmentation via Prototype-Anchored Feature Alignment and Contrastive Learning**|Qinji Yu et.al.|[2307.09769v1](http://arxiv.org/abs/2307.09769v1)|**[link](https://github.com/cscyqj/miccai23-protocontra-sfda)**|\n", "2307.09749": "|**2023-07-19**|**Towards Robust Scene Text Image Super-resolution via Explicit Location Enhancement**|Hang Guo et.al.|[2307.09749v1](http://arxiv.org/abs/2307.09749v1)|**[link](https://github.com/csguoh/lemma)**|\n", "2307.09721": "|**2023-07-19**|**Multi-Grained Multimodal Interaction Network for Entity Linking**|Pengfei Luo et.al.|[2307.09721v1](http://arxiv.org/abs/2307.09721v1)|**[link](https://github.com/pengfei-luo/mimic)**|\n", "2307.10810": "|**2023-07-20**|**On Combining Expert Demonstrations in Imitation Learning via Optimal Transport**|Ilana Sebag et.al.|[2307.10810v1](http://arxiv.org/abs/2307.10810v1)|null|\n", "2307.10782": "|**2023-07-20**|**See More and Know More: Zero-shot Point Cloud Segmentation via Multi-modal Visual Data**|Yuhang Lu et.al.|[2307.10782v1](http://arxiv.org/abs/2307.10782v1)|null|\n", "2307.10763": "|**2023-07-20**|**MSQNet: Actor-agnostic Action Recognition with Multi-modal Query**|Anindya Mondal et.al.|[2307.10763v1](http://arxiv.org/abs/2307.10763v1)|**[link](https://github.com/mondalanindya/msqnet)**|\n", "2307.10685": "|**2023-07-20**|**Pre-train, Adapt and Detect: Multi-Task Adapter Tuning for Camouflaged Object Detection**|Yinghui Xing et.al.|[2307.10685v1](http://arxiv.org/abs/2307.10685v1)|null|\n", "2307.10601": "|**2023-07-20**|**SCA-PVNet: Self-and-Cross Attention Based Aggregation of Point Cloud and Multi-View for 3D Object Retrieval**|Dongyun Lin et.al.|[2307.10601v1](http://arxiv.org/abs/2307.10601v1)|null|\n", "2307.10577": "|**2023-07-21**|**Ethosight: A Reasoning-Guided Iterative Learning System for Nuanced Perception based on Joint-Embedding & Contextual Label Affinity**|Hugo Latapie et.al.|[2307.10577v2](http://arxiv.org/abs/2307.10577v2)|null|\n", "2307.10519": "|**2023-07-20**|**Probabilistic Multimodal Depth Estimation Based on Camera-LiDAR Sensor Fusion**|Johan S. Obando-Ceron et.al.|[2307.10519v1](http://arxiv.org/abs/2307.10519v1)|null|\n", "2307.10490": "|**2023-07-24**|**(Ab)using Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs**|Eugene Bagdasaryan et.al.|[2307.10490v3](http://arxiv.org/abs/2307.10490v3)|**[link](https://github.com/ebagdasa/multimodal_injection)**|\n", "2307.10475": "|**2023-07-19**|**Findings of Factify 2: Multimodal Fake News Detection**|S Suryavardan et.al.|[2307.10475v1](http://arxiv.org/abs/2307.10475v1)|null|\n", "2307.11552": "|**2023-07-21**|**A multi-modal representation of El Ni\u00f1o Southern Oscillation Diversity**|Jakob Schl\u00f6r et.al.|[2307.11552v1](http://arxiv.org/abs/2307.11552v1)|**[link](https://github.com/jakob-schloer/latentgmm)**|\n", "2307.11545": "|**2023-07-21**|**Bridging Vision and Language Encoders: Parameter-Efficient Tuning for Referring Image Segmentation**|Zunnan Xu et.al.|[2307.11545v1](http://arxiv.org/abs/2307.11545v1)|**[link](https://github.com/kkakkkka/etris)**|\n", "2307.11530": "|**2023-07-21**|**UWAT-GAN: Fundus Fluorescein Angiography Synthesis via Ultra-wide-angle Transformation Multi-scale GAN**|Zhaojie Fang et.al.|[2307.11530v1](http://arxiv.org/abs/2307.11530v1)|**[link](https://github.com/Tinysqua/UWAT-GAN)**|\n", "2307.11450": "|**2023-07-21**|**Topic Identification For Spontaneous Speech: Enriching Audio Features With Embedded Linguistic Information**|Dejan Porjazovski et.al.|[2307.11450v1](http://arxiv.org/abs/2307.11450v1)|**[link](https://github.com/aalto-speech/Topic-identification-for-spontaneous-Finnish-speech)**|\n", "2307.11323": "|**2023-07-21**|**HVDetFusion: A Simple and Robust Camera-Radar Fusion Framework**|Kai Lei et.al.|[2307.11323v1](http://arxiv.org/abs/2307.11323v1)|**[link](https://github.com/hvxlab/hvdetfusion)**|\n", "2307.12964": "|**2023-07-24**|**Audio-Enhanced Text-to-Video Retrieval using Text-Conditioned Feature Alignment**|Sarah Ibrahimi et.al.|[2307.12964v1](http://arxiv.org/abs/2307.12964v1)|null|\n", "2307.12853": "|**2023-07-25**|**Spatiotemporal Modeling Encounters 3D Medical Image Analysis: Slice-Shift UNet with Multi-View Fusion**|C. I. Ugwu et.al.|[2307.12853v2](http://arxiv.org/abs/2307.12853v2)|null|\n", "2307.12732": "|**2023-07-24**|**CLIP-KD: An Empirical Study of Distilling CLIP Models**|Chuanguang Yang et.al.|[2307.12732v1](http://arxiv.org/abs/2307.12732v1)|null|\n", "2307.12626": "|**2023-07-24**|**Enhancing Human-like Multi-Modal Reasoning: A New Challenging Dataset and Comprehensive Framework**|Jingxuan Wei et.al.|[2307.12626v1](http://arxiv.org/abs/2307.12626v1)|**[link](https://github.com/weijingxuan/COCO-MMR)**|\n", "2307.12577": "|**2023-07-24**|**PRIOR: Prototype Representation Joint Learning from Medical Images and Reports**|Pujin Cheng et.al.|[2307.12577v1](http://arxiv.org/abs/2307.12577v1)|**[link](https://github.com/qtacierp/prior)**|\n", "2307.12545": "|**2023-07-24**|**Towards Video Anomaly Retrieval from Video Anomaly Detection: New Benchmarks and Model**|Peng Wu et.al.|[2307.12545v1](http://arxiv.org/abs/2307.12545v1)|null|\n", "2307.12242": "|**2023-07-23**|**HealthPrism: A Visual Analytics System for Exploring Children's Physical and Mental Health Profiles with Multimodal Data**|Zhihan Jiang et.al.|[2307.12242v1](http://arxiv.org/abs/2307.12242v1)|null|\n", "2307.12236": "|**2023-07-23**|**Multi-Modal Machine Learning for Assessing Gaming Skills in Online Streaming: A Case Study with CS:GO**|Longxiang Zhang et.al.|[2307.12236v1](http://arxiv.org/abs/2307.12236v1)|null|\n", "2307.12180": "|**2023-07-22**|**Prototype-Driven and Multi-Expert Integrated Multi-Modal MR Brain Tumor Image Segmentation**|Yafei Zhang et.al.|[2307.12180v1](http://arxiv.org/abs/2307.12180v1)|**[link](https://github.com/linzy0227/pdminet)**|\n", "2307.12067": "|**2023-07-22**|**Replay: Multi-modal Multi-view Acted Videos for Casual Holography**|Roman Shapovalov et.al.|[2307.12067v1](http://arxiv.org/abs/2307.12067v1)|**[link](https://github.com/facebookresearch/replay_dataset)**|\n", "2307.12058": "|**2023-07-22**|**Discovering Spatio-Temporal Rationales for Video Question Answering**|Yicong Li et.al.|[2307.12058v1](http://arxiv.org/abs/2307.12058v1)|null|\n", "2307.11921": "|**2023-07-21**|**Poverty rate prediction using multi-modal survey and earth observation data**|Simone Fobi et.al.|[2307.11921v1](http://arxiv.org/abs/2307.11921v1)|null|\n", "2307.13600": "|**2023-07-25**|**Decisive Data using Multi-Modality Optical Sensors for Advanced Vehicular Systems**|Muhammad Ali Farooq et.al.|[2307.13600v1](http://arxiv.org/abs/2307.13600v1)|null|\n", "2307.13537": "|**2023-07-25**|**Spectrum-guided Multi-granularity Referring Video Object Segmentation**|Bo Miao et.al.|[2307.13537v1](http://arxiv.org/abs/2307.13537v1)|**[link](https://github.com/bo-miao/sgmg)**|\n", "2307.13529": "|**2023-07-25**|**Re-mine, Learn and Reason: Exploring the Cross-modal Semantic Correlations for Language-guided HOI detection**|Yichao Cao et.al.|[2307.13529v1](http://arxiv.org/abs/2307.13529v1)|null|\n", "2307.13205": "|**2023-07-25**|**Text-oriented Modality Reinforcement Network for Multimodal Sentiment Analysis from Unaligned Multimodal Sequences**|Yuxuan Lei et.al.|[2307.13205v1](http://arxiv.org/abs/2307.13205v1)|null|\n", "2307.13125": "|**2023-07-24**|**Deep Learning Approaches for Data Augmentation in Medical Imaging: A Review**|Aghiles Kebaili et.al.|[2307.13125v1](http://arxiv.org/abs/2307.13125v1)|null|\n", "2307.13069": "|**2023-07-24**|**General-Purpose Multi-Modal OOD Detection Framework**|Viet Duong et.al.|[2307.13069v1](http://arxiv.org/abs/2307.13069v1)|null|\n", "2307.14277": "|**2023-07-26**|**G2L: Semantically Aligned and Uniform Video Grounding via Geodesic and Game Theory**|Hongxiang Li et.al.|[2307.14277v1](http://arxiv.org/abs/2307.14277v1)|null|\n", "2307.14273": "|**2023-07-26**|**Deepfake Image Generation for Improved Brain Tumor Segmentation**|Roa'a Al-Emaryeen et.al.|[2307.14273v1](http://arxiv.org/abs/2307.14273v1)|null|\n", "2307.14244": "|**2023-07-26**|**Neural-based Cross-modal Search and Retrieval of Artwork**|Yan Gong et.al.|[2307.14244v1](http://arxiv.org/abs/2307.14244v1)|null|\n", "2307.14240": "|**2023-07-26**|**Boon: A Neural Search Engine for Cross-Modal Information Retrieval**|Yan Gong et.al.|[2307.14240v1](http://arxiv.org/abs/2307.14240v1)|null|\n", "2307.14185": "|**2023-07-26**|**A comparison of machine learning surrogate models of street-scale flooding in Norfolk, Virginia**|Diana McSpadden et.al.|[2307.14185v1](http://arxiv.org/abs/2307.14185v1)|null|\n", "2307.14126": "|**2023-07-26**|**Multi-modal Learning with Missing Modality via Shared-Specific Feature Modelling**|Hu Wang et.al.|[2307.14126v1](http://arxiv.org/abs/2307.14126v1)|null|\n", "2307.14061": "|**2023-07-26**|**Set-level Guidance Attack: Boosting Adversarial Transferability of Vision-Language Pre-training Models**|Dong Lu et.al.|[2307.14061v1](http://arxiv.org/abs/2307.14061v1)|**[link](https://github.com/Zoky-2020/Set-level_Guidance_Attack)**|\n", "2307.13950": "|**2023-07-26**|**Deep Robust Multi-Robot Re-localisation in Natural Environments**|Milad Ramezani et.al.|[2307.13950v1](http://arxiv.org/abs/2307.13950v1)|null|\n", "2307.13933": "|**2023-07-26**|**AIDE: A Vision-Driven Multi-View, Multi-Modal, Multi-Tasking Dataset for Assistive Driving Perception**|Dingkang Yang et.al.|[2307.13933v1](http://arxiv.org/abs/2307.13933v1)|**[link](https://github.com/ydk122024/aide)**|\n", "2307.13925": "|**2023-07-27**|**EasyNet: An Easy Network for 3D Industrial Anomaly Detection**|Ruitao Chen et.al.|[2307.13925v2](http://arxiv.org/abs/2307.13925v2)|null|\n", "2307.13871": "|**2023-07-26**|**Emulating Expert Insight: A Robust Strategy for Optimal Experimental Design**|Matthew R. Carbone et.al.|[2307.13871v1](http://arxiv.org/abs/2307.13871v1)|**[link](https://github.com/matthewcarbone/scientificvalueagent)**|\n", "2307.15016": "|**2023-07-27**|**How Good is Google Bard's Visual Understanding? An Empirical Study on Open Challenges**|Haotong Qin et.al.|[2307.15016v1](http://arxiv.org/abs/2307.15016v1)|**[link](https://github.com/htqin/googlebard-visunderstand)**|\n", "2307.14901": "|**2023-07-27**|**Text-guided Foundation Model Adaptation for Pathological Image Classification**|Yunkun Zhang et.al.|[2307.14901v1](http://arxiv.org/abs/2307.14901v1)|**[link](https://github.com/yunkun-zhang/cite)**|\n", "2307.14889": "|**2023-07-27**|**Weakly Supervised Multi-Modal 3D Human Body Pose Estimation for Autonomous Driving**|Peter Bauer et.al.|[2307.14889v1](http://arxiv.org/abs/2307.14889v1)|null|\n", "2307.14878": "|**2023-07-27**|**MESED: A Multi-modal Entity Set Expansion Dataset with Fine-grained Semantic Classes and Hard Negative Entities**|Yangning Li et.al.|[2307.14878v1](http://arxiv.org/abs/2307.14878v1)|**[link](https://github.com/thukelab/mesed)**|\n", "2307.14682": "|**2023-07-27**|**Unified Adversarial Patch for Visible-Infrared Cross-modal Attacks in the Physical World**|Xingxing Wei et.al.|[2307.14682v1](http://arxiv.org/abs/2307.14682v1)|**[link](https://github.com/aries-iai/cross-modal_patch_attack)**|\n", "2307.14619": "|**2023-07-29**|**Imitating Complex Trajectories: Bridging Low-Level Stability and High-Level Behavior**|Adam Block et.al.|[2307.14619v2](http://arxiv.org/abs/2307.14619v2)|null|\n", "2307.14572": "|**2023-07-27**|**Non-invasive Deep-Brain Imaging with 3D Integrated Photoacoustic Tomography and Ultrasound Localization Microscopy (3D-PAULM)**|Yuqi Tang et.al.|[2307.14572v1](http://arxiv.org/abs/2307.14572v1)|null|\n", "2307.14539": "|**2023-07-26**|**Plug and Pray: Exploiting off-the-shelf components of Multi-Modal Models**|Erfan Shayegani et.al.|[2307.14539v1](http://arxiv.org/abs/2307.14539v1)|null|\n", "2307.14523": "|**2023-07-26**|**Towards multi-modal anatomical landmark detection for ultrasound-guided brain tumor resection with contrastive learning**|Soorena Salari et.al.|[2307.14523v1](http://arxiv.org/abs/2307.14523v1)|null|\n", "2307.14491": "|**2023-07-26**|**Modality-Agnostic Audio-Visual Deepfake Detection**|Cai Yu et.al.|[2307.14491v1](http://arxiv.org/abs/2307.14491v1)|null|\n", "2307.15554": "|**2023-07-28**|**'What are you referring to?' Evaluating the Ability of Multi-Modal Dialogue Models to Process Clarificational Exchanges**|Javier Chiyah-Garcia et.al.|[2307.15554v1](http://arxiv.org/abs/2307.15554v1)|**[link](https://github.com/jchiyah/what-are-you-referring-to)**|\n", "2307.15460": "|**2023-07-28**|**Cross-Modal Concept Learning and Inference for Vision-Language Models**|Yi Zhang et.al.|[2307.15460v1](http://arxiv.org/abs/2307.15460v1)|null|\n", "2307.15432": "|**2023-07-28**|**CFN-ESA: A Cross-Modal Fusion Network with Emotion-Shift Awareness for Dialogue Emotion Recognition**|Jiang Li et.al.|[2307.15432v1](http://arxiv.org/abs/2307.15432v1)|null|\n", "2307.15344": "|**2023-07-28**|**Improving Audio-Text Retrieval via Hierarchical Cross-Modal Interaction and Auxiliary Captions**|Yifei Xin et.al.|[2307.15344v1](http://arxiv.org/abs/2307.15344v1)|null|\n", "2307.15220": "|**2023-07-27**|**Learning Multi-modal Representations by Watching Hundreds of Surgical Video Lectures**|Kun Yuan et.al.|[2307.15220v1](http://arxiv.org/abs/2307.15220v1)|**[link](https://github.com/camma-public/surgvlp)**|\n", "2307.15167": "|**2023-07-27**|**PEANUT: A Human-AI Collaborative Tool for Annotating Audio-Visual Data**|Zheng Zhang et.al.|[2307.15167v1](http://arxiv.org/abs/2307.15167v1)|null|\n", "2307.15097": "|**2023-07-27**|**Cascaded Cross-Modal Transformer for Request and Complaint Detection**|Nicolae-Catalin Ristea et.al.|[2307.15097v1](http://arxiv.org/abs/2307.15097v1)|null|\n", "2307.16896": "|**2023-07-31**|**Disruptive Autoencoders: Leveraging Low-level features for 3D Medical Image Pre-training**|Jeya Maria Jose Valanarasu et.al.|[2307.16896v1](http://arxiv.org/abs/2307.16896v1)|null|\n", "2307.16847": "|**2023-07-31**|**Latent Masking for Multimodal Self-supervised Learning in Health Timeseries**|Shohreh Deldari et.al.|[2307.16847v1](http://arxiv.org/abs/2307.16847v1)|null|\n", "2307.16745": "|**2023-07-31**|**Advancing Smart Malnutrition Monitoring: A Multi-Modal Learning Approach for Vital Health Parameter Estimation**|Ashish Marisetty et.al.|[2307.16745v1](http://arxiv.org/abs/2307.16745v1)|null|\n", "2307.16617": "|**2023-07-31**|**FULLER: Unified Multi-modality Multi-task 3D Perception via Multi-level Gradient Calibration**|Zhijian Huang et.al.|[2307.16617v1](http://arxiv.org/abs/2307.16617v1)|null|\n", "2307.16532": "|**2023-07-31**|**Echoes Beyond Points: Unleashing the Power of Raw Radar Data in Multi-modality Fusion**|Yang Liu et.al.|[2307.16532v1](http://arxiv.org/abs/2307.16532v1)|null|\n", "2307.16395": "|**2023-07-31**|**Bridging the Gap: Exploring the Capabilities of Bridge-Architectures for Complex Visual Reasoning Tasks**|Kousik Rajesh et.al.|[2307.16395v1](http://arxiv.org/abs/2307.16395v1)|null|\n", "2307.16366": "|**2023-07-31**|**Multi-modal Graph Neural Network for Early Diagnosis of Alzheimer's Disease from sMRI and PET Scans**|Yanteng Zhanga et.al.|[2307.16366v1](http://arxiv.org/abs/2307.16366v1)|null|\n", "2307.16210": "|**2023-08-01**|**Rethinking Uncertainly Missing and Ambiguous Visual Modality in Multi-Modal Entity Alignment**|Zhuo Chen et.al.|[2307.16210v2](http://arxiv.org/abs/2307.16210v2)|**[link](https://github.com/zjukg/umaea)**|\n", "2307.16142": "|**2023-07-30**|**Implicit Neural Representation in Medical Imaging: A Comparative Survey**|Amirali Molaei et.al.|[2307.16142v1](http://arxiv.org/abs/2307.16142v1)|**[link](https://github.com/mindflow-institue/awesome-implicit-neural-representations-in-medical-imaging)**|\n", "2307.16121": "|**2023-07-30**|**Uncertainty-Encoded Multi-Modal Fusion for Robust Object Detection in Autonomous Driving**|Yang Lou et.al.|[2307.16121v1](http://arxiv.org/abs/2307.16121v1)|null|\n", "2307.16106": "|**2023-07-30**|**TransFusion: A Practical and Effective Transformer-based Diffusion Model for 3D Human Motion Prediction**|Sibo Tian et.al.|[2307.16106v1](http://arxiv.org/abs/2307.16106v1)|null|\n", "2307.16013": "|**2023-07-29**|**Marrying Dialogue Systems with Data Visualization: Interactive Data Visualization Generation from Natural Language Conversations**|Yuanfeng Song et.al.|[2307.16013v1](http://arxiv.org/abs/2307.16013v1)|null|\n", "2307.15988": "|**2023-07-29**|**RGB-D-Fusion: Image Conditioned Depth Diffusion of Humanoid Subjects**|Sascha Kirch et.al.|[2307.15988v1](http://arxiv.org/abs/2307.15988v1)|**[link](https://github.com/sascha-kirch/rgb-d-fusion)**|\n", "2307.15942": "|**2023-07-29**|**CMDA: Cross-Modality Domain Adaptation for Nighttime Semantic Segmentation**|Ruihao Xia et.al.|[2307.15942v1](http://arxiv.org/abs/2307.15942v1)|**[link](https://github.com/xiarho/cmda)**|\n", "2307.15872": "|**2023-07-29**|**Cross-dimensional transfer learning in medical image segmentation with deep learning**|Hicham Messaoudi et.al.|[2307.15872v1](http://arxiv.org/abs/2307.15872v1)|**[link](https://github.com/hic-messaoudi/cross-dimensional-transfer-learning-in-medical-image-segmentation-with-deep-learning)**|\n", "2308.00692": "|**2023-08-03**|**LISA: Reasoning Segmentation via Large Language Model**|Xin Lai et.al.|[2308.00692v2](http://arxiv.org/abs/2308.00692v2)|**[link](https://github.com/dvlab-research/lisa)**|\n", "2308.00628": "|**2023-08-01**|**Human-M3: A Multi-view Multi-modal Dataset for 3D Human Pose Estimation in Outdoor Scenes**|Bohao Fan et.al.|[2308.00628v1](http://arxiv.org/abs/2308.00628v1)|**[link](https://github.com/soullessrobot/human-m3-dataset)**|\n", "2308.00588": "|**2023-08-01**|**Relation-Aware Distribution Representation Network for Person Clustering with Multiple Modalities**|Kaijian Liu et.al.|[2308.00588v1](http://arxiv.org/abs/2308.00588v1)|null|\n", "2308.00330": "|**2023-08-01**|**Advancing Frame-Dropping in Multi-Object Tracking-by-Detection Systems Through Event-Based Detection Triggering**|Matti Henning et.al.|[2308.00330v1](http://arxiv.org/abs/2308.00330v1)|null|\n", "2308.00295": "|**2023-08-01**|**Making the V in Text-VQA Matter**|Shamanthak Hegde et.al.|[2308.00295v1](http://arxiv.org/abs/2308.00295v1)|null|\n", "2308.00291": "|**2023-08-01**|**Fundus-Enhanced Disease-Aware Distillation Model for Retinal Disease Classification from OCT Images**|Lehan Wang et.al.|[2308.00291v1](http://arxiv.org/abs/2308.00291v1)|**[link](https://github.com/xmed-lab/fddm)**|\n", "2308.00264": "|**2023-08-01**|**Multi-Modality Multi-Loss Fusion Network**|Zehui Wu et.al.|[2308.00264v1](http://arxiv.org/abs/2308.00264v1)|null|\n", "2308.00235": "|**2023-08-01**|**Demonstrating Autonomous 3D Path Planning on a Novel Scalable UGV-UAV Morphing Robot**|Eric Sihite et.al.|[2308.00235v1](http://arxiv.org/abs/2308.00235v1)|null|\n", "2308.00228": "|**2023-08-01**|**Using Scene and Semantic Features for Multi-modal Emotion Recognition**|Zhifeng Wang et.al.|[2308.00228v1](http://arxiv.org/abs/2308.00228v1)|null|\n", "2307.16620": "|**2023-08-01**|**Audio-Visual Segmentation by Exploring Cross-Modal Mutual Semantics**|Chen Liu et.al.|[2307.16620v2](http://arxiv.org/abs/2307.16620v2)|null|\n", "2308.01217": "|**2023-08-02**|**TeachCLIP: Multi-Grained Teaching for Efficient Text-to-Video Retrieval**|Kaibin Tian et.al.|[2308.01217v1](http://arxiv.org/abs/2308.01217v1)|null|\n", "2308.01147": "|**2023-08-02**|**Contrast-augmented Diffusion Model with Fine-grained Sequence Alignment for Markup-to-Image Generation**|Guojin Zhong et.al.|[2308.01147v1](http://arxiv.org/abs/2308.01147v1)|**[link](https://github.com/zgj77/fsacdm)**|\n", "2308.01006": "|**2023-08-03**|**FusionAD: Multi-modality Fusion for Prediction and Planning Tasks of Autonomous Driving**|Tengju Ye et.al.|[2308.01006v2](http://arxiv.org/abs/2308.01006v2)|**[link](https://github.com/westlake-autolab/fusionad)**|\n", "2308.00980": "|**2023-08-02**|**Grasp Stability Assessment Through Attention-Guided Cross-Modality Fusion and Transfer Learning**|Zhuangzhuang Zhang et.al.|[2308.00980v1](http://arxiv.org/abs/2308.00980v1)|null|\n", "2308.00906": "|**2023-08-02**|**ImageBrush: Learning Visual In-Context Instructions for Exemplar-Based Image Manipulation**|Yasheng Sun et.al.|[2308.00906v1](http://arxiv.org/abs/2308.00906v1)|null|\n", "2308.00856": "|**2023-08-01**|**Differential Privacy for Adaptive Weight Aggregation in Federated Tumor Segmentation**|Muhammad Irfan Khan et.al.|[2308.00856v1](http://arxiv.org/abs/2308.00856v1)|null|\n", "2308.01731": "|**2023-08-03**|**Quantification of Predictive Uncertainty via Inference-Time Sampling**|Katar\u00edna T\u00f3thov\u00e1 et.al.|[2308.01731v1](http://arxiv.org/abs/2308.01731v1)|null|\n", "2308.01546": "|**2023-08-03**|**MusicLDM: Enhancing Novelty in Text-to-Music Generation Using Beat-Synchronous Mixup Strategies**|Ke Chen et.al.|[2308.01546v1](http://arxiv.org/abs/2308.01546v1)|**[link](https://github.com/retrocirce/musicldm)**|\n", "2308.01526": "|**2023-08-03**|**Data Augmentation for Human Behavior Analysis in Multi-Person Conversations**|Kun Li et.al.|[2308.01526v1](http://arxiv.org/abs/2308.01526v1)|null|\n", "2308.01328": "|**2023-08-02**|**A vision transformer-based framework for knowledge transfer from multi-modal to mono-modal lymphoma subtyping models**|Bilel Guetarni et.al.|[2308.01328v1](http://arxiv.org/abs/2308.01328v1)|null|\n", "2308.02487": "|**2023-08-04**|**Convolutions Die Hard: Open-Vocabulary Segmentation with Single Frozen Convolutional CLIP**|Qihang Yu et.al.|[2308.02487v1](http://arxiv.org/abs/2308.02487v1)|**[link](https://github.com/bytedance/fc-clip)**|\n", "2308.02463": "|**2023-08-04**|**Towards Generalist Foundation Model for Radiology**|Chaoyi Wu et.al.|[2308.02463v1](http://arxiv.org/abs/2308.02463v1)|**[link](https://github.com/chaoyi-wu/radfm)**|\n", "2308.02239": "|**2023-08-04**|**DTF-Net: Category-Level Pose Estimation and Shape Reconstruction via Deformable Template Field**|Haowen Wang et.al.|[2308.02239v1](http://arxiv.org/abs/2308.02239v1)|null|\n", "2308.02097": "|**2023-08-04**|**Multi-interactive Feature Learning and a Full-time Multi-modality Benchmark for Image Fusion and Segmentation**|Jinyuan Liu et.al.|[2308.02097v1](http://arxiv.org/abs/2308.02097v1)|**[link](https://github.com/jinyuanliu-cv/segmif)**|\n", "2308.01994": "|**2023-08-03**|**Explainable unsupervised multi-modal image registration using deep networks**|Chengjia Wang et.al.|[2308.01994v1](http://arxiv.org/abs/2308.01994v1)|null|\n", "2308.02299": "|**2023-08-03**|**RegionBLIP: A Unified Multi-modal Pre-training Framework for Holistic and Regional Comprehension**|Qiang Zhou et.al.|[2308.02299v1](http://arxiv.org/abs/2308.02299v1)|**[link](https://github.com/mightyzau/regionblip)**|\n", "2308.03729": "|**2023-08-07**|**Tiny LVLM-eHub: Early Multimodal Experiments with Bard**|Wenqi Shao et.al.|[2308.03729v1](http://arxiv.org/abs/2308.03729v1)|**[link](https://github.com/opengvlab/multi-modality-arena)**|\n", "2308.03666": "|**2023-08-07**|**Bridging Trustworthiness and Open-World Learning: An Exploratory Neural Approach for Enhancing Interpretability, Generalization, and Robustness**|Shide Du et.al.|[2308.03666v1](http://arxiv.org/abs/2308.03666v1)|null|\n", "2308.03475": "|**2023-08-07**|**COPA: Efficient Vision-Language Pre-training Through Collaborative Object- and Patch-Text Alignment**|Chaoya Jiang et.al.|[2308.03475v1](http://arxiv.org/abs/2308.03475v1)|null|\n", "2308.03432": "|**2023-08-07**|**Cuing Without Sharing: A Federated Cued Speech Recognition Framework via Mutual Knowledge Distillation**|Yuxuan Zhang et.al.|[2308.03432v1](http://arxiv.org/abs/2308.03432v1)|**[link](https://github.com/yuxuanzhang0713/fedcsr)**|\n", "2308.03424": "|**2023-08-07**|**CAESURA: Language Models as Multi-Modal Query Planners**|Matthias Urban et.al.|[2308.03424v1](http://arxiv.org/abs/2308.03424v1)|null|\n", "2308.03267": "|**2023-08-07**|**Redundancy-aware Transformer for Video Question Answering**|Yicong Li et.al.|[2308.03267v1](http://arxiv.org/abs/2308.03267v1)|null|\n", "2308.03256": "|**2023-08-07**|**Learning a Graph Neural Network with Cross Modality Interaction for Image Fusion**|Jiawei Li et.al.|[2308.03256v1](http://arxiv.org/abs/2308.03256v1)|**[link](https://github.com/lok-18/ignet)**|\n", "2308.03151": "|**2023-08-06**|**Food-500 Cap: A Fine-Grained Food Caption Benchmark for Evaluating Vision-Language Models**|Zheng Ma et.al.|[2308.03151v1](http://arxiv.org/abs/2308.03151v1)|**[link](https://github.com/aaronma2020/Food500-Cap)**|\n", "2308.03135": "|**2023-08-06**|**E-CLIP: Towards Label-efficient Event-based Open-world Understanding by CLIP**|Jiazhou Zhou et.al.|[2308.03135v1](http://arxiv.org/abs/2308.03135v1)|null|\n", "2308.02982": "|**2023-08-06**|**Beyond First Impressions: Integrating Joint Multi-modal Cues for Comprehensive 3D Representation**|Haowei Wang et.al.|[2308.02982v1](http://arxiv.org/abs/2308.02982v1)|**[link](https://github.com/mr-neko/jm3d)**|\n", "2308.02883": "|**2023-08-05**|**Cross-modal & Cross-domain Learning for Unsupervised LiDAR Semantic Segmentation**|Yiyang Chen et.al.|[2308.02883v1](http://arxiv.org/abs/2308.02883v1)|null|\n", "2308.02872": "|**2023-08-05**|**Data-Based Design of Multi-Model Inferential Sensors**|Martin Mojto et.al.|[2308.02872v1](http://arxiv.org/abs/2308.02872v1)|null|\n", "2308.02823": "|**2023-08-05**|**A Symbolic Character-Aware Model for Solving Geometry Problems**|Maizhen Ning et.al.|[2308.02823v1](http://arxiv.org/abs/2308.02823v1)|**[link](https://github.com/ning-mz/sca-gps)**|\n", "2308.04369": "|**2023-08-08**|**SSTFormer: Bridging Spiking Neural Network and Memory Support Transformer for Frame-Event based Recognition**|Xiao Wang et.al.|[2308.04369v1](http://arxiv.org/abs/2308.04369v1)|**[link](https://github.com/event-ahu/sstformer)**|\n", "2308.04352": "|**2023-08-08**|**3D-VisTA: Pre-trained Transformer for 3D Vision and Text Alignment**|Ziyu Zhu et.al.|[2308.04352v1](http://arxiv.org/abs/2308.04352v1)|null|\n", "2308.04343": "|**2023-08-08**|**Unifying Two-Stream Encoders with Transformers for Cross-Modal Retrieval**|Yi Bin et.al.|[2308.04343v1](http://arxiv.org/abs/2308.04343v1)|**[link](https://github.com/luminosityx/hat)**|\n", "2308.04126": "|**2023-08-08**|**OmniDataComposer: A Unified Data Structure for Multimodal Data Fusion and Infinite Data Generation**|Dongyang Yu et.al.|[2308.04126v1](http://arxiv.org/abs/2308.04126v1)|**[link](https://github.com/shajiayu1/OmniDataComposer)**|\n", "2308.04067": "|**2023-08-08**|**Online Distillation-enhanced Multi-modal Transformer for Sequential Recommendation**|Wei Ji et.al.|[2308.04067v1](http://arxiv.org/abs/2308.04067v1)|**[link](https://github.com/xyliugo/odmt)**|\n", "2308.03908": "|**2023-08-07**|**ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings for Video Action Recognition**|Soumyabrata Chaudhuri et.al.|[2308.03908v1](http://arxiv.org/abs/2308.03908v1)|null|\n", "2308.05061": "|**2023-08-09**|**Prompting In-Context Operator Learning with Sensor Data, Equations, and Natural Language**|Liu Yang et.al.|[2308.05061v1](http://arxiv.org/abs/2308.05061v1)|**[link](https://github.com/liuyangmage/in-context-operator-networks)**|\n", "2308.04992": "|**2023-08-09**|**AspectMMKG: A Multi-modal Knowledge Graph with Aspect-aware Entities**|Jingdan Zhang et.al.|[2308.04992v1](http://arxiv.org/abs/2308.04992v1)|**[link](https://github.com/thezjd/aspectmmkg)**|\n", "2308.04829": "|**2023-08-09**|**MixReorg: Cross-Modal Mixed Patch Reorganization is a Good Mask Learner for Open-World Semantic Segmentation**|Kaixin Cai et.al.|[2308.04829v1](http://arxiv.org/abs/2308.04829v1)|null|\n", "2308.04820": "|**2023-08-09**|**Strategic Interactions in Multi-modal Mobility Systems: A Game-Theoretic Perspective**|Gioele Zardini et.al.|[2308.04820v1](http://arxiv.org/abs/2308.04820v1)|null|\n", "2308.04779": "|**2023-08-09**|**Multi-View Fusion and Distillation for Subgrade Distresses Detection based on 3D-GPR**|Chunpeng Zhou et.al.|[2308.04779v1](http://arxiv.org/abs/2308.04779v1)|null|\n", "2308.04778": "|**2023-08-09**|**Multi-modal Multi-view Clustering based on Non-negative Matrix Factorization**|Yasser Khalafaoui et.al.|[2308.04778v1](http://arxiv.org/abs/2308.04778v1)|null|\n", "2308.04706": "|**2023-08-09**|**Pareto Invariant Representation Learning for Multimedia Recommendation**|Shanshan Huang et.al.|[2308.04706v1](http://arxiv.org/abs/2308.04706v1)|null|\n", "2308.04702": "|**2023-08-09**|**Continual Road-Scene Semantic Segmentation via Feature-Aligned Symmetric Multi-Modal Network**|Francesco Barbato et.al.|[2308.04702v1](http://arxiv.org/abs/2308.04702v1)|null|\n", "2308.04663": "|**2023-08-09**|**Classification of lung cancer subtypes on CT images with synthetic pathological priors**|Wentao Zhu et.al.|[2308.04663v1](http://arxiv.org/abs/2308.04663v1)|null|\n", "2308.04579": "|**2023-08-08**|**RECipe: Does a Multi-Modal Recipe Knowledge Graph Fit a Multi-Purpose Recommendation System?**|Ali Pesaranghader et.al.|[2308.04579v1](http://arxiv.org/abs/2308.04579v1)|null|\n", "2308.04556": "|**2023-08-08**|**FocalFormer3D : Focusing on Hard Instance for 3D Object Detection**|Yilun Chen et.al.|[2308.04556v1](http://arxiv.org/abs/2308.04556v1)|**[link](https://github.com/NVlabs/FocalFormer3D)**|\n", "2308.05667": "|**2023-08-14**|**2D3D-MATR: 2D-3D Matching Transformer for Detection-free Registration between Images and Point Clouds**|Minhao Li et.al.|[2308.05667v2](http://arxiv.org/abs/2308.05667v2)|**[link](https://github.com/minhaolee/2d3dmatr)**|\n", "2308.05648": "|**2023-08-10**|**Counterfactual Cross-modality Reasoning for Weakly Supervised Video Moment Localization**|Zezhong Lv et.al.|[2308.05648v1](http://arxiv.org/abs/2308.05648v1)|**[link](https://github.com/sldz0306/ccr)**|\n", "2308.05478": "|**2023-08-10**|**Reviewing 3D Object Detectors in the Context of High-Resolution 3+1D Radar**|Patrick Palmer et.al.|[2308.05478v1](http://arxiv.org/abs/2308.05478v1)|null|\n", "2308.05438": "|**2023-08-10**|**Deep Fusion Transformer Network with Weighted Vector-Wise Keypoints Voting for Robust 6D Object Pose Estimation**|Jun Zhou et.al.|[2308.05438v1](http://arxiv.org/abs/2308.05438v1)|**[link](https://github.com/junzastar/dftr_voting)**|\n", "2308.05421": "|**2023-08-10**|**Progressive Spatio-temporal Perception for Audio-Visual Question Answering**|Guangyao Li et.al.|[2308.05421v1](http://arxiv.org/abs/2308.05421v1)|**[link](https://github.com/gewu-lab/pstp-net)**|\n", "2308.05128": "|**2023-08-09**|**High-Level Features Parallelization for Inference Cost Reduction Through Selective Attention**|Andr\u00e9 Peter Kelm et.al.|[2308.05128v1](http://arxiv.org/abs/2308.05128v1)|null|\n", "2308.06262": "|**2023-08-11**|**Foundation Model is Efficient Multimodal Multitask Model Selector**|Fanqing Meng et.al.|[2308.06262v1](http://arxiv.org/abs/2308.06262v1)|**[link](https://github.com/opengvlab/multitask-model-selector)**|\n", "2308.06207": "|**2023-08-11**|**Thinking Like an Expert:Multimodal Hypergraph-of-Thought (HoT) Reasoning to boost Foundation Modals**|Fanglong Yao et.al.|[2308.06207v1](http://arxiv.org/abs/2308.06207v1)|null|\n", "2308.06125": "|**2023-08-11**|**Improving Joint Speech-Text Representations Without Alignment**|Cal Peyser et.al.|[2308.06125v1](http://arxiv.org/abs/2308.06125v1)|null|\n", "2308.06024": "|**2023-08-11**|**Spatial-information Guided Adaptive Context-aware Network for Efficient RGB-D Semantic Segmentation**|Yang Zhang et.al.|[2308.06024v1](http://arxiv.org/abs/2308.06024v1)|**[link](https://github.com/mvme-hbut/sgacnet)**|\n", "2308.06009": "|**2023-08-11**|**ViGT: Proposal-free Video Grounding with Learnable Token in Transformer**|Kun Li et.al.|[2308.06009v1](http://arxiv.org/abs/2308.06009v1)|null|\n", "2308.05993": "|**2023-08-11**|**Image-based Geolocalization by Ground-to-2.5D Map Matching**|Mengjie Zhou et.al.|[2308.05993v1](http://arxiv.org/abs/2308.05993v1)|**[link](https://github.com/zhoumengjie/2-5dmap-dataset)**|\n", "2308.05948": "|**2023-08-11**|**Uncertainty-Aware Cross-Modal Transfer Network for Sketch-Based 3D Shape Retrieval**|Yiyang Cai et.al.|[2308.05948v1](http://arxiv.org/abs/2308.05948v1)|null|\n", "2308.05864": "|**2023-08-10**|**The Multi-modality Cell Segmentation Challenge: Towards Universal Solutions**|Jun Ma et.al.|[2308.05864v1](http://arxiv.org/abs/2308.05864v1)|null|\n", "2308.07222": "|**2023-08-14**|**MM-GEF: Multi-modal representation meet collaborative filtering**|Hao Wu et.al.|[2308.07222v1](http://arxiv.org/abs/2308.07222v1)|null|\n", "2308.07214": "|**2023-08-14**|**Automated Ensemble-Based Segmentation of Adult Brain Tumors: A Novel Approach Using the BraTS AFRICA Challenge Data**|Chiranjeewee Prasad Koirala et.al.|[2308.07214v1](http://arxiv.org/abs/2308.07214v1)|null|\n", "2308.07173": "|**2023-08-14**|**Enhancing State Estimator for Autonomous Race Car : Leveraging Multi-modal System and Managing Computing Resources**|Daegyu Lee et.al.|[2308.07173v1](http://arxiv.org/abs/2308.07173v1)|null|\n", "2308.07146": "|**2023-08-14**|**CTP: Towards Vision-Language Continual Pretraining via Compatible Momentum Contrast and Topology Preservation**|Hongguang Zhu et.al.|[2308.07146v1](http://arxiv.org/abs/2308.07146v1)|**[link](https://github.com/kevinlight831/ctp)**|\n", "2308.07026": "|**2023-08-14**|**AdvCLIP: Downstream-agnostic Adversarial Examples in Multimodal Contrastive Learning**|Ziqi Zhou et.al.|[2308.07026v1](http://arxiv.org/abs/2308.07026v1)|**[link](https://github.com/cgcl-codes/advclip)**|\n", "2308.06911": "|**2023-08-14**|**GIT-Mol: A Multi-modal Large Language Model for Molecular Science with Graph, Image, and Text**|Pengfei Liu et.al.|[2308.06911v1](http://arxiv.org/abs/2308.06911v1)|null|\n", "2308.06866": "|**2023-08-13**|**Improving Face Recognition from Caption Supervision with Multi-Granular Contextual Feature Aggregation**|Md Mahedi Hasan et.al.|[2308.06866v1](http://arxiv.org/abs/2308.06866v1)|null|\n", "2308.06735": "|**2023-08-13**|**AerialVLN: Vision-and-Language Navigation for UAVs**|Shubo Liu et.al.|[2308.06735v1](http://arxiv.org/abs/2308.06735v1)|**[link](https://github.com/airvln/airvln)**|\n", "2308.06696": "|**2023-08-13**|**MACO: A Modality Adversarial and Contrastive Framework for Modality-missing Multi-modal Knowledge Graph Completion**|Yichi Zhang et.al.|[2308.06696v1](http://arxiv.org/abs/2308.06696v1)|**[link](https://github.com/zjukg/maco)**|\n", "2308.06573": "|**2023-08-12**|**4DRVO-Net: Deep 4D Radar-Visual Odometry Using Multi-Modal and Multi-Scale Adaptive Fusion**|Guirong Zhuo et.al.|[2308.06573v1](http://arxiv.org/abs/2308.06573v1)|null|\n", "2308.06556": "|**2023-08-12**|**Contrastive Learning for Cross-modal Artist Retrieval**|Andres Ferraro et.al.|[2308.06556v1](http://arxiv.org/abs/2308.06556v1)|null|\n", "2308.06530": "|**2023-08-12**|**BEV-DG: Cross-Modal Learning under Bird's-Eye View for Domain Generalization of 3D Semantic Segmentation**|Miaoyu Li et.al.|[2308.06530v1](http://arxiv.org/abs/2308.06530v1)|null|\n", "2308.06498": "|**2023-08-12**|**Latent Emission-Augmented Perspective-Taking (LEAPT) for Human-Robot Interaction**|Kaiqi Chen et.al.|[2308.06498v1](http://arxiv.org/abs/2308.06498v1)|null|\n", "2308.06394": "|**2023-08-11**|**Detecting and Preventing Hallucinations in Large Vision Language Models**|Anisha Gunjal et.al.|[2308.06394v1](http://arxiv.org/abs/2308.06394v1)|null|\n", "2308.06377": "|**2023-08-11**|**CATS v2: Hybrid encoders for robust medical segmentation**|Hao Li et.al.|[2308.06377v1](http://arxiv.org/abs/2308.06377v1)|**[link](https://github.com/haoli12345/cats)**|\n", "2308.07907": "|**2023-08-15**|**Sequential Monte Carlo with Cross-validated Neural Networks for Complexity of Hyperbolic Black Hole Solutions in 4D**|Armin Hatefi et.al.|[2308.07907v1](http://arxiv.org/abs/2308.07907v1)|null|\n", "2308.07777": "|**2023-08-15**|**Enhancing Visually-Rich Document Understanding via Layout Structure Modeling**|Qiwei Li et.al.|[2308.07777v1](http://arxiv.org/abs/2308.07777v1)|null|\n", "2308.07751": "|**2023-08-15**|**CASPNet++: Joint Multi-Agent Motion Prediction**|Maximilian Sch\u00e4fer et.al.|[2308.07751v1](http://arxiv.org/abs/2308.07751v1)|null|\n", "2308.07732": "|**2023-08-15**|**UniTR: A Unified and Efficient Multi-Modal Transformer for Bird's-Eye-View Representation**|Haiyang Wang et.al.|[2308.07732v1](http://arxiv.org/abs/2308.07732v1)|**[link](https://github.com/haiyang-w/unitr)**|\n", "2308.07686": "|**2023-08-15**|**Boosting Multi-modal Model Performance with Adaptive Gradient Modulation**|Hong Li et.al.|[2308.07686v1](http://arxiv.org/abs/2308.07686v1)|**[link](https://github.com/lihong2303/agm_iccv2023)**|\n", "2308.07648": "|**2023-08-15**|**Prompt Switch: Efficient CLIP Adaptation for Text-Video Retrieval**|Chaorui Deng et.al.|[2308.07648v1](http://arxiv.org/abs/2308.07648v1)|**[link](https://github.com/bladewaltz1/promptswitch)**|\n", "2308.07622": "|**2023-08-15**|**EMID: An Emotional Aligned Dataset in Audio-Visual Modality**|Jialing Zou et.al.|[2308.07622v1](http://arxiv.org/abs/2308.07622v1)|**[link](https://github.com/ecnu-aigc/emid)**|\n", "2308.07605": "|**2023-08-15**|**SGDiff: A Style Guided Diffusion Model for Fashion Synthesis**|Zhengwentai Sun et.al.|[2308.07605v1](http://arxiv.org/abs/2308.07605v1)|**[link](https://github.com/taited/sgdiff)**|\n", "2308.08546": "|**2023-08-16**|**What is the source of the PTA GW signal?**|John Ellis et.al.|[2308.08546v1](http://arxiv.org/abs/2308.08546v1)|null|\n", "2308.08409": "|**2023-08-16**|**X-PSI Parameter Recovery for Temperature Map Configurations Inspired by PSR J0030+0451**|Serena Vinciguerra et.al.|[2308.08409v1](http://arxiv.org/abs/2308.08409v1)|null|\n", "2308.08303": "|**2023-08-16**|**Leveraging Next-Active Objects for Context-Aware Anticipation in Egocentric Videos**|Sanket Thakur et.al.|[2308.08303v1](http://arxiv.org/abs/2308.08303v1)|null|\n", "2308.08157": "|**2023-08-16**|**Learning to Generate Semantic Layouts for Higher Text-Image Correspondence in Text-to-Image Synthesis**|Minho Park et.al.|[2308.08157v1](http://arxiv.org/abs/2308.08157v1)|**[link](https://github.com/pmh9960/GCDP)**|\n", "2308.08143": "|**2023-08-16**|**SCANet: A Self- and Cross-Attention Network for Audio-Visual Speech Separation**|Kai Li et.al.|[2308.08143v1](http://arxiv.org/abs/2308.08143v1)|null|\n", "2308.08125": "|**2023-08-16**|**Radio2Text: Streaming Speech Recognition Using mmWave Radio Signals**|Running Zhao et.al.|[2308.08125v1](http://arxiv.org/abs/2308.08125v1)|null|\n", "2308.08088": "|**2023-08-16**|**Pro-Cap: Leveraging a Frozen Vision-Language Model for Hateful Meme Detection**|Rui Cao et.al.|[2308.08088v1](http://arxiv.org/abs/2308.08088v1)|**[link](https://github.com/social-ai-studio/pro-cap)**|\n", "2308.09622": "|**2023-08-18**|**Is context all you need? Scaling Neural Sign Language Translation to Large Domains of Discourse**|Ozge Mercanoglu Sincan et.al.|[2308.09622v1](http://arxiv.org/abs/2308.09622v1)|null|\n", "2308.09599": "|**2023-08-18**|**Language-Guided Diffusion Model for Visual Grounding**|Sijia Chen et.al.|[2308.09599v1](http://arxiv.org/abs/2308.09599v1)|null|\n", "2308.09568": "|**2023-08-18**|**PUMGPT: A Large Vision-Language Model for Product Understanding**|Shuhui Wu et.al.|[2308.09568v1](http://arxiv.org/abs/2308.09568v1)|null|\n", "2308.09475": "|**2023-08-18**|**Video-Instrument Synergistic Network for Referring Video Instrument Segmentation in Robotic Surgery**|Hongqiu Wang et.al.|[2308.09475v1](http://arxiv.org/abs/2308.09475v1)|null|\n", "2308.09469": "|**2023-08-18**|**An updated mass-radius analysis of the 2017-2018 NICER data set of PSR J0030+0451**|Serena Vinciguerra et.al.|[2308.09469v1](http://arxiv.org/abs/2308.09469v1)|null|\n", "2308.09442": "|**2023-08-21**|**BioMedGPT: Open Multimodal Generative Pre-trained Transformer for BioMedicine**|Yizhen Luo et.al.|[2308.09442v2](http://arxiv.org/abs/2308.09442v2)|**[link](https://github.com/pharmolix/openbiomed)**|\n", "2308.09369": "|**2023-08-18**|**Single Frame Semantic Segmentation Using Multi-Modal Spherical Images**|Suresh Guttikonda et.al.|[2308.09369v1](http://arxiv.org/abs/2308.09369v1)|**[link](https://github.com/sguttikon/SFSS-MMSI)**|\n", "2308.09363": "|**2023-08-18**|**Open-vocabulary Video Question Answering: A New Benchmark for Evaluating the Generalizability of Video Question Answering Models**|Dohwan Ko et.al.|[2308.09363v1](http://arxiv.org/abs/2308.09363v1)|**[link](https://github.com/mlvlab/ovqa)**|\n", "2308.09351": "|**2023-08-18**|**RLIPv2: Fast Scaling of Relational Language-Image Pre-training**|Hangjie Yuan et.al.|[2308.09351v1](http://arxiv.org/abs/2308.09351v1)|**[link](https://github.com/jacobyuan7/rlipv2)**|\n", "2308.09322": "|**2023-08-18**|**Audio-Visual Glance Network for Efficient Video Recognition**|Muhammad Adi Nugroho et.al.|[2308.09322v1](http://arxiv.org/abs/2308.09322v1)|null|\n", "2308.09306": "|**2023-08-18**|**DiffDis: Empowering Generative Diffusion Model with Cross-Modal Discrimination Capability**|Runhui Huang et.al.|[2308.09306v1](http://arxiv.org/abs/2308.09306v1)|null|\n", "2308.09300": "|**2023-08-21**|**V2A-Mapper: A Lightweight Solution for Vision-to-Audio Generation by Connecting Foundation Models**|Heng Wang et.al.|[2308.09300v2](http://arxiv.org/abs/2308.09300v2)|**[link](https://github.com/heng-hw/V2A-Mapper)**|\n", "2308.09234": "|**2023-08-18**|**Deep Boosting Multi-Modal Ensemble Face Recognition with Sample-Level Weighting**|Sahar Rahimi Malakshan et.al.|[2308.09234v1](http://arxiv.org/abs/2308.09234v1)|null|\n", "2308.09179": "|**2023-08-17**|**Versatile Multi-Contact Planning and Control for Legged Loco-Manipulation**|Jean-Pierre Sleiman et.al.|[2308.09179v1](http://arxiv.org/abs/2308.09179v1)|null|\n", "2308.08930": "|**2023-08-17**|**Point-aware Interaction and CNN-induced Refinement Network for RGB-D Salient Object Detection**|Runmin Cong et.al.|[2308.08930v1](http://arxiv.org/abs/2308.08930v1)|**[link](https://github.com/rmcong/picr-net_acmmm23)**|\n", "2308.10777": "|**2023-08-21**|**I-BaR: Integrated Balance Rehabilitation Framework**|Tugce Ersoy et.al.|[2308.10777v1](http://arxiv.org/abs/2308.10777v1)|null|\n", "2308.10741": "|**2023-08-21**|**On the Adversarial Robustness of Multi-Modal Foundation Models**|Christian Schlarmann et.al.|[2308.10741v1](http://arxiv.org/abs/2308.10741v1)|null|\n", "2308.10631": "|**2023-08-21**|**PsyMo: A Dataset for Estimating Self-Reported Psychological Traits from Gait**|Adrian Cosma et.al.|[2308.10631v1](http://arxiv.org/abs/2308.10631v1)|null|\n", "2308.10627": "|**2023-08-21**|**Polarimetric Information for Multi-Modal 6D Pose Estimation of Photometrically Challenging Objects with Limited Data**|Patrick Ruhkamp et.al.|[2308.10627v1](http://arxiv.org/abs/2308.10627v1)|null|\n", "2308.10621": "|**2023-08-21**|**Multi-Modal Dataset Acquisition for Photometrically Challenging Object**|HyunJun Jung et.al.|[2308.10621v1](http://arxiv.org/abs/2308.10621v1)|null|\n", "2308.10491": "|**2023-08-21**|**SynDrone -- Multi-modal UAV Dataset for Urban Scenarios**|Giulia Rizzoli et.al.|[2308.10491v1](http://arxiv.org/abs/2308.10491v1)|**[link](https://github.com/lttm/syndrone)**|\n", "2308.10486": "|**2023-08-21**|**Deep Metric Loss for Multimodal Learning**|Sehwan Moon et.al.|[2308.10486v1](http://arxiv.org/abs/2308.10486v1)|**[link](https://github.com/sehwanmoon/multimodalloss)**|\n", "2308.10454": "|**2023-08-21**|**Elucidating STEM Concepts through Generative AI: A Multi-modal Exploration of Analogical Reasoning**|Chen Cao et.al.|[2308.10454v1](http://arxiv.org/abs/2308.10454v1)|null|\n", "2308.10421": "|**2023-08-21**|**UniM$^2$AE: Multi-modal Masked Autoencoders with Unified 3D Representation for 3D Perception in Autonomous Driving**|Jian Zou et.al.|[2308.10421v1](http://arxiv.org/abs/2308.10421v1)|**[link](https://github.com/hollow-503/unim2ae)**|\n", "2308.10362": "|**2023-08-20**|**Vehicle Cameras Guide mmWave Beams: Approach and Real-World V2V Demonstration**|Tawfik Osman et.al.|[2308.10362v1](http://arxiv.org/abs/2308.10362v1)|null|\n", "2308.10240": "|**2023-08-20**|**Generic Attention-model Explainability by Weighted Relevance Accumulation**|Yiming Huang et.al.|[2308.10240v1](http://arxiv.org/abs/2308.10240v1)|null|\n", "2308.10175": "|**2023-08-20**|**BAVS: Bootstrapping Audio-Visual Segmentation by Integrating Foundation Knowledge**|Chen Liu et.al.|[2308.10175v1](http://arxiv.org/abs/2308.10175v1)|null|\n", "2308.10172": "|**2023-08-20**|**VLN-PETL: Parameter-Efficient Transfer Learning for Vision-and-Language Navigation**|Yanyuan Qiao et.al.|[2308.10172v1](http://arxiv.org/abs/2308.10172v1)|**[link](https://github.com/yanyuanqiao/vln-petl)**|\n", "2308.10161": "|**2023-08-20**|**ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under Challenging Conditions**|Qiao Yan et.al.|[2308.10161v1](http://arxiv.org/abs/2308.10161v1)|null|\n", "2308.10146": "|**2023-08-20**|**OCHID-Fi: Occlusion-Robust Hand Pose Estimation in 3D via RF-Vision**|Shujie Zhang et.al.|[2308.10146v1](http://arxiv.org/abs/2308.10146v1)|null|\n", "2308.11601": "|**2023-08-23**|**Tryage: Real-time, intelligent Routing of User Prompts to Large Language Models**|Surya Narayanan Hari et.al.|[2308.11601v2](http://arxiv.org/abs/2308.11601v2)|null|\n", "2308.11561": "|**2023-08-23**|**Target-Grounded Graph-Aware Transformer for Aerial Vision-and-Dialog Navigation**|Yifei Su et.al.|[2308.11561v2](http://arxiv.org/abs/2308.11561v2)|**[link](https://github.com/yifeisu/avdn-challenge)**|\n", "2308.11551": "|**2023-08-22**|**Multi-event Video-Text Retrieval**|Gengyuan Zhang et.al.|[2308.11551v1](http://arxiv.org/abs/2308.11551v1)|**[link](https://github.com/gengyuanmax/mevtr)**|\n", "2308.11530": "|**2023-08-22**|**Furnishing Sound Event Detection with Language Model Abilities**|Hualei Wang et.al.|[2308.11530v1](http://arxiv.org/abs/2308.11530v1)|null|\n", "2308.11513": "|**2023-08-22**|**TrackFlow: Multi-Object Tracking with Normalizing Flows**|Gianluca Mancusi et.al.|[2308.11513v1](http://arxiv.org/abs/2308.11513v1)|null|\n", "2308.11501": "|**2023-08-22**|**Four years of multi-modal odometry and mapping on the rail vehicles**|Yusheng Wang et.al.|[2308.11501v1](http://arxiv.org/abs/2308.11501v1)|null|\n", "2308.11492": "|**2023-08-22**|**A LiDAR-Inertial SLAM Tightly-Coupled with Dropout-Tolerant GNSS Fusion for Autonomous Mine Service Vehicles**|Yusheng Wang et.al.|[2308.11492v1](http://arxiv.org/abs/2308.11492v1)|null|\n", "2308.11356": "|**2023-08-22**|**Semantic RGB-D Image Synthesis**|Shijie Li et.al.|[2308.11356v1](http://arxiv.org/abs/2308.11356v1)|null|\n", "2308.11351": "|**2023-08-22**|**M3PS: End-to-End Multi-Grained Multi-Modal Attribute-Aware Product Summarization in E-commerce**|Tao Chen et.al.|[2308.11351v1](http://arxiv.org/abs/2308.11351v1)|null|\n", "2308.11331": "|**2023-08-22**|**GrowCLIP: Data-aware Automatic Model Growing for Large-scale Contrastive Language-Image Pre-training**|Xinchi Deng et.al.|[2308.11331v1](http://arxiv.org/abs/2308.11331v1)|null|\n", "2308.11206": "|**2023-08-22**|**DiffCloth: Diffusion Based Garment Synthesis and Manipulation via Structural Cross-modal Semantic Alignment**|Xujie Zhang et.al.|[2308.11206v1](http://arxiv.org/abs/2308.11206v1)|null|\n", "2308.11175": "|**2023-08-22**|**MISSRec: Pre-training and Transferring Multi-modal Interest-aware Sequence Representation for Recommendation**|Jinpeng Wang et.al.|[2308.11175v1](http://arxiv.org/abs/2308.11175v1)|**[link](https://github.com/gimpong/MM23-MISSRec)**|\n", "2308.11165": "|**2023-08-22**|**Improving Misaligned Multi-modality Image Fusion with One-stage Progressive Dense Registration**|Di Wang et.al.|[2308.11165v1](http://arxiv.org/abs/2308.11165v1)|null|\n", "2308.12199": "|**2023-08-23**|**Towards Real-Time Analysis of Broadcast Badminton Videos**|Nitin Nilesh et.al.|[2308.12199v1](http://arxiv.org/abs/2308.12199v1)|**[link](https://gitlab.com/nitin.nilesh/badminton-analysis-star)**|\n", "2308.12163": "|**2023-08-23**|**NPF-200: A Multi-Modal Eye Fixation Dataset and Method for Non-Photorealistic Videos**|Ziyu Yang et.al.|[2308.12163v1](http://arxiv.org/abs/2308.12163v1)|**[link](https://github.com/yangziyu/npf200)**|\n", "2308.12111": "|**2023-08-23**|**Cross-Modality Proposal-guided Feature Mining for Unregistered RGB-Thermal Pedestrian Detection**|Chao Tian et.al.|[2308.12111v1](http://arxiv.org/abs/2308.12111v1)|null|\n", "2308.12049": "|**2023-08-23**|**Towards Privacy-Supporting Fall Detection via Deep Unsupervised RGB2Depth Adaptation**|Hejun Xiao et.al.|[2308.12049v1](http://arxiv.org/abs/2308.12049v1)|**[link](https://github.com/1015206533/privacy_supporting_fall_detection)**|\n", "2308.11994": "|**2023-08-23**|**Progressive Feature Mining and External Knowledge-Assisted Text-Pedestrian Image Retrieval**|Huafeng Li et.al.|[2308.11994v1](http://arxiv.org/abs/2308.11994v1)|null|\n", "2308.11983": "|**2023-08-23**|**Multi-Modal Multi-Task (3MT) Road Segmentation**|Erkan Milli et.al.|[2308.11983v1](http://arxiv.org/abs/2308.11983v1)|**[link](https://github.com/erkanmilli/3mt-roadseg)**|\n", "2308.11880": "|**2023-08-23**|**SUMMIT: Source-Free Adaptation of Uni-Modal Models to Multi-Modal Targets**|Cody Simons et.al.|[2308.11880v1](http://arxiv.org/abs/2308.11880v1)|**[link](https://github.com/csimo005/summit)**|\n", "2308.11877": "|**2023-08-24**|**Integrated Image and Location Analysis for Wound Classification: A Deep Learning Approach**|Yash Patel et.al.|[2308.11877v2](http://arxiv.org/abs/2308.11877v2)|null|\n", "2308.11804": "|**2023-08-22**|**Ceci n'est pas une pomme: Adversarial Illusions in Multi-Modal Embeddings**|Eugene Bagdasaryan et.al.|[2308.11804v1](http://arxiv.org/abs/2308.11804v1)|**[link](https://github.com/ebagdasa/adversarial_illusions)**|\n", "2308.11797": "|**2023-08-22**|**CLIP Multi-modal Hashing: A new baseline CLIPMH**|Jian Zhu et.al.|[2308.11797v1](http://arxiv.org/abs/2308.11797v1)|null|\n", "2308.12956": "|**2023-08-24**|**DLIP: Distilling Language-Image Pre-training**|Huafeng Kuang et.al.|[2308.12956v1](http://arxiv.org/abs/2308.12956v1)|null|\n", "2308.12871": "|**2023-08-24**|**IPA: Inference Pipeline Adaptation to Achieve High Accuracy and Cost-Efficiency**|Saeid Ghafouri et.al.|[2308.12871v1](http://arxiv.org/abs/2308.12871v1)|null|\n", "2308.12863": "|**2023-08-24**|**SkipcrossNets: Adaptive Skip-cross Fusion for Road Detection**|Xinyu Zhang et.al.|[2308.12863v1](http://arxiv.org/abs/2308.12863v1)|null|\n", "2308.12755": "|**2023-08-24**|**Acquiring Qualitative Explainable Graphs for Automated Driving Scene Interpretation**|Nassim Belmecheri et.al.|[2308.12755v1](http://arxiv.org/abs/2308.12755v1)|**[link](https://github.com/simula-vias/qxg-builder)**|\n", "2308.12736": "|**2023-08-24**|**FastSurfer-HypVINN: Automated sub-segmentation of the hypothalamus and adjacent structures on high-resolutional brain MRI**|Santiago Estrada et.al.|[2308.12736v1](http://arxiv.org/abs/2308.12736v1)|**[link](https://github.com/Deep-MI/FastSurfer)**|\n", "2308.12610": "|**2023-08-24**|**Emotion-Aligned Contrastive Learning Between Images and Music**|Shanti Stewart et.al.|[2308.12610v1](http://arxiv.org/abs/2308.12610v1)|null|\n", "2308.12604": "|**2023-08-24**|**PromptMRG: Diagnosis-Driven Prompts for Medical Report Generation**|Haibo Jin et.al.|[2308.12604v1](http://arxiv.org/abs/2308.12604v1)|null|\n", "2308.12587": "|**2023-08-24**|**Grounded Entity-Landmark Adaptive Pre-training for Vision-and-Language Navigation**|Yibo Cui et.al.|[2308.12587v1](http://arxiv.org/abs/2308.12587v1)|**[link](https://github.com/csir1996/vln-gela)**|\n", "2308.12558": "|**2023-08-24**|**Hyperbolic Audio-visual Zero-shot Learning**|Jie Hong et.al.|[2308.12558v1](http://arxiv.org/abs/2308.12558v1)|null|\n", "2308.12509": "|**2023-08-24**|**Parameter-Efficient Transfer Learning for Remote Sensing Image-Text Retrieval**|Yuan Yuan et.al.|[2308.12509v1](http://arxiv.org/abs/2308.12509v1)|**[link](https://github.com/ZhanYang-nwpu/PE-RSITR)**|\n", "2308.12370": "|**2023-08-23**|**AdVerb: Visually Guided Audio Dereverberation**|Sanjoy Chowdhury et.al.|[2308.12370v1](http://arxiv.org/abs/2308.12370v1)|null|\n", "2308.12320": "|**2023-08-23**|**Understanding Dark Scenes by Contrasting Multi-Modal Observations**|Xiaoyu Dong et.al.|[2308.12320v1](http://arxiv.org/abs/2308.12320v1)|**[link](https://github.com/palmdong/smmcl)**|\n", "2308.13437": "|**2023-08-25**|**Position-Enhanced Visual Instruction Tuning for Multimodal Large Language Models**|Chi Chen et.al.|[2308.13437v1](http://arxiv.org/abs/2308.13437v1)|**[link](https://github.com/pvit-official/pvit)**|\n", "2308.13392": "|**2023-08-25**|**Self-Supervised Representation Learning with Cross-Context Learning between Global and Hypercolumn Features**|Zheng Gao et.al.|[2308.13392v1](http://arxiv.org/abs/2308.13392v1)|null|\n", "2308.13355": "|**2023-08-25**|**WorldSmith: Iterative and Expressive Prompting for World Building with a Generative AI**|Hai Dang et.al.|[2308.13355v1](http://arxiv.org/abs/2308.13355v1)|null|\n", "2308.13340": "|**2023-08-25**|**TriGait: Aligning and Fusing Skeleton and Silhouette Gait Data via a Tri-Branch Network**|Yan Sun et.al.|[2308.13340v1](http://arxiv.org/abs/2308.13340v1)|**[link](https://github.com/feng-xueling/trigait)**|\n", "2308.13077": "|**2023-08-24**|**Preserving Modality Structure Improves Multi-Modal Learning**|Swetha Sirnam et.al.|[2308.13077v1](http://arxiv.org/abs/2308.13077v1)|null|\n", "2308.14713": "|**2023-08-28**|**R3D3: Dense 3D Reconstruction of Dynamic Scenes from Multiple Cameras**|Aron Schmied et.al.|[2308.14713v1](http://arxiv.org/abs/2308.14713v1)|null|\n", "2308.14619": "|**2023-08-29**|**Compositional Semantic Mix for Domain Adaptation in Point Cloud Segmentation**|Cristiano Saltori et.al.|[2308.14619v2](http://arxiv.org/abs/2308.14619v2)|**[link](https://github.com/saltoricristiano/cosmix-uda)**|\n", "2308.14613": "|**2023-08-28**|**MS-Net: A Multi-modal Self-supervised Network for Fine-Grained Classification of Aircraft in SAR Images**|Bingying Yue et.al.|[2308.14613v1](http://arxiv.org/abs/2308.14613v1)|null|\n", "2308.14482": "|**2023-08-28**|**An Empirical Study of Consistency Regularization for End-to-End Speech-to-Text Translation**|Pengzhi Gao et.al.|[2308.14482v1](http://arxiv.org/abs/2308.14482v1)|**[link](https://github.com/gpengzhi/simcr)**|\n", "2308.14383": "|**2023-08-28**|**Multi-Modal Neural Radiance Field for Monocular Dense SLAM with a Light-Weight ToF Sensor**|Xinyang Liu et.al.|[2308.14383v1](http://arxiv.org/abs/2308.14383v1)|null|\n", "2308.14263": "|**2023-08-28**|**Cross-Modal Retrieval: A Systematic Review of Methods and Future Directions**|Lei Zhu et.al.|[2308.14263v1](http://arxiv.org/abs/2308.14263v1)|**[link](https://github.com/bmc-sdnu/cross-modal-retrieval)**|\n", "2308.14212": "|**2023-08-27**|**Exploring the Transfer Learning Capabilities of CLIP in Domain Generalization for Diabetic Retinopathy**|Sanoojan Baliah et.al.|[2308.14212v1](http://arxiv.org/abs/2308.14212v1)|**[link](https://github.com/sanoojan/clip-drdg)**|\n", "2308.14177": "|**2023-08-27**|**AIGC for Various Data Modalities: A Survey**|Lin Geng Foo et.al.|[2308.14177v1](http://arxiv.org/abs/2308.14177v1)|null|\n", "2308.14160": "|**2023-08-27**|**A Unified Transformer-based Network for multimodal Emotion Recognition**|Kamran Ali et.al.|[2308.14160v1](http://arxiv.org/abs/2308.14160v1)|null|\n", "2308.14105": "|**2023-08-29**|**Unified and Dynamic Graph for Temporal Character Grouping in Long Videos**|Xiujun Shu et.al.|[2308.14105v2](http://arxiv.org/abs/2308.14105v2)|null|\n", "2308.14083": "|**2023-08-27**|**4D Myocardium Reconstruction with Decoupled Motion and Shape Model**|Xiaohan Yuan et.al.|[2308.14083v1](http://arxiv.org/abs/2308.14083v1)|**[link](https://github.com/yuan-xiaohan/4d-myocardium-reconstruction-with-decoupled-motion-and-shape-model)**|\n", "2308.14064": "|**2023-08-27**|**Multi-model fusion for Aerial Vision and Dialog Navigation based on human attention aids**|Xinyi Wang et.al.|[2308.14064v1](http://arxiv.org/abs/2308.14064v1)|null|\n", "2308.14023": "|**2023-08-27**|**Domain-Specificity Inducing Transformers for Source-Free Domain Adaptation**|Sunandini Sanyal et.al.|[2308.14023v1](http://arxiv.org/abs/2308.14023v1)|null|\n", "2308.14009": "|**2023-08-27**|**Towards Fast and Accurate Image-Text Retrieval with Self-Supervised Fine-Grained Alignment**|Jiamin Zhuang et.al.|[2308.14009v1](http://arxiv.org/abs/2308.14009v1)|**[link](https://github.com/zjamie813/selfalign)**|\n", "2308.13976": "|**2023-08-27**|**Label Denoising through Cross-Model Agreement**|Yu Wang et.al.|[2308.13976v1](http://arxiv.org/abs/2308.13976v1)|null|\n", "2308.15273": "|**2023-08-29**|**Cross-Modal Retrieval Meets Inference:Improving Zero-Shot Classification with Cross-Modal Retrieval**|Seongha Eom et.al.|[2308.15273v1](http://arxiv.org/abs/2308.15273v1)|null|\n", "2308.15063": "|**2023-08-29**|**Learning Cross-modality Information Bottleneck Representation for Heterogeneous Person Re-Identification**|Haichao Shi et.al.|[2308.15063v1](http://arxiv.org/abs/2308.15063v1)|null|\n", "2308.14978": "|**2023-08-29**|**Vision Grid Transformer for Document Layout Analysis**|Cheng Da et.al.|[2308.14978v1](http://arxiv.org/abs/2308.14978v1)|**[link](https://github.com/alibabaresearch/advancedliteratemachinery)**|\n", "2308.14786": "|**2023-08-28**|**Extending Cross-Modal Retrieval with Interactive Learning to Improve Image Retrieval Performance in Forensics**|Nils B\u00f6hne et.al.|[2308.14786v1](http://arxiv.org/abs/2308.14786v1)|null|\n", "2308.16150": "|**2023-08-30**|**Modality Cycles with Masked Conditional Diffusion for Unsupervised Anomaly Segmentation in MRI**|Ziyun Liang et.al.|[2308.16150v1](http://arxiv.org/abs/2308.16150v1)|**[link](https://github.com/ziyunliang/mmccd)**|\n", "2308.16071": "|**2023-08-30**|**Semantic Image Synthesis via Class-Adaptive Cross-Attention**|Tomaso Fontanini et.al.|[2308.16071v1](http://arxiv.org/abs/2308.16071v1)|null|\n", "2308.16021": "|**2023-08-30**|**CALM: Contrastive Cross-modal Speaking Style Modeling for Expressive Text-to-Speech Synthesis**|Yi Meng et.al.|[2308.16021v1](http://arxiv.org/abs/2308.16021v1)|null|\n", "2308.15980": "|**2023-08-30**|**Adaptive Multi-Modalities Fusion in Sequential Recommendation Systems**|Hengchang Hu et.al.|[2308.15980v1](http://arxiv.org/abs/2308.15980v1)|**[link](https://github.com/holdenhu/mmsr)**|\n", "2308.15930": "|**2023-08-30**|**LLaSM: Large Language and Speech Model**|Yu Shu et.al.|[2308.15930v1](http://arxiv.org/abs/2308.15930v1)|**[link](https://github.com/linksoul-ai/llasm)**|\n", "2308.15846": "|**2023-08-30**|**Exploring Multi-Modal Contextual Knowledge for Open-Vocabulary Object Detection**|Yifan Xu et.al.|[2308.15846v1](http://arxiv.org/abs/2308.15846v1)|null|\n", "2308.15670": "|**2023-08-29**|**Multimodal Foundation Models For Echocardiogram Interpretation**|Matthew Christensen et.al.|[2308.15670v1](http://arxiv.org/abs/2308.15670v1)|**[link](https://github.com/echonet/echo_CLIP)**|\n", "2308.15640": "|**2023-08-29**|**Identifying Constitutive Parameters for Complex Hyperelastic Solids using Physics-Informed Neural Networks**|Siyuan Song et.al.|[2308.15640v1](http://arxiv.org/abs/2308.15640v1)|null|\n", "2308.15609": "|**2023-08-29**|**InstaTune: Instantaneous Neural Architecture Search During Fine-Tuning**|Sharath Nittur Sridhar et.al.|[2308.15609v1](http://arxiv.org/abs/2308.15609v1)|null|\n", "2308.15592": "|**2023-08-29**|**Non-local Interactions are Essential Elements for Dark Matter Halo Stability: A Cross-Model Study**|Ahmad Borzou et.al.|[2308.15592v1](http://arxiv.org/abs/2308.15592v1)|null|\n", "2308.16896": "|**2023-08-31**|**PointOcc: Cylindrical Tri-Perspective View for Point-based 3D Semantic Occupancy Prediction**|Sicheng Zuo et.al.|[2308.16896v1](http://arxiv.org/abs/2308.16896v1)|**[link](https://github.com/wzzheng/pointocc)**|\n", "2308.16777": "|**2023-09-01**|**Ref-Diff: Zero-shot Referring Image Segmentation with Generative Models**|Minheng Ni et.al.|[2308.16777v2](http://arxiv.org/abs/2308.16777v2)|null|\n", "2308.16758": "|**2023-08-31**|**Towards High-Fidelity Text-Guided 3D Face Generation and Manipulation Using only Images**|Cuican Yu et.al.|[2308.16758v1](http://arxiv.org/abs/2308.16758v1)|null|\n", "2308.16649": "|**2023-08-31**|**Learning with Multi-modal Gradient Attention for Explainable Composed Image Retrieval**|Prateksha Udhayanan et.al.|[2308.16649v1](http://arxiv.org/abs/2308.16649v1)|null|\n", "2308.16632": "|**2023-08-31**|**3D-STMN: Dependency-Driven Superpoint-Text Matching Network for End-to-End 3D Referring Expression Segmentation**|Changli Wu et.al.|[2308.16632v1](http://arxiv.org/abs/2308.16632v1)|**[link](https://github.com/sosppxo/3d-stmn)**|\n", "2308.16493": "|**2023-08-31**|**Expanding Frozen Vision-Language Models without Retraining: Towards Improved Robot Perception**|Riley Tavassoli et.al.|[2308.16493v1](http://arxiv.org/abs/2308.16493v1)|null|\n", "2308.16474": "|**2023-08-31**|**Enhancing Subtask Performance of Multi-modal Large Language Model**|Yongqiang Zhao et.al.|[2308.16474v1](http://arxiv.org/abs/2308.16474v1)|null|\n", "2308.16437": "|**2023-08-31**|**AntM$^{2}$C: A Large Scale Dataset For Multi-Scenario Multi-Modal CTR Prediction**|Zhaoxin Huan et.al.|[2308.16437v1](http://arxiv.org/abs/2308.16437v1)|null|\n", "2308.16386": "|**2023-08-31**|**RGB-T Tracking via Multi-Modal Mutual Prompt Learning**|Yang Luo et.al.|[2308.16386v1](http://arxiv.org/abs/2308.16386v1)|**[link](https://github.com/husteryoung/mplt)**|\n", "2309.00615": "|**2023-09-01**|**Point-Bind & Point-LLM: Aligning Point Cloud with Multi-modality for 3D Understanding, Generation, and Instruction Following**|Ziyu Guo et.al.|[2309.00615v1](http://arxiv.org/abs/2309.00615v1)|**[link](https://github.com/ziyuguo99/point-bind_point-llm)**|\n", "2309.00406": "|**2023-09-01**|**Constraining X-ray variability of the blazar 3C 273 using XMM-Newton observations over two decades**|Adithiya Dinesh et.al.|[2309.00406v1](http://arxiv.org/abs/2309.00406v1)|null|\n", "2309.00380": "|**2023-09-01**|**Learning multi-modal generative models with permutation-invariant encoders and tighter variational bounds**|Marcel Hirt et.al.|[2309.00380v1](http://arxiv.org/abs/2309.00380v1)|null|\n", "2309.00372": "|**2023-09-01**|**On the Localization of Ultrasound Image Slices within Point Distribution Models**|Lennart Bastian et.al.|[2309.00372v1](http://arxiv.org/abs/2309.00372v1)|**[link](https://github.com/vuenc/slice-to-shape)**|\n", "2309.00227": "|**2023-09-01**|**What Makes Good Open-Vocabulary Detector: A Disassembling Perspective**|Jincheng Li et.al.|[2309.00227v1](http://arxiv.org/abs/2309.00227v1)|null|\n", "2309.00133": "|**2023-08-31**|**Distraction-free Embeddings for Robust VQA**|Atharvan Dogra et.al.|[2309.00133v1](http://arxiv.org/abs/2309.00133v1)|null|\n", "2309.00030": "|**2023-08-31**|**Audio-Driven Dubbing for User Generated Contents via Style-Aware Semi-Parametric Synthesis**|Linsen Song et.al.|[2309.00030v1](http://arxiv.org/abs/2309.00030v1)|null|\n", "2309.02320": "|**2023-09-05**|**SeisCLIP: A seismology foundation model pre-trained by multi-modal data for multi-purpose seismic feature extraction**|Xu Si et.al.|[2309.02320v1](http://arxiv.org/abs/2309.02320v1)|**[link](https://github.com/sixu0/SeisCLIP)**|\n", "2309.02169": "|**2023-09-05**|**Dual Relation Alignment for Composed Image Retrieval**|Xintong Jiang et.al.|[2309.02169v1](http://arxiv.org/abs/2309.02169v1)|null|\n", "2309.02124": "|**2023-09-05**|**Exploiting Spatial-temporal Data for Sleep Stage Classification via Hypergraph Learning**|Yuze Liu et.al.|[2309.02124v1](http://arxiv.org/abs/2309.02124v1)|null|\n", "2309.02043": "|**2023-09-05**|**Decomposed Guided Dynamic Filters for Efficient RGB-Guided Depth Completion**|Yufei Wang et.al.|[2309.02043v1](http://arxiv.org/abs/2309.02043v1)|null|\n", "2309.02041": "|**2023-09-05**|**Learning Cross-Modal Affinity for Referring Video Object Segmentation Targeting Limited Samples**|Guanghui Li et.al.|[2309.02041v1](http://arxiv.org/abs/2309.02041v1)|**[link](https://github.com/hengliusky/few_shot_rvos)**|\n", "2309.01981": "|**2023-09-05**|**Graph-Based Interaction-Aware Multimodal 2D Vehicle Trajectory Prediction using Diffusion Graph Convolutional Networks**|Keshu Wu et.al.|[2309.01981v1](http://arxiv.org/abs/2309.01981v1)|null|\n", "2309.01955": "|**2023-09-05**|**A Survey on Interpretable Cross-modal Reasoning**|Dizhan Xue et.al.|[2309.01955v1](http://arxiv.org/abs/2309.01955v1)|**[link](https://github.com/ZuyiZhou/Awesome-Interpretable-Cross-modal-Reasoning)**|\n", "2309.01918": "|**2023-09-05**|**RoboAgent: Generalization and Efficiency in Robot Manipulation via Semantic Augmentations and Action Chunking**|Homanga Bharadhwaj et.al.|[2309.01918v1](http://arxiv.org/abs/2309.01918v1)|null|\n", "2309.01860": "|**2023-09-06**|**Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition and Translation**|Zaber Ibn Abdul Hakim et.al.|[2309.01860v2](http://arxiv.org/abs/2309.01860v2)|null|\n", "2309.01728": "|**2023-09-04**|**Generative-based Fusion Mechanism for Multi-Modal Tracking**|Zhangyong Tang et.al.|[2309.01728v1](http://arxiv.org/abs/2309.01728v1)|**[link](https://github.com/zhangyong-tang/gmmt)**|\n", "2309.01516": "|**2023-09-04**|**MultiWay-Adapater: Adapting large-scale multi-modal models for scalable image-text retrieval**|Zijun Long et.al.|[2309.01516v1](http://arxiv.org/abs/2309.01516v1)|**[link](https://github.com/longkukuhi/multiway-adapter)**|\n", "2309.01420": "|**2023-09-04**|**Unified Pre-training with Pseudo Texts for Text-To-Image Person Re-identification**|Zhiyin Shao et.al.|[2309.01420v1](http://arxiv.org/abs/2309.01420v1)|**[link](https://github.com/zhiyinshao-h/unipt)**|\n", "2309.01327": "|**2023-09-04**|**Can I Trust Your Answer? Visually Grounded Video Question Answering**|Junbin Xiao et.al.|[2309.01327v1](http://arxiv.org/abs/2309.01327v1)|**[link](https://github.com/doc-doc/next-gqa)**|\n", "2309.01256": "|**2023-09-03**|**BDC-Adapter: Brownian Distance Covariance for Better Vision-Language Reasoning**|Yi Zhang et.al.|[2309.01256v1](http://arxiv.org/abs/2309.01256v1)|null|\n", "2309.01073": "|**2023-09-03**|**Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding**|Cheng Shi et.al.|[2309.01073v1](http://arxiv.org/abs/2309.01073v1)|null|\n", "2309.03177": "|**2023-09-06**|**3D Object Positioning Using Differentiable Multimodal Learning**|Sean Zanyk-McLean et.al.|[2309.03177v1](http://arxiv.org/abs/2309.03177v1)|null|\n", "2309.03147": "|**2023-09-06**|**Real-Time Non-Invasive Imaging and Detection of Spreading Depolarizations through EEG: An Ultra-Light Explainable Deep Learning Approach**|Yinzhe Wu et.al.|[2309.03147v1](http://arxiv.org/abs/2309.03147v1)|null|\n", "2309.03100": "|**2023-09-06**|**FArMARe: a Furniture-Aware Multi-task methodology for Recommending Apartments based on the user interests**|Ali Abdari et.al.|[2309.03100v1](http://arxiv.org/abs/2309.03100v1)|**[link](https://github.com/aliabdari/farmare)**|\n", "2309.02965": "|**2023-09-06**|**Dynamic Hyperbolic Attention Network for Fine Hand-object Reconstruction**|Zhiying Leng et.al.|[2309.02965v1](http://arxiv.org/abs/2309.02965v1)|null|\n", "2309.02875": "|**2023-09-06**|**MAD: Modality Agnostic Distance Measure for Image Registration**|Vasiliki Sideri-Lampretsa et.al.|[2309.02875v1](http://arxiv.org/abs/2309.02875v1)|null|\n", "2309.02702": "|**2023-09-06**|**Gene-induced Multimodal Pre-training for Image-omic Classification**|Ting Jin et.al.|[2309.02702v1](http://arxiv.org/abs/2309.02702v1)|null|\n", "2309.02616": "|**2023-09-05**|**Generative AI-aided Joint Training-free Secure Semantic Communications via Multi-modal Prompts**|Hongyang Du et.al.|[2309.02616v1](http://arxiv.org/abs/2309.02616v1)|null|\n", "2309.02591": "|**2023-09-05**|**Scaling Autoregressive Multi-Modal Models: Pretraining and Instruction Tuning**|Lili Yu et.al.|[2309.02591v1](http://arxiv.org/abs/2309.02591v1)|null|\n", "2309.03905": "|**2023-09-07**|**ImageBind-LLM: Multi-modality Instruction Tuning**|Jiaming Han et.al.|[2309.03905v1](http://arxiv.org/abs/2309.03905v1)|**[link](https://github.com/opengvlab/llama-adapter)**|\n", "2309.03869": "|**2023-09-07**|**Text-to-feature diffusion for audio-visual few-shot learning**|Otniel-Bogdan Mercea et.al.|[2309.03869v1](http://arxiv.org/abs/2309.03869v1)|**[link](https://github.com/explainableml/avdiff-gfsl)**|\n", "2309.03734": "|**2023-09-07**|**ClusterFusion: Leveraging Radar Spatial Features for Radar-Camera 3D Object Detection in Autonomous Vehicles**|Irfan Tito Kurniawan et.al.|[2309.03734v1](http://arxiv.org/abs/2309.03734v1)|null|\n", "2309.03661": "|**2023-09-07**|**Prompt-based Context- and Domain-aware Pretraining for Vision and Language Navigation**|Ting Liu et.al.|[2309.03661v1](http://arxiv.org/abs/2309.03661v1)|null|\n", "2309.03473": "|**2023-09-07**|**Temporal Collection and Distribution for Referring Video Object Segmentation**|Jiajin Tang et.al.|[2309.03473v1](http://arxiv.org/abs/2309.03473v1)|null|\n", "2309.03452": "|**2023-09-07**|**Multi-Modality Guidance Network For Missing Modality Inference**|Zhuokai Zhao et.al.|[2309.03452v1](http://arxiv.org/abs/2309.03452v1)|null|\n", "2309.04453": "|**2023-09-08**|**WiSARD: A Labeled Visual and Thermal Image Dataset for Wilderness Search and Rescue**|Daniel Broyles et.al.|[2309.04453v1](http://arxiv.org/abs/2309.04453v1)|null|\n", "2309.04399": "|**2023-09-08**|**MaskDiffusion: Boosting Text-to-Image Consistency with Conditional Mask**|Yupeng Zhou et.al.|[2309.04399v1](http://arxiv.org/abs/2309.04399v1)|null|\n", "2309.04302": "|**2023-09-08**|**Have We Ever Encountered This Before? Retrieving Out-of-Distribution Road Obstacles from Driving Scenes**|Youssef Shoeb et.al.|[2309.04302v1](http://arxiv.org/abs/2309.04302v1)|null|\n", "2309.04287": "|**2023-09-08**|**Sequential Semantic Generative Communication for Progressive Text-to-Image Generation**|Hyelin Nam et.al.|[2309.04287v1](http://arxiv.org/abs/2309.04287v1)|null|\n", "2309.04109": "|**2023-09-08**|**From Text to Mask: Localizing Entities Using the Attention of Text-to-Image Diffusion Models**|Changming Xiao et.al.|[2309.04109v1](http://arxiv.org/abs/2309.04109v1)|null|\n", "2309.04062": "|**2023-09-08**|**3D Denoisers are Good 2D Teachers: Molecular Pretraining via Denoising and Cross-Modal Distillation**|Sungjun Cho et.al.|[2309.04062v1](http://arxiv.org/abs/2309.04062v1)|null|\n", "2309.04001": "|**2023-09-07**|**Multimodal Transformer for Material Segmentation**|Md Kaykobad Reza et.al.|[2309.04001v1](http://arxiv.org/abs/2309.04001v1)|**[link](https://github.com/csiplab/mmsformer)**|\n", "2309.05644": "|**2023-09-11**|**Grid-based Hybrid 3DMA GNSS and Terrestrial Positioning**|Paul Schwarzbach et.al.|[2309.05644v1](http://arxiv.org/abs/2309.05644v1)|null|\n", "2309.05608": "|**2023-09-11**|**Incorporating Pre-trained Model Prompting in Multimodal Stock Volume Movement Prediction**|Ruibo Chen et.al.|[2309.05608v1](http://arxiv.org/abs/2309.05608v1)|**[link](https://github.com/rayruibochen/promuse)**|\n", "2309.05573": "|**2023-09-11**|**UniSeg: A Unified Multi-Modal LiDAR Segmentation Network and the OpenPCSeg Codebase**|Youquan Liu et.al.|[2309.05573v1](http://arxiv.org/abs/2309.05573v1)|**[link](https://github.com/pjlab-adg/pcseg)**|\n", "2309.05519": "|**2023-09-13**|**NExT-GPT: Any-to-Any Multimodal LLM**|Shengqiong Wu et.al.|[2309.05519v2](http://arxiv.org/abs/2309.05519v2)|**[link](https://github.com/NExT-GPT/NExT-GPT)**|\n", "2309.05503": "|**2023-09-11**|**Long-Range Transformer Architectures for Document Understanding**|Thibault Douzon et.al.|[2309.05503v1](http://arxiv.org/abs/2309.05503v1)|**[link](https://github.com/thibaultdouzon/long-range-document-transformer)**|\n", "2309.05451": "|**2023-09-11**|**Dual-view Curricular Optimal Transport for Cross-lingual Cross-modal Retrieval**|Yabing Wang et.al.|[2309.05451v1](http://arxiv.org/abs/2309.05451v1)|null|\n", "2309.05423": "|**2023-09-11**|**Multi-Modal Automatic Prosody Annotation with Contrastive Pretraining of SSWP**|Jinzuomu Zhong et.al.|[2309.05423v1](http://arxiv.org/abs/2309.05423v1)|null|\n", "2309.05396": "|**2023-09-12**|**SlideSpeech: A Large-Scale Slide-Enriched Audio-Visual Corpus**|Haoxu Wang et.al.|[2309.05396v2](http://arxiv.org/abs/2309.05396v2)|null|\n", "2309.05298": "|**2023-09-11**|**Real-Time Parallel Trajectory Optimization with Spatiotemporal Safety Constraints for Autonomous Driving in Congested Traffic**|Lei Zheng et.al.|[2309.05298v1](http://arxiv.org/abs/2309.05298v1)|null|\n", "2309.05281": "|**2023-09-11**|**Class-Incremental Grouping Network for Continual Audio-Visual Learning**|Shentong Mo et.al.|[2309.05281v1](http://arxiv.org/abs/2309.05281v1)|**[link](https://github.com/stonemo/cign)**|\n", "2309.05257": "|**2023-09-11**|**FusionFormer: A Multi-sensory Fusion in Bird's-Eye-View and Temporal Consistent Transformer for 3D Objection**|Chunyong Hu et.al.|[2309.05257v1](http://arxiv.org/abs/2309.05257v1)|null|\n", "2309.05251": "|**2023-09-11**|**Multi3DRefer: Grounding Text Description to Multiple 3D Objects**|Yiming Zhang et.al.|[2309.05251v1](http://arxiv.org/abs/2309.05251v1)|null|\n", "2309.05248": "|**2023-09-11**|**Enhancing Speaker Diarization with Large Language Models: A Contextual Beam Search Approach**|Tae Jin Park et.al.|[2309.05248v1](http://arxiv.org/abs/2309.05248v1)|null|\n", "2309.05203": "|**2023-09-11**|**From Artificially Real to Real: Leveraging Pseudo Data from Large Language Models for Low-Resource Molecule Discovery**|Yuhan Chen et.al.|[2309.05203v1](http://arxiv.org/abs/2309.05203v1)|null|\n", "2309.05090": "|**2023-09-10**|**Sculpting Efficiency: Pruning Medical Imaging Models for On-Device Inference**|Sudarshan Sreeram et.al.|[2309.05090v1](http://arxiv.org/abs/2309.05090v1)|null|\n", "2309.06262": "|**2023-09-12**|**Modality Unifying Network for Visible-Infrared Person Re-Identification**|Hao Yu et.al.|[2309.06262v1](http://arxiv.org/abs/2309.06262v1)|null|\n", "2309.06255": "|**2023-09-12**|**Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation**|Yake Wei et.al.|[2309.06255v1](http://arxiv.org/abs/2309.06255v1)|null|\n", "2309.06176": "|**2023-09-12**|**Dual-Path Temporal Map Optimization for Make-up Temporal Video Grounding**|Jiaxiu Li et.al.|[2309.06176v1](http://arxiv.org/abs/2309.06176v1)|null|\n", "2309.06102": "|**2023-09-12**|**Can we predict the Most Replayed data of video streaming platforms?**|Alessandro Duico et.al.|[2309.06102v1](http://arxiv.org/abs/2309.06102v1)|**[link](https://github.com/ombretta/most-replayed-data)**|\n", "2309.06081": "|**2023-09-12**|**Information Flow in Graph Neural Networks: A Clinical Triage Use Case**|V\u00edctor Valls et.al.|[2309.06081v1](http://arxiv.org/abs/2309.06081v1)|null|\n", "2309.05904": "|**2023-09-12**|**Enhancing Representation in Radiography-Reports Foundation Model: A Granular Alignment Algorithm Using Masked Contrastive Learning**|Weijian Huang et.al.|[2309.05904v1](http://arxiv.org/abs/2309.05904v1)|null|\n", "2309.05818": "|**2023-09-11**|**Rice Plant Disease Detection and Diagnosis using Deep Convolutional Neural Networks and Multispectral Imaging**|Yara Ali Alnaggar et.al.|[2309.05818v1](http://arxiv.org/abs/2309.05818v1)|null|\n", "2309.05803": "|**2023-09-11**|**Revisiting Energy Based Models as Policies: Ranking Noise Contrastive Estimation and Interpolating Energy Models**|Sumeet Singh et.al.|[2309.05803v1](http://arxiv.org/abs/2309.05803v1)|null|\n", "2309.05756": "|**2023-09-11**|**TransferDoc: A Self-Supervised Transferable Document Representation Learning Model Unifying Vision and Language**|Souhail Bakkali et.al.|[2309.05756v1](http://arxiv.org/abs/2309.05756v1)|null|\n", "2309.07120": "|**2023-09-13**|**Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness and Ethics**|Haoqin Tu et.al.|[2309.07120v1](http://arxiv.org/abs/2309.07120v1)|**[link](https://github.com/ucsc-vlaa/sight-beyond-text)**|\n", "2309.07066": "|**2023-09-13**|**CLiFF-LHMP: Using Spatial Dynamics Patterns for Long-Term Human Motion Prediction**|Yufei Zhu et.al.|[2309.07066v1](http://arxiv.org/abs/2309.07066v1)|null|\n", "2309.06799": "|**2023-09-13**|**When Geoscience Meets Foundation Models: Towards General Geoscience Artificial Intelligence System**|Hao Zhang et.al.|[2309.06799v1](http://arxiv.org/abs/2309.06799v1)|null|\n", "2309.06735": "|**2023-09-13**|**GelFlow: Self-supervised Learning of Optical Flow for Vision-Based Tactile Sensor Displacement Measurement**|Zhiyuan Zhang et.al.|[2309.06735v1](http://arxiv.org/abs/2309.06735v1)|null|\n", "2309.06728": "|**2023-09-13**|**Leveraging Foundation models for Unsupervised Audio-Visual Segmentation**|Swapnil Bhosale et.al.|[2309.06728v1](http://arxiv.org/abs/2309.06728v1)|null|\n", "2309.06599": "|**2023-09-12**|**Reasoning with Latent Diffusion in Offline Reinforcement Learning**|Siddarth Venkatraman et.al.|[2309.06599v1](http://arxiv.org/abs/2309.06599v1)|**[link](https://github.com/ldcq/ldcq)**|\n", "2309.06597": "|**2023-09-12**|**Rank2Tell: A Multimodal Driving Dataset for Joint Importance Ranking and Reasoning**|Enna Sachdeva et.al.|[2309.06597v1](http://arxiv.org/abs/2309.06597v1)|null|\n", "2309.06547": "|**2023-09-12**|**AmodalSynthDrive: A Synthetic Amodal Perception Dataset for Autonomous Driving**|Ahmed Rida Sekkat et.al.|[2309.06547v1](http://arxiv.org/abs/2309.06547v1)|null|\n", "2309.06517": "|**2023-09-12**|**Overview of Memotion 3: Sentiment and Emotion Analysis of Codemixed Hinglish Memes**|Shreyash Mishra et.al.|[2309.06517v1](http://arxiv.org/abs/2309.06517v1)|null|\n", "2309.06511": "|**2023-09-12**|**DF-TransFusion: Multimodal Deepfake Detection via Lip-Audio Cross-Attention and Facial Self-Attention**|Aaditya Kharel et.al.|[2309.06511v1](http://arxiv.org/abs/2309.06511v1)|null|\n", "2309.07915": "|**2023-09-14**|**MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning**|Haozhe Zhao et.al.|[2309.07915v1](http://arxiv.org/abs/2309.07915v1)|**[link](https://github.com/haozhezhao/mic)**|\n", "2309.07794": "|**2023-09-14**|**Improving Multimodal Classification of Social Media Posts by Leveraging Image-Text Auxiliary tasks**|Danae S\u00e1nchez Villegas et.al.|[2309.07794v1](http://arxiv.org/abs/2309.07794v1)|null|\n", "2309.07759": "|**2023-09-14**|**PROGrasp: Pragmatic Human-Robot Communication for Object Grasping**|Gi-Cheon Kang et.al.|[2309.07759v1](http://arxiv.org/abs/2309.07759v1)|null|\n", "2309.07623": "|**2023-09-14**|**SwitchGPT: Adapting Large Language Models for Non-Text Outputs**|Xinyu Wang et.al.|[2309.07623v1](http://arxiv.org/abs/2309.07623v1)|null|\n", "2309.07495": "|**2023-09-14**|**HDTR-Net: A Real-Time High-Definition Teeth Restoration Network for Arbitrary Talking Face Generation Methods**|Yongyuan Li et.al.|[2309.07495v1](http://arxiv.org/abs/2309.07495v1)|**[link](https://github.com/yylgoodlucky/hdtr)**|\n", "2309.07387": "|**2023-09-14**|**VDialogUE: A Unified Evaluation Benchmark for Visually-grounded Dialogue**|Yunshui Li et.al.|[2309.07387v1](http://arxiv.org/abs/2309.07387v1)|null|\n", "2309.07332": "|**2023-09-13**|**Reliability-based cleaning of noisy training labels with inductive conformal prediction in multi-modal biomedical data mining**|Xianghao Zhan et.al.|[2309.07332v1](http://arxiv.org/abs/2309.07332v1)|**[link](https://github.com/xzhan96-stf/icp_train_clean)**|\n", "2309.07297": "|**2023-09-13**|**Multi-Modal Hybrid Learning and Sequential Training for RGB-T Saliency Detection**|Guangyu Ren et.al.|[2309.07297v1](http://arxiv.org/abs/2309.07297v1)|null|\n", "2309.08531": "|**2023-09-15**|**Towards Practical and Efficient Image-to-Speech Captioning with Vision-Language Pre-training and Multi-modal Tokens**|Minsu Kim et.al.|[2309.08531v1](http://arxiv.org/abs/2309.08531v1)|null|\n", "2309.08508": "|**2023-09-15**|**MOSAIC: Learning Unified Multi-Sensory Object Property Representations for Robot Perception**|Gyan Tatiya et.al.|[2309.08508v1](http://arxiv.org/abs/2309.08508v1)|**[link](https://github.com/gtatiya/MOSAIC)**|\n", "2309.08229": "|**2023-09-15**|**Automated Multi-Drugs Administration During Total Intravenous Anesthesia Using Multi-Model Predictive Control**|Bob Aubouin-Pairault et.al.|[2309.08229v1](http://arxiv.org/abs/2309.08229v1)|**[link](https://github.com/bobaubouin/tiva_drug_control)**|\n", "2309.08204": "|**2023-09-15**|**One-stage Modality Distillation for Incomplete Multimodal Learning**|Shicai Wei et.al.|[2309.08204v1](http://arxiv.org/abs/2309.08204v1)|null|\n", "2309.08160": "|**2023-09-15**|**Cross-Modal Synthesis of Structural MRI and Functional Connectivity Networks via Conditional ViT-GANs**|Yuda Bi et.al.|[2309.08160v1](http://arxiv.org/abs/2309.08160v1)|null|\n", "2309.08154": "|**2023-09-15**|**Uncertainty-Aware Multi-View Visual Semantic Embedding**|Wenzhang Wei et.al.|[2309.08154v1](http://arxiv.org/abs/2309.08154v1)|null|\n", "2309.08096": "|**2023-09-15**|**GelSplitter: Tactile Reconstruction from Near Infrared and Visible Images**|Yuankai Lin et.al.|[2309.08096v1](http://arxiv.org/abs/2309.08096v1)|null|\n", "2309.08088": "|**2023-09-15**|**Interactive Model Fusion-Based GM-PHD Filter**|Jiacheng He et.al.|[2309.08088v1](http://arxiv.org/abs/2309.08088v1)|null|\n", "2309.08021": "|**2023-09-14**|**Vision-based Analysis of Driver Activity and Driving Performance Under the Influence of Alcohol**|Ross Greer et.al.|[2309.08021v1](http://arxiv.org/abs/2309.08021v1)|null|\n", "2309.09958": "|**2023-09-18**|**An Empirical Study of Scaling Instruct-Tuned Large Multimodal Models**|Yadong Lu et.al.|[2309.09958v1](http://arxiv.org/abs/2309.09958v1)|**[link](https://github.com/haotian-liu/LLaVA)**|\n", "2309.09875": "|**2023-09-18**|**RaLF: Flow-based Global and Metric Radar Localization in LiDAR Maps**|Abhijeet Nayak et.al.|[2309.09875v1](http://arxiv.org/abs/2309.09875v1)|null|\n", "2309.09867": "|**2023-09-18**|**EGFE: End-to-end Grouping of Fragmented Elements in UI Designs with Multimodal Learning**|Liuqing Chen et.al.|[2309.09867v1](http://arxiv.org/abs/2309.09867v1)|**[link](https://github.com/test2975/egfe)**|\n", "2309.09832": "|**2023-09-18**|**Task Selection and Assignment for Multi-modal Multi-task Dialogue Act Classification with Non-stationary Multi-armed Bandits**|Xiangheng He et.al.|[2309.09832v1](http://arxiv.org/abs/2309.09832v1)|null|\n", "2309.09667": "|**2023-09-18**|**Unified Frequency-Assisted Transformer Framework for Detecting and Grounding Multi-Modal Manipulation**|Huan Liu et.al.|[2309.09667v1](http://arxiv.org/abs/2309.09667v1)|null|\n", "2309.09646": "|**2023-09-18**|**Concurrent Haptic, Audio, and Visual Data Set During Bare Finger Interaction with Textured Surfaces**|Alexis W. M. Devillard et.al.|[2309.09646v1](http://arxiv.org/abs/2309.09646v1)|null|\n", "2309.09592": "|**2023-09-18**|**Multi-Semantic Fusion Model for Generalized Zero-Shot Skeleton-Based Action Recognition**|Ming-Zhe Li et.al.|[2309.09592v1](http://arxiv.org/abs/2309.09592v1)|**[link](https://github.com/EHZ9NIWI7/MSF-GZSSAR)**|\n", "2309.09513": "|**2023-09-18**|**Learning Parallax for Stereo Event-based Motion Deblurring**|Mingyuan Lin et.al.|[2309.09513v1](http://arxiv.org/abs/2309.09513v1)|null|\n", "2309.09501": "|**2023-09-18**|**Discovering Sounding Objects by Audio Queries for Audio Visual Segmentation**|Shaofei Huang et.al.|[2309.09501v1](http://arxiv.org/abs/2309.09501v1)|null|\n", "2309.09473": "|**2023-09-18**|**Self-supervised Multi-view Clustering in Computer Vision: A Survey**|Jiatai Wang et.al.|[2309.09473v1](http://arxiv.org/abs/2309.09473v1)|null|\n", "2309.09421": "|**2023-09-18**|**Unified Pretraining Target Based Video-music Retrieval With Music Rhythm And Video Optical Flow Information**|Tianjun Mao et.al.|[2309.09421v1](http://arxiv.org/abs/2309.09421v1)|null|\n", "2309.09246": "|**2023-09-17**|**Image-level supervision and self-training for transformer-based cross-modality tumor segmentation**|Malo de Boisredon et.al.|[2309.09246v1](http://arxiv.org/abs/2309.09246v1)|null|\n", "2309.09088": "|**2023-09-16**|**Enhancing GAN-Based Vocoders with Contrastive Learning Under Data-limited Condition**|Haoming Guo et.al.|[2309.09088v1](http://arxiv.org/abs/2309.09088v1)|null|\n", "2309.09067": "|**2023-09-19**|**MMST-ViT: Climate Change-aware Crop Yield Prediction via Multi-Modal Spatial-Temporal Vision Transformer**|Fudong Lin et.al.|[2309.09067v2](http://arxiv.org/abs/2309.09067v2)|**[link](https://github.com/fudong03/mmst-vit)**|\n", "2309.08966": "|**2023-09-16**|**FF-LOGO: Cross-Modality Point Cloud Registration with Feature Filtering and Local to Global Optimization**|Nan Ma et.al.|[2309.08966v1](http://arxiv.org/abs/2309.08966v1)|null|\n", "2309.10724": "|**2023-09-19**|**Sound Source Localization is All about Cross-Modal Alignment**|Arda Senocak et.al.|[2309.10724v1](http://arxiv.org/abs/2309.10724v1)|null|\n", "2309.10649": "|**2023-09-19**|**Cross-modal and Cross-domain Knowledge Transfer for Label-free 3D Segmentation**|Jingyu Zhang et.al.|[2309.10649v1](http://arxiv.org/abs/2309.10649v1)|null|\n", "2309.10606": "|**2023-09-19**|**A Novel Hybrid Algorithm for Optimized Solutions in Ocean Renewable Energy Industry: Enhancing Power Take-Off Parameters and Site Selection Procedure of Wave Energy Converters**|Hossein Mehdipour et.al.|[2309.10606v1](http://arxiv.org/abs/2309.10606v1)|null|\n", "2309.10537": "|**2023-09-19**|**FoleyGen: Visually-Guided Audio Generation**|Xinhao Mei et.al.|[2309.10537v1](http://arxiv.org/abs/2309.10537v1)|null|\n", "2309.10365": "|**2023-09-19**|**Testable Likelihoods for Beyond-the-Standard Model Fits**|Anja Beck et.al.|[2309.10365v1](http://arxiv.org/abs/2309.10365v1)|null|\n", "2309.10361": "|**2023-09-19**|**Improving CLIP Robustness with Knowledge Distillation and Self-Training**|Clement Laroudie et.al.|[2309.10361v1](http://arxiv.org/abs/2309.10361v1)|null|\n", "2309.10283": "|**2023-09-19**|**FRAMU: Attention-based Machine Unlearning using Federated Reinforcement Learning**|Thanveer Shaik et.al.|[2309.10283v1](http://arxiv.org/abs/2309.10283v1)|null|\n", "2309.10244": "|**2023-09-19**|**UPL-SFDA: Uncertainty-aware Pseudo Label Guided Source-Free Domain Adaptation for Medical Image Segmentation**|Jianghao Wu et.al.|[2309.10244v1](http://arxiv.org/abs/2309.10244v1)|**[link](https://github.com/hilab-git/upl-sfda)**|\n", "2309.10195": "|**2023-09-20**|**Multi-modality Meets Re-learning: Mitigating Negative Transfer in Sequential Recommendation**|Bo Peng et.al.|[2309.10195v2](http://arxiv.org/abs/2309.10195v2)|null|\n", "2309.10091": "|**2023-09-18**|**Unified Coarse-to-Fine Alignment for Video-Text Retrieval**|Ziyang Wang et.al.|[2309.10091v1](http://arxiv.org/abs/2309.10091v1)|**[link](https://github.com/ziyang412/ucofia)**|\n", "2309.10077": "|**2023-09-18**|**GAME: Generalized deep learning model towards multimodal data integration for early screening of adolescent mental disorders**|Zhicheng Du et.al.|[2309.10077v1](http://arxiv.org/abs/2309.10077v1)|null|\n", "2309.11335": "|**2023-09-20**|**2D-3D Pose Tracking with Multi-View Constraints**|Huai Yu et.al.|[2309.11335v1](http://arxiv.org/abs/2309.11335v1)|null|\n", "2309.11119": "|**2023-09-21**|**BroadBEV: Collaborative LiDAR-camera Fusion for Broad-sighted Bird's Eye View Map Construction**|Minsu Kim et.al.|[2309.11119v2](http://arxiv.org/abs/2309.11119v2)|null|\n", "2309.11082": "|**2023-09-20**|**Dual-Modal Attention-Enhanced Text-Video Retrieval with Triplet Partial Margin Contrastive Learning**|Chen Jiang et.al.|[2309.11082v1](http://arxiv.org/abs/2309.11082v1)|null|\n", "2309.11081": "|**2023-09-20**|**Dense 2D-3D Indoor Prediction with Sound via Aligned Cross-Modal Distillation**|Heeseung Yun et.al.|[2309.11081v1](http://arxiv.org/abs/2309.11081v1)|**[link](https://github.com/hs-yn/daps)**|\n", "2309.12314": "|**2023-09-21**|**TinyCLIP: CLIP Distillation via Affinity Mimicking and Weight Inheritance**|Kan Wu et.al.|[2309.12314v1](http://arxiv.org/abs/2309.12314v1)|**[link](https://github.com/microsoft/Cream/tree/main/TinyCLIP)**|\n", "2309.12224": "|**2023-09-21**|**Towards Answering Health-related Questions from Medical Videos: Datasets and Approaches**|Deepak Gupta et.al.|[2309.12224v1](http://arxiv.org/abs/2309.12224v1)|null|\n", "2309.12158": "|**2023-09-21**|**Towards Robust and Truly Large-Scale Audio-Sheet Music Retrieval**|Luis Carvalho et.al.|[2309.12158v1](http://arxiv.org/abs/2309.12158v1)|null|\n", "2309.12134": "|**2023-09-21**|**Self-Supervised Contrastive Learning for Robust Audio-Sheet Music Retrieval Systems**|Luis Carvalho et.al.|[2309.12134v1](http://arxiv.org/abs/2309.12134v1)|null|\n", "2309.12111": "|**2023-09-21**|**Passage Summarization with Recurrent Models for Audio-Sheet Music Retrieval**|Luis Carvalho et.al.|[2309.12111v1](http://arxiv.org/abs/2309.12111v1)|null|\n", "2309.12110": "|**2023-09-21**|**Exploiting CLIP-based Multi-modal Approach for Artwork Classification and Retrieval**|Alberto Baldrati et.al.|[2309.12110v1](http://arxiv.org/abs/2309.12110v1)|null|\n", "2309.12030": "|**2023-09-21**|**CAMERA: A Multimodal Dataset and Benchmark for Ad Text Generation**|Masato Mita et.al.|[2309.12030v1](http://arxiv.org/abs/2309.12030v1)|**[link](https://github.com/cyberagentailab/camera)**|\n", "2309.12009": "|**2023-09-21**|**Elevating Skeleton-Based Action Recognition with Efficient Multi-Modality Self-Supervision**|Yiping Wei et.al.|[2309.12009v1](http://arxiv.org/abs/2309.12009v1)|**[link](https://github.com/desehuileng0o0/ikem)**|\n", "2309.11933": "|**2023-09-21**|**Fully Transformer-Equipped Architecture for End-to-End Referring Video Object Segmentation**|Ping Li et.al.|[2309.11933v1](http://arxiv.org/abs/2309.11933v1)|null|\n", "2309.11923": "|**2023-09-21**|**TextCLIP: Text-Guided Face Image Generation And Manipulation Without Adversarial Training**|Xiaozhou You et.al.|[2309.11923v1](http://arxiv.org/abs/2309.11923v1)|null|\n", "2309.11860": "|**2023-09-21**|**QUEST: An Efficient Query Evaluation Scheme Towards Scan-Intensive Cross-Model Analysis**|Jianfeng Huang et.al.|[2309.11860v1](http://arxiv.org/abs/2309.11860v1)|null|\n", "2309.11845": "|**2023-09-21**|**TMac: Temporal Multi-Modal Graph Learning for Acoustic Event Classification**|Meng Liu et.al.|[2309.11845v1](http://arxiv.org/abs/2309.11845v1)|**[link](https://github.com/mgithubl/tmac)**|\n", "2309.11839": "|**2023-09-21**|**MoPA: Multi-Modal Prior Aided Domain Adaptation for 3D Semantic Segmentation**|Haozhi Cao et.al.|[2309.11839v1](http://arxiv.org/abs/2309.11839v1)|null|\n", "2309.11837": "|**2023-09-21**|**Stellar model calibrations with the Ai Phe binary system. Open questions about the robustness of the fit**|G. Valle et.al.|[2309.11837v1](http://arxiv.org/abs/2309.11837v1)|null|\n", "2309.11755": "|**2023-09-21**|**2DDATA: 2D Detection Annotations Transmittable Aggregation for Semantic Segmentation on Point Cloud**|Guan-Cheng Lee et.al.|[2309.11755v1](http://arxiv.org/abs/2309.11755v1)|null|\n", "2309.13007": "|**2023-09-22**|**ReConcile: Round-Table Conference Improves Reasoning via Consensus among Diverse LLMs**|Justin Chih-Yao Chen et.al.|[2309.13007v1](http://arxiv.org/abs/2309.13007v1)|**[link](https://github.com/dinobby/reconcile)**|\n", "2309.12865": "|**2023-09-22**|**Bridging Sensor Gaps via Single-Direction Tuning for Hyperspectral Image Classification**|Xizhe Xue et.al.|[2309.12865v1](http://arxiv.org/abs/2309.12865v1)|**[link](https://github.com/cecilia-xue/hyt-nas)**|\n", "2309.12855": "|**2023-09-22**|**Cross-Modal Translation and Alignment for Survival Analysis**|Fengtao Zhou et.al.|[2309.12855v1](http://arxiv.org/abs/2309.12855v1)|**[link](https://github.com/ft-zhou-zzz/cmta)**|\n", "2309.12764": "|**2023-09-22**|**Multi-Modal Embeddings for Isolating Cross-Platform Coordinated Information Campaigns on Social Media**|Fabio Barbero et.al.|[2309.12764v1](http://arxiv.org/abs/2309.12764v1)|null|\n", "2309.12657": "|**2023-09-22**|**Exploiting Modality-Specific Features For Multi-Modal Manipulation Detection And Grounding**|Jiazhen Wang et.al.|[2309.12657v1](http://arxiv.org/abs/2309.12657v1)|null|\n", "2309.12572": "|**2023-09-22**|**Interpretable 3D Multi-Modal Residual Convolutional Neural Network for Mild Traumatic Brain Injury Diagnosis**|Hanem Ellethy et.al.|[2309.12572v1](http://arxiv.org/abs/2309.12572v1)|null|\n", "2309.14327": "|**2023-09-25**|**DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention**|Zhewei Yao et.al.|[2309.14327v1](http://arxiv.org/abs/2309.14327v1)|**[link](https://github.com/microsoft/deepspeedexamples)**|\n", "2309.14320": "|**2023-09-25**|**MUTEX: Learning Unified Policies from Multimodal Task Specifications**|Rutav Shah et.al.|[2309.14320v1](http://arxiv.org/abs/2309.14320v1)|null|\n", "2309.14203": "|**2023-09-25**|**Detecting and Grounding Multi-Modal Media Manipulation and Beyond**|Rui Shao et.al.|[2309.14203v1](http://arxiv.org/abs/2309.14203v1)|**[link](https://github.com/rshaojimmy/multimodal-deepfake)**|\n", "2309.14183": "|**2023-09-26**|**Species196: A One-Million Semi-supervised Dataset for Fine-grained Species Recognition**|Wei He et.al.|[2309.14183v2](http://arxiv.org/abs/2309.14183v2)|**[link](https://github.com/Species-Dataset/species-dataset.github.io)**|\n", "2309.14181": "|**2023-09-25**|**Q-Bench: A Benchmark for General-Purpose Foundation Models on Low-level Vision**|Haoning Wu et.al.|[2309.14181v1](http://arxiv.org/abs/2309.14181v1)|**[link](https://github.com/VQAssessment/Q-Bench)**|\n", "2309.14065": "|**2023-09-26**|**AsymFormer: Asymmetrical Cross-Modal Representation Learning for Mobile Platform Real-Time RGB-D Semantic Segmentation**|Siqi Du et.al.|[2309.14065v2](http://arxiv.org/abs/2309.14065v2)|**[link](https://github.com/Fourier7754/AsymFormer)**|\n", "2309.14050": "|**2023-09-26**|**NNgTL: Neural Network Guided Optimal Temporal Logic Task Planning for Mobile Robots**|Ruijia Liu et.al.|[2309.14050v2](http://arxiv.org/abs/2309.14050v2)|null|\n", "2309.14003": "|**2023-09-25**|**Hierarchical Imitation Learning for Stochastic Environments**|Maximilian Igl et.al.|[2309.14003v1](http://arxiv.org/abs/2309.14003v1)|null|\n", "2309.13770": "|**2023-09-24**|**Devil in the Number: Towards Robust Multi-modality Data Filter**|Yichen Xu et.al.|[2309.13770v1](http://arxiv.org/abs/2309.13770v1)|null|\n", "2309.13650": "|**2023-09-24**|**Cross-modal Alignment with Optimal Transport for CTC-based ASR**|Xugang Lu et.al.|[2309.13650v1](http://arxiv.org/abs/2309.13650v1)|null|\n", "2309.13554": "|**2023-09-24**|**A Novel Stochastic Interacting Particle-Field Algorithm for 3D Parabolic-Parabolic Keller-Segel Chemotaxis System**|Zhongjian Wang et.al.|[2309.13554v1](http://arxiv.org/abs/2309.13554v1)|null|\n", "2309.13504": "|**2023-09-23**|**Attention Is All You Need For Blind Room Volume Estimation**|Chunxi Wang et.al.|[2309.13504v1](http://arxiv.org/abs/2309.13504v1)|null|\n", "2309.13470": "|**2023-09-23**|**HAVE-Net: Hallucinated Audio-Visual Embeddings for Few-Shot Classification with Unimodal Cues**|Ankit Jha et.al.|[2309.13470v1](http://arxiv.org/abs/2309.13470v1)|null|\n", "2309.13322": "|**2023-09-23**|**From Text to Source: Results in Detecting Large Language Model-Generated Content**|Wissam Antoun et.al.|[2309.13322v1](http://arxiv.org/abs/2309.13322v1)|null|\n", "2309.13266": "|**2023-09-23**|**Robust Navigation with Cross-Modal Fusion and Knowledge Transfer**|Wenzhe Cai et.al.|[2309.13266v1](http://arxiv.org/abs/2309.13266v1)|**[link](https://github.com/wzcai99/Distill-Navigator)**|\n", "2309.15117": "|**2023-09-26**|**Generating Visual Scenes from Touch**|Fengyu Yang et.al.|[2309.15117v1](http://arxiv.org/abs/2309.15117v1)|null|\n", "2309.15112": "|**2023-09-27**|**InternLM-XComposer: A Vision-Language Large Model for Advanced Text-image Comprehension and Composition**|Pan Zhang et.al.|[2309.15112v2](http://arxiv.org/abs/2309.15112v2)|**[link](https://github.com/internlm/internlm-xcomposer)**|\n", "2309.15109": "|**2023-09-26**|**DistillBEV: Boosting Multi-Camera 3D Object Detection with Cross-Modal Knowledge Distillation**|Zeyu Wang et.al.|[2309.15109v1](http://arxiv.org/abs/2309.15109v1)|**[link](https://github.com/qcraftai/distill-bev)**|\n", "2309.15082": "|**2023-09-26**|**RPEFlow: Multimodal Fusion of RGB-PointCloud-Event for Joint Optical Flow and Scene Flow Estimation**|Zhexiong Wan et.al.|[2309.15082v1](http://arxiv.org/abs/2309.15082v1)|**[link](https://github.com/danqu130/RPEFlow)**|\n", "2309.14704": "|**2023-09-26**|**Tile Classification Based Viewport Prediction with Multi-modal Fusion Transformer**|Zhihao Zhang et.al.|[2309.14704v1](http://arxiv.org/abs/2309.14704v1)|null|\n", "2309.14673": "|**2023-09-26**|**ALEX: Towards Effective Graph Transfer Learning with Noisy Labels**|Jingyang Yuan et.al.|[2309.14673v1](http://arxiv.org/abs/2309.14673v1)|null|\n", "2309.14611": "|**2023-09-26**|**Event Stream-based Visual Object Tracking: A High-Resolution Benchmark Dataset and A Novel Baseline**|Xiao Wang et.al.|[2309.14611v1](http://arxiv.org/abs/2309.14611v1)|**[link](https://github.com/event-ahu/eventvot_benchmark)**|\n", "2309.14580": "|**2023-09-26**|**CWCL: Cross-Modal Transfer with Continuously Weighted Contrastive Loss**|Rakshith Sharma Srinivasa et.al.|[2309.14580v1](http://arxiv.org/abs/2309.14580v1)|null|\n", "2309.14516": "|**2023-09-25**|**UniBEV: Multi-modal 3D Object Detection with Uniform BEV Encoders for Robustness against Missing Sensor Modalities**|Shiming Wang et.al.|[2309.14516v1](http://arxiv.org/abs/2309.14516v1)|null|\n", "2309.14491": "|**2023-09-25**|**Unsupervised 3D Perception with 2D Vision-Language Distillation for Autonomous Driving**|Mahyar Najibi et.al.|[2309.14491v1](http://arxiv.org/abs/2309.14491v1)|null|\n", "2309.15826": "|**2023-09-27**|**Cross-Modal Multi-Tasking for Speech-to-Text Translation via Hard Parameter Sharing**|Brian Yan et.al.|[2309.15826v1](http://arxiv.org/abs/2309.15826v1)|null|\n", "2309.15751": "|**2023-09-27**|**InfraParis: A multi-modal and multi-task autonomous driving dataset**|Gianni Franchi et.al.|[2309.15751v1](http://arxiv.org/abs/2309.15751v1)|null|\n", "2309.15739": "|**2023-09-27**|**Experience and Evidence are the eyes of an excellent summarizer! Towards Knowledge Infused Multi-modal Clinical Conversation Summarization**|Abhisek Tiwari et.al.|[2309.15739v1](http://arxiv.org/abs/2309.15739v1)|**[link](https://github.com/nlp-rl/mm-cliconsummation)**|\n", "2309.15683": "|**2023-09-27**|**End-to-End Streaming Video Temporal Action Segmentation with Reinforce Learning**|Wujun Wen et.al.|[2309.15683v1](http://arxiv.org/abs/2309.15683v1)|**[link](https://github.com/Thinksky5124/SVTAS)**|\n", "2309.15599": "|**2023-09-27**|**OceanBench: The Sea Surface Height Edition**|J. Emmanuel Johnson et.al.|[2309.15599v1](http://arxiv.org/abs/2309.15599v1)|**[link](https://github.com/jejjohnson/oceanbench)**|\n", "2309.15529": "|**2023-09-27**|**Missing-modality Enabled Multi-modal Fusion Architecture for Medical Data**|Muyu Wang et.al.|[2309.15529v1](http://arxiv.org/abs/2309.15529v1)|null|\n", "2309.15427": "|**2023-09-27**|**Graph Neural Prompting with Large Language Models**|Yijun Tian et.al.|[2309.15427v1](http://arxiv.org/abs/2309.15427v1)|null|\n", "2309.15402": "|**2023-09-27**|**A Survey of Chain of Thought Reasoning: Advances, Frontiers and Future**|Zheng Chu et.al.|[2309.15402v1](http://arxiv.org/abs/2309.15402v1)|**[link](https://github.com/zchuz/cot-reasoning-survey)**|\n", "2309.15390": "|**2023-09-27**|**MINS: Efficient and Robust Multisensor-aided Inertial Navigation System**|Woosik Lee et.al.|[2309.15390v1](http://arxiv.org/abs/2309.15390v1)|**[link](https://github.com/rpng/mins)**|\n", "2309.15313": "|**2023-09-26**|**M$^{3}$3D: Learning 3D priors using Multi-Modal Masked Autoencoders for 2D image and video understanding**|Muhammad Abdullah Jamal et.al.|[2309.15313v1](http://arxiv.org/abs/2309.15313v1)|null|\n", "2309.15302": "|**2023-09-26**|**Self-Supervised Terrain Representation Learning from Unconstrained Robot Experience**|Haresh Karnan et.al.|[2309.15302v1](http://arxiv.org/abs/2309.15302v1)|null|\n", "2309.15283": "|**2023-09-26**|**Multi-Modal Planning on Regrasping for Stable Manipulation**|Jiaming Hu et.al.|[2309.15283v1](http://arxiv.org/abs/2309.15283v1)|null|\n", "2309.16592": "|**2023-09-28**|**Tensor Factorization for Leveraging Cross-Modal Knowledge in Data-Constrained Infrared Object Detection**|Manish Sharma et.al.|[2309.16592v1](http://arxiv.org/abs/2309.16592v1)|null|\n", "2309.16569": "|**2023-09-28**|**Audio-Visual Speaker Verification via Joint Cross-Attention**|R. Gnana Praveen et.al.|[2309.16569v1](http://arxiv.org/abs/2309.16569v1)|null|\n", "2309.16283": "|**2023-09-28**|**Self-supervised Cross-view Representation Reconstruction for Change Captioning**|Yunbin Tu et.al.|[2309.16283v1](http://arxiv.org/abs/2309.16283v1)|null|\n", "2309.16211": "|**2023-09-28**|**VDC: Versatile Data Cleanser for Detecting Dirty Samples via Visual-Linguistic Inconsistency**|Zihao Zhu et.al.|[2309.16211v1](http://arxiv.org/abs/2309.16211v1)|null|\n", "2309.16206": "|**2023-09-28**|**Cross-Modal Transformer GAN: Brain Structural-Functional Deep Fusing Network for Alzheimer's Disease Analysis**|Qiankun Zuo et.al.|[2309.16206v1](http://arxiv.org/abs/2309.16206v1)|null|\n", "2309.16203": "|**2023-09-28**|**The Cloud Strikes Back: Investigating the Decentralization of IPFS**|Leonhard Balduf et.al.|[2309.16203v1](http://arxiv.org/abs/2309.16203v1)|null|\n", "2309.16141": "|**2023-09-28**|**Align before Search: Aligning Ads Image to Text for Accurate Cross-Modal Sponsored Search**|Yuanmin Tang et.al.|[2309.16141v1](http://arxiv.org/abs/2309.16141v1)|**[link](https://github.com/pter61/aligncmss)**|\n", "2309.16093": "|**2023-09-28**|**Hierarchical Cross-Modality Knowledge Transfer with Sinkhorn Attention for CTC-based ASR**|Xugang Lu et.al.|[2309.16093v1](http://arxiv.org/abs/2309.16093v1)|null|\n", "2309.15954": "|**2023-09-27**|**The Devil is in the Details: A Deep Dive into the Rabbit Hole of Data Filtering**|Haichao Yu et.al.|[2309.15954v1](http://arxiv.org/abs/2309.15954v1)|null|\n", "2309.15915": "|**2023-09-27**|**Zero-Shot and Few-Shot Video Question Answering with Multi-Modal Prompts**|Deniz Engin et.al.|[2309.15915v1](http://arxiv.org/abs/2309.15915v1)|**[link](https://github.com/engindeniz/vitis)**|\n", "2309.17395": "|**2023-09-29**|**AV-CPL: Continuous Pseudo-Labeling for Audio-Visual Speech Recognition**|Andrew Rouditchenko et.al.|[2309.17395v1](http://arxiv.org/abs/2309.17395v1)|null|\n", "2309.17336": "|**2023-09-29**|**See Beyond Seeing: Robust 3D Object Detection from Point Clouds via Cross-Modal Hallucination**|Jianning Deng et.al.|[2309.17336v1](http://arxiv.org/abs/2309.17336v1)|null|\n", "2309.17264": "|**2023-09-29**|**A Foundation Model for General Moving Object Segmentation in Medical Images**|Zhongnuo Yan et.al.|[2309.17264v1](http://arxiv.org/abs/2309.17264v1)|null|\n", "2309.17239": "|**2023-09-29**|**EGVD: Event-Guided Video Deraining**|Yueyi Zhang et.al.|[2309.17239v1](http://arxiv.org/abs/2309.17239v1)|**[link](https://github.com/booker-max/egvd)**|\n", "2309.17175": "|**2023-09-29**|**TextField3D: Towards Enhancing Open-Vocabulary 3D Generation with Noisy Text Fields**|Tianyu Huang et.al.|[2309.17175v1](http://arxiv.org/abs/2309.17175v1)|null|\n", "2309.17133": "|**2023-09-29**|**Fine-grained Late-interaction Multi-modal Retrieval for Retrieval Augmented Visual Question Answering**|Weizhe Lin et.al.|[2309.17133v1](http://arxiv.org/abs/2309.17133v1)|**[link](https://github.com/linweizhedragon/retrieval-augmented-visual-question-answering)**|\n", "2309.17104": "|**2023-10-03**|**Prototype-guided Cross-modal Completion and Alignment for Incomplete Text-based Person Re-identification**|Tiantian Gong et.al.|[2309.17104v2](http://arxiv.org/abs/2309.17104v2)|null|\n", "2309.17102": "|**2023-09-29**|**Guiding Instruction-based Image Editing via Multimodal Large Language Models**|Tsu-Jui Fu et.al.|[2309.17102v1](http://arxiv.org/abs/2309.17102v1)|**[link](https://github.com/tsujuifu/pytorch_mgie)**|\n", "2309.17093": "|**2023-09-29**|**Prototype-based Aleatoric Uncertainty Quantification for Cross-modal Retrieval**|Hao Li et.al.|[2309.17093v1](http://arxiv.org/abs/2309.17093v1)|**[link](https://github.com/leolee99/pau)**|\n", "2309.17037": "|**2023-09-29**|**Beyond Co-occurrence: Multi-modal Session-based Recommendation**|Xiaokun Zhang et.al.|[2309.17037v1](http://arxiv.org/abs/2309.17037v1)|**[link](https://github.com/zhang-xiaokun/mmsbr)**|\n", "2309.16984": "|**2023-09-29**|**Consistency Models as a Rich and Efficient Policy Class for Reinforcement Learning**|Zihan Ding et.al.|[2309.16984v1](http://arxiv.org/abs/2309.16984v1)|null|\n", "2309.16949": "|**2023-09-29**|**CrossZoom: Simultaneously Motion Deblurring and Event Super-Resolving**|Chi Zhang et.al.|[2309.16949v1](http://arxiv.org/abs/2309.16949v1)|**[link](https://github.com/bestrivenzc/CZ-Net)**|\n", "2309.16830": "|**2023-09-28**|**Robust Safe Control with Multi-Modal Uncertainty**|Tianhao Wei et.al.|[2309.16830v1](http://arxiv.org/abs/2309.16830v1)|null|\n", "2309.16818": "|**2023-09-28**|**MEM: Multi-Modal Elevation Mapping for Robotics and Learning**|Gian Erni et.al.|[2309.16818v1](http://arxiv.org/abs/2309.16818v1)|**[link](https://github.com/leggedrobotics/elevation_mapping_cupy)**|\n", "2309.16772": "|**2023-10-02**|**XVO: Generalized Visual Odometry via Cross-Modal Self-Training**|Lei Lai et.al.|[2309.16772v2](http://arxiv.org/abs/2309.16772v2)|null|\n", "2310.02071": "|**2023-10-03**|**Towards End-to-End Embodied Decision Making via Multi-modal Large Language Model: Explorations with GPT4-Vision and Beyond**|Liang Chen et.al.|[2310.02071v1](http://arxiv.org/abs/2310.02071v1)|**[link](https://github.com/pkunlp-icler/pca-eval)**|\n", "2310.02050": "|**2023-10-03**|**Tuning Large language model for End-to-end Speech Translation**|Hao Zhang et.al.|[2310.02050v1](http://arxiv.org/abs/2310.02050v1)|null|\n", "2310.01852": "|**2023-10-04**|**LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment**|Bin Zhu et.al.|[2310.01852v2](http://arxiv.org/abs/2310.01852v2)|**[link](https://github.com/pku-yuangroup/languagebind)**|\n", "2310.01733": "|**2023-10-03**|**Health Guardian: Using Multi-modal Data to Understand Individual Health**|Vince S. Siu et.al.|[2310.01733v1](http://arxiv.org/abs/2310.01733v1)|null|\n", "2310.01358": "|**2023-10-02**|**NEUCORE: Neural Concept Reasoning for Composed Image Retrieval**|Shu Zhao et.al.|[2310.01358v1](http://arxiv.org/abs/2310.01358v1)|null|\n", "2310.01351": "|**2023-10-02**|**Streaming Motion Forecasting for Autonomous Driving**|Ziqi Pang et.al.|[2310.01351v1](http://arxiv.org/abs/2310.01351v1)|**[link](https://github.com/ziqipang/streamingforecasting)**|\n", "2310.01330": "|**2023-10-02**|**Towards reporting bias in visual-language datasets: bimodal augmentation by decoupling object-attribute association**|Qiyu Wu et.al.|[2310.01330v1](http://arxiv.org/abs/2310.01330v1)|null|\n", "2310.01286": "|**2023-10-02**|**A Dynamic Macroscopic Framework for Pricing of Ride-hailing Services with an Optional Bus Lane Access for Pool Vehicles**|Lynn Fayed et.al.|[2310.01286v1](http://arxiv.org/abs/2310.01286v1)|null|\n", "2310.01232": "|**2023-10-02**|**Modality-aware Transformer for Time series Forecasting**|Hajar Emami et.al.|[2310.01232v1](http://arxiv.org/abs/2310.01232v1)|null|\n", "2310.01035": "|**2023-10-02**|**Learnable Cross-modal Knowledge Distillation for Multi-modal Learning with Missing Modality**|Hu Wang et.al.|[2310.01035v1](http://arxiv.org/abs/2310.01035v1)|null|\n", "2310.00927": "|**2023-10-02**|**Understanding Transferable Representation Learning and Zero-shot Transfer in CLIP**|Zixiang Chen et.al.|[2310.00927v1](http://arxiv.org/abs/2310.00927v1)|null|\n", "2310.00862": "|**2023-10-02**|**Shack-Hartmann wavefront sensing: A new approach to time-resolved measurement of stress intensity during dynamic fracture of small brittle specimens**|Liuchi Li et.al.|[2310.00862v1](http://arxiv.org/abs/2310.00862v1)|null|\n", "2310.00745": "|**2023-10-01**|**Deterministic Langevin Unconstrained Optimization with Normalizing Flows**|James M. Sullivan et.al.|[2310.00745v1](http://arxiv.org/abs/2310.00745v1)|null|\n", "2310.00740": "|**2023-10-01**|**Top-down Green-ups: Satellite Sensing and Deep Models to Predict Buffelgrass Phenology**|Lucas Rosenblatt et.al.|[2310.00740v1](http://arxiv.org/abs/2310.00740v1)|**[link](https://github.com/lurosenb/phenology_projects)**|\n", "2310.00672": "|**2023-10-01**|**GeRA: Label-Efficient Geometrically Regularized Alignment**|Dustin Klebe et.al.|[2310.00672v1](http://arxiv.org/abs/2310.00672v1)|null|\n", "2310.03024": "|**2023-10-04**|**AstroCLIP: Cross-Modal Pre-Training for Astronomical Foundation Models**|Francois Lanusse et.al.|[2310.03024v1](http://arxiv.org/abs/2310.03024v1)|**[link](https://github.com/PolymathicAI/AstroCLIP)**|\n", "2310.02960": "|**2023-10-04**|**CoDA: Collaborative Novel Box Discovery and Cross-modal Alignment for Open-vocabulary 3D Object Detection**|Yang Cao et.al.|[2310.02960v1](http://arxiv.org/abs/2310.02960v1)|**[link](https://github.com/yangcaoai/CoDA_NeurIPS2023)**|\n", "2310.02821": "|**2023-10-04**|**Improving Vision Anomaly Detection with the Guidance of Language Modality**|Dong Chen et.al.|[2310.02821v1](http://arxiv.org/abs/2310.02821v1)|**[link](https://github.com/Anfeather/CMG)**|\n", "2310.02777": "|**2023-10-04**|**The Role of Linguistic Priors in Measuring Compositional Generalization of Vision-Language Models**|Chenwei Wu et.al.|[2310.02777v1](http://arxiv.org/abs/2310.02777v1)|null|\n", "2310.02690": "|**2023-10-04**|**Multi-Dimension-Embedding-Aware Modality Fusion Transformer for Psychiatric Disorder Clasification**|Guoxin Wang et.al.|[2310.02690v1](http://arxiv.org/abs/2310.02690v1)|null|\n", "2310.02663": "|**2023-10-04**|**MedPrompt: Cross-Modal Prompting for Multi-Task Medical Image Translation**|Xuhang Chen et.al.|[2310.02663v1](http://arxiv.org/abs/2310.02663v1)|null|\n", "2310.02569": "|**2023-10-04**|**ReForm-Eval: Evaluating Large Vision Language Models via Unified Re-Formulation of Task-Oriented Benchmarks**|Zejun Li et.al.|[2310.02569v1](http://arxiv.org/abs/2310.02569v1)|**[link](https://github.com/fudandisc/reform-eval)**|\n", "2310.02561": "|**2023-10-04**|**Integrated Sensing and Communications towards Proactive Beamforming in mmWave V2I via Multi-Modal Feature Fusion (MMFF)**|Haotian Zhang et.al.|[2310.02561v1](http://arxiv.org/abs/2310.02561v1)|null|\n", "2310.02528": "|**2023-10-04**|**On the Cognition of Visual Question Answering Models and Human Intelligence: A Comparative Study**|Liben Chen et.al.|[2310.02528v1](http://arxiv.org/abs/2310.02528v1)|null|\n", "2310.02361": "|**2023-10-03**|**Event-Enhanced Multi-Modal Spiking Neural Network for Dynamic Obstacle Avoidance**|Yang Wang et.al.|[2310.02361v1](http://arxiv.org/abs/2310.02361v1)|null|\n", "2310.03744": "|**2023-10-05**|**Improved Baselines with Visual Instruction Tuning**|Haotian Liu et.al.|[2310.03744v1](http://arxiv.org/abs/2310.03744v1)|null|\n", "2310.03724": "|**2023-10-05**|**Modular Speech-to-Text Translation for Zero-Shot Cross-Modal Transfer**|Paul-Ambroise Duquenne et.al.|[2310.03724v1](http://arxiv.org/abs/2310.03724v1)|null|\n", "2310.03485": "|**2023-10-07**|**BTDNet: a Multi-Modal Approach for Brain Tumor Radiogenomic Classification**|Dimitrios Kollias et.al.|[2310.03485v2](http://arxiv.org/abs/2310.03485v2)|null|\n", "2310.03420": "|**2023-10-05**|**FreeReg: Image-to-Point Cloud Registration Leveraging Pretrained Diffusion Models and Monocular Depth Estimators**|Haiping Wang et.al.|[2310.03420v1](http://arxiv.org/abs/2310.03420v1)|**[link](https://github.com/WHU-USI3DV/FreeReg)**|\n", "2310.03333": "|**2023-10-05**|**Real-time Multi-modal Object Detection and Tracking on Edge for Regulatory Compliance Monitoring**|Jia Syuen Lim et.al.|[2310.03333v1](http://arxiv.org/abs/2310.03333v1)|null|\n", "2310.03320": "|**2023-10-05**|**BioBridge: Bridging Biomedical Foundation Models via Knowledge Graph**|Zifeng Wang et.al.|[2310.03320v1](http://arxiv.org/abs/2310.03320v1)|null|\n", "2310.03221": "|**2023-10-05**|**Know2BIO: A Comprehensive Dual-View Benchmark for Evolving Biomedical Knowledge Graphs**|Yijia Xiao et.al.|[2310.03221v1](http://arxiv.org/abs/2310.03221v1)|**[link](https://github.com/yijia-xiao/know2bio)**|\n", "2310.03218": "|**2023-10-05**|**Learning Energy-Based Prior Model with Diffusion-Amortized MCMC**|Peiyu Yu et.al.|[2310.03218v1](http://arxiv.org/abs/2310.03218v1)|**[link](https://github.com/yupeiyu98/diffusion-amortized-mcmc)**|\n", "2310.03140": "|**2023-10-04**|**ViFiT: Reconstructing Vision Trajectories from IMU and Wi-Fi Fine Time Measurements**|Bryan Bo Cao et.al.|[2310.03140v1](http://arxiv.org/abs/2310.03140v1)|**[link](https://github.com/bryanbocao/vifit)**|\n", "2310.03111": "|**2023-10-04**|**Multi-modal Gaussian Process Variational Autoencoders for Neural and Behavioral Data**|Rabia Gondur et.al.|[2310.03111v1](http://arxiv.org/abs/2310.03111v1)|null|\n", "2310.03059": "|**2023-10-04**|**Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models**|Ivan Tang et.al.|[2310.03059v1](http://arxiv.org/abs/2310.03059v1)|**[link](https://github.com/EvenJoker/Point-PEFT)**|\n", "2310.04122": "|**2023-10-06**|**VI-Diff: Unpaired Visible-Infrared Translation Diffusion Model for Single Modality Labeled Visible-Infrared Person Re-identification**|Han Huang et.al.|[2310.04122v1](http://arxiv.org/abs/2310.04122v1)|null|\n", "2310.03958": "|**2023-10-06**|**The \"Seen but Unnoticed\" Vocabulary of Natural Touch: Revolutionizing Direct Interaction with Our Devices and One Another (UIST 2021 Vision)**|Ken Hinckley et.al.|[2310.03958v1](http://arxiv.org/abs/2310.03958v1)|null|\n", "2310.05863": "|**2023-10-10**|**Fine-grained Audio-Visual Joint Representations for Multimodal Large Language Models**|Guangzhi Sun et.al.|[2310.05863v2](http://arxiv.org/abs/2310.05863v2)|**[link](https://github.com/briansidp/audiovisualllm)**|\n", "2310.05628": "|**2023-10-09**|**Glitter or Gold? Deriving Structured Insights from Sustainability Reports via Large Language Models**|Marco Bronzini et.al.|[2310.05628v1](http://arxiv.org/abs/2310.05628v1)|**[link](https://github.com/saturnmars/derivingstructuredinsightsfromsustainabilityreportsvialargelanguagemodels)**|\n", "2310.05608": "|**2023-10-09**|**FlexKnot and Gaussian Process for 21 cm global signal analysis and foreground separation**|Stefan Heimersheim et.al.|[2310.05608v1](http://arxiv.org/abs/2310.05608v1)|null|\n", "2310.05572": "|**2023-10-09**|**A Simple and Robust Framework for Cross-Modality Medical Image Segmentation applied to Vision Transformers**|Matteo Bastico et.al.|[2310.05572v1](http://arxiv.org/abs/2310.05572v1)|**[link](https://github.com/matteo-bastico/mi-seg)**|\n", "2310.05462": "|**2023-10-09**|**AdaFuse: Adaptive Medical Image Fusion Based on Spatial-Frequential Cross Attention**|Xianming Gu et.al.|[2310.05462v1](http://arxiv.org/abs/2310.05462v1)|**[link](https://github.com/xianming-gu/adafuse)**|\n", "2310.05401": "|**2023-10-09**|**Entropy-MCMC: Sampling from Flat Basins with Ease**|Bolian Li et.al.|[2310.05401v1](http://arxiv.org/abs/2310.05401v1)|null|\n", "2310.05364": "|**2023-10-10**|**Universal Multi-modal Entity Alignment via Iteratively Fusing Modality Similarity Paths**|Bolin Zhu et.al.|[2310.05364v2](http://arxiv.org/abs/2310.05364v2)|**[link](https://github.com/blzhu0823/pathfusion)**|\n", "2310.05355": "|**2023-10-09**|**C^2M-DoT: Cross-modal consistent multi-view medical report generation with domain transfer network**|Ruizhi Wang et.al.|[2310.05355v1](http://arxiv.org/abs/2310.05355v1)|null|\n", "2310.05245": "|**2023-10-08**|**Influence of Camera-LiDAR Configuration on 3D Object Detection for Autonomous Driving**|Ye Li et.al.|[2310.05245v1](http://arxiv.org/abs/2310.05245v1)|**[link](https://github.com/safeai-lab/lidar-camera-placement)**|\n", "2310.05193": "|**2023-10-08**|**Improving Discriminative Multi-Modal Learning with Large-Scale Pre-Trained Models**|Chenzhuang Du et.al.|[2310.05193v1](http://arxiv.org/abs/2310.05193v1)|null|\n", "2310.05181": "|**2023-10-08**|**Unified speech and gesture synthesis using flow matching**|Shivam Mehta et.al.|[2310.05181v1](http://arxiv.org/abs/2310.05181v1)|null|\n", "2310.05060": "|**2023-10-08**|**Video-CSR: Complex Video Digest Creation for Visual-Language Models**|Tingkai Liu et.al.|[2310.05060v1](http://arxiv.org/abs/2310.05060v1)|null|\n", "2310.04992": "|**2023-10-08**|**VisionFM: a Multi-Modal Multi-Task Vision Foundation Model for Generalist Ophthalmic Artificial Intelligence**|Jianing Qiu et.al.|[2310.04992v1](http://arxiv.org/abs/2310.04992v1)|null|\n", "2310.04991": "|**2023-10-10**|**Video-Teller: Enhancing Cross-Modal Generation with Fusion and Decoupling**|Haogeng Liu et.al.|[2310.04991v2](http://arxiv.org/abs/2310.04991v2)|null|\n", "2310.04971": "|**2023-10-08**|**Understanding the Robustness of Multi-modal Contrastive Learning to Distribution Shift**|Yihao Xue et.al.|[2310.04971v1](http://arxiv.org/abs/2310.04971v1)|null|\n", "2310.06633": "|**2023-10-10**|**Blind Dates: Examining the Expression of Temporality in Historical Photographs**|Alexandra Barancov\u00e1 et.al.|[2310.06633v1](http://arxiv.org/abs/2310.06633v1)|null|\n", "2310.06627": "|**2023-10-10**|**What If the TV Was Off? Examining Counterfactual Reasoning Abilities of Multi-modal Language Models**|Letian Zhang et.al.|[2310.06627v1](http://arxiv.org/abs/2310.06627v1)|**[link](https://github.com/letian2003/c-vqa)**|\n", "2310.06440": "|**2023-10-10**|**Solution for SMART-101 Challenge of ICCV Multi-modal Algorithmic Reasoning Task 2023**|Xiangyu Wu et.al.|[2310.06440v1](http://arxiv.org/abs/2310.06440v1)|null|\n", "2310.06434": "|**2023-10-10**|**Whispering LLaMA: A Cross-Modal Generative Error Correction Framework for Speech Recognition**|Srijith Radhakrishnan et.al.|[2310.06434v1](http://arxiv.org/abs/2310.06434v1)|**[link](https://github.com/srijith-rkr/whispering-llama)**|\n", "2310.06383": "|**2023-10-10**|**What Makes for Robust Multi-Modal Models in the Face of Missing Modalities?**|Siting Li et.al.|[2310.06383v1](http://arxiv.org/abs/2310.06383v1)|null|\n", "2310.06365": "|**2023-10-10**|**Multi-Modal Knowledge Graph Transformer Framework for Multi-Modal Entity Alignment**|Qian Li et.al.|[2310.06365v1](http://arxiv.org/abs/2310.06365v1)|**[link](https://github.com/xiaoqian19940510/moalign)**|\n", "2310.06342": "|**2023-10-10**|**Contrastive Prompt Learning-based Code Search based on Interaction Matrix**|Yubo Zhang et.al.|[2310.06342v1](http://arxiv.org/abs/2310.06342v1)|null|\n", "2310.06282": "|**2023-10-11**|**MuseChat: A Conversational Music Recommendation System for Videos**|Zhikang Dong et.al.|[2310.06282v2](http://arxiv.org/abs/2310.06282v2)|null|\n", "2310.06259": "|**2023-10-10**|**Cross-modal Cognitive Consensus guided Audio-Visual Segmentation**|Zhaofeng Shi et.al.|[2310.06259v1](http://arxiv.org/abs/2310.06259v1)|null|\n", "2310.06212": "|**2023-10-09**|**Comparison of deep-learning data fusion strategies in mandibular osteoradionecrosis prediction modelling using clinical variables and radiation dose distribution volumes**|Laia Humbert-Vidan et.al.|[2310.06212v1](http://arxiv.org/abs/2310.06212v1)|null|\n", "2310.06008": "|**2023-10-09**|**CoBEVFusion: Cooperative Perception with LiDAR-Camera Bird's-Eye View Fusion**|Donghao Qiao et.al.|[2310.06008v1](http://arxiv.org/abs/2310.06008v1)|null|\n", "2310.07706": "|**2023-10-11**|**Pixel State Value Network for Combined Prediction and Planning in Interactive Environments**|Sascha Rosbach et.al.|[2310.07706v1](http://arxiv.org/abs/2310.07706v1)|null|\n", "2310.07668": "|**2023-10-11**|**GRaMuFeN: Graph-based Multi-modal Fake News Detection in Social Media**|Makan Kananian et.al.|[2310.07668v1](http://arxiv.org/abs/2310.07668v1)|null|\n", "2310.07602": "|**2023-10-11**|**Dual Radar: A Multi-modal Dataset with Dual 4D Radar for Autononous Driving**|Xinyu Zhang et.al.|[2310.07602v1](http://arxiv.org/abs/2310.07602v1)|**[link](https://github.com/adept-thu/dual-radar)**|\n", "2310.07591": "|**2023-10-11**|**PeP: a Point enhanced Painting method for unified point cloud tasks**|Zichao Dong et.al.|[2310.07591v1](http://arxiv.org/abs/2310.07591v1)|null|\n", "2310.07552": "|**2023-10-11**|**ProtoHPE: Prototype-guided High-frequency Patch Enhancement for Visible-Infrared Person Re-identification**|Guiwei Zhang et.al.|[2310.07552v1](http://arxiv.org/abs/2310.07552v1)|null|\n", "2310.07517": "|**2023-10-11**|**CM-PIE: Cross-modal perception for interactive-enhanced audio-visual video parsing**|Yaru Chen et.al.|[2310.07517v1](http://arxiv.org/abs/2310.07517v1)|null|\n", "2310.07355": "|**2023-10-11**|**IMITATE: Clinical Prior Guided Hierarchical Vision-Language Pre-training**|Che Liu et.al.|[2310.07355v1](http://arxiv.org/abs/2310.07355v1)|null|\n", "2310.07276": "|**2023-10-11**|**BioT5: Enriching Cross-modal Integration in Biology with Chemical Knowledge and Natural Language Associations**|Qizhi Pei et.al.|[2310.07276v1](http://arxiv.org/abs/2310.07276v1)|**[link](https://github.com/QizhiPei/BioT5)**|\n", "2310.07265": "|**2023-10-11**|**Distilling Efficient Vision Transformers from CNNs for Semantic Segmentation**|Xu Zheng et.al.|[2310.07265v1](http://arxiv.org/abs/2310.07265v1)|null|\n", "2310.07222": "|**2023-10-11**|**Uni-paint: A Unified Framework for Multimodal Image Inpainting with Pretrained Diffusion Model**|Shiyuan Yang et.al.|[2310.07222v1](http://arxiv.org/abs/2310.07222v1)|**[link](https://github.com/ysy31415/unipaint)**|\n", "2310.07005": "|**2023-10-10**|**Sound-skwatter (Did You Mean: Sound-squatter?) AI-powered Generator for Phishing Prevention**|Rodolfo Valentim et.al.|[2310.07005v1](http://arxiv.org/abs/2310.07005v1)|null|\n", "2310.08530": "|**2023-10-12**|**UniPose: Detecting Any Keypoints**|Jie Yang et.al.|[2310.08530v1](http://arxiv.org/abs/2310.08530v1)|**[link](https://github.com/IDEA-Research/UniPose)**|\n", "2310.08487": "|**2023-10-12**|**GraphextQA: A Benchmark for Evaluating Graph-Enhanced Large Language Models**|Yuanchun Shen et.al.|[2310.08487v1](http://arxiv.org/abs/2310.08487v1)|**[link](https://github.com/happen2me/cross-gnn)**|\n", "2310.08446": "|**2023-10-12**|**Towards Robust Multi-Modal Reasoning via Model Selection**|Xiangyan Liu et.al.|[2310.08446v1](http://arxiv.org/abs/2310.08446v1)|null|\n", "2310.08303": "|**2023-10-12**|**Multimodal Variational Auto-encoder based Audio-Visual Segmentation**|Yuxin Mao et.al.|[2310.08303v1](http://arxiv.org/abs/2310.08303v1)|**[link](https://github.com/opennlplab/mmvae-avs)**|\n", "2310.08285": "|**2023-10-12**|**How would mobility-as-a-service (MaaS) platform survive as an intermediary? From the viewpoint of stability in many-to-many matching**|Rui Yao et.al.|[2310.08285v1](http://arxiv.org/abs/2310.08285v1)|null|\n", "2310.08270": "|**2023-10-12**|**Hilbert Space Embedding-based Trajectory Optimization for Multi-Modal Uncertain Obstacle Trajectory Prediction**|Basant Sharma et.al.|[2310.08270v1](http://arxiv.org/abs/2310.08270v1)|null|\n", "2310.08261": "|**2023-10-12**|**GraphAlign: Enhancing Accurate Feature Alignment by Graph matching for Multi-Modal 3D Object Detection**|Ziying Song et.al.|[2310.08261v1](http://arxiv.org/abs/2310.08261v1)|null|\n", "2310.08166": "|**2023-10-12**|**Ziya-VL: Bilingual Large Vision-Language Model via Multi-Task Instruction Tuning**|Junyu Lu et.al.|[2310.08166v1](http://arxiv.org/abs/2310.08166v1)|null|\n", "2310.08114": "|**2023-10-12**|**Multi-Modal Sensor Fusion and Object Tracking for Autonomous Racing**|Phillip Karle et.al.|[2310.08114v1](http://arxiv.org/abs/2310.08114v1)|**[link](https://github.com/tumftm/fusiontracking)**|\n", "2310.08103": "|**2023-10-12**|**Radio Galaxy Zoo: tagging radio subjects using text**|Dawei Chen et.al.|[2310.08103v1](http://arxiv.org/abs/2310.08103v1)|null|\n", "2310.08027": "|**2023-10-12**|**Exploring Large Language Models for Multi-Modal Out-of-Distribution Detection**|Yi Dai et.al.|[2310.08027v1](http://arxiv.org/abs/2310.08027v1)|null|\n", "2310.08026": "|**2023-10-12**|**Beyond Sharing Weights in Decoupling Feature Learning Network for UAV RGB-Infrared Vehicle Re-Identification**|Xingyue Liu et.al.|[2310.08026v1](http://arxiv.org/abs/2310.08026v1)|null|\n", "2310.07990": "|**2023-10-12**|**Multi-View Variational Autoencoder for Missing Value Imputation in Untargeted Metabolomics**|Chen Zhao et.al.|[2310.07990v1](http://arxiv.org/abs/2310.07990v1)|null|\n", "2310.07944": "|**2023-10-11**|**AutoRepo: A general framework for multi-modal LLM-based automated construction reporting**|Hongxu Pu et.al.|[2310.07944v1](http://arxiv.org/abs/2310.07944v1)|null|\n", "2310.07940": "|**2023-10-11**|**Cost-Driven Hardware-Software Co-Optimization of Machine Learning Pipelines**|Ravit Sharma et.al.|[2310.07940v1](http://arxiv.org/abs/2310.07940v1)|null|\n", "2310.10651": "|**2023-10-16**|**HairCLIPv2: Unifying Hair Editing via Proxy Feature Blending**|Tianyi Wei et.al.|[2310.10651v1](http://arxiv.org/abs/2310.10651v1)|**[link](https://github.com/wty-ustc/hairclipv2)**|\n", "2310.10414": "|**2023-10-16**|**Style transfer between Microscopy and Magnetic Resonance Imaging via Generative Adversarial Network in small sample size settings**|Monika Pytlarz et.al.|[2310.10414v1](http://arxiv.org/abs/2310.10414v1)|null|\n", "2310.10371": "|**2023-10-16**|**Camera-LiDAR Fusion with Latent Contact for Place Recognition in Challenging Cross-Scenes**|Yan Pan et.al.|[2310.10371v1](http://arxiv.org/abs/2310.10371v1)|null|\n", "2310.10347": "|**2023-10-16**|**Editable-DeepSC: Cross-Modal Editable Semantic Communication Systems**|Wenbo Yu et.al.|[2310.10347v1](http://arxiv.org/abs/2310.10347v1)|null|\n", "2310.10290": "|**2023-10-16**|**Autonomous Mapping and Navigation using Fiducial Markers and Pan-Tilt Camera for Assisting Indoor Mobility of Blind and Visually Impaired People**|Dharmateja Adapa et.al.|[2310.10290v1](http://arxiv.org/abs/2310.10290v1)|null|\n", "2310.10125": "|**2023-10-16**|**Few-shot Action Recognition with Captioning Foundation Models**|Xiang Wang et.al.|[2310.10125v1](http://arxiv.org/abs/2310.10125v1)|null|\n", "2310.10010": "|**2023-10-16**|**Black-box Targeted Adversarial Attack on Segment Anything (SAM)**|Sheng Zheng et.al.|[2310.10010v1](http://arxiv.org/abs/2310.10010v1)|null|\n", "2310.09761": "|**2023-10-15**|**CAPro: Webly Supervised Learning with Cross-Modality Aligned Prototypes**|Yulei Qin et.al.|[2310.09761v1](http://arxiv.org/abs/2310.09761v1)|**[link](https://github.com/yuleiqin/capro)**|\n", "2310.09755": "|**2023-10-15**|**Beyond Segmentation: Road Network Generation with Multi-Modal LLMs**|Sumedh Rasal et.al.|[2310.09755v1](http://arxiv.org/abs/2310.09755v1)|null|\n", "2310.09714": "|**2023-10-15**|**Enhancing Task Performance of Learned Simplified Models via Reinforcement Learning**|Hien Bui et.al.|[2310.09714v1](http://arxiv.org/abs/2310.09714v1)|null|\n", "2310.09696": "|**2023-10-15**|**Progressive Evidence Refinement for Open-domain Multimodal Retrieval Question Answering**|Shuwen Yang et.al.|[2310.09696v1](http://arxiv.org/abs/2310.09696v1)|null|\n", "2310.09503": "|**2023-10-14**|**JM3D & JM3D-LLM: Elevating 3D Representation with Joint Multi-modal Cues**|Jiayi Ji et.al.|[2310.09503v1](http://arxiv.org/abs/2310.09503v1)|**[link](https://github.com/mr-neko/jm3d)**|\n", "2310.09478": "|**2023-10-14**|**MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning**|Jun Chen et.al.|[2310.09478v1](http://arxiv.org/abs/2310.09478v1)|null|\n", "2310.09199": "|**2023-10-13**|**PaLI-3 Vision Language Models: Smaller, Faster, Stronger**|Xi Chen et.al.|[2310.09199v1](http://arxiv.org/abs/2310.09199v1)|null|\n", "2310.09165": "|**2023-10-13**|**Towards Robust UAV Tracking in GNSS-Denied Environments: A Multi-LiDAR Multi-UAV Dataset**|Iacopo Catalano et.al.|[2310.09165v1](http://arxiv.org/abs/2310.09165v1)|**[link](https://github.com/tiers/uav_multi_lidar_dataset)**|\n", "2310.11374": "|**2023-10-17**|**DialogueLLM: Context and Emotion Knowledge-Tuned LLaMA Models for Emotion Recognition in Conversations**|Yazhou Zhang et.al.|[2310.11374v1](http://arxiv.org/abs/2310.11374v1)|null|\n", "2310.11316": "|**2023-10-17**|**MonoSKD: General Distillation Framework for Monocular 3D Object Detection via Spearman Correlation Coefficient**|Sen Wang et.al.|[2310.11316v1](http://arxiv.org/abs/2310.11316v1)|**[link](https://github.com/senwang98/monoskd)**|\n", "2310.11307": "|**2023-10-17**|**Multi Self-supervised Pre-fine-tuned Transformer Fusion for Better Intelligent Transportation Detection**|Juwu Zheng et.al.|[2310.11307v1](http://arxiv.org/abs/2310.11307v1)|null|\n", "2310.11295": "|**2023-10-17**|**CorrTalk: Correlation Between Hierarchical Speech and Facial Activity Variances for 3D Animation**|Zhaojie Chu et.al.|[2310.11295v1](http://arxiv.org/abs/2310.11295v1)|null|\n", "2310.10942": "|**2023-10-17**|**Unanswerable Visual Question Answering**|Yanyang Guo et.al.|[2310.10942v1](http://arxiv.org/abs/2310.10942v1)|**[link](https://github.com/guoyang9/unk-vqa)**|\n", "2310.10844": "|**2023-10-16**|**Survey of Vulnerabilities in Large Language Models Revealed by Adversarial Attacks**|Erfan Shayegani et.al.|[2310.10844v1](http://arxiv.org/abs/2310.10844v1)|null|\n", "2310.12081": "|**2023-10-18**|**DHOT-GM: Robust Graph Matching Using A Differentiable Hierarchical Optimal Transport Framework**|Haoran Cheng et.al.|[2310.12081v1](http://arxiv.org/abs/2310.12081v1)|null|\n", "2310.11989": "|**2023-10-18**|**Image Clustering with External Guidance**|Yunfan Li et.al.|[2310.11989v1](http://arxiv.org/abs/2310.11989v1)|null|\n", "2310.11939": "|**2023-10-18**|**Mixture distributions for probabilistic forecasts of disease outbreaks**|Spencer Wadsworth et.al.|[2310.11939v1](http://arxiv.org/abs/2310.11939v1)|null|\n", "2310.11938": "|**2023-10-18**|**Grounded and Well-rounded: A Methodological Approach to the Study of Cross-modal and Cross-lingual Grounding**|Timothee Mickus et.al.|[2310.11938v1](http://arxiv.org/abs/2310.11938v1)|null|\n", "2310.11910": "|**2023-10-18**|**Multi-modal Medical Neurological Image Fusion using Wavelet Pooled Edge Preserving Autoencoder**|Manisha Das et.al.|[2310.11910v1](http://arxiv.org/abs/2310.11910v1)|null|\n", "2310.11713": "|**2023-10-18**|**Separating Invisible Sounds Toward Universal Audiovisual Scene-Aware Sound Separation**|Yiyang Su et.al.|[2310.11713v1](http://arxiv.org/abs/2310.11713v1)|null|\n", "2310.11612": "|**2023-10-17**|**Balance Act: Mitigating Hubness in Cross-Modal Retrieval with Query and Gallery Banks**|Yimu Wang et.al.|[2310.11612v1](http://arxiv.org/abs/2310.11612v1)|**[link](https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval)**|\n", "2310.12973": "|**2023-10-19**|**Frozen Transformers in Language Models Are Effective Visual Encoder Layers**|Ziqi Pang et.al.|[2310.12973v1](http://arxiv.org/abs/2310.12973v1)|**[link](https://github.com/ziqipang/lm4visualencoding)**|\n", "2310.12798": "|**2023-10-19**|**MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal Adapter**|Zhiyuan Liu et.al.|[2310.12798v1](http://arxiv.org/abs/2310.12798v1)|**[link](https://github.com/acharkq/molca)**|\n", "2310.12609": "|**2023-10-19**|**Denoising Heat-inspired Diffusion with Insulators for Collision Free Motion Planning**|Junwoo Chang et.al.|[2310.12609v1](http://arxiv.org/abs/2310.12609v1)|null|\n", "2310.12520": "|**2023-10-19**|**Lost in Translation: When GPT-4V(ision) Can't See Eye to Eye with Text. A Vision-Language-Consistency Analysis of VLLMs and Beyond**|Xiang Zhang et.al.|[2310.12520v1](http://arxiv.org/abs/2310.12520v1)|null|\n", "2310.12518": "|**2023-10-19**|**Light-enhanced van der Waals force microscopy**|Han Yu-Xiao et.al.|[2310.12518v1](http://arxiv.org/abs/2310.12518v1)|null|\n", "2310.12344": "|**2023-10-18**|**LACMA: Language-Aligning Contrastive Learning with Meta-Actions for Embodied Instruction Following**|Cheng-Fu Yang et.al.|[2310.12344v1](http://arxiv.org/abs/2310.12344v1)|**[link](https://github.com/joeyy5588/lacma)**|\n", "2310.13619": "|**2023-10-20**|**Semi-supervised multimodal coreference resolution in image narrations**|Arushi Goel et.al.|[2310.13619v1](http://arxiv.org/abs/2310.13619v1)|**[link](https://github.com/vico-uoe/cin-ssl)**|\n", "2310.13596": "|**2023-10-20**|**MarineGPT: Unlocking Secrets of Ocean to the Public**|Ziqiang Zheng et.al.|[2310.13596v1](http://arxiv.org/abs/2310.13596v1)|**[link](https://github.com/hkust-vgd/marinegpt)**|\n", "2310.13451": "|**2023-10-20**|**Two-Stage Triplet Loss Training with Curriculum Augmentation for Audio-Visual Retrieval**|Donghuo Zeng et.al.|[2310.13451v1](http://arxiv.org/abs/2310.13451v1)|null|\n", "2310.13398": "|**2023-10-20**|**OpenAnnotate3D: Open-Vocabulary Auto-Labeling System for Multi-modal 3D Data**|Yijie Zhou et.al.|[2310.13398v1](http://arxiv.org/abs/2310.13398v1)|null|\n", "2310.13289": "|**2023-10-20**|**SALMONN: Towards Generic Hearing Abilities for Large Language Models**|Changli Tang et.al.|[2310.13289v1](http://arxiv.org/abs/2310.13289v1)|**[link](https://github.com/bytedance/salmonn)**|\n", "2310.13276": "|**2023-10-20**|**InvGC: Robust Cross-Modal Retrieval by Inverse Graph Convolution**|Xiangru Jian et.al.|[2310.13276v1](http://arxiv.org/abs/2310.13276v1)|**[link](https://github.com/yimuwangcs/Better_Cross_Modal_Retrieval)**|\n", "2310.13267": "|**2023-10-20**|**On the Language Encoder of Contrastive Cross-modal Models**|Mengjie Zhao et.al.|[2310.13267v1](http://arxiv.org/abs/2310.13267v1)|null|\n", "2310.13265": "|**2023-10-20**|**MoqaGPT : Zero-Shot Multi-modal Open-domain Question Answering with Large Language Model**|Le Zhang et.al.|[2310.13265v1](http://arxiv.org/abs/2310.13265v1)|**[link](https://github.com/lezhang7/moqagpt)**|\n", "2310.13257": "|**2023-10-20**|**Visual Grounding Helps Learn Word Meanings in Low-Data Regimes**|Chengxu Zhuang et.al.|[2310.13257v1](http://arxiv.org/abs/2310.13257v1)|null|\n", "2310.13235": "|**2023-10-20**|**Auxiliary Features-Guided Super Resolution for Monte Carlo Rendering**|Qiqi Hou et.al.|[2310.13235v1](http://arxiv.org/abs/2310.13235v1)|null|\n", "2310.13103": "|**2023-10-19**|**AVTENet: Audio-Visual Transformer-based Ensemble Network Exploiting Multiple Experts for Video Deepfake Detection**|Ammarah Hashmi et.al.|[2310.13103v1](http://arxiv.org/abs/2310.13103v1)|null|\n", "2310.14924": "|**2023-10-23**|**Converting Depth Images and Point Clouds for Feature-based Pose Estimation**|Robert L\u00f6sch et.al.|[2310.14924v1](http://arxiv.org/abs/2310.14924v1)|**[link](https://github.com/rlsch/depth-conversions)**|\n", "2310.14805": "|**2023-10-23**|**Cross-Modal Conceptualization in Bottleneck Models**|Danis Alukaev et.al.|[2310.14805v1](http://arxiv.org/abs/2310.14805v1)|**[link](https://github.com/danisalukaev/xcbs)**|\n", "2310.14785": "|**2023-10-23**|**Vision-Enhanced Semantic Entity Recognition in Document Images via Visually-Asymmetric Consistency Learning**|Hao Wang et.al.|[2310.14785v1](http://arxiv.org/abs/2310.14785v1)|null|\n", "2310.14720": "|**2023-10-23**|**Extended Deep Adaptive Input Normalization for Preprocessing Time Series Data for Neural Networks**|Marcus A. K. September et.al.|[2310.14720v1](http://arxiv.org/abs/2310.14720v1)|**[link](https://github.com/marcusgh/edain_paper)**|\n", "2310.14702": "|**2023-10-23**|**BM2CP: Efficient Collaborative Perception with LiDAR-Camera Modalities**|Binyu Zhao et.al.|[2310.14702v1](http://arxiv.org/abs/2310.14702v1)|**[link](https://github.com/byzhaoai/bm2cp)**|\n", "2310.14643": "|**2023-10-23**|**Dynamic gain and frequency comb formation in exceptional-point lasers**|Xingwei Gao et.al.|[2310.14643v1](http://arxiv.org/abs/2310.14643v1)|null|\n", "2310.14566": "|**2023-10-23**|**HallusionBench: You See What You Think? Or You Think What You See? An Image-Context Reasoning Benchmark Challenging for GPT-4V(ision), LLaVA-1.5, and Other Multi-modality Models**|Fuxiao Liu et.al.|[2310.14566v1](http://arxiv.org/abs/2310.14566v1)|**[link](https://github.com/tianyi-lab/hallusionbench)**|\n", "2310.14549": "|**2023-10-23**|**Multimodal Graph Learning for Modeling Emerging Pandemics with Big Data**|Khanh-Tung Tran et.al.|[2310.14549v1](http://arxiv.org/abs/2310.14549v1)|**[link](https://github.com/khanhtungtran/mgl4mep)**|\n", "2310.14278": "|**2023-10-22**|**Conversational Speech Recognition by Learning Audio-textual Cross-modal Contextual Representation**|Kun Wei et.al.|[2310.14278v1](http://arxiv.org/abs/2310.14278v1)|null|\n", "2310.14226": "|**2023-10-22**|**Multi-stream Cell Segmentation with Low-level Cues for Multi-modality Images**|Wei Lou et.al.|[2310.14226v1](http://arxiv.org/abs/2310.14226v1)|**[link](https://github.com/lhaof/cellseg)**|\n", "2310.14216": "|**2023-10-22**|**UniMAP: Universal SMILES-Graph Representation Learning**|Shikun Feng et.al.|[2310.14216v1](http://arxiv.org/abs/2310.14216v1)|**[link](https://github.com/fengshikun/unimap)**|\n", "2310.14158": "|**2023-10-22**|**Visual-Attribute Prompt Learning for Progressive Mild Cognitive Impairment Prediction**|Luoyao Kang et.al.|[2310.14158v1](http://arxiv.org/abs/2310.14158v1)|**[link](https://github.com/lhaof/vapl)**|\n", "2310.14075": "|**2023-10-21**|**Unsupervised Sim-to-Real Adaptation of Soft Robot Proprioception using a Dual Cross-modal Autoencoder**|Chaeree Park et.al.|[2310.14075v1](http://arxiv.org/abs/2310.14075v1)|null|\n", "2310.14037": "|**2023-10-21**|**Unlock Multi-Modal Capability of Dense Retrieval via Visual Module Plugin**|Tianshuo Zhou et.al.|[2310.14037v1](http://arxiv.org/abs/2310.14037v1)|**[link](https://github.com/openmatch/marvel)**|\n", "2310.13898": "|**2023-10-21**|**Computational and Systems Biology Advances to Enable for Bioagent Agnostic Signatures**|Andy Lin et.al.|[2310.13898v1](http://arxiv.org/abs/2310.13898v1)|null|\n", "2310.15887": "|**2023-10-24**|**AdaptiX -- A Transitional XR Framework for Development and Evaluation of Shared Control Applications in Assistive Robotics**|Max Pascher et.al.|[2310.15887v1](http://arxiv.org/abs/2310.15887v1)|**[link](https://github.com/maxpascher/AdaptiX)**|\n", "2310.15676": "|**2023-10-24**|**Recent Advances in Multi-modal 3D Scene Understanding: A Comprehensive Survey and Evaluation**|Yinjie Lei et.al.|[2310.15676v1](http://arxiv.org/abs/2310.15676v1)|null|\n", "2310.15670": "|**2023-10-24**|**Leveraging Vision-Centric Multi-Modal Expertise for 3D Object Detection**|Linyan Huang et.al.|[2310.15670v1](http://arxiv.org/abs/2310.15670v1)|**[link](https://github.com/opendrivelab/birds-eye-view-perception)**|\n", "2310.15587": "|**2023-10-24**|**ScanDL: A Diffusion Model for Generating Synthetic Scanpaths on Texts**|Lena S. Bolliger et.al.|[2310.15587v1](http://arxiv.org/abs/2310.15587v1)|**[link](https://github.com/dili-lab/scandl)**|\n", "2310.15585": "|**2023-10-24**|**Multimodal Representations for Teacher-Guided Compositional Visual Reasoning**|Wafa Aissa et.al.|[2310.15585v1](http://arxiv.org/abs/2310.15585v1)|null|\n", "2310.15568": "|**2023-10-24**|**I$^2$MD: 3D Action Representation Learning with Inter- and Intra-modal Mutual Distillation**|Yunyao Mao et.al.|[2310.15568v1](http://arxiv.org/abs/2310.15568v1)|null|\n", "2310.15482": "|**2023-10-24**|**Salient Object Detection in RGB-D Videos**|Ao Mou et.al.|[2310.15482v1](http://arxiv.org/abs/2310.15482v1)|**[link](https://github.com/kerenfu/rdvs)**|\n", "2310.15325": "|**2023-10-23**|**LXMERT Model Compression for Visual Question Answering**|Maryam Hashemi et.al.|[2310.15325v1](http://arxiv.org/abs/2310.15325v1)|**[link](https://github.com/ghazaleh-mahmoodi/lxmert_compression)**|\n", "2310.15301": "|**2023-10-23**|**ADMarker: A Multi-Modal Federated Learning System for Monitoring Digital Biomarkers of Alzheimer's Disease**|Xiaomin Ouyang et.al.|[2310.15301v1](http://arxiv.org/abs/2310.15301v1)|null|\n", "2310.15281": "|**2023-10-23**|**UncertaintyPlayground: A Fast and Simplified Python Library for Uncertainty Estimation**|Ilia Azizi et.al.|[2310.15281v1](http://arxiv.org/abs/2310.15281v1)|**[link](https://github.com/Unco3892/UncertaintyPlayground)**|\n", "2310.16781": "|**2023-10-25**|**Kiki or Bouba? Sound Symbolism in Vision-and-Language Models**|Morris Alper et.al.|[2310.16781v1](http://arxiv.org/abs/2310.16781v1)|null|\n", "2310.16754": "|**2023-10-25**|**CAD -- Contextual Multi-modal Alignment for Dynamic AVQA**|Asmar Nadeem et.al.|[2310.16754v1](http://arxiv.org/abs/2310.16754v1)|null|\n", "2310.16641": "|**2023-10-25**|**The Next Evolution of Artificial Sense of Touch**|Sonja Gro\u00df et.al.|[2310.16641v1](http://arxiv.org/abs/2310.16641v1)|null|\n", "2310.16629": "|**2023-10-25**|**EdgeCalib: Multi-Frame Weighted Edge Features for Automatic Targetless LiDAR-Camera Calibration**|Xingchen Li et.al.|[2310.16629v1](http://arxiv.org/abs/2310.16629v1)|null|\n", "2310.16590": "|**2023-10-25**|**$\\mathbb{VD}$-$\\mathbb{GR}$: Boosting $\\mathbb{V}$isual $\\mathbb{D}$ialog with Cascaded Spatial-Temporal Multi-Modal $\\mathbb{GR}$aphs**|Adnen Abdessaied et.al.|[2310.16590v1](http://arxiv.org/abs/2310.16590v1)|null|\n", "2310.16477": "|**2023-10-25**|**Show from Tell: Audio-Visual Modelling in Clinical Settings**|Jianbo Jiao et.al.|[2310.16477v1](http://arxiv.org/abs/2310.16477v1)|null|\n", "2310.16402": "|**2023-10-25**|**Video Referring Expression Comprehension via Transformer with Content-conditioned Query**|Ji Jiang et.al.|[2310.16402v1](http://arxiv.org/abs/2310.16402v1)|null|\n", "2310.16380": "|**2023-10-25**|**A model for multi-attack classification to improve intrusion detection performance using deep learning approaches**|Arun Kumar Silivery et.al.|[2310.16380v1](http://arxiv.org/abs/2310.16380v1)|null|\n", "2310.16356": "|**2023-10-25**|**A Multi-Modal Multilingual Benchmark for Document Image Classification**|Yoshinari Fujinuma et.al.|[2310.16356v1](http://arxiv.org/abs/2310.16356v1)|null|\n", "2310.16273": "|**2023-10-25**|**Deep Learning for Plant Identification and Disease Classification from Leaf Images: Multi-prediction Approaches**|Jianping Yao et.al.|[2310.16273v1](http://arxiv.org/abs/2310.16273v1)|**[link](https://github.com/funzi-son/plant_pathology_dl)**|\n", "2310.17642": "|**2023-10-26**|**Drive Anywhere: Generalizable End-to-end Autonomous Driving with Multi-modal Foundation Models**|Tsun-Hsuan Wang et.al.|[2310.17642v1](http://arxiv.org/abs/2310.17642v1)|null|\n", "2310.17568": "|**2023-10-26**|**Navigating to Success in Multi-Modal Human-Robot Collaboration: Analysis and Corpus Release**|Stephanie M. Lukin et.al.|[2310.17568v1](http://arxiv.org/abs/2310.17568v1)|null|\n", "2310.17540": "|**2023-10-26**|**EqDrive: Efficient Equivariant Motion Forecasting with Multi-Modality for Autonomous Driving**|Yuping Wang et.al.|[2310.17540v1](http://arxiv.org/abs/2310.17540v1)|null|\n", "2310.17468": "|**2023-10-26**|**Cross-modal Active Complementary Learning with Self-refining Correspondence**|Yang Qin et.al.|[2310.17468v1](http://arxiv.org/abs/2310.17468v1)|**[link](https://github.com/qinyang79/crcl)**|\n", "2310.17323": "|**2023-10-26**|**IndustReal: A Dataset for Procedure Step Recognition Handling Execution Errors in Egocentric Videos in an Industrial-Like Setting**|Tim J. Schoonbeek et.al.|[2310.17323v1](http://arxiv.org/abs/2310.17323v1)|**[link](https://github.com/timschoonbeek/industreal)**|\n", "2310.17133": "|**2023-10-26**|**Incorporating Probing Signals into Multimodal Machine Translation via Visual Question-Answering Pairs**|Yuxin Zuo et.al.|[2310.17133v1](http://arxiv.org/abs/2310.17133v1)|**[link](https://github.com/libeineu/mmt-vqa)**|\n", "2310.17025": "|**2023-10-25**|**netFound: Foundation Model for Network Security**|Satyandra Guthula et.al.|[2310.17025v1](http://arxiv.org/abs/2310.17025v1)|null|\n", "2310.16917": "|**2023-10-25**|**MimicTouch: Learning Human's Control Strategy with Multi-Modal Tactile Feedback**|Kelin Yu et.al.|[2310.16917v1](http://arxiv.org/abs/2310.16917v1)|null|\n", "2310.18049": "|**2023-10-27**|**Text Augmented Spatial-aware Zero-shot Referring Image Segmentation**|Yucheng Suo et.al.|[2310.18049v1](http://arxiv.org/abs/2310.18049v1)|null|\n", "2310.17956": "|**2023-10-27**|**Qilin-Med-VL: Towards Chinese Large Vision-Language Model for General Healthcare**|Junling Liu et.al.|[2310.17956v1](http://arxiv.org/abs/2310.17956v1)|**[link](https://github.com/williamliujl/qilin-med-vl)**|\n", "2310.17933": "|**2023-10-27**|**A barycenter-based approach for the multi-model ensembling of subseasonal forecasts**|Camille Le Coz et.al.|[2310.17933v1](http://arxiv.org/abs/2310.17933v1)|null|\n", "2310.17852": "|**2023-10-27**|**Function Space Bayesian Pseudocoreset for Bayesian Neural Networks**|Balhae Kim et.al.|[2310.17852v1](http://arxiv.org/abs/2310.17852v1)|null|\n", "2310.17796": "|**2023-10-26**|**ControlLLM: Augment Language Models with Tools by Searching on Graphs**|Zhaoyang Liu et.al.|[2310.17796v1](http://arxiv.org/abs/2310.17796v1)|**[link](https://github.com/opengvlab/controlllm)**|\n", "2310.17770": "|**2023-10-26**|**GROOViST: A Metric for Grounding Objects in Visual Storytelling**|Aditya K Surikuchi et.al.|[2310.17770v1](http://arxiv.org/abs/2310.17770v1)|**[link](https://github.com/akskuchi/groovist)**|\n", "2310.17737": "|**2023-10-26**|**ArchBERT: Bi-Modal Understanding of Neural Architectures and Natural Languages**|Mohammad Akbari et.al.|[2310.17737v1](http://arxiv.org/abs/2310.17737v1)|null|\n", "2310.19168": "|**2023-10-29**|**BirdSAT: Cross-View Contrastive Masked Autoencoders for Bird Species Classification and Mapping**|Srikumar Sastry et.al.|[2310.19168v1](http://arxiv.org/abs/2310.19168v1)|**[link](https://github.com/mvrl/birdsat)**|\n", "2310.19070": "|**2023-10-29**|**Myriad: Large Multimodal Model by Applying Vision Experts for Industrial Anomaly Detection**|Yuanze Li et.al.|[2310.19070v1](http://arxiv.org/abs/2310.19070v1)|null|\n", "2310.19062": "|**2023-10-29**|**A multi-modal table tennis robot system**|Andreas Ziegler et.al.|[2310.19062v1](http://arxiv.org/abs/2310.19062v1)|null|\n", "2310.19001": "|**2023-10-29**|**Uncovering Prototypical Knowledge for Weakly Open-Vocabulary Semantic Segmentation**|Fei Zhang et.al.|[2310.19001v1](http://arxiv.org/abs/2310.19001v1)|null|\n", "2310.18949": "|**2023-10-29**|**Customize StyleGAN with One Hand Sketch**|Shaocong Zhang et.al.|[2310.18949v1](http://arxiv.org/abs/2310.18949v1)|null|\n", "2310.18890": "|**2023-10-29**|**Towards Generalized Multi-stage Clustering: Multi-view Self-distillation**|Jiatai Wang et.al.|[2310.18890v1](http://arxiv.org/abs/2310.18890v1)|null|\n", "2310.18728": "|**2023-10-28**|**Online Multi-view Anomaly Detection with Disentangled Product-of-Experts Modeling**|Hao Wang et.al.|[2310.18728v1](http://arxiv.org/abs/2310.18728v1)|null|\n", "2310.18709": "|**2023-10-28**|**Audio-Visual Instance Segmentation**|Ruohao Guo et.al.|[2310.18709v1](http://arxiv.org/abs/2310.18709v1)|null|\n", "2310.18652": "|**2023-10-28**|**EHRXQA: A Multi-Modal Question Answering Dataset for Electronic Health Records with Chest X-ray Images**|Seongsu Bae et.al.|[2310.18652v1](http://arxiv.org/abs/2310.18652v1)|**[link](https://github.com/baeseongsu/ehrxqa)**|\n", "2310.18620": "|**2023-10-28**|**ODM3D: Alleviating Foreground Sparsity for Enhanced Semi-Supervised Monocular 3D Object Detection**|Weijia Zhang et.al.|[2310.18620v1](http://arxiv.org/abs/2310.18620v1)|null|\n", "2310.18583": "|**2023-10-28**|**Self-Supervised Multi-Modality Learning for Multi-Label Skin Lesion Classification**|Hao Wang et.al.|[2310.18583v1](http://arxiv.org/abs/2310.18583v1)|**[link](https://github.com/dylan-h-wang/skin-sm3)**|\n", "2310.18481": "|**2023-10-27**|**MOSEL: Inference Serving Using Dynamic Modality Selection**|Bodun Hu et.al.|[2310.18481v1](http://arxiv.org/abs/2310.18481v1)|null|\n", "2310.18438": "|**2023-10-27**|**Exploring Shape Embedding for Cloth-Changing Person Re-Identification via 2D-3D Correspondences**|Yubin Wang et.al.|[2310.18438v1](http://arxiv.org/abs/2310.18438v1)|null|\n", "2310.20561": "|**2023-10-31**|**Predictive Control for Autonomous Driving with Uncertain, Multi-modal Predictions**|Siddharth H. Nair et.al.|[2310.20561v1](http://arxiv.org/abs/2310.20561v1)|null|\n", "2310.20446": "|**2023-10-31**|**LAVSS: Location-Guided Audio-Visual Spatial Audio Separation**|Yuxin Ye et.al.|[2310.20446v1](http://arxiv.org/abs/2310.20446v1)|null|\n", "2310.20357": "|**2023-11-01**|**Enhancing the Spatial Awareness Capability of Multi-Modal Large Language Model**|Yongqiang Zhao et.al.|[2310.20357v2](http://arxiv.org/abs/2310.20357v2)|null|\n", "2310.20343": "|**2023-10-31**|**Large Multi-modal Encoders for Recommendation**|Zixuan Yi et.al.|[2310.20343v1](http://arxiv.org/abs/2310.20343v1)|null|\n", "2310.20025": "|**2023-10-30**|**GOPlan: Goal-conditioned Offline Reinforcement Learning by Planning with Learned Models**|Mianchu Wang et.al.|[2310.20025v1](http://arxiv.org/abs/2310.20025v1)|null|\n", "2310.19795": "|**2023-10-30**|**SimMMDG: A Simple and Effective Framework for Multi-modal Domain Generalization**|Hao Dong et.al.|[2310.19795v1](http://arxiv.org/abs/2310.19795v1)|**[link](https://github.com/donghao51/simmmdg)**|\n", "2310.19743": "|**2023-10-30**|**Tell Me What Is Good About This Property: Leveraging Reviews For Segment-Personalized Image Collection Summarization**|Monika Wysoczanska et.al.|[2310.19743v1](http://arxiv.org/abs/2310.19743v1)|null|\n", "2310.19654": "|**2023-10-30**|**MCAD: Multi-teacher Cross-modal Alignment Distillation for efficient image-text retrieval**|Youbo Lei et.al.|[2310.19654v1](http://arxiv.org/abs/2310.19654v1)|null|\n", "2310.19635": "|**2023-10-30**|**Bidirectional Captioning for Clinically Accurate and Interpretable Models**|Keegan Quigley et.al.|[2310.19635v1](http://arxiv.org/abs/2310.19635v1)|null|\n", "2310.19608": "|**2023-10-30**|**On Feynman--Kac training of partial Bayesian neural networks**|Zheng Zhao et.al.|[2310.19608v1](http://arxiv.org/abs/2310.19608v1)|null|\n", "2310.19559": "|**2023-10-30**|**Disentangled Counterfactual Learning for Physical Audiovisual Commonsense Reasoning**|Changsheng Lv et.al.|[2310.19559v1](http://arxiv.org/abs/2310.19559v1)|null|\n", "2310.19554": "|**2023-10-30**|**Harvest Video Foundation Models via Efficient Post-Pretraining**|Yizhuo Li et.al.|[2310.19554v1](http://arxiv.org/abs/2310.19554v1)|**[link](https://github.com/opengvlab/internvideo)**|\n", "2310.19432": "|**2023-10-30**|**Explaining the Decisions of Deep Policy Networks for Robotic Manipulations**|Seongun Kim et.al.|[2310.19432v1](http://arxiv.org/abs/2310.19432v1)|null|\n", "2310.19264": "|**2023-10-30**|**Sound of Story: Multi-modal Storytelling with Audio**|Jaeyeon Bae et.al.|[2310.19264v1](http://arxiv.org/abs/2310.19264v1)|null|\n", "2311.00618": "|**2023-11-01**|**De-Diffusion Makes Text a Strong Cross-Modal Interface**|Chen Wei et.al.|[2311.00618v1](http://arxiv.org/abs/2311.00618v1)|null|\n", "2311.00566": "|**2023-11-01**|**CROMA: Remote Sensing Representations with Contrastive Radar-Optical Masked Autoencoders**|Anthony Fuller et.al.|[2311.00566v1](http://arxiv.org/abs/2311.00566v1)|**[link](https://github.com/antofuller/croma)**|\n", "2311.00436": "|**2023-11-01**|**Enhancing Traffic Object Detection in Variable Illumination with RGB-Event Fusion**|Zhanwen Liu et.al.|[2311.00436v1](http://arxiv.org/abs/2311.00436v1)|null|\n", "2311.00265": "|**2023-11-01**|**Adaptive Latent Diffusion Model for 3D Medical Image to Image Translation: Multi-modal Magnetic Resonance Imaging Study**|Jonghun Kim et.al.|[2311.00265v1](http://arxiv.org/abs/2311.00265v1)|**[link](https://github.com/jongdory/aldm)**|\n", "2311.00207": "|**2023-11-01**|**Magmaw: Modality-Agnostic Adversarial Attacks on Machine Learning-Based Wireless Communication Systems**|Jung-Woo Chang et.al.|[2311.00207v1](http://arxiv.org/abs/2311.00207v1)|null|\n", "2311.01459": "|**2023-11-02**|**Align Your Prompts: Test-Time Prompting with Distribution Alignment for Zero-Shot Generalization**|Jameel Hassan et.al.|[2311.01459v1](http://arxiv.org/abs/2311.01459v1)|null|\n", "2311.01361": "|**2023-11-02**|**GPT-4V(ision) as a Generalist Evaluator for Vision-Language Tasks**|Xinlu Zhang et.al.|[2311.01361v1](http://arxiv.org/abs/2311.01361v1)|null|\n", "2311.01202": "|**2023-11-02**|**Cross-Modal Information-Guided Network using Contrastive Learning for Point Cloud Registration**|Yifan Xie et.al.|[2311.01202v1](http://arxiv.org/abs/2311.01202v1)|**[link](https://github.com/ivanxie416/cmignet)**|\n", "2311.01092": "|**2023-11-02**|**Learning A Multi-Task Transformer Via Unified And Customized Instruction Tuning For Chest Radiograph Interpretation**|Lijian Xu et.al.|[2311.01092v1](http://arxiv.org/abs/2311.01092v1)|**[link](https://github.com/medhk23/omnifm-dr)**|\n", "2311.01066": "|**2023-11-02**|**Dynamic Multimodal Information Bottleneck for Multimodality Classification**|Yingying Fang et.al.|[2311.01066v1](http://arxiv.org/abs/2311.01066v1)|**[link](https://github.com/bii-wushuang/dmib)**|\n", "2311.00807": "|**2023-11-01**|**VQA-GEN: A Visual Question Answering Benchmark for Domain Generalization**|Suraj Jyothi Unni et.al.|[2311.00807v1](http://arxiv.org/abs/2311.00807v1)|null|\n", "2311.00737": "|**2023-11-01**|**Real-Time Magnetic Tracking and Diagnosis of COVID-19 via Machine Learning**|Dang Nguyen et.al.|[2311.00737v1](http://arxiv.org/abs/2311.00737v1)|null|\n", "2311.01908": "|**2023-11-03**|**LLM-driven Multimodal Target Volume Contouring in Radiation Oncology**|Yujin Oh et.al.|[2311.01908v1](http://arxiv.org/abs/2311.01908v1)|null|\n", "2311.01886": "|**2023-11-03**|**Bridging the Gap between Multi-focus and Multi-modal: A Focused Integration Framework for Multi-modal Image Fusion**|Xilai Li et.al.|[2311.01886v1](http://arxiv.org/abs/2311.01886v1)|null|\n", "2311.01881": "|**2023-11-03**|**Quantitative Evaluation of a Multi-Modal Camera Setup for Fusing Event Data with RGB Images**|Julian Moosmann et.al.|[2311.01881v1](http://arxiv.org/abs/2311.01881v1)|null|\n", "2311.01831": "|**2023-11-03**|**Universal Multi-modal Multi-domain Pre-trained Recommendation**|Wenqi Sun et.al.|[2311.01831v1](http://arxiv.org/abs/2311.01831v1)|null|\n", "2311.01807": "|**2023-11-03**|**Cross-modal Consistency Learning with Fine-grained Fusion Network for Multimodal Fake News Detection**|Jun Li et.al.|[2311.01807v1](http://arxiv.org/abs/2311.01807v1)|**[link](https://github.com/uestc-lj/cffn)**|\n", "2311.01767": "|**2023-11-03**|**PPTC Benchmark: Evaluating Large Language Models for PowerPoint Task Completion**|Yiduo Guo et.al.|[2311.01767v1](http://arxiv.org/abs/2311.01767v1)|**[link](https://github.com/gydpku/pptc)**|\n", "2311.01766": "|**2023-11-03**|**Support or Refute: Analyzing the Stance of Evidence to Detect Out-of-Context Mis- and Disinformation**|Xin Yuan et.al.|[2311.01766v1](http://arxiv.org/abs/2311.01766v1)|null|\n", "2311.01740": "|**2023-11-03**|**SAC$^3$: Reliable Hallucination Detection in Black-Box Language Models via Semantic-aware Cross-check Consistency**|Jiaxin Zhang et.al.|[2311.01740v1](http://arxiv.org/abs/2311.01740v1)|null|\n", "2311.01734": "|**2023-11-03**|**MixCon3D: Synergizing Multi-View and Cross-Modal Contrastive Learning for Enhancing 3D Representation**|Yipeng Gao et.al.|[2311.01734v1](http://arxiv.org/abs/2311.01734v1)|**[link](https://github.com/ucsc-vlaa/mixcon3d)**|\n", "2311.01487": "|**2023-11-02**|**What Makes for Good Visual Instructions? Synthesizing Complex Visual Reasoning Instructions for Visual Instruction Tuning**|Yifan Du et.al.|[2311.01487v1](http://arxiv.org/abs/2311.01487v1)|**[link](https://github.com/rucaibox/comvint)**|\n", "2311.03328": "|**2023-11-06**|**On Asynchrony, Memory, and Communication: Separations and Landscapes**|Paola Flocchini et.al.|[2311.03328v1](http://arxiv.org/abs/2311.03328v1)|null|\n", "2311.03217": "|**2023-11-06**|**Leveraging Transformers to Improve Breast Cancer Classification and Risk Assessment with Multi-modal and Longitudinal Data**|Yiqiu Shen et.al.|[2311.03217v1](http://arxiv.org/abs/2311.03217v1)|null|\n", "2311.03106": "|**2023-11-06**|**Unified Multi-modal Unsupervised Representation Learning for Skeleton-based Action Understanding**|Shengkai Sun et.al.|[2311.03106v1](http://arxiv.org/abs/2311.03106v1)|**[link](https://github.com/huiguanlab/umurl)**|\n", "2311.03090": "|**2023-11-06**|**A multi-modal approach to continuous material identification through tactile sensing**|Augusto G\u00f3mez Egu\u00edluz et.al.|[2311.03090v1](http://arxiv.org/abs/2311.03090v1)|null|\n", "2311.03079": "|**2023-11-06**|**CogVLM: Visual Expert for Pretrained Language Models**|Weihan Wang et.al.|[2311.03079v1](http://arxiv.org/abs/2311.03079v1)|**[link](https://github.com/thudm/cogvlm)**|\n", "2311.02863": "|**2023-11-06**|**Temporal Shift -- Multi-Objective Loss Function for Improved Anomaly Fall Detection**|Stefan Denkovski et.al.|[2311.02863v1](http://arxiv.org/abs/2311.02863v1)|null|\n", "2311.02850": "|**2023-11-06**|**IR-STP: Enhancing Autonomous Driving with Interaction Reasoning in Spatio-Temporal Planning**|Yingbing Chen et.al.|[2311.02850v1](http://arxiv.org/abs/2311.02850v1)|**[link](https://github.com/chenyingbing/ir-stp-planner)**|\n", "2311.02842": "|**2023-11-06**|**An invariant feature extraction for multi-modal images matching**|Chenzhong Gao et.al.|[2311.02842v1](http://arxiv.org/abs/2311.02842v1)|null|\n", "2311.02820": "|**2023-11-06**|**Mesh Neural Cellular Automata**|Ehsan Pajouheshgar et.al.|[2311.02820v1](http://arxiv.org/abs/2311.02820v1)|null|\n", "2311.02782": "|**2023-11-05**|**Towards Generic Anomaly Detection and Understanding: Large-scale Visual-linguistic Model (GPT-4V) Takes the Lead**|Yunkang Cao et.al.|[2311.02782v1](http://arxiv.org/abs/2311.02782v1)|**[link](https://github.com/caoyunkang/gpt4v-for-generic-anomaly-detection)**|\n", "2311.02733": "|**2023-11-05**|**AV-Lip-Sync+: Leveraging AV-HuBERT to Exploit Multimodal Inconsistency for Video Deepfake Detection**|Sahibzada Adil Shahzad et.al.|[2311.02733v1](http://arxiv.org/abs/2311.02733v1)|null|\n", "2311.02559": "|**2023-11-05**|**Rotation Invariant Transformer for Recognizing Object in UAVs**|Shuoyi Chen et.al.|[2311.02559v1](http://arxiv.org/abs/2311.02559v1)|null|\n", "2311.02329": "|**2023-11-04**|**Complex Organ Mask Guided Radiology Report Generation**|Gu Tiancheng et.al.|[2311.02329v1](http://arxiv.org/abs/2311.02329v1)|**[link](https://github.com/garygutc/comg_model)**|\n", "2311.02282": "|**2023-11-04**|**Contrastive Multi-Modal Representation Learning for Spark Plug Fault Diagnosis**|Ardavan Modarres et.al.|[2311.02282v1](http://arxiv.org/abs/2311.02282v1)|null|\n", "2311.02248": "|**2023-11-03**|**COSMIC: Data Efficient Instruction-tuning For Speech In-Context Learning**|Jing Pan et.al.|[2311.02248v1](http://arxiv.org/abs/2311.02248v1)|null|\n", "2311.04219": "|**2023-11-07**|**OtterHD: A High-Resolution Multi-modality Model**|Bo Li et.al.|[2311.04219v1](http://arxiv.org/abs/2311.04219v1)|null|\n", "2311.04160": "|**2023-11-07**|**\"Tell me about that church\": Exploring the Design and User Experience of In-Vehicle Multi-modal Intuitive Interface in the Context of Driving Scenario**|Yueteng Yu et.al.|[2311.04160v1](http://arxiv.org/abs/2311.04160v1)|null|\n", "2311.04091": "|**2023-11-07**|**Proceedings of the 5th International Workshop on Reading Music Systems**|Jorge Calvo-Zaragoza et.al.|[2311.04091v1](http://arxiv.org/abs/2311.04091v1)|**[link](https://github.com/suziai/gui-tools)**|\n", "2311.04058": "|**2023-11-07**|**mmFUSION: Multimodal Fusion for 3D Objects Detection**|Javed Ahmad et.al.|[2311.04058v1](http://arxiv.org/abs/2311.04058v1)|null|\n", "2311.04056": "|**2023-11-07**|**Multi-View Causal Representation Learning with Partial Observability**|Dingling Yao et.al.|[2311.04056v1](http://arxiv.org/abs/2311.04056v1)|null|\n", "2311.03810": "|**2023-11-07**|**Rethinking and Improving Multi-task Learning for End-to-end Speech Translation**|Yuhao Zhang et.al.|[2311.03810v1](http://arxiv.org/abs/2311.03810v1)|**[link](https://github.com/xiaozhang521/imtl)**|\n", "2311.03620": "|**2023-11-07**|**FusionViT: Hierarchical 3D Object Detection via LiDAR-Camera Vision Transformer Fusion**|Xinhao Xiang et.al.|[2311.03620v1](http://arxiv.org/abs/2311.03620v1)|null|\n", "2311.03606": "|**2023-11-06**|**Multimodal Stress Detection Using Facial Landmarks and Biometric Signals**|Majid Hosseini et.al.|[2311.03606v1](http://arxiv.org/abs/2311.03606v1)|null|\n", "2311.03413": "|**2023-11-06**|**Discret2Di -- Deep Learning based Discretization for Model-based Diagnosis**|Lukas Moddemann et.al.|[2311.03413v1](http://arxiv.org/abs/2311.03413v1)|null|\n", "2311.04766": "|**2023-11-08**|**DualTalker: A Cross-Modal Dual Learning Approach for Speech-Driven 3D Facial Animation**|Guinan Su et.al.|[2311.04766v1](http://arxiv.org/abs/2311.04766v1)|null|\n", "2311.04678": "|**2023-11-08**|**Weakly supervised cross-model learning in high-content screening**|Watkinson Gabriel et.al.|[2311.04678v1](http://arxiv.org/abs/2311.04678v1)|null|\n", "2311.04589": "|**2023-11-08**|**TEAL: Tokenize and Embed ALL for Multi-modal Large Language Models**|Zhen Yang et.al.|[2311.04589v1](http://arxiv.org/abs/2311.04589v1)|null|\n", "2311.04563": "|**2023-11-08**|**Investigating the Nature of Disagreements on Mid-Scale Ratings: A Case Study on the Abstractness-Concreteness Continuum**|Urban Knuple\u0161 et.al.|[2311.04563v1](http://arxiv.org/abs/2311.04563v1)|null|\n", "2311.04552": "|**2023-11-08**|**A 3D generative model of pathological multi-modal MR images and segmentations**|Virginia Fernandez et.al.|[2311.04552v1](http://arxiv.org/abs/2311.04552v1)|**[link](https://github.com/virginiafdez/brainspade3d_rel)**|\n", "2311.04512": "|**2023-11-08**|**FFINet: Future Feedback Interaction Network for Motion Forecasting**|Miao Kang et.al.|[2311.04512v1](http://arxiv.org/abs/2311.04512v1)|null|\n", "2311.04507": "|**2023-11-08**|**Conversation Understanding using Relational Temporal Graph Neural Networks with Auxiliary Cross-Modality Interaction**|Cam-Van Thi Nguyen et.al.|[2311.04507v1](http://arxiv.org/abs/2311.04507v1)|null|\n", "2311.04390": "|**2023-11-07**|**Force-Constrained Visual Policy: Safe Robot-Assisted Dressing via Multi-Modal Sensing**|Zhanyi Sun et.al.|[2311.04390v1](http://arxiv.org/abs/2311.04390v1)|null|\n", "2311.04257": "|**2023-11-07**|**mPLUG-Owl2: Revolutionizing Multi-modal Large Language Model with Modality Collaboration**|Qinghao Ye et.al.|[2311.04257v1](http://arxiv.org/abs/2311.04257v1)|**[link](https://github.com/x-plug/mplug-owl)**|\n", "2311.05494": "|**2023-11-09**|**Object-centric Cross-modal Feature Distillation for Event-based Object Detection**|Lei Li et.al.|[2311.05494v1](http://arxiv.org/abs/2311.05494v1)|null|\n", "2311.05464": "|**2023-11-09**|**3DStyle-Diffusion: Pursuing Fine-grained Text-driven 3D Stylization with 2D Diffusion Models**|Haibo Yang et.al.|[2311.05464v1](http://arxiv.org/abs/2311.05464v1)|**[link](https://github.com/yanghb22-fdu/3dstyle-diffusion-official)**|\n", "2311.05463": "|**2023-11-09**|**ControlStyle: Text-Driven Stylized Image Generation Using Diffusion Priors**|Jingwen Chen et.al.|[2311.05463v1](http://arxiv.org/abs/2311.05463v1)|null|\n", "2311.05348": "|**2023-11-09**|**u-LLaVA: Unifying Multi-Modal Tasks via Large Language Model**|Jinjin Xu et.al.|[2311.05348v1](http://arxiv.org/abs/2311.05348v1)|null|\n", "2311.05319": "|**2023-11-09**|**TLCFuse: Temporal Multi-Modality Fusion Towards Occlusion-Aware Semantic Segmentation-Aided Motion Planning**|Gustavo Salazar-Gomez et.al.|[2311.05319v1](http://arxiv.org/abs/2311.05319v1)|null|\n", "2311.05298": "|**2023-11-09**|**Improving Vision-and-Language Reasoning via Spatial Relations Modeling**|Cheng Yang et.al.|[2311.05298v1](http://arxiv.org/abs/2311.05298v1)|null|\n", "2311.05152": "|**2023-11-09**|**Cross-modal Prompts: Adapting Large Pre-trained Models for Audio-Visual Downstream Tasks**|Haoyi Duan et.al.|[2311.05152v1](http://arxiv.org/abs/2311.05152v1)|**[link](https://github.com/haoyi-duan/dg-sct)**|\n", "2311.05032": "|**2023-11-08**|**Transfer learning from a sparsely annotated dataset of 3D medical images**|Gabriel Efrain Humpire-Mamani et.al.|[2311.05032v1](http://arxiv.org/abs/2311.05032v1)|**[link](https://github.com/diagnijmegen/medicaltransferlearning3d-unet)**|\n", "2311.05870": "|**2023-11-10**|**Automated Heterogeneous Low-Bit Quantization of Multi-Model Deep Learning Inference Pipeline**|Jayeeta Mondal et.al.|[2311.05870v1](http://arxiv.org/abs/2311.05870v1)|null|\n", "2311.05863": "|**2023-11-10**|**Watermarking Vision-Language Pre-trained Models for Multi-modal Embedding as a Service**|Yuanmin Tang et.al.|[2311.05863v1](http://arxiv.org/abs/2311.05863v1)|**[link](https://github.com/Pter61/vlpmarker)**|\n", "2311.05699": "|**2023-11-09**|**Cosmological parameter estimation with Genetic Algorithms**|Ricardo Medel-Esquivel et.al.|[2311.05699v1](http://arxiv.org/abs/2311.05699v1)|null|\n", "2311.05669": "|**2023-11-09**|**Multi-Modal Gaze Following in Conversational Scenarios**|Yuqi Hou et.al.|[2311.05669v1](http://arxiv.org/abs/2311.05669v1)|null|\n"}, "Point Cloud Localization": {"2301.05372": "|**2023-01-13**|**Text to Point Cloud Localization with Relation-Enhanced Transformer**|Guangzhi Wang et.al.|[2301.05372v1](http://arxiv.org/abs/2301.05372v1)|null|\n", "2209.15475": "|**2022-09-30**|**Point Cloud Quality Assessment using 3D Saliency Maps**|Zhengyu Wang et.al.|[2209.15475v1](http://arxiv.org/abs/2209.15475v1)|null|\n", "2207.05317": "|**2022-07-12**|**CPO: Change Robust Panorama to Point Cloud Localization**|Junho Kim et.al.|[2207.05317v1](http://arxiv.org/abs/2207.05317v1)|null|\n", "2205.14965": "|**2022-05-31**|**PSNet: Fast Data Structuring for Hierarchical Deep Learning on Point Cloud**|Luyang Li et.al.|[2205.14965v2](http://arxiv.org/abs/2205.14965v2)|**[link](https://github.com/lly007/pointstructuringnet)**|\n", "2203.15125": "|**2022-04-05**|**Text2Pos: Text-to-Point-Cloud Cross-Modal Localization**|Manuel Kolmet et.al.|[2203.15125v2](http://arxiv.org/abs/2203.15125v2)|null|\n", "2003.02392": "|**2021-11-22**|**PointLoc: Deep Pose Regressor for LiDAR Point Cloud Localization**|Wei Wang et.al.|[2003.02392v3](http://arxiv.org/abs/2003.02392v3)|**[link](https://github.com/loveoxford/vreloc)**|\n", "1812.01711": "|**2018-11-28**|**A Graph-CNN for 3D Point Cloud Classification**|Yingxue Zhang et.al.|[1812.01711v1](http://arxiv.org/abs/1812.01711v1)|**[link](https://github.com/maggie0106/Graph-CNN-in-3D-Point-Cloud-Classification)**|\n", "1712.06760": "|**2018-04-03**|**Mining Point Cloud Local Structures by Kernel Correlation and Graph Pooling**|Yiru Shen et.al.|[1712.06760v2](http://arxiv.org/abs/1712.06760v2)|null|\n", "1702.04114": "|**2017-02-14**|**Graph Based Over-Segmentation Methods for 3D Point Clouds**|Yizhak Ben-Shabat et.al.|[1702.04114v1](http://arxiv.org/abs/1702.04114v1)|null|\n"}, "Place Recognization": {"2302.06149": "|**2023-02-13**|**Contour Context: Abstract Structural Distribution for 3D LiDAR Loop Detection and Metric Pose Estimation**|Binqian Jiang et.al.|[2302.06149v1](http://arxiv.org/abs/2302.06149v1)|**[link](https://github.com/lewisjiang/contour-context)**|\n", "2301.05604": "|**2023-01-13**|**A LiDAR-Inertial-Visual SLAM System with Loop Detection**|Kangcheng Liu et.al.|[2301.05604v1](http://arxiv.org/abs/2301.05604v1)|null|\n", "2212.12745": "|**2022-12-24**|**GraffMatch: Global Matching of 3D Lines and Planes for Wide Baseline LiDAR Registration**|Parker C. Lusk et.al.|[2212.12745v1](http://arxiv.org/abs/2212.12745v1)|null|\n", "2211.14864": "|**2022-11-27**|**A Faster, Lighter and Stronger Deep Learning-Based Approach for Place Recognition**|Rui Huang et.al.|[2211.14864v1](http://arxiv.org/abs/2211.14864v1)|null|\n", "2211.12732": "|**2023-03-02**|**Wild-Places: A Large-Scale Dataset for Lidar Place Recognition in Unstructured Natural Environments**|Joshua Knights et.al.|[2211.12732v3](http://arxiv.org/abs/2211.12732v3)|**[link](https://github.com/csiro-robotics/Wild-Places)**|\n", "2210.13856": "|**2022-11-02**|**A Framework for Collaborative Multi-Robot Mapping using Spectral Graph Wavelets**|Lukas Bernreiter et.al.|[2210.13856v2](http://arxiv.org/abs/2210.13856v2)|null|\n", "2210.11029": "|**2022-10-20**|**DeepRING: Learning Roto-translation Invariant Representation for LiDAR based Place Recognition**|Sha Lu et.al.|[2210.11029v1](http://arxiv.org/abs/2210.11029v1)|null|\n", "2210.04432": "|**2023-03-06**|**Spectral Geometric Verification: Re-Ranking Point Cloud Retrieval for Metric Localization**|Kavisha Vidanapathirana et.al.|[2210.04432v2](http://arxiv.org/abs/2210.04432v2)|**[link](https://github.com/csiro-robotics/spectralgv)**|\n", "2210.04236": "|**2022-10-09**|**Fusing Event-based Camera and Radar for SLAM Using Spiking Neural Networks with Continual STDP Learning**|Ali Safa et.al.|[2210.04236v1](http://arxiv.org/abs/2210.04236v1)|null|\n", "2210.01320": "|**2022-11-23**|**Wi-Closure: Reliable and Efficient Search of Inter-robot Loop Closures Using Wireless Sensing**|Weiying Wang et.al.|[2210.01320v2](http://arxiv.org/abs/2210.01320v2)|null|\n", "2209.12513": "|**2022-09-26**|**NDD: A 3D Point Cloud Descriptor Based on Normal Distribution for Loop Closure Detection**|Ruihao Zhou et.al.|[2209.12513v1](http://arxiv.org/abs/2209.12513v1)|**[link](https://github.com/zhouruihao1001/ndd)**|\n", "2209.11894": "|**2022-09-24**|**Closing the Loop: Graph Networks to Unify Semantic Objects and Visual Features for Multi-object Scenes**|Jonathan J. Y. Kim et.al.|[2209.11894v1](http://arxiv.org/abs/2209.11894v1)|null|\n", "2209.09699": "|**2023-03-28**|**PADLoC: LiDAR-Based Deep Loop Closure Detection and Registration Using Panoptic Attention**|Jos\u00e9 Arce et.al.|[2209.09699v3](http://arxiv.org/abs/2209.09699v3)|**[link](https://github.com/robot-learning-freiburg/PADLoC)**|\n", "2209.08608": "|**2022-09-18**|**HGI-SLAM: Loop Closure With Human and Geometric Importance Features**|Shuhul Mujoo et.al.|[2209.08608v1](http://arxiv.org/abs/2209.08608v1)|null|\n", "2209.08578": "|**2022-09-18**|**Data-driven Loop Closure Detection in Bathymetric Point Clouds for Underwater SLAM**|Jiarui Tan et.al.|[2209.08578v1](http://arxiv.org/abs/2209.08578v1)|**[link](https://github.com/tjr16/bathy_nn_learning)**|\n", "2209.06779": "|**2022-10-15**|**Efficient Planar Pose Estimation via UWB Measurements**|Haodong Jiang et.al.|[2209.06779v3](http://arxiv.org/abs/2209.06779v3)|**[link](https://github.com/SLAMLab-CUHKSZ/Efficient-Pose-Estimation-via-UWB-measurements)**|\n", "2209.06545": "|**2023-01-12**|**Tac2Structure: Object Surface Reconstruction Only through Multi Times Touch**|Junyuan Lu et.al.|[2209.06545v3](http://arxiv.org/abs/2209.06545v3)|**[link](https://github.com/ljy-zju/tac2structure)**|\n", "2209.04497": "|**2022-09-09**|**General Place Recognition Survey: Towards the Real-world Autonomy Age**|Peng Yin et.al.|[2209.04497v1](http://arxiv.org/abs/2209.04497v1)|**[link](https://github.com/MetaSLAM/GPRS)**|\n", "2207.10916": "|**2022-07-22**|**PLD-SLAM: A Real-Time Visual SLAM Using Points and Line Segments in Dynamic Scenes**|BaoSheng Zhang et.al.|[2207.10916v1](http://arxiv.org/abs/2207.10916v1)|null|\n", "2207.06965": "|**2022-09-28**|**AutoMerge: A Framework for Map Assembling and Smoothing in City-scale Environments**|Peng Yin et.al.|[2207.06965v3](http://arxiv.org/abs/2207.06965v3)|null|\n", "2207.06738": "|**2022-07-14**|**Semi-supervised Vector-Quantization in Visual SLAM using HGCN**|Amir Zarringhalam et.al.|[2207.06738v1](http://arxiv.org/abs/2207.06738v1)|null|\n", "2207.06732": "|**2022-07-14**|**Self-supervised Vector-Quantization in Visual SLAM using Deep Convolutional Autoencoders**|Amir Zarringhalam et.al.|[2207.06732v1](http://arxiv.org/abs/2207.06732v1)|null|\n", "2206.12628": "|**2022-09-27**|**FreSCo: Frequency-Domain Scan Context for LiDAR-based Place Recognition with Translation and Rotation Invariance**|Yongzhi Fan et.al.|[2206.12628v2](http://arxiv.org/abs/2206.12628v2)|**[link](https://github.com/soytony/fresco)**|\n", "2206.08733": "|**2022-06-17**|**Efficient WiFi LiDAR SLAM for Autonomous Robots in Large Environments**|Khairuldanial Ismail et.al.|[2206.08733v1](http://arxiv.org/abs/2206.08733v1)|null|\n", "2205.13135": "|**2022-07-09**|**LAMP 2.0: A Robust Multi-Robot SLAM System for Operation in Challenging Large-Scale Underground Environments**|Yun Chang et.al.|[2205.13135v3](http://arxiv.org/abs/2205.13135v3)|**[link](https://github.com/nebula-autonomy/nebula-multirobot-dataset)**|\n", "2204.12831": "|**2022-11-09**|**The Revisiting Problem in Simultaneous Localization and Mapping: A Survey on Visual Loop Closure Detection**|Konstantinos A. Tsintotas et.al.|[2204.12831v3](http://arxiv.org/abs/2204.12831v3)|null|\n", "2204.05481": "|**2022-04-12**|**HiTPR: Hierarchical Transformer for Place Recognition in Point Cloud**|Zhixing Hou et.al.|[2204.05481v1](http://arxiv.org/abs/2204.05481v1)|null|\n", "2204.04932": "|**2022-04-11**|**Optimized SC-F-LOAM: Optimized Fast LiDAR Odometry and Mapping Using Scan Context**|Lizhou Liao et.al.|[2204.04932v1](http://arxiv.org/abs/2204.04932v1)|**[link](https://github.com/SlamCabbage/Optimized-SC-F-LOAM)**|\n", "2204.01524": "|**2022-04-01**|**Bi-directional Loop Closure for Visual SLAM**|Ihtisham Ali et.al.|[2204.01524v1](http://arxiv.org/abs/2204.01524v1)|null|\n", "2203.03454": "|**2022-03-07**|**Multi-Modal Lidar Dataset for Benchmarking General-Purpose Localization and Mapping Algorithms**|Qingqing Li et.al.|[2203.03454v1](http://arxiv.org/abs/2203.03454v1)|**[link](https://github.com/tiers/tiers-lidars-dataset)**|\n", "2201.13360": "|**2022-06-20**|**Hydra: A Real-time Spatial Perception System for 3D Scene Graph Construction and Optimization**|Nathan Hughes et.al.|[2201.13360v2](http://arxiv.org/abs/2201.13360v2)|null|\n", "2201.09048": "|**2022-01-22**|**Phase-SLAM: Phase Based Simultaneous Localization and Mapping for Mobile Structured Light Illumination Systems**|Xi Zheng et.al.|[2201.09048v1](http://arxiv.org/abs/2201.09048v1)|**[link](https://github.com/zhengxi-git/phase-slam)**|\n", "2201.03212": "|**2022-01-10**|**Why-So-Deep: Towards Boosting Previously Trained Models for Visual Place Recognition**|M. Usman Maqbool Bhutta et.al.|[2201.03212v1](http://arxiv.org/abs/2201.03212v1)|**[link](https://github.com/UsmanMaqbool/why-so-deep)**|\n", "2111.14990": "|**2021-11-29**|**MIXER: A Principled Framework for Multimodal, Multiway Data Association**|Parker C. Lusk et.al.|[2111.14990v1](http://arxiv.org/abs/2111.14990v1)|null|\n", "2111.13838": "|**2021-11-27**|**DSC: Deep Scan Context Descriptor for Large-Scale Place Recognition**|Jiafeng Cui et.al.|[2111.13838v1](http://arxiv.org/abs/2111.13838v1)|null|\n", "2111.13826": "|**2021-11-27**|**Average Outward Flux Skeletons for Environment Mapping and Topology Matching**|Morteza Rezanejad et.al.|[2111.13826v1](http://arxiv.org/abs/2111.13826v1)|null|\n", "2111.00440": "|**2022-02-27**|**Loop closure detection using local 3D deep descriptors**|Youjie Zhou et.al.|[2111.00440v2](http://arxiv.org/abs/2111.00440v2)|**[link](https://github.com/yiming107/l3d_loop_closure)**|\n", "2110.11491": "|**2021-10-21**|**SymbioLCD: Ensemble-Based Loop Closure Detection using CNN-Extracted Objects and Visual Bag-of-Words**|Jonathan J. Y. Kim et.al.|[2110.11491v1](http://arxiv.org/abs/2110.11491v1)|null|\n", "2109.08975": "|**2022-03-09**|**AirLoop: Lifelong Loop Closure Detection**|Dasong Gao et.al.|[2109.08975v3](http://arxiv.org/abs/2109.08975v3)|**[link](https://github.com/wang-chen/airloop)**|\n", "2109.06596": "|**2021-09-14**|**GPGM-SLAM: a Robust SLAM System for Unstructured Planetary Environments with Gaussian Process Gradient Maps**|Riccardo Giubilato et.al.|[2109.06596v1](http://arxiv.org/abs/2109.06596v1)|null|\n", "2108.12790": "|**2022-08-28**|**RPR-Net: A Point Cloud-based Rotation-aware Large Scale Place Recognition Network**|Zhaoxin Fan et.al.|[2108.12790v3](http://arxiv.org/abs/2108.12790v3)|null|\n", "2108.02028": "|**2021-08-04**|**Incorporating Learnt Local and Global Embeddings into Monocular Visual SLAM**|Huaiyang Huang et.al.|[2108.02028v1](http://arxiv.org/abs/2108.02028v1)|null|\n", "2108.01383": "|**2021-08-03**|**On the descriptive power of LiDAR intensity images for segment-based loop closing in 3-D SLAM**|Jan Wietrzykowski et.al.|[2108.01383v1](http://arxiv.org/abs/2108.01383v1)|**[link](https://github.com/LRMPUT/segmap_vis_views)**|\n", "2107.14611": "|**2021-07-30**|**Automatic Vocabulary and Graph Verification for Accurate Loop Closure Detection**|Haosong Yue et.al.|[2107.14611v1](http://arxiv.org/abs/2107.14611v1)|null|\n", "2107.07707": "|**2021-07-16**|**Probabilistic Appearance-Invariant Topometric Localization with New Place Awareness**|Ming Xu et.al.|[2107.07707v1](http://arxiv.org/abs/2107.07707v1)|**[link](https://github.com/mingu6/TopometricLoc)**|\n", "2107.07133": "|**2021-07-15**|**A life-long SLAM approach using adaptable local maps based on rasterized LIDAR images**|Waqas Ali et.al.|[2107.07133v1](http://arxiv.org/abs/2107.07133v1)|null|\n", "2106.11516": "|**2021-07-01**|**SA-LOAM: Semantic-aided LiDAR SLAM with Loop Closure**|Lin Li et.al.|[2106.11516v2](http://arxiv.org/abs/2106.11516v2)|null|\n", "2106.09637": "|**2023-01-04**|**AttDLNet: Attention-based DL Network for 3D LiDAR Place Recognition**|Tiago Barros et.al.|[2106.09637v4](http://arxiv.org/abs/2106.09637v4)|**[link](https://github.com/cybonic/attdlnet)**|\n", "2105.11344": "|**2021-05-24**|**OverlapNet: Loop Closing for LiDAR-based SLAM**|Xieyuanli Chen et.al.|[2105.11344v1](http://arxiv.org/abs/2105.11344v1)|**[link](https://github.com/PRBonn/OverlapNet)**|\n", "2103.12292": "|**2021-03-23**|**NDT-Transformer: Large-Scale 3D Point Cloud Localisation using the Normal Distribution Transform Representation**|Zhicheng Zhou et.al.|[2103.12292v1](http://arxiv.org/abs/2103.12292v1)|**[link](https://github.com/dachengxiaocheng/NDT-Transformer)**|\n", "2303.00477": "|**2023-03-01**|**ORCHNet: A Robust Global Feature Aggregation approach for 3D LiDAR-based Place recognition in Orchards**|T. Barros et.al.|[2303.00477v1](http://arxiv.org/abs/2303.00477v1)|**[link](https://github.com/cybonic/orchnet)**|\n", "2303.00295": "|**2023-03-01**|**Region Prediction for Efficient Robot Localization on Large Maps**|Matteo Scucchia et.al.|[2303.00295v1](http://arxiv.org/abs/2303.00295v1)|null|\n", "2304.03872": "|**2023-06-24**|**LSGDDN-LCD: An Appearance-based Loop Closure Detection using Local Superpixel Grid Descriptors and Incremental Dynamic Nodes**|Baosheng Zhang et.al.|[2304.03872v2](http://arxiv.org/abs/2304.03872v2)|null|\n", "2304.05146": "|**2023-04-14**|**Loop Closure Detection Based on Object-level Spatial Layout and Semantic Consistency**|Xingwu Ji et.al.|[2304.05146v2](http://arxiv.org/abs/2304.05146v2)|**[link](https://github.com/jixingwu/ss-lcd)**|\n", "2304.13487": "|**2023-04-26**|**Hydra-Multi: Collaborative Online Construction of 3D Scene Graphs with Multi-Robot Teams**|Yun Chang et.al.|[2304.13487v1](http://arxiv.org/abs/2304.13487v1)|null|\n", "2305.07154": "|**2023-05-11**|**Foundations of Spatial Perception for Robotics: Hierarchical Representations and Real-time Systems**|Nathan Hughes et.al.|[2305.07154v1](http://arxiv.org/abs/2305.07154v1)|**[link](https://github.com/mit-spark/hydra)**|\n", "2305.18013": "|**2023-05-29**|**TReR: A Lightweight Transformer Re-Ranking Approach for 3D LiDAR Place Recognition**|Tiago Barros et.al.|[2305.18013v1](http://arxiv.org/abs/2305.18013v1)|null|\n", "2307.04321": "|**2023-07-10**|**RaPlace: Place Recognition for Imaging Radar using Radon Transform and Mutable Threshold**|Hyesu Jang et.al.|[2307.04321v1](http://arxiv.org/abs/2307.04321v1)|**[link](https://github.com/hyesu-jang/raplace)**|\n", "2307.08221": "|**2023-07-17**|**NDT-Map-Code: A 3D global descriptor for real-time loop closure detection in lidar SLAM**|Lizhou Liao et.al.|[2307.08221v1](http://arxiv.org/abs/2307.08221v1)|**[link](https://github.com/SlamCabbage/NDTMC)**|\n", "2307.09044": "|**2023-07-18**|**3D-SeqMOS: A Novel Sequential 3D Moving Object Segmentation in Autonomous Driving**|Qipeng Li et.al.|[2307.09044v1](http://arxiv.org/abs/2307.09044v1)|null|\n", "2309.02394": "|**2023-09-05**|**Magnetic Navigation using Attitude-Invariant Magnetic Field Information for Loop Closure Detection**|Natalia Pavlasek et.al.|[2309.02394v1](http://arxiv.org/abs/2309.02394v1)|null|\n", "2309.07094": "|**2023-09-13**|**RadarLCD: Learnable Radar-based Loop Closure Detection Pipeline**|Mirko Usuelli et.al.|[2309.07094v1](http://arxiv.org/abs/2309.07094v1)|null|\n", "2309.09879": "|**2023-09-18**|**DynaPix SLAM: A Pixel-Based Dynamic SLAM Approach**|Chenghao Xu et.al.|[2309.09879v1](http://arxiv.org/abs/2309.09879v1)|null|\n", "2309.08914": "|**2023-09-16**|**Outram: One-shot Global Localization via Triangulated Scene Graph and Global Outlier Pruning**|Pengyu Yin et.al.|[2309.08914v1](http://arxiv.org/abs/2309.08914v1)|**[link](https://github.com/pamphlett/outram)**|\n"}, "LiDAR SLAM": {"2212.14209": "|**2022-12-29**|**An Enhanced LiDAR-Inertial SLAM System for Robotics Localization and Mapping**|Kangcheng Liu et.al.|[2212.14209v1](http://arxiv.org/abs/2212.14209v1)|**[link](https://github.com/KangchengLiu/slam_resources)**|\n", "2212.05705": "|**2022-12-12**|**An Integrated LiDAR-SLAM System for Complex Environment with Noisy Point Clouds**|Kangcheng Liu et.al.|[2212.05705v1](http://arxiv.org/abs/2212.05705v1)|**[link](https://github.com/KangchengLiu/DLC_LiDAR_SLAM)**|\n", "2212.02077": "|**2022-12-05**|**DL-SLOT: Dynamic LiDAR SLAM and object tracking based on collaborative graph optimization**|Xuebo Tian et.al.|[2212.02077v1](http://arxiv.org/abs/2212.02077v1)|null|\n", "2211.03484": "|**2022-11-07**|**When Geometry is not Enough: Using Reflector Markers in Lidar SLAM**|Gerhard Kurz et.al.|[2211.03484v1](http://arxiv.org/abs/2211.03484v1)|null|\n", "2211.02445": "|**2023-04-14**|**Lidar-level localization with radar? The CFEAR approach to accurate, fast and robust large-scale radar odometry in diverse environments**|Daniel Adolfsson et.al.|[2211.02445v3](http://arxiv.org/abs/2211.02445v3)|**[link](https://github.com/dan11003/CFEAR_Radarodometry_code_public)**|\n", "2210.11978": "|**2023-04-13**|**DCL-SLAM: A Distributed Collaborative LiDAR SLAM Framework for a Robotic Swarm**|Shipeng Zhong et.al.|[2210.11978v2](http://arxiv.org/abs/2210.11978v2)|**[link](https://github.com/pengyu-team/dcl-slam)**|\n", "2210.00812": "|**2022-10-03**|**A Benchmark for Multi-Modal Lidar SLAM with Ground Truth in GNSS-Denied Environments**|Ha Sier et.al.|[2210.00812v1](http://arxiv.org/abs/2210.00812v1)|**[link](https://github.com/tiers/tiers-lidars-dataset-enhanced)**|\n", "2209.08810": "|**2022-09-19**|**LMBAO: A Landmark Map for Bundle Adjustment Odometry in LiDAR SLAM**|Letian Zhang et.al.|[2209.08810v1](http://arxiv.org/abs/2209.08810v1)|null|\n", "2209.08248": "|**2022-09-29**|**PlaneSLAM: Plane-based LiDAR SLAM for Motion Planning in Structured 3D Environments**|Adam Dai et.al.|[2209.08248v2](http://arxiv.org/abs/2209.08248v2)|**[link](https://github.com/stanford-navlab/planeslam)**|\n", "2209.08091": "|**2022-09-16**|**ViWiD: Leveraging WiFi for Robust and Resource-Efficient SLAM**|Aditya Arun et.al.|[2209.08091v1](http://arxiv.org/abs/2209.08091v1)|null|\n", "2208.11855": "|**2022-08-25**|**Lidar SLAM for Autonomous Driving Vehicles**|Farhad Aghili et.al.|[2208.11855v1](http://arxiv.org/abs/2208.11855v1)|null|\n", "2208.09777": "|**2022-09-08**|**JVLDLoc: a Joint Optimization of Visual-LiDAR Constraints and Direction Priors for Localization in Driving Scenario**|Longrui Dong et.al.|[2208.09777v3](http://arxiv.org/abs/2208.09777v3)|null|\n", "2208.07473": "|**2022-11-18**|**BoW3D: Bag of Words for Real-Time Loop Closing in 3D LiDAR SLAM**|Yunge Cui et.al.|[2208.07473v2](http://arxiv.org/abs/2208.07473v2)|**[link](https://github.com/yungecui/bow3d)**|\n", "2207.06815": "|**2022-07-14**|**Challenges of SLAM in extremely unstructured environments: the DLR Planetary Stereo, Solid-State LiDAR, Inertial Dataset**|Riccardo Giubilato et.al.|[2207.06815v1](http://arxiv.org/abs/2207.06815v1)|null|\n", "2206.09463": "|**2022-06-19**|**RF-LIO: Removal-First Tightly-coupled Lidar Inertial Odometry in High Dynamic Environments**|Chenglong Qian et.al.|[2206.09463v1](http://arxiv.org/abs/2206.09463v1)|null|\n", "2206.08733": "|**2022-06-17**|**Efficient WiFi LiDAR SLAM for Autonomous Robots in Large Environments**|Khairuldanial Ismail et.al.|[2206.08733v1](http://arxiv.org/abs/2206.08733v1)|null|\n", "2206.00266": "|**2022-06-01**|**PaGO-LOAM: Robust Ground-Optimized LiDAR Odometry**|Dong-Uk Seo et.al.|[2206.00266v1](http://arxiv.org/abs/2206.00266v1)|**[link](https://github.com/url-kaist/alterground-lego-loam)**|\n", "2205.08556": "|**2022-05-17**|**Global Data Association for SLAM with 3D Grassmannian Manifold Objects**|Parker C. Lusk et.al.|[2205.08556v1](http://arxiv.org/abs/2205.08556v1)|null|\n", "2204.12769": "|**2022-04-27**|**Dynamic Registration: Joint Ego Motion Estimation and 3D Moving Object Detection in Dynamic Environment**|Wenyu Li et.al.|[2204.12769v1](http://arxiv.org/abs/2204.12769v1)|null|\n", "2204.08163": "|**2022-04-18**|**Mapping While Following: 2D LiDAR SLAM in Indoor Dynamic Environments with a Person Tracker**|Hanjing Ye et.al.|[2204.08163v1](http://arxiv.org/abs/2204.08163v1)|null|\n", "2203.13799": "|**2022-03-25**|**Gravity-constrained point cloud registration**|Vladim\u00edr Kubelka et.al.|[2203.13799v1](http://arxiv.org/abs/2203.13799v1)|null|\n", "2202.11431": "|**2022-02-23**|**DL-SLOT: Dynamic Lidar SLAM and Object Tracking Based On Graph Optimization**|Xuebo Tian et.al.|[2202.11431v1](http://arxiv.org/abs/2202.11431v1)|null|\n", "2201.06423": "|**2022-01-17**|**SC-LiDAR-SLAM: a Front-end Agnostic Versatile LiDAR SLAM System**|Giseop Kim et.al.|[2201.06423v1](http://arxiv.org/abs/2201.06423v1)|null|\n", "2110.11517": "|**2021-10-21**|**Real-Time Ground-Plane Refined LiDAR SLAM**|Fan Yang et.al.|[2110.11517v1](http://arxiv.org/abs/2110.11517v1)|null|\n", "2110.02018": "|**2021-10-03**|**AEROS: Adaptive RObust least-Squares for Graph-Based SLAM**|Milad Ramezani et.al.|[2110.02018v1](http://arxiv.org/abs/2110.02018v1)|null|\n", "2109.05483": "|**2021-09-12**|**ART-SLAM: Accurate Real-Time 6DoF LiDAR SLAM**|Matteo Frosi et.al.|[2109.05483v1](http://arxiv.org/abs/2109.05483v1)|**[link](https://github.com/matteof94/artslam)**|\n", "2109.00200": "|**2021-09-01**|**A real-time global re-localization framework for 3D LiDAR SLAM**|Ziqi Chai et.al.|[2109.00200v1](http://arxiv.org/abs/2109.00200v1)|null|\n", "2108.01383": "|**2021-08-03**|**On the descriptive power of LiDAR intensity images for segment-based loop closing in 3-D SLAM**|Jan Wietrzykowski et.al.|[2108.01383v1](http://arxiv.org/abs/2108.01383v1)|**[link](https://github.com/LRMPUT/segmap_vis_views)**|\n", "2107.05283": "|**2021-07-12**|**Benchmark of visual and 3D lidar SLAM systems in simulation environment for vineyards**|Ibrahim Hroob et.al.|[2107.05283v1](http://arxiv.org/abs/2107.05283v1)|null|\n", "2106.11516": "|**2021-07-01**|**SA-LOAM: Semantic-aided LiDAR SLAM with Loop Closure**|Lin Li et.al.|[2106.11516v2](http://arxiv.org/abs/2106.11516v2)|null|\n", "2105.08941": "|**2021-05-19**|**Large-scale Localization Datasets in Crowded Indoor Spaces**|Donghwan Lee et.al.|[2105.08941v1](http://arxiv.org/abs/2105.08941v1)|null|\n", "2105.03296": "|**2021-10-05**|**VIRAL SLAM: Tightly Coupled Camera-IMU-UWB-Lidar SLAM**|Thien-Minh Nguyen et.al.|[2105.03296v3](http://arxiv.org/abs/2105.03296v3)|null|\n", "2104.05347": "|**2021-04-12**|**Radar SLAM: A Robust SLAM System for All Weather Conditions**|Ziyang Hong et.al.|[2104.05347v1](http://arxiv.org/abs/2104.05347v1)|null|\n", "2104.03657": "|**2021-04-08**|**Dynamic Object Aware LiDAR SLAM based on Automatic Generation of Training Data**|Patrick Pfreundschuh et.al.|[2104.03657v1](http://arxiv.org/abs/2104.03657v1)|null|\n", "2103.13090": "|**2021-03-24**|**Greedy-Based Feature Selection for Efficient LiDAR SLAM**|Jianhao Jiao et.al.|[2103.13090v1](http://arxiv.org/abs/2103.13090v1)|null|\n", "2103.10678": "|**2021-03-19**|**6-DOF Feature based LIDAR SLAM using ORB Features from Rasterized Images of 3D LIDAR Point Cloud**|Waqas Ali et.al.|[2103.10678v1](http://arxiv.org/abs/2103.10678v1)|null|\n", "2103.09523": "|**2021-12-30**|**A Universal LiDAR SLAM Accelerator System on Low-cost FPGA**|Keisuke Sugiura et.al.|[2103.09523v2](http://arxiv.org/abs/2103.09523v2)|null|\n", "2103.05056": "|**2022-02-08**|**LCDNet: Deep Loop Closure Detection and Point Cloud Registration for LiDAR SLAM**|Daniele Cattaneo et.al.|[2103.05056v4](http://arxiv.org/abs/2103.05056v4)|**[link](https://github.com/robot-learning-freiburg/LCDNet)**|\n", "2103.03713": "|**2021-03-05**|**Ground-SLAM: Ground Constrained LiDAR SLAM for Structured Multi-Floor Environments**|Xin Wei et.al.|[2103.03713v1](http://arxiv.org/abs/2103.03713v1)|null|\n", "2102.03800": "|**2021-02-17**|**Lightweight 3-D Localization and Mapping for Solid-State LiDAR**|Han Wang et.al.|[2102.03800v2](http://arxiv.org/abs/2102.03800v2)|**[link](https://github.com/wh200720041/SSL_SLAM)**|\n", "2102.03798": "|**2021-02-17**|**Intensity-SLAM: Intensity Assisted Localization and Mapping for Large Scale Environment**|Han Wang et.al.|[2102.03798v2](http://arxiv.org/abs/2102.03798v2)|**[link](https://github.com/wh200720041/intensity_slam)**|\n", "2102.03771": "|**2021-04-27**|**MULLS: Versatile LiDAR SLAM via Multi-metric Linear Least Square**|Yue Pan et.al.|[2102.03771v3](http://arxiv.org/abs/2102.03771v3)|**[link](https://github.com/YuePanEdward/MULLS)**|\n", "2101.06615": "|**2021-05-31**|**Online Robust Sliding-Windowed LiDAR SLAM in Natural Environments**|Quang-Ha Pham et.al.|[2101.06615v6](http://arxiv.org/abs/2101.06615v6)|null|\n", "2012.03455": "|**2020-12-07**|**TP-TIO: A Robust Thermal-Inertial Odometry with Deep ThermalPoint**|Shibo Zhao et.al.|[2012.03455v1](http://arxiv.org/abs/2012.03455v1)|null|\n", "2012.02399": "|**2020-12-04**|**P3-LOAM: PPP/LiDAR Loosely Coupled SLAM with Accurate Covariance Estimation and Robust RAIM in Urban Canyon Environment**|Tao Li et.al.|[2012.02399v1](http://arxiv.org/abs/2012.02399v1)|null|\n", "2011.11357": "|**2020-11-23**|**CamVox: A Low-cost and Accurate Lidar-assisted Visual SLAM System**|Yuewen Zhu et.al.|[2011.11357v1](http://arxiv.org/abs/2011.11357v1)|**[link](https://github.com/ISEE-Technology/CamVox)**|\n", "2011.02306": "|**2021-09-11**|**A Comparison of LiDAR-based SLAM Systems for Control of Unmanned Aerial Vehicles**|Robert Milijas et.al.|[2011.02306v3](http://arxiv.org/abs/2011.02306v3)|null|\n", "2010.08215": "|**2021-01-13**|**BALM: Bundle Adjustment for Lidar Mapping**|Zheng Liu et.al.|[2010.08215v2](http://arxiv.org/abs/2010.08215v2)|**[link](https://github.com/hku-mars/BALM)**|\n", "2008.03694": "|**2020-08-09**|**LiDAR Data Enrichment Using Deep Learning Based on High-Resolution Image: An Approach to Achieve High-Performance LiDAR SLAM Using Low-cost LiDAR**|Jiang Yue et.al.|[2008.03694v1](http://arxiv.org/abs/2008.03694v1)|null|\n", "2008.02274": "|**2020-08-05**|**Elasticity Meets Continuous-Time: Map-Centric Dense 3D LiDAR SLAM**|Chanoh Park et.al.|[2008.02274v1](http://arxiv.org/abs/2008.02274v1)|null|\n", "2302.13613": "|**2023-03-13**|**Evaluation of Lidar-based 3D SLAM algorithms in SubT environment**|Anton Koval et.al.|[2302.13613v2](http://arxiv.org/abs/2302.13613v2)|null|\n", "2303.01155": "|**2023-04-07**|**Marker-based Visual SLAM leveraging Hierarchical Representations**|Ali Tourani et.al.|[2303.01155v2](http://arxiv.org/abs/2303.01155v2)|null|\n", "2303.05252": "|**2023-03-09**|**SLAMesh: Real-time LiDAR Simultaneous Localization and Meshing**|Jianyuan Ruan et.al.|[2303.05252v1](http://arxiv.org/abs/2303.05252v1)|**[link](https://github.com/RuanJY/SLAMesh)**|\n", "2305.01843": "|**2023-05-03**|**Direct LiDAR-Inertial Odometry and Mapping: Perceptive and Connective SLAM**|Kenny Chen et.al.|[2305.01843v1](http://arxiv.org/abs/2305.01843v1)|null|\n", "2306.03660": "|**2023-06-06**|**PQM: A Point Quality Evaluation Metric for Dense Maps**|Yash Turkar et.al.|[2306.03660v1](http://arxiv.org/abs/2306.03660v1)|**[link](https://github.com/droneslab/pqm-sim)**|\n", "2307.08221": "|**2023-07-17**|**NDT-Map-Code: A 3D global descriptor for real-time loop closure detection in lidar SLAM**|Lizhou Liao et.al.|[2307.08221v1](http://arxiv.org/abs/2307.08221v1)|**[link](https://github.com/SlamCabbage/NDTMC)**|\n", "2307.09044": "|**2023-07-18**|**3D-SeqMOS: A Novel Sequential 3D Moving Object Segmentation in Autonomous Driving**|Qipeng Li et.al.|[2307.09044v1](http://arxiv.org/abs/2307.09044v1)|null|\n", "2307.15005": "|**2023-07-27**|**FLiCR: A Fast and Lightweight LiDAR Point Cloud Compression Based on Lossy RI**|Jin Heo et.al.|[2307.15005v1](http://arxiv.org/abs/2307.15005v1)|null|\n", "2309.04937": "|**2023-09-12**|**LONER: LiDAR Only Neural Representations for Real-Time SLAM**|Seth Isaacson et.al.|[2309.04937v2](http://arxiv.org/abs/2309.04937v2)|null|\n", "2309.08086": "|**2023-09-15**|**Fast and Accurate Deep Loop Closing and Relocalization for Reliable LiDAR SLAM**|Chenghao Shi et.al.|[2309.08086v1](http://arxiv.org/abs/2309.08086v1)|null|\n", "2311.00928": "|**2023-11-02**|**Quatro++: Robust Global Registration Exploiting Ground Segmentation for Loop Closing in LiDAR SLAM**|Hyungtae Lim et.al.|[2311.00928v1](http://arxiv.org/abs/2311.00928v1)|null|\n", "2311.02327": "|**2023-11-04**|**ECMD: An Event-Centric Multisensory Driving Dataset for SLAM**|Peiyu Chen et.al.|[2311.02327v1](http://arxiv.org/abs/2311.02327v1)|null|\n"}, "Transformer": {"2302.08104": "|**2023-02-16**|**Multiscalar field cosmological model and possible solutions using Noether symmetry approach**|Santu Mondal et.al.|[2302.08104v1](http://arxiv.org/abs/2302.08104v1)|null|\n", "2301.11622": "|**2023-01-30**|**Darboux transformations for Dunkl-Schroedinger equations with energy dependent potential and position dependent mass**|Axel Schulze-Halberg et.al.|[2301.11622v2](http://arxiv.org/abs/2301.11622v2)|null|\n", "2301.09364": "|**2023-04-06**|**On uniqueness of submaximally symmetric vector ordinary differential equations of C-class**|Johnson Allen Kessy et.al.|[2301.09364v2](http://arxiv.org/abs/2301.09364v2)|null|\n", "2301.08739": "|**2023-03-30**|**FlatFormer: Flattened Window Attention for Efficient Point Cloud Transformer**|Zhijian Liu et.al.|[2301.08739v2](http://arxiv.org/abs/2301.08739v2)|null|\n", "2301.07301": "|**2023-01-18**|**PTA-Det: Point Transformer Associating Point cloud and Image for 3D Object Detection**|Rui Wan et.al.|[2301.07301v1](http://arxiv.org/abs/2301.07301v1)|null|\n", "2301.02650": "|**2023-01-06**|**Model-Agnostic Hierarchical Attention for 3D Object Detection**|Manli Shu et.al.|[2301.02650v1](http://arxiv.org/abs/2301.02650v1)|null|\n", "2212.13736": "|**2022-12-28**|**Hermitian Topologies originating from non-Hermitian braidings**|W. B. Rui et.al.|[2212.13736v1](http://arxiv.org/abs/2212.13736v1)|null|\n", "2212.13276": "|**2022-12-26**|**Generalization of non-Cartan Symmetries to arbitrary dimensions**|J. C. Ndogmo et.al.|[2212.13276v1](http://arxiv.org/abs/2212.13276v1)|null|\n", "2212.13244": "|**2022-12-26**|**Equivalence classes and Linearization of the Riccati and Abel chain**|J. C. Ndogmo et.al.|[2212.13244v1](http://arxiv.org/abs/2212.13244v1)|null|\n", "2211.12510": "|**2022-11-22**|**Reconstructing the Image Scanning Microscopy Dataset: an Inverse Problem**|Alessandro Zunino et.al.|[2211.12510v1](http://arxiv.org/abs/2211.12510v1)|null|\n", "2211.02079": "|**2022-11-03**|**On Darboux non-integrability of the Hietarinta equation**|S. Ya. Startsev et.al.|[2211.02079v1](http://arxiv.org/abs/2211.02079v1)|null|\n", "2210.15933": "|**2022-10-28**|**PSFormer: Point Transformer for 3D Salient Object Detection**|Baian Chen et.al.|[2210.15933v1](http://arxiv.org/abs/2210.15933v1)|null|\n", "2210.06668": "|**2022-11-05**|**Aspects of the Equivalence Between the $f^\u03bc$ and $c^{\u03bd\u03bc}$ Terms in Lorentz-Violating Quantum Field Theory**|Sapan Karki et.al.|[2210.06668v2](http://arxiv.org/abs/2210.06668v2)|null|\n", "2210.05666": "|**2022-10-12**|**Point Transformer V2: Grouped Vector Attention and Partition-based Pooling**|Xiaoyang Wu et.al.|[2210.05666v2](http://arxiv.org/abs/2210.05666v2)|**[link](https://github.com/gofinge/pointtransformerv2)**|\n", "2209.11255": "|**2022-09-21**|**3DPCT: 3D Point Cloud Transformer with Dual Self-attention**|Dening Lu et.al.|[2209.11255v1](http://arxiv.org/abs/2209.11255v1)|null|\n", "2208.10395": "|**2022-08-22**|**Symmetry Classification of Scalar $n$th Order Ordinary Differential Equations**|Said Waqas Shah et.al.|[2208.10395v1](http://arxiv.org/abs/2208.10395v1)|null|\n", "2208.00281": "|**2022-12-20**|**Point Primitive Transformer for Long-Term 4D Point Cloud Video Understanding**|Hao Wen et.al.|[2208.00281v2](http://arxiv.org/abs/2208.00281v2)|**[link](https://github.com/hoi4d/PPTr)**|\n", "2207.13226": "|**2022-08-15**|**Boosting Point-BERT by Multi-choice Tokens**|Kexue Fu et.al.|[2207.13226v2](http://arxiv.org/abs/2207.13226v2)|**[link](https://github.com/fukexue/mcp-bert)**|\n", "2207.11995": "|**2022-07-26**|**3D Siamese Transformer Network for Single Object Tracking on Point Clouds**|Le Hui et.al.|[2207.11995v2](http://arxiv.org/abs/2207.11995v2)|**[link](https://github.com/fpthink/stnet)**|\n", "2207.10994": "|**2022-07-22**|**Learning Generalized Non-Rigid Multimodal Biomedical Image Registration from Generic Point Set Data**|Zachary MC Baum et.al.|[2207.10994v1](http://arxiv.org/abs/2207.10994v1)|null|\n", "2207.08575": "|**2022-07-18**|**Anisotropic spacetimes in $f(T,B)$ theory IV: Noether symmetry analysis**|Andronikos Paliathanasis et.al.|[2207.08575v1](http://arxiv.org/abs/2207.08575v1)|null|\n", "2206.15191": "|**2022-06-30**|**Lewis-Riesenfeld invariants for PT-symmetrically coupled oscillators from two dimensional point transformations and Lie algebraic expansions**|Andreas Fring et.al.|[2206.15191v1](http://arxiv.org/abs/2206.15191v1)|null|\n", "2206.04670": "|**2022-10-12**|**PointNeXt: Revisiting PointNet++ with Improved Training and Scaling Strategies**|Guocheng Qian et.al.|[2206.04670v2](http://arxiv.org/abs/2206.04670v2)|**[link](https://github.com/guochengqian/pointnext)**|\n", "2206.04511": "|**2022-08-29**|**Efficient Human Pose Estimation via 3D Event Point Cloud**|Jiaan Chen et.al.|[2206.04511v2](http://arxiv.org/abs/2206.04511v2)|**[link](https://github.com/masterhow/eventpointpose)**|\n", "2205.08886": "|**2022-05-18**|**GeoPointGAN: Synthetic Spatial Data with Local Label Differential Privacy**|Teddy Cunningham et.al.|[2205.08886v1](http://arxiv.org/abs/2205.08886v1)|**[link](https://github.com/konstantinklemmer/geopointgan)**|\n", "2204.03957": "|**2022-04-08**|**Points to Patches: Enabling the Use of Self-Attention for 3D Shape Recognition**|Axel Berg et.al.|[2204.03957v1](http://arxiv.org/abs/2204.03957v1)|**[link](https://github.com/axeber01/point-tnt)**|\n", "2203.12758": "|**2022-03-23**|**Mokey: Enabling Narrow Fixed-Point Inference for Out-of-the-Box Floating-Point Transformer Models**|Ali Hadi Zadeh et.al.|[2203.12758v1](http://arxiv.org/abs/2203.12758v1)|null|\n", "2203.04007": "|**2022-08-31**|**DuMLP-Pin: A Dual-MLP-dot-product Permutation-invariant Network for Set Feature Extraction**|Jiajun Fei et.al.|[2203.04007v2](http://arxiv.org/abs/2203.04007v2)|**[link](https://github.com/jaronthu/dumlp-pin)**|\n", "2203.00972": "|**2022-04-07**|**Improving Point Cloud Based Place Recognition with Ranking-based Loss and Large Batch Training**|Jacek Komorowski et.al.|[2203.00972v2](http://arxiv.org/abs/2203.00972v2)|**[link](https://github.com/jac99/minkloc3dv2)**|\n", "2201.05140": "|**2022-01-13**|**An introduction to PT-symmetric quantum mechanics -- time-dependent systems**|Andreas Fring et.al.|[2201.05140v1](http://arxiv.org/abs/2201.05140v1)|null|\n", "2112.13725": "|**2021-12-27**|**Near-Optimal Bounds for Generalized Orthogonal Procrustes Problem via Generalized Power Method**|Shuyang Ling et.al.|[2112.13725v1](http://arxiv.org/abs/2112.13725v1)|null|\n", "2112.11959": "|**2021-12-22**|**Dynamics of a symmetrically decoupled three-dimensional point transformation**|Hacene Gharout et.al.|[2112.11959v1](http://arxiv.org/abs/2112.11959v1)|null|\n", "2112.05635": "|**2021-12-10**|**Geometry of inhomogeneous Poisson brackets, multicomponent Harry Dym hierarchies and multicomponent Hunter-Saxton equations**|Andrey Yu. Konyaev et.al.|[2112.05635v1](http://arxiv.org/abs/2112.05635v1)|null|\n", "2112.04863": "|**2021-12-17**|**3D Medical Point Transformer: Introducing Convolution to Attention Networks for Medical Point Cloud Analysis**|Jianhui Yu et.al.|[2112.04863v2](http://arxiv.org/abs/2112.04863v2)|**[link](https://github.com/crane-papercode/3dmedpt)**|\n", "2112.04702": "|**2022-04-04**|**Fast Point Transformer**|Chunghyun Park et.al.|[2112.04702v2](http://arxiv.org/abs/2112.04702v2)|**[link](https://github.com/POSTECH-CVLab/FastPointTransformer)**|\n", "2111.14819": "|**2022-06-06**|**Point-BERT: Pre-training 3D Point Cloud Transformers with Masked Point Modeling**|Xumin Yu et.al.|[2111.14819v2](http://arxiv.org/abs/2111.14819v2)|**[link](https://github.com/lulutang0608/Point-BERT)**|\n", "2111.14451": "|**2022-03-31**|**HDR-NeRF: High Dynamic Range Neural Radiance Fields**|Xin Huang et.al.|[2111.14451v3](http://arxiv.org/abs/2111.14451v3)|null|\n", "2111.13702": "|**2022-12-12**|**The Information Content of Projected Galaxy Fields**|Lucas Porth et.al.|[2111.13702v2](http://arxiv.org/abs/2111.13702v2)|null|\n", "2111.10866": "|**2021-11-21**|**CpT: Convolutional Point Transformer for 3D Point Cloud Processing**|Chaitanya Kaul et.al.|[2111.10866v1](http://arxiv.org/abs/2111.10866v1)|null|\n", "2111.08973": "|**2021-11-19**|**Generating Unrestricted 3D Adversarial Point Clouds**|Xuelong Dai et.al.|[2111.08973v2](http://arxiv.org/abs/2111.08973v2)|**[link](https://github.com/EricDai0/AdvGCGAN)**|\n", "2111.00207": "|**2022-03-24**|**PatchFormer: An Efficient Point Transformer with Patch Attention**|Zhang Cheng et.al.|[2111.00207v3](http://arxiv.org/abs/2111.00207v3)|null|\n", "2110.05609": "|**2021-11-03**|**Comparison between time-independent and time-dependent quantum systems in the context of energy, Heisenberg uncertainty, average energy, force, average force and thermodynamic quantities**|Debraj Nath et.al.|[2110.05609v2](http://arxiv.org/abs/2110.05609v2)|null|\n", "2110.09230": "|**2021-10-07**|**A study on the Friedmann like Universe with Torsion using Noether Symmetry**|Ramkumar Radhakrishnan et.al.|[2110.09230v1](http://arxiv.org/abs/2110.09230v1)|null|\n", "2109.05023": "|**2021-09-20**|**Real-time multimodal image registration with partial intraoperative point-set data**|Zachary M C Baum et.al.|[2109.05023v2](http://arxiv.org/abs/2109.05023v2)|null|\n", "2109.02107": "|**2021-09-05**|**Normal Forms of second order Ordinary Differential Equations $y_{xx}=J(x,y,y_{x})$ under Fibre-Preserving Maps**|Wei Guo Foo et.al.|[2109.02107v1](http://arxiv.org/abs/2109.02107v1)|null|\n", "2108.08958": "|**2021-08-20**|**Exact solutions for time-dependent non-Hermitian oscillators: classical and quantum pictures**|Kevin Zelaya et.al.|[2108.08958v1](http://arxiv.org/abs/2108.08958v1)|null|\n", "2108.08891": "|**2021-08-19**|**Neural TMDlayer: Modeling Instantaneous flow of features via SDE Generators**|Zihang Meng et.al.|[2108.08891v1](http://arxiv.org/abs/2108.08891v1)|**[link](https://github.com/zihangm/neural-tmd-layer)**|\n", "2108.06076": "|**2022-05-25**|**PVT: Point-Voxel Transformer for Point Cloud Learning**|Cheng Zhang et.al.|[2108.06076v4](http://arxiv.org/abs/2108.06076v4)|**[link](https://github.com/HaochengWan/PVT)**|\n", "2108.00620": "|**2021-10-14**|**Investigating Attention Mechanism in 3D Point Cloud Object Detection**|Shi Qiu et.al.|[2108.00620v2](http://arxiv.org/abs/2108.00620v2)|**[link](https://github.com/ShiQiu0419/attentions_in_3D_detection)**|\n", "2107.14144": "|**2021-07-29**|**Reduction of balance laws in (3+1)--dimensions to autonomous conservation laws by means of equivalence transformations**|Matteo Gorgone et.al.|[2107.14144v1](http://arxiv.org/abs/2107.14144v1)|null|\n", "2303.01166": "|**2023-03-02**|**BPT: Binary Point Cloud Transformer for Place Recognition**|Zhixing Hou et.al.|[2303.01166v1](http://arxiv.org/abs/2303.01166v1)|null|\n", "2303.04458": "|**2023-03-08**|**Full Point Encoding for Local Feature Aggregation in 3D Point Clouds**|Yong He et.al.|[2303.04458v1](http://arxiv.org/abs/2303.04458v1)|null|\n", "2303.07766": "|**2023-03-14**|**Classical and quantum cosmology in $f(T)$-gravity theory: A Noether symmetry approach**|Roshni Bhaumik et.al.|[2303.07766v1](http://arxiv.org/abs/2303.07766v1)|null|\n", "2303.08274": "|**2023-03-14**|**GeoSpark: Sparking up Point Cloud Segmentation with Geometry Clue**|Zhening Huang et.al.|[2303.08274v1](http://arxiv.org/abs/2303.08274v1)|null|\n", "2303.15320": "|**2023-03-22**|**Noether's theorem and Lie symmetries for time-dependent Hamilton-Lagrange systems**|J\u00fcrgen Struckmeier et.al.|[2303.15320v1](http://arxiv.org/abs/2303.15320v1)|null|\n", "2303.17815": "|**2023-03-31**|**APPT : Asymmetric Parallel Point Transformer for 3D Point Cloud Understanding**|Hengjia Li et.al.|[2303.17815v1](http://arxiv.org/abs/2303.17815v1)|null|\n", "2304.02013": "|**2023-09-01**|**NPC: Neural Point Characters from Video**|Shih-Yang Su et.al.|[2304.02013v2](http://arxiv.org/abs/2304.02013v2)|null|\n", "2304.08279": "|**2023-05-27**|**MoDA: Modeling Deformable 3D Objects from Casual Videos**|Chaoyue Song et.al.|[2304.08279v2](http://arxiv.org/abs/2304.08279v2)|**[link](https://github.com/chaoyuesong/moda)**|\n", "2304.08681": "|**2023-09-07**|**The integer point transform as a complete invariant**|Sinai Robins et.al.|[2304.08681v4](http://arxiv.org/abs/2304.08681v4)|null|\n", "2304.14132": "|**2023-04-28**|**Human Semantic Segmentation using Millimeter-Wave Radar Sparse Point Clouds**|Pengfei Song et.al.|[2304.14132v2](http://arxiv.org/abs/2304.14132v2)|null|\n", "2305.00773": "|**2023-05-01**|**Point Cloud Semantic Segmentation**|Ivan Martinovi\u0107 et.al.|[2305.00773v1](http://arxiv.org/abs/2305.00773v1)|null|\n", "2305.03045": "|**2023-05-08**|**OctFormer: Octree-based Transformers for 3D Point Clouds**|Peng-Shuai Wang et.al.|[2305.03045v2](http://arxiv.org/abs/2305.03045v2)|**[link](https://github.com/octree-nn/octformer)**|\n", "2305.02533": "|**2023-05-04**|**Point Transformer For Coronary Artery Labeling**|Xu Wang et.al.|[2305.02533v1](http://arxiv.org/abs/2305.02533v1)|null|\n", "2306.10759": "|**2023-10-31**|**Simplifying and Empowering Transformers for Large-Graph Representations**|Qitian Wu et.al.|[2306.10759v3](http://arxiv.org/abs/2306.10759v3)|**[link](https://github.com/qitianwu/sgformer)**|\n", "2306.12361": "|**2023-06-21**|**Sigma-point Kalman Filter with Nonlinear Unknown Input Estimation via Optimization and Data-driven Approach for Dynamic Systems**|Junn Yong Loo et.al.|[2306.12361v1](http://arxiv.org/abs/2306.12361v1)|null|\n", "2306.10798": "|**2023-06-23**|**ExpPoint-MAE: Better interpretability and performance for self-supervised point cloud transformers**|Ioannis Romanelis et.al.|[2306.10798v2](http://arxiv.org/abs/2306.10798v2)|**[link](https://github.com/vvrpanda/exppoint-mae)**|\n", "2307.04723": "|**2023-07-18**|**Quark/Gluon Discrimination and Top Tagging with Dual Attention Transformer**|Minxuan He et.al.|[2307.04723v2](http://arxiv.org/abs/2307.04723v2)|null|\n", "2307.11973": "|**2023-07-22**|**Two-stream Multi-level Dynamic Point Transformer for Two-person Interaction Recognition**|Yao Liu et.al.|[2307.11973v1](http://arxiv.org/abs/2307.11973v1)|null|\n", "2308.04637": "|**2023-08-09**|**Sparse Binary Transformers for Multivariate Time Series Modeling**|Matt Gorbett et.al.|[2308.04637v1](http://arxiv.org/abs/2308.04637v1)|null|\n", "2308.09403": "|**2023-08-18**|**Target Clustering Based Multi-Bernoulli Filter for Superpositional Sensors**|Wang Sen et.al.|[2308.09403v1](http://arxiv.org/abs/2308.09403v1)|null|\n", "2309.00339": "|**2023-09-01**|**Robust Point Cloud Processing through Positional Embedding**|Jianqiao Zheng et.al.|[2309.00339v1](http://arxiv.org/abs/2309.00339v1)|null|\n", "2309.04105": "|**2023-09-08**|**Weakly Supervised Point Clouds Transformer for 3D Object Detection**|Zuojin Tang et.al.|[2309.04105v1](http://arxiv.org/abs/2309.04105v1)|null|\n", "2310.01545": "|**2023-10-02**|**RF-ULM: Deep Learning for Radio-Frequency Ultrasound Localization Microscopy**|Christopher Hahne et.al.|[2310.01545v1](http://arxiv.org/abs/2310.01545v1)|**[link](https://github.com/hahnec/rf-ulm)**|\n", "2310.05780": "|**2023-10-09**|**Lie symmetries for the cosmological field equations in brane-world gravity with bulk scalar field**|Andronikos Paliathanasis et.al.|[2310.05780v1](http://arxiv.org/abs/2310.05780v1)|null|\n", "2310.16861": "|**2023-10-25**|**General Point Model with Autoencoding and Autoregressive**|Zhe Li et.al.|[2310.16861v1](http://arxiv.org/abs/2310.16861v1)|null|\n", "2310.19772": "|**2023-10-22**|**Exact FLRW cosmological solutions via invariants of the symmetry groups**|E. Ahmadi Azar et.al.|[2310.19772v1](http://arxiv.org/abs/2310.19772v1)|null|\n", "2311.04081": "|**2023-11-07**|**Learning Super-Resolution Ultrasound Localization Microscopy from Radio-Frequency Data**|Christopher Hahne et.al.|[2311.04081v1](http://arxiv.org/abs/2311.04081v1)|null|\n"}, "NeRF": {"2302.12237": "|**2023-02-24**|**Learning Neural Volumetric Representations of Dynamic Humans in Minutes**|Chen Geng et.al.|[2302.12237v2](http://arxiv.org/abs/2302.12237v2)|**[link](https://github.com/zju3dv/instant-nvr)**|\n", "2302.12231": "|**2023-02-23**|**DiffusioNeRF: Regularizing Neural Radiance Fields with Denoising Diffusion Models**|Jamie Wynn et.al.|[2302.12231v1](http://arxiv.org/abs/2302.12231v1)|**[link](https://github.com/nianticlabs/diffusionerf)**|\n", "2302.10109": "|**2023-02-20**|**NerfDiff: Single-image View Synthesis with NeRF-guided Distillation from 3D-aware Diffusion**|Jiatao Gu et.al.|[2302.10109v1](http://arxiv.org/abs/2302.10109v1)|null|\n", "2302.09486": "|**2023-02-19**|**LC-NeRF: Local Controllable Face Generation in Neural Randiance Field**|Wenyang Zhou et.al.|[2302.09486v1](http://arxiv.org/abs/2302.09486v1)|null|\n", "2302.08788": "|**2023-02-17**|**MixNeRF: Modeling a Ray with Mixture Density for Novel View Synthesis from Sparse Inputs**|Seunghyeon Seo et.al.|[2302.08788v1](http://arxiv.org/abs/2302.08788v1)|**[link](https://github.com/shawn615/MixNeRF)**|\n", "2302.06833": "|**2023-02-14**|**VQ3D: Learning a 3D-Aware Generative Model on ImageNet**|Kyle Sargent et.al.|[2302.06833v1](http://arxiv.org/abs/2302.06833v1)|null|\n", "2302.06608": "|**2023-02-13**|**3D-aware Blending with Generative NeRFs**|Hyunsu Kim et.al.|[2302.06608v1](http://arxiv.org/abs/2302.06608v1)|**[link](https://github.com/naver-ai/BlendNeRF)**|\n", "2302.05573": "|**2023-02-11**|**3D Colored Shape Reconstruction from a Single RGB Image through Diffusion**|Bo Li et.al.|[2302.05573v1](http://arxiv.org/abs/2302.05573v1)|null|\n", "2302.04264": "|**2023-02-08**|**Nerfstudio: A Modular Framework for Neural Radiance Field Development**|Matthew Tancik et.al.|[2302.04264v1](http://arxiv.org/abs/2302.04264v1)|null|\n", "2302.02088": "|**2023-02-07**|**AV-NeRF: Learning Neural Fields for Real-World Audio-Visual Scene Synthesis**|Susan Liang et.al.|[2302.02088v2](http://arxiv.org/abs/2302.02088v2)|null|\n", "2302.01579": "|**2023-02-03**|**Semantic 3D-aware Portrait Synthesis and Manipulation Based on Compositional Neural Radiance Field**|Tianxiang Ma et.al.|[2302.01579v1](http://arxiv.org/abs/2302.01579v1)|**[link](https://github.com/tianxiangma/cnerf)**|\n", "2302.01571": "|**2023-02-03**|**Robust Camera Pose Refinement for Multi-Resolution Hash Encoding**|Hwan Heo et.al.|[2302.01571v1](http://arxiv.org/abs/2302.01571v1)|null|\n", "2302.01532": "|**2023-02-03**|**INV: Towards Streaming Incremental Neural Videos**|Shengze Wang et.al.|[2302.01532v1](http://arxiv.org/abs/2302.01532v1)|null|\n", "2302.01226": "|**2023-02-02**|**Factor Fields: A Unified Framework for Neural Fields and Beyond**|Anpei Chen et.al.|[2302.01226v1](http://arxiv.org/abs/2302.01226v1)|null|\n", "2302.00833": "|**2023-02-02**|**RobustNeRF: Ignoring Distractors with Robust Losses**|Sara Sabour et.al.|[2302.00833v1](http://arxiv.org/abs/2302.00833v1)|null|\n", "2301.13430": "|**2023-01-31**|**GeneFace: Generalized and High-Fidelity Audio-Driven 3D Talking Face Synthesis**|Zhenhui Ye et.al.|[2301.13430v1](http://arxiv.org/abs/2301.13430v1)|null|\n", "2301.12780": "|**2023-01-30**|**Equivariant Architectures for Learning in Deep Weight Spaces**|Aviv Navon et.al.|[2301.12780v1](http://arxiv.org/abs/2301.12780v1)|**[link](https://github.com/AvivNavon/DWSNets)**|\n", "2301.11631": "|**2023-01-27**|**HyperNeRFGAN: Hypernetwork approach to 3D NeRF GAN**|Adam Kania et.al.|[2301.11631v1](http://arxiv.org/abs/2301.11631v1)|**[link](https://github.com/gmum/hypernerfgan)**|\n", "2301.11522": "|**2023-01-27**|**A Comparison of Tiny-nerf versus Spatial Representations for 3d Reconstruction**|Saulo Abraham Gante et.al.|[2301.11522v1](http://arxiv.org/abs/2301.11522v1)|null|\n", "2301.11520": "|**2023-01-27**|**SNeRL: Semantic-aware Neural Radiance Fields for Reinforcement Learning**|Dongseok Shim et.al.|[2301.11520v1](http://arxiv.org/abs/2301.11520v1)|null|\n", "2301.11280": "|**2023-01-26**|**Text-To-4D Dynamic Scene Generation**|Uriel Singer et.al.|[2301.11280v1](http://arxiv.org/abs/2301.11280v1)|null|\n", "2301.10941": "|**2023-01-26**|**GeCoNeRF: Few-shot Neural Radiance Fields via Geometric Consistency**|Minseop Kwak et.al.|[2301.10941v1](http://arxiv.org/abs/2301.10941v1)|**[link](https://github.com/KU-CVLAB/GeCoNeRF)**|\n", "2301.09632": "|**2023-01-23**|**HexPlane: A Fast Representation for Dynamic Scenes**|Ang Cao et.al.|[2301.09632v1](http://arxiv.org/abs/2301.09632v1)|**[link](https://github.com/Caoang327/HexPlane)**|\n", "2301.09060": "|**2023-02-02**|**3D Reconstruction of Non-cooperative Resident Space Objects using Instant NGP-accelerated NeRF and D-NeRF**|Trupti Mahendrakar et.al.|[2301.09060v2](http://arxiv.org/abs/2301.09060v2)|null|\n", "2301.07958": "|**2023-02-05**|**RecolorNeRF: Layer Decomposed Radiance Fields for Efficient Color Editing of 3D Scenes**|Bingchen Gong et.al.|[2301.07958v2](http://arxiv.org/abs/2301.07958v2)|null|\n", "2301.08556": "|**2023-01-18**|**NeRF in the Palm of Your Hand: Corrective Augmentation for Robotics via Novel-View Synthesis**|Allan Zhou et.al.|[2301.08556v1](http://arxiv.org/abs/2301.08556v1)|null|\n", "2301.07668": "|**2023-01-18**|**Behind the Scenes: Density Fields for Single View Reconstruction**|Felix Wimbauer et.al.|[2301.07668v1](http://arxiv.org/abs/2301.07668v1)|**[link](https://github.com/Brummi/BehindTheScenes)**|\n", "2301.06782": "|**2023-01-17**|**A Large-Scale Outdoor Multi-modal Dataset and Benchmark for Novel View Synthesis and Implicit Scene Reconstruction**|Chongshan Lu et.al.|[2301.06782v1](http://arxiv.org/abs/2301.06782v1)|null|\n", "2301.05747": "|**2023-01-13**|**Laser: Latent Set Representations for 3D Generative Modeling**|Pol Moreno et.al.|[2301.05747v1](http://arxiv.org/abs/2301.05747v1)|null|\n", "2301.04075": "|**2023-01-10**|**Benchmarking Robustness in Neural Radiance Fields**|Chen Wang et.al.|[2301.04075v1](http://arxiv.org/abs/2301.04075v1)|null|\n", "2301.03102": "|**2023-01-08**|**Towards Open World NeRF-Based SLAM**|Daniil Lisus et.al.|[2301.03102v1](http://arxiv.org/abs/2301.03102v1)|null|\n", "2301.02975": "|**2023-01-10**|**Traditional Readability Formulas Compared for English**|Bruce W. Lee et.al.|[2301.02975v2](http://arxiv.org/abs/2301.02975v2)|null|\n", "2301.00950": "|**2023-01-09**|**Class-Continuous Conditional Generative Neural Radiance Field**|Jiwook Kim et.al.|[2301.00950v2](http://arxiv.org/abs/2301.00950v2)|**[link](https://github.com/tom919654/C3G-NeRF)**|\n", "2301.00411": "|**2023-01-11**|**Detachable Novel Views Synthesis of Dynamic Scenes Using Distribution-Driven Neural Radiance Fields**|Boyu Zhang et.al.|[2301.00411v2](http://arxiv.org/abs/2301.00411v2)|**[link](https://github.com/luciferbobo/d4nerf)**|\n", "2212.13056": "|**2022-12-26**|**MonoNeRF: Learning a Generalizable Dynamic Radiance Field from Monocular Videos**|Fengrui Tian et.al.|[2212.13056v1](http://arxiv.org/abs/2212.13056v1)|**[link](https://github.com/tianfr/mononerf)**|\n", "2212.12871": "|**2022-12-25**|**PaletteNeRF: Palette-based Color Editing for NeRFs**|Qiling Wu et.al.|[2212.12871v1](http://arxiv.org/abs/2212.12871v1)|null|\n", "2212.11966": "|**2022-12-22**|**Removing Objects From Neural Radiance Fields**|Silvan Weder et.al.|[2212.11966v1](http://arxiv.org/abs/2212.11966v1)|null|\n", "2212.10950": "|**2022-12-21**|**Incremental Learning for Neural Radiance Field with Uncertainty-Filtered Knowledge Distillation**|Mengqi Guo et.al.|[2212.10950v1](http://arxiv.org/abs/2212.10950v1)|null|\n", "2212.10699": "|**2023-01-24**|**PaletteNeRF: Palette-based Appearance Editing of Neural Radiance Fields**|Zhengfei Kuang et.al.|[2212.10699v2](http://arxiv.org/abs/2212.10699v2)|null|\n", "2212.09735": "|**2022-12-20**|**Correspondence Distillation from NeRF-based GAN**|Yushi Lan et.al.|[2212.09735v2](http://arxiv.org/abs/2212.09735v2)|null|\n", "2212.09330": "|**2022-12-19**|**StyleTRF: Stylizing Tensorial Radiance Fields**|Rahul Goel et.al.|[2212.09330v1](http://arxiv.org/abs/2212.09330v1)|null|\n", "2212.09100": "|**2022-12-18**|**SPARF: Large-Scale Learning of 3D Sparse Radiance Fields from Few Input Images**|Abdullah Hamdi et.al.|[2212.09100v1](http://arxiv.org/abs/2212.09100v1)|**[link](https://github.com/ajhamdi/sparf_pytorch)**|\n", "2212.09069": "|**2022-12-18**|**Masked Wavelet Representation for Compact Neural Radiance Fields**|Daniel Rho et.al.|[2212.09069v1](http://arxiv.org/abs/2212.09069v1)|**[link](https://github.com/daniel03c1/masked_wavelet_nerf)**|\n", "2212.08328": "|**2022-12-31**|**MEIL-NeRF: Memory-Efficient Incremental Learning of Neural Radiance Fields**|Jaeyoung Chung et.al.|[2212.08328v2](http://arxiv.org/abs/2212.08328v2)|null|\n", "2212.08070": "|**2022-12-15**|**NeRF-Art: Text-Driven Neural Radiance Fields Stylization**|Can Wang et.al.|[2212.08070v1](http://arxiv.org/abs/2212.08070v1)|**[link](https://github.com/cassiePython/NeRF-Art)**|\n", "2212.08057": "|**2022-12-15**|**Real-Time Neural Light Field on Mobile Devices**|Junli Cao et.al.|[2212.08057v1](http://arxiv.org/abs/2212.08057v1)|**[link](https://github.com/snap-research/mobiler2l)**|\n", "2212.08476": "|**2022-12-15**|**SteerNeRF: Accelerating NeRF Rendering via Smooth Viewpoint Trajectory**|Sicheng Li et.al.|[2212.08476v1](http://arxiv.org/abs/2212.08476v1)|null|\n", "2212.07388": "|**2022-12-14**|**NoPe-NeRF: Optimising Neural Radiance Field with No Pose Prior**|Wenjing Bian et.al.|[2212.07388v1](http://arxiv.org/abs/2212.07388v1)|**[link](https://github.com/ActiveVisionLab/nope-nerf)**|\n", "2212.04701": "|**2022-12-09**|**4K-NeRF: High Fidelity Neural Radiance Fields at Ultra High Resolutions**|Zhongshu Wang et.al.|[2212.04701v1](http://arxiv.org/abs/2212.04701v1)|**[link](https://github.com/frozoul/4k-nerf)**|\n", "2212.04823": "|**2022-12-08**|**GazeNeRF: 3D-Aware Gaze Redirection with Neural Radiance Fields**|Alessandro Ruzzi et.al.|[2212.04823v1](http://arxiv.org/abs/2212.04823v1)|**[link](https://github.com/alessandroruzzi/gazenerf)**|\n", "2302.13543": "|**2023-02-27**|**BaLi-RF: Bandlimited Radiance Fields for Dynamic Scene Modeling**|Sameera Ramasinghe et.al.|[2302.13543v1](http://arxiv.org/abs/2302.13543v1)|null|\n", "2302.13397": "|**2023-02-26**|**Efficient physics-informed neural networks using hash encoding**|Xinquan Huang et.al.|[2302.13397v1](http://arxiv.org/abs/2302.13397v1)|null|\n", "2302.12931": "|**2023-02-24**|**CATNIPS: Collision Avoidance Through Neural Implicit Probabilistic Scenes**|Timothy Chen et.al.|[2302.12931v1](http://arxiv.org/abs/2302.12931v1)|null|\n", "2302.14683": "|**2023-03-09**|**IntrinsicNGP: Intrinsic Coordinate based Hash Encoding for Human NeRF**|Bo Peng et.al.|[2302.14683v2](http://arxiv.org/abs/2302.14683v2)|null|\n", "2303.00749": "|**2023-03-01**|**S-NeRF: Neural Radiance Fields for Street Views**|Ziyang Xie et.al.|[2303.00749v1](http://arxiv.org/abs/2303.00749v1)|null|\n", "2303.02091": "|**2023-03-03**|**Delicate Textured Mesh Recovery from NeRF via Adaptive Surface Refinement**|Jiaxiang Tang et.al.|[2303.02091v1](http://arxiv.org/abs/2303.02091v1)|**[link](https://github.com/ashawkey/nerf2mesh)**|\n", "2303.01736": "|**2023-03-03**|**Multi-Plane Neural Radiance Fields for Novel View Synthesis**|Youssef Abdelkareem et.al.|[2303.01736v1](http://arxiv.org/abs/2303.01736v1)|null|\n", "2303.03361": "|**2023-03-10**|**Nerflets: Local Radiance Fields for Efficient Structure-Aware 3D Scene Representation from 2D Supervision**|Xiaoshuai Zhang et.al.|[2303.03361v2](http://arxiv.org/abs/2303.03361v2)|null|\n", "2303.03003": "|**2023-03-07**|**Efficient Large-scale Scene Representation with a Hybrid of High-resolution Grid and Plane Features**|Yuqi Zhang et.al.|[2303.03003v2](http://arxiv.org/abs/2303.03003v2)|**[link](https://github.com/zyqz97/gp-nerf)**|\n", "2303.04086": "|**2023-03-07**|**NEPHELE: A Neural Platform for Highly Realistic Cloud Radiance Rendering**|Haimin Luo et.al.|[2303.04086v1](http://arxiv.org/abs/2303.04086v1)|null|\n", "2303.03808": "|**2023-03-07**|**Multiscale Tensor Decomposition and Rendering Equation Encoding for View Synthesis**|Kang Han et.al.|[2303.03808v1](http://arxiv.org/abs/2303.03808v1)|**[link](https://github.com/imkanghan/nrff)**|\n", "2303.03966": "|**2023-03-05**|**Semantic-aware Occlusion Filtering Neural Radiance Fields in the Wild**|Jaewon Lee et.al.|[2303.03966v1](http://arxiv.org/abs/2303.03966v1)|null|\n", "2303.04508": "|**2023-03-08**|**FastSurf: Fast Neural RGB-D Surface Reconstruction using Per-Frame Intrinsic Refinement and TSDF Fusion Prior Learning**|Seunghwan Lee et.al.|[2303.04508v1](http://arxiv.org/abs/2303.04508v1)|**[link](https://github.com/ROKIT-Healthcare/FastSurf)**|\n", "2303.04322": "|**2023-03-08**|**DroNeRF: Real-time Multi-agent Drone Pose Optimization for Computing Neural Radiance Fields**|Dipam Patel et.al.|[2303.04322v1](http://arxiv.org/abs/2303.04322v1)|null|\n", "2303.05512": "|**2023-03-09**|**PAC-NeRF: Physics Augmented Continuum Neural Radiance Fields for Geometry-Agnostic System Identification**|Xuan Li et.al.|[2303.05512v1](http://arxiv.org/abs/2303.05512v1)|null|\n", "2303.05835": "|**2023-03-10**|**You Only Train Once: Multi-Identity Free-Viewpoint Neural Human Rendering from Monocular Videos**|Jaehyeok Kim et.al.|[2303.05835v1](http://arxiv.org/abs/2303.05835v1)|null|\n", "2303.05807": "|**2023-03-10**|**Aleth-NeRF: Low-light Condition View Synthesis with Concealing Fields**|Ziteng Cui et.al.|[2303.05807v1](http://arxiv.org/abs/2303.05807v1)|null|\n", "2303.05775": "|**2023-03-10**|**Self-NeRF: A Self-Training Pipeline for Few-Shot Neural Radiance Fields**|Jiayang Bai et.al.|[2303.05775v1](http://arxiv.org/abs/2303.05775v1)|null|\n", "2303.05735": "|**2023-03-14**|**Hardware Acceleration of Neural Graphics**|Muhammad Husnain Mubarik et.al.|[2303.05735v2](http://arxiv.org/abs/2303.05735v2)|null|\n", "2303.05703": "|**2023-03-10**|**MovingParts: Motion-based 3D Part Discovery in Dynamic Radiance Field**|Kaizhi Yang et.al.|[2303.05703v1](http://arxiv.org/abs/2303.05703v1)|null|\n", "2303.06919": "|**2023-03-13**|**NeRFLiX: High-Quality Neural View Synthesis by Learning a Degradation-Driven Inter-viewpoint MiXer**|Kun Zhou et.al.|[2303.06919v1](http://arxiv.org/abs/2303.06919v1)|**[link](https://github.com/redrock303/NeRFLiX_CPVR2023)**|\n", "2303.06335": "|**2023-03-11**|**Just Flip: Flipped Observation Generation and Optimization for Neural Radiance Fields to Cover Unobserved View**|Minjae Lee et.al.|[2303.06335v1](http://arxiv.org/abs/2303.06335v1)|**[link](https://github.com/minjae-lulu/just-flip)**|\n", "2303.06226": "|**2023-03-10**|**NeRFlame: FLAME-based conditioning of NeRF for 3D face rendering**|Wojciech Zaj\u0105c et.al.|[2303.06226v1](http://arxiv.org/abs/2303.06226v1)|**[link](https://github.com/wojtekz4/nerflame)**|\n", "2303.08096": "|**2023-03-14**|**MELON: NeRF with Unposed Images Using Equivalence Class Estimation**|Axel Levy et.al.|[2303.08096v1](http://arxiv.org/abs/2303.08096v1)|null|\n", "2303.07937": "|**2023-03-16**|**Let 2D Diffusion Model Know 3D-Consistency for Robust Text-to-3D Generation**|Junyoung Seo et.al.|[2303.07937v3](http://arxiv.org/abs/2303.07937v3)|**[link](https://github.com/KU-CVLAB/3DFuse)**|\n", "2303.07653": "|**2023-03-16**|**NEF: Neural Edge Fields for 3D Parametric Curve Reconstruction from Multi-view Images**|Yunfan Ye et.al.|[2303.07653v2](http://arxiv.org/abs/2303.07653v2)|**[link](https://github.com/yunfan1202/NEF_code)**|\n", "2303.07596": "|**2023-03-18**|**Frequency-Modulated Point Cloud Rendering with Easy Editing**|Yi Zhang et.al.|[2303.07596v2](http://arxiv.org/abs/2303.07596v2)|**[link](https://github.com/yizhangphd/freqpcr)**|\n", "2303.07418": "|**2023-03-13**|**FreeNeRF: Improving Few-shot Neural Rendering with Free Frequency Regularization**|Jiawei Yang et.al.|[2303.07418v1](http://arxiv.org/abs/2303.07418v1)|**[link](https://github.com/jiawei-yang/freenerf)**|\n", "2303.08808": "|**2023-03-15**|**Mesh Strikes Back: Fast and Efficient Human Reconstruction from RGB videos**|Rohit Jena et.al.|[2303.08808v1](http://arxiv.org/abs/2303.08808v1)|null|\n", "2303.08717": "|**2023-03-15**|**Re-ReND: Real-time Rendering of NeRFs across Devices**|Sara Rojas et.al.|[2303.08717v1](http://arxiv.org/abs/2303.08717v1)|**[link](https://github.com/sararoma95/Re-ReND)**|\n", "2303.08695": "|**2023-03-15**|**RefiNeRF: Modelling dynamic neural radiance fields with inconsistent or missing camera parameters**|Shuja Khalid et.al.|[2303.08695v1](http://arxiv.org/abs/2303.08695v1)|null|\n", "2303.08370": "|**2023-03-15**|**Harnessing Low-Frequency Neural Fields for Few-Shot View Synthesis**|Liangchen Song et.al.|[2303.08370v1](http://arxiv.org/abs/2303.08370v1)|**[link](https://github.com/lsongx/halo)**|\n", "2303.09554": "|**2023-03-21**|**PartNeRF: Generating Part-Aware Editable 3D Shapes without 3D Supervision**|Konstantinos Tertikas et.al.|[2303.09554v3](http://arxiv.org/abs/2303.09554v3)|null|\n", "2303.09553": "|**2023-03-16**|**LERF: Language Embedded Radiance Fields**|Justin Kerr et.al.|[2303.09553v1](http://arxiv.org/abs/2303.09553v1)|null|\n", "2303.09431": "|**2023-03-16**|**NeRFMeshing: Distilling Neural Radiance Fields into Geometrically-Accurate 3D Meshes**|Marie-Julie Rakotosaona et.al.|[2303.09431v1](http://arxiv.org/abs/2303.09431v1)|null|\n", "2303.09412": "|**2023-03-17**|**NeRFtrinsic Four: An End-To-End Trainable NeRF Jointly Optimizing Diverse Intrinsic and Extrinsic Camera Parameters**|Hannah Schieber et.al.|[2303.09412v2](http://arxiv.org/abs/2303.09412v2)|**[link](https://github.com/hannahhaensen/nerftrinsic_four)**|\n", "2303.09153": "|**2023-03-16**|**Reliable Image Dehazing by NeRF**|Zheyan Jin et.al.|[2303.09153v1](http://arxiv.org/abs/2303.09153v1)|null|\n", "2303.10083": "|**2023-03-17**|**$\u03b1$Surf: Implicit Surface Reconstruction for Semi-Transparent and Thin Objects with Decoupled Geometry and Opacity**|Tianhao Wu et.al.|[2303.10083v1](http://arxiv.org/abs/2303.10083v1)|null|\n", "2303.09952": "|**2023-03-17**|**Single-view Neural Radiance Fields with Depth Teacher**|Yurui Chen et.al.|[2303.09952v1](http://arxiv.org/abs/2303.09952v1)|null|\n", "2303.11052": "|**2023-03-20**|**ContraNeRF: Generalizable Neural Radiance Fields for Synthetic-to-real Novel View Synthesis via Contrastive Learning**|Hao Yang et.al.|[2303.11052v1](http://arxiv.org/abs/2303.11052v1)|null|\n", "2303.10735": "|**2023-03-19**|**SKED: Sketch-guided Text-based 3D Editing**|Aryan Mikaeili et.al.|[2303.10735v1](http://arxiv.org/abs/2303.10735v1)|null|\n", "2303.10709": "|**2023-03-19**|**NeRF-LOAM: Neural Implicit Representation for Large-Scale Incremental LiDAR Odometry and Mapping**|Junyuan Deng et.al.|[2303.10709v1](http://arxiv.org/abs/2303.10709v1)|**[link](https://github.com/junyuandeng/nerf-loam)**|\n", "2303.10340": "|**2023-03-18**|**3D Data Augmentation for Driving Scenes on Camera**|Wenwen Tong et.al.|[2303.10340v1](http://arxiv.org/abs/2303.10340v1)|null|\n", "2303.11938": "|**2023-03-21**|**3D-CLFusion: Fast Text-to-3D Rendering with Contrastive Latent Diffusion**|Yu-Jhe Li et.al.|[2303.11938v1](http://arxiv.org/abs/2303.11938v1)|null|\n", "2303.11728": "|**2023-03-22**|**ExtremeNeRF: Few-shot Neural Radiance Fields Under Unconstrained Illumination**|SeokYeong Lee et.al.|[2303.11728v2](http://arxiv.org/abs/2303.11728v2)|null|\n", "2303.11364": "|**2023-03-20**|**DehazeNeRF: Multiple Image Haze Removal and 3D Shape Reconstruction using Neural Radiance Fields**|Wei-Ting Chen et.al.|[2303.11364v1](http://arxiv.org/abs/2303.11364v1)|null|\n", "2303.12791": "|**2023-03-22**|**SHERF: Generalizable Human NeRF from a Single Image**|Shoukang Hu et.al.|[2303.12791v1](http://arxiv.org/abs/2303.12791v1)|**[link](https://github.com/skhu101/sherf)**|\n", "2303.12789": "|**2023-03-22**|**Instruct-NeRF2NeRF: Editing 3D Scenes with Instructions**|Ayaan Haque et.al.|[2303.12789v1](http://arxiv.org/abs/2303.12789v1)|null|\n", "2303.12786": "|**2023-03-22**|**FeatureNeRF: Learning Generalizable NeRFs by Distilling Foundation Models**|Jianglong Ye et.al.|[2303.12786v1](http://arxiv.org/abs/2303.12786v1)|null|\n", "2303.12408": "|**2023-03-24**|**Balanced Spherical Grid for Egocentric View Synthesis**|Changwoon Choi et.al.|[2303.12408v2](http://arxiv.org/abs/2303.12408v2)|**[link](https://github.com/changwoonchoi/EgoNeRF)**|\n", "2303.12234": "|**2023-03-21**|**Pre-NeRF 360: Enriching Unbounded Appearances for Neural Radiance Fields**|Ahmad AlMughrabi et.al.|[2303.12234v1](http://arxiv.org/abs/2303.12234v1)|**[link](https://github.com/amughrabi/pre-nerf)**|\n", "2303.13497": "|**2023-03-23**|**TriPlaneNet: An Encoder for EG3D Inversion**|Ananta R. Bhattarai et.al.|[2303.13497v1](http://arxiv.org/abs/2303.13497v1)|null|\n", "2303.13472": "|**2023-03-23**|**Plotting Behind the Scenes: Towards Learnable Game Engines**|Willi Menapace et.al.|[2303.13472v1](http://arxiv.org/abs/2303.13472v1)|null|\n", "2303.13450": "|**2023-03-23**|**Set-the-Scene: Global-Local Training for Generating Controllable NeRF Scenes**|Dana Cohen-Bar et.al.|[2303.13450v1](http://arxiv.org/abs/2303.13450v1)|**[link](https://github.com/DanaCohen95/Set-the-Scene)**|\n", "2303.13277": "|**2023-03-25**|**SINE: Semantic-driven Image-based NeRF Editing with Prior-guided Editing Field**|Chong Bao et.al.|[2303.13277v2](http://arxiv.org/abs/2303.13277v2)|null|\n", "2303.13232": "|**2023-03-23**|**Transforming Radiance Field with Lipschitz Network for Photorealistic 3D Scene Stylization**|Zicheng Zhang et.al.|[2303.13232v1](http://arxiv.org/abs/2303.13232v1)|null|\n", "2303.13014": "|**2023-03-23**|**Semantic Ray: Learning a Generalizable Semantic Field with Cross-Reprojection Attention**|Fangfu Liu et.al.|[2303.13014v1](http://arxiv.org/abs/2303.13014v1)|**[link](https://github.com/liuff19/Semantic-Ray)**|\n", "2303.12865": "|**2023-03-22**|**NeRF-GAN Distillation for Efficient 3D-Aware Generation with Convolutions**|Mohamad Shahbazi et.al.|[2303.12865v1](http://arxiv.org/abs/2303.12865v1)|**[link](https://github.com/mshahbazi72/nerf-gan-distillation)**|\n", "2303.14001": "|**2023-03-24**|**Grid-guided Neural Radiance Fields for Large Urban Scenes**|Linning Xu et.al.|[2303.14001v1](http://arxiv.org/abs/2303.14001v1)|null|\n", "2303.13843": "|**2023-03-24**|**CompoNeRF: Text-guided Multi-object Compositional NeRF with Editable 3D Scene Layout**|Yiqi Lin et.al.|[2303.13843v1](http://arxiv.org/abs/2303.13843v1)|null|\n", "2303.13825": "|**2023-03-24**|**HandNeRF: Neural Radiance Fields for Animatable Interacting Hands**|Zhiyang Guo et.al.|[2303.13825v1](http://arxiv.org/abs/2303.13825v1)|null|\n", "2303.13817": "|**2023-03-24**|**ABLE-NeRF: Attention-Based Rendering with Learnable Embeddings for Neural Radiance Field**|Zhe Jun Tang et.al.|[2303.13817v1](http://arxiv.org/abs/2303.13817v1)|**[link](https://github.com/tangzj/able-nerf)**|\n", "2303.13777": "|**2023-03-24**|**GM-NeRF: Learning Generalizable Model-based Neural Radiance Fields from Multi-view Images**|Jianchuan Chen et.al.|[2303.13777v1](http://arxiv.org/abs/2303.13777v1)|null|\n", "2303.13743": "|**2023-03-24**|**TEGLO: High Fidelity Canonical Texture Mapping from Single-View Images**|Vishal Vinod et.al.|[2303.13743v1](http://arxiv.org/abs/2303.13743v1)|null|\n", "2303.13582": "|**2023-03-23**|**SCADE: NeRFs from Space Carving with Ambiguity-Aware Depth Estimates**|Mikaela Angelina Uy et.al.|[2303.13582v1](http://arxiv.org/abs/2303.13582v1)|null|\n", "2303.15427": "|**2023-03-27**|**JAWS: Just A Wild Shot for Cinematic Transfer in Neural Radiance Fields**|Xi Wang et.al.|[2303.15427v1](http://arxiv.org/abs/2303.15427v1)|**[link](https://github.com/robincourant/jaws)**|\n", "2303.15387": "|**2023-03-27**|**Generalizable Neural Voxels for Fast Human Radiance Fields**|Taoran Yi et.al.|[2303.15387v1](http://arxiv.org/abs/2303.15387v1)|null|\n", "2303.15368": "|**2023-03-27**|**NeUDF: Learning Unsigned Distance Fields from Multi-view Images for Reconstructing Non-watertight Models**|Fei Hou et.al.|[2303.15368v1](http://arxiv.org/abs/2303.15368v1)|null|\n", "2303.15012": "|**2023-03-27**|**3D-Aware Multi-Class Image-to-Image Translation with NeRFs**|Senmao Li et.al.|[2303.15012v1](http://arxiv.org/abs/2303.15012v1)|**[link](https://github.com/sen-mao/3di2i-translation)**|\n", "2303.14707": "|**2023-03-26**|**Clean-NeRF: Reformulating NeRF to account for View-Dependent Observations**|Xinhang Liu et.al.|[2303.14707v1](http://arxiv.org/abs/2303.14707v1)|null|\n", "2303.14536": "|**2023-03-25**|**SUDS: Scalable Urban Dynamic Scenes**|Haithem Turki et.al.|[2303.14536v1](http://arxiv.org/abs/2303.14536v1)|null|\n", "2303.14478": "|**2023-03-25**|**DBARF: Deep Bundle-Adjusting Generalizable Neural Radiance Fields**|Yu Chen et.al.|[2303.14478v1](http://arxiv.org/abs/2303.14478v1)|null|\n", "2303.14435": "|**2023-03-25**|**NeRF-DS: Neural Radiance Fields for Dynamic Specular Objects**|Zhiwen Yan et.al.|[2303.14435v1](http://arxiv.org/abs/2303.14435v1)|**[link](https://github.com/jokeryan/nerf-ds)**|\n", "2303.15206": "|**2023-03-24**|**Perceptual Quality Assessment of NeRF and Neural View Synthesis Methods for Front-Facing Views**|Hanxue Liang et.al.|[2303.15206v1](http://arxiv.org/abs/2303.15206v1)|null|\n", "2303.16196": "|**2023-03-28**|**SparseNeRF: Distilling Depth Ranking for Few-shot Novel View Synthesis**|Guangcong Wang et.al.|[2303.16196v1](http://arxiv.org/abs/2303.16196v1)|null|\n", "2303.16184": "|**2023-03-28**|**VMesh: Hybrid Volume-Mesh Representation for Efficient View Synthesis**|Yuan-Chen Guo et.al.|[2303.16184v1](http://arxiv.org/abs/2303.16184v1)|null|\n", "2303.16001": "|**2023-03-30**|**Adaptive Voronoi NeRFs**|Tim Elsner et.al.|[2303.16001v2](http://arxiv.org/abs/2303.16001v2)|null|\n", "2303.15951": "|**2023-03-28**|**F$^{2}$-NeRF: Fast Neural Radiance Field Training with Free Camera Trajectories**|Peng Wang et.al.|[2303.15951v1](http://arxiv.org/abs/2303.15951v1)|**[link](https://github.com/Totoro97/f2-nerf)**|\n", "2303.16485": "|**2023-03-29**|**TriVol: Point Cloud Rendering via Triple Volumes**|Tao Hu et.al.|[2303.16485v1](http://arxiv.org/abs/2303.16485v1)|**[link](https://github.com/dvlab-research/trivol)**|\n", "2303.16482": "|**2023-03-29**|**Point2Pix: Photo-Realistic Point Cloud Rendering via Neural Radiance Fields**|Tao Hu et.al.|[2303.16482v1](http://arxiv.org/abs/2303.16482v1)|null|\n", "2303.16333": "|**2023-03-28**|**Flow supervision for Deformable NeRF**|Chaoyang Wang et.al.|[2303.16333v1](http://arxiv.org/abs/2303.16333v1)|null|\n", "2303.17603": "|**2023-03-30**|**NeRF-Supervised Deep Stereo**|Fabio Tosi et.al.|[2303.17603v1](http://arxiv.org/abs/2303.17603v1)|**[link](https://github.com/fabiotosi92/nerf-supervised-deep-stereo)**|\n", "2303.17368": "|**2023-03-30**|**SynBody: Synthetic Dataset with Layered Human Models for 3D Human Perception and Modeling**|Zhitao Yang et.al.|[2303.17368v1](http://arxiv.org/abs/2303.17368v1)|**[link](https://github.com/openxrlab/xrfeitoria)**|\n", "2303.17147": "|**2023-03-30**|**NeILF++: Inter-Reflectable Light Fields for Geometry and Material Estimation**|Jingyang Zhang et.al.|[2303.17147v1](http://arxiv.org/abs/2303.17147v1)|null|\n", "2303.17094": "|**2023-03-30**|**Enhanced Stable View Synthesis**|Nishant Jain et.al.|[2303.17094v1](http://arxiv.org/abs/2303.17094v1)|null|\n", "2303.17968": "|**2023-03-31**|**VDN-NeRF: Resolving Shape-Radiance Ambiguity via View-Dependence Normalization**|Bingfan Zhu et.al.|[2303.17968v1](http://arxiv.org/abs/2303.17968v1)|**[link](https://github.com/boifz/vdn-nerf)**|\n", "2304.00916": "|**2023-04-06**|**DreamAvatar: Text-and-Shape Guided 3D Human Avatar Generation via Diffusion Models**|Yukang Cao et.al.|[2304.00916v2](http://arxiv.org/abs/2304.00916v2)|null|\n", "2304.00341": "|**2023-04-01**|**JacobiNeRF: NeRF Shaping with Mutual Information Gradients**|Xiaomeng Xu et.al.|[2304.00341v1](http://arxiv.org/abs/2304.00341v1)|**[link](https://github.com/xxm19/jacobinerf)**|\n", "2304.02001": "|**2023-04-04**|**MonoHuman: Animatable Human Neural Field from Monocular Video**|Zhengming Yu et.al.|[2304.02001v1](http://arxiv.org/abs/2304.02001v1)|null|\n", "2304.02061": "|**2023-04-11**|**Generating Continual Human Motion in Diverse 3D Scenes**|Aymen Mir et.al.|[2304.02061v2](http://arxiv.org/abs/2304.02061v2)|null|\n", "2304.03280": "|**2023-04-06**|**LANe: Lighting-Aware Neural Fields for Compositional Scene Synthesis**|Akshay Krishnan et.al.|[2304.03280v1](http://arxiv.org/abs/2304.03280v1)|null|\n", "2304.03266": "|**2023-04-06**|**Neural Fields meet Explicit Geometric Representation for Inverse Rendering of Urban Scenes**|Zian Wang et.al.|[2304.03266v1](http://arxiv.org/abs/2304.03266v1)|null|\n", "2304.02827": "|**2023-04-06**|**DITTO-NeRF: Diffusion-based Iterative Text To Omni-directional 3D Model**|Hoigi Seo et.al.|[2304.02827v1](http://arxiv.org/abs/2304.02827v1)|null|\n", "2304.02736": "|**2023-04-05**|**Image Stabilization for Hololens Camera in Remote Collaboration**|Gowtham Senthil et.al.|[2304.02736v1](http://arxiv.org/abs/2304.02736v1)|null|\n", "2304.03526": "|**2023-04-07**|**Lift3D: Synthesize 3D Training Data by Lifting 2D GAN to 3D Generative Radiance Field**|Leheng Li et.al.|[2304.03526v1](http://arxiv.org/abs/2304.03526v1)|null|\n", "2304.03384": "|**2023-04-06**|**Beyond NeRF Underwater: Learning Neural Reflectance Fields for True Color Correction of Marine Imagery**|Tianyi Zhang et.al.|[2304.03384v1](http://arxiv.org/abs/2304.03384v1)|**[link](https://github.com/tyz1030/neuralsea)**|\n", "2304.04452": "|**2023-04-10**|**Neural Residual Radiance Fields for Streamably Free-Viewpoint Videos**|Liao Wang et.al.|[2304.04452v1](http://arxiv.org/abs/2304.04452v1)|null|\n", "2304.04446": "|**2023-04-10**|**Inferring Fluid Dynamics via Inverse Rendering**|Jinxian Liu et.al.|[2304.04446v1](http://arxiv.org/abs/2304.04446v1)|null|\n", "2304.04395": "|**2023-04-10**|**Instance Neural Radiance Field**|Benran Hu et.al.|[2304.04395v1](http://arxiv.org/abs/2304.04395v1)|**[link](https://github.com/lyclyc52/instance_nerf)**|\n", "2304.04133": "|**2023-04-12**|**NeRF applied to satellite imagery for surface reconstruction**|Federico Semeraro et.al.|[2304.04133v3](http://arxiv.org/abs/2304.04133v3)|**[link](https://github.com/fsemerar/satnerf)**|\n", "2304.04012": "|**2023-04-08**|**PVD-AL: Progressive Volume Distillation with Active Learning for Efficient Conversion Between Different NeRF Architectures**|Shuangkang Fang et.al.|[2304.04012v1](http://arxiv.org/abs/2304.04012v1)|**[link](https://github.com/megvii-research/AAAI2023-PVD)**|\n", "2304.04559": "|**2023-04-07**|**Event-based Camera Tracker by $\\nabla$t NeRF**|Mana Masuda et.al.|[2304.04559v1](http://arxiv.org/abs/2304.04559v1)|null|\n", "2304.05218": "|**2023-04-11**|**Improving Neural Radiance Fields with Depth-aware Optimization for Novel View Synthesis**|Shu Chen et.al.|[2304.05218v1](http://arxiv.org/abs/2304.05218v1)|**[link](https://github.com/xtu-pr-lab/sfmnerf)**|\n", "2304.05097": "|**2023-04-11**|**One-Shot High-Fidelity Talking-Head Synthesis with Deformable Neural Radiance Field**|Weichuang Li et.al.|[2304.05097v1](http://arxiv.org/abs/2304.05097v1)|null|\n", "2304.04962": "|**2023-04-11**|**MRVM-NeRF: Mask-Based Pretraining for Neural Radiance Fields**|Ganlin Yang et.al.|[2304.04962v1](http://arxiv.org/abs/2304.04962v1)|null|\n", "2304.04897": "|**2023-04-10**|**Neural Image-based Avatars: Generalizable Radiance Fields for Human Avatar Modeling**|Youngjoong Kwon et.al.|[2304.04897v1](http://arxiv.org/abs/2304.04897v1)|null|\n", "2304.05620": "|**2023-04-12**|**NutritionVerse-Thin: An Optimized Strategy for Enabling Improved Rendering of 3D Thin Food Models**|Chi-en Amy Tai et.al.|[2304.05620v1](http://arxiv.org/abs/2304.05620v1)|null|\n", "2304.06714": "|**2023-04-17**|**Single-Stage Diffusion NeRF: A Unified Approach to 3D Generation and Reconstruction**|Hansheng Chen et.al.|[2304.06714v2](http://arxiv.org/abs/2304.06714v2)|**[link](https://github.com/Lakonik/SSDNeRF)**|\n", "2304.06706": "|**2023-04-13**|**Zip-NeRF: Anti-Aliased Grid-Based Neural Radiance Fields**|Jonathan T. Barron et.al.|[2304.06706v1](http://arxiv.org/abs/2304.06706v1)|null|\n", "2304.06287": "|**2023-04-13**|**NeRFVS: Neural Radiance Fields for Free View Synthesis via Geometry Scaffolds**|Chen Yang et.al.|[2304.06287v1](http://arxiv.org/abs/2304.06287v1)|null|\n", "2304.06969": "|**2023-04-14**|**UVA: Towards Unified Volumetric Avatar for View Synthesis, Pose rendering, Geometry and Texture Editing**|Jinlong Fan et.al.|[2304.06969v1](http://arxiv.org/abs/2304.06969v1)|null|\n", "2304.08279": "|**2023-04-17**|**MoDA: Modeling Deformable 3D Objects from Casual Videos**|Chaoyue Song et.al.|[2304.08279v1](http://arxiv.org/abs/2304.08279v1)|**[link](https://github.com/chaoyuesong/moda)**|\n", "2304.07979": "|**2023-04-17**|**NeRF-Loc: Visual Localization with Conditional Neural Radiance Field**|Jianlin Liu et.al.|[2304.07979v1](http://arxiv.org/abs/2304.07979v1)|**[link](https://github.com/jenningsl/nerf-loc)**|\n", "2304.07918": "|**2023-04-16**|**Likelihood-Based Generative Radiance Field with Latent Space Energy-Based Model for 3D-Aware Disentangled Image Representation**|Yaxuan Zhu et.al.|[2304.07918v1](http://arxiv.org/abs/2304.07918v1)|null|\n", "2304.07915": "|**2023-04-16**|**CAT-NeRF: Constancy-Aware Tx$^2$Former for Dynamic Body Modeling**|Haidong Zhu et.al.|[2304.07915v1](http://arxiv.org/abs/2304.07915v1)|**[link](https://github.com/haidongz-usc/CAT-NeRF)**|\n", "2304.07743": "|**2023-04-16**|**SeaThru-NeRF: Neural Radiance Fields in Scattering Media**|Deborah Levy et.al.|[2304.07743v1](http://arxiv.org/abs/2304.07743v1)|**[link](https://github.com/deborahLevy130/seathru_NeRF)**|\n", "2304.08971": "|**2023-04-18**|**SurfelNeRF: Neural Surfel Radiance Fields for Online Photorealistic Reconstruction of Indoor Scenes**|Yiming Gao et.al.|[2304.08971v1](http://arxiv.org/abs/2304.08971v1)|null|\n", "2304.08757": "|**2023-04-18**|**NeAI: A Pre-convoluted Representation for Plug-and-Play Neural Ambient Illumination**|Yiyu Zhuang et.al.|[2304.08757v1](http://arxiv.org/abs/2304.08757v1)|null|\n", "2304.09677": "|**2023-04-20**|**Reference-guided Controllable Inpainting of Neural Radiance Fields**|Ashkan Mirzaei et.al.|[2304.09677v2](http://arxiv.org/abs/2304.09677v2)|null|\n", "2304.10537": "|**2023-04-20**|**Learning Neural Duplex Radiance Fields for Real-Time View Synthesis**|Ziyu Wan et.al.|[2304.10537v1](http://arxiv.org/abs/2304.10537v1)|null|\n", "2304.10532": "|**2023-04-21**|**Nerfbusters: Removing Ghostly Artifacts from Casually Captured NeRFs**|Frederik Warburg et.al.|[2304.10532v2](http://arxiv.org/abs/2304.10532v2)|**[link](https://github.com/ethanweber/nerfbusters)**|\n", "2304.10448": "|**2023-04-20**|**ReLight My NeRF: A Dataset for Novel View Synthesis and Relighting of Real World Objects**|Marco Toschi et.al.|[2304.10448v1](http://arxiv.org/abs/2304.10448v1)|null|\n", "2304.10406": "|**2023-04-20**|**LiDAR-NeRF: Novel LiDAR View Synthesis via Neural Radiance Fields**|Tang Tao et.al.|[2304.10406v1](http://arxiv.org/abs/2304.10406v1)|**[link](https://github.com/tangtaogo/lidar-nerf)**|\n", "2304.10250": "|**2023-04-20**|**Revisiting Implicit Neural Representations in Low-Level Vision**|Wentian Xu et.al.|[2304.10250v1](http://arxiv.org/abs/2304.10250v1)|**[link](https://github.com/wentxul/linr)**|\n", "2304.10075": "|**2023-04-20**|**Multiscale Representation for Real-Time Anti-Aliasing Neural Rendering**|Dongting Hu et.al.|[2304.10075v1](http://arxiv.org/abs/2304.10075v1)|null|\n", "2304.10050": "|**2023-04-20**|**Neural Radiance Fields: Past, Present, and Future**|Ansh Mittal et.al.|[2304.10050v1](http://arxiv.org/abs/2304.10050v1)|null|\n", "2304.09987": "|**2023-04-19**|**Tetra-NeRF: Representing Neural Radiance Fields Using Tetrahedra**|Jonas Kulhanek et.al.|[2304.09987v1](http://arxiv.org/abs/2304.09987v1)|**[link](https://github.com/jkulhanek/tetra-nerf)**|\n", "2304.10780": "|**2023-04-21**|**Omni-Line-of-Sight Imaging for Holistic Shape Reconstruction**|Binbin Huang et.al.|[2304.10780v1](http://arxiv.org/abs/2304.10780v1)|null|\n", "2304.10664": "|**2023-04-20**|**A Comparative Neural Radiance Field (NeRF) 3D Analysis of Camera Poses from HoloLens Trajectories and Structure from Motion**|Miriam J\u00e4ger et.al.|[2304.10664v1](http://arxiv.org/abs/2304.10664v1)|null|\n", "2304.12308": "|**2023-04-26**|**Segment Anything in 3D with NeRFs**|Jiazhong Cen et.al.|[2304.12308v2](http://arxiv.org/abs/2304.12308v2)|null|\n", "2304.12294": "|**2023-04-24**|**Explicit Correspondence Matching for Generalizable Neural Radiance Fields**|Yuedong Chen et.al.|[2304.12294v1](http://arxiv.org/abs/2304.12294v1)|**[link](https://github.com/donydchen/matchnerf)**|\n", "2304.11842": "|**2023-04-25**|**Gen-NeRF: Efficient and Generalizable Neural Radiance Fields via Algorithm-Hardware Co-Design**|Yonggan Fu et.al.|[2304.11842v2](http://arxiv.org/abs/2304.11842v2)|null|\n", "2304.11470": "|**2023-04-22**|**3D-IntPhys: Towards More Generalized 3D-grounded Visual Intuitive Physics under Challenging Scenes**|Haotian Xue et.al.|[2304.11470v1](http://arxiv.org/abs/2304.11470v1)|null|\n", "2304.11448": "|**2023-04-22**|**Dehazing-NeRF: Neural Radiance Fields from Hazy Images**|Tian Li et.al.|[2304.11448v1](http://arxiv.org/abs/2304.11448v1)|null|\n", "2304.11342": "|**2023-04-22**|**NaviNeRF: NeRF-based 3D Representation Disentanglement by Latent Semantic Navigation**|Baao Xie et.al.|[2304.11342v1](http://arxiv.org/abs/2304.11342v1)|null|\n", "2304.11241": "|**2023-04-21**|**AutoNeRF: Training Implicit Scene Representations with Autonomous Agents**|Pierre Marza et.al.|[2304.11241v1](http://arxiv.org/abs/2304.11241v1)|null|\n", "2304.12746": "|**2023-04-25**|**Local Implicit Ray Function for Generalizable Radiance Field Representation**|Xin Huang et.al.|[2304.12746v1](http://arxiv.org/abs/2304.12746v1)|null|\n", "2304.12587": "|**2023-04-27**|**MF-NeRF: Memory Efficient NeRF with Mixed-Feature Hash Table**|Yongjae Lee et.al.|[2304.12587v3](http://arxiv.org/abs/2304.12587v3)|**[link](https://github.com/nfyfamr/mf-nerf)**|\n", "2304.12467": "|**2023-04-24**|**Instant-3D: Instant Neural Radiance Field Training Towards On-Device AR/VR 3D Reconstruction**|Sixu Li et.al.|[2304.12467v1](http://arxiv.org/abs/2304.12467v1)|null|\n", "2304.12439": "|**2023-04-24**|**TextMesh: Generation of Realistic 3D Meshes From Text Prompts**|Christina Tsalicoglou et.al.|[2304.12439v1](http://arxiv.org/abs/2304.12439v1)|null|\n", "2304.13518": "|**2023-04-26**|**Super-NeRF: View-consistent Detail Generation for NeRF super-resolution**|Yuqi Han et.al.|[2304.13518v1](http://arxiv.org/abs/2304.13518v1)|null|\n", "2304.13386": "|**2023-04-26**|**VGOS: Voxel Grid Optimization for View Synthesis from Sparse Inputs**|Jiakai Sun et.al.|[2304.13386v1](http://arxiv.org/abs/2304.13386v1)|**[link](https://github.com/sjojok/vgos)**|\n", "2304.14401": "|**2023-04-27**|**ActorsNeRF: Animatable Few-shot Human Rendering with Generalizable NeRFs**|Jiteng Mu et.al.|[2304.14401v1](http://arxiv.org/abs/2304.14401v1)|null|\n", "2304.14301": "|**2023-05-03**|**Combining HoloLens with Instant-NeRFs: Advanced Real-Time 3D Mobile Mapping**|Dennis Haitz et.al.|[2304.14301v2](http://arxiv.org/abs/2304.14301v2)|null|\n", "2304.14070": "|**2023-04-27**|**Compositional 3D Human-Object Neural Animation**|Zhi Hou et.al.|[2304.14070v1](http://arxiv.org/abs/2304.14070v1)|null|\n", "2304.14811": "|**2023-04-28**|**NeRF-LiDAR: Generating Realistic LiDAR Point Clouds with Neural Radiance Fields**|Junge Zhang et.al.|[2304.14811v1](http://arxiv.org/abs/2304.14811v1)|null|\n", "2304.14473": "|**2023-04-27**|**Learning a Diffusion Prior for NeRFs**|Guandao Yang et.al.|[2304.14473v1](http://arxiv.org/abs/2304.14473v1)|null|\n", "2305.00787": "|**2023-05-01**|**GeneFace++: Generalized and Stable Real-Time Audio-Driven 3D Talking Face Generation**|Zhenhui Ye et.al.|[2305.00787v1](http://arxiv.org/abs/2305.00787v1)|null|\n", "2305.00375": "|**2023-04-30**|**Neural Radiance Fields (NeRFs): A Review and Some Recent Developments**|Mohamed Debbagh et.al.|[2305.00375v1](http://arxiv.org/abs/2305.00375v1)|null|\n", "2305.00041": "|**2023-04-28**|**ViP-NeRF: Visibility Prior for Sparse Input Neural Radiance Fields**|Nagabhushan Somraj et.al.|[2305.00041v1](http://arxiv.org/abs/2305.00041v1)|**[link](https://github.com/NagabhushanSN95/ViP-NeRF)**|\n", "2305.01643": "|**2023-05-02**|**Neural LiDAR Fields for Novel View Synthesis**|Shengyu Huang et.al.|[2305.01643v1](http://arxiv.org/abs/2305.01643v1)|null|\n", "2305.01190": "|**2023-05-03**|**LatentAvatar: Learning Latent Expression Code for Expressive Neural Head Avatar**|Yuelang Xu et.al.|[2305.01190v2](http://arxiv.org/abs/2305.01190v2)|null|\n", "2305.01163": "|**2023-05-02**|**Federated Neural Radiance Fields**|Lachlan Holden et.al.|[2305.01163v1](http://arxiv.org/abs/2305.01163v1)|**[link](https://github.com/lachholden/fednerf-pytorch)**|\n", "2305.03049": "|**2023-05-04**|**NeuralEditor: Editing Neural Radiance Fields via Manipulating Point Clouds**|Jun-Kun Chen et.al.|[2305.03049v1](http://arxiv.org/abs/2305.03049v1)|null|\n", "2305.02756": "|**2023-05-04**|**Radiance Field Gradient Scaling for Unbiased Near-Camera Training**|Julien Philip et.al.|[2305.02756v1](http://arxiv.org/abs/2305.02756v1)|**[link](https://github.com/gradient-scaling/gradient-scaling.github.io)**|\n", "2305.02618": "|**2023-05-04**|**Semantic-aware Generation of Multi-view Portrait Drawings**|Biao Ma et.al.|[2305.02618v1](http://arxiv.org/abs/2305.02618v1)|**[link](https://github.com/aiart-hdu/sage)**|\n", "2305.03176": "|**2023-05-04**|**NeRF-QA: Neural Radiance Fields Quality Assessment Database**|Pedro Martin et.al.|[2305.03176v1](http://arxiv.org/abs/2305.03176v1)|null|\n", "2305.04789": "|**2023-05-08**|**AvatarReX: Real-time Expressive Full-body Avatars**|Zerong Zheng et.al.|[2305.04789v1](http://arxiv.org/abs/2305.04789v1)|null|\n", "2305.04296": "|**2023-05-07**|**HashCC: Lightweight Method to Improve the Quality of the Camera-less NeRF Scene Generation**|Jan Olszewski et.al.|[2305.04296v1](http://arxiv.org/abs/2305.04296v1)|null|\n", "2305.04268": "|**2023-05-07**|**Multi-Space Neural Radiance Fields**|Ze-Xin Yin et.al.|[2305.04268v1](http://arxiv.org/abs/2305.04268v1)|null|\n", "2305.05594": "|**2023-05-09**|**PET-NeuS: Positional Encoding Tri-Planes for Neural Surfaces**|Yiqun Wang et.al.|[2305.05594v1](http://arxiv.org/abs/2305.05594v1)|**[link](https://github.com/yiqun-wang/pet-neus)**|\n", "2305.04966": "|**2023-05-08**|**NerfAcc: Efficient Sampling Accelerates NeRFs**|Ruilong Li et.al.|[2305.04966v1](http://arxiv.org/abs/2305.04966v1)|null|\n", "2305.06131": "|**2023-05-10**|**Generative AI meets 3D: A Survey on Text-to-3D in AIGC Era**|Chenghao Li et.al.|[2305.06131v1](http://arxiv.org/abs/2305.06131v1)|null|\n", "2305.06118": "|**2023-05-10**|**NeRF$^\\textbf{2}$: Neural Radio-Frequency Radiance Fields**|Xiaopeng Zhao et.al.|[2305.06118v1](http://arxiv.org/abs/2305.06118v1)|null|\n", "2305.05766": "|**2023-05-09**|**Instant-NeRF: Instant On-Device Neural Radiance Field Training via Algorithm-Accelerator Co-Designed Near-Memory Processing**|Yang Zhao et.al.|[2305.05766v1](http://arxiv.org/abs/2305.05766v1)|null|\n", "2305.07342": "|**2023-05-12**|**BundleRecon: Ray Bundle-Based 3D Neural Reconstruction**|Weikun Zhang et.al.|[2305.07342v1](http://arxiv.org/abs/2305.07342v1)|null|\n", "2305.08851": "|**2023-05-15**|**MV-Map: Offboard HD-Map Generation with Multi-view Consistency**|Ziyang Xie et.al.|[2305.08851v1](http://arxiv.org/abs/2305.08851v1)|**[link](https://github.com/ziyang-xie/mv-map)**|\n", "2305.09761": "|**2023-05-16**|**NerfBridge: Bringing Real-time, Online Neural Radiance Field Training to Robotics**|Javier Yu et.al.|[2305.09761v1](http://arxiv.org/abs/2305.09761v1)|**[link](https://github.com/javieryu/nerf_bridge)**|\n", "2305.11167": "|**2023-05-18**|**MVPSNet: Fast Generalizable Multi-view Photometric Stereo**|Dongxu Zhao et.al.|[2305.11167v1](http://arxiv.org/abs/2305.11167v1)|null|\n", "2305.11031": "|**2023-05-18**|**ConsistentNeRF: Enhancing Neural Radiance Fields with 3D Consistency for Sparse View Synthesis**|Shoukang Hu et.al.|[2305.11031v1](http://arxiv.org/abs/2305.11031v1)|**[link](https://github.com/skhu101/consistentnerf)**|\n", "2305.10579": "|**2023-05-17**|**MultiPlaneNeRF: Neural Radiance Field with Non-Trainable Representation**|Dominik Zimny et.al.|[2305.10579v1](http://arxiv.org/abs/2305.10579v1)|**[link](https://github.com/gmum/multiplanenerf)**|\n", "2305.10503": "|**2023-05-24**|**OR-NeRF: Object Removing from 3D Scenes Guided by Multiview Segmentation with Neural Radiance Fields**|Youtan Yin et.al.|[2305.10503v2](http://arxiv.org/abs/2305.10503v2)|**[link](https://github.com/cuteyyt/or-nerf)**|\n", "2305.11588": "|**2023-05-19**|**Text2NeRF: Text-Driven 3D Scene Generation with Neural Radiance Fields**|Jingbo Zhang et.al.|[2305.11588v1](http://arxiv.org/abs/2305.11588v1)|null|\n", "2305.13307": "|**2023-05-22**|**NeRFuser: Large-Scale Scene Representation by NeRF Fusion**|Jiading Fang et.al.|[2305.13307v1](http://arxiv.org/abs/2305.13307v1)|**[link](https://github.com/ripl/nerfuser)**|\n", "2305.12843": "|**2023-05-22**|**Registering Neural Radiance Fields as 3D Density Images**|Han Jiang et.al.|[2305.12843v1](http://arxiv.org/abs/2305.12843v1)|null|\n", "2305.14093": "|**2023-05-24**|**3D Open-vocabulary Segmentation with Foundation Models**|Kunhao Liu et.al.|[2305.14093v2](http://arxiv.org/abs/2305.14093v2)|**[link](https://github.com/kunhao-liu/3d-ovs)**|\n", "2305.15171": "|**2023-05-31**|**Deceptive-NeRF: Enhancing NeRF Reconstruction using Pseudo-Observations from Diffusion Models**|Xinhang Liu et.al.|[2305.15171v2](http://arxiv.org/abs/2305.15171v2)|null|\n", "2305.15094": "|**2023-05-24**|**InpaintNeRF360: Text-Guided 3D Inpainting on Unbounded Neural Radiance Fields**|Dongqing Wang et.al.|[2305.15094v1](http://arxiv.org/abs/2305.15094v1)|null|\n", "2305.14831": "|**2023-05-24**|**OD-NeRF: Efficient Training of On-the-Fly Dynamic Neural Radiance Fields**|Zhiwen Yan et.al.|[2305.14831v1](http://arxiv.org/abs/2305.14831v1)|null|\n", "2305.16233": "|**2023-05-25**|**Interactive Segment Anything NeRF with Feature Imitation**|Xiaokang Chen et.al.|[2305.16233v1](http://arxiv.org/abs/2305.16233v1)|null|\n", "2305.16213": "|**2023-05-25**|**ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with Variational Score Distillation**|Zhengyi Wang et.al.|[2305.16213v1](http://arxiv.org/abs/2305.16213v1)|**[link](https://github.com/thu-ml/prolificdreamer)**|\n", "2305.16914": "|**2023-06-06**|**PlaNeRF: SVD Unsupervised 3D Plane Regularization for NeRF Large-Scale Scene Reconstruction**|Fusang Wang et.al.|[2305.16914v3](http://arxiv.org/abs/2305.16914v3)|null|\n", "2305.16411": "|**2023-05-25**|**ZeroAvatar: Zero-shot 3D Avatar Generation from a Single Image**|Zhenzhen Weng et.al.|[2305.16411v1](http://arxiv.org/abs/2305.16411v1)|null|\n", "2305.18079": "|**2023-05-31**|**Towards a Robust Framework for NeRF Evaluation**|Adrian Azzarelli et.al.|[2305.18079v3](http://arxiv.org/abs/2305.18079v3)|**[link](https://github.com/azzarelli/wape)**|\n", "2305.17916": "|**2023-05-31**|**Volume Feature Rendering for Fast Neural Radiance Field Reconstruction**|Kang Han et.al.|[2305.17916v2](http://arxiv.org/abs/2305.17916v2)|null|\n", "2305.19201": "|**2023-05-30**|**D\u00e4RF: Boosting Radiance Fields from Sparse Inputs with Monocular Depth Adaptation**|Jiuhn Song et.al.|[2305.19201v1](http://arxiv.org/abs/2305.19201v1)|**[link](https://github.com/KU-CVLAB/DaRF)**|\n", "2305.19065": "|**2023-05-30**|**Template-free Articulated Neural Point Clouds for Reposable View Synthesis**|Lukas Uzolas et.al.|[2305.19065v1](http://arxiv.org/abs/2305.19065v1)|**[link](https://github.com/lukasuz/articulated-point-nerf)**|\n", "2305.18766": "|**2023-05-31**|**HiFA: High-fidelity Text-to-3D with Advanced Diffusion Guidance**|Junzhe Zhu et.al.|[2305.18766v2](http://arxiv.org/abs/2305.18766v2)|null|\n", "2306.00783": "|**2023-06-01**|**FDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and Relighting with Diffusion Models**|Hao Zhang et.al.|[2306.00783v1](http://arxiv.org/abs/2306.00783v1)|**[link](https://github.com/billyxyb/fdnerf)**|\n", "2306.00696": "|**2023-06-01**|**Analyzing the Internals of Neural Radiance Fields**|Lukas Radl et.al.|[2306.00696v1](http://arxiv.org/abs/2306.00696v1)|**[link](https://github.com/r4dl/nerfinternals)**|\n", "2306.00547": "|**2023-06-02**|**AvatarStudio: Text-driven Editing of 3D Dynamic Human Head Avatars**|Mohit Mendiratta et.al.|[2306.00547v2](http://arxiv.org/abs/2306.00547v2)|null|\n", "2306.03000": "|**2023-06-05**|**BeyondPixels: A Comprehensive Review of the Evolution of Neural Radiance Fields**|AKM Shahariar Azad Rabby et.al.|[2306.03000v1](http://arxiv.org/abs/2306.03000v1)|null|\n", "2306.02741": "|**2023-06-05**|**ZIGNeRF: Zero-shot 3D Scene Representation with Invertible Generative Neural Radiance Fields**|Kanghyeok Ko et.al.|[2306.02741v1](http://arxiv.org/abs/2306.02741v1)|null|\n", "2306.03727": "|**2023-06-06**|**Towards Visual Foundational Models of Physical Scenes**|Chethan Parameshwara et.al.|[2306.03727v1](http://arxiv.org/abs/2306.03727v1)|null|\n", "2306.03576": "|**2023-06-06**|**Human 3D Avatar Modeling with Implicit Neural Representation: A Brief Survey**|Mingyang Sun et.al.|[2306.03576v1](http://arxiv.org/abs/2306.03576v1)|null|\n", "2306.03207": "|**2023-06-05**|**H2-Mapping: Real-time Dense Mapping Using Hierarchical Hybrid Representation**|Chenxing Jiang et.al.|[2306.03207v1](http://arxiv.org/abs/2306.03207v1)|**[link](https://github.com/sysu-star/h2-mapping)**|\n", "2306.05410": "|**2023-06-08**|**LU-NeRF: Scene and Pose Estimation by Synchronizing Local Unposed NeRFs**|Zezhou Cheng et.al.|[2306.05410v1](http://arxiv.org/abs/2306.05410v1)|null|\n", "2306.05303": "|**2023-06-08**|**Enhance-NeRF: Multiple Performance Evaluation for Neural Radiance Fields**|Qianqiu Tan et.al.|[2306.05303v1](http://arxiv.org/abs/2306.05303v1)|**[link](https://github.com/tanqianq/enhance-nerf)**|\n", "2306.06093": "|**2023-06-09**|**HyP-NeRF: Learning Improved NeRF Priors using a HyperNetwork**|Bipasha Sen et.al.|[2306.06093v1](http://arxiv.org/abs/2306.06093v1)|null|\n", "2306.06044": "|**2023-06-09**|**GANeRF: Leveraging Discriminators to Optimize Neural Radiance Fields**|Barbara Roessle et.al.|[2306.06044v1](http://arxiv.org/abs/2306.06044v1)|null|\n", "2306.05668": "|**2023-06-09**|**RePaint-NeRF: NeRF Editting via Semantic Masks and Diffusion Models**|Xingchen Zhou et.al.|[2306.05668v1](http://arxiv.org/abs/2306.05668v1)|null|\n", "2306.06388": "|**2023-06-10**|**From NeRFLiX to NeRFLiX++: A General NeRF-Agnostic Restorer Paradigm**|Kun Zhou et.al.|[2306.06388v1](http://arxiv.org/abs/2306.06388v1)|null|\n", "2306.06300": "|**2023-06-15**|**NERFBK: A High-Quality Benchmark for NERF-Based 3D Reconstruction**|Ali Karami et.al.|[2306.06300v2](http://arxiv.org/abs/2306.06300v2)|**[link](https://github.com/3dom-fbk/nerfbk)**|\n", "2306.07581": "|**2023-06-13**|**Binary Radiance Fields**|Seungjoo Shin et.al.|[2306.07581v1](http://arxiv.org/abs/2306.07581v1)|null|\n", "2306.09349": "|**2023-06-16**|**UrbanIR: Large-Scale Urban Scene Inverse Rendering from a Single Video**|Zhi-Hao Lin et.al.|[2306.09349v2](http://arxiv.org/abs/2306.09349v2)|null|\n", "2306.08068": "|**2023-06-13**|**DORSal: Diffusion for Object-centric Representations of Scenes $\\textit{et al.}$**|Allan Jabri et.al.|[2306.08068v1](http://arxiv.org/abs/2306.08068v1)|null|\n", "2306.09551": "|**2023-06-15**|**Edit-DiffNeRF: Editing 3D Neural Radiance Fields using 2D Diffusion Model**|Lu Yu et.al.|[2306.09551v1](http://arxiv.org/abs/2306.09551v1)|null|\n", "2306.11556": "|**2023-06-20**|**NeRF synthesis with shading guidance**|Chenbin Li et.al.|[2306.11556v1](http://arxiv.org/abs/2306.11556v1)|null|\n", "2306.10350": "|**2023-06-24**|**MA-NeRF: Motion-Assisted Neural Radiance Fields for Face Synthesis from Sparse Images**|Weichen Zhang et.al.|[2306.10350v2](http://arxiv.org/abs/2306.10350v2)|null|\n", "2306.12423": "|**2023-06-21**|**Benchmarking and Analyzing 3D-aware Image Synthesis with a Modularized Codebase**|Qiuyu Wang et.al.|[2306.12423v1](http://arxiv.org/abs/2306.12423v1)|**[link](https://github.com/qiuyu96/carver)**|\n", "2306.12422": "|**2023-06-21**|**DreamTime: An Improved Optimization Strategy for Text-to-3D Content Creation**|Yukun Huang et.al.|[2306.12422v1](http://arxiv.org/abs/2306.12422v1)|null|\n", "2306.12760": "|**2023-06-22**|**Blended-NeRF: Zero-Shot Object Generation and Blending in Existing Neural Radiance Fields**|Ori Gordon et.al.|[2306.12760v1](http://arxiv.org/abs/2306.12760v1)|**[link](https://github.com/orig333/Blended-NeRF)**|\n", "2306.12570": "|**2023-06-21**|**Local 3D Editing via 3D Distillation of CLIP Knowledge**|Junha Hyung et.al.|[2306.12570v1](http://arxiv.org/abs/2306.12570v1)|null|\n", "2306.15203": "|**2023-06-27**|**Unsupervised Polychromatic Neural Representation for CT Metal Artifact Reduction**|Qing Wu et.al.|[2306.15203v1](http://arxiv.org/abs/2306.15203v1)|**[link](https://github.com/iwuqing/polyner)**|\n", "2306.16541": "|**2023-06-28**|**Envisioning a Next Generation Extended Reality Conferencing System with Efficient Photorealistic Human Rendering**|Chuanyue Shen et.al.|[2306.16541v1](http://arxiv.org/abs/2306.16541v1)|null|\n", "2306.17723": "|**2023-07-16**|**FlipNeRF: Flipped Reflection Rays for Few-shot Novel View Synthesis**|Seunghyeon Seo et.al.|[2306.17723v2](http://arxiv.org/abs/2306.17723v2)|**[link](https://github.com/shawn615/FlipNeRF)**|\n", "2306.17624": "|**2023-07-03**|**Sphere2Vec: A General-Purpose Location Representation Learning over a Spherical Surface for Large-Scale Geospatial Predictions**|Gengchen Mai et.al.|[2306.17624v2](http://arxiv.org/abs/2306.17624v2)|null|\n", "2307.03441": "|**2023-07-07**|**NOFA: NeRF-based One-shot Facial Avatar Reconstruction**|Wangbo Yu et.al.|[2307.03441v1](http://arxiv.org/abs/2307.03441v1)|null|\n", "2307.03404": "|**2023-07-07**|**RGB-D Mapping and Tracking in a Plenoxel Radiance Field**|Andreas L. Teigen et.al.|[2307.03404v1](http://arxiv.org/abs/2307.03404v1)|**[link](https://github.com/ysus33/rgb-d_plenoxel_mapping_tracking)**|\n", "2307.05087": "|**2023-07-11**|**SAR-NeRF: Neural Radiance Fields for Synthetic Aperture Radar Multi-View Representation**|Zhengxin Lei et.al.|[2307.05087v1](http://arxiv.org/abs/2307.05087v1)|null|\n", "2307.08093": "|**2023-07-16**|**Cross-Ray Neural Radiance Fields for Novel-view Synthesis from Unconstrained Image Collections**|Yifan Yang et.al.|[2307.08093v1](http://arxiv.org/abs/2307.08093v1)|**[link](https://github.com/yifyang993/cr-nerf-pytorch)**|\n", "2307.07729": "|**2023-07-15**|**Improving NeRF with Height Data for Utilization of GIS Data**|Hinata Aoki et.al.|[2307.07729v1](http://arxiv.org/abs/2307.07729v1)|null|\n", "2307.09323": "|**2023-07-18**|**Efficient Region-Aware Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Jiahe Li et.al.|[2307.09323v1](http://arxiv.org/abs/2307.09323v1)|**[link](https://github.com/fictionarry/er-nerf)**|\n", "2307.10135": "|**2023-07-19**|**An Improved NeuMIP with Better Accuracy**|Bowen Xue et.al.|[2307.10135v1](http://arxiv.org/abs/2307.10135v1)|null|\n", "2307.09860": "|**2023-07-19**|**Magic NeRF Lens: Interactive Fusion of Neural Radiance Fields for Virtual Facility Inspection**|Ke Li et.al.|[2307.09860v1](http://arxiv.org/abs/2307.09860v1)|**[link](https://github.com/uhhhci/immersive-ngp)**|\n", "2307.09555": "|**2023-07-14**|**Transient Neural Radiance Fields for Lidar View Synthesis and 3D Reconstruction**|Anagh Malik et.al.|[2307.09555v1](http://arxiv.org/abs/2307.09555v1)|null|\n", "2307.10776": "|**2023-07-20**|**Urban Radiance Field Representation with Deformable Neural Mesh Primitives**|Fan Lu et.al.|[2307.10776v1](http://arxiv.org/abs/2307.10776v1)|null|\n", "2307.10664": "|**2023-07-20**|**Lighting up NeRF via Unsupervised Decomposition and Enhancement**|Haoyuan Wang et.al.|[2307.10664v1](http://arxiv.org/abs/2307.10664v1)|**[link](https://github.com/onpix/LLNeRF)**|\n", "2307.11526": "|**2023-07-29**|**CopyRNeRF: Protecting the CopyRight of Neural Radiance Fields**|Ziyuan Luo et.al.|[2307.11526v2](http://arxiv.org/abs/2307.11526v2)|null|\n", "2307.11418": "|**2023-08-07**|**FaceCLIPNeRF: Text-driven 3D Face Manipulation using Deformable Neural Radiance Fields**|Sungwon Hwang et.al.|[2307.11418v2](http://arxiv.org/abs/2307.11418v2)|null|\n", "2307.11335": "|**2023-07-21**|**Tri-MipRF: Tri-Mip Representation for Efficient Anti-Aliasing Neural Radiance Fields**|Wenbo Hu et.al.|[2307.11335v1](http://arxiv.org/abs/2307.11335v1)|null|\n", "2307.12909": "|**2023-07-24**|**Dyn-E: Local Appearance Editing of Dynamic Neural Radiance Fields**|Shangzhan Zhang et.al.|[2307.12909v1](http://arxiv.org/abs/2307.12909v1)|null|\n", "2307.12718": "|**2023-07-24**|**CarPatch: A Synthetic Benchmark for Radiance Field Evaluation on Vehicle Components**|Davide Di Nucci et.al.|[2307.12718v1](http://arxiv.org/abs/2307.12718v1)|null|\n", "2307.12291": "|**2023-07-23**|**TransHuman: A Transformer-based Human Representation for Generalizable Neural Human Rendering**|Xiao Pan et.al.|[2307.12291v1](http://arxiv.org/abs/2307.12291v1)|null|\n", "2307.13908": "|**2023-07-26**|**Points-to-3D: Bridging the Gap between Sparse Points and Shape-Controllable Text-to-3D Generation**|Chaohui Yu et.al.|[2307.13908v1](http://arxiv.org/abs/2307.13908v1)|null|\n", "2307.15058": "|**2023-07-27**|**MARS: An Instance-aware, Modular and Realistic Simulator for Autonomous Driving**|Zirui Wu et.al.|[2307.15058v1](http://arxiv.org/abs/2307.15058v1)|**[link](https://github.com/open-air-sun/mars)**|\n", "2307.14620": "|**2023-07-27**|**NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection**|Chenfeng Xu et.al.|[2307.14620v1](http://arxiv.org/abs/2307.14620v1)|**[link](https://github.com/facebookresearch/nerf-det)**|\n", "2307.15333": "|**2023-07-28**|**Dynamic PlenOctree for Adaptive Sampling Refinement in Explicit NeRF**|Haotian Bai et.al.|[2307.15333v1](http://arxiv.org/abs/2307.15333v1)|null|\n", "2307.15131": "|**2023-07-27**|**Seal-3D: Interactive Pixel-Level Editing for Neural Radiance Fields**|Xiangyu Wang et.al.|[2307.15131v1](http://arxiv.org/abs/2307.15131v1)|**[link](https://github.com/windingwind/seal-3d)**|\n", "2308.00462": "|**2023-08-01**|**Context-Aware Talking-Head Video Editing**|Songlin Yang et.al.|[2308.00462v1](http://arxiv.org/abs/2308.00462v1)|null|\n", "2308.01262": "|**2023-08-02**|**Incorporating Season and Solar Specificity into Renderings made by a NeRF Architecture using Satellite Images**|Michael Gableman et.al.|[2308.01262v1](http://arxiv.org/abs/2308.01262v1)|**[link](https://github.com/enterprisecv-6/season-nerf)**|\n", "2308.00773": "|**2023-08-01**|**High-Fidelity Eye Animatable Neural Radiance Fields for Human Face**|Hengfei Wang et.al.|[2308.00773v1](http://arxiv.org/abs/2308.00773v1)|null|\n", "2308.02191": "|**2023-08-04**|**ES-MVSNet: Efficient Framework for End-to-end Self-supervised Multi-View Stereo**|Qiang Zhou et.al.|[2308.02191v1](http://arxiv.org/abs/2308.02191v1)|null|\n", "2308.03280": "|**2023-08-07**|**Mirror-NeRF: Learning Neural Radiance Fields for Mirrors with Whitted-Style Ray Tracing**|Junyi Zeng et.al.|[2308.03280v1](http://arxiv.org/abs/2308.03280v1)|null|\n", "2308.02908": "|**2023-08-05**|**Where and How: Mitigating Confusion in Neural Radiance Fields from Sparse Inputs**|Yanqi Bao et.al.|[2308.02908v1](http://arxiv.org/abs/2308.02908v1)|**[link](https://github.com/bbbbby-99/wah-nerf)**|\n", "2308.02840": "|**2023-08-05**|**Learning Unified Decompositional and Compositional NeRF for Editable Novel View Synthesis**|Yuxin Wang et.al.|[2308.02840v1](http://arxiv.org/abs/2308.02840v1)|null|\n", "2308.02751": "|**2023-08-05**|**NeRFs: The Search for the Best 3D Representation**|Ravi Ramamoorthi et.al.|[2308.02751v1](http://arxiv.org/abs/2308.02751v1)|null|\n", "2308.04413": "|**2023-08-08**|**Digging into Depth Priors for Outdoor Neural Radiance Fields**|Chen Wang et.al.|[2308.04413v1](http://arxiv.org/abs/2308.04413v1)|null|\n", "2308.03772": "|**2023-07-27**|**Improved Neural Radiance Fields Using Pseudo-depth and Fusion**|Jingliang Li et.al.|[2308.03772v1](http://arxiv.org/abs/2308.03772v1)|null|\n", "2308.04826": "|**2023-08-09**|**WaveNeRF: Wavelet-based Generalizable Neural Radiance Fields**|Muyu Xu et.al.|[2308.04826v1](http://arxiv.org/abs/2308.04826v1)|null|\n", "2308.04669": "|**2023-08-14**|**A General Implicit Framework for Fast NeRF Composition and Rendering**|Xinyu Gao et.al.|[2308.04669v2](http://arxiv.org/abs/2308.04669v2)|null|\n", "2308.05970": "|**2023-08-11**|**Focused Specific Objects NeRF**|Yuesong Li et.al.|[2308.05970v1](http://arxiv.org/abs/2308.05970v1)|null|\n", "2308.05939": "|**2023-08-11**|**VERF: Runtime Monitoring of Pose Estimation with Neural Radiance Fields**|Dominic Maggio et.al.|[2308.05939v1](http://arxiv.org/abs/2308.05939v1)|null|\n", "2308.07118": "|**2023-08-16**|**Neural radiance fields in the industrial and robotics domain: applications, research opportunities and use cases**|Eugen \u0160lapak et.al.|[2308.07118v2](http://arxiv.org/abs/2308.07118v2)|**[link](https://github.com/maftej/iisnerf)**|\n", "2308.07032": "|**2023-08-14**|**S3IM: Stochastic Structural SIMilarity and Its Unreasonable Effectiveness for Neural Fields**|Zeke Xie et.al.|[2308.07032v1](http://arxiv.org/abs/2308.07032v1)|**[link](https://github.com/madaoer/s3im_nerf)**|\n", "2308.08530": "|**2023-08-21**|**Ref-DVGO: Reflection-Aware Direct Voxel Grid Optimization for an Improved Quality-Efficiency Trade-Off in Reflective Scene Reconstruction**|Georgios Kouros et.al.|[2308.08530v3](http://arxiv.org/abs/2308.08530v3)|**[link](https://github.com/gkouros/ref-dvgo)**|\n", "2308.08258": "|**2023-08-16**|**SceNeRFlow: Time-Consistent Reconstruction of General Dynamic Scenes**|Edith Tretschk et.al.|[2308.08258v1](http://arxiv.org/abs/2308.08258v1)|null|\n", "2308.09421": "|**2023-08-18**|**MonoNeRD: NeRF-like Representations for Monocular 3D Object Detection**|Junkai Xu et.al.|[2308.09421v1](http://arxiv.org/abs/2308.09421v1)|**[link](https://github.com/cskkxjk/mononerd)**|\n", "2308.09386": "|**2023-08-18**|**DReg-NeRF: Deep Registration for Neural Radiance Fields**|Yu Chen et.al.|[2308.09386v1](http://arxiv.org/abs/2308.09386v1)|**[link](https://github.com/aibluefisher/dreg-nerf)**|\n", "2308.08947": "|**2023-08-17**|**Watch Your Steps: Local Image and Scene Editing by Text Instructions**|Ashkan Mirzaei et.al.|[2308.08947v1](http://arxiv.org/abs/2308.08947v1)|null|\n", "2308.10902": "|**2023-08-30**|**CamP: Camera Preconditioning for Neural Radiance Fields**|Keunhong Park et.al.|[2308.10902v2](http://arxiv.org/abs/2308.10902v2)|null|\n", "2308.10337": "|**2023-08-20**|**Strata-NeRF : Neural Radiance Fields for Stratified Scenes**|Ankit Dhiman et.al.|[2308.10337v1](http://arxiv.org/abs/2308.10337v1)|null|\n", "2308.10122": "|**2023-08-19**|**HollowNeRF: Pruning Hashgrid-Based NeRFs with Trainable Collision Mitigation**|Xiufeng Xie et.al.|[2308.10122v1](http://arxiv.org/abs/2308.10122v1)|null|\n", "2308.10001": "|**2023-08-19**|**AltNeRF: Learning Robust Neural Radiance Field via Alternating Depth-Pose Optimization**|Kun Wang et.al.|[2308.10001v1](http://arxiv.org/abs/2308.10001v1)|null|\n", "2308.09894": "|**2023-08-19**|**Semantic-Human: Neural Rendering of Humans from Monocular Video with Human Parsing**|Jie Zhang et.al.|[2308.09894v1](http://arxiv.org/abs/2308.09894v1)|null|\n", "2308.11198": "|**2023-08-22**|**Novel-view Synthesis and Pose Estimation for Hand-Object Interaction from Sparse Views**|Wentian Qu et.al.|[2308.11198v1](http://arxiv.org/abs/2308.11198v1)|null|\n", "2308.11130": "|**2023-08-22**|**Efficient View Synthesis with Neural Radiance Distribution Field**|Yushuang Wu et.al.|[2308.11130v1](http://arxiv.org/abs/2308.11130v1)|null|\n", "2308.11974": "|**2023-08-23**|**Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields**|Hyeonseop Song et.al.|[2308.11974v1](http://arxiv.org/abs/2308.11974v1)|null|\n", "2308.11951": "|**2023-08-25**|**Pose Modulated Avatars from Video**|Chunjin Song et.al.|[2308.11951v2](http://arxiv.org/abs/2308.11951v2)|null|\n", "2308.11793": "|**2023-08-22**|**Enhancing NeRF akin to Enhancing LLMs: Generalizable NeRF Transformer with Mixture-of-View-Experts**|Wenyan Cong et.al.|[2308.11793v1](http://arxiv.org/abs/2308.11793v1)|**[link](https://github.com/vita-group/gnt-move)**|\n", "2308.11774": "|**2023-08-22**|**SAMSNeRF: Segment Anything Model (SAM) Guides Dynamic Surgical Scene Reconstruction by Neural Radiance Field (NeRF)**|Ange Lou et.al.|[2308.11774v1](http://arxiv.org/abs/2308.11774v1)|null|\n", "2308.12560": "|**2023-08-24**|**NOVA: NOvel View Augmentation for Neural Composition of Dynamic Objects**|Dakshit Agrawal et.al.|[2308.12560v1](http://arxiv.org/abs/2308.12560v1)|**[link](https://github.com/dakshitagrawal/nova)**|\n", "2308.13897": "|**2023-08-26**|**InsertNeRF: Instilling Generalizability into NeRF with HyperNet Modules**|Yanqi Bao et.al.|[2308.13897v1](http://arxiv.org/abs/2308.13897v1)|**[link](https://github.com/bbbbby-99/insertnerf)**|\n", "2308.15049": "|**2023-08-29**|**Pose-Free Neural Radiance Fields via Implicit Pose Regularization**|Jiahui Zhang et.al.|[2308.15049v1](http://arxiv.org/abs/2308.15049v1)|null|\n", "2308.14816": "|**2023-08-28**|**CLNeRF: Continual Learning Meets NeRF**|Zhipeng Cai et.al.|[2308.14816v1](http://arxiv.org/abs/2308.14816v1)|**[link](https://github.com/intellabs/clnerf)**|\n", "2308.16041": "|**2023-08-30**|**From Pixels to Portraits: A Comprehensive Survey of Talking Head Generation Techniques and Applications**|Shreyank N Gowda et.al.|[2308.16041v1](http://arxiv.org/abs/2308.16041v1)|null|\n", "2308.15733": "|**2023-08-30**|**Drone-NeRF: Efficient NeRF Based 3D Scene Reconstruction for Large-Scale Drone Survey**|Zhihao Jia et.al.|[2308.15733v1](http://arxiv.org/abs/2308.15733v1)|null|\n", "2308.15547": "|**2023-08-29**|**Efficient Ray Sampling for Radiance Fields Reconstruction**|Shilei Sun et.al.|[2308.15547v1](http://arxiv.org/abs/2308.15547v1)|null|\n", "2308.16576": "|**2023-09-03**|**GHuNeRF: Generalizable Human NeRF from a Monocular Video**|Chen Li et.al.|[2308.16576v2](http://arxiv.org/abs/2308.16576v2)|null|\n", "2309.00277": "|**2023-09-01**|**SparseSat-NeRF: Dense Depth Supervised Neural Radiance Fields for Sparse Satellite Images**|Lulin Zhang et.al.|[2309.00277v1](http://arxiv.org/abs/2309.00277v1)|**[link](https://github.com/lulinzhang/sps-nerf)**|\n", "2309.00014": "|**2023-09-04**|**Improving NeRF Quality by Progressive Camera Placement for Unrestricted Navigation in Complex Environments**|Georgios Kopanas et.al.|[2309.00014v2](http://arxiv.org/abs/2309.00014v2)|null|\n", "2309.01811": "|**2023-09-06**|**Instant Continual Learning of Neural Radiance Fields**|Ryan Po et.al.|[2309.01811v2](http://arxiv.org/abs/2309.01811v2)|null|\n", "2309.01351": "|**2023-09-04**|**Adv3D: Generating 3D Adversarial Examples in Driving Scenarios with NeRF**|Leheng Li et.al.|[2309.01351v1](http://arxiv.org/abs/2309.01351v1)|null|\n", "2309.03185": "|**2023-09-06**|**Bayes' Rays: Uncertainty Quantification for Neural Radiance Fields**|Lily Goli et.al.|[2309.03185v1](http://arxiv.org/abs/2309.03185v1)|**[link](https://github.com/BayesRays/BayesRays)**|\n", "2309.03160": "|**2023-09-06**|**ResFields: Residual Neural Fields for Spatiotemporal Signals**|Marko Mihajlovic et.al.|[2309.03160v1](http://arxiv.org/abs/2309.03160v1)|**[link](https://github.com/markomih/ResFields)**|\n", "2309.03550": "|**2023-09-07**|**Text2Control3D: Controllable 3D Avatar Generation in Neural Radiance Fields using Geometry-Guided Text-to-Image Diffusion Model**|Sungwon Hwang et.al.|[2309.03550v1](http://arxiv.org/abs/2309.03550v1)|null|\n", "2309.04410": "|**2023-09-08**|**DeformToon3D: Deformable 3D Toonification from Neural Radiance Fields**|Junzhe Zhang et.al.|[2309.04410v1](http://arxiv.org/abs/2309.04410v1)|**[link](https://github.com/junzhezhang/deformtoon3d)**|\n", "2309.03955": "|**2023-09-14**|**SimpleNeRF: Regularizing Sparse Input Neural Radiance Fields with Simpler Solutions**|Nagabhushan Somraj et.al.|[2309.03955v2](http://arxiv.org/abs/2309.03955v2)|null|\n", "2309.03933": "|**2023-09-07**|**BluNF: Blueprint Neural Field**|Robin Courant et.al.|[2309.03933v1](http://arxiv.org/abs/2309.03933v1)|null|\n", "2309.05339": "|**2023-09-11**|**PAg-NeRF: Towards fast and efficient end-to-end panoptic 3D representations for agricultural robotics**|Claus Smitt et.al.|[2309.05339v1](http://arxiv.org/abs/2309.05339v1)|null|\n", "2309.04917": "|**2023-09-10**|**Text-driven Editing of 3D Scenes without Retraining**|Shuangkang Fang et.al.|[2309.04917v1](http://arxiv.org/abs/2309.04917v1)|**[link](https://github.com/Fangkang515/DN2N)**|\n", "2309.04750": "|**2023-09-09**|**Mirror-Aware Neural Humans**|Daniel Ajisafe et.al.|[2309.04750v1](http://arxiv.org/abs/2309.04750v1)|null|\n", "2309.04581": "|**2023-09-08**|**Dynamic Mesh-Aware Radiance Fields**|Yi-Ling Qiao et.al.|[2309.04581v1](http://arxiv.org/abs/2309.04581v1)|null|\n", "2309.06030": "|**2023-09-12**|**Federated Learning for Large-Scale Scene Modeling with Neural Radiance Fields**|Teppei Suzuki et.al.|[2309.06030v1](http://arxiv.org/abs/2309.06030v1)|null|\n", "2309.07125": "|**2023-09-13**|**Text-Guided Generation and Editing of Compositional 3D Avatars**|Hao Zhang et.al.|[2309.07125v1](http://arxiv.org/abs/2309.07125v1)|null|\n", "2309.06802": "|**2023-09-13**|**Dynamic NeRFs for Soccer Scenes**|Sacha Lewin et.al.|[2309.06802v1](http://arxiv.org/abs/2309.06802v1)|null|\n", "2309.07846": "|**2023-09-14**|**MC-NeRF: Muti-Camera Neural Radiance Fields for Muti-Camera Image Acquisition Systems**|Yu Gao et.al.|[2309.07846v1](http://arxiv.org/abs/2309.07846v1)|null|\n", "2309.07752": "|**2023-09-14**|**DT-NeRF: Decomposed Triplane-Hash Neural Radiance Fields for High-Fidelity Talking Portrait Synthesis**|Yaoyu Su et.al.|[2309.07752v1](http://arxiv.org/abs/2309.07752v1)|null|\n", "2309.07668": "|**2023-09-14**|**CoRF : Colorizing Radiance Fields using Knowledge Distillation**|Ankit Dhiman et.al.|[2309.07668v1](http://arxiv.org/abs/2309.07668v1)|null|\n", "2309.08596": "|**2023-09-15**|**Robust e-NeRF: NeRF from Sparse & Noisy Events under Non-Uniform Motion**|Weng Fei Low et.al.|[2309.08596v1](http://arxiv.org/abs/2309.08596v1)|**[link](https://github.com/wengflow/robust-e-nerf)**|\n", "2309.08040": "|**2023-09-14**|**Gradient based Grasp Pose Optimization on a NeRF that Approximates Grasp Success**|Gergely S\u00f3ti et.al.|[2309.08040v1](http://arxiv.org/abs/2309.08040v1)|null|\n", "2309.09502": "|**2023-09-18**|**RenderOcc: Vision-Centric 3D Occupancy Prediction with 2D Rendering Supervision**|Mingjie Pan et.al.|[2309.09502v1](http://arxiv.org/abs/2309.09502v1)|**[link](https://github.com/pmj110119/renderocc)**|\n", "2309.09295": "|**2023-09-17**|**NeRF-VINS: A Real-time Neural Radiance Field Map-based Visual-Inertial Navigation System**|Saimouli Katragadda et.al.|[2309.09295v1](http://arxiv.org/abs/2309.09295v1)|null|\n", "2309.08927": "|**2023-09-16**|**DynaMoN: Motion-Aware Fast And Robust Camera Localization for Dynamic NeRF**|Mert Asim Karaoglu et.al.|[2309.08927v1](http://arxiv.org/abs/2309.08927v1)|null|\n", "2309.10684": "|**2023-09-19**|**Locally Stylized Neural Radiance Fields**|Hong-Wing Pang et.al.|[2309.10684v1](http://arxiv.org/abs/2309.10684v1)|null|\n", "2309.10503": "|**2023-09-19**|**Steganography for Neural Radiance Fields by Backdooring**|Weina Dong et.al.|[2309.10503v1](http://arxiv.org/abs/2309.10503v1)|null|\n", "2309.10011": "|**2023-09-18**|**Instant Photorealistic Style Transfer: A Lightweight and Adaptive Approach**|Rong Liu et.al.|[2309.10011v1](http://arxiv.org/abs/2309.10011v1)|null|\n", "2309.11009": "|**2023-09-21**|**Controllable Dynamic Appearance for Neural 3D Portraits**|ShahRukh Athar et.al.|[2309.11009v2](http://arxiv.org/abs/2309.11009v2)|null|\n", "2309.10987": "|**2023-09-20**|**Spiking NeRF: Making Bio-inspired Neural Networks See through the Real World**|Xingting Yao et.al.|[2309.10987v1](http://arxiv.org/abs/2309.10987v1)|null|\n", "2309.12183": "|**2023-09-21**|**ORTexME: Occlusion-Robust Human Shape and Pose via Temporal Average Texture and Mesh Encoding**|Yu Cheng et.al.|[2309.12183v1](http://arxiv.org/abs/2309.12183v1)|null|\n", "2309.11966": "|**2023-09-21**|**NeuralLabeling: A versatile toolset for labeling vision datasets using Neural Radiance Fields**|Floris Erich et.al.|[2309.11966v1](http://arxiv.org/abs/2309.11966v1)|**[link](https://github.com/FlorisE/neural-labeling)**|\n", "2309.11767": "|**2023-09-21**|**Fast Satellite Tensorial Radiance Field for Multi-date Satellite Imagery of Large Size**|Tongtong Zhang et.al.|[2309.11767v1](http://arxiv.org/abs/2309.11767v1)|null|\n", "2309.11747": "|**2023-09-21**|**MarkNerf:Watermarking for Neural Radiance Field**|Lifeng Chen et.al.|[2309.11747v1](http://arxiv.org/abs/2309.11747v1)|null|\n", "2309.11698": "|**2023-09-21**|**Rendering stable features improves sampling-based localisation with Neural radiance fields**|Boxuan Zhang et.al.|[2309.11698v1](http://arxiv.org/abs/2309.11698v1)|null|\n", "2309.11627": "|**2023-09-20**|**GenLayNeRF: Generalizable Layered Representations with 3D Model Alignment for Multi-Human View Synthesis**|Youssef Abdelkareem et.al.|[2309.11627v1](http://arxiv.org/abs/2309.11627v1)|null|\n", "2309.11525": "|**2023-09-23**|**Light Field Diffusion for Single-View Novel View Synthesis**|Yifeng Xiong et.al.|[2309.11525v2](http://arxiv.org/abs/2309.11525v2)|null|\n", "2309.13039": "|**2023-09-22**|**NeRRF: 3D Reconstruction and View Synthesis for Transparent and Specular Objects with Neural Refractive-Reflective Fields**|Xiaoxue Chen et.al.|[2309.13039v1](http://arxiv.org/abs/2309.13039v1)|**[link](https://github.com/dawning77/nerrf)**|\n", "2309.14293": "|**2023-09-25**|**NAS-NeRF: Generative Neural Architecture Search for Neural Radiance Fields**|Saeejith Nair et.al.|[2309.14293v1](http://arxiv.org/abs/2309.14293v1)|null|\n", "2309.14010": "|**2023-09-25**|**Variational Inference for Scalable 3D Object-centric Learning**|Tianyu Wang et.al.|[2309.14010v1](http://arxiv.org/abs/2309.14010v1)|null|\n", "2309.13607": "|**2023-09-24**|**MM-NeRF: Multimodal-Guided 3D Multi-Style Transfer of Neural Radiance Field**|Zijiang Yang et.al.|[2309.13607v1](http://arxiv.org/abs/2309.13607v1)|null|\n", "2309.13240": "|**2023-09-23**|**NeRF-Enhanced Outpainting for Faithful Field-of-View Extrapolation**|Rui Yu et.al.|[2309.13240v1](http://arxiv.org/abs/2309.13240v1)|null|\n", "2309.14800": "|**2023-09-26**|**3D Density-Gradient based Edge Detection on Neural Radiance Fields (NeRFs) for Geometric Reconstruction**|Miriam J\u00e4ger et.al.|[2309.14800v1](http://arxiv.org/abs/2309.14800v1)|null|\n", "2309.15526": "|**2023-09-27**|**P2I-NET: Mapping Camera Pose to Image via Adversarial Learning for New View Synthesis in Real Indoor Environments**|Xujie Kang et.al.|[2309.15526v1](http://arxiv.org/abs/2309.15526v1)|null|\n", "2309.15329": "|**2023-09-27**|**BASED: Bundle-Adjusting Surgical Endoscopic Dynamic Video Reconstruction using Neural Radiance Fields**|Shreya Saha et.al.|[2309.15329v1](http://arxiv.org/abs/2309.15329v1)|null|\n", "2309.16553": "|**2023-09-28**|**MatrixCity: A Large-scale City Dataset for City-scale Neural Rendering and Beyond**|Yixuan Li et.al.|[2309.16553v1](http://arxiv.org/abs/2309.16553v1)|null|\n", "2309.16364": "|**2023-10-04**|**FG-NeRF: Flow-GAN based Probabilistic Neural Radiance Field for Independence-Assumption-Free Uncertainty Estimation**|Songlin Wei et.al.|[2309.16364v2](http://arxiv.org/abs/2309.16364v2)|null|\n", "2309.16110": "|**2023-09-28**|**Learning Effective NeRFs and SDFs Representations with 3D Generative Adversarial Networks for 3D Object Generation: Technical Report for ICCV 2023 OmniObject3D Challenge**|Zheyuan Yang et.al.|[2309.16110v1](http://arxiv.org/abs/2309.16110v1)|null|\n", "2309.17450": "|**2023-09-29**|**Multi-task View Synthesis with Neural Radiance Fields**|Shuhong Zheng et.al.|[2309.17450v1](http://arxiv.org/abs/2309.17450v1)|**[link](https://github.com/zsh2000/muvienerf)**|\n", "2309.17390": "|**2023-09-29**|**Forward Flow for Novel View Synthesis of Dynamic Scenes**|Xiang Guo et.al.|[2309.17390v1](http://arxiv.org/abs/2309.17390v1)|null|\n", "2309.17128": "|**2023-09-29**|**HAvatar: High-fidelity Head Avatar via Facial Model Conditioned Neural Radiance Field**|Xiaochen Zhao et.al.|[2309.17128v1](http://arxiv.org/abs/2309.17128v1)|null|\n", "2309.16859": "|**2023-09-28**|**Preface: A Data-driven Volumetric Prior for Few-shot Ultra High-resolution Face Synthesis**|Marcel C. B\u00fchler et.al.|[2309.16859v1](http://arxiv.org/abs/2309.16859v1)|null|\n", "2310.01881": "|**2023-10-03**|**Adaptive Multi-NeRF: Exploit Efficient Parallelism in Adaptive Multiple Scale Neural Radiance Field Rendering**|Tong Wang et.al.|[2310.01881v1](http://arxiv.org/abs/2310.01881v1)|null|\n", "2310.01821": "|**2023-10-03**|**MIMO-NeRF: Fast Neural Rendering with Multi-input Multi-output Neural Radiance Fields**|Takuhiro Kaneko et.al.|[2310.01821v1](http://arxiv.org/abs/2310.01821v1)|null|\n", "2310.00874": "|**2023-10-02**|**PC-NeRF: Parent-Child Neural Radiance Fields under Partial Sensor Data Loss in Autonomous Driving Environments**|Xiuzhong Hu et.al.|[2310.00874v1](http://arxiv.org/abs/2310.00874v1)|**[link](https://github.com/biter0088/pc-nerf)**|\n", "2310.00684": "|**2023-10-01**|**How Many Views Are Needed to Reconstruct an Unknown Object Using NeRF?**|Sicong Pan et.al.|[2310.00684v1](http://arxiv.org/abs/2310.00684v1)|**[link](https://github.com/psc0628/nerf-prv)**|\n", "2310.00530": "|**2023-10-01**|**Enabling Neural Radiance Fields (NeRF) for Large-scale Aerial Images -- A Multi-tiling Approaching and the Geometry Assessment of NeRF**|Ningli Xu et.al.|[2310.00530v1](http://arxiv.org/abs/2310.00530v1)|null|\n", "2310.00249": "|**2023-09-30**|**MMPI: a Flexible Radiance Field Representation by Multiple Multi-plane Images Blending**|Yuze He et.al.|[2310.00249v1](http://arxiv.org/abs/2310.00249v1)|null|\n", "2310.02977": "|**2023-10-04**|**T$^3$Bench: Benchmarking Current Progress in Text-to-3D Generation**|Yuze He et.al.|[2310.02977v1](http://arxiv.org/abs/2310.02977v1)|**[link](https://github.com/THU-LYJ-Lab/T3Bench)**|\n", "2310.02712": "|**2023-10-04**|**ED-NeRF: Efficient Text-Guided Editing of 3D Scene using Latent Space NeRF**|Jangho Park et.al.|[2310.02712v1](http://arxiv.org/abs/2310.02712v1)|null|\n", "2310.02687": "|**2023-10-05**|**USB-NeRF: Unrolling Shutter Bundle Adjusted Neural Radiance Fields**|Moyang Li et.al.|[2310.02687v2](http://arxiv.org/abs/2310.02687v2)|null|\n", "2310.02437": "|**2023-10-03**|**EvDNeRF: Reconstructing Event Data with Dynamic Neural Radiance Fields**|Anish Bhattacharya et.al.|[2310.02437v1](http://arxiv.org/abs/2310.02437v1)|**[link](https://github.com/anish-bhattacharya/evdnerf)**|\n", "2310.03704": "|**2023-10-05**|**Drag View: Generalizable Novel View Synthesis with Unposed Imagery**|Zhiwen Fan et.al.|[2310.03704v1](http://arxiv.org/abs/2310.03704v1)|**[link](https://github.com/zhiwenfan/DragView)**|\n", "2310.03578": "|**2023-10-05**|**Targeted Adversarial Attacks on Generalizable Neural Radiance Fields**|Andras Horvath et.al.|[2310.03578v1](http://arxiv.org/abs/2310.03578v1)|null|\n", "2310.03563": "|**2023-10-05**|**BID-NeRF: RGB-D image pose estimation with inverted Neural Radiance Fields**|\u00c1goston Istv\u00e1n Csehi et.al.|[2310.03563v1](http://arxiv.org/abs/2310.03563v1)|null|\n", "2310.03125": "|**2023-10-04**|**Shielding the Unseen: Privacy Protection through Poisoning NeRF with Spatial Deformation**|Yihan Wu et.al.|[2310.03125v1](http://arxiv.org/abs/2310.03125v1)|null|\n", "2310.04152": "|**2023-10-06**|**Improving Neural Radiance Field using Near-Surface Sampling with Point Cloud Generation**|Hye Bin Yoo et.al.|[2310.04152v1](http://arxiv.org/abs/2310.04152v1)|null|\n", "2310.05837": "|**2023-10-09**|**A Real-time Method for Inserting Virtual Objects into Neural Radiance Fields**|Keyang Ye et.al.|[2310.05837v1](http://arxiv.org/abs/2310.05837v1)|null|\n", "2310.05391": "|**2023-10-09**|**Neural Impostor: Editing Neural Radiance Fields with Explicit Shape Manipulation**|Ruiyang Liu et.al.|[2310.05391v1](http://arxiv.org/abs/2310.05391v1)|null|\n", "2310.05134": "|**2023-10-08**|**LocoNeRF: A NeRF-based Approach for Local Structure from Motion for Precise Localization**|Artem Nenashev et.al.|[2310.05134v1](http://arxiv.org/abs/2310.05134v1)|null|\n", "2310.05133": "|**2023-10-08**|**Geometry Aware Field-to-field Transformations for 3D Semantic Segmentation**|Dominik Hollidt et.al.|[2310.05133v1](http://arxiv.org/abs/2310.05133v1)|null|\n", "2310.06275": "|**2023-10-10**|**High-Fidelity 3D Head Avatars Reconstruction through Spatially-Varying Expression Conditioned Neural Radiance Field**|Minghan Qin et.al.|[2310.06275v1](http://arxiv.org/abs/2310.06275v1)|null|\n", "2310.07449": "|**2023-10-12**|**PoRF: Pose Residual Field for Accurate Neural Surface Reconstruction**|Jia-Wang Bian et.al.|[2310.07449v2](http://arxiv.org/abs/2310.07449v2)|null|\n", "2310.07179": "|**2023-10-11**|**rpcPRF: Generalizable MPI Neural Radiance Field for Satellite Camera**|Tongtong Zhang et.al.|[2310.07179v1](http://arxiv.org/abs/2310.07179v1)|null|\n", "2310.06984": "|**2023-10-10**|**Leveraging Neural Radiance Fields for Uncertainty-Aware Visual Localization**|Le Chen et.al.|[2310.06984v1](http://arxiv.org/abs/2310.06984v1)|null|\n", "2310.07916": "|**2023-10-11**|**Dynamic Appearance Particle Neural Radiance Field**|Ancheng Lin et.al.|[2310.07916v1](http://arxiv.org/abs/2310.07916v1)|null|\n", "2310.10650": "|**2023-10-16**|**TraM-NeRF: Tracing Mirror and Near-Perfect Specular Reflections through Neural Radiance Fields**|Leif Van Holland et.al.|[2310.10650v1](http://arxiv.org/abs/2310.10650v1)|**[link](https://github.com/Rubikalubi/TraM-NeRF)**|\n", "2310.10624": "|**2023-10-16**|**DynVideo-E: Harnessing Dynamic NeRF for Large-Scale Motion- and View-Change Human-Centric Video Editing**|Jia-Wei Liu et.al.|[2310.10624v1](http://arxiv.org/abs/2310.10624v1)|null|\n", "2310.10209": "|**2023-10-16**|**Self-supervised Fetal MRI 3D Reconstruction Based on Radiation Diffusion Generation Model**|Junpeng Tan et.al.|[2310.10209v1](http://arxiv.org/abs/2310.10209v1)|null|\n", "2310.09965": "|**2023-10-15**|**ProteusNeRF: Fast Lightweight NeRF Editing using 3D-Aware Image Context**|Binglun Wang et.al.|[2310.09965v1](http://arxiv.org/abs/2310.09965v1)|null|\n", "2310.09892": "|**2023-10-15**|**Active Perception using Neural Radiance Fields**|Siming He et.al.|[2310.09892v1](http://arxiv.org/abs/2310.09892v1)|**[link](https://github.com/grasp-lyrl/active-perception-using-neural-radiance-fields)**|\n", "2310.09776": "|**2023-10-15**|**CBARF: Cascaded Bundle-Adjusting Neural Radiance Fields from Imperfect Camera Poses**|Hongyu Fu et.al.|[2310.09776v1](http://arxiv.org/abs/2310.09776v1)|null|\n", "2310.11864": "|**2023-10-18**|**VQ-NeRF: Neural Reflectance Decomposition and Editing with Vector Quantization**|Hongliang Zhong et.al.|[2310.11864v1](http://arxiv.org/abs/2310.11864v1)|null|\n", "2310.11645": "|**2023-10-18**|**Towards Abdominal 3-D Scene Rendering from Laparoscopy Surgical Videos using NeRFs**|Khoa Tuan Nguyen et.al.|[2310.11645v1](http://arxiv.org/abs/2310.11645v1)|null|\n", "2310.13670": "|**2023-10-20**|**ManifoldNeRF: View-dependent Image Feature Supervision for Few-shot Neural Radiance Fields**|Daiju Kanaoka et.al.|[2310.13670v1](http://arxiv.org/abs/2310.13670v1)|null|\n", "2310.13356": "|**2023-10-20**|**Sync-NeRF: Generalizing Dynamic NeRFs to Unsynchronized Videos**|Seoha Kim et.al.|[2310.13356v1](http://arxiv.org/abs/2310.13356v1)|**[link](https://github.com/seoha-kim/Sync-NeRF)**|\n", "2310.13263": "|**2023-10-20**|**UE4-NeRF:Neural Radiance Field for Real-Time Rendering of Large-Scale Scene**|Jiaming Gu et.al.|[2310.13263v1](http://arxiv.org/abs/2310.13263v1)|null|\n", "2310.14695": "|**2023-10-23**|**CAwa-NeRF: Instant Learning of Compression-Aware NeRF Features**|Omnia Mahmoud et.al.|[2310.14695v1](http://arxiv.org/abs/2310.14695v1)|null|\n", "2310.14487": "|**2023-10-23**|**VQ-NeRF: Vector Quantization Enhances Implicit Neural Representations**|Yiying Yang et.al.|[2310.14487v1](http://arxiv.org/abs/2310.14487v1)|null|\n", "2310.15504": "|**2023-10-24**|**Cross-view Self-localization from Synthesized Scene-graphs**|Ryogo Yamamoto et.al.|[2310.15504v1](http://arxiv.org/abs/2310.15504v1)|null|\n", "2310.16832": "|**2023-10-26**|**LightSpeed: Light and Fast Neural Light Fields on Mobile Devices**|Aarush Gupta et.al.|[2310.16832v2](http://arxiv.org/abs/2310.16832v2)|**[link](https://github.com/lightspeed-r2l/lightspeed)**|\n", "2310.16831": "|**2023-10-28**|**PERF: Panoramic Neural Radiance Field from a Single Panorama**|Guangcong Wang et.al.|[2310.16831v2](http://arxiv.org/abs/2310.16831v2)|**[link](https://github.com/perf-project/PeRF)**|\n", "2310.16383": "|**2023-10-25**|**Open-NeRF: Towards Open Vocabulary NeRF Decomposition**|Hao Zhang et.al.|[2310.16383v1](http://arxiv.org/abs/2310.16383v1)|null|\n", "2310.16255": "|**2023-10-25**|**UAV-Sim: NeRF-based Synthetic Data Generation for UAV-based Perception**|Christopher Maxey et.al.|[2310.16255v1](http://arxiv.org/abs/2310.16255v1)|null|\n", "2310.17075": "|**2023-10-27**|**HyperFields: Towards Zero-Shot Generation of NeRFs from Text**|Sudarshan Babu et.al.|[2310.17075v2](http://arxiv.org/abs/2310.17075v2)|null|\n", "2310.16858": "|**2023-10-25**|**4D-Editor: Interactive Object-level Editing in Dynamic Neural Radiance Fields via 4D Semantic Segmentation**|Dadong Jiang et.al.|[2310.16858v1](http://arxiv.org/abs/2310.16858v1)|null|\n", "2310.17994": "|**2023-10-27**|**ZeroNVS: Zero-Shot 360-Degree View Synthesis from a Single Real Image**|Kyle Sargent et.al.|[2310.17994v1](http://arxiv.org/abs/2310.17994v1)|null|\n", "2310.17880": "|**2023-10-27**|**Reconstructive Latent-Space Neural Radiance Fields for Efficient 3D Scene Representations**|Tristan Aumentado-Armstrong et.al.|[2310.17880v1](http://arxiv.org/abs/2310.17880v1)|null|\n", "2310.18917": "|**2023-11-04**|**TiV-NeRF: Tracking and Mapping via Time-Varying Representation with Dynamic Neural Radiance Fields**|Chengyao Duan et.al.|[2310.18917v2](http://arxiv.org/abs/2310.18917v2)|null|\n", "2310.18846": "|**2023-10-28**|**INCODE: Implicit Neural Conditioning with Prior Knowledge Embeddings**|Amirhossein Kazerouni et.al.|[2310.18846v1](http://arxiv.org/abs/2310.18846v1)|**[link](https://github.com/xmindflow/INCODE)**|\n", "2310.20710": "|**2023-10-31**|**FPO++: Efficient Encoding and Rendering of Dynamic Neural Radiance Fields by Analyzing and Enhancing Fourier PlenOctrees**|Saskia Rabich et.al.|[2310.20710v1](http://arxiv.org/abs/2310.20710v1)|null|\n", "2310.20685": "|**2023-10-31**|**NeRF Revisited: Fixing Quadrature Instability in Volume Rendering**|Mikaela Angelina Uy et.al.|[2310.20685v1](http://arxiv.org/abs/2310.20685v1)|null|\n", "2310.19464": "|**2023-10-30**|**Generative Neural Fields by Mixtures of Neural Implicit Functions**|Tackgeun You et.al.|[2310.19464v1](http://arxiv.org/abs/2310.19464v1)|null|\n", "2311.01065": "|**2023-11-02**|**Novel View Synthesis from a Single RGBD Image for Indoor Scenes**|Congrui Hetang et.al.|[2311.01065v1](http://arxiv.org/abs/2311.01065v1)|null|\n", "2311.01815": "|**2023-11-03**|**Estimating 3D Uncertainty Field: Quantifying Uncertainty for Neural Radiance Fields**|Jianxiong Shen et.al.|[2311.01815v1](http://arxiv.org/abs/2311.01815v1)|null|\n", "2311.01773": "|**2023-11-03**|**PDF: Point Diffusion Implicit Function for Large-scale Scene Neural Representation**|Yuhan Ding et.al.|[2311.01773v1](http://arxiv.org/abs/2311.01773v1)|null|\n", "2311.01659": "|**2023-11-03**|**Efficient Cloud Pipelines for Neural Radiance Fields**|Derek Jacoby et.al.|[2311.01659v1](http://arxiv.org/abs/2311.01659v1)|null|\n", "2311.03140": "|**2023-11-06**|**Animating NeRFs from Texture Space: A Framework for Pose-Dependent Rendering of Human Performances**|Paul Knoll et.al.|[2311.03140v1](http://arxiv.org/abs/2311.03140v1)|null|\n", "2311.02826": "|**2023-11-06**|**InstructPix2NeRF: Instructed 3D Portrait Editing from a Single Image**|Jianhui Li et.al.|[2311.02826v1](http://arxiv.org/abs/2311.02826v1)|**[link](https://github.com/mybabyyh/instructpix2nerf)**|\n", "2311.04154": "|**2023-11-07**|**High-fidelity 3D Reconstruction of Plants using Neural Radiance Field**|Kewei Hu et.al.|[2311.04154v1](http://arxiv.org/abs/2311.04154v1)|null|\n", "2311.03965": "|**2023-11-07**|**Fast Sun-aligned Outdoor Scene Relighting based on TensoRF**|Yeonjin Chang et.al.|[2311.03965v1](http://arxiv.org/abs/2311.03965v1)|null|\n", "2311.03784": "|**2023-11-08**|**UP-NeRF: Unconstrained Pose-Prior-Free Neural Radiance Fields**|Injae Kim et.al.|[2311.03784v2](http://arxiv.org/abs/2311.03784v2)|**[link](https://github.com/mlvlab/upnerf)**|\n", "2311.03484": "|**2023-11-06**|**Osprey: Multi-Session Autonomous Aerial Mapping with LiDAR-based SLAM and Next Best View Planning**|Rowan Border et.al.|[2311.03484v1](http://arxiv.org/abs/2311.03484v1)|null|\n", "2311.04400": "|**2023-11-08**|**LRM: Large Reconstruction Model for Single Image to 3D**|Yicong Hong et.al.|[2311.04400v1](http://arxiv.org/abs/2311.04400v1)|null|\n", "2311.04246": "|**2023-11-07**|**ADFactory: Automated Data Factory for Optical Flow Tasks**|Han Ling et.al.|[2311.04246v1](http://arxiv.org/abs/2311.04246v1)|null|\n", "2311.05521": "|**2023-11-09**|**BakedAvatar: Baking Neural Fields for Real-Time Head Avatar Synthesis**|Hao-Bin Duan et.al.|[2311.05521v1](http://arxiv.org/abs/2311.05521v1)|null|\n", "2311.05461": "|**2023-11-09**|**Control3D: Towards Controllable Text-to-3D Generation**|Yang Chen et.al.|[2311.05461v1](http://arxiv.org/abs/2311.05461v1)|null|\n", "2311.06214": "|**2023-11-10**|**Instant3D: Fast Text-to-3D with Sparse-View Generation and Large Reconstruction Model**|Jiahao Li et.al.|[2311.06214v1](http://arxiv.org/abs/2311.06214v1)|null|\n", "2311.05958": "|**2023-11-10**|**A Neural Height-Map Approach for the Binocular Photometric Stereo Problem**|Fotios Logothetis et.al.|[2311.05958v1](http://arxiv.org/abs/2311.05958v1)|null|\n"}}
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index cb2f68de0f..a228825da5 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,7 +7,7 @@ layout: default
 [![Stargazers][stars-shield]][stars-url]
 [![Issues][issues-shield]][issues-url]
 
-## Updated on 2023.11.13
+## Updated on 2023.11.14
 > Usage instructions: [here](./docs/README.md#usage)
 
 ## Kinematic Mapping
@@ -169,6 +169,10 @@ layout: default
 
 | Publish Date | Title | Authors | PDF | Code |
 |:---------|:-----------------------|:---------|:------|:------|
+|**2023-11-10**|**Automated Heterogeneous Low-Bit Quantization of Multi-Model Deep Learning Inference Pipeline**|Jayeeta Mondal et.al.|[2311.05870v1](http://arxiv.org/abs/2311.05870v1)|null|
+|**2023-11-10**|**Watermarking Vision-Language Pre-trained Models for Multi-modal Embedding as a Service**|Yuanmin Tang et.al.|[2311.05863v1](http://arxiv.org/abs/2311.05863v1)|**[link](https://github.com/Pter61/vlpmarker)**|
+|**2023-11-09**|**Cosmological parameter estimation with Genetic Algorithms**|Ricardo Medel-Esquivel et.al.|[2311.05699v1](http://arxiv.org/abs/2311.05699v1)|null|
+|**2023-11-09**|**Multi-Modal Gaze Following in Conversational Scenarios**|Yuqi Hou et.al.|[2311.05669v1](http://arxiv.org/abs/2311.05669v1)|null|
 |**2023-11-09**|**Object-centric Cross-modal Feature Distillation for Event-based Object Detection**|Lei Li et.al.|[2311.05494v1](http://arxiv.org/abs/2311.05494v1)|null|
 |**2023-11-09**|**3DStyle-Diffusion: Pursuing Fine-grained Text-driven 3D Stylization with 2D Diffusion Models**|Haibo Yang et.al.|[2311.05464v1](http://arxiv.org/abs/2311.05464v1)|**[link](https://github.com/yanghb22-fdu/3dstyle-diffusion-official)**|
 |**2023-11-09**|**ControlStyle: Text-Driven Stylized Image Generation Using Diffusion Priors**|Jingwen Chen et.al.|[2311.05463v1](http://arxiv.org/abs/2311.05463v1)|null|
@@ -2108,6 +2112,8 @@ layout: default
 
 | Publish Date | Title | Authors | PDF | Code |
 |:---------|:-----------------------|:---------|:------|:------|
+|**2023-11-10**|**Instant3D: Fast Text-to-3D with Sparse-View Generation and Large Reconstruction Model**|Jiahao Li et.al.|[2311.06214v1](http://arxiv.org/abs/2311.06214v1)|null|
+|**2023-11-10**|**A Neural Height-Map Approach for the Binocular Photometric Stereo Problem**|Fotios Logothetis et.al.|[2311.05958v1](http://arxiv.org/abs/2311.05958v1)|null|
 |**2023-11-09**|**BakedAvatar: Baking Neural Fields for Real-Time Head Avatar Synthesis**|Hao-Bin Duan et.al.|[2311.05521v1](http://arxiv.org/abs/2311.05521v1)|null|
 |**2023-11-09**|**Control3D: Towards Controllable Text-to-3D Generation**|Yang Chen et.al.|[2311.05461v1](http://arxiv.org/abs/2311.05461v1)|null|
 |**2023-11-08**|**LRM: Large Reconstruction Model for Single Image to 3D**|Yicong Hong et.al.|[2311.04400v1](http://arxiv.org/abs/2311.04400v1)|null|