diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8f8947468..a2949b51e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 ## Release 515 Entries
 
+### [515.76] 2022-09-20
+
+#### Fixed
+
+- Improved compatibility with new Linux kernel releases
+- Fixed possible excessive GPU power draw on an idle X11 or Wayland desktop when driving high resolutions or refresh rates
+
 ### [515.65.01] 2022-08-02
 
 #### Fixed
diff --git a/README.md b/README.md
index 7cf07ee67..e40cd4c41 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # NVIDIA Linux Open GPU Kernel Module Source
 
 This is the source release of the NVIDIA Linux open GPU kernel modules,
-version 515.65.01.
+version 515.76.
 
 
 ## How to Build
@@ -17,7 +17,7 @@ as root:
 
 Note that the kernel modules built here must be used with gsp.bin
 firmware and user-space NVIDIA GPU driver components from a corresponding
-515.65.01 driver release.  This can be achieved by installing
+515.76 driver release.  This can be achieved by installing
 the NVIDIA GPU driver from the .run file using the `--no-kernel-modules`
 option.  E.g.,
 
@@ -167,7 +167,7 @@ for the target kernel.
 ## Compatible GPUs
 
 The open-gpu-kernel-modules can be used on any Turing or later GPU
-(see the table below). However, in the 515.65.01 release,
+(see the table below). However, in the 515.76 release,
 GeForce and Workstation support is still considered alpha-quality.
 
 To enable use of the open kernel modules on GeForce and Workstation GPUs,
@@ -175,7 +175,7 @@ set the "NVreg_OpenRmEnableUnsupportedGpus" nvidia.ko kernel module
 parameter to 1. For more details, see the NVIDIA GPU driver end user
 README here:
 
-https://us.download.nvidia.com/XFree86/Linux-x86_64/515.65.01/README/kernel_open.html
+https://us.download.nvidia.com/XFree86/Linux-x86_64/515.76/README/kernel_open.html
 
 In the below table, if three IDs are listed, the first is the PCI Device 
 ID, the second is the PCI Subsystem Vendor ID, and the third is the PCI
@@ -643,6 +643,8 @@ Subsystem Device ID.
 | NVIDIA A100-PG509-200                           | 20B0 10DE 1450 |
 | NVIDIA A100-SXM4-80GB                           | 20B2 10DE 1463 |
 | NVIDIA A100-SXM4-80GB                           | 20B2 10DE 147F |
+| NVIDIA A100-SXM4-80GB                           | 20B2 10DE 1622 |
+| NVIDIA A100-SXM4-80GB                           | 20B2 10DE 1623 |
 | NVIDIA PG506-242                                | 20B3 10DE 14A7 |
 | NVIDIA PG506-243                                | 20B3 10DE 14A8 |
 | NVIDIA A100 80GB PCIe                           | 20B5 10DE 1533 |
@@ -743,6 +745,7 @@ Subsystem Device ID.
 | NVIDIA GeForce RTX 3050                         | 2507           |
 | NVIDIA GeForce RTX 3050 OEM                     | 2508           |
 | NVIDIA GeForce RTX 3060 Laptop GPU              | 2520           |
+| NVIDIA GeForce RTX 3060 Laptop GPU              | 2521           |
 | NVIDIA GeForce RTX 3050 Ti Laptop GPU           | 2523           |
 | NVIDIA RTX A2000                                | 2531 1028 151D |
 | NVIDIA RTX A2000                                | 2531 103C 151D |
diff --git a/kernel-open/Kbuild b/kernel-open/Kbuild
index 0b7482e12..42c14d686 100644
--- a/kernel-open/Kbuild
+++ b/kernel-open/Kbuild
@@ -72,7 +72,7 @@ EXTRA_CFLAGS += -I$(src)/common/inc
 EXTRA_CFLAGS += -I$(src)
 EXTRA_CFLAGS += -Wall -MD $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-format-extra-args
 EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM
-EXTRA_CFLAGS += -DNV_VERSION_STRING=\"515.65.01\"
+EXTRA_CFLAGS += -DNV_VERSION_STRING=\"515.76\"
 
 EXTRA_CFLAGS += -Wno-unused-function
 
@@ -203,9 +203,108 @@ $(obj)/conftest/patches.h: $(NV_CONFTEST_SCRIPT)
 	@mkdir -p $(obj)/conftest
 	@$(NV_CONFTEST_CMD) patch_check > $@
 
-$(obj)/conftest/headers.h: $(NV_CONFTEST_SCRIPT)
-	@mkdir -p $(obj)/conftest
-	@$(NV_CONFTEST_CMD) test_kernel_headers '$(NV_CONFTEST_CFLAGS)' > $@
+
+# Each of these headers is checked for presence with a test #include; a
+# corresponding #define will be generated in conftest/headers.h.
+NV_HEADER_PRESENCE_TESTS = \
+ asm/system.h \
+ drm/drmP.h \
+ drm/drm_auth.h \
+ drm/drm_gem.h \
+ drm/drm_crtc.h \
+ drm/drm_atomic.h \
+ drm/drm_atomic_helper.h \
+ drm/drm_encoder.h \
+ drm/drm_atomic_uapi.h \
+ drm/drm_drv.h \
+ drm/drm_framebuffer.h \
+ drm/drm_connector.h \
+ drm/drm_probe_helper.h \
+ drm/drm_blend.h \
+ drm/drm_fourcc.h \
+ drm/drm_prime.h \
+ drm/drm_plane.h \
+ drm/drm_vblank.h \
+ drm/drm_file.h \
+ drm/drm_ioctl.h \
+ drm/drm_device.h \
+ drm/drm_mode_config.h \
+ dt-bindings/interconnect/tegra_icc_id.h \
+ generated/autoconf.h \
+ generated/compile.h \
+ generated/utsrelease.h \
+ linux/efi.h \
+ linux/kconfig.h \
+ linux/platform/tegra/mc_utils.h \
+ linux/semaphore.h \
+ linux/printk.h \
+ linux/ratelimit.h \
+ linux/prio_tree.h \
+ linux/log2.h \
+ linux/of.h \
+ linux/bug.h \
+ linux/sched/signal.h \
+ linux/sched/task.h \
+ linux/sched/task_stack.h \
+ xen/ioemu.h \
+ linux/fence.h \
+ linux/dma-resv.h \
+ soc/tegra/chip-id.h \
+ soc/tegra/fuse.h \
+ soc/tegra/tegra_bpmp.h \
+ video/nv_internal.h \
+ linux/platform/tegra/dce/dce-client-ipc.h \
+ linux/nvhost.h \
+ linux/nvhost_t194.h \
+ asm/book3s/64/hash-64k.h \
+ asm/set_memory.h \
+ asm/prom.h \
+ asm/powernv.h \
+ linux/atomic.h \
+ asm/barrier.h \
+ asm/opal-api.h \
+ sound/hdaudio.h \
+ asm/pgtable_types.h \
+ linux/stringhash.h \
+ linux/dma-map-ops.h \
+ rdma/peer_mem.h \
+ sound/hda_codec.h \
+ linux/dma-buf.h \
+ linux/time.h \
+ linux/platform_device.h \
+ linux/mutex.h \
+ linux/reset.h \
+ linux/of_platform.h \
+ linux/of_device.h \
+ linux/of_gpio.h \
+ linux/gpio.h \
+ linux/gpio/consumer.h \
+ linux/interconnect.h \
+ linux/pm_runtime.h \
+ linux/clk.h \
+ linux/clk-provider.h \
+ linux/ioasid.h \
+ linux/stdarg.h \
+ linux/iosys-map.h \
+ asm/coco.h
+
+# Filename to store the define for the header in $(1); this is only consumed by
+# the rule below that concatenates all of these together.
+NV_HEADER_PRESENCE_PART = $(addprefix $(obj)/conftest/header_presence/,$(addsuffix .part,$(1)))
+
+# Define a rule to check the header $(1).
+define NV_HEADER_PRESENCE_CHECK
+ $$(call NV_HEADER_PRESENCE_PART,$(1)): $$(NV_CONFTEST_SCRIPT) $(obj)/conftest/uts_release
+	@mkdir -p $$(dir $$@)
+	@$$(NV_CONFTEST_CMD) test_kernel_header '$$(NV_CONFTEST_CFLAGS)' '$(1)' > $$@
+endef
+
+# Evaluate the rule above for each header in the list.
+$(foreach header,$(NV_HEADER_PRESENCE_TESTS),$(eval $(call NV_HEADER_PRESENCE_CHECK,$(header))))
+
+# Concatenate all of the parts into headers.h.
+$(obj)/conftest/headers.h: $(call NV_HEADER_PRESENCE_PART,$(NV_HEADER_PRESENCE_TESTS))
+	@cat $^ > $@
 
 clean-dirs := $(obj)/conftest
 
diff --git a/kernel-open/common/inc/nv-linux.h b/kernel-open/common/inc/nv-linux.h
index eeba17df5..dcfac7d03 100644
--- a/kernel-open/common/inc/nv-linux.h
+++ b/kernel-open/common/inc/nv-linux.h
@@ -227,6 +227,7 @@ static inline uid_t __kuid_val(uid_t uid)
 #endif
 
 #include <linux/fb.h>               /* fb_info struct                   */
+#include <linux/screen_info.h>      /* screen_info                      */
 
 #if !defined(CONFIG_PCI)
 #warning "Attempting to build driver for a platform with no PCI support!"
diff --git a/kernel-open/common/inc/nv-pgprot.h b/kernel-open/common/inc/nv-pgprot.h
index b56d95611..581e97f3a 100644
--- a/kernel-open/common/inc/nv-pgprot.h
+++ b/kernel-open/common/inc/nv-pgprot.h
@@ -78,13 +78,8 @@ static inline pgprot_t pgprot_modify_writecombine(pgprot_t old_prot)
 
 #define NV_PGPROT_UNCACHED_DEVICE(old_prot)     pgprot_noncached(old_prot)
 #if defined(NVCPU_AARCH64)
-#if defined(NV_MT_DEVICE_GRE_PRESENT)
-#define NV_PROT_WRITE_COMBINED_DEVICE   (PROT_DEFAULT | PTE_PXN | PTE_UXN |   \
-                                         PTE_ATTRINDX(MT_DEVICE_GRE))
-#else
 #define NV_PROT_WRITE_COMBINED_DEVICE   (PROT_DEFAULT | PTE_PXN | PTE_UXN |   \
                                          PTE_ATTRINDX(MT_DEVICE_nGnRE))
-#endif
 #define NV_PGPROT_WRITE_COMBINED_DEVICE(old_prot)                             \
     __pgprot_modify(old_prot, PTE_ATTRINDX_MASK, NV_PROT_WRITE_COMBINED_DEVICE)
 #define NV_PGPROT_WRITE_COMBINED(old_prot)      NV_PGPROT_UNCACHED(old_prot)
diff --git a/kernel-open/common/inc/nv.h b/kernel-open/common/inc/nv.h
index 923967220..568dfdf13 100644
--- a/kernel-open/common/inc/nv.h
+++ b/kernel-open/common/inc/nv.h
@@ -624,27 +624,45 @@ typedef enum
 #define NV_GET_NV_STATE(pGpu) \
     (nv_state_t *)((pGpu) ? (pGpu)->pOsGpuInfo : NULL)
 
-#define IS_REG_OFFSET(nv, offset, length)                                       \
-    (((offset) >= (nv)->regs->cpu_address) &&                                   \
-    (((offset) + ((length)-1)) <=                                               \
-        (nv)->regs->cpu_address + ((nv)->regs->size-1)))
-
-#define IS_FB_OFFSET(nv, offset, length)                                        \
-    (((nv)->fb) && ((offset) >= (nv)->fb->cpu_address) &&                       \
-    (((offset) + ((length)-1)) <= (nv)->fb->cpu_address + ((nv)->fb->size-1)))
-
-#define IS_UD_OFFSET(nv, offset, length)                                        \
-    (((nv)->ud.cpu_address != 0) && ((nv)->ud.size != 0) &&                     \
-    ((offset) >= (nv)->ud.cpu_address) &&                                       \
-    (((offset) + ((length)-1)) <= (nv)->ud.cpu_address + ((nv)->ud.size-1)))
-
-#define IS_IMEM_OFFSET(nv, offset, length)                                      \
-    (((nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&                    \
-     ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&                           \
-     ((offset) >= (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&             \
-     (((offset) + ((length) - 1)) <=                                            \
-        (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +                         \
-            ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size - 1)))
+static inline NvBool IS_REG_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((offset >= nv->regs->cpu_address) &&
+
+
+
+            ((offset + (length - 1)) <= (nv->regs->cpu_address + (nv->regs->size - 1))));
+}
+
+static inline NvBool IS_FB_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return  ((nv->fb) && (offset >= nv->fb->cpu_address) &&
+
+
+
+             ((offset + (length - 1)) <= (nv->fb->cpu_address + (nv->fb->size - 1))));
+}
+
+static inline NvBool IS_UD_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->ud.cpu_address != 0) && (nv->ud.size != 0) &&
+            (offset >= nv->ud.cpu_address) &&
+
+
+
+            ((offset + (length - 1)) <= (nv->ud.cpu_address + (nv->ud.size - 1))));
+}
+
+static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&
+            (nv->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&
+            (offset >= nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&
+
+
+
+            ((offset + (length - 1)) <= (nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +
+                                         (nv->bars[NV_GPU_BAR_INDEX_IMEM].size - 1))));
+}
 
 #define NV_RM_MAX_MSIX_LINES  8
 
diff --git a/kernel-open/conftest.sh b/kernel-open/conftest.sh
index ee19e0374..5ec66bf37 100755
--- a/kernel-open/conftest.sh
+++ b/kernel-open/conftest.sh
@@ -55,9 +55,13 @@ append_conftest() {
     done
 }
 
-translate_and_preprocess_header_files() {
-    # Inputs:
-    #   $1: list of relative file paths
+test_header_presence() {
+    #
+    # Determine if the given header file (which may or may not be
+    # present) is provided by the target kernel.
+    #
+    # Input:
+    #   $1: relative file path
     #
     # This routine creates an upper case, underscore version of each of the
     # relative file paths, and uses that as the token to either define or
@@ -73,115 +77,25 @@ translate_and_preprocess_header_files() {
     # strings, without special handling of the beginning or the end of the line.
     TEST_CFLAGS=`echo "-E -M $CFLAGS " | sed -e 's/\( -M[DG]\)* / /g'`
 
-    for file in "$@"; do
-        file_define=NV_`echo $file | tr '/.' '_' | tr '-' '_' | tr 'a-z' 'A-Z'`_PRESENT
+    file="$1"
+    file_define=NV_`echo $file | tr '/.' '_' | tr '-' '_' | tr 'a-z' 'A-Z'`_PRESENT
 
-        CODE="#include <$file>"
+    CODE="#include <$file>"
 
-        if echo "$CODE" | $CC $TEST_CFLAGS - > /dev/null 2>&1; then
-            echo "#define $file_define"
+    if echo "$CODE" | $CC $TEST_CFLAGS - > /dev/null 2>&1; then
+        echo "#define $file_define"
+    else
+        # If preprocessing failed, it could have been because the header
+        # file under test is not present, or because it is present but
+        # depends upon the inclusion of other header files. Attempting
+        # preprocessing again with -MG will ignore a missing header file
+        # but will still fail if the header file is present.
+        if echo "$CODE" | $CC $TEST_CFLAGS -MG - > /dev/null 2>&1; then
+            echo "#undef $file_define"
         else
-            # If preprocessing failed, it could have been because the header
-            # file under test is not present, or because it is present but
-            # depends upon the inclusion of other header files. Attempting
-            # preprocessing again with -MG will ignore a missing header file
-            # but will still fail if the header file is present.
-            if echo "$CODE" | $CC $TEST_CFLAGS -MG - > /dev/null 2>&1; then
-                echo "#undef $file_define"
-            else
-                echo "#define $file_define"
-            fi
+            echo "#define $file_define"
         fi
-    done
-}
-
-test_headers() {
-    #
-    # Determine which header files (of a set that may or may not be
-    # present) are provided by the target kernel.
-    #
-    FILES="asm/system.h"
-    FILES="$FILES drm/drmP.h"
-    FILES="$FILES drm/drm_auth.h"
-    FILES="$FILES drm/drm_gem.h"
-    FILES="$FILES drm/drm_crtc.h"
-    FILES="$FILES drm/drm_atomic.h"
-    FILES="$FILES drm/drm_atomic_helper.h"
-    FILES="$FILES drm/drm_encoder.h"
-    FILES="$FILES drm/drm_atomic_uapi.h"
-    FILES="$FILES drm/drm_drv.h"
-    FILES="$FILES drm/drm_framebuffer.h"
-    FILES="$FILES drm/drm_connector.h"
-    FILES="$FILES drm/drm_probe_helper.h"
-    FILES="$FILES drm/drm_blend.h"
-    FILES="$FILES drm/drm_fourcc.h"
-    FILES="$FILES drm/drm_prime.h"
-    FILES="$FILES drm/drm_plane.h"
-    FILES="$FILES drm/drm_vblank.h"
-    FILES="$FILES drm/drm_file.h"
-    FILES="$FILES drm/drm_ioctl.h"
-    FILES="$FILES drm/drm_device.h"
-    FILES="$FILES drm/drm_mode_config.h"
-    FILES="$FILES dt-bindings/interconnect/tegra_icc_id.h"
-    FILES="$FILES generated/autoconf.h"
-    FILES="$FILES generated/compile.h"
-    FILES="$FILES generated/utsrelease.h"
-    FILES="$FILES linux/efi.h"
-    FILES="$FILES linux/kconfig.h"
-    FILES="$FILES linux/platform/tegra/mc_utils.h"
-    FILES="$FILES linux/semaphore.h"
-    FILES="$FILES linux/printk.h"
-    FILES="$FILES linux/ratelimit.h"
-    FILES="$FILES linux/prio_tree.h"
-    FILES="$FILES linux/log2.h"
-    FILES="$FILES linux/of.h"
-    FILES="$FILES linux/bug.h"
-    FILES="$FILES linux/sched/signal.h"
-    FILES="$FILES linux/sched/task.h"
-    FILES="$FILES linux/sched/task_stack.h"
-    FILES="$FILES xen/ioemu.h"
-    FILES="$FILES linux/fence.h"
-    FILES="$FILES linux/dma-resv.h"
-    FILES="$FILES soc/tegra/chip-id.h"
-    FILES="$FILES soc/tegra/fuse.h"
-    FILES="$FILES soc/tegra/tegra_bpmp.h"
-    FILES="$FILES video/nv_internal.h"
-    FILES="$FILES linux/platform/tegra/dce/dce-client-ipc.h"
-    FILES="$FILES linux/nvhost.h"
-    FILES="$FILES linux/nvhost_t194.h"
-    FILES="$FILES asm/book3s/64/hash-64k.h"
-    FILES="$FILES asm/set_memory.h"
-    FILES="$FILES asm/prom.h"
-    FILES="$FILES asm/powernv.h"
-    FILES="$FILES linux/atomic.h"
-    FILES="$FILES asm/barrier.h"
-    FILES="$FILES asm/opal-api.h"
-    FILES="$FILES sound/hdaudio.h"
-    FILES="$FILES asm/pgtable_types.h"
-    FILES="$FILES linux/stringhash.h"
-    FILES="$FILES linux/dma-map-ops.h"
-    FILES="$FILES rdma/peer_mem.h"
-    FILES="$FILES sound/hda_codec.h"
-    FILES="$FILES linux/dma-buf.h"
-    FILES="$FILES linux/time.h"
-    FILES="$FILES linux/platform_device.h"
-    FILES="$FILES linux/mutex.h"
-    FILES="$FILES linux/reset.h"
-    FILES="$FILES linux/of_platform.h"
-    FILES="$FILES linux/of_device.h"
-    FILES="$FILES linux/of_gpio.h"
-    FILES="$FILES linux/gpio.h"
-    FILES="$FILES linux/gpio/consumer.h"
-    FILES="$FILES linux/interconnect.h"
-    FILES="$FILES linux/pm_runtime.h"
-    FILES="$FILES linux/clk.h"
-    FILES="$FILES linux/clk-provider.h"
-    FILES="$FILES linux/ioasid.h"
-    FILES="$FILES linux/stdarg.h"
-    FILES="$FILES linux/iosys-map.h"
-    FILES="$FILES asm/coco.h"
-
-    translate_and_preprocess_header_files $FILES
+    fi
 }
 
 build_cflags() {
@@ -2420,23 +2334,6 @@ compile_test() {
             compile_check_conftest "$CODE" "NV_PCI_DEV_HAS_ATS_ENABLED" "" "types"
         ;;
 
-        mt_device_gre)
-            #
-            # Determine if MT_DEVICE_GRE flag is present.
-            #
-            # MT_DEVICE_GRE flag is removed by commit 58cc6b72a21274
-            # ("arm64: mm: Remove unused support for Device-GRE memory type") in v5.14-rc1
-            # (2021-06-01).
-            #
-            CODE="
-            #include <asm/memory.h>
-            unsigned int conftest_mt_device_gre(void) {
-                return MT_DEVICE_GRE;
-            }"
-
-            compile_check_conftest "$CODE" "NV_MT_DEVICE_GRE_PRESENT" "" "types"
-        ;;
-
         get_user_pages)
             #
             # Conftest for get_user_pages()
@@ -5366,6 +5263,23 @@ compile_test() {
             compile_check_conftest "$CODE" "NV_GET_TASK_IOPRIO_PRESENT" "" "functions"
         ;;
 
+        num_registered_fb)
+            #
+            # Determine if 'num_registered_fb' variable is present.
+            #
+            # 'num_registered_fb' was removed by commit 5727dcfd8486
+            # ("fbdev: Make registered_fb[] private to fbmem.c) for
+            # v5.20 linux-next (2022-07-27).
+            #
+            CODE="
+            #include <linux/fb.h>
+            int conftest_num_registered_fb(void) {
+                return num_registered_fb;
+            }"
+
+            compile_check_conftest "$CODE" "NV_NUM_REGISTERED_FB_PRESENT" "" "types"
+        ;;
+
         # When adding a new conftest entry, please use the correct format for
         # specifying the relevant upstream Linux kernel commit.
         #
@@ -5764,14 +5678,14 @@ case "$5" in
     ;;
 
 
-    test_kernel_headers)
+    test_kernel_header)
         #
-        # Check for the availability of certain kernel headers
+        # Check for the availability of the given kernel header
         #
 
         CFLAGS=$6
 
-        test_headers
+        test_header_presence "${7}"
 
         for file in conftest*.d; do
             rm -f $file > /dev/null 2>&1
diff --git a/kernel-open/nvidia-drm/nvidia-drm-helper.c b/kernel-open/nvidia-drm/nvidia-drm-helper.c
index 3831180e0..8fc862068 100644
--- a/kernel-open/nvidia-drm/nvidia-drm-helper.c
+++ b/kernel-open/nvidia-drm/nvidia-drm-helper.c
@@ -41,6 +41,19 @@
 #include <drm/drm_atomic_uapi.h>
 #endif
 
+/*
+ * The inclusion of drm_framebuffer.h was removed from drm_crtc.h by commit
+ * 720cf96d8fecde29b72e1101f8a567a0ce99594f ("drm: Drop drm_framebuffer.h from
+ * drm_crtc.h") in linux-next, expected in v5.19-rc7.
+ *
+ * We only need drm_framebuffer.h for drm_framebuffer_put(), and it is always
+ * present (v4.9+) when drm_framebuffer_{put,get}() is present (v4.12+), so it
+ * is safe to unconditionally include it when drm_framebuffer_get() is present.
+ */
+#if defined(NV_DRM_FRAMEBUFFER_GET_PRESENT)
+#include <drm/drm_framebuffer.h>
+#endif
+
 static void __nv_drm_framebuffer_put(struct drm_framebuffer *fb)
 {
 #if defined(NV_DRM_FRAMEBUFFER_GET_PRESENT)
diff --git a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
index 8440b3c0c..383af3de7 100644
--- a/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-linux.c
@@ -59,6 +59,9 @@
 
 #define NVKMS_LOG_PREFIX "nvidia-modeset: "
 
+static bool output_rounding_fix = false;
+module_param_named(output_rounding_fix, output_rounding_fix, bool, 0400);
+
 /* These parameters are used for fault injection tests.  Normally the defaults
  * should be used. */
 MODULE_PARM_DESC(fail_malloc, "Fail the Nth call to nvkms_alloc");
@@ -71,6 +74,10 @@ module_param_named(malloc_verbose, malloc_verbose, bool, 0400);
 
 static atomic_t nvkms_alloc_called_count;
 
+NvBool nvkms_output_rounding_fix(void)
+{
+    return output_rounding_fix;
+}
 
 #define NVKMS_SYNCPT_STUBS_NEEDED
 
diff --git a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
index 36685a026..8036811f9 100644
--- a/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
+++ b/kernel-open/nvidia-modeset/nvidia-modeset-os-interface.h
@@ -110,6 +110,7 @@ typedef struct {
     } set_maxval;
 } NvKmsSyncPtOpParams;
 
+NvBool nvkms_output_rounding_fix(void);
 
 void   nvkms_call_rm    (void *ops);
 void*  nvkms_alloc      (size_t size,
diff --git a/kernel-open/nvidia-uvm/uvm_channel.c b/kernel-open/nvidia-uvm/uvm_channel.c
index 5eaf3f14e..b1b2177f8 100644
--- a/kernel-open/nvidia-uvm/uvm_channel.c
+++ b/kernel-open/nvidia-uvm/uvm_channel.c
@@ -35,10 +35,6 @@
 #include "nv_uvm_interface.h"
 #include "clb06f.h"
 
-#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT 1024
-#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MIN 32
-#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX (1024 * 1024)
-
 static unsigned uvm_channel_num_gpfifo_entries = UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT;
 
 #define UVM_CHANNEL_GPFIFO_LOC_DEFAULT "auto"
@@ -86,6 +82,12 @@ static NvU32 uvm_channel_update_progress_with_max(uvm_channel_t *channel,
 
     uvm_spin_lock(&channel->pool->lock);
 
+    // Completed value should never exceed the queued value
+    UVM_ASSERT_MSG_RELEASE(completed_value <= channel->tracking_sem.queued_value,
+                           "GPU %s channel %s unexpected completed_value 0x%llx > queued_value 0x%llx\n",
+                           channel->pool->manager->gpu->parent->name, channel->name, completed_value,
+                           channel->tracking_sem.queued_value);
+
     cpu_put = channel->cpu_put;
     gpu_get = channel->gpu_get;
 
@@ -395,6 +397,14 @@ static void uvm_channel_semaphore_release(uvm_push_t *push, NvU64 semaphore_va,
 {
     uvm_gpu_t *gpu = uvm_push_get_gpu(push);
 
+    // We used to skip the membar or use membar GPU for the semaphore release
+    // for a few pushes, but that doesn't provide sufficient ordering guarantees
+    // in some cases (e.g. ga100 with an LCE with PCEs from both HSHUBs) for the
+    // semaphore writes. To be safe, just always uses a membar sys for now.
+    // TODO bug 3770539: Optimize membars used by end of push semaphore releases
+    (void)uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_GPU);
+    (void)uvm_push_get_and_reset_flag(push, UVM_PUSH_FLAG_NEXT_MEMBAR_NONE);
+
     if (uvm_channel_is_ce(push->channel))
         gpu->parent->ce_hal->semaphore_release(push, semaphore_va, new_payload);
 
@@ -1562,6 +1572,7 @@ static void uvm_channel_print_info(uvm_channel_t *channel, struct seq_file *s)
     UVM_SEQ_OR_DBG_PRINT(s, "get                %u\n", channel->gpu_get);
     UVM_SEQ_OR_DBG_PRINT(s, "put                %u\n", channel->cpu_put);
     UVM_SEQ_OR_DBG_PRINT(s, "Semaphore GPU VA   0x%llx\n", uvm_channel_tracking_semaphore_get_gpu_va(channel));
+    UVM_SEQ_OR_DBG_PRINT(s, "Semaphore CPU VA   0x%llx\n", (NvU64)(uintptr_t)channel->tracking_sem.semaphore.payload);
 
     uvm_spin_unlock(&channel->pool->lock);
 }
diff --git a/kernel-open/nvidia-uvm/uvm_channel.h b/kernel-open/nvidia-uvm/uvm_channel.h
index 7e5add260..fadc4b3c1 100644
--- a/kernel-open/nvidia-uvm/uvm_channel.h
+++ b/kernel-open/nvidia-uvm/uvm_channel.h
@@ -46,6 +46,21 @@
 // wait for a GPFIFO entry to free up.
 //
 
+#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_DEFAULT 1024
+#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MIN 32
+#define UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX (1024 * 1024)
+
+// Semaphore payloads cannot advance too much between calls to
+// uvm_gpu_tracking_semaphore_update_completed_value(). In practice the jumps
+// are bound by gpfifo sizing as we have to update the completed value to
+// reclaim gpfifo entries. Set a limit based on the max gpfifo entries we could
+// ever see.
+//
+// Logically this define belongs to uvm_gpu_semaphore.h but it depends on the
+// channel GPFIFO sizing defined here so it's easiest to just have it here as
+// uvm_channel.h includes uvm_gpu_semaphore.h.
+#define UVM_GPU_SEMAPHORE_MAX_JUMP (2 * UVM_CHANNEL_NUM_GPFIFO_ENTRIES_MAX)
+
 // Channel types
 typedef enum
 {
diff --git a/kernel-open/nvidia-uvm/uvm_channel_test.c b/kernel-open/nvidia-uvm/uvm_channel_test.c
index c7f31d059..2b7d78359 100644
--- a/kernel-open/nvidia-uvm/uvm_channel_test.c
+++ b/kernel-open/nvidia-uvm/uvm_channel_test.c
@@ -151,6 +151,37 @@ static NV_STATUS test_ordering(uvm_va_space_t *va_space)
     return status;
 }
 
+static NV_STATUS test_unexpected_completed_values(uvm_va_space_t *va_space)
+{
+    NV_STATUS status;
+    uvm_gpu_t *gpu;
+
+    for_each_va_space_gpu(gpu, va_space) {
+        uvm_channel_t *channel;
+        NvU64 completed_value;
+
+        // The GPU channel manager is destroyed and then re-created after
+        // the test, so this test requires exclusive access to the GPU.
+        TEST_CHECK_RET(uvm_gpu_retained_count(gpu) == 1);
+
+        channel = &gpu->channel_manager->channel_pools[0].channels[0];
+        completed_value = uvm_channel_update_completed_value(channel);
+        uvm_gpu_semaphore_set_payload(&channel->tracking_sem.semaphore, (NvU32)completed_value + 1);
+
+        TEST_CHECK_RET(uvm_global_get_status() == NV_OK);
+        uvm_channel_update_progress_all(channel);
+        TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_INVALID_STATE);
+
+        uvm_channel_manager_destroy(gpu->channel_manager);
+        // Destruction will hit the error again, so clear one more time.
+        uvm_global_reset_fatal_error();
+
+        TEST_NV_CHECK_RET(uvm_channel_manager_create(gpu, &gpu->channel_manager));
+    }
+
+    return NV_OK;
+}
+
 static NV_STATUS uvm_test_rc_for_gpu(uvm_gpu_t *gpu)
 {
     uvm_push_t push;
@@ -712,6 +743,14 @@ NV_STATUS uvm_test_channel_sanity(UVM_TEST_CHANNEL_SANITY_PARAMS *params, struct
 
 
 
+    g_uvm_global.disable_fatal_error_assert = true;
+    uvm_release_asserts_set_global_error_for_tests = true;
+    status = test_unexpected_completed_values(va_space);
+    uvm_release_asserts_set_global_error_for_tests = false;
+    g_uvm_global.disable_fatal_error_assert = false;
+    if (status != NV_OK)
+        goto done;
+
     if (g_uvm_global.num_simulated_devices == 0) {
         status = test_rc(va_space);
         if (status != NV_OK)
diff --git a/kernel-open/nvidia-uvm/uvm_common.c b/kernel-open/nvidia-uvm/uvm_common.c
index f46761eb5..2e38472ef 100644
--- a/kernel-open/nvidia-uvm/uvm_common.c
+++ b/kernel-open/nvidia-uvm/uvm_common.c
@@ -48,6 +48,33 @@ module_param(uvm_enable_builtin_tests, int, S_IRUGO);
 MODULE_PARM_DESC(uvm_enable_builtin_tests,
                  "Enable the UVM built-in tests. (This is a security risk)");
 
+// Default to release asserts being enabled.
+int uvm_release_asserts __read_mostly = 1;
+
+// Make the module param writable so that release asserts can be enabled or
+// disabled at any time by modifying the module parameter.
+module_param(uvm_release_asserts, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(uvm_release_asserts, "Enable uvm asserts included in release builds.");
+
+// Default to failed release asserts not dumping stack.
+int uvm_release_asserts_dump_stack __read_mostly = 0;
+
+// Make the module param writable so that dumping the stack can be enabled and
+// disabled at any time by modifying the module parameter.
+module_param(uvm_release_asserts_dump_stack, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(uvm_release_asserts_dump_stack, "dump_stack() on failed UVM release asserts.");
+
+// Default to failed release asserts not setting the global UVM error.
+int uvm_release_asserts_set_global_error __read_mostly = 0;
+
+// Make the module param writable so that setting the global fatal error can be
+// enabled and disabled at any time by modifying the module parameter.
+module_param(uvm_release_asserts_set_global_error, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(uvm_release_asserts_set_global_error, "Set UVM global fatal error on failed release asserts.");
+
+// A separate flag to enable setting global error, to be used by tests only.
+bool uvm_release_asserts_set_global_error_for_tests __read_mostly = false;
+
 //
 // Convert kernel errno codes to corresponding NV_STATUS
 //
diff --git a/kernel-open/nvidia-uvm/uvm_common.h b/kernel-open/nvidia-uvm/uvm_common.h
index 1b93e2303..f5e320b41 100644
--- a/kernel-open/nvidia-uvm/uvm_common.h
+++ b/kernel-open/nvidia-uvm/uvm_common.h
@@ -80,6 +80,9 @@ bool uvm_debug_prints_enabled(void);
 #define UVM_ASSERT_PRINT(fmt, ...) \
     UVM_PRINT_FUNC_PREFIX(printk, KERN_ERR NVIDIA_UVM_PRETTY_PRINTING_PREFIX, " " fmt, ##__VA_ARGS__)
 
+#define UVM_ASSERT_PRINT_RL(fmt, ...) \
+    UVM_PRINT_FUNC_PREFIX(printk_ratelimited, KERN_ERR NVIDIA_UVM_PRETTY_PRINTING_PREFIX, " " fmt, ##__VA_ARGS__)
+
 #define UVM_ERR_PRINT(fmt, ...) \
     UVM_PRINT_FUNC_PREFIX_CHECK(printk, KERN_ERR NVIDIA_UVM_PRETTY_PRINTING_PREFIX, " " fmt, ##__VA_ARGS__)
 
@@ -146,9 +149,7 @@ void on_uvm_test_fail(void);
 // Unlike on_uvm_test_fail it provides 'panic' coverity semantics
 void on_uvm_assert(void);
 
-// UVM_ASSERT_RELEASE and UVM_ASSERT_MSG_RELEASE are always enabled, even on
-// release builds.
-#define _UVM_ASSERT_MSG_RELEASE(expr, cond, fmt, ...)                                           \
+#define _UVM_ASSERT_MSG(expr, cond, fmt, ...)                                                   \
     do {                                                                                        \
         if (unlikely(!(expr))) {                                                                \
             UVM_ASSERT_PRINT("Assert failed, condition %s not true" fmt, cond, ##__VA_ARGS__);  \
@@ -157,9 +158,6 @@ void on_uvm_assert(void);
         }                                                                                       \
     } while (0)
 
-#define UVM_ASSERT_MSG_RELEASE(expr, fmt, ...)  _UVM_ASSERT_MSG_RELEASE(expr, #expr, ": " fmt, ##__VA_ARGS__)
-#define UVM_ASSERT_RELEASE(expr)                _UVM_ASSERT_MSG_RELEASE(expr, #expr, "\n")
-
 // Prevent function calls in expr and the print argument list from being
 // evaluated.
 #define UVM_ASSERT_MSG_IGNORE(expr, fmt, ...)   \
@@ -170,13 +168,42 @@ void on_uvm_assert(void);
 
 // UVM_ASSERT and UVM_ASSERT_MSG are only enabled on non-release and Coverity builds
 #if UVM_IS_DEBUG() || defined __COVERITY__
-    #define UVM_ASSERT_MSG                  UVM_ASSERT_MSG_RELEASE
-    #define UVM_ASSERT                      UVM_ASSERT_RELEASE
+    #define UVM_ASSERT_MSG(expr, fmt, ...)  _UVM_ASSERT_MSG(expr, #expr, ": " fmt, ##__VA_ARGS__)
+    #define UVM_ASSERT(expr)                _UVM_ASSERT_MSG(expr, #expr, "\n")
 #else
     #define UVM_ASSERT_MSG(expr, fmt, ...)  UVM_ASSERT_MSG_IGNORE(expr, fmt, ##__VA_ARGS__)
     #define UVM_ASSERT(expr)                UVM_ASSERT_MSG_IGNORE(expr, "\n")
 #endif
 
+// UVM_ASSERT_RELEASE and UVM_ASSERT_MSG_RELEASE are always included in the
+// build, even on release builds. They are skipped at runtime if
+// uvm_release_asserts is 0.
+
+// Whether release asserts are enabled and whether they should dump the stack
+// and set the global error.
+extern int uvm_release_asserts;
+extern int uvm_release_asserts_dump_stack;
+extern int uvm_release_asserts_set_global_error;
+extern bool uvm_release_asserts_set_global_error_for_tests;
+
+// Given these are enabled for release builds, we need to be more cautious than
+// in UVM_ASSERT(). Use a ratelimited print and only dump the stack if a module
+// param is enabled.
+#define _UVM_ASSERT_MSG_RELEASE(expr, cond, fmt, ...)                                                   \
+    do {                                                                                                \
+        if (uvm_release_asserts && unlikely(!(expr))) {                                                 \
+            UVM_ASSERT_PRINT_RL("Assert failed, condition %s not true" fmt, cond, ##__VA_ARGS__);       \
+            if (uvm_release_asserts_set_global_error || uvm_release_asserts_set_global_error_for_tests) \
+                uvm_global_set_fatal_error(NV_ERR_INVALID_STATE);                                       \
+            if (uvm_release_asserts_dump_stack)                                                         \
+                dump_stack();                                                                           \
+            on_uvm_assert();                                                                            \
+        }                                                                                               \
+    } while (0)
+
+#define UVM_ASSERT_MSG_RELEASE(expr, fmt, ...)  _UVM_ASSERT_MSG_RELEASE(expr, #expr, ": " fmt, ##__VA_ARGS__)
+#define UVM_ASSERT_RELEASE(expr)                _UVM_ASSERT_MSG_RELEASE(expr, #expr, "\n")
+
 // Provide a short form of UUID's, typically for use in debug printing:
 #define ABBREV_UUID(uuid) (unsigned)(uuid)
 
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
index f649de09a..9ae32eeb3 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore.c
@@ -25,6 +25,7 @@
 #include "uvm_lock.h"
 #include "uvm_global.h"
 #include "uvm_kvmalloc.h"
+#include "uvm_channel.h" // For UVM_GPU_SEMAPHORE_MAX_JUMP
 
 #define UVM_SEMAPHORE_SIZE 4
 #define UVM_SEMAPHORE_PAGE_SIZE PAGE_SIZE
@@ -467,9 +468,16 @@ static NvU64 update_completed_value_locked(uvm_gpu_tracking_semaphore_t *trackin
     // push, it's easily guaranteed because of the small number of GPFIFO
     // entries available per channel (there could be at most as many pending
     // pushes as GPFIFO entries).
-    if (new_sem_value < old_sem_value)
+    if (unlikely(new_sem_value < old_sem_value))
         new_value += 1ULL << 32;
 
+    // Check for unexpected large jumps of the semaphore value
+    UVM_ASSERT_MSG_RELEASE(new_value - old_value <= UVM_GPU_SEMAPHORE_MAX_JUMP,
+                           "GPU %s unexpected semaphore (CPU VA 0x%llx) jump from 0x%llx to 0x%llx\n",
+                           tracking_semaphore->semaphore.page->pool->gpu->parent->name,
+                           (NvU64)(uintptr_t)tracking_semaphore->semaphore.payload,
+                           old_value, new_value);
+
     // Use an atomic write even though the spinlock is held so that the value can
     // be (carefully) read atomically outside of the lock.
     //
diff --git a/kernel-open/nvidia-uvm/uvm_gpu_semaphore_test.c b/kernel-open/nvidia-uvm/uvm_gpu_semaphore_test.c
index 220d0a46b..b8a8d2874 100644
--- a/kernel-open/nvidia-uvm/uvm_gpu_semaphore_test.c
+++ b/kernel-open/nvidia-uvm/uvm_gpu_semaphore_test.c
@@ -27,6 +27,18 @@
 #include "uvm_va_space.h"
 #include "uvm_kvmalloc.h"
 
+static NV_STATUS set_and_test(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU64 new_value)
+{
+    uvm_gpu_semaphore_set_payload(&tracking_sem->semaphore, (NvU32)new_value);
+    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_update_completed_value(tracking_sem) == new_value);
+    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value));
+    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value - 1));
+    TEST_CHECK_RET(!uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value + 1));
+    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_completed(tracking_sem));
+
+    return NV_OK;
+}
+
 static NV_STATUS add_and_test(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU32 increment_by)
 {
     NvU64 new_value;
@@ -43,13 +55,45 @@ static NV_STATUS add_and_test(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU32
     TEST_CHECK_RET(!uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value));
     TEST_CHECK_RET(!uvm_gpu_tracking_semaphore_is_completed(tracking_sem));
 
-    uvm_gpu_semaphore_set_payload(&tracking_sem->semaphore, (NvU32)new_value);
-    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_update_completed_value(tracking_sem) == new_value);
+    TEST_NV_CHECK_RET(set_and_test(tracking_sem, new_value));
     TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, completed));
-    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value));
-    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value - 1));
-    TEST_CHECK_RET(!uvm_gpu_tracking_semaphore_is_value_completed(tracking_sem, new_value + 1));
-    TEST_CHECK_RET(uvm_gpu_tracking_semaphore_is_completed(tracking_sem));
+
+    return NV_OK;
+}
+
+// Set the current state of the sema, avoiding UVM_GPU_SEMAPHORE_MAX_JUMP
+// detection.
+static void manual_set(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU64 value)
+{
+    uvm_gpu_semaphore_set_payload(&tracking_sem->semaphore, (NvU32)value);
+    atomic64_set(&tracking_sem->completed_value, value);
+    tracking_sem->queued_value = value;
+}
+
+// Set the starting value and payload and expect a global error
+static NV_STATUS set_and_expect_error(uvm_gpu_tracking_semaphore_t *tracking_sem, NvU64 starting_value, NvU32 payload)
+{
+    manual_set(tracking_sem, starting_value);
+    uvm_gpu_semaphore_set_payload(&tracking_sem->semaphore, payload);
+
+    TEST_CHECK_RET(uvm_global_get_status() == NV_OK);
+    uvm_gpu_tracking_semaphore_update_completed_value(tracking_sem);
+    TEST_CHECK_RET(uvm_global_reset_fatal_error() == NV_ERR_INVALID_STATE);
+
+    return NV_OK;
+}
+
+static NV_STATUS test_invalid_jumps(uvm_gpu_tracking_semaphore_t *tracking_sem)
+{
+    int i;
+    for (i = 0; i < 10; ++i) {
+        NvU64 base = (1ULL<<32) * i;
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base, UVM_GPU_SEMAPHORE_MAX_JUMP + 1));
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base, UINT_MAX));
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base + i + 1, i));
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base + UINT_MAX / 2, UINT_MAX / 2 + UVM_GPU_SEMAPHORE_MAX_JUMP + 1));
+        TEST_NV_CHECK_RET(set_and_expect_error(tracking_sem, base + UINT_MAX / 2, UINT_MAX / 2 - i - 1));
+    }
 
     return NV_OK;
 }
@@ -73,11 +117,31 @@ static NV_STATUS test_tracking(uvm_va_space_t *va_space)
         goto done;
 
     for (i = 0; i < 100; ++i) {
-        status = add_and_test(&tracking_sem, UINT_MAX - 1);
+        status = add_and_test(&tracking_sem, UVM_GPU_SEMAPHORE_MAX_JUMP - i);
+        if (status != NV_OK)
+            goto done;
+    }
+
+    // Test wrap-around cases
+    for (i = 0; i < 100; ++i) {
+        // Start with a value right before wrap-around
+        NvU64 starting_value = (1ULL<<32) * (i + 1) - i - 1;
+        manual_set(&tracking_sem, starting_value);
+
+        // And set payload to after wrap-around
+        status = set_and_test(&tracking_sem, (1ULL<<32) * (i + 1) + i);
         if (status != NV_OK)
             goto done;
     }
 
+    g_uvm_global.disable_fatal_error_assert = true;
+    uvm_release_asserts_set_global_error_for_tests = true;
+    status = test_invalid_jumps(&tracking_sem);
+    uvm_release_asserts_set_global_error_for_tests = false;
+    g_uvm_global.disable_fatal_error_assert = false;
+    if (status != NV_OK)
+        goto done;
+
 done:
     uvm_gpu_tracking_semaphore_free(&tracking_sem);
     return status;
diff --git a/kernel-open/nvidia-uvm/uvm_push.h b/kernel-open/nvidia-uvm/uvm_push.h
index 6d8d15021..931006e64 100644
--- a/kernel-open/nvidia-uvm/uvm_push.h
+++ b/kernel-open/nvidia-uvm/uvm_push.h
@@ -52,11 +52,21 @@ typedef enum
     // By default all operations include a membar sys after any transfer and
     // before a semaphore operation.
     // This flag indicates that next operation should use no membar at all.
+    //
+    // For end of push semaphore release, this flag indicates that the push
+    // itself does not need a membar to be used (membar sys is the default). A
+    // membar may still be used, if needed to order the semaphore release
+    // write. See comments in uvm_channel_end_push().
     UVM_PUSH_FLAG_NEXT_MEMBAR_NONE,
 
     // By default all operations include a membar sys after any transfer and
     // before a semaphore operation.
     // This flag indicates that next operation should use a membar gpu instead.
+    //
+    // For end of push semaphore release, this flag indicates that the push
+    // itself only needs a membar gpu (the default is membar sys). A membar sys
+    // may still be used, if needed to order the semaphore release write. See
+    // comments in uvm_channel_end_push().
     UVM_PUSH_FLAG_NEXT_MEMBAR_GPU,
 
     UVM_PUSH_FLAG_COUNT,
diff --git a/kernel-open/nvidia/nv-dmabuf.c b/kernel-open/nvidia/nv-dmabuf.c
index 29894e9d0..84d3146a4 100644
--- a/kernel-open/nvidia/nv-dmabuf.c
+++ b/kernel-open/nvidia/nv-dmabuf.c
@@ -820,8 +820,13 @@ nv_dma_buf_reuse(
         goto cleanup_dmabuf;
     }
 
+
+
+
+
     if (params->index > (priv->total_objects - params->numObjects))
     {
+
         status = NV_ERR_INVALID_ARGUMENT;
         goto unlock_priv;
     }
diff --git a/kernel-open/nvidia/nv-mmap.c b/kernel-open/nvidia/nv-mmap.c
index 5c0f764c1..b62719cda 100644
--- a/kernel-open/nvidia/nv-mmap.c
+++ b/kernel-open/nvidia/nv-mmap.c
@@ -132,6 +132,13 @@ nvidia_vma_access(
     pageIndex = ((addr - vma->vm_start) >> PAGE_SHIFT);
     pageOffset = (addr & ~PAGE_MASK);
 
+
+
+
+
+
+
+
     if (!mmap_context->valid)
     {
         nv_printf(NV_DBG_ERRORS, "NVRM: VM: invalid mmap context\n");
@@ -430,7 +437,7 @@ static int nvidia_mmap_numa(
     const nv_alloc_mapping_context_t *mmap_context)
 {
     NvU64 start, addr;
-    unsigned int pages;
+    NvU64 pages;
     NvU64 i;
 
     pages = NV_VMA_SIZE(vma) >> PAGE_SHIFT;
@@ -509,6 +516,13 @@ int nvidia_mmap_helper(
         NvU64 access_start = mmap_context->access_start;
         NvU64 access_len = mmap_context->access_size;
 
+
+
+
+
+
+
+
         if (IS_REG_OFFSET(nv, access_start, access_len))
         {
             if (nv_encode_caching(&vma->vm_page_prot, NV_MEMORY_UNCACHED,
diff --git a/kernel-open/nvidia/nv.c b/kernel-open/nvidia/nv.c
index babdd2882..b8ce6d5a9 100644
--- a/kernel-open/nvidia/nv.c
+++ b/kernel-open/nvidia/nv.c
@@ -1467,6 +1467,11 @@ static int nv_open_device(nv_state_t *nv, nvidia_stack_t *sp)
         return -ENODEV;
     }
 
+
+
+
+
+
     if ( ! (nv->flags & NV_FLAG_OPEN))
     {
         /* Sanity check: !NV_FLAG_OPEN requires usage_count == 0 */
diff --git a/kernel-open/nvidia/nvidia.Kbuild b/kernel-open/nvidia/nvidia.Kbuild
index 6941c651b..6ca6abae8 100644
--- a/kernel-open/nvidia/nvidia.Kbuild
+++ b/kernel-open/nvidia/nvidia.Kbuild
@@ -219,6 +219,7 @@ NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_dram_clk_to_mc_clk
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_get_dram_num_channels
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_tegra_dram_types
 NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_pxm_to_node
+NV_CONFTEST_SYMBOL_COMPILE_TESTS += is_export_symbol_present_screen_info
 
 NV_CONFTEST_TYPE_COMPILE_TESTS += file_operations
 NV_CONFTEST_TYPE_COMPILE_TESTS += kuid_t
@@ -242,9 +243,9 @@ NV_CONFTEST_TYPE_COMPILE_TESTS += vmalloc_has_pgprot_t_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += mm_has_mmap_lock
 NV_CONFTEST_TYPE_COMPILE_TESTS += pci_channel_state
 NV_CONFTEST_TYPE_COMPILE_TESTS += pci_dev_has_ats_enabled
-NV_CONFTEST_TYPE_COMPILE_TESTS += mt_device_gre
 NV_CONFTEST_TYPE_COMPILE_TESTS += remove_memory_has_nid_arg
 NV_CONFTEST_TYPE_COMPILE_TESTS += add_memory_driver_managed_has_mhp_flags_arg
+NV_CONFTEST_TYPE_COMPILE_TESTS += num_registered_fb
 
 NV_CONFTEST_GENERIC_COMPILE_TESTS += dom0_kernel_present
 NV_CONFTEST_GENERIC_COMPILE_TESTS += nvidia_vgpu_kvm_build
diff --git a/kernel-open/nvidia/nvlink_linux.c b/kernel-open/nvidia/nvlink_linux.c
index 6c44d949b..af8a048d0 100644
--- a/kernel-open/nvidia/nvlink_linux.c
+++ b/kernel-open/nvidia/nvlink_linux.c
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2015-2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2015-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -207,7 +207,10 @@ static int nvlink_fops_release(struct inode *inode, struct file *filp)
 
     nvlink_print(NVLINK_DBG_INFO, "nvlink driver close\n");
 
-    WARN_ON(private == NULL);
+
+
+
+
 
     mutex_lock(&nvlink_drvctx.lock);
 
diff --git a/kernel-open/nvidia/os-interface.c b/kernel-open/nvidia/os-interface.c
index f8810c338..c9ff6f8da 100644
--- a/kernel-open/nvidia/os-interface.c
+++ b/kernel-open/nvidia/os-interface.c
@@ -1120,31 +1120,58 @@ void NV_API_CALL os_get_screen_info(
     NvU64 consoleBar2Address
 )
 {
-#if defined(CONFIG_FB)
-    int i;
     *pPhysicalAddress = 0;
     *pFbWidth = *pFbHeight = *pFbDepth = *pFbPitch = 0;
 
-    for (i = 0; i < num_registered_fb; i++)
+#if defined(CONFIG_FB) && defined(NV_NUM_REGISTERED_FB_PRESENT)
+    if (num_registered_fb > 0)
     {
-        if (!registered_fb[i])
-            continue;
+        int i;
+
+        for (i = 0; i < num_registered_fb; i++)
+        {
+            if (!registered_fb[i])
+                continue;
+
+            /* Make sure base address is mapped to GPU BAR */
+            if ((registered_fb[i]->fix.smem_start == consoleBar1Address) ||
+                (registered_fb[i]->fix.smem_start == consoleBar2Address))
+            {
+                *pPhysicalAddress = registered_fb[i]->fix.smem_start;
+                *pFbWidth = registered_fb[i]->var.xres;
+                *pFbHeight = registered_fb[i]->var.yres;
+                *pFbDepth = registered_fb[i]->var.bits_per_pixel;
+                *pFbPitch = registered_fb[i]->fix.line_length;
+                break;
+            }
+        }
+    }
+#elif NV_IS_EXPORT_SYMBOL_PRESENT_screen_info
+    /*
+     * If there is not a framebuffer console, return 0 size.
+     *
+     * orig_video_isVGA is set to 1 during early Linux kernel
+     * initialization, and then will be set to a value, such as
+     * VIDEO_TYPE_VLFB or VIDEO_TYPE_EFI if an fbdev console is used.
+     */
+    if (screen_info.orig_video_isVGA > 1)
+    {
+        NvU64 physAddr = screen_info.lfb_base;
+#if defined(VIDEO_CAPABILITY_64BIT_BASE)
+        physAddr |= (NvU64)screen_info.ext_lfb_base << 32;
+#endif
 
         /* Make sure base address is mapped to GPU BAR */
-        if ((registered_fb[i]->fix.smem_start == consoleBar1Address) ||
-            (registered_fb[i]->fix.smem_start == consoleBar2Address))
+        if ((physAddr == consoleBar1Address) ||
+            (physAddr == consoleBar2Address))
         {
-            *pPhysicalAddress = registered_fb[i]->fix.smem_start;
-            *pFbWidth = registered_fb[i]->var.xres;
-            *pFbHeight = registered_fb[i]->var.yres;
-            *pFbDepth = registered_fb[i]->var.bits_per_pixel;
-            *pFbPitch = registered_fb[i]->fix.line_length;
-            break;
+            *pPhysicalAddress = physAddr;
+            *pFbWidth = screen_info.lfb_width;
+            *pFbHeight = screen_info.lfb_height;
+            *pFbDepth = screen_info.lfb_depth;
+            *pFbPitch = screen_info.lfb_linelength;
         }
     }
-#else
-    *pPhysicalAddress = 0;
-    *pFbWidth = *pFbHeight = *pFbDepth = *pFbPitch = 0;
 #endif
 }
 
diff --git a/src/common/inc/nvBldVer.h b/src/common/inc/nvBldVer.h
index 6009d40c0..3db404ab5 100644
--- a/src/common/inc/nvBldVer.h
+++ b/src/common/inc/nvBldVer.h
@@ -36,25 +36,25 @@
 // and then checked back in. You cannot make changes to these sections without
 // corresponding changes to the buildmeister script
 #ifndef NV_BUILD_BRANCH
-    #define NV_BUILD_BRANCH             r516_87
+    #define NV_BUILD_BRANCH             r515_00
 #endif
 #ifndef NV_PUBLIC_BRANCH
-    #define NV_PUBLIC_BRANCH             r516_87
+    #define NV_PUBLIC_BRANCH             r515_00
 #endif
 
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS)
-#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r515/r516_87-317"
-#define NV_BUILD_CHANGELIST_NUM         (31589401)
+#define NV_BUILD_BRANCH_VERSION         "rel/gpu_drv/r515/r515_00-409"
+#define NV_BUILD_CHANGELIST_NUM         (31799928)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "rel/gpu_drv/r515/r516_87-317"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (31589401)
+#define NV_BUILD_NAME                   "rel/gpu_drv/r515/r515_00-409"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (31799928)
 
 #else     /* Windows builds */
-#define NV_BUILD_BRANCH_VERSION         "r516_87-1"
-#define NV_BUILD_CHANGELIST_NUM         (31588177)
+#define NV_BUILD_BRANCH_VERSION         "r515_00-323"
+#define NV_BUILD_CHANGELIST_NUM         (31799928)
 #define NV_BUILD_TYPE                   "Official"
-#define NV_BUILD_NAME                   "516.90"
-#define NV_LAST_OFFICIAL_CHANGELIST_NUM (31588177)
+#define NV_BUILD_NAME                   "517.40"
+#define NV_LAST_OFFICIAL_CHANGELIST_NUM (31799928)
 #define NV_BUILD_BRANCH_BASE_VERSION    R515
 #endif
 // End buildmeister python edited section
diff --git a/src/common/inc/nvUnixVersion.h b/src/common/inc/nvUnixVersion.h
index 14f2d67f3..d4eba52cc 100644
--- a/src/common/inc/nvUnixVersion.h
+++ b/src/common/inc/nvUnixVersion.h
@@ -4,7 +4,7 @@
 #if defined(NV_LINUX) || defined(NV_BSD) || defined(NV_SUNOS) || defined(NV_VMWARE) || defined(NV_QNX) || defined(NV_INTEGRITY) || \
     (defined(RMCFG_FEATURE_PLATFORM_GSP) && RMCFG_FEATURE_PLATFORM_GSP == 1)
 
-#define NV_VERSION_STRING               "515.65.01"
+#define NV_VERSION_STRING               "515.76"
 
 #else
 
diff --git a/src/common/inc/nvlog_defs.h b/src/common/inc/nvlog_defs.h
index e0b3b415f..45e1a3f3a 100644
--- a/src/common/inc/nvlog_defs.h
+++ b/src/common/inc/nvlog_defs.h
@@ -195,6 +195,11 @@ extern NVLOG_LOGGER NvLogLogger;
 #define NVLOG_BUFFER_FLAGS_FORMAT_LIBOS_LOG              1
 #define NVLOG_BUFFER_FLAGS_FORMAT_MEMTRACK               2
 
+// Never deallocate this buffer until RM is unloaded
+#define NVLOG_BUFFER_FLAGS_PRESERVE                     11:11
+#define NVLOG_BUFFER_FLAGS_PRESERVE_NO                  0
+#define NVLOG_BUFFER_FLAGS_PRESERVE_YES                 1
+
 // Buffer GPU index
 #define NVLOG_BUFFER_FLAGS_GPU_INSTANCE              31:24
 
diff --git a/src/common/modeset/timing/nvtiming.h b/src/common/modeset/timing/nvtiming.h
index 70ee491cf..047d7af93 100644
--- a/src/common/modeset/timing/nvtiming.h
+++ b/src/common/modeset/timing/nvtiming.h
@@ -4091,6 +4091,8 @@ typedef struct tagNVT_GAMUT_METADATA
 #define NVT_DPCD_ADDRESS_DOWN_REP_BUFFER_FIELD              0x01400
 #define NVT_DPCD_ADDRESS_UP_REQ_BUFFER_FIELD                0x01600
 #define NVT_DPCD_ADDRESS_DEVICE_SERVICE_IRQ_VECTOR_ESI0     0x02003
+#define NVT_DPCD_ADDRESS_DP_TUNNELING_DEVICE_IEEE_OUI       0xE0000
+#define NVT_DPCD_ADDRESS_DP_TUNNELING_DEVICE_ID_STRING      0xE0003
 #define NVT_DPCD_ADDRESS_DP_TUNNELING_CAPS_SUPPORT_FIELD    0xE000D
 #define NVT_DPCD_ADDRESS_DP_IN_ADAPTER_INFO_FIELD           0xE000E
 #define NVT_DPCD_ADDRESS_USB4_DRIVER_ID_FIELD               0xE000F
@@ -5079,7 +5081,7 @@ typedef struct tagNVT_DPCD_CONFIG
 
 typedef struct tagNVT_DPCD_DP_TUNNELING_CAPS
 {
-    NvU8 dpTunnelingSupport               : 1; // DP Tunneling through USB4 Support
+    NvU8 dpTunneling                      : 1; // DP Tunneling through USB4 Support
     NvU8 reserved                         : 5; // Reserved.
     NvU8 dpPanelReplayTunnelingOptSupport : 1; // Panel Replay Tunneling Optimization Support
     NvU8 dpInBwAllocationModeSupport      : 1; // DP IN Bandwidth Allocation Mode Support
diff --git a/src/common/nvlink/interface/nvlink_lib_ctrl.h b/src/common/nvlink/interface/nvlink_lib_ctrl.h
index e81aed4a8..19bb0168e 100644
--- a/src/common/nvlink/interface/nvlink_lib_ctrl.h
+++ b/src/common/nvlink/interface/nvlink_lib_ctrl.h
@@ -64,7 +64,7 @@
  * Total number of nvlink endpoints core library can have
  *  This is mapped to NVLINK_MAX_SYSTEM_LINK_NUM in drivers/nvlink/interface/nvlink.h
  */
-#define NVLINK_MAX_NVLINK_ENDPOINTS 312
+#define NVLINK_MAX_NVLINK_ENDPOINTS 624
 
 #define NVLINK_VERSION_STRING_LENGTH    64
 
diff --git a/src/common/nvlink/kernel/nvlink/interface/nvlink_ioctl_entry.c b/src/common/nvlink/kernel/nvlink/interface/nvlink_ioctl_entry.c
index 0afb10f8e..a0a07853e 100644
--- a/src/common/nvlink/kernel/nvlink/interface/nvlink_ioctl_entry.c
+++ b/src/common/nvlink/kernel/nvlink/interface/nvlink_ioctl_entry.c
@@ -28,6 +28,7 @@
 #include "../nvlink_ctx.h"
 #include "../nvlink_helper.h"
 #include "nvlink_lock.h"
+#include "nvctassert.h"
 
 #define NVLINK_IOC_GET_BUF(ctrlParams, type) (ctrlParams)->size >= sizeof(type) ? (type *) (ctrlParams)->buf : NULL
 
@@ -3423,6 +3424,8 @@ nvlink_lib_ctrl_get_device_link_states
     NvU32         numLinks  = 0;
     NvU32         i         = 0;
 
+    ct_assert(NVLINK_MAX_SYSTEM_LINK_NUM == NVLINK_MAX_NVLINK_ENDPOINTS);
+
     nvlink_link   **links = (nvlink_link **)nvlink_malloc(
                             sizeof(nvlink_link *) * NVLINK_MAX_SYSTEM_LINK_NUM);
     if (links == NULL)
diff --git a/src/common/uproc/os/libos-v2.0.0/debug/logdecode.c b/src/common/uproc/os/libos-v2.0.0/debug/logdecode.c
index f898adadb..8b0f58175 100644
--- a/src/common/uproc/os/libos-v2.0.0/debug/logdecode.c
+++ b/src/common/uproc/os/libos-v2.0.0/debug/logdecode.c
@@ -1041,24 +1041,41 @@ static NvBool libosCopyLogToNvlog_nowrap(LIBOS_LOG_DECODE_LOG *pLog)
     NvU64 putCopy                      = pLog->physicLogBuffer[0];
     NvU64 putOffset                    = putCopy * sizeof(NvU64) + sizeof(NvU64);
 
-    if (putOffset == pNvLogBuffer->pos)
+    //
+    // If RM was not unloaded, we will reuse a preserved nowrap nvlog buffer with the fresh
+    // physical log buffer. In this case, we fix up all the offsets into the nvlog buffer to be
+    // relative to its preserved position rather than the start.
+    //
+    NvU64 nvlogPos                     = pNvLogBuffer->pos - pLog->preservedNoWrapPos;
+
+    if (putOffset < nvlogPos)
+    {
+        // Buffer put counter unexpectedly reset. Terminate nowrap log collection.
+        return NV_FALSE;
+    }
+
+    if (putOffset == nvlogPos)
     {
         // No new data
         return NV_TRUE;
     }
 
-    if (putOffset > pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64))
+    if (putOffset + pLog->preservedNoWrapPos >
+        pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64))
     {
         // Are we done filling nowrap?
         return NV_FALSE;
     }
 
-    NvU64 len  = putOffset - pNvLogBuffer->pos;
-    NvU8 *pSrc = ((NvU8 *)pLog->physicLogBuffer) + pNvLogBuffer->pos;
+    NvU64 len  = putOffset - nvlogPos;
+    NvU8 *pSrc = ((NvU8 *)pLog->physicLogBuffer) + nvlogPos;
     NvU8 *pDst = pNoWrapBuf->data + pNvLogBuffer->pos;
+
+    pLog->bDidPush = NV_TRUE;
+
     portMemCopy(pDst, len, pSrc, len);
-    pNvLogBuffer->pos            = putOffset; // TODO: usage of NVLOG_BUFFER::pos is sus here, reconsider?
-    *(NvU64 *)(pNoWrapBuf->data) = putCopy;
+    pNvLogBuffer->pos            = putOffset + pLog->preservedNoWrapPos; // TODO: usage of NVLOG_BUFFER::pos is sus here, reconsider?
+    *(NvU64 *)(pNoWrapBuf->data) = putCopy + pLog->preservedNoWrapPos / sizeof(NvU64);
     return NV_TRUE;
 }
 
@@ -1095,6 +1112,46 @@ static void libosExtractLogs_nvlog(LIBOS_LOG_DECODE *logDecode, NvBool bSyncNvLo
     }
 }
 
+void libosPreserveLogs(LIBOS_LOG_DECODE *pLogDecode)
+{
+    NvU64 i;
+    for (i = 0; i < pLogDecode->numLogBuffers; i++)
+    {
+        LIBOS_LOG_DECODE_LOG *pLog = &pLogDecode->log[i];
+
+        if (pLog->bDidPush)
+        {
+            NvHandle hNvlog = pLog->hNvLogNoWrap;
+            NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[hNvlog];
+
+            if (hNvlog == 0 || pNvLogBuffer == NULL)
+                continue;
+
+            pNvLogBuffer->flags |= DRF_DEF(LOG, _BUFFER_FLAGS, _PRESERVE, _YES);
+        }
+    }
+}
+
+static NvBool findPreservedNvlogBuffer(NvU32 tag, NvU32 gpuInstance, NVLOG_BUFFER_HANDLE *pHandle)
+{
+    NVLOG_BUFFER_HANDLE handle = 0;
+    NV_STATUS status = nvlogGetBufferHandleFromTag(tag, &handle);
+
+    if (status != NV_OK)
+        return NV_FALSE;
+
+    NVLOG_BUFFER *pNvLogBuffer = NvLogLogger.pBuffers[handle];
+    if (FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pNvLogBuffer->flags) &&
+        DRF_VAL(LOG, _BUFFER_FLAGS, _GPU_INSTANCE, pNvLogBuffer->flags) == gpuInstance &&
+        (pNvLogBuffer->pos < pNvLogBuffer->size - NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data) - sizeof(NvU64)))
+    {
+        *pHandle = handle;
+        return NV_TRUE;
+    }
+
+    return NV_FALSE;
+}
+
 #endif // LIBOS_LOG_TO_NVLOG
 
 /**
@@ -1211,39 +1268,60 @@ void libosLogAddLogEx(LIBOS_LOG_DECODE *logDecode, void *buffer, NvU64 bufferSiz
     pLog->hNvLogWrap   = 0;
     pLog->bNvLogNoWrap = NV_FALSE;
 
-    LIBOS_LOG_NVLOG_BUFFER *pNoWrapBuf;
+    pLog->bDidPush             = NV_FALSE;
+    pLog->preservedNoWrapPos   = 0;
 
-    status = nvlogAllocBuffer(
-        bufferSize + NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data), libosNoWrapBufferFlags,
-        LIBOS_LOG_NVLOG_BUFFER_TAG(logDecode->sourceName, i * 2),
-        &pLog->hNvLogNoWrap);
+    LIBOS_LOG_NVLOG_BUFFER *pNoWrapBuf;
+    NvU32 tag = LIBOS_LOG_NVLOG_BUFFER_TAG(logDecode->sourceName, i * 2);
+    NvBool bFoundPreserved = findPreservedNvlogBuffer(tag, gpuInstance, &pLog->hNvLogNoWrap);
 
-    if (status == NV_OK)
+    if (!bFoundPreserved)
     {
-        pNoWrapBuf = (LIBOS_LOG_NVLOG_BUFFER *)NvLogLogger.pBuffers[pLog->hNvLogNoWrap]->data;
-        if (name)
+        status = nvlogAllocBuffer(
+            bufferSize + NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data), libosNoWrapBufferFlags,
+            tag,
+            &pLog->hNvLogNoWrap);
+
+        if (status == NV_OK)
         {
-            portStringCopy(
-                pNoWrapBuf->taskPrefix, sizeof pNoWrapBuf->taskPrefix, name, sizeof pNoWrapBuf->taskPrefix);
-        }
+            pNoWrapBuf = (LIBOS_LOG_NVLOG_BUFFER *)NvLogLogger.pBuffers[pLog->hNvLogNoWrap]->data;
+            if (name)
+            {
+                portStringCopy(
+                    pNoWrapBuf->taskPrefix, sizeof pNoWrapBuf->taskPrefix, name, sizeof pNoWrapBuf->taskPrefix);
+            }
 
-        pNoWrapBuf->gpuArch = gpuArch;
-        pNoWrapBuf->gpuImpl = gpuImpl;
+            pNoWrapBuf->gpuArch = gpuArch;
+            pNoWrapBuf->gpuImpl = gpuImpl;
 
-        NvLogLogger.pBuffers[pLog->hNvLogNoWrap]->pos = sizeof(NvU64); // offset to account for put pointer
-        pLog->bNvLogNoWrap                            = NV_TRUE;
+            NvLogLogger.pBuffers[pLog->hNvLogNoWrap]->pos = sizeof(NvU64); // offset to account for put pointer
+            pLog->bNvLogNoWrap                            = NV_TRUE;
+        }
+        else
+        {
+            printf("nvlogAllocBuffer nowrap failed\n");
+        }
     }
     else
     {
-        printf("nvlogAllocBuffer nowrap failed\n");
+        pLog->bNvLogNoWrap = NV_TRUE;
+        pLog->preservedNoWrapPos = NvLogLogger.pBuffers[pLog->hNvLogNoWrap]->pos;
+
+        //
+        // The 0th NvU64 is the last value of put pointer from the physical log buffer, which is
+        // the number of NvU64 log buffer elements in it plus one.
+        // Subtract one NvU64 from it to avoid off-by-one error.
+        //
+        if (pLog->preservedNoWrapPos >= sizeof(NvU64))
+            pLog->preservedNoWrapPos -= sizeof(NvU64);
     }
 
     LIBOS_LOG_NVLOG_BUFFER *pWrapBuf;
+    tag = LIBOS_LOG_NVLOG_BUFFER_TAG(logDecode->sourceName, i * 2 + 1);
 
     status = nvlogAllocBuffer(
         bufferSize + NV_OFFSETOF(LIBOS_LOG_NVLOG_BUFFER, data), libosWrapBufferFlags,
-        LIBOS_LOG_NVLOG_BUFFER_TAG(logDecode->sourceName, i * 2 + 1),
-        &pLog->hNvLogWrap);
+        tag, &pLog->hNvLogWrap);
 
     if (status == NV_OK)
     {
@@ -1349,13 +1427,13 @@ void libosLogDestroy(LIBOS_LOG_DECODE *logDecode)
 
         if (pLog->hNvLogNoWrap != 0)
         {
-            nvlogDeallocBuffer(pLog->hNvLogNoWrap);
+            nvlogDeallocBuffer(pLog->hNvLogNoWrap, NV_FALSE);
             pLog->hNvLogNoWrap = 0;
         }
 
         if (pLog->hNvLogWrap != 0)
         {
-            nvlogDeallocBuffer(pLog->hNvLogWrap);
+            nvlogDeallocBuffer(pLog->hNvLogWrap, NV_FALSE);
             pLog->hNvLogWrap = 0;
         }
     }
diff --git a/src/common/uproc/os/libos-v2.0.0/debug/logdecode.h b/src/common/uproc/os/libos-v2.0.0/debug/logdecode.h
index 48e6ea048..51f3b8123 100644
--- a/src/common/uproc/os/libos-v2.0.0/debug/logdecode.h
+++ b/src/common/uproc/os/libos-v2.0.0/debug/logdecode.h
@@ -108,6 +108,9 @@ struct LIBOS_LOG_DECODE_LOG
     NvU32 hNvLogNoWrap;  // No wrap buffer captures first records.
     NvU32 hNvLogWrap;    // Wrap buffer captures last records.
     NvBool bNvLogNoWrap; // NV_TRUE if no wrap buffer not full.
+
+    NvBool bDidPush;     // NV_TRUE if this buffer was ever pushed to
+    NvU64 preservedNoWrapPos; // Position in preserved nvlog buffer
 #endif
 
 #if LIBOS_LOG_DECODE_ENABLE
@@ -170,6 +173,8 @@ void libosLogDestroy(LIBOS_LOG_DECODE *logDecode);
 
 void libosExtractLogs(LIBOS_LOG_DECODE *logDecode, NvBool bSyncNvLog);
 
+void libosPreserveLogs(LIBOS_LOG_DECODE *pLogDecode);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h b/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h
index 36685a026..8036811f9 100644
--- a/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h
+++ b/src/nvidia-modeset/os-interface/include/nvidia-modeset-os-interface.h
@@ -110,6 +110,7 @@ typedef struct {
     } set_maxval;
 } NvKmsSyncPtOpParams;
 
+NvBool nvkms_output_rounding_fix(void);
 
 void   nvkms_call_rm    (void *ops);
 void*  nvkms_alloc      (size_t size,
diff --git a/src/nvidia-modeset/src/nvkms-evo3.c b/src/nvidia-modeset/src/nvkms-evo3.c
index 76458f06d..6cb390ef8 100644
--- a/src/nvidia-modeset/src/nvkms-evo3.c
+++ b/src/nvidia-modeset/src/nvkms-evo3.c
@@ -1288,6 +1288,8 @@ static void EvoSetOCsc0C5(NVDispEvoPtr pDispEvo, const NvU32 head)
 
     const float32_t zeroF32 = NvU32viewAsF32(NV_FLOAT_ZERO);
     const float32_t oneF32 = NvU32viewAsF32(NV_FLOAT_ONE);
+    const float32_t inv2048F32 = f32_div(NvU32viewAsF32(NV_FLOAT_HALF),
+                                         NvU32viewAsF32(NV_FLOAT_1024));
     /* divide satCos by the default setting of 1024 */
     const float32_t satCos = f32_div(i32_to_f32(pHeadState->procAmp.satCos),
                                      NvU32viewAsF32(NV_FLOAT_1024));
@@ -1324,6 +1326,12 @@ static void EvoSetOCsc0C5(NVDispEvoPtr pDispEvo, const NvU32 head)
     ocsc0Matrix = nvMultiply3x4Matrix(&satHueMatrix, &ocsc0Matrix);
     ocsc0Matrix = nvMultiply3x4Matrix(&CrYCbtoRGBMatrix, &ocsc0Matrix);
 
+    if (nvkms_output_rounding_fix()) {
+        ocsc0Matrix.m[0][3] = f32_add(ocsc0Matrix.m[0][3], inv2048F32);
+        ocsc0Matrix.m[1][3] = f32_add(ocsc0Matrix.m[1][3], inv2048F32);
+        ocsc0Matrix.m[2][3] = f32_add(ocsc0Matrix.m[2][3], inv2048F32);
+    }
+
     nvDmaSetStartEvoMethod(pChannel, NVC57D_HEAD_SET_OCSC0COEFFICIENT_C00(head), 12);
     nvDmaSetEvoMethodData(pChannel, DRF_NUM(C57D, _HEAD_SET_OCSC0COEFFICIENT_C00, _VALUE, cscCoefConvertS514(ocsc0Matrix.m[0][0])));
     nvDmaSetEvoMethodData(pChannel, DRF_NUM(C57D, _HEAD_SET_OCSC0COEFFICIENT_C01, _VALUE, cscCoefConvertS514(ocsc0Matrix.m[0][1])));
@@ -1965,11 +1973,13 @@ static inline NvU32 GetMaxPixelsFetchedPerLine(NvU16 inWidth,
 static void SetScalingUsageBoundsOneWindow5(
                                 NVDevEvoPtr pDevEvo, NvU32 window,
                                 const struct NvKmsScalingUsageBounds *pScaling,
+                                NvBool layerUsable,
                                 const NVHwModeViewPortEvo *pViewPort,
                                 NVEvoUpdateState *updateState)
 {
     NVEvoChannelPtr pChannel = pDevEvo->core;
     NvU32 setWindowUsageBounds = NV_EVO3_DEFAULT_WINDOW_USAGE_BOUNDS_C5;
+    NvU32 maxPixelsFetchedPerLine;
 
     nvUpdateUpdateState(pDevEvo, updateState, pChannel);
 
@@ -1981,10 +1991,15 @@ static void SetScalingUsageBoundsOneWindow5(
         DRF_NUM(C57D, _WINDOW_SET_MAX_INPUT_SCALE_FACTOR, _VERTICAL,
                 pScaling->maxVDownscaleFactor));
 
+    if (layerUsable) {
+        maxPixelsFetchedPerLine = GetMaxPixelsFetchedPerLine(pViewPort->in.width,
+                                                   pScaling->maxHDownscaleFactor);
+    } else {
+        maxPixelsFetchedPerLine = 0;
+    }
+
     setWindowUsageBounds |=
-        (DRF_NUM(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _MAX_PIXELS_FETCHED_PER_LINE,
-                 GetMaxPixelsFetchedPerLine(pViewPort->in.width,
-                 pScaling->maxHDownscaleFactor))) |
+        (DRF_NUM(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _MAX_PIXELS_FETCHED_PER_LINE,maxPixelsFetchedPerLine)) |
         (pScaling->vTaps >= NV_EVO_SCALER_5TAPS ?
             DRF_DEF(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _INPUT_SCALER_TAPS, _TAPS_5) :
             DRF_DEF(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _INPUT_SCALER_TAPS, _TAPS_2)) |
@@ -2056,8 +2071,9 @@ static NvBool EvoSetUsageBoundsC5(NVDevEvoPtr pDevEvo, NvU32 sd, NvU32 head,
     needCoreUpdate = EvoSetUsageBounds3(pDevEvo, sd, head, pUsage, updateState);
 
     for (layer = 0; layer < pDevEvo->head[head].numLayers; layer++) {
-        if (!nvEvoScalingUsageBoundsEqual(&pCurrentUsage->layer[layer].scaling,
-                                          &pUsage->layer[layer].scaling)) {
+        if ((pCurrentUsage->layer[layer].usable != pUsage->layer[layer].usable) ||
+            (!nvEvoScalingUsageBoundsEqual(&pCurrentUsage->layer[layer].scaling,
+                                           &pUsage->layer[layer].scaling))) {
             const NVHwModeViewPortEvo *pViewPort =
                 &pDevEvo->gpus[sd].pDispEvo->headState[head].timings.viewPort;
 
@@ -2066,6 +2082,7 @@ static NvBool EvoSetUsageBoundsC5(NVDevEvoPtr pDevEvo, NvU32 sd, NvU32 head,
                 NV_EVO_CHANNEL_MASK_WINDOW_NUMBER(
                     pDevEvo->head[head].layer[layer]->channelMask),
                 &pUsage->layer[layer].scaling,
+                pUsage->layer[layer].usable,
                 pViewPort,
                 updateState);
             needCoreUpdate = TRUE;
@@ -4383,7 +4400,9 @@ static void EvoSetLUTContextDmaC5(const NVDispEvoRec *pDispEvo,
 
     nvDmaSetStartEvoMethod(pChannel, NVC57D_HEAD_SET_OLUT_CONTROL(head), 1);
     nvDmaSetEvoMethodData(pChannel,
-        DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _INTERPOLATE, _ENABLE) |
+        (!nvkms_output_rounding_fix() ?
+            DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _INTERPOLATE, _ENABLE) :
+            DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _INTERPOLATE, _DISABLE)) |
         DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _MIRROR, _DISABLE) |
         DRF_DEF(C57D, _HEAD_SET_OLUT_CONTROL, _MODE, _DIRECT10) |
         DRF_NUM(C57D, _HEAD_SET_OLUT_CONTROL, _SIZE, NV_LUT_VSS_HEADER_SIZE +
@@ -5180,13 +5199,11 @@ static NvBool EvoSetViewportInOut3(NVDevEvoPtr pDevEvo, const int head,
                                    const NVHwModeViewPortEvo *pViewPortMin,
                                    const NVHwModeViewPortEvo *pViewPort,
                                    const NVHwModeViewPortEvo *pViewPortMax,
-                                   NVEvoUpdateState *updateState,
-                                   NvU32 setWindowUsageBounds)
+                                   NVEvoUpdateState *updateState)
 {
     const NVEvoCapabilitiesPtr pEvoCaps = &pDevEvo->gpus[0].capabilities;
     NVEvoChannelPtr pChannel = pDevEvo->core;
     struct NvKmsScalingUsageBounds scalingUsageBounds = { };
-    NvU32 win;
 
     /* These methods should only apply to a single pDpy */
     nvAssert(pDevEvo->subDevMaskStackDepth > 0);
@@ -5232,31 +5249,6 @@ static NvBool EvoSetViewportInOut3(NVDevEvoPtr pDevEvo, const int head,
         DRF_NUM(C37D, _HEAD_SET_MAX_OUTPUT_SCALE_FACTOR, _VERTICAL,
                 scalingUsageBounds.maxVDownscaleFactor));
 
-    /*
-     * Program MAX_PIXELS_FETCHED_PER_LINE window usage bounds
-     * for each window that’s attached to the head.
-     *
-     * Precomp will clip the post-scaled window to the input viewport, reverse-scale
-     * this cropped size back to the input surface domain, and isohub will fetch
-     * this cropped size. This function assumes that there's no window scaling yet,
-     * so the MAX_PIXELS_FETCHED_PER_LINE will be bounded by the input viewport
-     * width. SetScalingUsageBoundsOneWindow5() will take care of updating
-     * MAX_PIXELS_FETCHED_PER_LINE, if window scaling is enabled later.
-     */
-    setWindowUsageBounds |=
-        DRF_NUM(C37D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _MAX_PIXELS_FETCHED_PER_LINE,
-                GetMaxPixelsFetchedPerLine(pViewPort->in.width,
-                NV_EVO_SCALE_FACTOR_1X));
-
-    for (win = 0; win < pDevEvo->numWindows; win++) {
-        if (head != pDevEvo->headForWindow[win]) {
-            continue;
-        }
-
-        nvDmaSetStartEvoMethod(pChannel, NVC37D_WINDOW_SET_WINDOW_USAGE_BOUNDS(win), 1);
-        nvDmaSetEvoMethodData(pChannel, setWindowUsageBounds);
-    }
-
     return scalingUsageBounds.vUpscalingAllowed;
 }
 
@@ -5267,10 +5259,11 @@ static void EvoSetViewportInOutC3(NVDevEvoPtr pDevEvo, const int head,
                                   NVEvoUpdateState *updateState)
 {
     NVEvoChannelPtr pChannel = pDevEvo->core;
+    NvU32 win;
+    NvU32 setWindowUsageBounds = NV_EVO3_DEFAULT_WINDOW_USAGE_BOUNDS_C3;
     NvBool verticalUpscalingAllowed =
         EvoSetViewportInOut3(pDevEvo, head, pViewPortMin, pViewPort,
-                             pViewPortMax, updateState,
-                             NV_EVO3_DEFAULT_WINDOW_USAGE_BOUNDS_C3);
+                             pViewPortMax, updateState);
 
     nvDmaSetStartEvoMethod(pChannel,
         NVC37D_HEAD_SET_HEAD_USAGE_BOUNDS(head), 1);
@@ -5280,6 +5273,34 @@ static void EvoSetViewportInOutC3(NVDevEvoPtr pDevEvo, const int head,
         (verticalUpscalingAllowed ?
             DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _UPSCALING_ALLOWED, _TRUE) :
             DRF_DEF(C37D, _HEAD_SET_HEAD_USAGE_BOUNDS, _UPSCALING_ALLOWED, _FALSE)));
+     /*
+      * Program MAX_PIXELS_FETCHED_PER_LINE window usage bounds
+      * for each window that is attached to the head.
+      *
+      * Precomp will clip the post-scaled window to the input viewport, reverse-scale
+      * this cropped size back to the input surface domain, and isohub will fetch
+      * this cropped size. This function assumes that there's no window scaling yet,
+      * so the MAX_PIXELS_FETCHED_PER_LINE will be bounded by the input viewport
+      * width. SetScalingUsageBoundsOneWindow5() will take care of updating
+      * MAX_PIXELS_FETCHED_PER_LINE, if window scaling is enabled later.
+      * On Volta, Program for each window that is attached to head. For turing+,
+      * SetScalingUsageBoundsOneWindow5() will take care of programming window
+      * usage bounds only for the layers/windows in use.
+      */
+
+    setWindowUsageBounds |=
+       DRF_NUM(C37D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _MAX_PIXELS_FETCHED_PER_LINE,
+               GetMaxPixelsFetchedPerLine(pViewPort->in.width,
+               NV_EVO_SCALE_FACTOR_1X));
+
+    for (win = 0; win < pDevEvo->numWindows; win++) {
+        if (head != pDevEvo->headForWindow[win]) {
+            continue;
+        }
+
+        nvDmaSetStartEvoMethod(pChannel, NVC37D_WINDOW_SET_WINDOW_USAGE_BOUNDS(win), 1);
+        nvDmaSetEvoMethodData(pChannel, setWindowUsageBounds);
+    }
 }
 
 static void EvoSetViewportInOutC5(NVDevEvoPtr pDevEvo, const int head,
@@ -5289,13 +5310,9 @@ static void EvoSetViewportInOutC5(NVDevEvoPtr pDevEvo, const int head,
                                   NVEvoUpdateState *updateState)
 {
     NVEvoChannelPtr pChannel = pDevEvo->core;
-    NvU32 setWindowUsageBounds =
-        (NV_EVO3_DEFAULT_WINDOW_USAGE_BOUNDS_C5 |
-         DRF_DEF(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _INPUT_SCALER_TAPS, _TAPS_2) |
-         DRF_DEF(C57D, _WINDOW_SET_WINDOW_USAGE_BOUNDS, _UPSCALING_ALLOWED, _FALSE));
     NvU32 verticalUpscalingAllowed =
         EvoSetViewportInOut3(pDevEvo, head, pViewPortMin, pViewPort,
-                             pViewPortMax, updateState, setWindowUsageBounds);
+                             pViewPortMax, updateState);
 
     nvDmaSetStartEvoMethod(pChannel,
         NVC57D_HEAD_SET_HEAD_USAGE_BOUNDS(head), 1);
diff --git a/src/nvidia/arch/nvalloc/unix/include/nv.h b/src/nvidia/arch/nvalloc/unix/include/nv.h
index 1349d634d..96324c7a2 100644
--- a/src/nvidia/arch/nvalloc/unix/include/nv.h
+++ b/src/nvidia/arch/nvalloc/unix/include/nv.h
@@ -619,27 +619,33 @@ typedef enum
 #define NV_GET_NV_STATE(pGpu) \
     (nv_state_t *)((pGpu) ? (pGpu)->pOsGpuInfo : NULL)
 
-#define IS_REG_OFFSET(nv, offset, length)                                       \
-    (((offset) >= (nv)->regs->cpu_address) &&                                   \
-    (((offset) + ((length)-1)) <=                                               \
-        (nv)->regs->cpu_address + ((nv)->regs->size-1)))
-
-#define IS_FB_OFFSET(nv, offset, length)                                        \
-    (((nv)->fb) && ((offset) >= (nv)->fb->cpu_address) &&                       \
-    (((offset) + ((length)-1)) <= (nv)->fb->cpu_address + ((nv)->fb->size-1)))
-
-#define IS_UD_OFFSET(nv, offset, length)                                        \
-    (((nv)->ud.cpu_address != 0) && ((nv)->ud.size != 0) &&                     \
-    ((offset) >= (nv)->ud.cpu_address) &&                                       \
-    (((offset) + ((length)-1)) <= (nv)->ud.cpu_address + ((nv)->ud.size-1)))
-
-#define IS_IMEM_OFFSET(nv, offset, length)                                      \
-    (((nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&                    \
-     ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&                           \
-     ((offset) >= (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&             \
-     (((offset) + ((length) - 1)) <=                                            \
-        (nv)->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +                         \
-            ((nv)->bars[NV_GPU_BAR_INDEX_IMEM].size - 1)))
+static inline NvBool IS_REG_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((offset >= nv->regs->cpu_address) &&
+            ((offset + (length - 1)) <= (nv->regs->cpu_address + (nv->regs->size - 1))));
+}
+
+static inline NvBool IS_FB_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return  ((nv->fb) && (offset >= nv->fb->cpu_address) &&
+             ((offset + (length - 1)) <= (nv->fb->cpu_address + (nv->fb->size - 1))));
+}
+
+static inline NvBool IS_UD_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->ud.cpu_address != 0) && (nv->ud.size != 0) &&
+            (offset >= nv->ud.cpu_address) &&
+            ((offset + (length - 1)) <= (nv->ud.cpu_address + (nv->ud.size - 1))));
+}
+
+static inline NvBool IS_IMEM_OFFSET(nv_state_t *nv, NvU64 offset, NvU64 length)
+{
+    return ((nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address != 0) &&
+            (nv->bars[NV_GPU_BAR_INDEX_IMEM].size != 0) &&
+            (offset >= nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address) &&
+            ((offset + (length - 1)) <= (nv->bars[NV_GPU_BAR_INDEX_IMEM].cpu_address +
+                                         (nv->bars[NV_GPU_BAR_INDEX_IMEM].size - 1))));
+}
 
 #define NV_RM_MAX_MSIX_LINES  8
 
diff --git a/src/nvidia/arch/nvalloc/unix/src/osapi.c b/src/nvidia/arch/nvalloc/unix/src/osapi.c
index 13c7171b0..b832852d4 100644
--- a/src/nvidia/arch/nvalloc/unix/src/osapi.c
+++ b/src/nvidia/arch/nvalloc/unix/src/osapi.c
@@ -780,10 +780,8 @@ static NV_STATUS RmAccessRegistry(
             RmStatus = NV_ERR_INVALID_STRING_LENGTH;
             goto done;
         }
-
         // get access to client's parmStr
         RMAPI_PARAM_COPY_INIT(parmStrParamCopy, tmpParmStr, clientParmStrAddress, ParmStrLength, 1);
-        parmStrParamCopy.flags |= RMAPI_PARAM_COPY_FLAGS_ZERO_BUFFER;
         RmStatus = rmapiParamsAcquire(&parmStrParamCopy, NV_TRUE);
         if (RmStatus != NV_OK)
         {
@@ -2026,6 +2024,7 @@ static NV_STATUS RmGetAllocPrivate(
     PMEMORY_DESCRIPTOR pMemDesc;
     NvU32 pageOffset;
     NvU64 pageCount;
+    NvU64 endingOffset;
     RsResourceRef *pResourceRef;
     RmResource *pRmResource;
     void *pMemData;
@@ -2086,8 +2085,9 @@ static NV_STATUS RmGetAllocPrivate(
     if (rmStatus != NV_OK)
         goto done;
 
-    pageCount = ((pageOffset + length) / os_page_size);
-    pageCount += (*pPageIndex + (((pageOffset + length) % os_page_size) ? 1 : 0));
+    endingOffset = pageOffset + length;
+    pageCount = (endingOffset / os_page_size);
+    pageCount += (*pPageIndex + ((endingOffset % os_page_size) ? 1 : 0));
 
     if (pageCount > NV_RM_PAGES_TO_OS_PAGES(pMemDesc->PageCount))
     {
diff --git a/src/nvidia/arch/nvalloc/unix/src/osinit.c b/src/nvidia/arch/nvalloc/unix/src/osinit.c
index 49ede1008..77eac6c14 100644
--- a/src/nvidia/arch/nvalloc/unix/src/osinit.c
+++ b/src/nvidia/arch/nvalloc/unix/src/osinit.c
@@ -362,10 +362,6 @@ osHandleGpuLost
     pmc_boot_0 = NV_PRIV_REG_RD32(nv->regs->map_u, NV_PMC_BOOT_0);
     if (pmc_boot_0 != nvp->pmc_boot_0)
     {
-        RM_API *pRmApi = rmapiGetInterface(RMAPI_GPU_LOCK_INTERNAL);
-        NV2080_CTRL_GPU_GET_OEM_BOARD_INFO_PARAMS *pBoardInfoParams;
-        NV_STATUS status;
-
         //
         // This doesn't support PEX Reset and Recovery yet.
         // This will help to prevent accessing registers of a GPU
@@ -376,24 +372,11 @@ osHandleGpuLost
 
         NV_DEV_PRINTF(NV_DBG_ERRORS, nv, "GPU has fallen off the bus.\n");
 
-        pBoardInfoParams = portMemAllocNonPaged(sizeof(*pBoardInfoParams));
-        if (pBoardInfoParams != NULL)
+        if (pGpu->boardInfo != NULL && pGpu->boardInfo->serialNumber[0] != '\0')
         {
-            portMemSet(pBoardInfoParams, 0, sizeof(*pBoardInfoParams));
-
-            status = pRmApi->Control(pRmApi, nv->rmapi.hClient,
-                                     nv->rmapi.hSubDevice,
-                                     NV2080_CTRL_CMD_GPU_GET_OEM_BOARD_INFO,
-                                     pBoardInfoParams,
-                                     sizeof(*pBoardInfoParams));
-            if (status == NV_OK)
-            {
-                NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
-                              "GPU serial number is %s.\n",
-                              pBoardInfoParams->serialNumber);
-            }
-
-            portMemFree(pBoardInfoParams);
+            NV_DEV_PRINTF(NV_DBG_ERRORS, nv,
+                          "GPU serial number is %s.\n",
+                          pGpu->boardInfo->serialNumber);
         }
 
         gpuSetDisconnectedProperties(pGpu);
diff --git a/src/nvidia/generated/g_gpu_nvoc.h b/src/nvidia/generated/g_gpu_nvoc.h
index 40ecf92fd..41e5e01d6 100644
--- a/src/nvidia/generated/g_gpu_nvoc.h
+++ b/src/nvidia/generated/g_gpu_nvoc.h
@@ -60,6 +60,7 @@ typedef struct GPUATTACHARG GPUATTACHARG;
  * */
 #include "ctrl/ctrl0080/ctrl0080gpu.h" // NV0080_CTRL_GPU_GET_SRIOV_CAPS_PARAMS (form hal)
 #include "ctrl/ctrl2080/ctrl2080internal.h" // NV2080_CTRL_CMD_INTERNAL_MAX_BSPS/NVENCS
+#include "ctrl/ctrl2080/ctrl2080ecc.h"
 #include "ctrl/ctrl2080/ctrl2080nvd.h"
 #include "class/cl2080.h"
 #include "class/cl90cd.h"
diff --git a/src/nvidia/generated/g_kernel_gsp_nvoc.h b/src/nvidia/generated/g_kernel_gsp_nvoc.h
index 0b5d0758a..0657f317c 100644
--- a/src/nvidia/generated/g_kernel_gsp_nvoc.h
+++ b/src/nvidia/generated/g_kernel_gsp_nvoc.h
@@ -301,6 +301,7 @@ struct KernelGsp {
     LIBOS_LOG_DECODE logDecode;
     RM_LIBOS_LOG_MEM rmLibosLogMem[2];
     void *pLogElf;
+    NvBool bInInit;
     MEMORY_DESCRIPTOR *pMemDesc_simAccessBuf;
     SimAccessBuffer *pSimAccessBuf;
     NvP64 pSimAccessBufPriv;
diff --git a/src/nvidia/generated/g_nv_name_released.h b/src/nvidia/generated/g_nv_name_released.h
index 9251e5d5c..dc5ffa1c4 100644
--- a/src/nvidia/generated/g_nv_name_released.h
+++ b/src/nvidia/generated/g_nv_name_released.h
@@ -806,6 +806,8 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x20B0, 0x1450, 0x10de, "NVIDIA A100-PG509-200" },
     { 0x20B2, 0x1463, 0x10de, "NVIDIA A100-SXM4-80GB" },
     { 0x20B2, 0x147f, 0x10de, "NVIDIA A100-SXM4-80GB" },
+    { 0x20B2, 0x1622, 0x10de, "NVIDIA A100-SXM4-80GB" },
+    { 0x20B2, 0x1623, 0x10de, "NVIDIA A100-SXM4-80GB" },
     { 0x20B3, 0x14a7, 0x10de, "NVIDIA PG506-242" },
     { 0x20B3, 0x14a8, 0x10de, "NVIDIA PG506-243" },
     { 0x20B5, 0x1533, 0x10de, "NVIDIA A100 80GB PCIe" },
@@ -907,6 +909,7 @@ static const CHIPS_RELEASED sChipsReleased[] = {
     { 0x2507, 0x0000, 0x0000, "NVIDIA GeForce RTX 3050" },
     { 0x2508, 0x0000, 0x0000, "NVIDIA GeForce RTX 3050 OEM" },
     { 0x2520, 0x0000, 0x0000, "NVIDIA GeForce RTX 3060 Laptop GPU" },
+    { 0x2521, 0x0000, 0x0000, "NVIDIA GeForce RTX 3060 Laptop GPU" },
     { 0x2523, 0x0000, 0x0000, "NVIDIA GeForce RTX 3050 Ti Laptop GPU" },
     { 0x2531, 0x151d, 0x1028, "NVIDIA RTX A2000" },
     { 0x2531, 0x151d, 0x103c, "NVIDIA RTX A2000" },
diff --git a/src/nvidia/inc/libraries/nvlog/nvlog.h b/src/nvidia/inc/libraries/nvlog/nvlog.h
index 00debf749..c7be92119 100644
--- a/src/nvidia/inc/libraries/nvlog/nvlog.h
+++ b/src/nvidia/inc/libraries/nvlog/nvlog.h
@@ -86,8 +86,9 @@ NV_STATUS nvlogAllocBuffer(NvU32 size, NvU32 flags, NvU32 tag, NVLOG_BUFFER_HAND
  * @brief Deallocate a buffer with the given handle
  *
  * @param[in]   hBuffer     Handle of the buffer to deallocate
+ * @param[in]   bDeallocPreserved Deallocate preserved buffers
  */
-void nvlogDeallocBuffer(NVLOG_BUFFER_HANDLE hBuffer);
+void nvlogDeallocBuffer(NVLOG_BUFFER_HANDLE hBuffer, NvBool bDeallocPreserved);
 
 /**
  * @brief Write to a buffer with the given handle
diff --git a/src/nvidia/kernel/vgpu/nv/rpc.c b/src/nvidia/kernel/vgpu/nv/rpc.c
index 4e4ee5205..c5766045a 100644
--- a/src/nvidia/kernel/vgpu/nv/rpc.c
+++ b/src/nvidia/kernel/vgpu/nv/rpc.c
@@ -265,8 +265,11 @@ static NV_STATUS _issueRpcLarge
     // should not be called in broadcast mode
     NV_ASSERT_OR_RETURN(!gpumgrGetBcEnabledStatus(pGpu), NV_ERR_INVALID_STATE);
 
+    //
     // Copy the initial buffer
-    entryLength = NV_MIN(bufSize, pRpc->maxRpcSize);
+    // Temporary black magic WAR for bug 3594082: reducing the size by 1
+    //
+    entryLength = NV_MIN(bufSize, pRpc->maxRpcSize - 1);
 
     if ((NvU8 *)vgpu_rpc_message_header_v != pBuf8)
         portMemCopy(vgpu_rpc_message_header_v, entryLength, pBuf8, entryLength);
@@ -291,8 +294,11 @@ static NV_STATUS _issueRpcLarge
     remainingSize -= entryLength;
     pBuf8   += entryLength;
 
+    //
     // Copy the remaining buffers
-    entryLength = pRpc->maxRpcSize - sizeof(rpc_message_header_v);
+    // Temporary black magic WAR for bug 3594082: reducing the size by 1
+    //
+    entryLength = pRpc->maxRpcSize - sizeof(rpc_message_header_v) - 1;
     while (remainingSize != 0)
     {
         if (entryLength > remainingSize)
diff --git a/src/nvidia/src/kernel/diagnostics/nvlog.c b/src/nvidia/src/kernel/diagnostics/nvlog.c
index 677d3726e..50638bf72 100644
--- a/src/nvidia/src/kernel/diagnostics/nvlog.c
+++ b/src/nvidia/src/kernel/diagnostics/nvlog.c
@@ -103,7 +103,7 @@ nvlogDestroy()
     tlsShutdown();
     for (i = 0; i < NVLOG_MAX_BUFFERS; i++)
     {
-        nvlogDeallocBuffer(i);
+        nvlogDeallocBuffer(i, NV_TRUE);
     }
     if (NvLogLogger.mainLock != NULL)
     {
@@ -261,7 +261,8 @@ nvlogAllocBuffer
 void
 nvlogDeallocBuffer
 (
-    NVLOG_BUFFER_HANDLE hBuffer
+    NVLOG_BUFFER_HANDLE hBuffer,
+    NvBool bDeallocPreserved
 )
 {
     NVLOG_BUFFER *pBuffer;
@@ -271,6 +272,12 @@ nvlogDeallocBuffer
 
     pBuffer = NvLogLogger.pBuffers[hBuffer];
 
+    if (FLD_TEST_DRF(LOG_BUFFER, _FLAGS, _PRESERVE, _YES, pBuffer->flags) &&
+        !bDeallocPreserved)
+    {
+        return;
+    }
+
     pBuffer->flags = FLD_SET_DRF(LOG_BUFFER, _FLAGS, _DISABLED,
                                  _YES, pBuffer->flags);
 
diff --git a/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c b/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c
index 664049cd7..2fab6ca47 100644
--- a/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c
+++ b/src/nvidia/src/kernel/gpu/bus/arch/maxwell/kern_bus_gm107.c
@@ -2502,15 +2502,19 @@ kbusFlushSingle_GM107
             if (IS_GSP_CLIENT(pGpu))
             {
                 //
-                // on GSP client, we only support PCIE_READ to do flush
-                // a sysmembar flush should call kbusSendSysmembarSingle_HAL explicitly
+                // on GSP client, we should use PCIE_READ to do video memory flush.
+                // A sysmembar flush that touches registers is done through RPC and has
+                // lower effeciency.  For cases where it needs sysmembar, the caller site
+                // should use kbusSendSysmembarSingle_HAL explicitly.
                 //
-                NV_ASSERT_OR_RETURN(0, NV_ERR_INVALID_PATH);
-            }
-            else
-            {
-                return kbusSendSysmembarSingle_HAL(pGpu, pKernelBus);
+                NV_ASSERT(0);
+
+                // This will dump a stack trace to assist debug on certain
+                // platforms.
+                osAssertFailed();
             }
+
+            return kbusSendSysmembarSingle_HAL(pGpu, pKernelBus);
         }
     }
 
diff --git a/src/nvidia/src/kernel/gpu/fifo/kernel_channel.c b/src/nvidia/src/kernel/gpu/fifo/kernel_channel.c
index 8f880eea7..8e0b03ceb 100644
--- a/src/nvidia/src/kernel/gpu/fifo/kernel_channel.c
+++ b/src/nvidia/src/kernel/gpu/fifo/kernel_channel.c
@@ -3750,6 +3750,7 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
     Memory *pMemory;
     ContextDma *pContextDma;
     NvU32 addressSpace;
+    NvU64 notificationBufferSize;
     NV_STATUS status;
 
     hNotifier = pKernelChannel->hErrorContext;
@@ -3758,6 +3759,8 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
     NV_CHECK_OR_RETURN(LEVEL_INFO, index != NV_CHANNELGPFIFO_NOTIFICATION_TYPE_ERROR,
                      NV_ERR_INVALID_ARGUMENT);
 
+    notificationBufferSize = (index + 1) * sizeof(NvNotification);
+
     status = deviceGetByInstance(pClient, gpuGetDeviceInstance(pGpu), &pDevice);
     if (status != NV_OK)
         return NV_ERR_INVALID_DEVICE;
@@ -3766,7 +3769,7 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
     {
         addressSpace = memdescGetAddressSpace(pMemory->pMemDesc);
 
-        NV_CHECK_OR_RETURN(LEVEL_INFO, pMemory->Length >= ((index + 1) * sizeof(NvNotification)),
+        NV_CHECK_OR_RETURN(LEVEL_INFO, pMemory->Length >= notificationBufferSize,
                          NV_ERR_OUT_OF_RANGE);
         switch (addressSpace)
         {
@@ -3784,7 +3787,7 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
                                          &pDmaMappingInfo),
                     NV_ERR_GENERIC);
 
-                NV_CHECK_OR_RETURN(LEVEL_INFO, pDmaMappingInfo->pMemDesc->Size >= ((index + 1) * sizeof(NvNotification)),
+                NV_CHECK_OR_RETURN(LEVEL_INFO, pDmaMappingInfo->pMemDesc->Size >= notificationBufferSize,
                                  NV_ERR_OUT_OF_RANGE);
                 break;
             }
@@ -3799,7 +3802,7 @@ kchannelUpdateWorkSubmitTokenNotifIndex_IMPL
     }
     else if (NV_OK == ctxdmaGetByHandle(pClient, hNotifier, &pContextDma))
     {
-        NV_CHECK_OR_RETURN(LEVEL_INFO, pContextDma->Limit >= (((index + 1) * sizeof(NvNotification)) - 1),
+        NV_CHECK_OR_RETURN(LEVEL_INFO, pContextDma->Limit >= (notificationBufferSize - 1),
                          NV_ERR_OUT_OF_RANGE);
     }
     else
diff --git a/src/nvidia/src/kernel/gpu/gpu.c b/src/nvidia/src/kernel/gpu/gpu.c
index a8aee3676..275fcb0c5 100644
--- a/src/nvidia/src/kernel/gpu/gpu.c
+++ b/src/nvidia/src/kernel/gpu/gpu.c
@@ -1923,26 +1923,6 @@ gpuStatePreInit_IMPL
         }
     }
 
-    pGpu->boardInfo = portMemAllocNonPaged(sizeof(*pGpu->boardInfo));
-    if (pGpu->boardInfo)
-    {
-        // To avoid potential race of xid reporting with the control, zero it out
-        portMemSet(pGpu->boardInfo, '\0', sizeof(*pGpu->boardInfo));
-
-        RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
-
-        if (pRmApi->Control(pRmApi,
-                        pGpu->hInternalClient,
-                        pGpu->hInternalSubdevice,
-                        NV2080_CTRL_CMD_GPU_GET_OEM_BOARD_INFO,
-                        pGpu->boardInfo,
-                        sizeof(*pGpu->boardInfo)) != NV_OK)
-        {
-            portMemFree(pGpu->boardInfo);
-            pGpu->boardInfo = NULL;
-        }
-    }
-
     return rmStatus;
 }
 
@@ -2291,6 +2271,26 @@ gpuStatePostLoad
             goto gpuStatePostLoad_exit;
     }
 
+    pGpu->boardInfo = portMemAllocNonPaged(sizeof(*pGpu->boardInfo));
+    if (pGpu->boardInfo)
+    {
+        // To avoid potential race of xid reporting with the control, zero it out
+        portMemSet(pGpu->boardInfo, '\0', sizeof(*pGpu->boardInfo));
+
+        RM_API *pRmApi = GPU_GET_PHYSICAL_RMAPI(pGpu);
+
+        if(pRmApi->Control(pRmApi,
+                           pGpu->hInternalClient,
+                           pGpu->hInternalSubdevice,
+                           NV2080_CTRL_CMD_GPU_GET_OEM_BOARD_INFO,
+                           pGpu->boardInfo,
+                           sizeof(*pGpu->boardInfo)) != NV_OK)
+        {
+            portMemFree(pGpu->boardInfo);
+            pGpu->boardInfo = NULL;
+        }
+    }
+
 gpuStatePostLoad_exit:
     return rmStatus;
 }
@@ -2326,6 +2326,9 @@ gpuStatePreUnload
     NvU32               curEngDescIdx;
     NV_STATUS           rmStatus = NV_OK;
 
+    portMemFree(pGpu->boardInfo);
+    pGpu->boardInfo = NULL;
+
     engDescriptorList = gpuGetUnloadEngineDescriptors(pGpu);
     numEngDescriptors = gpuGetNumEngDescriptors(pGpu);
 
@@ -2648,9 +2651,6 @@ gpuStateDestroy_IMPL
     _gpuFreeInternalObjects(pGpu);
     gpuDestroyGenericKernelFalconList(pGpu);
 
-    portMemFree(pGpu->boardInfo);
-    pGpu->boardInfo = NULL;
-
     portMemFree(pGpu->gspSupportedEngines);
     pGpu->gspSupportedEngines = NULL;
 
diff --git a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
index 6c1d976c6..53961a1be 100644
--- a/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
+++ b/src/nvidia/src/kernel/gpu/gsp/kernel_gsp.c
@@ -1047,7 +1047,7 @@ _kgspInitLibosLoggingStructures
 
         //
         // Setup logging memory for each task.
-        // Use MEMDESC_FLAGS_CPU_ONLY -- to early to call memdescMapIommu.
+        // Use MEMDESC_FLAGS_CPU_ONLY -- too early to call memdescMapIommu.
         //
         NV_ASSERT_OK_OR_GOTO(nvStatus,
             memdescCreate(&pLog->pTaskLogDescriptor,
@@ -1258,6 +1258,8 @@ kgspInitRm_IMPL
         return NV_ERR_INVALID_ARGUMENT;
     }
 
+    pKernelGsp->bInInit = NV_TRUE;
+
     // Need to hold the GPU instance lock in order to write to the RPC queue
     NV_ASSERT_OK_OR_GOTO(status,
         rmGpuGroupLockAcquire(pGpu->gpuInstance, GPU_LOCK_GRP_SUBDEVICE,
@@ -1278,7 +1280,7 @@ kgspInitRm_IMPL
     {
         KernelGspVbiosImg *pVbiosImg = NULL;
 
-		// Try and extract a VBIOS image.
+        // Try and extract a VBIOS image.
         status = kgspExtractVbiosFromRom_HAL(pGpu, pKernelGsp, &pVbiosImg);
 
         if (status == NV_OK)
@@ -1403,6 +1405,14 @@ kgspInitRm_IMPL
     NV_CHECK_OK_OR_GOTO(status, LEVEL_ERROR, kgspStartLogPolling(pGpu, pKernelGsp), done);
 
 done:
+    pKernelGsp->bInInit = NV_FALSE;
+
+    if (status != NV_OK)
+    {
+        // Preserve any captured gsp-rm logs
+        libosPreserveLogs(&pKernelGsp->logDecode);
+    }
+
     if (gpusLockedMask != 0)
     {
         rmGpuGroupLockRelease(gpusLockedMask, GPUS_LOCK_FLAGS_NONE);
@@ -1520,7 +1530,7 @@ kgspDumpGspLogs_IMPL
     NvBool bSyncNvLog
 )
 {
-    if (pKernelGsp->pLogElf || bSyncNvLog)
+    if (pKernelGsp->bInInit || pKernelGsp->pLogElf || bSyncNvLog)
         libosExtractLogs(&pKernelGsp->logDecode, bSyncNvLog);
 }
 
diff --git a/src/nvidia/src/kernel/gpu/mem_mgr/mem_mgr.c b/src/nvidia/src/kernel/gpu/mem_mgr/mem_mgr.c
index 005888560..a2c742b58 100644
--- a/src/nvidia/src/kernel/gpu/mem_mgr/mem_mgr.c
+++ b/src/nvidia/src/kernel/gpu/mem_mgr/mem_mgr.c
@@ -1959,6 +1959,7 @@ memmgrFillComprInfo_IMPL
 {
     const MEMORY_SYSTEM_STATIC_CONFIG *pMemorySystemConfig =
         kmemsysGetStaticConfig(pGpu, GPU_GET_KERNEL_MEMORY_SYSTEM(pGpu));
+    NvU32 size;
 
     portMemSet(pComprInfo, 0, sizeof(*pComprInfo));
 
@@ -1969,10 +1970,12 @@ memmgrFillComprInfo_IMPL
 
     NV_ASSERT(compTagStartOffset != ~(NvU32)0);
 
+    size = pageSize * pageCount;
+
     pComprInfo->compPageShift = pMemorySystemConfig->comprPageShift;
     pComprInfo->compTagLineMin = compTagStartOffset;
     pComprInfo->compPageIndexLo = (NvU32)(surfOffset >> pComprInfo->compPageShift);
-    pComprInfo->compPageIndexHi = (NvU32)((surfOffset + pageSize * pageCount - 1) >> pComprInfo->compPageShift);
+    pComprInfo->compPageIndexHi = (NvU32)((surfOffset + size - 1) >> pComprInfo->compPageShift);
     pComprInfo->compTagLineMultiplier = 1;
 
     return NV_OK;
diff --git a/src/nvidia/src/kernel/gpu_mgr/gpu_mgr.c b/src/nvidia/src/kernel/gpu_mgr/gpu_mgr.c
index 5c342d9ce..d77843220 100644
--- a/src/nvidia/src/kernel/gpu_mgr/gpu_mgr.c
+++ b/src/nvidia/src/kernel/gpu_mgr/gpu_mgr.c
@@ -751,6 +751,8 @@ NvBool gpumgrIsDeviceRmFirmwareCapable
         0x2236, // A10   SKU215     Pris-24
         0x2237, // A10G  SKU215     Pris-24
         0x25B6, // A16
+        0x20F5, // A800-80
+        0x20F6, // A800-40
     };
     NvU32 count = NV_ARRAY_ELEMENTS(defaultGspRmGpus);
     NvU32 i;
diff --git a/version.mk b/version.mk
index 0b27afc62..f3d119595 100644
--- a/version.mk
+++ b/version.mk
@@ -1,4 +1,4 @@
-NVIDIA_VERSION = 515.65.01
+NVIDIA_VERSION = 515.76
 
 # This file.
 VERSION_MK_FILE := $(lastword $(MAKEFILE_LIST))