linkedin · KaiSernLim · Dec 6, 2024 · Dec 6, 2024 · Dec 7, 2024 · Dec 7, 2024
diff --git a/...vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerImpl.java b/...vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerImpl.java
@@ -758,7 +758,7 @@ protected Optional<PubSubMessage<K, ChangeEvent<V>, VeniceChangeCoordinate>> con
     if (messageType.equals(MessageType.PUT)) {
       Put put = (Put) message.getValue().payloadUnion;
       // Select appropriate deserializers
-      Lazy deserializerProvider;
+      Lazy<RecordDeserializer> deserializerProvider;
       int readerSchemaId;
       if (pubSubTopicPartition.getPubSubTopic().isVersionTopic()) {
         deserializerProvider = Lazy.of(() -> storeDeserializerCache.getDeserializer(put.schemaId, put.schemaId));
@@ -783,9 +783,9 @@ protected Optional<PubSubMessage<K, ChangeEvent<V>, VeniceChangeCoordinate>> con
           keyBytes,
           put.getPutValue(),
           message.getOffset(),
-          deserializerProvider,
           readerSchemaId,
-          compressor);
+          compressor,
+          (valueBytes) -> deserializerProvider.get().deserialize(valueBytes));
       if (assembledObject == null) {
         // bufferAndAssembleRecord may have only buffered records and not returned anything yet because
         // it's waiting for more input. In this case, just return an empty optional for now.

diff --git a/...t/src/main/java/com/linkedin/davinci/kafka/consumer/LeaderFollowerStoreIngestionTask.java b/...t/src/main/java/com/linkedin/davinci/kafka/consumer/LeaderFollowerStoreIngestionTask.java
@@ -7,6 +7,8 @@
 import static com.linkedin.davinci.kafka.consumer.LeaderFollowerStateType.LEADER;
 import static com.linkedin.davinci.kafka.consumer.LeaderFollowerStateType.PAUSE_TRANSITION_FROM_STANDBY_TO_LEADER;
 import static com.linkedin.davinci.kafka.consumer.LeaderFollowerStateType.STANDBY;
+import static com.linkedin.davinci.validation.PartitionTracker.TopicType.REALTIME_TOPIC_TYPE;
+import static com.linkedin.davinci.validation.PartitionTracker.TopicType.VERSION_TOPIC_TYPE;
 import static com.linkedin.venice.kafka.protocol.enums.ControlMessageType.END_OF_PUSH;
 import static com.linkedin.venice.kafka.protocol.enums.ControlMessageType.START_OF_SEGMENT;
 import static com.linkedin.venice.pubsub.api.PubSubMessageHeaders.VENICE_LEADER_COMPLETION_STATE_HEADER;
@@ -33,6 +35,7 @@
 import com.linkedin.davinci.store.view.VeniceViewWriter;
 import com.linkedin.davinci.validation.KafkaDataIntegrityValidator;
 import com.linkedin.davinci.validation.PartitionTracker;
+import com.linkedin.davinci.validation.PartitionTracker.TopicType;
 import com.linkedin.venice.common.VeniceSystemStoreUtils;
 import com.linkedin.venice.compression.CompressionStrategy;
 import com.linkedin.venice.exceptions.VeniceException;
@@ -1479,7 +1482,7 @@ protected void updateOffsetMetadataInOffsetRecord(PartitionConsumptionState part
         upstreamTopic = versionTopic;
       }
       if (upstreamTopic.isRealTime()) {
-        offsetRecord.resetUpstreamOffsetMap(partitionConsumptionState.getLatestProcessedUpstreamRTOffsetMap());
+        offsetRecord.mergeUpstreamOffsets(partitionConsumptionState.getLatestProcessedUpstreamRTOffsetMap());
       } else {
         offsetRecord.setCheckpointUpstreamVersionTopicOffset(
             partitionConsumptionState.getLatestProcessedUpstreamVersionTopicOffset());
@@ -1619,7 +1622,6 @@ protected static void checkAndHandleUpstreamOffsetRewind(
               int actualSchemaId = ByteUtils.readInt(actualValue, 0);
               Put put = (Put) envelope.payloadUnion;
               if (actualSchemaId == put.schemaId) {
-
                 if (put.putValue.equals(
                     ByteBuffer.wrap(
                         actualValue,
@@ -1904,6 +1906,10 @@ protected boolean shouldProcessRecord(PubSubMessage<KafkaKey, KafkaMessageEnvelo
               }
             }
           }
+          // Global RT DIV messages should be completely ignored when leader is consuming from remote version topic
+          if (record.getKey().isGlobalRtDiv()) {
+            return false;
+          }
         }
 
         if (!Utils.resolveLeaderTopicFromPubSubTopic(pubSubTopicRepository, record.getTopicPartition().getPubSubTopic())
@@ -2281,18 +2287,17 @@ protected Iterable<PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long>> validate
       Iterable<PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long>> records,
       String kafkaUrl,
       PubSubTopicPartition topicPartition) {
-    PartitionConsumptionState partitionConsumptionState =
-        partitionConsumptionStateMap.get(topicPartition.getPartitionNumber());
-    if (partitionConsumptionState == null) {
+    final PartitionConsumptionState pcs = partitionConsumptionStateMap.get(topicPartition.getPartitionNumber());
+    if (pcs == null) {
       // The partition is likely unsubscribed, will skip these messages.
       LOGGER.warn(
-          "No partition consumption state for store version: {}, partition:{}, will filter out all the messages",
+          "No partition consumption state for store version: {}, partition: {}, will filter out all the messages",
           kafkaVersionTopic,
           topicPartition.getPartitionNumber());
       return Collections.emptyList();
     }
-    boolean isEndOfPushReceived = partitionConsumptionState.isEndOfPushReceived();
-    if (!shouldProduceToVersionTopic(partitionConsumptionState)) {
+    boolean isEndOfPushReceived = pcs.isEndOfPushReceived();
+    if (!shouldProduceToVersionTopic(pcs)) {
       return records;
     }
     /**
@@ -2302,30 +2307,15 @@ protected Iterable<PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long>> validate
     Iterator<PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long>> iter = records.iterator();
     while (iter.hasNext()) {
       PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long> record = iter.next();
-      boolean isRealTimeMsg = record.getTopicPartition().getPubSubTopic().isRealTime();
+      boolean isRealTimeTopic = record.getTopicPartition().getPubSubTopic().isRealTime();
       try {
         /**
          * TODO: An improvement can be made to fail all future versions for fatal DIV exceptions after EOP.
          */
-        if (!isGlobalRtDivEnabled) {
-          validateMessage(
-              PartitionTracker.VERSION_TOPIC,
-              this.kafkaDataIntegrityValidatorForLeaders,
-              record,
-              isEndOfPushReceived,
-              partitionConsumptionState);
-        } else {
-          validateMessage(
-              PartitionTracker.TopicType.of(
-                  isRealTimeMsg
-                      ? PartitionTracker.TopicType.REALTIME_TOPIC_TYPE
-                      : PartitionTracker.TopicType.VERSION_TOPIC_TYPE,
-                  kafkaUrl),
-              this.kafkaDataIntegrityValidatorForLeaders,
-              record,
-              isEndOfPushReceived,
-              partitionConsumptionState);
-        }
+        final TopicType topicType = (isGlobalRtDivEnabled)
+            ? TopicType.of(isRealTimeTopic ? REALTIME_TOPIC_TYPE : VERSION_TOPIC_TYPE, kafkaUrl)
+            : PartitionTracker.VERSION_TOPIC;
+        validateMessage(topicType, kafkaDataIntegrityValidatorForLeaders, record, isEndOfPushReceived, pcs);
         versionedDIVStats.recordSuccessMsg(storeName, versionNumber);
       } catch (FatalDataValidationException e) {
         if (!isEndOfPushReceived) {
@@ -2342,7 +2332,7 @@ protected Iterable<PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long>> validate
             "Skipping a duplicate record from: {} offset: {} for replica: {}",
             record.getTopicPartition(),
             record.getOffset(),
-            partitionConsumptionState.getReplicaId());
+            pcs.getReplicaId());
         iter.remove();
       }
     }

diff --git a/...da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTask.java b/...da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTask.java
@@ -12,6 +12,7 @@
 import static com.linkedin.venice.ConfigKeys.KAFKA_BOOTSTRAP_SERVERS;
 import static com.linkedin.venice.LogMessages.KILLED_JOB_MESSAGE;
 import static com.linkedin.venice.kafka.protocol.enums.ControlMessageType.START_OF_SEGMENT;
+import static com.linkedin.venice.serialization.avro.AvroProtocolDefinition.GLOBAL_RT_DIV_STATE;
 import static com.linkedin.venice.utils.Utils.FATAL_DATA_VALIDATION_ERROR;
 import static com.linkedin.venice.utils.Utils.getReplicaId;
 import static java.util.Comparator.comparingInt;
@@ -48,6 +49,7 @@
 import com.linkedin.venice.common.VeniceSystemStoreType;
 import com.linkedin.venice.common.VeniceSystemStoreUtils;
 import com.linkedin.venice.compression.CompressionStrategy;
+import com.linkedin.venice.compression.NoopCompressor;
 import com.linkedin.venice.compression.VeniceCompressor;
 import com.linkedin.venice.exceptions.DiskLimitExhaustedException;
 import com.linkedin.venice.exceptions.MemoryLimitExhaustedException;
@@ -184,6 +186,8 @@ public abstract class StoreIngestionTask implements Runnable, Closeable {
   private static final int MAX_KILL_CHECKING_ATTEMPTS = 10;
   private static final int CHUNK_MANIFEST_SCHEMA_ID =
       AvroProtocolDefinition.CHUNKED_VALUE_MANIFEST.getCurrentProtocolVersion();
+  private static final int GLOBAL_RT_DIV_STATE_SCHEMA_ID =
+      AvroProtocolDefinition.GLOBAL_RT_DIV_STATE.getCurrentProtocolVersion();
 
   protected static final RedundantExceptionFilter REDUNDANT_LOGGING_FILTER =
       RedundantExceptionFilter.getRedundantExceptionFilter();
@@ -317,6 +321,7 @@ public abstract class StoreIngestionTask implements Runnable, Closeable {
   protected final IngestionNotificationDispatcher ingestionNotificationDispatcher;
 
   protected final ChunkAssembler chunkAssembler;
+  protected final ChunkAssembler divChunkAssembler;
   private final Optional<ObjectCacheBackend> cacheBackend;
   private DaVinciRecordTransformer recordTransformer;
 
@@ -467,6 +472,8 @@ public StoreIngestionTask(
         new IngestionNotificationDispatcher(notifiers, kafkaVersionTopic, isCurrentVersion);
     this.missingSOPCheckExecutor.execute(() -> waitForStateVersion(kafkaVersionTopic));
     this.chunkAssembler = new ChunkAssembler(storeName);
+    this.divChunkAssembler =
+        builder.getDivChunkAssembler() != null ? builder.getDivChunkAssembler() : new ChunkAssembler(storeName);
     this.cacheBackend = cacheBackend;
 
     if (recordTransformerFunction != null) {
@@ -1145,6 +1152,12 @@ private int handleSingleMessage(
             record.getTopicPartition().getPartitionNumber(),
             partitionConsumptionStateMap.get(topicPartition.getPartitionNumber()));
       }
+    } else if (record.getKey().isGlobalRtDiv()) {
+      // TODO: This is a placeholder for the actual implementation.
+      if (isGlobalRtDivEnabled) {
+        processGlobalRtDivMessage(record); // This is a global realtime topic data integrity validator snapshot
+      }
+      return 0;
     }
 
     // This function may modify the original record in KME and it is unsafe to use the payload from KME directly after
@@ -1189,6 +1202,28 @@ private int handleSingleMessage(
     return record.getPayloadSize();
   }
 
+  void processGlobalRtDivMessage(PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long> record) {
+    KafkaKey key = record.getKey();
+    KafkaMessageEnvelope value = record.getValue();
+    Put put = (Put) value.getPayloadUnion();
+
+    Object assembledObject = divChunkAssembler.bufferAndAssembleRecord(
+        record.getTopicPartition(),
+        put.getSchemaId(),
+        key.getKey(),
+        put.getPutValue(),
+        record.getOffset(),
+        put.getSchemaId(),
+        new NoopCompressor(),
+        (valueBytes) -> GLOBAL_RT_DIV_STATE.getSerializer()
+            .deserialize(ByteUtils.extractByteArray(valueBytes), GLOBAL_RT_DIV_STATE_SCHEMA_ID));
+
+    if (assembledObject == null) {
+      return; // the message value only contained one data chunk, so the Global RT DIV cannot yet be fully assembled
+    }
+    // TODO: We will add the code to process Global RT DIV message later in here.
+  }
+
   /**
    * This function is in charge of producing the consumer records to the writer buffers maintained by {@link StoreBufferService}.
    *
@@ -2397,6 +2432,12 @@ protected boolean shouldProcessRecord(PubSubMessage<KafkaKey, KafkaMessageEnvelo
       return false;
     }
 
+    // Global RT DIV messages should only be in version topics, not realtime topics. Skip it and log a warning.
+    if (record.getKey().isGlobalRtDiv() && record.getTopic().isRealTime()) {
+      LOGGER.warn("Skipping Global RT DIV message from realtime topic partition: {}", record.getTopicPartition());
+      return false;
+    }
+
     if (partitionConsumptionState.isEndOfPushReceived() && partitionConsumptionState.isBatchOnly()) {
       KafkaKey key = record.getKey();
       KafkaMessageEnvelope value = record.getValue();
@@ -3786,7 +3827,7 @@ private int processKafkaDataMessage(
   private void waitReadyToProcessRecord(PubSubMessage<KafkaKey, KafkaMessageEnvelope, Long> record)
       throws InterruptedException {
     KafkaMessageEnvelope kafkaValue = record.getValue();
-    if (record.getKey().isControlMessage() || kafkaValue == null) {
+    if (record.getKey().isControlMessage() || record.getKey().isGlobalRtDiv() || kafkaValue == null) {
       return;
     }
 
@@ -4561,4 +4602,8 @@ void setVersionRole(PartitionReplicaIngestionContext.VersionRole versionRole) {
   protected boolean isDaVinciClient() {
     return isDaVinciClient;
   }
+
+  ChunkAssembler getDivChunkAssembler() {
+    return this.divChunkAssembler;
+  }
 }
diff --git a/...i-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskFactory.java b/...i-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreIngestionTaskFactory.java
@@ -14,6 +14,7 @@
 import com.linkedin.davinci.storage.StorageService;
 import com.linkedin.davinci.store.cache.backend.ObjectCacheBackend;
 import com.linkedin.davinci.store.view.VeniceViewWriterFactory;
+import com.linkedin.davinci.utils.ChunkAssembler;
 import com.linkedin.venice.kafka.protocol.state.PartitionState;
 import com.linkedin.venice.meta.ReadOnlySchemaRepository;
 import com.linkedin.venice.meta.ReadOnlyStoreRepository;
@@ -126,6 +127,7 @@ public static class Builder {
     private PubSubTopicRepository pubSubTopicRepository;
     private Runnable runnableForKillIngestionTasksForNonCurrentVersions;
     private ExecutorService aaWCWorkLoadProcessingThreadPool;
+    private ChunkAssembler divChunkAssembler;
 
     private interface Setter {
       void apply();
@@ -336,5 +338,13 @@ public Builder setAAWCWorkLoadProcessingThreadPool(ExecutorService executorServi
     public ExecutorService getAAWCWorkLoadProcessingThreadPool() {
       return this.aaWCWorkLoadProcessingThreadPool;
     }
+
+    public Builder setDivChunkAssembler(ChunkAssembler divChunkAssembler) {
+      return set(() -> this.divChunkAssembler = divChunkAssembler);
+    }
+
+    public ChunkAssembler getDivChunkAssembler() {
+      return divChunkAssembler;
+    }
   }
 }