From 58f702fe5be8dd543905e96c7eea7bd949313bf1 Mon Sep 17 00:00:00 2001 From: Iraklis Psaroudakis Date: Mon, 8 Dec 2025 19:13:43 +0200 Subject: [PATCH] Move action and commit related stuff to stateless Migrating: * PrimaryTermAndGeneration * ClosedShardService * BlobFile * BatchedCompoundCommit * Blob ranges * ReplicatedContent * VirtualBatchedCompoundCommit * Some action requests and responses * Shard local commit trackers * StaleCompoundCommit * Utils and related functionality and tests. Relates ES-13590 --- .../stateless/src/main/java/module-info.java | 11 +- .../xpack/stateless/StatelessPlugin.java | 178 +++ .../action/FetchShardCommitsInUseAction.java | 215 +++ ...tualBatchedCompoundCommitChunkRequest.java | 119 ++ ...ualBatchedCompoundCommitChunkResponse.java | 58 + .../action/NewCommitNotificationRequest.java | 220 +++ .../action/NewCommitNotificationResponse.java | 69 + .../TransportEnsureDocsSearchableAction.java | 173 +++ .../cache/Lucene90CompoundEntriesReader.java | 66 + .../StatelessSharedBlobCacheService.java | 79 ++ .../AbstractBatchedCompoundCommit.java | 16 + .../commits/BCCHeaderReadExecutor.java | 61 + .../commits/BatchedCompoundCommit.java | 228 +++ .../xpack/stateless/commits/BlobFile.java | 39 + .../stateless/commits/BlobFileRanges.java | 170 +++ .../xpack/stateless/commits/BlobLocation.java | 169 +++ .../stateless/commits/ClosedShardService.java | 65 + .../stateless/commits/CommitBCCResolver.java | 24 + .../IndexEngineLocalReaderListener.java | 21 + .../InternalFilesReplicatedRanges.java | 135 ++ .../stateless/commits/ReplicatedContent.java | 256 ++++ .../commits/ShardLocalCommitsRefs.java | 94 ++ .../commits/ShardLocalCommitsTracker.java | 20 + .../commits/ShardLocalReadersTracker.java | 65 + .../commits/SoftDeleteIndexCommit.java | 59 + .../commits/StaleCompoundCommit.java | 26 + .../commits/StatelessCompoundCommit.java | 815 +++++++++++ .../commits/VirtualBatchedCompoundCommit.java | 1222 +++++++++++++++++ .../engine/NewCommitNotification.java | 52 + .../engine/PrimaryTermAndGeneration.java | 63 + .../xpack/stateless/lucene/FileCacheKey.java | 17 + .../stateless/lucene/StatelessCommitRef.java | 132 ++ .../IndexingShardRecoveryComparator.java | 108 ++ .../utils/TransferableCloseables.java | 46 + .../xpack/stateless/utils/WaitForVersion.java | 54 + .../NewCommitNotificationRequestTests.java | 287 ++++ ...otificationResponseSerializationTests.java | 42 + .../NewCommitNotificationResponseTests.java | 72 + .../Lucene90CompoundEntriesReaderTests.java | 63 + .../commits/BlobFileRangesTestUtils.java | 23 + .../commits/BlobLocationTestUtils.java | 29 + .../stateless/commits/BlobLocationTests.java | 70 + .../InternalFilesReplicatedRangesTests.java | 49 + .../commits/ReplicatedContentTests.java | 287 ++++ ...telessCompoundCommitInternalFileTests.java | 223 +++ .../StatelessCompoundCommitTestUtils.java | 128 ++ .../commits/StatelessCompoundCommitTests.java | 507 +++++++ ...VirtualBatchedCompoundCommitTestUtils.java | 32 + .../engine/PrimaryTermAndGenerationTests.java | 69 + 49 files changed, 7025 insertions(+), 1 deletion(-) create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/FetchShardCommitsInUseAction.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/GetVirtualBatchedCompoundCommitChunkRequest.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/GetVirtualBatchedCompoundCommitChunkResponse.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationRequest.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponse.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/TransportEnsureDocsSearchableAction.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/cache/Lucene90CompoundEntriesReader.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/cache/StatelessSharedBlobCacheService.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/AbstractBatchedCompoundCommit.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BCCHeaderReadExecutor.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BatchedCompoundCommit.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobFile.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobFileRanges.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobLocation.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ClosedShardService.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/CommitBCCResolver.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/IndexEngineLocalReaderListener.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/InternalFilesReplicatedRanges.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ReplicatedContent.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalCommitsRefs.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalCommitsTracker.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalReadersTracker.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/SoftDeleteIndexCommit.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/StaleCompoundCommit.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommit.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/VirtualBatchedCompoundCommit.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/engine/NewCommitNotification.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/engine/PrimaryTermAndGeneration.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/lucene/FileCacheKey.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/lucene/StatelessCommitRef.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/IndexingShardRecoveryComparator.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/TransferableCloseables.java create mode 100644 x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/WaitForVersion.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationRequestTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponseSerializationTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponseTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/cache/Lucene90CompoundEntriesReaderTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobFileRangesTestUtils.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobLocationTestUtils.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobLocationTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/InternalFilesReplicatedRangesTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/ReplicatedContentTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitInternalFileTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitTestUtils.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitTests.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/VirtualBatchedCompoundCommitTestUtils.java create mode 100644 x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/engine/PrimaryTermAndGenerationTests.java diff --git a/x-pack/plugin/stateless/src/main/java/module-info.java b/x-pack/plugin/stateless/src/main/java/module-info.java index 4908f3a0efe01..683bfcf6dcd03 100644 --- a/x-pack/plugin/stateless/src/main/java/module-info.java +++ b/x-pack/plugin/stateless/src/main/java/module-info.java @@ -7,10 +7,19 @@ module org.elasticsearch.xpack.stateless { requires org.elasticsearch.base; + requires org.elasticsearch.blobcache; + requires org.elasticsearch.logging; requires org.elasticsearch.server; requires org.elasticsearch.xcore; + requires org.elasticsearch.xcontent; requires org.apache.logging.log4j; - requires org.elasticsearch.logging; + requires org.apache.lucene.core; exports org.elasticsearch.xpack.stateless; + exports org.elasticsearch.xpack.stateless.action; + exports org.elasticsearch.xpack.stateless.cache; + exports org.elasticsearch.xpack.stateless.commits; + exports org.elasticsearch.xpack.stateless.engine; + exports org.elasticsearch.xpack.stateless.lucene; + exports org.elasticsearch.xpack.stateless.utils; } diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/StatelessPlugin.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/StatelessPlugin.java index b9df2a6ff5b1a..c25e1a81ed906 100644 --- a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/StatelessPlugin.java +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/StatelessPlugin.java @@ -8,8 +8,11 @@ import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodeRole; +import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.core.TimeValue; import org.elasticsearch.license.License; import org.elasticsearch.license.LicensedFeature; import org.elasticsearch.license.XPackLicenseState; @@ -19,6 +22,9 @@ import org.elasticsearch.plugins.ClusterCoordinationPlugin; import org.elasticsearch.plugins.ExtensiblePlugin; import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.repositories.blobstore.BlobStoreRepository; +import org.elasticsearch.threadpool.ExecutorBuilder; +import org.elasticsearch.threadpool.ScalingExecutorBuilder; import org.elasticsearch.xpack.core.XPackPlugin; import java.io.IOException; @@ -54,7 +60,168 @@ public class StatelessPlugin extends Plugin implements ClusterCoordinationPlugin public static final String NAME = "stateless"; + // Thread pool names are defined in the BlobStoreRepository because we need to verify there that no requests are running on other pools. + public static final String SHARD_READ_THREAD_POOL = BlobStoreRepository.STATELESS_SHARD_READ_THREAD_NAME; + public static final String SHARD_READ_THREAD_POOL_SETTING = "stateless." + SHARD_READ_THREAD_POOL + "_thread_pool"; + public static final String TRANSLOG_THREAD_POOL = BlobStoreRepository.STATELESS_TRANSLOG_THREAD_NAME; + public static final String TRANSLOG_THREAD_POOL_SETTING = "stateless." + TRANSLOG_THREAD_POOL + "_thread_pool"; + public static final String SHARD_WRITE_THREAD_POOL = BlobStoreRepository.STATELESS_SHARD_WRITE_THREAD_NAME; + public static final String SHARD_WRITE_THREAD_POOL_SETTING = "stateless." + SHARD_WRITE_THREAD_POOL + "_thread_pool"; + public static final String CLUSTER_STATE_READ_WRITE_THREAD_POOL = BlobStoreRepository.STATELESS_CLUSTER_STATE_READ_WRITE_THREAD_NAME; + public static final String CLUSTER_STATE_READ_WRITE_THREAD_POOL_SETTING = "stateless." + + CLUSTER_STATE_READ_WRITE_THREAD_POOL + + "_thread_pool"; + public static final String GET_VIRTUAL_BATCHED_COMPOUND_COMMIT_CHUNK_THREAD_POOL = "stateless_get_vbcc_chunk"; + public static final String GET_VIRTUAL_BATCHED_COMPOUND_COMMIT_CHUNK_THREAD_POOL_SETTING = "stateless." + + GET_VIRTUAL_BATCHED_COMPOUND_COMMIT_CHUNK_THREAD_POOL + + "_thread_pool"; + public static final String FILL_VIRTUAL_BATCHED_COMPOUND_COMMIT_CACHE_THREAD_POOL = "stateless_fill_vbcc_cache"; + public static final String FILL_VIRTUAL_BATCHED_COMPOUND_COMMIT_CHUNK_THREAD_POOL_SETTING = "stateless." + + FILL_VIRTUAL_BATCHED_COMPOUND_COMMIT_CACHE_THREAD_POOL + + "_thread_pool"; + public static final String PREWARM_THREAD_POOL = BlobStoreRepository.STATELESS_SHARD_PREWARMING_THREAD_NAME; + public static final String PREWARM_THREAD_POOL_SETTING = "stateless." + PREWARM_THREAD_POOL + "_thread_pool"; + public static final String UPLOAD_PREWARM_THREAD_POOL = BlobStoreRepository.STATELESS_SHARD_UPLOAD_PREWARMING_THREAD_NAME; + public static final String UPLOAD_PREWARM_THREAD_POOL_SETTING = "stateless." + UPLOAD_PREWARM_THREAD_POOL + "_thread_pool"; + + /** + * The set of {@link ShardRouting.Role}s that we expect to see in a stateless deployment + */ + public static final Set STATELESS_SHARD_ROLES = Set.of(ShardRouting.Role.INDEX_ONLY, ShardRouting.Role.SEARCH_ONLY); + private final boolean enabled; + private final boolean hasIndexRole; + + public static ExecutorBuilder[] statelessExecutorBuilders(Settings settings, boolean hasIndexRole) { + // TODO: Consider modifying these pool counts if we change the object store client connections based on node size. + // Right now we have 10 threads for snapshots, 1 or 8 threads for translog and 20 or 28 threads for shard thread pools. This is to + // attempt to keep the threads below the default client connections limit of 50. This assumption is currently broken by the snapshot + // metadata pool having 50 threads. But we will continue to iterate on this numbers and limits. + + final int processors = EsExecutors.allocatedProcessors(settings); + final int shardReadMaxThreads; + final int translogCoreThreads; + final int translogMaxThreads; + final int shardWriteCoreThreads; + final int shardWriteMaxThreads; + final int clusterStateReadWriteCoreThreads; + final int clusterStateReadWriteMaxThreads; + final int getVirtualBatchedCompoundCommitChunkCoreThreads; + final int getVirtualBatchedCompoundCommitChunkMaxThreads; + final int fillVirtualBatchedCompoundCommitCacheCoreThreads; + final int fillVirtualBatchedCompoundCommitCacheMaxThreads; + final int prewarmMaxThreads; + final int uploadPrewarmCoreThreads; + final int uploadPrewarmMaxThreads; + + if (hasIndexRole) { + shardReadMaxThreads = Math.min(processors * 4, 10); + translogCoreThreads = 2; + translogMaxThreads = Math.min(processors * 2, 8); + shardWriteCoreThreads = 2; + shardWriteMaxThreads = Math.min(processors * 4, 10); + clusterStateReadWriteCoreThreads = 2; + clusterStateReadWriteMaxThreads = 4; + getVirtualBatchedCompoundCommitChunkCoreThreads = 1; + getVirtualBatchedCompoundCommitChunkMaxThreads = Math.min(processors, 4); + fillVirtualBatchedCompoundCommitCacheCoreThreads = 0; + fillVirtualBatchedCompoundCommitCacheMaxThreads = 1; + prewarmMaxThreads = Math.min(processors * 2, 32); + // These threads are used for prewarming the shared blob cache on upload, and are separate from the prewarm thread pool + // in order to avoid any deadlocks between the two (e.g., when two fillgaps compete). Since they are used to prewarm on upload, + // we use the same amount of max threads as the shard write pool. + // these threads use a sizeable thread-local direct buffer which might take a while to GC, so we prefer to keep some idle + // threads around to reduce churn and re-use the existing buffers more + uploadPrewarmMaxThreads = Math.min(processors * 4, 10); + uploadPrewarmCoreThreads = uploadPrewarmMaxThreads / 2; + } else { + shardReadMaxThreads = Math.min(processors * 4, 28); + translogCoreThreads = 0; + translogMaxThreads = 1; + shardWriteCoreThreads = 0; + shardWriteMaxThreads = 1; + clusterStateReadWriteCoreThreads = 0; + clusterStateReadWriteMaxThreads = 1; + getVirtualBatchedCompoundCommitChunkCoreThreads = 0; + getVirtualBatchedCompoundCommitChunkMaxThreads = 1; + prewarmMaxThreads = Math.min(processors * 4, 32); + // these threads use a sizeable thread-local direct buffer which might take a while to GC, so we prefer to keep some idle + // threads around to reduce churn and re-use the existing buffers more + fillVirtualBatchedCompoundCommitCacheCoreThreads = Math.max(processors / 2, 2); + fillVirtualBatchedCompoundCommitCacheMaxThreads = Math.max(processors, 2); + uploadPrewarmCoreThreads = 0; + uploadPrewarmMaxThreads = 1; + } + + return new ExecutorBuilder[] { + new ScalingExecutorBuilder( + SHARD_READ_THREAD_POOL, + 4, + shardReadMaxThreads, + TimeValue.timeValueMinutes(5), + true, + SHARD_READ_THREAD_POOL_SETTING, + EsExecutors.TaskTrackingConfig.builder().trackOngoingTasks().trackExecutionTime(0.3).build() + ), + new ScalingExecutorBuilder( + TRANSLOG_THREAD_POOL, + translogCoreThreads, + translogMaxThreads, + TimeValue.timeValueMinutes(5), + true, + TRANSLOG_THREAD_POOL_SETTING + ), + new ScalingExecutorBuilder( + SHARD_WRITE_THREAD_POOL, + shardWriteCoreThreads, + shardWriteMaxThreads, + TimeValue.timeValueMinutes(5), + true, + SHARD_WRITE_THREAD_POOL_SETTING + ), + new ScalingExecutorBuilder( + CLUSTER_STATE_READ_WRITE_THREAD_POOL, + clusterStateReadWriteCoreThreads, + clusterStateReadWriteMaxThreads, + TimeValue.timeValueMinutes(5), + true, + CLUSTER_STATE_READ_WRITE_THREAD_POOL_SETTING + ), + new ScalingExecutorBuilder( + GET_VIRTUAL_BATCHED_COMPOUND_COMMIT_CHUNK_THREAD_POOL, + getVirtualBatchedCompoundCommitChunkCoreThreads, + getVirtualBatchedCompoundCommitChunkMaxThreads, + TimeValue.timeValueMinutes(5), + true, + GET_VIRTUAL_BATCHED_COMPOUND_COMMIT_CHUNK_THREAD_POOL_SETTING + ), + new ScalingExecutorBuilder( + FILL_VIRTUAL_BATCHED_COMPOUND_COMMIT_CACHE_THREAD_POOL, + fillVirtualBatchedCompoundCommitCacheCoreThreads, + fillVirtualBatchedCompoundCommitCacheMaxThreads, + TimeValue.timeValueMinutes(5), + true, + FILL_VIRTUAL_BATCHED_COMPOUND_COMMIT_CHUNK_THREAD_POOL_SETTING + ), + new ScalingExecutorBuilder( + PREWARM_THREAD_POOL, + // these threads use a sizeable thread-local direct buffer which might take a while to GC, so we prefer to keep some idle + // threads around to reduce churn and re-use the existing buffers more + prewarmMaxThreads / 2, + prewarmMaxThreads, + TimeValue.timeValueMinutes(5), + true, + PREWARM_THREAD_POOL_SETTING + ), + new ScalingExecutorBuilder( + UPLOAD_PREWARM_THREAD_POOL, + uploadPrewarmCoreThreads, + uploadPrewarmMaxThreads, + TimeValue.timeValueMinutes(5), + true, + UPLOAD_PREWARM_THREAD_POOL_SETTING + ) }; + } @Override public List> getSettings() { @@ -106,6 +273,7 @@ public StatelessPlugin(Settings settings) { ); } } + hasIndexRole = DiscoveryNode.hasRole(settings, DiscoveryNodeRole.INDEX_ROLE); } @Override @@ -152,4 +320,14 @@ public void close() throws IOException { public boolean isEnabled() { return enabled; } + + @Override + public List> getExecutorBuilders(Settings settings) { + if (enabled) { + return List.of(statelessExecutorBuilders(settings, hasIndexRole)); + } else { + return super.getExecutorBuilders(settings); + } + } + } diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/FetchShardCommitsInUseAction.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/FetchShardCommitsInUseAction.java new file mode 100644 index 0000000000000..70f6432231d75 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/FetchShardCommitsInUseAction.java @@ -0,0 +1,215 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.action.ActionRequest; +import org.elasticsearch.action.ActionRequestValidationException; +import org.elasticsearch.action.FailedNodeException; +import org.elasticsearch.action.support.nodes.BaseNodeResponse; +import org.elasticsearch.action.support.nodes.BaseNodesRequest; +import org.elasticsearch.action.support.nodes.BaseNodesResponse; +import org.elasticsearch.cluster.ClusterName; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +/** + * TransportFetchShardCommitsInUseAction broadcasts a request to a set of search nodes asking what commits are still in use for a + * particular shard. + * + * {@link FetchShardCommitsInUseAction.Request} is the request made by the index node to the transport layer, and + * {@link FetchShardCommitsInUseAction.Response} is the response returned. {@link FetchShardCommitsInUseAction.NodeRequest} and + * {@link FetchShardCommitsInUseAction.NodeResponse} are the individual search node requests and responses handled in the transport layer. + */ +public class FetchShardCommitsInUseAction { + private static final Logger logger = LogManager.getLogger(FetchShardCommitsInUseAction.class); + + public static class Request extends BaseNodesRequest { + private final ShardId shardId; + + /** + * @param nodesIds The IDs of the nodes to ask for commits-in-use information. + * @param shardId The shardId to identify the particular shard in the index. + */ + public Request(String[] nodesIds, ShardId shardId) { + super(nodesIds); + this.shardId = shardId; + } + + public ShardId getShardId() { + return this.shardId; + } + + @Override + public int hashCode() { + return Objects.hash(shardId, nodesIds()); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null || getClass() != obj.getClass()) return false; + FetchShardCommitsInUseAction.Request that = (FetchShardCommitsInUseAction.Request) obj; + return Objects.equals(this.shardId, that.shardId) && Arrays.equals(this.nodesIds(), that.nodesIds()); + } + } + + /** + * Request object for fetching which commits the search shard copy on a node is actively using for reads. + */ + public static class NodeRequest extends ActionRequest { + private final ShardId shardId; + + /** + * @param request The original index node request handed to the transport layer, which is to be broadcast to individual nodes. + */ + public NodeRequest(Request request) { + super(); + this.shardId = request.getShardId(); + } + + public NodeRequest(StreamInput in) throws IOException { + super(in); + this.shardId = new ShardId(in); + } + + public ShardId getShardId() { + return this.shardId; + } + + @Override + public void writeTo(final StreamOutput out) throws IOException { + super.writeTo(out); + out.writeWriteable(this.shardId); + } + + @Override + public ActionRequestValidationException validate() { + return null; + } + + @Override + public String toString() { + return super.toString() + ", shardId: " + this.shardId; + } + } + + /** + * The combined response containing the combined list of commits-in-use. + * Contains information about which serverless shard commits are still in active use by which search nodes. + */ + public static class Response extends BaseNodesResponse { + + public Response(StreamInput in) throws IOException { + super(in); + } + + public Response( + ClusterName clusterName, + List nodeResponses, + List nodeFailures + ) { + super(clusterName, nodeResponses, nodeFailures); + } + + /** + * Combines the commits-in-use from each node that was called into a single set for the final {@link Response}. + */ + public Set getAllPrimaryTermAndGenerationsInUse() { + Set allPrimaryTermAndGenerationsInUse = new HashSet<>(); + for (FetchShardCommitsInUseAction.NodeResponse nodeResponse : getNodes()) { + allPrimaryTermAndGenerationsInUse.addAll(nodeResponse.primaryTermAndGenerationsInUse); + } + return allPrimaryTermAndGenerationsInUse; + } + + /** + * Serializes the list of {@link NodeResponse} instances into an output stream. + */ + @Override + protected void writeNodesTo(StreamOutput out, List nodeResponses) throws IOException { + out.writeCollection(nodeResponses); + } + + /** + * Deserializes the node response instances from the input stream into a list of {@link NodeResponse}. + */ + @Override + protected List readNodesFrom(StreamInput in) throws IOException { + return in.readCollectionAsList(FetchShardCommitsInUseAction.NodeResponse::new); + } + + @Override + public String toString() { + return "FetchShardCommitsInUseAction.Response{" + "FetchShardCommitsInUseAction.NodeResponse=" + getNodes() + "}"; + } + } + + /** + * The response of a single node receiving a FetchShardCommitsInUseAction transport request. + * Holds a list of serverless commits ({@link PrimaryTermAndGeneration}) actively in use by the search shard on that node. + */ + public static class NodeResponse extends BaseNodeResponse { + private Set primaryTermAndGenerationsInUse; + + public NodeResponse(DiscoveryNode node, Set primaryTermAndGenerationsInUse) { + super(node); + this.primaryTermAndGenerationsInUse = primaryTermAndGenerationsInUse; + } + + public NodeResponse(StreamInput in) throws IOException { + super(in); + this.primaryTermAndGenerationsInUse = in.readCollectionAsSet(PrimaryTermAndGeneration::new); + } + + public Set getPrimaryTermAndGenerationsInUse() { + return primaryTermAndGenerationsInUse; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeCollection(primaryTermAndGenerationsInUse); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + FetchShardCommitsInUseAction.NodeResponse that = (FetchShardCommitsInUseAction.NodeResponse) o; + return Objects.equals(this.getNode(), that.getNode()) + && Objects.equals(this.primaryTermAndGenerationsInUse, that.primaryTermAndGenerationsInUse); + } + + @Override + public int hashCode() { + return Objects.hash(primaryTermAndGenerationsInUse, getNode()); + } + + @Override + public String toString() { + return "FetchShardCommitsInUseAction.NodeResponse{" + + "nodeId=" + + getNode().getId() + + ", primaryTermAndGenerationsInUse=" + + primaryTermAndGenerationsInUse + + "}"; + } + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/GetVirtualBatchedCompoundCommitChunkRequest.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/GetVirtualBatchedCompoundCommitChunkRequest.java new file mode 100644 index 0000000000000..55020c6cf5199 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/GetVirtualBatchedCompoundCommitChunkRequest.java @@ -0,0 +1,119 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.elasticsearch.action.ActionRequest; +import org.elasticsearch.action.ActionRequestValidationException; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.index.shard.ShardId; + +import java.io.IOException; + +import static org.elasticsearch.action.ValidateActions.addValidationError; + +public class GetVirtualBatchedCompoundCommitChunkRequest extends ActionRequest { + + private final ShardId shardId; + private final long primaryTerm; + private final long virtualBatchedCompoundCommitGeneration; + private final long offset; + private final int length; + private final String preferredNodeId; + + public GetVirtualBatchedCompoundCommitChunkRequest( + final ShardId shardId, + final long primaryTerm, + final long virtualBatchedCompoundCommitGeneration, + final long offset, + final int length, + final String preferredNodeId + ) { + super(); + this.shardId = shardId; + this.primaryTerm = primaryTerm; + this.virtualBatchedCompoundCommitGeneration = virtualBatchedCompoundCommitGeneration; + this.offset = offset; + this.length = length; + this.preferredNodeId = preferredNodeId; + assert preferredNodeId != null; + assert length > 0 : length; + assert offset >= 0 : offset; + } + + public GetVirtualBatchedCompoundCommitChunkRequest(StreamInput in) throws IOException { + super(in); + shardId = new ShardId(in); + primaryTerm = in.readVLong(); + virtualBatchedCompoundCommitGeneration = in.readVLong(); + offset = in.readVLong(); + length = in.readVInt(); + preferredNodeId = in.readOptionalString(); + } + + @Override + public ActionRequestValidationException validate() { + ActionRequestValidationException validationException = null; + if (shardId == null) { + validationException = addValidationError("shard id is missing", validationException); + } + return validationException; + } + + public void writeTo(StreamOutput out) throws IOException { + super.writeTo(out); + out.writeWriteable(shardId); + out.writeVLong(primaryTerm); + out.writeVLong(virtualBatchedCompoundCommitGeneration); + out.writeVLong(offset); + out.writeVInt(length); + out.writeOptionalString(preferredNodeId); + } + + public ShardId getShardId() { + return shardId; + } + + public long getPrimaryTerm() { + return primaryTerm; + } + + public long getVirtualBatchedCompoundCommitGeneration() { + return virtualBatchedCompoundCommitGeneration; + } + + public long getOffset() { + return offset; + } + + public int getLength() { + return length; + } + + public String getPreferredNodeId() { + return preferredNodeId; + } + + @Override + public String toString() { + return GetVirtualBatchedCompoundCommitChunkRequest.class.getSimpleName() + + "[" + + shardId + + "," + + primaryTerm + + "," + + virtualBatchedCompoundCommitGeneration + + "," + + offset + + "," + + length + + "," + + preferredNodeId + + "]"; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/GetVirtualBatchedCompoundCommitChunkResponse.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/GetVirtualBatchedCompoundCommitChunkResponse.java new file mode 100644 index 0000000000000..ed18b7d0234ad --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/GetVirtualBatchedCompoundCommitChunkResponse.java @@ -0,0 +1,58 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.elasticsearch.action.ActionResponse; +import org.elasticsearch.common.bytes.ReleasableBytesReference; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; + +import java.io.IOException; + +public class GetVirtualBatchedCompoundCommitChunkResponse extends ActionResponse { + + private final ReleasableBytesReference data; + + public GetVirtualBatchedCompoundCommitChunkResponse(ReleasableBytesReference data) { + assert data.hasReferences(); + this.data = data; // takes ownership of the original ref, no need to .retain() + } + + public GetVirtualBatchedCompoundCommitChunkResponse(StreamInput in) throws IOException { + data = in.readReleasableBytesReference(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeBytesReference(data); + } + + public ReleasableBytesReference getData() { + return data; + } + + @Override + public void incRef() { + data.incRef(); + } + + @Override + public boolean tryIncRef() { + return data.tryIncRef(); + } + + @Override + public boolean decRef() { + return data.decRef(); + } + + @Override + public boolean hasReferences() { + return data.hasReferences(); + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationRequest.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationRequest.java new file mode 100644 index 0000000000000..eaadf457bd9fe --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationRequest.java @@ -0,0 +1,220 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.elasticsearch.action.ActionRequestValidationException; +import org.elasticsearch.action.support.broadcast.unpromotable.BroadcastUnpromotableRequest; +import org.elasticsearch.cluster.routing.IndexShardRoutingTable; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.io.IOException; +import java.util.Objects; + +import static org.elasticsearch.action.ValidateActions.addValidationError; + +/** + * A request sent from an index node to a search node with information about the latest commit for a particular index shard. + */ +public class NewCommitNotificationRequest extends BroadcastUnpromotableRequest { + + /** + * The new compound commit + */ + private final StatelessCompoundCommit compoundCommit; + + /** + * The generation of the BCC that contains the CC. The BCC's primary term is the same as the CC's primary. + */ + private final long batchedCompoundCommitGeneration; + + /** + * The generation of latest uploaded BCC. It is null if no upload has happened. + */ + @Nullable + private final PrimaryTermAndGeneration latestUploadedBatchedCompoundCommitTermAndGen; + + /** + * The cluster state version on the node at the time the new compound commit was notified + */ + @Nullable + private final Long clusterStateVersion; + + /** + * The id of the node that notified the new compound commit + */ + @Nullable + private final String nodeId; + + /** + * Constructs a request to broadcast a new shard commit to all the unpromotable shards in the index shard routing table. + */ + public NewCommitNotificationRequest( + final IndexShardRoutingTable indexShardRoutingTable, + final StatelessCompoundCommit compoundCommit, + final long batchedCompoundCommitGeneration, + @Nullable final PrimaryTermAndGeneration latestUploadedBatchedCompoundCommitTermAndGen, + final long clusterStateVersion, + final String nodeId + ) { + super(indexShardRoutingTable); + this.compoundCommit = compoundCommit; + this.batchedCompoundCommitGeneration = batchedCompoundCommitGeneration; + this.latestUploadedBatchedCompoundCommitTermAndGen = latestUploadedBatchedCompoundCommitTermAndGen; + this.clusterStateVersion = clusterStateVersion; + this.nodeId = nodeId; + } + + /** + * Constructs the request received over the wire by a search node. + */ + public NewCommitNotificationRequest(final StreamInput in) throws IOException { + super(in); + compoundCommit = StatelessCompoundCommit.readFromTransport(in); + batchedCompoundCommitGeneration = in.readVLong(); + latestUploadedBatchedCompoundCommitTermAndGen = in.readOptionalWriteable(PrimaryTermAndGeneration::new); + clusterStateVersion = in.readVLong(); + nodeId = in.readString(); + } + + public long getTerm() { + return compoundCommit.primaryTerm(); + } + + public long getGeneration() { + return compoundCommit.generation(); + } + + public StatelessCompoundCommit getCompoundCommit() { + return compoundCommit; + } + + public long getBatchedCompoundCommitGeneration() { + return batchedCompoundCommitGeneration; + } + + public PrimaryTermAndGeneration getLatestUploadedBatchedCompoundCommitTermAndGen() { + return latestUploadedBatchedCompoundCommitTermAndGen; + } + + /** + * The cluster state version on the node at the time the new compound commit was notified + */ + @Nullable + public Long getClusterStateVersion() { + return clusterStateVersion; + } + + /** + * The id of the node that notified the new compound commit + */ + @Nullable + public String getNodeId() { + return nodeId; + } + + /** + * Whether the BCC in this request is uploaded + */ + public boolean isUploaded() { + return latestUploadedBatchedCompoundCommitTermAndGen != null + && latestUploadedBatchedCompoundCommitTermAndGen.generation() == batchedCompoundCommitGeneration; + } + + @Override + public ActionRequestValidationException validate() { + ActionRequestValidationException validationException = super.validate(); + if (getGeneration() < batchedCompoundCommitGeneration) { + validationException = addValidationError( + "compound commit generation [" + + compoundCommit.generation() + + "] < batched compound commit generation [" + + batchedCompoundCommitGeneration + + "]", + validationException + ); + } + + if (latestUploadedBatchedCompoundCommitTermAndGen != null) { + if (getTerm() < latestUploadedBatchedCompoundCommitTermAndGen.primaryTerm()) { + validationException = addValidationError( + "batched compound commit primary term [" + + getTerm() + + "] < latest uploaded batched compound commit primary term [" + + latestUploadedBatchedCompoundCommitTermAndGen.primaryTerm() + + "]", + validationException + ); + } else if (getTerm() == latestUploadedBatchedCompoundCommitTermAndGen.primaryTerm() + && batchedCompoundCommitGeneration < latestUploadedBatchedCompoundCommitTermAndGen.generation()) { + validationException = addValidationError( + "batched compound commit generation [" + + batchedCompoundCommitGeneration + + "] < latest uploaded batched compound commit generation [" + + latestUploadedBatchedCompoundCommitTermAndGen.generation() + + "]", + validationException + ); + } + } + + return validationException; + } + + @Override + public void writeTo(final StreamOutput out) throws IOException { + super.writeTo(out); + compoundCommit.writeTo(out); + out.writeVLong(batchedCompoundCommitGeneration); + out.writeOptionalWriteable(latestUploadedBatchedCompoundCommitTermAndGen); + out.writeVLong(clusterStateVersion); + out.writeString(nodeId); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NewCommitNotificationRequest request = (NewCommitNotificationRequest) o; + return batchedCompoundCommitGeneration == request.batchedCompoundCommitGeneration + && Objects.equals(compoundCommit, request.compoundCommit) + && Objects.equals(latestUploadedBatchedCompoundCommitTermAndGen, request.latestUploadedBatchedCompoundCommitTermAndGen) + && Objects.equals(clusterStateVersion, request.clusterStateVersion) + && Objects.equals(nodeId, request.nodeId); + } + + @Override + public int hashCode() { + return Objects.hash( + compoundCommit, + batchedCompoundCommitGeneration, + latestUploadedBatchedCompoundCommitTermAndGen, + clusterStateVersion, + nodeId + ); + } + + @Override + public String toString() { + return "NewCommitNotificationRequest{" + + "compoundCommit=" + + compoundCommit + + ", batchedCompoundCommitGeneration=" + + batchedCompoundCommitGeneration + + ", latestUploadedBatchedCompoundCommitTermAndGen=" + + latestUploadedBatchedCompoundCommitTermAndGen + + ", clusterStateVersion=" + + clusterStateVersion + + ", nodeId=" + + nodeId + + '}'; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponse.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponse.java new file mode 100644 index 0000000000000..43a9a30787a72 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponse.java @@ -0,0 +1,69 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.elasticsearch.action.ActionResponse; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; + +public class NewCommitNotificationResponse extends ActionResponse { + public static final NewCommitNotificationResponse EMPTY = new NewCommitNotificationResponse(Set.of()); + + private final Set primaryTermAndGenerationsInUse; + + public NewCommitNotificationResponse(Set primaryTermAndGenerationsInUse) { + this.primaryTermAndGenerationsInUse = primaryTermAndGenerationsInUse; + } + + public NewCommitNotificationResponse(StreamInput in) throws IOException { + this.primaryTermAndGenerationsInUse = in.readCollectionAsImmutableSet(PrimaryTermAndGeneration::new); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeCollection(primaryTermAndGenerationsInUse); + } + + public Set getPrimaryTermAndGenerationsInUse() { + return primaryTermAndGenerationsInUse; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + NewCommitNotificationResponse that = (NewCommitNotificationResponse) o; + return Objects.equals(primaryTermAndGenerationsInUse, that.primaryTermAndGenerationsInUse); + } + + @Override + public int hashCode() { + return Objects.hash(primaryTermAndGenerationsInUse); + } + + public static NewCommitNotificationResponse combine(List responses) { + var combinedPrimaryTermAndGenerations = new HashSet(); + for (NewCommitNotificationResponse response : responses) { + combinedPrimaryTermAndGenerations.addAll(response.primaryTermAndGenerationsInUse); + } + return new NewCommitNotificationResponse(Collections.unmodifiableSet(combinedPrimaryTermAndGenerations)); + } + + @Override + public String toString() { + return "NewCommitNotificationResponse{" + "primaryTermAndGenerationsInUse=" + primaryTermAndGenerationsInUse + "}"; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/TransportEnsureDocsSearchableAction.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/TransportEnsureDocsSearchableAction.java new file mode 100644 index 0000000000000..541917c5a4207 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/action/TransportEnsureDocsSearchableAction.java @@ -0,0 +1,173 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.action.ActionResponse; +import org.elasticsearch.action.NoShardAvailableActionException; +import org.elasticsearch.action.admin.indices.refresh.TransportShardRefreshAction; +import org.elasticsearch.action.support.ActionFilters; +import org.elasticsearch.action.support.ActiveShardCount; +import org.elasticsearch.action.support.replication.BasicReplicationRequest; +import org.elasticsearch.action.support.single.shard.TransportSingleShardAction; +import org.elasticsearch.action.termvectors.EnsureDocsSearchableAction; +import org.elasticsearch.client.internal.OriginSettingClient; +import org.elasticsearch.client.internal.node.NodeClient; +import org.elasticsearch.cluster.ProjectState; +import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.cluster.node.DiscoveryNodeRole; +import org.elasticsearch.cluster.project.ProjectResolver; +import org.elasticsearch.cluster.routing.ShardIterator; +import org.elasticsearch.cluster.service.ClusterService; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.index.IndexService; +import org.elasticsearch.index.mapper.Uid; +import org.elasticsearch.index.shard.IndexShard; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.indices.IndicesService; +import org.elasticsearch.injection.guice.Inject; +import org.elasticsearch.threadpool.ThreadPool; +import org.elasticsearch.transport.TransportService; + +import java.io.IOException; +import java.util.List; + +import static org.elasticsearch.action.termvectors.EnsureDocsSearchableAction.ENSURE_DOCS_SEARCHABLE_ORIGIN; + +public class TransportEnsureDocsSearchableAction extends TransportSingleShardAction< + EnsureDocsSearchableAction.EnsureDocsSearchableRequest, + ActionResponse.Empty> { + + private static final Logger logger = LogManager.getLogger(TransportEnsureDocsSearchableAction.class); + private final NodeClient client; + private final IndicesService indicesService; + + @Inject + public TransportEnsureDocsSearchableAction( + ClusterService clusterService, + NodeClient client, + TransportService transportService, + IndicesService indicesService, + ThreadPool threadPool, + ActionFilters actionFilters, + ProjectResolver projectResolver, + IndexNameExpressionResolver indexNameExpressionResolver + ) { + super( + EnsureDocsSearchableAction.TYPE.name(), + threadPool, + clusterService, + transportService, + actionFilters, + projectResolver, + indexNameExpressionResolver, + EnsureDocsSearchableAction.EnsureDocsSearchableRequest::new, + threadPool.executor(ThreadPool.Names.GET) + ); + this.client = client; + this.indicesService = indicesService; + } + + @Override + protected boolean isSubAction() { + return true; + } + + @Override + protected Writeable.Reader getResponseReader() { + return in -> ActionResponse.Empty.INSTANCE; + } + + @Override + protected boolean resolveIndex(EnsureDocsSearchableAction.EnsureDocsSearchableRequest request) { + return false; + } + + @Override + protected ShardIterator shards(ProjectState state, InternalRequest request) { + assert DiscoveryNode.isStateless(clusterService.getSettings()) + : EnsureDocsSearchableAction.TYPE.name() + " should only be used in stateless"; + final var primaryShard = state.routingTable() + .shardRoutingTable(request.concreteIndex(), request.request().shardId()) + .primaryShard(); + if (primaryShard.active() == false) { + throw new NoShardAvailableActionException(primaryShard.shardId(), "primary shard is not active"); + } + DiscoveryNode node = state.cluster().nodes().get(primaryShard.currentNodeId()); + assert node != null; + return new ShardIterator(primaryShard.shardId(), List.of(primaryShard)); + } + + @Override + protected void asyncShardOperation( + EnsureDocsSearchableAction.EnsureDocsSearchableRequest request, + ShardId shardId, + ActionListener listener + ) throws IOException { + assert DiscoveryNode.isStateless(clusterService.getSettings()) + : EnsureDocsSearchableAction.TYPE.name() + " should only be used in stateless"; + assert DiscoveryNode.hasRole(clusterService.getSettings(), DiscoveryNodeRole.INDEX_ROLE) + : EnsureDocsSearchableAction.TYPE.name() + " should only be executed on a stateless indexing node"; + logger.debug("received request with {} docs", request.docIds().length); + getExecutor(shardId).execute(() -> ActionListener.run(listener, l -> { + final IndexService indexService = indicesService.indexServiceSafe(shardId.getIndex()); + final IndexShard indexShard = indexService.getShard(shardId.id()); + boolean docsFoundInLiveVersionMap = false; + for (String docId : request.docIds()) { + final var docUid = Uid.encodeId(docId); + // There are a couple of limited cases where we may unnecessarily trigger an additional external refresh: + // 1. Asking whether a document is in the live version map may incur a stateless refresh in itself. + // 2. The document may be in the live version map archive, even though it has been refreshed to the search shards. The + // document will be removed from the archive in a subsequent stateless refresh. + // We prefer simplicity to complexity (trying to avoid the unnecessary stateless refresh) for the above limited cases. + boolean docInLiveVersionMap = indexShard.withEngine(engine -> engine.isDocumentInLiveVersionMap(docUid)); + if (docInLiveVersionMap) { + logger.debug("doc id [{}] (uid [{}]) found in live version map of index shard [{}]", docId, docUid, shardId); + docsFoundInLiveVersionMap = true; + break; + } + } + + if (docsFoundInLiveVersionMap) { + logger.debug("refreshing index shard [{}] due to mtv_eds", shardId); + BasicReplicationRequest refreshRequest = new BasicReplicationRequest(shardId); + refreshRequest.waitForActiveShards(ActiveShardCount.NONE); + // We call the transport action (instead of refreshing the index shard) to also update the unpromotable shards. + final var originClient = new OriginSettingClient(client, ENSURE_DOCS_SEARCHABLE_ORIGIN); + originClient.execute(TransportShardRefreshAction.TYPE, refreshRequest, l.delegateFailureAndWrap((ll, r) -> { + // TransportShardRefreshAction.UnpromotableReplicasRefreshProxy.onPrimaryOperationComplete() returns a + // single shard failure if unpromotable(s) failed, with a combined list of (suppressed) exceptions. + if (r.getShardInfo().getFailed() > 0) { + assert r.getShardInfo().getFailed() == 1 + : "expected a single shard failure, got " + r.getShardInfo().getFailed() + " failures"; + throw new ElasticsearchException("failed to refresh [{}]", r.getShardInfo().getFailures()[0].getCause(), shardId); + } + logger.debug("refreshed index shard [{}] due to mtv_eds", shardId); + ll.onResponse(ActionResponse.Empty.INSTANCE); + })); + } else { + // Notice that there cannot be a race between the document(s) being evicted from the live version map due to an + // ongoing refresh and before the search shards being updated with the new commit, because the documents are + // guaranteed to be the in the live version map archive until search shards are updated with the new commit. + // Thus, we can safely respond immediately as a no-op. + logger.debug("eds does not require refresh of index shard [{}]", shardId); + l.onResponse(ActionResponse.Empty.INSTANCE); + } + })); + } + + @Override + protected ActionResponse.Empty shardOperation(EnsureDocsSearchableAction.EnsureDocsSearchableRequest request, ShardId shardId) { + throw new UnsupportedOperationException(); + } + +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/cache/Lucene90CompoundEntriesReader.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/cache/Lucene90CompoundEntriesReader.java new file mode 100644 index 0000000000000..3b56806cf3d94 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/cache/Lucene90CompoundEntriesReader.java @@ -0,0 +1,66 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.cache; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.CollectionUtil; +import org.apache.lucene.util.StringHelper; +import org.elasticsearch.index.store.LuceneFilesExtensions; + +import java.io.IOException; +import java.util.Map; + +/** + * This file is mostly copied from org.apache.lucene.codecs.lucene90.Lucene90CompoundReader + * in order to be able to parse compound segment entries in order to prewarm them. + * Currently, it is impossible to reuse the original class as the necessary code has private access + */ +public class Lucene90CompoundEntriesReader { + + static final String ENTRY_CODEC = "Lucene90CompoundEntries"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + public static Map readEntries(Directory directory, String filename) throws IOException { + assert LuceneFilesExtensions.fromFile(filename) == LuceneFilesExtensions.CFE : filename; + try (var input = directory.openInput(filename, IOContext.READONCE)) { + return Lucene90CompoundEntriesReader.readEntries(input); + } + } + + /** + * This method skips the input validation and only lists the entries in a cfe file. + * Validation is going to be performed later once directory is opened for the index engine. + */ + public static Map readEntries(DataInput dataInput) throws IOException { + CodecUtil.checkHeader(dataInput, ENTRY_CODEC, VERSION_START, VERSION_CURRENT); + dataInput.skipBytes(StringHelper.ID_LENGTH); + CodecUtil.checkIndexHeaderSuffix(dataInput, ""); + return readMapping(dataInput); + } + + private static Map readMapping(DataInput entriesStream) throws IOException { + final int numEntries = entriesStream.readVInt(); + var mapping = CollectionUtil.newHashMap(numEntries); + for (int i = 0; i < numEntries; i++) { + final String id = entriesStream.readString(); + final FileEntry fileEntry = new FileEntry(entriesStream.readLong(), entriesStream.readLong()); + FileEntry previous = mapping.put(id, fileEntry); + if (previous != null) { + throw new CorruptIndexException("Duplicate cfs entry id=" + id + " in CFS ", entriesStream); + } + } + return mapping; + } + + public record FileEntry(long offset, long length) {} +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/cache/StatelessSharedBlobCacheService.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/cache/StatelessSharedBlobCacheService.java new file mode 100644 index 0000000000000..64729cc0609e1 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/cache/StatelessSharedBlobCacheService.java @@ -0,0 +1,79 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.cache; + +import org.elasticsearch.blobcache.BlobCacheMetrics; +import org.elasticsearch.blobcache.shared.SharedBlobCacheService; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.util.concurrent.EsExecutors; +import org.elasticsearch.env.NodeEnvironment; +import org.elasticsearch.threadpool.ThreadPool; +import org.elasticsearch.xpack.stateless.StatelessPlugin; +import org.elasticsearch.xpack.stateless.lucene.FileCacheKey; + +import java.util.concurrent.Executor; +import java.util.function.LongSupplier; + +public class StatelessSharedBlobCacheService extends SharedBlobCacheService { + + // Stateless shared blob cache service populates-and-reads in-thread. And it relies on the cache service to fetch gap bytes + // asynchronously using a CacheBlobReader. + private static final Executor IO_EXECUTOR = EsExecutors.DIRECT_EXECUTOR_SERVICE; + + private final Executor shardReadThreadPoolExecutor; + + public StatelessSharedBlobCacheService( + NodeEnvironment environment, + Settings settings, + ThreadPool threadPool, + BlobCacheMetrics blobCacheMetrics + ) { + super(environment, settings, threadPool, IO_EXECUTOR, blobCacheMetrics); + this.shardReadThreadPoolExecutor = threadPool.executor(StatelessPlugin.SHARD_READ_THREAD_POOL); + } + + // for tests + public StatelessSharedBlobCacheService( + NodeEnvironment environment, + Settings settings, + ThreadPool threadPool, + BlobCacheMetrics blobCacheMetrics, + LongSupplier relativeTimeInNanosSupplier + ) { + super(environment, settings, threadPool, IO_EXECUTOR, blobCacheMetrics, relativeTimeInNanosSupplier); + this.shardReadThreadPoolExecutor = IO_EXECUTOR; + } + + public void assertInvariants() { + assert getRangeSize() >= getRegionSize() : getRangeSize() + " < " + getRegionSize(); + } + + public Executor getShardReadThreadPoolExecutor() { + return shardReadThreadPoolExecutor; + } + + @Override + protected int computeCacheFileRegionSize(long fileLength, int region) { + return getRegionSize(); + } + + @Override + public int getRegion(long position) { + return super.getRegion(position); + } + + @Override + public int getEndingRegion(long position) { + return super.getEndingRegion(position); + } + + @Override + public long getRegionEnd(int region) { + return super.getRegionEnd(region); + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/AbstractBatchedCompoundCommit.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/AbstractBatchedCompoundCommit.java new file mode 100644 index 0000000000000..37bff4a0554a9 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/AbstractBatchedCompoundCommit.java @@ -0,0 +1,16 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +public interface AbstractBatchedCompoundCommit { + PrimaryTermAndGeneration primaryTermAndGeneration(); + + StatelessCompoundCommit lastCompoundCommit(); +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BCCHeaderReadExecutor.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BCCHeaderReadExecutor.java new file mode 100644 index 0000000000000..22b83ea5b6822 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BCCHeaderReadExecutor.java @@ -0,0 +1,61 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.common.util.concurrent.ThrottledTaskRunner; +import org.elasticsearch.core.Releasable; +import org.elasticsearch.logging.LogManager; +import org.elasticsearch.logging.Logger; +import org.elasticsearch.threadpool.ThreadPool; + +import java.util.concurrent.Executor; + +import static org.elasticsearch.xpack.stateless.StatelessPlugin.SHARD_READ_THREAD_POOL; + +/** + * Executor that limits concurrent BCC header reads to the shard read thread pool size + * to prevent memory exhaustion when processing referenced BCCs during recovery operations. + */ +public class BCCHeaderReadExecutor implements Executor { + private final Logger logger = LogManager.getLogger(BCCHeaderReadExecutor.class); + + private final ThrottledTaskRunner throttledFetchExecutor; + + public BCCHeaderReadExecutor(ThreadPool threadPool) { + this.throttledFetchExecutor = new ThrottledTaskRunner( + BCCHeaderReadExecutor.class.getCanonicalName(), + // With this limit we don't hurt reading performance, but we avoid OOMing if + // the latest BCC references too many BCCs. + threadPool.info(SHARD_READ_THREAD_POOL).getMax(), + threadPool.generic() + ); + } + + @Override + public void execute(Runnable command) { + throttledFetchExecutor.enqueueTask(new ActionListener<>() { + @Override + public void onResponse(Releasable releasable) { + try (releasable) { + command.run(); + } + } + + @Override + public void onFailure(Exception e) { + logger.warn("Failed to read a BCC header", e); + } + + @Override + public String toString() { + return command.toString(); + } + }); + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BatchedCompoundCommit.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BatchedCompoundCommit.java new file mode 100644 index 0000000000000..8285992eeccc1 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BatchedCompoundCommit.java @@ -0,0 +1,228 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.blobcache.BlobCacheUtils; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Represents a collection of compound commits stored together within the same blob. + */ +public record BatchedCompoundCommit(PrimaryTermAndGeneration primaryTermAndGeneration, List compoundCommits) + implements + AbstractBatchedCompoundCommit { + + public BatchedCompoundCommit { + if (primaryTermAndGeneration == null) { + throw new IllegalArgumentException("Batched compound commits must have a non-null primary term and generation"); + } + + if (compoundCommits == null) { + throw new IllegalArgumentException("Batched compound commits must have a non-null list of compound commits"); + } + + if (compoundCommits.isEmpty()) { + throw new IllegalArgumentException("Batched compound commits must have a non-empty list of compound commits"); + } + + assert compoundCommits.stream().map(StatelessCompoundCommit::primaryTerm).distinct().count() == 1 + : "all compound commits must have the same primary term"; + + assert IntStream.range(0, compoundCommits.size() - 1) + .allMatch( + i -> compoundCommits.get(i).primaryTermAndGeneration().compareTo(compoundCommits.get(i + 1).primaryTermAndGeneration()) < 0 + ) : "the list of compound commits must be sorted by their primary terms and generations"; + + assert compoundCommits.stream().map(StatelessCompoundCommit::shardId).distinct().count() == 1 + : "all compound commits must be for the same shard"; + } + + public ShardId shardId() { + return compoundCommits.getFirst().shardId(); + } + + public int size() { + return compoundCommits.size(); + } + + @Override + public StatelessCompoundCommit lastCompoundCommit() { + return compoundCommits.getLast(); + } + + public Set getAllInternalFiles() { + return compoundCommits.stream().flatMap(commit -> commit.internalFiles().stream()).collect(Collectors.toSet()); + } + + /** + * Reads a maximum of {@code maxBlobLength} bytes of a {@link BatchedCompoundCommit} from the blob store. For that it materializes the + * headers for all the {@link StatelessCompoundCommit} contained in the batched compound commit that are located before the maximum blob + * length. + * + * @param blobName the blob name where the batched compound commit is stored + * @param maxBlobLength the maximum number of bytes to read for the blob (not expected to be in the middle of a header or internal + * replicated range bytes) + * @param blobReader a blob reader + * @param exactBlobLength a flag indicating that the max. blob length is equal to the real blob length in the object store (flag is + * {@code true}) or not (flag is {@code false}) in which case we are OK to not read the blob fully. This flag + * is used in assertions only. + * @return the {@link BatchedCompoundCommit} containing all the commits before {@code maxBlobLength} + * @throws IOException if an I/O exception is thrown while reading the blob + */ + public static BatchedCompoundCommit readFromStore(String blobName, long maxBlobLength, BlobReader blobReader, boolean exactBlobLength) + throws IOException { + PrimaryTermAndGeneration primaryTermAndGeneration = null; + List compoundCommits = new ArrayList<>(); + var bccCommitsIterator = readFromStoreIncrementally(blobName, maxBlobLength, blobReader, exactBlobLength); + while (bccCommitsIterator.hasNext()) { + var compoundCommit = bccCommitsIterator.next(); + // BatchedCompoundCommit uses the first StatelessCompoundCommit primary term and generation + if (primaryTermAndGeneration == null) { + primaryTermAndGeneration = compoundCommit.primaryTermAndGeneration(); + } + compoundCommits.add(compoundCommit); + } + return new BatchedCompoundCommit(primaryTermAndGeneration, Collections.unmodifiableList(compoundCommits)); + } + + /** + * Creates an iterator that incrementally reads {@link StatelessCompoundCommit} headers from a batched compound commit blob + * in the blob store, up to the specified maximum blob length. This method provides lazy loading of compound commits, + * allowing for memory-efficient processing of large batched compound commits without materializing all commits at once. + *

+ * The iterator will read and parse compound commit headers sequentially from the blob, stopping when either the maximum + * blob length is reached or all compound commits in the blob have been processed. Each iteration returns a + * {@link StatelessCompoundCommit} containing both the BCC primary term and generation and the compound commit header. + *

+ * + *

+ * This method is particularly useful when processing large batched compound commits where memory usage needs to be + * controlled, or when only a subset of the compound commits in a blob need to be processed. + *

+ * + * @param blobName the blob name where the batched compound commit is stored + * @param maxBlobLength the maximum number of bytes to read for the blob (not expected to be in the middle of a header or internal + * replicated range bytes) + * @param blobReader a blob reader + * @param exactBlobLength a flag indicating that the max. blob length is equal to the real blob length in the object store (flag is + * {@code true}) or not (flag is {@code false}) in which case we are OK to not read the blob fully. This flag + * is used in assertions only. + * @return an iterator over {@link StatelessCompoundCommit} objects that lazily reads compound commit headers + * from the blob store up to the specified maximum length + */ + public static Iterator readFromStoreIncrementally( + String blobName, + long maxBlobLength, + BlobReader blobReader, + boolean exactBlobLength + ) { + return new BCCStatelessCompoundCommitsIterator(blobName, maxBlobLength, blobReader, exactBlobLength); + } + + static class BCCStatelessCompoundCommitsIterator implements Iterator { + private final String blobName; + private final long maxBlobLength; + private final BlobReader blobReader; + private final boolean exactBlobLength; + private long offset; + + BCCStatelessCompoundCommitsIterator(String blobName, long maxBlobLength, BlobReader blobReader, boolean exactBlobLength) { + this.blobName = blobName; + this.maxBlobLength = maxBlobLength; + this.blobReader = blobReader; + this.exactBlobLength = exactBlobLength; + } + + @Override + public boolean hasNext() { + assert offset < maxBlobLength || offset == BlobCacheUtils.toPageAlignedSize(maxBlobLength) || exactBlobLength == false + : "offset " + + offset + + " != page-aligned blobLength " + + BlobCacheUtils.toPageAlignedSize(maxBlobLength) + + " with exact blob length flag [true]"; + return offset < maxBlobLength; + } + + @Override + public StatelessCompoundCommit next() { + assert offset == BlobCacheUtils.toPageAlignedSize(offset) : "should only read page-aligned compound commits but got: " + offset; + try (StreamInput streamInput = blobReader.readBlobAtOffset(blobName, offset, maxBlobLength - offset)) { + var compoundCommit = StatelessCompoundCommit.readFromStoreAtOffset( + streamInput, + offset, + ignored -> StatelessCompoundCommit.parseGenerationFromBlobName(blobName) + ); + + assert assertPaddingComposedOfZeros(blobName, maxBlobLength, blobReader, offset, compoundCommit); + offset += BlobCacheUtils.toPageAlignedSize(compoundCommit.sizeInBytes()); + return compoundCommit; + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + + private static boolean assertPaddingComposedOfZeros( + String blobName, + long blobLength, + BlobReader blobReader, + long offset, + StatelessCompoundCommit compoundCommit + ) throws IOException { + long compoundCommitSize = compoundCommit.sizeInBytes(); + long compoundCommitSizePageAligned = BlobCacheUtils.toPageAlignedSize(compoundCommitSize); + int padding = Math.toIntExact(compoundCommitSizePageAligned - compoundCommitSize); + assert padding >= 0 : "padding " + padding + " is negative"; + long paddingOffset = offset + compoundCommitSize; + if (padding > 0 && paddingOffset < blobLength) { + try (StreamInput paddingStreamInput = blobReader.readBlobAtOffset(blobName, paddingOffset, padding)) { + byte[] paddingBytes = paddingStreamInput.readNBytes(padding); + byte[] zeroBytes = new byte[padding]; + Arrays.fill(zeroBytes, (byte) 0); + assert Arrays.equals(paddingBytes, zeroBytes); + } + } + return true; + } + + /** + * An object that allows reading a blob at a given offset efficiently + * (i.e. issuing a new call to the blob store API instead of reading all the bytes until the requested offset) + */ + @FunctionalInterface + public interface BlobReader { + StreamInput readBlobAtOffset(String blobName, long offset, long length) throws IOException; + } + + public static Set computeReferencedBCCGenerations(StatelessCompoundCommit commit) { + Set primaryTermAndGenerations = new HashSet<>(); + for (BlobLocation blobLocation : commit.commitFiles().values()) { + primaryTermAndGenerations.add(blobLocation.getBatchedCompoundCommitTermAndGeneration()); + } + return Collections.unmodifiableSet(primaryTermAndGenerations); + } + + public static String blobNameFromGeneration(long generation) { + return StatelessCompoundCommit.blobNameFromGeneration(generation); + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobFile.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobFile.java new file mode 100644 index 0000000000000..237371f02f2fb --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobFile.java @@ -0,0 +1,39 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +/** + * Represents a file (typically a {@link BatchedCompoundCommit}) stored in the blobstore. + */ +public record BlobFile(String blobName, PrimaryTermAndGeneration termAndGeneration) { + + public BlobFile { + assert (StatelessCompoundCommit.startsWithBlobPrefix(blobName) == false && termAndGeneration.generation() == -1) + || termAndGeneration.generation() == StatelessCompoundCommit.parseGenerationFromBlobName(blobName) + : "generation mismatch: " + termAndGeneration + " vs " + blobName; + } + + public long primaryTerm() { + return termAndGeneration.primaryTerm(); + } + + /** + * The generation of the blob file in case it is a stateless compound commit file, otherwise it is -1. + */ + public long generation() { + return termAndGeneration.generation(); + } + + @Override + public String toString() { + return "BlobFile{" + "primaryTerm=" + primaryTerm() + ", blobName='" + blobName + '\'' + '}'; + } + +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobFileRanges.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobFileRanges.java new file mode 100644 index 0000000000000..4c39c00f9486c --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobFileRanges.java @@ -0,0 +1,170 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.NavigableMap; +import java.util.Objects; +import java.util.Set; +import java.util.TreeMap; + +import static java.util.Collections.unmodifiableNavigableMap; + +/** + * Used to know the position from which to read a file in a blob. + *

+ * This class provides a {@link #getPosition(long, int)} method that takes an absolute position in a blob that we want to read, and returns + * the actual position to read (which may differ). In most cases the method will return a position in the blob where the file is stored in + * its entirety. In case the blob has been optimized to store some ranges of bytes of the file (like the header and footer) in the first + * region of the blob, and if the number of bytes to read does not exceed the length of the range, the {@link #getPosition(long, int)} + * method will return the actual position within the first region that points to the range. + */ +public class BlobFileRanges { + + private final BlobLocation blobLocation; + private final NavigableMap replicatedRanges; + + public BlobFileRanges(BlobLocation blobLocation) { + this(blobLocation, Collections.emptyNavigableMap()); + } + + private BlobFileRanges(BlobLocation blobLocation, NavigableMap replicatedRanges) { + this.blobLocation = Objects.requireNonNull(blobLocation); + this.replicatedRanges = Objects.requireNonNull(replicatedRanges); + } + + public BlobLocation blobLocation() { + return blobLocation; + } + + public String blobName() { + return blobLocation.blobName(); + } + + public PrimaryTermAndGeneration getBatchedCompoundCommitTermAndGeneration() { + return blobLocation.getBatchedCompoundCommitTermAndGeneration(); + } + + public long primaryTerm() { + return blobLocation.primaryTerm(); + } + + public long fileOffset() { + return blobLocation.offset(); + } + + public long fileLength() { + return blobLocation.fileLength(); + } + + /** + * Returns the actual position to read in the blob + * + * @param position the position that we want to start reading from (absolute position from the beginning of the blob) + * @param length the length of bytes to read + * @return the actual position to start reading the blob from (which may differ from {@code position}) + */ + public long getPosition(long position, int length) { + if (replicatedRanges.isEmpty() == false) { + short len = (short) length; + if (length == (int) len) { + // greatest range that is less than or equal to the position to start reading from (or null if there is no such range) + var candidate = replicatedRanges.floorEntry(position); + if (candidate != null) { + return candidate.getValue().getPosition(position, len); + } + } + } + return position; + } + + /** + * Represents a range of {@code length} bytes that is originally stored at {@code position} in a blob and which is also copied at a + * different {@code copy} position within the same blob. + * Note: {@code position} and {@code copy} are absolute offsets starting from the beginning of the blob. + * + * @param position the position at which the original range of bytes starts in the blob + * @param length the length of the range of bytes + * @param copy the position at which a copy of the same bytes exists in the blob + */ + private record ReplicatedByteRange(long position, short length, long copy) { + + /** + * Returns the position to read in the replicated range if the bytes to read are present in the range, otherwise returns {@code pos} + */ + private long getPosition(long pos, short len) { + if (this.position <= pos && pos + len <= this.position + this.length) { + return this.copy + (pos - this.position); + } + return pos; + } + } + + /** + * Computes the {@link BlobFileRanges} for a given set of internal files of a {@link StatelessCompoundCommit} + */ + public static Map computeBlobFileRanges( + boolean useReplicatedRanges, + StatelessCompoundCommit compoundCommit, + long blobOffset, + Set internalFiles + ) { + long replicatedRangesOffset = blobOffset + compoundCommit.headerSizeInBytes(); + long internalFilesOffset = replicatedRangesOffset + compoundCommit.internalFilesReplicatedRanges().dataSizeInBytes(); + + var replicatedRanges = new TreeMap(); + for (var range : compoundCommit.internalFilesReplicatedRanges().replicatedRanges()) { + long position = internalFilesOffset + range.position(); + var previous = replicatedRanges.put(position, new ReplicatedByteRange(position, range.length(), replicatedRangesOffset)); + assert previous == null : "replicated range already exists: " + previous; + replicatedRangesOffset += range.length(); + } + assert assertNoOverlappingReplicatedRanges(replicatedRanges); + + var blobFileRanges = HashMap.newHashMap(internalFiles.size()); + for (var internalFile : internalFiles) { + var blobLocation = compoundCommit.commitFiles().get(internalFile); + assert blobLocation != null : internalFile; + if (useReplicatedRanges == false || replicatedRanges.isEmpty()) { + blobFileRanges.put(internalFile, new BlobFileRanges(blobLocation)); + continue; + } + + var header = replicatedRanges.floorKey(blobLocation.offset()); + var footer = replicatedRanges.floorKey(blobLocation.offset() + blobLocation.fileLength() - 1); + if (header == null || footer == null) { + blobFileRanges.put(internalFile, new BlobFileRanges(blobLocation)); + continue; + } + + blobFileRanges.put( + internalFile, + new BlobFileRanges(blobLocation, unmodifiableNavigableMap(replicatedRanges.subMap(header, true, footer, true))) + ); + } + return blobFileRanges; + } + + private static boolean assertNoOverlappingReplicatedRanges(TreeMap ranges) { + ReplicatedByteRange previous = null; + for (var range : ranges.entrySet()) { + assert previous == null || previous.copy + previous.length <= range.getValue().copy : previous + " vs " + range; + previous = range.getValue(); + } + return true; + } + + public String toString() { + return blobLocation.toString(); + } + +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobLocation.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobLocation.java new file mode 100644 index 0000000000000..05181e4226c67 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/BlobLocation.java @@ -0,0 +1,169 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.ParseField; +import org.elasticsearch.xcontent.ToXContentObject; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.io.IOException; +import java.util.Objects; + +import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; + +/** + * Represents a file embedded in a {@code BlobFile} stored in the blobstore. + * + * In order to save costs for uploading multiple files (such as compound commits) + * to the blobstore they are merged into a single {@code BlobFile}. + * + * Each of the original files could be located in {@code BlobFile} using offset and fileLength. + * + * @param blobFile the blob file containing this location + * @param offset the offset inside the blob file where the file is written + * @param fileLength the length of the embedded file + */ +public record BlobLocation(BlobFile blobFile, long offset, long fileLength) implements Writeable, ToXContentObject { + + public BlobLocation { + assert offset >= 0 : "offset " + offset + " < 0"; + assert fileLength > 0 : "fileLength " + fileLength + " <= 0"; + } + + // private access only for deserialization + private BlobLocation(long primaryTerm, String blobName, long offset, long fileLength) { + this( + new BlobFile( + blobName, + new PrimaryTermAndGeneration(primaryTerm, StatelessCompoundCommit.parseGenerationFromBlobName(blobName)) + ), + offset, + fileLength + ); + } + + public long primaryTerm() { + return blobFile.primaryTerm(); + } + + public String blobName() { + return blobFile.blobName(); + } + + /** + * @return parse the generation from the blob name + */ + public long compoundFileGeneration() { + return blobFile.generation(); + } + + public PrimaryTermAndGeneration getBatchedCompoundCommitTermAndGeneration() { + return blobFile.termAndGeneration(); + } + + /** + * Returns true if this BlobLocation fully contains the given BlobLocation. + */ + public boolean contains(BlobLocation blobLocation) { + return Objects.equals(blobLocation.blobFile(), blobFile()) + && blobLocation.offset() >= offset() + && blobLocation.offset() + blobLocation.fileLength() <= offset() + fileLength(); + } + + /** + * This method is used to read BlobLocation from the object store. Ancient commits (before xcontent) can still contain blobLength + */ + public static BlobLocation readFromStore(StreamInput streamInput, boolean includesBlobLength) throws IOException { + if (includesBlobLength) { + return readWithBlobLength(streamInput); + } else { + return readWithoutBlobLength(streamInput); + } + } + + public static BlobLocation readFromTransport(StreamInput streamInput) throws IOException { + return readWithoutBlobLength(streamInput); + } + + private static BlobLocation readWithBlobLength(StreamInput streamInput) throws IOException { + long primaryTerm = streamInput.readVLong(); + String blobName = streamInput.readString(); + streamInput.readVLong(); // ignore blobLength + long offset = streamInput.readVLong(); + long length = streamInput.readVLong(); + return new BlobLocation(primaryTerm, blobName, offset, length); + } + + private static BlobLocation readWithoutBlobLength(StreamInput streamInput) throws IOException { + long primaryTerm = streamInput.readVLong(); + String blobName = streamInput.readString(); + long offset = streamInput.readVLong(); + long length = streamInput.readVLong(); + return new BlobLocation(primaryTerm, blobName, offset, length); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVLong(primaryTerm()); + out.writeString(blobName()); + out.writeVLong(offset); + out.writeVLong(fileLength); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + return builder.startObject() + .field("primary_term", primaryTerm()) + .field("blob_name", blobName()) + .field("offset", offset) + .field("file_length", fileLength) + .endObject(); + } + + private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( + "blob_location", + true, + args -> { + long primaryTerm = (long) args[0]; + String blobName = (String) args[1]; + long offset = (long) args[2]; + long fileLength = (long) args[3]; + return new BlobLocation(primaryTerm, blobName, offset, fileLength); + } + ); + static { + PARSER.declareLong(constructorArg(), new ParseField("primary_term")); + PARSER.declareString(constructorArg(), new ParseField("blob_name")); + PARSER.declareLong(constructorArg(), new ParseField("offset")); + PARSER.declareLong(constructorArg(), new ParseField("file_length")); + } + + public static BlobLocation fromXContent(XContentParser parser) throws IOException { + return PARSER.parse(parser, null); + } + + @Override + public String toString() { + return "BlobLocation{" + + "primaryTerm=" + + primaryTerm() + + ", blobName='" + + blobName() + + ", offset=" + + offset + + ", fileLength=" + + fileLength + + '}'; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ClosedShardService.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ClosedShardService.java new file mode 100644 index 0000000000000..3bec13568635a --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ClosedShardService.java @@ -0,0 +1,65 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.logging.LogManager; +import org.elasticsearch.logging.Logger; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Extends the lifetime of shard state beyond the lifetime of an IndexShard. + */ +public class ClosedShardService { + private static final Logger logger = LogManager.getLogger(ClosedShardService.class); + + /** + * Active shard reader information, per shard. Readers may continue running after a shard is technically closed. + * + * Essentially a globally available snapshot of any remaining SearchEngine readers at the time an IndexShard is closed. + * All commits for a particular index are cleared at once from the service when all readers are done and the SearchEngine closes. + */ + private final Map> openReadersByShardId = new ConcurrentHashMap<>(); + + /** + * Registers information about what stateless shard commits are still in active use by search operations. + * + * Expected to be called whenever an {@link org.elasticsearch.index.shard.IndexShard} closes with active readers. + */ + public void onShardClose(ShardId shardId, Set openReaders) { + if (openReaders.isEmpty()) { + return; + } + openReadersByShardId.put(shardId, openReaders); + } + + /** + * Clears any information about stateless shard commits in use. All active search operation have finished: the Store cannot be closed + * until all active search operations have exited, releasing the storage state. + * + * Expected to be called whenever an index shard {@link org.elasticsearch.index.store.Store} closes. + */ + public void onStoreClose(ShardId shardId) { + openReadersByShardId.remove(shardId); + } + + /** + * Fetches any stateless commits that are still in active use by readers even though the index shard was closed. Readers can continue + * past shard closure. + * + * @param shardId + * @return A set of stateless commit identifiers. Can be empty. + */ + public Set getPrimaryTermAndGenerations(ShardId shardId) { + return openReadersByShardId.getOrDefault(shardId, Set.of()); + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/CommitBCCResolver.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/CommitBCCResolver.java new file mode 100644 index 0000000000000..3a14ec9ef13f5 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/CommitBCCResolver.java @@ -0,0 +1,24 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.util.Set; + +public interface CommitBCCResolver { + /** + * Resolves the referenced BCCs used by a specific commit for the given generation. + * + * @param generation the generation to resolve + * @return a set of {@link PrimaryTermAndGeneration} representing BCC dependencies for the specified commit. + * It can return an empty set if the shard is closed or relocated as it's not expected to upload BCCs from that point on, + * otherwise it's guaranteed to return a non-empty set since the {@code generation} must be contained in at least one BCC. + */ + Set resolveReferencedBCCsForCommit(long generation); +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/IndexEngineLocalReaderListener.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/IndexEngineLocalReaderListener.java new file mode 100644 index 0000000000000..cd569a41eec55 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/IndexEngineLocalReaderListener.java @@ -0,0 +1,21 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.util.Set; + +public interface IndexEngineLocalReaderListener { + /** + * Listener invoked when one of the local readers, which holds a commit, is closed. + * @param bccHoldingClosedCommit the bcc generation that contains the closed commit + * @param remainingReferencedBCCs set of the remaining held BCCs by other local readers + */ + void onLocalReaderClosed(long bccHoldingClosedCommit, Set remainingReferencedBCCs); +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/InternalFilesReplicatedRanges.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/InternalFilesReplicatedRanges.java new file mode 100644 index 0000000000000..1571e1978e14d --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/InternalFilesReplicatedRanges.java @@ -0,0 +1,135 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.lucene.codecs.CodecUtil; +import org.elasticsearch.blobcache.common.BlobCacheBufferedIndexInput; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.ParseField; +import org.elasticsearch.xcontent.ToXContent; +import org.elasticsearch.xcontent.ToXContentFragment; +import org.elasticsearch.xcontent.ToXContentObject; +import org.elasticsearch.xcontent.XContentBuilder; + +import java.io.IOException; +import java.util.List; + +import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; + +/** + * Compound commit consists of 3 main blocks: + * - header + * - replicated content + * - main content + * Replicated content is composed of ranges of bytes of copied from main content that are located beyond the first region. + * Such ranges are used to store the headers and footers of Lucene files which are always accessed when the Lucene index is opened. + * By copying the bytes corresponding to the headers/footers in the first region, + * we minimize the number of requests to the object store that are needed to open a shard in order to speedup recovery and relocation. + * + * This structure is the header describing replicated content. It references the positions of replicated bytes relative to the main content. + */ +public record InternalFilesReplicatedRanges(List replicatedRanges, long dataSizeInBytes) + implements + ToXContentFragment { + + public static final short REPLICATED_CONTENT_HEADER_SIZE = BlobCacheBufferedIndexInput.BUFFER_SIZE; + public static final short REPLICATED_CONTENT_FOOTER_SIZE = (short) CodecUtil.footerLength(); + public static final short REPLICATED_CONTENT_MAX_SINGLE_FILE_SIZE = (short) (REPLICATED_CONTENT_HEADER_SIZE + + REPLICATED_CONTENT_FOOTER_SIZE); + + public static InternalFilesReplicatedRanges EMPTY = new InternalFilesReplicatedRanges(List.of(), 0L); + + public InternalFilesReplicatedRanges { + assert replicatedRanges != null; + assert assertRangesSorted(replicatedRanges); + assert dataSizeInBytes == dataSizeInBytes(replicatedRanges); + } + + public static InternalFilesReplicatedRanges from(List replicatedRanges) { + return replicatedRanges != null && replicatedRanges.isEmpty() == false + ? new InternalFilesReplicatedRanges(replicatedRanges, dataSizeInBytes(replicatedRanges)) + : InternalFilesReplicatedRanges.EMPTY; + } + + private static boolean assertRangesSorted(List replicatedRanges) { + InternalFileReplicatedRange previous = null; + for (InternalFileReplicatedRange range : replicatedRanges) { + assert previous == null || previous.position + previous.length <= range.position : "Ranges are not sorted: " + replicatedRanges; + previous = range; + } + return true; + } + + private static long dataSizeInBytes(List replicatedRanges) { + long size = 0; + for (var range : replicatedRanges) { + size += range.length; + } + return size; + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException { + builder.startArray("internal_files_replicated_ranges"); + for (var r : replicatedRanges) { + r.toXContent(builder, ToXContent.EMPTY_PARAMS); + } + builder.endArray(); + return builder; + } + + public boolean isEmpty() { + return replicatedRanges.isEmpty(); + } + + public record InternalFileReplicatedRange(long position, short length) + implements + Writeable, + ToXContentObject, + Comparable { + + public static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( + "internal_file_replicated_range", + true, + args -> new InternalFileReplicatedRange((long) args[0], ((Integer) args[1]).shortValue()) + ); + + static { + PARSER.declareLong(constructorArg(), new ParseField("position")); + PARSER.declareInt(constructorArg(), new ParseField("length")); + } + + public InternalFileReplicatedRange { + assert position >= 0 : "Position must be non negative: " + position; + assert length > 0 : "Must replicate non empty content: " + length; + } + + public static InternalFileReplicatedRange fromStream(StreamInput in) throws IOException { + return new InternalFileReplicatedRange(in.readVLong(), in.readShort()); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVLong(position); + out.writeShort(length); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + return builder.startObject().field("position", position).field("length", length).endObject(); + } + + @Override + public int compareTo(InternalFileReplicatedRange o) { + return Long.compare(position, o.position); + } + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ReplicatedContent.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ReplicatedContent.java new file mode 100644 index 0000000000000..1986ea52a2169 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ReplicatedContent.java @@ -0,0 +1,256 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.elasticsearch.common.lucene.store.InputStreamIndexInput; +import org.elasticsearch.core.IOUtils; +import org.elasticsearch.index.store.LuceneFilesExtensions; +import org.elasticsearch.xpack.stateless.cache.Lucene90CompoundEntriesReader; +import org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges.InternalFileReplicatedRange; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.InternalFile; +import org.elasticsearch.xpack.stateless.commits.VirtualBatchedCompoundCommit.InternalDataReader; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import java.util.function.LongPredicate; + +import static org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges.REPLICATED_CONTENT_FOOTER_SIZE; +import static org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges.REPLICATED_CONTENT_HEADER_SIZE; +import static org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges.REPLICATED_CONTENT_MAX_SINGLE_FILE_SIZE; + +/** + * This class creates a replicated content section that can be later added to the VBCC. + * Captured data includes both: + * - internal file replicated ranges (using corresponding readers) + * - header required to interpret them + */ +class ReplicatedContent { + + private static final Logger logger = LogManager.getLogger(ReplicatedContent.class); + + static final ReplicatedContent EMPTY = new ReplicatedContent(); + + static final LongPredicate ALWAYS_REPLICATE = (ignored) -> true; + + private final List ranges = new ArrayList<>(); + private final List readers = new ArrayList<>(); + + private ReplicatedContent() {} + + public static ReplicatedContent create( + boolean useInternalFilesReplicatedContent, + Collection internalFiles, + Directory directory, + LongPredicate shouldReplicate + ) { + if (useInternalFilesReplicatedContent == false) { + return EMPTY; + } + + // It is not possible to know the absolute position in CC since header and replicated content can not be materialized yet. + // So targetContentOffset is the offset of the data after the header and the replicated ranges. + long targetContentOffset = 0; + var content = new ReplicatedContent(); + for (var internalFile : internalFiles) { + long nextContentOffset = targetContentOffset + internalFile.length(); + if (shouldReplicate.test(nextContentOffset)) { + if (internalFile.length() <= REPLICATED_CONTENT_MAX_SINGLE_FILE_SIZE) { + content.append(internalFile.name(), directory, targetContentOffset, 0, (short) internalFile.length(), false); + } else { + boolean isCompoundSegmentsFile = LuceneFilesExtensions.fromFile(internalFile.name()) == LuceneFilesExtensions.CFS; + content.append( + internalFile.name(), + directory, + targetContentOffset, + 0, + REPLICATED_CONTENT_HEADER_SIZE, + isCompoundSegmentsFile + ); + if (isCompoundSegmentsFile) { + var entries = readSortedCompoundEntries(directory, correspondingCfeFilename(internalFile.name())); + for (var entry : entries) { + if (entry.length() <= REPLICATED_CONTENT_MAX_SINGLE_FILE_SIZE) { + content.append( + internalFile.name(), + directory, + targetContentOffset, + entry.offset(), + (short) entry.length(), + false + ); + } else { + content.append( + internalFile.name(), + directory, + targetContentOffset, + entry.offset(), + REPLICATED_CONTENT_HEADER_SIZE, + false + ); + content.append( + internalFile.name(), + directory, + targetContentOffset, + entry.offset() + entry.length() - REPLICATED_CONTENT_FOOTER_SIZE, + REPLICATED_CONTENT_FOOTER_SIZE, + false + ); + } + } + } + content.append( + internalFile.name(), + directory, + targetContentOffset, + internalFile.length() - REPLICATED_CONTENT_FOOTER_SIZE, + REPLICATED_CONTENT_FOOTER_SIZE, + false + ); + } + } + targetContentOffset = nextContentOffset; + } + + assert content.assertContentLength(); + return content; + } + + private boolean assertContentLength() { + var declaredContentLength = ranges.stream().mapToLong(InternalFileReplicatedRange::length).sum(); + var copiedContentLength = readers.stream().mapToLong(InternalFileRangeReader::rangeLength).sum(); + assert declaredContentLength == copiedContentLength; + return true; + } + + private static String correspondingCfeFilename(String cfsFilename) { + assert LuceneFilesExtensions.fromFile(cfsFilename) == LuceneFilesExtensions.CFS; + return cfsFilename.substring(0, cfsFilename.length() - 3) + LuceneFilesExtensions.CFE.getExtension(); + } + + private static List readSortedCompoundEntries(Directory directory, String filename) { + try { + var entries = Lucene90CompoundEntriesReader.readEntries(directory, filename).values(); + return entries.stream().sorted(Comparator.comparingLong(Lucene90CompoundEntriesReader.FileEntry::offset)).toList(); + } catch (IOException e) { + logger.warn(() -> "Failed to parse [" + filename + "] entries", e); + assert false; + return List.of(); + } + } + + private void append( + String filename, + Directory directory, + long internalFileOffset, + long fileOffset, + short length, + boolean forceNewRange + ) { + appendRange(internalFileOffset + fileOffset, length, forceNewRange); + appendReader(filename, directory, fileOffset, length); + } + + private void appendRange(long blobContentOffset, short length, boolean forceNewRange) { + assert ranges.isEmpty() || ranges.getLast().position() <= blobContentOffset; + if (forceNewRange + || ranges.isEmpty() + || ranges.getLast().position() + ranges.getLast().length() < blobContentOffset + || computeMergedLength(ranges.getLast(), blobContentOffset, length) >= Short.MAX_VALUE) { + assert ranges.isEmpty() || ranges.getLast().position() + ranges.getLast().length() <= blobContentOffset; + ranges.add(new InternalFileReplicatedRange(blobContentOffset, length)); + } else { + var last = ranges.removeLast(); + ranges.add( + new InternalFileReplicatedRange( + last.position(), + (short) computeMergedLength(last, blobContentOffset, length) // above condition guards from overflow + ) + ); + } + } + + private static long computeMergedLength(InternalFileReplicatedRange range, long otherRangeOffset, short otherRangeLength) { + return Math.max(range.position() + range.length(), otherRangeOffset + otherRangeLength) - range.position(); + } + + private void appendReader(String filename, Directory directory, long fileOffset, short length) { + if (readers.isEmpty() + || Objects.equals(readers.getLast().filename, filename) == false + || readers.getLast().rangeOffset() + readers.getLast().rangeLength() < fileOffset) { + readers.add(new InternalFileRangeReader(filename, directory, fileOffset, length)); + } else { + var last = readers.removeLast(); + readers.add( + new InternalFileRangeReader( + filename, + directory, + last.rangeOffset(), + Math.max(last.rangeOffset() + last.rangeLength(), fileOffset + length) - last.rangeOffset() + ) + ); + } + } + + InternalFilesReplicatedRanges header() { + return InternalFilesReplicatedRanges.from(Collections.unmodifiableList(ranges)); + } + + List readers() { + return Collections.unmodifiableList(readers); + } + + /** + * Internal data range reader for the range of the internal file, that will be replicated in the beginning of a CC + */ + record InternalFileRangeReader(String filename, Directory directory, long rangeOffset, long rangeLength) implements InternalDataReader { + + private InputStream createInputStream(long offset, long length, IOContext context) throws IOException { + long fileLength = rangeOffset + rangeLength; + long fileOffset = rangeOffset + offset; + assert fileOffset < fileLength + : "offset [" + rangeOffset + "+" + offset + "] more than file length [" + rangeOffset + "+" + rangeLength + "]"; + long fileBytesToRead = Math.min(Math.min(length, rangeLength), fileLength - fileOffset); + IndexInput input = directory.openInput(filename, context); + try { + input.seek(fileOffset); + return new InputStreamIndexInput(input, fileBytesToRead) { + @Override + public void close() throws IOException { + IOUtils.close(super::close, input); + } + }; + } catch (IOException e) { + IOUtils.closeWhileHandlingException(input); + throw e; + } + } + + @Override + public InputStream getInputStream(long offset, long length) throws IOException { + var ioContext = filename.startsWith(IndexFileNames.SEGMENTS) ? IOContext.READONCE : IOContext.DEFAULT; + return createInputStream(offset, length, ioContext); + } + + @Override + public InputStream getInputStream() throws IOException { + return createInputStream(0L, Long.MAX_VALUE, IOContext.READONCE); + } + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalCommitsRefs.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalCommitsRefs.java new file mode 100644 index 0000000000000..dd0816515bc65 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalCommitsRefs.java @@ -0,0 +1,94 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.lucene.index.IndexCommit; +import org.elasticsearch.core.Assertions; + +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Manages acquired {@link IndexCommit} references for a shard, ensuring they remain + * valid even when the underlying {@link org.elasticsearch.index.engine.Engine} changes. + * + *

Once a reference is acquired (e.g., for snapshots), the commit is protected from + * deletion until all references are released, preventing data corruption during + * long-running operations. + */ +public class ShardLocalCommitsRefs { + /** + * Holds the references count for a commit + */ + private final Map acquiredGenerations; + // Index commits internally acquired by the commits listener. We want to track them separately to be able to disregard them + // when checking for externally acquired index commits that haven't been released during testing + private final Map acquiredCommitGenerationsForCommitsListener = Assertions.ENABLED ? new ConcurrentHashMap<>() : null; + + public ShardLocalCommitsRefs() { + this.acquiredGenerations = new ConcurrentHashMap<>(); + } + + // TODO: make package-private ES-13786 + public SoftDeleteIndexCommit incRef(IndexCommit indexCommit) { + return incRef(indexCommit, false); + } + + // TODO: make package-private ES-13786 + public SoftDeleteIndexCommit incRef(IndexCommit indexCommit, boolean acquiredForCommitListener) { + if (Assertions.ENABLED && acquiredForCommitListener) { + incRefGeneration(acquiredCommitGenerationsForCommitsListener, indexCommit.getGeneration()); + } + incRefGeneration(acquiredGenerations, indexCommit.getGeneration()); + return SoftDeleteIndexCommit.wrap(indexCommit, acquiredForCommitListener); + } + + void incRefGeneration(Map counters, long generation) { + counters.merge(generation, 1, Integer::sum); + } + + /** + * Decrements the reference count for the IndexCommit and returns whether it can be deleted. + * + * @param indexCommit the IndexCommit to decrement + * @return {@code true} if the IndexCommit can be safely deleted, {@code false} otherwise + */ + // TODO: make package-private ES-13786 + public boolean decRef(IndexCommit indexCommit) { + assert indexCommit instanceof SoftDeleteIndexCommit; + if (Assertions.ENABLED && ((SoftDeleteIndexCommit) indexCommit).isAcquiredForCommitListener()) { + decRefGeneration(acquiredCommitGenerationsForCommitsListener, indexCommit.getGeneration()); + } + return decRefGeneration(acquiredGenerations, indexCommit.getGeneration()); + } + + private boolean decRefGeneration(Map counters, long generation) { + assert counters.containsKey(generation) : generation; + var refCount = counters.compute(generation, (ignored, value) -> { + assert value != null : "already fully released"; + if (value == 1) { + return null; + } + return value - 1; + }); + assert refCount == null || refCount > 0; + return refCount == null; + } + + // TODO: make package-private ES-13786 + public boolean hasAcquiredIndexCommitsForTesting() { + // We explicitly check only external commits and disregard internal commits acquired by the commits listener + for (var e : acquiredGenerations.entrySet()) { + var commitListenerCount = acquiredCommitGenerationsForCommitsListener.get(e.getKey()); + if (commitListenerCount == null || e.getValue() > commitListenerCount) { + return true; + } + } + return false; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalCommitsTracker.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalCommitsTracker.java new file mode 100644 index 0000000000000..1fe049fcdb3d5 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalCommitsTracker.java @@ -0,0 +1,20 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +/** + * Encapsulates shard-level tracking of acquired commits and open readers to prevent + * premature deletion of referenced blobs during engine operations. + * + *

+ * This tracker maintains state that persists across {@link org.elasticsearch.index.shard.IndexShard#resetEngine} + * operations, ensuring that commit references and reader state are preserved to avoid + * deleting blobs that are still in use by active readers or referenced commits. + *

+ */ +public record ShardLocalCommitsTracker(ShardLocalReadersTracker shardLocalReadersTracker, ShardLocalCommitsRefs shardLocalCommitsRefs) {} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalReadersTracker.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalReadersTracker.java new file mode 100644 index 0000000000000..0b27740c445dd --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/ShardLocalReadersTracker.java @@ -0,0 +1,65 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.lucene.index.DirectoryReader; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Tracks all open {@link DirectoryReader} instances at the shard level to ensure their + * underlying commit blobs are not deleted while still in use. + *

+ * Readers are tied to the {@link org.elasticsearch.index.store.Store} rather than a particular + * engine instance, so they survive across {@link org.elasticsearch.index.shard.IndexShard#resetEngine} resets. + * This allows to close and recreate engines without losing or prematurely deleting + * the blobs still referenced by existing readers. + */ +public class ShardLocalReadersTracker { + // The values of this map are sets of BCCs referenced by the reader. This map is guarded by the openReaders monitor. + private final Map> openReaders = new HashMap<>(); + + private final IndexEngineLocalReaderListener localReaderListener; + + public ShardLocalReadersTracker(IndexEngineLocalReaderListener localReaderListener) { + this.localReaderListener = localReaderListener; + } + + public void trackOpenReader(DirectoryReader directoryReader, Set referencedBCCsForCommit) { + synchronized (openReaders) { + openReaders.put(directoryReader, referencedBCCsForCommit); + } + } + + public void onLocalReaderClosed(DirectoryReader reader) { + Set bccDependencies; + Set remainingReferencedBCCs; + // CHM iterators are weakly consistent, meaning that we're not guaranteed to see new insertions while we compute + // the set of remainingReferencedBCCs, that's why we use a regular HashMap with synchronized. + synchronized (openReaders) { + bccDependencies = openReaders.remove(reader); + assert bccDependencies != null : openReaders + " -> " + reader; + assert bccDependencies.isEmpty() == false; + remainingReferencedBCCs = openReaders.values().stream().flatMap(Collection::stream).collect(Collectors.toSet()); + } + + long bccHoldingCommit = bccDependencies.stream().max(PrimaryTermAndGeneration::compareTo).get().generation(); + localReaderListener.onLocalReaderClosed(bccHoldingCommit, remainingReferencedBCCs); + } + + public Map> getOpenReaders() { + synchronized (openReaders) { + return Map.copyOf(openReaders); + } + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/SoftDeleteIndexCommit.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/SoftDeleteIndexCommit.java new file mode 100644 index 0000000000000..4916462949321 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/SoftDeleteIndexCommit.java @@ -0,0 +1,59 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.lucene.index.IndexCommit; +import org.elasticsearch.common.lucene.FilterIndexCommit; + +import java.util.concurrent.atomic.AtomicBoolean; + +public class SoftDeleteIndexCommit extends FilterIndexCommit { + + private final AtomicBoolean softDelete = new AtomicBoolean(); + private final boolean acquiredForCommitListener; + + private SoftDeleteIndexCommit(IndexCommit in, boolean acquiredForCommitListener) { + super(in); + this.acquiredForCommitListener = acquiredForCommitListener; + } + + public boolean isSoftDeleted() { + return softDelete.get(); + } + + @Override + public void delete() { + softDelete.compareAndSet(false, true); + // Suppress any deletion executed by a wrapped index deletion policy. + // We do not call super.delete() here to avoid deleting the commit immediately, + // the commit is deleted once all references to it are released. + } + + boolean isAcquiredForCommitListener() { + return acquiredForCommitListener; + } + + @Override + public String toString() { + return "SoftDeleteIndexCommit[" + in.getGeneration() + (isSoftDeleted() ? "](soft deleted)" : "]"); + } + + public static SoftDeleteIndexCommit wrap(IndexCommit commit, boolean acquiredForCommitListener) { + assert commit instanceof SoftDeleteIndexCommit == false : commit.getClass().getName(); + return new SoftDeleteIndexCommit(commit, acquiredForCommitListener); + } + + public static IndexCommit unwrap(IndexCommit commit) { + if (commit instanceof SoftDeleteIndexCommit softDeleteIndexCommit) { + return softDeleteIndexCommit.getIndexCommit(); + } + var error = "[" + commit.getClass().getName() + "] is not an instance of [" + SoftDeleteIndexCommit.class.getName() + ']'; + assert false : error; + throw new IllegalStateException(error); + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/StaleCompoundCommit.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/StaleCompoundCommit.java new file mode 100644 index 0000000000000..f4f57f94279ff --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/StaleCompoundCommit.java @@ -0,0 +1,26 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.common.blobstore.BlobPath; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +public record StaleCompoundCommit(ShardId shardId, PrimaryTermAndGeneration primaryTermAndGeneration, long allocationPrimaryTerm) { + public long primaryTerm() { + return primaryTermAndGeneration.primaryTerm(); + } + + public String fileName() { + return StatelessCompoundCommit.blobNameFromGeneration(primaryTermAndGeneration.generation()); + } + + public String absoluteBlobPath(BlobPath blobPath) { + return blobPath.buildAsString() + fileName(); + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommit.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommit.java new file mode 100644 index 0000000000000..8f479b348be4c --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommit.java @@ -0,0 +1,815 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.elasticsearch.TransportVersion; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.PositionTrackingOutputStreamStreamOutput; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.util.Maps; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.index.translog.BufferedChecksumStreamInput; +import org.elasticsearch.index.translog.BufferedChecksumStreamOutput; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.ParseField; +import org.elasticsearch.xcontent.ToXContent; +import org.elasticsearch.xcontent.ToXContentObject; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xcontent.XContentParserConfiguration; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.io.IOException; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; +import static org.elasticsearch.xcontent.ConstructingObjectParser.optionalConstructorArg; + +/** + * Represents a Lucene commit point with additional information required to manage this commit in the object store as well as locally. Such + * objects are uploaded to the object store as binary blobs. + * + * A hollow commit is one that does not have translog and will be recovered with a hollow engine (i.e., not fully ready for ingestion, but + * will be loaded when ingestion first comes). For a hollow commit, the translog recovery start file is set to + * {@link #HOLLOW_TRANSLOG_RECOVERY_START_FILE} and the node ephemeral id is empty. + */ +public record StatelessCompoundCommit( + ShardId shardId, + PrimaryTermAndGeneration primaryTermAndGeneration, + long translogRecoveryStartFile, + String nodeEphemeralId, + // the commit's required files, that are either located in previous CCs (referenced) or inside this CC as additional internal files + Map commitFiles, + // the size of the compound commit including codec, header, checksums, replicated content and all internal files and extra content + long sizeInBytes, + // the additional internal files that are part of this commit + Set internalFiles, + long headerSizeInBytes, + InternalFilesReplicatedRanges internalFilesReplicatedRanges, + // extra content (e.g., replicated referenced files) that is appended after the internal files of this CC + Map extraContent, + boolean hollow, + @Nullable TimestampFieldValueRange timestampFieldValueRange // nullable because not all indices/commits have a @timestamp field +) implements Writeable { + + public static final String TRANSLOG_RECOVERY_START_FILE = "translog_recovery_start_file"; + public static long HOLLOW_TRANSLOG_RECOVERY_START_FILE = Long.MAX_VALUE; + + public StatelessCompoundCommit { + assert commitFiles.keySet().containsAll(internalFiles); + assert hollow + ? (nodeEphemeralId.isEmpty() == (translogRecoveryStartFile == HOLLOW_TRANSLOG_RECOVERY_START_FILE)) + : (nodeEphemeralId.isEmpty() == false && translogRecoveryStartFile != HOLLOW_TRANSLOG_RECOVERY_START_FILE) + : "a hollow (currently " + + hollow + + ") commit must have an empty node ephemeral id (currently " + + nodeEphemeralId + + ") and a translog recovery start file with value " + + HOLLOW_TRANSLOG_RECOVERY_START_FILE + + " (currently " + + translogRecoveryStartFile + + ")"; + assert extraContent != null; + assert extraContent.isEmpty() || hollow : "only hollow commits can currently have extra content (currently " + extraContent + ")"; + } + + /** + * Constructor that sets the hollow flag based on the translog field. + */ + public StatelessCompoundCommit( + ShardId shardId, + PrimaryTermAndGeneration primaryTermAndGeneration, + long translogRecoveryStartFile, + String nodeEphemeralId, + Map commitFiles, + long sizeInBytes, + Set internalFiles, + long headerSizeInBytes, + InternalFilesReplicatedRanges internalFilesReplicatedRanges, + Map extraContent, + @Nullable TimestampFieldValueRange timestampFieldValueRange + ) { + this( + shardId, + primaryTermAndGeneration, + translogRecoveryStartFile, + nodeEphemeralId, + commitFiles, + sizeInBytes, + internalFiles, + headerSizeInBytes, + internalFilesReplicatedRanges, + extraContent, + translogRecoveryStartFile == HOLLOW_TRANSLOG_RECOVERY_START_FILE, + timestampFieldValueRange + ); + } + + /** + * Instantiates a hollow commit. + */ + public static StatelessCompoundCommit newHollowStatelessCompoundCommit( + ShardId shardId, + PrimaryTermAndGeneration primaryTermAndGeneration, + Map commitFiles, + long sizeInBytes, + Set internalFiles, + long headerSizeInBytes, + InternalFilesReplicatedRanges internalFilesReplicatedRanges, + Map extraContent, + @Nullable TimestampFieldValueRange timestampFieldValueRange + ) { + return new StatelessCompoundCommit( + shardId, + primaryTermAndGeneration, + HOLLOW_TRANSLOG_RECOVERY_START_FILE, + "", + commitFiles, + sizeInBytes, + internalFiles, + headerSizeInBytes, + internalFilesReplicatedRanges, + extraContent, + true, + timestampFieldValueRange + ); + } + + public static final String PREFIX = "stateless_commit_"; + + public static boolean isGenerationalFile(String file) { + return file.startsWith("_") && (file.endsWith(".tmp") == false) && IndexFileNames.parseGeneration(file) > 0L; + } + + public PrimaryTermAndGeneration primaryTermAndGeneration() { + return primaryTermAndGeneration; + } + + public long primaryTerm() { + return primaryTermAndGeneration.primaryTerm(); + } + + public long generation() { + return primaryTermAndGeneration.generation(); + } + + public @Nullable TimestampFieldValueRange getTimestampFieldValueRange() { + return timestampFieldValueRange; + } + + @Override + public String toString() { + return "StatelessCompoundCommit{" + + "shardId=" + + shardId + + ", generation=" + + generation() + + ", primaryTerm=" + + primaryTerm() + + ", translogRecoveryStartFile=" + + translogRecoveryStartFile + + ", nodeEphemeralId='" + + nodeEphemeralId + + "', sizeInBytes=" + + sizeInBytes + + "', timestampFieldValueRange=" + + timestampFieldValueRange + + '}'; + } + + public String toShortDescription() { + return '[' + blobNameFromGeneration(generation()) + "][" + primaryTerm() + "][" + generation() + ']' + (hollow() ? "[h]" : ""); + } + + public String toLongDescription() { + return shardId + + toShortDescription() + + '[' + + translogRecoveryStartFile + + "][" + + nodeEphemeralId + + "][" + + commitFiles + + "][" + + extraContent + + "][" + + timestampFieldValueRange + + ']'; + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + shardId.writeTo(out); + // For backward compatibility, use different order than PrimaryTermAndGeneration.writeTo(StreamOutput) + out.writeVLong(primaryTermAndGeneration.generation()); + out.writeVLong(primaryTermAndGeneration.primaryTerm()); + out.writeVLong(translogRecoveryStartFile); + out.writeString(nodeEphemeralId); + out.writeMap(commitFiles, StreamOutput::writeString, (o, v) -> v.writeTo(o)); + out.writeVLong(sizeInBytes); + out.writeStringCollection(internalFiles); + out.writeVLong(headerSizeInBytes); + out.writeCollection(internalFilesReplicatedRanges.replicatedRanges()); + out.writeMap(extraContent, StreamOutput::writeString, (o, v) -> v.writeTo(o)); + out.writeOptionalWriteable(timestampFieldValueRange); + } + + public static StatelessCompoundCommit readFromTransport(StreamInput in) throws IOException { + ShardId shardId = new ShardId(in); + PrimaryTermAndGeneration primaryTermAndGeneration = primaryTermAndGeneration(in); + long translogRecoveryStartFile = in.readVLong(); + String nodeEphemeralId = in.readString(); + Map commitFiles = in.readImmutableMap(StreamInput::readString, BlobLocation::readFromTransport); + long sizeInBytes = in.readVLong(); + Set internalFiles = in.readCollectionAsImmutableSet(StreamInput::readString); + long headerSizeInBytes; + InternalFilesReplicatedRanges replicatedRanges; + headerSizeInBytes = in.readVLong(); + replicatedRanges = InternalFilesReplicatedRanges.from( + in.readCollectionAsImmutableList(InternalFilesReplicatedRanges.InternalFileReplicatedRange::fromStream) + ); + Map extraContent = in.readImmutableMap(StreamInput::readString, BlobLocation::readFromTransport); + TimestampFieldValueRange timestampFieldValueRange = in.readOptionalWriteable(TimestampFieldValueRange::new); + return new StatelessCompoundCommit( + shardId, + primaryTermAndGeneration, + translogRecoveryStartFile, + nodeEphemeralId, + commitFiles, + sizeInBytes, + internalFiles, + headerSizeInBytes, + replicatedRanges, + extraContent, + timestampFieldValueRange + ); + } + + private static PrimaryTermAndGeneration primaryTermAndGeneration(StreamInput in) throws IOException { + // For backward compatibility, use a different order than PrimaryTermAndGeneration(StreamInput)) + long generation = in.readVLong(); + long primaryTerm = in.readVLong(); + return new PrimaryTermAndGeneration(primaryTerm, generation); + } + + public Set getInternalFiles() { + return internalFiles; + } + + /** + * Calculates and returns the total size of all the files referenced in this compound commit. + * This method includes the sizes of files stored in other commits, unlike {@link #sizeInBytes()}, + * which only considers the sizes of files unique to this commit and the header + padding. + * + * @return the total size of the files either embedded or referenced in this commit in bytes + */ + public long getAllFilesSizeInBytes() { + long commitFilesSizeInBytes = 0; + for (BlobLocation commitFile : commitFiles.values()) { + commitFilesSizeInBytes += commitFile.fileLength(); + } + return commitFilesSizeInBytes; + } + + /** + * Returns the blob location of the internal files with the minimum offset within the current term and generation. + */ + public BlobLocation getMinInternalFilesOffsetInCurrentGeneration() { + return getInternalFilesBoundaryOffsetInCurrentGeneration(Comparator.naturalOrder()); + } + + /** + * Returns the blob location of the internal files with the maximum offset within the current term and generation. + */ + public BlobLocation getMaxInternalFilesOffsetInCurrentGeneration() { + return getInternalFilesBoundaryOffsetInCurrentGeneration(Comparator.reverseOrder()); + } + + /** + * Returns the "first" blob location of the internal files after comparing all the offsets in the current term and + * generation using the provided comparator. + * + * @param offsetComparator If {@link Comparator#naturalOrder()} (i.e. lower offset first) is used will find the minimum offset; + * otherwise, for reverse order (i.e, highest offset first), finds the maximum offset. + * @return The {@link BlobLocation} with offset at the boundary (lower or upper) of the commit. + */ + private BlobLocation getInternalFilesBoundaryOffsetInCurrentGeneration(Comparator offsetComparator) { + BlobLocation commitBoundary = null; + for (String currentGenFile : internalFiles) { + BlobLocation location = commitFiles.get(currentGenFile); + if (commitBoundary == null) { + commitBoundary = location; + } + + if (offsetComparator.compare(commitBoundary.offset(), location.offset()) > 0) { + commitBoundary = location; + } + } + assert commitBoundary != null : "commit must have at least the segments_N file in the current term and generation"; + return commitBoundary; + } + + /** + * Writes the StatelessCompoundCommit header to the given StreamOutput and returns the number of bytes written + * @return the header size in bytes + */ + // visible for testing + static long writeXContentHeader( + ShardId shardId, + long generation, + long primaryTerm, + String nodeEphemeralId, + long translogRecoveryStartFile, + @Nullable TimestampFieldValueRange timestampFieldValueRange, + Map referencedBlobFiles, + Iterable internalFiles, + InternalFilesReplicatedRanges internalFilesReplicatedRanges, + PositionTrackingOutputStreamStreamOutput positionTracking, + boolean useInternalFilesReplicatedContent, + Iterable extraContent + ) throws IOException { + assert assertSortedBySize(internalFiles) : "internal files must be sorted by size, got " + internalFiles; + assert (translogRecoveryStartFile == HOLLOW_TRANSLOG_RECOVERY_START_FILE && nodeEphemeralId.isEmpty()) + || translogRecoveryStartFile != HOLLOW_TRANSLOG_RECOVERY_START_FILE + : "a hollow commit must have an empty node ephemeral id (currently " + nodeEphemeralId + ")"; + BufferedChecksumStreamOutput out = new BufferedChecksumStreamOutput(positionTracking); + CodecUtil.writeHeader(new OutputStreamDataOutput(out), SHARD_COMMIT_CODEC, CURRENT_VERSION); + long codecSize = positionTracking.position(); + + var bytesStreamOutput = new BytesStreamOutput(); + try (var b = new XContentBuilder(XContentType.SMILE.xContent(), bytesStreamOutput)) { + b.startObject(); + { + shardIdXContent(shardId, b); + b.field("generation", generation); + b.field("primary_term", primaryTerm); + b.field("node_ephemeral_id", nodeEphemeralId); + b.field(TRANSLOG_RECOVERY_START_FILE, translogRecoveryStartFile); + if (timestampFieldValueRange != null) { + // the CC XContentHeader is always serialized under the last version, + // so the timestamp field value range always has the right to be present + b.startObject("timestamp_field_value_range"); + { + b.field("min_millis", timestampFieldValueRange.minMillis); + b.field("max_millis", timestampFieldValueRange.maxMillis); + } + b.endObject(); + } + b.startObject("commit_files"); + { + for (Map.Entry e : referencedBlobFiles.entrySet()) { + b.field(e.getKey()); + e.getValue().toXContent(b, ToXContent.EMPTY_PARAMS); + } + } + b.endObject(); + b.startArray("internal_files"); + { + for (InternalFile f : internalFiles) { + f.toXContent(b, ToXContent.EMPTY_PARAMS); + } + } + b.endArray(); + if (useInternalFilesReplicatedContent) { + internalFilesReplicatedRanges.toXContent(b, ToXContent.EMPTY_PARAMS); + } + b.startArray("extra_content"); + { + for (InternalFile f : extraContent) { + f.toXContent(b, ToXContent.EMPTY_PARAMS); + } + } + b.endArray(); + } + b.endObject(); + } + // Write the end marker manually, can't customize XContent to use SmileGenerator.Feature#WRITE_END_MARKER + bytesStreamOutput.write(XContentType.SMILE.xContent().bulkSeparator()); + bytesStreamOutput.flush(); + + BytesReference xContentHeader = bytesStreamOutput.bytes(); + out.writeInt(xContentHeader.length()); + out.writeInt((int) out.getChecksum()); + xContentHeader.writeTo(out); + out.writeInt((int) out.getChecksum()); + out.flush(); + + var headerSize = positionTracking.position(); + assert headerSize >= 0; + assert headerSize == codecSize + 4 + 4 + xContentHeader.length() + 4; + return headerSize; + } + + static final String SHARD_COMMIT_CODEC = "stateless_commit"; + static final int VERSION_WITH_COMMIT_FILES = 0; + static final int VERSION_WITH_BLOB_LENGTH = 1; + static final int VERSION_WITH_XCONTENT_ENCODING = 2; + static final int CURRENT_VERSION = VERSION_WITH_XCONTENT_ENCODING; + + public static StatelessCompoundCommit readFromStore(StreamInput in) throws IOException { + return readFromStoreAtOffset(in, 0, Function.identity()); + } + + private static final Logger logger = LogManager.getLogger(StatelessCompoundCommit.class); + + /** + * Reads the compound commit header from the data store at the specified offset within the input stream. + * It's expected that the input stream is already positioned at the specified offset. + * The {@param offset} parameter is utilized to construct the {@link StatelessCompoundCommit} instance, + * referring to the compound commit at the given offset within the {@link BatchedCompoundCommit}. + * @param in the input stream to read from + * @param offset the offset within the blob where this compound commit header starts + * @param bccGenSupplier a function that gives the generation of the batched compound commit blob where this compound commit is stored + */ + public static StatelessCompoundCommit readFromStoreAtOffset(StreamInput in, long offset, Function bccGenSupplier) + throws IOException { + try (BufferedChecksumStreamInput input = new BufferedChecksumStreamInput(in, SHARD_COMMIT_CODEC)) { + int version = CodecUtil.checkHeader( + new InputStreamDataInput(input), + SHARD_COMMIT_CODEC, + VERSION_WITH_COMMIT_FILES, + CURRENT_VERSION + ); + if (version < VERSION_WITH_XCONTENT_ENCODING) { + TransportVersion.readVersion(input); + ShardId shardId = new ShardId(input); + long generation = input.readVLong(); + long primaryTerm = input.readVLong(); + String nodeEphemeralId = input.readString(); + + // TODO: remove logging after confirming that no compound commits exist at obsolete versions + logger.info( + "{} with UUID [{}] reading compound commit {} of obsolete version [{}]", + shardId, + shardId.getIndex().getUUID(), + new PrimaryTermAndGeneration(primaryTerm, generation), + version + ); + + Map referencedBlobLocations = input.readMap( + StreamInput::readString, + (is) -> BlobLocation.readFromStore(is, version == VERSION_WITH_BLOB_LENGTH) + ); + List internalFiles = input.readCollectionAsList(InternalFile::new); + long headerSize = input.readLong(); + verifyChecksum(input); + long totalSizeInBytes = headerSize + internalFiles.stream().mapToLong(InternalFile::length).sum(); + return statelessCompoundCommit( + shardId, + generation, + primaryTerm, + 0, + nodeEphemeralId, + referencedBlobLocations, + internalFiles, + InternalFilesReplicatedRanges.EMPTY, + offset, + headerSize, + totalSizeInBytes, + bccGenSupplier, + List.of(), + null // the version here is definitely before the version that introduced the timestamp range in the CC header + ); + } else { + assert version == VERSION_WITH_XCONTENT_ENCODING; + + int xContentLength = input.readInt(); + verifyChecksum(input); + + byte[] bytes = new byte[xContentLength]; + input.readBytes(bytes, 0, bytes.length); + verifyChecksum(input); + + // codec header + serialized header size + checksum + header content + checksum + var headerSize = CodecUtil.headerLength(SHARD_COMMIT_CODEC) + 4 + 4 + xContentLength + 4; + return readXContentHeader(new BytesArray(bytes).streamInput(), headerSize, offset, bccGenSupplier); + } + } catch (Exception e) { + throw new IOException("Failed to read shard commit", e); + } + } + + private static void verifyChecksum(BufferedChecksumStreamInput input) throws IOException { + long actualChecksum = input.getChecksum(); + long expectedChecksum = Integer.toUnsignedLong(input.readInt()); + if (actualChecksum != expectedChecksum) { + throw new CorruptIndexException( + "checksum verification failed - expected: 0x" + + Long.toHexString(expectedChecksum) + + ", got: 0x" + + Long.toHexString(actualChecksum), + input.getSource() + ); + } + } + + private static StatelessCompoundCommit readXContentHeader( + StreamInput is, + long headerSize, + long offset, + Function bccGenSupplier + ) throws IOException { + record XContentStatelessCompoundCommit( + ShardId shardId, + long generation, + long primaryTerm, + long translogRecoveryStartFile, + String nodeEphemeralId, + Map referencedBlobLocations, + List internalFiles, + InternalFilesReplicatedRanges replicatedContentMetadata, + List extraContent, + @Nullable TimestampFieldValueRange timestampFieldValueRange + ) { + @SuppressWarnings("unchecked") + private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( + "stateless_compound_commit", + true, + args -> { + // args[8] is null if the blob is from before we introduced extra content + final var extraContent = (List) args[8]; + return new XContentStatelessCompoundCommit( + (ShardId) args[0], + (long) args[1], + (long) args[2], + args[3] == null ? 0 : (long) args[3], + (String) args[4], + (Map) args[5], + (List) args[6], + // args[7] is null if the xcontent does not contain replicated ranges + InternalFilesReplicatedRanges.from((List) args[7]), + extraContent == null ? List.of() : extraContent, + (TimestampFieldValueRange) args[9] + ); + } + ); + static { + PARSER.declareObject(constructorArg(), SHARD_ID_PARSER, new ParseField("shard_id")); + PARSER.declareLong(constructorArg(), new ParseField("generation")); + PARSER.declareLong(constructorArg(), new ParseField("primary_term")); + PARSER.declareLong(optionalConstructorArg(), new ParseField(TRANSLOG_RECOVERY_START_FILE)); + PARSER.declareString(constructorArg(), new ParseField("node_ephemeral_id")); + PARSER.declareObject( + constructorArg(), + (p, c) -> p.map(HashMap::new, BlobLocation::fromXContent), + new ParseField("commit_files") + ); + PARSER.declareObjectArray(constructorArg(), InternalFile.PARSER, new ParseField("internal_files")); + PARSER.declareObjectArray( + optionalConstructorArg(), + InternalFilesReplicatedRanges.InternalFileReplicatedRange.PARSER, + new ParseField("internal_files_replicated_ranges") + ); + PARSER.declareObjectArray(optionalConstructorArg(), InternalFile.PARSER, new ParseField("extra_content")); + PARSER.declareObject( + optionalConstructorArg(), + TimestampFieldValueRange.TIMESTAMP_FIELD_VALUE_RANGE_PARSER, + new ParseField("timestamp_field_value_range") + ); + } + } + + try (XContentParser parser = XContentType.SMILE.xContent().createParser(XContentParserConfiguration.EMPTY, is)) { + XContentStatelessCompoundCommit c = XContentStatelessCompoundCommit.PARSER.parse(parser, null); + assert headerSize > 0; + long internalFilesLength = c.internalFiles.stream().mapToLong(InternalFile::length).sum(); + long extraContentLength = c.extraContent.stream().mapToLong(InternalFile::length).sum(); + long totalSizeInBytes = headerSize + c.replicatedContentMetadata.dataSizeInBytes() + internalFilesLength + extraContentLength; + return statelessCompoundCommit( + c.shardId, + c.generation, + c.primaryTerm, + c.translogRecoveryStartFile, + c.nodeEphemeralId, + c.referencedBlobLocations, + c.internalFiles, + c.replicatedContentMetadata, + offset, + headerSize, + totalSizeInBytes, + bccGenSupplier, + c.extraContent, + c.timestampFieldValueRange + ); + } + } + + // visible for testing + static StatelessCompoundCommit statelessCompoundCommit( + ShardId shardId, + long generation, + long primaryTerm, + long translogRecoveryStartFile, + String nodeEphemeralId, + Map referencedBlobLocations, + List internalFiles, + InternalFilesReplicatedRanges replicatedContentRanges, + long internalFilesOffset, + long headerSizeInBytes, + long totalSizeInBytes, + Function bccGenSupplier, + List extraContent, + @Nullable TimestampFieldValueRange timestampFieldValueRange + ) { + PrimaryTermAndGeneration bccTermAndGen = new PrimaryTermAndGeneration(primaryTerm, bccGenSupplier.apply(generation)); + var blobFile = new BlobFile(StatelessCompoundCommit.blobNameFromGeneration(bccTermAndGen.generation()), bccTermAndGen); + final var combinedCommitFilesResult = combineCommitFiles( + blobFile, + replicatedContentRanges, + internalFiles, + referencedBlobLocations, + internalFilesOffset, + headerSizeInBytes, + extraContent + ); + + return new StatelessCompoundCommit( + shardId, + new PrimaryTermAndGeneration(primaryTerm, generation), + translogRecoveryStartFile, + nodeEphemeralId, + combinedCommitFilesResult.commitFiles, + totalSizeInBytes, + internalFiles.stream().map(InternalFile::name).collect(Collectors.toSet()), + headerSizeInBytes, + replicatedContentRanges, + combinedCommitFilesResult.extraContent, + timestampFieldValueRange + ); + } + + public record TimestampFieldValueRange(long minMillis, long maxMillis) implements Writeable { + + static final ConstructingObjectParser TIMESTAMP_FIELD_VALUE_RANGE_PARSER = + new ConstructingObjectParser<>( + "timestamp_field_value_range", + args -> new TimestampFieldValueRange((long) args[0], (long) args[1]) + ); + static { + TIMESTAMP_FIELD_VALUE_RANGE_PARSER.declareLong(constructorArg(), new ParseField("min_millis")); + TIMESTAMP_FIELD_VALUE_RANGE_PARSER.declareLong(constructorArg(), new ParseField("max_millis")); + } + + public TimestampFieldValueRange { + if (minMillis > maxMillis) { + throw new IllegalArgumentException("Invalid millis timestamp range [" + minMillis + ", " + maxMillis + "]"); + } + } + + public TimestampFieldValueRange(StreamInput in) throws IOException { + this(in.readLong(), in.readLong()); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeLong(minMillis); + out.writeLong(maxMillis); + } + } + + record CombinedCommitFilesResult( + // all files that are part of this commit, either referenced or internal + Map commitFiles, + // the extra content of the commit + Map extraContent + ) {} + + /** + * This method combines the referenced blob locations with the files uploaded in this commit, and returns the commit files. + * Additionally, calculates the extra content files of the commit and returns them separately. + */ + static CombinedCommitFilesResult combineCommitFiles( + BlobFile blobFile, + InternalFilesReplicatedRanges replicatedContentRanges, + List internalFiles, + Map referencedBlobFiles, + long offset, + long headerSizeInBytes, + List extraContent + ) { + var commitFiles = Maps.newHashMapWithExpectedSize(referencedBlobFiles.size() + internalFiles.size()); + commitFiles.putAll(referencedBlobFiles); + + long currentOffset = offset + headerSizeInBytes + replicatedContentRanges.dataSizeInBytes(); + for (InternalFile internalFile : internalFiles) { + commitFiles.put(internalFile.name(), new BlobLocation(blobFile, currentOffset, internalFile.length())); + currentOffset += internalFile.length(); + } + + Map extraContentMap = extraContent.isEmpty() + ? Map.of() + : Maps.newHashMapWithExpectedSize(extraContent.size()); + for (InternalFile extraFile : extraContent) { + extraContentMap.put(extraFile.name(), new BlobLocation(blobFile, currentOffset, extraFile.length())); + currentOffset += extraFile.length(); + } + + return new CombinedCommitFilesResult(Collections.unmodifiableMap(commitFiles), Collections.unmodifiableMap(extraContentMap)); + } + + private static void shardIdXContent(ShardId shardId, XContentBuilder b) throws IOException { + // Can't use Shard#toXContent because it loses index_uuid + b.startObject("shard_id").field("index", shardId.getIndex()).field("id", shardId.id()).endObject(); + } + + private static final ConstructingObjectParser SHARD_ID_PARSER = new ConstructingObjectParser<>( + "shard_id", + args -> new ShardId((Index) args[0], (int) args[1]) + ); + static { + SHARD_ID_PARSER.declareObject(constructorArg(), (p, c) -> Index.fromXContent(p), new ParseField("index")); + SHARD_ID_PARSER.declareInt(constructorArg(), new ParseField("id")); + } + + // Since CC and BCC share the same naming scheme, this method works equally for both of them. + public static boolean startsWithBlobPrefix(String name) { + return name.startsWith(StatelessCompoundCommit.PREFIX); + } + + // Since CC and BCC share the same naming scheme, this method works equally for both of them. + public static String blobNameFromGeneration(long generation) { + assert generation > 0 : generation; + return StatelessCompoundCommit.PREFIX + generation; + } + + // Since CC and BCC share the same naming scheme, this method works equally for both of them. + public static long parseGenerationFromBlobName(String name) { + assert startsWithBlobPrefix(name) : name; + return Long.parseLong(name.substring(name.lastIndexOf('_') + 1)); + } + + private static boolean assertSortedBySize(Iterable files) { + InternalFile previous = null; + for (InternalFile file : files) { + if (previous != null && previous.compareTo(file) >= 0) { + return false; + } + previous = file; + } + return true; + } + + record InternalFile(String name, long length) implements Writeable, ToXContentObject, Comparable { + + private static final ConstructingObjectParser PARSER = new ConstructingObjectParser<>( + "internal_file", + true, + args -> new InternalFile((String) args[0], (long) args[1]) + ); + + static { + PARSER.declareString(constructorArg(), new ParseField("name")); + PARSER.declareLong(constructorArg(), new ParseField("length")); + } + + private InternalFile(StreamInput streamInput) throws IOException { + this(streamInput.readString(), streamInput.readLong()); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(name); + out.writeLong(length); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + return builder.startObject().field("name", name).field("length", length).endObject(); + } + + @Override + public int compareTo(InternalFile o) { + int cmp = Long.compare(length, o.length); + if (cmp != 0) { + return cmp; + } + return name.compareTo(o.name); + } + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/VirtualBatchedCompoundCommit.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/VirtualBatchedCompoundCommit.java new file mode 100644 index 0000000000000..f5b206e2f41df --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/commits/VirtualBatchedCompoundCommit.java @@ -0,0 +1,1222 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ResourceNotFoundException; +import org.elasticsearch.blobcache.BlobCacheUtils; +import org.elasticsearch.blobcache.shared.SharedBytes; +import org.elasticsearch.common.io.stream.PositionTrackingOutputStreamStreamOutput; +import org.elasticsearch.common.lucene.store.InputStreamIndexInput; +import org.elasticsearch.common.util.Maps; +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; +import org.elasticsearch.common.util.set.Sets; +import org.elasticsearch.core.AbstractRefCounted; +import org.elasticsearch.core.IOUtils; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.core.Streams; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.index.seqno.SequenceNumbers; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.index.snapshots.blobstore.SlicedInputStream; +import org.elasticsearch.index.store.LuceneFilesExtensions; +import org.elasticsearch.index.store.Store; +import org.elasticsearch.threadpool.ThreadPool; +import org.elasticsearch.xpack.stateless.StatelessPlugin; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.InternalFile; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.TimestampFieldValueRange; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; +import org.elasticsearch.xpack.stateless.lucene.StatelessCommitRef; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.UncheckedIOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NavigableMap; +import java.util.NavigableSet; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Function; +import java.util.function.LongPredicate; +import java.util.function.LongSupplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static java.util.stream.Collectors.groupingBy; +import static org.elasticsearch.common.io.Streams.limitStream; +import static org.elasticsearch.xpack.stateless.commits.ReplicatedContent.ALWAYS_REPLICATE; +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.isGenerationalFile; + +/** + * Represents a collection of non-uploaded compound commits, where multiple commits can be added and read, + * ensuring they will all be uploaded as a single blob with fixed offsets within the final batched compound commit. + * + *

+ * This class uses ref-counting to ensure that readers can have access to the underlying Lucene segments once + * they've acquired a reference through {@link #incRef()}. The acquired reference remains valid until it is + * released using the {@link #decRef()} method. + *

+ * + *

+ * It is expected that after the batched compound commit is written to the store using the + * {@link #getFrozenInputStreamForUpload()} method, the caller should promptly invoke {@link #close()} on the input stream and the VBCC + * instance. This action releases the acquired Lucene commit reference and facilitates the proper release of associated resources. + *

+ * + *

+ * This class facilitates the appending of multiple compound commits via + * {@link #appendCommit(StatelessCommitRef, boolean, TimestampFieldValueRange)}. + * When the caller intends to write these commits to the blob store it should use {@link #getFrozenInputStreamForUpload()}. + *

+ * + * */ +public class VirtualBatchedCompoundCommit extends AbstractRefCounted implements Closeable, AbstractBatchedCompoundCommit { + + private static final Logger logger = LogManager.getLogger(VirtualBatchedCompoundCommit.class); + + private static final Logger LOG_TIME_SPENT_READING_DURING_UPLOAD = LogManager.getLogger( + VirtualBatchedCompoundCommit.class.getCanonicalName() + ".time_spent_reading_during_upload" + ); + + private final ShardId shardId; + private final String nodeEphemeralId; + private final Function uploadedBlobLocationsSupplier; + private final NavigableSet pendingCompoundCommits; + // TODO: the internal files should be added to the corresponding BlobReferences + private final Map internalLocations = new ConcurrentHashMap<>(); + // Maps internal data (pending compound commits' headers, files, padding) to their offset in the virtual batched compound commit + private final NavigableMap internalDataReadersByOffset = new ConcurrentSkipListMap<>(); + private final AtomicLong currentOffset = new AtomicLong(); + private final AtomicReference appendingCommitThread = new AtomicReference<>(); + private final BlobFile blobFile; + private final PrimaryTermAndGeneration primaryTermAndGeneration; + private final long creationTimeInMillis; + // VBCC can no longer be appended to once it is frozen + private volatile boolean frozen = false; + + // Tracks search nodes notified that the non-uploaded VBCC's commits are available from the index node. + // Search shards may move to new search nodes before the commits are uploaded and tracking in the BlobReference begins. + // So tracking begins here before a BlobReference is created. + private final Set notifiedSearchNodeIds; + + // Size of a region in cache + private final int cacheRegionSizeInBytes; + + // An estimate of the maximum size of a header in a cache region. + // This is used to avoid adding replicated content for files that are already included in the first region. + private final int estimatedMaxHeaderSizeInBytes; + + public VirtualBatchedCompoundCommit( + ShardId shardId, + String nodeEphemeralId, + long primaryTerm, + long generation, + Function uploadedBlobLocationsSupplier, + LongSupplier timeInMillisSupplier, + int cacheRegionSize, + int estimatedMaxHeaderSizeInBytes + ) { + this.shardId = shardId; + this.nodeEphemeralId = nodeEphemeralId; + this.uploadedBlobLocationsSupplier = uploadedBlobLocationsSupplier; + this.pendingCompoundCommits = new ConcurrentSkipListSet<>(); + this.primaryTermAndGeneration = new PrimaryTermAndGeneration(primaryTerm, generation); + this.blobFile = new BlobFile(StatelessCompoundCommit.blobNameFromGeneration(generation), primaryTermAndGeneration); + this.creationTimeInMillis = timeInMillisSupplier.getAsLong(); + this.notifiedSearchNodeIds = ConcurrentCollections.newConcurrentSet(); + this.cacheRegionSizeInBytes = cacheRegionSize; + if (estimatedMaxHeaderSizeInBytes < 0 || cacheRegionSizeInBytes < estimatedMaxHeaderSizeInBytes) { + throw new IllegalArgumentException( + "Must be 0.0 to " + cacheRegionSizeInBytes + " inclusive but got " + estimatedMaxHeaderSizeInBytes + ); + } + this.estimatedMaxHeaderSizeInBytes = estimatedMaxHeaderSizeInBytes; + } + + public void addNotifiedSearchNodeIds(Collection nodeIds) { + assert frozen == false : "Unable to add notified search nodes ids after the VBCC is finalized"; + notifiedSearchNodeIds.addAll(nodeIds); + } + + public Set getNotifiedSearchNodeIds() { + assert frozen : "Accessing the notified search node id list before the VBCC is finalized"; + return Collections.unmodifiableSet(notifiedSearchNodeIds); + } + + /** + * Freeze the VBCC so that no more CC can be appended. The VBCC is guaranteed to be frozen afterwards. + * No synchronization is needed for this method because its sole caller is itself synchronized + * @return {@code true} if the VBCC is frozen by this thread or + * {@code false} if it is already frozen or concurrently frozen by other threads. + */ + public boolean freeze() { + assert assertCompareAndSetFreezeOrAppendingCommitThread(null, Thread.currentThread()); + try { + assert pendingCompoundCommits.isEmpty() == false : "Cannot freeze an empty virtual batch compound commit"; + if (isFrozen()) { + return false; + } + frozen = true; + logger.debug("VBCC is successfully frozen"); + return true; + } finally { + assert assertCompareAndSetFreezeOrAppendingCommitThread(Thread.currentThread(), null); + } + } + + /** + * Add the specified {@link StatelessCommitRef} as {@link PendingCompoundCommit} + * No synchronization is needed for this method because its sole caller is itself synchronized + * @return {@code true} if the append is successful or {@code false} if the VBCC is frozen and cannot be appended to + */ + public boolean appendCommit( + StatelessCommitRef reference, + boolean useInternalFilesReplicatedContent, + @Nullable TimestampFieldValueRange timestampFieldValueRange + ) { + assert assertCompareAndSetFreezeOrAppendingCommitThread(null, Thread.currentThread()); + try { + return doAppendCommit(reference, useInternalFilesReplicatedContent, timestampFieldValueRange); + } catch (IOException e) { + throw new UncheckedIOException( + "Unable to append commit [" + reference.getPrimaryTerm() + ", " + reference.getGeneration() + "]", + e + ); + } catch (Throwable e) { + // This on purpose catches Throwables, to log potential assertions that may be otherwise masked by test suite timeouts. + logger.warn(shardId + " throwable while appending [" + reference.getPrimaryTerm() + ", " + reference.getGeneration() + "]", e); + throw e; + } finally { + assert assertCompareAndSetFreezeOrAppendingCommitThread(Thread.currentThread(), null); + } + } + + public boolean isFrozen() { + return frozen; + } + + private boolean doAppendCommit( + StatelessCommitRef reference, + boolean useInternalFilesReplicatedContent, + @Nullable TimestampFieldValueRange timestampFieldValueRange + ) throws IOException { + assert primaryTermAndGeneration.primaryTerm() == reference.getPrimaryTerm(); + assert (pendingCompoundCommits.isEmpty() && primaryTermAndGeneration.generation() == reference.getGeneration()) + || (pendingCompoundCommits.isEmpty() == false && primaryTermAndGeneration.generation() < reference.getGeneration()); + assert pendingCompoundCommits.isEmpty() || pendingCompoundCommits.last().getGeneration() < reference.getGeneration(); + + // bail early if VBCC is already frozen to avoid doing any work + if (isFrozen()) { + return false; + } + + final var ccTermAndGen = new PrimaryTermAndGeneration(reference.getPrimaryTerm(), reference.getGeneration()); + final boolean isFirstCommit = ccTermAndGen.equals(primaryTermAndGeneration); + + // Ordered set of compound commit (CC) internal files + var internalFiles = new TreeSet(); + + // Map of compound commit (CC) referenced files + var referencedFiles = new HashMap(); + + var internalFilesSize = 0L; + for (String commitFile : reference.getCommitFiles()) { + boolean isAdditionalFile = reference.getAdditionalFiles().contains(commitFile); + if (isAdditionalFile || (isFirstCommit && isGenerationalFile(commitFile))) { + assert internalLocations.containsKey(commitFile) == false : commitFile; + var fileLength = reference.getDirectory().fileLength(commitFile); + internalFiles.add(new InternalFile(commitFile, fileLength)); + internalFilesSize += fileLength; + } else { + var blobLocation = internalLocations.get(commitFile); + assert blobLocation != null || isGenerationalFile(commitFile) == false : commitFile; + if (blobLocation == null) { + blobLocation = uploadedBlobLocationsSupplier.apply(commitFile); + assert blobLocation != null : commitFile; + assert blobLocation.getBatchedCompoundCommitTermAndGeneration().before(primaryTermAndGeneration); + } + referencedFiles.put(commitFile, blobLocation); + } + } + + var replicatedContent = useInternalFilesReplicatedContent + ? getReplicatedContent(ccTermAndGen, currentOffset.get(), internalFiles, internalFilesSize, reference.getDirectory()) + : ReplicatedContent.EMPTY; + var replicatedContentHeader = replicatedContent.header(); + + // We replicate referenced .si files into hollow commits as extra content appended after the internal files in the compound commit. + final List extraContentFiles = reference.isHollow() ? new ArrayList<>() : List.of(); + var extraContentSize = 0L; + if (reference.isHollow()) { + for (var entry : referencedFiles.entrySet()) { + if (Objects.equals(IndexFileNames.getExtension(entry.getKey()), LuceneFilesExtensions.SI.getExtension())) { + extraContentFiles.add(new InternalFile(entry.getKey(), entry.getValue().fileLength())); + extraContentSize += entry.getValue().fileLength(); + } + } + } + + var header = materializeCompoundCommitHeader( + reference, + internalFiles, + replicatedContentHeader, + referencedFiles, + useInternalFilesReplicatedContent, + extraContentFiles, + timestampFieldValueRange + ); + + final long sizeInBytes = header.length + replicatedContentHeader.dataSizeInBytes() + internalFilesSize + extraContentSize; + if (logger.isDebugEnabled()) { + var referencedBlobs = referencedFiles.values().stream().map(location -> location.blobFile().blobName()).distinct().count(); + logger.debug( + """ + {}{} appending commit ({} bytes). References external {} files in {} other CCs and adds + {} internal: {}, and + {} extra content files: {}.""", + shardId, + primaryTermAndGeneration, + sizeInBytes, + referencedFiles.size(), + referencedBlobs, + internalFiles.size(), + internalFiles, + extraContentFiles.size(), + extraContentFiles + ); + } + + // Add padding to the previous CC if it exists + if (pendingCompoundCommits.isEmpty() == false) { + var lastCompoundCommit = pendingCompoundCommits.last(); + long lastCompoundCommitSize = lastCompoundCommit.getSizeInBytes(); + long lastCompoundCommitSizePageAligned = BlobCacheUtils.toPageAlignedSize(lastCompoundCommitSize); + int padding = Math.toIntExact(lastCompoundCommitSizePageAligned - lastCompoundCommitSize); + if (padding > 0) { + lastCompoundCommit.setPadding(padding); + long paddingOffset = currentOffset.get(); + var previousPaddingOffset = internalDataReadersByOffset.put(paddingOffset, new InternalPaddingReader(padding)); + assert previousPaddingOffset == null; + currentOffset.set(paddingOffset + padding); + } + } + + final long headerOffset = currentOffset.get(); + assert headerOffset == BlobCacheUtils.toPageAlignedSize(headerOffset) : "header offset is not page-aligned: " + headerOffset; + var previousHeaderOffset = internalDataReadersByOffset.put(headerOffset, new InternalHeaderReader(header)); + assert previousHeaderOffset == null; + + long replicatedContentOffset = headerOffset + header.length; + for (var replicatedRangeReader : replicatedContent.readers()) { + var previousReplicatedContent = internalDataReadersByOffset.put(replicatedContentOffset, replicatedRangeReader); + assert previousReplicatedContent == null; + replicatedContentOffset += replicatedRangeReader.rangeLength(); + } + assert replicatedContentOffset == headerOffset + header.length + replicatedContentHeader.dataSizeInBytes(); + + long fileOffset = headerOffset + header.length + replicatedContentHeader.dataSizeInBytes(); + + // Map of all compound commit (CC) files with their internal or referenced blob location + final var commitFiles = new HashMap<>(referencedFiles); + + for (var internalFile : internalFiles) { + var fileLength = internalFile.length(); + var blobLocation = new BlobLocation(blobFile, fileOffset, fileLength); + + var previousFile = commitFiles.put(internalFile.name(), blobLocation); + assert previousFile == null : internalFile.name(); + + var previousLocation = internalLocations.put(internalFile.name(), blobLocation); + assert previousLocation == null : internalFile.name(); + + var previousOffset = internalDataReadersByOffset.put( + fileOffset, + new InternalFileReader(internalFile.name(), reference.getDirectory()) + ); + assert previousOffset == null : internalFile.name(); + fileOffset += fileLength; + } + currentOffset.set(fileOffset); + + // Extra content files + final Map extraContent = extraContentFiles.isEmpty() + ? Map.of() + : Maps.newHashMapWithExpectedSize(extraContentFiles.size()); + for (var extraFile : extraContentFiles) { + var fileLength = extraFile.length(); + var blobLocation = new BlobLocation(blobFile, fileOffset, fileLength); + + var previousFile = extraContent.put(extraFile.name(), blobLocation); + assert previousFile == null : extraFile.name(); + + var previousOffset = internalDataReadersByOffset.put( + fileOffset, + new InternalFileReader(extraFile.name(), reference.getDirectory()) + ); + assert previousOffset == null : extraFile.name(); + fileOffset += fileLength; + } + currentOffset.set(fileOffset); + + var pendingCompoundCommit = new PendingCompoundCommit( + header.length, + reference, + reference.isHollow() + ? StatelessCompoundCommit.newHollowStatelessCompoundCommit( + shardId, + ccTermAndGen, + Collections.unmodifiableMap(commitFiles), + sizeInBytes, + internalFiles.stream().map(InternalFile::name).collect(Collectors.toUnmodifiableSet()), + header.length, + replicatedContent.header(), + Collections.unmodifiableMap(extraContent), + timestampFieldValueRange + ) + : new StatelessCompoundCommit( + shardId, + ccTermAndGen, + reference.getTranslogRecoveryStartFile(), + nodeEphemeralId, + Collections.unmodifiableMap(commitFiles), + sizeInBytes, + internalFiles.stream().map(InternalFile::name).collect(Collectors.toUnmodifiableSet()), + header.length, + replicatedContent.header(), + Collections.unmodifiableMap(extraContent), + timestampFieldValueRange + ), + Long.parseLong(reference.getIndexCommit().getUserData().get(SequenceNumbers.MAX_SEQ_NO)) + ); + pendingCompoundCommits.add(pendingCompoundCommit); + assert currentOffset.get() == headerOffset + pendingCompoundCommit.getSizeInBytes() + : "current offset " + + currentOffset.get() + + " should be equal to header offset " + + headerOffset + + " plus size of pending compound commit " + + pendingCompoundCommit.getSizeInBytes(); + assert assertInternalConsistency(); + return true; + } + + private boolean assertInternalConsistency() { + final Set allInternalFiles = pendingCompoundCommits.stream() + .flatMap(pc -> pc.getStatelessCompoundCommit().internalFiles().stream()) + .collect(Collectors.toUnmodifiableSet()); + assert allInternalFiles.equals(internalLocations.keySet()) : "all internal files must have internal blobLocations"; + + final var sizeInBytes = pendingCompoundCommits.stream().mapToLong(PendingCompoundCommit::getSizeInBytes).sum(); + assert sizeInBytes == currentOffset.get() : "current offset must be at the end of the VBCC"; + + var it = pendingCompoundCommits.iterator(); + while (it.hasNext()) { + var pendingCompoundCommit = it.next(); + final var cc = pendingCompoundCommit.getStatelessCompoundCommit(); + // Assert that compound commits have padding to be page-aligned, except for the last compound commit + assert it.hasNext() == false || pendingCompoundCommit.getSizeInBytes() == BlobCacheUtils.toPageAlignedSize(cc.sizeInBytes()) + : "intermediate statelessCompoundCommit size in bytes " + + cc.sizeInBytes() + + " plus padding length " + + pendingCompoundCommit.padding + + " should be equal to page-aligned size in bytes " + + BlobCacheUtils.toPageAlignedSize(cc.sizeInBytes()); + assert it.hasNext() || pendingCompoundCommit.padding == 0 : "last pending compound commit should not have padding"; + + // Assert that all generational files are contained in the same VBCC (no reference to a previous VBCC or BCC) + for (var commitFile : cc.commitFiles().entrySet()) { + assert isGenerationalFile(commitFile.getKey()) == false + || commitFile.getValue().getBatchedCompoundCommitTermAndGeneration().equals(primaryTermAndGeneration) + : "generational file " + + commitFile.getValue() + + " should be located in BCC " + + primaryTermAndGeneration + + " but got " + + commitFile.getValue().getBatchedCompoundCommitTermAndGeneration(); + } + + assert cc.commitFiles().keySet().containsAll(cc.internalFiles()) + : "internal files " + cc.internalFiles() + " must be part of commit files " + cc.commitFiles().keySet(); + + assert cc.extraContent().isEmpty() || cc.hollow() : "currently only hollow commits can have extra files"; + assert cc.extraContent() + .keySet() + .stream() + .allMatch(filename -> Objects.equals(IndexFileNames.getExtension(filename), LuceneFilesExtensions.SI.getExtension())) + : "currently only segment info ." + + LuceneFilesExtensions.SI.getExtension() + + " files are expected to be in extra content files, and found " + + cc.extraContent().keySet(); + assert cc.commitFiles().keySet().containsAll(cc.extraContent().keySet()) + : "extra content files " + + cc.extraContent().keySet() + + " must be part of commit files " + + cc.commitFiles().keySet() + + " as they currently consist of replicated referenced files"; + assert cc.hollow() == false + || cc.commitFiles() + .entrySet() + .stream() + .filter( + e -> cc.internalFiles().contains(e.getKey()) == false + && Objects.equals(IndexFileNames.getExtension(e.getKey()), LuceneFilesExtensions.SI.getExtension()) + ) + .allMatch(e -> cc.extraContent().containsKey(e.getKey())) + : "hollow commit referenced .si files in " + + cc.commitFiles().keySet() + + " must be part of extra content files " + + cc.extraContent().keySet(); + + // a blob location that refers to the whole CC, so that we can check that internal and extra content files are in these bounds + final var maxBlobLocation = Stream.concat(cc.commitFiles().entrySet().stream(), cc.extraContent().entrySet().stream()) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (value1, value2) -> value2)) + .values() + .stream() + .max((bl1, bl2) -> { + final var bcc = bl1.getBatchedCompoundCommitTermAndGeneration(); + int cmp = bcc.compareTo(bl2.getBatchedCompoundCommitTermAndGeneration()); + if (cmp != 0) { + return cmp; + } + return Long.compare(bl1.offset(), bl2.offset()); + }) + .get(); + final var ccLocation = new BlobLocation( + maxBlobLocation.blobFile(), + maxBlobLocation.offset() + maxBlobLocation.fileLength() - cc.sizeInBytes(), + cc.sizeInBytes() + ); + assert cc.extraContent().values().stream().allMatch(location -> ccLocation.contains(location)) + : "all extra content files " + + cc.extraContent() + + " must be contained within the compound commit file location " + + ccLocation; + assert cc.internalFiles().stream().allMatch(file -> ccLocation.contains(cc.commitFiles().get(file))) + : "all internal files " + + cc.internalFiles() + + " of commit files " + + cc.commitFiles() + + " must be contained within the compound commit file location " + + ccLocation; + assert Sets.difference(cc.commitFiles().keySet(), cc.internalFiles()) + .stream() + .allMatch(file -> ccLocation.contains(cc.commitFiles().get(file)) == false) + : "all referenced files must be located outside the compound commit file location " + + ccLocation + + " but got " + + cc.commitFiles() + + " with " + + cc.internalFiles() + + " internal files"; + } + + // Group the internal data readers by class + final Map, List> internalDataReaderGroups = internalDataReadersByOffset.values() + .stream() + .collect(groupingBy(internalHeaderOrFile -> internalHeaderOrFile.getClass())); + assert internalDataReaderGroups.get(InternalHeaderReader.class).size() == pendingCompoundCommits.size() + : "all pending CCs must have header offsets"; + final Set allExtraContentFiles = pendingCompoundCommits.stream() + .flatMap(pc -> pc.getStatelessCompoundCommit().extraContent().keySet().stream()) + .collect(Collectors.toUnmodifiableSet()); + assert Sets.union(allInternalFiles, allExtraContentFiles) + .equals( + Set.copyOf( + internalDataReaderGroups.get(InternalFileReader.class).stream().map(r -> ((InternalFileReader) r).filename).toList() + ) + ) : "all internal and extra content files must have a corresponding InternalFileReader"; + if (internalDataReaderGroups.containsKey(InternalPaddingReader.class)) { + assert internalDataReaderGroups.get(InternalPaddingReader.class).size() < pendingCompoundCommits.size() + : "paddings " + + internalDataReaderGroups.get(InternalPaddingReader.class).size() + + " are more than pending CCs (excluding the last one) " + + (pendingCompoundCommits.size() - 1); + internalDataReaderGroups.get(InternalPaddingReader.class).forEach(reader -> { + assert reader instanceof InternalPaddingReader; + InternalPaddingReader paddingReader = (InternalPaddingReader) reader; + assert paddingReader.padding < SharedBytes.PAGE_SIZE + : "padding " + paddingReader.padding + " is more than page size " + SharedBytes.PAGE_SIZE; + }); + } + return true; + } + + private ReplicatedContent getReplicatedContent( + PrimaryTermAndGeneration commitTermAndGen, + long currentOffset, + TreeSet internalFiles, + long internalFilesSize, + Directory directory + ) { + // Current position of the start of the commit (relative to the start of the region) + int currentPositionInRegion = (int) (currentOffset % cacheRegionSizeInBytes); + + // Approximate position of the end of the header + replicated content (using the estimated max. header size as a hint) + int estimatedHeaderEnd = Math.addExact(currentPositionInRegion, estimatedMaxHeaderSizeInBytes); + + // Approximate position of the end of the commit + var estimatedCommitEnd = Math.addExact(estimatedHeaderEnd, internalFilesSize); + + // Check if the header and internal files completely fit into the same region, in which case there is no need for replicated content + if (estimatedCommitEnd < cacheRegionSizeInBytes) { + logger.trace( + "{} skipping content replication: commit {} at offset [{}][{}] with [{}] bytes of internal files " + + "would fit within the cache region of [{}] bytes assuming an approximate header size of [{}] bytes", + shardId, + commitTermAndGen, + currentOffset, + currentPositionInRegion, + internalFilesSize, + cacheRegionSizeInBytes, + estimatedMaxHeaderSizeInBytes + ); + return ReplicatedContent.EMPTY; + } + + LongPredicate shouldReplicate; + if (estimatedMaxHeaderSizeInBytes != 0) { + // Replicate content for internal files that are not in the same region as the header + shouldReplicate = (offset) -> cacheRegionSizeInBytes <= (estimatedHeaderEnd + offset); + } else { + shouldReplicate = ALWAYS_REPLICATE; // Keep previous behavior + } + return ReplicatedContent.create(true, internalFiles, directory, shouldReplicate); + } + + public BatchedCompoundCommit getFrozenBatchedCompoundCommit() { + assert isFrozen() : "Cannot serialize before freeze"; + assert assertInternalConsistency(); + + List compoundCommits = new ArrayList<>(pendingCompoundCommits.size()); + for (PendingCompoundCommit pendingCompoundCommit : pendingCompoundCommits) { + compoundCommits.add(pendingCompoundCommit.getStatelessCompoundCommit()); + } + return new BatchedCompoundCommit(primaryTermAndGeneration, Collections.unmodifiableList(compoundCommits)); + } + + /** + * Generate an InputStream of the serialized VBCC suitable for upload to blob storage. + *

+ * The InputStream is implemented as a {@link SlicedInputStream} that iterates over a set of InputStreams + * representing VBCC metadata and each of the Lucene files included in the VBCC commits. + *

+ * As each Lucene file InputStream is consumed, it maintains a running checksum of the bytes read, which it compares + * to the checksum in the Lucene file footer when the file substream is closed. Each substream is closed before opening + * the next, according to the contract of {@link SlicedInputStream}. + *

+ * If Lucene file corruption is detected when the file is closed, it will throw a CorruptIndexException that propagates + * up through the SlicedInputStream as it is read. + * + * @return an InputStream of the VBCC, which throws CorruptIndexException on Lucene checksum mismatch, in addition to + * general IOExceptions on IO error. + */ + public InputStream getFrozenInputStreamForUpload() { + assert isFrozen() : "Cannot stream before freeze"; + assert assertInternalConsistency(); + return getInputStreamForUpload(); + } + + InputStream getInputStreamForUpload() { + mustIncRef(); + List offsets = internalDataReadersByOffset.navigableKeySet().stream().collect(Collectors.toUnmodifiableList()); + return wrapForLogging(new SlicedInputStream(offsets.size()) { + @Override + protected InputStream openSlice(int slice) throws IOException { + final var offset = offsets.get(slice); + final var reader = internalDataReadersByOffset.get(offset); + return reader.getInputStream(); + } + + @Override + public void close() throws IOException { + if (isClosed() == false) { + try { + super.close(); + } finally { + decRef(); + } + } + } + }); + } + + public InputStream getFrozenInputStreamForUpload(final long offset, final long length) { + assert isFrozen() : "Cannot stream before freeze"; + assert assertInternalConsistency(); + assert hasReferences(); + + mustIncRef(); + final var slices = internalDataReadersByOffset.subMap( + internalDataReadersByOffset.floorKey(offset), + true, + // could have been offset + length - 1, but we avoid an `if` that we'd + // otherwise need to avoid a NPE for the case of getBytesByRange(0, 0). + internalDataReadersByOffset.floorKey(offset + length), + true + ).entrySet().stream().toList(); + + if (slices.isEmpty()) { + return new ByteArrayInputStream(BytesRef.EMPTY_BYTES); + } + return limitStream(wrapForLogging(new SlicedInputStream(slices.size()) { + + final AtomicBoolean closed = new AtomicBoolean(); + + @Override + protected InputStream openSlice(int n) throws IOException { + var slice = slices.get(n); + long skipBytes = Math.max(0L, offset - slice.getKey()); + assert skipBytes == 0 || n == 0 : "can be non-zero only for the first entry, but got: " + skipBytes + " for slice " + n; + if (skipBytes > 0) { + // make sure that we validate the checksum of any file we reach the end of. To do this we need to read from the + // beginning of this file even when we're starting at an offset. We do want to avoid reading skipped bytes if we are + // not reaching the end of the file in this slice to minimize overread, so we only read skipped bytes if the end of the + // first file in this slice is contained in the slice. + final long chunkEnd = offset + length; + final Long higherKey = internalDataReadersByOffset.higherKey(offset); + final long fileEnd = higherKey == null ? getTotalSizeInBytes() : higherKey; + final boolean overread = fileEnd <= chunkEnd; + var stream = overread ? slice.getValue().getInputStream() : slice.getValue().getInputStream(skipBytes, Long.MAX_VALUE); + if (overread) { + stream.skipNBytes(skipBytes); + } + assert stream.markSupported(); + return stream; + } else { + var stream = slice.getValue().getInputStream(); + assert stream.markSupported(); + return stream; + } + } + + @Override + public void close() throws IOException { + if (closed.compareAndSet(false, true)) { + try { + super.close(); + } finally { + decRef(); + } + } + } + }), length); + } + + public String getBlobName() { + return blobFile.blobName(); + } + + public ShardId getShardId() { + return shardId; + } + + public PrimaryTermAndGeneration getPrimaryTermAndGeneration() { + return primaryTermAndGeneration; + } + + @Override + public PrimaryTermAndGeneration primaryTermAndGeneration() { + return getPrimaryTermAndGeneration(); + } + + public long getTotalSizeInBytes() { + return currentOffset.get(); + } + + public Map getInternalLocations() { + return internalLocations; + } + + public long getCreationTimeInMillis() { + return creationTimeInMillis; + } + + public StatelessCompoundCommit lastCompoundCommit() { + assert pendingCompoundCommits.isEmpty() == false; + return pendingCompoundCommits.last().getStatelessCompoundCommit(); + } + + public long getMaxGeneration() { + assert pendingCompoundCommits.isEmpty() == false; + return pendingCompoundCommits.last().getGeneration(); + } + + public PendingCompoundCommit getLastPendingCompoundCommit() { + return pendingCompoundCommits.last(); + } + + @Override + public void close() { + decRef(); + } + + @Override + protected void closeInternal() { + IOUtils.closeWhileHandlingException(pendingCompoundCommits); + } + + public List getPendingCompoundCommits() { + return List.copyOf(pendingCompoundCommits); + } + + // TODO: make package-private ES-13786 + public int size() { + return pendingCompoundCommits.size(); + } + + // TODO: make package-private ES-13786 + public Set getPendingCompoundCommitGenerations() { + return pendingCompoundCommits.stream() + .map(PendingCompoundCommit::getStatelessCompoundCommit) + .map(StatelessCompoundCommit::primaryTermAndGeneration) + .collect(Collectors.toSet()); + } + + // visible for testing + public int getTotalPaddingInBytes() { + return pendingCompoundCommits.stream().mapToInt(pendingCompoundCommit -> pendingCompoundCommit.padding).sum(); + } + + private byte[] materializeCompoundCommitHeader( + StatelessCommitRef reference, + Iterable internalFiles, + InternalFilesReplicatedRanges replicatedRanges, + Map referencedFiles, + boolean useInternalFilesReplicatedContent, + Iterable extraContent, + @Nullable TimestampFieldValueRange timestampFieldValueRange + ) throws IOException { + assert getBlobName() != null; + try (ByteArrayOutputStream os = new ByteArrayOutputStream()) { + var positionTrackingOutputStreamStreamOutput = new PositionTrackingOutputStreamStreamOutput(os); + StatelessCompoundCommit.writeXContentHeader( + shardId, + reference.getGeneration(), + reference.getPrimaryTerm(), + reference.isHollow() ? "" : nodeEphemeralId, + reference.getTranslogRecoveryStartFile(), + timestampFieldValueRange, + referencedFiles, + internalFiles, + replicatedRanges, + positionTrackingOutputStreamStreamOutput, + useInternalFilesReplicatedContent, + extraContent + ); + return os.toByteArray(); + } + } + + // TODO: make package-private ES-13786 + public BlobLocation getBlobLocation(String fileName) { + var internalLocation = internalLocations.get(fileName); + return internalLocation == null ? uploadedBlobLocationsSupplier.apply(fileName) : internalLocation; + } + + /** + * Get the bytes of the virtual batched compound commit by reading the internal files (headers and internal files of pending + * compound commits) in the given range. + * @param offset the offset in the virtual batched compound commit to start reading internal files + * @param length the length of the range to read + * @param output the output to write the bytes to + * @throws IOException + */ + public void getBytesByRange(final long offset, final long length, final OutputStream output) throws IOException { + assert offset >= 0; + assert length >= 0 : "invalid length " + length; + assert offset + length <= currentOffset.get() : "range [" + offset + ", " + length + "] more than " + currentOffset.get(); + assert ThreadPool.assertCurrentThreadPool( + StatelessPlugin.GET_VIRTUAL_BATCHED_COMPOUND_COMMIT_CHUNK_THREAD_POOL, + StatelessPlugin.SHARD_WRITE_THREAD_POOL, + StatelessPlugin.PREWARM_THREAD_POOL, + StatelessPlugin.UPLOAD_PREWARM_THREAD_POOL + ); + + if (tryIncRef()) { + try { + NavigableMap subMap = internalDataReadersByOffset.subMap( + internalDataReadersByOffset.floorKey(offset), + true, + // could have been offset + length - 1, but we avoid an `if` that we'd + // otherwise need to avoid a NPE for the case of getBytesByRange(0, 0). + internalDataReadersByOffset.floorKey(offset + length), + true + ); + long remainingBytesToRead = length; + for (var entry : subMap.entrySet()) { + if (remainingBytesToRead <= 0) { + break; + } + InternalDataReader internalDataReader = entry.getValue(); + long skipBytes = Math.max(0, offset - entry.getKey()); // can be non-zero only for the first entry + try (var inputStream = internalDataReader.getInputStream(skipBytes, remainingBytesToRead)) { + long bytesRead = Streams.copy(inputStream, output, false); + remainingBytesToRead -= bytesRead; + } + } + assert remainingBytesToRead == 0 : "remaining bytes to read " + remainingBytesToRead; + } finally { + decRef(); + } + } else { + throw buildResourceNotFoundException(shardId, primaryTermAndGeneration); + } + } + + // TODO: make package-private ES-13786 + public boolean assertSameNodeEphemeralId(String id) { + assert id.equals(nodeEphemeralId) : id + " != " + nodeEphemeralId; + return true; + } + + @Override + public String toString() { + return "VirtualBatchedCompoundCommit{" + + "shardId=" + + shardId + + ", primaryTermAndGeneration=" + + primaryTermAndGeneration + + ", size=" + + size() + + ", nodeEphemeralId='" + + nodeEphemeralId + + '\'' + + ", creationTimeInMillis=" + + creationTimeInMillis + + ", frozen=" + + frozen + + '}'; + } + + public static ResourceNotFoundException buildResourceNotFoundException( + ShardId shardId, + PrimaryTermAndGeneration primaryTermAndGeneration + ) { + return new ResourceNotFoundException("BCC for shard " + shardId + " and " + primaryTermAndGeneration + " is already uploaded"); + } + + private boolean assertCompareAndSetFreezeOrAppendingCommitThread(Thread current, Thread updated) { + final Thread witness = appendingCommitThread.compareAndExchange(current, updated); + assert witness == current + : "Unable to set appending commit thread to [" + + updated + + "]: expected thread [" + + current + + "] to be the appending commit thread, but thread " + + witness + + " is already appending a commit to " + + getBlobName(); + return true; + } + + // TODO: make package-private ES-13786 + public static class PendingCompoundCommit implements Closeable, Comparable { + private final int headerSize; + private final StatelessCommitRef reference; + private final StatelessCompoundCommit statelessCompoundCommit; + private final long maxSeqNo; + // No need to be volatile because writing is synchronized at higher level in StatelessCommitService + // and reading is dispatched to another thread after a second synchronization + private int padding = 0; + + /** + * Creates a new pending to upload compound commit. Note that the last pending compound commit should not have padding. The + * padding is added to the previous pending compound commit when appending a new pending compound commit. + * @param headerSize the size of materialized compound commit header + * @param reference the lucene commit reference + * @param statelessCompoundCommit the associated compound commit that will be uploaded + */ + PendingCompoundCommit( + int headerSize, + StatelessCommitRef reference, + StatelessCompoundCommit statelessCompoundCommit, + long maxSeqNo + ) { + this.headerSize = headerSize; + this.reference = reference; + this.statelessCompoundCommit = statelessCompoundCommit; + this.maxSeqNo = maxSeqNo; + assert statelessCompoundCommit.hollow() == reference.isHollow() + : "stateless compound commit hollow flag [" + + statelessCompoundCommit.hollow() + + "] does not match a hollow reference [" + + reference.isHollow() + + "]"; + } + + void setPadding(int padding) { + this.padding = padding; + assert padding >= 0 : "padding " + padding + " is negative"; + } + + public long getGeneration() { + return reference.getGeneration(); + } + + // TODO: make package-private ES-13786 + public long getMaxSeqNo() { + return maxSeqNo; + } + + // TODO: make package-private ES-13786 + public StatelessCommitRef getCommitReference() { + return reference; + } + + /** + * the size of the compound commit including codec, header, checksums, all files, and padding + * Note that the last pending compound commit should not have padding. The padding is added to the previous pending compound commit + * when appending a new pending compound commit. + */ + public long getSizeInBytes() { + return statelessCompoundCommit.sizeInBytes() + padding; + } + + public StatelessCompoundCommit getStatelessCompoundCommit() { + return statelessCompoundCommit; + } + + // package-private for testing + long getHeaderSize() { + return headerSize; + } + + @Override + public int compareTo(PendingCompoundCommit o) { + return Long.compare(getGeneration(), o.getGeneration()); + } + + @Override + public void close() throws IOException { + logger.debug( + "{} releasing Lucene commit [term={}, gen={}]", + statelessCompoundCommit.shardId(), + reference.getPrimaryTerm(), + reference.getGeneration() + ); + reference.close(); + } + } + + /** + * Interface for reading internal data from a batched compound commit + */ + interface InternalDataReader { + /** + * Get the {@link InputStream} for reading the internal data. + * @param offset the number of bytes to skip in the internal data before starting to read the internal data. + * @param length the max number of bytes to read. ineffective if larger than the remaining available size of the internal data. + */ + InputStream getInputStream(long offset, long length) throws IOException; + + /** + * Get the {@link InputStream} for reading the entire contents of the contained file. + * @return An input stream that will read the entire contents of the file. + * @throws IOException + */ + InputStream getInputStream() throws IOException; + } + + /** + * Internal data reader for header bytes + */ + private record InternalHeaderReader(byte[] header) implements InternalDataReader { + @Override + public InputStream getInputStream(long offset, long length) throws IOException { + var stream = new ByteArrayInputStream(header); + stream.skipNBytes(offset); + return limitStream(stream, length); + } + + @Override + public InputStream getInputStream() { + return new ByteArrayInputStream(header); + } + } + + /** + * Internal data reader for an internal file + */ + private record InternalFileReader(String filename, Directory directory) implements InternalDataReader { + @Override + public InputStream getInputStream(long offset, long length) throws IOException { + long fileLength = directory.fileLength(filename); + assert offset < fileLength : "offset [" + offset + "] more than file length [" + fileLength + "]"; + long fileBytesToRead = Math.min(length, fileLength - offset); + var ioContext = filename.startsWith(IndexFileNames.SEGMENTS) ? IOContext.READONCE : IOContext.DEFAULT; + IndexInput input = directory.openInput(filename, ioContext); + try { + input.seek(offset); + return new InputStreamIndexInput(input, fileBytesToRead) { + @Override + public void close() throws IOException { + IOUtils.close(super::close, input); + } + }; + } catch (IOException e) { + IOUtils.closeWhileHandlingException(input); + throw e; + } + } + + /** + * Produce an input stream for an index file that updates a running checksum as it is read, and validates it against the Lucene + * footer when it is read to the end. + * @return an input stream for this instance's filename + * @throws IOException on an IO error opening the file (and the stream may throw CorruptIndexException on read) + */ + @Override + public InputStream getInputStream() throws IOException { + Store.VerifyingIndexInput input = new Store.VerifyingIndexInput(directory.openInput(filename, IOContext.READONCE)); + logger.trace("opening validating input for {}", filename); + + return new InputStreamIndexInput(input, input.length()) { + @Override + public int read() throws IOException { + int ret = super.read(); + verifyAtEnd(); + return ret; + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int ret = super.read(b, off, len); + verifyAtEnd(); + return ret; + } + + @Override + public void close() throws IOException { + IOUtils.close(super::close, input); + } + + void verifyAtEnd() throws IOException { + if (input.getFilePointer() == input.length()) { + input.verify(); + } + } + }; + } + } + + /** + * Internal data reader for padding bytes + */ + private record InternalPaddingReader(int padding) implements InternalDataReader { + + public InternalPaddingReader { + assert padding <= SharedBytes.PAGE_SIZE : "padding " + padding + " is more than page size " + SharedBytes.PAGE_SIZE; + } + + private static final byte[] PADDING_BYTES; + + static { + byte[] padding = new byte[SharedBytes.PAGE_SIZE]; + Arrays.fill(padding, (byte) 0); + PADDING_BYTES = padding; + } + + @Override + public InputStream getInputStream(long offset, long length) { + assert offset < padding : "offset [" + offset + "] more than padding length [" + padding + "]"; + int paddingBytesToRead = BlobCacheUtils.toIntBytes(Math.min(length, padding - offset)); + return limitStream(new ByteArrayInputStream(PADDING_BYTES), paddingBytesToRead); + } + + @Override + public InputStream getInputStream() { + return getInputStream(0L, padding); + } + } + + private InputStream wrapForLogging(InputStream stream) { + if (LOG_TIME_SPENT_READING_DURING_UPLOAD.isDebugEnabled()) { + return new LogTimeSpentReadingInputStream(stream, shardId, primaryTermAndGeneration); + } else { + return stream; + } + } + + /** + * {@link FilterInputStream} that tracks the time spent reading from the delegating input stream. + */ + private static class LogTimeSpentReadingInputStream extends FilterInputStream { + + private final ShardId shardId; + private final PrimaryTermAndGeneration primaryTermAndGeneration; + private long elapsedNanos; + private long bytes; + + LogTimeSpentReadingInputStream(InputStream in, ShardId shardId, PrimaryTermAndGeneration primaryTermAndGeneration) { + super(in); + this.shardId = shardId; + this.primaryTermAndGeneration = primaryTermAndGeneration; + } + + @Override + public int read() throws IOException { + long startTime = System.nanoTime(); + int result = super.read(); + elapsedNanos += (System.nanoTime() - startTime); + if (result != -1) { + bytes += 1L; + } + return result; + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + long startTime = System.nanoTime(); + var result = super.read(b, off, len); + elapsedNanos += (System.nanoTime() - startTime); + if (result != -1) { + bytes += result; + } + return result; + } + + @Override + public void close() throws IOException { + try { + super.close(); + } finally { + LOG_TIME_SPENT_READING_DURING_UPLOAD.debug( + "{} spent [{}] ms reading [{}] bytes from VBCC {} during upload", + shardId, + TimeValue.nsecToMSec(elapsedNanos), + bytes, + primaryTermAndGeneration + ); + } + } + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/engine/NewCommitNotification.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/engine/NewCommitNotification.java new file mode 100644 index 0000000000000..486f5bdb53731 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/engine/NewCommitNotification.java @@ -0,0 +1,52 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.engine; + +import org.elasticsearch.core.Nullable; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit; + +/** + * Notification of a newly created compound commit. + * + * @param compoundCommit The new compound commit + * @param batchedCompoundCommitGeneration The generation of the BCC that contains the CC. The BCC's primary term is the + * same as the CC's primary. + * @param latestUploadedBatchedCompoundCommitTermAndGen The generation of latest uploaded BCC. It is null if no upload has happened. + * @param clusterStateVersion The cluster state version on the node at the time the new compound commit was + * notified + * @param nodeId The id of the node that notified the new compound commit + */ +public record NewCommitNotification( + StatelessCompoundCommit compoundCommit, + long batchedCompoundCommitGeneration, + @Nullable PrimaryTermAndGeneration latestUploadedBatchedCompoundCommitTermAndGen, + long clusterStateVersion, + String nodeId +) { + public boolean isBatchedCompoundCommitUploaded() { + return latestUploadedBatchedCompoundCommitTermAndGen != null + && latestUploadedBatchedCompoundCommitTermAndGen.generation() == batchedCompoundCommitGeneration(); + } + + @Override + public String toString() { + return "NewCommitNotification{" + + "compoundCommit=" + + compoundCommit.toShortDescription() + + ", batchedCompoundCommitGeneration=" + + batchedCompoundCommitGeneration + + ", latestUploadedBatchedCompoundCommitTermAndGen=" + + latestUploadedBatchedCompoundCommitTermAndGen + + ", clusterStateVersion=" + + clusterStateVersion + + ", nodeId='" + + nodeId + + '\'' + + '}'; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/engine/PrimaryTermAndGeneration.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/engine/PrimaryTermAndGeneration.java new file mode 100644 index 0000000000000..8e5c27f565d3f --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/engine/PrimaryTermAndGeneration.java @@ -0,0 +1,63 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.engine; + +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; + +import java.io.IOException; +import java.util.Comparator; + +public record PrimaryTermAndGeneration(long primaryTerm, long generation) implements Writeable, Comparable { + + private static final Comparator COMPARATOR = Comparator.comparing(PrimaryTermAndGeneration::primaryTerm) + .thenComparing(PrimaryTermAndGeneration::generation); + + public static final PrimaryTermAndGeneration ZERO = new PrimaryTermAndGeneration(0, 0); + + public PrimaryTermAndGeneration(StreamInput in) throws IOException { + this(in.readVLong(), in.readVLong()); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVLong(primaryTerm); + out.writeVLong(generation); + } + + @Override + public String toString() { + return "[term=" + primaryTerm + ", gen=" + generation + ']'; + } + + @Override + public int compareTo(PrimaryTermAndGeneration other) { + return COMPARATOR.compare(this, other); + } + + public boolean after(PrimaryTermAndGeneration other) { + return compareTo(other) > 0; + } + + public boolean onOrAfter(PrimaryTermAndGeneration other) { + return compareTo(other) >= 0; + } + + public boolean before(PrimaryTermAndGeneration other) { + return compareTo(other) < 0; + } + + public boolean onOrBefore(PrimaryTermAndGeneration other) { + return compareTo(other) <= 0; + } + + public static PrimaryTermAndGeneration max(PrimaryTermAndGeneration a, PrimaryTermAndGeneration b) { + return a.onOrAfter(b) ? a : b; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/lucene/FileCacheKey.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/lucene/FileCacheKey.java new file mode 100644 index 0000000000000..6f96afcef0761 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/lucene/FileCacheKey.java @@ -0,0 +1,17 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.lucene; + +import org.elasticsearch.blobcache.shared.SharedBlobCacheService; +import org.elasticsearch.index.shard.ShardId; + +public record FileCacheKey(ShardId shardId, long primaryTerm, String fileName) implements SharedBlobCacheService.KeyBase { + public FileCacheKey { + assert shardId != null; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/lucene/StatelessCommitRef.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/lucene/StatelessCommitRef.java new file mode 100644 index 0000000000000..13af146e23eda --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/lucene/StatelessCommitRef.java @@ -0,0 +1,132 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.lucene; + +import org.elasticsearch.common.lucene.FilterIndexCommit; +import org.elasticsearch.index.engine.Engine; +import org.elasticsearch.index.shard.ShardId; + +import java.io.Closeable; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collection; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; + +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.HOLLOW_TRANSLOG_RECOVERY_START_FILE; + +/** + * StatelessCommitRef is a wrapper around a Lucene commit that contains additional information, like the new files introduced by this commit + * or the starting point of the translog to use to recover the commit. Closing a StatelessCommitRef may trigger the deletion of the + * underlying Lucene commit. + */ +public class StatelessCommitRef extends FilterIndexCommit implements Closeable { + + public static final String TRANSLOG_CARRY_OVER = "translog_carry_over"; + + private final ShardId shardId; + private final Engine.IndexCommitRef indexCommitRef; + private final Set additionalFiles; + private final AtomicBoolean released; + private final long primaryTerm; + // The translog recovery start file is encoded in the commit user data and in the CC header, and is used to pinpoint the starting + // translog compound file number to start scanning from for recovering operations indexed after the commit. It takes a special value + // of {@link #HOLLOW_TRANSLOG_RECOVERY_START_FILE} to indicate that the commit is hollow and has no translog to recover from. + private final long translogRecoveryStartFile; + // The translog release end file is used so that the TranslogReplicator can release any translog files before this one. + private final long translogReleaseEndFile; + private final boolean carryOverTranslog; + + public StatelessCommitRef( + ShardId shardId, + Engine.IndexCommitRef indexCommitRef, + Set additionalFiles, + long primaryTerm, + long translogRecoveryStartFile, + long translogReleaseEndFile + ) { + super(indexCommitRef.getIndexCommit()); + this.shardId = Objects.requireNonNull(shardId); + this.indexCommitRef = indexCommitRef; + this.additionalFiles = Objects.requireNonNull(additionalFiles); + this.primaryTerm = primaryTerm; + this.translogRecoveryStartFile = translogRecoveryStartFile; + this.translogReleaseEndFile = translogReleaseEndFile; + try { + this.carryOverTranslog = indexCommitRef.getIndexCommit().getUserData().containsKey(TRANSLOG_CARRY_OVER); + } catch (IOException e) { + assert false : e; // should never happen, none of the Lucene implementations throw this. + throw new UncheckedIOException(e); + } + this.released = new AtomicBoolean(); + assert translogReleaseEndFile < 0 + || translogRecoveryStartFile == translogReleaseEndFile + || translogRecoveryStartFile == HOLLOW_TRANSLOG_RECOVERY_START_FILE + : "translog start file for cleaning (" + + translogReleaseEndFile + + ") must be the same as translog recovery start file (" + + translogRecoveryStartFile + + ") for non-hollow commits or negative (ineffective)"; + assert translogReleaseEndFile != HOLLOW_TRANSLOG_RECOVERY_START_FILE + : translogReleaseEndFile + " == " + HOLLOW_TRANSLOG_RECOVERY_START_FILE; + } + + public long getPrimaryTerm() { + return primaryTerm; + } + + public Collection getCommitFiles() { + try { + return getFileNames(); + } catch (IOException e) { + assert false : e; // should never happen, none of the Lucene implementations throw this. + throw new UncheckedIOException(e); + } + } + + public Set getAdditionalFiles() { + return additionalFiles; + } + + @Override + public void close() throws IOException { + if (released.compareAndSet(false, true)) { + indexCommitRef.close(); + } + } + + public ShardId getShardId() { + return shardId; + } + + public long getTranslogRecoveryStartFile() { + return translogRecoveryStartFile; + } + + public long getTranslogReleaseEndFile() { + return translogReleaseEndFile; + } + + public boolean isHollow() { + return getTranslogRecoveryStartFile() == HOLLOW_TRANSLOG_RECOVERY_START_FILE; + } + + /** + * Signals that translog data like nodeEphemeralid in this commit should be carried over from current recovered commit. + * This is need when performing a flush during translog replay. + */ + public boolean carryOverTranslog() { + return carryOverTranslog; + } + + @Override + public String toString() { + return "StatelessCommitRef(" + shardId + ',' + primaryTerm + "," + in.toString() + ')'; + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/IndexingShardRecoveryComparator.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/IndexingShardRecoveryComparator.java new file mode 100644 index 0000000000000..c9e66274dea2e --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/IndexingShardRecoveryComparator.java @@ -0,0 +1,108 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.utils; + +import org.apache.lucene.index.IndexFileNames; +import org.elasticsearch.index.store.LuceneFilesExtensions; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit; + +import java.util.Comparator; + +/** + * Order commit files in an optimized order for indexing shard recoveries + */ +public class IndexingShardRecoveryComparator implements Comparator { + + @Override + public int compare(String fileName1, String fileName2) { + // The segment_N file is usually the first file Lucene reads, so it is always prewarmed first. + boolean segments1 = fileName1.startsWith(IndexFileNames.SEGMENTS); + boolean segments2 = fileName2.startsWith(IndexFileNames.SEGMENTS); + var compare = Boolean.compare(segments2, segments1); + if (compare != 0) { + return compare; + } + + // Lucene then usually reads segment core info files (.si), so we prioritize them over other type of files. + var si = LuceneFilesExtensions.SI.getExtension(); + boolean segmentInfo1 = IndexFileNames.matchesExtension(fileName1, si); + boolean segmentInfo2 = IndexFileNames.matchesExtension(fileName2, si); + compare = Boolean.compare(segmentInfo2, segmentInfo1); + if (compare != 0) { + return compare; + } + // Special case of two .si files: we sort them by segment names + if (segmentInfo1 && segmentInfo2) { + return IndexFileNames.parseSegmentName(fileName1).compareTo(IndexFileNames.parseSegmentName(fileName2)); + } + + // Lucene usually reads generational files when opening the IndexWriter + var isGenerationalFile1 = StatelessCompoundCommit.isGenerationalFile(fileName1); + var isGenerationalFile2 = StatelessCompoundCommit.isGenerationalFile(fileName2); + compare = Boolean.compare(isGenerationalFile2, isGenerationalFile1); + if (compare != 0) { + return compare; + } + + // Lucene loads a global field map when initializing the IndexWriter, so we want to prewarm .fnm files before other type of + // files. + var fnm = LuceneFilesExtensions.FNM.getExtension(); + boolean fields1 = IndexFileNames.matchesExtension(fileName1, fnm); + boolean fields2 = IndexFileNames.matchesExtension(fileName2, fnm); + compare = Boolean.compare(fields2, fields1); + if (compare != 0) { + return compare; + } + + // Special case of two .fnm files: we sort them by generation in the next step + + // Lucene usually parses segment core info files (.si) in the order they are serialized in the segment_N file. We don't have + // this exact order today (but we could add it this information in the compound commit blob in the future) so we use the segment + // names (parsed as longs) to order them. + var segmentName1 = isGenerationalFile1 ? IndexFileNames.parseGeneration(fileName1) : Long.MAX_VALUE; + var segmentName2 = isGenerationalFile2 ? IndexFileNames.parseGeneration(fileName2) : Long.MAX_VALUE; + compare = Long.compare(segmentName1, segmentName2); + if (compare != 0) { + return compare; + } + + // Sort files belonging to the same segment core are sorted in a pre-defined order (see #getExtensionOrder) + var extension1 = getExtensionOrder(fileName1); + var extension2 = getExtensionOrder(fileName2); + compare = Integer.compare(extension1, extension2); + if (compare != 0) { + return compare; + } + // Natural ordering as last resort + return fileName1.compareTo(fileName2); + } + + private static int getExtensionOrder(String fileName) { + var ext = LuceneFilesExtensions.fromFile(fileName); + assert ext != null || fileName.startsWith(IndexFileNames.SEGMENTS) : fileName; + if (ext == null) { + return 0; + } + // basically the order in which files are accessed when SegmentCoreReaders and SegmentReader are instantiated + return switch (ext) { + case SI -> 0; + case FNM -> 1; + case CFE, CFS -> 2; + case BFM, BFI, DOC, POS, PAY, CMP, LKP, TMD, TIM, TIP -> 3; + case NVM, NVD -> 4; + case FDM, FDT, FDX -> 5; + case TVM, TVD, TVX, TVF -> 6; + case KDM, KDI, KDD, DIM, DII -> 7; + case VEC, VEX, VEM, VEMF, VEMQ, VEQ -> 8; + case LIV -> 9; + case DVM, DVD -> 10; + default -> Integer.MAX_VALUE; + }; + } + +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/TransferableCloseables.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/TransferableCloseables.java new file mode 100644 index 0000000000000..d6109fd46236b --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/TransferableCloseables.java @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.utils; + +import org.elasticsearch.core.IOUtils; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Encapsulates a common pattern of trying to open a bunch of resources and then transferring ownership elsewhere on success, + * but closing them on failure. + */ +public class TransferableCloseables implements Closeable { + + private boolean transferred = false; + private final List closeables = new ArrayList<>(); + + public T add(T releasable) { + assert transferred == false : "already transferred"; + closeables.add(releasable); + return releasable; + } + + public Closeable transfer() { + assert transferred == false : "already transferred"; + transferred = true; + Collections.reverse(closeables); + return () -> IOUtils.close(closeables); + } + + @Override + public void close() throws IOException { + if (transferred == false) { + IOUtils.close(closeables); + } + } +} diff --git a/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/WaitForVersion.java b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/WaitForVersion.java new file mode 100644 index 0000000000000..3a9e7cf599691 --- /dev/null +++ b/x-pack/plugin/stateless/src/main/java/org/elasticsearch/xpack/stateless/utils/WaitForVersion.java @@ -0,0 +1,54 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.utils; + +import org.elasticsearch.common.util.concurrent.ConcurrentCollections; + +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; +import java.util.function.LongPredicate; + +/** + * A utility to wait for a version to be available, assuming a monotonically increasing versioning scheme like cluster state. + */ +public class WaitForVersion { + private final Map waiters = ConcurrentCollections.newConcurrentMap(); + private final AtomicLong lastProcessedVersion = new AtomicLong(-1); + + public void waitUntilVersion(long version, Runnable action) { + if (version <= lastProcessedVersion.get()) { + action.run(); + } else { + waiters.put(value -> value <= lastProcessedVersion.get(), action); + long versionProcessedAfter = lastProcessedVersion.get(); + if (version <= versionProcessedAfter) { + retest(versionProcessedAfter); + } + } + } + + public void notifyVersionProcessed(long versionProcessed) { + if (versionProcessed > lastProcessedVersion.get()) { + long result = lastProcessedVersion.accumulateAndGet(versionProcessed, Math::max); + if (result == versionProcessed) { + retest(versionProcessed); + } + } + } + + private void retest(long versionProcessed) { + waiters.keySet().forEach(key -> { + if (key.test(versionProcessed)) { + Runnable action = waiters.remove(key); + if (action != null) { + action.run(); + } + } + }); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationRequestTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationRequestTests.java new file mode 100644 index 0000000000000..c2bb6b0882a8f --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationRequestTests.java @@ -0,0 +1,287 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.elasticsearch.action.ActionRequestValidationException; +import org.elasticsearch.cluster.routing.IndexShardRoutingTable; +import org.elasticsearch.cluster.routing.ShardRoutingState; +import org.elasticsearch.cluster.routing.TestShardRouting; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.test.AbstractWireSerializingTestCase; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; +import org.junit.Before; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; + +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommitTestUtils.randomCompoundCommit; +import static org.hamcrest.Matchers.allOf; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.nullValue; + +public class NewCommitNotificationRequestTests extends AbstractWireSerializingTestCase { + + private IndexShardRoutingTable indexShardRoutingTable; + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + indexShardRoutingTable = randomIndexShardRoutingTable(); + } + + @Override + protected NewCommitNotificationRequest createTestInstance() { + final NewCommitNotificationRequest request = randomRequest(); + assertThat(randomRequest().validate(), nullValue()); + return request; + } + + @Override + protected NewCommitNotificationRequest mutateInstance(NewCommitNotificationRequest instance) throws IOException { + final StatelessCompoundCommit compoundCommit = instance.getCompoundCommit(); + + final var i = between(0, 3); + return switch (i) { + // Mutate CC's primary and generation + case 0 -> { + final var ccTermAndGen = compoundCommit.primaryTermAndGeneration(); + final var newCcTermAndGen = randomValueOtherThan( + ccTermAndGen, + () -> new PrimaryTermAndGeneration( + ccTermAndGen.primaryTerm() + between(-5, 5), + ccTermAndGen.generation() + between(-5, 5) + ) + ); + yield new NewCommitNotificationRequest( + indexShardRoutingTable, + new StatelessCompoundCommit( + compoundCommit.shardId(), + newCcTermAndGen, + compoundCommit.translogRecoveryStartFile(), + compoundCommit.nodeEphemeralId(), + compoundCommit.commitFiles(), + compoundCommit.sizeInBytes(), + compoundCommit.internalFiles(), + compoundCommit.headerSizeInBytes(), + compoundCommit.internalFilesReplicatedRanges(), + compoundCommit.extraContent(), + compoundCommit.timestampFieldValueRange() + ), + newCcTermAndGen.generation(), + instance.getLatestUploadedBatchedCompoundCommitTermAndGen(), + instance.getClusterStateVersion(), + instance.getNodeId() + ); + } + // Mutate latest uploaded BCC's primary and generation + case 1 -> { + final var bccGeneration = instance.getBatchedCompoundCommitGeneration(); + final var uploadedBccTermAndGen = instance.getLatestUploadedBatchedCompoundCommitTermAndGen(); + final var newUploadedBccTermAndGen = randomValueOtherThan( + uploadedBccTermAndGen, + () -> randomFrom( + new PrimaryTermAndGeneration(randomLongBetween(1, compoundCommit.primaryTerm() - 1), randomLongBetween(1, 100)), + new PrimaryTermAndGeneration(compoundCommit.primaryTerm(), randomLongBetween(1, bccGeneration - 1)), + null // for new shards where uploads are yet to happen + ) + ); + yield new NewCommitNotificationRequest( + indexShardRoutingTable, + compoundCommit, + bccGeneration, + newUploadedBccTermAndGen, + instance.getClusterStateVersion(), + instance.getNodeId() + ); + } + // Mutate cluster state version + case 2 -> new NewCommitNotificationRequest( + indexShardRoutingTable, + compoundCommit, + instance.getBatchedCompoundCommitGeneration(), + instance.getLatestUploadedBatchedCompoundCommitTermAndGen(), + randomValueOtherThan(instance.getClusterStateVersion(), ESTestCase::randomNonNegativeLong), + instance.getNodeId() + ); + // Mutate node id + case 3 -> new NewCommitNotificationRequest( + indexShardRoutingTable, + compoundCommit, + instance.getBatchedCompoundCommitGeneration(), + instance.getLatestUploadedBatchedCompoundCommitTermAndGen(), + instance.getClusterStateVersion(), + randomValueOtherThan(instance.getNodeId(), ESTestCase::randomIdentifier) + ); + default -> throw new IllegalArgumentException("Unexpected value " + i); + }; + } + + @Override + protected Writeable.Reader instanceReader() { + return NewCommitNotificationRequest::new; + } + + public void testValidationErrors() { + final long primaryTerm = randomLongBetween(1, 42); + final long generation = randomLongBetween(1, 100); + final StatelessCompoundCommit compoundCommit = new StatelessCompoundCommit( + indexShardRoutingTable.shardId(), + new PrimaryTermAndGeneration(primaryTerm, generation), + randomLongBetween(1L, Long.MAX_VALUE - 1L), + randomUUID(), + Map.of(), + randomLongBetween(10, 100), + Set.of(), + randomNonNegativeLong(), + InternalFilesReplicatedRanges.EMPTY, + Map.of(), + null + ); + + final var request1 = new NewCommitNotificationRequest( + indexShardRoutingTable, + compoundCommit, + generation + 1, + new PrimaryTermAndGeneration(primaryTerm, generation + 2), + randomNonNegativeLong(), + randomIdentifier() + ); + + final ActionRequestValidationException validationException1 = request1.validate(); + assertThat( + validationException1.getMessage(), + allOf( + containsString( + "compound commit generation [" + generation + "] < batched compound commit generation [" + (generation + 1) + "]" + ), + containsString( + "batched compound commit generation [" + + (generation + 1) + + "] < latest uploaded batched compound commit generation [" + + (generation + 2) + + "]" + ) + ) + ); + + final var request2 = new NewCommitNotificationRequest( + indexShardRoutingTable, + compoundCommit, + generation, + new PrimaryTermAndGeneration(primaryTerm + 1, generation), + randomNonNegativeLong(), + randomIdentifier() + ); + + final ActionRequestValidationException validationException2 = request2.validate(); + assertThat( + validationException2.getMessage(), + containsString( + "batched compound commit primary term [" + + (primaryTerm) + + "] < latest uploaded batched compound commit primary term [" + + (primaryTerm + 1) + + "]" + ) + ); + } + + public void testIsUpload() { + final long primaryTerm = randomLongBetween(10, 42); + final long generation = randomLongBetween(10, 100); + final long bccGeneration = randomLongBetween(5, generation); + final StatelessCompoundCommit statelessCompoundCommit = randomCompoundCommit( + indexShardRoutingTable.shardId(), + new PrimaryTermAndGeneration(primaryTerm, generation) + ); + final long clusterStateVersion = randomNonNegativeLong(); + final String nodeId = randomIdentifier(); + + var request = new NewCommitNotificationRequest( + indexShardRoutingTable, + statelessCompoundCommit, + bccGeneration, + null, + clusterStateVersion, + nodeId + ); + assertThat(request.toString(), request.isUploaded(), is(false)); + + request = new NewCommitNotificationRequest( + indexShardRoutingTable, + statelessCompoundCommit, + bccGeneration, + new PrimaryTermAndGeneration(randomLongBetween(1, primaryTerm), randomLongBetween(1, bccGeneration - 1)), + clusterStateVersion, + nodeId + ); + assertThat(request.toString(), request.isUploaded(), is(false)); + + request = new NewCommitNotificationRequest( + indexShardRoutingTable, + statelessCompoundCommit, + bccGeneration, + new PrimaryTermAndGeneration(primaryTerm, bccGeneration), + clusterStateVersion, + nodeId + ); + assertThat(request.toString(), request.isUploaded(), is(true)); + } + + public static IndexShardRoutingTable randomIndexShardRoutingTable() { + final var shardId = new ShardId(new Index(randomIdentifier(), randomUUID()), between(0, 3)); + final var shardRouting = TestShardRouting.newShardRouting(shardId, null, true, ShardRoutingState.UNASSIGNED); + final var builder = new IndexShardRoutingTable.Builder(shardId); + builder.addShard(shardRouting); + return builder.build(); + } + + private NewCommitNotificationRequest randomRequest() { + if (randomBoolean()) { + return randomRequestWithSingleCC(); + } + final long primaryTerm = randomLongBetween(10, 42); + final long generation = randomLongBetween(10, 100); + final long bccGeneration = randomLongBetween(5, generation); + + return new NewCommitNotificationRequest( + indexShardRoutingTable, + randomCompoundCommit(indexShardRoutingTable.shardId(), new PrimaryTermAndGeneration(primaryTerm, generation)), + bccGeneration, + randomFrom( + new PrimaryTermAndGeneration(primaryTerm - between(1, 9), randomLongBetween(1, 100)), + new PrimaryTermAndGeneration(primaryTerm, bccGeneration - between(0, 4)), + null // for new shards where uploads are yet to happen + ), + randomNonNegativeLong(), + randomIdentifier() + ); + } + + private NewCommitNotificationRequest randomRequestWithSingleCC() { + final long primaryTerm = randomLongBetween(10, 42); + final long generation = randomLongBetween(10, 100); + + return new NewCommitNotificationRequest( + indexShardRoutingTable, + randomCompoundCommit(indexShardRoutingTable.shardId(), new PrimaryTermAndGeneration(primaryTerm, generation)), + generation, + new PrimaryTermAndGeneration(primaryTerm, generation), + randomNonNegativeLong(), + randomIdentifier() + ); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponseSerializationTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponseSerializationTests.java new file mode 100644 index 0000000000000..5ead48ba94071 --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponseSerializationTests.java @@ -0,0 +1,42 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractWireSerializingTestCase; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGenerationTests; + +import java.io.IOException; +import java.util.stream.Collectors; + +public class NewCommitNotificationResponseSerializationTests extends AbstractWireSerializingTestCase { + + @Override + protected Writeable.Reader instanceReader() { + return NewCommitNotificationResponse::new; + } + + @Override + protected NewCommitNotificationResponse createTestInstance() { + return new NewCommitNotificationResponse(randomSet(0, 10, PrimaryTermAndGenerationTests::randomPrimaryTermAndGeneration)); + } + + @Override + protected NewCommitNotificationResponse mutateInstance(NewCommitNotificationResponse instance) throws IOException { + if (instance.getPrimaryTermAndGenerationsInUse().isEmpty()) { + return new NewCommitNotificationResponse(randomSet(1, 10, PrimaryTermAndGenerationTests::randomPrimaryTermAndGeneration)); + } + + return new NewCommitNotificationResponse( + instance.getPrimaryTermAndGenerationsInUse() + .stream() + .map(PrimaryTermAndGenerationTests::mutatePrimaryTermAndGeneration) + .collect(Collectors.toSet()) + ); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponseTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponseTests.java new file mode 100644 index 0000000000000..68792cff9eab3 --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/action/NewCommitNotificationResponseTests.java @@ -0,0 +1,72 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.action; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; + +import static org.hamcrest.Matchers.equalTo; + +public class NewCommitNotificationResponseTests extends ESTestCase { + + public void testCombineResponses() { + { + var combinedResponses = NewCommitNotificationResponse.combine( + List.of(NewCommitNotificationResponse.EMPTY, NewCommitNotificationResponse.EMPTY) + ); + assertThat(combinedResponses, equalTo(NewCommitNotificationResponse.EMPTY)); + } + + { + var combinedResponses = NewCommitNotificationResponse.combine( + List.of(response(new PrimaryTermAndGeneration(1, 2)), NewCommitNotificationResponse.EMPTY) + ); + assertThat(combinedResponses, equalTo(response(new PrimaryTermAndGeneration(1, 2)))); + } + + { + var combinedResponses = NewCommitNotificationResponse.combine( + List.of(NewCommitNotificationResponse.EMPTY, response(new PrimaryTermAndGeneration(1, 2))) + ); + assertThat(combinedResponses, equalTo(response(new PrimaryTermAndGeneration(1, 2)))); + } + + { + var combinedResponses = NewCommitNotificationResponse.combine( + List.of( + response(new PrimaryTermAndGeneration(1, 2)), + response(new PrimaryTermAndGeneration(1, 2), new PrimaryTermAndGeneration(1, 3)) + ) + ); + assertThat(combinedResponses, equalTo(response(new PrimaryTermAndGeneration(1, 2), new PrimaryTermAndGeneration(1, 3)))); + } + + { + var combinedResponses = NewCommitNotificationResponse.combine( + List.of( + response(new PrimaryTermAndGeneration(2, 3)), + response(new PrimaryTermAndGeneration(1, 2), new PrimaryTermAndGeneration(1, 3)) + ) + ); + assertThat( + combinedResponses, + equalTo( + response(new PrimaryTermAndGeneration(1, 2), new PrimaryTermAndGeneration(1, 3), new PrimaryTermAndGeneration(2, 3)) + ) + ); + } + } + + private NewCommitNotificationResponse response(PrimaryTermAndGeneration... generations) { + return new NewCommitNotificationResponse(Arrays.stream(generations).collect(Collectors.toSet())); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/cache/Lucene90CompoundEntriesReaderTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/cache/Lucene90CompoundEntriesReaderTests.java new file mode 100644 index 0000000000000..502f1b5787bb1 --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/cache/Lucene90CompoundEntriesReaderTests.java @@ -0,0 +1,63 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.cache; + +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IOContext; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; +import java.util.List; +import java.util.Set; + +import static java.util.stream.Collectors.toUnmodifiableSet; +import static org.hamcrest.Matchers.equalTo; + +public class Lucene90CompoundEntriesReaderTests extends ESTestCase { + + public void testReadEntries() throws IOException { + var tmpDir = createTempDir(); + + IndexWriterConfig conf = new IndexWriterConfig().setUseCompoundFile(true); + try (Directory directory = FSDirectory.open(tmpDir); IndexWriter writer = new IndexWriter(directory, conf)) { + for (int i = 0; i < randomIntBetween(50, 100); i++) { + writer.addDocument(createDocument()); + } + } + + try (Directory directory = FSDirectory.open(tmpDir)) { + var infos = Lucene.readSegmentInfos(directory); + var si = infos.info(0).info; + + var actualSegmentEntries = Set.of(si.getCodec().compoundFormat().getCompoundReader(directory, si).listAll()); + var parsedSegmentEntries = prependSegmentName( + si.name, + Lucene90CompoundEntriesReader.readEntries(directory.openInput(si.name + ".cfe", IOContext.DEFAULT)).keySet() + ); + + assertThat(parsedSegmentEntries, equalTo(actualSegmentEntries)); + } + + } + + private static List createDocument() { + return List.of(new TextField("id", randomIdentifier(), Field.Store.YES), new IntField("value", randomInt(), Field.Store.YES)); + } + + private static Set prependSegmentName(String segmentName, Set files) { + return files.stream().map(file -> segmentName + file).collect(toUnmodifiableSet()); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobFileRangesTestUtils.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobFileRangesTestUtils.java new file mode 100644 index 0000000000000..df57ebbf45bbe --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobFileRangesTestUtils.java @@ -0,0 +1,23 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import java.util.Map; +import java.util.Set; + +public class BlobFileRangesTestUtils { + + public static Map computeBlobFileRanges( + boolean useReplicatedRanges, + StatelessCompoundCommit compoundCommit, + long blobOffset, + Set internalFiles + ) { + return BlobFileRanges.computeBlobFileRanges(useReplicatedRanges, compoundCommit, blobOffset, internalFiles); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobLocationTestUtils.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobLocationTestUtils.java new file mode 100644 index 0000000000000..d425d92846fa9 --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobLocationTestUtils.java @@ -0,0 +1,29 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + * + * This file was contributed to by generative AI + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +public class BlobLocationTestUtils { + + private BlobLocationTestUtils() {} + + public static BlobLocation createBlobLocation(long primaryTerm, long generation, long offset, long fileLength) { + return new BlobLocation( + new BlobFile(StatelessCompoundCommit.PREFIX + generation, new PrimaryTermAndGeneration(primaryTerm, generation)), + offset, + fileLength + ); + } + + public static BlobFileRanges createBlobFileRanges(long primaryTerm, long generation, long offset, long fileLength) { + return new BlobFileRanges(createBlobLocation(primaryTerm, generation, offset, fileLength)); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobLocationTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobLocationTests.java new file mode 100644 index 0000000000000..072eab843fc22 --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/BlobLocationTests.java @@ -0,0 +1,70 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractXContentSerializingTestCase; +import org.elasticsearch.xcontent.XContentParser; + +import java.io.IOException; + +import static org.elasticsearch.xpack.stateless.commits.BlobLocationTestUtils.createBlobLocation; + +public class BlobLocationTests extends AbstractXContentSerializingTestCase { + + @Override + protected Writeable.Reader instanceReader() { + return BlobLocation::readFromTransport; + } + + @Override + protected BlobLocation createTestInstance() { + return createBlobLocation( + randomLongBetween(1, 10), + randomLongBetween(1, 1000), + randomLongBetween(0, 100), + randomLongBetween(100, 1000) + ); + } + + @Override + protected BlobLocation mutateInstance(BlobLocation instance) throws IOException { + return switch (randomIntBetween(0, 3)) { + case 0 -> createBlobLocation( + randomValueOtherThan(instance.primaryTerm(), () -> randomLongBetween(1, 10)), + instance.compoundFileGeneration(), + instance.offset(), + instance.fileLength() + ); + case 1 -> createBlobLocation( + instance.primaryTerm(), + randomValueOtherThan(instance.compoundFileGeneration(), () -> randomLongBetween(1, 1000)), + instance.offset(), + instance.fileLength() + ); + case 2 -> createBlobLocation( + instance.primaryTerm(), + instance.compoundFileGeneration(), + randomValueOtherThan(instance.offset(), () -> randomLongBetween(0, 100)), + instance.fileLength() + ); + case 3 -> createBlobLocation( + instance.primaryTerm(), + instance.compoundFileGeneration(), + instance.offset(), + randomValueOtherThan(instance.fileLength(), () -> randomLongBetween(100, 1000)) + ); + default -> randomValueOtherThan(instance, this::createTestInstance); + }; + } + + @Override + protected BlobLocation doParseInstance(XContentParser parser) throws IOException { + return BlobLocation.fromXContent(parser); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/InternalFilesReplicatedRangesTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/InternalFilesReplicatedRangesTests.java new file mode 100644 index 0000000000000..643bb1b7f86e6 --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/InternalFilesReplicatedRangesTests.java @@ -0,0 +1,49 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractXContentSerializingTestCase; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xcontent.XContentParser; +import org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges.InternalFileReplicatedRange; + +import java.io.IOException; + +public class InternalFilesReplicatedRangesTests extends AbstractXContentSerializingTestCase { + + @Override + protected Writeable.Reader instanceReader() { + return InternalFileReplicatedRange::fromStream; + } + + @Override + protected InternalFileReplicatedRange createTestInstance() { + return new InternalFileReplicatedRange(randomNonNegativeLong(), (short) randomIntBetween(1, 1024)); + } + + @Override + protected InternalFileReplicatedRange mutateInstance(InternalFileReplicatedRange instance) throws IOException { + return switch (randomInt(1)) { + case 0 -> new InternalFileReplicatedRange( + randomValueOtherThan(instance.position(), ESTestCase::randomNonNegativeLong), + instance.length() + ); + case 1 -> new InternalFileReplicatedRange( + instance.position(), + randomValueOtherThan(instance.length(), () -> (short) randomIntBetween(1, 1024)) + ); + default -> throw new RuntimeException("unreachable"); + }; + } + + @Override + protected InternalFileReplicatedRange doParseInstance(XContentParser parser) throws IOException { + return InternalFileReplicatedRange.PARSER.parse(parser, null); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/ReplicatedContentTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/ReplicatedContentTests.java new file mode 100644 index 0000000000000..08f50461158bb --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/ReplicatedContentTests.java @@ -0,0 +1,287 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.lucene.store.BytesReferenceIndexInput; +import org.elasticsearch.core.Streams; +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.stateless.cache.Lucene90CompoundEntriesReader; +import org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges.InternalFileReplicatedRange; +import org.elasticsearch.xpack.stateless.commits.ReplicatedContent.InternalFileRangeReader; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.InternalFile; +import org.hamcrest.Matchers; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.stream.Stream; + +import static org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges.REPLICATED_CONTENT_FOOTER_SIZE; +import static org.elasticsearch.xpack.stateless.commits.InternalFilesReplicatedRanges.REPLICATED_CONTENT_HEADER_SIZE; +import static org.elasticsearch.xpack.stateless.commits.ReplicatedContent.ALWAYS_REPLICATE; +import static org.hamcrest.Matchers.allOf; +import static org.hamcrest.Matchers.anyOf; +import static org.hamcrest.Matchers.contains; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.hasItems; +import static org.hamcrest.Matchers.hasSize; +import static org.hamcrest.Matchers.lessThan; +import static org.hamcrest.Matchers.not; + +public class ReplicatedContentTests extends ESTestCase { + + public void testReplicatesContent() throws IOException { + try (var directory = LuceneTestCase.newDirectory()) { + var smallFile = new InternalFile( + "small-file", + randomLongBetween(1, REPLICATED_CONTENT_HEADER_SIZE + REPLICATED_CONTENT_FOOTER_SIZE) + ); + var bigFile = new InternalFile( + "big-file", + randomLongBetween(REPLICATED_CONTENT_HEADER_SIZE + REPLICATED_CONTENT_FOOTER_SIZE + 1, Long.MAX_VALUE) + ); + var content = ReplicatedContent.create(true, List.of(smallFile, bigFile), directory, ALWAYS_REPLICATE); + + assertThat( + content.header(), + equalTo( + new InternalFilesReplicatedRanges( + List.of( + // first small file is merged with the following big file header + new InternalFileReplicatedRange(0, (short) (smallFile.length() + REPLICATED_CONTENT_HEADER_SIZE)), + new InternalFileReplicatedRange( + smallFile.length() + bigFile.length() - REPLICATED_CONTENT_FOOTER_SIZE, + REPLICATED_CONTENT_FOOTER_SIZE + ) + ), + smallFile.length() + REPLICATED_CONTENT_HEADER_SIZE + REPLICATED_CONTENT_FOOTER_SIZE + ) + ) + ); + assertThat( + content.readers(), + contains( + new InternalFileRangeReader(smallFile.name(), directory, 0, smallFile.length()), + new InternalFileRangeReader(bigFile.name(), directory, 0, REPLICATED_CONTENT_HEADER_SIZE), + new InternalFileRangeReader( + bigFile.name(), + directory, + bigFile.length() - REPLICATED_CONTENT_FOOTER_SIZE, + REPLICATED_CONTENT_FOOTER_SIZE + ) + ) + ); + } + } + + public void testReplicatesNonCompoundFileContent() throws IOException { + try (var directory = LuceneTestCase.newDirectory()) { + var conf = new IndexWriterConfig().setUseCompoundFile(false); + try (var writer = new IndexWriter(directory, conf)) { + for (int i = 0; i < randomIntBetween(50, 100); i++) { + writer.addDocument(createDocument()); + } + } + + assertThat(List.of(directory.listAll()), not(hasItems("_0.cfe", "_0.cfs"))); + var internalFiles = createInternalFilesFrom(directory); + var content = ReplicatedContent.create(true, internalFiles, directory, ALWAYS_REPLICATE); + + var smallFilesCount = internalFiles.stream() + .filter(file -> file.length() <= REPLICATED_CONTENT_HEADER_SIZE + REPLICATED_CONTENT_FOOTER_SIZE) + .count(); + var bigFilesCount = internalFiles.stream() + .filter(file -> file.length() > REPLICATED_CONTENT_HEADER_SIZE + REPLICATED_CONTENT_FOOTER_SIZE) + .count(); + + // some ranges are merged, so overall number of ranges should be smaller than total readers + assertThat(content.header().replicatedRanges().size(), lessThan((int) (smallFilesCount + 2 * bigFilesCount))); + assertThat(content.readers(), hasSize((int) (smallFilesCount + 2 * bigFilesCount))); + + verifyReplicatedContent(content); + } + } + + public void testReplicatesCompoundFileContent() throws IOException { + try (var directory = LuceneTestCase.newDirectory()) { + var conf = new IndexWriterConfig().setUseCompoundFile(true); + try (var writer = new IndexWriter(directory, conf)) { + for (int i = 0; i < randomIntBetween(50, 100); i++) { + writer.addDocument(createDocument()); + } + } + + assertThat(List.of(directory.listAll()), hasItems("_0.cfe", "_0.cfs")); + assertThat(directory.fileLength("_0.cfs"), greaterThan(1024L + 16L)); + var internalFiles = createInternalFilesFrom(directory); + var compoundSegmentsFileOffset = fileOffsetIn(internalFiles, "_0.cfs"); + var content = ReplicatedContent.create(true, internalFiles, directory, ALWAYS_REPLICATE); + var compoundEntries = Lucene90CompoundEntriesReader.readEntries(directory, "_0.cfe").values(); + + // max number of ranges that should be replicated + // the number is 2 (header and footer of the top level file, assuming it is > 1024+16 bytes) + ranges of every nested file + var replicatedRangesInCompoundSegment = 2 + (int) compoundEntries.stream() + .mapToLong(entry -> entry.length() > REPLICATED_CONTENT_HEADER_SIZE + REPLICATED_CONTENT_FOOTER_SIZE ? 2 : 1) + .sum(); + var compoundSegmentReaders = content.readers().stream().filter(reader -> Objects.equals(reader.filename(), "_0.cfs")).toList(); + + assertThat( + compoundSegmentReaders.size(), + allOf( + greaterThan(2), // more that a single header and footer + lessThan(replicatedRangesInCompoundSegment) // less than total count since some ranges are merged + ) + ); + + // we replicate first 1024 bytes of every file + // with compound file this means that first entry (or even entries) are likely going to be present in this range. + // this ensures we reuse this range rather than duplicating it + var overlappingEntries = compoundEntries.stream().filter(entry -> entry.offset() < 1024).toList(); + assertThat(overlappingEntries.size(), greaterThanOrEqualTo(1)); + // each of such segments is contained by a single first cfs reader + for (var entry : overlappingEntries) { + var entryReaders = compoundSegmentReaders.stream().filter(reader -> { + var entryStartPosition = entry.offset(); + var entryEndPosition = entry.offset() + entry.length(); + var startOfTheEntryIsInRange = reader.rangeOffset() <= entryStartPosition + && entryStartPosition < reader.rangeOffset() + reader.rangeLength(); + var endOfTheEntryIsInRange = reader.rangeOffset() <= entryEndPosition + && entryEndPosition < reader.rangeOffset() + reader.rangeLength(); + return startOfTheEntryIsInRange || endOfTheEntryIsInRange; + }).toList(); + assertThat(entryReaders, Matchers.>allOf(hasSize(1), contains(compoundSegmentReaders.getFirst()))); + } + // all of such entries are mapped to the same header range + var correspondingReplicatedRanges = new HashSet<>(); + correspondingReplicatedRanges.add(findRange(content.header(), compoundSegmentsFileOffset, REPLICATED_CONTENT_HEADER_SIZE)); + for (var entry : overlappingEntries) { + correspondingReplicatedRanges.add(findRange(content.header(), compoundSegmentsFileOffset + entry.offset(), entry.length())); + } + assertThat(correspondingReplicatedRanges, hasSize(1)); + + // In principle, it is possible to have multiple small files prior to CFS file. + // In an unlucky case they could be collapsed (along with cfs header) into small file + // but may not fit completely the nested segment range that overlaps with the compound file header. + // | small files | cfs | + // | 1024 <-- cfs file header range + // | |h|h.f|h..f|h..f|h......f|h.......f|f| <-- actual entries headers and footers + // |-----------------| <-- collapsed adjacent ranges + // ^^^ first cfs entry file that is partially located in cfs header range, + // but can not be completely added to the range due to the overflow + // To ensure this never happens every compound file creates a new range + assertTrue( + "There should be a range starting at a compound segments position", + content.header().replicatedRanges().stream().anyMatch(range -> range.position() == compoundSegmentsFileOffset) + ); + + verifyReplicatedContent(content); + } + } + + public void testHandlesLotsOfSmallFiles() throws IOException { + try (var directory = LuceneTestCase.newDirectory()) { + for (int i = 0; i < randomIntBetween(25, 100); i++) { + var conf = new IndexWriterConfig().setUseCompoundFile(false).setMergePolicy(NoMergePolicy.INSTANCE); + try (var writer = new IndexWriter(directory, conf)) { + for (int j = 0; j < randomIntBetween(3, 5); j++) { + writer.addDocument(createDocument()); + } + } + } + var internalFiles = createInternalFilesFrom(directory); + var content = ReplicatedContent.create(true, internalFiles, directory, ALWAYS_REPLICATE); + + var totalSmallFilesSize = internalFiles.stream() + .filter(file -> file.length() <= REPLICATED_CONTENT_HEADER_SIZE + REPLICATED_CONTENT_FOOTER_SIZE) + .mapToLong(InternalFile::length) + .sum(); + + assertThat(totalSmallFilesSize, greaterThan((long) Short.MAX_VALUE)); + assertThat(content.header().replicatedRanges().size(), greaterThan(1)); + + verifyReplicatedContent(content); + } + } + + private static List createDocument() { + return List.of(new TextField("id", randomIdentifier(), Field.Store.YES), new IntField("value", randomInt(), Field.Store.YES)); + } + + private static List createInternalFilesFrom(Directory directory) throws IOException { + return Stream.of(directory.listAll()) + .filter(filename -> Objects.equals(filename, "write.lock") == false) + .filter(filename -> Objects.equals(filename, "extra0") == false) + .map(filename -> new InternalFile(filename, sizeOfFileUnchecked(filename, directory))) + .sorted(Comparator.comparing(InternalFile::length)) + .toList(); + } + + private static long fileOffsetIn(List internalFiles, String filename) { + var position = 0L; + for (InternalFile internalFile : internalFiles) { + if (Objects.equals(internalFile.name(), filename)) { + return position; + } + position += internalFile.length(); + } + throw new AssertionError("File [" + filename + "] is not found"); + } + + private static InternalFileReplicatedRange findRange(InternalFilesReplicatedRanges ranges, long position, long length) { + return ranges.replicatedRanges() + .stream() + .filter(range -> range.position() <= position && position + length <= range.position() + range.length()) + .findFirst() + .get(); + } + + private static long sizeOfFileUnchecked(String filename, Directory directory) { + try { + return directory.fileLength(filename); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + private static void verifyReplicatedContent(ReplicatedContent content) throws IOException { + try (var output = new BytesStreamOutput()) { + for (var reader : content.readers()) { + try (var in = reader.getInputStream(0, Long.MAX_VALUE)) { + Streams.copy(in, output, false); + } + } + + var input = new BytesReferenceIndexInput("test", output.bytes()); + assertThat(input.length(), equalTo(content.header().dataSizeInBytes())); + var position = 0L; + for (var range : content.header().replicatedRanges()) { + input.seek(position); + // every range should start as a header or footer + assertThat(CodecUtil.readBEInt(input), anyOf(equalTo(CodecUtil.CODEC_MAGIC), equalTo(CodecUtil.FOOTER_MAGIC))); + position += range.length(); + } + } + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitInternalFileTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitInternalFileTests.java new file mode 100644 index 0000000000000..b15d84bd5ebee --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitInternalFileTests.java @@ -0,0 +1,223 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.InternalFile; + +import java.util.List; + +import static org.hamcrest.CoreMatchers.equalTo; + +public class StatelessCompoundCommitInternalFileTests extends ESTestCase { + + public void testSortInternalFiles() { + var internalFiles = shuffledList( + List.of( + file("segments_5", 1254), + file("_1.si", 205), + file("_0_1.fnm", 148), + file("_1_1.fnm", 169), + file("_0_1_Lucene90_0.dvd", 89), + file("_0_1_Lucene90_0.dvm", 94), + file("_1_1_Lucene90_0.dvd", 107), + file("_1_1_Lucene90_0.dvm", 75), + file("_1.cfe", 264), + file("_1.cfs", 199), + file("segments_6", 1542), + file("_2.si", 301), + file("_0_2.fnm", 115), + file("_1_2.fnm", 102), + file("_2_1.fnm", 88), + file("_2_1_Lucene90_0.dvd", 69), + file("_2_1_Lucene90_0.dvm", 102), + file("_0_2_Lucene90_0.dvd", 88), + file("_0_2_Lucene90_0.dvm", 111), + file("_1_2_Lucene90_0.dvd", 76), + file("_1_2_Lucene90_0.dvm", 99), + file("_2.cfe", 293), + file("_2.cfs", 265) + ) + ); + assertThat( + internalFiles.stream().sorted().toList(), + equalTo( + List.of( + file("_2_1_Lucene90_0.dvd", 69), + file("_1_1_Lucene90_0.dvm", 75), + file("_1_2_Lucene90_0.dvd", 76), + file("_0_2_Lucene90_0.dvd", 88), + file("_2_1.fnm", 88), + file("_0_1_Lucene90_0.dvd", 89), + file("_0_1_Lucene90_0.dvm", 94), + file("_1_2_Lucene90_0.dvm", 99), + file("_1_2.fnm", 102), + file("_2_1_Lucene90_0.dvm", 102), + file("_1_1_Lucene90_0.dvd", 107), + file("_0_2_Lucene90_0.dvm", 111), + file("_0_2.fnm", 115), + file("_0_1.fnm", 148), + file("_1_1.fnm", 169), + file("_1.cfs", 199), + file("_1.si", 205), + file("_1.cfe", 264), + file("_2.cfs", 265), + file("_2.cfe", 293), + file("_2.si", 301), + file("segments_5", 1254), + file("segments_6", 1542) + ) + ) + ); + } + + public void testMultipleCFESort() { + var unsortedInternalFiles = shuffledList( + List.of( + file("segments_aw2y", 1037), + file("_41mks.si", 398), + file("_41ml3.si", 399), + file("_41ml4.si", 361), + file("_41ml5.si", 361), + file("_41ml6.si", 361), + file("_41ml7.si", 361), + file("_41ml3_1.fnm", 4193), + file("_41ml4_1.fnm", 3865), + file("_41mks_2.fnm", 7174), + file("_41ml3_1_Lucene90_0.dvd", 93), + file("_41ml3_1_Lucene90_0.dvm", 160), + file("_41ml4_1_Lucene90_0.dvd", 75), + file("_41ml4_1_Lucene90_0.dvm", 160), + file("_41mks_2_Lucene90_0.dvd", 293), + file("_41mks_2_Lucene90_0.dvm", 160), + file("_41mks.cfe", 697), + file("_41mks.cfs", 78713), + file("_41ml3.cfe", 595), + file("_41ml3.cfs", 19785), + file("_41ml4.cfe", 595), + file("_41ml4.cfs", 11343), + file("_41ml5.cfe", 595), + file("_41ml5.cfs", 5008), + file("_41ml6.cfe", 595), + file("_41ml6.cfs", 11343), + file("_41ml7.cfe", 595), + file("_41ml7.cfs", 5008) + ) + ); + + assertThat( + unsortedInternalFiles.stream().sorted().toList(), + equalTo( + List.of( + file("_41ml4_1_Lucene90_0.dvd", 75), + file("_41ml3_1_Lucene90_0.dvd", 93), + file("_41mks_2_Lucene90_0.dvm", 160), + file("_41ml3_1_Lucene90_0.dvm", 160), + file("_41ml4_1_Lucene90_0.dvm", 160), + file("_41mks_2_Lucene90_0.dvd", 293), + file("_41ml4.si", 361), + file("_41ml5.si", 361), + file("_41ml6.si", 361), + file("_41ml7.si", 361), + file("_41mks.si", 398), + file("_41ml3.si", 399), + file("_41ml3.cfe", 595), + file("_41ml4.cfe", 595), + file("_41ml5.cfe", 595), + file("_41ml6.cfe", 595), + file("_41ml7.cfe", 595), + file("_41mks.cfe", 697), + file("segments_aw2y", 1037), + file("_41ml4_1.fnm", 3865), + file("_41ml3_1.fnm", 4193), + file("_41ml5.cfs", 5008), + file("_41ml7.cfs", 5008), + file("_41mks_2.fnm", 7174), + file("_41ml4.cfs", 11343), + file("_41ml6.cfs", 11343), + file("_41ml3.cfs", 19785), + file("_41mks.cfs", 78713) + ) + ) + ); + } + + public void testCFEAndRegularSegmentsSort() { + var unsortedInternalFiles = shuffledList( + List.of( + file("segments_6n9", 5206), + file("_rks.si", 584), + file("_rsm.si", 361), + file("_rsn.si", 361), + file("_rso.si", 361), + file("_rsp.si", 361), + file("_rks.fnm", 9401), + file("_rsm.cfe", 425), + file("_rsm.cfs", 705461), + file("_rsn.cfe", 425), + file("_rsn.cfs", 144514), + file("_rso.cfe", 425), + file("_rso.cfs", 3964123), + file("_rsp.cfe", 425), + file("_rsp.cfs", 1657860), + file("_rks_ES812Postings_0.doc", 472857214), + file("_rks_ES812Postings_0.tim", 40133762), + file("_rks_ES812Postings_0.tip", 2559177), + file("_rks_ES812Postings_0.tmd", 3739), + file("_rks.fdm", 791), + file("_rks.fdt", 463032785), + file("_rks.fdx", 59089), + file("_rks.kdd", 69843813), + file("_rks.kdi", 307265), + file("_rks.kdm", 548), + file("_rks_Lucene90_0.dvd", 647851632), + file("_rks_Lucene90_0.dvm", 14738) + ) + ); + + assertThat( + unsortedInternalFiles.stream().sorted().toList(), + equalTo( + List.of( + file("_rsm.si", 361), + file("_rsn.si", 361), + file("_rso.si", 361), + file("_rsp.si", 361), + file("_rsm.cfe", 425), + file("_rsn.cfe", 425), + file("_rso.cfe", 425), + file("_rsp.cfe", 425), + file("_rks.kdm", 548), + file("_rks.si", 584), + file("_rks.fdm", 791), + file("_rks_ES812Postings_0.tmd", 3739), + file("segments_6n9", 5206), + file("_rks.fnm", 9401), + file("_rks_Lucene90_0.dvm", 14738), + file("_rks.fdx", 59089), + file("_rsn.cfs", 144514), + file("_rks.kdi", 307265), + file("_rsm.cfs", 705461), + file("_rsp.cfs", 1657860), + file("_rks_ES812Postings_0.tip", 2559177), + file("_rso.cfs", 3964123), + file("_rks_ES812Postings_0.tim", 40133762), + file("_rks.kdd", 69843813), + file("_rks.fdt", 463032785), + file("_rks_ES812Postings_0.doc", 472857214), + file("_rks_Lucene90_0.dvd", 647851632) + ) + ) + ); + + } + + private static InternalFile file(String name, int length) { + return new InternalFile(name, length); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitTestUtils.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitTestUtils.java new file mode 100644 index 0000000000000..3323822fd812f --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitTestUtils.java @@ -0,0 +1,128 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.elasticsearch.common.UUIDs; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.TimestampFieldValueRange; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.util.ArrayList; +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.apache.lucene.tests.util.LuceneTestCase.rarely; +import static org.elasticsearch.test.ESTestCase.randomAlphaOfLength; +import static org.elasticsearch.test.ESTestCase.randomBoolean; +import static org.elasticsearch.test.ESTestCase.randomFrom; +import static org.elasticsearch.test.ESTestCase.randomIntBetween; +import static org.elasticsearch.test.ESTestCase.randomLongBetween; +import static org.elasticsearch.test.ESTestCase.randomNonEmptySubsetOf; +import static org.elasticsearch.test.ESTestCase.randomValueOtherThanMany; +import static org.elasticsearch.xpack.stateless.commits.BlobLocationTestUtils.createBlobLocation; + +public final class StatelessCompoundCommitTestUtils { + + private StatelessCompoundCommitTestUtils() {} + + public static StatelessCompoundCommit randomCompoundCommit() { + return randomCompoundCommit( + randomShardId(), + new PrimaryTermAndGeneration(randomNonZeroPositiveLong(), randomNonZeroPositiveLong()) + ); + } + + public static StatelessCompoundCommit randomCompoundCommit(ShardId shardId, PrimaryTermAndGeneration termAndGeneration) { + return randomCompoundCommit(shardId, termAndGeneration, randomBoolean()); + } + + public static StatelessCompoundCommit randomCompoundCommit( + ShardId shardId, + PrimaryTermAndGeneration termAndGeneration, + boolean hollow + ) { + Map commitFiles = randomCommitFiles(); + if (hollow) { + return StatelessCompoundCommit.newHollowStatelessCompoundCommit( + shardId, + termAndGeneration, + commitFiles, + randomNonZeroPositiveLong(), + Set.copyOf(randomNonEmptySubsetOf(commitFiles.keySet())), + randomNonZeroPositiveLong(), + randomInternalFilesReplicatedRanges(), + randomCommitFiles(), + randomFrom(randomTimestampFieldValueRange(), null) + ); + } else { + return new StatelessCompoundCommit( + shardId, + termAndGeneration, + randomNonZeroPositiveLong(), + randomNodeEphemeralId(), + commitFiles, + randomNonZeroPositiveLong(), + Set.copyOf(randomNonEmptySubsetOf(commitFiles.keySet())), + randomNonZeroPositiveLong(), + randomInternalFilesReplicatedRanges(), + Map.of(), + randomFrom(randomTimestampFieldValueRange(), null) + ); + } + } + + public static ShardId randomShardId() { + return new ShardId(randomAlphaOfLength(20), UUIDs.randomBase64UUID(), randomIntBetween(0, 25)); + } + + public static Long randomNonZeroPositiveLong() { + return randomLongBetween(1L, Long.MAX_VALUE - 1L); + } + + public static String randomNodeEphemeralId() { + return randomAlphaOfLength(10); + } + + public static TimestampFieldValueRange randomTimestampFieldValueRange() { + long minTimestamp = randomLongBetween(0L, Long.MAX_VALUE - 1L); + return new TimestampFieldValueRange( + minTimestamp, + randomValueOtherThanMany(maxTimestamp -> maxTimestamp < minTimestamp, () -> randomLongBetween(0L, Long.MAX_VALUE)) + ); + } + + public static Map randomCommitFiles() { + final int entries = randomIntBetween(1, 50); + return IntStream.range(0, entries + 1) + .mapToObj(operand -> UUIDs.randomBase64UUID()) + .collect(Collectors.toMap(Function.identity(), s -> { + long fileLength = randomLongBetween(100, 1000); + long offset = randomLongBetween(0, 200); + return createBlobLocation(randomNonZeroPositiveLong(), randomLongBetween(1, 1000), offset, fileLength); + })); + } + + public static InternalFilesReplicatedRanges randomInternalFilesReplicatedRanges() { + if (rarely()) { + return InternalFilesReplicatedRanges.EMPTY; + } + int maxNumberOfRanges = randomIntBetween(1, 25); + var replicatedRanges = new ArrayList(); + long position = randomNonZeroPositiveLong(); + while (replicatedRanges.size() < maxNumberOfRanges && position < Long.MAX_VALUE) { + position = randomLongBetween(position, Long.MAX_VALUE - 1L); + short length = (short) Math.min(randomLongBetween(1L, Short.MAX_VALUE), Long.MAX_VALUE - position); + replicatedRanges.add(new InternalFilesReplicatedRanges.InternalFileReplicatedRange(position, length)); + position += length; + } + return InternalFilesReplicatedRanges.from(replicatedRanges); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitTests.java new file mode 100644 index 0000000000000..4cc4dfdf3bc8b --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/StatelessCompoundCommitTests.java @@ -0,0 +1,507 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.store.OutputStreamDataOutput; +import org.elasticsearch.TransportVersion; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.stream.ByteArrayStreamInput; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.PositionTrackingOutputStreamStreamOutput; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.index.translog.BufferedChecksumStreamOutput; +import org.elasticsearch.test.AbstractWireSerializingTestCase; +import org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.TimestampFieldValueRange; +import org.elasticsearch.xpack.stateless.engine.PrimaryTermAndGeneration; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommit.HOLLOW_TRANSLOG_RECOVERY_START_FILE; +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommitTestUtils.randomCompoundCommit; +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommitTestUtils.randomNodeEphemeralId; +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommitTestUtils.randomNonZeroPositiveLong; +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommitTestUtils.randomShardId; +import static org.elasticsearch.xpack.stateless.commits.StatelessCompoundCommitTestUtils.randomTimestampFieldValueRange; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; + +public class StatelessCompoundCommitTests extends AbstractWireSerializingTestCase { + + @Override + protected StatelessCompoundCommit createTestInstance() { + return randomCompoundCommit(); + } + + @Override + protected StatelessCompoundCommit mutateInstance(StatelessCompoundCommit instance) throws IOException { + return switch (randomInt(10)) { + case 0 -> new StatelessCompoundCommit( + randomValueOtherThan(instance.shardId(), StatelessCompoundCommitTestUtils::randomShardId), + instance.primaryTermAndGeneration(), + instance.translogRecoveryStartFile(), + instance.nodeEphemeralId(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + instance.timestampFieldValueRange() + ); + case 1 -> new StatelessCompoundCommit( + instance.shardId(), + randomValueOtherThan( + instance.primaryTermAndGeneration(), + () -> new PrimaryTermAndGeneration(randomNonZeroPositiveLong(), randomNonZeroPositiveLong()) + ), + instance.translogRecoveryStartFile(), + instance.nodeEphemeralId(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + instance.timestampFieldValueRange() + ); + case 2 -> instance.hollow() ? + // unhollowed commit + new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + randomValueOtherThan(HOLLOW_TRANSLOG_RECOVERY_START_FILE, StatelessCompoundCommitTestUtils::randomNonZeroPositiveLong), + randomNodeEphemeralId(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + Map.of(), + instance.timestampFieldValueRange() + ) : randomBoolean() ? + // hollowed commit + StatelessCompoundCommit.newHollowStatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + instance.timestampFieldValueRange() + ) : + // different unhollowed commit + new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + randomValueOtherThan(instance.translogRecoveryStartFile(), StatelessCompoundCommitTestUtils::randomNonZeroPositiveLong), + instance.nodeEphemeralId(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + Map.of(), + instance.timestampFieldValueRange() + ); + case 3 -> instance.hollow() ? + // unhollowed commit + new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + randomValueOtherThan(HOLLOW_TRANSLOG_RECOVERY_START_FILE, StatelessCompoundCommitTestUtils::randomNonZeroPositiveLong), + randomNodeEphemeralId(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + Map.of(), + instance.timestampFieldValueRange() + ) : randomBoolean() ? + // hollowed commit + StatelessCompoundCommit.newHollowStatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + instance.timestampFieldValueRange() + ) : + // different unhollowed commit + new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.translogRecoveryStartFile(), + randomValueOtherThan(instance.nodeEphemeralId(), StatelessCompoundCommitTestUtils::randomNodeEphemeralId), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + Map.of(), + instance.timestampFieldValueRange() + ); + case 4 -> { + var commitFiles = randomValueOtherThan(instance.commitFiles(), StatelessCompoundCommitTestUtils::randomCommitFiles); + yield new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.translogRecoveryStartFile(), + instance.nodeEphemeralId(), + commitFiles, + instance.sizeInBytes(), + Set.copyOf(randomSubsetOf(commitFiles.keySet())), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + instance.timestampFieldValueRange() + ); + } + case 5 -> new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.translogRecoveryStartFile(), + instance.nodeEphemeralId(), + instance.commitFiles(), + randomValueOtherThan(instance.sizeInBytes(), StatelessCompoundCommitTestUtils::randomNonZeroPositiveLong), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + instance.timestampFieldValueRange() + ); + case 6 -> { + Map commitFiles = instance.commitFiles().isEmpty() + ? randomValueOtherThan(Map.of(), StatelessCompoundCommitTestUtils::randomCommitFiles) + : instance.commitFiles(); + yield new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.translogRecoveryStartFile(), + instance.nodeEphemeralId(), + commitFiles, + instance.sizeInBytes(), + randomValueOtherThan(instance.internalFiles(), () -> Set.copyOf(randomSubsetOf(commitFiles.keySet()))), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + instance.timestampFieldValueRange() + ); + } + case 7 -> new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.translogRecoveryStartFile(), + instance.nodeEphemeralId(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + randomValueOtherThan(instance.headerSizeInBytes(), StatelessCompoundCommitTestUtils::randomNonZeroPositiveLong), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + instance.timestampFieldValueRange() + ); + case 8 -> new StatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.translogRecoveryStartFile(), + instance.nodeEphemeralId(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + randomValueOtherThan( + instance.internalFilesReplicatedRanges(), + StatelessCompoundCommitTestUtils::randomInternalFilesReplicatedRanges + ), + instance.extraContent(), + instance.timestampFieldValueRange() + ); + case 9 -> { + final Map extraContent = randomValueOtherThan( + instance.extraContent(), + () -> rarely() ? Map.of() : StatelessCompoundCommitTestUtils.randomCommitFiles() + ); + yield StatelessCompoundCommit.newHollowStatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + extraContent, + instance.timestampFieldValueRange() + ); + } + case 10 -> { + TimestampFieldValueRange newTimestampFieldValueRange; + if (instance.timestampFieldValueRange() == null) { + newTimestampFieldValueRange = StatelessCompoundCommitTestUtils.randomTimestampFieldValueRange(); + } else if (randomBoolean()) { + newTimestampFieldValueRange = null; + } else { + newTimestampFieldValueRange = randomValueOtherThanMany( + timestampFieldValueRange -> timestampFieldValueRange.equals(instance.timestampFieldValueRange()), + StatelessCompoundCommitTestUtils::randomTimestampFieldValueRange + ); + } + yield StatelessCompoundCommit.newHollowStatelessCompoundCommit( + instance.shardId(), + instance.primaryTermAndGeneration(), + instance.commitFiles(), + instance.sizeInBytes(), + instance.internalFiles(), + instance.headerSizeInBytes(), + instance.internalFilesReplicatedRanges(), + instance.extraContent(), + newTimestampFieldValueRange + ); + } + default -> throw new AssertionError("Unexpected value"); + }; + } + + @Override + protected Writeable.Reader instanceReader() { + return StatelessCompoundCommit::readFromTransport; + } + + public void testStoreVersionCompatibility() throws Exception { + StatelessCompoundCommit testInstance = randomCompoundCommit( + randomShardId(), + new PrimaryTermAndGeneration(randomNonZeroPositiveLong(), randomNonZeroPositiveLong()), + // hollow shards were not supported at the time + false + ); + + try (BytesStreamOutput output = new BytesStreamOutput()) { + PositionTrackingOutputStreamStreamOutput positionTracking = new PositionTrackingOutputStreamStreamOutput(output); + + Map referencedCommitBlobsWithoutBlobLength = StatelessCompoundCommitTestUtils.randomCommitFiles(); + List internalFiles = new ArrayList<>(); + int internalFileCount = randomIntBetween(1, 10); + for (int i = 0; i < internalFileCount; i++) { + internalFiles.add(new StatelessCompoundCommit.InternalFile("internal_file_" + i, randomLongBetween(100, 1000))); + } + + writeBwcHeader( + positionTracking, + testInstance.shardId(), + testInstance.generation(), + testInstance.primaryTerm(), + testInstance.nodeEphemeralId(), + referencedCommitBlobsWithoutBlobLength, + internalFiles, + randomFrom(StatelessCompoundCommit.VERSION_WITH_COMMIT_FILES, StatelessCompoundCommit.VERSION_WITH_BLOB_LENGTH) + ); + + var headerOffset = positionTracking.position(); + var totalSize = headerOffset + internalFiles.stream().mapToLong(StatelessCompoundCommit.InternalFile::length).sum(); + var expectedCommitFiles = StatelessCompoundCommit.combineCommitFiles( + new BlobFile( + StatelessCompoundCommit.blobNameFromGeneration(testInstance.generation()), + new PrimaryTermAndGeneration(testInstance.primaryTerm(), testInstance.generation()) + ), + InternalFilesReplicatedRanges.EMPTY, + internalFiles, + referencedCommitBlobsWithoutBlobLength, + 0, + headerOffset, + List.of() + ); + // StatelessCompoundCommit.VERSION_WITH_COMMIT_FILES, StatelessCompoundCommit.VERSION_WITH_BLOB_LENGTH do not support + // translogRecoveryVersion or timestamp field value range, so the deserialized value will always be 0 and null, respectively. + StatelessCompoundCommit withOldBlobLengths = new StatelessCompoundCommit( + testInstance.shardId(), + testInstance.primaryTermAndGeneration(), + 0, + testInstance.nodeEphemeralId(), + expectedCommitFiles.commitFiles(), + totalSize, + internalFiles.stream().map(StatelessCompoundCommit.InternalFile::name).collect(Collectors.toSet()), + headerOffset, + InternalFilesReplicatedRanges.EMPTY, + Map.of(), + null // BWC versions do not write any timestamp field value ranges + ); + + try (StreamInput in = output.bytes().streamInput()) { + StatelessCompoundCommit compoundCommit = StatelessCompoundCommit.readFromStore(in); + assertEqualInstances(withOldBlobLengths, compoundCommit); + } + } + } + + // This method is moved from StatelessCompoundCommit since the production code only needs to write commit blobs with current version + private static long writeBwcHeader( + PositionTrackingOutputStreamStreamOutput positionTracking, + ShardId shardId, + long generation, + long primaryTerm, + String nodeEphemeralId, + Map referencedBlobFiles, + List internalFiles, + int version + ) throws IOException { + assert version < StatelessCompoundCommit.VERSION_WITH_XCONTENT_ENCODING; + BufferedChecksumStreamOutput out = new BufferedChecksumStreamOutput(positionTracking); + CodecUtil.writeHeader(new OutputStreamDataOutput(out), StatelessCompoundCommit.SHARD_COMMIT_CODEC, version); + TransportVersion.writeVersion(TransportVersion.current(), out); + out.writeWriteable(shardId); + out.writeVLong(generation); + out.writeVLong(primaryTerm); + out.writeString(nodeEphemeralId); + out.writeMap(referencedBlobFiles, StreamOutput::writeString, (so, v) -> { + final boolean includeBlobLength = version >= StatelessCompoundCommit.VERSION_WITH_BLOB_LENGTH; + so.writeVLong(v.primaryTerm()); + so.writeString(v.blobName()); + if (includeBlobLength) { + so.writeVLong(v.offset() + v.fileLength()); + } + so.writeVLong(v.offset()); + so.writeVLong(v.fileLength()); + }); + out.writeCollection(internalFiles); + out.flush(); + // Add 8 bytes for the header size field and 4 bytes for the checksum + var headerSize = positionTracking.position() + 8 + 4; + out.writeLong(headerSize); + out.writeInt((int) out.getChecksum()); + out.flush(); + return headerSize; + } + + public void testStoreCorruption() throws Exception { + StatelessCompoundCommit testInstance = createTestInstance(); + + try (BytesStreamOutput output = new BytesStreamOutput()) { + Map commitFiles = testInstance.commitFiles(); + + StatelessCompoundCommit.writeXContentHeader( + testInstance.shardId(), + testInstance.generation(), + testInstance.primaryTerm(), + testInstance.nodeEphemeralId(), + 0, + testInstance.timestampFieldValueRange(), + commitFiles, + List.of(), + InternalFilesReplicatedRanges.EMPTY, + new PositionTrackingOutputStreamStreamOutput(output), + randomBoolean(), + List.of() + ); + // flip one byte anywhere + byte[] bytes = BytesReference.toBytes(output.bytes()); + int i = randomIntBetween(0, bytes.length - 1); + bytes[i] = (byte) ~bytes[i]; + try (StreamInput in = new ByteArrayStreamInput(bytes)) { + try { + StatelessCompoundCommit.readFromStore(in); + assert false : "Should have thrown"; + } catch (IOException e) { + assertThat(e.getMessage(), containsString("Failed to read shard commit")); + } catch (AssertionError e) { + assertThat(e.getMessage(), containsString("(offset + file) length is greater than blobLength")); + } + } + } + } + + public void testShouldReadHeaderRegardlessFeatureFlagState() throws IOException { + StatelessCompoundCommit testInstance = createTestInstance(); + var writerFeatureFlag = randomBoolean(); + + byte[] bytes; + + try (BytesStreamOutput output = new BytesStreamOutput()) { + StatelessCompoundCommit.writeXContentHeader( + testInstance.shardId(), + testInstance.generation(), + testInstance.primaryTerm(), + testInstance.nodeEphemeralId(), + testInstance.translogRecoveryStartFile(), + testInstance.timestampFieldValueRange(), + testInstance.commitFiles(), + List.of(), + InternalFilesReplicatedRanges.EMPTY, + new PositionTrackingOutputStreamStreamOutput(output), + writerFeatureFlag, + List.of() + ); + bytes = BytesReference.toBytes(output.bytes()); + } + + try (StreamInput in = new ByteArrayStreamInput(bytes)) { + var copy = StatelessCompoundCommit.readFromStore(in); + + assertThat(copy.shardId(), equalTo(testInstance.shardId())); + assertThat(copy.generation(), equalTo(testInstance.generation())); + assertThat(copy.primaryTerm(), equalTo(testInstance.primaryTerm())); + assertThat(copy.nodeEphemeralId(), equalTo(testInstance.nodeEphemeralId())); + assertThat(copy.translogRecoveryStartFile(), equalTo(testInstance.translogRecoveryStartFile())); + assertThat(copy.timestampFieldValueRange(), equalTo(testInstance.timestampFieldValueRange())); + assertThat(copy.commitFiles(), equalTo(testInstance.commitFiles())); + } + } + + public void testGetInternalFilesBoundaryOffsetInCurrentTermWithMixedFiles() { + PrimaryTermAndGeneration previousGeneration = new PrimaryTermAndGeneration(4L, 4L); + PrimaryTermAndGeneration currentGeneration = new PrimaryTermAndGeneration(5L, 5L); + + BlobLocation previousMin = new BlobLocation(new BlobFile(StatelessCompoundCommit.PREFIX + "4", previousGeneration), 50L, 25L); + BlobLocation previousMax = new BlobLocation(new BlobFile(StatelessCompoundCommit.PREFIX + "4", previousGeneration), 400L, 25L); + BlobLocation currentMin = new BlobLocation(new BlobFile(StatelessCompoundCommit.PREFIX + "5", currentGeneration), 100L, 50L); + BlobLocation currentMax = new BlobLocation(new BlobFile(StatelessCompoundCommit.PREFIX + "5", currentGeneration), 300L, 50L); + + Map commitFiles = Map.of( + "previousMin", + previousMin, + "previousMax", + previousMax, + "currentMin", + currentMin, + "inBetween", + new BlobLocation(new BlobFile(StatelessCompoundCommit.PREFIX + "5", currentGeneration), 200L, 50L), + "currentMax", + currentMax + ); + + StatelessCompoundCommit commit = new StatelessCompoundCommit( + randomShardId(), + currentGeneration, + 1L, + "node-1", + commitFiles, + 700L, + Set.of("currentMin", "inBetween", "currentMax"), + 50L, + InternalFilesReplicatedRanges.EMPTY, + Map.of(), + randomTimestampFieldValueRange() + ); + + assertThat(commit.getMaxInternalFilesOffsetInCurrentGeneration(), equalTo(currentMax)); + assertThat(commit.getMinInternalFilesOffsetInCurrentGeneration(), equalTo(currentMin)); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/VirtualBatchedCompoundCommitTestUtils.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/VirtualBatchedCompoundCommitTestUtils.java new file mode 100644 index 0000000000000..d56dccad22d91 --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/commits/VirtualBatchedCompoundCommitTestUtils.java @@ -0,0 +1,32 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.commits; + +import java.io.InputStream; +import java.util.List; + +public class VirtualBatchedCompoundCommitTestUtils { + + private VirtualBatchedCompoundCommitTestUtils() {} + + public static BlobLocation getBlobLocation(VirtualBatchedCompoundCommit target, String fileName) { + return target.getBlobLocation(fileName); + } + + public static List getPendingStatelessCompoundCommits(VirtualBatchedCompoundCommit target) { + return target.getPendingCompoundCommits().stream().map(cc -> cc.getStatelessCompoundCommit()).toList(); + } + + public static long getHeaderSize(VirtualBatchedCompoundCommit.PendingCompoundCommit pendingCompoundCommit) { + return pendingCompoundCommit.getHeaderSize(); + } + + public static InputStream getInputStreamForUpload(VirtualBatchedCompoundCommit vbcc) { + return vbcc.getInputStreamForUpload(); + } +} diff --git a/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/engine/PrimaryTermAndGenerationTests.java b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/engine/PrimaryTermAndGenerationTests.java new file mode 100644 index 0000000000000..9cf3ecd2a03da --- /dev/null +++ b/x-pack/plugin/stateless/src/test/java/org/elasticsearch/xpack/stateless/engine/PrimaryTermAndGenerationTests.java @@ -0,0 +1,69 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.stateless.engine; + +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.test.AbstractWireSerializingTestCase; +import org.elasticsearch.test.ESTestCase; + +import java.io.IOException; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.lessThan; + +public class PrimaryTermAndGenerationTests extends AbstractWireSerializingTestCase { + @Override + protected Writeable.Reader instanceReader() { + return PrimaryTermAndGeneration::new; + } + + @Override + protected PrimaryTermAndGeneration createTestInstance() { + return randomPrimaryTermAndGeneration(); + } + + @Override + protected PrimaryTermAndGeneration mutateInstance(PrimaryTermAndGeneration instance) throws IOException { + return mutatePrimaryTermAndGeneration(instance); + } + + public static PrimaryTermAndGeneration randomPrimaryTermAndGeneration() { + return new PrimaryTermAndGeneration(randomNonNegativeLong(), randomNonNegativeLong()); + } + + public static PrimaryTermAndGeneration mutatePrimaryTermAndGeneration(PrimaryTermAndGeneration instance) { + return switch (randomInt(1)) { + case 0 -> new PrimaryTermAndGeneration( + randomValueOtherThan(instance.primaryTerm(), ESTestCase::randomNonNegativeLong), + instance.generation() + ); + case 1 -> new PrimaryTermAndGeneration( + instance.primaryTerm(), + randomValueOtherThan(instance.generation(), ESTestCase::randomNonNegativeLong) + ); + default -> throw new IllegalArgumentException("Unexpected branch"); + }; + } + + public void testCompareTo() { + var p1 = new PrimaryTermAndGeneration(randomNonNegativeLong(), randomNonNegativeLong()); + var p2 = new PrimaryTermAndGeneration(p1.primaryTerm(), p1.generation()); + assertThat("p1=" + p1 + ", p2=" + p2, p1.compareTo(p2), equalTo(0)); + + p1 = new PrimaryTermAndGeneration(randomNonNegativeLong(), randomNonNegativeInt()); + p2 = new PrimaryTermAndGeneration(p1.primaryTerm(), p1.generation() + randomLongBetween(1, Byte.MAX_VALUE)); + assertThat("p1=" + p1 + ", p2=" + p2, p1.compareTo(p2), lessThan(0)); + assertThat("p1=" + p1 + ", p2=" + p2, p2.compareTo(p1), greaterThan(0)); + + p1 = new PrimaryTermAndGeneration(randomNonNegativeInt(), randomNonNegativeLong()); + p2 = new PrimaryTermAndGeneration(p1.primaryTerm() + randomLongBetween(1, Byte.MAX_VALUE), randomNonNegativeLong()); + assertThat("p1=" + p1 + ", p2=" + p2, p1.compareTo(p2), lessThan(0)); + assertThat("p1=" + p1 + ", p2=" + p2, p2.compareTo(p1), greaterThan(0)); + } +}