@@ -588,6 +588,7 @@ void VectorReadMain(
588588 const TIntrusivePtr<TKikimrTableMetadata> & mainTableMeta,
589589 const TCoAtomList& mainColumns,
590590 const TKqpStreamLookupSettings& pushdownSettings,
591+ bool withOverlap,
591592 TExprNodePtr& read)
592593{
593594 const bool isCovered = CheckIndexCovering (mainColumns, postingTableMeta);
@@ -606,21 +607,48 @@ void VectorReadMain(
606607 .Columns (postingColumns)
607608 .Settings (isVectorCovered ? settingsNode : settings.BuildNode (ctx, pos))
608609 .Done ().Ptr ();
610+ }
609611
610- read = Build<TKqlStreamLookupTable>(ctx, pos)
611- .Table (mainTable)
612- .LookupKeys (read)
613- .Columns (mainColumns)
614- .Settings (settingsNode)
615- .Done ().Ptr ();
616- } else {
617- read = Build<TKqlStreamLookupTable>(ctx, pos)
618- .Table (postingTable)
619- .LookupKeys (read)
620- .Columns (mainColumns)
621- .Settings (settingsNode)
622- .Done ().Ptr ();
612+ const auto & targetTable = isCovered ? postingTable : mainTable;
613+
614+ if (withOverlap) {
615+ // mainColumns must contain primary key columns for DistinctColumns pushdown
616+ THashSet<TStringBuf> cols;
617+ for (const auto & col: mainColumns) {
618+ cols.insert (col.Value ());
619+ }
620+ TVector<TCoAtom> columnsWithKey;
621+ for (const auto & col: mainTableMeta->KeyColumnNames ) {
622+ if (!cols.contains (col)) {
623+ columnsWithKey.push_back (Build<TCoAtom>(ctx, pos)
624+ .Value (col)
625+ .Done ());
626+ }
627+ }
628+ if (columnsWithKey.size ()) {
629+ for (const auto & col: mainColumns) {
630+ columnsWithKey.push_back (col);
631+ }
632+ read = Build<TKqlStreamLookupTable>(ctx, pos)
633+ .Table (targetTable)
634+ .LookupKeys (read)
635+ .Columns <TCoAtomList>().Add (columnsWithKey).Build ()
636+ .Settings (settingsNode)
637+ .Done ().Ptr ();
638+ read = Build<TCoExtractMembers>(ctx, pos)
639+ .Input (read)
640+ .Members (mainColumns)
641+ .Done ().Ptr ();
642+ return ;
643+ }
623644 }
645+
646+ read = Build<TKqlStreamLookupTable>(ctx, pos)
647+ .Table (targetTable)
648+ .LookupKeys (read)
649+ .Columns (mainColumns)
650+ .Settings (settingsNode)
651+ .Done ().Ptr ();
624652}
625653
626654void VectorTopMain (TExprContext& ctx, const TCoTopBase& top, TExprNodePtr& read) {
@@ -633,6 +661,10 @@ void VectorTopMain(TExprContext& ctx, const TCoTopBase& top, TExprNodePtr& read)
633661 .Done ().Ptr ();
634662}
635663
664+ // FIXME Most of this rewriting should probably be handled in kqp/opt/physical
665+ // Logical optimizer should only rewrite it to something like TKqlReadTableVectorIndex
666+ // This would remove the need for skipping KqpApplyExtractMembersToReadTable based on settings.VectorTopDistinct
667+
636668TExprBase DoRewriteTopSortOverKMeansTree (
637669 const TReadMatch& match, const TMaybeNode<TCoFlatMap>& flatMap, const TExprBase& lambdaArgs, const TExprBase& lambdaBody, const TCoTopBase& top,
638670 TExprContext& ctx, const TKqpOptimizeContext& kqpCtx,
@@ -697,6 +729,9 @@ TExprBase DoRewriteTopSortOverKMeansTree(
697729
698730 const auto levelTop = kqpCtx.Config ->KMeansTreeSearchTopSize .Get ().GetOrElse (1 );
699731
732+ const auto & kmeansDesc = std::get<NKikimrKqp::TVectorIndexKmeansTreeDescription>(indexDesc.SpecializedIndexDescription );
733+ const bool withOverlap = kmeansDesc.settings ().overlap_clusters () > 1 ;
734+
700735 TKqpStreamLookupSettings settings;
701736 settings.Strategy = EStreamLookupStrategyType::LookupRows;
702737 settings.VectorTopColumn = NTableIndex::NKMeans::CentroidColumn;
@@ -716,7 +751,8 @@ TExprBase DoRewriteTopSortOverKMeansTree(
716751
717752 settings.VectorTopColumn = indexDesc.KeyColumns .back ();
718753 settings.VectorTopLimit = top.Count ().Ptr ();
719- VectorReadMain (ctx, pos, postingTable, postingTableDesc->Metadata , mainTable, tableDesc.Metadata , mainColumns, settings, read);
754+ settings.VectorTopDistinct = true ;
755+ VectorReadMain (ctx, pos, postingTable, postingTableDesc->Metadata , mainTable, tableDesc.Metadata , mainColumns, settings, withOverlap, read);
720756
721757 if (flatMap) {
722758 read = Build<TCoFlatMap>(ctx, flatMap.Cast ().Pos ())
@@ -827,6 +863,9 @@ TExprBase DoRewriteTopSortOverPrefixedKMeansTree(
827863
828864 const auto levelTop = kqpCtx.Config ->KMeansTreeSearchTopSize .Get ().GetOrElse (1 );
829865
866+ const auto & kmeansDesc = std::get<NKikimrKqp::TVectorIndexKmeansTreeDescription>(indexDesc.SpecializedIndexDescription );
867+ const bool withOverlap = kmeansDesc.settings ().overlap_clusters () > 1 ;
868+
830869 TKqpStreamLookupSettings settings;
831870 settings.Strategy = EStreamLookupStrategyType::LookupRows;
832871 settings.VectorTopColumn = NTableIndex::NKMeans::CentroidColumn;
@@ -849,7 +888,8 @@ TExprBase DoRewriteTopSortOverPrefixedKMeansTree(
849888
850889 settings.VectorTopColumn = indexDesc.KeyColumns .back ();
851890 settings.VectorTopLimit = top.Count ().Ptr ();
852- VectorReadMain (ctx, pos, postingTable, postingTableDesc->Metadata , mainTable, tableDesc.Metadata , mainColumns, settings, read);
891+ settings.VectorTopDistinct = true ;
892+ VectorReadMain (ctx, pos, postingTable, postingTableDesc->Metadata , mainTable, tableDesc.Metadata , mainColumns, settings, withOverlap, read);
853893
854894 if (mainLambda) {
855895 read = Build<TCoMap>(ctx, flatMap.Pos ())
0 commit comments