From cb7dfeb910267f093ccf392cf7eb4c89d7d6a21c Mon Sep 17 00:00:00 2001 From: Pranav Manoj <69426276+bossbeagle1509@users.noreply.github.com> Date: Fri, 5 Dec 2025 23:39:29 +0100 Subject: [PATCH 1/3] feat: add support for .b2z, .b2d, .b2e files and update related tests --- src/blosc2/dict_store.py | 12 ++--- src/blosc2/embed_store.py | 11 ++++- src/blosc2/schunk.py | 92 +++++++++++++++++++++++++-------------- tests/test_dict_store.py | 13 ++++++ tests/test_embed_store.py | 18 +++++++- 5 files changed, 107 insertions(+), 39 deletions(-) diff --git a/src/blosc2/dict_store.py b/src/blosc2/dict_store.py index bb104080d..20289acdb 100644 --- a/src/blosc2/dict_store.py +++ b/src/blosc2/dict_store.py @@ -153,7 +153,7 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None): if "embed.b2e" not in self.offsets: raise FileNotFoundError("Embed file embed.b2e not found in store.") estore_offset = self.offsets["embed.b2e"]["offset"] - schunk = blosc2.open(self.b2z_path, mode="r", offset=estore_offset, dparams=dparams) + schunk = blosc2.blosc2_ext.open(self.b2z_path, mode="r", offset=estore_offset, dparams=dparams) for filepath in self.offsets: if filepath.endswith((".b2nd", ".b2f")): key = "/" + filepath[: -5 if filepath.endswith(".b2nd") else -4] @@ -161,7 +161,7 @@ def _init_read_mode(self, dparams: blosc2.DParams | None = None): else: # .b2d if not os.path.isdir(self.localpath): raise FileNotFoundError(f"Directory {self.localpath} does not exist for reading.") - schunk = blosc2.open(self.estore_path, mode="r", dparams=dparams) + schunk = blosc2.blosc2_ext.open(self.estore_path, mode="r", offset=0, dparams=dparams) self._update_map_tree() self._estore = EmbedStore(_from_schunk=schunk) @@ -267,7 +267,7 @@ def __getitem__(self, key: str) -> blosc2.NDArray | SChunk | C2Array: filepath = self.map_tree[key] if filepath in self.offsets: offset = self.offsets[filepath]["offset"] - return blosc2.open(self.b2z_path, mode="r", offset=offset, dparams=self.dparams) + return blosc2.blosc2_ext.open(self.b2z_path, mode="r", offset=offset, dparams=self.dparams) else: urlpath = os.path.join(self.working_dir, filepath) if os.path.exists(urlpath): @@ -331,7 +331,9 @@ def values(self) -> Iterator[blosc2.NDArray | SChunk | C2Array]: if self.is_zip_store: if filepath in self.offsets: offset = self.offsets[filepath]["offset"] - yield blosc2.open(self.b2z_path, mode="r", offset=offset, dparams=self.dparams) + yield blosc2.blosc2_ext.open( + self.b2z_path, mode="r", offset=offset, dparams=self.dparams + ) else: urlpath = os.path.join(self.working_dir, filepath) yield blosc2.open(urlpath, mode="r" if self.mode == "r" else "a", dparams=self.dparams) @@ -350,7 +352,7 @@ def items(self) -> Iterator[tuple[str, blosc2.NDArray | SChunk | C2Array]]: if self.is_zip_store: if filepath in self.offsets: offset = self.offsets[filepath]["offset"] - yield key, blosc2.open(self.b2z_path, mode="r", offset=offset) + yield key, blosc2.blosc2_ext.open(self.b2z_path, mode="r", offset=offset) else: urlpath = os.path.join(self.working_dir, filepath) yield key, blosc2.open(urlpath, mode="r" if self.mode == "r" else "a") diff --git a/src/blosc2/embed_store.py b/src/blosc2/embed_store.py index 61dd34475..befb04e2e 100644 --- a/src/blosc2/embed_store.py +++ b/src/blosc2/embed_store.py @@ -108,7 +108,7 @@ def __init__( self.storage = storage if mode in ("r", "a") and urlpath: - self._store = blosc2.open(urlpath, mode=mode) + self._store = blosc2.blosc2_ext.open(urlpath, mode=mode, offset=0) self._load_metadata() return @@ -254,6 +254,15 @@ def to_cframe(self) -> bytes: """Serialize embed store to CFrame format.""" return self._store.to_cframe() + def __enter__(self): + """Context manager enter.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + # No need to close anything as SChunk/NDArray handles persistence automatically + return False + def estore_from_cframe(cframe: bytes, copy: bool = False) -> EmbedStore: """ diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 37564b97a..437ef5a9a 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -1470,11 +1470,59 @@ def __dealloc__(self): super().__dealloc__() +def _open_special_store(urlpath, mode, offset, **kwargs): + if urlpath.endswith(".b2z") or urlpath.endswith(".b2d"): + if offset != 0: + raise ValueError("Offset must be 0 for DictStore") + from blosc2.dict_store import DictStore + + return DictStore(urlpath, mode=mode, **kwargs) + elif urlpath.endswith(".b2e"): + if offset != 0: + raise ValueError("Offset must be 0 for EmbedStore") + from blosc2.embed_store import EmbedStore + + return EmbedStore(urlpath, mode=mode, **kwargs) + return None + + +def _set_default_dparams(kwargs): + dparams = kwargs.get("dparams") + if dparams is None: + # Use multiple threads for decompression by default, unless we are in WASM + # (does not support threads). The only drawback for using multiple threads + # is that access time will be slower because of the overhead of spawning threads + # (but could be fixed in the future with more intelligent thread pools). + dparams = ( + blosc2.DParams(nthreads=blosc2.nthreads) if not blosc2.IS_WASM else blosc2.DParams(nthreads=1) + ) + kwargs["dparams"] = dparams + + +def _process_opened_object(res): + meta = getattr(res, "schunk", res).meta + if "proxy-source" in meta: + proxy_src = meta["proxy-source"] + if proxy_src["local_abspath"] is not None: + src = blosc2.open(proxy_src["local_abspath"]) + return blosc2.Proxy(src, _cache=res) + elif proxy_src["urlpath"] is not None: + src = blosc2.C2Array(proxy_src["urlpath"][0], proxy_src["urlpath"][1], proxy_src["urlpath"][2]) + return blosc2.Proxy(src, _cache=res) + elif not proxy_src["caterva2_env"]: + raise RuntimeError("Could not find the source when opening a Proxy") + + if isinstance(res, blosc2.NDArray) and "LazyArray" in res.schunk.meta: + return blosc2._open_lazyarray(res) + else: + return res + + def open( urlpath: str | pathlib.Path | blosc2.URLPath, mode: str = "a", offset: int = 0, **kwargs: dict -) -> blosc2.SChunk | blosc2.NDArray | blosc2.C2Array | blosc2.LazyArray | blosc2.Proxy: - """Open a persistent :ref:`SChunk`, :ref:`NDArray`, a remote :ref:`C2Array` - or a :ref:`Proxy` +) -> blosc2.SChunk | blosc2.NDArray | blosc2.C2Array | blosc2.LazyArray | blosc2.Proxy | Any: + """Open a persistent :ref:`SChunk`, :ref:`NDArray`, a remote :ref:`C2Array`, + a :ref:`Proxy`, a :ref:`DictStore` or an :ref:`EmbedStore`. See the `Notes` section for more info on opening `Proxy` objects. @@ -1510,9 +1558,8 @@ def open( Returns ------- - out: :ref:`SChunk`, :ref:`NDArray` or :ref:`C2Array` - The SChunk or NDArray (if there is a "b2nd" metalayer") - or the C2Array if :paramref:`urlpath` is a :ref:`blosc2.URLPath ` instance. + out: :ref:`SChunk`, :ref:`NDArray`, :ref:`C2Array`, :ref:`DictStore` or :ref:`EmbedStore` + The object found in the path. Notes ----- @@ -1577,34 +1624,15 @@ def open( if isinstance(urlpath, pathlib.PurePath): urlpath = str(urlpath) + + special = _open_special_store(urlpath, mode, offset, **kwargs) + if special is not None: + return special + if not os.path.exists(urlpath): raise FileNotFoundError(f"No such file or directory: {urlpath}") - dparams = kwargs.get("dparams") - if dparams is None: - # Use multiple threads for decompression by default, unless we are in WASM - # (does not support threads). The only drawback for using multiple threads - # is that access time will be slower because of the overhead of spawning threads - # (but could be fixed in the future with more intelligent thread pools). - dparams = ( - blosc2.DParams(nthreads=blosc2.nthreads) if not blosc2.IS_WASM else blosc2.DParams(nthreads=1) - ) - kwargs["dparams"] = dparams + _set_default_dparams(kwargs) res = blosc2_ext.open(urlpath, mode, offset, **kwargs) - meta = getattr(res, "schunk", res).meta - if "proxy-source" in meta: - proxy_src = meta["proxy-source"] - if proxy_src["local_abspath"] is not None: - src = blosc2.open(proxy_src["local_abspath"]) - return blosc2.Proxy(src, _cache=res) - elif proxy_src["urlpath"] is not None: - src = blosc2.C2Array(proxy_src["urlpath"][0], proxy_src["urlpath"][1], proxy_src["urlpath"][2]) - return blosc2.Proxy(src, _cache=res) - elif not proxy_src["caterva2_env"]: - raise RuntimeError("Could not find the source when opening a Proxy") - - if isinstance(res, blosc2.NDArray) and "LazyArray" in res.schunk.meta: - return blosc2._open_lazyarray(res) - else: - return res + return _process_opened_object(res) diff --git a/tests/test_dict_store.py b/tests/test_dict_store.py index 5313c5101..0136eef43 100644 --- a/tests/test_dict_store.py +++ b/tests/test_dict_store.py @@ -436,3 +436,16 @@ def test_get_with_different_types(): finally: if os.path.exists(path): os.remove(path) + + +def test_open_context_manager(populated_dict_store): + """Test opening via blosc2.open as a context manager.""" + dstore_fixture, path = populated_dict_store + # Close the fixture store to ensure data is written to disk + dstore_fixture.close() + + # Test opening via blosc2.open as a context manager + with blosc2.open(path, mode="r") as dstore: + assert isinstance(dstore, DictStore) + assert "/node1" in dstore + assert np.array_equal(dstore["/node1"][:], np.array([1, 2, 3])) diff --git a/tests/test_embed_store.py b/tests/test_embed_store.py index 108706230..e26fbf9c0 100644 --- a/tests/test_embed_store.py +++ b/tests/test_embed_store.py @@ -19,7 +19,7 @@ def cleanup_files(): "test_estore.b2e", "external_node3.b2nd", ] - yield + yield files for f in files: if os.path.exists(f): os.remove(f) @@ -201,3 +201,19 @@ def test_store_and_retrieve_schunk(): assert value.nbytes == len(data) assert value[:] == data assert value.vlmeta["description"] == vlmeta + + +def test_open_context_manager(cleanup_files): + """Test opening via blosc2.open as a context manager.""" + path = "test_embed_open.b2e" + cleanup_files.append(path) + + # Create an EmbedStore + estore = blosc2.EmbedStore(path, mode="w") + estore["/node1"] = np.arange(10) + + # Test opening via blosc2.open as a context manager + with blosc2.open(path, mode="r") as estore_read: + assert isinstance(estore_read, blosc2.EmbedStore) + assert "/node1" in estore_read + assert np.array_equal(estore_read["/node1"][:], np.arange(10)) From 66f85cc49788b8e211bf7fbb4acee0e994a98b46 Mon Sep 17 00:00:00 2001 From: Pranav Manoj <69426276+bossbeagle1509@users.noreply.github.com> Date: Sat, 13 Dec 2025 01:35:19 +0100 Subject: [PATCH 2/3] fix: added support and tests for opening TreeStore --- src/blosc2/schunk.py | 23 +++++++++++++++++++---- tests/test_tree_store.py | 15 +++++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 437ef5a9a..f28637361 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -1471,12 +1471,18 @@ def __dealloc__(self): def _open_special_store(urlpath, mode, offset, **kwargs): - if urlpath.endswith(".b2z") or urlpath.endswith(".b2d"): + if urlpath.endswith(".b2d"): if offset != 0: raise ValueError("Offset must be 0 for DictStore") from blosc2.dict_store import DictStore return DictStore(urlpath, mode=mode, **kwargs) + elif urlpath.endswith(".b2z"): + if offset != 0: + raise ValueError("Offset must be 0 for TreeStore") + from blosc2.tree_store import TreeStore + + return TreeStore(urlpath, mode=mode, **kwargs) elif urlpath.endswith(".b2e"): if offset != 0: raise ValueError("Offset must be 0 for EmbedStore") @@ -1520,9 +1526,18 @@ def _process_opened_object(res): def open( urlpath: str | pathlib.Path | blosc2.URLPath, mode: str = "a", offset: int = 0, **kwargs: dict -) -> blosc2.SChunk | blosc2.NDArray | blosc2.C2Array | blosc2.LazyArray | blosc2.Proxy | Any: +) -> ( + blosc2.SChunk + | blosc2.NDArray + | blosc2.C2Array + | blosc2.LazyArray + | blosc2.Proxy + | blosc2.DictStore + | blosc2.TreeStore + | blosc2.EmbedStore +): """Open a persistent :ref:`SChunk`, :ref:`NDArray`, a remote :ref:`C2Array`, - a :ref:`Proxy`, a :ref:`DictStore` or an :ref:`EmbedStore`. + a :ref:`Proxy`, a :ref:`DictStore`, :ref:`EmbedStore`, or :ref:`TreeStore`. See the `Notes` section for more info on opening `Proxy` objects. @@ -1558,7 +1573,7 @@ def open( Returns ------- - out: :ref:`SChunk`, :ref:`NDArray`, :ref:`C2Array`, :ref:`DictStore` or :ref:`EmbedStore` + out: :ref:`SChunk`, :ref:`NDArray`, :ref:`C2Array`, :ref:`DictStore`, :ref:`EmbedStore`, or :ref:`TreeStore` The object found in the path. Notes diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index 068558763..f17bd168c 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -919,3 +919,18 @@ def test_key_normalization(): assert "/group/data2" in tstore os.remove("test_key_normalization.b2z") + + +def test_open_context_manager(populated_tree_store): + """Test opening via blosc2.open as a context manager.""" + tstore_fixture, path = populated_tree_store + if ".b2d" in path: + pytest.skip("This test is only for b2z storage") + # Close the fixture store to ensure data is written to disk + tstore_fixture.close() + + # Test opening via blosc2.open as a context manager + with blosc2.open(path, mode="r") as tstore: + assert isinstance(tstore, TreeStore) + assert "/child0/data" in tstore + assert np.array_equal(tstore["/child0/data"][:], np.array([1, 2, 3])) From 5f1a0b52138e71d5620c88973017a9fb5b84bb53 Mon Sep 17 00:00:00 2001 From: Pranav Manoj <69426276+bossbeagle1509@users.noreply.github.com> Date: Sat, 13 Dec 2025 01:57:10 +0100 Subject: [PATCH 3/3] docs: updated docs to reflect new `blosc2.open` syntax --- doc/reference/dict_store.rst | 4 ++-- doc/reference/embed_store.rst | 4 ++++ doc/reference/tree_store.rst | 4 ++++ examples/dict-store.py | 2 +- examples/embed-store.py | 2 +- examples/tree-store.py | 2 +- 6 files changed, 13 insertions(+), 5 deletions(-) diff --git a/doc/reference/dict_store.rst b/doc/reference/dict_store.rst index d04538bdc..dd0ba6cc1 100644 --- a/doc/reference/dict_store.rst +++ b/doc/reference/dict_store.rst @@ -29,8 +29,8 @@ Quick example arr_ext = blosc2.arange(3, urlpath="n3.b2nd", mode="w") dstore["/dir1/node3"] = arr_ext # external file referenced - # Reopen and read - with blosc2.DictStore("my_dstore.b2z", mode="r") as dstore: + # Reopen and read using blosc2.open + with blosc2.open("my_dstore.b2z", mode="r") as dstore: print(sorted(dstore.keys())) # ['/dir1/node3', '/node1', '/node2'] print(dstore["/node1"][:]) # [1 2 3] diff --git a/doc/reference/embed_store.rst b/doc/reference/embed_store.rst index 60d66d253..261541a51 100644 --- a/doc/reference/embed_store.rst +++ b/doc/reference/embed_store.rst @@ -41,6 +41,10 @@ Quickstart print(list(estore.keys())) # ['/node1', '/node2', '/node3', '/node4'] + # Reopen using blosc2.open + estore = blosc2.open("example_estore.b2e", mode="r") + print(list(estore.keys())) + .. note:: - Embedded arrays (NumPy, NDArray, and SChunk) increase the size of the ``.b2e`` container. - Remote ``C2Array`` nodes only store lightweight references; reading them requires access to the remote source. NDArrays coming from external ``.b2nd`` files are embedded into the store. diff --git a/doc/reference/tree_store.rst b/doc/reference/tree_store.rst index 8d89f951b..29ac41aea 100644 --- a/doc/reference/tree_store.rst +++ b/doc/reference/tree_store.rst @@ -47,6 +47,10 @@ Quick example print(sorted(subtree.keys())) # ['/child1/leaf2', '/child2', '/leaf1'] print(subtree["/child1/leaf2"][:]) # [4 5 6] + # Reopen using blosc2.open + with blosc2.open("my_tree.b2z", mode="r") as tstore: + print(sorted(tstore.keys())) + .. currentmodule:: blosc2 .. autoclass:: TreeStore diff --git a/examples/dict-store.py b/examples/dict-store.py index 5b1483a73..581a814c2 100644 --- a/examples/dict-store.py +++ b/examples/dict-store.py @@ -31,7 +31,7 @@ print("After deletion, keys:", list(dstore.keys())) # Reading back the dstore -with blosc2.DictStore("example_dstore.b2z", mode="a") as dstore2: +with blosc2.open("example_dstore.b2z", mode="a") as dstore2: # Add another node to the dstore dstore2["/dir2/node5"] = np.array([4, 5, 6]) print("Node5 data:", dstore2["/dir2/node5"][:]) diff --git a/examples/embed-store.py b/examples/embed-store.py index b17598222..9fc9bf55e 100644 --- a/examples/embed-store.py +++ b/examples/embed-store.py @@ -35,7 +35,7 @@ # Reading back the tree if persistent: - estore_read = blosc2.EmbedStore(urlpath="example_estore.b2e", mode="a") + estore_read = blosc2.open("example_estore.b2e", mode="a") else: estore_read = blosc2.from_cframe(estore.to_cframe()) diff --git a/examples/tree-store.py b/examples/tree-store.py index a22a64072..1462d44b9 100644 --- a/examples/tree-store.py +++ b/examples/tree-store.py @@ -60,7 +60,7 @@ print("After deleting '/child0/child1', keys:", sorted(tstore.keys())) # Reopen and add another leaf under an existing subtree -with blosc2.TreeStore("example_tree.b2z", mode="a") as tstore2: +with blosc2.open("example_tree.b2z", mode="a") as tstore2: tstore2["/child0/new_leaf"] = np.array([9, 9, 9]) print("Reopened keys:", sorted(tstore2.keys())) # Read via subtree view