Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
run: |
python -m pip install --upgrade pip
if [ "${{ matrix.numpy_version }}" = "numpy<2" ]; then
pip install ".[test,pandas,spark,test_numpy_pre2]"
pip install ".[test,pandas,test_spark_pre2,test_numpy_pre2]"
else
pip install ".[test,pandas,spark]"
fi
Expand Down
16 changes: 8 additions & 8 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ more quickly via Numpy commands, rather than Python for loops.

This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.

Latest Python release: v1.1.1 (Aug 2025).
Latest update: Aug 2025.
Latest Python release: v1.1.2 (Sep 2025).
Latest update: Sep 2025.

References
==========
Expand All @@ -38,19 +38,19 @@ Changes
See Changes log `here <https://github.com/histogrammar/histogrammar-python/blob/master/CHANGES.rst>`_.


Spark 3.X
---------
Spark
-----

With Spark 3.X, based on Scala 2.12 or 2.13, make sure to pick up the correct histogrammar jar files:
With Spark, make sure to pick up the correct histogrammar jar files. Spark 4.X is based on Scala 2.13; Spark 3.X is based on Scala 2.12 or 2.13.

.. code-block:: python

spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.30").getOrCreate()
spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.13:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.13:1.0.30").getOrCreate()


For Scala 2.13, in the string above simply replace "2.12" with "2.13".
For Scala 2.12, in the string above simply replace "2.13" with "2.12".

December, 2023
September, 2025


Example notebooks
Expand Down
2 changes: 1 addition & 1 deletion histogrammar/dfinterface/spark_histogrammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def construct_empty_hist(self, df, features):
for idx, col in enumerate(revcols):
# histogram type depends on the data type
dt = self.var_dtype[col]
quant = df[col]
quant = f.col(col)
hist = self.get_hist_bin(hist, features, quant, col, dt)

return hist
Expand Down
9 changes: 5 additions & 4 deletions histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
{
"cell_type": "markdown",
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
},
Expand Down Expand Up @@ -118,9 +119,9 @@
"# for spark 2.X, in the jars string, for both jar files change \"_2.12\" into \"_2.11\".\n",
"\n",
"if pyspark_installed:\n",
" scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n",
" hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n",
" hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n",
" scala = '2.12' if int(pyspark_version[0]) == 3 else '2.13'\n",
" hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.30'\n",
" hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.30'\n",
"\n",
" spark = SparkSession.builder.config(\n",
" \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n",
Expand Down Expand Up @@ -521,7 +522,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.11.11"
},
"nteract": {
"version": "0.15.0"
Expand Down
2 changes: 1 addition & 1 deletion histogrammar/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def __init__(self, expr, name=None):
ok = False
else:
if isinstance(expr, Column) and self.name is None:
self.name = str(expr)[7:-1]
self.name = str(expr)[8:-2]
ok = True
if not ok:
raise TypeError(f"quantity ({expr}) must be a string, function, or SparkSQL Column")
Expand Down
2 changes: 1 addition & 1 deletion histogrammar/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import re

version = "1.1.1"
version = "1.1.2"


def split_version_string(version_string: str) -> tuple[int, int]:
Expand Down
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "histogrammar"
description = "Composable histogram primitives for distributed data reduction"
description = "Histograms for Pandas/Spark/Numpy"
keywords = [
"pandas",
"spark",
Expand All @@ -17,7 +17,7 @@ keywords = [
]
readme = "README.rst"
requires-python = ">=3.9"
authors = [{ name = "Jim Pivarski (DIANA-HEP)", email = "pivarski@fnal.gov" }, { name = "Max Baak", email = "maxbaak@gmail.com" }]
authors = [{ name = "Max Baak", email = "maxbaak@gmail.com" }, { name = "Jim Pivarski (DIANA-HEP)", email = "pivarski@fnal.gov" }]
maintainers = [{ name = "Max Baak", email = "maxbaak@gmail.com" }]
license = { type = "Apache Software License v2", file = "LICENSE" }
dependencies = [
Expand All @@ -40,7 +40,7 @@ pandas = [
"pandas"
]
spark = [
"pyspark<4; python_version <= '3.11'",
"pyspark",
]
test = [
"ipykernel>=5.1.3",
Expand All @@ -55,6 +55,9 @@ test_numpy_pre2 = [
"numpy<2",
"pandas<2",
]
test_spark_pre2 = [
"pyspark<4; python_version <= '3.11'",
]

# files to be shipped with the installation, under: histogrammar/test_data and histogrammar/notebooks
# after installation, these can be found with the functions in resources.py
Expand Down
Binary file not shown.
Binary file added tests/jars/histogrammar-sparksql_2.13-1.0.30.jar
Binary file not shown.
Binary file added tests/jars/histogrammar_2.12-1.0.30.jar
Binary file not shown.
Binary file added tests/jars/histogrammar_2.13-1.0.30.jar
Binary file not shown.
48 changes: 24 additions & 24 deletions tests/test_spark_histogrammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def get_spark():

current_path = Path(__file__).resolve().parent

scala = "2.12" if int(pyspark_version[0]) >= 3 else "2.11"
hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.20.jar"
hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.20.jar"
scala = "2.12" if int(pyspark_version[0]) == 3 else "2.13"
hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.30.jar"
hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.30.jar"

return (
SparkSession.builder.master("local")
Expand All @@ -44,16 +44,16 @@ def spark_co():
@pytest.mark.skipif(not spark_found, reason="spark not found")
@pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
def test_get_histograms(spark_co):
pytest.age["data"]["name"] = "'age'"
pytest.company["data"]["name"] = "'company'"
pytest.eyesColor["data"]["name"] = "'eyeColor'"
pytest.gender["data"]["name"] = "'gender'"
pytest.isActive["data"]["name"] = "'isActive'"
pytest.latitude["data"]["name"] = "'latitude'"
pytest.longitude["data"]["name"] = "'longitude'"
pytest.transaction["data"]["name"] = "'transaction'"

pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
pytest.age["data"]["name"] = "age"
pytest.company["data"]["name"] = "company"
pytest.eyesColor["data"]["name"] = "eyeColor"
pytest.gender["data"]["name"] = "gender"
pytest.isActive["data"]["name"] = "isActive"
pytest.latitude["data"]["name"] = "latitude"
pytest.longitude["data"]["name"] = "longitude"
pytest.transaction["data"]["name"] = "transaction"

pytest.latitude_longitude["data"]["name"] = "latitude:longitude"
pytest.latitude_longitude["data"]["bins:name"] = "unit_func"

spark = spark_co
Expand Down Expand Up @@ -104,15 +104,15 @@ def test_get_histograms(spark_co):
@pytest.mark.skipif(not spark_found, reason="spark not found")
@pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
def test_get_histograms_module(spark_co):
pytest.age["data"]["name"] = "'age'"
pytest.company["data"]["name"] = "'company'"
pytest.eyesColor["data"]["name"] = "'eyeColor'"
pytest.gender["data"]["name"] = "'gender'"
pytest.isActive["data"]["name"] = "'isActive'"
pytest.latitude["data"]["name"] = "'latitude'"
pytest.longitude["data"]["name"] = "'longitude'"

pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
pytest.age["data"]["name"] = "age"
pytest.company["data"]["name"] = "company"
pytest.eyesColor["data"]["name"] = "eyeColor"
pytest.gender["data"]["name"] = "gender"
pytest.isActive["data"]["name"] = "isActive"
pytest.latitude["data"]["name"] = "latitude"
pytest.longitude["data"]["name"] = "longitude"

pytest.latitude_longitude["data"]["name"] = "latitude:longitude"
pytest.latitude_longitude["data"]["bins:name"] = "unit_func"

spark = spark_co
Expand Down Expand Up @@ -187,7 +187,7 @@ def test_get_histograms_timestamp(spark_co):
"bins": {"108": 9.0, "109": 1.0},
"bins:type": "Count",
"entries": 10.0,
"name": "'dt'",
"name": "dt",
"nanflow": 0.0,
"nanflow:type": "Count",
"origin": 1.2625632e18,
Expand Down Expand Up @@ -229,7 +229,7 @@ def test_get_histograms_date(spark_co):
"bins": {"108": 9.0, "109": 1.0},
"bins:type": "Count",
"entries": 10.0,
"name": "'dt'",
"name": "dt",
"nanflow": 0.0,
"nanflow:type": "Count",
"origin": 1.2625632e18,
Expand Down