histogrammar · mbaak · Sep 2, 2025 · Sep 1, 2025 · Sep 1, 2025 · Sep 1, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -35,7 +35,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           if [ "${{ matrix.numpy_version }}" = "numpy<2" ]; then
-            pip install ".[test,pandas,spark,test_numpy_pre2]"
+            pip install ".[test,pandas,test_spark_pre2,test_numpy_pre2]"
           else
             pip install ".[test,pandas,spark]"          
           fi

diff --git a/README.rst b/README.rst
@@ -17,8 +17,8 @@ more quickly via Numpy commands, rather than Python for loops.
 
 This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
 
-Latest Python release: v1.1.1 (Aug 2025).
-Latest update: Aug 2025.
+Latest Python release: v1.1.2 (Sep 2025).
+Latest update: Sep 2025.
 
 References
 ==========
@@ -38,19 +38,19 @@ Changes
 See Changes log `here <https://github.com/histogrammar/histogrammar-python/blob/master/CHANGES.rst>`_.
 
 
-Spark 3.X
----------
+Spark
+-----
 
-With Spark 3.X, based on Scala 2.12 or 2.13, make sure to pick up the correct histogrammar jar files:
+With Spark, make sure to pick up the correct histogrammar jar files. Spark 4.X is based on Scala 2.13; Spark 3.X is based on Scala 2.12 or 2.13.
 
 .. code-block:: python
 
-  spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.12:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.30").getOrCreate()
+  spark = SparkSession.builder.config("spark.jars.packages", "io.github.histogrammar:histogrammar_2.13:1.0.30,io.github.histogrammar:histogrammar-sparksql_2.13:1.0.30").getOrCreate()
 
 
-For Scala 2.13, in the string above simply replace "2.12" with "2.13".
+For Scala 2.12, in the string above simply replace "2.13" with "2.12".
 
-December, 2023
+September, 2025
 
 
 Example notebooks

diff --git a/histogrammar/dfinterface/spark_histogrammar.py b/histogrammar/dfinterface/spark_histogrammar.py
@@ -225,7 +225,7 @@ def construct_empty_hist(self, df, features):
         for idx, col in enumerate(revcols):
             # histogram type depends on the data type
             dt = self.var_dtype[col]
-            quant = df[col]
+            quant = f.col(col)
             hist = self.get_hist_bin(hist, features, quant, col, dt)
 
         return hist

diff --git a/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb b/histogrammar/notebooks/histogrammar_tutorial_advanced.ipynb
@@ -3,6 +3,7 @@
   {
    "cell_type": "markdown",
    "metadata": {
+    "collapsed": false,
     "jupyter": {
      "outputs_hidden": false
     },
@@ -118,9 +119,9 @@
     "# for spark 2.X, in the jars string, for both jar files change \"_2.12\" into \"_2.11\".\n",
     "\n",
     "if pyspark_installed:\n",
-    "    scala = '2.12' if int(pyspark_version[0]) >= 3 else '2.11'\n",
-    "    hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.20'\n",
-    "    hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.20'\n",
+    "    scala = '2.12' if int(pyspark_version[0]) == 3 else '2.13'\n",
+    "    hist_jar = f'io.github.histogrammar:histogrammar_{scala}:1.0.30'\n",
+    "    hist_spark_jar = f'io.github.histogrammar:histogrammar-sparksql_{scala}:1.0.30'\n",
     "\n",
     "    spark = SparkSession.builder.config(\n",
     "        \"spark.jars.packages\", f'{hist_spark_jar},{hist_jar}'\n",
@@ -521,7 +522,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.11.11"
   },
   "nteract": {
    "version": "0.15.0"

diff --git a/histogrammar/util.py b/histogrammar/util.py
@@ -247,7 +247,7 @@ def __init__(self, expr, name=None):
                 ok = False
             else:
                 if isinstance(expr, Column) and self.name is None:
-                    self.name = str(expr)[7:-1]
+                    self.name = str(expr)[8:-2]
                 ok = True
         if not ok:
             raise TypeError(f"quantity ({expr}) must be a string, function, or SparkSQL Column")

diff --git a/histogrammar/version.py b/histogrammar/version.py
@@ -2,7 +2,7 @@
 
 import re
 
-version = "1.1.1"
+version = "1.1.2"
 
 
 def split_version_string(version_string: str) -> tuple[int, int]:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "histogrammar"
-description = "Composable histogram primitives for distributed data reduction"
+description = "Histograms for Pandas/Spark/Numpy"
 keywords = [
     "pandas",
     "spark",
@@ -17,7 +17,7 @@ keywords = [
 ]
 readme = "README.rst"
 requires-python = ">=3.9"
-authors = [{ name = "Jim Pivarski (DIANA-HEP)", email = "pivarski@fnal.gov" }, { name = "Max Baak", email = "maxbaak@gmail.com" }]
+authors = [{ name = "Max Baak", email = "maxbaak@gmail.com" }, { name = "Jim Pivarski (DIANA-HEP)", email = "pivarski@fnal.gov" }]
 maintainers = [{ name = "Max Baak", email = "maxbaak@gmail.com" }]
 license = { type = "Apache Software License v2", file = "LICENSE" }
 dependencies = [
@@ -40,7 +40,7 @@ pandas = [
     "pandas"
 ]
 spark = [
-    "pyspark<4; python_version <= '3.11'",
+    "pyspark",
 ]
 test = [
     "ipykernel>=5.1.3",
@@ -55,6 +55,9 @@ test_numpy_pre2 = [
     "numpy<2",
     "pandas<2",
 ]
+test_spark_pre2 = [
+    "pyspark<4; python_version <= '3.11'",
+]
 
 # files to be shipped with the installation, under: histogrammar/test_data and histogrammar/notebooks
 # after installation, these can be found with the functions in resources.py

diff --git a/tests/jars/histogrammar-sparksql_2.12-1.0.30.jar b/tests/jars/histogrammar-sparksql_2.12-1.0.30.jar
diff --git a/tests/jars/histogrammar-sparksql_2.13-1.0.30.jar b/tests/jars/histogrammar-sparksql_2.13-1.0.30.jar
diff --git a/tests/jars/histogrammar_2.12-1.0.30.jar b/tests/jars/histogrammar_2.12-1.0.30.jar
diff --git a/tests/jars/histogrammar_2.13-1.0.30.jar b/tests/jars/histogrammar_2.13-1.0.30.jar
diff --git a/tests/test_spark_histogrammar.py b/tests/test_spark_histogrammar.py
@@ -21,9 +21,9 @@ def get_spark():
 
     current_path = Path(__file__).resolve().parent
 
-    scala = "2.12" if int(pyspark_version[0]) >= 3 else "2.11"
-    hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.20.jar"
-    hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.20.jar"
+    scala = "2.12" if int(pyspark_version[0]) == 3 else "2.13"
+    hist_spark_jar = current_path / f"jars/histogrammar-sparksql_{scala}-1.0.30.jar"
+    hist_jar = current_path / f"jars/histogrammar_{scala}-1.0.30.jar"
 
     return (
         SparkSession.builder.master("local")
@@ -44,16 +44,16 @@ def spark_co():
 @pytest.mark.skipif(not spark_found, reason="spark not found")
 @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
 def test_get_histograms(spark_co):
-    pytest.age["data"]["name"] = "'age'"
-    pytest.company["data"]["name"] = "'company'"
-    pytest.eyesColor["data"]["name"] = "'eyeColor'"
-    pytest.gender["data"]["name"] = "'gender'"
-    pytest.isActive["data"]["name"] = "'isActive'"
-    pytest.latitude["data"]["name"] = "'latitude'"
-    pytest.longitude["data"]["name"] = "'longitude'"
-    pytest.transaction["data"]["name"] = "'transaction'"
-
-    pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
+    pytest.age["data"]["name"] = "age"
+    pytest.company["data"]["name"] = "company"
+    pytest.eyesColor["data"]["name"] = "eyeColor"
+    pytest.gender["data"]["name"] = "gender"
+    pytest.isActive["data"]["name"] = "isActive"
+    pytest.latitude["data"]["name"] = "latitude"
+    pytest.longitude["data"]["name"] = "longitude"
+    pytest.transaction["data"]["name"] = "transaction"
+
+    pytest.latitude_longitude["data"]["name"] = "latitude:longitude"
     pytest.latitude_longitude["data"]["bins:name"] = "unit_func"
 
     spark = spark_co
@@ -104,15 +104,15 @@ def test_get_histograms(spark_co):
 @pytest.mark.skipif(not spark_found, reason="spark not found")
 @pytest.mark.filterwarnings("ignore:createDataFrame attempted Arrow optimization because")
 def test_get_histograms_module(spark_co):
-    pytest.age["data"]["name"] = "'age'"
-    pytest.company["data"]["name"] = "'company'"
-    pytest.eyesColor["data"]["name"] = "'eyeColor'"
-    pytest.gender["data"]["name"] = "'gender'"
-    pytest.isActive["data"]["name"] = "'isActive'"
-    pytest.latitude["data"]["name"] = "'latitude'"
-    pytest.longitude["data"]["name"] = "'longitude'"
-
-    pytest.latitude_longitude["data"]["name"] = "'latitude:longitude'"
+    pytest.age["data"]["name"] = "age"
+    pytest.company["data"]["name"] = "company"
+    pytest.eyesColor["data"]["name"] = "eyeColor"
+    pytest.gender["data"]["name"] = "gender"
+    pytest.isActive["data"]["name"] = "isActive"
+    pytest.latitude["data"]["name"] = "latitude"
+    pytest.longitude["data"]["name"] = "longitude"
+
+    pytest.latitude_longitude["data"]["name"] = "latitude:longitude"
     pytest.latitude_longitude["data"]["bins:name"] = "unit_func"
 
     spark = spark_co
@@ -187,7 +187,7 @@ def test_get_histograms_timestamp(spark_co):
             "bins": {"108": 9.0, "109": 1.0},
             "bins:type": "Count",
             "entries": 10.0,
-            "name": "'dt'",
+            "name": "dt",
             "nanflow": 0.0,
             "nanflow:type": "Count",
             "origin": 1.2625632e18,
@@ -229,7 +229,7 @@ def test_get_histograms_date(spark_co):
             "bins": {"108": 9.0, "109": 1.0},
             "bins:type": "Count",
             "entries": 10.0,
-            "name": "'dt'",
+            "name": "dt",
             "nanflow": 0.0,
             "nanflow:type": "Count",
             "origin": 1.2625632e18,