From 0399c24c9fcce998bbce0b9d9bebe25a37f7213c Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 24 Nov 2025 10:19:02 +0100 Subject: [PATCH 1/2] Add safe file handling with QuantifyingException for process and report scripts --- scripts/2-process/gcs_process.py | 3 +++ scripts/2-process/github_process.py | 1 + scripts/3-report/gcs_report.py | 9 +++++++++ scripts/3-report/github_report.py | 2 ++ scripts/shared.py | 11 +++++++++++ 5 files changed, 26 insertions(+) diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index c5d354b7..4d74cb23 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -311,6 +311,7 @@ def main(): # Count data file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv") + shared.safe_open_file(file1_count, "process GCS count data") count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"]) process_product_totals(args, count_data) process_latest_prior_retired_totals(args, count_data) @@ -321,6 +322,7 @@ def main(): file2_language = shared.path_join( PATHS["data_1-fetch"], "gcs_2_count_by_language.csv" ) + shared.safe_open_file(file2_language, "process GCS language data") language_data = pd.read_csv( file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"] ) @@ -330,6 +332,7 @@ def main(): file3_country = shared.path_join( PATHS["data_1-fetch"], "gcs_3_count_by_country.csv" ) + shared.safe_open_file(file3_country, "process GCS country data") country_data = pd.read_csv( file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"] ) diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index ae9d261a..76743dd0 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -178,6 +178,7 @@ def main(): shared.git_fetch_and_merge(args, PATHS["repo"]) file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv") + shared.safe_open_file(file_count, "process GitHub count data") count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]) process_totals_by_license(args, count_data) process_totals_by_restriction(args, count_data) diff --git a/scripts/3-report/gcs_report.py b/scripts/3-report/gcs_report.py index 105313fa..80b0aa35 100755 --- a/scripts/3-report/gcs_report.py +++ b/scripts/3-report/gcs_report.py @@ -79,6 +79,7 @@ def gcs_intro(args): "gcs_product_totals.csv", ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS intro") name_label = "CC legal tool product" data = pd.read_csv(file_path, index_col=name_label) total_count = f"{data['Count'].sum():,d}" @@ -110,6 +111,7 @@ def plot_products(args): PATHS["data_2-process"], "gcs_product_totals.csv" ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS products report") name_label = "CC legal tool product" data = pd.read_csv(file_path, index_col=name_label) data = data[::-1] # reverse order @@ -155,6 +157,7 @@ def plot_tool_status(args): "gcs_status_combined_totals.csv", ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS tool status report") name_label = "CC legal tool" data = pd.read_csv(file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) @@ -198,6 +201,7 @@ def plot_latest_tools(args): "gcs_status_latest_totals.csv", ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS latest tools report") name_label = "CC legal tool" data = pd.read_csv(file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) @@ -240,6 +244,7 @@ def plot_prior_tools(args): PATHS["data_2-process"], "gcs_status_prior_totals.csv" ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS prior tools report") name_label = "CC legal tool" data = pd.read_csv(file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) @@ -285,6 +290,7 @@ def plot_retired_tools(args): "gcs_status_retired_totals.csv", ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS retired tools report") name_label = "CC legal tool" data = pd.read_csv(file_path, index_col=name_label) data.sort_values(name_label, ascending=False, inplace=True) @@ -330,6 +336,7 @@ def plot_countries_highest_usage(args): PATHS["data_2-process"], "gcs_totals_by_country.csv" ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS countries report") name_label = "Country" data_label = "Count" data = pd.read_csv(file_path, index_col=name_label) @@ -383,6 +390,7 @@ def plot_languages_highest_usage(args): PATHS["data_2-process"], "gcs_totals_by_language.csv" ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS languages report") name_label = "Language" data_label = "Count" data = pd.read_csv(file_path, index_col=name_label) @@ -437,6 +445,7 @@ def plot_free_culture(args): "gcs_totals_by_free_cultural.csv", ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GCS free culture report") name_label = "Category" data_label = "Count" data = pd.read_csv(file_path, index_col=name_label) diff --git a/scripts/3-report/github_report.py b/scripts/3-report/github_report.py index 7de0189c..8360603a 100755 --- a/scripts/3-report/github_report.py +++ b/scripts/3-report/github_report.py @@ -150,6 +150,7 @@ def plot_totals_by_license_type(args): "github_totals_by_license.csv", ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GitHub license report") name_label = "License" data_label = "Count" data = pd.read_csv(file_path, index_col=name_label) @@ -199,6 +200,7 @@ def plot_totals_by_restriction(args): "github_totals_by_restriction.csv", ) LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + shared.safe_open_file(file_path, "generate GitHub restriction report") name_label = "Category" data_label = "Count" data = pd.read_csv(file_path, index_col=name_label) diff --git a/scripts/shared.py b/scripts/shared.py index 541988fc..3db8947f 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -236,6 +236,17 @@ def setup(current_file): return logger, paths +def safe_open_file(file_path, operation="read"): + """ + Check file exists, raise QuantifyingException with helpful message if not. + """ + if not os.path.exists(file_path): + raise QuantifyingException( + f"Cannot {operation} file: {file_path} does not exist" + ) + return file_path + + def update_readme( args, section_title, From 5f11dc1dac4685f094706c00da5e8f8cb260d334 Mon Sep 17 00:00:00 2001 From: opsmithe Date: Mon, 24 Nov 2025 10:20:18 +0100 Subject: [PATCH 2/2] Make shared.py executable --- scripts/shared.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 scripts/shared.py diff --git a/scripts/shared.py b/scripts/shared.py old mode 100644 new mode 100755