diff --git a/scripts/1-fetch/arxiv_fetch.py b/scripts/1-fetch/arxiv_fetch.py index 93249652..3f6bd91d 100755 --- a/scripts/1-fetch/arxiv_fetch.py +++ b/scripts/1-fetch/arxiv_fetch.py @@ -1,22 +1,25 @@ #!/usr/bin/env python """ -Fetch ArXiv papers with CC license information and generate count reports. +Fetch ArXiv papers with CC license information using OAI-PMH API. + +This script uses ArXiv's OAI-PMH interface to harvest papers with structured +license metadata, providing more accurate CC license detection than text-based +pattern matching. Focuses on recent years where CC licensing is more common. """ # Standard library import argparse import csv import os -import re import sys import textwrap import time import traceback -import urllib.parse +import xml.etree.ElementTree as ET # XML parsing for OAI-PMH responses from collections import Counter, defaultdict +from datetime import datetime # Date calculations for harvesting ranges from operator import itemgetter # Third-party -import feedparser import requests import yaml from pygments import highlight @@ -33,9 +36,13 @@ LOGGER, PATHS = shared.setup(__file__) # Constants -# API Configuration -BASE_URL = "https://export.arxiv.org/api/query?" -DEFAULT_FETCH_LIMIT = 800 # Default total papers to fetch +# API Configuration - Updated to use OAI-PMH for structured license data +BASE_URL = "https://oaipmh.arxiv.org/oai" +# Implementation choice: Set to 1000 CC-licensed papers for balanced collection +# This is NOT an ArXiv API requirement - ArXiv only requires "responsible" use +# The 3-second delays between requests ensure compliance with OAI-PMH practices +DEFAULT_FETCH_LIMIT = 1000 # Default total CC-licensed papers to fetch +DEFAULT_YEARS_BACK = 5 # Default years to look back from current year # CSV Headers HEADER_AUTHOR_BUCKET = ["TOOL_IDENTIFIER", "AUTHOR_BUCKET", "COUNT"] @@ -48,46 +55,24 @@ HEADER_COUNT = ["TOOL_IDENTIFIER", "COUNT"] HEADER_YEAR = ["TOOL_IDENTIFIER", "YEAR", "COUNT"] -# Search Queries -SEARCH_QUERIES = [ - 'all:"creative commons"', - 'all:"CC BY"', - 'all:"CC-BY"', - 'all:"CC BY-NC"', - 'all:"CC-BY-NC"', - 'all:"CC BY-SA"', - 'all:"CC-BY-SA"', - 'all:"CC BY-ND"', - 'all:"CC-BY-ND"', - 'all:"CC BY-NC-SA"', - 'all:"CC-BY-NC-SA"', - 'all:"CC BY-NC-ND"', - 'all:"CC-BY-NC-ND"', - 'all:"CC0"', - 'all:"CC 0"', - 'all:"CC-0"', -] - -# Compiled regex patterns for CC license detection -CC_PATTERNS = [ - (re.compile(r"\bCC[-\s]?0\b", re.IGNORECASE), "CC0"), - ( - re.compile(r"\bCC[-\s]?BY[-\s]?NC[-\s]?ND\b", re.IGNORECASE), - "CC BY-NC-ND", - ), - ( - re.compile(r"\bCC[-\s]?BY[-\s]?NC[-\s]?SA\b", re.IGNORECASE), - "CC BY-NC-SA", - ), - (re.compile(r"\bCC[-\s]?BY[-\s]?ND\b", re.IGNORECASE), "CC BY-ND"), - (re.compile(r"\bCC[-\s]?BY[-\s]?SA\b", re.IGNORECASE), "CC BY-SA"), - (re.compile(r"\bCC[-\s]?BY[-\s]?NC\b", re.IGNORECASE), "CC BY-NC"), - (re.compile(r"\bCC[-\s]?BY\b", re.IGNORECASE), "CC BY"), - ( - re.compile(r"\bCREATIVE\s+COMMONS\b", re.IGNORECASE), - "UNKNOWN CC legal tool", - ), -] +# License mapping for structured data from OAI-PMH +LICENSE_MAPPING = { + "http://creativecommons.org/licenses/by/3.0/": "CC BY 3.0", + "http://creativecommons.org/licenses/by/4.0/": "CC BY 4.0", + "http://creativecommons.org/licenses/by-nc/3.0/": "CC BY-NC 3.0", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC BY-NC 4.0", + "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC BY-NC-ND 3.0", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC BY-NC-ND 4.0", + "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC BY-NC-SA 3.0", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC BY-NC-SA 4.0", + "http://creativecommons.org/licenses/by-nd/3.0/": "CC BY-ND 3.0", + "http://creativecommons.org/licenses/by-nd/4.0/": "CC BY-ND 4.0", + "http://creativecommons.org/licenses/by-sa/3.0/": "CC BY-SA 3.0", + "http://creativecommons.org/licenses/by-sa/4.0/": "CC BY-SA 4.0", + "http://creativecommons.org/licenses/publicdomain": "CC CERTIFICATION 1.0 US", + "http://creativecommons.org/publicdomain/zero/1.0/": "CC0 1.0", + "http://creativecommons.org/share-your-work/public-domain/cc0/": "CC0", +} # ArXiv Categories - manually curated from ArXiv official taxonomy # Source: https://arxiv.org/category_taxonomy @@ -273,9 +258,9 @@ def parse_arguments(): """Parse command-line options, returns parsed argument namespace. - Note: The --limit parameter sets the total number of papers to fetch - across all search queries, not per query. ArXiv API recommends - maximum of 30000 results per session for optimal performance. + Note: The --limit parameter sets the total number of papers to fetch. + The --years-back parameter limits harvesting to recent years where + CC licensing is more common. """ LOGGER.info("Parsing command-line options") parser = argparse.ArgumentParser(description=__doc__) @@ -284,12 +269,19 @@ def parse_arguments(): type=int, default=DEFAULT_FETCH_LIMIT, help=( - f"Total limit of papers to fetch across all search queries " - f"(default: {DEFAULT_FETCH_LIMIT}). Maximum recommended: 30000. " - f"Note: Individual queries limited to 500 results " - f"(implementation choice). " - f"See ArXiv API documentation: " - f"https://info.arxiv.org/help/api/user-manual.html" + f"Total limit of papers to fetch " + f"(default: {DEFAULT_FETCH_LIMIT}). " + f"Note: Uses OAI-PMH API for structured license data." + ), + ) + parser.add_argument( + "--years-back", + type=int, + default=DEFAULT_YEARS_BACK, + help=( + f"Number of years back from current year to harvest " + f"(default: {DEFAULT_YEARS_BACK}). " + f"Reduces dataset size and focuses on recent CC-licensed papers." ), ) parser.add_argument( @@ -333,99 +325,101 @@ def initialize_all_data_files(args): initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET) -def normalize_license_text(raw_text): +def extract_license_from_xml(record_xml): """ - Convert raw license text to standardized CC license identifiers. - - Uses regex patterns to identify CC licenses from paper text. - Returns specific license (e.g., "CC BY", "CC0") or "Unknown". + Extract CC license information from OAI-PMH XML record. + Returns normalized license identifier or specific error indicator. """ - if not raw_text: - return "Unknown" + try: + root = ET.fromstring(record_xml) - for pattern, license_type in CC_PATTERNS: - if pattern.search(raw_text): - return license_type + # Find license element in arXiv namespace + license_element = root.find(".//{http://arxiv.org/OAI/arXiv/}license") - return "Unknown" + if license_element is not None and license_element.text: + license_url = license_element.text.strip() + # Check exact mapping first + if license_url in LICENSE_MAPPING: + return LICENSE_MAPPING[license_url] -def extract_license_info(entry): - """ - Extract CC license information from ArXiv paper entry. + # Validate CC URLs more strictly + if "creativecommons.org/licenses/" in license_url.lower(): + return f"CC (unmapped): {license_url}" + elif "creativecommons.org" in license_url.lower(): + return f"CC (ambiguous): {license_url}" - Checks rights field first, then summary field for license patterns. - Returns normalized license identifier or "Unknown". - """ - # checking through the rights field first then summary - if hasattr(entry, "rights") and entry.rights: - license_info = normalize_license_text(entry.rights) - if license_info != "Unknown": - return license_info - if hasattr(entry, "summary") and entry.summary: - license_info = normalize_license_text(entry.summary) - if license_info != "Unknown": - return license_info - return "Unknown" - - -def extract_category_from_entry(entry): - """Extract primary category from ArXiv entry.""" - if ( - hasattr(entry, "arxiv_primary_category") - and entry.arxiv_primary_category - ): - return entry.arxiv_primary_category.get("term", "Unknown") - if hasattr(entry, "tags") and entry.tags: - # Get first category from tags - for tag in entry.tags: - if hasattr(tag, "term"): - return tag.term - return "Unknown" - - -def extract_year_from_entry(entry): - """Extract publication year from ArXiv entry.""" - if hasattr(entry, "published") and entry.published: - try: - return entry.published[:4] # Extract year from date string - except (AttributeError, IndexError) as e: - LOGGER.debug( - f"Failed to extract year from '{entry.published}': {e}" - ) - return "Unknown" + return "Non-CC" + return "No license field" -def extract_author_count_from_entry(entry): - """Extract number of authors from ArXiv entry.""" - if hasattr(entry, "authors") and entry.authors: - try: - return len(entry.authors) - except Exception as e: - LOGGER.debug(f"Failed to count authors from entry.authors: {e}") - if hasattr(entry, "author") and entry.author: - return 1 - return "Unknown" + except ET.ParseError as e: + LOGGER.error(f"XML parsing failed: {e}") + return "XML parse error" + except Exception as e: + LOGGER.error(f"License extraction failed: {e}") + return "Extraction error" -def bucket_author_count(n): +def extract_metadata_from_xml(record_xml): """ - Convert author count to predefined buckets for analysis. + Extract paper metadata from OAI-PMH XML record. - Buckets: "1", "2", "3", "4", "5+", "Unknown" - Reduces granularity for better statistical analysis. + Returns dict with category, year, author_count, and license info. """ - if n == 1: - return "1" - if n == 2: - return "2" - if n == 3: - return "3" - if n == 4: - return "4" - if n >= 5: - return "5+" - return "Unknown" + try: + root = ET.fromstring(record_xml) + + # Extract category (primary category from categories field) + categories_elem = root.find( + ".//{http://arxiv.org/OAI/arXiv/}categories" + ) + category = "Unknown" + if categories_elem is not None and categories_elem.text: + # Take first category as primary + category = categories_elem.text.strip().split()[0] + + # Extract year from created date + created_elem = root.find(".//{http://arxiv.org/OAI/arXiv/}created") + year = "Unknown" + if created_elem is not None and created_elem.text: + try: + year = created_elem.text.strip()[:4] # Extract year + except (AttributeError, IndexError) as e: + LOGGER.warning( + f"Failed to extract year from '{created_elem.text}': {e}" + ) + year = "Unknown" + + # Extract author count + authors = root.findall(".//{http://arxiv.org/OAI/arXiv/}author") + author_count = len(authors) if authors else 0 + + # Extract license + license_info = extract_license_from_xml(record_xml) + + return { + "category": category, + "year": year, + "author_count": author_count, + "license": license_info, + } + + except Exception as e: + LOGGER.error(f"Metadata extraction error: {e}") + return { + "category": "Unknown", + "year": "Unknown", + "author_count": 0, + "license": "Unknown", + } + + +def bucket_author_count(author_count): + """Convert author count to predefined buckets: "1", "2", "3", "4", "5+".""" + if author_count <= 4: + return str(author_count) + return "5+" def save_count_data( @@ -438,38 +432,42 @@ def save_count_data( # license_counts: {license: count} # category_counts: {license: {category_code: count}} # year_counts: {license: {year: count}} - # author_counts: {license: {author_count(int|None): count}} + # author_counts: {license: {author_count: count}} # Save license counts data = [] - for lic, c in license_counts.items(): - data.append({"TOOL_IDENTIFIER": lic, "COUNT": c}) + for license_name, count in license_counts.items(): + data.append({"TOOL_IDENTIFIER": license_name, "COUNT": count}) data.sort(key=itemgetter("TOOL_IDENTIFIER")) - with open(FILE_ARXIV_COUNT, "w", encoding="utf-8", newline="\n") as fh: - writer = csv.DictWriter(fh, fieldnames=HEADER_COUNT, dialect="unix") + with open( + FILE_ARXIV_COUNT, "w", encoding="utf-8", newline="\n" + ) as file_handle: + writer = csv.DictWriter( + file_handle, fieldnames=HEADER_COUNT, dialect="unix" + ) writer.writeheader() for row in data: writer.writerow(row) # Save category report with labels data = [] - for lic, cats in category_counts.items(): - for code, c in cats.items(): + for license_name, categories in category_counts.items(): + for code, count in categories.items(): label = CATEGORIES.get(code, code) data.append( { - "TOOL_IDENTIFIER": lic, + "TOOL_IDENTIFIER": license_name, "CATEGORY_CODE": code, "CATEGORY_LABEL": label, - "COUNT": c, + "COUNT": count, } ) data.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE")) with open( FILE_ARXIV_CATEGORY_REPORT, "w", encoding="utf-8", newline="\n" - ) as fh: + ) as file_handle: writer = csv.DictWriter( - fh, fieldnames=HEADER_CATEGORY_REPORT, dialect="unix" + file_handle, fieldnames=HEADER_CATEGORY_REPORT, dialect="unix" ) writer.writeheader() for row in data: @@ -477,34 +475,44 @@ def save_count_data( # Save year counts data = [] - for lic, years in year_counts.items(): - for year, c in years.items(): - data.append({"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": c}) + for license_name, years in year_counts.items(): + for year, count in years.items(): + data.append( + {"TOOL_IDENTIFIER": license_name, "YEAR": year, "COUNT": count} + ) data.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR")) - with open(FILE_ARXIV_YEAR, "w", encoding="utf-8", newline="\n") as fh: - writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix") + with open( + FILE_ARXIV_YEAR, "w", encoding="utf-8", newline="\n" + ) as file_handle: + writer = csv.DictWriter( + file_handle, fieldnames=HEADER_YEAR, dialect="unix" + ) writer.writeheader() for row in data: writer.writerow(row) # Save author buckets summary data = [] - for lic, acs in author_counts.items(): + for license_name, author_count_data in author_counts.items(): # build buckets across licenses bucket_counts = Counter() - for ac, c in acs.items(): - b = bucket_author_count(ac) - bucket_counts[b] += c - for b, c in bucket_counts.items(): + for author_count, count in author_count_data.items(): + bucket = bucket_author_count(author_count) + bucket_counts[bucket] += count + for bucket, count in bucket_counts.items(): data.append( - {"TOOL_IDENTIFIER": lic, "AUTHOR_BUCKET": b, "COUNT": c} + { + "TOOL_IDENTIFIER": license_name, + "AUTHOR_BUCKET": bucket, + "COUNT": count, + } ) data.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET")) with open( FILE_ARXIV_AUTHOR_BUCKET, "w", encoding="utf-8", newline="\n" - ) as fh: + ) as file_handle: writer = csv.DictWriter( - fh, fieldnames=HEADER_AUTHOR_BUCKET, dialect="unix" + file_handle, fieldnames=HEADER_AUTHOR_BUCKET, dialect="unix" ) writer.writeheader() for row in data: @@ -513,16 +521,24 @@ def save_count_data( def query_arxiv(args): """ - Main function to query ArXiv API and collect CC license data. + Main function to query ArXiv OAI-PMH API and collect CC license data. + Uses structured license metadata from OAI-PMH instead of text search. + Harvests papers from recent years to focus on CC-licensed content. """ - LOGGER.info("Beginning to fetch results from ArXiv API") + LOGGER.info("Beginning to fetch results from ArXiv OAI-PMH API") session = shared.get_session() - results_per_iteration = 50 + # Calculate date range for harvesting + current_year = datetime.now().year + start_year = current_year - args.years_back + from_date = f"{start_year}-01-01" - search_queries = SEARCH_QUERIES + LOGGER.info( + f"Harvesting papers from {from_date} onwards " + f"({args.years_back} years back)" + ) # Data structures for counting license_counts = defaultdict(int) @@ -531,81 +547,107 @@ def query_arxiv(args): author_counts = defaultdict(lambda: defaultdict(int)) total_fetched = 0 + resumption_token = None - for search_query in search_queries: - if total_fetched >= args.limit: - break - - LOGGER.info(f"Searching for: {search_query}") - papers_found_for_query = 0 - - for start in range( - 0, - min(args.limit - total_fetched, 500), - results_per_iteration, - ): - encoded_query = urllib.parse.quote_plus(search_query) - query = ( - f"search_query={encoded_query}&start={start}" - f"&max_results={results_per_iteration}" - ) + while total_fetched < args.limit: + try: + # Build OAI-PMH request URL + if resumption_token: + # Continue with resumption token + query_params = { + "verb": "ListRecords", + "resumptionToken": resumption_token, + } + else: + # Initial request with date range + query_params = { + "verb": "ListRecords", + "metadataPrefix": "arXiv", + "from": from_date, + } - papers_found_in_batch = 0 + # Make API request + LOGGER.info(f"Fetching batch starting from record {total_fetched}") + response = session.get(BASE_URL, params=query_params, timeout=60) + response.raise_for_status() - try: - LOGGER.info( - f"Fetching results {start} - " - f"{start + results_per_iteration}" + # Parse XML response + root = ET.fromstring(response.content) + + # Check for errors + error_element = root.find( + ".//{http://www.openarchives.org/OAI/2.0/}error" + ) + if error_element is not None: + raise shared.QuantifyingException( + f"OAI-PMH Error: {error_element.text}", 1 ) - response = session.get(BASE_URL + query, timeout=30) - response.raise_for_status() - feed = feedparser.parse(response.content) - for entry in feed.entries: - if total_fetched >= args.limit: - break + # Process records + records = root.findall( + ".//{http://www.openarchives.org/OAI/2.0/}record" + ) + batch_cc_count = 0 - license_info = extract_license_info(entry) + for record in records: + if total_fetched >= args.limit: + break - if license_info != "Unknown": + # Convert record to string for metadata extraction + record_xml = ET.tostring(record, encoding="unicode") + metadata = extract_metadata_from_xml(record_xml) - category = extract_category_from_entry(entry) - year = extract_year_from_entry(entry) - author_count = extract_author_count_from_entry(entry) + # Only process CC-licensed papers + if ( + metadata["license"] != "Unknown" + and metadata["license"].startswith("CC") + ): + license_info = metadata["license"] + category = metadata["category"] + year = metadata["year"] + author_count = metadata["author_count"] - # Count by license - license_counts[license_info] += 1 + # Count by license + license_counts[license_info] += 1 - # Count by category and license - category_counts[license_info][category] += 1 + # Count by category and license + category_counts[license_info][category] += 1 - # Count by year and license - year_counts[license_info][year] += 1 + # Count by year and license + year_counts[license_info][year] += 1 - # Count by author count and license - author_counts[license_info][author_count] += 1 + # Count by author count and license + author_counts[license_info][author_count] += 1 - total_fetched += 1 - papers_found_in_batch += 1 - papers_found_for_query += 1 + total_fetched += 1 + batch_cc_count += 1 - # arXiv recommends a 3-seconds delay between consecutive - # api calls for efficiency - time.sleep(3) - except requests.HTTPError as e: - raise shared.QuantifyingException(f"HTTP Error: {e}", 1) - except requests.RequestException as e: - raise shared.QuantifyingException(f"Request Exception: {e}", 1) - except KeyError as e: - raise shared.QuantifyingException(f"KeyError: {e}", 1) + LOGGER.info( + f"Batch completed: {batch_cc_count} CC-licensed papers found" + ) - if papers_found_in_batch == 0: + # Check for resumption token + resumption_element = root.find( + ".//{http://www.openarchives.org/OAI/2.0/}resumptionToken" + ) + if resumption_element is not None and resumption_element.text: + resumption_token = resumption_element.text + LOGGER.info("Continuing with resumption token...") + else: + LOGGER.info("No more records available") break - LOGGER.info( - f"Query '{search_query}' completed: " - f"{papers_found_for_query} papers found" - ) + # OAI-PMH recommends delays between requests + time.sleep(3) + + except requests.HTTPError as e: + raise shared.QuantifyingException(f"HTTP Error: {e}", 1) + except requests.RequestException as e: + raise shared.QuantifyingException(f"Request Exception: {e}", 1) + except ET.ParseError as e: + raise shared.QuantifyingException(f"XML Parse Error: {e}", 1) + except Exception as e: + raise shared.QuantifyingException(f"Unexpected error: {e}", 1) # Save results if args.enable_save: @@ -613,23 +655,37 @@ def query_arxiv(args): license_counts, category_counts, year_counts, author_counts ) - # save provenance + # Save provenance provenance_data = { "total_fetched": total_fetched, - "queries": search_queries, + "from_date": from_date, + "years_back": args.years_back, "limit": args.limit, "quarter": QUARTER, "script": os.path.basename(__file__), + "api_endpoint": BASE_URL, + "method": "OAI-PMH structured license harvesting", } - # write provenance YAML for auditing + # Write provenance YAML for auditing try: - with open(FILE_PROVENANCE, "w", encoding="utf-8", newline="\n") as fh: - yaml.dump(provenance_data, fh, default_flow_style=False, indent=2) + with open( + FILE_PROVENANCE, "w", encoding="utf-8", newline="\n" + ) as file_handle: + yaml.dump( + provenance_data, + file_handle, + default_flow_style=False, + indent=2, + ) except Exception as e: - LOGGER.warning("Failed to write provenance file: %s", e) + LOGGER.error(f"Failed to write provenance file: {e}") + raise shared.QuantifyingException( + f"Provenance file write failed: {e}", 1 + ) - LOGGER.info(f"Total CC licensed papers fetched: {total_fetched}") + LOGGER.info(f"Total papers with CC licenses fetched: {total_fetched}") + LOGGER.info(f"License distribution: {dict(license_counts)}") def main(): diff --git a/sources.md b/sources.md index 2f559bef..d5598a0f 100644 --- a/sources.md +++ b/sources.md @@ -6,21 +6,28 @@ public domain. Below are the sources and their respective information: ## arXiv -**Description:** arXiv is a free distribution service and an open-access archive for scholarly articles in physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics. All arXiv articles are available under various open licenses or are in the public domain. +**Description:** arXiv is a free distribution service and an open-access +archive for scholarly articles in physics, mathematics, computer science, +quantitative biology, quantitative finance, statistics, electrical engineering +and systems science, and economics. All arXiv articles are available under +various open licenses or are in the public domain. **API documentation link:** -- [arXiv API User Manual](https://arxiv.org/help/api/user-manual) -- [arXiv API Reference](https://arxiv.org/help/api) -- [Base URL](http://export.arxiv.org/api/query) +- [arXiv OAI-PMH Interface](https://info.arxiv.org/help/oa/index.html) +- [Base URL (OAI-PMH)](https://oaipmh.arxiv.org/oai) - [arXiv Subject Classifications](https://arxiv.org/category_taxonomy) - [Terms of Use for arXiv APIs](https://info.arxiv.org/help/api/tou.html) **API information:** - No API key required - Query limit: No official limit, but requests should be made responsibly -- Data available through Atom XML format -- Supports search by fields: title (ti), author (au), abstract (abs), comment (co), journal reference (jr), subject category (cat), report number (rn), id, all (searches all fields), and submittedDate (date filter) -- Metadata includes licensing information for each paper +- **Data format**: OAI-PMH XML format with structured metadata fields +- **OAI-PMH Interface** (used by `arxiv_fetch.py`): + - Structured metadata harvesting with resumption tokens + - License information extracted from `{http://arxiv.org/OAI/arXiv/}license` XML field + - Recommended 3-second delays between requests + - Supports date-based filtering for bulk harvesting +- Metadata includes comprehensive licensing information for each paper ## CC Legal Tools