From 7daeeb65f8f748e25ee387b8f16f66e5117e7844 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 28 May 2026 19:43:22 -0700 Subject: [PATCH 01/15] Bump major version to match releases Signed-off-by: Jono Yang --- CHANGELOG.rst | 2 +- packagedb/tests/testfiles/sbom/package-sbom-expected.json | 2 +- purldb_project/__init__.py | 2 +- setup.cfg | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index eb913ecb..310cae44 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,7 +1,7 @@ Changelog ========= -v6.1.0 +v7.1.0 --------- - Add SBOM API action on packages (`/api/packages//sbom/`) diff --git a/packagedb/tests/testfiles/sbom/package-sbom-expected.json b/packagedb/tests/testfiles/sbom/package-sbom-expected.json index dfdd35c7..9d018e39 100644 --- a/packagedb/tests/testfiles/sbom/package-sbom-expected.json +++ b/packagedb/tests/testfiles/sbom/package-sbom-expected.json @@ -23,7 +23,7 @@ "tools":[ { "name":"PurlDB", - "version":"6.1.0" + "version":"7.1.0" } ] }, diff --git a/purldb_project/__init__.py b/purldb_project/__init__.py index 7a6a79fb..7be6bfe2 100644 --- a/purldb_project/__init__.py +++ b/purldb_project/__init__.py @@ -10,7 +10,7 @@ import os import sys -__version__ = "6.1.0" +__version__ = "7.1.0" def command_line(): diff --git a/setup.cfg b/setup.cfg index ac9c8b8c..827ff26c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = purldb -version = 6.1.0 +version = 7.1.0 license_files = LICENSE AUTHORS.rst From 502364b3a23ced8cde3fa0bbbcc5a3d32648eb1f Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 29 May 2026 12:11:38 -0700 Subject: [PATCH 02/15] Create command to federate Packages as SBOMs Signed-off-by: Jono Yang --- .../management/commands/federate_packages.py | 4 +- .../management/commands/federate_sboms.py | 122 ++++++++++++++++++ packagedb/sbom.py | 6 +- 3 files changed, 125 insertions(+), 7 deletions(-) create mode 100644 minecode/management/commands/federate_sboms.py diff --git a/minecode/management/commands/federate_packages.py b/minecode/management/commands/federate_packages.py index 6790acdb..f9567f50 100644 --- a/minecode/management/commands/federate_packages.py +++ b/minecode/management/commands/federate_packages.py @@ -18,9 +18,7 @@ from minecode_pipelines import pipes from packagedb import models as packagedb_models -""" -Utility command to find license oddities. -""" + logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) diff --git a/minecode/management/commands/federate_sboms.py b/minecode/management/commands/federate_sboms.py new file mode 100644 index 00000000..3cb18916 --- /dev/null +++ b/minecode/management/commands/federate_sboms.py @@ -0,0 +1,122 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging +import sys +from pathlib import Path + +from aboutcode.federated import DataFederation +from commoncode import fileutils +from minecode.management import federatedcode +from minecode.management.commands import VerboseCommand +from minecode_pipelines import pipes +from packagedb import models as packagedb_models +from packagedb import sbom + + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + +TRACE = False +if TRACE: + logger.setLevel(logging.DEBUG) + + +PACKAGE_BATCH_SIZE = 1000 + + +def commit_message(commit_batch, total_commit_batch="many"): + from django.conf import settings + + author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME + author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL + tool_name = "pkg:github/aboutcode-org/purldb" + + return f"""\ + Save CycloneDX SBOMs from PurlDB ({commit_batch}/{total_commit_batch}) + + Tool: {tool_name}@v{settings.PURLDB_VERSION} + Reference: https://{settings.ALLOWED_HOSTS[0]} + + Signed-off-by: {author_name} <{author_email}> + """ + + +class Command(VerboseCommand): + help = "Save and commit CycloneDX SBOMs, generated from PackageDB package data. to FederatedCode repos." + + def add_arguments(self, parser): + parser.add_argument( + "-d", + "--working-directory", + type=str, + required=False, + help="Directory where FederatedCode repos will be cloned", + ) + + def handle(self, *args, **options): + logger.setLevel(self.get_verbosity(**options)) + working_dir = options.get("working_directory") + if working_dir: + working_path = Path(working_dir) + else: + working_path = Path(fileutils.get_temp_dir()) + + # Clone data and config repo + data_federation = DataFederation.from_url( + name="aboutcode-data", + remote_root_url="https://github.com/aboutcode-data", + ) + data_cluster = data_federation.get_cluster("cyclonedx16_sboms") + + # TODO: do something more efficient + files_to_commit = [] + commit_batch = 1 + for i, package in enumerate( + packagedb_models.Package.objects.all().iterator(chunk_size=PACKAGE_BATCH_SIZE), start=1 + ): + package_repo_name, datafile_path = data_cluster.get_datafile_repo_and_path( + purl=package.purl + ) + _, package_repo = federatedcode.get_or_create_repository( + repo_name=package_repo_name, + working_path=working_path, + logger=logger.log, + ) + package_sbom_data = sbom.to_cyclonedx(package) + sbom_file = pipes.write_package_data_to_file( + repo=package_repo, + relative_api_package_metadata_datafile_path=datafile_path, + package_data=package_sbom_data, + ) + if sbom_file not in files_to_commit: + files_to_commit.append(sbom_file) + + if len(files_to_commit) == PACKAGE_BATCH_SIZE: + federatedcode.commit_and_push_changes( + commit_message=commit_message(commit_batch), + repo=package_repo, + files_to_commit=files_to_commit, + logger=logger.log, + ) + logger.log(f"Committed {i} SBOMs to {package_repo_name}") + files_to_commit.clear() + commit_batch += 1 + + if files_to_commit: + federatedcode.commit_and_push_changes( + commit_message=commit_message(commit_batch), + repo=package_repo, + files_to_commit=files_to_commit, + logger=logger.log, + ) + logger.log(f"Committed {i} SBOMs to {package_repo_name}") + files_to_commit.clear() + commit_batch += 1 diff --git a/packagedb/sbom.py b/packagedb/sbom.py index 56b208f7..1a49dfec 100644 --- a/packagedb/sbom.py +++ b/packagedb/sbom.py @@ -34,7 +34,7 @@ def get_cyclonedx_bom(package): """ - Return a CycloneDX `Bom` object filled with provided `project` data. + Return a CycloneDX `Bom` object filled with data from `package`. See https://cyclonedx.org/use-cases/#dependency-graph """ @@ -76,9 +76,7 @@ def sort_bom_with_schema_ordering(bom_as_dict, schema_version): def to_cyclonedx(package, cyclonedx_version="1.6"): """ - Generate output for the provided ``project`` in CycloneDX BOM format. - The output file is created in the ``project`` "output/" directory. - Return the path of the generated output file. + Return a CycloneDX SBOM of `package` as a Python dictionary. """ schema_version = SchemaVersion.from_version(cyclonedx_version) From a8913e7115ea7192a72bfa6634086f65cc023bed Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 29 May 2026 12:37:08 -0700 Subject: [PATCH 03/15] Add docs for package sbom endpoint Signed-off-by: Jono Yang --- docs/source/purldb/rest_api.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/source/purldb/rest_api.rst b/docs/source/purldb/rest_api.rst index f34b62bd..277788ce 100644 --- a/docs/source/purldb/rest_api.rst +++ b/docs/source/purldb/rest_api.rst @@ -547,6 +547,28 @@ Using cURL to reindex a package: "status": "pkg:maven/org.elasticsearch/elasticsearch@7.17.9 has been queued for reindexing" } +sbom +^^^^^ + +Generate a CycloneDX SBOM from this package instance. + +Using cURL to get an SBOM for a package: + +.. code-block:: console + + api_url="https://public.purldb.io/api/packages/0bbdcf88-ad07-4970-9272-7d5f4c82cc7b/sbom/" + content_type="Content-Type: application/json" + + curl -X GET "$api_url" -H "$content_type" + +.. code-block:: json + + { + "$schema": "http://cyclonedx.org/schema/bom-1.6.schema.json", + "bomFormat": "CycloneDX", + "specVersion": "1.6", + } + Filter by checksum ~~~~~~~~~~~~~~~~~~ From 85bb135862dabaa257ccc6b3b54178fcff4b93f2 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Fri, 29 May 2026 13:21:17 -0700 Subject: [PATCH 04/15] Update CHANGELOG.rst Signed-off-by: Jono Yang --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 310cae44..9d96f906 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,6 +5,7 @@ v7.1.0 --------- - Add SBOM API action on packages (`/api/packages//sbom/`) +- Create command, ``federate_sboms``, to federate CycloneDX 1.6 SBOMs using stored package data v6.0.0 From 661438a9688450cdddd45f995152758f7d80746b Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Mon, 2 Feb 2026 14:32:10 +0800 Subject: [PATCH 05/15] Improve checkpoint loeader with error handling #637 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipes/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index d8abccb4..33be514d 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -70,6 +70,10 @@ def get_checkpoint_from_file(cloned_repo, path): return checkpoint_data or {} except FileNotFoundError: return {} + except FileNotFoundError: + return {} + except json.JSONDecodeError: + return {} def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None): From b9ef5bd9c747c30763c71a6d42c358c468ac16d4 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Tue, 3 Feb 2026 17:42:48 +0800 Subject: [PATCH 06/15] Add `mine_crates` pipeline for Rust crate indexing #637 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipelines/mine_crates.py | 110 ++++++++++++++++++++ minecode_pipelines/pipes/crates.py | 89 ++++++++++++++++ pyproject-minecode_pipelines.toml | 1 + 3 files changed, 200 insertions(+) create mode 100644 minecode_pipelines/pipelines/mine_crates.py create mode 100644 minecode_pipelines/pipes/crates.py diff --git a/minecode_pipelines/pipelines/mine_crates.py b/minecode_pipelines/pipelines/mine_crates.py new file mode 100644 index 00000000..16a1e6bc --- /dev/null +++ b/minecode_pipelines/pipelines/mine_crates.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from scanpipe.pipes import federatedcode + +from minecode_pipelines import pipes +from minecode_pipelines.pipelines import MineCodeBasePipeline +from minecode_pipelines.pipelines import _mine_and_publish_packageurls +from minecode_pipelines.pipes import crates + + +class MineCrates(MineCodeBasePipeline): + """Mine PackageURLs from crates.io-index and publish them to FederatedCode.""" + + pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/" + checkpoint_path = "crates/checkpoints.json" + append_purls = True + + crates_index_repo_url = "https://github.com/rust-lang/crates.io-index" + + last_checkpoint = "" + current_utc = "" + + @classmethod + def steps(cls): + return ( + cls.check_federatedcode_eligibility, + cls.create_federatedcode_working_dir, + cls.fetch_federation_config, + cls.fetch_checkpoint_and_crates_io_index, + cls.get_current_utc, + cls.mine_and_publish_crates_packageurls, + cls.save_check_point, + cls.delete_working_dir, + ) + + def fetch_checkpoint_and_crates_io_index(self): + self.checkpoint_config_repo = federatedcode.clone_repository( + repo_url=self.pipeline_config_repo, + clone_path=self.working_path / "minecode-pipelines-config", + logger=self.log, + ) + checkpoint = pipes.get_checkpoint_from_file( + cloned_repo=self.checkpoint_config_repo, + path=self.checkpoint_path, + ) + if checkpoint: + self.last_checkpoint = checkpoint.get("previous_index_date") + self.log(f"last_checkpoint: {self.last_checkpoint}") + + # Clone the crates.io-index repository + self.crates_index_repo = federatedcode.clone_repository( + repo_url=self.crates_index_repo_url, + clone_path=self.working_path / "crates_index_repo", + logger=self.log, + ) + + self.crates_collector = crates.CratesCollector( + repo_location=self.crates_index_repo, + logger=self.log, + ) + + def get_current_utc(self): + from datetime import datetime, timezone + + self.current_utc = datetime.now(timezone.utc).isoformat() + + def mine_and_publish_crates_packageurls(self): + _mine_and_publish_packageurls( + packageurls=self.crates_collector.get_packages( + previous_index_date=self.last_checkpoint + ), + total_package_count=None, + data_cluster=self.data_cluster, + checked_out_repos=self.checked_out_repos, + working_path=self.working_path, + append_purls=self.append_purls, + commit_msg_func=self.commit_message, + logger=self.log, + ) + + def save_check_point(self): + checkpoint = {"previous_index_date": self.current_utc} + + self.log(f"Saving checkpoint: {checkpoint}") + pipes.update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=self.checkpoint_config_repo, + path=self.checkpoint_path, + logger=self.log, + ) diff --git a/minecode_pipelines/pipes/crates.py b/minecode_pipelines/pipes/crates.py new file mode 100644 index 00000000..a2bdc111 --- /dev/null +++ b/minecode_pipelines/pipes/crates.py @@ -0,0 +1,89 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +import requests +from dateutil import parser +from packagedcode.models import PackageData +from packageurl import PackageURL + + +TRACE = False +TRACE_DEEP = False + +CRATES_API_URL = "https://crates.io/api/v1/crates/" + + +class CratesCollector: + def __init__( + self, + repo_location=None, + logger=None, + ): + if not repo_location: + raise Exception("repo_location must be set for CratesCollector.") + self.repo_location = repo_location + + def get_packages(self, previous_index_date=None, logger=None): + """Yield Package objects from crates.io-index""" + base_dir = self.repo_location.working_dir + + previous_index_date_parsed = "" + if previous_index_date: + previous_index_date_parsed = parser.isoparse(previous_index_date) + + for root, dirs, filenames in os.walk(base_dir): + # Skip .github and .git directories at the top level + if root == base_dir: + dirs.remove(".github") + dirs.remove(".git") + # Skip README.md and config.json at the top level + filenames = [f for f in filenames if f not in ("README.md", "config.json")] + + for crate_name in filenames: + url = f"{CRATES_API_URL}/{crate_name}" + response = requests.get(url) + if not response.status_code == 200: + self.logger(f"Error fetching {crate_name}: {response.status_code}") + else: + data = response.json() + crate_versions_info = data.get("versions", {}) + for crate_version_info in crate_versions_info: + package_last_update = crate_version_info.get("updated_at", "") + if previous_index_date_parsed and package_last_update: + last_update = parser.isoparse(package_last_update) + if last_update < previous_index_date_parsed: + continue + name = crate_version_info.get("crate") + version = crate_version_info.get("num") + download_url = "https://crates.io" + crate_version_info.get("dl_path", "") + release_date = crate_version_info.get("created_at", "") + sha256 = crate_version_info.get("checksum", "") + homepage_url = crate_version_info.get("homepage", "") + if not homepage_url: + homepage_url = crate_version_info.get("repository", "") + + package = PackageData( + type="maven", + namespace=None, + name=name, + version=version, + qualifiers=None, + download_url=download_url, + sha256=sha256, + release_date=release_date, + repository_homepage_url=homepage_url, + repository_download_url=download_url, + ) + current_purl = PackageURL( + type="maven", + name=name, + version=version, + ) + yield current_purl, [package.purl] diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 4885cf74..076fe081 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -62,6 +62,7 @@ mine_cpan = "minecode_pipelines.pipelines.mine_cpan:MineCpan" mine_cran = "minecode_pipelines.pipelines.mine_cran:MineCran" mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift" mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer" +mine_crates = "minecode_pipelines.pipelines.mine_crates:MineCrates" [tool.bumpversion] current_version = "1.0.1" From db7e72b220104c08a3eede0b00427f77e4e03823 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Tue, 26 May 2026 11:02:05 +0800 Subject: [PATCH 07/15] Corrected package type to "cargo" and adapted the same logic structure as similar as the debian.py #637 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipes/crates.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/minecode_pipelines/pipes/crates.py b/minecode_pipelines/pipes/crates.py index a2bdc111..03c18710 100644 --- a/minecode_pipelines/pipes/crates.py +++ b/minecode_pipelines/pipes/crates.py @@ -69,11 +69,23 @@ def get_packages(self, previous_index_date=None, logger=None): if not homepage_url: homepage_url = crate_version_info.get("repository", "") - package = PackageData( - type="maven", - namespace=None, + package_url = PackageURL( + type="cargo", name=name, - version=version, + version=str(version), + ) + + versionless_purl = PackageURL( + type=package_url.type, + namespace=package_url.namespace, + name=package_url.name, + ) + + packaged_data = PackageData( + type=package_url.type, + namespace=None, + name=package_url.name, + version=package_url.version, qualifiers=None, download_url=download_url, sha256=sha256, @@ -81,9 +93,5 @@ def get_packages(self, previous_index_date=None, logger=None): repository_homepage_url=homepage_url, repository_download_url=download_url, ) - current_purl = PackageURL( - type="maven", - name=name, - version=version, - ) - yield current_purl, [package.purl] + + yield versionless_purl, [packaged_data.purl], [] From 2509f3dc1e3f73eaf266aa64a59cdd7904ca272b Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Fri, 26 Jun 2026 17:56:30 +0800 Subject: [PATCH 08/15] Add pipeline to mine Apache packages #631 - Constructing purls based on https://github.com/package-url/purl-spec/issues/834#issuecomment-4777179862 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipelines/mine_apache.py | 100 +++++ minecode_pipelines/pipes/apache.py | 419 ++++++++++++++++++ minecode_pipelines/tests/pipes/test_apache.py | 229 ++++++++++ pyproject-minecode_pipelines.toml | 1 + 4 files changed, 749 insertions(+) create mode 100644 minecode_pipelines/pipelines/mine_apache.py create mode 100644 minecode_pipelines/pipes/apache.py create mode 100644 minecode_pipelines/tests/pipes/test_apache.py diff --git a/minecode_pipelines/pipelines/mine_apache.py b/minecode_pipelines/pipelines/mine_apache.py new file mode 100644 index 00000000..c132090d --- /dev/null +++ b/minecode_pipelines/pipelines/mine_apache.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from scanpipe.pipes import federatedcode + +from minecode_pipelines import pipes +from minecode_pipelines.pipelines import MineCodeBasePipeline +from minecode_pipelines.pipelines import _mine_and_publish_packageurls +from minecode_pipelines.pipes import apache + +from datetime import datetime, timezone + + +class MineApache(MineCodeBasePipeline): + """Mine PackageURLs from apache.org and publish them to FederatedCode.""" + + pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/" + + append_purls = True + + @classmethod + def steps(cls): + return ( + cls.check_federatedcode_eligibility, + cls.create_federatedcode_working_dir, + cls.fetch_federation_config, + cls.fetch_checkpoint_config_repo, + cls.fetch_apache, + cls.mine_and_publish_apache_packageurls, + cls.save_check_point, + cls.delete_working_dir, + ) + + def fetch_checkpoint_config_repo(self): + self.checkpoint_config_repo = federatedcode.clone_repository( + repo_url=self.pipeline_config_repo, + clone_path=self.working_path / "minecode-pipelines-config", + logger=self.log, + ) + + def fetch_apache(self): + checkpoint_path = "apache/checkpoints.json" + checkpoint = pipes.get_checkpoint_from_file( + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + ) + last_sync = checkpoint.get("last_sync", "") + if last_sync: + self.log(f"last_sync: {last_sync}") + find_ls_url = "https://archive.apache.org/dist/zzz/find-ls2.txt.gz" + project_json = "https://projects.apache.org/json/foundation/projects.json" + self.apache_collector = apache.ApacheCollector( + find_ls_url=find_ls_url, + project_json=project_json, + logger=self.log, + ) + + def mine_and_publish_apache_packageurls(self): + _mine_and_publish_packageurls( + packageurls=self.apache_collector.get_packages(), + total_package_count=None, + data_clusters=self.data_clusters, + checked_out_repos=self.checked_out_repos, + working_path=self.working_path, + append_purls=self.append_purls, + commit_msg_func=self.commit_message, + logger=self.log, + ) + + def save_check_point(self): + checkpoint_path = "apache/checkpoints.json" + # We use the current timestamp to record when the sync occurred. + now = datetime.now(timezone.utc) + checkpoint = {"last_sync": now} + self.log(f"Saving checkpoint: {checkpoint}") + pipes.update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=self.checkpoint_config_repo, + path=checkpoint_path, + logger=self.log, + ) diff --git a/minecode_pipelines/pipes/apache.py b/minecode_pipelines/pipes/apache.py new file mode 100644 index 00000000..d246a1c5 --- /dev/null +++ b/minecode_pipelines/pipes/apache.py @@ -0,0 +1,419 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import gzip +import shutil +import json +import os +from shutil import rmtree +import re + +import requests + +from packageurl import PackageURL + + +TRACE = False +TRACE_DEEP = False + + +FIND_LS_URL = "https://archive.apache.org/dist/zzz/find-ls2.txt.gz" +PROJECT_JSON = "https://projects.apache.org/json/foundation/projects.json" +BASE_URL = "https://archive.apache.org/dist/" +BASE_NAMESPACE = "apache.org/" + + +CHECKSUM_EXTS = ( + ".sha256", + ".sha512", + ".md5", + ".sha", + ".sha1", +) + +# only keep downloads with certain extensions for some archives, packages and checksums +ARCHIVE_EXTS = ( + # archives + ".jar", + ".zip", + ".tar.gz", + ".tgz", + ".tar.bz2", + ".war", + ".tar.xz", + ".tgz", + ".tar", + # packages + # '.deb', '.rpm', '.msi', '.exe', + ".whl", + ".gem", + ".nupkg", + # '.dmg', + # '.nbm', +) + +IGNORED_PATH_CONTAINS = ( + "META/", # # + # doc + "/documentation/", + "/doc/", # # + "-doc.", # # + "-doc-", # # + "/docs/", # # + "-docs.", # # + "-docs-", # # + "javadoc", # # + "fulldoc", # # + "apidoc", # # + "-manual.", + "-asdocs.", # # + # eclipse p2/update sites are redundant + # redundant + "updatesite/", # # + "eclipse-update-site", # # + "update/eclipse", # # + "sling/eclipse", # # + "eclipse.site-", + # large multi-origin binary distributions + "-distro.", + "-bin-withdeps.", + "-bin-with-deps", + # these are larger distributions with third-parties + "apache-airavata-distribution", + "apache-airavata-server", + "apache-mahout-distribution", + "/syncope-standalone-", + "binaries/conda", + # obscure + "perl/contrib", + # index data + "zzz", + # doc + "ant/manual", + # tmp + "/tmp/", # noqa: S108 safe: used only as ignore pattern +) + + +# TODO: ignore these globs too: + +# openoffice/*/binaries is very large +# /*/apache-log4j-*-site.zip + + +class ApacheCollector: + """ + Download and process the find-ls file. + """ + + def __init__( + self, + find_ls_url=None, + project_json=None, + logger=None, + ): + self.downloads = [] + + if not find_ls_url: + find_ls_url = FIND_LS_URL + + if not project_json: + project_json = PROJECT_JSON + + find_ls_download = self._fetch_http(find_ls_url) + project_json_download = self._fetch_http(project_json) + self.find_ls_location = find_ls_download.path + self.project_json_location = project_json_download.path + + def __del__(self): + if self.downloads: + for download in self.downloads: + rmtree(download.directory) + + def _fetch_http(self, uri): + from scanpipe.pipes.fetch import fetch_http + + fetched = fetch_http(uri) + self.downloads.append(fetched) + return fetched + + def get_packages(self): + """Yield Package objects from the find_ls list""" + txt_path = extract_archives(archive_path=self.find_ls_location) + packages_data, packages_checksum = get_archives_and_checksum(txt_path) + updated_packages_list = update_package_data( + packages_data, packages_checksum, project_json_location=self.project_json_location + ) + + current_base = None + current_purls = [] + + for package in updated_packages_list: + """ + repository_homepage_url = package.get("repository_homepage_url", "") + repository_download_url = package.get("repository_download_url", "") + download_url = package.get("download_url", "") + size = package.get("size", "") + release_date = package.get("date", "") + """ + + namespace, name, version, qualifiers = determine_purl_elements(package) + + purl = PackageURL( + type="sid", + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers, + ).to_string() + + base_purl = PackageURL( + type="sid", + namespace=namespace, + name=name, + ).to_string() + + if current_base is None: + current_base = base_purl + current_purls.append(purl) + elif base_purl == current_base: + current_purls.append(purl) + else: + yield current_base, current_purls, [] + current_base = base_purl + current_purls = [purl] + + if current_base is not None: + yield current_base, current_purls, [] + + +def determine_purl_elements(package): + """ + Determine and return the namespace, name, version and qualifier based + on the path info + """ + path = package.get("filepath").lstrip("./") + parsed_result = parse_apache_path_common(path) + if parsed_result: + namespace = BASE_NAMESPACE + parsed_result.get("namespace") + name = parsed_result.get("name") + version = parsed_result.get("version") + qualifier = {"file_name": parsed_result["file_name"]} + else: + parsed_result = parse_apache_path_complex(path) + namespace = BASE_NAMESPACE + parsed_result.get("namespace") + name = parsed_result.get("name") + version = parsed_result.get("version") + qualifier = {"download_url": BASE_URL + path} + return namespace, name, version, qualifier + + +def get_archives_and_checksum(txt_path): + """ + Return: + - A list of dictionaries containing the package archive path, size, and release date + - A list of checksum files + + """ + packages_data = [] + packages_checksum = [] + with open(txt_path, encoding="utf-8") as f: + for line in f: + parts = line.strip().split() + + if not parts or len(parts) < 9: + continue + + # Extracting the components + permissions = parts[0] + + # Skip if it's not a file + if not permissions.startswith("-"): + continue + + size = parts[4] + date = f"{parts[5]} {parts[6]} {parts[7]}" + filepath = parts[8] + + if any(ignored in filepath for ignored in IGNORED_PATH_CONTAINS): + continue + + if filepath.endswith(CHECKSUM_EXTS): + packages_checksum.append(filepath) + elif filepath.endswith(ARCHIVE_EXTS): + info_dict = {} + info_dict["filepath"] = filepath + info_dict["size"] = size + info_dict["date"] = date + packages_data.append(info_dict) + + return packages_data, packages_checksum + + +def update_package_data(packages_data, packages_checksum, project_json_location): + """ + Update package metadata with: + - Project information from + https://projects.apache.org/json/foundation/projects.json + (homepage, download page, description). + - A constructed download URL. + - Available checksum values (sha256, sha512, md5, etc.). + """ + updated_package_data = [] + data = "" + with open(project_json_location, encoding="utf-8") as f: + data = json.load(f) + + for package in packages_data: + package_dict = package.copy() + path = package["filepath"] + package_name = path.split("/")[1] + download_url = BASE_URL + path.lstrip("./") + package_dict["download_url"] = download_url + if data: + package_metadata = data.get(package_name, "") + # In some cases, projects.json uses + # {package_name}-{subpackage_name} as the key. + # For example, "directory-fortress" likely refers to + # files under /directory/fortress* + if not package_metadata: + subpackage_name = path.split("/")[2] + name = package_name + "-" + subpackage_name + package_metadata = data.get(name, "") + if package_metadata: + for key, target in { + "homepage": "repository_homepage_url", + "download-page": "repository_download_url", + "description": "description", + }.items(): + value = package_metadata.get(key) + if value: + package_dict[target] = value + """ + Request to get checksum for every packages will likely lead to Rate Limiting/HTTP 429 error + Ignoring the checksum collection for now + """ + # for ext in CHECKSUM_EXTS: + # checksum_path = path + ext + # if checksum_path in packages_checksum: + # checksum = get_checksum(BASE_URL + checksum_path.lstrip("./")) + # checksum_ext = ext.lstrip(".") + # package_dict[checksum_ext] = checksum + + updated_package_data.append(package_dict) + return updated_package_data + + +def get_checksum(url): + """ + Fetch the checksum file from the given URL and + return only the hash value. + """ + response = requests.get(url) + response.raise_for_status() + + content = response.text.strip() + checksum = content.split()[0] + return checksum + + +def extract_archives(archive_path): + txt_path = os.path.splitext(archive_path)[0] + + # Open the gzipped file and write out the decompressed content + with gzip.open(archive_path, "rb") as f_in: + with open(txt_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + return txt_path + + +def parse_apache_path_common(path): + segments = path.strip().split("/") + + # The minimum required segments for {name}/{version}/{filename} is 3 + if len(segments) < 3: + return None + + # filename is the last segment of the path + file_name = segments[-1] + + # version is the segment before the filename + version = segments[-2] + + # Check if the version segment represents a numeric value (starts with + # a digit) + if not (version and version[0].isdigit()): + return None + + # name is the segment before the version + name = segments[-3] + + # Ensure the name exists as part of the filename + if name not in file_name: + return None + + # namespace consists of all segments from the beginning up to the name + # segment + namespace_segments = segments[:-3] + namespace = "/".join(namespace_segments) + + return {"namespace": namespace, "name": name, "version": version, "file_name": file_name} + + +def parse_apache_path_complex(path): + segments = path.strip().split("/") + + if len(segments) < 2: + return None + + path_segments = segments[:-1] + file_name = segments[-1] + + special_words = { + "jars", + "binaries", + "binary", + "sources", + "source", + "java", + "bin", + "dist", + "old", + } + + marker_idx = None + version = "" + + for i, seg in enumerate(path_segments): + # Look for numeric version groupings + version_match = re.search(r"(\d+(?:\.\d+)+)", seg) + + is_version = False + if version_match: + is_version = True + if not version: + version = version_match.group(1) + + # Dynamic check: Matches hardcoded words OR 'rc' + numbers (e.g., rc1, rc2) + # This will completely ignore "release-candidates" + is_special = (seg.lower() in special_words) or bool(re.match(r"^rc\d+$", seg.lower())) + + if (is_version or is_special) and marker_idx is None: + marker_idx = i + + if marker_idx is not None and marker_idx > 0: + name = path_segments[marker_idx - 1] + namespace_segments = path_segments[: marker_idx - 1] + else: + name = path_segments[-1] if path_segments else "" + namespace_segments = path_segments[:-1] if path_segments else [] + + namespace = "/".join(namespace_segments) + + return {"namespace": namespace, "name": name, "version": version, "file_name": file_name} diff --git a/minecode_pipelines/tests/pipes/test_apache.py b/minecode_pipelines/tests/pipes/test_apache.py new file mode 100644 index 00000000..d9e272c9 --- /dev/null +++ b/minecode_pipelines/tests/pipes/test_apache.py @@ -0,0 +1,229 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os + +from commoncode.testcase import FileBasedTesting + +from minecode_pipelines.pipes import apache + + +class ApacheMiscTest(FileBasedTesting): + test_data_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") + + def test_parse_apache_path_common(self): + paths = [ + "abdera/1.0/apache-abdera-1.0-src.tar.gz", + "accumulo/1.10.1/accumulo-1.10.1-src.tar.gz", + "answer/1.3.0-incubating/apache-answer-1.3.0-incubating-bin-darwin-amd64.tar.gz", + "karaf/cellar/4.0.5/apache-karaf-cellar-4.0.5-src.tar.gz", + "cxf/3.1.9/apache-cxf-3.1.9-src.tar.gz", + "ws/commons/axiom/1_2_2/axiom-1.2.2-bin.zip", + "avalon/framework/jars/avalon-framework-excalibur-test-4.0b1.jar", + "avalon/logkit/v1.2/LogKit-1.2-bin.tar.gz", + ] + expected = [ + { + "namespace": "", + "name": "abdera", + "version": "1.0", + "file_name": "apache-abdera-1.0-src.tar.gz", + }, + { + "namespace": "", + "name": "accumulo", + "version": "1.10.1", + "file_name": "accumulo-1.10.1-src.tar.gz", + }, + { + "namespace": "", + "name": "answer", + "version": "1.3.0-incubating", + "file_name": "apache-answer-1.3.0-incubating-bin-darwin-amd64.tar.gz", + }, + { + "namespace": "karaf", + "name": "cellar", + "version": "4.0.5", + "file_name": "apache-karaf-cellar-4.0.5-src.tar.gz", + }, + { + "namespace": "", + "name": "cxf", + "version": "3.1.9", + "file_name": "apache-cxf-3.1.9-src.tar.gz", + }, + { + "namespace": "ws/commons", + "name": "axiom", + "version": "1_2_2", + "file_name": "axiom-1.2.2-bin.zip", + }, + None, + None, + ] + + for i, p in enumerate(paths): + self.assertEqual(apache.parse_apache_path_common(p), expected[i]) + + def test_parse_apache_path_complex(self): + paths = [ + "avalon/framework/jars/avalon-framework-excalibur-test-4.0b1.jar", + "avalon/logkit/v1.2/LogKit-1.2-bin.tar.gz", + "avalon/merlin/binaries/3.0/avalon-merlin-3.0-dist.zip", + "avalon/merlin/jars/merlin-plugin-1.0.jar", + "avro/avro-1.10.0/java/avro-grpc-1.10.0-sources.jar", + "httpd/libapreq/libapreq-1.1.tar.gz", + "airflow/providers/apache_airflow_providers_cncf_kubernetes-10.18.0.tar.gz", + "ace/apache-ace-2.1.0/apache-ace-2.1.0-src.zip", + "avalon/excalibur/v4.0/Excalibur-4.0-bin.tar.gz", + "airflow/providers/2.11/apache_airflow_providers_fab-1.5.4-py3-none-any.whl", + "beam/vendor/beam-vendor-calcite-1_40_0/0.1/apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", + "groovy/2.5.23/distribution/apache-groovy-binary-2.5.23.zip", + "groovy/2.5.23/sources/apache-groovy-src-2.5.23.zip", + "geronimo/safeguard/safeguard-parent-1.2.1-source-release.zip", + "beam/2.73.0/prism/windows/arm64/apache_beam-v2.73.0-prism-windows-arm64.zip", + "ranger/2.7.0/plugins/hdfs/ranger-2.7.0-hdfs-plugin.tar.gz", + "netbeans/netbeans-maven-archetypes/netbeans-platform-app-archetype/netbeans-platform-app-archetype-1.24/netbeans-platform-app-archetype-1.24-source-release.zip", + "ant/antlibs/antunit/source/apache-ant-antunit-1.5.0-src.tar.bz2", + "ant/antlibs/compress/binaries/apache-ant-compress-1.5-bin.zip", + "asterixdb/asterixdb-0.9.8.1/apache-asterixdb-0.9.8.1-source-release.zip", + "deltacloud/rc1/deltacloud-client-1.1.0.gem", + ] + expected = [ + { + "namespace": "avalon", + "name": "framework", + "version": "", + "file_name": "avalon-framework-excalibur-test-4.0b1.jar", + }, + { + "namespace": "avalon", + "name": "logkit", + "version": "1.2", + "file_name": "LogKit-1.2-bin.tar.gz", + }, + { + "namespace": "avalon", + "name": "merlin", + "version": "3.0", + "file_name": "avalon-merlin-3.0-dist.zip", + }, + { + "namespace": "avalon", + "name": "merlin", + "version": "", + "file_name": "merlin-plugin-1.0.jar", + }, + { + "namespace": "", + "name": "avro", + "version": "1.10.0", + "file_name": "avro-grpc-1.10.0-sources.jar", + }, + { + "namespace": "httpd", + "name": "libapreq", + "version": "", + "file_name": "libapreq-1.1.tar.gz", + }, + { + "namespace": "airflow", + "name": "providers", + "version": "", + "file_name": "apache_airflow_providers_cncf_kubernetes-10.18.0.tar.gz", + }, + { + "namespace": "", + "name": "ace", + "version": "2.1.0", + "file_name": "apache-ace-2.1.0-src.zip", + }, + { + "namespace": "avalon", + "name": "excalibur", + "version": "4.0", + "file_name": "Excalibur-4.0-bin.tar.gz", + }, + { + "namespace": "airflow", + "name": "providers", + "version": "2.11", + "file_name": "apache_airflow_providers_fab-1.5.4-py3-none-any.whl", + }, + { + "namespace": "beam/vendor", + "name": "beam-vendor-calcite-1_40_0", + "version": "0.1", + "file_name": "apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", + }, + { + "namespace": "", + "name": "groovy", + "version": "2.5.23", + "file_name": "apache-groovy-binary-2.5.23.zip", + }, + { + "namespace": "", + "name": "groovy", + "version": "2.5.23", + "file_name": "apache-groovy-src-2.5.23.zip", + }, + { + "namespace": "geronimo", + "name": "safeguard", + "version": "", + "file_name": "safeguard-parent-1.2.1-source-release.zip", + }, + { + "namespace": "", + "name": "beam", + "version": "2.73.0", + "file_name": "apache_beam-v2.73.0-prism-windows-arm64.zip", + }, + { + "namespace": "", + "name": "ranger", + "version": "2.7.0", + "file_name": "ranger-2.7.0-hdfs-plugin.tar.gz", + }, + { + "namespace": "netbeans/netbeans-maven-archetypes", + "name": "netbeans-platform-app-archetype", + "version": "1.24", + "file_name": "netbeans-platform-app-archetype-1.24-source-release.zip", + }, + { + "namespace": "ant/antlibs", + "name": "antunit", + "version": "", + "file_name": "apache-ant-antunit-1.5.0-src.tar.bz2", + }, + { + "namespace": "ant/antlibs", + "name": "compress", + "version": "", + "file_name": "apache-ant-compress-1.5-bin.zip", + }, + { + "namespace": "", + "name": "asterixdb", + "version": "0.9.8.1", + "file_name": "apache-asterixdb-0.9.8.1-source-release.zip", + }, + { + "namespace": "", + "name": "deltacloud", + "version": "", + "file_name": "deltacloud-client-1.1.0.gem", + }, + ] + + for i, p in enumerate(paths): + self.assertEqual(apache.parse_apache_path_complex(p), expected[i]) diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 076fe081..05fc3f8f 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -63,6 +63,7 @@ mine_cran = "minecode_pipelines.pipelines.mine_cran:MineCran" mine_swift = "minecode_pipelines.pipelines.mine_swift:MineSwift" mine_composer = "minecode_pipelines.pipelines.mine_composer:MineComposer" mine_crates = "minecode_pipelines.pipelines.mine_crates:MineCrates" +mine_apache = "minecode_pipelines.pipelines.mine_apache:MineApache" [tool.bumpversion] current_version = "1.0.1" From 093936f42077932e40320ce092ff0d91736feeac Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Mon, 29 Jun 2026 12:59:17 +0800 Subject: [PATCH 09/15] More comments and code enhancements #631 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipelines/mine_apache.py | 6 +- minecode_pipelines/pipes/apache.py | 25 +- minecode_pipelines/tests/pipes/test_apache.py | 324 ++++++++++-------- 3 files changed, 196 insertions(+), 159 deletions(-) diff --git a/minecode_pipelines/pipelines/mine_apache.py b/minecode_pipelines/pipelines/mine_apache.py index c132090d..5cd1dc11 100644 --- a/minecode_pipelines/pipelines/mine_apache.py +++ b/minecode_pipelines/pipelines/mine_apache.py @@ -66,6 +66,10 @@ def fetch_apache(self): last_sync = checkpoint.get("last_sync", "") if last_sync: self.log(f"last_sync: {last_sync}") + # The "find-ls2.txt.gz" is a compressed metadata manifest file + # generated by the Apache Software Foundation. It contains a + # recursive plaintext directory listing of every single folder and + # file hosted on the Apache distribution archives. find_ls_url = "https://archive.apache.org/dist/zzz/find-ls2.txt.gz" project_json = "https://projects.apache.org/json/foundation/projects.json" self.apache_collector = apache.ApacheCollector( @@ -89,7 +93,7 @@ def mine_and_publish_apache_packageurls(self): def save_check_point(self): checkpoint_path = "apache/checkpoints.json" # We use the current timestamp to record when the sync occurred. - now = datetime.now(timezone.utc) + now = datetime.now(timezone.utc).isoformat() checkpoint = {"last_sync": now} self.log(f"Saving checkpoint: {checkpoint}") pipes.update_checkpoints_in_github( diff --git a/minecode_pipelines/pipes/apache.py b/minecode_pipelines/pipes/apache.py index d246a1c5..10c4277d 100644 --- a/minecode_pipelines/pipes/apache.py +++ b/minecode_pipelines/pipes/apache.py @@ -162,7 +162,6 @@ def get_packages(self): size = package.get("size", "") release_date = package.get("date", "") """ - namespace, name, version, qualifiers = determine_purl_elements(package) purl = PackageURL( @@ -334,6 +333,12 @@ def extract_archives(archive_path): def parse_apache_path_common(path): + """ + Parse standard Apache paths following a strict + '{name}/{version}/{filename}' structure. Requires the version segment + to start with a digit and the component name to be a substring of the + filename. + """ segments = path.strip().split("/") # The minimum required segments for {name}/{version}/{filename} is 3 @@ -367,6 +372,14 @@ def parse_apache_path_common(path): def parse_apache_path_complex(path): + """ + Parse non-standard Apache paths by locating a version or keyword + boundary. + + Scans left-to-right for a "marker" segment (a semantic version or words + like 'bin', 'rc1'). The segment right before this marker becomes the + 'name'. Falls back to the parent directory if no marker is found. + """ segments = path.strip().split("/") if len(segments) < 2: @@ -391,8 +404,9 @@ def parse_apache_path_complex(path): version = "" for i, seg in enumerate(path_segments): - # Look for numeric version groupings - version_match = re.search(r"(\d+(?:\.\d+)+)", seg) + # Match standard versions (e.g., 1.2.0) OR release candidates (e.g., rc1, rc1.1) + # Added re.IGNORECASE to safely handle 'RC1' or 'rc1' + version_match = re.search(r"(\d+(?:\.\d+)+|rc\d+(?:\.\d+)*)", seg, re.IGNORECASE) is_version = False if version_match: @@ -400,9 +414,8 @@ def parse_apache_path_complex(path): if not version: version = version_match.group(1) - # Dynamic check: Matches hardcoded words OR 'rc' + numbers (e.g., rc1, rc2) - # This will completely ignore "release-candidates" - is_special = (seg.lower() in special_words) or bool(re.match(r"^rc\d+$", seg.lower())) + # Check only against the hardcoded metadata keywords + is_special = seg.lower() in special_words if (is_version or is_special) and marker_idx is None: marker_idx = i diff --git a/minecode_pipelines/tests/pipes/test_apache.py b/minecode_pipelines/tests/pipes/test_apache.py index d9e272c9..3ff99629 100644 --- a/minecode_pipelines/tests/pipes/test_apache.py +++ b/minecode_pipelines/tests/pipes/test_apache.py @@ -72,158 +72,178 @@ def test_parse_apache_path_common(self): for i, p in enumerate(paths): self.assertEqual(apache.parse_apache_path_common(p), expected[i]) - def test_parse_apache_path_complex(self): - paths = [ - "avalon/framework/jars/avalon-framework-excalibur-test-4.0b1.jar", - "avalon/logkit/v1.2/LogKit-1.2-bin.tar.gz", - "avalon/merlin/binaries/3.0/avalon-merlin-3.0-dist.zip", - "avalon/merlin/jars/merlin-plugin-1.0.jar", - "avro/avro-1.10.0/java/avro-grpc-1.10.0-sources.jar", - "httpd/libapreq/libapreq-1.1.tar.gz", - "airflow/providers/apache_airflow_providers_cncf_kubernetes-10.18.0.tar.gz", - "ace/apache-ace-2.1.0/apache-ace-2.1.0-src.zip", - "avalon/excalibur/v4.0/Excalibur-4.0-bin.tar.gz", - "airflow/providers/2.11/apache_airflow_providers_fab-1.5.4-py3-none-any.whl", - "beam/vendor/beam-vendor-calcite-1_40_0/0.1/apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", - "groovy/2.5.23/distribution/apache-groovy-binary-2.5.23.zip", - "groovy/2.5.23/sources/apache-groovy-src-2.5.23.zip", - "geronimo/safeguard/safeguard-parent-1.2.1-source-release.zip", - "beam/2.73.0/prism/windows/arm64/apache_beam-v2.73.0-prism-windows-arm64.zip", - "ranger/2.7.0/plugins/hdfs/ranger-2.7.0-hdfs-plugin.tar.gz", - "netbeans/netbeans-maven-archetypes/netbeans-platform-app-archetype/netbeans-platform-app-archetype-1.24/netbeans-platform-app-archetype-1.24-source-release.zip", - "ant/antlibs/antunit/source/apache-ant-antunit-1.5.0-src.tar.bz2", - "ant/antlibs/compress/binaries/apache-ant-compress-1.5-bin.zip", - "asterixdb/asterixdb-0.9.8.1/apache-asterixdb-0.9.8.1-source-release.zip", - "deltacloud/rc1/deltacloud-client-1.1.0.gem", + def test_parse_complex_with_special_word_markers(self): + """ + Test paths where parsing boundaries are triggered by keywords like + 'jars', 'binaries', or 'source'. + """ + cases = [ + ( + "avalon/framework/jars/avalon-framework-excalibur-test-4.0b1.jar", + { + "namespace": "avalon", + "name": "framework", + "version": "", + "file_name": "avalon-framework-excalibur-test-4.0b1.jar", + }, + ), + ( + "avalon/merlin/binaries/3.0/avalon-merlin-3.0-dist.zip", + { + "namespace": "avalon", + "name": "merlin", + "version": "3.0", + "file_name": "avalon-merlin-3.0-dist.zip", + }, + ), + ( + "avalon/merlin/jars/merlin-plugin-1.0.jar", + { + "namespace": "avalon", + "name": "merlin", + "version": "", + "file_name": "merlin-plugin-1.0.jar", + }, + ), + ( + "ant/antlibs/antunit/source/apache-ant-antunit-1.5.0-src.tar.bz2", + { + "namespace": "ant/antlibs", + "name": "antunit", + "version": "", + "file_name": "apache-ant-antunit-1.5.0-src.tar.bz2", + }, + ), + ( + "ant/antlibs/compress/binaries/apache-ant-compress-1.5-bin.zip", + { + "namespace": "ant/antlibs", + "name": "compress", + "version": "", + "file_name": "apache-ant-compress-1.5-bin.zip", + }, + ), ] - expected = [ - { - "namespace": "avalon", - "name": "framework", - "version": "", - "file_name": "avalon-framework-excalibur-test-4.0b1.jar", - }, - { - "namespace": "avalon", - "name": "logkit", - "version": "1.2", - "file_name": "LogKit-1.2-bin.tar.gz", - }, - { - "namespace": "avalon", - "name": "merlin", - "version": "3.0", - "file_name": "avalon-merlin-3.0-dist.zip", - }, - { - "namespace": "avalon", - "name": "merlin", - "version": "", - "file_name": "merlin-plugin-1.0.jar", - }, - { - "namespace": "", - "name": "avro", - "version": "1.10.0", - "file_name": "avro-grpc-1.10.0-sources.jar", - }, - { - "namespace": "httpd", - "name": "libapreq", - "version": "", - "file_name": "libapreq-1.1.tar.gz", - }, - { - "namespace": "airflow", - "name": "providers", - "version": "", - "file_name": "apache_airflow_providers_cncf_kubernetes-10.18.0.tar.gz", - }, - { - "namespace": "", - "name": "ace", - "version": "2.1.0", - "file_name": "apache-ace-2.1.0-src.zip", - }, - { - "namespace": "avalon", - "name": "excalibur", - "version": "4.0", - "file_name": "Excalibur-4.0-bin.tar.gz", - }, - { - "namespace": "airflow", - "name": "providers", - "version": "2.11", - "file_name": "apache_airflow_providers_fab-1.5.4-py3-none-any.whl", - }, - { - "namespace": "beam/vendor", - "name": "beam-vendor-calcite-1_40_0", - "version": "0.1", - "file_name": "apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", - }, - { - "namespace": "", - "name": "groovy", - "version": "2.5.23", - "file_name": "apache-groovy-binary-2.5.23.zip", - }, - { - "namespace": "", - "name": "groovy", - "version": "2.5.23", - "file_name": "apache-groovy-src-2.5.23.zip", - }, - { - "namespace": "geronimo", - "name": "safeguard", - "version": "", - "file_name": "safeguard-parent-1.2.1-source-release.zip", - }, - { - "namespace": "", - "name": "beam", - "version": "2.73.0", - "file_name": "apache_beam-v2.73.0-prism-windows-arm64.zip", - }, - { - "namespace": "", - "name": "ranger", - "version": "2.7.0", - "file_name": "ranger-2.7.0-hdfs-plugin.tar.gz", - }, - { - "namespace": "netbeans/netbeans-maven-archetypes", - "name": "netbeans-platform-app-archetype", - "version": "1.24", - "file_name": "netbeans-platform-app-archetype-1.24-source-release.zip", - }, - { - "namespace": "ant/antlibs", - "name": "antunit", - "version": "", - "file_name": "apache-ant-antunit-1.5.0-src.tar.bz2", - }, - { - "namespace": "ant/antlibs", - "name": "compress", - "version": "", - "file_name": "apache-ant-compress-1.5-bin.zip", - }, - { - "namespace": "", - "name": "asterixdb", - "version": "0.9.8.1", - "file_name": "apache-asterixdb-0.9.8.1-source-release.zip", - }, - { - "namespace": "", - "name": "deltacloud", - "version": "", - "file_name": "deltacloud-client-1.1.0.gem", - }, + for path, expected in cases: + self.assertEqual(apache.parse_apache_path_complex(path), expected) + + def test_parse_complex_with_version_markers(self): + """ + Test paths where parsing boundaries are explicitly triggered by version strings. + """ + cases = [ + ( + "avalon/logkit/v1.2/LogKit-1.2-bin.tar.gz", + { + "namespace": "avalon", + "name": "logkit", + "version": "1.2", + "file_name": "LogKit-1.2-bin.tar.gz", + }, + ), + ( + "avro/avro-1.10.0/java/avro-grpc-1.10.0-sources.jar", + { + "namespace": "", + "name": "avro", + "version": "1.10.0", + "file_name": "avro-grpc-1.10.0-sources.jar", + }, + ), + ( + "airflow/providers/2.11/apache_airflow_providers_fab-1.5.4-py3-none-any.whl", + { + "namespace": "airflow", + "name": "providers", + "version": "2.11", + "file_name": "apache_airflow_providers_fab-1.5.4-py3-none-any.whl", + }, + ), + ( + "beam/vendor/beam-vendor-calcite-1_40_0/0.1/apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", + { + "namespace": "beam/vendor", + "name": "beam-vendor-calcite-1_40_0", + "version": "0.1", + "file_name": "apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", + }, + ), + ( + "groovy/2.5.23/distribution/apache-groovy-binary-2.5.23.zip", + { + "namespace": "", + "name": "groovy", + "version": "2.5.23", + "file_name": "apache-groovy-binary-2.5.23.zip", + }, + ), + ( + "beam/2.73.0/prism/windows/arm64/apache_beam-v2.73.0-prism-windows-arm64.zip", + { + "namespace": "", + "name": "beam", + "version": "2.73.0", + "file_name": "apache_beam-v2.73.0-prism-windows-arm64.zip", + }, + ), + ( + "netbeans/netbeans-maven-archetypes/netbeans-platform-app-archetype/netbeans-platform-app-archetype-1.24/netbeans-platform-app-archetype-1.24-source-release.zip", + { + "namespace": "netbeans/netbeans-maven-archetypes", + "name": "netbeans-platform-app-archetype", + "version": "1.24", + "file_name": "netbeans-platform-app-archetype-1.24-source-release.zip", + }, + ), ] + for path, expected in cases: + self.assertEqual(apache.parse_apache_path_complex(path), expected) - for i, p in enumerate(paths): - self.assertEqual(apache.parse_apache_path_complex(p), expected[i]) + def test_parse_complex_fallback_logic(self): + """ + Test no version in path + Only treat the version found in the path as the package version. + A version found in the filename represents the file's own version, + not necessary the package version. + There are cases where a package contains multiple files, each with + its own version. + For instance, + "/namespace/package/1.0.0/john-1.2.3.zip" + "/namespace/package/1.0.0/doo-2.3.zip" + """ + cases = [ + ( + "httpd/libapreq/libapreq-1.1.tar.gz", + { + "namespace": "httpd", + "name": "libapreq", + "version": "", + "file_name": "libapreq-1.1.tar.gz", + }, + ), + ( + "airflow/providers/apache_airflow_providers_cncf_kubernetes-10.18.0.tar.gz", + { + "namespace": "airflow", + "name": "providers", + "version": "", + "file_name": "apache_airflow_providers_cncf_kubernetes-10.18.0.tar.gz", + }, + ), + ] + for path, expected in cases: + self.assertEqual(apache.parse_apache_path_complex(path), expected) + + def test_parse_complex_release_candidate_markers(self): + """ + Test handling for release candidate patterns like 'rc1', 'rc2'. + """ + path = "deltacloud/rc1/deltacloud-client-1.1.0.gem" + expected = { + "namespace": "", + "name": "deltacloud", + "version": "rc1", + "file_name": "deltacloud-client-1.1.0.gem", + } + + self.assertEqual(apache.parse_apache_path_complex(path), expected) From f6a6cd67a68c705ec58ab92a1c7269f11cddeb32 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Mon, 2 Feb 2026 14:32:10 +0800 Subject: [PATCH 10/15] Improve checkpoint loeader with error handling #637 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipes/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index 33be514d..9e69c180 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -67,13 +67,11 @@ def get_checkpoint_from_file(cloned_repo, path): try: with open(checkpoint_path) as f: checkpoint_data = json.load(f) - return checkpoint_data or {} - except FileNotFoundError: - return {} except FileNotFoundError: return {} except json.JSONDecodeError: return {} + return checkpoint_data def update_checkpoints_in_github(checkpoint, cloned_repo, path, logger=None): From 76e85e4a1ebe7a5a17c24e9fe1f7a7a6d38bb90b Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Mon, 29 Jun 2026 15:04:57 +0800 Subject: [PATCH 11/15] Refine code and tests #631 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipes/apache.py | 5 +-- minecode_pipelines/tests/pipes/test_apache.py | 32 ++++++++----------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/minecode_pipelines/pipes/apache.py b/minecode_pipelines/pipes/apache.py index 10c4277d..ab839371 100644 --- a/minecode_pipelines/pipes/apache.py +++ b/minecode_pipelines/pipes/apache.py @@ -359,10 +359,6 @@ def parse_apache_path_common(path): # name is the segment before the version name = segments[-3] - # Ensure the name exists as part of the filename - if name not in file_name: - return None - # namespace consists of all segments from the beginning up to the name # segment namespace_segments = segments[:-3] @@ -398,6 +394,7 @@ def parse_apache_path_complex(path): "bin", "dist", "old", + "obsolete", } marker_idx = None diff --git a/minecode_pipelines/tests/pipes/test_apache.py b/minecode_pipelines/tests/pipes/test_apache.py index 3ff99629..ad440fc9 100644 --- a/minecode_pipelines/tests/pipes/test_apache.py +++ b/minecode_pipelines/tests/pipes/test_apache.py @@ -27,6 +27,8 @@ def test_parse_apache_path_common(self): "ws/commons/axiom/1_2_2/axiom-1.2.2-bin.zip", "avalon/framework/jars/avalon-framework-excalibur-test-4.0b1.jar", "avalon/logkit/v1.2/LogKit-1.2-bin.tar.gz", + "airflow/providers/2.11/apache_airflow_providers_fab-1.5.4-py3-none-any.whl", + "beam/vendor/beam-vendor-calcite-1_40_0/0.1/apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", ] expected = [ { @@ -67,6 +69,18 @@ def test_parse_apache_path_common(self): }, None, None, + { + "namespace": "airflow", + "name": "providers", + "version": "2.11", + "file_name": "apache_airflow_providers_fab-1.5.4-py3-none-any.whl", + }, + { + "namespace": "beam/vendor", + "name": "beam-vendor-calcite-1_40_0", + "version": "0.1", + "file_name": "apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", + }, ] for i, p in enumerate(paths): @@ -150,24 +164,6 @@ def test_parse_complex_with_version_markers(self): "file_name": "avro-grpc-1.10.0-sources.jar", }, ), - ( - "airflow/providers/2.11/apache_airflow_providers_fab-1.5.4-py3-none-any.whl", - { - "namespace": "airflow", - "name": "providers", - "version": "2.11", - "file_name": "apache_airflow_providers_fab-1.5.4-py3-none-any.whl", - }, - ), - ( - "beam/vendor/beam-vendor-calcite-1_40_0/0.1/apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", - { - "namespace": "beam/vendor", - "name": "beam-vendor-calcite-1_40_0", - "version": "0.1", - "file_name": "apache-beam-f6ec9cb0c167815f942cf70a674f92a04819c83b-source-release.zip", - }, - ), ( "groovy/2.5.23/distribution/apache-groovy-binary-2.5.23.zip", { From c54d259caeabd3b832efac1032f332174058b056 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Tue, 30 Jun 2026 16:59:22 +0800 Subject: [PATCH 12/15] Create parent dir if not exist. Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipes/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index 9e69c180..a5c650bb 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -92,6 +92,8 @@ def update_checkpoints_file_in_github(checkpoints_file, cloned_repo, path): from scanpipe.pipes.federatedcode import commit_and_push_changes checkpoint_path = os.path.join(cloned_repo.working_dir, path) + # Create the directory if does not exist + os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True) shutil.move(checkpoints_file, checkpoint_path) commit_message = """Update federatedcode purl mining checkpoint""" commit_and_push_changes( From 556312227f57627f77268af4df2ce9cdbb308e49 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Tue, 30 Jun 2026 17:00:27 +0800 Subject: [PATCH 13/15] Heavily modify the code to follow the structure/template as the mine_npm.py #631 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipelines/mine_apache.py | 93 +++-- minecode_pipelines/pipes/apache.py | 362 +++++++++++++++----- 2 files changed, 316 insertions(+), 139 deletions(-) diff --git a/minecode_pipelines/pipelines/mine_apache.py b/minecode_pipelines/pipelines/mine_apache.py index 5cd1dc11..1bfba62f 100644 --- a/minecode_pipelines/pipelines/mine_apache.py +++ b/minecode_pipelines/pipelines/mine_apache.py @@ -20,85 +20,84 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/aboutcode-org/scancode.io for support and download. -from scanpipe.pipes import federatedcode -from minecode_pipelines import pipes +from minecode_pipelines.pipes import apache from minecode_pipelines.pipelines import MineCodeBasePipeline from minecode_pipelines.pipelines import _mine_and_publish_packageurls -from minecode_pipelines.pipes import apache - -from datetime import datetime, timezone class MineApache(MineCodeBasePipeline): """Mine PackageURLs from apache.org and publish them to FederatedCode.""" - pipeline_config_repo = "https://github.com/aboutcode-data/minecode-pipelines-config/" - - append_purls = True + package_batch_size = 5 @classmethod def steps(cls): return ( cls.check_federatedcode_eligibility, cls.create_federatedcode_working_dir, + cls.mine_apache_packages, + cls.get_apache_packages_to_sync, cls.fetch_federation_config, - cls.fetch_checkpoint_config_repo, - cls.fetch_apache, - cls.mine_and_publish_apache_packageurls, - cls.save_check_point, - cls.delete_working_dir, + cls.mine_and_publish_packageurls, + cls.update_mined_checkpoints, + # cls.delete_working_dir, + ) + + def mine_apache_packages(self): + """Mine apache package archive path from the find_ls file or checkpoint.""" + (self.apache_packages, self.apache_packages_metadata, self.config_repo) = ( + apache.mine_apache_packages(logger=self.log) ) - def fetch_checkpoint_config_repo(self): - self.checkpoint_config_repo = federatedcode.clone_repository( - repo_url=self.pipeline_config_repo, - clone_path=self.working_path / "minecode-pipelines-config", + def get_apache_packages_to_sync(self): + """Get apache packages which needs to be synced using checkpoint.""" + self.packages, self.synced_packages = apache.get_apache_packages_to_sync( + packages_file=self.apache_packages, logger=self.log, ) - def fetch_apache(self): - checkpoint_path = "apache/checkpoints.json" - checkpoint = pipes.get_checkpoint_from_file( - cloned_repo=self.checkpoint_config_repo, - path=checkpoint_path, + def packages_count(self): + return len(self.packages) + + def mine_packageurls(self): + """Yield npm packageURLs for all mined npm package names.""" + self.packages_mined = [] + yield from apache.mine_and_publish_apache_packageurls( + packages_to_sync=self.packages, + packages_mined=self.packages_mined, + packages_metadata=self.apache_packages_metadata, + logger=self.log, ) - last_sync = checkpoint.get("last_sync", "") - if last_sync: - self.log(f"last_sync: {last_sync}") - # The "find-ls2.txt.gz" is a compressed metadata manifest file - # generated by the Apache Software Foundation. It contains a - # recursive plaintext directory listing of every single folder and - # file hosted on the Apache distribution archives. - find_ls_url = "https://archive.apache.org/dist/zzz/find-ls2.txt.gz" - project_json = "https://projects.apache.org/json/foundation/projects.json" - self.apache_collector = apache.ApacheCollector( - find_ls_url=find_ls_url, - project_json=project_json, + + def save_check_point(self): + apache.save_mined_packages_in_checkpoint( + packages_mined=self.packages_mined, + synced_packages=self.synced_packages, + config_repo=self.config_repo, logger=self.log, ) + self.packages_mined = [] + + def mine_and_publish_packageurls(self): + """Mine and publish PackageURLs.""" - def mine_and_publish_apache_packageurls(self): _mine_and_publish_packageurls( - packageurls=self.apache_collector.get_packages(), - total_package_count=None, + packageurls=self.mine_packageurls(), + total_package_count=self.packages_count(), data_clusters=self.data_clusters, checked_out_repos=self.checked_out_repos, working_path=self.working_path, append_purls=self.append_purls, commit_msg_func=self.commit_message, logger=self.log, + checkpoint_func=self.save_check_point, + checkpoint_on_commit=True, + batch_size=self.package_batch_size, ) - def save_check_point(self): - checkpoint_path = "apache/checkpoints.json" - # We use the current timestamp to record when the sync occurred. - now = datetime.now(timezone.utc).isoformat() - checkpoint = {"last_sync": now} - self.log(f"Saving checkpoint: {checkpoint}") - pipes.update_checkpoints_in_github( - checkpoint=checkpoint, - cloned_repo=self.checkpoint_config_repo, - path=checkpoint_path, + def update_mined_checkpoints(self): + apache.update_mined_checkpoints( + config_repo=self.config_repo, logger=self.log, ) diff --git a/minecode_pipelines/pipes/apache.py b/minecode_pipelines/pipes/apache.py index ab839371..a39f8bfc 100644 --- a/minecode_pipelines/pipes/apache.py +++ b/minecode_pipelines/pipes/apache.py @@ -7,21 +7,40 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +from datetime import datetime + +from minecode_pipelines.pipes import fetch_checkpoint_from_github +from minecode_pipelines.pipes import update_checkpoints_in_github +from minecode_pipelines.pipes import update_checkpoints_file_in_github +from minecode_pipelines.pipes import get_mined_packages_from_checkpoint +from minecode_pipelines.pipes import update_mined_packages_in_checkpoint +from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO +from minecode_pipelines.pipes import write_packages_json +from minecode_pipelines.pipes import compress_packages_file + +from minecode_pipelines.utils import get_temp_dir + +from packageurl import PackageURL + +from scanpipe.pipes.federatedcode import clone_repository +from scanpipe.pipes.federatedcode import delete_local_clone + +from scanpipe.pipes.fetch import fetch_http + import gzip import shutil import json import os -from shutil import rmtree import re import requests -from packageurl import PackageURL - TRACE = False TRACE_DEEP = False +SID_TYPE = "sid" + FIND_LS_URL = "https://archive.apache.org/dist/zzz/find-ls2.txt.gz" PROJECT_JSON = "https://projects.apache.org/json/foundation/projects.json" @@ -29,6 +48,13 @@ BASE_NAMESPACE = "apache.org/" +PACKAGE_FILE_NAME = "ApachePackages.json" +COMPRESSED_PACKAGE_FILE_NAME = "ApachePackages.json.gz" +COMPRESSED_APACHE_PACKAGES_PATH = "apache/" + COMPRESSED_PACKAGE_FILE_NAME +APACHE_CHECKPOINT_PATH = "apache/checkpoints.json" +APACHE_PACKAGES_CHECKPOINT_PATH = "apache/packages_checkpoint.json" +PACKAGE_BATCH_SIZE = 700 + CHECKSUM_EXTS = ( ".sha256", ".sha512", @@ -107,97 +133,11 @@ # /*/apache-log4j-*-site.zip -class ApacheCollector: - """ - Download and process the find-ls file. - """ - - def __init__( - self, - find_ls_url=None, - project_json=None, - logger=None, - ): - self.downloads = [] - - if not find_ls_url: - find_ls_url = FIND_LS_URL - - if not project_json: - project_json = PROJECT_JSON - - find_ls_download = self._fetch_http(find_ls_url) - project_json_download = self._fetch_http(project_json) - self.find_ls_location = find_ls_download.path - self.project_json_location = project_json_download.path - - def __del__(self): - if self.downloads: - for download in self.downloads: - rmtree(download.directory) - - def _fetch_http(self, uri): - from scanpipe.pipes.fetch import fetch_http - - fetched = fetch_http(uri) - self.downloads.append(fetched) - return fetched - - def get_packages(self): - """Yield Package objects from the find_ls list""" - txt_path = extract_archives(archive_path=self.find_ls_location) - packages_data, packages_checksum = get_archives_and_checksum(txt_path) - updated_packages_list = update_package_data( - packages_data, packages_checksum, project_json_location=self.project_json_location - ) - - current_base = None - current_purls = [] - - for package in updated_packages_list: - """ - repository_homepage_url = package.get("repository_homepage_url", "") - repository_download_url = package.get("repository_download_url", "") - download_url = package.get("download_url", "") - size = package.get("size", "") - release_date = package.get("date", "") - """ - namespace, name, version, qualifiers = determine_purl_elements(package) - - purl = PackageURL( - type="sid", - namespace=namespace, - name=name, - version=version, - qualifiers=qualifiers, - ).to_string() - - base_purl = PackageURL( - type="sid", - namespace=namespace, - name=name, - ).to_string() - - if current_base is None: - current_base = base_purl - current_purls.append(purl) - elif base_purl == current_base: - current_purls.append(purl) - else: - yield current_base, current_purls, [] - current_base = base_purl - current_purls = [purl] - - if current_base is not None: - yield current_base, current_purls, [] - - -def determine_purl_elements(package): +def determine_purl_elements(path): """ Determine and return the namespace, name, version and qualifier based on the path info """ - path = package.get("filepath").lstrip("./") parsed_result = parse_apache_path_common(path) if parsed_result: namespace = BASE_NAMESPACE + parsed_result.get("namespace") @@ -255,7 +195,7 @@ def get_archives_and_checksum(txt_path): return packages_data, packages_checksum -def update_package_data(packages_data, packages_checksum, project_json_location): +def update_package_data(packages_data, packages_checksum): """ Update package metadata with: - Project information from @@ -266,7 +206,8 @@ def update_package_data(packages_data, packages_checksum, project_json_location) """ updated_package_data = [] data = "" - with open(project_json_location, encoding="utf-8") as f: + project_json_download = fetch_http(PROJECT_JSON) + with open(project_json_download.path, encoding="utf-8") as f: data = json.load(f) for package in packages_data: @@ -290,6 +231,8 @@ def update_package_data(packages_data, packages_checksum, project_json_location) "homepage": "repository_homepage_url", "download-page": "repository_download_url", "description": "description", + "mailing-list": "mailing_list", + "programming-language": "programming_language", }.items(): value = package_metadata.get(key) if value: @@ -427,3 +370,238 @@ def parse_apache_path_complex(path): namespace = "/".join(namespace_segments) return {"namespace": namespace, "name": name, "version": version, "file_name": file_name} + + +def mine_apache_packages(logger=None): + """ + Mine apache packages names from "https://archive.apache.org/dist/zzz/find-ls2.txt.gz" + + Apache.org does not provide an index file, so we have no way + to check the index and determine which packages are new and + need to be synced, unlike npm. + + We will use the timestamp to log when the packages were mined. + """ + + config_repo = clone_repository( + repo_url=MINECODE_PIPELINES_CONFIG_REPO, + clone_path=get_temp_dir(), + logger=logger, + ) + + packages, packages_metadata = get_find_ls_archive_paths_and_metadata(logger=logger) + packages_file = write_packages_json( + packages=packages, + name=PACKAGE_FILE_NAME, + ) + compressed_packages_file = packages_file + ".gz" + compress_packages_file( + packages_file=packages_file, + compressed_packages_file=compressed_packages_file, + ) + update_checkpoints_file_in_github( + checkpoints_file=compressed_packages_file, + cloned_repo=config_repo, + path=COMPRESSED_APACHE_PACKAGES_PATH, + ) + + update_apache_checkpoints( + cloned_repo=config_repo, + checkpoint_path=APACHE_CHECKPOINT_PATH, + logger=logger, + ) + + return packages_file, packages_metadata, config_repo + + +def get_find_ls_archive_paths_and_metadata(logger=None): + find_ls_download = fetch_http(FIND_LS_URL) + txt_path = extract_archives(find_ls_download.path) + packages_data, packages_checksum = get_archives_and_checksum(txt_path) + updated_packages_list = update_package_data(packages_data, packages_checksum) + all_package_paths = [] + for package in packages_data: + all_package_paths.append(package.get("filepath")) + if logger: + logger(f"Collected: {len(all_package_paths)} package archive files.") + + return {"packages": all_package_paths}, updated_packages_list + + +def update_apache_checkpoints( + cloned_repo, + checkpoint_path, + state=None, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + logger=None, +): + checkpoint = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=checkpoint_path, + ) + if state: + checkpoint["state"] = state + + checkpoint["date"] = str(datetime.now()) + update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=cloned_repo, + path=checkpoint_path, + logger=logger, + ) + + +def get_apache_packages_to_sync(packages_file, logger=None): + packages = load_apache_packages(packages_file) + if logger: + logger(f"# of package archives found from apache.org: {len(packages)}") + + if not packages: + return + + synced_packages = get_mined_packages_from_checkpoint( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + checkpoint_path=APACHE_PACKAGES_CHECKPOINT_PATH, + ) + packages_to_sync = list(set(packages).difference(set(synced_packages))) + if logger: + logger( + f"Starting initial package mining for {len(packages_to_sync)} packages archives from checkpoint" + ) + + return packages_to_sync, synced_packages + + +def load_apache_packages(packages_file): + with open(packages_file) as f: + packages_data = json.load(f) + + return packages_data.get("packages", []) + + +def mine_and_publish_apache_packageurls( + packages_to_sync, packages_mined, packages_metadata, logger=None +): + if logger: + logger("Starting package mining for a batch of packages") + + handled_base = None + for i, package_path in enumerate(packages_to_sync): + current_base = None + current_purls = [] + purls_and_package_data = [] + + if i > 10: + break + + if not package_path: + continue + + # fetch packageURLs for package + if logger: + logger(f"getting packageURLs for package: {package_path}") + + packages_mined.append(package_path) + + package_path = package_path.lstrip("./") + namespace, name, _version, _qualifiers = determine_purl_elements(package_path) + current_base = PackageURL( + type=SID_TYPE, + namespace=namespace, + name=name, + ).to_string() + + if handled_base and handled_base == current_base: + continue + else: + handled_base = current_base + + for package in packages_metadata: + path = package.get("filepath").lstrip("./") + package_namespace, package_name, package_version, package_qualifiers = ( + determine_purl_elements(path) + ) + + base_purl = PackageURL( + type=SID_TYPE, + namespace=package_namespace, + name=package_name, + ).to_string() + + if current_base == base_purl: + purl = PackageURL( + type=SID_TYPE, + namespace=package_namespace, + name=package_name, + version=package_version, + qualifiers=package_qualifiers, + ).to_string() + + if purl not in current_purls: + package_metadata = {} + package_metadata["name"] = package_name + package_metadata["version"] = package_version + package_metadata["repository_homepage_url"] = package.get( + "repository_homepage_url", "" + ) + package_metadata["repository_download_url"] = package.get( + "repository_download_url", "" + ) + package_metadata["description"] = package.get("description", "") + package_metadata["download_url"] = package.get("download_url", "") + package_metadata["size"] = package.get("size", "") + package_metadata["release_date"] = package.get("date", "") + package_metadata["mailing_list"] = package.get("mailing_list", "") + package_metadata["programming_language"] = package.get( + "programming_language", "" + ) + + package_data = (purl, package_metadata) + + current_purls.append(purl) + purls_and_package_data.append(package_data) + + else: + if current_purls: + yield current_base, current_purls, purls_and_package_data + # Reset + current_base = None + current_purls = [] + purls_and_package_data = [] + # packages_metadata should be ordered so that we can + # break the loop once all relevant entries have been + # found. + break + + if current_base is not None: + yield current_base, current_purls, purls_and_package_data + + +def update_mined_checkpoints(config_repo, logger=None): + # Refresh mined packages checkpoint + update_checkpoints_in_github( + checkpoint={"packages_mined": []}, + cloned_repo=config_repo, + path=APACHE_PACKAGES_CHECKPOINT_PATH, + logger=logger, + ) + + if logger: + logger(f"Deleting local clone at: {config_repo.working_dir}") + delete_local_clone(config_repo) + + +def save_mined_packages_in_checkpoint(packages_mined, synced_packages, config_repo, logger=None): + # Update mined packages checkpoint for every batch + # so we can continue mining the other packages after restarting + if logger: + logger(f"Checkpointing processed packages to: {APACHE_PACKAGES_CHECKPOINT_PATH}") + + packages_checkpoint = packages_mined + synced_packages + update_mined_packages_in_checkpoint( + packages=packages_checkpoint, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + cloned_repo=config_repo, + checkpoint_path=APACHE_PACKAGES_CHECKPOINT_PATH, + logger=logger, + ) From c5022dc6e13b8d130adb44b457dd8d84f6917b68 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Thu, 2 Jul 2026 07:14:25 +0800 Subject: [PATCH 14/15] Restore previously commented out code #631 Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipelines/mine_apache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/minecode_pipelines/pipelines/mine_apache.py b/minecode_pipelines/pipelines/mine_apache.py index 1bfba62f..1a66be17 100644 --- a/minecode_pipelines/pipelines/mine_apache.py +++ b/minecode_pipelines/pipelines/mine_apache.py @@ -41,7 +41,7 @@ def steps(cls): cls.fetch_federation_config, cls.mine_and_publish_packageurls, cls.update_mined_checkpoints, - # cls.delete_working_dir, + cls.delete_working_dir, ) def mine_apache_packages(self): From 452c2a8f4d3a725fcc19af775b2bc42104123a96 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Thu, 2 Jul 2026 11:22:55 +0800 Subject: [PATCH 15/15] Simplied the code #631 * Only use timestamp to determine what packages need to be indexed Signed-off-by: Chin Yeung Li --- minecode_pipelines/pipelines/mine_apache.py | 33 +---- minecode_pipelines/pipes/apache.py | 152 ++++++++------------ 2 files changed, 65 insertions(+), 120 deletions(-) diff --git a/minecode_pipelines/pipelines/mine_apache.py b/minecode_pipelines/pipelines/mine_apache.py index 1a66be17..8235bc7a 100644 --- a/minecode_pipelines/pipelines/mine_apache.py +++ b/minecode_pipelines/pipelines/mine_apache.py @@ -40,48 +40,36 @@ def steps(cls): cls.get_apache_packages_to_sync, cls.fetch_federation_config, cls.mine_and_publish_packageurls, - cls.update_mined_checkpoints, cls.delete_working_dir, ) def mine_apache_packages(self): - """Mine apache package archive path from the find_ls file or checkpoint.""" - (self.apache_packages, self.apache_packages_metadata, self.config_repo) = ( - apache.mine_apache_packages(logger=self.log) + """Mine apache package archive path from the find_ls file.""" + (self.apache_packages_metadata, self.last_mined_date) = apache.mine_apache_packages( + logger=self.log ) def get_apache_packages_to_sync(self): """Get apache packages which needs to be synced using checkpoint.""" - self.packages, self.synced_packages = apache.get_apache_packages_to_sync( - packages_file=self.apache_packages, + self.packages = apache.get_apache_packages_to_sync( + packages_metadata=self.apache_packages_metadata, + last_mined_date=self.last_mined_date, logger=self.log, ) def packages_count(self): - return len(self.packages) + return len(list(self.mine_packageurls())) def mine_packageurls(self): """Yield npm packageURLs for all mined npm package names.""" - self.packages_mined = [] yield from apache.mine_and_publish_apache_packageurls( packages_to_sync=self.packages, - packages_mined=self.packages_mined, packages_metadata=self.apache_packages_metadata, logger=self.log, ) - def save_check_point(self): - apache.save_mined_packages_in_checkpoint( - packages_mined=self.packages_mined, - synced_packages=self.synced_packages, - config_repo=self.config_repo, - logger=self.log, - ) - self.packages_mined = [] - def mine_and_publish_packageurls(self): """Mine and publish PackageURLs.""" - _mine_and_publish_packageurls( packageurls=self.mine_packageurls(), total_package_count=self.packages_count(), @@ -91,13 +79,6 @@ def mine_and_publish_packageurls(self): append_purls=self.append_purls, commit_msg_func=self.commit_message, logger=self.log, - checkpoint_func=self.save_check_point, checkpoint_on_commit=True, batch_size=self.package_batch_size, ) - - def update_mined_checkpoints(self): - apache.update_mined_checkpoints( - config_repo=self.config_repo, - logger=self.log, - ) diff --git a/minecode_pipelines/pipes/apache.py b/minecode_pipelines/pipes/apache.py index a39f8bfc..33087b69 100644 --- a/minecode_pipelines/pipes/apache.py +++ b/minecode_pipelines/pipes/apache.py @@ -7,16 +7,11 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from datetime import datetime +from datetime import datetime, timezone from minecode_pipelines.pipes import fetch_checkpoint_from_github from minecode_pipelines.pipes import update_checkpoints_in_github -from minecode_pipelines.pipes import update_checkpoints_file_in_github -from minecode_pipelines.pipes import get_mined_packages_from_checkpoint -from minecode_pipelines.pipes import update_mined_packages_in_checkpoint from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO -from minecode_pipelines.pipes import write_packages_json -from minecode_pipelines.pipes import compress_packages_file from minecode_pipelines.utils import get_temp_dir @@ -53,7 +48,6 @@ COMPRESSED_APACHE_PACKAGES_PATH = "apache/" + COMPRESSED_PACKAGE_FILE_NAME APACHE_CHECKPOINT_PATH = "apache/checkpoints.json" APACHE_PACKAGES_CHECKPOINT_PATH = "apache/packages_checkpoint.json" -PACKAGE_BATCH_SIZE = 700 CHECKSUM_EXTS = ( ".sha256", @@ -389,49 +383,37 @@ def mine_apache_packages(logger=None): logger=logger, ) - packages, packages_metadata = get_find_ls_archive_paths_and_metadata(logger=logger) - packages_file = write_packages_json( - packages=packages, - name=PACKAGE_FILE_NAME, - ) - compressed_packages_file = packages_file + ".gz" - compress_packages_file( - packages_file=packages_file, - compressed_packages_file=compressed_packages_file, - ) - update_checkpoints_file_in_github( - checkpoints_file=compressed_packages_file, - cloned_repo=config_repo, - path=COMPRESSED_APACHE_PACKAGES_PATH, - ) + packages_metadata = get_find_ls_archive_paths_and_metadata(logger=logger) - update_apache_checkpoints( + last_mined_date = get_and_update_apache_checkpoints( cloned_repo=config_repo, checkpoint_path=APACHE_CHECKPOINT_PATH, logger=logger, ) - return packages_file, packages_metadata, config_repo + delete_local_clone(config_repo) + + return packages_metadata, last_mined_date def get_find_ls_archive_paths_and_metadata(logger=None): + """ + Get the archive paths and metadata from the find-ls file. + """ find_ls_download = fetch_http(FIND_LS_URL) txt_path = extract_archives(find_ls_download.path) packages_data, packages_checksum = get_archives_and_checksum(txt_path) - updated_packages_list = update_package_data(packages_data, packages_checksum) - all_package_paths = [] - for package in packages_data: - all_package_paths.append(package.get("filepath")) + packages_metadata = update_package_data(packages_data, packages_checksum) + if logger: - logger(f"Collected: {len(all_package_paths)} package archive files.") + logger(f"Collected: {len(packages_metadata)} package archive files.") - return {"packages": all_package_paths}, updated_packages_list + return packages_metadata -def update_apache_checkpoints( +def get_and_update_apache_checkpoints( cloned_repo, checkpoint_path, - state=None, config_repo=MINECODE_PIPELINES_CONFIG_REPO, logger=None, ): @@ -439,10 +421,14 @@ def update_apache_checkpoints( config_repo=config_repo, checkpoint_path=checkpoint_path, ) - if state: - checkpoint["state"] = state - checkpoint["date"] = str(datetime.now()) + last_mined_date = checkpoint.get("date", "") + if logger: + logger(f"Last mined date from checkpoint: {last_mined_date}") + + now = datetime.now(timezone.utc) + formatted_now = now.strftime("%Y-%m-%d %H:%M UTC") + checkpoint["date"] = formatted_now update_checkpoints_in_github( checkpoint=checkpoint, cloned_repo=cloned_repo, @@ -450,50 +436,60 @@ def update_apache_checkpoints( logger=logger, ) + return last_mined_date -def get_apache_packages_to_sync(packages_file, logger=None): - packages = load_apache_packages(packages_file) - if logger: - logger(f"# of package archives found from apache.org: {len(packages)}") - if not packages: - return +def get_apache_packages_to_sync(packages_metadata, last_mined_date, logger=None): + """ + Get the list of Apache packages that need to be synced based on the + timestamp. + + Was thinking to record all mined archives, but even when processing + only 10 packages it produced about 62k archive paths (all versions + included) totaling 4.2 MB. Scaling this to all ~10,000 Apache packages + would make the checkpoint file far too large. Instead, we will log only + the timestamp indicating when the packages were mined. + """ - synced_packages = get_mined_packages_from_checkpoint( - config_repo=MINECODE_PIPELINES_CONFIG_REPO, - checkpoint_path=APACHE_PACKAGES_CHECKPOINT_PATH, - ) - packages_to_sync = list(set(packages).difference(set(synced_packages))) if logger: - logger( - f"Starting initial package mining for {len(packages_to_sync)} packages archives from checkpoint" - ) - - return packages_to_sync, synced_packages + logger(f"# of package archives found from apache.org: {len(packages_metadata)}") + if not packages_metadata: + return -def load_apache_packages(packages_file): - with open(packages_file) as f: - packages_data = json.load(f) + packages_to_sync = [] + for package in packages_metadata: + path = package.get("filepath") + release_date = package.get("date", "") + if not last_mined_date: + packages_to_sync.append(path) + else: + if release_date: + fmt = "%Y-%m-%d %H:%M UTC" + release_date_format = datetime.strptime(release_date, fmt).replace( + tzinfo=timezone.utc + ) + last_mined_date_format = datetime.strptime(last_mined_date, fmt).replace( + tzinfo=timezone.utc + ) + if release_date_format > last_mined_date_format: + packages_to_sync.append(path) + if logger: + logger(f"Starting initial package mining for {len(packages_to_sync)} packages archives.") - return packages_data.get("packages", []) + return packages_to_sync -def mine_and_publish_apache_packageurls( - packages_to_sync, packages_mined, packages_metadata, logger=None -): +def mine_and_publish_apache_packageurls(packages_to_sync, packages_metadata, logger=None): if logger: logger("Starting package mining for a batch of packages") handled_base = None - for i, package_path in enumerate(packages_to_sync): + for package_path in packages_to_sync: current_base = None current_purls = [] purls_and_package_data = [] - if i > 10: - break - if not package_path: continue @@ -501,8 +497,6 @@ def mine_and_publish_apache_packageurls( if logger: logger(f"getting packageURLs for package: {package_path}") - packages_mined.append(package_path) - package_path = package_path.lstrip("./") namespace, name, _version, _qualifiers = determine_purl_elements(package_path) current_base = PackageURL( @@ -575,33 +569,3 @@ def mine_and_publish_apache_packageurls( if current_base is not None: yield current_base, current_purls, purls_and_package_data - - -def update_mined_checkpoints(config_repo, logger=None): - # Refresh mined packages checkpoint - update_checkpoints_in_github( - checkpoint={"packages_mined": []}, - cloned_repo=config_repo, - path=APACHE_PACKAGES_CHECKPOINT_PATH, - logger=logger, - ) - - if logger: - logger(f"Deleting local clone at: {config_repo.working_dir}") - delete_local_clone(config_repo) - - -def save_mined_packages_in_checkpoint(packages_mined, synced_packages, config_repo, logger=None): - # Update mined packages checkpoint for every batch - # so we can continue mining the other packages after restarting - if logger: - logger(f"Checkpointing processed packages to: {APACHE_PACKAGES_CHECKPOINT_PATH}") - - packages_checkpoint = packages_mined + synced_packages - update_mined_packages_in_checkpoint( - packages=packages_checkpoint, - config_repo=MINECODE_PIPELINES_CONFIG_REPO, - cloned_repo=config_repo, - checkpoint_path=APACHE_PACKAGES_CHECKPOINT_PATH, - logger=logger, - )