Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 59 additions & 10 deletions cmd/wppackages/cmd/discover.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ import (
"github.com/roots/wp-packages/internal/wporg"
)

// svnLogChunkSize bounds how many revisions each changelog REPORT covers, so the
// watermark advances in verifiable steps and a large catch-up after downtime
// can't exceed the server's per-response log-item limit.
const svnLogChunkSize = 10000

var discoverCmd = &cobra.Command{
Use: "discover",
Short: "Discover packages from WordPress.org",
Expand Down Expand Up @@ -239,12 +244,44 @@ func markChangedFromSVNLog(ctx context.Context, client *wporg.Client, src struct
}
}

if lastRev > 0 && lastRev < currentRev {
// First run: no baseline to diff against, so just record HEAD as the
// starting point for the next run.
if lastRev == 0 {
application.Logger.Info("no previous SVN revision stored, skipping changelog (first run)",
"type", src.pkgType, "current_rev", currentRev)
if err := packages.SetMeta(ctx, application.DB, src.metaKey, strconv.FormatInt(currentRev, 10)); err != nil {
return fmt.Errorf("storing revision: %w", err)
}
return nil
}

// Scan forward from the watermark in bounded chunks. The watermark advances
// only to the highest revision the changelog REPORT actually returned —
// never to currentRev, which is read from a different endpoint (the HTML
// listing) that can be ahead of the REPORT replica. Advancing past
// revisions the REPORT never returned would skip those commits permanently
// and strand the affected packages at stale versions. Chunking also keeps
// each response well under the server's log-item limit so a large catch-up
// after downtime can't silently truncate.
cursor := lastRev
for cursor < currentRev {
chunkEnd := cursor + svnLogChunkSize
if chunkEnd > currentRev {
chunkEnd = currentRev
}

application.Logger.Info("fetching SVN changelog",
"type", src.pkgType, "from_rev", lastRev, "to_rev", currentRev)
"type", src.pkgType, "from_rev", cursor+1, "to_rev", chunkEnd)

slugRevisions, err := client.FetchSVNChangedSlugs(ctx, src.url, lastRev+1, currentRev)
slugRevisions, maxRev, err := client.FetchSVNChangedSlugs(ctx, src.url, cursor+1, chunkEnd)
if err != nil {
// Persist whatever we fully scanned before bailing so the next run
// resumes from there instead of redoing work.
if cursor > lastRev {
if setErr := packages.SetMeta(ctx, application.DB, src.metaKey, strconv.FormatInt(cursor, 10)); setErr != nil {
return fmt.Errorf("storing revision: %w", setErr)
}
}
return err
}

Expand All @@ -254,15 +291,27 @@ func markChangedFromSVNLog(ctx context.Context, client *wporg.Client, src struct
return fmt.Errorf("marking changed packages: %w", err)
}
application.Logger.Info("marked changed packages from SVN log",
"type", src.pkgType, "slugs_in_log", len(slugRevisions), "packages_marked", affected)
"type", src.pkgType, "from_rev", cursor+1, "to_rev", chunkEnd,
"slugs_in_log", len(slugRevisions), "packages_marked", affected)
}

// Every revision in the repo is a commit, so the REPORT yields a
// log-item per revision in range; maxRev is therefore the highest
// revision the REPORT replica actually holds within this chunk.
if maxRev <= cursor {
// REPORT endpoint is behind our cursor (replica lag) — nothing new
// available. Stop and retry from the same point next run.
break
}
cursor = maxRev
if maxRev < chunkEnd {
// Replica lagged within this chunk; rescan the remainder next run
// rather than skipping it.
break
}
} else if lastRev == 0 {
application.Logger.Info("no previous SVN revision stored, skipping changelog (first run)",
"type", src.pkgType, "current_rev", currentRev)
}

// Store current revision for next run.
if err := packages.SetMeta(ctx, application.DB, src.metaKey, strconv.FormatInt(currentRev, 10)); err != nil {
if err := packages.SetMeta(ctx, application.DB, src.metaKey, strconv.FormatInt(cursor, 10)); err != nil {
return fmt.Errorf("storing revision: %w", err)
}

Expand Down Expand Up @@ -332,7 +381,7 @@ func runBackfillRevisions(cmd *cobra.Command, args []string) error {
var slugRevisions map[string]int64
var fetchErr error
for attempt := 1; attempt <= 3; attempt++ {
slugRevisions, fetchErr = client.FetchSVNChangedSlugs(ctx, baseURL, toRev, fromRev)
slugRevisions, _, fetchErr = client.FetchSVNChangedSlugs(ctx, baseURL, toRev, fromRev)
if fetchErr == nil {
break
}
Expand Down
32 changes: 22 additions & 10 deletions internal/wporg/svn.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,12 @@ type svnLogItem struct {

// FetchSVNChangedSlugs queries the SVN DAV log between two revisions and returns
// a map of unique top-level slugs (plugin/theme names) to the highest SVN revision
// that touched them within the queried range.
func (c *Client) FetchSVNChangedSlugs(ctx context.Context, baseURL string, fromRev, toRev int64) (map[string]int64, error) {
// that touched them within the queried range, plus the highest revision present
// in the response. The caller must advance its watermark to that returned
// revision (not to an externally-assumed HEAD), because the REPORT endpoint can
// lag behind other endpoints; advancing past revisions the REPORT never returned
// would skip those commits permanently.
func (c *Client) FetchSVNChangedSlugs(ctx context.Context, baseURL string, fromRev, toRev int64) (map[string]int64, int64, error) {
body := fmt.Sprintf(`<?xml version="1.0" encoding="utf-8"?>`+
`<S:log-report xmlns:S="svn:" xmlns:D="DAV:">`+
`<S:start-revision>%d</S:start-revision>`+
Expand All @@ -151,7 +155,7 @@ func (c *Client) FetchSVNChangedSlugs(ctx context.Context, baseURL string, fromR
reqURL := strings.TrimSuffix(baseURL, "/") + "/!svn/bc/0/"
req, err := http.NewRequestWithContext(ctx, "REPORT", reqURL, strings.NewReader(body))
if err != nil {
return nil, fmt.Errorf("creating SVN log request: %w", err)
return nil, 0, fmt.Errorf("creating SVN log request: %w", err)
}
req.Header.Set("Content-Type", "text/xml")
req.Header.Set("User-Agent", UserAgent)
Expand All @@ -161,18 +165,18 @@ func (c *Client) FetchSVNChangedSlugs(ctx context.Context, baseURL string, fromR
davClient := &http.Client{Timeout: 600 * time.Second}
resp, err := davClient.Do(req)
if err != nil {
return nil, fmt.Errorf("fetching SVN log: %w", err)
return nil, 0, fmt.Errorf("fetching SVN log: %w", err)
}
defer func() { _ = resp.Body.Close() }()

if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(io.LimitReader(resp.Body, 512))
return nil, fmt.Errorf("SVN log returned status %d: %s", resp.StatusCode, string(respBody))
return nil, 0, fmt.Errorf("SVN log returned status %d: %s", resp.StatusCode, string(respBody))
}

data, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("reading SVN log response: %w", err)
return nil, 0, fmt.Errorf("reading SVN log response: %w", err)
}

return parseSVNLogSlugs(data)
Expand All @@ -190,17 +194,25 @@ func sanitizeXML(data []byte) []byte {
}

// parseSVNLogSlugs extracts unique top-level slugs from SVN log XML and maps
// each slug to the highest revision that touched it.
// each slug to the highest revision that touched it. It also returns the highest
// revision present in the response (across all log-items, regardless of whether a
// path yielded a slug) so callers can advance their watermark to exactly what the
// REPORT covered.
// Paths look like "/plugin-name/trunk/file.php" — we extract "plugin-name".
func parseSVNLogSlugs(data []byte) (map[string]int64, error) {
func parseSVNLogSlugs(data []byte) (map[string]int64, int64, error) {
data = sanitizeXML(data)
var report svnLogReport
if err := xml.Unmarshal(data, &report); err != nil {
return nil, fmt.Errorf("parsing SVN log XML: %w", err)
return nil, 0, fmt.Errorf("parsing SVN log XML: %w", err)
}

slugRevisions := make(map[string]int64)
var maxRev int64
for _, item := range report.Items {
if item.Revision > maxRev {
maxRev = item.Revision
}

allPaths := make([]string, 0, len(item.AddedPaths)+len(item.ModifiedPaths)+len(item.DeletedPaths))
allPaths = append(allPaths, item.AddedPaths...)
allPaths = append(allPaths, item.ModifiedPaths...)
Expand All @@ -215,7 +227,7 @@ func parseSVNLogSlugs(data []byte) (map[string]int64, error) {
}
}

return slugRevisions, nil
return slugRevisions, maxRev, nil
}

// slugFromPath extracts the top-level directory (slug) from an SVN path.
Expand Down
58 changes: 57 additions & 1 deletion internal/wporg/svn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,15 @@ func TestParseSVNLogSlugs(t *testing.T) {
</S:log-item>
</S:log-report>`

slugRevisions, err := parseSVNLogSlugs([]byte(xml))
slugRevisions, maxRev, err := parseSVNLogSlugs([]byte(xml))
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

if maxRev != 101 {
t.Errorf("maxRev = %d, want 101 (highest revision in the response)", maxRev)
}

if len(slugRevisions) != 2 {
t.Fatalf("expected 2 unique slugs, got %d: %v", len(slugRevisions), slugRevisions)
}
Expand All @@ -138,6 +142,58 @@ func TestParseSVNLogSlugs(t *testing.T) {
}
}

// TestParseSVNLogSlugsMaxRevReflectsResponse guards the watermark-advance fix:
// maxRev must reflect the highest revision actually present in the REPORT
// response, not any externally-requested bound. When the REPORT replica lags,
// the response stops short of the requested range, and the caller must advance
// the watermark only this far so the gap is rescanned rather than skipped.
func TestParseSVNLogSlugsMaxRevReflectsResponse(t *testing.T) {
// Requested up to a high revision, but the replica only returned up to 205.
xml := `<?xml version="1.0" encoding="utf-8"?>
<S:log-report xmlns:S="svn:" xmlns:D="DAV:">
<S:log-item>
<D:version-name>204</D:version-name>
<S:date>2026-05-29T10:00:00.000000Z</S:date>
<S:modified-path node-kind="file">/colissimo-shipping-methods-for-woocommerce/trunk/readme.txt</S:modified-path>
</S:log-item>
<S:log-item>
<D:version-name>205</D:version-name>
<S:date>2026-05-29T10:05:00.000000Z</S:date>
<S:added-path node-kind="dir">/colissimo-shipping-methods-for-woocommerce/tags/2.10.0</S:added-path>
</S:log-item>
</S:log-report>`

slugRevisions, maxRev, err := parseSVNLogSlugs([]byte(xml))
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if maxRev != 205 {
t.Errorf("maxRev = %d, want 205 (highest revision in response)", maxRev)
}
if rev := slugRevisions["colissimo-shipping-methods-for-woocommerce"]; rev != 205 {
t.Errorf("slug revision = %d, want 205", rev)
}
}

// TestParseSVNLogSlugsEmpty ensures an empty (but valid) response yields maxRev 0
// so the caller leaves its watermark untouched instead of advancing blindly.
func TestParseSVNLogSlugsEmpty(t *testing.T) {
xml := `<?xml version="1.0" encoding="utf-8"?>
<S:log-report xmlns:S="svn:" xmlns:D="DAV:">
</S:log-report>`

slugRevisions, maxRev, err := parseSVNLogSlugs([]byte(xml))
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if maxRev != 0 {
t.Errorf("maxRev = %d, want 0 for empty response", maxRev)
}
if len(slugRevisions) != 0 {
t.Errorf("expected no slugs, got %d", len(slugRevisions))
}
}

func TestSlugFromPath(t *testing.T) {
tests := []struct {
path string
Expand Down
Loading