diff --git a/README.md b/README.md index 73d3014..85a7254 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,22 @@ reader, _ := archives.OpenWithPrefix("pkg.tgz", f, "package/") // files are now accessible without the package/ prefix ``` +### Comparing versions + +The `diff` subpackage compares two archives and produces unified diffs. It classifies each file as added, deleted, modified, or binary, and includes line-level diff output for text files. + +```go +import "github.com/git-pkgs/archives/diff" + +result, _ := diff.Compare(oldReader, newReader) +for _, f := range result.Files { + fmt.Printf("%s %s (+%d -%d)\n", f.Type, f.Path, f.LinesAdded, f.LinesDeleted) + if f.Diff != "" { + fmt.Println(f.Diff) + } +} +``` + ## Supported formats - `.zip`, `.jar`, `.whl`, `.nupkg`, `.egg` (ZIP-based) diff --git a/diff/diff.go b/diff/diff.go new file mode 100644 index 0000000..83df0d7 --- /dev/null +++ b/diff/diff.go @@ -0,0 +1,310 @@ +// Package diff provides utilities for comparing package versions. +package diff + +import ( + "bufio" + "bytes" + "fmt" + "io" + "sort" + "strings" + + "github.com/git-pkgs/archives" +) + +// FileDiff represents the diff for a single file. +type FileDiff struct { + Path string `json:"path"` + Type string `json:"type"` // "modified", "added", "deleted", "renamed" + OldPath string `json:"old_path,omitempty"` + Diff string `json:"diff,omitempty"` + IsBinary bool `json:"is_binary,omitempty"` + LinesAdded int `json:"lines_added"` + LinesDeleted int `json:"lines_deleted"` +} + +// CompareResult contains the complete comparison between two versions. +type CompareResult struct { + Files []FileDiff `json:"files"` + TotalAdded int `json:"total_added"` + TotalDeleted int `json:"total_deleted"` + FilesChanged int `json:"files_changed"` + FilesAdded int `json:"files_added"` + FilesDeleted int `json:"files_deleted"` +} + +// Compare generates a diff between two archive readers. +func Compare(oldReader, newReader archives.Reader) (*CompareResult, error) { + // Get file listings + oldFiles, err := oldReader.List() + if err != nil { + return nil, fmt.Errorf("listing old archive: %w", err) + } + + newFiles, err := newReader.List() + if err != nil { + return nil, fmt.Errorf("listing new archive: %w", err) + } + + // Create maps for quick lookup + oldMap := make(map[string]archives.FileInfo) + newMap := make(map[string]archives.FileInfo) + + for _, f := range oldFiles { + if !f.IsDir { + oldMap[f.Path] = f + } + } + + for _, f := range newFiles { + if !f.IsDir { + newMap[f.Path] = f + } + } + + result := &CompareResult{ + Files: []FileDiff{}, + } + + // Find all unique paths + allPaths := make(map[string]bool) + for path := range oldMap { + allPaths[path] = true + } + for path := range newMap { + allPaths[path] = true + } + + // Convert to sorted slice + paths := make([]string, 0, len(allPaths)) + for path := range allPaths { + paths = append(paths, path) + } + sort.Strings(paths) + + // Compare each file + for _, path := range paths { + oldExists := oldMap[path] + newExists := newMap[path] + + var fileDiff FileDiff + + if oldExists.Path != "" && newExists.Path == "" { + // File was deleted + fileDiff = FileDiff{ + Path: path, + Type: "deleted", + } + result.FilesDeleted++ + } else if oldExists.Path == "" && newExists.Path != "" { + // File was added + fileDiff = FileDiff{ + Path: path, + Type: "added", + } + result.FilesAdded++ + + // Try to get content for added files + if content, err := readFileContent(newReader, path); err == nil { + if isBinary(content) { + fileDiff.IsBinary = true + } else { + fileDiff.Diff = generateAddedDiff(path, content) + fileDiff.LinesAdded = countLines(content) + } + } + } else { + // File exists in both - check if modified + oldContent, err1 := readFileContent(oldReader, path) + newContent, err2 := readFileContent(newReader, path) + + if err1 != nil || err2 != nil { + continue // Skip files we can't read + } + + if bytes.Equal(oldContent, newContent) { + continue // No change + } + + fileDiff = FileDiff{ + Path: path, + Type: "modified", + } + result.FilesChanged++ + + if isBinary(oldContent) || isBinary(newContent) { + fileDiff.IsBinary = true + } else { + diffText, added, deleted := generateUnifiedDiff(path, oldContent, newContent) + fileDiff.Diff = diffText + fileDiff.LinesAdded = added + fileDiff.LinesDeleted = deleted + result.TotalAdded += added + result.TotalDeleted += deleted + } + } + + result.Files = append(result.Files, fileDiff) + } + + return result, nil +} + +// readFileContent reads a file's content from an archive reader. +func readFileContent(reader archives.Reader, path string) ([]byte, error) { + rc, err := reader.Extract(path) + if err != nil { + return nil, err + } + defer func() { _ = rc.Close() }() + + return io.ReadAll(rc) +} + +// isBinary checks if content appears to be binary. +func isBinary(content []byte) bool { + if len(content) == 0 { + return false + } + + // Check first 8KB for null bytes + checkLen := len(content) + if checkLen > 8192 { + checkLen = 8192 + } + + for i := 0; i < checkLen; i++ { + if content[i] == 0 { + return true + } + } + + return false +} + +// generateUnifiedDiff generates a unified diff between two file contents. +// Uses line-based diffing for proper unified diff output. +func generateUnifiedDiff(path string, oldContent, newContent []byte) (string, int, int) { + return generateSimpleDiff(path, oldContent, newContent) +} + +// generateSimpleDiff generates a line-based unified diff. +func generateSimpleDiff(path string, oldContent, newContent []byte) (string, int, int) { + oldLines := strings.Split(string(oldContent), "\n") + newLines := strings.Split(string(newContent), "\n") + + // Simple line-by-line comparison (can be improved with Myers algorithm) + var buf strings.Builder + fmt.Fprintf(&buf, "--- a/%s\n", path) + fmt.Fprintf(&buf, "+++ b/%s\n", path) + + linesAdded := 0 + linesDeleted := 0 + + // Find common prefix + commonPrefix := 0 + maxCommon := len(oldLines) + if len(newLines) < maxCommon { + maxCommon = len(newLines) + } + for commonPrefix < maxCommon && oldLines[commonPrefix] == newLines[commonPrefix] { + commonPrefix++ + } + + // Find common suffix + commonSuffix := 0 + oldEnd := len(oldLines) - 1 + newEnd := len(newLines) - 1 + for commonSuffix < maxCommon-commonPrefix && + oldEnd-commonSuffix >= commonPrefix && + newEnd-commonSuffix >= commonPrefix && + oldLines[oldEnd-commonSuffix] == newLines[newEnd-commonSuffix] { + commonSuffix++ + } + + // Calculate range + oldStart := commonPrefix + oldCount := len(oldLines) - commonPrefix - commonSuffix + newStart := commonPrefix + newCount := len(newLines) - commonPrefix - commonSuffix + + if oldCount == 0 && newCount == 0 { + return "", 0, 0 + } + + // Context lines + contextBefore := 3 + contextAfter := 3 + + hunkOldStart := oldStart - contextBefore + if hunkOldStart < 0 { + hunkOldStart = 0 + } + + hunkNewStart := newStart - contextBefore + if hunkNewStart < 0 { + hunkNewStart = 0 + } + + // Build hunk + var hunk strings.Builder + + // Context before + for i := hunkOldStart; i < oldStart && i < len(oldLines); i++ { + hunk.WriteString(" " + oldLines[i] + "\n") + } + + // Deleted lines + for i := oldStart; i < oldStart+oldCount && i < len(oldLines); i++ { + hunk.WriteString("-" + oldLines[i] + "\n") + linesDeleted++ + } + + // Added lines + for i := newStart; i < newStart+newCount && i < len(newLines); i++ { + hunk.WriteString("+" + newLines[i] + "\n") + linesAdded++ + } + + // Context after + afterStart := oldStart + oldCount + for i := 0; i < contextAfter && afterStart+i < len(oldLines); i++ { + hunk.WriteString(" " + oldLines[afterStart+i] + "\n") + } + + // Calculate hunk size + hunkOldCount := (oldStart - hunkOldStart) + oldCount + contextAfter + hunkNewCount := (newStart - hunkNewStart) + newCount + contextAfter + + // Write hunk header + fmt.Fprintf(&buf, "@@ -%d,%d +%d,%d @@\n", hunkOldStart+1, hunkOldCount, hunkNewStart+1, hunkNewCount) + buf.WriteString(hunk.String()) + + return buf.String(), linesAdded, linesDeleted +} + +// generateAddedDiff generates a diff for a newly added file. +func generateAddedDiff(path string, content []byte) string { + var buf strings.Builder + buf.WriteString("--- /dev/null\n") + fmt.Fprintf(&buf, "+++ b/%s\n", path) + + lines := bytes.Split(content, []byte("\n")) + fmt.Fprintf(&buf, "@@ -0,0 +1,%d @@\n", len(lines)) + + for _, line := range lines { + buf.WriteString("+" + string(line) + "\n") + } + + return buf.String() +} + +// countLines counts the number of lines in content. +func countLines(content []byte) int { + scanner := bufio.NewScanner(bytes.NewReader(content)) + count := 0 + for scanner.Scan() { + count++ + } + return count +} diff --git a/diff/diff_test.go b/diff/diff_test.go new file mode 100644 index 0000000..c25bea0 --- /dev/null +++ b/diff/diff_test.go @@ -0,0 +1,254 @@ +package diff + +import ( + "archive/tar" + "bytes" + "compress/gzip" + "strings" + "testing" + + "github.com/git-pkgs/archives" +) + +func createTestArchiveWithFiles(files map[string]string) []byte { + buf := new(bytes.Buffer) + gw := gzip.NewWriter(buf) + tw := tar.NewWriter(gw) + + for path, content := range files { + header := &tar.Header{ + Name: path, + Size: int64(len(content)), + Mode: 0644, + } + _ = tw.WriteHeader(header) + _, _ = tw.Write([]byte(content)) + } + + _ = tw.Close() + _ = gw.Close() + return buf.Bytes() +} + +func TestCompare(t *testing.T) { + // Create two test archives + oldFiles := map[string]string{ + "README.md": "# Old Version\n", + "src/main.go": "package main\n\nfunc main() {\n\tprintln(\"old\")\n}\n", + "deleted.txt": "this will be deleted", + } + + newFiles := map[string]string{ + "README.md": "# New Version\n\nWith more content\n", + "src/main.go": "package main\n\nfunc main() {\n\tprintln(\"new\")\n}\n", + "added.txt": "this is new", + } + + oldArchive, err := archives.Open("old.tar.gz", bytes.NewReader(createTestArchiveWithFiles(oldFiles))) + if err != nil { + t.Fatalf("failed to open old archive: %v", err) + } + defer func() { _ = oldArchive.Close() }() + + newArchive, err := archives.Open("new.tar.gz", bytes.NewReader(createTestArchiveWithFiles(newFiles))) + if err != nil { + t.Fatalf("failed to open new archive: %v", err) + } + defer func() { _ = newArchive.Close() }() + + // Compare + result, err := Compare(oldArchive, newArchive) + if err != nil { + t.Fatalf("Compare failed: %v", err) + } + + // Check counts + if result.FilesChanged != 2 { + t.Errorf("FilesChanged = %d, want 2", result.FilesChanged) + } + + if result.FilesAdded != 1 { + t.Errorf("FilesAdded = %d, want 1", result.FilesAdded) + } + + if result.FilesDeleted != 1 { + t.Errorf("FilesDeleted = %d, want 1", result.FilesDeleted) + } + + // Check individual files + fileMap := make(map[string]FileDiff) + for _, f := range result.Files { + fileMap[f.Path] = f + } + + // Check deleted file + if f, ok := fileMap["deleted.txt"]; !ok || f.Type != "deleted" { + t.Error("deleted.txt should be marked as deleted") + } + + // Check added file + if f, ok := fileMap["added.txt"]; !ok || f.Type != "added" { + t.Error("added.txt should be marked as added") + } + + // Check modified files + if f, ok := fileMap["README.md"]; !ok || f.Type != "modified" { + t.Error("README.md should be marked as modified") + } + + if f, ok := fileMap["src/main.go"]; !ok || f.Type != "modified" { + t.Error("src/main.go should be marked as modified") + } +} + +func TestIsBinary(t *testing.T) { + tests := []struct { + name string + content []byte + expected bool + }{ + {"empty", []byte{}, false}, + {"text", []byte("hello world"), false}, + {"binary with null", []byte{0x00, 0x01, 0x02}, true}, + {"text with newlines", []byte("line1\nline2\nline3"), false}, + {"json", []byte(`{"key": "value"}`), false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isBinary(tt.content) + if got != tt.expected { + t.Errorf("isBinary() = %v, want %v", got, tt.expected) + } + }) + } +} + +func TestGenerateUnifiedDiff(t *testing.T) { + oldContent := []byte("line 1\nline 2\nline 3\n") + newContent := []byte("line 1\nline 2 modified\nline 3\n") + + diff, added, deleted := generateUnifiedDiff("test.txt", oldContent, newContent) + + // Log the diff for debugging + t.Logf("Generated diff:\n%s", diff) + t.Logf("Added: %d, Deleted: %d", added, deleted) + + // The diff library might generate optimized diffs + // Check that we have some diff output + if diff == "" { + t.Error("diff should not be empty") + } + + if !strings.Contains(diff, "--- a/test.txt") { + t.Error("diff should contain old file marker") + } + + if !strings.Contains(diff, "+++ b/test.txt") { + t.Error("diff should contain new file marker") + } + + // Check that the diff contains the changed content + if !strings.Contains(diff, "line 2") { + t.Error("diff should reference the changed line") + } +} + +func TestGenerateAddedDiff(t *testing.T) { + content := []byte("new file\nwith content\n") + + diff := generateAddedDiff("new.txt", content) + + if !strings.Contains(diff, "--- /dev/null") { + t.Error("diff should indicate new file") + } + + if !strings.Contains(diff, "+++ b/new.txt") { + t.Error("diff should contain new file path") + } + + if !strings.Contains(diff, "+new file") { + t.Error("diff should contain added lines") + } +} + +func TestCountLines(t *testing.T) { + tests := []struct { + name string + content []byte + expected int + }{ + {"empty", []byte{}, 0}, + {"one line", []byte("hello"), 1}, + {"three lines", []byte("line1\nline2\nline3"), 3}, + {"trailing newline", []byte("line1\nline2\n"), 2}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := countLines(tt.content) + if got != tt.expected { + t.Errorf("countLines() = %d, want %d", got, tt.expected) + } + }) + } +} + +func TestCompareIdentical(t *testing.T) { + files := map[string]string{ + "README.md": "# Test\n", + "main.go": "package main\n", + } + + archive1, _ := archives.Open("test1.tar.gz", bytes.NewReader(createTestArchiveWithFiles(files))) + defer func() { _ = archive1.Close() }() + + archive2, _ := archives.Open("test2.tar.gz", bytes.NewReader(createTestArchiveWithFiles(files))) + defer func() { _ = archive2.Close() }() + + result, err := Compare(archive1, archive2) + if err != nil { + t.Fatalf("Compare failed: %v", err) + } + + if len(result.Files) != 0 { + t.Errorf("expected no changes, got %d files", len(result.Files)) + } + + if result.FilesChanged != 0 || result.FilesAdded != 0 || result.FilesDeleted != 0 { + t.Error("expected all counts to be zero for identical archives") + } +} + +func TestCompareBinaryFiles(t *testing.T) { + oldFiles := map[string]string{ + "image.png": string([]byte{0x89, 0x50, 0x4E, 0x47, 0x00}), // Binary content + } + + newFiles := map[string]string{ + "image.png": string([]byte{0x89, 0x50, 0x4E, 0x47, 0x01}), // Different binary + } + + oldArchive, _ := archives.Open("old.tar.gz", bytes.NewReader(createTestArchiveWithFiles(oldFiles))) + defer func() { _ = oldArchive.Close() }() + + newArchive, _ := archives.Open("new.tar.gz", bytes.NewReader(createTestArchiveWithFiles(newFiles))) + defer func() { _ = newArchive.Close() }() + + result, err := Compare(oldArchive, newArchive) + if err != nil { + t.Fatalf("Compare failed: %v", err) + } + + if len(result.Files) != 1 { + t.Fatalf("expected 1 file, got %d", len(result.Files)) + } + + if !result.Files[0].IsBinary { + t.Error("file should be marked as binary") + } + + if result.Files[0].Diff != "" { + t.Error("binary files should not have diff content") + } +}