From e964198a0e631ce7b4093f5c621e32115ef467d1 Mon Sep 17 00:00:00 2001 From: Marcus Vorwaller Date: Sun, 22 Mar 2026 04:08:35 -0700 Subject: [PATCH] feat: add semantic diff analysis package and CLI command Implement heuristic-based semantic classification of git diffs into categories (feature, bugfix, refactor, deps, config, test, docs, cleanup) with structured markdown and JSON reporting. Enrich the TaskSemanticDiff description with structured agent instructions. Nightshift-Task: semantic-diff Nightshift-Ref: https://github.com/marcus/nightshift Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/nightshift/commands/semanticdiff.go | 144 ++++++++ internal/analysis/semanticdiff/classifier.go | 266 ++++++++++++++ internal/analysis/semanticdiff/diff.go | 205 +++++++++++ internal/analysis/semanticdiff/report.go | 245 +++++++++++++ .../analysis/semanticdiff/semanticdiff.go | 85 +++++ .../semanticdiff/semanticdiff_test.go | 325 ++++++++++++++++++ internal/tasks/tasks.go | 16 +- 7 files changed, 1282 insertions(+), 4 deletions(-) create mode 100644 cmd/nightshift/commands/semanticdiff.go create mode 100644 internal/analysis/semanticdiff/classifier.go create mode 100644 internal/analysis/semanticdiff/diff.go create mode 100644 internal/analysis/semanticdiff/report.go create mode 100644 internal/analysis/semanticdiff/semanticdiff.go create mode 100644 internal/analysis/semanticdiff/semanticdiff_test.go diff --git a/cmd/nightshift/commands/semanticdiff.go b/cmd/nightshift/commands/semanticdiff.go new file mode 100644 index 0000000..1f3f259 --- /dev/null +++ b/cmd/nightshift/commands/semanticdiff.go @@ -0,0 +1,144 @@ +package commands + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/spf13/cobra" + + "github.com/marcus/nightshift/internal/analysis/semanticdiff" + "github.com/marcus/nightshift/internal/logging" +) + +var semanticDiffCmd = &cobra.Command{ + Use: "semantic-diff [path]", + Short: "Explain the semantic meaning of code changes", + Long: `Analyze git changes between two refs and classify each change by semantic +category (feature, bugfix, refactor, dependency update, config, test, docs, cleanup). + +Produces a structured report grouping changes by category with impact highlights +for API surface, schema, and security-sensitive files. + +Examples: + nightshift semantic-diff # last commit in current repo + nightshift semantic-diff --since 7d # changes in the last 7 days + nightshift semantic-diff --base main --head dev # compare two branches + nightshift semantic-diff --json # output as JSON`, + RunE: func(cmd *cobra.Command, args []string) error { + path, _ := cmd.Flags().GetString("path") + if path == "" && len(args) > 0 { + path = args[0] + } + if path == "" { + var err error + path, err = os.Getwd() + if err != nil { + return fmt.Errorf("getting current directory: %w", err) + } + } + + baseRef, _ := cmd.Flags().GetString("base") + headRef, _ := cmd.Flags().GetString("head") + since, _ := cmd.Flags().GetString("since") + jsonOutput, _ := cmd.Flags().GetBool("json") + save, _ := cmd.Flags().GetString("save") + + return runSemanticDiff(path, baseRef, headRef, since, jsonOutput, save) + }, +} + +func init() { + semanticDiffCmd.Flags().StringP("path", "p", "", "Repository path") + semanticDiffCmd.Flags().String("base", "", "Base ref (commit, branch, or tag)") + semanticDiffCmd.Flags().String("head", "", "Head ref (default: HEAD)") + semanticDiffCmd.Flags().String("since", "", "Analyze changes since duration (e.g. 7d, 24h, 30d)") + semanticDiffCmd.Flags().Bool("json", false, "Output as JSON") + semanticDiffCmd.Flags().String("save", "", "Save report to file path") + rootCmd.AddCommand(semanticDiffCmd) +} + +func runSemanticDiff(path, baseRef, headRef, since string, jsonOutput bool, savePath string) error { + logger := logging.Component("semantic-diff") + + absPath, err := filepath.Abs(path) + if err != nil { + return fmt.Errorf("resolving path: %w", err) + } + + // Verify git repo + if _, err := os.Stat(filepath.Join(absPath, ".git")); err != nil { + return fmt.Errorf("not a git repository: %s", absPath) + } + + var sinceDur time.Duration + if since != "" { + sinceDur, err = parseDuration(since) + if err != nil { + return fmt.Errorf("parsing --since: %w", err) + } + } + + analyzer := semanticdiff.NewAnalyzer(semanticdiff.Options{ + RepoPath: absPath, + BaseRef: baseRef, + HeadRef: headRef, + Since: sinceDur, + }) + + report, err := analyzer.Run() + if err != nil { + return fmt.Errorf("analysis failed: %w", err) + } + + if jsonOutput { + enc := json.NewEncoder(os.Stdout) + enc.SetIndent("", " ") + return enc.Encode(report) + } + + md := semanticdiff.RenderMarkdown(report) + fmt.Print(md) + + if savePath != "" { + if err := os.WriteFile(savePath, []byte(md), 0o644); err != nil { + logger.Errorf("saving report: %v", err) + } else { + logger.Infof("report saved to %s", savePath) + } + } + + return nil +} + +// parseDuration parses human-friendly durations like "7d", "24h", "30d". +func parseDuration(s string) (time.Duration, error) { + if len(s) == 0 { + return 0, fmt.Errorf("empty duration") + } + + // Handle day suffix which Go's time.ParseDuration doesn't support. + last := s[len(s)-1] + if last == 'd' || last == 'D' { + s = s[:len(s)-1] + "h" + d, err := time.ParseDuration(s) + if err != nil { + return 0, fmt.Errorf("invalid duration: %s", s) + } + return d * 24, nil + } + + // Handle week suffix. + if last == 'w' || last == 'W' { + s = s[:len(s)-1] + "h" + d, err := time.ParseDuration(s) + if err != nil { + return 0, fmt.Errorf("invalid duration: %s", s) + } + return d * 24 * 7, nil + } + + return time.ParseDuration(s) +} diff --git a/internal/analysis/semanticdiff/classifier.go b/internal/analysis/semanticdiff/classifier.go new file mode 100644 index 0000000..03500ef --- /dev/null +++ b/internal/analysis/semanticdiff/classifier.go @@ -0,0 +1,266 @@ +package semanticdiff + +import ( + "path/filepath" + "strings" +) + +// Category represents a semantic category for a change. +type Category string + +const ( + CategoryFeature Category = "feature" + CategoryBugfix Category = "bugfix" + CategoryRefactor Category = "refactor" + CategoryDeps Category = "dependency-update" + CategoryConfig Category = "config-change" + CategoryTest Category = "test-change" + CategoryDocs Category = "docs-change" + CategoryCleanup Category = "cleanup" + CategoryUnknown Category = "uncategorized" +) + +// ClassifiedChange pairs a file change with its semantic category. +type ClassifiedChange struct { + File FileChange + Category Category + Reason string // short explanation of why this category was chosen +} + +// ClassifiedCommit pairs a commit with its primary category. +type ClassifiedCommit struct { + Commit Commit + Category Category + Reason string + Files []ClassifiedChange +} + +// ClassifyChangeSet classifies every commit and file in the change set. +func ClassifyChangeSet(cs *ChangeSet) []ClassifiedCommit { + var result []ClassifiedCommit + for _, c := range cs.Commits { + cc := classifyCommit(c) + result = append(result, cc) + } + return result +} + +// classifyCommit determines the primary category of a commit. +func classifyCommit(c Commit) ClassifiedCommit { + var classified []ClassifiedChange + for _, f := range c.Files { + cat, reason := classifyFile(f, c.Subject, c.Body) + classified = append(classified, ClassifiedChange{ + File: f, + Category: cat, + Reason: reason, + }) + } + + primary, reason := classifyFromMessage(c.Subject, c.Body) + if primary == CategoryUnknown && len(classified) > 0 { + primary, reason = majorityCategory(classified) + } + + return ClassifiedCommit{ + Commit: c, + Category: primary, + Reason: reason, + Files: classified, + } +} + +// classifyFile determines the category of a single file change. +func classifyFile(f FileChange, subject, body string) (Category, string) { + path := strings.ToLower(f.Path) + base := strings.ToLower(filepath.Base(f.Path)) + ext := strings.ToLower(filepath.Ext(f.Path)) + + // Test files + if isTestFile(path, base) { + return CategoryTest, "test file path" + } + + // Documentation + if isDocsFile(path, base, ext) { + return CategoryDocs, "documentation file" + } + + // Dependencies + if isDepsFile(base) { + return CategoryDeps, "dependency manifest" + } + + // Configuration + if isConfigFile(path, base, ext) { + return CategoryConfig, "configuration file" + } + + // Fall back to commit message analysis. + cat, reason := classifyFromMessage(subject, body) + if cat != CategoryUnknown { + return cat, reason + } + + // Heuristic: deletion-heavy changes are often cleanup. + if f.Deletions > 0 && f.Additions == 0 { + return CategoryCleanup, "pure deletion" + } + + return CategoryUnknown, "" +} + +// classifyFromMessage determines category from commit message patterns. +func classifyFromMessage(subject, body string) (Category, string) { + msg := strings.ToLower(subject + " " + body) + + // Conventional commit prefixes + subjectLower := strings.ToLower(subject) + if strings.HasPrefix(subjectLower, "fix:") || strings.HasPrefix(subjectLower, "fix(") { + return CategoryBugfix, "conventional commit: fix" + } + if strings.HasPrefix(subjectLower, "feat:") || strings.HasPrefix(subjectLower, "feat(") { + return CategoryFeature, "conventional commit: feat" + } + if strings.HasPrefix(subjectLower, "docs:") || strings.HasPrefix(subjectLower, "docs(") { + return CategoryDocs, "conventional commit: docs" + } + if strings.HasPrefix(subjectLower, "test:") || strings.HasPrefix(subjectLower, "test(") { + return CategoryTest, "conventional commit: test" + } + if strings.HasPrefix(subjectLower, "refactor:") || strings.HasPrefix(subjectLower, "refactor(") { + return CategoryRefactor, "conventional commit: refactor" + } + if strings.HasPrefix(subjectLower, "chore:") || strings.HasPrefix(subjectLower, "chore(") { + return CategoryCleanup, "conventional commit: chore" + } + if strings.HasPrefix(subjectLower, "build:") || strings.HasPrefix(subjectLower, "build(") || strings.HasPrefix(subjectLower, "ci:") || strings.HasPrefix(subjectLower, "ci(") { + return CategoryConfig, "conventional commit: build/ci" + } + if strings.HasPrefix(subjectLower, "deps:") || strings.HasPrefix(subjectLower, "deps(") { + return CategoryDeps, "conventional commit: deps" + } + + // Keyword patterns in message + bugKeywords := []string{"fix", "bug", "patch", "hotfix", "resolve", "issue"} + for _, kw := range bugKeywords { + if strings.Contains(msg, kw) { + return CategoryBugfix, "keyword: " + kw + } + } + + featureKeywords := []string{"add ", "new ", "feature", "implement", "introduce"} + for _, kw := range featureKeywords { + if strings.Contains(msg, kw) { + return CategoryFeature, "keyword: " + strings.TrimSpace(kw) + } + } + + refactorKeywords := []string{"refactor", "restructure", "reorganize", "simplify", "extract", "rename"} + for _, kw := range refactorKeywords { + if strings.Contains(msg, kw) { + return CategoryRefactor, "keyword: " + kw + } + } + + depKeywords := []string{"bump", "upgrade", "dependency", "dependabot", "go.sum", "go.mod"} + for _, kw := range depKeywords { + if strings.Contains(msg, kw) { + return CategoryDeps, "keyword: " + kw + } + } + + cleanupKeywords := []string{"cleanup", "clean up", "remove unused", "delete", "deprecate", "drop"} + for _, kw := range cleanupKeywords { + if strings.Contains(msg, kw) { + return CategoryCleanup, "keyword: " + kw + } + } + + return CategoryUnknown, "" +} + +func isTestFile(path, base string) bool { + if strings.Contains(path, "_test.go") || strings.Contains(path, "_test.") { + return true + } + if strings.Contains(path, "/test/") || strings.Contains(path, "/tests/") || strings.Contains(path, "/testdata/") { + return true + } + if strings.HasPrefix(path, "test/") || strings.HasPrefix(path, "tests/") || strings.HasPrefix(path, "testdata/") { + return true + } + if strings.HasSuffix(base, ".test.js") || strings.HasSuffix(base, ".test.ts") || strings.HasSuffix(base, ".spec.js") || strings.HasSuffix(base, ".spec.ts") { + return true + } + return false +} + +func isDocsFile(path, base, ext string) bool { + if ext == ".md" || ext == ".rst" || ext == ".txt" || ext == ".adoc" { + return true + } + if strings.Contains(path, "/docs/") || strings.Contains(path, "/doc/") { + return true + } + if base == "readme" || base == "readme.md" || base == "changelog" || base == "changelog.md" || base == "license" || base == "license.md" { + return true + } + return false +} + +func isDepsFile(base string) bool { + depFiles := []string{ + "go.mod", "go.sum", + "package.json", "package-lock.json", "yarn.lock", "pnpm-lock.yaml", + "requirements.txt", "poetry.lock", "pipfile.lock", + "gemfile.lock", "cargo.lock", "cargo.toml", + "composer.lock", + } + for _, df := range depFiles { + if base == df { + return true + } + } + return false +} + +func isConfigFile(path, base, ext string) bool { + if ext == ".yaml" || ext == ".yml" || ext == ".toml" || ext == ".ini" { + if !strings.Contains(path, "/test") && !strings.Contains(path, "/doc") { + return true + } + } + configFiles := []string{ + "dockerfile", ".dockerignore", "docker-compose.yml", "docker-compose.yaml", + "makefile", ".goreleaser.yml", ".goreleaser.yaml", ".golangci.yml", + ".gitignore", ".editorconfig", ".eslintrc", ".prettierrc", + "tsconfig.json", "webpack.config.js", + } + for _, cf := range configFiles { + if base == cf { + return true + } + } + if strings.Contains(path, ".github/") || strings.Contains(path, ".circleci/") { + return true + } + return false +} + +// majorityCategory returns the most common category among classified changes. +func majorityCategory(changes []ClassifiedChange) (Category, string) { + counts := make(map[Category]int) + for _, c := range changes { + counts[c.Category]++ + } + best := CategoryUnknown + bestCount := 0 + for cat, count := range counts { + if count > bestCount { + best = cat + bestCount = count + } + } + return best, "majority of files" +} diff --git a/internal/analysis/semanticdiff/diff.go b/internal/analysis/semanticdiff/diff.go new file mode 100644 index 0000000..377f78b --- /dev/null +++ b/internal/analysis/semanticdiff/diff.go @@ -0,0 +1,205 @@ +// Package semanticdiff extracts git diffs and classifies changes by semantic category. +package semanticdiff + +import ( + "fmt" + "os/exec" + "strconv" + "strings" +) + +// Commit represents a single git commit with metadata. +type Commit struct { + Hash string + Subject string + Body string + Author string + Files []FileChange +} + +// FileChange represents a single file's changes within a commit. +type FileChange struct { + Path string + OldPath string // non-empty if renamed + Additions int + Deletions int + IsBinary bool + IsRename bool + IsNew bool + IsDeleted bool + DiffOutput string // raw diff hunk text for this file +} + +// ChangeSet is the full diff extraction result between two refs. +type ChangeSet struct { + BaseRef string + HeadRef string + Commits []Commit + AllFiles []FileChange // aggregated across all commits +} + +// TotalAdditions returns the sum of additions across all files. +func (cs *ChangeSet) TotalAdditions() int { + total := 0 + for _, f := range cs.AllFiles { + total += f.Additions + } + return total +} + +// TotalDeletions returns the sum of deletions across all files. +func (cs *ChangeSet) TotalDeletions() int { + total := 0 + for _, f := range cs.AllFiles { + total += f.Deletions + } + return total +} + +// ExtractDiff extracts a structured ChangeSet between two git refs. +func ExtractDiff(repoPath, baseRef, headRef string) (*ChangeSet, error) { + commits, err := parseCommits(repoPath, baseRef, headRef) + if err != nil { + return nil, fmt.Errorf("parsing commits: %w", err) + } + + allFiles, err := parseNumstat(repoPath, baseRef, headRef) + if err != nil { + return nil, fmt.Errorf("parsing numstat: %w", err) + } + + return &ChangeSet{ + BaseRef: baseRef, + HeadRef: headRef, + Commits: commits, + AllFiles: allFiles, + }, nil +} + +// parseCommits extracts commit metadata between two refs. +func parseCommits(repoPath, baseRef, headRef string) ([]Commit, error) { + // Use a delimiter unlikely to appear in commit messages. + const sep = "---NIGHTSHIFT-SEP---" + format := fmt.Sprintf("%%H%s%%s%s%%b%s%%an%s", sep, sep, sep, sep) + + cmd := exec.Command("git", "log", "--format="+format, baseRef+".."+headRef) + cmd.Dir = repoPath + out, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("git log: %w", err) + } + + raw := strings.TrimSpace(string(out)) + if raw == "" { + return nil, nil + } + + // Each commit ends with the trailing separator. + entries := strings.Split(raw, sep+"\n") + var commits []Commit + for _, entry := range entries { + entry = strings.TrimSpace(entry) + if entry == "" { + continue + } + parts := strings.SplitN(entry, sep, 4) + if len(parts) < 4 { + continue + } + c := Commit{ + Hash: strings.TrimSpace(parts[0]), + Subject: strings.TrimSpace(parts[1]), + Body: strings.TrimSpace(parts[2]), + Author: strings.TrimSpace(parts[3]), + } + commits = append(commits, c) + } + + // Attach file lists to each commit. + for i := range commits { + files, err := parseCommitFiles(repoPath, commits[i].Hash) + if err != nil { + continue // non-fatal + } + commits[i].Files = files + } + + return commits, nil +} + +// parseCommitFiles returns the file changes for a single commit. +func parseCommitFiles(repoPath, hash string) ([]FileChange, error) { + cmd := exec.Command("git", "diff-tree", "--no-commit-id", "-r", "--numstat", "-M", hash) + cmd.Dir = repoPath + out, err := cmd.Output() + if err != nil { + return nil, err + } + return parseNumstatLines(string(out)) +} + +// parseNumstat runs git diff --numstat between two refs and returns aggregated file changes. +func parseNumstat(repoPath, baseRef, headRef string) ([]FileChange, error) { + cmd := exec.Command("git", "diff", "--numstat", "-M", baseRef+".."+headRef) + cmd.Dir = repoPath + out, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("git diff --numstat: %w", err) + } + return parseNumstatLines(string(out)) +} + +// parseNumstatLines parses lines of git numstat output into FileChange structs. +func parseNumstatLines(output string) ([]FileChange, error) { + var files []FileChange + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + + fc := FileChange{} + + // Binary files show "-" for additions/deletions. + if fields[0] == "-" && fields[1] == "-" { + fc.IsBinary = true + } else { + fc.Additions, _ = strconv.Atoi(fields[0]) + fc.Deletions, _ = strconv.Atoi(fields[1]) + } + + path := fields[2] + // Detect renames: "old => new" or "{prefix/old => prefix/new}" + if len(fields) >= 4 && strings.Contains(line, "=>") { + fc.IsRename = true + fc.OldPath = extractRenamePart(fields[2:], true) + fc.Path = extractRenamePart(fields[2:], false) + } else { + fc.Path = path + } + + files = append(files, fc) + } + return files, nil +} + +// extractRenamePart extracts old or new path from rename notation. +func extractRenamePart(fields []string, wantOld bool) string { + joined := strings.Join(fields, " ") + // Handle {a => b} style + if idx := strings.Index(joined, " => "); idx >= 0 { + if wantOld { + old := joined[:idx] + old = strings.TrimPrefix(old, "{") + return strings.TrimSpace(old) + } + newPart := joined[idx+4:] + newPart = strings.TrimSuffix(newPart, "}") + return strings.TrimSpace(newPart) + } + return joined +} diff --git a/internal/analysis/semanticdiff/report.go b/internal/analysis/semanticdiff/report.go new file mode 100644 index 0000000..ac5125a --- /dev/null +++ b/internal/analysis/semanticdiff/report.go @@ -0,0 +1,245 @@ +package semanticdiff + +import ( + "bytes" + "fmt" + "sort" + "strings" +) + +// Report is the structured result of a semantic diff analysis. +type Report struct { + BaseRef string `json:"base_ref"` + HeadRef string `json:"head_ref"` + Summary string `json:"summary"` + Categories []CategoryGroup `json:"categories"` + Highlights []string `json:"highlights"` + Stats DiffStats `json:"stats"` +} + +// CategoryGroup groups changes under one semantic category. +type CategoryGroup struct { + Category Category `json:"category"` + Label string `json:"label"` + Commits []ClassifiedCommit `json:"commits"` + FilesCount int `json:"files_count"` + Additions int `json:"additions"` + Deletions int `json:"deletions"` +} + +// DiffStats contains aggregate statistics. +type DiffStats struct { + TotalCommits int `json:"total_commits"` + TotalFiles int `json:"total_files"` + TotalAdded int `json:"total_added"` + TotalDeleted int `json:"total_deleted"` +} + +// GenerateReport builds a Report from a classified change set. +func GenerateReport(cs *ChangeSet, classified []ClassifiedCommit) *Report { + groups := groupByCategory(classified) + stats := DiffStats{ + TotalCommits: len(cs.Commits), + TotalFiles: len(cs.AllFiles), + TotalAdded: cs.TotalAdditions(), + TotalDeleted: cs.TotalDeletions(), + } + highlights := detectHighlights(cs, classified) + summary := buildSummary(groups, stats) + + return &Report{ + BaseRef: cs.BaseRef, + HeadRef: cs.HeadRef, + Summary: summary, + Categories: groups, + Highlights: highlights, + Stats: stats, + } +} + +// groupByCategory groups classified commits by their primary category. +func groupByCategory(classified []ClassifiedCommit) []CategoryGroup { + catMap := make(map[Category]*CategoryGroup) + order := []Category{} + + for _, cc := range classified { + g, ok := catMap[cc.Category] + if !ok { + g = &CategoryGroup{ + Category: cc.Category, + Label: categoryLabel(cc.Category), + } + catMap[cc.Category] = g + order = append(order, cc.Category) + } + g.Commits = append(g.Commits, cc) + for _, f := range cc.Files { + g.FilesCount++ + g.Additions += f.File.Additions + g.Deletions += f.File.Deletions + } + } + + // Sort by priority. + sort.Slice(order, func(i, j int) bool { + return categoryPriority(order[i]) < categoryPriority(order[j]) + }) + + groups := make([]CategoryGroup, 0, len(order)) + for _, cat := range order { + groups = append(groups, *catMap[cat]) + } + return groups +} + +func categoryLabel(c Category) string { + switch c { + case CategoryFeature: + return "New Features" + case CategoryBugfix: + return "Bug Fixes" + case CategoryRefactor: + return "Refactoring" + case CategoryDeps: + return "Dependency Updates" + case CategoryConfig: + return "Configuration Changes" + case CategoryTest: + return "Test Changes" + case CategoryDocs: + return "Documentation" + case CategoryCleanup: + return "Cleanup" + case CategoryUnknown: + return "Other Changes" + default: + return string(c) + } +} + +func categoryPriority(c Category) int { + switch c { + case CategoryFeature: + return 0 + case CategoryBugfix: + return 1 + case CategoryRefactor: + return 2 + case CategoryDeps: + return 3 + case CategoryConfig: + return 4 + case CategoryTest: + return 5 + case CategoryDocs: + return 6 + case CategoryCleanup: + return 7 + default: + return 8 + } +} + +// detectHighlights identifies high-impact changes. +func detectHighlights(cs *ChangeSet, classified []ClassifiedCommit) []string { + var highlights []string + + for _, f := range cs.AllFiles { + path := strings.ToLower(f.Path) + + // API surface changes + if strings.Contains(path, "api") || strings.Contains(path, "handler") || strings.Contains(path, "route") { + highlights = append(highlights, fmt.Sprintf("API surface changed: %s (+%d/-%d)", f.Path, f.Additions, f.Deletions)) + } + + // Schema/migration changes + if strings.Contains(path, "migration") || strings.Contains(path, "schema") { + highlights = append(highlights, fmt.Sprintf("Schema/migration changed: %s", f.Path)) + } + + // Security-sensitive files + if strings.Contains(path, "auth") || strings.Contains(path, "security") || strings.Contains(path, "crypto") || strings.Contains(path, "permission") { + highlights = append(highlights, fmt.Sprintf("Security-sensitive file changed: %s", f.Path)) + } + + // Large changes + if f.Additions+f.Deletions > 200 { + highlights = append(highlights, fmt.Sprintf("Large change: %s (+%d/-%d lines)", f.Path, f.Additions, f.Deletions)) + } + } + + return dedup(highlights) +} + +func dedup(items []string) []string { + seen := make(map[string]bool) + var result []string + for _, item := range items { + if !seen[item] { + seen[item] = true + result = append(result, item) + } + } + return result +} + +func buildSummary(groups []CategoryGroup, stats DiffStats) string { + parts := make([]string, 0, len(groups)) + for _, g := range groups { + parts = append(parts, fmt.Sprintf("%d %s", len(g.Commits), strings.ToLower(g.Label))) + } + return fmt.Sprintf("%d commits across %d files: %s. Net change: +%d/-%d lines.", + stats.TotalCommits, stats.TotalFiles, strings.Join(parts, ", "), + stats.TotalAdded, stats.TotalDeleted) +} + +// RenderMarkdown renders the report as a markdown string. +func RenderMarkdown(r *Report) string { + var buf bytes.Buffer + + fmt.Fprintf(&buf, "# Semantic Diff: %s..%s\n\n", r.BaseRef, r.HeadRef) + + // TL;DR + buf.WriteString("## TL;DR\n\n") + fmt.Fprintf(&buf, "%s\n\n", r.Summary) + + // Stats + buf.WriteString("## Stats\n\n") + buf.WriteString("| Metric | Value |\n") + buf.WriteString("|--------|-------|\n") + fmt.Fprintf(&buf, "| Commits | %d |\n", r.Stats.TotalCommits) + fmt.Fprintf(&buf, "| Files Changed | %d |\n", r.Stats.TotalFiles) + fmt.Fprintf(&buf, "| Lines Added | +%d |\n", r.Stats.TotalAdded) + fmt.Fprintf(&buf, "| Lines Deleted | -%d |\n\n", r.Stats.TotalDeleted) + + // Highlights + if len(r.Highlights) > 0 { + buf.WriteString("## Impact Highlights\n\n") + for _, h := range r.Highlights { + fmt.Fprintf(&buf, "- %s\n", h) + } + buf.WriteString("\n") + } + + // Categories + for _, g := range r.Categories { + fmt.Fprintf(&buf, "## %s\n\n", g.Label) + fmt.Fprintf(&buf, "*%d commits, %d files, +%d/-%d lines*\n\n", len(g.Commits), g.FilesCount, g.Additions, g.Deletions) + for _, cc := range g.Commits { + fmt.Fprintf(&buf, "- **%s** `%s`\n", cc.Commit.Subject, shortHash(cc.Commit.Hash)) + for _, f := range cc.Files { + fmt.Fprintf(&buf, " - `%s` (+%d/-%d)\n", f.File.Path, f.File.Additions, f.File.Deletions) + } + } + buf.WriteString("\n") + } + + return buf.String() +} + +func shortHash(hash string) string { + if len(hash) > 7 { + return hash[:7] + } + return hash +} diff --git a/internal/analysis/semanticdiff/semanticdiff.go b/internal/analysis/semanticdiff/semanticdiff.go new file mode 100644 index 0000000..7e08fc1 --- /dev/null +++ b/internal/analysis/semanticdiff/semanticdiff.go @@ -0,0 +1,85 @@ +package semanticdiff + +import ( + "fmt" + "os/exec" + "strings" + "time" +) + +// Options configures the semantic diff analysis. +type Options struct { + RepoPath string + BaseRef string + HeadRef string + Since time.Duration // alternative to BaseRef: go back N duration from HEAD +} + +// Analyzer performs semantic diff analysis on a git repository. +type Analyzer struct { + opts Options +} + +// NewAnalyzer creates a new semantic diff analyzer. +func NewAnalyzer(opts Options) *Analyzer { + return &Analyzer{opts: opts} +} + +// Run executes the analysis and returns a report. +func (a *Analyzer) Run() (*Report, error) { + baseRef, headRef, err := a.resolveRefs() + if err != nil { + return nil, fmt.Errorf("resolving refs: %w", err) + } + + cs, err := ExtractDiff(a.opts.RepoPath, baseRef, headRef) + if err != nil { + return nil, fmt.Errorf("extracting diff: %w", err) + } + + if len(cs.Commits) == 0 { + return &Report{ + BaseRef: baseRef, + HeadRef: headRef, + Summary: "No changes found between the specified refs.", + }, nil + } + + classified := ClassifyChangeSet(cs) + report := GenerateReport(cs, classified) + return report, nil +} + +// resolveRefs resolves base and head refs, applying defaults. +func (a *Analyzer) resolveRefs() (string, string, error) { + headRef := a.opts.HeadRef + if headRef == "" { + headRef = "HEAD" + } + + baseRef := a.opts.BaseRef + if baseRef == "" && a.opts.Since > 0 { + // Use git rev-list to find the commit closest to the since duration. + sinceTime := time.Now().Add(-a.opts.Since) + cmd := exec.Command("git", "log", "--format=%H", "--after="+sinceTime.Format(time.RFC3339), "--reverse") + cmd.Dir = a.opts.RepoPath + out, err := cmd.Output() + if err != nil { + return "", "", fmt.Errorf("finding base commit from --since: %w", err) + } + lines := strings.Split(strings.TrimSpace(string(out)), "\n") + if len(lines) > 0 && lines[0] != "" { + // Use the parent of the first commit in range, or the commit itself. + baseRef = lines[0] + "~1" + } else { + return "", "", fmt.Errorf("no commits found in the last %s", a.opts.Since) + } + } + + if baseRef == "" { + // Default: compare HEAD against its first parent (last commit). + baseRef = headRef + "~1" + } + + return baseRef, headRef, nil +} diff --git a/internal/analysis/semanticdiff/semanticdiff_test.go b/internal/analysis/semanticdiff/semanticdiff_test.go new file mode 100644 index 0000000..ee9e6c4 --- /dev/null +++ b/internal/analysis/semanticdiff/semanticdiff_test.go @@ -0,0 +1,325 @@ +package semanticdiff + +import ( + "strings" + "testing" +) + +func TestParseNumstatLines(t *testing.T) { + tests := []struct { + name string + input string + want int + wantAdd int + wantDel int + }{ + { + name: "normal file", + input: "10\t5\tsrc/main.go\n", + want: 1, + wantAdd: 10, + wantDel: 5, + }, + { + name: "binary file", + input: "-\t-\tassets/logo.png\n", + want: 1, + wantAdd: 0, + wantDel: 0, + }, + { + name: "multiple files", + input: "10\t5\ta.go\n3\t1\tb.go\n", + want: 2, + wantAdd: 13, + wantDel: 6, + }, + { + name: "empty input", + input: "", + want: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + files, err := parseNumstatLines(tt.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(files) != tt.want { + t.Fatalf("got %d files, want %d", len(files), tt.want) + } + totalAdd, totalDel := 0, 0 + for _, f := range files { + totalAdd += f.Additions + totalDel += f.Deletions + } + if totalAdd != tt.wantAdd { + t.Errorf("total additions = %d, want %d", totalAdd, tt.wantAdd) + } + if totalDel != tt.wantDel { + t.Errorf("total deletions = %d, want %d", totalDel, tt.wantDel) + } + }) + } +} + +func TestParseNumstatBinaryFlag(t *testing.T) { + files, err := parseNumstatLines("-\t-\timage.png\n") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(files) != 1 { + t.Fatalf("expected 1 file, got %d", len(files)) + } + if !files[0].IsBinary { + t.Error("expected IsBinary=true") + } +} + +func TestClassifyFromMessage(t *testing.T) { + tests := []struct { + subject string + body string + want Category + }{ + {"fix: null pointer in handler", "", CategoryBugfix}, + {"feat: add user profile page", "", CategoryFeature}, + {"docs: update README", "", CategoryDocs}, + {"test: add integration tests", "", CategoryTest}, + {"refactor: extract helper", "", CategoryRefactor}, + {"chore: clean up old scripts", "", CategoryCleanup}, + {"ci: update workflow", "", CategoryConfig}, + {"deps: bump cobra to v1.8", "", CategoryDeps}, + {"Fix a bug in login", "", CategoryBugfix}, + {"Add new endpoint", "", CategoryFeature}, + {"something unrelated", "", CategoryUnknown}, + } + + for _, tt := range tests { + t.Run(tt.subject, func(t *testing.T) { + got, _ := classifyFromMessage(tt.subject, tt.body) + if got != tt.want { + t.Errorf("classifyFromMessage(%q) = %s, want %s", tt.subject, got, tt.want) + } + }) + } +} + +func TestClassifyFileByPath(t *testing.T) { + tests := []struct { + path string + want Category + }{ + {"internal/foo/foo_test.go", CategoryTest}, + {"tests/integration.py", CategoryTest}, + {"README.md", CategoryDocs}, + {"docs/guide.rst", CategoryDocs}, + {"go.mod", CategoryDeps}, + {"package.json", CategoryDeps}, + {".github/workflows/ci.yml", CategoryConfig}, + {"Dockerfile", CategoryConfig}, + {"Makefile", CategoryConfig}, + } + + for _, tt := range tests { + t.Run(tt.path, func(t *testing.T) { + f := FileChange{Path: tt.path} + got, _ := classifyFile(f, "", "") + if got != tt.want { + t.Errorf("classifyFile(%q) = %s, want %s", tt.path, got, tt.want) + } + }) + } +} + +func TestIsTestFile(t *testing.T) { + if !isTestFile("cmd/foo_test.go", "foo_test.go") { + t.Error("expected _test.go to be detected") + } + if !isTestFile("tests/main.py", "main.py") { + t.Error("expected tests/ directory to be detected") + } + if !isTestFile("src/app.test.js", "app.test.js") { + t.Error("expected .test.js to be detected") + } + if isTestFile("src/main.go", "main.go") { + t.Error("expected non-test file to not be detected") + } +} + +func TestIsDocsFile(t *testing.T) { + if !isDocsFile("readme.md", "readme.md", ".md") { + t.Error("expected .md to be detected") + } + if !isDocsFile("docs/guide.rst", "guide.rst", ".rst") { + t.Error("expected .rst to be detected") + } + if isDocsFile("src/main.go", "main.go", ".go") { + t.Error("expected .go to not be docs") + } +} + +func TestIsDepsFile(t *testing.T) { + if !isDepsFile("go.mod") { + t.Error("expected go.mod to be deps") + } + if !isDepsFile("package-lock.json") { + t.Error("expected package-lock.json to be deps") + } + if isDepsFile("main.go") { + t.Error("expected main.go to not be deps") + } +} + +func TestMajorityCategory(t *testing.T) { + changes := []ClassifiedChange{ + {Category: CategoryBugfix}, + {Category: CategoryBugfix}, + {Category: CategoryFeature}, + } + got, _ := majorityCategory(changes) + if got != CategoryBugfix { + t.Errorf("got %s, want bugfix", got) + } +} + +func TestGenerateReport(t *testing.T) { + cs := &ChangeSet{ + BaseRef: "abc1234", + HeadRef: "def5678", + Commits: []Commit{ + {Hash: "aaa1111", Subject: "feat: add widget", Author: "alice", Files: []FileChange{ + {Path: "widget.go", Additions: 50, Deletions: 0}, + }}, + {Hash: "bbb2222", Subject: "fix: null check", Author: "bob", Files: []FileChange{ + {Path: "handler.go", Additions: 2, Deletions: 1}, + }}, + }, + AllFiles: []FileChange{ + {Path: "widget.go", Additions: 50, Deletions: 0}, + {Path: "handler.go", Additions: 2, Deletions: 1}, + }, + } + + classified := ClassifyChangeSet(cs) + report := GenerateReport(cs, classified) + + if report.Stats.TotalCommits != 2 { + t.Errorf("expected 2 commits, got %d", report.Stats.TotalCommits) + } + if report.Stats.TotalFiles != 2 { + t.Errorf("expected 2 files, got %d", report.Stats.TotalFiles) + } + if report.Stats.TotalAdded != 52 { + t.Errorf("expected 52 additions, got %d", report.Stats.TotalAdded) + } + if report.Stats.TotalDeleted != 1 { + t.Errorf("expected 1 deletion, got %d", report.Stats.TotalDeleted) + } + if len(report.Categories) == 0 { + t.Fatal("expected at least one category group") + } + + // Verify markdown rendering. + md := RenderMarkdown(report) + if !strings.Contains(md, "Semantic Diff") { + t.Error("markdown should contain header") + } + if !strings.Contains(md, "TL;DR") { + t.Error("markdown should contain TL;DR section") + } + if !strings.Contains(md, "widget.go") { + t.Error("markdown should reference changed files") + } +} + +func TestRenderMarkdownEmpty(t *testing.T) { + r := &Report{ + BaseRef: "aaa", + HeadRef: "bbb", + Summary: "No changes found between the specified refs.", + } + md := RenderMarkdown(r) + if !strings.Contains(md, "No changes") { + t.Error("empty report should show no-changes message") + } +} + +func TestDetectHighlights(t *testing.T) { + cs := &ChangeSet{ + AllFiles: []FileChange{ + {Path: "internal/api/handler.go", Additions: 10, Deletions: 5}, + {Path: "internal/auth/middleware.go", Additions: 3, Deletions: 2}, + {Path: "db/migrations/001_init.sql", Additions: 50, Deletions: 0}, + {Path: "README.md", Additions: 5, Deletions: 2}, + }, + } + highlights := detectHighlights(cs, nil) + found := map[string]bool{"api": false, "auth": false, "migration": false} + for _, h := range highlights { + if strings.Contains(h, "API surface") { + found["api"] = true + } + if strings.Contains(h, "Security-sensitive") { + found["auth"] = true + } + if strings.Contains(h, "Schema/migration") { + found["migration"] = true + } + } + for k, v := range found { + if !v { + t.Errorf("expected %s highlight to be detected", k) + } + } +} + +func TestChangeSetTotals(t *testing.T) { + cs := &ChangeSet{ + AllFiles: []FileChange{ + {Additions: 10, Deletions: 3}, + {Additions: 5, Deletions: 7}, + }, + } + if cs.TotalAdditions() != 15 { + t.Errorf("TotalAdditions = %d, want 15", cs.TotalAdditions()) + } + if cs.TotalDeletions() != 10 { + t.Errorf("TotalDeletions = %d, want 10", cs.TotalDeletions()) + } +} + +func TestShortHash(t *testing.T) { + if shortHash("abc1234567890") != "abc1234" { + t.Error("expected 7-char short hash") + } + if shortHash("abc") != "abc" { + t.Error("expected short input returned as-is") + } +} + +func TestCategoryLabel(t *testing.T) { + if categoryLabel(CategoryFeature) != "New Features" { + t.Error("unexpected label for feature") + } + if categoryLabel(CategoryUnknown) != "Other Changes" { + t.Error("unexpected label for unknown") + } +} + +func TestBuildSummary(t *testing.T) { + groups := []CategoryGroup{ + {Category: CategoryFeature, Label: "New Features", Commits: []ClassifiedCommit{{}}}, + {Category: CategoryBugfix, Label: "Bug Fixes", Commits: []ClassifiedCommit{{}, {}}}, + } + stats := DiffStats{TotalCommits: 3, TotalFiles: 5, TotalAdded: 100, TotalDeleted: 20} + summary := buildSummary(groups, stats) + if !strings.Contains(summary, "3 commits") { + t.Error("summary should mention commit count") + } + if !strings.Contains(summary, "+100/-20") { + t.Error("summary should mention line changes") + } +} diff --git a/internal/tasks/tasks.go b/internal/tasks/tasks.go index 2c7dabb..5bcace6 100644 --- a/internal/tasks/tasks.go +++ b/internal/tasks/tasks.go @@ -392,10 +392,18 @@ Apply safe updates directly, and leave concise follow-ups for anything uncertain DefaultInterval: 72 * time.Hour, }, TaskSemanticDiff: { - Type: TaskSemanticDiff, - Category: CategoryAnalysis, - Name: "Semantic Diff Explainer", - Description: "Explain the semantic meaning of code changes", + Type: TaskSemanticDiff, + Category: CategoryAnalysis, + Name: "Semantic Diff Explainer", + Description: `Analyze recent git changes and produce a semantic diff report. +Run 'nightshift semantic-diff --since 72h' (or the configured interval) on the target repository. +Classify every commit and changed file into one of: feature, bugfix, refactor, dependency-update, +config-change, test-change, docs-change, or cleanup. Group the results by category and highlight +high-impact changes (API surface modifications, schema/migration changes, security-sensitive files, +and large diffs exceeding 200 lines). Output a structured markdown report with a TL;DR summary, +aggregate stats (commits, files, lines added/deleted), an impact-highlights section, and per-category +breakdowns listing each commit with its affected files. When --json is used, return the report as +structured JSON for downstream tooling.`, CostTier: CostMedium, RiskLevel: RiskLow, DefaultInterval: 72 * time.Hour,