diff --git a/src/batch.js b/src/batch.js index 4e6778d7..ba849990 100644 --- a/src/batch.js +++ b/src/batch.js @@ -6,6 +6,7 @@ */ import { complexityData } from './complexity.js'; +import { dataflowData } from './dataflow.js'; import { flowData } from './flow.js'; import { contextData, @@ -36,6 +37,7 @@ export const BATCH_COMMANDS = { impact: { fn: impactAnalysisData, sig: 'file' }, deps: { fn: fileDepsData, sig: 'file' }, flow: { fn: flowData, sig: 'name' }, + dataflow: { fn: dataflowData, sig: 'name' }, complexity: { fn: complexityData, sig: 'dbOnly' }, }; diff --git a/src/builder.js b/src/builder.js index 91ab5db6..a9ae11d4 100644 --- a/src/builder.js +++ b/src/builder.js @@ -435,7 +435,7 @@ export async function buildGraph(rootDir, opts = {}) { if (isFullBuild) { const deletions = - 'PRAGMA foreign_keys = OFF; DELETE FROM node_metrics; DELETE FROM edges; DELETE FROM function_complexity; DELETE FROM nodes; PRAGMA foreign_keys = ON;'; + 'PRAGMA foreign_keys = OFF; DELETE FROM node_metrics; DELETE FROM edges; DELETE FROM function_complexity; DELETE FROM dataflow; DELETE FROM nodes; PRAGMA foreign_keys = ON;'; db.exec( hasEmbeddings ? `${deletions.replace('PRAGMA foreign_keys = ON;', '')} DELETE FROM embeddings; PRAGMA foreign_keys = ON;` @@ -505,11 +505,20 @@ export async function buildGraph(rootDir, opts = {}) { } catch { deleteComplexityForFile = null; } + let deleteDataflowForFile; + try { + deleteDataflowForFile = db.prepare( + 'DELETE FROM dataflow WHERE source_id IN (SELECT id FROM nodes WHERE file = ?) OR target_id IN (SELECT id FROM nodes WHERE file = ?)', + ); + } catch { + deleteDataflowForFile = null; + } for (const relPath of removed) { deleteEmbeddingsForFile?.run(relPath); deleteEdgesForFile.run({ f: relPath }); deleteMetricsForFile.run(relPath); deleteComplexityForFile?.run(relPath); + deleteDataflowForFile?.run(relPath, relPath); deleteNodesForFile.run(relPath); } for (const item of parseChanges) { @@ -518,6 +527,7 @@ export async function buildGraph(rootDir, opts = {}) { deleteEdgesForFile.run({ f: relPath }); deleteMetricsForFile.run(relPath); deleteComplexityForFile?.run(relPath); + deleteDataflowForFile?.run(relPath, relPath); deleteNodesForFile.run(relPath); } @@ -1078,6 +1088,18 @@ export async function buildGraph(rootDir, opts = {}) { } _t.complexityMs = performance.now() - _t.complexity0; + // Opt-in dataflow analysis (--dataflow) + if (opts.dataflow) { + _t.dataflow0 = performance.now(); + try { + const { buildDataflowEdges } = await import('./dataflow.js'); + await buildDataflowEdges(db, allSymbols, rootDir, engineOpts); + } catch (err) { + debug(`Dataflow analysis failed: ${err.message}`); + } + _t.dataflowMs = performance.now() - _t.dataflow0; + } + // Release any remaining cached WASM trees for GC for (const [, symbols] of allSymbols) { symbols._tree = null; diff --git a/src/cli.js b/src/cli.js index 90f7470e..8ee3157b 100644 --- a/src/cli.js +++ b/src/cli.js @@ -97,10 +97,11 @@ program .command('build [dir]') .description('Parse repo and build graph in .codegraph/graph.db') .option('--no-incremental', 'Force full rebuild (ignore file hashes)') + .option('--dataflow', 'Extract data flow edges (flows_to, returns, mutates)') .action(async (dir, opts) => { const root = path.resolve(dir || '.'); const engine = program.opts().engine; - await buildGraph(root, { incremental: opts.incremental, engine }); + await buildGraph(root, { incremental: opts.incremental, engine, dataflow: opts.dataflow }); }); program @@ -967,6 +968,41 @@ program }); }); +program + .command('dataflow ') + .description('Show data flow for a function: parameters, return consumers, mutations') + .option('-d, --db ', 'Path to graph.db') + .option('-f, --file ', 'Scope to file (partial match)') + .option('-k, --kind ', 'Filter by symbol kind') + .option('-T, --no-tests', 'Exclude test/spec files from results') + .option('--include-tests', 'Include test/spec files (overrides excludeTests config)') + .option('-j, --json', 'Output as JSON') + .option('--ndjson', 'Newline-delimited JSON output') + .option('--limit ', 'Max results to return') + .option('--offset ', 'Skip N results (default: 0)') + .option('--path ', 'Find data flow path to ') + .option('--impact', 'Show data-dependent blast radius') + .option('--depth ', 'Max traversal depth', '5') + .action(async (name, opts) => { + if (opts.kind && !ALL_SYMBOL_KINDS.includes(opts.kind)) { + console.error(`Invalid kind "${opts.kind}". Valid: ${ALL_SYMBOL_KINDS.join(', ')}`); + process.exit(1); + } + const { dataflow } = await import('./dataflow.js'); + dataflow(name, opts.db, { + file: opts.file, + kind: opts.kind, + noTests: resolveNoTests(opts), + json: opts.json, + ndjson: opts.ndjson, + limit: opts.limit ? parseInt(opts.limit, 10) : undefined, + offset: opts.offset ? parseInt(opts.offset, 10) : undefined, + path: opts.path, + impact: opts.impact, + depth: opts.depth, + }); + }); + program .command('complexity [target]') .description('Show per-function complexity metrics (cognitive, cyclomatic, nesting depth, MI)') diff --git a/src/dataflow.js b/src/dataflow.js new file mode 100644 index 00000000..e0ae266b --- /dev/null +++ b/src/dataflow.js @@ -0,0 +1,1249 @@ +/** + * Dataflow analysis — define/use chains and data movement edges. + * + * Adds three edge types to track how data moves through functions: + * - flows_to: parameter/variable flows into another function as an argument + * - returns: a call's return value is captured and used in the caller + * - mutates: a parameter-derived value is mutated (e.g. arr.push()) + * + * Opt-in via `build --dataflow`. JS/TS only for MVP. + */ + +import fs from 'node:fs'; +import path from 'node:path'; +import { openReadonlyOrFail } from './db.js'; +import { info } from './logger.js'; +import { paginateResult } from './paginate.js'; +import { isTestFile } from './queries.js'; + +// Methods that mutate their receiver in-place +const MUTATING_METHODS = new Set([ + 'push', + 'pop', + 'shift', + 'unshift', + 'splice', + 'sort', + 'reverse', + 'fill', + 'set', + 'delete', + 'add', + 'clear', +]); + +// JS/TS language IDs that support dataflow extraction +const DATAFLOW_LANG_IDS = new Set(['javascript', 'typescript', 'tsx']); + +// ── AST helpers ────────────────────────────────────────────────────────────── + +function truncate(str, max = 120) { + if (!str) return ''; + return str.length > max ? `${str.slice(0, max)}…` : str; +} + +/** + * Get the name of a function node from the AST. + */ +function functionName(fnNode) { + if (!fnNode) return null; + const t = fnNode.type; + if (t === 'function_declaration') { + const nameNode = fnNode.childForFieldName('name'); + return nameNode ? nameNode.text : null; + } + if (t === 'method_definition') { + const nameNode = fnNode.childForFieldName('name'); + return nameNode ? nameNode.text : null; + } + // arrow_function or function_expression assigned to a variable + if (t === 'arrow_function' || t === 'function_expression') { + const parent = fnNode.parent; + if (parent?.type === 'variable_declarator') { + const nameNode = parent.childForFieldName('name'); + return nameNode ? nameNode.text : null; + } + if (parent?.type === 'pair') { + const keyNode = parent.childForFieldName('key'); + return keyNode ? keyNode.text : null; + } + if (parent?.type === 'assignment_expression') { + const left = parent.childForFieldName('left'); + return left ? left.text : null; + } + } + return null; +} + +/** + * Extract parameter names and indices from a formal_parameters node. + * Handles: simple identifiers, destructured objects/arrays, defaults, rest, TS typed params. + */ +function extractParams(paramsNode) { + if (!paramsNode) return []; + const result = []; + let index = 0; + for (const child of paramsNode.namedChildren) { + const names = extractParamNames(child); + for (const name of names) { + result.push({ name, index }); + } + index++; + } + return result; +} + +function extractParamNames(node) { + if (!node) return []; + const t = node.type; + if (t === 'identifier') return [node.text]; + // TS: required_parameter, optional_parameter + if (t === 'required_parameter' || t === 'optional_parameter') { + const pattern = node.childForFieldName('pattern'); + return pattern ? extractParamNames(pattern) : []; + } + if (t === 'assignment_pattern') { + const left = node.childForFieldName('left'); + return left ? extractParamNames(left) : []; + } + if (t === 'rest_pattern') { + // rest_pattern → ...identifier + for (const child of node.namedChildren) { + if (child.type === 'identifier') return [child.text]; + } + return []; + } + if (t === 'object_pattern') { + const names = []; + for (const child of node.namedChildren) { + if (child.type === 'shorthand_property_identifier_pattern') { + names.push(child.text); + } else if (child.type === 'pair_pattern') { + const value = child.childForFieldName('value'); + if (value) names.push(...extractParamNames(value)); + } else if (child.type === 'rest_pattern') { + names.push(...extractParamNames(child)); + } + } + return names; + } + if (t === 'array_pattern') { + const names = []; + for (const child of node.namedChildren) { + names.push(...extractParamNames(child)); + } + return names; + } + return []; +} + +/** + * Resolve the name a call expression is calling. + * Handles: `foo()`, `obj.method()`, `obj.nested.method()`. + */ +function resolveCalleeName(callNode) { + const fn = callNode.childForFieldName('function'); + if (!fn) return null; + if (fn.type === 'identifier') return fn.text; + if (fn.type === 'member_expression' || fn.type === 'optional_chain_expression') { + // Handle optional chaining: foo?.bar() or foo?.() + const target = fn.type === 'optional_chain_expression' ? fn.namedChildren[0] : fn; + if (!target) return null; + if (target.type === 'member_expression') { + const prop = target.childForFieldName('property'); + return prop ? prop.text : null; + } + if (target.type === 'identifier') return target.text; + const prop = fn.childForFieldName('property'); + return prop ? prop.text : null; + } + return null; +} + +/** + * Get the receiver (object) of a member expression. + */ +function memberReceiver(memberExpr) { + const obj = memberExpr.childForFieldName('object'); + if (!obj) return null; + if (obj.type === 'identifier') return obj.text; + if (obj.type === 'member_expression') return memberReceiver(obj); + return null; +} + +// ── extractDataflow ────────────────────────────────────────────────────────── + +/** + * Extract dataflow information from a parsed AST. + * + * @param {object} tree - tree-sitter parse tree + * @param {string} filePath - relative file path + * @param {object[]} definitions - symbol definitions from the parser + * @returns {{ parameters, returns, assignments, argFlows, mutations }} + */ +export function extractDataflow(tree, _filePath, _definitions) { + const parameters = []; + const returns = []; + const assignments = []; + const argFlows = []; + const mutations = []; + + // Build a scope stack as we traverse + // Each scope: { funcName, funcNode, params: Map, locals: Map } + const scopeStack = []; + + function currentScope() { + return scopeStack.length > 0 ? scopeStack[scopeStack.length - 1] : null; + } + + function findBinding(name) { + // Search from innermost scope outward + for (let i = scopeStack.length - 1; i >= 0; i--) { + const scope = scopeStack[i]; + if (scope.params.has(name)) + return { type: 'param', index: scope.params.get(name), funcName: scope.funcName }; + if (scope.locals.has(name)) + return { type: 'local', source: scope.locals.get(name), funcName: scope.funcName }; + } + return null; + } + + function enterScope(fnNode) { + const name = functionName(fnNode); + const paramsNode = fnNode.childForFieldName('parameters'); + const paramList = extractParams(paramsNode); + const paramMap = new Map(); + for (const p of paramList) { + paramMap.set(p.name, p.index); + if (name) { + parameters.push({ + funcName: name, + paramName: p.name, + paramIndex: p.index, + line: (paramsNode?.startPosition?.row ?? fnNode.startPosition.row) + 1, + }); + } + } + scopeStack.push({ funcName: name, funcNode: fnNode, params: paramMap, locals: new Map() }); + } + + function exitScope() { + scopeStack.pop(); + } + + /** + * Determine confidence for a variable binding flowing as an argument. + */ + function bindingConfidence(binding) { + if (!binding) return 0.5; + if (binding.type === 'param') return 1.0; + if (binding.type === 'local') { + // Local from a call return → 0.9, from destructuring → 0.8 + if (binding.source?.type === 'call_return') return 0.9; + if (binding.source?.type === 'destructured') return 0.8; + return 0.9; + } + return 0.5; + } + + // Recursive AST walk + function visit(node) { + if (!node) return; + const t = node.type; + + // Enter function scopes + if ( + t === 'function_declaration' || + t === 'method_definition' || + t === 'arrow_function' || + t === 'function_expression' || + t === 'function' + ) { + enterScope(node); + // Visit body + for (const child of node.namedChildren) { + visit(child); + } + exitScope(); + return; + } + + // Return statements + if (t === 'return_statement') { + const scope = currentScope(); + if (scope?.funcName) { + const expr = node.namedChildren[0]; + const referencedNames = []; + if (expr) collectIdentifiers(expr, referencedNames); + returns.push({ + funcName: scope.funcName, + expression: truncate(expr ? expr.text : ''), + referencedNames, + line: node.startPosition.row + 1, + }); + } + // Still visit children for nested expressions + for (const child of node.namedChildren) { + visit(child); + } + return; + } + + // Variable declarations: track assignments from calls + if (t === 'variable_declarator') { + const nameNode = node.childForFieldName('name'); + const valueNode = node.childForFieldName('value'); + const scope = currentScope(); + + if (nameNode && valueNode && scope) { + // Resolve the call expression from the value (handles await wrapping) + let callExpr = null; + if (valueNode.type === 'call_expression') { + callExpr = valueNode; + } else if (valueNode.type === 'await_expression') { + const awaitChild = valueNode.namedChildren[0]; + if (awaitChild?.type === 'call_expression') callExpr = awaitChild; + } + + if (callExpr) { + const callee = resolveCalleeName(callExpr); + if (callee && scope.funcName) { + // Destructuring: const { a, b } = foo() + if (nameNode.type === 'object_pattern' || nameNode.type === 'array_pattern') { + const names = extractParamNames(nameNode); + for (const n of names) { + assignments.push({ + varName: n, + callerFunc: scope.funcName, + sourceCallName: callee, + expression: truncate(node.text), + line: node.startPosition.row + 1, + }); + scope.locals.set(n, { type: 'destructured', callee }); + } + } else { + // Simple: const x = foo() + assignments.push({ + varName: nameNode.text, + callerFunc: scope.funcName, + sourceCallName: callee, + expression: truncate(node.text), + line: node.startPosition.row + 1, + }); + scope.locals.set(nameNode.text, { type: 'call_return', callee }); + } + } + } + } + // Visit children + for (const child of node.namedChildren) { + visit(child); + } + return; + } + + // Call expressions: track argument flows + if (t === 'call_expression') { + const callee = resolveCalleeName(node); + const argsNode = node.childForFieldName('arguments'); + const scope = currentScope(); + + if (callee && argsNode && scope?.funcName) { + let argIndex = 0; + for (const arg of argsNode.namedChildren) { + // Handle spread arguments: foo(...args) + const unwrapped = arg.type === 'spread_element' ? arg.namedChildren[0] : arg; + if (!unwrapped) { + argIndex++; + continue; + } + const argName = unwrapped.type === 'identifier' ? unwrapped.text : null; + const argMember = + unwrapped.type === 'member_expression' ? memberReceiver(unwrapped) : null; + const trackedName = argName || argMember; + + if (trackedName) { + const binding = findBinding(trackedName); + if (binding) { + argFlows.push({ + callerFunc: scope.funcName, + calleeName: callee, + argIndex, + argName: trackedName, + binding, + confidence: bindingConfidence(binding), + expression: truncate(arg.text), + line: node.startPosition.row + 1, + }); + } + } + argIndex++; + } + } + // Visit children (but not arguments again — we handled them) + for (const child of node.namedChildren) { + visit(child); + } + return; + } + + // Assignment expressions: mutation detection + non-declaration call captures + if (t === 'assignment_expression') { + const left = node.childForFieldName('left'); + const right = node.childForFieldName('right'); + const scope = currentScope(); + + if (scope?.funcName) { + // Mutation: obj.prop = value + if (left?.type === 'member_expression') { + const receiver = memberReceiver(left); + if (receiver) { + const binding = findBinding(receiver); + if (binding) { + mutations.push({ + funcName: scope.funcName, + receiverName: receiver, + binding, + mutatingExpr: truncate(node.text), + line: node.startPosition.row + 1, + }); + } + } + } + + // Non-declaration assignment: x = foo() (without const/let/var) + if (left?.type === 'identifier' && right) { + let callExpr = null; + if (right.type === 'call_expression') { + callExpr = right; + } else if (right.type === 'await_expression') { + const awaitChild = right.namedChildren[0]; + if (awaitChild?.type === 'call_expression') callExpr = awaitChild; + } + if (callExpr) { + const callee = resolveCalleeName(callExpr); + if (callee) { + assignments.push({ + varName: left.text, + callerFunc: scope.funcName, + sourceCallName: callee, + expression: truncate(node.text), + line: node.startPosition.row + 1, + }); + scope.locals.set(left.text, { type: 'call_return', callee }); + } + } + } + } + + // Visit children + for (const child of node.namedChildren) { + visit(child); + } + return; + } + + // Mutation detection: mutating method calls (push, pop, splice, etc.) + if (t === 'expression_statement') { + const expr = node.namedChildren[0]; + if (expr?.type === 'call_expression') { + const fn = expr.childForFieldName('function'); + if (fn?.type === 'member_expression') { + const prop = fn.childForFieldName('property'); + if (prop && MUTATING_METHODS.has(prop.text)) { + const receiver = memberReceiver(fn); + const scope = currentScope(); + if (receiver && scope?.funcName) { + const binding = findBinding(receiver); + if (binding) { + mutations.push({ + funcName: scope.funcName, + receiverName: receiver, + binding, + mutatingExpr: truncate(expr.text), + line: node.startPosition.row + 1, + }); + } + } + } + } + } + } + + // Default: visit all children + for (const child of node.namedChildren) { + visit(child); + } + } + + visit(tree.rootNode); + + return { parameters, returns, assignments, argFlows, mutations }; +} + +/** + * Collect all identifier names referenced within a node. + */ +function collectIdentifiers(node, out) { + if (node.type === 'identifier') { + out.push(node.text); + return; + } + for (const child of node.namedChildren) { + collectIdentifiers(child, out); + } +} + +// ── buildDataflowEdges ────────────────────────────────────────────────────── + +/** + * Build dataflow edges and insert them into the database. + * Called during graph build when --dataflow is enabled. + * + * @param {object} db - better-sqlite3 database instance + * @param {Map} fileSymbols - map of relPath → symbols + * @param {string} rootDir - absolute root directory + * @param {object} engineOpts - engine options + */ +export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts) { + // Lazily init WASM parsers if needed + let parsers = null; + let extToLang = null; + let needsFallback = false; + + for (const [relPath, symbols] of fileSymbols) { + if (!symbols._tree) { + const ext = path.extname(relPath).toLowerCase(); + if ( + ext === '.js' || + ext === '.ts' || + ext === '.tsx' || + ext === '.jsx' || + ext === '.mjs' || + ext === '.cjs' + ) { + needsFallback = true; + break; + } + } + } + + if (needsFallback) { + const { createParsers, LANGUAGE_REGISTRY } = await import('./parser.js'); + parsers = await createParsers(); + extToLang = new Map(); + for (const entry of LANGUAGE_REGISTRY) { + for (const ext of entry.extensions) { + extToLang.set(ext, entry.id); + } + } + } + + let getParserFn = null; + if (parsers) { + const mod = await import('./parser.js'); + getParserFn = mod.getParser; + } + + const insert = db.prepare( + `INSERT INTO dataflow (source_id, target_id, kind, param_index, expression, line, confidence) + VALUES (?, ?, ?, ?, ?, ?, ?)`, + ); + + const getNodeByNameAndFile = db.prepare( + `SELECT id, name, kind, file, line FROM nodes + WHERE name = ? AND file = ? AND kind IN ('function', 'method')`, + ); + + const getNodeByName = db.prepare( + `SELECT id, name, kind, file, line FROM nodes + WHERE name = ? AND kind IN ('function', 'method') + ORDER BY file, line LIMIT 10`, + ); + + let totalEdges = 0; + + const tx = db.transaction(() => { + for (const [relPath, symbols] of fileSymbols) { + const ext = path.extname(relPath).toLowerCase(); + // Only JS/TS for MVP + if ( + ext !== '.js' && + ext !== '.ts' && + ext !== '.tsx' && + ext !== '.jsx' && + ext !== '.mjs' && + ext !== '.cjs' + ) { + continue; + } + + let tree = symbols._tree; + + // WASM fallback if no cached tree + if (!tree) { + if (!extToLang || !getParserFn) continue; + const langId = extToLang.get(ext); + if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue; + + const absPath = path.join(rootDir, relPath); + let code; + try { + code = fs.readFileSync(absPath, 'utf-8'); + } catch { + continue; + } + + const parser = getParserFn(parsers, absPath); + if (!parser) continue; + + try { + tree = parser.parse(code); + } catch { + continue; + } + } + + const data = extractDataflow(tree, relPath, symbols.definitions); + + // Resolve function names to node IDs in this file first, then globally + function resolveNode(funcName) { + const local = getNodeByNameAndFile.all(funcName, relPath); + if (local.length > 0) return local[0]; + const global = getNodeByName.all(funcName); + return global.length > 0 ? global[0] : null; + } + + // flows_to: parameter/variable passed as argument to another function + for (const flow of data.argFlows) { + const sourceNode = resolveNode(flow.callerFunc); + const targetNode = resolveNode(flow.calleeName); + if (sourceNode && targetNode) { + insert.run( + sourceNode.id, + targetNode.id, + 'flows_to', + flow.argIndex, + flow.expression, + flow.line, + flow.confidence, + ); + totalEdges++; + } + } + + // returns: call return value captured in caller + for (const assignment of data.assignments) { + const producerNode = resolveNode(assignment.sourceCallName); + const consumerNode = resolveNode(assignment.callerFunc); + if (producerNode && consumerNode) { + insert.run( + producerNode.id, + consumerNode.id, + 'returns', + null, + assignment.expression, + assignment.line, + 1.0, + ); + totalEdges++; + } + } + + // mutates: parameter-derived value is mutated + for (const mut of data.mutations) { + const mutatorNode = resolveNode(mut.funcName); + if (mutatorNode && mut.binding?.type === 'param') { + // The mutation in this function affects the parameter source + insert.run( + mutatorNode.id, + mutatorNode.id, + 'mutates', + null, + mut.mutatingExpr, + mut.line, + 1.0, + ); + totalEdges++; + } + } + } + }); + + tx(); + info(`Dataflow: ${totalEdges} edges inserted`); +} + +// ── Query functions ───────────────────────────────────────────────────────── + +/** + * Look up node(s) by name with optional file/kind/noTests filtering. + * Similar to findMatchingNodes in queries.js but operates on the dataflow table. + */ +function findNodes(db, name, opts = {}) { + const kinds = opts.kind + ? [opts.kind] + : [ + 'function', + 'method', + 'class', + 'interface', + 'type', + 'struct', + 'enum', + 'trait', + 'record', + 'module', + ]; + const placeholders = kinds.map(() => '?').join(', '); + const params = [`%${name}%`, ...kinds]; + + let fileCondition = ''; + if (opts.file) { + fileCondition = ' AND file LIKE ?'; + params.push(`%${opts.file}%`); + } + + const rows = db + .prepare( + `SELECT id, name, kind, file, line FROM nodes + WHERE name LIKE ? AND kind IN (${placeholders})${fileCondition} + ORDER BY file, line`, + ) + .all(...params); + + return opts.noTests ? rows.filter((n) => !isTestFile(n.file)) : rows; +} + +/** + * Check if the dataflow table exists and has data. + */ +function hasDataflowTable(db) { + try { + const row = db.prepare('SELECT COUNT(*) as c FROM dataflow').get(); + return row.c > 0; + } catch { + return false; + } +} + +/** + * Return all dataflow edges for a symbol. + * + * @param {string} name - symbol name (partial match) + * @param {string} [customDbPath] - path to graph.db + * @param {object} [opts] - { noTests, file, kind, limit, offset } + * @returns {{ name, results: object[] }} + */ +export function dataflowData(name, customDbPath, opts = {}) { + const db = openReadonlyOrFail(customDbPath); + const noTests = opts.noTests || false; + + if (!hasDataflowTable(db)) { + db.close(); + return { + name, + results: [], + warning: 'No dataflow data found. Run `codegraph build --dataflow` first.', + }; + } + + const nodes = findNodes(db, name, { noTests, file: opts.file, kind: opts.kind }); + if (nodes.length === 0) { + db.close(); + return { name, results: [] }; + } + + const flowsToOut = db.prepare( + `SELECT d.*, n.name AS target_name, n.kind AS target_kind, n.file AS target_file, n.line AS target_line + FROM dataflow d JOIN nodes n ON d.target_id = n.id + WHERE d.source_id = ? AND d.kind = 'flows_to'`, + ); + const flowsToIn = db.prepare( + `SELECT d.*, n.name AS source_name, n.kind AS source_kind, n.file AS source_file, n.line AS source_line + FROM dataflow d JOIN nodes n ON d.source_id = n.id + WHERE d.target_id = ? AND d.kind = 'flows_to'`, + ); + const returnsOut = db.prepare( + `SELECT d.*, n.name AS target_name, n.kind AS target_kind, n.file AS target_file, n.line AS target_line + FROM dataflow d JOIN nodes n ON d.target_id = n.id + WHERE d.source_id = ? AND d.kind = 'returns'`, + ); + const returnsIn = db.prepare( + `SELECT d.*, n.name AS source_name, n.kind AS source_kind, n.file AS source_file, n.line AS source_line + FROM dataflow d JOIN nodes n ON d.source_id = n.id + WHERE d.target_id = ? AND d.kind = 'returns'`, + ); + const mutatesOut = db.prepare( + `SELECT d.*, n.name AS target_name, n.kind AS target_kind, n.file AS target_file, n.line AS target_line + FROM dataflow d JOIN nodes n ON d.target_id = n.id + WHERE d.source_id = ? AND d.kind = 'mutates'`, + ); + const mutatesIn = db.prepare( + `SELECT d.*, n.name AS source_name, n.kind AS source_kind, n.file AS source_file, n.line AS source_line + FROM dataflow d JOIN nodes n ON d.source_id = n.id + WHERE d.target_id = ? AND d.kind = 'mutates'`, + ); + + const results = nodes.map((node) => { + const flowsTo = flowsToOut.all(node.id).map((r) => ({ + target: r.target_name, + kind: r.target_kind, + file: r.target_file, + line: r.line, + paramIndex: r.param_index, + expression: r.expression, + confidence: r.confidence, + })); + + const flowsFrom = flowsToIn.all(node.id).map((r) => ({ + source: r.source_name, + kind: r.source_kind, + file: r.source_file, + line: r.line, + paramIndex: r.param_index, + expression: r.expression, + confidence: r.confidence, + })); + + const returnConsumers = returnsOut.all(node.id).map((r) => ({ + consumer: r.target_name, + kind: r.target_kind, + file: r.target_file, + line: r.line, + expression: r.expression, + })); + + const returnedBy = returnsIn.all(node.id).map((r) => ({ + producer: r.source_name, + kind: r.source_kind, + file: r.source_file, + line: r.line, + expression: r.expression, + })); + + const mutatesTargets = mutatesOut.all(node.id).map((r) => ({ + target: r.target_name, + expression: r.expression, + line: r.line, + })); + + const mutatedBy = mutatesIn.all(node.id).map((r) => ({ + source: r.source_name, + expression: r.expression, + line: r.line, + })); + + if (noTests) { + const filter = (arr) => arr.filter((r) => !isTestFile(r.file)); + return { + name: node.name, + kind: node.kind, + file: node.file, + line: node.line, + flowsTo: filter(flowsTo), + flowsFrom: filter(flowsFrom), + returns: returnConsumers.filter((r) => !isTestFile(r.file)), + returnedBy: returnedBy.filter((r) => !isTestFile(r.file)), + mutates: mutatesTargets, + mutatedBy, + }; + } + + return { + name: node.name, + kind: node.kind, + file: node.file, + line: node.line, + flowsTo, + flowsFrom, + returns: returnConsumers, + returnedBy, + mutates: mutatesTargets, + mutatedBy, + }; + }); + + db.close(); + const base = { name, results }; + return paginateResult(base, 'results', { limit: opts.limit, offset: opts.offset }); +} + +/** + * BFS through flows_to + returns edges to find how data gets from A to B. + * + * @param {string} from - source symbol name + * @param {string} to - target symbol name + * @param {string} [customDbPath] + * @param {object} [opts] - { noTests, maxDepth, limit, offset } + * @returns {{ from, to, found, hops?, path? }} + */ +export function dataflowPathData(from, to, customDbPath, opts = {}) { + const db = openReadonlyOrFail(customDbPath); + const noTests = opts.noTests || false; + const maxDepth = opts.maxDepth || 10; + + if (!hasDataflowTable(db)) { + db.close(); + return { + from, + to, + found: false, + warning: 'No dataflow data found. Run `codegraph build --dataflow` first.', + }; + } + + const fromNodes = findNodes(db, from, { noTests, file: opts.fromFile, kind: opts.kind }); + if (fromNodes.length === 0) { + db.close(); + return { from, to, found: false, error: `No symbol matching "${from}"` }; + } + + const toNodes = findNodes(db, to, { noTests, file: opts.toFile, kind: opts.kind }); + if (toNodes.length === 0) { + db.close(); + return { from, to, found: false, error: `No symbol matching "${to}"` }; + } + + const sourceNode = fromNodes[0]; + const targetNode = toNodes[0]; + + if (sourceNode.id === targetNode.id) { + db.close(); + return { + from, + to, + found: true, + hops: 0, + path: [ + { + name: sourceNode.name, + kind: sourceNode.kind, + file: sourceNode.file, + line: sourceNode.line, + edgeKind: null, + }, + ], + }; + } + + // BFS through flows_to and returns edges + const neighborStmt = db.prepare( + `SELECT n.id, n.name, n.kind, n.file, n.line, d.kind AS edge_kind, d.expression + FROM dataflow d JOIN nodes n ON d.target_id = n.id + WHERE d.source_id = ? AND d.kind IN ('flows_to', 'returns')`, + ); + + const visited = new Set([sourceNode.id]); + const parent = new Map(); + let queue = [sourceNode.id]; + let found = false; + + for (let depth = 1; depth <= maxDepth; depth++) { + const nextQueue = []; + for (const currentId of queue) { + const neighbors = neighborStmt.all(currentId); + for (const n of neighbors) { + if (noTests && isTestFile(n.file)) continue; + if (n.id === targetNode.id) { + if (!found) { + found = true; + parent.set(n.id, { + parentId: currentId, + edgeKind: n.edge_kind, + expression: n.expression, + }); + } + continue; + } + if (!visited.has(n.id)) { + visited.add(n.id); + parent.set(n.id, { + parentId: currentId, + edgeKind: n.edge_kind, + expression: n.expression, + }); + nextQueue.push(n.id); + } + } + } + if (found) break; + queue = nextQueue; + if (queue.length === 0) break; + } + + if (!found) { + db.close(); + return { from, to, found: false }; + } + + // Reconstruct path + const nodeById = db.prepare('SELECT id, name, kind, file, line FROM nodes WHERE id = ?'); + const pathItems = []; + let cur = targetNode.id; + while (cur !== undefined) { + const nodeRow = nodeById.get(cur); + const parentInfo = parent.get(cur); + pathItems.unshift({ + name: nodeRow.name, + kind: nodeRow.kind, + file: nodeRow.file, + line: nodeRow.line, + edgeKind: parentInfo?.edgeKind ?? null, + expression: parentInfo?.expression ?? null, + }); + cur = parentInfo?.parentId; + if (cur === sourceNode.id) { + const srcRow = nodeById.get(cur); + pathItems.unshift({ + name: srcRow.name, + kind: srcRow.kind, + file: srcRow.file, + line: srcRow.line, + edgeKind: null, + expression: null, + }); + break; + } + } + + db.close(); + return { from, to, found: true, hops: pathItems.length - 1, path: pathItems }; +} + +/** + * Forward BFS through returns edges: "if I change this function's return value, what breaks?" + * + * @param {string} name - symbol name + * @param {string} [customDbPath] + * @param {object} [opts] - { noTests, depth, file, kind, limit, offset } + * @returns {{ name, results: object[] }} + */ +export function dataflowImpactData(name, customDbPath, opts = {}) { + const db = openReadonlyOrFail(customDbPath); + const maxDepth = opts.depth || 5; + const noTests = opts.noTests || false; + + if (!hasDataflowTable(db)) { + db.close(); + return { + name, + results: [], + warning: 'No dataflow data found. Run `codegraph build --dataflow` first.', + }; + } + + const nodes = findNodes(db, name, { noTests, file: opts.file, kind: opts.kind }); + if (nodes.length === 0) { + db.close(); + return { name, results: [] }; + } + + // Forward BFS: who consumes this function's return value (directly or transitively)? + const consumersStmt = db.prepare( + `SELECT DISTINCT n.id, n.name, n.kind, n.file, n.line + FROM dataflow d JOIN nodes n ON d.target_id = n.id + WHERE d.source_id = ? AND d.kind = 'returns'`, + ); + + const results = nodes.map((node) => { + const visited = new Set([node.id]); + const levels = {}; + let frontier = [node.id]; + + for (let d = 1; d <= maxDepth; d++) { + const nextFrontier = []; + for (const fid of frontier) { + const consumers = consumersStmt.all(fid); + for (const c of consumers) { + if (!visited.has(c.id) && (!noTests || !isTestFile(c.file))) { + visited.add(c.id); + nextFrontier.push(c.id); + if (!levels[d]) levels[d] = []; + levels[d].push({ name: c.name, kind: c.kind, file: c.file, line: c.line }); + } + } + } + frontier = nextFrontier; + if (frontier.length === 0) break; + } + + return { + name: node.name, + kind: node.kind, + file: node.file, + line: node.line, + levels, + totalAffected: visited.size - 1, + }; + }); + + db.close(); + const base = { name, results }; + return paginateResult(base, 'results', { limit: opts.limit, offset: opts.offset }); +} + +// ── Display formatters ────────────────────────────────────────────────────── + +/** + * CLI display for dataflow command. + */ +export function dataflow(name, customDbPath, opts = {}) { + if (opts.path) { + return dataflowPath(name, opts.path, customDbPath, opts); + } + if (opts.impact) { + return dataflowImpact(name, customDbPath, opts); + } + + const data = dataflowData(name, customDbPath, opts); + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + if (opts.ndjson) { + for (const r of data.results) { + console.log(JSON.stringify(r)); + } + return; + } + + if (data.warning) { + console.log(`⚠ ${data.warning}`); + return; + } + if (data.results.length === 0) { + console.log(`No symbols matching "${name}".`); + return; + } + + for (const r of data.results) { + console.log(`\n${r.kind} ${r.name} (${r.file}:${r.line})`); + console.log('─'.repeat(60)); + + if (r.flowsTo.length > 0) { + console.log('\n Data flows TO:'); + for (const f of r.flowsTo) { + const conf = f.confidence < 1.0 ? ` [${(f.confidence * 100).toFixed(0)}%]` : ''; + console.log(` → ${f.target} (${f.file}:${f.line}) arg[${f.paramIndex}]${conf}`); + } + } + + if (r.flowsFrom.length > 0) { + console.log('\n Data flows FROM:'); + for (const f of r.flowsFrom) { + const conf = f.confidence < 1.0 ? ` [${(f.confidence * 100).toFixed(0)}%]` : ''; + console.log(` ← ${f.source} (${f.file}:${f.line}) arg[${f.paramIndex}]${conf}`); + } + } + + if (r.returns.length > 0) { + console.log('\n Return value consumed by:'); + for (const c of r.returns) { + console.log(` → ${c.consumer} (${c.file}:${c.line}) ${c.expression}`); + } + } + + if (r.returnedBy.length > 0) { + console.log('\n Uses return value of:'); + for (const p of r.returnedBy) { + console.log(` ← ${p.producer} (${p.file}:${p.line}) ${p.expression}`); + } + } + + if (r.mutates.length > 0) { + console.log('\n Mutates:'); + for (const m of r.mutates) { + console.log(` ✎ ${m.expression} (line ${m.line})`); + } + } + + if (r.mutatedBy.length > 0) { + console.log('\n Mutated by:'); + for (const m of r.mutatedBy) { + console.log(` ✎ ${m.source} — ${m.expression} (line ${m.line})`); + } + } + } +} + +/** + * CLI display for dataflow --path. + */ +function dataflowPath(from, to, customDbPath, opts = {}) { + const data = dataflowPathData(from, to, customDbPath, { + noTests: opts.noTests, + maxDepth: opts.depth ? Number(opts.depth) : 10, + }); + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + + if (data.warning) { + console.log(`⚠ ${data.warning}`); + return; + } + if (!data.found) { + console.log(data.error || `No data flow path found from "${from}" to "${to}".`); + return; + } + + console.log( + `\nData flow path: ${from} → ${to} (${data.hops} hop${data.hops !== 1 ? 's' : ''})\n`, + ); + for (let i = 0; i < data.path.length; i++) { + const p = data.path[i]; + const prefix = i === 0 ? ' ●' : ` ${'│ '.repeat(i - 1)}├─`; + const edge = p.edgeKind ? ` [${p.edgeKind}]` : ''; + console.log(`${prefix} ${p.name} (${p.file}:${p.line})${edge}`); + } +} + +/** + * CLI display for dataflow --impact. + */ +function dataflowImpact(name, customDbPath, opts = {}) { + const data = dataflowImpactData(name, customDbPath, { + noTests: opts.noTests, + depth: opts.depth ? Number(opts.depth) : 5, + file: opts.file, + kind: opts.kind, + limit: opts.limit, + offset: opts.offset, + }); + + if (opts.json) { + console.log(JSON.stringify(data, null, 2)); + return; + } + if (opts.ndjson) { + for (const r of data.results) { + console.log(JSON.stringify(r)); + } + return; + } + + if (data.warning) { + console.log(`⚠ ${data.warning}`); + return; + } + if (data.results.length === 0) { + console.log(`No symbols matching "${name}".`); + return; + } + + for (const r of data.results) { + console.log( + `\n${r.kind} ${r.name} (${r.file}:${r.line}) — ${r.totalAffected} data-dependent consumer${r.totalAffected !== 1 ? 's' : ''}`, + ); + for (const [level, items] of Object.entries(r.levels)) { + console.log(` Level ${level}:`); + for (const item of items) { + console.log(` ${item.name} (${item.file}:${item.line})`); + } + } + } +} diff --git a/src/db.js b/src/db.js index 9a35f322..f3f55fa4 100644 --- a/src/db.js +++ b/src/db.js @@ -144,6 +144,27 @@ export const MIGRATIONS = [ CREATE INDEX IF NOT EXISTS idx_fc_mi ON function_complexity(maintainability_index ASC); `, }, + { + version: 10, + up: ` + CREATE TABLE IF NOT EXISTS dataflow ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source_id INTEGER NOT NULL, + target_id INTEGER NOT NULL, + kind TEXT NOT NULL, + param_index INTEGER, + expression TEXT, + line INTEGER, + confidence REAL DEFAULT 1.0, + FOREIGN KEY(source_id) REFERENCES nodes(id), + FOREIGN KEY(target_id) REFERENCES nodes(id) + ); + CREATE INDEX IF NOT EXISTS idx_dataflow_source ON dataflow(source_id); + CREATE INDEX IF NOT EXISTS idx_dataflow_target ON dataflow(target_id); + CREATE INDEX IF NOT EXISTS idx_dataflow_kind ON dataflow(kind); + CREATE INDEX IF NOT EXISTS idx_dataflow_source_kind ON dataflow(source_id, kind); + `, + }, ]; export function getBuildMeta(db, key) { diff --git a/src/index.js b/src/index.js index ae8f3f43..7c012b2d 100644 --- a/src/index.js +++ b/src/index.js @@ -46,6 +46,15 @@ export { loadConfig } from './config.js'; export { EXTENSIONS, IGNORE_DIRS, normalizePath } from './constants.js'; // Circular dependency detection export { findCycles, formatCycles } from './cycles.js'; +// Dataflow analysis +export { + buildDataflowEdges, + dataflow, + dataflowData, + dataflowImpactData, + dataflowPathData, + extractDataflow, +} from './dataflow.js'; // Database utilities export { findDbPath, diff --git a/src/mcp.js b/src/mcp.js index 158af38b..66cba606 100644 --- a/src/mcp.js +++ b/src/mcp.js @@ -656,6 +656,29 @@ const BASE_TOOLS = [ required: ['base', 'target'], }, }, + { + name: 'dataflow', + description: + 'Show data flow edges: what data flows in/out of a function, return value consumers, mutations. Requires build --dataflow.', + inputSchema: { + type: 'object', + properties: { + name: { type: 'string', description: 'Function/method name (partial match)' }, + mode: { + type: 'string', + enum: ['edges', 'path', 'impact'], + description: 'edges (default), path, or impact', + }, + target: { type: 'string', description: 'Target symbol for path mode' }, + depth: { type: 'number', description: 'Max depth for impact mode', default: 5 }, + file: { type: 'string', description: 'Scope to file (partial match)' }, + kind: { type: 'string', enum: ALL_SYMBOL_KINDS, description: 'Filter by symbol kind' }, + no_tests: { type: 'boolean', description: 'Exclude test files', default: false }, + ...PAGINATION_PROPS, + }, + required: ['name'], + }, + }, { name: 'check', description: @@ -1173,6 +1196,40 @@ export async function startMCPServer(customDbPath, options = {}) { result = args.format === 'mermaid' ? branchCompareMermaid(bcData) : bcData; break; } + case 'dataflow': { + const mode = args.mode || 'edges'; + if (mode === 'path') { + if (!args.target) { + result = { error: 'path mode requires a "target" argument' }; + break; + } + const { dataflowPathData } = await import('./dataflow.js'); + result = dataflowPathData(args.name, args.target, dbPath, { + noTests: args.no_tests, + maxDepth: args.depth ?? 10, + }); + } else if (mode === 'impact') { + const { dataflowImpactData } = await import('./dataflow.js'); + result = dataflowImpactData(args.name, dbPath, { + depth: args.depth, + file: args.file, + kind: args.kind, + noTests: args.no_tests, + limit: Math.min(args.limit ?? MCP_DEFAULTS.fn_impact, MCP_MAX_LIMIT), + offset: args.offset ?? 0, + }); + } else { + const { dataflowData } = await import('./dataflow.js'); + result = dataflowData(args.name, dbPath, { + file: args.file, + kind: args.kind, + noTests: args.no_tests, + limit: Math.min(args.limit ?? MCP_DEFAULTS.fn_deps, MCP_MAX_LIMIT), + offset: args.offset ?? 0, + }); + } + break; + } case 'check': { const { checkData } = await import('./check.js'); result = checkData(dbPath, { diff --git a/tests/integration/dataflow.test.js b/tests/integration/dataflow.test.js new file mode 100644 index 00000000..2ba0a70a --- /dev/null +++ b/tests/integration/dataflow.test.js @@ -0,0 +1,291 @@ +/** + * Integration tests for dataflow analysis queries. + * + * Uses a hand-crafted in-memory DB with known dataflow topology: + * + * processData(input) → transform(input) [flows_to, arg 0] + * processData → format(result) [flows_to, arg 0] + * transform returns → processData [returns] + * processData mutates input.items [mutates] + * pipeline() → processData(raw) [flows_to, arg 0] + * loadData returns → pipeline [returns] + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, test } from 'vitest'; +import { dataflowData, dataflowImpactData, dataflowPathData } from '../../src/dataflow.js'; +import { initSchema } from '../../src/db.js'; + +// ─── Helpers ─────────────────────────────────────────────────────────── + +function insertNode(db, name, kind, file, line) { + return db + .prepare('INSERT INTO nodes (name, kind, file, line) VALUES (?, ?, ?, ?)') + .run(name, kind, file, line).lastInsertRowid; +} + +function insertDataflow(db, sourceId, targetId, kind, opts = {}) { + db.prepare( + 'INSERT INTO dataflow (source_id, target_id, kind, param_index, expression, line, confidence) VALUES (?, ?, ?, ?, ?, ?, ?)', + ).run( + sourceId, + targetId, + kind, + opts.paramIndex ?? null, + opts.expression ?? null, + opts.line ?? 1, + opts.confidence ?? 1.0, + ); +} + +// ─── Fixture DB ──────────────────────────────────────────────────────── + +let tmpDir, dbPath; + +beforeAll(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-dataflow-')); + fs.mkdirSync(path.join(tmpDir, '.codegraph')); + dbPath = path.join(tmpDir, '.codegraph', 'graph.db'); + + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + + // Nodes + const processData = insertNode(db, 'processData', 'function', 'src/process.js', 10); + const transform = insertNode(db, 'transform', 'function', 'src/transform.js', 5); + const format = insertNode(db, 'format', 'function', 'src/format.js', 1); + const pipeline = insertNode(db, 'pipeline', 'function', 'src/pipeline.js', 1); + const loadData = insertNode(db, 'loadData', 'function', 'src/loader.js', 1); + + // Test file nodes + const testHelper = insertNode(db, 'testProcessData', 'function', 'tests/process.test.js', 5); + + // flows_to: processData → transform (arg 0) + insertDataflow(db, processData, transform, 'flows_to', { + paramIndex: 0, + expression: 'input', + line: 12, + confidence: 1.0, + }); + + // flows_to: processData → format (arg 0) + insertDataflow(db, processData, format, 'flows_to', { + paramIndex: 0, + expression: 'result', + line: 14, + confidence: 0.9, + }); + + // returns: transform → processData (return value captured) + insertDataflow(db, transform, processData, 'returns', { + expression: 'const result = transform(input)', + line: 12, + }); + + // mutates: processData mutates itself (parameter mutation) + insertDataflow(db, processData, processData, 'mutates', { + expression: 'input.items.push(newItem)', + line: 15, + }); + + // flows_to: pipeline → processData (arg 0) + insertDataflow(db, pipeline, processData, 'flows_to', { + paramIndex: 0, + expression: 'raw', + line: 3, + confidence: 1.0, + }); + + // returns: loadData → pipeline + insertDataflow(db, loadData, pipeline, 'returns', { + expression: 'const raw = loadData()', + line: 2, + }); + + // flows_to from test file + insertDataflow(db, testHelper, processData, 'flows_to', { + paramIndex: 0, + expression: 'testInput', + line: 7, + }); + + db.close(); +}); + +afterAll(() => { + fs.rmSync(tmpDir, { recursive: true, force: true }); +}); + +// ─── dataflowData ────────────────────────────────────────────────────── + +describe('dataflowData', () => { + test('returns flows_to edges for a symbol', () => { + const data = dataflowData('processData', dbPath, { noTests: true }); + expect(data.results).toHaveLength(1); + const r = data.results[0]; + expect(r.name).toBe('processData'); + expect(r.flowsTo).toHaveLength(2); + expect(r.flowsTo).toEqual( + expect.arrayContaining([ + expect.objectContaining({ target: 'transform', paramIndex: 0 }), + expect.objectContaining({ target: 'format', paramIndex: 0, confidence: 0.9 }), + ]), + ); + }); + + test('returns flowsFrom edges', () => { + const data = dataflowData('transform', dbPath); + const r = data.results[0]; + expect(r.flowsFrom).toHaveLength(1); + expect(r.flowsFrom[0].source).toBe('processData'); + }); + + test('returns return-value consumers', () => { + const data = dataflowData('transform', dbPath); + const r = data.results[0]; + expect(r.returns).toHaveLength(1); + expect(r.returns[0].consumer).toBe('processData'); + }); + + test('returns returnedBy edges', () => { + const data = dataflowData('processData', dbPath); + const r = data.results[0]; + expect(r.returnedBy).toHaveLength(1); + expect(r.returnedBy[0].producer).toBe('transform'); + }); + + test('returns mutates edges', () => { + const data = dataflowData('processData', dbPath); + const r = data.results[0]; + expect(r.mutates).toHaveLength(1); + expect(r.mutates[0].expression).toContain('push'); + }); + + test('returns empty results for unknown symbol', () => { + const data = dataflowData('nonExistent', dbPath); + expect(data.results).toHaveLength(0); + }); + + test('--no-tests excludes test file edges', () => { + const data = dataflowData('processData', dbPath, { noTests: true }); + const r = data.results[0]; + // testHelper flows_to processData should be excluded + const testFlows = r.flowsFrom.filter((f) => f.file?.includes('test')); + expect(testFlows).toHaveLength(0); + }); + + test('pagination works', () => { + const data = dataflowData('processData', dbPath, { limit: 1, offset: 0 }); + expect(data.results).toHaveLength(1); + }); +}); + +// ─── dataflowPathData ────────────────────────────────────────────────── + +describe('dataflowPathData', () => { + test('finds data flow path between two symbols', () => { + const data = dataflowPathData('processData', 'format', dbPath); + expect(data.found).toBe(true); + expect(data.hops).toBeGreaterThan(0); + expect(data.path).toBeDefined(); + expect(data.path[0].name).toBe('processData'); + expect(data.path[data.path.length - 1].name).toBe('format'); + }); + + test('finds multi-hop path', () => { + const data = dataflowPathData('pipeline', 'transform', dbPath); + expect(data.found).toBe(true); + expect(data.hops).toBeGreaterThanOrEqual(2); + }); + + test('returns found=false when no path exists', () => { + const data = dataflowPathData('format', 'loadData', dbPath); + expect(data.found).toBe(false); + }); + + test('handles self-path', () => { + const data = dataflowPathData('processData', 'processData', dbPath); + expect(data.found).toBe(true); + expect(data.hops).toBe(0); + }); + + test('returns error for unknown symbol', () => { + const data = dataflowPathData('nonExistent', 'format', dbPath); + expect(data.found).toBe(false); + expect(data.error).toBeDefined(); + }); +}); + +// ─── dataflowImpactData ──────────────────────────────────────────────── + +describe('dataflowImpactData', () => { + test('shows return-value-dependent blast radius', () => { + const data = dataflowImpactData('transform', dbPath); + expect(data.results).toHaveLength(1); + const r = data.results[0]; + expect(r.totalAffected).toBeGreaterThan(0); + // transform returns → processData + expect(r.levels[1]).toEqual( + expect.arrayContaining([expect.objectContaining({ name: 'processData' })]), + ); + }); + + test('shows transitive impact through return chains', () => { + const data = dataflowImpactData('loadData', dbPath); + const r = data.results[0]; + // loadData returns → pipeline (level 1) + expect(r.levels[1]).toEqual( + expect.arrayContaining([expect.objectContaining({ name: 'pipeline' })]), + ); + }); + + test('returns empty for symbol with no return consumers', () => { + const data = dataflowImpactData('format', dbPath); + const r = data.results[0]; + expect(r.totalAffected).toBe(0); + }); + + test('respects depth limit', () => { + const data = dataflowImpactData('loadData', dbPath, { depth: 1 }); + const r = data.results[0]; + expect(r.levels[2]).toBeUndefined(); + }); +}); + +// ─── Empty dataflow table ────────────────────────────────────────────── + +describe('empty dataflow', () => { + let emptyDbPath; + + beforeAll(() => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-df-empty-')); + fs.mkdirSync(path.join(dir, '.codegraph')); + emptyDbPath = path.join(dir, '.codegraph', 'graph.db'); + const db = new Database(emptyDbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + // Insert a node but no dataflow edges + insertNode(db, 'lonely', 'function', 'src/lonely.js', 1); + db.close(); + }); + + test('dataflowData returns warning when no dataflow data', () => { + const data = dataflowData('lonely', emptyDbPath); + expect(data.warning).toBeDefined(); + expect(data.results).toHaveLength(0); + }); + + test('dataflowPathData returns warning', () => { + const data = dataflowPathData('lonely', 'lonely', emptyDbPath); + expect(data.warning).toBeDefined(); + }); + + test('dataflowImpactData returns warning', () => { + const data = dataflowImpactData('lonely', emptyDbPath); + expect(data.warning).toBeDefined(); + }); +}); diff --git a/tests/parsers/dataflow-javascript.test.js b/tests/parsers/dataflow-javascript.test.js new file mode 100644 index 00000000..7ea83193 --- /dev/null +++ b/tests/parsers/dataflow-javascript.test.js @@ -0,0 +1,369 @@ +/** + * Unit tests for extractDataflow() against parsed JS/TS ASTs. + */ +import { beforeAll, describe, expect, it } from 'vitest'; +import { extractDataflow } from '../../src/dataflow.js'; +import { createParsers } from '../../src/parser.js'; + +describe('extractDataflow — JavaScript', () => { + let parsers; + + beforeAll(async () => { + parsers = await createParsers(); + }); + + function parseAndExtract(code) { + const parser = parsers.get('javascript'); + const tree = parser.parse(code); + return extractDataflow(tree, 'test.js', []); + } + + // ── Parameter extraction ────────────────────────────────────────────── + + describe('parameters', () => { + it('extracts simple parameters', () => { + const data = parseAndExtract(`function add(a, b) { return a + b; }`); + expect(data.parameters).toEqual( + expect.arrayContaining([ + expect.objectContaining({ funcName: 'add', paramName: 'a', paramIndex: 0 }), + expect.objectContaining({ funcName: 'add', paramName: 'b', paramIndex: 1 }), + ]), + ); + }); + + it('extracts destructured object parameters', () => { + const data = parseAndExtract(`function greet({ name, age }) { return name; }`); + expect(data.parameters).toEqual( + expect.arrayContaining([ + expect.objectContaining({ funcName: 'greet', paramName: 'name' }), + expect.objectContaining({ funcName: 'greet', paramName: 'age' }), + ]), + ); + }); + + it('extracts destructured array parameters', () => { + const data = parseAndExtract(`function first([head, tail]) { return head; }`); + expect(data.parameters).toEqual( + expect.arrayContaining([ + expect.objectContaining({ funcName: 'first', paramName: 'head' }), + expect.objectContaining({ funcName: 'first', paramName: 'tail' }), + ]), + ); + }); + + it('extracts default parameters', () => { + const data = parseAndExtract(`function inc(x, step = 1) { return x + step; }`); + expect(data.parameters).toEqual( + expect.arrayContaining([ + expect.objectContaining({ funcName: 'inc', paramName: 'x', paramIndex: 0 }), + expect.objectContaining({ funcName: 'inc', paramName: 'step', paramIndex: 1 }), + ]), + ); + }); + + it('extracts rest parameters', () => { + const data = parseAndExtract(`function sum(...nums) { return nums.reduce((a,b) => a+b); }`); + expect(data.parameters).toEqual( + expect.arrayContaining([ + expect.objectContaining({ funcName: 'sum', paramName: 'nums', paramIndex: 0 }), + ]), + ); + }); + + it('extracts arrow function parameters', () => { + const data = parseAndExtract(`const multiply = (x, y) => x * y;`); + expect(data.parameters).toEqual( + expect.arrayContaining([ + expect.objectContaining({ funcName: 'multiply', paramName: 'x', paramIndex: 0 }), + expect.objectContaining({ funcName: 'multiply', paramName: 'y', paramIndex: 1 }), + ]), + ); + }); + }); + + // ── Return statements ───────────────────────────────────────────────── + + describe('returns', () => { + it('captures return expressions', () => { + const data = parseAndExtract(`function double(x) { return x * 2; }`); + expect(data.returns).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + funcName: 'double', + referencedNames: expect.arrayContaining(['x']), + }), + ]), + ); + }); + + it('captures return with call expression', () => { + const data = parseAndExtract(`function process(items) { return items.map(x => x); }`); + expect(data.returns).toHaveLength(1); + expect(data.returns[0].funcName).toBe('process'); + expect(data.returns[0].referencedNames).toContain('items'); + }); + }); + + // ── Assignment from calls ───────────────────────────────────────────── + + describe('assignments', () => { + it('tracks const x = foo()', () => { + const data = parseAndExtract(` + function main() { + const result = compute(); + return result; + } + `); + expect(data.assignments).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + varName: 'result', + callerFunc: 'main', + sourceCallName: 'compute', + }), + ]), + ); + }); + + it('tracks destructured assignment from call', () => { + const data = parseAndExtract(` + function load() { + const { data, error } = fetchData(); + return data; + } + `); + expect(data.assignments).toEqual( + expect.arrayContaining([ + expect.objectContaining({ varName: 'data', sourceCallName: 'fetchData' }), + expect.objectContaining({ varName: 'error', sourceCallName: 'fetchData' }), + ]), + ); + }); + }); + + // ── Argument flows ──────────────────────────────────────────────────── + + describe('argFlows', () => { + it('detects parameter passed as argument', () => { + const data = parseAndExtract(` + function process(input) { + transform(input); + } + `); + expect(data.argFlows).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + callerFunc: 'process', + calleeName: 'transform', + argIndex: 0, + argName: 'input', + confidence: 1.0, + }), + ]), + ); + }); + + it('detects variable intermediary with call return source', () => { + const data = parseAndExtract(` + function pipeline() { + const val = getData(); + process(val); + } + `); + expect(data.argFlows).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + callerFunc: 'pipeline', + calleeName: 'process', + argName: 'val', + confidence: 0.9, + }), + ]), + ); + }); + + it('tracks multiple arguments', () => { + const data = parseAndExtract(` + function run(a, b) { + combine(a, b); + } + `); + const flows = data.argFlows.filter((f) => f.calleeName === 'combine'); + expect(flows).toHaveLength(2); + expect(flows[0].argIndex).toBe(0); + expect(flows[1].argIndex).toBe(1); + }); + }); + + // ── Mutation detection ──────────────────────────────────────────────── + + describe('mutations', () => { + it('detects push on parameter', () => { + const data = parseAndExtract(` + function addItem(list, item) { + list.push(item); + } + `); + expect(data.mutations).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + funcName: 'addItem', + receiverName: 'list', + }), + ]), + ); + expect(data.mutations[0].mutatingExpr).toContain('push'); + }); + + it('detects property assignment on parameter', () => { + const data = parseAndExtract(` + function setName(obj, name) { + obj.name = name; + } + `); + expect(data.mutations).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + funcName: 'setName', + receiverName: 'obj', + }), + ]), + ); + }); + + it('detects splice mutation', () => { + const data = parseAndExtract(` + function removeFirst(arr) { + arr.splice(0, 1); + } + `); + expect(data.mutations).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + funcName: 'removeFirst', + receiverName: 'arr', + }), + ]), + ); + }); + }); + + // ── Nested scopes ──────────────────────────────────────────────────── + + describe('nested scopes', () => { + it('separates parameters of outer and inner functions', () => { + const data = parseAndExtract(` + function outer(x) { + function inner(y) { + return y; + } + return inner(x); + } + `); + const outerParams = data.parameters.filter((p) => p.funcName === 'outer'); + const innerParams = data.parameters.filter((p) => p.funcName === 'inner'); + expect(outerParams).toHaveLength(1); + expect(outerParams[0].paramName).toBe('x'); + expect(innerParams).toHaveLength(1); + expect(innerParams[0].paramName).toBe('y'); + }); + + it('tracks argument flow from outer to inner function', () => { + const data = parseAndExtract(` + function outer(x) { + function inner(y) { + return y; + } + return inner(x); + } + `); + expect(data.argFlows).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + callerFunc: 'outer', + calleeName: 'inner', + argName: 'x', + }), + ]), + ); + }); + }); + + // ── Arrow implicit returns ──────────────────────────────────────────── + + describe('arrow functions', () => { + it('extracts parameters from arrow expressions', () => { + const data = parseAndExtract(`const square = (n) => n * n;`); + expect(data.parameters).toEqual( + expect.arrayContaining([ + expect.objectContaining({ funcName: 'square', paramName: 'n', paramIndex: 0 }), + ]), + ); + }); + }); + + // ── Spread arguments ────────────────────────────────────────────────── + + describe('spread arguments', () => { + it('tracks spread argument flow', () => { + const data = parseAndExtract(` + function forward(items) { + process(...items); + } + `); + expect(data.argFlows).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + callerFunc: 'forward', + calleeName: 'process', + argName: 'items', + }), + ]), + ); + }); + }); + + // ── Non-declaration assignments ─────────────────────────────────────── + + describe('non-declaration assignments', () => { + it('tracks x = foo() without const/let/var', () => { + const data = parseAndExtract(` + function update() { + let result; + result = compute(); + return result; + } + `); + expect(data.assignments).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + varName: 'result', + callerFunc: 'update', + sourceCallName: 'compute', + }), + ]), + ); + }); + }); + + // ── Optional chaining ───────────────────────────────────────────────── + + describe('optional chaining', () => { + it('resolves callee name through optional chain', () => { + const data = parseAndExtract(` + function safeFetch(client) { + client?.fetch(client); + } + `); + expect(data.argFlows).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + callerFunc: 'safeFetch', + calleeName: 'fetch', + argName: 'client', + }), + ]), + ); + }); + }); +}); diff --git a/tests/unit/mcp.test.js b/tests/unit/mcp.test.js index 8a57799e..395878ec 100644 --- a/tests/unit/mcp.test.js +++ b/tests/unit/mcp.test.js @@ -38,6 +38,7 @@ const ALL_TOOL_NAMES = [ 'batch_query', 'triage', 'branch_compare', + 'dataflow', 'check', 'list_repos', ];