Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,450 changes: 1,450 additions & 0 deletions crates/codegraph-core/src/dataflow.rs

Large diffs are not rendered by default.

19 changes: 15 additions & 4 deletions crates/codegraph-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,31 @@ pub mod cycles;
pub mod incremental;
pub mod complexity;
pub mod cfg;
pub mod dataflow;

use napi_derive::napi;
use types::*;

/// Parse a single file and return extracted symbols.
/// When `include_dataflow` is true, dataflow analysis is also extracted.
#[napi]
pub fn parse_file(file_path: String, source: String) -> Option<FileSymbols> {
parallel::parse_file(&file_path, &source)
pub fn parse_file(
file_path: String,
source: String,
include_dataflow: Option<bool>,
) -> Option<FileSymbols> {
parallel::parse_file(&file_path, &source, include_dataflow.unwrap_or(false))
}

/// Parse multiple files in parallel and return all extracted symbols.
/// When `include_dataflow` is true, dataflow analysis is also extracted.
#[napi]
pub fn parse_files(file_paths: Vec<String>, root_dir: String) -> Vec<FileSymbols> {
parallel::parse_files_parallel(&file_paths, &root_dir)
pub fn parse_files(
file_paths: Vec<String>,
root_dir: String,
include_dataflow: Option<bool>,
) -> Vec<FileSymbols> {
parallel::parse_files_parallel(&file_paths, &root_dir, include_dataflow.unwrap_or(false))
}

/// Resolve a single import path.
Expand Down
17 changes: 15 additions & 2 deletions crates/codegraph-core/src/parallel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,20 @@ use rayon::prelude::*;
use std::fs;
use tree_sitter::Parser;

use crate::dataflow::extract_dataflow;
use crate::extractors::extract_symbols;
use crate::parser_registry::LanguageKind;
use crate::types::FileSymbols;

/// Parse multiple files in parallel using rayon.
/// Each thread creates its own Parser (cheap; Language objects are Send+Sync).
/// Failed files are silently skipped (matches WASM behavior).
pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec<FileSymbols> {
/// When `include_dataflow` is false, dataflow extraction is skipped for performance.
pub fn parse_files_parallel(
file_paths: &[String],
_root_dir: &str,
include_dataflow: bool,
) -> Vec<FileSymbols> {
file_paths
.par_iter()
.filter_map(|file_path| {
Expand All @@ -24,14 +30,18 @@ pub fn parse_files_parallel(file_paths: &[String], _root_dir: &str) -> Vec<FileS

let tree = parser.parse(&source, None)?;
let mut symbols = extract_symbols(lang, &tree, &source, file_path);
if include_dataflow {
symbols.dataflow = extract_dataflow(&tree, &source, lang.lang_id_str());
}
symbols.line_count = Some(line_count);
Some(symbols)
})
.collect()
}

/// Parse a single file and return its symbols.
pub fn parse_file(file_path: &str, source: &str) -> Option<FileSymbols> {
/// When `include_dataflow` is false, dataflow extraction is skipped for performance.
pub fn parse_file(file_path: &str, source: &str, include_dataflow: bool) -> Option<FileSymbols> {
let lang = LanguageKind::from_extension(file_path)?;
let source_bytes = source.as_bytes();

Expand All @@ -43,6 +53,9 @@ pub fn parse_file(file_path: &str, source: &str) -> Option<FileSymbols> {
let tree = parser.parse(source_bytes, None)?;
let line_count = source_bytes.iter().filter(|&&b| b == b'\n').count() as u32 + 1;
let mut symbols = extract_symbols(lang, &tree, source_bytes, file_path);
if include_dataflow {
symbols.dataflow = extract_dataflow(&tree, source_bytes, lang.lang_id_str());
}
symbols.line_count = Some(line_count);
Some(symbols)
}
18 changes: 18 additions & 0 deletions crates/codegraph-core/src/parser_registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,24 @@ pub enum LanguageKind {
}

impl LanguageKind {
/// Return the string ID used by dataflow/cfg rules lookup.
/// Matches the JS `DATAFLOW_RULES` map keys in `src/dataflow.js`.
pub fn lang_id_str(&self) -> &'static str {
match self {
Self::JavaScript => "javascript",
Self::TypeScript => "typescript",
Self::Tsx => "tsx",
Self::Python => "python",
Self::Go => "go",
Self::Rust => "rust",
Self::Java => "java",
Self::CSharp => "csharp",
Self::Ruby => "ruby",
Self::Php => "php",
Self::Hcl => "hcl",
}
}

/// Determine language from file extension — mirrors `getParser()` in parser.js
pub fn from_extension(file_path: &str) -> Option<Self> {
let path = Path::new(file_path);
Expand Down
83 changes: 83 additions & 0 deletions crates/codegraph-core/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,87 @@ pub struct AstNode {
pub receiver: Option<String>,
}

// ─── Dataflow Types ──────────────────────────────────────────────────────

#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataflowParam {
#[napi(js_name = "funcName")]
pub func_name: String,
#[napi(js_name = "paramName")]
pub param_name: String,
#[napi(js_name = "paramIndex")]
pub param_index: u32,
pub line: u32,
}

#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataflowReturn {
#[napi(js_name = "funcName")]
pub func_name: String,
pub expression: String,
#[napi(js_name = "referencedNames")]
pub referenced_names: Vec<String>,
pub line: u32,
}

#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataflowAssignment {
#[napi(js_name = "varName")]
pub var_name: String,
#[napi(js_name = "callerFunc")]
pub caller_func: Option<String>,
#[napi(js_name = "sourceCallName")]
pub source_call_name: String,
pub expression: String,
pub line: u32,
}

#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataflowArgFlow {
#[napi(js_name = "callerFunc")]
pub caller_func: Option<String>,
#[napi(js_name = "calleeName")]
pub callee_name: String,
#[napi(js_name = "argIndex")]
pub arg_index: u32,
#[napi(js_name = "argName")]
pub arg_name: Option<String>,
#[napi(js_name = "bindingType")]
pub binding_type: Option<String>,
pub confidence: f64,
pub expression: String,
pub line: u32,
}

#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataflowMutation {
#[napi(js_name = "funcName")]
pub func_name: Option<String>,
#[napi(js_name = "receiverName")]
pub receiver_name: String,
#[napi(js_name = "bindingType")]
pub binding_type: Option<String>,
#[napi(js_name = "mutatingExpr")]
pub mutating_expr: String,
pub line: u32,
}

#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DataflowResult {
pub parameters: Vec<DataflowParam>,
pub returns: Vec<DataflowReturn>,
pub assignments: Vec<DataflowAssignment>,
#[napi(js_name = "argFlows")]
pub arg_flows: Vec<DataflowArgFlow>,
pub mutations: Vec<DataflowMutation>,
}

#[napi(object)]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FileSymbols {
Expand All @@ -186,6 +267,7 @@ pub struct FileSymbols {
pub exports: Vec<ExportInfo>,
#[napi(js_name = "astNodes")]
pub ast_nodes: Vec<AstNode>,
pub dataflow: Option<DataflowResult>,
pub line_count: Option<u32>,
}

Expand All @@ -199,6 +281,7 @@ impl FileSymbols {
classes: Vec::new(),
exports: Vec::new(),
ast_nodes: Vec::new(),
dataflow: None,
line_count: None,
}
}
Expand Down
8 changes: 6 additions & 2 deletions src/builder.js
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,7 @@ export async function buildGraph(rootDir, opts = {}) {
opts.incremental !== false && config.build && config.build.incremental !== false;

// Engine selection: 'native', 'wasm', or 'auto' (default)
const engineOpts = { engine: opts.engine || 'auto' };
const engineOpts = { engine: opts.engine || 'auto', dataflow: opts.dataflow !== false };
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dataflow computed unconditionally in CFG-only pending analysis pass

engineOpts.dataflow is set to opts.dataflow !== false (defaults to true) for all parseFilesAuto calls. In the incremental "pending analysis" path at line 551, parseFilesAuto is called inside if (needsCfg || needsDataflow). When only needsCfg is true (e.g. a build that only needs to refresh CFG data), the native engine will traverse every file's full AST and populate symbols.dataflow — work that is immediately discarded because if (needsDataflow) at line 556 is false.

Before this PR, engineOpts had no dataflow key, so !!opts.dataflow resolved to false and the native engine skipped dataflow extraction entirely on this path. Now it is always computed by default, adding non-trivial overhead to incremental CFG-only rebuilds on large codebases.

A targeted fix for the pending-analysis call site:

const analysisOpts = {
  ...engineOpts,
  dataflow: needsDataflow && opts.dataflow !== false,
};
const analysisSymbols = await parseFilesAuto(files, rootDir, analysisOpts);

The main build path (line 711 → allSymbols) is fine as-is because buildDataflowEdges is guarded by if (opts.dataflow !== false) and uses the same symbol map.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in e52c533 — the pending-analysis call site now overrides engineOpts.dataflow to only compute dataflow when needsDataflow is true. CFG-only incremental rebuilds no longer pay the dataflow extraction cost.

const { name: engineName, version: engineVersion } = getActiveEngine(engineOpts);
info(`Using ${engineName} engine${engineVersion ? ` (v${engineVersion})` : ''}`);

Expand Down Expand Up @@ -548,7 +548,11 @@ export async function buildGraph(rootDir, opts = {}) {

if (needsCfg || needsDataflow) {
info('No file changes. Running pending analysis pass...');
const analysisSymbols = await parseFilesAuto(files, rootDir, engineOpts);
const analysisOpts = {
...engineOpts,
dataflow: needsDataflow && opts.dataflow !== false,
};
const analysisSymbols = await parseFilesAuto(files, rootDir, analysisOpts);
if (needsCfg) {
const { buildCFGData } = await import('./cfg.js');
await buildCFGData(db, analysisSymbols, rootDir, engineOpts);
Expand Down
62 changes: 33 additions & 29 deletions src/dataflow.js
Original file line number Diff line number Diff line change
Expand Up @@ -1009,7 +1009,7 @@ export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts)
let needsFallback = false;

for (const [relPath, symbols] of fileSymbols) {
if (!symbols._tree) {
if (!symbols._tree && !symbols.dataflow) {
const ext = path.extname(relPath).toLowerCase();
if (DATAFLOW_EXTENSIONS.has(ext)) {
needsFallback = true;
Expand Down Expand Up @@ -1061,41 +1061,45 @@ export async function buildDataflowEdges(db, fileSymbols, rootDir, _engineOpts)
const ext = path.extname(relPath).toLowerCase();
if (!DATAFLOW_EXTENSIONS.has(ext)) continue;

let tree = symbols._tree;
let langId = symbols._langId;
// Use native dataflow data if available — skip WASM extraction
let data = symbols.dataflow;
if (!data) {
let tree = symbols._tree;
let langId = symbols._langId;

// WASM fallback if no cached tree
if (!tree) {
if (!extToLang || !getParserFn) continue;
langId = extToLang.get(ext);
if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue;

const absPath = path.join(rootDir, relPath);
let code;
try {
code = fs.readFileSync(absPath, 'utf-8');
} catch {
continue;
}

// WASM fallback if no cached tree
if (!tree) {
if (!extToLang || !getParserFn) continue;
langId = extToLang.get(ext);
if (!langId || !DATAFLOW_LANG_IDS.has(langId)) continue;
const parser = getParserFn(parsers, absPath);
if (!parser) continue;

const absPath = path.join(rootDir, relPath);
let code;
try {
code = fs.readFileSync(absPath, 'utf-8');
} catch {
continue;
try {
tree = parser.parse(code);
} catch {
continue;
}
}

const parser = getParserFn(parsers, absPath);
if (!parser) continue;

try {
tree = parser.parse(code);
} catch {
continue;
if (!langId) {
langId = extToLang ? extToLang.get(ext) : null;
if (!langId) continue;
}
}

if (!langId) {
langId = extToLang ? extToLang.get(ext) : null;
if (!langId) continue;
}

if (!DATAFLOW_RULES.has(langId)) continue;
if (!DATAFLOW_RULES.has(langId)) continue;

const data = extractDataflow(tree, relPath, symbols.definitions, langId);
data = extractDataflow(tree, relPath, symbols.definitions, langId);
}

// Resolve function names to node IDs in this file first, then globally
function resolveNode(funcName) {
Expand Down
44 changes: 42 additions & 2 deletions src/parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,46 @@ function normalizeNativeSymbols(result) {
text: n.text ?? null,
receiver: n.receiver ?? null,
})),
dataflow: result.dataflow
? {
parameters: (result.dataflow.parameters || []).map((p) => ({
funcName: p.funcName,
paramName: p.paramName,
paramIndex: p.paramIndex,
line: p.line,
})),
returns: (result.dataflow.returns || []).map((r) => ({
funcName: r.funcName,
expression: r.expression ?? '',
referencedNames: r.referencedNames ?? [],
line: r.line,
})),
assignments: (result.dataflow.assignments || []).map((a) => ({
varName: a.varName,
callerFunc: a.callerFunc ?? null,
sourceCallName: a.sourceCallName,
expression: a.expression ?? '',
line: a.line,
})),
argFlows: (result.dataflow.argFlows ?? []).map((f) => ({
callerFunc: f.callerFunc ?? null,
calleeName: f.calleeName,
argIndex: f.argIndex,
argName: f.argName ?? null,
binding: f.bindingType ? { type: f.bindingType } : null,
confidence: f.confidence,
expression: f.expression ?? '',
line: f.line,
})),
mutations: (result.dataflow.mutations || []).map((m) => ({
funcName: m.funcName ?? null,
receiverName: m.receiverName,
binding: m.bindingType ? { type: m.bindingType } : null,
mutatingExpr: m.mutatingExpr,
line: m.line,
})),
}
: null,
};
}

Expand Down Expand Up @@ -400,7 +440,7 @@ export async function parseFileAuto(filePath, source, opts = {}) {
const { native } = resolveEngine(opts);

if (native) {
const result = native.parseFile(filePath, source);
const result = native.parseFile(filePath, source, !!opts.dataflow);
return result ? normalizeNativeSymbols(result) : null;
}

Expand All @@ -423,7 +463,7 @@ export async function parseFilesAuto(filePaths, rootDir, opts = {}) {
const result = new Map();

if (native) {
const nativeResults = native.parseFiles(filePaths, rootDir);
const nativeResults = native.parseFiles(filePaths, rootDir, !!opts.dataflow);
for (const r of nativeResults) {
if (!r) continue;
const relPath = path.relative(rootDir, r.file).split(path.sep).join('/');
Expand Down