diff --git a/crates/codegraph-core/src/extractors/csharp.rs b/crates/codegraph-core/src/extractors/csharp.rs index c378bbc..9d853ec 100644 --- a/crates/codegraph-core/src/extractors/csharp.rs +++ b/crates/codegraph-core/src/extractors/csharp.rs @@ -10,6 +10,7 @@ impl SymbolExtractor for CSharpExtractor { fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { let mut symbols = FileSymbols::new(file_path.to_string()); walk_node(&tree.root_node(), source, &mut symbols); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &CSHARP_AST_CONFIG); symbols } } diff --git a/crates/codegraph-core/src/extractors/go.rs b/crates/codegraph-core/src/extractors/go.rs index 319c519..23d7e1a 100644 --- a/crates/codegraph-core/src/extractors/go.rs +++ b/crates/codegraph-core/src/extractors/go.rs @@ -10,6 +10,7 @@ impl SymbolExtractor for GoExtractor { fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { let mut symbols = FileSymbols::new(file_path.to_string()); walk_node(&tree.root_node(), source, &mut symbols); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &GO_AST_CONFIG); symbols } } diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs index 414a74e..7419f61 100644 --- a/crates/codegraph-core/src/extractors/helpers.rs +++ b/crates/codegraph-core/src/extractors/helpers.rs @@ -1,5 +1,5 @@ use tree_sitter::Node; -use crate::types::Definition; +use crate::types::{AstNode, Definition}; /// Get the text of a node from the source bytes. pub fn node_text<'a>(node: &Node, source: &'a [u8]) -> &'a str { @@ -88,3 +88,308 @@ pub fn truncate(s: &str, max: usize) -> String { } format!("{}\u{2026}", &s[..end]) } + +// ── AST node extraction (shared across all languages) ──────────────────────── + +/// Max length for the AST `text` field — matches `TEXT_MAX` in `ast.js`. +pub const AST_TEXT_MAX: usize = 200; + +/// Language-specific AST node type configuration. +pub struct LangAstConfig { + /// Node types mapping to `"new"` kind (e.g. `new_expression`, `object_creation_expression`) + pub new_types: &'static [&'static str], + /// Node types mapping to `"throw"` kind (e.g. `throw_statement`, `raise_statement`) + pub throw_types: &'static [&'static str], + /// Node types mapping to `"await"` kind (e.g. `await_expression`, `await`) + pub await_types: &'static [&'static str], + /// Node types mapping to `"string"` kind (e.g. `string`, `string_literal`) + pub string_types: &'static [&'static str], + /// Node types mapping to `"regex"` kind (e.g. `regex`) + pub regex_types: &'static [&'static str], + /// Characters to strip from string delimiters when extracting content. + pub quote_chars: &'static [char], + /// Single-char prefixes that can appear before string quotes (e.g. `r`, `b`, `f`, `u` for Python). + /// Multi-char combos like `rb`, `fr` are handled by stripping each char in sequence. + pub string_prefixes: &'static [char], +} + +// ── Per-language configs ───────────────────────────────────────────────────── + +pub const PYTHON_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &[], + throw_types: &["raise_statement"], + await_types: &["await"], + string_types: &["string"], + regex_types: &[], + quote_chars: &['\'', '"'], + string_prefixes: &['r', 'b', 'f', 'u', 'R', 'B', 'F', 'U'], +}; + +pub const GO_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &[], + throw_types: &[], + await_types: &[], + string_types: &["interpreted_string_literal", "raw_string_literal"], + regex_types: &[], + quote_chars: &['"', '`'], + string_prefixes: &[], +}; + +pub const RUST_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &[], + throw_types: &[], + await_types: &["await_expression"], + string_types: &["string_literal", "raw_string_literal"], + regex_types: &[], + quote_chars: &['"'], + string_prefixes: &[], +}; + +pub const JAVA_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &["object_creation_expression"], + throw_types: &["throw_statement"], + await_types: &[], + string_types: &["string_literal"], + regex_types: &[], + quote_chars: &['"'], + string_prefixes: &[], +}; + +pub const CSHARP_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &["object_creation_expression"], + throw_types: &["throw_statement", "throw_expression"], + await_types: &["await_expression"], + string_types: &["string_literal", "verbatim_string_literal"], + regex_types: &[], + quote_chars: &['"'], + string_prefixes: &[], +}; + +pub const RUBY_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &[], + throw_types: &[], + await_types: &[], + string_types: &["string"], + regex_types: &["regex"], + quote_chars: &['\'', '"'], + string_prefixes: &[], +}; + +pub const PHP_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &["object_creation_expression"], + throw_types: &["throw_expression"], + await_types: &[], + string_types: &["string", "encapsed_string"], + regex_types: &[], + quote_chars: &['\'', '"'], + string_prefixes: &[], +}; + +// ── Generic AST node walker ────────────────────────────────────────────────── + +/// Node types that represent identifiers across languages. +const IDENT_TYPES: &[&str] = &[ + "identifier", "type_identifier", "name", "qualified_name", + "scoped_identifier", "qualified_identifier", + "member_expression", "member_access_expression", + "field_expression", "attribute", "scoped_type_identifier", +]; + +/// Node types that represent function/method calls across languages. +const CALL_TYPES: &[&str] = &[ + "call_expression", "call", "invocation_expression", + "method_invocation", "function_call_expression", + "member_call_expression", "scoped_call_expression", +]; + +/// Walk the tree collecting AST nodes using language-specific config. +/// Generic version of `walk_ast_nodes()` in `javascript.rs`. +pub fn walk_ast_nodes_with_config( + node: &Node, + source: &[u8], + ast_nodes: &mut Vec, + config: &LangAstConfig, +) { + let kind = node.kind(); + + if config.new_types.contains(&kind) { + let name = extract_constructor_name(node, source); + let text = truncate(node_text(node, source), AST_TEXT_MAX); + ast_nodes.push(AstNode { + kind: "new".to_string(), + name, + line: start_line(node), + text: Some(text), + receiver: None, + }); + // Fall through to recurse children (e.g. string args inside `new`) + } else if config.throw_types.contains(&kind) { + let name = extract_throw_target(node, source, config); + let text = extract_child_expression_text(node, source); + ast_nodes.push(AstNode { + kind: "throw".to_string(), + name, + line: start_line(node), + text, + receiver: None, + }); + // Fall through to recurse children (e.g. `new` inside `throw new ...`) + } else if config.await_types.contains(&kind) { + let name = extract_awaited_name(node, source); + let text = extract_child_expression_text(node, source); + ast_nodes.push(AstNode { + kind: "await".to_string(), + name, + line: start_line(node), + text, + receiver: None, + }); + // Fall through to recurse children + } else if config.string_types.contains(&kind) { + let raw = node_text(node, source); + let is_raw_string = kind.contains("raw_string"); + // Strip language prefix modifiers before quote chars: + // - C# verbatim `@"..."` + // - Rust raw strings `r"..."`, `r#"..."#` + // - Python prefixes: r, b, f, u and combos like rb, fr + let without_prefix = raw.trim_start_matches('@') + .trim_start_matches(|c: char| config.string_prefixes.contains(&c)) + .trim_start_matches('r'); + // Only strip `#` delimiters for raw string node types (e.g. Rust `r#"..."#`) + let without_prefix = if is_raw_string { + without_prefix.trim_start_matches('#') + } else { + without_prefix + }; + let content = without_prefix + .trim_start_matches(|c: char| config.quote_chars.contains(&c)); + let content = if is_raw_string { + content.trim_end_matches('#') + } else { + content + }; + let content = content + .trim_end_matches(|c: char| config.quote_chars.contains(&c)); + if content.chars().count() < 2 { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + walk_ast_nodes_with_config(&child, source, ast_nodes, config); + } + } + return; + } + let name = truncate(content, 100); + let text = truncate(raw, AST_TEXT_MAX); + ast_nodes.push(AstNode { + kind: "string".to_string(), + name, + line: start_line(node), + text: Some(text), + receiver: None, + }); + // Fall through to recurse children (template strings may have nested expressions) + } else if config.regex_types.contains(&kind) { + let raw = node_text(node, source); + let name = if raw.is_empty() { "?".to_string() } else { raw.to_string() }; + let text = truncate(raw, AST_TEXT_MAX); + ast_nodes.push(AstNode { + kind: "regex".to_string(), + name, + line: start_line(node), + text: Some(text), + receiver: None, + }); + // Fall through to recurse children + } + + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + walk_ast_nodes_with_config(&child, source, ast_nodes, config); + } + } +} + +// ── Name extraction helpers ────────────────────────────────────────────────── + +/// Extract constructor name from a `new`/`object_creation_expression` node. +fn extract_constructor_name(node: &Node, source: &[u8]) -> String { + // Try common field names for the constructed type + for field in &["type", "class", "constructor"] { + if let Some(child) = node.child_by_field_name(field) { + return node_text(&child, source).to_string(); + } + } + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if IDENT_TYPES.contains(&child.kind()) { + return node_text(&child, source).to_string(); + } + } + } + let raw = node_text(node, source); + raw.split('(') + .next() + .unwrap_or(raw) + .replace("new ", "") + .trim() + .to_string() +} + +/// Extract name from a throw/raise statement. +fn extract_throw_target(node: &Node, source: &[u8], config: &LangAstConfig) -> String { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + let ck = child.kind(); + if config.new_types.contains(&ck) { + return extract_constructor_name(&child, source); + } + if CALL_TYPES.contains(&ck) { + return extract_call_name(&child, source); + } + if IDENT_TYPES.contains(&ck) { + return node_text(&child, source).to_string(); + } + } + } + truncate(node_text(node, source), AST_TEXT_MAX) +} + +/// Extract name from an await expression. +fn extract_awaited_name(node: &Node, source: &[u8]) -> String { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + let ck = child.kind(); + if CALL_TYPES.contains(&ck) { + return extract_call_name(&child, source); + } + if IDENT_TYPES.contains(&ck) { + return node_text(&child, source).to_string(); + } + } + } + truncate(node_text(node, source), AST_TEXT_MAX) +} + +/// Extract function name from a call node. +fn extract_call_name(node: &Node, source: &[u8]) -> String { + for field in &["function", "method", "name"] { + if let Some(fn_node) = node.child_by_field_name(field) { + return node_text(&fn_node, source).to_string(); + } + } + let text = node_text(node, source); + text.split('(').next().unwrap_or("?").to_string() +} + +/// Extract expression text from throw/await — skip the keyword child. +fn extract_child_expression_text(node: &Node, source: &[u8]) -> Option { + const KEYWORDS: &[&str] = &["throw", "raise", "await", "new"]; + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if !KEYWORDS.contains(&child.kind()) { + return Some(truncate(node_text(&child, source), AST_TEXT_MAX)); + } + } + } + Some(truncate(node_text(node, source), AST_TEXT_MAX)) +} diff --git a/crates/codegraph-core/src/extractors/java.rs b/crates/codegraph-core/src/extractors/java.rs index c6418e1..fd07ac2 100644 --- a/crates/codegraph-core/src/extractors/java.rs +++ b/crates/codegraph-core/src/extractors/java.rs @@ -10,6 +10,7 @@ impl SymbolExtractor for JavaExtractor { fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { let mut symbols = FileSymbols::new(file_path.to_string()); walk_node(&tree.root_node(), source, &mut symbols); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &JAVA_AST_CONFIG); symbols } } diff --git a/crates/codegraph-core/src/extractors/php.rs b/crates/codegraph-core/src/extractors/php.rs index 0c5e800..376b61a 100644 --- a/crates/codegraph-core/src/extractors/php.rs +++ b/crates/codegraph-core/src/extractors/php.rs @@ -10,6 +10,7 @@ impl SymbolExtractor for PhpExtractor { fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { let mut symbols = FileSymbols::new(file_path.to_string()); walk_node(&tree.root_node(), source, &mut symbols); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &PHP_AST_CONFIG); symbols } } diff --git a/crates/codegraph-core/src/extractors/python.rs b/crates/codegraph-core/src/extractors/python.rs index 57b7947..9dafe8d 100644 --- a/crates/codegraph-core/src/extractors/python.rs +++ b/crates/codegraph-core/src/extractors/python.rs @@ -10,6 +10,7 @@ impl SymbolExtractor for PythonExtractor { fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { let mut symbols = FileSymbols::new(file_path.to_string()); walk_node(&tree.root_node(), source, &mut symbols); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &PYTHON_AST_CONFIG); symbols } } diff --git a/crates/codegraph-core/src/extractors/ruby.rs b/crates/codegraph-core/src/extractors/ruby.rs index a2eb3f0..d8daab1 100644 --- a/crates/codegraph-core/src/extractors/ruby.rs +++ b/crates/codegraph-core/src/extractors/ruby.rs @@ -10,6 +10,7 @@ impl SymbolExtractor for RubyExtractor { fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { let mut symbols = FileSymbols::new(file_path.to_string()); walk_node(&tree.root_node(), source, &mut symbols); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &RUBY_AST_CONFIG); symbols } } diff --git a/crates/codegraph-core/src/extractors/rust_lang.rs b/crates/codegraph-core/src/extractors/rust_lang.rs index c58ec2d..e8a1bb0 100644 --- a/crates/codegraph-core/src/extractors/rust_lang.rs +++ b/crates/codegraph-core/src/extractors/rust_lang.rs @@ -10,6 +10,7 @@ impl SymbolExtractor for RustExtractor { fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { let mut symbols = FileSymbols::new(file_path.to_string()); walk_node(&tree.root_node(), source, &mut symbols); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &RUST_AST_CONFIG); symbols } } diff --git a/src/ast.js b/src/ast.js index 59acdf0..e75f5c6 100644 --- a/src/ast.js +++ b/src/ast.js @@ -197,33 +197,33 @@ export async function buildAstNodes(db, fileSymbols, _rootDir, _engineOpts) { } } - // 2. AST walk for JS/TS/TSX — extract new, throw, await, string, regex - const ext = path.extname(relPath).toLowerCase(); - if (WALK_EXTENSIONS.has(ext)) { - if (symbols._tree) { - // WASM path: walk the tree-sitter AST + // 2. Non-call AST nodes (new, throw, await, string, regex) + if (symbols.astNodes?.length) { + // Native path: use pre-extracted AST nodes from Rust (all languages) + for (const n of symbols.astNodes) { + const parentDef = findParentDef(defs, n.line); + let parentNodeId = null; + if (parentDef) { + parentNodeId = + nodeIdMap.get(`${parentDef.name}|${parentDef.kind}|${parentDef.line}`) || null; + } + allRows.push({ + file: relPath, + line: n.line, + kind: n.kind, + name: n.name, + text: n.text || null, + receiver: n.receiver || null, + parentNodeId, + }); + } + } else { + // WASM fallback: walk the tree-sitter AST (JS/TS/TSX only) + const ext = path.extname(relPath).toLowerCase(); + if (WALK_EXTENSIONS.has(ext) && symbols._tree) { const astRows = []; walkAst(symbols._tree.rootNode, defs, relPath, astRows, nodeIdMap); allRows.push(...astRows); - } else if (symbols.astNodes?.length) { - // Native path: use pre-extracted AST nodes from Rust - for (const n of symbols.astNodes) { - const parentDef = findParentDef(defs, n.line); - let parentNodeId = null; - if (parentDef) { - parentNodeId = - nodeIdMap.get(`${parentDef.name}|${parentDef.kind}|${parentDef.line}`) || null; - } - allRows.push({ - file: relPath, - line: n.line, - kind: n.kind, - name: n.name, - text: n.text || null, - receiver: n.receiver || null, - parentNodeId, - }); - } } } } diff --git a/tests/parsers/ast-all-langs.test.js b/tests/parsers/ast-all-langs.test.js new file mode 100644 index 0000000..c07368e --- /dev/null +++ b/tests/parsers/ast-all-langs.test.js @@ -0,0 +1,511 @@ +/** + * Tests for AST node extraction across all languages. + * + * 1. Verifies buildAstNodes accepts native astNodes for non-JS languages + * (tests the JS-side ungate from WALK_EXTENSIONS). + * 2. When native engine is available, verifies each language extractor + * produces astNodes for its supported AST node kinds. + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, test } from 'vitest'; +import { buildAstNodes } from '../../src/ast.js'; +import { initSchema } from '../../src/db.js'; +import { loadNative } from '../../src/native.js'; +import { parseFilesAuto } from '../../src/parser.js'; + +// ─── Helpers ────────────────────────────────────────────────────────── + +function createTempDb() { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-ast-lang-')); + fs.mkdirSync(path.join(tmpDir, '.codegraph')); + const dbPath = path.join(tmpDir, '.codegraph', 'graph.db'); + const db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + return { tmpDir, db }; +} + +function queryByKind(db, kind) { + return db.prepare('SELECT * FROM ast_nodes WHERE kind = ? ORDER BY line').all(kind); +} + +function queryAll(db) { + return db.prepare('SELECT * FROM ast_nodes ORDER BY line').all(); +} + +// ─── JS-side: buildAstNodes accepts astNodes for non-JS files ──────── + +describe('buildAstNodes — non-JS language astNodes', () => { + let tmpDir, db; + + beforeAll(() => { + ({ tmpDir, db } = createTempDb()); + }); + + afterAll(() => { + if (db) db.close(); + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + test('inserts native astNodes for a .py file', async () => { + // Simulate native engine output with pre-extracted astNodes + const fileSymbols = new Map(); + fileSymbols.set('src/example.py', { + definitions: [{ name: 'process', kind: 'function', line: 1, endLine: 10 }], + calls: [], + astNodes: [ + { + kind: 'throw', + name: 'ValueError', + line: 3, + text: 'ValueError("bad input")', + receiver: null, + }, + { kind: 'string', name: 'bad input', line: 3, text: '"bad input"', receiver: null }, + { kind: 'await', name: 'fetch_data', line: 5, text: 'fetch_data(url)', receiver: null }, + { + kind: 'string', + name: 'https://api.example.com', + line: 6, + text: '"https://api.example.com"', + receiver: null, + }, + ], + }); + + // Insert a node so parent resolution has something to find + db.prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)').run( + 'process', + 'function', + 'src/example.py', + 1, + 10, + ); + + await buildAstNodes(db, fileSymbols, tmpDir); + + const all = queryAll(db); + expect(all.length).toBeGreaterThanOrEqual(4); + + const throws = queryByKind(db, 'throw'); + expect(throws.some((n) => n.name === 'ValueError')).toBe(true); + + const strings = queryByKind(db, 'string'); + expect(strings.some((n) => n.name.includes('api.example.com'))).toBe(true); + + const awaits = queryByKind(db, 'await'); + expect(awaits.some((n) => n.name === 'fetch_data')).toBe(true); + }); + + test('inserts native astNodes for a .java file', async () => { + const db2Setup = createTempDb(); + const db2 = db2Setup.db; + const tmpDir2 = db2Setup.tmpDir; + + const fileSymbols = new Map(); + fileSymbols.set('src/Main.java', { + definitions: [ + { name: 'Main', kind: 'class', line: 1, endLine: 20 }, + { name: 'Main.run', kind: 'method', line: 3, endLine: 15 }, + ], + calls: [], + astNodes: [ + { kind: 'new', name: 'ArrayList', line: 4, text: 'new ArrayList<>()', receiver: null }, + { + kind: 'throw', + name: 'IllegalArgumentException', + line: 7, + text: 'new IllegalArgumentException("invalid")', + receiver: null, + }, + { kind: 'string', name: 'invalid', line: 7, text: '"invalid"', receiver: null }, + ], + }); + + db2 + .prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)') + .run('Main', 'class', 'src/Main.java', 1, 20); + db2 + .prepare('INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)') + .run('Main.run', 'method', 'src/Main.java', 3, 15); + + await buildAstNodes(db2, fileSymbols, tmpDir2); + + const newNodes = queryByKind(db2, 'new'); + expect(newNodes.some((n) => n.name === 'ArrayList')).toBe(true); + + const throwNodes = queryByKind(db2, 'throw'); + expect(throwNodes.some((n) => n.name === 'IllegalArgumentException')).toBe(true); + + db2.close(); + fs.rmSync(tmpDir2, { recursive: true, force: true }); + }); + + test('all inserted nodes have valid kinds', async () => { + const all = queryAll(db); + const validKinds = new Set(['call', 'new', 'string', 'regex', 'throw', 'await']); + for (const node of all) { + expect(validKinds.has(node.kind)).toBe(true); + } + }); +}); + +// ─── Native engine: multi-language AST extraction ──────────────────── + +const LANG_FIXTURES = { + 'fixture.py': ` +def process(data): + raise ValueError("bad input") + +async def fetch(): + result = await get_data() + url = "https://api.example.com/data" + pattern = r"^[a-z]+\\d{3}$" + greeting = f"hello {data}" + raw_bytes = rb"raw bytes value" + return result +`, + 'fixture.go': ` +package main + +import "fmt" + +func main() { + msg := "hello world from go" + raw := \`raw string literal\` + fmt.Println(msg, raw) +} +`, + 'fixture.java': ` +import java.util.ArrayList; + +public class Main { + public void run() { + ArrayList list = new ArrayList<>(); + String msg = "hello from java"; + if (list.isEmpty()) { + throw new IllegalArgumentException("empty list"); + } + } +} +`, + 'fixture.cs': ` +using System; +using System.Threading.Tasks; + +public class Service { + public async Task FetchAsync() { + var result = await GetDataAsync(); + string msg = "hello from csharp"; + if (result == null) { + throw new ArgumentNullException("result"); + } + return msg; + } +} +`, + 'fixture.rb': ` +class Greeter + def greet(name) + msg = "hello from ruby" + pattern = /^[A-Z][a-z]+$/ + puts msg + end +end +`, + 'fixture.rs': ` +use std::collections::HashMap; + +async fn fetch_data(url: &str) -> Result> { + let client = reqwest::get(url).await?; + let msg = "hello from rust"; + let raw = r#"raw string content"#; + Ok(msg.to_string()) +} +`, + 'fixture.php': `isValid()) { + throw new \\InvalidArgumentException("invalid user"); + } + return $user; + } +} +`, +}; + +// Check if native addon supports astNodes for non-JS languages +function nativeSupportsMultiLangAst() { + const native = loadNative(); + if (!native) return false; + try { + const tmpCheck = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-ast-ml-check-')); + const srcCheck = path.join(tmpCheck, 'src'); + fs.mkdirSync(srcCheck, { recursive: true }); + // Test with a Python file that has a string literal + const checkPath = path.join(srcCheck, 'check.py'); + fs.writeFileSync(checkPath, 'msg = "hello world test"'); + const results = native.parseFiles([checkPath], tmpCheck); + const r = results?.[0]; + const hasAst = r?.astNodes?.length > 0 || r?.ast_nodes?.length > 0; + fs.rmSync(tmpCheck, { recursive: true, force: true }); + return hasAst; + } catch { + return false; + } +} + +const canTestMultiLang = nativeSupportsMultiLangAst(); + +describe.skipIf(!canTestMultiLang)('native AST nodes — multi-language', () => { + let tmpDir, db; + + beforeAll(async () => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-ast-multilang-')); + const srcDir = path.join(tmpDir, 'src'); + fs.mkdirSync(srcDir, { recursive: true }); + fs.mkdirSync(path.join(tmpDir, '.codegraph')); + + // Write all fixture files + const filePaths = []; + for (const [name, code] of Object.entries(LANG_FIXTURES)) { + const fp = path.join(srcDir, name); + fs.writeFileSync(fp, code); + filePaths.push(fp); + } + + // Parse all files with native engine + const allSymbols = await parseFilesAuto(filePaths, tmpDir, { engine: 'native' }); + + // Create DB + const dbPath = path.join(tmpDir, '.codegraph', 'graph.db'); + db = new Database(dbPath); + db.pragma('journal_mode = WAL'); + initSchema(db); + + // Insert definition nodes for parent resolution + const insertNode = db.prepare( + 'INSERT INTO nodes (name, kind, file, line, end_line) VALUES (?, ?, ?, ?, ?)', + ); + for (const [relPath, symbols] of allSymbols) { + for (const def of symbols.definitions || []) { + insertNode.run(def.name, def.kind, relPath, def.line, def.endLine); + } + } + + // Build AST nodes + await buildAstNodes(db, allSymbols, tmpDir); + }); + + afterAll(() => { + if (db) db.close(); + if (tmpDir) fs.rmSync(tmpDir, { recursive: true, force: true }); + }); + + // ── Python ── + + test('Python: extracts raise as throw', () => { + const throws = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'throw' AND file LIKE '%fixture.py'") + .all(); + expect(throws.length).toBeGreaterThanOrEqual(1); + expect(throws.some((n) => n.name === 'ValueError')).toBe(true); + }); + + test('Python: extracts await', () => { + const awaits = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'await' AND file LIKE '%fixture.py'") + .all(); + expect(awaits.length).toBeGreaterThanOrEqual(1); + expect(awaits.some((n) => n.name.includes('get_data'))).toBe(true); + }); + + test('Python: extracts string literals', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.py'") + .all(); + expect(strings.length).toBeGreaterThanOrEqual(1); + expect( + strings.some((n) => n.name.includes('bad input') || n.name.includes('api.example.com')), + ).toBe(true); + }); + + test('Python: strips r/f/rb prefixes from string names', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.py'") + .all(); + // r"..." prefix should be stripped — name should not start with 'r' + const rawStr = strings.find((n) => n.name.includes('^[a-z]+')); + expect(rawStr).toBeDefined(); + expect(rawStr.name.startsWith('r')).toBe(false); + // f"..." prefix should be stripped + const fStr = strings.find((n) => n.name.includes('hello')); + expect(fStr).toBeDefined(); + expect(fStr.name.startsWith('f')).toBe(false); + // rb"..." prefix should be stripped + const rbStr = strings.find((n) => n.name.includes('raw bytes')); + expect(rbStr).toBeDefined(); + expect(rbStr.name.startsWith('r')).toBe(false); + expect(rbStr.name.startsWith('b')).toBe(false); + }); + + // ── Go ── + + test('Go: extracts string literals', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.go'") + .all(); + expect(strings.length).toBeGreaterThanOrEqual(1); + expect(strings.some((n) => n.name.includes('hello world'))).toBe(true); + }); + + // ── Java ── + + test('Java: extracts new as kind:new', () => { + const news = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'new' AND file LIKE '%fixture.java'") + .all(); + expect(news.length).toBeGreaterThanOrEqual(1); + expect(news.some((n) => n.name.includes('ArrayList'))).toBe(true); + }); + + test('Java: extracts throw', () => { + const throws = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'throw' AND file LIKE '%fixture.java'") + .all(); + expect(throws.length).toBeGreaterThanOrEqual(1); + expect(throws.some((n) => n.name.includes('IllegalArgumentException'))).toBe(true); + }); + + test('Java: extracts string literals', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.java'") + .all(); + expect(strings.length).toBeGreaterThanOrEqual(1); + expect(strings.some((n) => n.name.includes('hello from java'))).toBe(true); + }); + + // ── C# ── + + test('C#: extracts new as kind:new', () => { + const news = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'new' AND file LIKE '%fixture.cs'") + .all(); + expect(news.length).toBeGreaterThanOrEqual(1); + expect(news.some((n) => n.name.includes('ArgumentNullException'))).toBe(true); + }); + + test('C#: extracts throw', () => { + const throws = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'throw' AND file LIKE '%fixture.cs'") + .all(); + expect(throws.length).toBeGreaterThanOrEqual(1); + }); + + test('C#: extracts await', () => { + const awaits = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'await' AND file LIKE '%fixture.cs'") + .all(); + expect(awaits.length).toBeGreaterThanOrEqual(1); + expect(awaits.some((n) => n.name.includes('GetDataAsync'))).toBe(true); + }); + + test('C#: extracts string literals', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.cs'") + .all(); + expect(strings.length).toBeGreaterThanOrEqual(1); + expect(strings.some((n) => n.name.includes('hello from csharp'))).toBe(true); + }); + + // ── Ruby ── + + test('Ruby: extracts string literals', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.rb'") + .all(); + expect(strings.length).toBeGreaterThanOrEqual(1); + expect(strings.some((n) => n.name.includes('hello from ruby'))).toBe(true); + }); + + test('Ruby: extracts regex literals', () => { + const regexes = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'regex' AND file LIKE '%fixture.rb'") + .all(); + expect(regexes.length).toBeGreaterThanOrEqual(1); + expect(regexes.some((n) => n.name.includes('[A-Z]'))).toBe(true); + }); + + // ── Rust ── + + test('Rust: extracts await', () => { + const awaits = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'await' AND file LIKE '%fixture.rs'") + .all(); + expect(awaits.length).toBeGreaterThanOrEqual(1); + }); + + test('Rust: extracts string literals', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.rs'") + .all(); + expect(strings.length).toBeGreaterThanOrEqual(2); + expect(strings.some((n) => n.name.includes('hello from rust'))).toBe(true); + }); + + test('Rust: extracts raw string literals with trimmed name', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.rs'") + .all(); + const rawStr = strings.find((n) => n.name.includes('raw string content')); + expect(rawStr).toBeDefined(); + // Name should not contain r, #, or quote prefixes + expect(rawStr.name).not.toMatch(/^[r#"]/); + }); + + // ── PHP ── + + test('PHP: extracts new as kind:new', () => { + const news = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'new' AND file LIKE '%fixture.php'") + .all(); + expect(news.length).toBeGreaterThanOrEqual(1); + expect( + news.some((n) => n.name.includes('User') || n.name.includes('InvalidArgumentException')), + ).toBe(true); + }); + + test('PHP: extracts throw', () => { + const throws = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'throw' AND file LIKE '%fixture.php'") + .all(); + expect(throws.length).toBeGreaterThanOrEqual(1); + }); + + test('PHP: extracts string literals', () => { + const strings = db + .prepare("SELECT * FROM ast_nodes WHERE kind = 'string' AND file LIKE '%fixture.php'") + .all(); + expect(strings.length).toBeGreaterThanOrEqual(1); + expect( + strings.some((n) => n.name.includes('created user') || n.name.includes('invalid user')), + ).toBe(true); + }); + + // ── Cross-language ── + + test('all nodes have valid kinds', () => { + const all = queryAll(db); + const validKinds = new Set(['call', 'new', 'string', 'regex', 'throw', 'await']); + for (const node of all) { + expect(validKinds.has(node.kind)).toBe(true); + } + }); +});