"""Definition extraction (tree-sitter required).
This project requires tree-sitter for symbol/definition extraction. If tree-sitter
is not available, symbol extraction raises a clear error instead of silently
falling back to best-effort heuristics.
"""
import os
from typing import Dict, List, Set, Tuple, Iterable
DefNode = Tuple[str, int, int] # (kind, start_byte, end_byte)
# Language configuration
LANG_MAP = {
".py": "python", ".js": "javascript", ".jsx": "javascript", ".ts": "typescript",
".tsx": "tsx", ".java": "java", ".go": "go", ".rs": "rust", ".c": "c", ".h": "c",
".cpp": "cpp", ".cc": "cpp", ".hpp": "cpp", ".cs": "c_sharp", ".php": "php",
".rb": "ruby", ".swift": "swift", ".kt": "kotlin", ".scala": "scala"
}
DEF_NODES = {
"python": {"function_definition", "class_definition", "async_function_definition"},
"javascript": {"function_declaration", "class_declaration", "method_definition", "arrow_function"},
"typescript": {"function_declaration", "class_declaration", "method_definition", "interface_declaration"},
"tsx": {"function_declaration", "class_declaration", "method_definition", "interface_declaration"},
"java": {"method_declaration", "class_declaration", "interface_declaration", "constructor_declaration"},
"go": {"function_declaration", "method_declaration", "type_declaration"},
"rust": {"function_item", "impl_item", "struct_item", "trait_item"},
"c": {"function_definition", "struct_specifier"},
"cpp": {"function_definition", "class_specifier", "struct_specifier"},
"c_sharp": {"method_declaration", "class_declaration", "interface_declaration"},
"php": {"function_definition", "method_declaration", "class_declaration"},
"ruby": {"method", "class", "module"},
"swift": {"function_declaration", "class_declaration", "protocol_declaration"},
"kotlin": {"function_declaration", "class_declaration"},
"scala": {"function_definition", "class_definition", "trait_definition"}
}
_TS_AVAILABLE = False
_PARSERS = {}
try:
import tree_sitter # noqa: F401
# Prefer tree_sitter_languages when available (historical dependency),
# otherwise fall back to tree_sitter_language_pack (Py>=3.12 friendly).
try:
from tree_sitter_languages import get_parser as _get_parser # type: ignore
except Exception: # pragma: no cover
from tree_sitter_language_pack import get_parser as _get_parser # type: ignore
_TS_AVAILABLE = True
except ImportError:
_TS_AVAILABLE = False
def _require_tree_sitter() -> None:
if _TS_AVAILABLE:
return
raise RuntimeError(
"tree-sitter is required for symbol extraction. "
"Install: pip install tree-sitter-language-pack"
)
[docs]
def available() -> bool:
"""Return True only if tree-sitter + parsers are usable."""
if not _TS_AVAILABLE:
return False
try:
return _get_parser("python") is not None
except Exception:
return False
def _get_parser_for_lang(lang: str):
"""Get parser for language (handles both API versions)."""
if not available():
return None
if lang in _PARSERS:
return _PARSERS[lang]
try:
parser = _get_parser(lang)
_PARSERS[lang] = parser
return parser
except Exception:
_PARSERS[lang] = None
return None
def _node_text(src: bytes, node) -> str:
try:
return src[node.start_byte:node.end_byte].decode("utf-8", errors="replace")
except Exception:
return ""
_IDENTIFIER_TYPES = {
"identifier",
"field_identifier",
"property_identifier",
"type_identifier",
"scoped_identifier",
"namespace_identifier",
"method_identifier",
"constant_identifier",
}
def _iter_descendants(node) -> Iterable:
stack = [node]
while stack:
cur = stack.pop()
yield cur
for child in reversed(getattr(cur, "children", []) or []):
stack.append(child)
def _best_name_for_def(def_node, src: bytes) -> str:
"""Best-effort extraction of a definition name for a tree-sitter def node."""
# Many languages expose a name field directly.
try:
name_node = def_node.child_by_field_name("name")
except Exception:
name_node = None
if name_node is not None:
name = _node_text(src, name_node).strip()
if name:
return name
# C/C++ often expose declarator.
try:
decl = def_node.child_by_field_name("declarator")
except Exception:
decl = None
search_root = decl if decl is not None else def_node
# Heuristic: prefer plain identifiers over type identifiers.
best = ""
best_rank = 10**9
rank_map = {"identifier": 0, "field_identifier": 1, "property_identifier": 2, "method_identifier": 3}
for n in _iter_descendants(search_root):
if not getattr(n, "is_named", False):
continue
t = getattr(n, "type", "")
if t not in _IDENTIFIER_TYPES:
continue
txt = _node_text(src, n).strip()
if not txt:
continue
r = rank_map.get(t, 100)
if r < best_rank:
best = txt
best_rank = r
if best_rank == 0:
break
return best
def extract_named_defs(file_path: str) -> List[Tuple[str, str, int, int]]:
"""Extract named definitions from file.
Returns [(kind, name, start_byte, end_byte)].
"""
_require_tree_sitter()
lang = LANG_MAP.get(os.path.splitext(file_path.lower())[1])
if not lang or lang not in DEF_NODES:
return []
parser = _get_parser_for_lang(lang)
if not parser:
raise RuntimeError(
f"tree-sitter parser for language '{lang}' is not available. "
"Install: pip install tree-sitter-language-pack"
)
try:
with open(file_path, "rb") as f:
src = f.read()
tree = parser.parse(src)
except Exception:
return []
result: List[Tuple[str, str, int, int]] = []
target_types = DEF_NODES[lang]
for node in _iter_descendants(tree.root_node):
if not getattr(node, "is_named", False):
continue
node_type = getattr(node, "type", "")
if "comment" in node_type:
continue
if node_type not in target_types:
continue
name = _best_name_for_def(node, src)
if not name:
continue
result.append((node_type, name, node.start_byte, node.end_byte))
return result