#4415 Knowledge Graph of Python Source File
#4415 Knowledge Graph of Python Source File
https://www.youtube.com/watch?v=uti3tMcqczQ
#4415 Provide a Python application to take as input a Python source file and generate a useful knowledge graph with appropriate entities and relationships
Building a knowledge graph from Python source code involves parsing the file into an Abstract Syntax Tree (AST) and then mapping those nodes into a graph database like Neo4j.
Below is a robust Python application using the ast library for static analysis and py2neo to populate a Neo4j graph. This script identifies key entities (Classes, Functions, Arguments) and their structural relationships.
Prerequisites
You will need a running Neo4j instance and the following library:
pip install py2neo
The Knowledge Graph Generator
import ast
import os
from py2neo import Graph, Node, Relationship
class CodeGraphGenerator:
def __init__(self, uri="bolt://localhost:7687", user="neo4j", password="password"):
# Initialize connection to Neo4j
try:
self.graph = Graph(uri, auth=(user, password))
self.graph.delete_all() # Clear graph for a fresh run
except Exception as e:
print(f"Connection failed: {e}")
def parse_file(self, file_path):
with open(file_path, "r") as file:
tree = ast.parse(file.read())
file_node = Node("File", name=os.path.basename(file_path))
self.graph.create(file_node)
self._visit_nodes(tree, file_node)
def _visit_nodes(self, tree, parent_graph_node):
for node in ast.iter_child_nodes(tree):
# Entity: Classes
if isinstance(node, ast.ClassDef):
class_node = Node("Class", name=node.name)
self.graph.create(class_node)
self.graph.create(Relationship(parent_graph_node, "DEFINES", class_node))
self._visit_nodes(node, class_node)
# Entity: Functions/Methods
elif isinstance(node, ast.FunctionDef):
func_node = Node("Function", name=node.name)
self.graph.create(func_node)
self.graph.create(Relationship(parent_graph_node, "CONTAINS", func_node))
# Extract Arguments
for arg in node.args.args:
arg_node = Node("Argument", name=arg.arg)
self.graph.create(arg_node)
self.graph.create(Relationship(func_node, "REQUIRES", arg_node))
self._visit_nodes(node, func_node)
# Entity: Imports
elif isinstance(node, (ast.Import, ast.ImportFrom)):
for alias in node.names:
import_node = Node("Module", name=alias.name)
self.graph.create(import_node)
self.graph.create(Relationship(parent_graph_node, "DEPENDS_ON", import_node))
if __name__ == "__main__":
# Usage
generator = CodeGraphGenerator(password="your_password")
generator.parse_file("your_script.py")
print("Knowledge Graph populated successfully.")
Schema Design
The application creates a hierarchical and relational view of your code. This is particularly useful for identifying tight coupling or bloated classes.
| Entity (Node) | Relationship | Target Entity | Description |
| File | DEFINES | Class | Top-level structural container. |
| Class | CONTAINS | Function | Maps methods to their parent classes. |
| File | CONTAINS | Function | Identifies global/standalone functions. |
| Function | REQUIRES | Argument | Maps the signature dependencies. |
| File | DEPENDS_ON | Module | Tracks external library usage. |
Extending the Graph
To make the graph more "agent-ready" or useful for impact analysis, consider adding:
CALLSRelationship: Useast.Callto link one function node to another when it is invoked.Docstring Extraction: Pull
ast.get_docstring(node)and store it as a property on the node for LLM context retrieval.Variable Tracking: Map
ast.Assignto see where specific data structures are modified across functions.
Would you like to explore adding the logic to track function calls (the CALLS relationship) to visualize the execution flow?
Yes
Adding the CALLS relationship requires a second pass or a more sophisticated visitor pattern to ensure that the target functions already exist as nodes in the graph before they are linked.
By using the ast.NodeVisitor class, we can cleanly separate the logic for discovering definitions (Classes/Functions) and the logic for discovering interactions (Calls).
Updated Python Application with Call Tracking
import ast
import os
from py2neo import Graph, Node, Relationship
class CodeGraphGenerator(ast.NodeVisitor):
def __init__(self, uri="bolt://localhost:7687", user="neo4j", password="password"):
try:
self.graph = Graph(uri, auth=(user, password))
self.graph.delete_all()
except Exception as e:
print(f"Connection failed: {e}")
self.node_cache = {} # Stores created nodes to link relationships
self.current_scope = None
def parse_file(self, file_path):
with open(file_path, "r") as file:
source = file.read()
tree = ast.parse(source)
file_node = Node("File", name=os.path.basename(file_path))
self.graph.create(file_node)
self.node_cache['module'] = file_node
self.current_scope = file_node
# First pass: Define all entities
self.visit(tree)
# Second pass: Create "CALLS" relationships
self._analyze_calls(tree)
def visit_ClassDef(self, node):
class_node = Node("Class", name=node.name)
self.graph.create(class_node)
self.graph.create(Relationship(self.node_cache['module'], "DEFINES", class_node))
old_scope = self.current_scope
self.current_scope = class_node
self.node_cache[node.name] = class_node
self.generic_visit(node)
self.current_scope = old_scope
def visit_FunctionDef(self, node):
func_node = Node("Function", name=node.name)
self.graph.create(func_node)
# Determine if it's a method or a standalone function
rel_type = "HAS_FUNCTION" if self.current_scope.labels.get('Class') else "CONTAINS"
self.graph.create(Relationship(self.current_scope, rel_type, func_node))
self.node_cache[node.name] = func_node
self.generic_visit(node)
def _analyze_calls(self, tree):
"""Walks the tree specifically looking for function calls."""
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
caller_node = self.node_cache.get(node.name)
for child in ast.walk(node):
if isinstance(child, ast.Call) and isinstance(child.func, ast.Name):
callee_name = child.func.id
if callee_name in self.node_cache:
callee_node = self.node_cache[callee_name]
self.graph.create(Relationship(caller_node, "CALLS", callee_node))
if __name__ == "__main__":
generator = CodeGraphGenerator(password="your_password")
generator.parse_file("your_script.py")
Key Enhancements
Relationship Logic: Changed the naming convention for class methods to
HAS_FUNCTIONfor better semantic clarity in the graph.Recursive Scope Tracking: Uses a
current_scopepointer to correctly attribute functions to their parent classes or modules.Call Logic: The
_analyze_callsmethod usesast.walkto findast.Callnodes. It cross-references thenode_cacheto ensure we only drawCALLSlines between functions defined within the parsed file.
Visualizing the Result
Once this is executed, you can run a Cypher query in the Neo4j browser to see the flow:
MATCH (f1:Function)-[r:CALLS]->(f2:Function)
RETURN f1, r, f2
This visualization helps pinpoint "God Functions" (nodes with many outgoing call relationships) or "Critical Dependencies" (nodes with many incoming call relationships).
Would you like to see how to incorporate the extraction of docstrings to populate node properties for better documentation indexing?
Yes
To incorporate docstrings and descriptive metadata, we modify the visitor pattern to extract the ast.get_docstring() attribute. This transforms your graph from a structural map into a semantic knowledge base, which is essential for impact analysis or feeding into an LLM for automated documentation.
Python Knowledge Graph Generator (with Docstrings & Metadata)
import ast
import os
from py2neo import Graph, Node, Relationship
class EnhancedCodeGraph(ast.NodeVisitor):
def __init__(self, uri="bolt://localhost:7687", user="neo4j", password="password"):
try:
self.graph = Graph(uri, auth=(user, password))
self.graph.delete_all()
except Exception as e:
print(f"Connection failed: {e}")
self.node_cache = {}
self.current_scope = None
def parse_file(self, file_path):
with open(file_path, "r") as file:
source = file.read()
tree = ast.parse(source)
# File level metadata
file_node = Node("File",
name=os.path.basename(file_path),
path=file_path,
docstring=ast.get_docstring(tree) or "No module docstring")
self.graph.create(file_node)
self.node_cache['module'] = file_node
self.current_scope = file_node
self.visit(tree)
self._analyze_calls(tree)
def visit_ClassDef(self, node):
# Extract Class metadata
doc = ast.get_docstring(node) or "N/A"
class_node = Node("Class", name=node.name, description=doc)
self.graph.create(class_node)
# Relationship logic
self.graph.create(Relationship(self.node_cache['module'], "DEFINES", class_node))
# Scope management
old_scope = self.current_scope
self.current_scope = class_node
self.node_cache[node.name] = class_node
self.generic_visit(node)
self.current_scope = old_scope
def visit_FunctionDef(self, node):
# Extract Function metadata
doc = ast.get_docstring(node) or "N/A"
line_no = node.lineno
func_node = Node("Function",
name=node.name,
description=doc,
line_start=line_no)
self.graph.create(func_node)
# Use HAS_xyz for class methods vs standalone functions
rel_label = "HAS_METHOD" if self.current_scope.labels.get('Class') else "HAS_FUNCTION"
self.graph.create(Relationship(self.current_scope, rel_label, func_node))
self.node_cache[node.name] = func_node
self.generic_visit(node)
def _analyze_calls(self, tree):
"""Maps interaction flow between discovered entities."""
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
caller_node = self.node_cache.get(node.name)
if not caller_node: continue
for child in ast.walk(node):
if isinstance(child, ast.Call) and isinstance(child.func, ast.Name):
callee_name = child.func.id
if callee_name in self.node_cache:
callee_node = self.node_cache[callee_name]
self.graph.create(Relationship(caller_node, "CALLS", callee_node))
if __name__ == "__main__":
# Point this to any .py file you wish to analyze
generator = EnhancedCodeGraph(password="your_secure_password")
generator.parse_file("your_logic.py")
What This Enables
By adding the description (docstring) and line_start properties, you unlock several powerful use cases:
Semantic Search: You can query the graph for specific keywords within the
descriptionproperty to find relevant logic across thousands of lines of code.Traceability: The
line_startproperty allows you to jump directly from a graph visualization back to the specific line in your IDE.Dependency Auditing: You can run Cypher queries to find "orphaned" functions (functions with no incoming
CALLSrelationships) that might be candidates for deletion.
Visualization Strategy
When you open Neo4j Bloom or the Neo4j Browser, the resulting structure will look like a tree where the File is the root, branching out to Classes and Functions. The CALLS relationships will then create a "web" across those branches, highlighting how different parts of the system interact.
Would you be interested in expanding this to include Import Analysis, which would allow you to visualize how multiple files in a directory relate to one another?
Yes
To expand this into a multi-file System-Wide Knowledge Graph, we need to shift from analyzing a single file to traversing an entire directory. This version handles Import statements and maps relationships between different files, creating a high-level map of your project's architecture.
Directory-Level Knowledge Graph Generator
This script walks through a project folder, creates nodes for every .py file, and establishes DEPENDS_ON relationships when one file imports another.
import ast
import os
from py2neo import Graph, Node, Relationship
class ProjectGraphArch(ast.NodeVisitor):
def __init__(self, uri="bolt://localhost:7687", user="neo4j", password="password"):
try:
self.graph = Graph(uri, auth=(user, password))
self.graph.delete_all()
except Exception as e:
print(f"Connection failed: {e}")
self.file_nodes = {} # Map filename to Node
self.current_file_node = None
def analyze_directory(self, root_path):
# First Pass: Create all File nodes
for root, _, files in os.walk(root_path):
for file in files:
if file.endswith(".py"):
full_path = os.path.join(root, file)
rel_path = os.path.relpath(full_path, root_path)
file_node = Node("File", name=file, path=rel_path)
self.graph.create(file_node)
self.file_nodes[rel_path] = file_node
# Second Pass: Analyze imports and internal structure
for rel_path, node in self.file_nodes.items():
self.current_file_node = node
full_path = os.path.join(root_path, rel_path)
self._process_file_contents(full_path)
def _process_file_contents(self, full_path):
with open(full_path, "r", encoding="utf-8") as f:
try:
tree = ast.parse(f.read())
self.visit(tree)
except SyntaxError:
print(f"Skipping {full_path} due to syntax error.")
def visit_Import(self, node):
for alias in node.names:
self._link_dependency(alias.name)
def visit_ImportFrom(self, node):
if node.module:
self._link_dependency(node.module)
def _link_dependency(self, target_module):
# Logic to see if the import matches a file in our local project
# Simple heuristic: check if module name matches a file name in our cache
target_file = f"{target_module.replace('.', '/')}.py"
if target_file in self.file_nodes:
target_node = self.file_nodes[target_file]
self.graph.create(Relationship(self.current_file_node, "DEPENDS_ON", target_node))
else:
# It's an external library (like os, ast, or pip-installed)
ext_node = Node("ExternalModule", name=target_module)
self.graph.merge(ext_node, "ExternalModule", "name")
self.graph.create(Relationship(self.current_file_node, "USES_LIBRARY", ext_node))
# Re-using Class and Function visitors from the previous version
def visit_ClassDef(self, node):
class_node = Node("Class", name=node.name, docstring=ast.get_docstring(node))
self.graph.create(class_node)
self.graph.create(Relationship(self.current_file_node, "HAS_CLASS", class_node))
self.generic_visit(node)
if __name__ == "__main__":
generator = ProjectGraphArch(password="your_password")
generator.analyze_directory("./my_python_project")
print("Multi-file architecture graph complete.")
Understanding the Multi-File Schema
By aggregating multiple files, your graph schema now includes a higher layer of abstraction:
| Relationship | Source | Target | Use Case |
| DEPENDS_ON | File | File | Visualizes the "import web" of your local project. |
| USES_LIBRARY | File | ExternalModule | Identifies which parts of your code rely on third-party packages. |
| HAS_CLASS | File | Class | Maps the object-oriented structure within the file system. |
Advanced Querying: Finding the "Linchpin"
Once the graph is populated, you can run a Centrality Query to find the most critical file in your project—the one that, if changed, would have the largest impact:
MATCH (dependent:File)-[:DEPENDS_ON]->(core:File)
RETURN core.name, count(dependent) AS IncomingDependencies
ORDER BY IncomingDependencies DESC
This identifies the "core" modules that almost every other file relies on, helping you prioritize testing and careful refactoring for those specific files.
From <https://gemini.google.com/app/e7e769f9ca27cda1> Google Gemini 3.0 (Fast)

Comments
Post a Comment